這裡我用 pandas.DataFrame 裡的 groupby 幫我做分類
然後用 apply(list) 把所有列的結果輸出
就可以得到論文中一開始所說的「等價類」
# equivalence relation
def eq_relation(f_list, data, item_name, subset = None):
'''
f_list : 特徵子集
data : 觀察集
item_name : 觀察集中代表觀察樣本編號的欄位名稱
subset : 論文之後會用到,當只想看樣本子集時可用
'''
if subset is None:
subset = data[item_name]
cut = (data[item_name].isin(subset))
temp = data[cut]
res = temp.groupby(f_list)
return list(res[item_name].apply(list))
由於目前所講到的粗糙集特徵選取只使用到 POS(內部)
所以這裡只寫了 POS 的部分
就是一個一個看有哪些 P 的等價類被包含在 Q 的等價類中
def pos_dep(f_list, q_list, data, item_name, subset = None):
if subset is None:
subset1 = data[item_name]
if len(f_list)*len(q_list)==0:
return 0
modP = eq_relation(f_list, data, item_name, subset = subset1)
modQ = eq_relation(q_list, data, item_name, subset = subset1)
pos_list = [[p for p in modP if len([p1 for p1 in p if p1 not in q])==0] for q in modQ]
union_pos = list(set().union(*[list(set().union(*p)) for p in pos_list]))
return len(union_pos)/len(data[item_name])
最後就是模仿向前特徵選取
把 pos_dep 當作模型表現力
每次只新增可以讓模型表現最好的
def rough_feature_selection(q_list, data, item_name, feature_list, subset = None):
fs_list = []
temp_fs = []
best_performance = 0
temp_performance = -1
while temp_performance != best_performance:
temp_performance = best_performance
for f in [feat for feat in feature_list if feat not in fs_list]:
now_per = pos_dep(f_list = fs_list + [f],
q_list = q_list,
data = data,
item_name = item_name)
past_per = pos_dep(f_list = fs_list,
q_list = q_list,
data = data,
item_name = item_name)
if now_per > past_per and now_per > best_performance:
temp_fs = fs_list + [f]
best_performance = now_per
fs_list = temp_fs
return temp_fs, best_performance
寫的還是很冗長請見諒
我還會再多多練習