Saturday, February 16, 2013
Evaluating missing features using sci kit learn library in Python
This code is still improving....
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
#make sure you provide only the important features (after eliminating unwanted ones)
def feature_predict(training_data, feature_num, num_of_rows, num_of_cols): #give training data as a numpy array and which particular missing feauture you want to predict
fatt = np.zeros((num_of_cols-1,), dtype = object)
train_new = np.concatenate((training_data[training_data[0::,feature_num]!='', 0:feature_num],training_data[training_data[0::,feature_num]!='', feature_num+1:]),1)
targets = training_data[training_data[0::, feature_num] !='', feature_num ]
targets_new = np.zeros(len(targets), dtype = int)
tar = set(targets)
tar = list(tar)
tar.sort()
tell = {}
for i in xrange(len(tar)):
tell[tar[i]] = i
for i in xrange(len(targets)):
targets_new[i] = tell[targets[i]]
for i in xrange(num_of_cols-1):
s = set(np.concatenate((training_data[0::, 0:feature_num],training_data[0::,feature_num+1:]),1)[:,i])
s = list(s)
s.sort()
fatt[i] = s
valid = np.concatenate((training_data[training_data[0::,feature_num]=='', 0:feature_num],training_data[training_data[0::,feature_num]=='', feature_num+1:]),1)
feature = np.zeros((len(train_new),num_of_cols-1), dtype = int)
valid_new = np.zeros((len(valid), num_of_cols-1), dtype = int)
for i in xrange(num_of_cols-1):
for j in xrange(len(train_new)):
for k,element in enumerate(fatt[i]):
if train_new[j,i]==element:
feature[j,i]=k
break
for i in xrange(num_of_cols-1):
for j in xrange(len(valid)):
for k,element in enumerate(fatt[i]):
if valid[j,i]==element:
valid_new[j,i]=k
break
clf = ExtraTreesClassifier(n_estimators=10)
clf.fit(feature, targets_new)
res = clf.predict(valid_new)
return res
Subscribe to:
Posts (Atom)