Saturday, February 16, 2013

Evaluating missing features using sci kit learn library in Python


This code is still improving....

import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier


#make sure you provide only the important features (after eliminating unwanted ones)
def feature_predict(training_data, feature_num, num_of_rows, num_of_cols): #give training data as a numpy array and which particular missing feauture you want to predict
       
    fatt = np.zeros((num_of_cols-1,), dtype = object)
   
   
    train_new = np.concatenate((training_data[training_data[0::,feature_num]!='', 0:feature_num],training_data[training_data[0::,feature_num]!='', feature_num+1:]),1)
    targets = training_data[training_data[0::, feature_num] !='', feature_num ]
    targets_new = np.zeros(len(targets), dtype = int)
   
    tar = set(targets)
    tar = list(tar)
    tar.sort()
   
    tell = {}
   
    for i in xrange(len(tar)):
        tell[tar[i]] = i
                 
    for i in xrange(len(targets)):
        targets_new[i] = tell[targets[i]]
   
   
   
    for i in xrange(num_of_cols-1):
        s = set(np.concatenate((training_data[0::, 0:feature_num],training_data[0::,feature_num+1:]),1)[:,i])
        s = list(s)
        s.sort()
        fatt[i] = s
       
    valid = np.concatenate((training_data[training_data[0::,feature_num]=='', 0:feature_num],training_data[training_data[0::,feature_num]=='', feature_num+1:]),1)
   
    feature = np.zeros((len(train_new),num_of_cols-1), dtype = int)
    valid_new = np.zeros((len(valid), num_of_cols-1), dtype = int)
   
    for i in xrange(num_of_cols-1):
        for j in xrange(len(train_new)):
            for k,element in enumerate(fatt[i]):
                if train_new[j,i]==element:
                    feature[j,i]=k
                    break
                       
   
    for i in xrange(num_of_cols-1):
        for j in xrange(len(valid)):
            for k,element in enumerate(fatt[i]):
                if valid[j,i]==element:
                    valid_new[j,i]=k
                    break
   
    clf = ExtraTreesClassifier(n_estimators=10)
    clf.fit(feature, targets_new)
    res = clf.predict(valid_new)
   
    return res