-
Notifications
You must be signed in to change notification settings - Fork 62
/
RegressionTree.py
213 lines (177 loc) · 6.44 KB
/
RegressionTree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# regression tree
# input is a dataframe of features
# the corresponding y value(called labels here) is the scores for each document
import pandas as pd
import numpy as np
from multiprocessing import Pool
from itertools import repeat
import scipy
import scipy.optimize
node_id = 0
def get_splitting_points(args):
# given a list
# return a list of possible splitting values
attribute, col = args
attribute.sort()
possible_split = []
for i in range(len(attribute)-1):
if attribute[i] != attribute[i+1]:
possible_split.append(np.mean((attribute[i],attribute[i+1])))
return possible_split, col
# create a dictionary, key is the attribute number, value is whole list of possible splits for that column
def find_best_split_parallel(args):
best_ls = 1000000
best_split = None
best_children = None
split_point, data, label = args
key,possible_split = split_point
for split in possible_split:
children = split_children(data, label, key, split)
#weighted average of left and right ls
ls = len(children[1])*least_square(children[1])/len(label) + len(children[3])*least_square(children[3])/len(label)
if ls < best_ls:
best_ls = ls
best_split = (key, split)
best_children = children
return best_ls, best_split, best_children
def find_best_split(data, label, split_points):
# split_points is a dictionary of possible splitting values
# return the best split
best_ls = 1000000
best_split = None
best_children = None
pool = Pool()
for ls, split, children in pool.map(find_best_split_parallel, zip(split_points.items(), repeat(data), repeat(label))):
if ls < best_ls:
best_ls = ls
best_split = split
best_children = children
pool.close()
return best_split, best_children # return a tuple(attribute, value)
def split_children(data, label, key, split):
left_index = [index for index in xrange(len(data.iloc[:,key])) if data.iloc[index,key] < split]
right_index = [index for index in xrange(len(data.iloc[:,key])) if data.iloc[index,key] >= split]
left_data = data.iloc[left_index,:]
right_data = data.iloc[right_index,:]
left_label = [label[i] for i in left_index]
right_label =[label[i] for i in right_index]
return left_data, left_label, right_data, right_label
def least_square(label):
if not len(label):
return 0
return (np.sum(label)**2)/len(set(label))
def create_leaf(label):
global node_id
node_id += 1
leaf = {'splittng_feature': None,
'left': None,
'right':None,
'is_leaf':True,
'index':node_id}
leaf['value'] = round(np.mean(label),3)
return leaf
def find_splits_parallel(args):
var_space, label, col = args
# var_space = data.iloc[:,col].tolist()
return scipy.optimize.fminbound(error_function, min(var_space), max(var_space), args = (col, var_space, label), full_output = 1)
# return,
# if not min_error or error < min_error:
# min_error = error
# split_var = col
# min_split = split
def create_tree(data, all_pos_split, label, max_depth, ideal_ls, current_depth = 0):
remaining_features = all_pos_split
#stopping conditions
if sum([len(v)!= 0 for v in remaining_features.values()]) == 0:
# If there are no remaining features to consider, make current node a leaf node
return create_leaf(label)
# #Additional stopping condition (limit tree depth)
elif current_depth > max_depth:
return create_leaf(label)
#######
min_error = None
split_var = None
min_split = None
var_spaces = [data.iloc[:,col].tolist() for col in xrange(data.shape[1])]
cols = [col for col in xrange(data.shape[1])]
pool = Pool()
for split, error, ierr, numf in pool.map(find_splits_parallel, zip(var_spaces, repeat(label), cols)):
if not min_error or error < min_error:
min_error = error
split_var = col
min_split = split
pool.close()
splitting_feature = (split_var, min_split)
children = split_children(data, label, split_var, min_split)
left_data, left_label, right_data, right_label = children
if len(left_label) == 0 or len(right_label) == 0:
return create_leaf(label)
left_least_square = least_square(left_label)
# Create a leaf node if the split is "perfect"
if left_least_square < ideal_ls:
return create_leaf(left_label)
if least_square(right_label) < ideal_ls:
return create_leaf(right_label)
# recurse on children
left_tree = create_tree(left_data, remaining_features, left_label, max_depth, ideal_ls, current_depth +1)
right_tree = create_tree(right_data, remaining_features, right_label, max_depth, ideal_ls, current_depth +1)
return {'is_leaf' : False,
'value' : None,
'splitting_feature': splitting_feature,
'left' : left_tree,
'right' : right_tree,
'index' : None}
def error_function(split_point, split_var, data, label):
data1 = []
data2 = []
for i in xrange(len(data)):
temp_dat = data[i]
if temp_dat <= split_point:
data1.append(label[i])
else:
data2.append(label[i])
return least_square(data1) + least_square(data2)
def make_prediction(tree, x, annotate = False):
if tree['is_leaf']:
if annotate:
print "At leaf, predicting %s" % tree['value']
return tree['value']
else:
# the splitting value of x.
split_feature_value = x[tree['splitting_feature'][0]]
if annotate:
print "Split on %s = %s" % (tree['splitting_feature'], split_feature_value)
if split_feature_value < tree['splitting_feature'][1]:
return make_prediction(tree['left'], x, annotate)
else:
return make_prediction(tree['right'], x, annotate)
class RegressionTree:
def __init__(self, training_data, labels, max_depth=5, ideal_ls=100):
self.training_data = training_data
self.labels = labels
self.max_depth = max_depth
self.ideal_ls = ideal_ls
self.tree = None
def fit(self):
global node_id
node_id = 0
all_pos_split = {}
pool = Pool()
splitting_data = [self.training_data.iloc[:,col].tolist() for col in xrange(self.training_data.shape[1])]
cols = [col for col in xrange(self.training_data.shape[1])]
for dat, col in pool.map(get_splitting_points, zip(splitting_data, cols)):
all_pos_split[col] = dat
pool.close()
self.tree = create_tree(self.training_data, all_pos_split, self.labels, self.max_depth, self.ideal_ls)
def predict(self, test):
prediction = np.array([make_prediction(self.tree, x) for x in test])
return prediction
if __name__ == '__main__':
#read in data, label
data = pd.read_excel("mlr06.xls")
test = [[478, 184, 40, 74, 11, 31], [1000,10000,10000,10000,10000,1000,100000]]
label = data['X7']
del data['X7']
model = RegressionTree(data, label)
model.fit()
print model.predict(test)