-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgrahamTraines_Logistic_Regression.py
294 lines (240 loc) · 8.99 KB
/
grahamTraines_Logistic_Regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
__author__ = 'graham'
# Graham Traines
# CSCI 7090 Machine Learning
# Logistic Regression with Gradient Descent
# Demonstration
import numpy as np
import scipy as sp
import matplotlib.pyplot as mpl
"""
GO TO THE BOTTOM OF THE SCRIPT TO FIND THE START
"""
# method:
# ---Load data
def load_data_set(file_location, delimiter, column_y, column_x1):
"""
:param file_location: string for data file location
:param delimiter: ',' for CSV, etc.
:param column_y: the column containing the target values
:param column_x1: input data column -- right now, I only take 1
TODO: add a parameter that is a set so I can take multiple input columns
:return: Numpy Array of target values, input values, and bias term (bias term always = 1.0)
"""
data = sp.genfromtxt(file_location, delimiter=delimiter, dtype=None)
# Need to get everything after the headers
X = data[1:, column_x1]
Y = data[1:, column_y]
# we make the cases 1 and -1 to fit with the Likelihood Formula
# P(y | x) -> h(x) for y = +1
# P(y | x) -> 1 - h(x) for y = -1
y_numeric = [1.0 if entry == 'Yes' else -1.0 for entry in Y]
# Will use this for x0
ones = [1.0 for x in X]
return np.array(zip(y_numeric, X, ones), dtype='float_')
def sigmoid(weights, x):
"""
Sigmoid function as presented in the CalTech lectures
I make it overly verbose so I can see what's going on inside
:param weights: vector of value weights, including bias weight
:param x: vector of input values, including bias term
:return: a number between 0 and 1
"""
sigmoid_func = lambda s: np.exp(s) / (1.0 + np.exp(s))
T = weights.transpose().dot(x)
theta_s = sigmoid_func(T)
return theta_s
def print_sigmoids(weights, x_values):
"""
For debugging/testing, look directly at the values instead of plotting
:param weights: vector of value weights, including bias weight
:param x: vector of input values, including bias term
:return: void
"""
for x in x_values:
print x, str(sigmoid(weights, x))
return
def plot_sigmoid(weights, x_values, x_label, iterations):
"""
Plots probability vs. our random variable
:param weights: Vector of weights including bias weight
:param x_values: Array of X values, including bias term
:param x_label: Name of the data column we are learning
:return:
"""
y_label = 'Probability of True'
y_values = []
for x in x_values:
y_values.append(sigmoid(weights, x))
x_space = x_values[:, 0]
y_space = np.array(y_values)
mpl.plot(x_space, y_space, 'ro')
mpl.title("Sigmoid Function for Probability of Default Given " + x_label + " After " + str(iterations) + " Iterations")
mpl.xlabel(x_label)
mpl.ylabel(y_label)
mpl.autoscale(tight=True)
mpl.grid()
mpl.show()
return
def plot_data_set(x_values, y_values, x_label):
"""
:param x_values: array of x values, including bias term (for simplicity, I remove it in here
:param y_values: array of target values
:param x_label: name of the data we are considering
:return:
"""
y_label = 'Defaulted (yes/no)'
x_space = x_values[:, 0]
y_space = np.array(y_values)
mpl.plot(x_space, y_space, 'go')
mpl.title("Data for " + x_label + " Compared to Defaults")
mpl.xlabel(x_label)
mpl.ylabel(y_label)
mpl.autoscale(tight=True)
mpl.grid()
mpl.show()
return
#method:
#---Gradient descent
def gradient_descent(little_epsilon, eta, y_set, x_set, iterations):
"""
:param little_epsilon: acceptable error rate
:param eta: learning step size
:param y_set: set of target values
:param x_set: set of input values, including bias term (1.0)
:return:
"""
assert type(y_set) is np.ndarray, "y_set is not numpy array"
assert type(x_set) is np.ndarray, "x_set is not numpy array"
# t = number of iterations
t = 0
# initialize w(0) to 0
weights_t = np.array([0.0, 0.0])
# depending on the data set, we could continue training until
# the gradient is smaller than our acceptable epsilon
# otherwise, not using the little_epsilon parameter at the moment
#while abs(grEin.any()) >= little_epsilon:
# iterate to the next step until it is time to stop
for i in range(0, iterations):
# grEin = gradient of in-sample Error
grEin = get_gradient(y_set, weights_t, x_set)
# update the weights
# delta w = eta * grEin
delta = (1.0 * eta) * grEin
# w(t + 1) = w(t) - delta(w(t))
weights_t = weights_t - delta
# The following block is for debugging/testing purposes
# print delta
#print abs(grEin)
#print grEin
#print eta * grEin
#print weights_t
# update iteration count
t += 1
# return the final weights vector W (weights_t)
return weights_t
#method:
#---Get gradient at this t with w(t)
def get_gradient(y_set, weights_t, data_set):
"""
:param y_set: the array of target values
:param weights_t: array of weights with bias term
:param data_set: array of training values with bias term
:return:
"""
assert type(y_set) is np.ndarray, "y_set is not a numpy array"
assert type(weights_t) is np.ndarray, "weights_t is not a numpy array"
assert type(data_set) is np.ndarray, "dataset is not a numpy matrix"
# number of training examples
N = len(data_set)
# get the sum of error in the whole set
summation_error = 0.0
for n in range(0, N):
Xn = np.array(data_set[n, :])
summation_error += get_partial_gradient_sum(y_set[n], weights_t, Xn)
# average out the error
# gradient of in-sample error = -1/N * (summation of n in N -> (yn*Xn) / (1 + euler's constant ^ (yn * W(t).T * Xn))
gradient = (-1.0/N) * summation_error
return gradient
# method:
# returns the value for n : yn * Xn / (1 + e^(yn * W(t).T * Xn)
def get_partial_gradient_sum(yn, Wt, Xn):
'''
This is the inner term of the summation from the CalTech lectures/text
I separate it out for readability/inspection/debugging
It should really be in a matrix operation, this is really slow
:param yn: target value for data row in question
:param Wt: vector of Weights for this iteration
:param Xn: vector of inputs for data row
:return: Computed value for the summation term yn * Xn / (1 + e^(yn * W(t).T * Xn)
'''
#assert yn is np.ndarray, "y is not a numpy array"
#assert Xn is np.ndarray, "Xn is not a numpy array"
#assert Wt is np.ndarray, "Wt is not a numpy array"
#print Wt
#print yn
# y[n]X[n]
top_term = yn * Xn
# y[n] * W[t]T * X[n]
exponent_term = yn * (Wt.transpose().dot(Xn))
# 1 + e^exponent
divisor = 1.0 + np.exp(exponent_term)
partial = top_term / divisor
return partial
#method:
#---Execute the learning and show results
def learn_class(data_set_file_location, data_column, column_name, target_column, little_epsilon, eta, iterations):
"""
:param data_set_file_location: Location of data file to load
:param data_column: Integer for input data column in the data file
:param column_name: Name of the data we are considering (just a string we make up)
:param target_column: Integer for target data column in the data file
:param little_epsilon: Tolerable in-sample error (not currently being used)
:param eta: Learning step size
:return:
"""
global dataset, classes
# Load rows from credit_data.csv
# Row 1 = Target, Row 3 = Balance, Row 4 = Income
dataset = load_data_set(data_set_file_location, ",", target_column, data_column)
# get possible classes (should be -1 and 1)
# in case we want to check the data
# classes = np.unique(dataset[:, 0])
# for d in dataset:
# if d[0] == 1:
# print d
X = np.array(dataset[:, 1:])
if data_column == 4:
X = np.array([x/1000.0 for x in X])
Y_set = np.array(dataset[:, 0])
plot_data_set(X, Y_set, column_name)
#learn the functions
weights = gradient_descent(little_epsilon, eta, Y_set, X, iterations)
#return weights, intercept
print "Coefficient: ", weights[0]
print "Intercept: ", weights[1]
#print_sigmoids(weights, sgd_classifier.intercept_, XT)
plot_sigmoid(weights, X, column_name, iterations)
return
'''
!!!!!!!!!!!!!!!!!!
!!! START HERE !!!
!!!!!!!!!!!!!!!!!!
'''
# little_epsilon: tolerable in-sample error
# (not currently being used, the error never seems to reach low enough)
little_epsilon = 0.1
# eta: learning rate
eta = 0.001
# learning iterations
iterations = 200
# target data column: 1
# input data column: 3
#learn: P(Default|Balance)
learn_class("/home/graham/Desktop/MachineLearning/Mod6/Credit_Data.csv", 3, 'Balance', 1, little_epsilon, eta, iterations)
# target data column: 1
# input data column: 4
#learn: P(Default|Income)
learn_class("/home/graham/Desktop/MachineLearning/Mod6/Credit_Data.csv", 4, 'Income (* 1000)', 1, little_epsilon, eta, iterations)
#demonstrate correct classification: P(D|B)
#P(Default|Balance)
#P(Default|Income)