-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmine.py
233 lines (200 loc) · 10.6 KB
/
mine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
from validations import parseArguments
from diabetes import prepareDataDiabetes, describePatternsDiabetes
from posts import prepareDataPosts, describePatternsPosts
from anonymized1 import prepareDataAnonymizedSet1
from anonymized2 import prepareDataAnonymizedSet2
from prefixspan import PrefixSpan
import pandas as pd
import numpy as np
from time import time
from datetime import date, datetime
def minePatterns(sequences, threshold, minlen, ifclosed):
ps = PrefixSpan(sequences)
ps.minlen = minlen
patterns = ps.frequent(threshold, closed = ifclosed)
return patterns
def groupDataFrameByDate(df):
df["date"] = np.nan
df["date"] = df.apply(lambda row: row["date_time"].date(), axis = 1)
df_by_date = df.groupby("date")
return df_by_date
# get monotonic subsequences - increasing or decreasing
def getStatesSubsequences(direction, states):
subsequences = []
sequence = []
# grouping sequential database into sequences by date
states_by_date = groupDataFrameByDate(states)
for _, group in states_by_date: # _ is a date we group by, group is dataframe of measurements in that group (single sequence)
group.reset_index(drop=True, inplace=True)
for idx, measurement in group.iterrows(): # idx is an index of iterator, measurement is a row in our states sequence
if(idx == 0 or
(direction == "up" and states.iloc[idx-1]["value"] <= measurement["value"]) or # when dir is up
(direction == "down" and states.iloc[idx-1]["value"] >= measurement["value"])): # when dir is down
sequence.append(measurement.copy())
else:
if(len(sequence) > 1): subsequences.append(sequence.copy())
sequence.clear()
sequence.append(measurement.copy())
if(len(sequence) > 1): subsequences.append(sequence.copy())
sequence.clear()
return subsequences
# get event subsequences during state subsequences change timeframe
def getEventsSubsequences(stateSubsequences, events):
eventsSubsequences = []
eventsCodesSubsequences = []
for stateSubsequence in stateSubsequences:
minTimestamp = stateSubsequence[0]["date_time"]
maxTimestamp = stateSubsequence[-1]["date_time"]
difference = stateSubsequence[-1]["value"] - stateSubsequence[0]["value"]
subEvents = events[(events.date_time >= minTimestamp) & (events.date_time <= maxTimestamp)].copy()
if(len(subEvents) == 0 or difference == 0): # only consider subsequences that have a change
continue
subEvents["difference"] = difference
eventsSubsequences.append(subEvents)
eventsCodesSubsequences.append(subEvents["code"].tolist())
return eventsSubsequences, eventsCodesSubsequences
def getElementIndex(element, list):
try:
return list.index(element)
except ValueError:
return -1
# check if all pattern elements are in sequence
def checkIfPatternElementsInSequence(pattern, sequence):
return all([True if elem >= 0 else False for elem in [getElementIndex(elem, sequence) for elem in pattern]])
# check if all pattern elements are in correct order in sequence
def checkIfPatternElementsInSequenceInOrder(pattern, sequence):
sequence_copy = sequence.copy()
for index, _ in enumerate(pattern):
if len(pattern) == 1:
elementIndexInSequence = getElementIndex(pattern[index], sequence_copy)
if elementIndexInSequence < 0:
return False
if index < len(pattern) - 1:
elementIndexInSequence = getElementIndex(pattern[index], sequence_copy)
sequence_copy = sequence_copy[elementIndexInSequence+1:]
nextElementIndexInSequence = getElementIndex(pattern[index+1], sequence_copy)
if elementIndexInSequence < 0 or nextElementIndexInSequence < 0:
return False
return True
# add a score measure to patterns - score is a sum of state differences in sequences where a pattern occurs
def addMeasuresToPatterns(patterns, eventsSubsequences, events, direction):
# grouping sequential database into sequences by date
events_by_date = groupDataFrameByDate(events)
allSequencesCount = len(events_by_date)
patternsWithMeasures = []
for pattern in patterns:
score = 0.0
occurInAll = 0
supportingSequencesCount = 0
occurInChangeRealSet = set()
# for every pattern check in how many state changing sequences it appears and sum all differences as score measure
for subsequence in eventsSubsequences:
if (checkIfPatternElementsInSequence(pattern[1], subsequence["code"].tolist()) and checkIfPatternElementsInSequenceInOrder(pattern[1], subsequence["code"].tolist())):
score = score + subsequence.iloc[0]["difference"]
occurInChangeRealSet.add(subsequence.iloc[0]["date_time"].date())
# for every pattern check in how many sequences it appears
for _, sequence in events_by_date:
sequence.reset_index(drop=True, inplace=True)
sequence = sequence["code"].tolist()
if (checkIfPatternElementsInSequence(pattern[1], sequence) and checkIfPatternElementsInSequenceInOrder(pattern[1], sequence)):
supportingSequencesCount = supportingSequencesCount + 1
while (checkIfPatternElementsInSequence(pattern[1], sequence) and checkIfPatternElementsInSequenceInOrder(pattern[1], sequence)):
occurInAll = occurInAll + 1
if(len(pattern[1]) <= len(sequence) and getElementIndex(pattern[1][-1], sequence) > -1):
sequence = sequence[getElementIndex(pattern[1][-1], sequence)+1:]
if(len(sequence) < len(pattern[1])):
break
# calculate the measures
occurInChange = pattern[0]
support = occurInChange / allSequencesCount
supportAll = occurInAll / allSequencesCount
confidence = support / supportAll
occurInChangeReal = len(occurInChangeRealSet)
supportReal = occurInChangeReal / allSequencesCount
supportAllReal = supportingSequencesCount / allSequencesCount
confidenceReal = supportReal / supportAllReal
if(direction == "down"): score = -score
pattern = pattern + (score, support, confidence, supportReal, confidenceReal, )
patternsWithMeasures.append(pattern)
return patternsWithMeasures
def dataMining(events, states, direction, threshold, minlen, bide):
# statesSubsequences [[seq],[seq],[seq]]
statesSubsequences = getStatesSubsequences(direction, states)
print("States subsequences done\t", datetime.now())
# eventsSubsequences[dataframe of eventsSubsequence] subsequences of events
# eventsCodesSubsequences[event codes subsequence] only codes from subsequences of events
eventsSubsequences, eventsCodesSubsequences = getEventsSubsequences(statesSubsequences, events)
print("Event subsequences done\t\t", datetime.now())
patterns = minePatterns(eventsCodesSubsequences, threshold, minlen, bide)
print("Patterns done\t\t\t", datetime.now())
# result is list of tuples (numberOfOccurencesOfPatternInChangeEvents, pattern, score, support, confidence)
patternsMeasures = addMeasuresToPatterns(patterns, eventsSubsequences, events, direction)
print("Measures done\t\t\t", datetime.now())
return patternsMeasures
def getOppositeDirection(direction):
directions = ["up", "down"]
directions.remove(direction)
return directions[0]
def patternsToCSV(patterns, areDescribed, filename = "patterns.csv"):
if patterns.empty:
print("No patterns found.")
else:
if areDescribed:
patterns.to_csv(filename, index = False, header = ["allOccurences", "pattern", "score", "support'", "confidence'", "support", "confidence", "patternDescribed"])
else:
patterns.to_csv(filename, index = False, header = ["allOccurences", "pattern", "score", "support'", "confidence'", "support", "confidence"])
def updatePatternsByOppositeResults(patterns, patternsOpposite):
patternsUpdatedScore = []
for pattern in patterns:
oppositeExists = False
for patternOpposite in patternsOpposite:
if(pattern[1] == patternOpposite[1]):
pattern = list(pattern)
pattern[2] = (pattern[2] - patternOpposite[2]) / (pattern[0] + patternOpposite[0])
pattern[0] = pattern[0] + patternOpposite[0]
pattern = tuple(pattern)
oppositeExists = True
break
if not oppositeExists:
pattern = list(pattern)
pattern[2] = pattern[2] / pattern[0]
pattern = tuple(pattern)
patternsUpdatedScore.append(pattern)
return patternsUpdatedScore
def describePatterns(file, df):
if (file == 'diabetes'):
df["patternDescribed"] = describePatternsDiabetes(df[1].tolist())
areDescribed = True
elif (file == 'posts'):
df["patternDescribed"] = describePatternsPosts(df[1].tolist())
areDescribed = True
else:
areDescribed = False
return areDescribed, df
def main(file, direction, threshold, minlen, user, bide):
print("Start\t\t\t\t", datetime.now())
# prepare raw data in expected format
# states has date_time, value
# events has data_time, code
if (file == 'diabetes'): events, states = prepareDataDiabetes(user)
elif (file == 'posts'): events, states = prepareDataPosts()
elif (file == 'anonymized1'): events, states = prepareDataAnonymizedSet1(user)
elif (file == 'anonymized2'): events, states = prepareDataAnonymizedSet2(user)
print("Preparing done\t\t\t", datetime.now())
patterns = dataMining(events, states, direction, threshold, minlen, bide)
print("Pattern mining done\t\t", datetime.now())
patternsOpposite = dataMining(events, states, getOppositeDirection(direction), threshold, minlen, bide)
print("Opposite pattern mining done\t", datetime.now())
patternsUpdatedScore = updatePatternsByOppositeResults(patterns, patternsOpposite)
print("Updates pattens done\t\t", datetime.now())
df = pd.DataFrame(patternsUpdatedScore)
areDescribed, df = describePatterns(file, df)
patternsToCSV(df, areDescribed, "patterns.csv")
print("Done\t\t\t\t", datetime.now())
# events_sequences = [group["code"].tolist() for _, group in groupDataFrameByDate(events)]
# patterns = minePatterns(events_sequences, threshold, minlen, bide)
# print(patterns)
# print("Done\t\t\t\t", datetime.now())
if __name__ == "__main__":
args = parseArguments()
main(args.file, args.direction, args.threshold, args.minlen, args.user, args.bide)