-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAprioriPruningOptimized.py
264 lines (200 loc) · 8.02 KB
/
AprioriPruningOptimized.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
import time
class AprioriPruningOptimized:
"""
Computation class for calculating Apriori with a more optimal candidate selection
"""
def __init__(self):
pass
@staticmethod
def _gettransobjects(line: str):
"""
Processes text line.
Extracts transactions objects and size.
:param line: Line of input text file.
:return: tuple (set of trans objects, transaction size)
"""
holder = line.split("\t")
transobjects = frozenset(holder[2].split(" "))
transize = int(holder[1])
return transobjects, transize
@staticmethod
def _generatecandidates(currentsize, candidates_list, candidates_set):
"""
Joins frequent sets of current size with each other only keeping
candidates of proper size.
:param currentsize: size (k) of candidates to generate
:param candidates_list: list of minimum support sets size currentsize - 1
:param candidates_set: set containing the same elements as candidates_list, used
for imporved lookup time.
:return:
"""
group_list = set()
i = 0
while i < len(candidates_list):
j = i + 1
set_1 = candidates_list[i]
while j < len(candidates_list):
set_2 = candidates_list[j]
holder = set_1.union(set_2)
add = False
if len(holder) == currentsize:
add = True
for x in holder:
"""
If any of the currentsize - 1 combinations of the
union set are not candidates then we skip
"""
if not holder.difference({x}) in candidates_set:
add = False
break
if add:
group_list.add(holder)
j += 1
i += 1
return group_list
def _performsupportcount(self, currentsize: int, transobjects: frozenset, candidates: list, iternum: float):
"""
Function performs the support count for a given transaction. It updates the running count table.
:param currentsize: Is the size of the current candidates
:param transobjects: A set that holds the transaction objects.
:param candidates: A list of the potential candidates
:param iternum: a number to iterate count by. equal to (1/number of transactions)
"""
if currentsize == 1:
self.keepTesting = True
for val in transobjects:
try:
self.countTable[currentsize][frozenset([val])] += iternum
except KeyError:
self.countTable[currentsize][frozenset([val])] = iternum
else:
for test_set in candidates:
if test_set.issubset(transobjects):
self.keepTesting = True
try:
self.countTable[currentsize][test_set] += iternum
except KeyError:
self.countTable[currentsize][test_set] = iternum
def apriori(self, filename: str, minimumsup: float = 50):
"""
Runs apriori algorithm over a txt file. Reads and processes transaction line by line.
Modfies class result table variable.
:param filename: filename/director for txt file.
:param minimumsup: the minimum support threshold.
:return:
"""
start = time.time()
self.filename = filename
self.numberFrequentItemSets = 0
"""
countTable holds current supports
dict of dict
key1 = size of rule, int
key2 = rule, frozenset
value = support, float
"""
self.countTable = {}
"""
result table hold all frequent sets and their supports
"""
self.resultTable = {}
self.minimumsup = minimumsup/100.
candidates = []
currentsize = 1
self.keepTesting = True
# While loop terminates when no more sets are frequent
while self.keepTesting:
file = open(self.filename, "r")
# Reading first line for number of transactions
self.trans_num = int(file.readline().rstrip())
iternum = 1./self.trans_num
self.countTable[currentsize] = {}
self.keepTesting = False
while True:
line = file.readline().rstrip()
# reached end of file
if not line:
break
transobjects, transize = self._gettransobjects(line)
if currentsize <= transize:
self._performsupportcount(currentsize=currentsize,
transobjects=transobjects,
candidates=candidates,
iternum=iternum)
file.close()
if not self.keepTesting:
end = time.time()
self.executiontime = end - start
continue
"""
This code iterates over countTable accomplishing 2 things:
1: Fills Candidate list
2: Constructs result Table
"""
candidates_list = []
candidates_set = set() # sets used for constant time look up
self.resultTable[currentsize] = {}
for key in self.countTable[currentsize]:
if self.countTable[currentsize][key] >= (self.minimumsup):
self.resultTable[currentsize][key] = self.countTable[currentsize][key]
candidates_list.append(key)
candidates_set.add(key)
self.numberFrequentItemSets += 1
currentsize += 1
candidates = self._generatecandidates(currentsize=currentsize,
candidates_list=candidates_list,
candidates_set=candidates_set)
def printterminal(self):
"""
Prints number of frequent item sets and the execution time to system out.
:return:
"""
print(f"|FPs| = {self.numberFrequentItemSets}")
print(f"Execution Time = {self.executiontime}s\n")
@staticmethod
def _frequentset_to_string(set_1):
"""
Takes in a set and formats it as a string for output.
:param set_1: Set to format
:return: Takes set and formats a string.
"""
if len(set_1) == 1:
for item in set_1:
return str(item)
else:
holder_str = ""
for item in set_1:
holder_str = holder_str + item + ", "
return holder_str[:-2]
def results_to_file(self, filename: str):
"""
Saves formatted results in filename
:return:
"""
keylist = self._getfrequentsetsorted()
file = open(filename, "w")
file.write(f"|FPs| = {self.numberFrequentItemSets}\n")
for size in self.resultTable:
keylist[size].sort()
for set_1 in keylist[size]:
set_1 = list(map(str, set_1))
file.write(f"{self._frequentset_to_string(set_1)} : {self.resultTable[size][frozenset(set_1)]}\n")
file.close()
def _getfrequentsetsorted(self):
"""
Used for output file formatting. Sorts all the frequent sets.
:return: returns dict that holds frequent sets as sorted lists
for file print out
"""
keylist = {}
for key in self.resultTable:
keylist[key] = []
for f_set in self.resultTable[key]:
holder = list(f_set)
try:
holder = list(map(int, holder))
except ValueError:
pass
holder.sort()
keylist[key].append(holder)
return keylist