-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmerger.py
67 lines (57 loc) · 2.33 KB
/
merger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import math
from queue import Queue
import utils
class Merger:
"""
A class for the merge functionality of posting files
"""
def __init__(self, path_to_files, file_type, docs_file, corpus_size=0):
self.corpus_size = corpus_size
self.queue = Queue()
self.dict1 = None
self.dict2 = None
self.files_name = None
self.file_type = file_type
self.path_to_files = path_to_files
self.docs_file = docs_file
def merge(self, group_id):
"""
The merge function:
The function will collect all the dictionaries from the posting file given
and insert each dictionary to the queue.
Algorithm used is the BSBI algorithm:
for each two dictionaries:
merged them, update the intersection keys in the merged dictionary and put the
new merged dictionary back to the queue.
To that until there is only 1 dictionary left in the queue.
Save the last dictionary in the posting file.
"""
was_combined = False
merged_dict = None
self.files_name = group_id
self.collect_files()
while self.queue.qsize() > 1:
if not was_combined:
was_combined = True
self.dict1 = self.queue.get()
self.dict2 = self.queue.get()
# merge the 2 dictionaries
merged_dict = {**self.dict1, **self.dict2}
for key in set(self.dict1.keys()).intersection(set(self.dict2.keys())):
merged_dict[key]['docs'] = merged_dict[key]['docs'] + self.dict1[key]['docs']
merged_dict[key]['df'] = merged_dict[key]['df'] + self.dict1[key]['df']
self.queue.put(merged_dict)
if was_combined:
utils.save_obj(merged_dict, f"{self.path_to_files}\\{self.files_name}")
else:
file_dict = self.queue.get()
for key in file_dict:
self.calculate_doc_weight(file_dict, key)
utils.save_obj(file_dict, f"{self.path_to_files}\\{self.files_name}")
def collect_files(self):
file_handle = utils.open_file(f"{self.path_to_files}\\{self.files_name}")
obj = utils.get_next(file_handle)
while obj:
self.queue.put(obj)
obj = utils.get_next(file_handle)
utils.close_file(file_handle)