-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimage_load_orchastrator.py
161 lines (131 loc) · 6.63 KB
/
image_load_orchastrator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import asyncio
from image_worker import ImageWorker
from os import path, listdir, mkdir
import time
from random import randrange
from typing import Dict, List
default_working_dir = "./images/"
file_types = ["jpeg", "png", "jpg"]
class ImageLoadOrchastrator:
__instance = None
@classmethod
def get_instance(cls, working_dir, db_path, verbose, precision, reduced_size_factor) -> 'ImageLoadOrchastrator':
# Create singleton of this class, and return the existing instance if it exists
if cls.__instance is None:
cls.__instance = cls.__new__(cls)
# The working directory of the image
cls.__instance.working_dir = working_dir
# The path to the database files (SQLite3)
cls.__instance.db_path = db_path
# Whether or not to be verbose
cls.__instance.verbose = verbose
# The precision (how many different characters allowed in the hash) for determining likeness
cls.__instance.precision = precision
# The factor to reduce the image size by
cls.__instance.reduced_size_factor = reduced_size_factor
return cls.__instance
# Only allow creation through get_instance method
def __init__(self):
raise RuntimeError("Call get_instance() instead")
# Run the comparison routine
async def run(self, comparison_method: str, avoid_db: bool) -> None:
# Make sure the provided path to images exists
if not path.isdir(self.working_dir):
raise Exception("Working dir does not exist")
# Get all images from the working dir provided
files = [file for file in listdir(self.working_dir) if path.isfile(
path.join(self.working_dir, file))]
# Initialize the start time (for stats purposes) and list of tasks
tasks = []
start = time.time()
print(f"Starting... current time is {time.strftime('%H:%M:%S')}")
# Create an async worker for each file (to get a hash of the file) and run
for i, file in enumerate(files):
# Make sure the filetype is one of the allowed types
if file.split(".")[-1] not in file_types:
continue
# Create the ImageWorker for the image, start it, and append the task to our list of tasks
worker = ImageWorker(self.working_dir, str(file), self.reduced_size_factor, avoid_db)
tasks.append(asyncio.create_task(worker.construct(comparison_method, self.db_path,
self.verbose)))
# Wait until all workers are done and gather into a list of completed workers
fulfilled_workers = await asyncio.gather(*tasks)
# Get workers (trim out all exact matches) and find similar images
workers = await self.get_workers(fulfilled_workers)
# Get groupings of alike and exact matches
groups = self.get_groupings(workers)
# Move each image into its new folder for comparison
self.move_groups(groups)
# Finish by saving all images to the database if we're not avoiding it
if not avoid_db:
await self.save_image_data(workers)
end = time.time()
diff = end - start
print("Done, finished {file_len} files. Time is {time}, operation took "
"{hours:.0f}:{minutes:<02.0f}:{seconds:<02.2f}"
.format(file_len=len(files), time=time.strftime("%H:%M:%S"), hours=diff // 3600,
minutes=(diff // 60) % 60, seconds=diff % 60))
# Group all alike and exact images together
@staticmethod
def get_groupings(workers: Dict[str, ImageWorker]) -> List[List[ImageWorker]]:
found = []
groups = []
# Loop through all workers (excluding those trimmed)
for md5, worker in workers.items():
# Skip if the MD5 was already found
if md5 in found:
continue
# Copy the alike workers
current = list(worker.alike.values())
# Add all found MD5s to the found list so that files aren't moved multiple times
found.extend(list(worker.alike.keys()))
# Only append to the return list of groups if there are results
if len(current) > 1 or (len(current) == 1 and len(current[0].exact) > 0):
groups.append(current)
return groups
# Trim down workers with the exact same MD5
async def get_workers(self, fulfilled_workers) -> Dict[str, ImageWorker]:
workers = {}
# Search through all workers
for i in range(len(fulfilled_workers)):
worker = fulfilled_workers[i]
# If no other worker with the given MD5 exists, add this and find all similar workers throughout the rest of
# the list
if worker.md5 not in workers:
workers[worker.md5] = worker
worker.check_alike(fulfilled_workers[i + 1:], self.precision)
# Otherwise, add this worker to the already existing workers list of exact matches
else:
workers[worker.md5].add_exact(worker)
return workers
# Loop through and move groups of images into new folders
def move_groups(self, groups: List[List[ImageWorker]]) -> None:
# Loop through and move each group
for group in groups:
# Create some random numerical suffix from 0 to 2^50
random_suffix = hex(randrange(0, 2**50))
# get the new path
new_path = path.join(self.working_dir, random_suffix)
# Create the new directory
mkdir(new_path)
# Asynchronously move all images into the new directory
for image in group:
image.move(new_path)
# Save all images to the database
@staticmethod
async def save_image_data(workers: List[ImageWorker]) -> None:
tasks = []
for worker in workers.values():
tasks.append(asyncio.create_task(worker.save_image_data()))
await asyncio.gather(*tasks)
# Add an ignore similarity request
def ignore_similarity(self, image_1_name: str, image_2_name: str) -> None:
if not path.isdir(self.working_dir):
raise Exception("Working dir does not exist")
image_1 = ImageWorker(self.working_dir, image_1_name, self.reduced_size_factor, False)
image_2 = ImageWorker(self.working_dir, image_2_name, self.reduced_size_factor, False)
image_1.construct(None, self.db_path, self.verbose)
image_2.construct(None, self.db_path, self.verbose)
image_1.save_image_data()
image_2.save_image_data()
image_1.save_ignore_similarity(image_2)