Skip to content

Commit

Permalink
changed python version and index update logic of cfeature. all worked.
Browse files Browse the repository at this point in the history
  • Loading branch information
ryogrid committed Jan 25, 2025
1 parent 698a316 commit 07798d5
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 49 deletions.
65 changes: 36 additions & 29 deletions gen_cfeatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from PIL import Image
from huggingface_hub import hf_hub_download, HfFileSystem
from gensim.similarities import Similarity
from scipy.sparse import csr_matrix

try:
from imgutils.data import load_images, ImageTyping
Expand Down Expand Up @@ -294,9 +295,11 @@ def gen_image_ndarray(self, file_path) -> np.ndarray | None:
return None

def get_image_feature(self, file_path: str) -> np.ndarray:
if self.cindex is None:
self.cindex = Similarity.load('charactor-featues-idx')
if self.threshold == -1.0:
self.threshold = self.ccip_default_threshold(_DEFAULT_MODEL_NAMES) / 1.5
# if self.cindex is None:
# self.cindex = Similarity.load('charactor-featues-idx')
# self.threshold = self.ccip_default_threshold(_DEFAULT_MODEL_NAMES) / 1.5

img: np.ndarray = self.gen_image_ndarray(file_path)
return self.predict([img])[0]
Expand All @@ -311,6 +314,25 @@ def write_vecs_to_index(self, vecs: np.ndarray) -> bool:
self.cindex.add_documents([id_and_vals])
#self.cindex.add_documents([vec])

def get_current_cfeature_number(self) -> int:
# find latest revision of index files (charactor-featues-idx.NUMBER)
files = os.listdir('.')

# Extract files matching the pattern "charactor-features-index" or "charactor-features-index.NUMBER"
pattern = re.compile(r'^charactor-featues-idx(\d*)$')
numbers = []

for file in files:
match = pattern.match(file)
if match:
# Get the numeric part (default to 0 if not present)
number = int(match.group(1)) if match.group(1) else 0
numbers.append(number)

# Get the maximum number
max_number = max(numbers)

return max_number

def process_directory(self, dir_path: str, added_date: datetime.date | None = None) -> None:
file_list: List[str] = self.list_files_recursive(dir_path)
Expand All @@ -321,44 +343,29 @@ def process_directory(self, dir_path: str, added_date: datetime.date | None = No
file_list = self.filter_files_by_date(file_list, added_date)
print(f'{len(file_list)} files found after {added_date}')

# Create backup directory with timestamp
backup_dir = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
os.makedirs(backup_dir, exist_ok=True)
# Backup existing index files
for file in Path('.').glob('charactor-featues-idx*'):
shutil.copy2(file, Path(backup_dir) / file.name)
print(f'Backed up {file} to {backup_dir}')

# find latest revision of index files (charactor-featues-idx.NUMBER)
files = os.listdir('.')

# Extract files matching the pattern "charactor-features-index" or "charactor-features-index.NUMBER"
pattern = re.compile(r'^charactor-featues-idx(\d*)$')
numbers = []

for file in files:
match = pattern.match(file)
if match:
# Get the numeric part (default to 0 if not present)
number = int(match.group(1)) if match.group(1) else 0
numbers.append(number)
# # Create backup directory with timestamp
# backup_dir = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
# os.makedirs(backup_dir, exist_ok=True)
# # Backup existing index files
# for file in Path('.').glob('charactor-featues-idx*'):
# shutil.copy2(file, Path(backup_dir) / file.name)
# print(f'Backed up {file} to {backup_dir}')

# Get the maximum number
max_number = max(numbers)
max_number = self.get_current_cfeature_number()

print('copying index files to new index files')

# copy all index data to new index files
if max_number == 0:
old_index = Similarity.load('charactor-featues-idx')
old_index = Similarity.load('charactor-featues-idx', mmap=None)
else:
old_index = Similarity.load('tmp-charactor-featues-idx' + str(max_number))
old_index = Similarity.load('tmp-charactor-featues-idx' + str(max_number), mmap=None)

for idx in range(0, len(old_index)):
if self.cindex is None:
self.cindex = Similarity('charactor-featues-idx' + str(max_number + 1), [old_index.vector_by_id(idx)], num_features=768)
self.cindex = Similarity('charactor-featues-idx' + str(max_number + 1), [csr_matrix(old_index.vector_by_id(idx)).toarray().squeeze()], num_features=768)
else:
self.cindex.add_documents([old_index.vector_by_id(idx)])
self.cindex.add_documents([csr_matrix(old_index.vector_by_id(idx)).toarray().squeeze()])

print('copying index files to new index files done')

Expand Down
30 changes: 10 additions & 20 deletions webui.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from gensim.similarities import MatrixSimilarity
from numpy import ndarray
from streamlit.runtime.state import SessionStateProxy
from icecream import ic
import pickle

import numpy as np
Expand Down Expand Up @@ -256,38 +257,25 @@ def get_cfeatures_based_reranked_scores(final_scores, topn, required_tags: List[
global cfeatures_idx
global predictor

if predictor is None:
predictor = Predictor()
predictor.embed_model = predictor._open_feat_model(_DEFAULT_MODEL_NAMES, executor='CPUExecutionProvider')
predictor.metric_model = predictor._open_metric_model(_DEFAULT_MODEL_NAMES, executor='CPUExecutionProvider')

if cfeature_filepath_idx is None:
cfeature_filepath_idx = []
with open('charactor-featues-idx.csv', 'r', encoding='utf-8') as f:
for line in f:
cfeature_filepath_idx.append(line.strip())

if cfeatures_idx is None:
# find latest revision of index files (charactor-featues-idx.NUMBER)
files = os.listdir('.')
# Extract files matching the pattern "charactor-features-index" or "charactor-features-index.NUMBER"
pattern = re.compile(r'^charactor-featues-idx(\d*)$')
numbers = []

for file in files:
match = pattern.match(file)
if match:
# Get the numeric part (default to 0 if not present)
number = int(match.group(1)) if match.group(1) else 0
numbers.append(number)

# Get the maximum number
max_number = max(numbers)
# get latest revision of index files (charactor-featues-idx.NUMBER)
max_number = predictor.get_current_cfeature_number()
if max_number == 0:
cfeatures_idx = MatrixSimilarity.load('charactor-featues-idx')
else:
cfeatures_idx = MatrixSimilarity.load('charactor-featues-idx' + str(max_number))

if predictor is None:
predictor = Predictor()
predictor.embed_model = predictor._open_feat_model(_DEFAULT_MODEL_NAMES, executor='CPUExecutionProvider')
predictor.metric_model = predictor._open_metric_model(_DEFAULT_MODEL_NAMES, executor='CPUExecutionProvider')

# when length of final_scores is larger than 10, calculate mean vector of cfeatures from top10 images
# and calculate similarity between the mean vector and all images
# then, sort the similarity and return images whose similarity is higher than threshold
Expand Down Expand Up @@ -316,6 +304,8 @@ def get_cfeatures_based_reranked_scores(final_scores, topn, required_tags: List[
diffs_by_cfeature_list: List[Tuple[int, float]] = []

for idx in range(0, len(cfeature_filepath_idx)):
# ic(cfeatures_idx.vector_by_id(idx))
# ic(len(cfeatures_idx.vector_by_id(idx)))
diff: float = predictor.ccip_difference(cfeatures_idx.vector_by_id(idx), weighted_mean_cfeatures)

is_include_required = False
Expand Down

0 comments on commit 07798d5

Please sign in to comment.