changed python version and index update logic of cfeature. all worked.

ryogrid · Jan 25, 2025 · 07798d5 · 07798d5
1 parent 698a316
commit 07798d5
Show file tree

Hide file tree

Showing 2 changed files with 46 additions and 49 deletions.
diff --git a/gen_cfeatures.py b/gen_cfeatures.py
@@ -19,6 +19,7 @@
 from PIL import Image
 from huggingface_hub import hf_hub_download, HfFileSystem
 from gensim.similarities import Similarity
+from scipy.sparse import csr_matrix
 
 try:
     from imgutils.data import load_images, ImageTyping
@@ -294,9 +295,11 @@ def gen_image_ndarray(self, file_path) -> np.ndarray | None:
             return None
 
     def get_image_feature(self, file_path: str) -> np.ndarray:
-        if self.cindex is None:
-            self.cindex = Similarity.load('charactor-featues-idx')
+        if self.threshold == -1.0:
             self.threshold = self.ccip_default_threshold(_DEFAULT_MODEL_NAMES) / 1.5
+        # if self.cindex is None:
+        #     self.cindex = Similarity.load('charactor-featues-idx')
+        #     self.threshold = self.ccip_default_threshold(_DEFAULT_MODEL_NAMES) / 1.5
 
         img: np.ndarray = self.gen_image_ndarray(file_path)
         return self.predict([img])[0]
@@ -311,6 +314,25 @@ def write_vecs_to_index(self, vecs: np.ndarray) -> bool:
                 self.cindex.add_documents([id_and_vals])
                 #self.cindex.add_documents([vec])
 
+    def get_current_cfeature_number(self) -> int:
+        # find latest revision of index files (charactor-featues-idx.NUMBER)
+        files = os.listdir('.')
+
+        # Extract files matching the pattern "charactor-features-index" or "charactor-features-index.NUMBER"
+        pattern = re.compile(r'^charactor-featues-idx(\d*)$')
+        numbers = []
+
+        for file in files:
+            match = pattern.match(file)
+            if match:
+                # Get the numeric part (default to 0 if not present)
+                number = int(match.group(1)) if match.group(1) else 0
+                numbers.append(number)
+
+        # Get the maximum number
+        max_number = max(numbers)
+
+        return max_number
 
     def process_directory(self, dir_path: str, added_date: datetime.date | None = None) -> None:
         file_list: List[str] = self.list_files_recursive(dir_path)
@@ -321,44 +343,29 @@ def process_directory(self, dir_path: str, added_date: datetime.date | None = No
             file_list = self.filter_files_by_date(file_list, added_date)
             print(f'{len(file_list)} files found after {added_date}')
 
-            # Create backup directory with timestamp
-            backup_dir = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
-            os.makedirs(backup_dir, exist_ok=True)
-            # Backup existing index files
-            for file in Path('.').glob('charactor-featues-idx*'):
-                shutil.copy2(file, Path(backup_dir) / file.name)
-                print(f'Backed up {file} to {backup_dir}')
-
-            # find latest revision of index files (charactor-featues-idx.NUMBER)
-            files = os.listdir('.')
-
-            # Extract files matching the pattern "charactor-features-index" or "charactor-features-index.NUMBER"
-            pattern = re.compile(r'^charactor-featues-idx(\d*)$')
-            numbers = []
-
-            for file in files:
-                match = pattern.match(file)
-                if match:
-                    # Get the numeric part (default to 0 if not present)
-                    number = int(match.group(1)) if match.group(1) else 0
-                    numbers.append(number)
+            # # Create backup directory with timestamp
+            # backup_dir = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
+            # os.makedirs(backup_dir, exist_ok=True)
+            # # Backup existing index files
+            # for file in Path('.').glob('charactor-featues-idx*'):
+            #     shutil.copy2(file, Path(backup_dir) / file.name)
+            #     print(f'Backed up {file} to {backup_dir}')
 
-            # Get the maximum number
-            max_number = max(numbers)
+            max_number = self.get_current_cfeature_number()
 
             print('copying index files to new index files')
 
             # copy all index data to new index files
             if max_number == 0:
-                old_index = Similarity.load('charactor-featues-idx')
+                old_index = Similarity.load('charactor-featues-idx', mmap=None)
             else:
-                old_index = Similarity.load('tmp-charactor-featues-idx' + str(max_number))
+                old_index = Similarity.load('tmp-charactor-featues-idx' + str(max_number), mmap=None)
 
             for idx in range(0, len(old_index)):
                 if self.cindex is None:
-                    self.cindex = Similarity('charactor-featues-idx' + str(max_number + 1), [old_index.vector_by_id(idx)], num_features=768)
+                    self.cindex = Similarity('charactor-featues-idx' + str(max_number + 1), [csr_matrix(old_index.vector_by_id(idx)).toarray().squeeze()], num_features=768)
                 else:
-                    self.cindex.add_documents([old_index.vector_by_id(idx)])
+                    self.cindex.add_documents([csr_matrix(old_index.vector_by_id(idx)).toarray().squeeze()])
 
             print('copying index files to new index files done')
 

diff --git a/webui.py b/webui.py
@@ -7,6 +7,7 @@
 from gensim.similarities import MatrixSimilarity
 from numpy import ndarray
 from streamlit.runtime.state import SessionStateProxy
+from icecream import ic
 import pickle
 
 import numpy as np
@@ -256,38 +257,25 @@ def get_cfeatures_based_reranked_scores(final_scores, topn, required_tags: List[
     global cfeatures_idx
     global predictor
 
+    if predictor is None:
+        predictor = Predictor()
+        predictor.embed_model = predictor._open_feat_model(_DEFAULT_MODEL_NAMES, executor='CPUExecutionProvider')
+        predictor.metric_model = predictor._open_metric_model(_DEFAULT_MODEL_NAMES, executor='CPUExecutionProvider')
+
     if cfeature_filepath_idx is None:
         cfeature_filepath_idx = []
         with open('charactor-featues-idx.csv', 'r', encoding='utf-8') as f:
             for line in f:
                 cfeature_filepath_idx.append(line.strip())
 
     if cfeatures_idx is None:
-        # find latest revision of index files (charactor-featues-idx.NUMBER)
-        files = os.listdir('.')
-        # Extract files matching the pattern "charactor-features-index" or "charactor-features-index.NUMBER"
-        pattern = re.compile(r'^charactor-featues-idx(\d*)$')
-        numbers = []
-
-        for file in files:
-            match = pattern.match(file)
-            if match:
-                # Get the numeric part (default to 0 if not present)
-                number = int(match.group(1)) if match.group(1) else 0
-                numbers.append(number)
-
-        # Get the maximum number
-        max_number = max(numbers)
+        # get latest revision of index files (charactor-featues-idx.NUMBER)
+        max_number = predictor.get_current_cfeature_number()
         if max_number == 0:
             cfeatures_idx = MatrixSimilarity.load('charactor-featues-idx')
         else:
             cfeatures_idx = MatrixSimilarity.load('charactor-featues-idx' + str(max_number))
 
-    if predictor is None:
-        predictor = Predictor()
-        predictor.embed_model = predictor._open_feat_model(_DEFAULT_MODEL_NAMES, executor='CPUExecutionProvider')
-        predictor.metric_model = predictor._open_metric_model(_DEFAULT_MODEL_NAMES, executor='CPUExecutionProvider')
-
     # when length of final_scores is larger than 10, calculate mean vector of cfeatures from top10 images
     # and calculate similarity between the mean vector and all images
     # then, sort the similarity and return images whose similarity is higher than threshold
@@ -316,6 +304,8 @@ def get_cfeatures_based_reranked_scores(final_scores, topn, required_tags: List[
         diffs_by_cfeature_list: List[Tuple[int, float]] = []
 
         for idx in range(0, len(cfeature_filepath_idx)):
+            # ic(cfeatures_idx.vector_by_id(idx))
+            # ic(len(cfeatures_idx.vector_by_id(idx)))
             diff: float = predictor.ccip_difference(cfeatures_idx.vector_by_id(idx), weighted_mean_cfeatures)
 
             is_include_required = False