diff --git a/web-ui-image-search-lsi.py b/web-ui-image-search-lsi.py index 2cafe0e..2bcc114 100644 --- a/web-ui-image-search-lsi.py +++ b/web-ui-image-search-lsi.py @@ -25,9 +25,37 @@ class Arguments(Protocol): args: Optional[Arguments] = None +def normalize_and_apply_weight_to_quey_bow(query_bow: List[Tuple[int, int]], new_doc: str) -> List[Tuple[int, float]]: + tags: List[str] = new_doc.split(" ") + + # parse tag:weight format + tag_and_weight_list: List[Tuple[str, float]] = [] + for tag in tags: + tag_splited: List[str] = tag.split(":") + if len(tag_splited) == 2: + tag_and_weight_list.append((tag_splited[0], int(tag_splited[1]))) + else: + tag_and_weight_list.append((tag_splited[0], 1)) + + # apply weight to query_bow + for tag, weight in tag_and_weight_list: + tag_id: int = dictionary.token2id[tag] + for ii in range(len(query_bow)): + if query_bow[ii][0] == tag_id: + query_bow[ii] = (query_bow[ii][0], query_bow[ii][1]*weight) + break + + query_lsi: List[Tuple[int, float]] = model[query_bow] + + # normalize query with tag num + tag_num: int = len(tags) + query_lsi = [(tag_id, tag_value / tag_num) for tag_id, tag_value in query_lsi] + return query_lsi + def find_similar_documents(model: LsiModel, new_doc: str, topn: int = 50) -> List[Tuple[int, float]]: query_bow: List[Tuple[int, int]] = dictionary.doc2bow(simple_preprocess(new_doc)) - query_lsi: List[Tuple[int, float]] = model[query_bow] + query_lsi = normalize_and_apply_weight_to_quey_bow(query_bow, new_doc) + #query_lsi: List[Tuple[int, float]] = model[query_bow] sims: List[Tuple[int, float]] = index[query_lsi]