diff --git a/tests/python_client/chaos/checker.py b/tests/python_client/chaos/checker.py index f63bb5985b257..4508389fc9e62 100644 --- a/tests/python_client/chaos/checker.py +++ b/tests/python_client/chaos/checker.py @@ -408,7 +408,7 @@ def __init__(self, collection_name=None, partition_name=None, shards_num=2, dim= self.c_wrap.load(replica_number=self.replica_number) self.p_wrap.init_partition(self.c_name, self.p_name) - if insert_data: + if insert_data and self.c_wrap.num_entities == 0: log.info(f"collection {c_name} created, start to insert data") t0 = time.perf_counter() self.insert_data(nb=constants.ENTITIES_FOR_SEARCH, partition_name=self.p_name) diff --git a/tests/python_client/chaos/testcases/test_get_collections.py b/tests/python_client/chaos/testcases/test_get_collections.py index 62c6a5f1bfb7a..a90c00f2e67b2 100644 --- a/tests/python_client/chaos/testcases/test_get_collections.py +++ b/tests/python_client/chaos/testcases/test_get_collections.py @@ -17,7 +17,7 @@ class TestGetCollections(TestcaseBase): def test_get_collections_by_prefix(self,): self._connect() all_collections = self.utility_wrap.list_collections()[0] - all_collections = [c_name for c_name in all_collections if "Checker" in c_name] + all_collections = [c_name for c_name in all_collections if c_name.startswith("Checker")] selected_collections_map = {} for c_name in all_collections: if Collection(name=c_name).num_entities < constants.ENTITIES_FOR_SEARCH: diff --git a/tests/python_client/common/common_func.py b/tests/python_client/common/common_func.py index fead3dd43c75e..d0770c62a7010 100644 --- a/tests/python_client/common/common_func.py +++ b/tests/python_client/common/common_func.py @@ -127,7 +127,7 @@ def remove_punctuation(text): # Tokenize the corpus def jieba_split(text): text_without_punctuation = remove_punctuation(text) - return jieba.lcut(text_without_punctuation) + return jieba.cut_for_search(text_without_punctuation) def blank_space_split(text): text_without_punctuation = remove_punctuation(text) @@ -156,7 +156,7 @@ def analyze_documents(texts, language="en"): if isinstance(text, str): new_texts.append(text) # Tokenize the corpus - tokenized = tokenizer.tokenize(new_texts, return_as="tuple") + tokenized = tokenizer.tokenize(new_texts, return_as="tuple", show_progress=False) # log.info(f"Tokenized: {tokenized}") # Create a frequency counter freq = Counter() @@ -169,8 +169,13 @@ def analyze_documents(texts, language="en"): # Convert token ids back to words word_freq = Counter({id_to_word[token_id]: count for token_id, count in freq.items()}) - log.debug(f"word freq {word_freq.most_common(10)}") + + # if language in ["zh", "cn", "chinese"], remove the long words + # this is a trick to make the text match test case verification simple, because the long word can be still split + if language in ["zh", "cn", "chinese"]: + word_freq = Counter({word: count for word, count in word_freq.items() if 1< len(word) <= 3}) + log.info(f"word freq {word_freq.most_common(10)}") return word_freq diff --git a/tests/python_client/testcases/test_full_text_search.py b/tests/python_client/testcases/test_full_text_search.py index b3445f75560b7..1386598f94ca4 100644 --- a/tests/python_client/testcases/test_full_text_search.py +++ b/tests/python_client/testcases/test_full_text_search.py @@ -506,7 +506,6 @@ def test_insert_for_full_text_search_default(self, tokenizer, text_lang, nullabl if i + batch_size < len(df) else data[i: len(df)] ) - collection_w.flush() collection_w.create_index( "emb", {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, @@ -658,7 +657,6 @@ def test_insert_for_full_text_search_enable_dynamic_field(self, tokenizer, text_ if i + batch_size < len(data) else data[i: len(data)] ) - collection_w.flush() collection_w.create_index( "emb", {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, @@ -800,7 +798,6 @@ def test_insert_for_full_text_search_with_dataframe(self, tokenizer, text_lang, batch_size = 5000 for i in range(0, len(df), batch_size): collection_w.insert(df[i: i + batch_size]) - collection_w.flush() collection_w.create_index( "emb", {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, @@ -938,7 +935,6 @@ def test_insert_for_full_text_search_with_part_of_empty_string(self, tokenizer): if i + batch_size < len(df) else data[i: len(df)] ) - collection_w.flush() num_entities = collection_w.num_entities # query with count(*) res, _ = collection_w.query( @@ -1190,7 +1186,6 @@ def test_upsert_for_full_text_search(self, tokenizer, nullable): if i + batch_size < len(df) else data[i: len(df)] ) - collection_w.flush() collection_w.create_index( "emb", {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, @@ -1348,7 +1343,6 @@ def test_upsert_for_full_text_search_with_no_varchar_data(self, tokenizer, nulla if i + batch_size < len(df) else data[i: len(df)] ) - collection_w.flush() collection_w.create_index( "emb", {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, @@ -1486,7 +1480,6 @@ def test_delete_for_full_text_search(self, tokenizer): if i + batch_size < len(df) else data[i: len(df)] ) - collection_w.flush() collection_w.create_index( "emb", {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, @@ -1651,7 +1644,6 @@ def test_create_index_for_full_text_search_default( if i + batch_size < len(df) else data[i: len(df)] ) - collection_w.flush() collection_w.create_index( "emb", {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, @@ -1775,7 +1767,6 @@ def test_create_full_text_search_with_invalid_index_type( if i + batch_size < len(df) else data[i: len(df)] ) - collection_w.flush() collection_w.create_index( "emb", {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, @@ -1884,7 +1875,6 @@ def test_create_full_text_search_index_with_invalid_metric_type( if i + batch_size < len(df) else data[i: len(df)] ) - collection_w.flush() collection_w.create_index( "emb", {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, @@ -1993,8 +1983,6 @@ def test_create_index_using_bm25_metric_type_for_non_bm25_output_field( if i + batch_size < len(df) else data[i: len(df)] ) - collection_w.flush() - error = {ct.err_code: 1100, ct.err_msg: "float vector index does not support metric type: BM25"} collection_w.create_index( "emb", @@ -2091,7 +2079,6 @@ def test_create_full_text_search_with_invalid_bm25_params( if i + batch_size < len(df) else data[i: len(df)] ) - collection_w.flush() collection_w.create_index( "emb", {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, @@ -2227,7 +2214,6 @@ def test_full_text_search_default( if i + batch_size < len(df) else data[i: len(df)] ) - collection_w.flush() collection_w.create_index( "emb", {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, @@ -2316,7 +2302,7 @@ def test_full_text_search_default( overlap) > 0, f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}" @pytest.mark.tags(CaseLabel.L0) - @pytest.mark.parametrize("nq", [10]) + @pytest.mark.parametrize("nq", [2]) @pytest.mark.parametrize("empty_percent", [0.5]) @pytest.mark.parametrize("enable_partition_key", [True]) @pytest.mark.parametrize("enable_inverted_index", [True]) @@ -2409,7 +2395,10 @@ def test_full_text_search_with_jieba_tokenizer( log.info(f"dataframe\n{df}") texts = df["text"].to_list() word_freq = cf.analyze_documents(texts, language=language) - tokens = list(word_freq.keys()) + tokens = [] + for item in word_freq.most_common(20): + if len(item[0]) == 2: + tokens.append(item[0]) if len(tokens) == 0: log.info(f"empty tokens, add a dummy token") tokens = ["dummy"] @@ -2420,7 +2409,6 @@ def test_full_text_search_with_jieba_tokenizer( if i + batch_size < len(df) else data[i: len(df)] ) - collection_w.flush() collection_w.create_index( "emb", {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, @@ -2612,7 +2600,6 @@ def test_full_text_search_with_range_search( if i + batch_size < len(df) else data[i: len(df)] ) - collection_w.flush() collection_w.create_index( "emb", {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, @@ -2778,7 +2765,6 @@ def test_full_text_search_with_search_iterator( if i + batch_size < len(df) else data[i: len(df)] ) - collection_w.flush() collection_w.create_index( "emb", {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, @@ -2812,16 +2798,17 @@ def test_full_text_search_with_search_iterator( output_fields=["id", "text", "text_sparse_emb"], limit=limit ) + iter_result = [] while True: result = iterator.next() if not result: iterator.close() break else: - assert len(result) == batch_size + iter_result.append(len(result)) + for r in iter_result[:-1]: + assert r == batch_size - -# @pytest.mark.skip("skip") class TestSearchWithFullTextSearchNegative(TestcaseBase): """ ****************************************************************** @@ -2925,7 +2912,6 @@ def test_search_for_full_text_search_with_empty_string_search_data( if i + batch_size < len(df) else data[i: len(df)] ) - collection_w.flush() collection_w.create_index( "emb", {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, @@ -3062,7 +3048,6 @@ def test_search_for_full_text_search_with_invalid_search_data( if i + batch_size < len(df) else data[i: len(df)] ) - collection_w.flush() collection_w.create_index( "emb", {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, @@ -3200,7 +3185,6 @@ def test_hybrid_search_with_full_text_search( if i + batch_size < len(df) else data[i: len(df)] ) - collection_w.flush() collection_w.create_index( "dense_emb", {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},