Skip to content

Commit

Permalink
Merge branch 'Feature/#338' into Feature/#335
Browse files Browse the repository at this point in the history
# Conflicts:
#	autorag/nodes/passageaugmenter/base.py
  • Loading branch information
bwook00 committed Apr 22, 2024
2 parents a71f8a4 + d933625 commit 37c1567
Show file tree
Hide file tree
Showing 11 changed files with 83 additions and 55 deletions.
2 changes: 1 addition & 1 deletion autorag/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.1.3
0.1.4
16 changes: 6 additions & 10 deletions autorag/data/corpus/langchain.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,12 @@ def langchain_documents_to_parquet(langchain_documents: List[Document],
Default is False.
:return: Corpus data as pd.DataFrame
"""
doc_ids = [str(uuid.uuid4()) for _ in langchain_documents]
corpus_df = pd.DataFrame([
{
'doc_id': doc_id,
'contents': doc.page_content,
'metadata': add_essential_metadata(doc.metadata, prev_id, next_id)
}
for doc, doc_id, prev_id, next_id in
zip(langchain_documents, doc_ids, [None] + doc_ids[:-1], doc_ids[1:] + [None])
])

corpus_df = pd.DataFrame(list(map(lambda doc: {
'doc_id': str(uuid.uuid4()),
'contents': doc.page_content,
'metadata': add_essential_metadata(doc.metadata)
}, langchain_documents)))

if output_filepath is not None:
save_parquet_safe(corpus_df, output_filepath, upsert=upsert)
Expand Down
34 changes: 18 additions & 16 deletions autorag/data/corpus/llama_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,12 @@ def llama_documents_to_parquet(llama_documents: List[Document],
Default is False.
:return: Corpus data as pd.DataFrame
"""
doc_ids = [str(uuid.uuid4()) for _ in llama_documents]
doc_lst = [
{
'doc_id': doc_id,
'contents': doc.text,
'metadata': add_essential_metadata(doc.metadata, prev_id, next_id)
}
for doc, doc_id, prev_id, next_id in zip(llama_documents, doc_ids, [None] + doc_ids[:-1], doc_ids[1:] + [None])
]

doc_lst = pd.DataFrame(list(map(lambda doc: {
'doc_id': str(uuid.uuid4()),
'contents': doc.text,
'metadata': add_essential_metadata(doc.metadata)
}, llama_documents)))

processed_df = pd.DataFrame(doc_lst)

Expand All @@ -62,7 +59,6 @@ def llama_text_node_to_parquet(text_nodes: List[TextNode],
Default is False.
:return: Corpus data as pd.DataFrame
"""

corpus_df = pd.DataFrame(list(map(lambda node: {
'doc_id': node.node_id,
'contents': node.text,
Expand All @@ -78,10 +74,16 @@ def llama_text_node_to_parquet(text_nodes: List[TextNode],
def add_essential_metadata_llama_text_node(metadata: Dict, relationships: Dict) -> Dict:
if 'last_modified_datetime' not in metadata:
metadata['last_modified_datetime'] = datetime.now()
prev_node = relationships.get(NodeRelationship.PREVIOUS, None)
if prev_node:
metadata['prev_id'] = prev_node.node_id
next_node = relationships.get(NodeRelationship.NEXT, None)
if next_node:
metadata['next_id'] = next_node.node_id

if 'prev_id' not in metadata:
if NodeRelationship.PREVIOUS in relationships:
prev_node = relationships.get(NodeRelationship.PREVIOUS, None)
if prev_node:
metadata['prev_id'] = prev_node.node_id

if 'next_id' not in metadata:
if NodeRelationship.NEXT in relationships:
next_node = relationships.get(NodeRelationship.NEXT, None)
if next_node:
metadata['next_id'] = next_node.node_id
return metadata
6 changes: 1 addition & 5 deletions autorag/data/utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,9 @@ def get_file_metadata(file_path: str) -> Dict:
}


def add_essential_metadata(metadata: Dict, prev_id: str, next_id: str) -> Dict:
def add_essential_metadata(metadata: Dict) -> Dict:
if 'last_modified_datetime' not in metadata:
metadata['last_modified_datetime'] = datetime.now()
if 'prev_id' not in metadata:
metadata['prev_id'] = prev_id
if 'next_id' not in metadata:
metadata['next_id'] = next_id
return metadata


Expand Down
40 changes: 25 additions & 15 deletions autorag/nodes/passageaugmenter/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@

from autorag import embedding_models
from autorag.evaluate.metric.util import calculate_cosine_similarity
from autorag.utils import result_to_dataframe, validate_qa_dataset, fetch_contents, sort_by_scores
from autorag.utils.util import reconstruct_list, filter_dict_keys
from autorag.utils import (result_to_dataframe, validate_qa_dataset, fetch_contents, sort_by_scores,
validate_corpus_dataset, cast_corpus_dataset)
from autorag.utils.util import reconstruct_list, filter_dict_keys, select_top_k

logger = logging.getLogger("AutoRAG")

Expand All @@ -36,42 +37,51 @@ def wrapper(
ids = previous_result["retrieved_ids"].tolist()

corpus_df = pd.read_parquet(os.path.join(data_dir, "corpus.parquet"))
validate_corpus_dataset(corpus_df)

# get top_k
top_k = kwargs.pop("top_k")
assert top_k <= len(ids[0][0]), ("The number of top_k must be same or less than the number of retrieved "
"passages.")

if func.__name__ == 'prev_next_augmenter':
corpus_df = cast_corpus_dataset(corpus_df)
slim_corpus_df = corpus_df[["doc_id", "metadata"]]
slim_corpus_df['metadata'] = slim_corpus_df['metadata'].apply(filter_dict_keys, keys=['prev_id', 'next_id'])

mode = kwargs.pop("mode", 'both')
num_passages = kwargs.pop("num_passages", 1)

# get augmented ids
ids = func(ids_list=ids, corpus_df=slim_corpus_df, mode=mode, num_passages=num_passages)
augmented_ids = func(ids_list=ids, corpus_df=slim_corpus_df, mode=mode, num_passages=num_passages)
# fetch contents from corpus to use augmented ids
contents = fetch_contents(corpus_df, ids)
augmented_contents = fetch_contents(corpus_df, ids)
else:
contents = fetch_contents(corpus_df, ids)
ids, contents = func(ids_list=ids, contents_list=contents, *args, **kwargs)
augmented_ids, augmented_contents = func(ids_list=ids, contents_list=contents, *args, **kwargs)

# set embedding model for getting scores
embedding_model_str = kwargs.pop("embedding_model", 'openai')
query_embeddings, contents_embeddings = embedding_query_content(queries, contents, embedding_model_str,
batch=128)
query_embeddings, contents_embeddings = embedding_query_content(queries, augmented_contents,
embedding_model_str, batch=128)

# get scores from calculated cosine similarity
scores = [np.array([calculate_cosine_similarity(query_embedding, x) for x in content_embeddings]).tolist()
for query_embedding, content_embeddings in zip(query_embeddings, contents_embeddings)]
augmented_scores = [
np.array([calculate_cosine_similarity(query_embedding, x) for x in content_embeddings]).tolist()
for query_embedding, content_embeddings in zip(query_embeddings, contents_embeddings)]

# sort by scores
df = pd.DataFrame({
'contents': contents,
'ids': ids,
'scores': scores,
'contents': augmented_contents,
'ids': augmented_ids,
'scores': augmented_scores,
})
df[['contents', 'ids', 'scores']] = df.apply(sort_by_scores, axis=1, result_type='expand')
augmented_contents, augmented_ids, augmented_scores = \
df['contents'].tolist(), df['ids'].tolist(), df['scores'].tolist()

return augmented_contents, augmented_ids, augmented_scores
# select by top_k
results = select_top_k(df, ['contents', 'ids', 'scores'], top_k)

return results['contents'].tolist(), results['ids'].tolist(), results['scores'].tolist()

return wrapper

Expand Down
15 changes: 15 additions & 0 deletions autorag/utils/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,4 +69,19 @@ def make_datetime_metadata(x):
assert sum(df['metadata'].apply(lambda x: x.get('last_modified_datetime') is not None)) == len(df), \
"Every metadata must have a datetime key."

def make_prev_next_id_metadata(x, id_type: str):
if x is None or x == {}:
return {id_type: None}
elif x.get(id_type) is None:
return {**x, id_type: None}
else:
return x

df['metadata'] = df['metadata'].apply(lambda x: make_prev_next_id_metadata(x, 'prev_id'))
df['metadata'] = df['metadata'].apply(lambda x: make_prev_next_id_metadata(x, 'next_id'))

# check every metadata have a prev_id, next_id key
assert all('prev_id' in metadata for metadata in df['metadata']), "Every metadata must have a prev_id key."
assert all('next_id' in metadata for metadata in df['metadata']), "Every metadata must have a next_id key."

return df
6 changes: 6 additions & 0 deletions docs/source/nodes/passage_augmenter/passage_augmenter.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@ The primary benefit of passage augmenter is that allows users to fetch additiona

## **Node Parameters**

**Top_k**

- **Description**: The `top_k` parameter is utilized at the node level to define the top 'k' results to be retrieved
from corpus.
- 📌 **Note**: The number of `top_k` must be same or less than the number of "`retrieval`node parameter `top_k`".

**embedding_model**

- **Description**: The embedding model name to be used for calculating the cosine similarity between the query and the
Expand Down
2 changes: 0 additions & 2 deletions tests/autorag/data/corpus/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,4 @@ def validate_corpus(result_df: pd.DataFrame, length: int, parquet_filepath):

assert ['test text'] * length == result_df['contents'].tolist()
assert all(['last_modified_datetime' in metadata for metadata in result_df['metadata'].tolist()])
assert all(['prev_id' in metadata for metadata in result_df['metadata'].tolist()])
assert all(['next_id' in metadata for metadata in result_df['metadata'].tolist()])
assert all([isinstance(doc_id, str) for doc_id in result_df['doc_id'].tolist()])
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,12 @@ def test_prev_next_augmenter_multi_passages():


def test_prev_next_augmenter_node():
result_df = prev_next_augmenter(project_dir=project_dir, previous_result=previous_result, mode='next')
result_df = prev_next_augmenter(project_dir=project_dir, previous_result=previous_result, mode='next', top_k=3)
contents = result_df["retrieved_contents"].tolist()
ids = result_df["retrieved_ids"].tolist()
scores = result_df["retrieve_scores"].tolist()
assert len(contents) == len(ids) == len(scores) == 2
assert len(contents[0]) == len(ids[0]) == len(scores[0]) == 4
assert len(contents[0]) == len(ids[0]) == len(scores[0]) == 3
for content_list, id_list, score_list in zip(contents, ids, scores):
for i, (content, _id, score) in enumerate(zip(content_list, id_list, score_list)):
assert isinstance(content, str)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def node_line_dir():

def test_run_passage_augmenter_node(node_line_dir):
modules = [prev_next_augmenter]
module_params = [{'num_passages': 1}]
module_params = [{'top_k': 2, 'num_passages': 1}]
strategies = {
'metrics': ['retrieval_f1', 'retrieval_recall'],
}
Expand All @@ -52,7 +52,7 @@ def test_run_passage_augmenter_node(node_line_dir):
assert summary_df['passage_augmenter_retrieval_f1'][0] == pytest.approx(result_df['retrieval_f1'].mean())
assert summary_df['passage_augmenter_retrieval_recall'][0] == pytest.approx(result_df['retrieval_recall'].mean())
assert summary_df['module_name'][0] == "prev_next_augmenter"
assert summary_df['module_params'][0] == {'num_passages': 1}
assert summary_df['module_params'][0] == {'top_k': 2, 'num_passages': 1}
assert summary_df['execution_time'][0] > 0
# test the best file is saved properly
best_path = summary_df[summary_df['is_best']]['filename'].values[0]
Expand Down
9 changes: 7 additions & 2 deletions tests/autorag/utils/test_preprocess.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from datetime import datetime

import pytest
import pandas as pd
import pytest

from autorag.utils import validate_qa_dataset, validate_corpus_dataset, cast_qa_dataset, cast_corpus_dataset

Expand All @@ -22,7 +22,8 @@ def corpus_df():
return pd.DataFrame({
'doc_id': ['doc1', 'doc2', 'doc3'],
'contents': ['content1', 'content2', 'content3'],
'metadata': [{}, {'test_key': 'test_value'}, {'last_modified_datetime': datetime(2022, 12, 1, 3, 4, 5)}]
'metadata': [{'prev_id': None, 'next_id': 'doc2'}, {'test_key': 'test_value'},
{'last_modified_datetime': datetime(2022, 12, 1, 3, 4, 5)}]
})


Expand Down Expand Up @@ -68,5 +69,9 @@ def test_cast_corpus_dataset(corpus_df):
# Cast the dataset and check for a datetime key in metadata
casted_df = cast_corpus_dataset(corpus_df)
assert all('last_modified_datetime' in x for x in casted_df['metadata'])
assert casted_df['metadata'].iloc[0]['prev_id'] is None
assert casted_df['metadata'].iloc[0]['next_id'] == 'doc2'
assert casted_df['metadata'].iloc[1]['test_key'] == 'test_value'
assert casted_df['metadata'].iloc[1]['prev_id'] is None
assert casted_df['metadata'].iloc[1]['next_id'] is None
assert casted_df['metadata'].iloc[2]['last_modified_datetime'] == datetime(2022, 12, 1, 3, 4, 5)

0 comments on commit 37c1567

Please sign in to comment.