-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
docs: Add minimal getting started code to showcase haystack + RAG (#5578
) * init * Change question * Add TODO comment * Addressing feedback * Add local folder option. Move additional functions inside haystack.utils for easier imports * Apply Daria's review suggestions Co-authored-by: Daria Fokina <[email protected]> * Add integration test * change string formatting Co-authored-by: Silvano Cerza <[email protected]> * Add outputparser to HF * Exclude anthropic test --------- Co-authored-by: Daria Fokina <[email protected]> Co-authored-by: Silvano Cerza <[email protected]>
- Loading branch information
1 parent
10d6886
commit d048bb5
Showing
4 changed files
with
144 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
from haystack.document_stores import InMemoryDocumentStore | ||
from haystack.utils import build_pipeline, add_example_data, print_answers | ||
|
||
|
||
def getting_started(provider, API_KEY): | ||
""" | ||
This getting_started example shows you how to use LLMs with your data with a technique called Retrieval Augmented Generation - RAG. | ||
:param provider: We are model agnostic :) Here, you can choose from: "anthropic", "cohere", "huggingface", and "openai". | ||
:param API_KEY: The API key matching the provider. | ||
""" | ||
|
||
# We support many different databases. Here we load a simple and lightweight in-memory database. | ||
document_store = InMemoryDocumentStore(use_bm25=True) | ||
|
||
# Pipelines are the main abstraction in Haystack, they connect components like LLMs and databases. | ||
pipeline = build_pipeline(provider, API_KEY, document_store) | ||
|
||
# Download and add Game of Thrones TXT articles to Haystack's database. | ||
# You can also provide a folder with your local documents. | ||
# You might need to install additional dependencies - look inside the function for more information. | ||
add_example_data(document_store, "data/GoT_getting_started") | ||
|
||
# Ask a question on the data you just added. | ||
result = pipeline.run(query="Who is the father of Arya Stark?") | ||
|
||
# For details such as which documents were used to generate the answer, look into the <result> object. | ||
print_answers(result, details="medium") | ||
return result | ||
|
||
|
||
if __name__ == "__main__": | ||
getting_started(provider="openai", API_KEY="ADD KEY HERE") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
import os | ||
|
||
import pytest | ||
|
||
from examples.getting_started import getting_started | ||
from haystack.schema import Answer, Document | ||
|
||
|
||
@pytest.mark.integration | ||
@pytest.mark.parametrize("provider", ["cohere", "huggingface", "openai"]) | ||
def test_getting_started(provider): | ||
if provider == "anthropic": | ||
api_key = os.environ.get("ANTHROPIC_API_KEY", "") | ||
elif provider == "cohere": | ||
api_key = os.environ.get("COHERE_API_KEY", "") | ||
elif provider == "huggingface": | ||
api_key = os.environ.get("HUGGINGFACE_API_KEY", "") | ||
elif provider == "openai": | ||
api_key = os.environ.get("OPENAI_API_KEY", "") | ||
result = getting_started(provider=provider, API_KEY=api_key) | ||
|
||
# Testing only for functionality. Since model predictions from APIs might change, we cannot test those directly. | ||
assert isinstance(result, dict) | ||
assert type(result["answers"][0]) == Answer | ||
assert type(result["documents"][0]) == Document |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
import logging | ||
import os | ||
|
||
from haystack.utils import convert_files_to_docs | ||
from haystack.utils import fetch_archive_from_http | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def build_pipeline(provider, API_KEY, document_store): | ||
# Importing top-level causes a circular import | ||
from haystack.nodes import AnswerParser, PromptNode, PromptTemplate, BM25Retriever | ||
from haystack.pipelines import Pipeline | ||
|
||
provider = provider.lower() | ||
# A retriever selects the right documents when given a question. | ||
retriever = BM25Retriever(document_store=document_store, top_k=5) | ||
# Load prompt for doing retrieval augmented generation from https://prompthub.deepset.ai/?prompt=deepset%2Fquestion-answering-with-references | ||
question_answering_with_references = PromptTemplate( | ||
prompt="deepset/question-answering-with-references", | ||
output_parser=AnswerParser(reference_pattern=r"Document\[(\d+)\]"), | ||
) | ||
# Load the LLM model | ||
if provider == "anthropic": | ||
prompt_node = PromptNode( | ||
model_name_or_path="claude-2", api_key=API_KEY, default_prompt_template=question_answering_with_references | ||
) | ||
elif provider == "cohere": | ||
prompt_node = PromptNode( | ||
model_name_or_path="command", api_key=API_KEY, default_prompt_template=question_answering_with_references | ||
) | ||
elif provider == "huggingface": | ||
# TODO: swap out for meta-llama/Llama-2-7b-chat-hf or the 40b model once supported in Haystack+HF API free tier | ||
# The tiiuae/falcon-7b-instruct model cannot handle a complex prompt with references, so we use a very simple one | ||
simple_QA = PromptTemplate( | ||
prompt="deepset/question-answering", output_parser=AnswerParser(reference_pattern=r"Document\[(\d+)\]") | ||
) | ||
prompt_node = PromptNode( | ||
model_name_or_path="tiiuae/falcon-7b-instruct", api_key=API_KEY, default_prompt_template=simple_QA | ||
) | ||
elif provider == "openai": | ||
prompt_node = PromptNode( | ||
model_name_or_path="gpt-3.5-turbo-0301", | ||
api_key=API_KEY, | ||
default_prompt_template=question_answering_with_references, | ||
) | ||
else: | ||
logger.error('Given <provider> unknown. Please use any of "anthropic", "cohere", "huggingface", or "openai"') | ||
# Compose the query pipeline | ||
query_pipeline = Pipeline() | ||
query_pipeline.add_node(component=retriever, name="retriever", inputs=["Query"]) | ||
query_pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["retriever"]) | ||
|
||
return query_pipeline | ||
|
||
|
||
def add_example_data(document_store, dir): | ||
# Importing top-level causes a circular import | ||
from haystack.nodes import TextConverter, PreProcessor | ||
|
||
if dir == "data/GoT_getting_started": | ||
# Download and add Game of Thrones TXT files | ||
fetch_archive_from_http( | ||
url="https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip", | ||
output_dir=dir, | ||
) | ||
files_to_index = [dir + "/" + f for f in os.listdir(dir)] | ||
converter = TextConverter(remove_numeric_tables=True, valid_languages=["en"]) | ||
docs = [converter.convert(file_path=file, meta=None)[0] for file in files_to_index] | ||
else: | ||
# Here you can add a local folder with your files(.txt, .pdf, .docx). | ||
# You might need to install additional packages with "pip install farm-haystack[ocr,preprocessing,file-conversion,pdf]". | ||
# For more details, see: https://haystack.deepset.ai/tutorials/08_preprocessing. | ||
# Be aware that some of your data will be sent to external APIs if you use this functionality! | ||
files_to_index = [dir + "/" + f for f in os.listdir(dir)] | ||
logger.info("Adding %s number of files from local disk at %s.", len(files_to_index), dir) | ||
docs = convert_files_to_docs(dir_path=dir) | ||
|
||
preprocessor = PreProcessor( | ||
split_by="word", split_length=200, split_overlap=0, split_respect_sentence_boundary=True | ||
) | ||
docs_processed = preprocessor.process(docs) | ||
|
||
document_store.write_documents(documents=docs_processed) |