diff --git a/nesis/rag/core/components/ingest/ingest_helper.py b/nesis/rag/core/components/ingest/ingest_helper.py index f643a4e..3f5702f 100644 --- a/nesis/rag/core/components/ingest/ingest_helper.py +++ b/nesis/rag/core/components/ingest/ingest_helper.py @@ -25,29 +25,29 @@ logger = logging.getLogger(__name__) -FILE_READER_CLS: Dict[str, Type[BaseReader]] = { - ".hwp": HWPReader, - ".pdf": PDFReader, - ".doc": DocxReader, - ".docx": DocxReader, - ".pptx": PptxReader, - ".ppt": PptxReader, - ".pptm": PptxReader, - ".jpg": ImageReader, - ".png": ImageReader, - ".jpeg": ImageReader, - ".mp3": VideoAudioReader, - ".mp4": VideoAudioReader, - ".csv": PandasCSVReader, - ".epub": EpubReader, - ".md": MarkdownReader, - ".mbox": MboxReader, - ".ipynb": IPYNBReader, - ".json": JSONReader, - ".xls": ExcelReader, - ".xlsx": ExcelReader, - ".ods": OdsReader, - ".tiff": TiffReader, +FILE_READER_CLS: Dict[str, BaseReader] = { + ".hwp": HWPReader(), + ".pdf": PDFReader(), + ".doc": DocxReader(), + ".docx": DocxReader(), + ".pptx": PptxReader(), + ".ppt": PptxReader(), + ".pptm": PptxReader(), + ".jpg": ImageReader(), + ".png": ImageReader(), + ".jpeg": ImageReader(), + ".mp3": VideoAudioReader(), + ".mp4": VideoAudioReader(), + ".csv": PandasCSVReader(), + ".epub": EpubReader(), + ".md": MarkdownReader(), + ".mbox": MboxReader(), + ".ipynb": IPYNBReader(), + ".json": JSONReader(), + ".xls": ExcelReader(), + ".xlsx": ExcelReader(), + ".ods": OdsReader(), + ".tiff": TiffReader(), } @@ -73,8 +73,8 @@ def transform_file_into_documents( def _load_file_to_documents(file_name: str, file_data: Path) -> list[Document]: logger.debug("Transforming file_name=%s into documents", file_name) extension = Path(file_name).suffix - reader_cls = FILE_READER_CLS.get(extension) - if reader_cls is None: + reader = FILE_READER_CLS.get(extension) + if reader is None: logger.debug( "No reader found for extension=%s, using default string reader", extension, @@ -84,7 +84,7 @@ def _load_file_to_documents(file_name: str, file_data: Path) -> list[Document]: return string_reader.load_data([file_data.read_text()]) logger.debug("Specific reader found for extension=%s", extension) - return reader_cls().load_data(file_data) + return reader.load_data(file_data) @staticmethod def _exclude_metadata(documents: list[Document]) -> None: diff --git a/nesis/rag/core/components/ingest/readers.py b/nesis/rag/core/components/ingest/readers.py index af0744b..66b8b44 100644 --- a/nesis/rag/core/components/ingest/readers.py +++ b/nesis/rag/core/components/ingest/readers.py @@ -3,17 +3,29 @@ from pathlib import Path from typing import List, Optional, Dict -from fsspec import AbstractFileSystem -from llama_index.core.readers.base import BaseReader import pandas as pd -from llama_index.core import Document - from PIL import Image, ImageSequence -import os -import os.path -import glob +from fsspec import AbstractFileSystem +from llama_index.core import Document +from llama_index.core.readers.base import BaseReader from llama_index.readers.file import ImageReader +from llama_index.core.readers.base import BaseReader +from llama_index.core.readers.json import JSONReader +from llama_index.readers.file import ( + DocxReader, + EpubReader, + HWPReader, + ImageReader, + IPYNBReader, + MarkdownReader, + MboxReader, + PandasCSVReader, + PDFReader, + PptxReader, + VideoAudioReader, +) # pants: no-infer-dep + class ExcelReader(BaseReader): """ @@ -66,18 +78,19 @@ def load_data( fs: Optional[AbstractFileSystem] = None, ) -> List[Document]: - im = Image.open(file.absolute()) - documents: List[Document] = [] - for idx, page in enumerate(ImageSequence.Iterator(im)): - temp_file_name = tempfile.NamedTemporaryFile( - dir=tempfile.gettempdir(), prefix=f"{file.name.split('.')[0]}-{idx}-" - ) - path = pathlib.Path(temp_file_name.name).with_suffix(".png") + with Image.open(file.absolute()) as image: + documents: List[Document] = [] + for idx, page in enumerate(ImageSequence.Iterator(image)): + with tempfile.NamedTemporaryFile( + dir=tempfile.gettempdir(), + prefix=f"{file.name.split('.')[0]}-{idx}-", + ) as temp_file_name: + path = pathlib.Path(temp_file_name.name).with_suffix(".png") - page.save(path) - page_documents: List[Document] = self._load_page_data( - file=path, extra_info=extra_info, fs=fs - ) - documents += page_documents + page.save(path) + page_documents: List[Document] = self._load_page_data( + file=path, extra_info=extra_info, fs=fs + ) + documents += page_documents return documents diff --git a/nesis/rag/tests/rag/core/components/__init__.py b/nesis/rag/tests/rag/core/components/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/nesis/rag/tests/rag/core/components/test_ingestion_helper.py b/nesis/rag/tests/rag/core/components/test_ingestion_helper.py new file mode 100644 index 0000000..a38ea8d --- /dev/null +++ b/nesis/rag/tests/rag/core/components/test_ingestion_helper.py @@ -0,0 +1,59 @@ +import pathlib + +import pytest +from llama_index.core import Document + +from nesis.rag import tests +from nesis.rag.core.components.ingest.ingest_helper import IngestionHelper + + +@pytest.mark.parametrize( + "file_name, expected", + [ + ("file-sample_150kB.pdf", ["Lorem", "ipsum"]), + ("file-sample_500kB.docx", ["Lorem", "ipsum"]), + ("samplepptx.pptx", None), + ("rfc791.txt", ["INTERNET", "PROTOCOL"]), + ("sales_data_sample.json", None), + ("website-traffic-dashboard.csv", ["web", "traffic", "dashboard"]), + ("website-traffic-dashboard.ods", ["web", "traffic", "dashboard"]), + ("website-traffic-dashboard.xlsx", ["web", "traffic", "dashboard"]), + # ("website-traffic-dashboard.png", ["web", "traffic", "dashboard"]), + ("website-traffic-dashboard.pdf", ["web", "traffic", "dashboard"]), + # ("website-traffic-dashboard.jpg", ["web", "traffic", "dashboard"]), + # ("website-traffic-dashboard.tiff", ["web", "traffic", "dashboard"]), + ("introduction-to-nesis.mp3", ["canada", "england"]), + ("introduction-to-nesis.mp4", ["canada", "england"]), + ], +) +def test_ingestion(file_name: str, expected: list[str]): + """ + Test to ensure we can ingest all files. This test helps make sure we have all the necessary libraries installed. + This test also tests for accuracy of the extracted data. + """ + file_path: pathlib.Path = ( + pathlib.Path(tests.__file__).parent.absolute() / "resources" / file_name + ) + + document_list: list[Document] = IngestionHelper.transform_file_into_documents( + file_name=file_path.name, + file_data=file_path, + metadata={ + "file_name": str(file_path.absolute()), + "datasource": "rfc-documents", + }, + ) + + assert len(document_list) > 0 + if expected: + assert ( + len( + [ + doc + for doc in document_list + for content in expected + if content.lower() in doc.text.lower() + ] + ) + > 0 + ) diff --git a/nesis/rag/tests/rag/core/server/test_ingestion_service.py b/nesis/rag/tests/rag/core/server/test_ingestion_service.py index 3bedbf3..1cdd728 100644 --- a/nesis/rag/tests/rag/core/server/test_ingestion_service.py +++ b/nesis/rag/tests/rag/core/server/test_ingestion_service.py @@ -30,9 +30,6 @@ def settings() -> Settings: "file-sample_500kB.docx", "samplepptx.pptx", "rfc791.txt", - "free-hugs.jpg", - "free-hugs.jpeg", - "free-hugs.png", "sales_data_sample.json", "website-traffic-dashboard.csv", "website-traffic-dashboard.ods", @@ -46,7 +43,8 @@ def settings() -> Settings: ) def test_ingestion_supported(injector, file_name): """ - Test to ensure we can ingest all files. This test helps make sure we have all the necessary libraries installed + Test to ensure we can ingest all files. This test helps make sure we have all the necessary libraries installed. + This test DOES NOT test for accuracy of the extracted data. """ file_path: pathlib.Path = ( pathlib.Path(tests.__file__).parent.absolute() / "resources" / file_name diff --git a/nesis/rag/tests/resources/free-hugs.jpeg b/nesis/rag/tests/resources/free-hugs.jpeg deleted file mode 100644 index 8426608..0000000 Binary files a/nesis/rag/tests/resources/free-hugs.jpeg and /dev/null differ diff --git a/nesis/rag/tests/resources/free-hugs.jpg b/nesis/rag/tests/resources/free-hugs.jpg deleted file mode 100644 index 8426608..0000000 Binary files a/nesis/rag/tests/resources/free-hugs.jpg and /dev/null differ diff --git a/nesis/rag/tests/resources/free-hugs.png b/nesis/rag/tests/resources/free-hugs.png deleted file mode 100644 index 69d3460..0000000 Binary files a/nesis/rag/tests/resources/free-hugs.png and /dev/null differ diff --git a/nesis/rag/tests/resources/introduction-to-nesis.mp3 b/nesis/rag/tests/resources/introduction-to-nesis.mp3 new file mode 100644 index 0000000..26dbad5 Binary files /dev/null and b/nesis/rag/tests/resources/introduction-to-nesis.mp3 differ