Skip to content

Commit

Permalink
Add ingestion helper test
Browse files Browse the repository at this point in the history
  • Loading branch information
mawandm committed Apr 13, 2024
1 parent d79db1a commit 69d9416
Show file tree
Hide file tree
Showing 9 changed files with 119 additions and 49 deletions.
52 changes: 26 additions & 26 deletions nesis/rag/core/components/ingest/ingest_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,29 +25,29 @@
logger = logging.getLogger(__name__)


FILE_READER_CLS: Dict[str, Type[BaseReader]] = {
".hwp": HWPReader,
".pdf": PDFReader,
".doc": DocxReader,
".docx": DocxReader,
".pptx": PptxReader,
".ppt": PptxReader,
".pptm": PptxReader,
".jpg": ImageReader,
".png": ImageReader,
".jpeg": ImageReader,
".mp3": VideoAudioReader,
".mp4": VideoAudioReader,
".csv": PandasCSVReader,
".epub": EpubReader,
".md": MarkdownReader,
".mbox": MboxReader,
".ipynb": IPYNBReader,
".json": JSONReader,
".xls": ExcelReader,
".xlsx": ExcelReader,
".ods": OdsReader,
".tiff": TiffReader,
FILE_READER_CLS: Dict[str, BaseReader] = {
".hwp": HWPReader(),
".pdf": PDFReader(),
".doc": DocxReader(),
".docx": DocxReader(),
".pptx": PptxReader(),
".ppt": PptxReader(),
".pptm": PptxReader(),
".jpg": ImageReader(),
".png": ImageReader(),
".jpeg": ImageReader(),
".mp3": VideoAudioReader(),
".mp4": VideoAudioReader(),
".csv": PandasCSVReader(),
".epub": EpubReader(),
".md": MarkdownReader(),
".mbox": MboxReader(),
".ipynb": IPYNBReader(),
".json": JSONReader(),
".xls": ExcelReader(),
".xlsx": ExcelReader(),
".ods": OdsReader(),
".tiff": TiffReader(),
}


Expand All @@ -73,8 +73,8 @@ def transform_file_into_documents(
def _load_file_to_documents(file_name: str, file_data: Path) -> list[Document]:
logger.debug("Transforming file_name=%s into documents", file_name)
extension = Path(file_name).suffix
reader_cls = FILE_READER_CLS.get(extension)
if reader_cls is None:
reader = FILE_READER_CLS.get(extension)
if reader is None:
logger.debug(
"No reader found for extension=%s, using default string reader",
extension,
Expand All @@ -84,7 +84,7 @@ def _load_file_to_documents(file_name: str, file_data: Path) -> list[Document]:
return string_reader.load_data([file_data.read_text()])

logger.debug("Specific reader found for extension=%s", extension)
return reader_cls().load_data(file_data)
return reader.load_data(file_data)

@staticmethod
def _exclude_metadata(documents: list[Document]) -> None:
Expand Down
51 changes: 32 additions & 19 deletions nesis/rag/core/components/ingest/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,29 @@
from pathlib import Path
from typing import List, Optional, Dict

from fsspec import AbstractFileSystem
from llama_index.core.readers.base import BaseReader
import pandas as pd
from llama_index.core import Document

from PIL import Image, ImageSequence
import os
import os.path
import glob
from fsspec import AbstractFileSystem
from llama_index.core import Document
from llama_index.core.readers.base import BaseReader
from llama_index.readers.file import ImageReader

from llama_index.core.readers.base import BaseReader
from llama_index.core.readers.json import JSONReader
from llama_index.readers.file import (
DocxReader,
EpubReader,
HWPReader,
ImageReader,
IPYNBReader,
MarkdownReader,
MboxReader,
PandasCSVReader,
PDFReader,
PptxReader,
VideoAudioReader,
) # pants: no-infer-dep


class ExcelReader(BaseReader):
"""
Expand Down Expand Up @@ -66,18 +78,19 @@ def load_data(
fs: Optional[AbstractFileSystem] = None,
) -> List[Document]:

im = Image.open(file.absolute())
documents: List[Document] = []
for idx, page in enumerate(ImageSequence.Iterator(im)):
temp_file_name = tempfile.NamedTemporaryFile(
dir=tempfile.gettempdir(), prefix=f"{file.name.split('.')[0]}-{idx}-"
)
path = pathlib.Path(temp_file_name.name).with_suffix(".png")
with Image.open(file.absolute()) as image:
documents: List[Document] = []
for idx, page in enumerate(ImageSequence.Iterator(image)):
with tempfile.NamedTemporaryFile(
dir=tempfile.gettempdir(),
prefix=f"{file.name.split('.')[0]}-{idx}-",
) as temp_file_name:
path = pathlib.Path(temp_file_name.name).with_suffix(".png")

page.save(path)
page_documents: List[Document] = self._load_page_data(
file=path, extra_info=extra_info, fs=fs
)
documents += page_documents
page.save(path)
page_documents: List[Document] = self._load_page_data(
file=path, extra_info=extra_info, fs=fs
)
documents += page_documents

return documents
Empty file.
59 changes: 59 additions & 0 deletions nesis/rag/tests/rag/core/components/test_ingestion_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import pathlib

import pytest
from llama_index.core import Document

from nesis.rag import tests
from nesis.rag.core.components.ingest.ingest_helper import IngestionHelper


@pytest.mark.parametrize(
"file_name, expected",
[
("file-sample_150kB.pdf", ["Lorem", "ipsum"]),
("file-sample_500kB.docx", ["Lorem", "ipsum"]),
("samplepptx.pptx", None),
("rfc791.txt", ["INTERNET", "PROTOCOL"]),
("sales_data_sample.json", None),
("website-traffic-dashboard.csv", ["web", "traffic", "dashboard"]),
("website-traffic-dashboard.ods", ["web", "traffic", "dashboard"]),
("website-traffic-dashboard.xlsx", ["web", "traffic", "dashboard"]),
# ("website-traffic-dashboard.png", ["web", "traffic", "dashboard"]),
("website-traffic-dashboard.pdf", ["web", "traffic", "dashboard"]),
# ("website-traffic-dashboard.jpg", ["web", "traffic", "dashboard"]),
# ("website-traffic-dashboard.tiff", ["web", "traffic", "dashboard"]),
("introduction-to-nesis.mp3", ["canada", "england"]),
("introduction-to-nesis.mp4", ["canada", "england"]),
],
)
def test_ingestion(file_name: str, expected: list[str]):
"""
Test to ensure we can ingest all files. This test helps make sure we have all the necessary libraries installed.
This test also tests for accuracy of the extracted data.
"""
file_path: pathlib.Path = (
pathlib.Path(tests.__file__).parent.absolute() / "resources" / file_name
)

document_list: list[Document] = IngestionHelper.transform_file_into_documents(
file_name=file_path.name,
file_data=file_path,
metadata={
"file_name": str(file_path.absolute()),
"datasource": "rfc-documents",
},
)

assert len(document_list) > 0
if expected:
assert (
len(
[
doc
for doc in document_list
for content in expected
if content.lower() in doc.text.lower()
]
)
> 0
)
6 changes: 2 additions & 4 deletions nesis/rag/tests/rag/core/server/test_ingestion_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,6 @@ def settings() -> Settings:
"file-sample_500kB.docx",
"samplepptx.pptx",
"rfc791.txt",
"free-hugs.jpg",
"free-hugs.jpeg",
"free-hugs.png",
"sales_data_sample.json",
"website-traffic-dashboard.csv",
"website-traffic-dashboard.ods",
Expand All @@ -46,7 +43,8 @@ def settings() -> Settings:
)
def test_ingestion_supported(injector, file_name):
"""
Test to ensure we can ingest all files. This test helps make sure we have all the necessary libraries installed
Test to ensure we can ingest all files. This test helps make sure we have all the necessary libraries installed.
This test DOES NOT test for accuracy of the extracted data.
"""
file_path: pathlib.Path = (
pathlib.Path(tests.__file__).parent.absolute() / "resources" / file_name
Expand Down
Binary file removed nesis/rag/tests/resources/free-hugs.jpeg
Binary file not shown.
Binary file removed nesis/rag/tests/resources/free-hugs.jpg
Binary file not shown.
Binary file removed nesis/rag/tests/resources/free-hugs.png
Binary file not shown.
Binary file not shown.

0 comments on commit 69d9416

Please sign in to comment.