Skip to content

Commit

Permalink
Add custom readers
Browse files Browse the repository at this point in the history
  • Loading branch information
mawandm committed Apr 12, 2024
1 parent c865562 commit e6c9f2b
Show file tree
Hide file tree
Showing 22 changed files with 76,402 additions and 2,833 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
password: ${{ secrets.DOCKER_HUB_PASSWORD }}

- name: Build and push Docker image
uses: docker/build-push-action@v3
uses: docker/build-push-action@v5
with:
context: .
push: true
Expand All @@ -49,7 +49,7 @@ jobs:
password: ${{ secrets.DOCKER_HUB_PASSWORD }}

- name: Build and push frontend Docker image
uses: docker/build-push-action@v3
uses: docker/build-push-action@v5
with:
context: .
push: true
Expand Down Expand Up @@ -77,7 +77,7 @@ jobs:
password: ${{ secrets.DOCKER_HUB_PASSWORD }}

- name: Build and push RAG docker image
uses: docker/build-push-action@v3
uses: docker/build-push-action@v5
with:
context: .
push: true
Expand Down
8 changes: 6 additions & 2 deletions nesis/rag/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
FROM python:3.11-buster as build
COPY nesis/rag/requirements.txt /app/nesis/rag/requirements.txt
COPY nesis/rag/requirements-huggingface.txt /app/nesis/rag/requirements-huggingface.txt
COPY nesis/rag/requirements-torch-cpu-x86.txt /app/nesis/rag/requirements-torch-cpu-x86.txt

RUN apt-get update \
&& python -m venv /app/.venv \
&& /app/.venv/bin/pip install -r /app/nesis/rag/requirements.txt -r /app/nesis/rag/requirements-huggingface.txt --default-timeout=1200
&& /app/.venv/bin/pip install -r /app/nesis/rag/requirements.txt \
-r requirements-torch-cpu-x86.txt -r /app/nesis/rag/requirements-huggingface.txt \
--default-timeout=1200



ARG NESIS_VERSION
FROM python:3.11.6-slim-bookworm
RUN apt-get update \
&& apt-get clean \
&& adduser --system --home /app --shell /bin/bash nesis
&& adduser --system --home /app --shell /bin/bash nesis \
&& apt install ffmpeg

WORKDIR /app

Expand Down
7 changes: 6 additions & 1 deletion nesis/rag/core/components/ingest/ingest_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
VideoAudioReader,
) # pants: no-infer-dep

from nesis.rag.core.components.ingest.readers import ExcelReader, TiffReader, OdsReader

logger = logging.getLogger(__name__)


Expand All @@ -42,7 +44,10 @@
".mbox": MboxReader,
".ipynb": IPYNBReader,
".json": JSONReader,
".xlsx": JSONReader,
".xls": ExcelReader,
".xlsx": ExcelReader,
".ods": OdsReader,
".tiff": TiffReader,
}


Expand Down
83 changes: 83 additions & 0 deletions nesis/rag/core/components/ingest/readers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import pathlib
import tempfile
from pathlib import Path
from typing import List, Optional, Dict

from fsspec import AbstractFileSystem
from llama_index.core.readers.base import BaseReader
import pandas as pd
from llama_index.core import Document

from PIL import Image, ImageSequence
import os
import os.path
import glob
from llama_index.readers.file import ImageReader


class ExcelReader(BaseReader):
"""
A simple MS Excel file reader. Uses pandas in the background
"""

def load_data(
self,
file: Path,
extra_info: Optional[Dict] = None,
fs: Optional[AbstractFileSystem] = None,
) -> List[Document]:
data = pd.read_excel(file.absolute()).to_string()
return [Document(text=data, metadata=extra_info or {})]


class OdsReader(BaseReader):
"""
A simple open document spreadsheet reader
"""

def load_data(
self,
file: Path,
extra_info: Optional[Dict] = None,
fs: Optional[AbstractFileSystem] = None,
) -> List[Document]:
data = pd.read_excel(file.absolute(), engine="odf").to_string()
return [Document(text=data, metadata=extra_info or {})]


class TiffReader(BaseReader):
"""
A simple tiff file reader. Converts the pages into png and then uses an image reader to convert into llama-index
documents
"""

@staticmethod
def _load_page_data(
file: Path,
extra_info: Optional[Dict] = None,
fs: Optional[AbstractFileSystem] = None,
) -> List[Document]:
return ImageReader().load_data(file.absolute(), extra_info=extra_info, fs=fs)

def load_data(
self,
file: Path,
extra_info: Optional[Dict] = None,
fs: Optional[AbstractFileSystem] = None,
) -> List[Document]:

im = Image.open(file.absolute())
documents: List[Document] = []
for idx, page in enumerate(ImageSequence.Iterator(im)):
temp_file_name = tempfile.NamedTemporaryFile(
dir=tempfile.gettempdir(), prefix=f"{file.name.split('.')[0]}-{idx}-"
)
path = pathlib.Path(temp_file_name.name).with_suffix(".png")

page.save(path)
page_documents: List[Document] = self._load_page_data(
file=path, extra_info=extra_info, fs=fs
)
documents += page_documents

return documents
2 changes: 1 addition & 1 deletion nesis/rag/requirements-huggingface.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Use pytorch@cpu
torch @ https://download.pytorch.org/whl/cpu/torch-2.2.2%2Bcpu-cp311-cp311-linux_x86_64.whl
llama-index-embeddings-huggingface==0.1.3

3 changes: 2 additions & 1 deletion nesis/rag/requirements-test.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
pylint==2.13.8
pytest==8.1.1
coverage==7.4.4
coverage==7.4.4

3 changes: 3 additions & 0 deletions nesis/rag/requirements-torch-cpu-x86.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Use pytorch@cpu
torch @ https://download.pytorch.org/whl/cpu/torch-2.2.2%2Bcpu-cp311-cp311-linux_x86_64.whl

5 changes: 5 additions & 0 deletions nesis/rag/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,8 @@ boto3==1.34.75
python-multipart==0.0.9
python_pptx==0.6.23
docx2txt==0.8
openpyxl==3.1.2
openai-whisper @ git+https://github.com/openai/whisper.git
pydub==0.25.1
ffprobe==0.5
odfpy==1.4.1
14 changes: 13 additions & 1 deletion nesis/rag/tests/rag/core/server/test_ingestion_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,21 @@ def settings() -> Settings:
"file_name",
[
"file-sample_150kB.pdf",
# "file-sample_100kB.doc",
"file-sample_500kB.docx",
"samplepptx.pptx",
"rfc791.txt",
"free-hugs.jpg",
"free-hugs.jpeg",
"free-hugs.png",
"sales_data_sample.json",
"website-traffic-dashboard.csv",
"website-traffic-dashboard.ods",
"website-traffic-dashboard.xlsx",
"website-traffic-dashboard.png",
"website-traffic-dashboard.pdf",
"website-traffic-dashboard.jpg",
"website-traffic-dashboard.tiff",
"introduction-to-nesis.mp4",
],
)
def test_ingestion_supported(injector, file_name):
Expand Down
Binary file added nesis/rag/tests/resources/free-hugs.jpeg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added nesis/rag/tests/resources/free-hugs.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added nesis/rag/tests/resources/free-hugs.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit e6c9f2b

Please sign in to comment.