-
Notifications
You must be signed in to change notification settings - Fork 21
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
22 changed files
with
76,402 additions
and
2,833 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
import pathlib | ||
import tempfile | ||
from pathlib import Path | ||
from typing import List, Optional, Dict | ||
|
||
from fsspec import AbstractFileSystem | ||
from llama_index.core.readers.base import BaseReader | ||
import pandas as pd | ||
from llama_index.core import Document | ||
|
||
from PIL import Image, ImageSequence | ||
import os | ||
import os.path | ||
import glob | ||
from llama_index.readers.file import ImageReader | ||
|
||
|
||
class ExcelReader(BaseReader): | ||
""" | ||
A simple MS Excel file reader. Uses pandas in the background | ||
""" | ||
|
||
def load_data( | ||
self, | ||
file: Path, | ||
extra_info: Optional[Dict] = None, | ||
fs: Optional[AbstractFileSystem] = None, | ||
) -> List[Document]: | ||
data = pd.read_excel(file.absolute()).to_string() | ||
return [Document(text=data, metadata=extra_info or {})] | ||
|
||
|
||
class OdsReader(BaseReader): | ||
""" | ||
A simple open document spreadsheet reader | ||
""" | ||
|
||
def load_data( | ||
self, | ||
file: Path, | ||
extra_info: Optional[Dict] = None, | ||
fs: Optional[AbstractFileSystem] = None, | ||
) -> List[Document]: | ||
data = pd.read_excel(file.absolute(), engine="odf").to_string() | ||
return [Document(text=data, metadata=extra_info or {})] | ||
|
||
|
||
class TiffReader(BaseReader): | ||
""" | ||
A simple tiff file reader. Converts the pages into png and then uses an image reader to convert into llama-index | ||
documents | ||
""" | ||
|
||
@staticmethod | ||
def _load_page_data( | ||
file: Path, | ||
extra_info: Optional[Dict] = None, | ||
fs: Optional[AbstractFileSystem] = None, | ||
) -> List[Document]: | ||
return ImageReader().load_data(file.absolute(), extra_info=extra_info, fs=fs) | ||
|
||
def load_data( | ||
self, | ||
file: Path, | ||
extra_info: Optional[Dict] = None, | ||
fs: Optional[AbstractFileSystem] = None, | ||
) -> List[Document]: | ||
|
||
im = Image.open(file.absolute()) | ||
documents: List[Document] = [] | ||
for idx, page in enumerate(ImageSequence.Iterator(im)): | ||
temp_file_name = tempfile.NamedTemporaryFile( | ||
dir=tempfile.gettempdir(), prefix=f"{file.name.split('.')[0]}-{idx}-" | ||
) | ||
path = pathlib.Path(temp_file_name.name).with_suffix(".png") | ||
|
||
page.save(path) | ||
page_documents: List[Document] = self._load_page_data( | ||
file=path, extra_info=extra_info, fs=fs | ||
) | ||
documents += page_documents | ||
|
||
return documents |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
# Use pytorch@cpu | ||
torch @ https://download.pytorch.org/whl/cpu/torch-2.2.2%2Bcpu-cp311-cp311-linux_x86_64.whl | ||
llama-index-embeddings-huggingface==0.1.3 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
pylint==2.13.8 | ||
pytest==8.1.1 | ||
coverage==7.4.4 | ||
coverage==7.4.4 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# Use pytorch@cpu | ||
torch @ https://download.pytorch.org/whl/cpu/torch-2.2.2%2Bcpu-cp311-cp311-linux_x86_64.whl | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Oops, something went wrong.