Skip to content

Commit

Permalink
refactor(extraction): code lint (#16)
Browse files Browse the repository at this point in the history
* refactor(extraction): code lint

* code lint
  • Loading branch information
guillaume-chervet authored Jun 12, 2023
1 parent e9962ce commit 4299d5b
Showing 1 changed file with 5 additions and 1 deletion.
6 changes: 5 additions & 1 deletion packages/extraction/src/mlopspython_extraction/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,14 @@ def convert_pixmap_to_rgb(pixmap) -> Pixmap:
else:
return fitz.Pixmap(fitz.csRGB, pixmap)


@dataclass
class ImageResult:
image_bytes_io: BytesIO
index_page: int
index_image: int


def extract_images_stream(pdf_bytes) -> Iterable[ImageResult]:
with fitz.open(stream=pdf_bytes, filetype="pdf") as document:
number_pages = len(document) - 1
Expand All @@ -32,6 +34,7 @@ def extract_images_stream(pdf_bytes) -> Iterable[ImageResult]:
image_bytes_io = BytesIO(convert_pixmap_to_rgb(image_pix).tobytes())
yield ImageResult(image_bytes_io, index_page, index_image)


@dataclass
class ExtractImagesResult:
number_files_input: int
Expand All @@ -46,7 +49,8 @@ def extract_images(pdfs_directory_path: str, images_directory_path: str) -> Extr
with open(pdf_path, "rb") as pdf_stream:
pdf_bytes = pdf_stream.read()
for image_stream in extract_images_stream(pdf_bytes):
filename = "{0}_page{1}_index{2}.png".format(pdf_path.stem, str(image_stream.index_page), str(image_stream.index_image))
filename = "{0}_page{1}_index{2}.png".format(pdf_path.stem, str(image_stream.index_page),
str(image_stream.index_image))
number_images_output = number_images_output + 1
with open(Path(images_directory_path) / filename, "wb") as file_stream:
file_stream.write(image_stream.image_bytes_io.getbuffer())
Expand Down

0 comments on commit 4299d5b

Please sign in to comment.