Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: upgrade endpoint to docling v2 #13

Merged
merged 2 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Containerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ RUN if [ "$CPU_ONLY" = "true" ]; then \
ENV HF_HOME=/tmp/
ENV TORCH_HOME=/tmp/

RUN poetry run python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'
RUN poetry run python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; artifacts_path = StandardPdfPipeline.download_models_hf(force=True);'

# On container environments, always set a thread budget to avoid undesired thread congestion.
ENV OMP_NUM_THREADS=4
Expand All @@ -29,4 +29,4 @@ COPY ./docling_serve /docling-serve/docling_serve

EXPOSE 5000

CMD ["poetry", "run", "uvicorn", "--port", "5000", "docling_serve.app:app"]
CMD ["poetry", "run", "uvicorn", "--port", "5000", "--host", "0.0.0.0", "docling_serve.app:app"]
245 changes: 214 additions & 31 deletions docling_serve/app.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,55 @@
import base64
import hashlib
from contextlib import asynccontextmanager
from enum import Enum
from io import BytesIO
from pathlib import Path
from typing import Any, Dict, Union
from typing import Any, Dict, List, Optional, Tuple, Union

import httpx
from docling.datamodel.base_models import (
ConversionStatus,
DocumentStream,
PipelineOptions,
ErrorItem,
InputFormat,
)
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrOptions,
PdfPipelineOptions,
RapidOcrOptions,
TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.utils.profiling import ProfilingItem
from docling_core.types.doc import DoclingDocument, ImageRefMode
from docling_core.utils.file import resolve_remote_filename
from fastapi import FastAPI, HTTPException, Response
from pydantic import AnyHttpUrl, BaseModel


# TODO: import enum from Docling, once it is exposed
class OcrEngine(str, Enum):
EASYOCR = "easyocr"
TESSERACT = "tesseract"
RAPIDOCR = "rapidocr"


class ConvertOptions(BaseModel):
output_docling_document: bool = True
output_markdown: bool = False
output_html: bool = False
do_ocr: bool = True
ocr_engine: OcrEngine = OcrEngine.EASYOCR
ocr_lang: Optional[List[str]] = None
force_ocr: bool = False
do_table_structure: bool = True
include_images: bool = True
images_scale: float = 2.0


from docling_serve.settings import Settings
class DocumentConvertBase(BaseModel):
options: ConvertOptions = ConvertOptions()


class HttpSource(BaseModel):
Expand All @@ -28,37 +62,124 @@ class FileSource(BaseModel):
filename: str


class ConvertDocumentHttpSourceRequest(BaseModel):
class ConvertDocumentHttpSourceRequest(DocumentConvertBase):
http_source: HttpSource


class ConvertDocumentFileSourceRequest(BaseModel):
class ConvertDocumentFileSourceRequest(DocumentConvertBase):
file_source: FileSource


class DocumentResponse(BaseModel):
markdown: Optional[str] = None
docling_document: Optional[DoclingDocument] = None
html: Optional[str] = None


class ConvertDocumentResponse(BaseModel):
content_md: str
document: DocumentResponse
status: ConversionStatus
errors: List[ErrorItem] = []
timings: Dict[str, ProfilingItem] = {}


class ConvertDocumentErrorResponse(BaseModel):
status: ConversionStatus
# errors: List[ErrorItem] = []


ConvertDocumentRequest = Union[
ConvertDocumentFileSourceRequest, ConvertDocumentHttpSourceRequest
]


models = {}
class MarkdownTextResponse(Response):
media_type = "text/markdown"


class HealthCheckResponse(BaseModel):
status: str = "ok"


def get_pdf_pipeline_opts(options: ConvertOptions) -> Tuple[PdfPipelineOptions, str]:

if options.ocr_engine == OcrEngine.EASYOCR:
try:
import easyocr # noqa: F401
except ImportError:
raise HTTPException(
status_code=400,
detail="The requested OCR engine"
f" (ocr_engine={options.ocr_engine.value})"
" is not available on this system. Please choose another OCR engine "
"or contact your system administrator.",
)
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=options.force_ocr)
elif options.ocr_engine == OcrEngine.TESSERACT:
try:
import tesserocr # noqa: F401
except ImportError:
raise HTTPException(
status_code=400,
detail="The requested OCR engine"
f" (ocr_engine={options.ocr_engine.value})"
" is not available on this system. Please choose another OCR engine "
"or contact your system administrator.",
)
ocr_options = TesseractOcrOptions(force_full_page_ocr=options.force_ocr)
elif options.ocr_engine == OcrEngine.RAPIDOCR:
try:
from rapidocr_onnxruntime import RapidOCR # noqa: F401
except ImportError:
raise HTTPException(
status_code=400,
detail="The requested OCR engine"
f" (ocr_engine={options.ocr_engine.value})"
" is not available on this system. Please choose another OCR engine "
"or contact your system administrator.",
)
ocr_options = RapidOcrOptions(force_full_page_ocr=options.force_ocr)
else:
raise RuntimeError(f"Unexpected OCR engine type {options.ocr_engine}")

if options.ocr_lang is not None:
ocr_options.lang = options.ocr_lang

pipeline_options = PdfPipelineOptions(
do_ocr=options.do_ocr,
ocr_options=ocr_options,
do_table_structure=options.do_table_structure,
generate_page_images=options.include_images,
generate_picture_images=options.include_images,
images_scale=options.images_scale,
)

options_hash = hashlib.sha1(pipeline_options.model_dump_json().encode()).hexdigest()

return pipeline_options, options_hash


converters: Dict[str, DocumentConverter] = {}


@asynccontextmanager
async def lifespan(app: FastAPI):
# Converter
settings = Settings()
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = settings.do_ocr
pipeline_options.do_table_structure = settings.do_table_structure
models["converter"] = DocumentConverter(pipeline_options=pipeline_options)
# settings = Settings()

# Converter with default options
pipeline_options, options_hash = get_pdf_pipeline_opts(ConvertOptions())
converters[options_hash] = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
}
)

converters[options_hash].initialize_pipeline(InputFormat.PDF)

yield

models.clear()
converters.clear()


app = FastAPI(
Expand All @@ -67,10 +188,14 @@ async def lifespan(app: FastAPI):
)


@app.post("/convert")
def convert_pdf_document(
@app.get("/health")
def health() -> HealthCheckResponse:
return HealthCheckResponse()


def _convert_document(
body: ConvertDocumentRequest,
) -> ConvertDocumentResponse:
) -> ConversionResult:

filename: str
buf: BytesIO
Expand All @@ -81,16 +206,74 @@ def convert_pdf_document(
elif isinstance(body, ConvertDocumentHttpSourceRequest):
http_res = httpx.get(body.http_source.url, headers=body.http_source.headers)
buf = BytesIO(http_res.content)
filename = Path(
body.http_source.url
).name # TODO: use better way to detect filename, e.g. from Content-Disposition
filename = resolve_remote_filename(
http_url=AnyHttpUrl(body.http_source.url),
response_headers=dict(**http_res.headers),
)

doc_input = DocumentStream(name=filename, stream=buf)

pipeline_options, options_hash = get_pdf_pipeline_opts(body.options)
if options_hash not in converters:
converters[options_hash] = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
}
)

result: ConversionResult = converters[options_hash].convert(doc_input)

if result is None or result.status == ConversionStatus.SKIPPED:
raise HTTPException(status_code=400, detail=result.errors)

if result is None or result.status not in {
ConversionStatus.SUCCESS,
}:
raise HTTPException(
status_code=500, detail={"errors": result.errors, "status": result.status}
)

return result

docs_input = DocumentConversionInput.from_streams(
[DocumentStream(filename=filename, stream=buf)]

@app.post(
"/convert",
)
def convert_document(
body: ConvertDocumentRequest,
) -> ConvertDocumentResponse:

result = _convert_document(body=body)

image_mode = (
ImageRefMode.EMBEDDED
if body.options.include_images
else ImageRefMode.PLACEHOLDER
)
doc_resp = DocumentResponse()
if body.options.output_docling_document:
doc_resp.docling_document = result.document
if body.options.output_markdown:
doc_resp.markdown = result.document.export_to_markdown(image_mode=image_mode)
if body.options.output_html:
doc_resp.html = result.document.export_to_html(image_mode=image_mode)

return ConvertDocumentResponse(
document=doc_resp, status=result.status, timings=result.timings
)
result: ConversionResult = next(models["converter"].convert(docs_input), None)

if result is None or result.status != ConversionStatus.SUCCESS:
raise HTTPException(status_code=500, detail={"errors": result.errors})

return ConvertDocumentResponse(content_md=result.render_as_markdown())
@app.post("/convert/markdown", response_class=MarkdownTextResponse)
def convert_document_md(
body: ConvertDocumentRequest,
) -> MarkdownTextResponse:
result = _convert_document(body=body)
image_mode = (
ImageRefMode.EMBEDDED
if body.options.include_images
else ImageRefMode.PLACEHOLDER
)
return MarkdownTextResponse(
result.document.export_to_markdown(image_mode=image_mode)
)
2 changes: 0 additions & 2 deletions docling_serve/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,5 @@


class Settings(BaseSettings):
do_ocr: bool = True
do_table_structure: bool = True

model_config = SettingsConfigDict(env_prefix="DOCLING_")
Loading
Loading