Skip to content

Commit

Permalink
update cli
Browse files Browse the repository at this point in the history
  • Loading branch information
myhloli committed Jun 4, 2024
1 parent bc05526 commit 88f2245
Showing 1 changed file with 57 additions and 52 deletions.
109 changes: 57 additions & 52 deletions magic_pdf/cli/magicpdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import click
from loguru import logger
from pathlib import Path
from magic_pdf.libs.version import __version__

from magic_pdf.libs.MakeContentConfig import DropMode
from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
Expand All @@ -52,7 +53,7 @@ def prepare_env(pdf_file_name, method):
get_local_dir(), "magic-pdf", pdf_file_name, method
)

local_image_dir = os.path.join(local_parent_dir, "images")
local_image_dir = os.path.join(str(local_parent_dir), "images")
local_md_dir = local_parent_dir
os.makedirs(local_image_dir, exist_ok=True)
os.makedirs(local_md_dir, exist_ok=True)
Expand Down Expand Up @@ -102,6 +103,7 @@ def cli():


@cli.command()
@click.version_option(__version__, "--version", "-v", help="显示版本信息")
@click.option("--json", type=str, help="输入一个S3路径")
@click.option(
"--method",
Expand Down Expand Up @@ -158,63 +160,65 @@ def read_s3_path(s3path):
)


@cli.command()
@click.option("--local_json", type=str, help="输入一个本地jsonl路径")
@click.option(
"--method",
type=parse_pdf_methods,
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
default="auto",
)
def local_json_command(local_json, method):
def read_s3_path(s3path):
bucket, key = parse_s3path(s3path)
@cli.command()
@click.version_option(__version__, "--version", "-v", help="显示版本信息")
@click.option("--local_json", type=str, help="输入一个本地jsonl路径")
@click.option(
"--method",
type=parse_pdf_methods,
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
default="auto",
)
def local_json_command(local_json, method):
def read_s3_path(s3path):
bucket, key = parse_s3path(s3path)

s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
s3_rw = S3ReaderWriter(
s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
)
may_range_params = parse_s3_range_params(s3path)
if may_range_params is None or 2 != len(may_range_params):
byte_start, byte_end = 0, None
else:
byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
byte_end += byte_start - 1
return s3_rw.read_jsonl(
remove_non_official_s3_args(s3path),
byte_start,
byte_end,
AbsReaderWriter.MODE_BIN,
s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
s3_rw = S3ReaderWriter(
s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
)
may_range_params = parse_s3_range_params(s3path)
if may_range_params is None or 2 != len(may_range_params):
byte_start, byte_end = 0, None
else:
byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
byte_end += byte_start - 1
return s3_rw.read_jsonl(
remove_non_official_s3_args(s3path),
byte_start,
byte_end,
AbsReaderWriter.MODE_BIN,
)

with open(local_json, "r", encoding="utf-8") as f:
for json_line in f:
jso = json_parse.loads(json_line)

s3_file_path = jso.get("file_location")
if s3_file_path is None:
s3_file_path = jso.get("path")
pdf_file_name = Path(s3_file_path).stem
pdf_data = read_s3_path(s3_file_path)
local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)

local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
local_md_dir
)

with open(local_json, "r", encoding="utf-8") as f:
for json_line in f:
jso = json_parse.loads(json_line)

s3_file_path = jso.get("file_location")
if s3_file_path is None:
s3_file_path = jso.get("path")
pdf_file_name = Path(s3_file_path).stem
pdf_data = read_s3_path(s3_file_path)
local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)

local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
local_md_dir
)

_do_parse(
pdf_file_name,
pdf_data,
jso["doc_layout_result"],
method,
local_image_rw,
local_md_rw,
os.path.basename(local_image_dir),
local_md_dir
)
_do_parse(
pdf_file_name,
pdf_data,
jso["doc_layout_result"],
method,
local_image_rw,
local_md_rw,
os.path.basename(local_image_dir),
local_md_dir
)


@cli.command()
@click.version_option(__version__, "--version", "-v", help="显示版本信息")
@click.option(
"--pdf", type=click.Path(exists=True), required=True, help="PDF文件的路径"
)
Expand Down Expand Up @@ -256,6 +260,7 @@ def read_fn(path):
)



if __name__ == "__main__":
"""
python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551
Expand Down

0 comments on commit 88f2245

Please sign in to comment.