From 0ffe78279782cee24d0821427a9cecdbcc0ce266 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=83=A1=E7=8F=A9?= <huheng.1989@bytedance.com>
Date: Wed, 6 Nov 2024 08:28:21 +0000
Subject: [PATCH] video processing demo with ocr

* added README for LLM preprocess demo


* Fix pip problem in gpu ci


* Add aes module in LLM demo


* param parser for the LLM preprocessing main


* rename the LLM preprocessing demo path


* video processing demo with ocr


See merge request: !840
---
 .codebase/pipelines/ci.yaml                   |   4 +-
 bmf/demo/LLM_video_preprocessing/README.md    |  18 ++
 .../LLM_video_preprocessing/aesmod_module.py  | 153 ++++++++++
 .../LLM_video_preprocessing/clip_process.py   | 262 ++++++++++++++++++
 bmf/demo/LLM_video_preprocessing/main.py      |  60 ++++
 .../LLM_video_preprocessing/media_info.py     |  67 +++++
 .../LLM_video_preprocessing/module_utils.py   | 115 ++++++++
 bmf/demo/LLM_video_preprocessing/ocr_crop.py  | 195 +++++++++++++
 .../LLM_video_preprocessing/operator_base.py  |  30 ++
 .../LLM_video_preprocessing/prepare_env.sh    |   3 +
 .../LLM_video_preprocessing/requirement.txt   |   4 +
 bmf/demo/LLM_video_preprocessing/split.py     |  17 ++
 12 files changed, 926 insertions(+), 2 deletions(-)
 create mode 100644 bmf/demo/LLM_video_preprocessing/README.md
 create mode 100644 bmf/demo/LLM_video_preprocessing/aesmod_module.py
 create mode 100644 bmf/demo/LLM_video_preprocessing/clip_process.py
 create mode 100644 bmf/demo/LLM_video_preprocessing/main.py
 create mode 100644 bmf/demo/LLM_video_preprocessing/media_info.py
 create mode 100644 bmf/demo/LLM_video_preprocessing/module_utils.py
 create mode 100644 bmf/demo/LLM_video_preprocessing/ocr_crop.py
 create mode 100644 bmf/demo/LLM_video_preprocessing/operator_base.py
 create mode 100644 bmf/demo/LLM_video_preprocessing/prepare_env.sh
 create mode 100644 bmf/demo/LLM_video_preprocessing/requirement.txt
 create mode 100644 bmf/demo/LLM_video_preprocessing/split.py

diff --git a/.codebase/pipelines/ci.yaml b/.codebase/pipelines/ci.yaml
index 6a32df33..ff3f03a7 100644
--- a/.codebase/pipelines/ci.yaml
+++ b/.codebase/pipelines/ci.yaml
@@ -238,7 +238,7 @@ jobs:
           - (cd output/demo/video_enhance && pip install basicsr==1.4.2 realesrgan && (sed -i '8s/from torchvision.transforms.functional_tensor import rgb_to_grayscale/from torchvision.transforms.functional import rgb_to_grayscale/' /usr/local/lib/python3.8/dist-packages/basicsr/data/degradations.py) && python3 enhance_demo.py)
           - (cd output/demo/aesthetic_assessment && pip install onnxruntime && python3 aesthetic_assessment_demo.py)
           - (cd output/demo/face_detect && cp ../../models/version-RFB-640.onnx . && trtexec --onnx=version-RFB-640.onnx --buildOnly --saveEngine=version-RFB-640.engine && cp version-RFB-640.engine ../../models && python3 detect_trt_sample.py)
-          - (cd output/demo/colorization_python && git clone https://github.com/eefengwei/DeOldify.git DeOldify && pip3 install -r ./DeOldify/requirements-colab.txt && pip3 install Ipython && mkdir -p ./DeOldify/models && wget -c https://huggingface.co/spensercai/DeOldify/resolve/main/ColorizeVideo_gen.pth -O ./DeOldify/models/ColorizeVideo_gen.pth && python3 deoldify_demo.py)
+          - (cd output/demo/colorization_python && git clone https://github.com/eefengwei/DeOldify.git DeOldify && pip install "numpy>=2.0.0,<3.0.0" && pip3 install -r ./DeOldify/requirements-colab.txt && pip3 install Ipython && mkdir -p ./DeOldify/models && wget -c https://huggingface.co/spensercai/DeOldify/resolve/main/ColorizeVideo_gen.pth -O ./DeOldify/models/ColorizeVideo_gen.pth && python3 deoldify_demo.py)
 
   clang_build_test:
     name: clang_build_test
@@ -330,4 +330,4 @@ jobs:
           - go mod init
           - commit=$(cat ../../../gosdk_version.txt) && go get github.com/babitmf/bmf-gosdk@${commit}
           - go build main.go
-          - python3 test_go.py
\ No newline at end of file
+          - python3 test_go.py
diff --git a/bmf/demo/LLM_video_preprocessing/README.md b/bmf/demo/LLM_video_preprocessing/README.md
new file mode 100644
index 00000000..8692d3a4
--- /dev/null
+++ b/bmf/demo/LLM_video_preprocessing/README.md
@@ -0,0 +1,18 @@
+It's a demo for LLM video/image generating training preprocessing.
+
+Based on BMF, it's flexible to build and integrate algorithms into whole pipeline of preprocessing.
+
+Two part of them are included:
+1. Clip processing
+The input video will be split according to scene change, and subtitles in the video will be detected and cropped by OCR module, and the video quality will be assessed by BMF provided aesthetic module.
+After that, the finalized video clips will be encoded as output.
+## Prerequisites
+Please pip install all the dependencies in `requirement.txt`
+## Run
+```
+python main.py --input_file <your test video>
+```
+There will be output info and clips to be stored in `clip_output` of current path.
+
+2. Caption
+Please reference the README in "bmf/bmf/demo/fast_caption_module" 
\ No newline at end of file
diff --git a/bmf/demo/LLM_video_preprocessing/aesmod_module.py b/bmf/demo/LLM_video_preprocessing/aesmod_module.py
new file mode 100644
index 00000000..cf23e3cf
--- /dev/null
+++ b/bmf/demo/LLM_video_preprocessing/aesmod_module.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from module_utils import SyncModule
+import os
+import time
+import json
+import pdb
+import os.path as osp
+import numpy as np
+
+os.environ["OMP_NUM_THREADS"] = "8"
+import onnxruntime as ort
+import torch
+import logging
+import cv2
+
+
+def get_logger():
+    return logging.getLogger("main")
+
+
+LOGGER = get_logger()
+
+
+def flex_resize_aesv2(img, desired_size=[448, 672], pad_color=[0, 0, 0]):
+    old_h, old_w = img.shape[:2]  # old_size is in (height, width) format
+    if desired_size[0] >= desired_size[1]:
+        if old_h < old_w:  # rotate the honrizontal video
+            img = np.rot90(img, k=1, axes=(1, 0))
+    else:
+        if old_h > old_w:  # rotate the vertical video
+            img = np.rot90(img, k=1, axes=(1, 0))
+    old_h, old_w = img.shape[:2]
+
+    if old_w / old_h > (desired_size[1] / desired_size[0]):
+        ratio = desired_size[0] / old_h
+    else:
+        ratio = desired_size[1] / old_w
+    img = cv2.resize(img, None, fx=ratio, fy=ratio)
+    h, w, _ = img.shape
+    h_crop = (h - desired_size[0]) // 2
+    w_crop = (w - desired_size[1]) // 2
+    img = img[h_crop:h_crop + desired_size[0],
+              w_crop:w_crop + desired_size[1], :]
+    return img
+
+
+class Aesmod:
+
+    def __init__(self, model_path, model_version, output_path):
+        self._frm_idx = 0
+        self._frm_scores = []
+        self._model_version = model_version
+        self._output_path = output_path
+
+        # model_dir = osp.join(osp.abspath(osp.dirname("__file__")), "models")
+        # aesmod_ort_model_path = osp.realpath(
+        #    osp.join(model_dir, "aes_transonnx_update3.onnx")
+        # )
+        self.use_gpu = False
+        aesmod_ort_model_path = model_path
+        print(aesmod_ort_model_path)
+        LOGGER.info("loading aesthetic ort inference session")
+        self.ort_session = ort.InferenceSession(aesmod_ort_model_path)
+
+        self.resize_reso = [672, 448]
+
+    def preprocess(self, frame):
+        frame = flex_resize_aesv2(frame)
+        # print('using flex_resize_aesv2', frame.shape)
+        frame = (frame.astype(np.float32) / 255.0 -
+                 np.array([0.485, 0.456, 0.406], dtype="float32")) / (np.array(
+                     [0.229, 0.224, 0.225], dtype="float32"))
+        frame = np.transpose(frame, (2, 0, 1))
+        frame = np.expand_dims(frame, 0)
+        return frame
+
+    @staticmethod
+    def tensor_to_list(tensor):
+        if tensor.requires_grad:
+            return tensor.detach().cpu().flatten().tolist()
+        else:
+            return tensor.cpu().flatten().tolist()
+
+    @staticmethod
+    def score_pred_mapping(raw_scores, raw_min=2.60, raw_max=7.42):
+        pred_score = np.clip(
+            np.sum([x * (i + 1) for i, x in enumerate(raw_scores)]), raw_min,
+            raw_max)
+        pred_score = np.sqrt(
+            (pred_score - raw_min) / (raw_max - raw_min)) * 100
+        return float(np.clip(pred_score, 0, 100.0))
+
+    def process(self, frames):
+        frames = [
+            frame
+            if frame.flags["C_CONTIGUOUS"] else np.ascontiguousarray(frame)
+            for frame in frames
+        ]
+        frame = self.preprocess(frames[0])
+        print("after preprocess shape", frame.shape)
+        if not frame.flags["C_CONTIGUOUS"]:
+            frame = np.ascontiguousarray(frame, dtype=np.float32)
+
+        t1 = time.time()
+        if self.use_gpu:
+            with torch.no_grad():
+                input_batch = torch.from_numpy(frame).contiguous().cuda()
+                preds, _ = self.trt_model(input_batch)
+                raw_score = self.tensor_to_list(preds)
+        else:
+
+            raw_score = self.ort_session.run(None, {"input": frame})
+            raw_score = raw_score[0][0]
+        score = self.score_pred_mapping(raw_score)
+        self._frm_scores.append(score)
+        self._frm_idx += 1
+        t2 = time.time()
+        LOGGER.info(f"[Aesmod] inference time: {(t2 - t1) * 1000:0.1f} ms")
+        return frames[0]
+
+    def clean(self):
+        nr_score = round(np.mean(self._frm_scores), 2)
+        results = {
+            "aesthetic": nr_score,
+            "aesthetic_version": self._model_version
+        }
+        LOGGER.info(f"overall prediction {json.dumps(results)}")
+        with open(self._output_path, "w") as outfile:
+            json.dump(results, outfile, indent=4, ensure_ascii=False)
+
+
+class aesmod_module(SyncModule):
+
+    def __init__(self, node=None, option=None):
+        output_path = option.get("result_path", 0)
+        params = option.get("params", {})
+        model_version = params.get("model_version", "v1.0")
+        model_path = params.get("model_path",
+                                "../../models/aes_transonnx_update3.onnx")
+        self._nrp = Aesmod(model_path, model_version, output_path)
+        SyncModule.__init__(self,
+                            node,
+                            nb_in=1,
+                            in_fmt="rgb24",
+                            out_fmt="rgb24")
+
+    def core_process(self, frames):
+        return self._nrp.process(frames)
+
+    def clean(self):
+        self._nrp.clean()
diff --git a/bmf/demo/LLM_video_preprocessing/clip_process.py b/bmf/demo/LLM_video_preprocessing/clip_process.py
new file mode 100644
index 00000000..6ee1d7f4
--- /dev/null
+++ b/bmf/demo/LLM_video_preprocessing/clip_process.py
@@ -0,0 +1,262 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import logging
+import bmf
+from pydantic.v1.utils import deep_update
+import os
+import json
+from pathlib import Path
+
+logger = logging.getLogger("main")
+
+DEFAULT_OPERATOR_CONFIG = {
+    "ocr_crop": {
+        "name": "ocr_crop",
+        "module_name": "ocr_crop",
+        "pre_module": True,
+        "params": {},
+    },
+    "aesmod_module": {
+        "name": "aesmod_module",
+        "module_name": "aesmod_module",
+        "pre_module": False,
+        "params": {}
+    },
+}
+
+ClipOutputPath = "clip_output"
+
+FIXED_RES = 480
+
+def prepare_dir(base_path):
+    Path(base_path).mkdir(parents=True, exist_ok=True)
+
+def get_operator_tmp_result_path(base_path, operator_name):
+    file_name = f"{operator_name}_result.json"
+    return os.path.join(base_path, file_name)
+
+
+def get_operator_result_path(base_path, operator_name, clip_index):
+    file_name = f"clip_{clip_index}_{operator_name}_result.json"
+    return os.path.join(base_path, file_name)
+
+
+def get_jpg_serial_path(base_path, clip_index, fixed_width):
+    img_path = f"clip_{clip_index}_{fixed_width}_img"
+    return os.path.join(
+        base_path, img_path, "clip_{}_{}_img_%04d.jpg".format(clip_index, fixed_width)
+    )
+
+def get_jpg_dir_path(base_path, clip_index, fixed_width):
+    img_path = f"clip_{clip_index}_{fixed_width}_img"
+    return os.path.join(base_path, img_path)
+
+def get_clip_path(base_path, clip_index, fixed_width):
+    file_name = "clip_{}_{}.mp4".format(clip_index, fixed_width)
+    return os.path.join(base_path, file_name)
+
+
+def bmf_output(stream, configs, clip_index):
+    if len(configs) == 0:
+        return
+    s = stream
+    resize_streams = dict()
+
+    for c in configs:
+        o = s
+        res_str = c["res"]
+        if res_str != "orig":
+            stream_key = res_str
+            if stream_key not in resize_streams:
+                res = int(res_str)
+                o = o.scale(
+                    "if(lt(iw,ih),{},-2):if(lt(iw,ih),-2,{})".format(res, res)
+                ).ff_filter("setsar", sar="1/1")
+                resize_streams[stream_key] = o
+            else:
+                # get saved resized stream
+                o = resize_streams[stream_key]
+
+        elif "limit" in c:
+            res = int(c["limit"])
+            o = o.scale(
+                f"if(lt(iw,ih),min({res},iw),-2):if(lt(iw,ih),-2,min({res},ih))"
+            ).ff_filter("setsar", sar="1/1")
+
+        # encode
+        if c["type"] == "jpg":
+            bmf.encode(
+                o,
+                None,
+                {
+                    "output_path": get_jpg_serial_path(
+                        ClipOutputPath, clip_index, c["res"]
+                    ),
+                    "video_params": {
+                        "vsync": "vfr",
+                        "codec": "jpg",
+                        "qscale": int(c["quality"]) if "quality" in c else 2,
+                        "pix_fmt": "yuvj444p",
+                        "threads": "4",
+                    },
+                    "format": "image2",
+                },
+            )
+
+        elif c["type"] == "mp4":
+            bmf.encode(
+                o,
+                None,
+                {
+                    "output_path": get_clip_path(ClipOutputPath, clip_index, c["res"]),
+                    "video_params": {
+                        "vsync": "vfr",
+                        "codec": "h264",
+                        "preset": "veryfast",
+                        "threads": "4",
+                    },
+                },
+            )
+
+
+class ClipProcess:
+    def __init__(self, input_file, timelines, mode, config):
+        self.input_file = input_file
+        self.timelines = timelines
+        self.mode = mode
+        self.config = config
+        modes = mode.split(",")
+        operator_options = []
+        self.output_configs = (
+            config["output_configs"] if "output_configs" in config else {}
+        )
+        for operator_name in modes:
+            if operator_name in DEFAULT_OPERATOR_CONFIG:
+                option = DEFAULT_OPERATOR_CONFIG[operator_name]
+                config_operator_option = (
+                    config[operator_name] if operator_name in config else {}
+                )
+                operator_option = deep_update(option, config_operator_option)
+                operator_options.append(operator_option)
+        logger.info(f"operator_options: {operator_options}")
+        self.operator_options = operator_options
+        prepare_dir(ClipOutputPath)
+
+    def operator_process(self, timeline):
+        if len(self.operator_options) == 0:
+            return True
+
+        decode_param = {}
+        decode_param["input_path"] = self.input_file
+        decode_param["dec_params"] = {"threads": "4"}
+        decode_param["durations"] = timeline
+        graph = bmf.graph()
+        v = graph.decode(decode_param)["video"]
+        v = v.scale(
+            f"if(lt(iw,ih),min({FIXED_RES},iw),-2):if(lt(iw,ih),-2,min({FIXED_RES},ih))"
+        ).ff_filter("setsar", sar="1/1")
+        for operator_option in self.operator_options:
+            operator_name = operator_option["name"]
+            if "module_name" in operator_option:
+                if operator_option["pre_module"]:
+                    v = v.module(
+                        operator_option["module_name"],
+                        pre_module=self.operator_premodules[operator_name],
+                    )
+                else:
+                    v = v.module(
+                        operator_option["module_name"],
+                        option={
+                            "result_path": get_operator_tmp_result_path(
+                                ClipOutputPath, operator_name
+                            ),
+                            "params": operator_option["params"],
+                        },
+                    )
+        pkts = v.start()
+        count = 0
+        for _, pkt in enumerate(pkts):
+            if pkt.is_(bmf.VideoFrame):
+                count += 1
+        logger.info(f"operator process get videoframe count: {count}")
+        return count > 0
+
+    def process_one_clip(self, timeline, clip_index):
+        passthrough = self.operator_process(timeline)
+
+        if not passthrough:
+            return
+
+        if len(self.output_configs) == 0:
+            return
+
+        for output in self.output_configs:
+            if output["type"] == "jpg":
+                res = output["res"]
+                img_dir = get_jpg_dir_path(ClipOutputPath, clip_index, res)
+                prepare_dir(img_dir)
+
+        decode_param = {}
+        decode_param["input_path"] = self.input_file
+        decode_param["dec_params"] = {"threads": "4"}
+        decode_param["durations"] = timeline
+        graph = bmf.graph({"optimize_graph": False})
+        v = graph.decode(decode_param)["video"]
+
+        for operator_option in self.operator_options:
+            operator_name = operator_option["name"]
+            if operator_name == "ocr_crop":
+                result_path = get_operator_tmp_result_path(
+                    ClipOutputPath, operator_name
+                )
+                if not os.path.exists(result_path):
+                    continue
+                with open(result_path, "r") as f:
+                    operator_res = json.load(f)
+                if (
+                    "result" in operator_res
+                    and "nms_crop_box" in operator_res["result"]
+                ):
+                    nms_crop_box = operator_res["result"]["nms_crop_box"]
+                    left, top, right, bottom = nms_crop_box
+                    v = v.ff_filter(
+                        "crop",
+                        f"w=iw*{right - left}:h=ih*{bottom - top}:x=iw*{left}:y=ih*{top}",
+                    )
+
+        bmf_output(v, self.output_configs, clip_index)
+        graph.run()
+
+    def process_clip_result(self, clip_index):
+        for operator_option in self.operator_options:
+            operator_name = operator_option["name"]
+            tmp_path = get_operator_tmp_result_path(ClipOutputPath, operator_name)
+            if os.path.exists(tmp_path):
+                os.rename(
+                    tmp_path,
+                    get_operator_result_path(ClipOutputPath, operator_name, clip_index),
+                )
+
+    def process(self):
+        # create premodule
+        operator_premodules = {}
+        for op_option in self.operator_options:
+            if "pre_module" in op_option and op_option["pre_module"]:
+                operator_name = op_option["name"]
+                operator_premodule = bmf.create_module(
+                    op_option["module_name"],
+                    option={
+                        "result_path": get_operator_tmp_result_path(
+                            ClipOutputPath, operator_name
+                        ),
+                        "params": op_option["params"],
+                    },
+                )
+                operator_premodules[operator_name] = operator_premodule
+
+        self.operator_premodules = operator_premodules
+
+        for i in range(len(self.timelines)):
+            timeline = self.timelines[i]
+            self.process_one_clip(timeline, i)
+            self.process_clip_result(i)
diff --git a/bmf/demo/LLM_video_preprocessing/main.py b/bmf/demo/LLM_video_preprocessing/main.py
new file mode 100644
index 00000000..e7e95d98
--- /dev/null
+++ b/bmf/demo/LLM_video_preprocessing/main.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import logging
+from media_info import MediaInfo
+from split import get_split_info
+import torch
+from clip_process import ClipProcess
+import argparse
+import os
+
+logger = logging.getLogger('main')
+
+scene_thres = 0.3
+
+def get_timeline_list(pts_list, last):
+    current = 0
+    timelines = []
+    for pts in pts_list:
+        pts = pts/1000000
+        if pts > current:
+            timelines.append([current,pts])
+        current = pts
+    # last
+    if last > current:
+        timelines.append([current,last])
+    return timelines
+
+def video_processing_demo(input_file, mode, config):
+    media_info = MediaInfo("ffprobe", input_file)
+    duration = media_info.video_duration()
+    logger.info(f"duration:{duration}")
+
+    pts_list = get_split_info(input_file, scene_thres)
+    timelines = get_timeline_list(pts_list, duration)
+    logger.info(f"timelines:{timelines}")
+    cp = ClipProcess(input_file, timelines, mode, config)
+    cp.process()
+
+def demo_run(args):
+    if args.input_file == '':
+        print("input file needed")
+        return -1
+    model_path = "../../models/aes_transonnx_update3.onnx"
+    if not os.path.exists(model_path):
+        print(
+            "please download model first, use 'wget https://github.com/BabitMF/bmf/releases/download/files/models.tar.gz && tar zxvf models.tar.gz -C ../../' "
+        )
+        exit(0)
+    torch.set_num_threads(4)
+    mode = "ocr_crop,aesmod_module"
+    config = {"output_configs":[{"res":"orig", "type":"jpg"}, {"res":"480", "type":"mp4"}]}
+    video_processing_demo(args.input_file, mode, config)
+
+if __name__ == '__main__':
+    logger.setLevel(logging.INFO)
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_file', type=str, help="input local file", default = '')
+    args = parser.parse_args()
+    demo_run(args)
diff --git a/bmf/demo/LLM_video_preprocessing/media_info.py b/bmf/demo/LLM_video_preprocessing/media_info.py
new file mode 100644
index 00000000..0c73c26e
--- /dev/null
+++ b/bmf/demo/LLM_video_preprocessing/media_info.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import logging
+import subprocess
+import json
+
+logger = logging.getLogger('main')
+
+class MediaInfo:
+    def __init__(self, ffprobe_bin, input_file):
+        self.__ffprobe_bin = ffprobe_bin
+        self.__streams = dict()
+        self.__v_stream = {}
+        self.__v_stream_idx = 0
+        self.__a_stream = {}
+        self.__a_stream_idx = 0
+        show_options = " -show_format -show_streams"
+        ff_cmd = "%s -hide_banner -loglevel error -print_format json %s %s" % (
+            self.__ffprobe_bin,
+            show_options,
+            input_file,
+        )
+        res = subprocess.Popen(ff_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        stdout, stderr = res.communicate()
+        status = res.returncode
+        if status:
+            raise Exception('ffprobe failed')
+        msg = stdout.decode(errors='ignore')
+        dict_info = json.loads(msg)
+        if 'streams' not in dict_info or len(dict_info['streams']) <= 0:
+            raise Exception('ffprobe no streams')
+        streams = dict_info.get('streams')
+        self.__streams = streams
+        self.__auto_select_stream()
+
+    def __auto_select_stream(self):
+        v_streams = []
+        a_streams = []
+        streams = self.__streams
+        for stream in streams:
+            if stream.get('codec_type') == 'video':
+                v_streams.append(stream)
+            elif stream.get('codec_type') == 'audio':
+                a_streams.append(stream)
+        if len(v_streams) > 0:
+            self.__v_stream = v_streams[0]
+        if len(a_streams) > 0:
+            self.__a_stream = a_streams[0]
+
+    def video_wxh(self):
+        v_stream = self.__v_stream
+        src_w = int(v_stream.get('width', '-1'))
+        src_h = int(v_stream.get('height', '-1'))
+        if src_w < 0 or src_h < 0:
+            raise Exception('invalid resolution')
+        return src_w, src_h
+
+    def video_duration(self):
+        v_stream = self.__v_stream
+        video_duration = -1
+        if 'duration' in v_stream:
+            video_duration = float(v_stream.get('duration', -1))
+        return video_duration
+
+
+
diff --git a/bmf/demo/LLM_video_preprocessing/module_utils.py b/bmf/demo/LLM_video_preprocessing/module_utils.py
new file mode 100644
index 00000000..0e4cf475
--- /dev/null
+++ b/bmf/demo/LLM_video_preprocessing/module_utils.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*
+import bmf
+
+from bmf import VideoFrame
+from bmf.lib._bmf.sdk import ffmpeg
+import bmf.hml.hmp as mp
+
+
+def generate_out_packets(packet, np_arr, out_fmt):
+    # video_frame = bmf.VideoFrame.from_ndarray(np_arr, format=out_fmt)
+    rgbformat = mp.PixelInfo(mp.kPF_RGB24)
+    image = mp.Frame(mp.from_numpy(np_arr), rgbformat)
+    video_frame = VideoFrame(image)
+
+    video_frame.pts = packet.get(VideoFrame).pts
+    video_frame.time_base = packet.get(VideoFrame).time_base
+
+    pkt = bmf.Packet(video_frame)
+    pkt.timestamp = packet.timestamp
+    return pkt
+
+
+class SyncModule(bmf.Module):
+
+    def __init__(self,
+                 node=None,
+                 nb_in=1,
+                 in_fmt="yuv420p",
+                 out_fmt="yuv420p"):
+        """
+        nb_in: the number of frames for core_process function
+        in_fmt: the pixel format of frames for core_process function
+        out_fmt: the pixel format of frame returned by core_process function
+        """
+        self._node = node
+
+        self._margin_num = (nb_in - 1) // 2
+        self._out_frame_index = self._margin_num
+        self._in_frame_num = nb_in
+
+        self._in_fmt = in_fmt
+        self._out_fmt = out_fmt
+
+        self._in_packets = []
+        self._frames = []
+        self._eof = False
+
+    def process(self, task):
+
+        input_queue = task.get_inputs()[0]
+        # output_queue = task.get_outputs()[0]
+
+        while not input_queue.empty():
+            pkt = input_queue.get()
+            pkt_timestamp = pkt.timestamp
+
+            if pkt_timestamp == bmf.Timestamp.EOF:
+                output_queue = task.get_outputs().get(0, None)
+                self._eof = True
+                for _ in range(self._margin_num):
+                    self._in_packets.append(self._in_packets[-1])
+                    self._frames.append(self._frames[-1])
+                self._consume()
+
+                # output_queue.put(bmf.Packet.generate_eof_packet())
+                task.set_timestamp(bmf.Timestamp.DONE)
+                if output_queue:
+                    output_queue.put(pkt)
+                return bmf.ProcessResult.OK
+
+            pkt_data = pkt.get(VideoFrame)
+            if pkt_data is not None:
+                self._in_packets.append(pkt)
+                # self._frames.append(pkt.get(VideoFrame).to_ndarray(format=self._in_fmt))
+
+                self._frames.append(
+                    ffmpeg.reformat(pkt.get(VideoFrame),
+                                    self._in_fmt).frame().plane(0).numpy())
+
+            # padding first frame.
+            if len(self._in_packets) == 1:
+                for _ in range(self._margin_num):
+                    self._in_packets.append(self._in_packets[0])
+                    self._frames.append(self._frames[0])
+
+        self._consume()
+
+        return bmf.ProcessResult.OK
+
+    def _consume(self, output_queue=None):
+        while len(self._in_packets) >= self._in_frame_num:
+            out_frame = self.core_process(self._frames[:self._in_frame_num])
+            out_packet = generate_out_packets(
+                self._in_packets[self._out_frame_index], out_frame,
+                self._out_fmt)
+            # output_queue.put(out_packet)
+            self._in_packets.pop(0)
+            self._frames.pop(0)
+
+    def core_process(self, frames):
+        """
+        user defined, process frames to output one frame, pass through by default
+        frames: input frames, list format
+        """
+        return frames[0]
+
+    def clean(self):
+        pass
+
+    def close(self):
+        self.clean()
+
+    def reset(self):
+        self._eof = False
diff --git a/bmf/demo/LLM_video_preprocessing/ocr_crop.py b/bmf/demo/LLM_video_preprocessing/ocr_crop.py
new file mode 100644
index 00000000..d2c17e1e
--- /dev/null
+++ b/bmf/demo/LLM_video_preprocessing/ocr_crop.py
@@ -0,0 +1,195 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from dataclasses import dataclass, field
+from operator_base import BaseModule
+import easyocr
+from bmf import VideoFrame, av_time_base, Log, LogLevel, Packet, Timestamp
+from bmf.lib._bmf.sdk import ffmpeg
+import json
+import numpy as np
+
+def point_in_area(point, area_coord):
+    px,py=point
+    x0, y0, x1, y1 = area_coord
+    if px > x0 and px < x1 and py > y0 and py < y1:
+        return True
+    return False
+
+def hole_in_area(hole, area_coord):
+    for p in hole:
+        if point_in_area(p, area_coord):
+            return True
+    return False
+
+def split_area_by_hole(hole, area_coord):
+    res = []
+    if not hole_in_area(hole, area_coord):
+        res.append(area_coord)
+        return res
+
+    hole_x0 = hole[0][0]
+    hole_x1 = hole[1][0]
+    hole_y0 = hole[0][1]
+    hole_y1 = hole[2][1]
+
+    x0, y0, x1, y1 = area_coord
+    if hole_x0 > x0:
+        res.append((x0, y0, hole_x0, y1))
+    if hole_x1 < x1:
+        res.append((hole_x1, y0, x1, y1))
+    if hole_y0 > y0:
+        res.append((x0, y0, x1, hole_y0))
+    if hole_y1 < y1:
+        res.append((x0, hole_y1, x1, y1))
+    return res
+
+def get_area(area_coord):
+    x0, y0, x1, y1 = area_coord
+    return (x1-x0)*(y1-y0)
+
+def calc_max_area_(blackholes, area_coord, max_area_coord, max_area_thres):
+    holes = blackholes[0:]
+    max_area = get_area(max_area_coord)
+    if len(holes) == 0:
+        area = get_area(area_coord)
+        if max_area < area:
+            max_area = area
+            max_area_coord = area_coord
+    else:
+        hole = holes[0]
+        sub_holes = holes[1:]
+        res = split_area_by_hole(hole, area_coord)
+        for sub_area_coord in res:
+            if get_area(sub_area_coord) <= max_area:
+                continue
+            max_sub_area_coord = calc_max_area_(sub_holes, sub_area_coord, max_area_coord, max_area_thres)
+            area = get_area(max_sub_area_coord)
+            if max_area < area:
+                max_area = area
+                max_area_coord = max_sub_area_coord
+    return max_area_coord
+
+# return the maximum area without blackholes
+# stop calculate if area ratio is ls less than ratio_thres
+def calc_max_area(blackholes, width, height, ratio_thres):
+    max_area_coord = (0,0,0,0)
+    max_area_thres = int(width*height*ratio_thres)
+    max_area_coord = calc_max_area_(blackholes, (0, 0, width, height), max_area_coord, max_area_thres)
+    x0, y0, x1, y1 = max_area_coord
+    res = (x0/width, y0/height, x1/width, y1/height)
+    if get_area(max_area_coord) > max_area_thres:
+        return True, res
+    return False, res
+
+@dataclass
+class ocr_crop_params:
+    nms_remain_area_ratio_thres: float = field(default=0)
+    word_prob_threshold: float = field(default=0.3)
+    sample_every_seconds: float = field(default=1)
+
+
+class ocr_crop(BaseModule):
+    def __init__(self, node, option):
+        super().__init__(node, option)
+        self.reader = easyocr.Reader(["ch_sim", "en"])
+        self.params = ocr_crop_params(
+            **option["params"] if option is not None and "params" in option else {}
+        )
+        self.reset()
+
+    def reset(self):
+        self.input_packets = []
+        self.ocr_results = []
+        self.frame_width = None
+        self.frame_height = None
+        self.last_processed_frame_timestamp = None
+
+    def write_result(self, ok, nms_crop_box):
+        output = {}
+        output["video_width"] = self.frame_width
+        output["video_height"] = self.frame_height
+        output["nms_remain_area_ratio_thres"] = self.params.nms_remain_area_ratio_thres
+        output["word_prob_threshold"] = self.params.word_prob_threshold
+        output["get_max_area"] = ok
+        output["nms_crop_box"] = nms_crop_box
+        res = {}
+        res["result"] = output
+
+        with open(self.result_path, "w", encoding="utf-8") as f:
+            json.dump(res, f, ensure_ascii=False, indent=4)
+
+    def process_ocr_results(self):
+        """
+        ocr_results = [(
+            [
+                [np.int32(86), np.int32(80)],
+                [np.int32(134), np.int32(80)],
+                [np.int32(134), np.int32(128)],
+                [np.int32(86), np.int32(128)],
+            ],
+            "西",
+            np.float64(0.6629598563364745),
+        )]
+        """
+
+        ocr_areas = []
+        for frame_results in self.ocr_results:
+            for res in frame_results:
+                # nms points coordinate
+                if res[2] > self.params.word_prob_threshold:
+                    ocr_areas.append(res[0])
+        # debug
+        print(ocr_areas)
+        return calc_max_area(
+            ocr_areas,
+            self.frame_width,
+            self.frame_height,
+            self.params.nms_remain_area_ratio_thres,
+        )
+
+    def on_eof(self, task, pkt):
+        output_queue = task.get_outputs().get(0, None)
+        # crop and send out
+        ok, max_nms_area = self.process_ocr_results()
+        self.write_result(ok, max_nms_area)
+        if ok and output_queue:
+            x0, y0, x1, y1 = max_nms_area
+            x0 = int(x0 * self.frame_width)
+            x1 = int(x1 * self.frame_width)
+            y0 = int(y0 * self.frame_height)
+            y1 = int(y1 * self.frame_height)
+            crop_str = f"crop=w={x1-x0}:h={y1-y0}:x={x0}:y={y0}"
+            Log.log(LogLevel.INFO, f"crop_str: {crop_str}")
+            for packet in self.input_packets:
+                vf = packet.get(VideoFrame)
+                video_frame = ffmpeg.siso_filter(vf, crop_str)
+                video_frame.pts = vf.pts
+                video_frame.time_base = vf.time_base
+                output_pkt = Packet(video_frame)
+                output_pkt.timestamp = packet.timestamp
+                output_queue.put(output_pkt)
+        if output_queue:
+            output_queue.put(pkt)
+        task.timestamp = Timestamp.DONE
+        self.reset()
+
+    def on_pkt(self, task, pkt):
+        if not pkt.is_(VideoFrame):
+            return
+
+        current_timestamp = pkt.timestamp * av_time_base
+        if (
+            self.last_processed_frame_timestamp is None
+            or current_timestamp - self.last_processed_frame_timestamp >= 1
+        ):
+            vf = pkt.get(VideoFrame)
+            if self.frame_width is None:
+                self.frame_width = vf.width
+                self.frame_height = vf.height
+            frame = ffmpeg.reformat(vf, "rgb24").frame().plane(0).numpy()
+            results = self.reader.readtext(frame)
+            self.ocr_results.append(results)
+            self.last_processed_frame_timestamp = pkt.timestamp * av_time_base
+
+        self.input_packets.append(pkt)
diff --git a/bmf/demo/LLM_video_preprocessing/operator_base.py b/bmf/demo/LLM_video_preprocessing/operator_base.py
new file mode 100644
index 00000000..a5b14344
--- /dev/null
+++ b/bmf/demo/LLM_video_preprocessing/operator_base.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from bmf import Module, Timestamp, ProcessResult
+
+
+class BaseModule(Module):
+    def __init__(self, node, option):
+        self.node_ = node
+        self.result_path = (
+            option.get("result_path")
+            if option is not None and "result_path" in option
+            else f"{node}.json"
+        )
+
+    def process(self, task):
+        for _, input_packets in task.get_inputs().items():
+            while not input_packets.empty():
+                pkt = input_packets.get()
+                if pkt.timestamp == Timestamp.EOF:
+                    self.on_eof(task, pkt)
+                else:
+                    self.on_pkt(task, pkt)
+        return ProcessResult.OK
+
+    def on_eof(self, task, pkt):
+        return
+
+    def on_pkt(self, task, pkt):
+        return
diff --git a/bmf/demo/LLM_video_preprocessing/prepare_env.sh b/bmf/demo/LLM_video_preprocessing/prepare_env.sh
new file mode 100644
index 00000000..98b452d5
--- /dev/null
+++ b/bmf/demo/LLM_video_preprocessing/prepare_env.sh
@@ -0,0 +1,3 @@
+pip3 install -r requirement.txt
+(wget https://github.com/BabitMF/bmf/releases/download/files/files.tar.gz && tar zxvf files.tar.gz -C ../../)
+(wget https://github.com/BabitMF/bmf/releases/download/files/models.tar.gz && tar zxvf models.tar.gz -C ../../)
diff --git a/bmf/demo/LLM_video_preprocessing/requirement.txt b/bmf/demo/LLM_video_preprocessing/requirement.txt
new file mode 100644
index 00000000..468305b9
--- /dev/null
+++ b/bmf/demo/LLM_video_preprocessing/requirement.txt
@@ -0,0 +1,4 @@
+easyocr
+pydantic
+BabitMF==0.0.11
+onnxruntime
\ No newline at end of file
diff --git a/bmf/demo/LLM_video_preprocessing/split.py b/bmf/demo/LLM_video_preprocessing/split.py
new file mode 100644
index 00000000..b621f4c9
--- /dev/null
+++ b/bmf/demo/LLM_video_preprocessing/split.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import bmf
+
+def get_split_info(video_path, scene_thres):
+    graph = bmf.graph()
+    v = graph.decode({"input_path":video_path})['video']
+    v = v.ff_filter('select', f"gt(scene,{scene_thres})").pass_through()
+    pkts = v.start()
+    scene_change_list = []
+    for _, pkt in enumerate(pkts):
+        if pkt.is_(bmf.VideoFrame):
+            scene_change_list.append(pkt.timestamp)
+    return scene_change_list
+
+#print(get_split_info("/home/huheng.1989/Videos/hdr_720p.mp4", 0.3))