From 5e379b765d6d7e9c4a41cfa0426b2386d813b47b Mon Sep 17 00:00:00 2001 From: YuanJianhao508 <820010508@qq.com> Date: Wed, 29 May 2024 14:54:50 +0100 Subject: [PATCH] first update --- README.md | 55 +- llava/__init__.py | 1 + llava/constants.py | 18 + llava/conversation.py | 398 + llava/eval/eval_gpt_mmvet.py | 276 + llava/eval/eval_gpt_review.py | 113 + llava/eval/eval_gpt_review_bench.py | 121 + llava/eval/eval_gpt_review_visual.py | 118 + llava/eval/eval_gqa.py | 499 + llava/eval/eval_pope.py | 81 + llava/eval/eval_science_qa.py | 114 + llava/eval/eval_science_qa_gpt4.py | 104 + llava/eval/eval_science_qa_gpt4_requery.py | 149 + llava/eval/eval_textvqa.py | 65 + .../eval/generate_webpage_data_from_table.py | 111 + llava/eval/m4c_evaluator.py | 334 + llava/eval/model_qa.py | 85 + llava/eval/model_vqa.py | 112 + llava/eval/model_vqa_loader.py | 148 + llava/eval/model_vqa_mmbench.py | 175 + llava/eval/model_vqa_science.py | 150 + llava/eval/qa_baseline_gpt35.py | 74 + llava/eval/run_llava.py | 97 + llava/eval/summarize_gpt_review.py | 60 + .../table/caps_boxes_coco2014_val_80.jsonl | 80 + llava/eval/table/model.jsonl | 5 + llava/eval/table/prompt.jsonl | 4 + llava/eval/table/question.jsonl | 80 + llava/eval/table/reviewer.jsonl | 4 + llava/eval/table/rule.json | 11 + .../video/eval_benchmark_1_correctness.py | 191 + .../eval_benchmark_2_detailed_orientation.py | 191 + llava/eval/video/eval_benchmark_3_context.py | 191 + llava/eval/video/eval_benchmark_4_temporal.py | 190 + .../video/eval_benchmark_5_consistency.py | 198 + llava/eval/video/eval_video_qa.py | 206 + .../run_inference_benchmark_consistency.py | 96 + .../video/run_inference_benchmark_general.py | 87 + llava/eval/video/run_inference_video_qa.py | 171 + .../eval/video/run_inference_video_qa_act.py | 171 + llava/eval/webpage/index.html | 162 + llava/eval/webpage/script.js | 245 + llava/eval/webpage/styles.css | 105 + llava/mm_utils.py | 123 + llava/model/__init__.py | 2 + llava/model/apply_delta.py | 48 + llava/model/builder.py | 173 + llava/model/consolidate.py | 29 + llava/model/language_model/llava_llama.py | 145 + llava/model/language_model/llava_llama_v1.py | 155 + llava/model/language_model/llava_mpt.py | 113 + llava/model/llava_arch.py | 346 + llava/model/make_delta.py | 52 + llava/model/multimodal_encoder/builder.py | 69 + .../model/multimodal_encoder/clip_encoder.py | 78 + llava/model/multimodal_encoder/mae_encoder.py | 80 + llava/model/multimodal_projector/builder.py | 257 + llava/model/utils.py | 20 + llava/serve/__init__.py | 0 llava/serve/cli.py | 145 + llava/serve/controller.py | 298 + llava/serve/eval_custom.py | 159 + llava/serve/eval_custom_chunck.py | 184 + llava/serve/eval_custom_predsig.py | 166 + llava/serve/examples/desert.jpg | Bin 0 -> 262144 bytes llava/serve/examples/extreme_ironing.jpg | Bin 0 -> 62587 bytes llava/serve/examples/sample_demo_1.mp4 | Bin 0 -> 262144 bytes llava/serve/examples/sample_demo_13.mp4 | Bin 0 -> 262144 bytes llava/serve/examples/sample_demo_22.mp4 | Bin 0 -> 262144 bytes llava/serve/examples/sample_demo_3.mp4 | Bin 0 -> 262144 bytes llava/serve/examples/sample_demo_8.mp4 | Bin 0 -> 262144 bytes llava/serve/examples/sample_demo_9.mp4 | Bin 0 -> 262144 bytes llava/serve/examples/sample_img_13.png | Bin 0 -> 48079 bytes llava/serve/examples/sample_img_22.png | Bin 0 -> 60836 bytes llava/serve/examples/sample_img_8.png | Bin 0 -> 262144 bytes llava/serve/examples/waterview.jpg | Bin 0 -> 95499 bytes llava/serve/gradio_utils.py | 141 + llava/serve/gradio_web_server.py | 253 + llava/serve/model_worker.py | 285 + llava/serve/register_worker.py | 26 + llava/serve/test_message.py | 62 + llava/serve/video_caption.py | 148 + llava/train/llama_flash_attn_monkey_patch.py | 115 + .../train/llama_xformers_attn_monkey_patch.py | 129 + llava/train/llava_trainer.py | 176 + llava/train/train.py | 1095 + llava/train/train_mem.py | 18 + llava/train/train_xformers.py | 13 + llava/utils.py | 126 + pyproject.toml | 36 + retrieval/BDDX_RAG_hybird_vpmatch.json | 22796 +++++++++++++++ retrieval/BDDX_RAG_neg_3.json | 17697 ++++++++++++ retrieval/BDDX_RAG_pos_3.json | 19893 ++++++++++++++ retrieval/BDDX_RAG_tuned_vpmatch_t13.json | 22928 ++++++++++++++++ retrieval/BDDX_RAG_visual_vpmatch.json | 22796 +++++++++++++++ retrieval/bddx_vpath_info_match.json | 1878 ++ retrieval/check_sim.py | 49 + retrieval/embeddings_project.npz | Bin 0 -> 1048576 bytes retrieval/final_match.py | 238 + retrieval/matching.py | 327 + retrieval/project.py | 105 + retrieval/projector/best_model.pth | Bin 0 -> 786432 bytes retrieval/rag.py | 85 + retrieval/train.py | 195 + retrieval/train_projector.sh | 5 + retrieval/vpath_info_match.py | 55 + scripts/finetune.sh | 46 + scripts/pretrain.sh | 43 + scripts/zero2.json | 23 + scripts/zero3.json | 28 + scripts/zero3_offload.json | 56 + video_process/create_bddx_json.py | 170 + 112 files changed, 120523 insertions(+), 1 deletion(-) create mode 100644 llava/__init__.py create mode 100644 llava/constants.py create mode 100644 llava/conversation.py create mode 100644 llava/eval/eval_gpt_mmvet.py create mode 100644 llava/eval/eval_gpt_review.py create mode 100644 llava/eval/eval_gpt_review_bench.py create mode 100644 llava/eval/eval_gpt_review_visual.py create mode 100644 llava/eval/eval_gqa.py create mode 100644 llava/eval/eval_pope.py create mode 100644 llava/eval/eval_science_qa.py create mode 100644 llava/eval/eval_science_qa_gpt4.py create mode 100644 llava/eval/eval_science_qa_gpt4_requery.py create mode 100644 llava/eval/eval_textvqa.py create mode 100644 llava/eval/generate_webpage_data_from_table.py create mode 100644 llava/eval/m4c_evaluator.py create mode 100644 llava/eval/model_qa.py create mode 100644 llava/eval/model_vqa.py create mode 100644 llava/eval/model_vqa_loader.py create mode 100644 llava/eval/model_vqa_mmbench.py create mode 100644 llava/eval/model_vqa_science.py create mode 100644 llava/eval/qa_baseline_gpt35.py create mode 100644 llava/eval/run_llava.py create mode 100644 llava/eval/summarize_gpt_review.py create mode 100644 llava/eval/table/caps_boxes_coco2014_val_80.jsonl create mode 100644 llava/eval/table/model.jsonl create mode 100644 llava/eval/table/prompt.jsonl create mode 100644 llava/eval/table/question.jsonl create mode 100644 llava/eval/table/reviewer.jsonl create mode 100644 llava/eval/table/rule.json create mode 100644 llava/eval/video/eval_benchmark_1_correctness.py create mode 100644 llava/eval/video/eval_benchmark_2_detailed_orientation.py create mode 100644 llava/eval/video/eval_benchmark_3_context.py create mode 100644 llava/eval/video/eval_benchmark_4_temporal.py create mode 100644 llava/eval/video/eval_benchmark_5_consistency.py create mode 100644 llava/eval/video/eval_video_qa.py create mode 100644 llava/eval/video/run_inference_benchmark_consistency.py create mode 100644 llava/eval/video/run_inference_benchmark_general.py create mode 100644 llava/eval/video/run_inference_video_qa.py create mode 100644 llava/eval/video/run_inference_video_qa_act.py create mode 100644 llava/eval/webpage/index.html create mode 100644 llava/eval/webpage/script.js create mode 100644 llava/eval/webpage/styles.css create mode 100644 llava/mm_utils.py create mode 100644 llava/model/__init__.py create mode 100644 llava/model/apply_delta.py create mode 100644 llava/model/builder.py create mode 100644 llava/model/consolidate.py create mode 100644 llava/model/language_model/llava_llama.py create mode 100644 llava/model/language_model/llava_llama_v1.py create mode 100644 llava/model/language_model/llava_mpt.py create mode 100644 llava/model/llava_arch.py create mode 100644 llava/model/make_delta.py create mode 100644 llava/model/multimodal_encoder/builder.py create mode 100644 llava/model/multimodal_encoder/clip_encoder.py create mode 100644 llava/model/multimodal_encoder/mae_encoder.py create mode 100644 llava/model/multimodal_projector/builder.py create mode 100644 llava/model/utils.py create mode 100644 llava/serve/__init__.py create mode 100644 llava/serve/cli.py create mode 100644 llava/serve/controller.py create mode 100644 llava/serve/eval_custom.py create mode 100644 llava/serve/eval_custom_chunck.py create mode 100644 llava/serve/eval_custom_predsig.py create mode 100644 llava/serve/examples/desert.jpg create mode 100644 llava/serve/examples/extreme_ironing.jpg create mode 100644 llava/serve/examples/sample_demo_1.mp4 create mode 100644 llava/serve/examples/sample_demo_13.mp4 create mode 100644 llava/serve/examples/sample_demo_22.mp4 create mode 100644 llava/serve/examples/sample_demo_3.mp4 create mode 100644 llava/serve/examples/sample_demo_8.mp4 create mode 100644 llava/serve/examples/sample_demo_9.mp4 create mode 100644 llava/serve/examples/sample_img_13.png create mode 100644 llava/serve/examples/sample_img_22.png create mode 100644 llava/serve/examples/sample_img_8.png create mode 100644 llava/serve/examples/waterview.jpg create mode 100644 llava/serve/gradio_utils.py create mode 100644 llava/serve/gradio_web_server.py create mode 100644 llava/serve/model_worker.py create mode 100644 llava/serve/register_worker.py create mode 100644 llava/serve/test_message.py create mode 100644 llava/serve/video_caption.py create mode 100644 llava/train/llama_flash_attn_monkey_patch.py create mode 100644 llava/train/llama_xformers_attn_monkey_patch.py create mode 100644 llava/train/llava_trainer.py create mode 100644 llava/train/train.py create mode 100644 llava/train/train_mem.py create mode 100644 llava/train/train_xformers.py create mode 100644 llava/utils.py create mode 100644 pyproject.toml create mode 100644 retrieval/BDDX_RAG_hybird_vpmatch.json create mode 100644 retrieval/BDDX_RAG_neg_3.json create mode 100644 retrieval/BDDX_RAG_pos_3.json create mode 100644 retrieval/BDDX_RAG_tuned_vpmatch_t13.json create mode 100644 retrieval/BDDX_RAG_visual_vpmatch.json create mode 100644 retrieval/bddx_vpath_info_match.json create mode 100644 retrieval/check_sim.py create mode 100644 retrieval/embeddings_project.npz create mode 100644 retrieval/final_match.py create mode 100644 retrieval/matching.py create mode 100644 retrieval/project.py create mode 100644 retrieval/projector/best_model.pth create mode 100644 retrieval/rag.py create mode 100644 retrieval/train.py create mode 100644 retrieval/train_projector.sh create mode 100644 retrieval/vpath_info_match.py create mode 100644 scripts/finetune.sh create mode 100644 scripts/pretrain.sh create mode 100644 scripts/zero2.json create mode 100644 scripts/zero3.json create mode 100644 scripts/zero3_offload.json create mode 100644 video_process/create_bddx_json.py diff --git a/README.md b/README.md index b1accae..3efd4d0 100644 --- a/README.md +++ b/README.md @@ -12,4 +12,57 @@ Official GitHub repository for "RAG-Driver: Generalisable Driving Explanations w **RAG-Driver** is a Multi-Modal Large Language Model with Retrieval-augmented In-context Learning capacity designed for generalisable and explainable end-to-end driving with strong zeroshot generalisation capacity. ## News -Codes and models will be released soon +## 📰 News +* **[2024.05.27]** Code update is in Progress, this repo is under active maintenance. + + +## TODO List +- [ ] Uploading the processed version of BDDX. +- [ ] Uploading the model checkpoint. +- [ ] Releasing Spoken-SAX dataset. +- [ ] further cleaning of retrieval engine codebase + +## Usage + +### Requirements and Installation +* Python >= 3.10 +* Pytorch == 2.0.1 +* CUDA Version >= 11.7 +* Install required packages: +```bash +git clone https://github.com/YuanJianhao508/RAG-Driver.git +cd RAG-DRIVER +conda create -n ragdriver python=3.10 -y +conda activate ragdriver +pip install --upgrade pip # enable PEP 660 support +pip install -e . +pip install -e ".[train]" +pip install flash-attn --no-build-isolation +pip install decord opencv-python git+https://github.com/facebookresearch/pytorchvideo.git@28fe037d212663c6a24f373b94cc5d478c8c1a1d +``` + +### Instruction Tuning on BDD-X dataset + +```bash +bash ./scripts/finetune.sh +``` + +- Download pre-trained Video-LLaVA LLM and projector checkpoint from [here](https://huggingface.co/LanguageBind/Video-LLaVA-7B) and [here](https://huggingface.co/LanguageBind/Video-LLaVA-Pretrain-7B) and specify path in '--model_name_or_path' and '--pretrain_mm_mlp_adapter'. +- Download pre-trained LanguageBind encoder from [here](https://huggingface.co/LanguageBind/LanguageBind_Video_merge) and specify path in '--video_tower'. +- Change the batch size '--per_device_train_batch_size' and gradient accumulation step '--gradient_accumulation_steps' based on the number of gpu available, please ensure the effective batch size (i.e. --per_device_train_batch_size * gradient accumulation step * number of gpus) equals '128'. + + +## Citations +If you find our paper and code useful in your research, please consider citing: +```BibTeX +@article{yuan2024rag, + title={RAG-Driver: Generalisable Driving Explanations with Retrieval-Augmented In-Context Learning in Multi-Modal Large Language Model}, + author={Yuan, Jianhao and Sun, Shuyang and Omeiza, Daniel and Zhao, Bo and Newman, Paul and Kunze, Lars and Gadd, Matthew}, + journal={arXiv preprint arXiv:2402.10828}, + year={2024} +} +} +``` + +## Acknowledgement +This repo is built on [Video-LLaVA](https://github.com/haotian-liu/LLaVA) and [ADAPT](https://github.com/jxbbb/ADAPT). We thank all the authors for their open-sourced codebase. \ No newline at end of file diff --git a/llava/__init__.py b/llava/__init__.py new file mode 100644 index 0000000..4d1f016 --- /dev/null +++ b/llava/__init__.py @@ -0,0 +1 @@ +from .model import LlavaLlamaForCausalLM diff --git a/llava/constants.py b/llava/constants.py new file mode 100644 index 0000000..f1bcfae --- /dev/null +++ b/llava/constants.py @@ -0,0 +1,18 @@ +CONTROLLER_HEART_BEAT_EXPIRATION = 30 +WORKER_HEART_BEAT_INTERVAL = 15 + +LOGDIR = "." + +# Model Constants +IGNORE_INDEX = -100 +X_TOKEN_INDEX = {'IMAGE': -200, 'VIDEO': -201, 'AUDIO': -202, 'THERMAL': -203, 'DEPTH': -204} +X_INDEX_TOKEN = {v: k for k, v in X_TOKEN_INDEX.items()} +# IMAGE_TOKEN_INDEX = -200 +DEFAULT_X_TOKEN = {'IMAGE': "", 'VIDEO': "