diff --git a/BLOG.md b/BLOG.md
index 22d33052..9cc905e5 100644
--- a/BLOG.md
+++ b/BLOG.md
@@ -1,5 +1,30 @@
 # InternVL's Blog
 
+## InternVL-Chat-V1.2-Plus
+
+> Date: 2024/02/21<br>
+> Developed by: Zhe Chen, Weiyun Wang, Wenhai Wang, Erfei Cui, Zhangwei Gao, Xizhou Zhu, Lewei Lu, Tong Lu, Yu Qiao, Jifeng Dai
+
+[InternVL-Chat-V1.2-Plus](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2-Plus) uses the same model architecture as InternVL-Chat-V1.2, but the difference lies in the SFT dataset. InternVL-Chat-V1.2 only utilizes an SFT dataset with 1.2M samples, while our plus version employs an SFT dataset with 12M samples.
+
+### Performance
+
+\* Proprietary Model          † Training Set Observed
+
+| name                    | image size | MMMU<br>(val) | MMMU<br>(test) | MathVista<br>(testmini) | MMB<br>(test) | MMB−CN<br>(test) | MMVP | MME      | ScienceQA<br>(image) | POPE | TextVQA<br>(val) | SEEDv1<br>(image) | VizWiz<br>(test) | GQA<br>(test) |
+| ----------------------- | ---------- | ------------- | -------------- | ----------------------- | ------------- | ---------------- | ---- | -------- | -------------------- | ---- | ---------------- | ----------------- | ---------------- | ------------- |
+| GPT-4V\*                | unknown    | 56.8          | 55.7           | 49.9                    | 77.0          | 74.4             | 38.7 | 1409/517 | -                    | -    | 78.0             | 71.6              | -                | -             |
+| Gemini Ultra\*          | unknown    | 59.4          | -              | 53.0                    | -             | -                | -    | -        | -                    | -    | 82.3             | -                 | -                | -             |
+| Gemini Pro\*            | unknown    | 47.9          | -              | 45.2                    | 73.6          | 74.3             | 40.7 | 1497/437 | -                    | -    | 74.6             | 70.7              | -                | -             |
+| Qwen−VL−Plus\*          | unknown    | 45.2          | 40.8           | 43.3                    | 67.0          | 70.7             | -    | 1681/502 | -                    | -    | 78.9             | 65.7              | -                | -             |
+| Qwen−VL−Max\*           | unknown    | 51.4          | 46.8           | 51.0                    | 77.6          | 75.7             | -    | -        | -                    | -    | 79.5             | -                 | -                | -             |
+|                         |            |               |                |                         |               |                  |      |          |                      |      |                  |                   |                  |               |
+| LLaVA−NEXT−34B          | 672x672    | 51.1          | 44.7           | 46.5                    | 79.3          | 79.0             | -    | 1631/397 | 81.8                 | 87.7 | 69.5             | 75.9              | 63.8             | 67.1†         |
+| InternVL−Chat−V1.2      | 448x448    | 51.6          | 46.2           | 47.7                    | 82.2          | 81.2             | 56.7 | 1672/509 | 83.3                 | 88.0 | 69.7             | 75.6              | 60.0             | 64.0†         |
+| InternVL−Chat−V1.2−Plus | 448x448    | 50.3          | 45.6           | 59.9                    | 83.8          | 82.0             | 58.7 | 1624/551 | 98.1†                | 88.7 | 71.3†            | 76.4              | -                | 66.9†         |
+
+- MMBench results are collected from the [leaderboard](https://mmbench.opencompass.org.cn/leaderboard).
+
 ## InternVL-Chat-V1.2
 
 > Date: 2024/02/12<br>
@@ -31,8 +56,8 @@ For more details about data preparation, please see [here](./internvl_chat#prepa
 | Qwen-VL-Plus\*     | unknown    | 45.2          | 40.8           | 43.3                    | 67.0          | 70.7             | -    | 1681/502 | -                    | -    | 78.9    | 65.7              | -                | -             |
 | Qwen-VL-Max\*      | unknown    | 51.4          | 46.8           | 51.0                    | 77.6          | 75.7             | -    | -        | -                    | -    | 79.5    | -                 | -                | -             |
 |                    |            |               |                |                         |               |                  |      |          |                      |      |         |                   |                  |               |
-| LLaVA-NEXT-34B     | 672x672    | 51.1          | 44.7           | 46.5                    | 79.3          | 79.0             | -    | 1631/397 | 81.8                 | 87.7 | 69.5    | 75.9              | 63.8             | 67.1          |
-| InternVL-Chat-V1.2 | 448x448    | 51.6          | 46.2           | 47.7                    | 82.2          | 81.2             | 56.7 | 1672/509 | 83.3                 | 88.0 | 69.7    | 75.6              | 60.0             | 64.0          |
+| LLaVA−NEXT−34B     | 672x672    | 51.1          | 44.7           | 46.5                    | 79.3          | 79.0             | -    | 1631/397 | 81.8                 | 87.7 | 69.5    | 75.9              | 63.8             | 67.1          |
+| InternVL−Chat−V1.2 | 448x448    | 51.6          | 46.2           | 47.7                    | 82.2          | 81.2             | 56.7 | 1672/509 | 83.3                 | 88.0 | 69.7    | 75.6              | 60.0             | 64.0          |
 
 - MMBench results are collected from the [leaderboard](https://mmbench.opencompass.org.cn/leaderboard).
 - In most benchmarks, InternVL-Chat-V1.2 achieves better performance than LLaVA-NeXT-34B.
diff --git a/README.md b/README.md
index bca7eef6..42e40dec 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,10 @@
 # <img width="60" alt="image" src="https://github.com/OpenGVLab/InternVL/assets/8529570/5aa4cda8-b453-40a0-9336-17012b430ae8"> InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks —— An Open-Source Alternative to ViT-22B
 
-\[[InternVL-Chat-V1.2 Blog](./BLOG.md)\]   \[[Paper](https://arxiv.org/abs/2312.14238)\]  \[[Chat Demo](https://internvl.opengvlab.com/)\]  \[[Quick Start](#quick-start-with-huggingface)\]  \[[中文解读](https://mp.weixin.qq.com/s/bdfAJRqOF9tUk8Vy9KC_XQ)\]
+\[[Update Blog](./BLOG.md)\]   \[[Paper](https://arxiv.org/abs/2312.14238)\]  \[[Chat Demo](https://internvl.opengvlab.com/)\]  \[[Quick Start](#quick-start-with-huggingface)\]  \[[中文解读](https://mp.weixin.qq.com/s/bdfAJRqOF9tUk8Vy9KC_XQ)\]
 
 ## News🚀🚀🚀
 
+- `2024/02/21`: [InternVL-Chat-V1.2-Plus](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2-Plus) achieves SOTA performance on MathVista (59.9), MMBench (83.8), and MMVP (58.7). See our [blog](BLOG.md) for more details.
 - `2024/02/12`: InternVL-Chat-V1.2 has been released, utilizing [Nous-Hermes-2-Yi-34B](https://huggingface.co/NousResearch/Nous-Hermes-2-Yi-34B) as the LLM. It achieves 51.6 on MMMU val and 82.3 on MMBench test. For more details, please refer to our [blog](BLOG.md) or try our [demo](https://internvl.opengvlab.com/). The model is now available on [HuggingFace](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2), and both training/evaluation data and scripts are open-sourced.
 - `2024/02/04`: [InternVL-Chat-V1.1](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-1) achieves 44.67% on [MMVP](https://github.com/tsb0601/MMVP), higher than GPT-4V!
 - `2024/01/27`: We release 448 resolution model, achieving 76.6 on MMBench dev, see [here](https://github.com/OpenGVLab/InternVL/tree/main/internvl_chat#-evaluation-chinese-models).
@@ -27,13 +28,14 @@ InternVL scales up the ViT to _**6B parameters**_ and aligns it with LLM.
 
 **Vision Large Language Model**
 
-| Model                   | Date       | Download                                                                             | Note                             |
-| ----------------------- | ---------- | ------------------------------------------------------------------------------------ | -------------------------------- |
-| InternVL-Chat-13B       | 2023.12.25 | 🤗 [HF link](https://huggingface.co/OpenGVLab/InternVL-Chat-ViT-6B-Vicuna-7B)        | English multimodal dialogue      |
-| InternVL-Chat-19B       | 2023.12.25 | 🤗 [HF link](https://huggingface.co/OpenGVLab/InternVL-Chat-ViT-6B-Vicuna-13B)       | English multimodal dialogue      |
-| InternVL-Chat-19B-448px | 2024.02.03 | 🤗 [HF link](https://huggingface.co/OpenGVLab/InternVL-Chat-ViT-6B-Vicuna-13B-448px) | 448 resolution                   |
-| InternVL-Chat-V1.1      | 2024.01.24 | 🤗 [HF link](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-1)            | support Chinese and stronger OCR |
-| InternVL-Chat-V1.2      | 2024.02.11 | 🤗 [HF link](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2)            | scaling up LLM to 34B (🔥new)    |
+| Model                   | Date       | Download                                                                             | Note                               |
+| ----------------------- | ---------- | ------------------------------------------------------------------------------------ | ---------------------------------- |
+| InternVL-Chat-13B       | 2023.12.25 | 🤗 [HF link](https://huggingface.co/OpenGVLab/InternVL-Chat-ViT-6B-Vicuna-7B)        | English multimodal dialogue        |
+| InternVL-Chat-19B       | 2023.12.25 | 🤗 [HF link](https://huggingface.co/OpenGVLab/InternVL-Chat-ViT-6B-Vicuna-13B)       | English multimodal dialogue        |
+| InternVL-Chat-19B-448px | 2024.02.03 | 🤗 [HF link](https://huggingface.co/OpenGVLab/InternVL-Chat-ViT-6B-Vicuna-13B-448px) | 448 resolution                     |
+| InternVL-Chat-V1.1      | 2024.01.24 | 🤗 [HF link](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-1)            | support Chinese and stronger OCR   |
+| InternVL-Chat-V1.2      | 2024.02.11 | 🤗 [HF link](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2)            | scaling up LLM to 34B (🔥new)      |
+| InternVL-Chat-V1.2-Plus | 2024.02.21 | 🤗 [HF link](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2-Plus)       | more SFT data and stronger (🔥new) |
 
 ## What can InternVL do?
 
@@ -503,6 +505,41 @@ caption = tokenizer.decode(pred[0].cpu(), skip_special_tokens=True).strip()
 <details>
   <summary>using InternVL-Chat (click to expand)</summary>
 
+- Single GPU
+
+```python
+import torch
+from PIL import Image
+from transformers import AutoModel, CLIPImageProcessor
+from transformers import AutoTokenizer
+
+path = "OpenGVLab/InternVL-Chat-Chinese-V1-1"
+model = AutoModel.from_pretrained(
+    path,
+    torch_dtype=torch.bfloat16,
+    low_cpu_mem_usage=True,
+    trust_remote_code=True).eval().cuda()
+
+tokenizer = AutoTokenizer.from_pretrained(path)
+image = Image.open('./examples/image2.jpg').convert('RGB')
+image = image.resize((448, 448))
+image_processor = CLIPImageProcessor.from_pretrained(path)
+
+pixel_values = image_processor(images=image, return_tensors='pt').pixel_values
+pixel_values = pixel_values.to(torch.bfloat16).cuda()
+
+generation_config = dict(
+    num_beams=1,
+    max_new_tokens=512,
+    do_sample=False,
+)
+
+question = "请详细描述图片"
+response = model.chat(tokenizer, pixel_values, question, generation_config)
+```
+
+- Multiple GPUs
+
 ```python
 import torch
 from PIL import Image
diff --git a/internvl_chat/README.md b/internvl_chat/README.md
index 85d94577..cbcd9338 100644
--- a/internvl_chat/README.md
+++ b/internvl_chat/README.md
@@ -133,41 +133,52 @@ PARTITION='your partition' GPUS=32 PER_DEVICE_BATCH_SIZE=8 sh shell/hermes2_yi34
 PARTITION='your partition' GPUS=64 PER_DEVICE_BATCH_SIZE=8 sh shell/hermes2_yi34b/internvl_chat_v1_2_hermes2_yi34b_448_finetune.sh
 ```
 
-The hyperparameters used for finetuning are listed in the following table.
+The hyperparameters used for fine-tuning are listed in the following table. And, you can view the training logs in tensorboard at [here](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2/tensorboard).
 
-| Hyperparameter     | Trainable Param | Global Batch Size | Learning rate | Epochs | Max length | Weight decay |
-| ------------------ | --------------- | ----------------- | ------------- | ------ | ---------- | ------------ |
-| InternVL-Chat-V1.2 | 40B             | 512               | 1e-5          | 1      | 2048       | 0.05         |
+| Hyperparameter     | Trainable Param  | Global Batch Size | Learning rate | Epochs | Max length | Weight decay |
+| ------------------ | ---------------- | ----------------- | ------------- | ------ | ---------- | ------------ |
+| InternVL-Chat-V1.2 | 40B (full model) | 512               | 1e-5          | 1      | 2048       | 0.05         |
 
-## 📊 Evaluation
+## Continue Fine-tune
 
-\* Training set observed.
+You can continue to fine-tune the checkpoint from the previous training process use this [script](./shell/hermes2_yi34b/internvl_chat_v1_2_hermes2_yi34b_448_finetune_continue.sh).
+
+Before fine-tuning, you should set the `--meta_path` in to your custom meta file of training data.
+
+```sh
+# using 16 GPUs
+PARTITION='your partition' GPUS=16 sh shell/hermes2_yi34b/internvl_chat_v1_2_hermes2_yi34b_448_finetune_continue.sh
+```
+
+## 📊 Evaluation
 
 **MultiModal Benchmark**
 
-| model                                                                             | MME            | MMB<sub>dev/test</sub> | MMB-CN<sub>dev/test</sub> | POPE | MMVP | MathVista |
-| --------------------------------------------------------------------------------- | -------------- | ---------------------- | ------------------------- | ---- | ---- | --------- |
-| [InternVL-Chat-V1.1](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-1) | 1672.3 / 341.1 | 76.6 / 75.4            | 71.5 / 70.1               | 87.2 | 44.7 | 34.5      |
-| [InternVL-Chat-V1.2](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2) | 1672.1 / 509.3 | 81.4 / 82.2            | 79.5 / 81.2               | 88.0 | 56.7 | 47.7      |
+\* Training set observed.
 
-| model                                                                             | MMMU<sub>val/test</sub>                                                            | CMMMU<sub>val/test</sub> | Tiny<sub>LVLM</sub> | LLaVA<sub>bench</sub> | MM-Vet |
-| --------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------- | ------------------------ | ------------------- | --------------------- | ------ |
-| [InternVL-Chat-V1.1](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-1) | 39.1 / 35.3                                                                        | 34.8 / 34.0              | 344.5               | 76.3                  | 45.0   |
-| [InternVL-Chat-V1.2](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2) | 51.6 / [46.2](https://eval.ai/web/challenges/challenge-page/2179/leaderboard/5377) | -                        | 350.3               | -                     | 48.9   |
+| name                                                                                        | model size | MathVista<br>(testmini) | MMB<br>(dev/test) | MMB−CN<br>(dev/test) | MMMU<br>(val/test)                                                                 | CMMMU<br>(val/test) | MMVP | MME            | POPE | Tiny LVLM | SEEDv1<br>(image) | LLaVA Wild | MM−Vet |
+| ------------------------------------------------------------------------------------------- | ---------- | ----------------------- | ----------------- | -------------------- | ---------------------------------------------------------------------------------- | ------------------- | ---- | -------------- | ---- | --------- | ----------------- | ---------- | ------ |
+| [InternVL−Chat−V1.1](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-1)           | 19B        | 34.5                    | 76.7 / 75.4       | 71.9 / 70.3          | 39.1 / 35.3                                                                        | 34.8 / 34.0         | 44.7 | 1675.1 / 348.6 | 87.1 | 343.2     | 73.2              | 73.2       | 46.7   |
+| [InternVL−Chat−V1.2](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2)           | 40B        | 47.7                    | 81.4 / 82.2       | 79.5 / 81.2          | 51.6 / [46.2](https://eval.ai/web/challenges/challenge-page/2179/leaderboard/5377) | TODO                | 56.7 | 1672.1 / 509.3 | 88.0 | 350.3     | 75.6              | 85.0       | 48.9   |
+| [InternVL−Chat−V1.2−Plus](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2-Plus) | 40B        | 59.9                    | 83.4 / 83.8       | 81.6 / 82.0          | 50.3 / 45.6                                                                        | TODO                | 58.7 | 1623.6 / 550.7 | 88.7 | 353.9     | 76.4              | 84.6       | 47.9   |
 
-**Visual Question Answering**
+**Image Captioning & Visual Question Answering**
+
+\* Training set observed.
 
-| model                                                                             | VQAv2<sub>test</sub> | OKVQA<sub>val</sub> | TextVQA<sub>val</sub> | VizWiz<sub>val/test</sub> | AI2D<sub>test</sub> | GQA<sub>test</sub> | SQA<sub>test</sub> |
-| --------------------------------------------------------------------------------- | -------------------- | ------------------- | --------------------- | ------------------------- | ------------------- | ------------------ | ------------------ |
-| [InternVL-Chat-V1.1](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-1) | 80.9\*               | 64.2\*              | 65.8                  | 58.3 / 57.3               | 70.2\*              | 62.4\*             | 91.2\*             |
-| [InternVL-Chat-V1.2](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2) | -                    | 62.5\*              | 69.7                  | 61.9 / 60.0               | 71.6\*              | 64.0\*             | 83.3               |
+| name                                                                                        | model size | COCO<br>(test) | Flickr30K<br>(test) | NoCaps<br>(val) | VQAv2<br>(testdev) | OKVQA<br>(val) | TextVQA<br>(val) | VizWiz<br>(val/test) | AI2D<br>(test) | GQA<br>(test) | ScienceQA<br>(image) |
+| ------------------------------------------------------------------------------------------- | ---------- | -------------- | ------------------- | --------------- | ------------------ | -------------- | ---------------- | -------------------- | -------------- | ------------- | -------------------- |
+| [InternVL−Chat−V1.1](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-1)           | 19B        | 142.2\*        | 85.3                | 120.8           | 80.9\*             | 64.1\*         | 65.9             | 59.0 / 57.3          | 70.3\*         | 62.5\*        | 90.1\*               |
+| [InternVL−Chat−V1.2](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2)           | 40B        | 113.9          | 92.4                | 112.5           | -                  | 62.5\*         | 69.7             | 61.9 / 60.0          | 71.6\*         | 64.0\*        | 83.3                 |
+| [InternVL−Chat−V1.2−Plus](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2-Plus) | 40B        | 143.4\*        | 90.5                | 125.8           | -                  | 67.6\*         | 71.3\*           | 61.3 / -             | 74.2\*         | 66.9\*        | 98.1\*               |
 
-**Image Captioning**
+**Visual Grounding**
 
-| model                                                                             | COCO<sub>test</sub> | Flickr30K<sub>test</sub> | NoCaps<sub>val</sub> |
-| --------------------------------------------------------------------------------- | ------------------- | ------------------------ | -------------------- |
-| [InternVL-Chat-V1.1](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-1) | 141.8\*             | 84.3                     | 120.4                |
-| [InternVL-Chat-V1.2](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2) | 113.9               | 92.4                     | 112.5                |
+| name                                                                                        | model size | RefCOCO<br>(val) | RefCOCO<br>(testA) | RefCOCO<br>(testB) | RefCOCO+<br>(val) | RefCOCO+<br>(testA) | RefCOCO+<br>(testB) | RefCOCO−g<br>(val) | RefCOCO−g<br>(test) |
+| ------------------------------------------------------------------------------------------- | ---------- | ---------------- | ------------------ | ------------------ | ----------------- | ------------------- | ------------------- | ------------------ | ------------------- |
+| [InternVL−Chat−V1.1](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-1)           | 19B        | 84.7             | 89.9               | 78.6               | 78.5              | 85.6                | 70.1                | 81.0               | 81.4                |
+| [InternVL−Chat−V1.2](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2)           | 40B        | 74.4             | 80.3               | 66.5               | 70.7              | 77.6                | 62.0                | 69.2               | 70.0                |
+| [InternVL−Chat−V1.2−Plus](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2-Plus) | 40B        | 90.2             | 93.4               | 85.5               | 85.3              | 90.4                | 79.7                | 88.5               | 88.8                |
 
 ## 📊 Evaluation (Legacy Models)
 
diff --git a/internvl_chat/eval/llava_bench/evaluate_llava_bench.py b/internvl_chat/eval/llava_bench/evaluate_llava_bench.py
index 74cccc25..669f66b1 100644
--- a/internvl_chat/eval/llava_bench/evaluate_llava_bench.py
+++ b/internvl_chat/eval/llava_bench/evaluate_llava_bench.py
@@ -62,7 +62,7 @@ def evaluate_chat_model():
                 max_new_tokens=ds_collections[ds_name]['max_new_tokens'],
                 min_new_tokens=ds_collections[ds_name]['min_new_tokens'],
                 length_penalty=1,
-                repetition_penalty=1.5,
+                # repetition_penalty=1.5,
                 do_sample=True if args.temperature > 0 else False,
                 temperature=args.temperature,
             )
@@ -97,7 +97,7 @@ def evaluate_chat_model():
     parser.add_argument('--batch-size', type=int, default=1)
     parser.add_argument('--num-workers', type=int, default=1)
     parser.add_argument('--num-beams', type=int, default=5)
-    parser.add_argument('--temperature', type=float, default=1.0)
+    parser.add_argument('--temperature', type=float, default=0.0)
     parser.add_argument('--out-dir', type=str, default='results')
     parser.add_argument('--seed', type=int, default=0)
     args = parser.parse_args()
diff --git a/internvl_chat/eval/mmbench/evaluate_mmbench.py b/internvl_chat/eval/mmbench/evaluate_mmbench.py
index 962fa53c..e77b543e 100644
--- a/internvl_chat/eval/mmbench/evaluate_mmbench.py
+++ b/internvl_chat/eval/mmbench/evaluate_mmbench.py
@@ -188,7 +188,7 @@ def evaluate_chat_model():
                 max_new_tokens=ds_collections[ds_name]['max_new_tokens'],
                 min_new_tokens=ds_collections[ds_name]['min_new_tokens'],
                 length_penalty=1,
-                repetition_penalty=1.2,
+                # repetition_penalty=1.2,
                 do_sample=True if args.temperature > 0 else False,
                 temperature=args.temperature,
             )
diff --git a/internvl_chat/eval/mmvet/evaluate_mmvet.py b/internvl_chat/eval/mmvet/evaluate_mmvet.py
index 793923f1..377de0da 100644
--- a/internvl_chat/eval/mmvet/evaluate_mmvet.py
+++ b/internvl_chat/eval/mmvet/evaluate_mmvet.py
@@ -74,7 +74,7 @@ def evaluate_chat_model():
                 max_new_tokens=ds_collections[ds_name]['max_new_tokens'],
                 min_new_tokens=ds_collections[ds_name]['min_new_tokens'],
                 length_penalty=1.0,
-                repetition_penalty=1.2,
+                # repetition_penalty=1.2,
                 do_sample=True if args.temperature > 0 else False,
                 temperature=args.temperature,
             )
diff --git a/internvl_chat/eval/refcoco/evaluate_grounding.py b/internvl_chat/eval/refcoco/evaluate_grounding.py
index 8c55b043..f33baf70 100644
--- a/internvl_chat/eval/refcoco/evaluate_grounding.py
+++ b/internvl_chat/eval/refcoco/evaluate_grounding.py
@@ -128,16 +128,14 @@ def evaluate_chat_model():
         )
 
         outputs = []
-        for _, (pixel_values, questions, bboxes, hws) in tqdm(enumerate(dataloader)):
+        for _, (pixel_values, questions, bboxes, hws) in enumerate(tqdm(dataloader)):
             pixel_values = pixel_values.to(torch.bfloat16).cuda()
             generation_config = dict(
-                do_sample=args.sample,
                 num_beams=args.num_beams,
                 max_new_tokens=100,
-                min_new_tokens=20,
+                min_new_tokens=1,
                 length_penalty=1,
-                top_k=args.top_k,
-                top_p=args.top_p,
+                do_sample=True if args.temperature > 0 else False,
                 temperature=args.temperature,
             )
             pred = model.chat(
@@ -182,7 +180,8 @@ def evaluate_chat_model():
                                            dtype=torch.float32).view(-1, 4)
                 predict_bbox = torch.tensor(predict_bbox,
                                             dtype=torch.float32).view(-1, 4)
-                predict_bbox = predict_bbox / divisor
+                if predict_bbox.sum() >= 4:
+                    predict_bbox = predict_bbox / 1000
                 predict_bbox[:, 0::2] *= output['hw'][1]
                 predict_bbox[:, 1::2] *= output['hw'][0]
                 iou, _ = box_iou(predict_bbox, target_bbox)
@@ -217,10 +216,8 @@ def evaluate_chat_model():
     parser.add_argument('--num-workers', type=int, default=1)
     parser.add_argument('--num-beams', type=int, default=5)
     parser.add_argument('--out-dir', type=str, default='results')
-    parser.add_argument('--top-k', type=int, default=50)
-    parser.add_argument('--top-p', type=float, default=0.9)
-    parser.add_argument('--sample', type=bool, default=True)
-    parser.add_argument('--temperature', type=float, default=1.0)
+    parser.add_argument('--sample', type=bool, default=False)
+    parser.add_argument('--temperature', type=float, default=0.0)
     parser.add_argument('--seed', type=int, default=0)
     args = parser.parse_args()
 
@@ -240,6 +237,7 @@ def evaluate_chat_model():
     torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
 
     tokenizer = LlamaTokenizer.from_pretrained(args.checkpoint)
+    PATTERN = re.compile(r'\[*\[(.*?),(.*?),(.*?),(.*?)\]\]*')
 
     if 'qllama' in args.checkpoint.lower():
         from internvl.model.internvl_chat_with_qllama import InternVLChatModel
@@ -247,8 +245,6 @@ def evaluate_chat_model():
             args.checkpoint, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).cuda().eval()
         image_size = model.internvl.config.force_image_size or model.config.internvl_config.vision_config.image_size
         pad2square = model.config.pad2square
-        PATTERN = re.compile(r'\[(.*?),(.*?),(.*?),(.*?)\]')
-        divisor = 1  # TODO: divisor
         prompt = 'Please provide the bounding box coordinate of the region this sentence describes: {}'
     else:
         from internvl.model.internvl_chat import InternVLChatModel
@@ -256,8 +252,6 @@ def evaluate_chat_model():
             args.checkpoint, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).cuda().eval()
         image_size = model.config.force_image_size or model.config.vision_config.image_size
         pad2square = model.config.pad2square
-        PATTERN = re.compile(r'\[\[(.*?),(.*?),(.*?),(.*?)\]\]')
-        divisor = 1  # TODO: divisor
         prompt = 'Please provide the bounding box coordinate of the region this sentence describes: <ref>{}</ref>'
 
     total_params = sum(p.numel() for p in model.parameters()) / 1e9
diff --git a/internvl_chat/evaluate.sh b/internvl_chat/evaluate.sh
index 1fa94c7f..e9a0d084 100644
--- a/internvl_chat/evaluate.sh
+++ b/internvl_chat/evaluate.sh
@@ -231,6 +231,16 @@ if [ ${DATASET} == "refcoco" ]; then
     eval/refcoco/evaluate_grounding.py --checkpoint ${CHECKPOINT}
 fi
 
+if [ ${DATASET} == "refcoco-val" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/refcoco/evaluate_grounding.py --checkpoint ${CHECKPOINT} --datasets refcoco_val
+fi
+
 if [ ${DATASET} == "llava-bench" ]; then
     rm -rf results/llava_bench_results_review.jsonl
     python eval/llava_bench/evaluate_llava_bench.py --checkpoint ${CHECKPOINT}
diff --git a/internvl_chat/internvl/dist_utils.py b/internvl_chat/internvl/dist_utils.py
index 81875e7e..0eb8ae27 100644
--- a/internvl_chat/internvl/dist_utils.py
+++ b/internvl_chat/internvl/dist_utils.py
@@ -47,7 +47,8 @@ def _init_dist_pytorch(backend, **kwargs):
     rank = int(os.environ['RANK'])
     num_gpus = torch.cuda.device_count()
     torch.cuda.set_device(rank % num_gpus)
-    dist.init_process_group(backend=backend, **kwargs)
+    # dist.init_process_group(backend=backend, **kwargs)
+    deepspeed.init_distributed(dist_backend=backend)
 
 
 def _init_dist_mpi(backend, **kwargs):
diff --git a/internvl_chat/internvl/model/internvl_chat/modeling_internvl_chat.py b/internvl_chat/internvl/model/internvl_chat/modeling_internvl_chat.py
index 3246741d..5697ff06 100644
--- a/internvl_chat/internvl/model/internvl_chat/modeling_internvl_chat.py
+++ b/internvl_chat/internvl/model/internvl_chat/modeling_internvl_chat.py
@@ -33,6 +33,7 @@ def __init__(self, config: InternVLChatConfig, vision_model=None, language_model
         self.select_layer = config.select_layer
         self.template = config.template
         self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio ** 2))
+        self.downsample_ratio = config.downsample_ratio
         logger.info(f'num_image_token: {self.num_image_token}')
         if vision_model is not None:
             self.vision_model = vision_model
@@ -182,13 +183,13 @@ def extract_feature(self, pixel_values):
             vit_embeds = self.vision_model(
                 pixel_values=pixel_values,
                 output_hidden_states=True,
-                return_dict=True).hidden_states[-4]
+                return_dict=True).hidden_states[self.select_layer]
         vit_embeds = vit_embeds[:, 1:, :]
         # if torch.distributed.get_rank() == 0:
         #     print("before pixel shuffle:", vit_embeds.shape)
         h = w = int(vit_embeds.shape[1] ** 0.5)
         vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
-        vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=0.5)
+        vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
         vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
         # if torch.distributed.get_rank() == 0:
         #     print("after pixel shuffle:", vit_embeds.shape)
diff --git a/internvl_chat/internvl/serve/model_worker.py b/internvl_chat/internvl/serve/model_worker.py
index 7f145c22..8fd9f045 100644
--- a/internvl_chat/internvl/serve/model_worker.py
+++ b/internvl_chat/internvl/serve/model_worker.py
@@ -189,7 +189,7 @@ def generate_stream(self, params):
             input_ids=input_ids,
             do_sample=do_sample,
             temperature=temperature,
-            repetition_penalty=1.1,
+            repetition_penalty=1.0,
             top_p=top_p,
             max_new_tokens=max_new_tokens,
             streamer=streamer,
diff --git a/internvl_chat/internvl/train/internvl_chat_finetune.py b/internvl_chat/internvl/train/internvl_chat_finetune.py
index a7a8c756..1d9b357b 100644
--- a/internvl_chat/internvl/train/internvl_chat_finetune.py
+++ b/internvl_chat/internvl/train/internvl_chat_finetune.py
@@ -514,7 +514,8 @@ def main():
     # Parse input arguments
     # See all possible arguments in src/transformers/training_args.py
     # If use DeepSpeed zero3, init_dist must before HfArgumentParser
-    init_dist(launcher='slurm', backend='nccl')
+    launcher = os.environ.get('LAUNCHER', 'slurm')
+    init_dist(launcher=launcher, backend='nccl')
     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
     if len(sys.argv) == 2 and sys.argv[1].endswith('.json'):
         # If we pass only one argument to the script, and it's the path to a json file,
diff --git a/internvl_chat/internvl/train/internvl_chat_pretrain.py b/internvl_chat/internvl/train/internvl_chat_pretrain.py
index 21d48d4c..60039ecd 100644
--- a/internvl_chat/internvl/train/internvl_chat_pretrain.py
+++ b/internvl_chat/internvl/train/internvl_chat_pretrain.py
@@ -500,7 +500,8 @@ def main():
     # Parse input arguments
     # See all possible arguments in src/transformers/training_args.py
     # If use DeepSpeed zero3, init_dist must before HfArgumentParser
-    init_dist(launcher='slurm', backend='nccl')
+    launcher = os.environ.get('LAUNCHER', 'slurm')
+    init_dist(launcher=launcher, backend='nccl')
     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
     if len(sys.argv) == 2 and sys.argv[1].endswith('.json'):
         # If we pass only one argument to the script, and it's the path to a json file,
diff --git a/internvl_chat/pyproject.toml b/internvl_chat/pyproject.toml
index f0491d13..e6b2ab0a 100644
--- a/internvl_chat/pyproject.toml
+++ b/internvl_chat/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "internvl_chat"
-version = "1.2.1"
+version = "1.2.2"
 description = "Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks."
 readme = "README.md"
 requires-python = ">=3.8"
diff --git a/internvl_chat/shell/hermes2_yi34b/internvl_chat_v1_2_hermes2_yi34b_448_finetune_continue.sh b/internvl_chat/shell/hermes2_yi34b/internvl_chat_v1_2_hermes2_yi34b_448_finetune_continue.sh
new file mode 100644
index 00000000..8d387cfa
--- /dev/null
+++ b/internvl_chat/shell/hermes2_yi34b/internvl_chat_v1_2_hermes2_yi34b_448_finetune_continue.sh
@@ -0,0 +1,72 @@
+set -x
+
+PARTITION=${PARTITION:-"INTERN2"}
+GPUS=${GPUS:-16}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-1}
+SRUN_ARGS=${SRUN_ARGS:-""}
+BATCH_SIZE=${BATCH_SIZE:-128}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34223
+
+OUTPUT_DIR='work_dirs/internvl_chat_v1_2_hermes2_yi34b_448_finetune_continue'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 16
+# batch size per gpu: 4
+# gradient accumulation steps: 1
+# total batch size: 128
+# epoch: 1
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "./pretrained/InternVL-Chat-Chinese-V1-2" \
+  --conv_style "Hermes-2" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./path/to/your/custom/meta/file" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.0 \
+  --pad2square False \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone True \
+  --vision_select_layer -1 \
+  --use_data_resampling False \
+  --dataloader_num_workers 2 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 1e-5 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 2048 \
+  --do_train True \
+  --grad_checkpoint True \
+  --deepspeed "zero_stage3_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/internvl_chat/zero_stage1_config.json b/internvl_chat/zero_stage1_config.json
index 71fca6e4..9cd513d0 100644
--- a/internvl_chat/zero_stage1_config.json
+++ b/internvl_chat/zero_stage1_config.json
@@ -20,15 +20,6 @@
   "bf16": {
     "enabled": "auto"
   },
-  "scheduler": {
-    "type": "WarmupDecayLR",
-    "params": {
-      "warmup_min_lr": "auto",
-      "warmup_max_lr": "auto",
-      "warmup_num_steps": "auto",
-      "total_num_steps": "auto"
-    }
-  },
   "optimizer": {
     "type": "AdamW",
     "params": {
diff --git a/internvl_chat/zero_stage1_config_wo_opt.json b/internvl_chat/zero_stage1_config_wo_opt.json
deleted file mode 100644
index e832ceac..00000000
--- a/internvl_chat/zero_stage1_config_wo_opt.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
-  "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 1e9,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 1e9,
-    "contiguous_gradients": true
-  },
-  "fp16": {
-    "enabled": "auto",
-    "auto_cast": true,
-    "loss_scale": 0,
-    "initial_scale_power": 32,
-    "loss_scale_window": 1000,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-  "bf16": {
-    "enabled": "auto"
-  },
-  "gradient_accumulation_steps": "auto",
-  "gradient_clipping": "auto",
-  "steps_per_print": 2000,
-  "train_batch_size": "auto",
-  "train_micro_batch_size_per_gpu": "auto",
-  "wall_clock_breakdown": false
-}
diff --git a/internvl_chat/zero_stage2_config.json b/internvl_chat/zero_stage2_config.json
index b6f859ac..9e831dca 100644
--- a/internvl_chat/zero_stage2_config.json
+++ b/internvl_chat/zero_stage2_config.json
@@ -20,15 +20,6 @@
   "bf16": {
     "enabled": "auto"
   },
-  "scheduler": {
-    "type": "WarmupDecayLR",
-    "params": {
-      "warmup_min_lr": "auto",
-      "warmup_max_lr": "auto",
-      "warmup_num_steps": "auto",
-      "total_num_steps": "auto"
-    }
-  },
   "optimizer": {
     "type": "AdamW",
     "params": {