From dda5784315c65e3885fd25e59e8d46f5be48bd9d Mon Sep 17 00:00:00 2001
From: Haodong Duan <dhd.efz@gmail.com>
Date: Wed, 11 Dec 2024 18:11:53 +0800
Subject: [PATCH] [Fix] Update SenseChat Vision

---
 vlmeval/api/sensechat_vision.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/vlmeval/api/sensechat_vision.py b/vlmeval/api/sensechat_vision.py
index e4ace4ca5..35c5a0d13 100644
--- a/vlmeval/api/sensechat_vision.py
+++ b/vlmeval/api/sensechat_vision.py
@@ -40,7 +40,7 @@ def dump_image(self, line, dataset):
         """
         ROOT = LMUDataRoot()
         assert isinstance(dataset, str)
-        img_root = osp.join(ROOT, 'images', img_root_map[dataset] if dataset in img_root_map else dataset)
+        img_root = osp.join(ROOT, 'images', img_root_map(dataset))
         os.makedirs(img_root, exist_ok=True)
         if 'image' in line:
             if isinstance(line['image'], list):
@@ -141,8 +141,8 @@ def build_prompt(self, line, dataset=None):
             for key, item in options.items():
                 question += f'\n{key}. {item}'
             prompt = {
-                'multiple-choice': 'You are an expert in {}. Please solve the university-level {} examination question, which includes interleaved images and text. Your output should be divided into two parts: First, reason about the correct answer. Then write the answer in the following format where X is exactly one of the choices given by the problem: "ANSWER: X". If you are uncertain of the correct answer, guess the most likely one.',  # noqa: E501
-                'open': 'You are an expert in {}. Please solve the university-level {} examination question, which includes interleaved images and text. Your output should be divided into two parts: First, reason about the correct answer. Then write the answer in the following format where X is only the answer and nothing else: "ANSWER: X"'  # noqa: E501
+                'multiple-choice': 'Answer with carefully thought step by step. Apply the thinking process recursively at both macro and micro levels. Verify consistency of reasoning and look for potential flaws or gaps during thinking. When realize mistakes, explain why the previous thinking was incorrect, fix it and then continue thinking.\n\n',  # noqa
+                'open': 'Answer with carefully thought step by step. Apply the thinking process recursively at both macro and micro levels. Verify consistency of reasoning and look for potential flaws or gaps during thinking. When realize mistakes, explain why the previous thinking was incorrect, fix it and then continue thinking.\n\n'  # noqa
             }
             subject = '_'.join(line['id'].split('_')[1:-1])
             prompt = prompt[line['question_type']].format(subject, subject) + '\n' + question
@@ -168,7 +168,7 @@ def generate_inner(self, inputs, **kwargs) -> str:
         inputs = [inputs] if isinstance(inputs, str) else inputs
         dataset = kwargs.get('dataset', None)
 
-        if dataset is not None and listinstr(['ChartQA_TEST'], dataset):
+        if dataset is not None and listinstr(['ChartQA_TEST','MathVista_MINI'], dataset):
             self.max_num = 12
         elif dataset is not None and listinstr(['DocVQA_VAL', 'DocVQA_TEST'], dataset):
             self.max_num = 18
@@ -182,9 +182,11 @@ def generate_inner(self, inputs, **kwargs) -> str:
         elif listinstr(['AI2D_TEST'], dataset):
             self.max_new_tokens = 10
         elif 'MMMU' in dataset:
-            self.max_new_tokens = 1024
+            self.max_new_tokens = 4096  # 1024
         elif 'MMBench' in dataset:
             self.max_new_tokens = 100
+        elif 'MathVista_MINI' in dataset:
+            self.max_new_tokens = 4096
 
         prompt, image = self.message_to_promptimg(message=inputs, dataset=dataset)
 
@@ -212,7 +214,11 @@ def generate_inner(self, inputs, **kwargs) -> str:
 
         data = {
             'messages': message,
-            'max_new_tokens': self.max_new_tokens,
+            'max_new_tokens': self.max_new_tokens,  # 1024
+            'temperature': 0,
+            "top_k": 0,
+            "top_p": 0.99,
+            'repetition_penalty': 1.05,
             'model': self.model,
             'stream': False,
         }
@@ -232,8 +238,6 @@ def generate_inner(self, inputs, **kwargs) -> str:
         try:
             assert response.status_code == 200
             response = response.json()['data']['choices'][0]['message'].strip()
-            if dataset is not None and 'MMMU' in dataset:
-                response = response.split('ANSWER: ')[-1].strip()
             if self.verbose:
                 self.logger.info(f'inputs: {inputs}\nanswer: {response}')
             return 0, response, 'Succeeded! '
@@ -241,7 +245,7 @@ def generate_inner(self, inputs, **kwargs) -> str:
             if self.verbose:
                 self.logger.error('---------------------------ERROR---------------------------')
                 self.logger.error(response.json())
-                self.logger.error(f'{type(err)}: {err}')
+                self.logger.error(err)
                 self.logger.error('---------------------------request_id---------------------------' + request_id)
                 self.logger.error(
                     'api error' + response.json()['error']['message']