update chart2text_preprocess

Alpha-Innovator · Apr 4, 2024 · 0cac89d · 0cac89d
1 parent 9293061
commit 0cac89d
Show file tree

Hide file tree

Showing 2 changed files with 87 additions and 14 deletions.
diff --git a/auxiliary_decoder/train/templates/train_data.json b/auxiliary_decoder/train/templates/train_data.json
@@ -10,23 +10,18 @@
     "output": "2018"
   },
   {
-    "instruction": "Given the following triplet data (marked by <data>) with the title (marked by <title>) and the question related to the data (marked by <question>), give the answer with no output of hints, explanations or notes.",
-    "input": "<data> [(2007,Not too much/not at all,0),(2007,Great deal) fair amount,26),(2009,Not too much/not at all,51),(2009,Great deal) fair amount,36),(2011,Not too much/not at all,0),(2011,Great deal) fair amount,0),(2013,Not too much/not at all,60),(2013,Great deal) fair amount,0),(2015,Not too much/not at all,0),(2015,Great deal) fair amount,0),(2017,Not too much/not at all,0),(2017,Great deal) fair amount,0),(2018,Not too much/not at all,772),(2018,Great deal) fair amount,0)] <title> Belief that U.S. considers other countries' interests in foreign policy returns to 2007 level In making international policy decisions, to what extent do you think the US. takes the interest of your country into account?  <question> What does the blue line represent?",
-    "output": "Not too much/not at all"
-  },
-  {
-    "instruction": "Given the following triplet data (marked by <data>) with the title (marked by <title>) and the question related to the data (marked by <question>), give the answer with no output of hints, explanations or notes.",
-    "input": "<data> [(2007,Not too much/not at all,0),(2007,Great deal) fair amount,26),(2009,Not too much/not at all,51),(2009,Great deal) fair amount,36),(2011,Not too much/not at all,0),(2011,Great deal) fair amount,0),(2013,Not too much/not at all,60),(2013,Great deal) fair amount,0),(2015,Not too much/not at all,0),(2015,Great deal) fair amount,0),(2017,Not too much/not at all,0),(2017,Great deal) fair amount,0),(2018,Not too much/not at all,772),(2018,Great deal) fair amount,0)] <title> Belief that U.S. considers other countries' interests in foreign policy returns to 2007 level In making international policy decisions, to what extent do you think the US. takes the interest of your country into account?  <question> What is the max value of blue line?",
-    "output": "0.72"
+    "instruction": "Generate a descriptive text according to the CSV data, title, and chart type.",
+    "input": "<data> [(Facebook,Users(million),2500),(Facebook,Usage Time(minutes),150),(YouTube,Users(million),2000),(YouTube,Usage Time(minutes),200),(Instagram,Users(million),1500),(Instagram,Usage Time(minutes),130),(Twitter,Users(million),1000),(Twitter,Usage Time(minutes),100)] <title>  Social Media Usage by Number of Users and Time Spent in 2021 <type> bar_chart",
+    "output": "The bar chart illustrates social media usage by the number of users in millions and the time spent in minutes during 2021. Facebook leads with 2500 million users and an average usage time of 150 minutes. YouTube follows with 2000 million users and a higher average usage time of 200 minutes. Instagram has 1500 million users with 130 minutes of usage time, while Twitter has the least, with 1000 million users spending an average of 100 minutes. Each platform's engagement is visually compared by users and time spent."
   },
   {
-    "instruction": "Given the following triplet data (marked by <data>) with the title (marked by <title>) and the question related to the data (marked by <question>), give the answer with no output of hints, explanations or notes.",
-    "input": "<data> [(2010,Budget deficit,60),(2010,Jobs,0),(2010,Economy,0.0),(2012,Budget deficit,0),(2012,Jobs,0),(2012,Economy,0.0),(2014,Budget deficit,63),(2014,Jobs,0),(2014,Economy,0.0),(2016,Budget deficit,0),(2016,Jobs,0),(2016,Economy,0.0),(2018,Budget deficit,48),(2018,Jobs,62),(2018,Economy,0.71)] <title> Downward trend in public views of economic issues as 'top priority' % who say_ is a top priority for the president and Congress  <question> What's the percentage of respondents who say Job is a top priority for the president and Congress in 2016?",
-    "output": "68"
+    "instruction": "Produce a concise synthesis or key insights based on the graphical data representation.",
+    "input": "<data> [(2019,Philosophy(%),40),(2019,Psychology(%),30),(2019,Sociology(%),35),(2020,Philosophy(%),42),(2020,Psychology(%),31),(2020,Sociology(%),38),(2021,Philosophy(%),44),(2021,Psychology(%),32),(2021,Sociology(%),40)] <title>  Percentage of Social Sciences and Humanities in three fields from 2019 to 2021 ",
+    "output": "Over three consecutive years, 2019 to 2021, there has been a modest but consistent increase in the percentage of students majoring in Philosophy, Psychology, and Sociology. Philosophy saw a rise from 40% to 44%, Psychology inched up from 30% to 32%, and Sociology grew from 35% to 40%. This upward trend across the board suggests a growing interest among students in the social sciences and humanities fields, with Philosophy showing the most significant increase over the period. These findings highlight a strengthening regard for disciplines that delve into human thought and societal structures."
   },
   {
-    "instruction": "Given the following triplet data (marked by <data>) with the title (marked by <title>) and the question related to the data (marked by <question>), give the answer with no output of hints, explanations or notes.",
-    "input": "<data> [(2010,Budget deficit,60),(2010,Jobs,0),(2010,Economy,0.0),(2012,Budget deficit,0),(2012,Jobs,0),(2012,Economy,0.0),(2014,Budget deficit,63),(2014,Jobs,0),(2014,Economy,0.0),(2016,Budget deficit,0),(2016,Jobs,0),(2016,Economy,0.0),(2018,Budget deficit,48),(2018,Jobs,62),(2018,Economy,0.71)] <title> Downward trend in public views of economic issues as 'top priority' % who say_ is a top priority for the president and Congress  <question> Which line has the lowest value of 71%?",
-    "output": "Economy"
+    "instruction": "Write a Python script to redraw the chart based on the following triplet-format data (marked by <data>) list and the chart type (marked by <type>). The chart title, if provided with the mark of <title>, can be plotted.",
+    "input": "<data> [(Air,Passengers(thousand),200),(Air,Distance(km),5000),(Rail,Passengers(thousand),500),(Rail,Distance(km),1500),(Road,Passengers(thousand),1000),(Road,Distance(km),1000)] <title>  Number of passengers and distance traveled by different modes of transportation in 2021 <type> bar_chart",
+    "output": "\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nplt.figure(figsize=(10, 6))\nax = plt.subplot()\n\ndata = np.array([[200, 5000], [500, 1500], [1000, 1000]])\ncategories = [\"Air\", \"Rail\", \"Road\"]\nbar_width = 0.4\n\npassengers = data[:, 0]\ndistance = data[:, 1]\n\nax.bar(np.arange(len(categories)), passengers, width=bar_width, label=\"Passengers(thousand)\")\nax.bar(np.arange(len(categories)) + bar_width, distance, width=bar_width, label=\"Distance(km)\")\n\nax.set_xticks(np.arange(len(categories)))\nax.set_xticklabels(categories, rotation=0, wrap=True)\nax.set_title(\"Number of passengers and distance traveled by different modes of transportation in 2021\")\nax.legend(loc=\"best\")\n\nplt.tight_layout()\nplt.savefig('./bar_chart.png')\nplt.clf()"
   }
 ]
diff --git a/base_decoder/train/tools/data_preprocess/data_preprocess_chart2text.py b/base_decoder/train/tools/data_preprocess/data_preprocess_chart2text.py
@@ -0,0 +1,78 @@
+import os
+import csv
+from glob import glob
+from PIL import Image
+import numpy as np
+import torch
+from tqdm.contrib import tzip
+from transformers import Pix2StructProcessor
+import random
+from random import sample
+import json
+from tqdm import tqdm
+
+splits = ["images", "multiColumn"]
+subsets = ["matplot", "statista"]  # "human", 
+root = " " # this path needs to cover the root of your dataset
+save_root = " " # Save path
+os.makedirs(save_root, exist_ok=True)
+
+
+input_tokenizer = Pix2StructProcessor.from_pretrained("pix2struct-base") 
+label_tokenizer = Pix2StructProcessor.from_pretrained("pix2struct-base")
+label_tokenizer.image_processor.is_vqa = False
+input_tokenizer.image_processor.is_vqa = False
+
+for split in tqdm(splits):
+    for subset in subsets:
+
+        img_folder_path = os.path.join(root, split, subset)
+        tables_folder_path = os.path.join(root, split, "table")
+
+        all_image_name = glob(os.path.join(img_folder_path, "*.png"))
+        sel_image_name = sample(all_image_name, round(len(all_image_name)*1))
+        imgnames = []
+        imgs = []
+        texts = []
+        for item in tqdm(sel_image_name):
+            imgname = os.path.split(item)[-1]
+
+            image = Image.open(os.path.join(img_folder_path, imgname))
+            table_path = os.path.join(tables_folder_path, imgname.replace(".png", ".csv"))
+            text = ""
+            with open(table_path, 'r') as csv_file:
+                csv_reader = csv.reader(csv_file)
+                for line in csv_reader:  # Iterate through the loop to read line by line
+                    text = text + " \\t ".join(line) + " \\n "
+
+            imgnames.append(imgname)
+            imgs.append(image)
+            texts.append(text)
+
+
+    for idx, (name, img, la) in enumerate(tzip(imgnames, imgs, texts)):
+            inputs = input_tokenizer(
+                    images=img,
+                    return_tensors="pt",
+                    padding="max_length",
+                    # truncation=True,
+                    max_patches=input_tokenizer.image_processor.max_patches,
+                    max_length=1280,
+                    )
+
+            labels = label_tokenizer(
+                    text=la, 
+                    return_tensors="pt", 
+                    padding="max_length",
+                    # truncation=True,
+                    add_special_tokens=True, 
+                    max_length=1280,
+                ).input_ids
+
+            np.save(f"{save_root}/{name.split('.')[0]}_{idx}_chart2text_input_flattened_patches.npy", inputs.data['flattened_patches'].numpy())
+            np.save(f"{save_root}/{name.split('.')[0]}_{idx}_chart2text_input_attention_mask.npy", inputs.data['attention_mask'].numpy())
+            np.save(f"{save_root}/{name.split('.')[0]}_{idx}_chart2text_label.npy", labels.numpy())
+
+
+
+