Skip to content

Commit

Permalink
update chart2text_preprocess
Browse files Browse the repository at this point in the history
  • Loading branch information
renqiux0302 authored Apr 4, 2024
1 parent 9293061 commit 0cac89d
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 14 deletions.
23 changes: 9 additions & 14 deletions auxiliary_decoder/train/templates/train_data.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,18 @@
"output": "2018"
},
{
"instruction": "Given the following triplet data (marked by <data>) with the title (marked by <title>) and the question related to the data (marked by <question>), give the answer with no output of hints, explanations or notes.",
"input": "<data> [(2007,Not too much/not at all,0),(2007,Great deal) fair amount,26),(2009,Not too much/not at all,51),(2009,Great deal) fair amount,36),(2011,Not too much/not at all,0),(2011,Great deal) fair amount,0),(2013,Not too much/not at all,60),(2013,Great deal) fair amount,0),(2015,Not too much/not at all,0),(2015,Great deal) fair amount,0),(2017,Not too much/not at all,0),(2017,Great deal) fair amount,0),(2018,Not too much/not at all,772),(2018,Great deal) fair amount,0)] <title> Belief that U.S. considers other countries' interests in foreign policy returns to 2007 level In making international policy decisions, to what extent do you think the US. takes the interest of your country into account? <question> What does the blue line represent?",
"output": "Not too much/not at all"
},
{
"instruction": "Given the following triplet data (marked by <data>) with the title (marked by <title>) and the question related to the data (marked by <question>), give the answer with no output of hints, explanations or notes.",
"input": "<data> [(2007,Not too much/not at all,0),(2007,Great deal) fair amount,26),(2009,Not too much/not at all,51),(2009,Great deal) fair amount,36),(2011,Not too much/not at all,0),(2011,Great deal) fair amount,0),(2013,Not too much/not at all,60),(2013,Great deal) fair amount,0),(2015,Not too much/not at all,0),(2015,Great deal) fair amount,0),(2017,Not too much/not at all,0),(2017,Great deal) fair amount,0),(2018,Not too much/not at all,772),(2018,Great deal) fair amount,0)] <title> Belief that U.S. considers other countries' interests in foreign policy returns to 2007 level In making international policy decisions, to what extent do you think the US. takes the interest of your country into account? <question> What is the max value of blue line?",
"output": "0.72"
"instruction": "Generate a descriptive text according to the CSV data, title, and chart type.",
"input": "<data> [(Facebook,Users(million),2500),(Facebook,Usage Time(minutes),150),(YouTube,Users(million),2000),(YouTube,Usage Time(minutes),200),(Instagram,Users(million),1500),(Instagram,Usage Time(minutes),130),(Twitter,Users(million),1000),(Twitter,Usage Time(minutes),100)] <title> Social Media Usage by Number of Users and Time Spent in 2021 <type> bar_chart",
"output": "The bar chart illustrates social media usage by the number of users in millions and the time spent in minutes during 2021. Facebook leads with 2500 million users and an average usage time of 150 minutes. YouTube follows with 2000 million users and a higher average usage time of 200 minutes. Instagram has 1500 million users with 130 minutes of usage time, while Twitter has the least, with 1000 million users spending an average of 100 minutes. Each platform's engagement is visually compared by users and time spent."
},
{
"instruction": "Given the following triplet data (marked by <data>) with the title (marked by <title>) and the question related to the data (marked by <question>), give the answer with no output of hints, explanations or notes.",
"input": "<data> [(2010,Budget deficit,60),(2010,Jobs,0),(2010,Economy,0.0),(2012,Budget deficit,0),(2012,Jobs,0),(2012,Economy,0.0),(2014,Budget deficit,63),(2014,Jobs,0),(2014,Economy,0.0),(2016,Budget deficit,0),(2016,Jobs,0),(2016,Economy,0.0),(2018,Budget deficit,48),(2018,Jobs,62),(2018,Economy,0.71)] <title> Downward trend in public views of economic issues as 'top priority' % who say_ is a top priority for the president and Congress <question> What's the percentage of respondents who say Job is a top priority for the president and Congress in 2016?",
"output": "68"
"instruction": "Produce a concise synthesis or key insights based on the graphical data representation.",
"input": "<data> [(2019,Philosophy(%),40),(2019,Psychology(%),30),(2019,Sociology(%),35),(2020,Philosophy(%),42),(2020,Psychology(%),31),(2020,Sociology(%),38),(2021,Philosophy(%),44),(2021,Psychology(%),32),(2021,Sociology(%),40)] <title> Percentage of Social Sciences and Humanities in three fields from 2019 to 2021 ",
"output": "Over three consecutive years, 2019 to 2021, there has been a modest but consistent increase in the percentage of students majoring in Philosophy, Psychology, and Sociology. Philosophy saw a rise from 40% to 44%, Psychology inched up from 30% to 32%, and Sociology grew from 35% to 40%. This upward trend across the board suggests a growing interest among students in the social sciences and humanities fields, with Philosophy showing the most significant increase over the period. These findings highlight a strengthening regard for disciplines that delve into human thought and societal structures."
},
{
"instruction": "Given the following triplet data (marked by <data>) with the title (marked by <title>) and the question related to the data (marked by <question>), give the answer with no output of hints, explanations or notes.",
"input": "<data> [(2010,Budget deficit,60),(2010,Jobs,0),(2010,Economy,0.0),(2012,Budget deficit,0),(2012,Jobs,0),(2012,Economy,0.0),(2014,Budget deficit,63),(2014,Jobs,0),(2014,Economy,0.0),(2016,Budget deficit,0),(2016,Jobs,0),(2016,Economy,0.0),(2018,Budget deficit,48),(2018,Jobs,62),(2018,Economy,0.71)] <title> Downward trend in public views of economic issues as 'top priority' % who say_ is a top priority for the president and Congress <question> Which line has the lowest value of 71%?",
"output": "Economy"
"instruction": "Write a Python script to redraw the chart based on the following triplet-format data (marked by <data>) list and the chart type (marked by <type>). The chart title, if provided with the mark of <title>, can be plotted.",
"input": "<data> [(Air,Passengers(thousand),200),(Air,Distance(km),5000),(Rail,Passengers(thousand),500),(Rail,Distance(km),1500),(Road,Passengers(thousand),1000),(Road,Distance(km),1000)] <title> Number of passengers and distance traveled by different modes of transportation in 2021 <type> bar_chart",
"output": "\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nplt.figure(figsize=(10, 6))\nax = plt.subplot()\n\ndata = np.array([[200, 5000], [500, 1500], [1000, 1000]])\ncategories = [\"Air\", \"Rail\", \"Road\"]\nbar_width = 0.4\n\npassengers = data[:, 0]\ndistance = data[:, 1]\n\nax.bar(np.arange(len(categories)), passengers, width=bar_width, label=\"Passengers(thousand)\")\nax.bar(np.arange(len(categories)) + bar_width, distance, width=bar_width, label=\"Distance(km)\")\n\nax.set_xticks(np.arange(len(categories)))\nax.set_xticklabels(categories, rotation=0, wrap=True)\nax.set_title(\"Number of passengers and distance traveled by different modes of transportation in 2021\")\nax.legend(loc=\"best\")\n\nplt.tight_layout()\nplt.savefig('./bar_chart.png')\nplt.clf()"
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import os
import csv
from glob import glob
from PIL import Image
import numpy as np
import torch
from tqdm.contrib import tzip
from transformers import Pix2StructProcessor
import random
from random import sample
import json
from tqdm import tqdm

splits = ["images", "multiColumn"]
subsets = ["matplot", "statista"] # "human",
root = " " # this path needs to cover the root of your dataset
save_root = " " # Save path
os.makedirs(save_root, exist_ok=True)


input_tokenizer = Pix2StructProcessor.from_pretrained("pix2struct-base")
label_tokenizer = Pix2StructProcessor.from_pretrained("pix2struct-base")
label_tokenizer.image_processor.is_vqa = False
input_tokenizer.image_processor.is_vqa = False

for split in tqdm(splits):
for subset in subsets:

img_folder_path = os.path.join(root, split, subset)
tables_folder_path = os.path.join(root, split, "table")

all_image_name = glob(os.path.join(img_folder_path, "*.png"))
sel_image_name = sample(all_image_name, round(len(all_image_name)*1))
imgnames = []
imgs = []
texts = []
for item in tqdm(sel_image_name):
imgname = os.path.split(item)[-1]

image = Image.open(os.path.join(img_folder_path, imgname))
table_path = os.path.join(tables_folder_path, imgname.replace(".png", ".csv"))
text = ""
with open(table_path, 'r') as csv_file:
csv_reader = csv.reader(csv_file)
for line in csv_reader: # Iterate through the loop to read line by line
text = text + " \\t ".join(line) + " \\n "

imgnames.append(imgname)
imgs.append(image)
texts.append(text)


for idx, (name, img, la) in enumerate(tzip(imgnames, imgs, texts)):
inputs = input_tokenizer(
images=img,
return_tensors="pt",
padding="max_length",
# truncation=True,
max_patches=input_tokenizer.image_processor.max_patches,
max_length=1280,
)

labels = label_tokenizer(
text=la,
return_tensors="pt",
padding="max_length",
# truncation=True,
add_special_tokens=True,
max_length=1280,
).input_ids

np.save(f"{save_root}/{name.split('.')[0]}_{idx}_chart2text_input_flattened_patches.npy", inputs.data['flattened_patches'].numpy())
np.save(f"{save_root}/{name.split('.')[0]}_{idx}_chart2text_input_attention_mask.npy", inputs.data['attention_mask'].numpy())
np.save(f"{save_root}/{name.split('.')[0]}_{idx}_chart2text_label.npy", labels.numpy())




0 comments on commit 0cac89d

Please sign in to comment.