add batch_processor

DFKI-NLP · Oct 18, 2023 · b7ef657 · b7ef657
1 parent 2e4c4d8
commit b7ef657
Show file tree

Hide file tree

Showing 12 changed files with 732 additions and 1 deletion.
diff --git a/batch_processor/README.md b/batch_processor/README.md
@@ -0,0 +1,39 @@
+# batch processor
+Tool for processing CSV files with llm. 
+### Usage
+Help:
+```
+$ python batch_processor.py -h
+usage: batch_processor [-h] [--api API] [-i INPUT_FILE] [-o OUTPUT_FILE] [-ic INPUT_COLUMN] [-oc OUTPUT_COLUMN] [--max_new_tokens MAX_NEW_TOKENS] [--temperature TEMPERATURE]
+
+Process a csv data in api
+
+options:
+  -h, --help            show this help message and exit
+  --api API             API URL;
+  -i INPUT_FILE, --input_file INPUT_FILE
+                        input file path; defaults to 'input.csv'
+  -o OUTPUT_FILE, --output_file OUTPUT_FILE
+                        output file path; defaults to 'output.csv'
+  -ic INPUT_COLUMN, --input_column INPUT_COLUMN
+                        name of the input data column; defaults to 'input'
+  -oc OUTPUT_COLUMN, --output_column OUTPUT_COLUMN
+                        name of the output data column; defaults to 'output'
+  -p PARAMS, --parameters 
+                        name of the json with parameters; defaults to 'parameters.json'
+```
+Each entry in an INPUT_COLUMN of INPUT_FILE will be sent to API.
+
+OUTPUT_FILE is a copy of an input file with a new column OUTPUT_COLUMN (default = 'output') which contains API responses. 
+
+Make sure to set MAX_NEW_TOKENS as needed. 100 Tokens is only 1 paragraph maximum.
+
+If input file already has an 'output' column - only rows with empty output will be processed. Processing may be interrupted(via CTRL+C in UNIX) at any point and continued later by using an output file as an input. 
+
+Example usage
+```
+$ python3.11 batch_processor.py -i output.csv
+processing output.csv: 100%|########################################| 60/60 [00:00<00:00, 673.91rows/s]
+Processed 60 rows in 0.08912 seconds. (673.3 rows/second)
+Saved results into output.csv
+```
diff --git a/batch_processor/batch_processor.py b/batch_processor/batch_processor.py
@@ -0,0 +1,68 @@
+import requests
+import argparse
+import pandas as pd
+import json
+from tqdm import tqdm
+
+def call_llm(input_str, params, api_url):
+    body = {'inputs': input_str, 'parameters': params}
+    x = requests.post(api_url, json = body)
+    x.raise_for_status()  # raises exception when not a 2xx response
+    if x.status_code != 204:
+        return x.json()
+
+
+def batch_process(args):
+
+    df = pd.read_csv(args.input_file, dtype=str)
+    if args.input_column not in df.columns:
+        raise ValueError(f"Input csv must have an {args.input_column} column")
+    # Create a column if it doesn't exist already
+    df[args.output_column] = df.get(args.output_column, None)
+
+    with open(args.parameters, 'r') as f:
+        params = json.load(f)
+
+    processed = 0
+    try:
+        with tqdm(total=df.shape[0], ascii=True, desc=f"processing {args.input_file}", unit="rows", position=0, leave = True) as pbar:
+            for i, row in enumerate(df.itertuples()):
+                #Skip rows which already have an output
+                if not pd.isnull(row.__getattribute__(args.output_column)):
+                    pbar.update()
+                    continue 
+                input_str = str(row.__getattribute__(args.input_column))
+                resp = call_llm(input_str, params, args.api)
+                # print(resp)
+                df.loc[row.Index, args.output_column] = resp['generated_text']
+                df.to_csv(args.output_file, index=False)
+                processed += 1
+                pbar.update()
+    except KeyboardInterrupt:
+        pbar.close()
+
+    timer = pbar.format_dict["elapsed"]
+    print(f"Processed {processed} rows in {timer:.4} seconds. ({processed/timer:.4} rows/second)")
+    print(f"Saved results into {args.output_file}")
+
+
+if __name__ == "__main__":
+    ap = argparse.ArgumentParser(prog="batch_processor",
+                                 description="Process a csv data in api")
+
+    ap.add_argument("--api", type=str, help= "API URL")
+    ap.add_argument("-i", "--input_file", default="input.csv", type=str, help = "input file path; defaults to 'input.csv'")
+    ap.add_argument("-o", "--output_file", default="output.csv", type=str, help = "output file path; defaults to 'output.csv'")
+    ap.add_argument("-ic", "--input_column", default="input", type=str, help = "name of the input data column; defaults to 'input'")
+    ap.add_argument("-oc", "--output_column", default="output", type=str, help = "name of the output data column; defaults to 'output'")
+
+    # #params
+    # ap.add_argument("--max_new_tokens", type=int, default=100, help = "maximum model return size in tokens; defaults to 100")
+    # ap.add_argument("--temperature", type=float, default=1., help = "model output temperature between 0 and 2, defines how random is output; defaults to 1")
+    ap.add_argument("-p", "--parameters", default="parameters.json", type=str, help = "parameters for the model; defaults to 'parameters.json'" )
+    args = ap.parse_args()
+
+    try:
+        batch_process(args)
+    except KeyboardInterrupt:
+        pass
diff --git a/batch_processor/example_data/leam_llm.csv b/batch_processor/example_data/leam_llm.csv
@@ -0,0 +1,266 @@
+Programm alignment,input,count
+Pre,"Schreibe eine kurze Zusammenfassung dieser Tweets auf deutsch:Heute im Background:  #KI-Aufsicht: Spanien geht mit neuer Behörde voran | #LEAM-Studie: Der Weg zu großen KI-Modellen in Deutschland | Verfehlter #Netzausbau: Harte Sanktionen für die Mobilfunker? | Hier anmelden und lesen:
+----------
+Große KI-Modelle in Deutschland? Mit #LEAM ein innovatives KI-Ökosystem schaffen und in leistungsfähige Infrastruktur investieren. Hierdurch soll die digitale Souveränität sowie Wettbewerbsfähigkeit Deutschlands sichergestellt werden ➡ http://bit.ly/3GYVEDO
+----------
+Ein Artikel der 
+@faznet
+ zu unserer heutigen #LEAM Konferenz. 
+
+https://m.faz.net/aktuell/wirtschaft/grosse-ki-modelle-wie-die-deutsche-antwort-auf-chat-gpt-entstehen-soll-18624692.amp.html",3
+Begrüßung,"Schreibe eine kurze Zusammenfassung dieser Tweets auf deutsch:Today at the #LEAM conference in #Berlin. Let’s talk about large European Ai model / foundation models and how to realize it. With 
+@ki_verband
+
+@MMerantix
+
+@DFKI
+  - more on http://LEAM.Ai",1
+"Keynote | Prof. Dr. Hans Uszkoreit (Scientific Director, DFKI)","Schreibe eine kurze Zusammenfassung dieser Tweets auf deutsch:Wir freuen uns sehr über die ⁦
+@BMWK
+⁩ geförderte #Leam Studie des ⁦
+@ki_verband
+⁩ zu Großen #KI-Modellen für DEU und EU: Wir wollen die Grundlage für erfolgreiche KI-(Geschäfts-)Modelle legen, Innovationen fördern und Startups u KMU für die Datenökonomie rüsten. 1/2",1
+Wo steht Deutschland im internationalen Vergleich? | Paneldiskussion,"Schreibe eine kurze Zusammenfassung dieser Tweets auf deutsch:350 Millionen Euro würde ein Rechenzentrum für große #KI Modelle kosten. ""Das entspricht 50km Autobahn"", sagt Hans Uszkoreit vom 
+@DFKI
+ Noch besser ist der Vergleich beim Energieverbrauch #LEAM
+----------
+Aus der Reihe ""Orte, an die mich mein Job führt"": Allianz Forum, Präsentation der ##Leam Machbarkeitsstudie zu Large European AI models und KI-Foundation Modellen. Ich bin so gespannt, was da noch auf uns wartet nach #gpt3!
+----------
+Wo steht #Deutschland / #EU bei #KI im internationalen Vergleich? 
+Kurz: ziemlich weit hinter USA und China. 
+Die Herausforderungen sind vielfältig und komplex sagt 
+@nicolebuettner
+. #Zusammenarbeit ist wichtiger als Wettbewerb. 
+Achtung: ohne #Rechenpower geht es nicht. 
+#LEAM
+----------
+Die auch hier bei #LEAM jetzt mehrfach zitierte #KIHalluzination Merkel arbeite jetzt für McKinsey stammt übrigens von Youchat 
+@YouSearchEngine
+ der sonst hochinteressanten Antwort eines Deutschen auf #ChatGPT Mehr dazu ⏬
+----------
+Heute übergibt der 
+@ki_verband
+  die #Machbarkeitsstudie für große KI-Modelle für Deutschland und Europa. Die Initiative #LEAM – Large European #AIModels – befasst sich mit dem Aufbau eines Hochleistungsrechenzentrums für  #KünstlicheIntelligenz. #LLM
+----------
+The #LEAM feasibility study is out now❗On behalf of BMWK, we have been working with our partners to gather arguments for developing AI foundation models in Europe. At the conference, 
+@nicolebuettner
+ explains how we can move Europe forward in the international AI competition🚀
+----------
+Bei der #LEAM-Konferenz des 
+@ki_verband
+ geht es heute um #KI-Foundation-Modelle, wie den #ChatGPT von 
+@OpenAI
+. 🤖 Wie kann sich 🇩🇪 im internat. Vergleich behaupten & welche 🇪🇺 Infrastrukturen brauchen wir dafür? Antworten in der 
+@BMWK
+-LEAM-Studie: https://bit.ly/3H6KAVa 🚀",7
+Was braucht die Industrie auf dem Weg zum Weltmarktführer? | Paneldiskussion,"Schreibe eine kurze Zusammenfassung dieser Tweets auf deutsch:Michael Koch, Director of Data Analytics & Artificial Intelligence 
+@lufthansa
+ at #LEAM: for productive use we always will have to customize foundations models with our own data. Could knowledge graphs be the best way to enable domain customization? 
+@coreonapp
+----------
+Large European AI Models, LEAM - Ambrosys takes part. Dr. Markus Abel and Thomas Seidler share their ideas.
+----------
+@schnabelu
+ von 
+@DIEZEIT
+: ‚Fragen die CEOs nicht: „was kümmert es uns, ob wir von einem dt. oder US KI-Modell abhängig sind? Wir sind doch ein internationales Unternehmen!“‘
+Marion Legler von Bayer: „wir müssen mit eigenen Daten trainieren können - können wir dort nicht“ #LEAM",3
+Wie können uns Sprachmodelle den Alltag und die Arbeit erleichtern? | Paneldiskussion,"Schreibe eine kurze Zusammenfassung dieser Tweets auf deutsch:„Wenn man Grundlagenforschung macht, darf man die Leute nicht jeden Monat nach den konkret nutzbaren Fortschritten fragen“ Claudia Pohlink, Chief Expert Data bei der Deutschen Bahn, auf der #LEAM Konferenz des 
+@ki_verband
+
+
+#KI #AI
+----------
+#LEAM panel on language models in daily use. 
+@j_bubenzer
+, 
+@hansuz
+, 
+@gdm3000
+, Thomas Lindemann from 
+@fyrfeed
+. Customer don’t know what innovative products can do for them. Focus on short term gains prevents innovation.
+----------
+Nicolas Flores-Herr Head of Business Area Document Analytics 
+@eis_bonn
+: the real disruption of models like #ChatGPT is, that white collar workers can now “talk with data”. #LEAM",3
+Post,"Schreibe eine kurze Zusammenfassung dieser Tweets auf deutsch:My favorite slide at today's #leam conference im Berlin! Great comparison at the cost side. Overall, great conference and inspiring get-up-and-go mood towards leveraging the benefits of large language models in DE! Definitely a must to follow this exciting initiative 
+@ki_verband
+----------
+That was a lot of fun and EXTREMELY insightful. Thanks 
+@ki_verband
+ for the LEAM Konferenz!
+
+#aimadeingermany #leam #germany
+----------
+This is cool! #leam 
+@ki_verband
+----------
+Gestern haben wir gemeinsam mit allen Autor:innen die Studie ""Große KI-Modelle für Deutschland"" der #LEAM Initiative in Berlin vorgestellt. Eine Erkenntnis: Um international mithalten zu können, brauchen wir leistungsfähige KI-Rechenzentren in DE. ⬇
+https://eco.de/presse/machbarkeitsstudie-ermittelt-voraussetzungen-fuer-ki-anwendungen-wie-beispielsweise-chatbots-in-deutschland/
+----------
+👉 Die Initiative #LEAM strebt den Aufbau einer europäischen KI-Infrastruktur mit einem Hochleistungsrechenzentrum in Deutschland an. Das Investitionsvolumen wird auf 350 bis 400 Millionen Euro geschätzt. 🧵
+----------
+💡Did you know that such advanced foundation models for video generation are not yet available in Europe? To change that the #LEAM initiative aims to create a dedicated #supercomputing infrastructure in 🇩🇪.
+🔎 Curious? You can find the complete study here: https://iais.fraunhofer.de/de/publikationen/studien.html
+----------
+👉 The #LEAM initiative aims to establish a European AI infrastructure with a high-performance computing center in Germany. The investment volume is estimated at 350 to 400 million euros.
+----------
+Die #LEAM-Machbarkeitsstudie ist da. https://leam.ai
+----------
+Heard on a #LEAM panel on transformative #AI with #DAX30 companies: ""We're not software companies, we don't do that. Someone else should do that for us."" Wake up! Companies that don't do #software and #AI will be replaced by companies who do! Independent of the industry!
+----------
+Bei der #LEAM Konferenz ging es um #KI und die Wettbewerbsfähigkeit Europas. Zukunftsträchtige KI-Technologien wie 
+@OpenGPTX
+
+(großes Sprachmodell, #gaiax) brauchen zum KI-Training enorme Rechnerkapazitäten. Startschuss für ein großes KI-Rechenzentrum in Deutschland? 
+
+#ai #gaiax
+----------
+#ChatGPT ist in aller Munde, ist aber nur die Spitze des Eisbergs. Uns beschäftigt #KI in der Verwaltung und damit auch #DigitaleSouveränität & Datensicherheit. Mehr dazu in der Machbarkeits #Studie, die auf der #LEAM Konferenz vorgestellt wurde.
+----------
+LEAM Machbarkeitsstudie: Experten der Initiative #LEAM  (Large European AI Models) des 
+@ki_verband
+, i. A. des 
+@BMWK
+,  sprechen sich deutlich für die Entwicklung von #KI-Grundlagenmodellen nach europäischen Standards aus.
+----------
+Braucht Deutschland eigene KI-Modelle à la #ChatGPT? Die #LEAM-Initiative aus Industrie und Wissenschaftlern fordert ein neues Rechenzentrum und mehr – 
+@AlaArmbruster
+
+@CarstenKnop
+ und 
+@MaxosTaxos
+ diskutieren darüber im neuen #DigitecPodcast:
+----------
+Braucht Deutschland eigene KI-Modelle à la #ChatGPT? Die #LEAM-Initiative aus Industrie und Wissenschaftlern fordert ein neues Rechenzentrum und mehr – 
+@AlaArmbruster
+
+@CarstenKnop
+ und 
+@MaxosTaxos
+ diskutieren darüber im neuen #DigitecPodcast:
+----------
+Braucht Deutschland eigene KI-Modelle à la #ChatGPT? Die #LEAM-Initiative aus Industrie und Wissenschaftlern fordert ein neues Rechenzentrum und mehr – 
+@AlaArmbruster
+
+@CarstenKnop
+ und 
+@MaxosTaxos
+ diskutieren darüber im neuen #DigitecPodcast:
+----------
+Dank an 
+@hannesbajohr
+ für diesen großartigen Beitrag! Er macht deutlich, wie tiefgreifend generative KI-Systeme unsere Kommunikation verändern werden - mit  Gefahr der Konvergenz zu einer ""Durchschnittssprache"". Initiativen wie #leam sind wichtiger denn je!
+----------
+Unser Newsletter ist da! 📨🤖
+
+Themen sind:
+👉#KICampus 2.0
+👉#ChatGPT-Livestream
+👉#UFFestival mit #KI-Track
+👉#Fellowship-Sammelband
+👉#Curriculum-Barcamp
+👉#LEAM-Konferenz 
+
+Jetzt lesen & gerne weiterempfehlen:
+➡️http://mailchi.mp/ki-campus/januar-2023
+
+#TwitterCampus #DigitaleBildung #OER
+----------
+Hörenswerte #podcast Folge zu #GPT3 
+und die Frage,ob D eigene (große #KI-Modelle benötigt?
+#Digitec 
+@faznet
+ mit 
+@carstenknop
+ und 
+@AlaArmbruster
+
+➡️Eine Antwort könnte #LEAM 
+@ki_verband
+ sein. 
+➡️Eine weitere ist meiner Meinung nach frühzeitige KI-Bildung:
+----------
+ChatGPT writing somethin about European AI.
+
+#chatgpt #leam #artificialintelligence
+----------
+Was great to be a panel speaker at the
+#LEAM Conference
+at the Brandenburg Gate in Berlin
+
+Europe needs large AI models, but we need to make sure they are built responsibly.
+
+Exciting discussions/talks by 
+@hansuz
+
+@nicolebuettner
+
+@percyliang
+ (prerecorded) 
+@Breitseiten
+ + many more!
+----------
+SAP takes part in LEAM - Large European AI Models
+initiative from the German AI Association to research and promote the development of large AI models  #ai #development #sap #research #llms #chatGPT #nlp
+----------
+Replying to 
+@max_fksr
+wann gibt's denn ein Konzept für das #Digitalbudget, damit Initiativen wie #LEAM für eine deutsche/europäische Alternative unterstützt werden können?
+----------
+“We are now in the #industrialization phase of #AI.” “We will now grow from a 4% AI adaption rate to 85% [until End ‘24]”
+“#Foundationmodels will enable this”
+
+Thomas Weber, AI chief product of 
+@huawei
+ on #WAICF23 
+
+@JonasAndrulis
+
+@ki_verband
+ #LEAM 
+@RonjaKemmer
+ #AI
+----------
+Stehe ich dahinter. Wenn der Staat seine eigene Angst vor Wagniskapital verliert, dann können wir hier in Deutschland große Dinge tun.
+Mein Favorit: Große #ki Foundation Modelle im Sinne von #leam um die deutsche und europäische Technologiesouveränität zu gewährleisten!
+----------
+Replying to 
+@_SilkeHahn
+
+@JonasAndrulis
+ and 
+@tarnkappe_info
+Wie geschrieben ""vorerst"" - beim Thema ""Intelligent"" sind wir noch nicht und da müssen andere Ansätze her. Wenn wir aber weiter Machbarkeitsstudien #LEAM durchführen müssen um Geld zu bekommen, dämpft das meinen Optimismus.
+----------
+Replying to 
+@didijo
+
+@JonasAndrulis
+ and 
+@tarnkappe_info
+#LEAM kommt um Jahre zu spät damit um die Ecke. Die Idee mag fantastisch sein und ja, Deutschland bräuchte das. Aber eine Machbarkeitsstudie (!) jetzt, wo die anderen schon ausrollen zum Abräumen: wie behäbig. Bis das brummt, ist der Rest der Welt schon um Lichtjahre weiter.
+----------
+Replying to 
+@_SilkeHahn
+
+@didijo
+ and 2 others
+Im besten Fall sind das keine konkurrierenden sondern komplementären Dinge. Auf der #LEAM Konferenz wurden fantastische Projekte vorgestellt in denen mit Luminous die industrielle Revolution souverän wird. #LEAM bringt unser AI-Leadership zusammen & teilt Information. 1/2
+----------
+Schön zu sehen dass unsere Bemühungen das 🇪🇺 #ki Ökosystem anzukurbeln auch auf politischer Ebene gesehen wird. Danke an 
+@Breitseiten
+, 
+@ki_verband
+ für die Unterstützung von #leam aber auch 
+@JonasAndrulis
+ und 
+@RichardSocher
+ für ihre Vorreiterrollen
+----------
+Last month the 
+@ki_verband
+ officially presented the #LEAM feasibility study that has received much acclaim. You can now re-watch the event in full via Youtube below! 👀
+----------
+nteresting initiative in the EU about AI (LEAM = Large European AI Models) #ai #leam #europeanaistrategy #aicommunity #aiawareness #politics #innovationleader #innovationsleadership",30