Merge pull request #51 from namiyousef/develop

Merges develop into main for early access release
namiyousef · Apr 19, 2022 · df836ae · df836ae
2 parents a37f047 + bf92b1f
commit df836ae
Show file tree

Hide file tree

Showing 14 changed files with 3,373 additions and 1,413 deletions.
diff --git a/README.md b/README.md
@@ -89,12 +89,86 @@ Considering all combinations, the processor API provies functionality for the fo
 
 ### Data Augmentation (Adversarial Examples)
 
+The DataProcessor is designed such that it allows easy
 
 ### Evaluation
 
-# Quick Start
 - TODO
 
+# Quick Start
+
+A quick start showing how to use the DataProcessor for the AAE dataset.
+```python
+from argminer.data import ArgumentMiningDataset, TUDarmstadtProcessor
+from argminer.evaluation import inference
+from argminer.config import LABELS_MAP_DICT
+from torch.utils.data import DataLoader
+from torch.optim import Adam
+from transformers import AutoModelForTokenClassification, AutoTokenizer
+
+# set path to data source
+path = 'ArgumentAnnotatedEssay-2.0'
+
+processor = TUDarmstadtProcessor(path)
+processor = processor.preprocess()
+
+# augmenter
+def hello_world_augmenter(text):
+    text = ['Hello'] + text.split() + ['World']
+    text = ' '.join(text)
+    return text
+
+processor = processor.process('bieo', processors=[hello_world_augmenter]).postprocess()
+
+df_dict = processor.get_tts(test_size=0.3)
+df_train = df_dict['train'][['text', 'labels']]
+df_test = df_dict['test'][['text', 'labels']]
+
+df_label_map = LABELS_MAP_DICT['TUDarmstadt']['bieo']
+
+max_length = 1024
+
+# datasets
+tokenizer = AutoTokenizer.from_pretrained('google/bigbird-roberta-base', add_prefix_space=True)
+model = AutoModelForTokenClassification.from_pretrained('google/bigbird-roberta-base')
+optimizer = Adam(model.parameters())
+
+trainset = ArgumentMiningDataset(
+    df_label_map, df_train, tokenizer, max_length
+)
+testset = ArgumentMiningDataset(
+    df_label_map, df_train, tokenizer, max_length, is_train=False
+)
+
+train_loader = DataLoader(trainset)
+test_loader = DataLoader(testset)
+
+# sample training script (very simplistic, see run.py in cluster/cluster_setup/job_files for a full-fledged one)
+epochs = 1
+for epoch in range(epochs):
+    model.train()
+
+    for i, (inputs, targets) in enumerate(train_loader):
+
+        optimizer.zero_grad()
+
+        loss, outputs = model(
+            labels=targets,
+            input_ids=inputs['input_ids'],
+            attention_mask=inputs['attention_mask'],
+            return_dict=False
+        )
+
+        # backward pass
+
+        loss.backward()
+        optimizer.step()
+
+# run inference
+df_metrics, df_scores = inference(model, test_loader)
+
+```
+
 
 
 # References

diff --git a/argminer/__init__.py b/argminer/__init__.py
@@ -1,4 +1,4 @@
 import argminer
 
 
-__version__ = '0.0.17'
+__version__ = '0.1.0'
diff --git a/argminer/api/__init__.py b/argminer/api/__init__.py
diff --git a/argminer/api/specs/__init__.py b/argminer/api/specs/__init__.py
diff --git a/argminer/api/specs/api.yaml b/argminer/api/specs/api.yaml
@@ -0,0 +1,265 @@
+openapi: 3.0.5
+info:
+  version: 0.0.1
+  title: ArgMiner
+
+paths:
+  /model_info:
+    get:
+      tags:
+        - model_info
+      operationId: argminer.api.views.model_info
+      parameters:
+        - $ref: '#/components/parameters/OurModels'
+      summary: get summary information about model
+      responses:
+        200:
+          $ref: '#/components/schemas/ModelInfoResponse'
+  /health_check:
+    get:
+      tags:
+        - health_check
+      operationId: argminer.api.views.health_check
+      summary: Check API alive
+      responses:
+        200:
+          description: 'Checks to see if API is alive'
+          #$ref: '#/components/responses/Success'
+        #500:
+        #  $ref: '#/components/responses/InternalServerError
+  /evaluate:
+    post:
+      tags:
+        - evaluate
+      operationId: argminer.api.views.evaluate
+      summary: Evaluates models on a given test set
+      parameters:
+        - $ref: '#/components/parameters/ModelName'
+        - $ref: '#/components/parameters/LabellingStrategy'
+        - $ref: '#/components/parameters/AggregationStrategy'
+        - $ref: '#/components/parameters/LabellingStrategyScope'
+        - $ref: '#/components/parameters/MaxLength'
+        - $ref: '#/components/parameters/BatchSize'
+        - $ref: '#/components/parameters/Labels'
+        #- $ref: '#/components/parameters/TextSegments'
+      requestBody:
+        $ref: '#/components/requestBodies/TextSegments'
+      responses:
+            200:
+              description: Successfully evaluated on test dataset
+              content:
+                application/json:
+                  schema:
+                    $ref: '#/components/schemas/InferenceResponse'
+            404:
+              description: HuggingFace retrieve error
+              content:
+                application/json:
+                  schema:
+                    $ref: '#/components/schemas/HuggingFaceError'
+
+
+  /predict:
+    post:
+      tags:
+        - predict
+      operationId: argminer.api.views.predict
+      summary: Predicts labels on a given sample of text
+      parameters:
+        - $ref: '#/components/parameters/OurModels'
+      requestBody:
+        $ref: '#/components/requestBodies/FreeText'
+      responses:
+        200:
+          description: Successfully predicted on a piece of text
+          content:
+              application/json:
+                schema:
+                  $ref: '#/components/schemas/PredictionResponse'
+components:
+  requestBodies:
+    TextSegments:
+      content:
+        application/json:
+          schema:
+            $ref: '#/components/schemas/TextSegments'
+    FreeText:
+      content:
+        text/plain:
+          schema:
+            type: string
+
+  parameters:
+    OurModels:
+      name: model_name
+      in: query
+      schema:
+        $ref: '#/components/schemas/OurModels'
+    Dataset:
+      name: dataset_name
+      in: query
+      schema:
+        $ref: '#/components/schemas/Dataset'
+    LabellingStrategy:
+      name: strategy
+      in: query
+      schema:
+        $ref: '#/components/schemas/LabellingStrategy'
+    AggregationStrategy:
+      name: agg_strategy
+      in: query
+      schema:
+        $ref: '#/components/schemas/AggregationStrategy'
+    ModelName:
+      name: model_name
+      in: query
+      schema:
+        $ref: '#/components/schemas/ModelName'
+      examples:
+        Ours:
+          value: ucabqfe/roberta_PER_io
+          summary: roberta on PERSUADE with io label
+        HuggingFace: # Distinct name
+          value: google/bigbird-roberta-base
+          summary: Official Bigbird model from HuggingFace
+    LabellingStrategyScope:
+      name: strategy_level
+      in: query
+      schema:
+        $ref: '#/components/schemas/LabellingStrategyScope'
+    MaxLength:
+      name: max_length
+      in: query
+      schema:
+        $ref: '#/components/schemas/MaxLength'
+    BatchSize:
+      name: batch_size
+      in: query
+      schema:
+        $ref: '#/components/schemas/BatchSize'
+    Labels:
+      name: label_map
+      in: query
+      schema:
+        $ref: '#/components/schemas/Labels'
+    TextSegments:
+      name: text_segments
+      in: query
+      schema:
+        $ref: '#/components/schemas/TextSegments'
+      explode: true
+
+  schemas:
+    Dataset:
+      type: string
+      example: AAE
+      nullable: false
+      enum: [AAE, PERSUADE]
+      description: Name of the dataset
+    LabellingStrategy:
+      type: string
+      example: io
+      enum: [io, bio, bieo, bixo]
+      nullable: false
+      description: Strategy to label words in a given text segment
+    AggregationStrategy:
+      type: string
+      example: first
+      enum: [first, mean, max]
+      nullable: false
+      description: aggregation strategy for mapping back from tokens to words
+    ModelName:
+      type: string
+      example: google/bigbird-roberta-base
+      nullable: false
+      description: name of the model to use. Can be any generic one from HuggingFace or one of our models
+    LabellingStrategyScope:
+      type: string
+      nullable: false
+      enum: [standard, wordLevel]
+      default: standard
+      description: level to apply labelling strategy at. If standard then inside subtokens labelled as I-.
+    MaxLength:
+      type: integer
+      nullable: false
+      default: 512
+      description: maximum number of tokens per passage
+    BatchSize:
+      type: integer
+      nullable: false
+      default: 32
+      description: batch size for inference
+    Labels:
+      type: array
+      items:
+        type: string
+        example: Other
+        uniqueItems: true
+
+      description: Labels present in data
+    TextSegments:
+      type: array
+      items:
+        type: array
+        items:
+          type: string
+          example: "Claim:: NLP is the best ML field!"
+
+    HuggingFaceError:
+      type: object
+      properties:
+        type:
+          type: string
+          example: tokenizer
+        name:
+          type: string
+          example: model
+        error:
+          type: string
+
+    DimensionMismatchError:
+      type: object
+      properties:
+        error:
+          type: string
+        expected:
+          type: string
+        received:
+          type: string
+
+    InferenceResponse:
+      type: object
+      properties:
+
+        score_table:
+          type: string
+    OurModels:
+      type: string
+      enum: [ucabqfe/roberta_AAE_bieo, ucabqfe/roberta_AAE_bio, ucabqfe/roberta_AAE_io, ucabqfe/roberta_PER_bieo, ucabqfe/roberta_PER_bio, ucabqfe/roberta_PER_io, ucabqfe/bigBird_AAE_bieo, ucabqfe/bigBird_AAE_bio, ucabqfe/bigBird_AAE_io, ucabqfe/bigBird_PER_bieo, ucabqfe/bigBird_PER_bio, ucabqfe/bigBird_PER_io]
+      default: ucabqfe/roberta_AAE_bieo
+
+    ModelInfoResponse:
+      type: object
+      properties:
+        hugging_face_model_name:
+          type: string
+          description: name of base model used for training
+        labels:
+          type: string
+    PredictionResponse:
+      type: object
+
+
+#definitions:
+#  User:
+#    type: object
+#    properties:
+#      id:
+#        type: integer
+#        description: The user ID.
+#      username:
+#        type: string
+#        description: The user name.
+
+
+
diff --git a/argminer/api/utils.py b/argminer/api/utils.py
@@ -0,0 +1,17 @@
+from argminer.data import DataProcessor
+import pandas as pd
+def _generate_df_text_from_input(text_segments, strategy):
+    text_segments_split = [
+        (text_segment.split('::') for text_segment in doc) for doc in text_segments
+    ]
+
+
+    df = pd.concat([
+        pd.DataFrame.from_records(
+            doc,
+            columns=['label', 'text']
+        ).assign(doc_id=i) for i, doc in enumerate(text_segments_split)
+    ])
+
+    processor = DataProcessor('').from_json(status='preprocessed', df=df).process(strategy).postprocess()
+    return processor.dataframe