Skip to content

Commit

Permalink
Merge pull request #51 from namiyousef/develop
Browse files Browse the repository at this point in the history
Merges develop into main for early access release
  • Loading branch information
namiyousef authored Apr 19, 2022
2 parents a37f047 + bf92b1f commit df836ae
Show file tree
Hide file tree
Showing 14 changed files with 3,373 additions and 1,413 deletions.
76 changes: 75 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,86 @@ Considering all combinations, the processor API provies functionality for the fo

### Data Augmentation (Adversarial Examples)

The DataProcessor is designed such that it allows easy

### Evaluation

# Quick Start
- TODO

# Quick Start

A quick start showing how to use the DataProcessor for the AAE dataset.
```python
from argminer.data import ArgumentMiningDataset, TUDarmstadtProcessor
from argminer.evaluation import inference
from argminer.config import LABELS_MAP_DICT
from torch.utils.data import DataLoader
from torch.optim import Adam
from transformers import AutoModelForTokenClassification, AutoTokenizer

# set path to data source
path = 'ArgumentAnnotatedEssay-2.0'

processor = TUDarmstadtProcessor(path)
processor = processor.preprocess()

# augmenter
def hello_world_augmenter(text):
text = ['Hello'] + text.split() + ['World']
text = ' '.join(text)
return text

processor = processor.process('bieo', processors=[hello_world_augmenter]).postprocess()

df_dict = processor.get_tts(test_size=0.3)
df_train = df_dict['train'][['text', 'labels']]
df_test = df_dict['test'][['text', 'labels']]

df_label_map = LABELS_MAP_DICT['TUDarmstadt']['bieo']

max_length = 1024

# datasets
tokenizer = AutoTokenizer.from_pretrained('google/bigbird-roberta-base', add_prefix_space=True)
model = AutoModelForTokenClassification.from_pretrained('google/bigbird-roberta-base')
optimizer = Adam(model.parameters())

trainset = ArgumentMiningDataset(
df_label_map, df_train, tokenizer, max_length
)
testset = ArgumentMiningDataset(
df_label_map, df_train, tokenizer, max_length, is_train=False
)

train_loader = DataLoader(trainset)
test_loader = DataLoader(testset)

# sample training script (very simplistic, see run.py in cluster/cluster_setup/job_files for a full-fledged one)
epochs = 1
for epoch in range(epochs):
model.train()

for i, (inputs, targets) in enumerate(train_loader):

optimizer.zero_grad()

loss, outputs = model(
labels=targets,
input_ids=inputs['input_ids'],
attention_mask=inputs['attention_mask'],
return_dict=False
)

# backward pass

loss.backward()
optimizer.step()

# run inference
df_metrics, df_scores = inference(model, test_loader)

```



# References
Expand Down
2 changes: 1 addition & 1 deletion argminer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import argminer


__version__ = '0.0.17'
__version__ = '0.1.0'
Empty file added argminer/api/__init__.py
Empty file.
Empty file added argminer/api/specs/__init__.py
Empty file.
265 changes: 265 additions & 0 deletions argminer/api/specs/api.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,265 @@
openapi: 3.0.5
info:
version: 0.0.1
title: ArgMiner

paths:
/model_info:
get:
tags:
- model_info
operationId: argminer.api.views.model_info
parameters:
- $ref: '#/components/parameters/OurModels'
summary: get summary information about model
responses:
200:
$ref: '#/components/schemas/ModelInfoResponse'
/health_check:
get:
tags:
- health_check
operationId: argminer.api.views.health_check
summary: Check API alive
responses:
200:
description: 'Checks to see if API is alive'
#$ref: '#/components/responses/Success'
#500:
# $ref: '#/components/responses/InternalServerError
/evaluate:
post:
tags:
- evaluate
operationId: argminer.api.views.evaluate
summary: Evaluates models on a given test set
parameters:
- $ref: '#/components/parameters/ModelName'
- $ref: '#/components/parameters/LabellingStrategy'
- $ref: '#/components/parameters/AggregationStrategy'
- $ref: '#/components/parameters/LabellingStrategyScope'
- $ref: '#/components/parameters/MaxLength'
- $ref: '#/components/parameters/BatchSize'
- $ref: '#/components/parameters/Labels'
#- $ref: '#/components/parameters/TextSegments'
requestBody:
$ref: '#/components/requestBodies/TextSegments'
responses:
200:
description: Successfully evaluated on test dataset
content:
application/json:
schema:
$ref: '#/components/schemas/InferenceResponse'
404:
description: HuggingFace retrieve error
content:
application/json:
schema:
$ref: '#/components/schemas/HuggingFaceError'


/predict:
post:
tags:
- predict
operationId: argminer.api.views.predict
summary: Predicts labels on a given sample of text
parameters:
- $ref: '#/components/parameters/OurModels'
requestBody:
$ref: '#/components/requestBodies/FreeText'
responses:
200:
description: Successfully predicted on a piece of text
content:
application/json:
schema:
$ref: '#/components/schemas/PredictionResponse'
components:
requestBodies:
TextSegments:
content:
application/json:
schema:
$ref: '#/components/schemas/TextSegments'
FreeText:
content:
text/plain:
schema:
type: string

parameters:
OurModels:
name: model_name
in: query
schema:
$ref: '#/components/schemas/OurModels'
Dataset:
name: dataset_name
in: query
schema:
$ref: '#/components/schemas/Dataset'
LabellingStrategy:
name: strategy
in: query
schema:
$ref: '#/components/schemas/LabellingStrategy'
AggregationStrategy:
name: agg_strategy
in: query
schema:
$ref: '#/components/schemas/AggregationStrategy'
ModelName:
name: model_name
in: query
schema:
$ref: '#/components/schemas/ModelName'
examples:
Ours:
value: ucabqfe/roberta_PER_io
summary: roberta on PERSUADE with io label
HuggingFace: # Distinct name
value: google/bigbird-roberta-base
summary: Official Bigbird model from HuggingFace
LabellingStrategyScope:
name: strategy_level
in: query
schema:
$ref: '#/components/schemas/LabellingStrategyScope'
MaxLength:
name: max_length
in: query
schema:
$ref: '#/components/schemas/MaxLength'
BatchSize:
name: batch_size
in: query
schema:
$ref: '#/components/schemas/BatchSize'
Labels:
name: label_map
in: query
schema:
$ref: '#/components/schemas/Labels'
TextSegments:
name: text_segments
in: query
schema:
$ref: '#/components/schemas/TextSegments'
explode: true

schemas:
Dataset:
type: string
example: AAE
nullable: false
enum: [AAE, PERSUADE]
description: Name of the dataset
LabellingStrategy:
type: string
example: io
enum: [io, bio, bieo, bixo]
nullable: false
description: Strategy to label words in a given text segment
AggregationStrategy:
type: string
example: first
enum: [first, mean, max]
nullable: false
description: aggregation strategy for mapping back from tokens to words
ModelName:
type: string
example: google/bigbird-roberta-base
nullable: false
description: name of the model to use. Can be any generic one from HuggingFace or one of our models
LabellingStrategyScope:
type: string
nullable: false
enum: [standard, wordLevel]
default: standard
description: level to apply labelling strategy at. If standard then inside subtokens labelled as I-.
MaxLength:
type: integer
nullable: false
default: 512
description: maximum number of tokens per passage
BatchSize:
type: integer
nullable: false
default: 32
description: batch size for inference
Labels:
type: array
items:
type: string
example: Other
uniqueItems: true

description: Labels present in data
TextSegments:
type: array
items:
type: array
items:
type: string
example: "Claim:: NLP is the best ML field!"

HuggingFaceError:
type: object
properties:
type:
type: string
example: tokenizer
name:
type: string
example: model
error:
type: string

DimensionMismatchError:
type: object
properties:
error:
type: string
expected:
type: string
received:
type: string

InferenceResponse:
type: object
properties:

score_table:
type: string
OurModels:
type: string
enum: [ucabqfe/roberta_AAE_bieo, ucabqfe/roberta_AAE_bio, ucabqfe/roberta_AAE_io, ucabqfe/roberta_PER_bieo, ucabqfe/roberta_PER_bio, ucabqfe/roberta_PER_io, ucabqfe/bigBird_AAE_bieo, ucabqfe/bigBird_AAE_bio, ucabqfe/bigBird_AAE_io, ucabqfe/bigBird_PER_bieo, ucabqfe/bigBird_PER_bio, ucabqfe/bigBird_PER_io]
default: ucabqfe/roberta_AAE_bieo

ModelInfoResponse:
type: object
properties:
hugging_face_model_name:
type: string
description: name of base model used for training
labels:
type: string
PredictionResponse:
type: object


#definitions:
# User:
# type: object
# properties:
# id:
# type: integer
# description: The user ID.
# username:
# type: string
# description: The user name.



17 changes: 17 additions & 0 deletions argminer/api/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from argminer.data import DataProcessor
import pandas as pd
def _generate_df_text_from_input(text_segments, strategy):
text_segments_split = [
(text_segment.split('::') for text_segment in doc) for doc in text_segments
]


df = pd.concat([
pd.DataFrame.from_records(
doc,
columns=['label', 'text']
).assign(doc_id=i) for i, doc in enumerate(text_segments_split)
])

processor = DataProcessor('').from_json(status='preprocessed', df=df).process(strategy).postprocess()
return processor.dataframe
Loading

0 comments on commit df836ae

Please sign in to comment.