Skip to content

Commit

Permalink
Merge branch 'feature-modelbuild' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
bbengfort committed Aug 17, 2016
2 parents 2fb7a5f + 61fca20 commit bc93eae
Show file tree
Hide file tree
Showing 7 changed files with 248 additions and 3 deletions.
22 changes: 22 additions & 0 deletions arbiter/migrations/0002_estimator_corpus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.9.7 on 2016-08-17 00:51
from __future__ import unicode_literals

from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
('corpus', '0003_corpus'),
('arbiter', '0001_initial'),
]

operations = [
migrations.AddField(
model_name='estimator',
name='corpus',
field=models.ForeignKey(blank=True, default=None, null=True, on_delete=django.db.models.deletion.CASCADE, to='corpus.Corpus'),
),
]
1 change: 1 addition & 0 deletions arbiter/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ class Estimator(TimeStampedModel):
estimator = PickledObjectField(**nullable) # The pickled object model
build_time = models.DurationField(**nullable) # The amount of time it took to buld
owner = models.ForeignKey('auth.User', **nullable) # The owner, if any, of the model
corpus = models.ForeignKey('corpus.Corpus', **nullable) # The corpus the estimator was trained on

This comment has been minimized.

Copy link
@bbengfort

bbengfort Aug 17, 2016

Author Member

@lauralorenz one part of the spec was to ensure that an estimator could look up what documents it was trained on - here is the key that provides a relationship from the estimator to a corpus (the collection of documents).


class Meta:
db_table = "estimators"
Expand Down
3 changes: 2 additions & 1 deletion corpus/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
##########################################################################

from django.contrib import admin
from corpus.models import Document, Annotation, Label
from corpus.models import Document, Annotation, Label, Corpus

##########################################################################
## Register Admin
Expand All @@ -27,3 +27,4 @@
admin.site.register(Label)
admin.site.register(Annotation)
admin.site.register(Document)
admin.site.register(Corpus)
27 changes: 26 additions & 1 deletion corpus/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
##########################################################################

from django.db import models

from django.apps import apps

##########################################################################
## Annotation Manager
Expand All @@ -37,3 +37,28 @@ def democratic(self):
Filters the annotations for only democratic annotations.
"""
return self.filter(label__slug='democratic')


##########################################################################
## Corpus Manager
##########################################################################

class CorpusManager(models.Manager):

def create_for_user(self, user, **kwargs):

This comment has been minimized.

Copy link
@bbengfort

bbengfort Aug 17, 2016

Author Member

@lauralorenz Automatic corpus creation for a specific user. Note that documents with "None" labels may be part of the corpus, and that the label can change after creation of the corpus. ...

"""
Creates a user-specific corpus containing all the documents that the
user has tagged to date. Can pass in any additional fields as well.
"""
# Lazy load the document model
Document = apps.get_model('corpus.Document')

# Add the user to the kwargs and construct the corpus.
kwargs['user'] = user
corpus = self.create(**kwargs)

# Now add all the documents the user has annotated to date.
docs = Document.objects.filter(annotations__user=user)
corpus.documents.set(docs)

return corpus
38 changes: 38 additions & 0 deletions corpus/migrations/0003_corpus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.9.7 on 2016-08-17 00:51
from __future__ import unicode_literals

import autoslug.fields
from django.conf import settings
from django.db import migrations, models
import django.db.models.deletion
import django.utils.timezone
import model_utils.fields


class Migration(migrations.Migration):

dependencies = [
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
('corpus', '0002_auto_20160802_1030'),
]

operations = [
migrations.CreateModel(
name='Corpus',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('created', model_utils.fields.AutoCreatedField(default=django.utils.timezone.now, editable=False, verbose_name='created')),
('modified', model_utils.fields.AutoLastModifiedField(default=django.utils.timezone.now, editable=False, verbose_name='modified')),
('title', models.CharField(blank=True, default=None, max_length=255, null=True)),
('slug', autoslug.fields.AutoSlugField(editable=False, populate_from='title', unique=True)),
('documents', models.ManyToManyField(related_name='corpora', to='corpus.Document')),
('user', models.ForeignKey(blank=True, default=None, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='corpora', to=settings.AUTH_USER_MODEL)),
],
options={
'db_table': 'corpora',
'get_latest_by': 'created',
'ordering': ['-created'],
},
),
]
63 changes: 62 additions & 1 deletion corpus/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@
from django.core.urlresolvers import reverse
from model_utils.models import TimeStampedModel
from picklefield.fields import PickledObjectField
from corpus.managers import AnnotationManager
from corpus.managers import AnnotationManager, CorpusManager

from operator import itemgetter

##########################################################################
## Document Model
Expand Down Expand Up @@ -53,6 +55,31 @@ class Meta:
get_latest_by = "created"
unique_together = ("long_url", "short_url")

def label(self, user=None):

This comment has been minimized.

Copy link
@bbengfort

bbengfort Aug 17, 2016

Author Member

This method computes the label for a document either by looking up a particular user's annotation or by getting a majority vote from all annotators. If there is a tie or if there are no annotations this method returns None.

"""
If a user is specified then returns the label for that user. Otherwise
returns the majority voted label for the document in the corpus.
"""
# If a user is supplied get their annotation and return the label.
if user is not None:
annotation = self.annotations.filter(user=user).first()
if annotation: return annotation.label

# Otherwise aggregate the annotations per document.
# TODO: Add annotator aggreement logic here!
else:
labels = self.labels.annotate(votes=models.Count('id'))
votes = [(label, label.votes) for label in labels]
if votes:
# Check if a tie
if len(set(vote[1] for vote in votes)) == 1:
return None

# Return the maximum
return max(votes, key=itemgetter(1))[0]

return None

def get_absolute_url(self):
"""
Returns the detail view url for the object
Expand Down Expand Up @@ -120,3 +147,37 @@ def __str__(self):
##########################################################################
## Corpus Model
##########################################################################

class Corpus(TimeStampedModel):

This comment has been minimized.

Copy link
@bbengfort

bbengfort Aug 17, 2016

Author Member

@lauralorenz The collection of documents that an estimator is trained on. Can be tied to a user or not. Was thinking of making the Many2Many relationship between corpora and documents through a model called "LabeledDocuments" so that we could fix a specific annotation to a document at the time of training as well as eliminate null labels from the training process. What do you think?

This comment has been minimized.

Copy link
@lauralorenz

lauralorenz Aug 17, 2016

I agree especially re: fixing annotations to protect model reproducability and detail since labels can change underneath the model over time. I was actually originally concerned with the content possibly changing too, is that possible? Can/should documents get redownloaded and reprocessed?

For removing null labels, obviously advantageous against supervised estimators which is all this is used against now but do we want flexibility for clustering against labelless or label-sparse corpora in this app?

This comment has been minimized.

Copy link
@lauralorenz

lauralorenz Aug 17, 2016

PS @bbengfort ^ probs should tag you on these for email notifications eh

This comment has been minimized.

Copy link
@bbengfort

bbengfort Aug 17, 2016

Author Member

Hehe, thanks @lauralorenz -- email notifications are good!

If we're going to be doing clustering or unsupervised methods - the estimator would simply omit the labels; but I take your point, we'd still need to be able to create a corpus that allowed documents to be added with a null label. So in that case, the spec for the LabeledDocuments M2M relationship will be that the label is nullable?

I'll go ahead and add that to the Issue.

This comment has been minimized.

Copy link
@bbengfort

bbengfort Aug 17, 2016

Author Member

@lauralorenz As for content changing -- well that's a good question. Currently the URL is unique in the Document model, and the signal that does the loading and preprocessing checks to make sure that the content is null - so things as they stand will not allow the redownloading/processing of a link from the first time it was accessed. I'll, send a line note to the section where this occurs.

This comment has been minimized.

Copy link
@lauralorenz

lauralorenz Aug 17, 2016

@bbengfort ok got it, and saw the other lines notes and makes sense. We'll have to keep it this way or provide some other versioning structure if we ever want to allow any sort of extra (dare we even consider human??) content post-processing, but preserve estimator reproducability.

"""
A model that maintains a mapping of documents to estimators for use in
tracking the training data that is used to fit a text classifier object.
"""

title = models.CharField(max_length=255, **nullable)
slug = AutoSlugField(populate_from='title', unique=True)
documents = models.ManyToManyField('corpus.Document', related_name='corpora')
user = models.ForeignKey('auth.User', related_name='corpora', **nullable)

objects = CorpusManager()

class Meta:
db_table = "corpora"
get_latest_by = "created"
ordering = ["-created"]
verbose_name = "corpus"
verbose_name_plural = "corpora"

def __str__(self):
if self.title:
return self.title

# Construct the descriptive string.
s = "{} document corpus created on {}".format(
self.documents.count(), self.created.strftime("%Y-%m-%d")
)

if self.user:
s += " by {}".format(self.user)

return s
97 changes: 97 additions & 0 deletions corpus/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,103 @@ def tagged(self, **kwargs):
]


##########################################################################
## Django Query Corpus Reader
##########################################################################

class QueryCorpusReader(object):

This comment has been minimized.

Copy link
@bbengfort

bbengfort Aug 17, 2016

Author Member

Provides an interface to a query that returns a collection of documents for the corpus loader class.

"""
The query corpus reader takes in a query that yields a list of documents
and modifies it such that it is only fetching the preprocessed content in
a streaming fashion.
"""

def __init__(self, query, user=None):
"""
Pass in a QuerySet or Query object for selecting a group of documents.
Can also optionally pass in a user to determine labeling scheme.
"""
self.user = user
self.query = query

def fileids(self, categories=None):

This comment has been minimized.

Copy link
@bbengfort

bbengfort Aug 17, 2016

Author Member

This seems inefficient (in terms of database queries) and roughly done.

"""
Returns a list of file primary keys for the files that make up this
corpus or that make up the given category(s) if specified.
Categories can be either a single string or a list of strings.
"""
# If categories is None, return all fileids.
if categories is None:
return self.query.values_list('id', flat=True)

# Convert to a list if a singleton is passed
if isinstance(categories, str):
categories = [categories,]

# Convert to a quick lookup data structure
categories = set(categories)

# Manually loop through all documents (bummer)
return [
doc.id for doc in self.query
if doc.label(self.user) in categories
]

def categories(self, fileids=None):

This comment has been minimized.

Copy link
@bbengfort

bbengfort Aug 17, 2016

Author Member

This seems inefficient and roughly done.

"""
Return a list of file identifiers of the categories defined for this
corpus or the file(s) if it is given.
Fileids can be either a list of integers or a single integer.
"""
# If fileids is None, return all categories
# HACK: use a unique query on the database
if fileids is None:
return list(set([
str(doc.label(self.user)) for doc in self.query
]))

# Convert to a list if a singleton is passed
if isinstance(fileids, int):
fileids = [fileids,]

return list(set([
str(doc.label(self.user))
for doc in self.query.filter(id__in=fileids)
]))

def tagged(self, fileids=None, categories=None):

This comment has been minimized.

Copy link
@bbengfort

bbengfort Aug 17, 2016

Author Member

This is the primary access point for the model build process.

"""
Returns the content of each document.
"""
if fileids is None:
fileids = self.fileids(categories)

if isinstance(fileids, int):
fileids = [fileids,]

for doc in self.query.filter(id__in=fileids).values_list('content', flat=True):
for para in doc:
yield para


##########################################################################
## Django Corpus Model Reader
##########################################################################

class CorpusModelReader(QueryCorpusReader):

This comment has been minimized.

Copy link
@bbengfort

bbengfort Aug 17, 2016

Author Member

Specifically for a corpus model.

"""
Takes a corpus object and automatically references documents.
"""

def __init__(self, corpus):
self.corpus = corpus
super(CorpusModelReader, self).__init__(
corpus.documents.all(), corpus.user
)


if __name__ == '__main__':
path = os.path.join(os.path.dirname(__file__), "fixtures", "debates")
corpus = TranscriptCorpusReader(path)
Expand Down

0 comments on commit bc93eae

Please sign in to comment.