-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# -*- coding: utf-8 -*- | ||
# Generated by Django 1.9.7 on 2016-08-17 00:51 | ||
from __future__ import unicode_literals | ||
|
||
from django.db import migrations, models | ||
import django.db.models.deletion | ||
|
||
|
||
class Migration(migrations.Migration): | ||
|
||
dependencies = [ | ||
('corpus', '0003_corpus'), | ||
('arbiter', '0001_initial'), | ||
] | ||
|
||
operations = [ | ||
migrations.AddField( | ||
model_name='estimator', | ||
name='corpus', | ||
field=models.ForeignKey(blank=True, default=None, null=True, on_delete=django.db.models.deletion.CASCADE, to='corpus.Corpus'), | ||
), | ||
] |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,7 +18,7 @@ | |
########################################################################## | ||
|
||
from django.db import models | ||
|
||
from django.apps import apps | ||
|
||
########################################################################## | ||
## Annotation Manager | ||
|
@@ -37,3 +37,28 @@ def democratic(self): | |
Filters the annotations for only democratic annotations. | ||
""" | ||
return self.filter(label__slug='democratic') | ||
|
||
|
||
########################################################################## | ||
## Corpus Manager | ||
########################################################################## | ||
|
||
class CorpusManager(models.Manager): | ||
|
||
def create_for_user(self, user, **kwargs): | ||
This comment has been minimized.
Sorry, something went wrong.
bbengfort
Author
Member
|
||
""" | ||
Creates a user-specific corpus containing all the documents that the | ||
user has tagged to date. Can pass in any additional fields as well. | ||
""" | ||
# Lazy load the document model | ||
Document = apps.get_model('corpus.Document') | ||
|
||
# Add the user to the kwargs and construct the corpus. | ||
kwargs['user'] = user | ||
corpus = self.create(**kwargs) | ||
|
||
# Now add all the documents the user has annotated to date. | ||
docs = Document.objects.filter(annotations__user=user) | ||
corpus.documents.set(docs) | ||
|
||
return corpus |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
# -*- coding: utf-8 -*- | ||
# Generated by Django 1.9.7 on 2016-08-17 00:51 | ||
from __future__ import unicode_literals | ||
|
||
import autoslug.fields | ||
from django.conf import settings | ||
from django.db import migrations, models | ||
import django.db.models.deletion | ||
import django.utils.timezone | ||
import model_utils.fields | ||
|
||
|
||
class Migration(migrations.Migration): | ||
|
||
dependencies = [ | ||
migrations.swappable_dependency(settings.AUTH_USER_MODEL), | ||
('corpus', '0002_auto_20160802_1030'), | ||
] | ||
|
||
operations = [ | ||
migrations.CreateModel( | ||
name='Corpus', | ||
fields=[ | ||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), | ||
('created', model_utils.fields.AutoCreatedField(default=django.utils.timezone.now, editable=False, verbose_name='created')), | ||
('modified', model_utils.fields.AutoLastModifiedField(default=django.utils.timezone.now, editable=False, verbose_name='modified')), | ||
('title', models.CharField(blank=True, default=None, max_length=255, null=True)), | ||
('slug', autoslug.fields.AutoSlugField(editable=False, populate_from='title', unique=True)), | ||
('documents', models.ManyToManyField(related_name='corpora', to='corpus.Document')), | ||
('user', models.ForeignKey(blank=True, default=None, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='corpora', to=settings.AUTH_USER_MODEL)), | ||
], | ||
options={ | ||
'db_table': 'corpora', | ||
'get_latest_by': 'created', | ||
'ordering': ['-created'], | ||
}, | ||
), | ||
] |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,7 +23,9 @@ | |
from django.core.urlresolvers import reverse | ||
from model_utils.models import TimeStampedModel | ||
from picklefield.fields import PickledObjectField | ||
from corpus.managers import AnnotationManager | ||
from corpus.managers import AnnotationManager, CorpusManager | ||
|
||
from operator import itemgetter | ||
|
||
########################################################################## | ||
## Document Model | ||
|
@@ -53,6 +55,31 @@ class Meta: | |
get_latest_by = "created" | ||
unique_together = ("long_url", "short_url") | ||
|
||
def label(self, user=None): | ||
This comment has been minimized.
Sorry, something went wrong.
bbengfort
Author
Member
|
||
""" | ||
If a user is specified then returns the label for that user. Otherwise | ||
returns the majority voted label for the document in the corpus. | ||
""" | ||
# If a user is supplied get their annotation and return the label. | ||
if user is not None: | ||
annotation = self.annotations.filter(user=user).first() | ||
if annotation: return annotation.label | ||
|
||
# Otherwise aggregate the annotations per document. | ||
# TODO: Add annotator aggreement logic here! | ||
else: | ||
labels = self.labels.annotate(votes=models.Count('id')) | ||
votes = [(label, label.votes) for label in labels] | ||
if votes: | ||
# Check if a tie | ||
if len(set(vote[1] for vote in votes)) == 1: | ||
return None | ||
|
||
# Return the maximum | ||
return max(votes, key=itemgetter(1))[0] | ||
|
||
return None | ||
|
||
def get_absolute_url(self): | ||
""" | ||
Returns the detail view url for the object | ||
|
@@ -120,3 +147,37 @@ def __str__(self): | |
########################################################################## | ||
## Corpus Model | ||
########################################################################## | ||
|
||
class Corpus(TimeStampedModel): | ||
This comment has been minimized.
Sorry, something went wrong.
bbengfort
Author
Member
|
||
""" | ||
A model that maintains a mapping of documents to estimators for use in | ||
tracking the training data that is used to fit a text classifier object. | ||
""" | ||
|
||
title = models.CharField(max_length=255, **nullable) | ||
slug = AutoSlugField(populate_from='title', unique=True) | ||
documents = models.ManyToManyField('corpus.Document', related_name='corpora') | ||
user = models.ForeignKey('auth.User', related_name='corpora', **nullable) | ||
|
||
objects = CorpusManager() | ||
|
||
class Meta: | ||
db_table = "corpora" | ||
get_latest_by = "created" | ||
ordering = ["-created"] | ||
verbose_name = "corpus" | ||
verbose_name_plural = "corpora" | ||
|
||
def __str__(self): | ||
if self.title: | ||
return self.title | ||
|
||
# Construct the descriptive string. | ||
s = "{} document corpus created on {}".format( | ||
self.documents.count(), self.created.strftime("%Y-%m-%d") | ||
) | ||
|
||
if self.user: | ||
s += " by {}".format(self.user) | ||
|
||
return s |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -52,6 +52,103 @@ def tagged(self, **kwargs): | |
] | ||
|
||
|
||
########################################################################## | ||
## Django Query Corpus Reader | ||
########################################################################## | ||
|
||
class QueryCorpusReader(object): | ||
This comment has been minimized.
Sorry, something went wrong.
bbengfort
Author
Member
|
||
""" | ||
The query corpus reader takes in a query that yields a list of documents | ||
and modifies it such that it is only fetching the preprocessed content in | ||
a streaming fashion. | ||
""" | ||
|
||
def __init__(self, query, user=None): | ||
""" | ||
Pass in a QuerySet or Query object for selecting a group of documents. | ||
Can also optionally pass in a user to determine labeling scheme. | ||
""" | ||
self.user = user | ||
self.query = query | ||
|
||
def fileids(self, categories=None): | ||
This comment has been minimized.
Sorry, something went wrong.
bbengfort
Author
Member
|
||
""" | ||
Returns a list of file primary keys for the files that make up this | ||
corpus or that make up the given category(s) if specified. | ||
Categories can be either a single string or a list of strings. | ||
""" | ||
# If categories is None, return all fileids. | ||
if categories is None: | ||
return self.query.values_list('id', flat=True) | ||
|
||
# Convert to a list if a singleton is passed | ||
if isinstance(categories, str): | ||
categories = [categories,] | ||
|
||
# Convert to a quick lookup data structure | ||
categories = set(categories) | ||
|
||
# Manually loop through all documents (bummer) | ||
return [ | ||
doc.id for doc in self.query | ||
if doc.label(self.user) in categories | ||
] | ||
|
||
def categories(self, fileids=None): | ||
This comment has been minimized.
Sorry, something went wrong. |
||
""" | ||
Return a list of file identifiers of the categories defined for this | ||
corpus or the file(s) if it is given. | ||
Fileids can be either a list of integers or a single integer. | ||
""" | ||
# If fileids is None, return all categories | ||
# HACK: use a unique query on the database | ||
if fileids is None: | ||
return list(set([ | ||
str(doc.label(self.user)) for doc in self.query | ||
])) | ||
|
||
# Convert to a list if a singleton is passed | ||
if isinstance(fileids, int): | ||
fileids = [fileids,] | ||
|
||
return list(set([ | ||
str(doc.label(self.user)) | ||
for doc in self.query.filter(id__in=fileids) | ||
])) | ||
|
||
def tagged(self, fileids=None, categories=None): | ||
This comment has been minimized.
Sorry, something went wrong. |
||
""" | ||
Returns the content of each document. | ||
""" | ||
if fileids is None: | ||
fileids = self.fileids(categories) | ||
|
||
if isinstance(fileids, int): | ||
fileids = [fileids,] | ||
|
||
for doc in self.query.filter(id__in=fileids).values_list('content', flat=True): | ||
for para in doc: | ||
yield para | ||
|
||
|
||
########################################################################## | ||
## Django Corpus Model Reader | ||
########################################################################## | ||
|
||
class CorpusModelReader(QueryCorpusReader): | ||
This comment has been minimized.
Sorry, something went wrong. |
||
""" | ||
Takes a corpus object and automatically references documents. | ||
""" | ||
|
||
def __init__(self, corpus): | ||
self.corpus = corpus | ||
super(CorpusModelReader, self).__init__( | ||
corpus.documents.all(), corpus.user | ||
) | ||
|
||
|
||
if __name__ == '__main__': | ||
path = os.path.join(os.path.dirname(__file__), "fixtures", "debates") | ||
corpus = TranscriptCorpusReader(path) | ||
|
@lauralorenz one part of the spec was to ensure that an estimator could look up what documents it was trained on - here is the key that provides a relationship from the estimator to a corpus (the collection of documents).