From 53673f1e93eed0c780faa7f4d341b525dafcd194 Mon Sep 17 00:00:00 2001 From: Benjamin Bengfort Date: Tue, 16 Aug 2016 21:16:21 -0400 Subject: [PATCH 1/3] corpus and corpus manager --- arbiter/migrations/0002_estimator_corpus.py | 22 ++++++++++++ arbiter/models.py | 1 + corpus/admin.py | 3 +- corpus/managers.py | 27 ++++++++++++++- corpus/migrations/0003_corpus.py | 38 +++++++++++++++++++++ corpus/models.py | 36 ++++++++++++++++++- 6 files changed, 124 insertions(+), 3 deletions(-) create mode 100644 arbiter/migrations/0002_estimator_corpus.py create mode 100644 corpus/migrations/0003_corpus.py diff --git a/arbiter/migrations/0002_estimator_corpus.py b/arbiter/migrations/0002_estimator_corpus.py new file mode 100644 index 0000000..66e4f99 --- /dev/null +++ b/arbiter/migrations/0002_estimator_corpus.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.9.7 on 2016-08-17 00:51 +from __future__ import unicode_literals + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('corpus', '0003_corpus'), + ('arbiter', '0001_initial'), + ] + + operations = [ + migrations.AddField( + model_name='estimator', + name='corpus', + field=models.ForeignKey(blank=True, default=None, null=True, on_delete=django.db.models.deletion.CASCADE, to='corpus.Corpus'), + ), + ] diff --git a/arbiter/models.py b/arbiter/models.py index 8282d0d..687ed52 100644 --- a/arbiter/models.py +++ b/arbiter/models.py @@ -44,6 +44,7 @@ class Estimator(TimeStampedModel): estimator = PickledObjectField(**nullable) # The pickled object model build_time = models.DurationField(**nullable) # The amount of time it took to buld owner = models.ForeignKey('auth.User', **nullable) # The owner, if any, of the model + corpus = models.ForeignKey('corpus.Corpus', **nullable) # The corpus the estimator was trained on class Meta: db_table = "estimators" diff --git a/corpus/admin.py b/corpus/admin.py index 1f5d626..09efd96 100644 --- a/corpus/admin.py +++ b/corpus/admin.py @@ -18,7 +18,7 @@ ########################################################################## from django.contrib import admin -from corpus.models import Document, Annotation, Label +from corpus.models import Document, Annotation, Label, Corpus ########################################################################## ## Register Admin @@ -27,3 +27,4 @@ admin.site.register(Label) admin.site.register(Annotation) admin.site.register(Document) +admin.site.register(Corpus) diff --git a/corpus/managers.py b/corpus/managers.py index 97277f4..4644513 100644 --- a/corpus/managers.py +++ b/corpus/managers.py @@ -18,7 +18,7 @@ ########################################################################## from django.db import models - +from django.apps import apps ########################################################################## ## Annotation Manager @@ -37,3 +37,28 @@ def democratic(self): Filters the annotations for only democratic annotations. """ return self.filter(label__slug='democratic') + + +########################################################################## +## Corpus Manager +########################################################################## + +class CorpusManager(models.Manager): + + def create_for_user(self, user, **kwargs): + """ + Creates a user-specific corpus containing all the documents that the + user has tagged to date. Can pass in any additional fields as well. + """ + # Lazy load the document model + Document = apps.get_model('corpus.Document') + + # Add the user to the kwargs and construct the corpus. + kwargs['user'] = user + corpus = self.create(**kwargs) + + # Now add all the documents the user has annotated to date. + docs = Document.objects.filter(annotations__user=user) + corpus.documents.set(docs) + + return corpus diff --git a/corpus/migrations/0003_corpus.py b/corpus/migrations/0003_corpus.py new file mode 100644 index 0000000..3a5e481 --- /dev/null +++ b/corpus/migrations/0003_corpus.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.9.7 on 2016-08-17 00:51 +from __future__ import unicode_literals + +import autoslug.fields +from django.conf import settings +from django.db import migrations, models +import django.db.models.deletion +import django.utils.timezone +import model_utils.fields + + +class Migration(migrations.Migration): + + dependencies = [ + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ('corpus', '0002_auto_20160802_1030'), + ] + + operations = [ + migrations.CreateModel( + name='Corpus', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('created', model_utils.fields.AutoCreatedField(default=django.utils.timezone.now, editable=False, verbose_name='created')), + ('modified', model_utils.fields.AutoLastModifiedField(default=django.utils.timezone.now, editable=False, verbose_name='modified')), + ('title', models.CharField(blank=True, default=None, max_length=255, null=True)), + ('slug', autoslug.fields.AutoSlugField(editable=False, populate_from='title', unique=True)), + ('documents', models.ManyToManyField(related_name='corpora', to='corpus.Document')), + ('user', models.ForeignKey(blank=True, default=None, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='corpora', to=settings.AUTH_USER_MODEL)), + ], + options={ + 'db_table': 'corpora', + 'get_latest_by': 'created', + 'ordering': ['-created'], + }, + ), + ] diff --git a/corpus/models.py b/corpus/models.py index 4863e4a..371d844 100644 --- a/corpus/models.py +++ b/corpus/models.py @@ -23,7 +23,7 @@ from django.core.urlresolvers import reverse from model_utils.models import TimeStampedModel from picklefield.fields import PickledObjectField -from corpus.managers import AnnotationManager +from corpus.managers import AnnotationManager, CorpusManager ########################################################################## ## Document Model @@ -120,3 +120,37 @@ def __str__(self): ########################################################################## ## Corpus Model ########################################################################## + +class Corpus(TimeStampedModel): + """ + A model that maintains a mapping of documents to estimators for use in + tracking the training data that is used to fit a text classifier object. + """ + + title = models.CharField(max_length=255, **nullable) + slug = AutoSlugField(populate_from='title', unique=True) + documents = models.ManyToManyField('corpus.Document', related_name='corpora') + user = models.ForeignKey('auth.User', related_name='corpora', **nullable) + + objects = CorpusManager() + + class Meta: + db_table = "corpora" + get_latest_by = "created" + ordering = ["-created"] + verbose_name = "corpus" + verbose_name_plural = "corpora" + + def __str__(self): + if self.title: + return self.title + + # Construct the descriptive string. + s = "{} document corpus created on {}".format( + self.documents.count(), self.created.strftime("%Y-%m-%d") + ) + + if self.user: + s += " by {}".format(self.user) + + return s From 4fce5b4751aaecafc99363d7b5170c718f4fbb1d Mon Sep 17 00:00:00 2001 From: Benjamin Bengfort Date: Wed, 17 Aug 2016 06:38:32 -0400 Subject: [PATCH 2/3] initial query corpus reader --- corpus/models.py | 27 ++++++++++++++ corpus/reader.py | 93 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+) diff --git a/corpus/models.py b/corpus/models.py index 371d844..acfecb7 100644 --- a/corpus/models.py +++ b/corpus/models.py @@ -25,6 +25,8 @@ from picklefield.fields import PickledObjectField from corpus.managers import AnnotationManager, CorpusManager +from operator import itemgetter + ########################################################################## ## Document Model ########################################################################## @@ -53,6 +55,31 @@ class Meta: get_latest_by = "created" unique_together = ("long_url", "short_url") + def label(self, user=None): + """ + If a user is specified then returns the label for that user. Otherwise + returns the majority voted label for the document in the corpus. + """ + # If a user is supplied get their annotation and return the label. + if user is not None: + annotation = self.annotations.filter(user=user).first() + if annotation: return annotation.label + + # Otherwise aggregate the annotations per document. + # TODO: Add annotator aggreement logic here! + else: + labels = self.labels.annotate(votes=models.Count('id')) + votes = [(label, label.votes) for label in labels] + if votes: + # Check if a tie + if len(set(vote[1] for vote in votes)) == 1: + return None + + # Return the maximum + return max(votes, key=itemgetter(1))[0] + + return None + def get_absolute_url(self): """ Returns the detail view url for the object diff --git a/corpus/reader.py b/corpus/reader.py index bc03c81..eb4552e 100644 --- a/corpus/reader.py +++ b/corpus/reader.py @@ -52,6 +52,99 @@ def tagged(self, **kwargs): ] +########################################################################## +## Django Query Corpus Reader +########################################################################## + +class QueryCorpusReader(object): + """ + The query corpus reader takes in a query that yields a list of documents + and modifies it such that it is only fetching the preprocessed content in + a streaming fashion. + """ + + def __init__(self, query, user=None): + """ + Pass in a QuerySet or Query object for selecting a group of documents. + Can also optionally pass in a user to determine labeling scheme. + """ + self.user = user + self.query = query + + def fileids(self, categories=None): + """ + Returns a list of file primary keys for the files that make up this + corpus or that make up the given category(s) if specified. + + Categories can be either a single string or a list of strings. + """ + # If categories is None, return all fileids. + if categories is None: + return self.query.values_list('id', flat=True) + + # Convert to a list if a singleton is passed + if isinstance(categories, str): + categories = [categories,] + + # Convert to a quick lookup data structure + categories = set(categories) + + # Manually loop through all documents (bummer) + return [ + doc.id for doc in self.query + if doc.label(self.user) in categories + ] + + def categories(self, fileids=None): + """ + Return a list of file identifiers of the categories defined for this + corpus or the file(s) if it is given. + + Fileids can be either a list of integers or a single integer. + """ + # If fileids is None, return all categories + # HACK: use a unique query on the database + return list(set([ + doc.label(self.user) for doc in self.query + ])) + + # Convert to a list if a singleton is passed + if isinstance(fileids, int): + fileids = [fileids,] + + return list(set([ + doc.label(self.user) for doc in self.query.filter(id__in=fileids) + ])) + + def tagged(self, fileids=None, categories=None): + """ + Returns the content of each document. + """ + if fileids is None: + fileids = self.fileids(categories) + + if isinstance(fileids, int): + fileids = [fileids,] + + return self.query.filter(id__in=fileids).values_list('content', flat=True) + + +########################################################################## +## Django Corpus Model Reader +########################################################################## + +class CorpusModelReader(QueryCorpusReader): + """ + Takes a corpus object and automatically references documents. + """ + + def __init__(self, corpus): + self.corpus = corpus + super(CorpusModelReader, self).__init__( + corpus.documents.all(), corpus.user + ) + + if __name__ == '__main__': path = os.path.join(os.path.dirname(__file__), "fixtures", "debates") corpus = TranscriptCorpusReader(path) From 61fca203b816063bdd463e86497bf52b5949a83a Mon Sep 17 00:00:00 2001 From: Benjamin Bengfort Date: Wed, 17 Aug 2016 06:59:38 -0400 Subject: [PATCH 3/3] corpus reader works with loader --- corpus/reader.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/corpus/reader.py b/corpus/reader.py index eb4552e..c054e6b 100644 --- a/corpus/reader.py +++ b/corpus/reader.py @@ -104,16 +104,18 @@ def categories(self, fileids=None): """ # If fileids is None, return all categories # HACK: use a unique query on the database - return list(set([ - doc.label(self.user) for doc in self.query - ])) + if fileids is None: + return list(set([ + str(doc.label(self.user)) for doc in self.query + ])) # Convert to a list if a singleton is passed if isinstance(fileids, int): fileids = [fileids,] return list(set([ - doc.label(self.user) for doc in self.query.filter(id__in=fileids) + str(doc.label(self.user)) + for doc in self.query.filter(id__in=fileids) ])) def tagged(self, fileids=None, categories=None): @@ -126,7 +128,9 @@ def tagged(self, fileids=None, categories=None): if isinstance(fileids, int): fileids = [fileids,] - return self.query.filter(id__in=fileids).values_list('content', flat=True) + for doc in self.query.filter(id__in=fileids).values_list('content', flat=True): + for para in doc: + yield para ##########################################################################