Merge branch 'feature-modelbuild' into develop

DistrictDataLabs · Aug 17, 2016 · bc93eae · bbengfort · Aug 17, 2016 · bbengfort
2 parents 2fb7a5f + 61fca20
commit bc93eae
Show file tree

Hide file tree

Showing 7 changed files with 248 additions and 3 deletions.
diff --git a/arbiter/migrations/0002_estimator_corpus.py b/arbiter/migrations/0002_estimator_corpus.py
@@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+# Generated by Django 1.9.7 on 2016-08-17 00:51
+from __future__ import unicode_literals
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('corpus', '0003_corpus'),
+        ('arbiter', '0001_initial'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='estimator',
+            name='corpus',
+            field=models.ForeignKey(blank=True, default=None, null=True, on_delete=django.db.models.deletion.CASCADE, to='corpus.Corpus'),
+        ),
+    ]
diff --git a/arbiter/models.py b/arbiter/models.py
@@ -44,6 +44,7 @@ class Estimator(TimeStampedModel):
     estimator   = PickledObjectField(**nullable)                 # The pickled object model
     build_time  = models.DurationField(**nullable)               # The amount of time it took to buld
     owner       = models.ForeignKey('auth.User', **nullable)     # The owner, if any, of the model
+    corpus      = models.ForeignKey('corpus.Corpus', **nullable) # The corpus the estimator was trained on
 
     class Meta:
         db_table = "estimators"

diff --git a/corpus/admin.py b/corpus/admin.py
@@ -18,7 +18,7 @@
 ##########################################################################
 
 from django.contrib import admin
-from corpus.models import Document, Annotation, Label
+from corpus.models import Document, Annotation, Label, Corpus
 
 ##########################################################################
 ## Register Admin
@@ -27,3 +27,4 @@
 admin.site.register(Label)
 admin.site.register(Annotation)
 admin.site.register(Document)
+admin.site.register(Corpus)
diff --git a/corpus/managers.py b/corpus/managers.py
@@ -18,7 +18,7 @@
 ##########################################################################
 
 from django.db import models
-
+from django.apps import apps
 
 ##########################################################################
 ## Annotation Manager
@@ -37,3 +37,28 @@ def democratic(self):
         Filters the annotations for only democratic annotations.
         """
         return self.filter(label__slug='democratic')
+
+
+##########################################################################
+## Corpus Manager
+##########################################################################
+
+class CorpusManager(models.Manager):
+
+    def create_for_user(self, user, **kwargs):
+        """
+        Creates a user-specific corpus containing all the documents that the
+        user has tagged to date. Can pass in any additional fields as well.
+        """
+        # Lazy load the document model
+        Document = apps.get_model('corpus.Document')
+
+        # Add the user to the kwargs and construct the corpus.
+        kwargs['user'] = user
+        corpus = self.create(**kwargs)
+
+        # Now add all the documents the user has annotated to date.
+        docs = Document.objects.filter(annotations__user=user)
+        corpus.documents.set(docs)
+
+        return corpus
diff --git a/corpus/migrations/0003_corpus.py b/corpus/migrations/0003_corpus.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+# Generated by Django 1.9.7 on 2016-08-17 00:51
+from __future__ import unicode_literals
+
+import autoslug.fields
+from django.conf import settings
+from django.db import migrations, models
+import django.db.models.deletion
+import django.utils.timezone
+import model_utils.fields
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+        ('corpus', '0002_auto_20160802_1030'),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='Corpus',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('created', model_utils.fields.AutoCreatedField(default=django.utils.timezone.now, editable=False, verbose_name='created')),
+                ('modified', model_utils.fields.AutoLastModifiedField(default=django.utils.timezone.now, editable=False, verbose_name='modified')),
+                ('title', models.CharField(blank=True, default=None, max_length=255, null=True)),
+                ('slug', autoslug.fields.AutoSlugField(editable=False, populate_from='title', unique=True)),
+                ('documents', models.ManyToManyField(related_name='corpora', to='corpus.Document')),
+                ('user', models.ForeignKey(blank=True, default=None, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='corpora', to=settings.AUTH_USER_MODEL)),
+            ],
+            options={
+                'db_table': 'corpora',
+                'get_latest_by': 'created',
+                'ordering': ['-created'],
+            },
+        ),
+    ]
diff --git a/corpus/models.py b/corpus/models.py
@@ -23,7 +23,9 @@
 from django.core.urlresolvers import reverse
 from model_utils.models import TimeStampedModel
 from picklefield.fields import PickledObjectField
-from corpus.managers import AnnotationManager
+from corpus.managers import AnnotationManager, CorpusManager
+
+from operator import itemgetter
 
 ##########################################################################
 ## Document Model
@@ -53,6 +55,31 @@ class Meta:
         get_latest_by = "created"
         unique_together = ("long_url", "short_url")
 
+    def label(self, user=None):
+        """
+        If a user is specified then returns the label for that user. Otherwise
+        returns the majority voted label for the document in the corpus.
+        """
+        # If a user is supplied get their annotation and return the label.
+        if user is not None:
+            annotation = self.annotations.filter(user=user).first()
+            if annotation: return annotation.label
+
+        # Otherwise aggregate the annotations per document.
+        # TODO: Add annotator aggreement logic here!
+        else:
+            labels = self.labels.annotate(votes=models.Count('id'))
+            votes  = [(label, label.votes) for label in labels]
+            if votes:
+                # Check if a tie
+                if len(set(vote[1] for vote in votes)) == 1:
+                    return None
+
+                # Return the maximum
+                return max(votes, key=itemgetter(1))[0]
+
+        return None
+
     def get_absolute_url(self):
         """
         Returns the detail view url for the object
@@ -120,3 +147,37 @@ def __str__(self):
 ##########################################################################
 ## Corpus Model
 ##########################################################################
+
+class Corpus(TimeStampedModel):
+    """
+    A model that maintains a mapping of documents to estimators for use in
+    tracking the training data that is used to fit a text classifier object.
+    """
+
+    title     = models.CharField(max_length=255, **nullable)
+    slug      = AutoSlugField(populate_from='title', unique=True)
+    documents = models.ManyToManyField('corpus.Document', related_name='corpora')
+    user      = models.ForeignKey('auth.User', related_name='corpora', **nullable)
+
+    objects   = CorpusManager()
+
+    class Meta:
+        db_table = "corpora"
+        get_latest_by = "created"
+        ordering = ["-created"]
+        verbose_name = "corpus"
+        verbose_name_plural = "corpora"
+
+    def __str__(self):
+        if self.title:
+            return self.title
+
+        # Construct the descriptive string.
+        s = "{} document corpus created on {}".format(
+            self.documents.count(), self.created.strftime("%Y-%m-%d")
+        )
+
+        if self.user:
+            s += " by {}".format(self.user)
+
+        return s
diff --git a/corpus/reader.py b/corpus/reader.py
@@ -52,6 +52,103 @@ def tagged(self, **kwargs):
             ]
 
 
+##########################################################################
+## Django Query Corpus Reader
+##########################################################################
+
+class QueryCorpusReader(object):
+    """
+    The query corpus reader takes in a query that yields a list of documents
+    and modifies it such that it is only fetching the preprocessed content in
+    a streaming fashion.
+    """
+
+    def __init__(self, query, user=None):
+        """
+        Pass in a QuerySet or Query object for selecting a group of documents.
+        Can also optionally pass in a user to determine labeling scheme.
+        """
+        self.user  = user
+        self.query = query
+
+    def fileids(self, categories=None):
+        """
+        Returns a list of file primary keys for the files that make up this
+        corpus or that make up the given category(s) if specified.
+
+        Categories can be either a single string or a list of strings.
+        """
+        # If categories is None, return all fileids.
+        if categories is None:
+            return self.query.values_list('id', flat=True)
+
+        # Convert to a list if a singleton is passed
+        if isinstance(categories, str):
+            categories = [categories,]
+
+        # Convert to a quick lookup data structure
+        categories = set(categories)
+
+        # Manually loop through all documents (bummer)
+        return [
+            doc.id for doc in self.query
+            if doc.label(self.user) in categories
+        ]
+
+    def categories(self, fileids=None):
+        """
+        Return a list of file identifiers of the categories defined for this
+        corpus or the file(s) if it is given.
+
+        Fileids can be either a list of integers or a single integer.
+        """
+        # If fileids is None, return all categories
+        # HACK: use a unique query on the database
+        if fileids is None:
+            return list(set([
+                str(doc.label(self.user)) for doc in self.query
+            ]))
+
+        # Convert to a list if a singleton is passed
+        if isinstance(fileids, int):
+            fileids = [fileids,]
+
+        return list(set([
+            str(doc.label(self.user))
+            for doc in self.query.filter(id__in=fileids)
+        ]))
+
+    def tagged(self, fileids=None, categories=None):
+        """
+        Returns the content of each document.
+        """
+        if fileids is None:
+            fileids = self.fileids(categories)
+
+        if isinstance(fileids, int):
+            fileids = [fileids,]
+
+        for doc in self.query.filter(id__in=fileids).values_list('content', flat=True):
+            for para in doc:
+                yield para
+
+
+##########################################################################
+## Django Corpus Model Reader
+##########################################################################
+
+class CorpusModelReader(QueryCorpusReader):
+    """
+    Takes a corpus object and automatically references documents.
+    """
+
+    def __init__(self, corpus):
+        self.corpus = corpus
+        super(CorpusModelReader, self).__init__(
+            corpus.documents.all(), corpus.user
+        )
+
+
 if __name__ == '__main__':
     path = os.path.join(os.path.dirname(__file__), "fixtures", "debates")
     corpus = TranscriptCorpusReader(path)