diff --git a/.gitignore b/.gitignore index 72364f9..db73985 100644 --- a/.gitignore +++ b/.gitignore @@ -87,3 +87,10 @@ ENV/ # Rope project settings .ropeproject + +# Corpora and Fixtures +corpus/fixtures/debates +corpus/fixtures/*.pickle + +# Mac Stuff +.DS_Store diff --git a/README.md b/README.md index 05e959d..560dfea 100644 --- a/README.md +++ b/README.md @@ -22,11 +22,21 @@ The release versions that are deployed to the web servers are also tagged in Git The versioning uses a three part version system, "a.b.c" - "a" represents a major release that may not be backwards compatible. "b" is incremented on minor releases that may contain extra features, but are backwards compatible. "c" releases are bug fixes or other micro changes that developers should feel free to immediately update to. +### Version 0.2 Beta 3 + +* **tag**: [v0.2b3](https://github.com/DistrictDataLabs/partisan-discourse/releases/tag/v0.2b3) +* **deployment**: Monday, August 29, 2016 +* **commit**: [see tag](#) + +This is an intermediate release to ensure that some front end components become visible in advance of the first release. In particular the annotation help button on the document view and the about page that gives built with attribution. This release also fixes the "." in usernames bug that would not allow people to log in or access their profile. + +This release also contains components that are not officially ready but sit quietly in the background waiting to be deployed. This includes the management command to build models, the corpus object to link data sets, estimator models for Scikit-Learn pipeline/estimator data storage and more. These elements will be discussed in detail in future releases. + ### Version 0.1 Beta 1 * **tag**: [v0.1b1](https://github.com/DistrictDataLabs/partisan-discourse/releases/tag/v0.1b1) * **deployment**: Monday, July 18, 2016 -* **commit**: [see tag](#) +* **commit**: [393211a](https://github.com/DistrictDataLabs/partisan-discourse/commit/393211acd1a270e04ae7b69f750be2f3e7305230) This is the first beta release of the Partisan Discourse application. Right now this simple web application allows users to sign in, then add links to go fetch web content to the global corpus. These links are then preprocessed using NLP foo. Users can tag the documents as Republican or Democrat, allowing us to build a political classifier. diff --git a/arbiter/__init__.py b/arbiter/__init__.py new file mode 100644 index 0000000..4bfdcd0 --- /dev/null +++ b/arbiter/__init__.py @@ -0,0 +1,25 @@ +# arbiter +# A Django app that implements a MMS for the red/blue models. +# +# Author: Benjamin Bengfort +# Created: Tue Aug 02 09:13:41 2016 -0400 +# +# Copyright (C) 2016 District Data Labs +# For license information, see LICENSE.txt +# +# ID: __init__.py [cd70726] benjamin@bengfort.com $ + +""" +A Django app that implements a MMS for the red/blue models. +""" + +########################################################################## +## Imports +########################################################################## + + +########################################################################## +## Configuration +########################################################################## + +default_app_config = 'arbiter.apps.ArbiterConfig' diff --git a/arbiter/admin.py b/arbiter/admin.py new file mode 100644 index 0000000..ab33bec --- /dev/null +++ b/arbiter/admin.py @@ -0,0 +1,28 @@ +# arbiter.admin +# Django admin CMS definitions and registrations for the arbiter app. +# +# Author: Benjamin Bengfort +# Created: Tue Aug 02 09:18:18 2016 -0400 +# +# Copyright (C) 2016 District Data Labs +# For license information, see LICENSE.txt +# +# ID: admin.py [cd70726] benjamin@bengfort.com $ + +""" +Django admin CMS definitions and registrations for the arbiter app. +""" + +########################################################################## +## Imports +########################################################################## + +from django.contrib import admin +from arbiter.models import Estimator, Score + +########################################################################## +## Register Admin +########################################################################## + +admin.site.register(Estimator) +admin.site.register(Score) diff --git a/arbiter/apps.py b/arbiter/apps.py new file mode 100644 index 0000000..a88772c --- /dev/null +++ b/arbiter/apps.py @@ -0,0 +1,34 @@ +# arbiter.apps +# Application definition for the arbiter app. +# +# Author: Benjamin Bengfort +# Created: Tue Aug 02 09:14:47 2016 -0400 +# +# Copyright (C) 2016 District Data Labs +# For license information, see LICENSE.txt +# +# ID: apps.py [cd70726] benjamin@bengfort.com $ + +""" +Application definition for the arbiter app. +""" + +########################################################################## +## Imports +########################################################################## + +from django.apps import AppConfig + + +########################################################################## +## Corpus Config +########################################################################## + +class ArbiterConfig(AppConfig): + + name = 'arbiter' + verbose_name = 'Arbiter' + + def ready(self): + pass + # import arbiter.signals diff --git a/arbiter/management/__init__.py b/arbiter/management/__init__.py new file mode 100644 index 0000000..f976217 --- /dev/null +++ b/arbiter/management/__init__.py @@ -0,0 +1,18 @@ +# arbiter.management +# A module that specifies Django management commands for the arbiter app. +# +# Author: Benjamin Bengfort +# Created: Tue Aug 02 10:36:54 2016 -0400 +# +# Copyright (C) 2016 District Data Labs +# For license information, see LICENSE.txt +# +# ID: __init__.py [feb4803] benjamin@bengfort.com $ + +""" +A module that specifies Django management commands for the arbiter app. +""" + +########################################################################## +## Imports +########################################################################## diff --git a/arbiter/management/commands/__init__.py b/arbiter/management/commands/__init__.py new file mode 100644 index 0000000..76b9134 --- /dev/null +++ b/arbiter/management/commands/__init__.py @@ -0,0 +1,18 @@ +# arbiter.management.commands +# Module that contains each individual management command for Django. +# +# Author: Benjamin Bengfort +# Created: Tue Aug 02 10:37:24 2016 -0400 +# +# Copyright (C) 2016 District Data Labs +# For license information, see LICENSE.txt +# +# ID: __init__.py [feb4803] benjamin@bengfort.com $ + +""" +Module that contains each individual management command for Django. +""" + +########################################################################## +## Imports +########################################################################## diff --git a/arbiter/management/commands/train.py b/arbiter/management/commands/train.py new file mode 100644 index 0000000..9c1e091 --- /dev/null +++ b/arbiter/management/commands/train.py @@ -0,0 +1,262 @@ +# arbiter.management.commands.train +# Command to train red/blue classifiers from the command line. +# +# Author: Benjamin Bengfort +# Created: Tue Aug 02 10:38:54 2016 -0400 +# +# Copyright (C) 2016 District Data Labs +# For license information, see LICENSE.txt +# +# ID: train.py [feb4803] benjamin@bengfort.com $ + +""" +Command to train red/blue classifiers from the command line. +""" + +########################################################################## +## Imports +########################################################################## + +import numpy as np + +from datetime import datetime +from arbiter.models import Estimator, Score +from django.contrib.auth.models import User +from corpus.reader import TranscriptCorpusReader +from corpus.models import Corpus, Document, LabeledDocument +from corpus.reader import QueryCorpusReader, CorpusModelReader +from corpus.learn import CorpusLoader, build_model +from django.core.management.base import BaseCommand, CommandError + +from sklearn.naive_bayes import MultinomialNB +from sklearn.linear_model import SGDClassifier +from sklearn.linear_model import LogisticRegression + + +########################################################################## +## Training Command +########################################################################## + +class Command(BaseCommand): + + help = "Trains red/blue classifiers and stores them in the database." + + # The types of estimators that this command knows how to train + estimators = { + 'maxent': (LogisticRegression, {}), + 'svm': (SGDClassifier, {'loss':'hinge', 'penalty':'l2', 'alpha':1e-3}), + 'nbayes': (MultinomialNB, {}), + } + + # The minimum number of documents to train an estimator + min_docs = 12 + + def add_arguments(self, parser): + """ + Add command line argparse arguments. + """ + # Model selection argument + parser.add_argument( + '-m', '--model', choices=self.estimators, default='maxent', + help='specify the model form to fit on the given corpus', + ) + + # Number of folds for cross-validation + parser.add_argument( + '-f', '--folds', type=int, default=12, + help='number of folds to use in cross-validation', + ) + + # Optional ownership argument/build model for user + parser.add_argument( + '-u', '--username', default=None, metavar='NAME', + help='specify a user to build the model for or to assign ownership', + ) + + # Path on disk to build a corpus from transcripts + parser.add_argument( + '-t', '--transcripts', default=None, type=str, metavar='PATH', + help='specify a path on disk to the directory containing transcripts', + ) + + # Specify a corpus id to specifically build for + parser.add_argument( + '-c', '--corpus', type=int, default=None, metavar='ID', + help='specify the id of a corpus to build the model for', + ) + + def handle(self, *args, **options): + """ + Handles the model training process as follows: + + 1. If a transcript path is specified build that and assign to the + owner if given in the arguments (ignore other args) + 2. If a corpus id is specified, build the model for that corpus and + assign to the owner if given in the arguments + 3. If just a username is given, construct a user-specific corpus + and build a model for that corpus + 4. If none of those arguments are given, construct a corpus that + utilizes the entire state of the current database, and build + a model for that corpus. + + Note that items 1 and 2 do not create a corpus, whereas 3 and 4 do. + """ + + # Get the owner from the options + owner = self.get_user(options['username']) + + # Create the reader from the options + if options['transcripts']: + # Get the transcripts reader + reader = TranscriptCorpusReader(options['transcripts']) + corpus = None + description = "transcripts located at {}".format(options['transcripts']) + else: + # Get or create the corpus object + reader, corpus = self.get_corpus(owner=owner, **options) + if corpus: + description = str(reader.corpus) + else: + description = "Corpus read by {}".format( + reader.__class__.__name__ + ) + + # Build the model from the corpus and owner. + estimator = self.build_model(reader, owner, description, **options) + + # If corpus, assign it to the estimator and save + if corpus: + estimator.corpus = corpus + estimator.save() + + def build_model(self, reader, owner, description, **options): + """ + Once the reader has been + """ + # Get the details from the command line arguments + model, kwargs = self.estimators[options['model']] + + # Construct the loader from the passed in reader object. + loader = CorpusLoader(reader, options['folds']) + + # Inform the user that the training process is beginning + self.stdout.write(( + "Starting training of {} {} models on {}\n" + "This may take quite a bit of time, please be patient!\n" + ).format( + loader.n_folds + 1, model.__name__, description + )) + + # GO! Build the model forever! Whooo!!! + (clf, scores), total_time = build_model(loader, model, **kwargs) + + # Save the estimator model + estimator = Estimator.objects.create( + model_type = Estimator.TYPES.classifier, + model_class = model.__name__, + model_form = repr(clf), + estimator = clf, + build_time = total_time, + owner = owner, + ) + + # Save the scores objects. + for metric, values in scores.items(): + + # Handle the time key in particular. + if metric == 'times': + Score.objects.create( + metric = Score.METRICS.time, + score = values['final'].total_seconds(), + folds = [td.total_seconds() for td in values['folds']], + estimator = estimator, + ) + continue + + # Handle generic scores for the model + for label, folds in values.items(): + if metric == 'support' and label == 'average': + # This will be an array of None values, so skip. + continue + + Score.objects.create( + metric = metric, + score = np.asarray(folds).mean(), + label = label, + folds = folds, + estimator = estimator, + ) + + + # Report model construction complete + self.stdout.write( + "Training complete in {}! Estimator saved to the database\n".format(total_time) + ) + + return estimator + + def get_user(self, username): + """ + Returns a user or None, raising a command error if no user with the + specified username is found in the database. + """ + if username is None: return None + try: + return User.objects.get(username=username) + except User.DoesNotExist: + raise CommandError( + "No user with username '{}' in the database".format(username) + ) + + def get_corpus(self, owner=None, **options): + """ + Uses the supplied options to get or create a corpus from the args + that have been passed in. Note can raise a CommandError for not enough + documents in a constructed corpus. + + Returns a corpus model reader object as well as a corpus. + """ + + # If an ID is supplied fetch the corpus from the database. + if options['corpus']: + try: + corpus = Corpus.objects.get(id=options['corpus']) + reader = CorpusModelReader(corpus) + return reader, corpus + except Corpus.DoesNotExist: + raise CommandError( + "No corpus with id {} in the database".format(options['corpus']) + ) + + # If an owner is supplied then create a corpus for that specific user. + if owner is not None: + corpus = Corpus.objects.create_for_user( + owner, title="{} user corpus created on {}".format( + owner.username, datetime.now().strftime("%Y-%m-%d") + ) + ) + + # Create a corpus from every document that has annotator agreement! + else: + corpus = Corpus.objects.create( + labeled=True, title="global corpus created on {}".format( + datetime.now().strftime("%Y-%m-%d") + ) + ) + + for document in Document.objects.all(): + label = document.label() + if label is not None: + LabeledDocument.objects.create( + corpus=corpus, document=document, label=label, + ) + + # Perform the check for the corpus count. + if corpus.documents.count() < self.min_docs: + corpus.delete() # Delete any too small corpora + raise CommandError( + "Could not create a corpus with less than {} documents".format(self.min_docs) + ) + + # Otherwise return the corpus + return CorpusModelReader(corpus), corpus diff --git a/arbiter/migrations/0001_initial.py b/arbiter/migrations/0001_initial.py new file mode 100644 index 0000000..09c5e4e --- /dev/null +++ b/arbiter/migrations/0001_initial.py @@ -0,0 +1,58 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.9.7 on 2016-08-02 17:06 +from __future__ import unicode_literals + +from django.conf import settings +import django.contrib.postgres.fields +from django.db import migrations, models +import django.db.models.deletion +import django.utils.timezone +import model_utils.fields +import picklefield.fields + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.CreateModel( + name='Estimator', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('created', model_utils.fields.AutoCreatedField(default=django.utils.timezone.now, editable=False, verbose_name='created')), + ('modified', model_utils.fields.AutoLastModifiedField(default=django.utils.timezone.now, editable=False, verbose_name='modified')), + ('model_type', models.CharField(choices=[('classifier', 'classifier'), ('regression', 'regression'), ('clusters', 'clusters'), ('decomposition', 'decomposition')], max_length=32)), + ('model_class', models.CharField(blank=True, default=None, max_length=255, null=True)), + ('model_form', models.CharField(blank=True, default=None, max_length=512, null=True)), + ('estimator', picklefield.fields.PickledObjectField(blank=True, default=None, editable=False, null=True)), + ('build_time', models.DurationField(blank=True, default=None, null=True)), + ('owner', models.ForeignKey(blank=True, default=None, null=True, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)), + ], + options={ + 'get_latest_by': 'created', + 'db_table': 'estimators', + }, + ), + migrations.CreateModel( + name='Score', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('created', model_utils.fields.AutoCreatedField(default=django.utils.timezone.now, editable=False, verbose_name='created')), + ('modified', model_utils.fields.AutoLastModifiedField(default=django.utils.timezone.now, editable=False, verbose_name='modified')), + ('metric', models.CharField(choices=[('accuracy', 'accuracy'), ('auc', 'auc'), ('brier', 'brier'), ('f1', 'f1'), ('fbeta', 'fbeta'), ('hamming', 'hamming'), ('hinge', 'hinge'), ('jaccard', 'jaccard'), ('logloss', 'logloss'), ('mcc', 'mcc'), ('precision', 'precision'), ('recall', 'recall'), ('roc', 'roc'), ('support', 'support'), ('mae', 'mae'), ('mse', 'mse'), ('mdae', 'mdae'), ('r2', 'r2'), ('rand', 'rand'), ('completeness', 'completeness'), ('homogeneity', 'homogeneity'), ('mutual', 'mutual'), ('silhouette', 'silhouette'), ('v', 'v'), ('time', 'time')], max_length=32)), + ('score', models.FloatField(blank=True, default=None, null=True)), + ('label', models.CharField(blank=True, default=None, max_length=32, null=True)), + ('folds', django.contrib.postgres.fields.ArrayField(base_field=models.FloatField(), blank=True, default=None, null=True, size=None)), + ('estimator', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='scores', to='arbiter.Estimator')), + ], + options={ + 'get_latest_by': 'created', + 'db_table': 'evaluations', + }, + ), + ] diff --git a/arbiter/migrations/0002_estimator_corpus.py b/arbiter/migrations/0002_estimator_corpus.py new file mode 100644 index 0000000..ee7d159 --- /dev/null +++ b/arbiter/migrations/0002_estimator_corpus.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.9.7 on 2016-08-21 20:40 +from __future__ import unicode_literals + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('corpus', '0002_corpus_models'), + ('arbiter', '0001_initial'), + ] + + operations = [ + migrations.AddField( + model_name='estimator', + name='corpus', + field=models.ForeignKey(blank=True, default=None, null=True, on_delete=django.db.models.deletion.CASCADE, to='corpus.Corpus'), + ), + ] diff --git a/arbiter/migrations/__init__.py b/arbiter/migrations/__init__.py new file mode 100644 index 0000000..aee5044 --- /dev/null +++ b/arbiter/migrations/__init__.py @@ -0,0 +1,18 @@ +# arbiter.migrations +# Database migrations for arbiter models. +# +# Author: Benjamin Bengfort +# Created: Tue Aug 02 09:13:04 2016 -0400 +# +# Copyright (C) 2016 District Data Labs +# For license information, see LICENSE.txt +# +# ID: __init__.py [cd70726] benjamin@bengfort.com $ + +""" +Database migrations for arbiter models. +""" + +########################################################################## +## Imports +########################################################################## diff --git a/arbiter/models.py b/arbiter/models.py new file mode 100644 index 0000000..122c424 --- /dev/null +++ b/arbiter/models.py @@ -0,0 +1,96 @@ +# arbiter.models +# Model definitions for the arbiter app. +# +# Author: Benjamin Bengfort +# Created: Tue Aug 02 09:16:07 2016 -0400 +# +# Copyright (C) 2016 District Data Labs +# For license information, see LICENSE.txt +# +# ID: models.py [cd70726] benjamin@bengfort.com $ + +""" +Model definitions for the arbiter app. +""" + +########################################################################## +## Imports +########################################################################## + +from django.db import models +from model_utils import Choices +from partisan.utils import nullable +from model_utils.models import TimeStampedModel +from picklefield.fields import PickledObjectField +from django.contrib.postgres.fields import ArrayField + + +########################################################################## +## Estimator Model +########################################################################## + + +class Estimator(TimeStampedModel): + """ + Stores a Scikit-Learn Estimator object as a pickle in the database. + """ + + # Model types to help decide on evaluation criteria + TYPES = Choices('classifier', 'regression', 'clusters', 'decomposition') + + model_type = models.CharField(choices=TYPES, max_length=32) # The type of the estimator + model_class = models.CharField(max_length=255, **nullable) # The class name of the estimator + model_form = models.CharField(max_length=512, **nullable) # The repr of the estimator + estimator = PickledObjectField(**nullable) # The pickled object model + build_time = models.DurationField(**nullable) # The amount of time it took to buld + owner = models.ForeignKey('auth.User', **nullable) # The owner, if any, of the model + corpus = models.ForeignKey('corpus.Corpus', **nullable) # The corpus the estimator was trained on + + class Meta: + db_table = "estimators" + get_latest_by = "created" + + def __str__(self): + s = "{} {} ({})".format( + self.model_class, self.model_type.title(), self.created.strftime('%Y-%m-%d') + ) + + if self.owner: + s += " for {}".format(self.owner) + + return s + + +class Score(TimeStampedModel): + """ + Stores an evaluation metric for an estimator. + """ + + # Metrics define how a specific estimator is scored + METRICS = Choices( + 'accuracy', 'auc', 'brier', 'f1', 'fbeta', 'hamming', 'hinge', + 'jaccard', 'logloss', 'mcc', 'precision', 'recall', 'roc', 'support', + 'mae', 'mse', 'mdae', 'r2', + 'rand', 'completeness', 'homogeneity', 'mutual', 'silhouette', 'v', + 'time', + ) + + metric = models.CharField(choices=METRICS, max_length=32) # The type of the score + score = models.FloatField(**nullable) # The actual value of the score + label = models.CharField(max_length=32, **nullable) # The label, if any, of the score + folds = ArrayField(models.FloatField(), **nullable) # Cross-validation scores + estimator = models.ForeignKey(Estimator, related_name='scores') # The estimator being evaluated + + class Meta: + db_table = "evaluations" + get_latest_by = "created" + + def __str__(self): + s = "{} score for {} = {:0.3f}".format( + self.metric.title(), self.estimator, self.score + ) + + if self.label: + s = "{} ".format(self.label.title()) + s + + return s diff --git a/arbiter/tests.py b/arbiter/tests.py new file mode 100644 index 0000000..d7f55ce --- /dev/null +++ b/arbiter/tests.py @@ -0,0 +1,25 @@ +# arbiter.tests +# Tests for the arbiter app. +# +# Author: Benjamin Bengfort +# Created: Tue Aug 02 09:17:12 2016 -0400 +# +# Copyright (C) 2016 District Data Labs +# For license information, see LICENSE.txt +# +# ID: tests.py [cd70726] benjamin@bengfort.com $ + +""" +Tests for the arbiter app. +""" + +########################################################################## +## Imports +########################################################################## + +from django.test import TestCase + + +########################################################################## +## Tests +########################################################################## diff --git a/arbiter/views.py b/arbiter/views.py new file mode 100644 index 0000000..2dd9f49 --- /dev/null +++ b/arbiter/views.py @@ -0,0 +1,22 @@ +# arbiter.views +# Views for the arbiter app. +# +# Author: Benjamin Bengfort +# Created: Tue Aug 02 09:17:46 2016 -0400 +# +# Copyright (C) 2016 District Data Labs +# For license information, see LICENSE.txt +# +# ID: views.py [cd70726] benjamin@bengfort.com $ + +""" +Views for the arbiter app. +""" + +########################################################################## +## Imports +########################################################################## + +from django.shortcuts import render + +# Create your views here. diff --git a/corpus/admin.py b/corpus/admin.py index 1f5d626..09efd96 100644 --- a/corpus/admin.py +++ b/corpus/admin.py @@ -18,7 +18,7 @@ ########################################################################## from django.contrib import admin -from corpus.models import Document, Annotation, Label +from corpus.models import Document, Annotation, Label, Corpus ########################################################################## ## Register Admin @@ -27,3 +27,4 @@ admin.site.register(Label) admin.site.register(Annotation) admin.site.register(Document) +admin.site.register(Corpus) diff --git a/corpus/fixtures/debates.zip b/corpus/fixtures/debates.zip new file mode 100644 index 0000000..01897b4 Binary files /dev/null and b/corpus/fixtures/debates.zip differ diff --git a/corpus/learn.py b/corpus/learn.py new file mode 100644 index 0000000..282da09 --- /dev/null +++ b/corpus/learn.py @@ -0,0 +1,385 @@ +# corpus.learn +# Machine learning for the corpus with Scikit-Learn. +# +# Author: Benjamin Bengfort +# Created: Mon Jul 25 17:23:50 2016 -0400 +# +# Copyright (C) 2016 District Data Labs +# For license information, see LICENSE.txt +# +# ID: learn.py [3100e46] benjamin@bengfort.com $ + +""" +Machine learning for the corpus with Scikit-Learn. +""" + +########################################################################## +## Imports +########################################################################## + +import nltk +import unicodedata + +from nltk.corpus import wordnet as wn +from sklearn.pipeline import Pipeline +from sklearn.pipeline import FeatureUnion +from sklearn.cross_validation import KFold +from sklearn.decomposition import TruncatedSVD +from sklearn.feature_extraction import DictVectorizer +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.metrics import precision_recall_fscore_support + +from partisan.utils import identity, timeit +from collections import Counter, defaultdict + + +########################################################################## +## Corpus Loader (Not a transformer) +########################################################################## + +class CorpusLoader(object): + """ + The corpus loader knows how to deal with an NLTK corpus at the top of a + pipeline by simply taking as input a corpus to read from. It exposes both + the data and the labels and can be set up to do cross-validation. + + If a number of folds is passed in for cross-validation, then the loader + is smart about how to access data for train/test splits. Otherwise it will + simply yield all documents in the corpus. + """ + + def __init__(self, corpus, folds=None, shuffle=True): + self.n_docs = len(corpus.fileids()) + self.corpus = corpus + self.folds = folds + + if folds is not None: + # Generate the KFold cross validation for the loader. + self.folds = KFold(self.n_docs, folds, shuffle) + + @property + def n_folds(self): + """ + Returns the number of folds if it exists; 0 otherwise. + """ + if self.folds is None: return 0 + return self.folds.n_folds + + def fileids(self, fold=None, train=False, test=False): + """ + Returns a listing of the documents filtering to retreive specific + data from the folds/splits. If no fold, train, or test is specified + then the method will return all fileids. + + If a fold is specified (should be an integer between 0 and folds), + then the loader will return documents from that fold. Further, train + or test must be specified to split the fold correctly. + """ + if fold is None: + # If no fold is specified, return all the fileids. + return self.corpus.fileids() + + # Otherwise, identify the fold specifically and get the train/test idx + for fold_idx, (train_idx, test_idx) in enumerate(self.folds): + if fold_idx == fold: break + else: + # We have discovered no correct fold. + raise ValueError( + "{} is not a fold, specify an integer less than {}".format( + fold, self.folds.n_folds + ) + ) + + # Now determine if we're in train or test mode. + if not (test or train) or (test and train): + raise ValueError( + "Please specify either train or test flag" + ) + + # Select only the indices to filter upon. + indices = train_idx if train else test_idx + return [ + fileid for doc_idx, fileid in enumerate(self.corpus.fileids()) + if doc_idx in indices + ] + + def labels(self, fold=None, train=False, test=False): + """ + Fit will load a list of the labels from the corpus categories. + + If a fold is specified (should be an integer between 0 and folds), + then the loader will return documents from that fold. Further, train + or test must be specified to split the fold correctly. + """ + return [ + self.corpus.categories(fileids=fileid)[0] + for fileid in self.fileids(fold, train, test) + ] + + def documents(self, fold=None, train=False, test=False): + """ + A generator of documents being streamed from disk. Each document is + a list of paragraphs, which are a list of sentences, which in turn is + a list of tuples of (token, tag) pairs. All preprocessing is done by + NLTK and the CorpusReader object this object wraps. + + If a fold is specified (should be an integer between 0 and folds), + then the loader will return documents from that fold. Further, train + or test must be specified to split the fold correctly. This method + allows us to maintain the generator properties of document reads. + """ + for fileid in self.fileids(fold, train, test): + yield list(self.corpus.tagged(fileids=fileid)) + + +########################################################################## +## Normalize Transformer +########################################################################## + +class TextNormalizer(BaseEstimator, TransformerMixin): + """ + Takes a list of tokens, removes stopwords and punctuation and lowercases + as well as lemmatizes the words for the first step in feature extraction. + + Note that this transformer expects as input to transform a list of tuples, + (token, tag) pairs, that represent a single document. + """ + + def __init__(self, stopwords=None): + self.stopwords = set(stopwords or nltk.corpus.stopwords.words('english')) + self.lemmatizer = nltk.WordNetLemmatizer() + + def is_punct(self, token): + """ + Determines if the entire token is punctuation. + """ + return all( + unicodedata.category(char).startswith('P') for char in token + ) + + def is_stopword(self, token): + """ + Determines if the token is a stopword or not. + """ + return token.lower() in self.stopwords + + def tagwn(self, tag): + """ + Returns the WordNet tag from the Penn Treebank tag. + """ + return { + 'N': wn.NOUN, + 'V': wn.VERB, + 'R': wn.ADV, + 'J': wn.ADJ + }.get(tag[0], wn.NOUN) + + def lemmatize(self, token, tag): + """ + Lemmatizes the token according to the part of speech tag. + """ + return self.lemmatizer.lemmatize(token, self.tagwn(tag)) + + def normalize(self, document): + """ + Normalize each (token, tag) pair in the words data set. + """ + return [ + self.lemmatize(token, tag).lower() + for paragraph in document + for sentence in paragraph + for (token, tag) in sentence + if not self.is_punct(token) and not self.is_stopword(token) + ] + + def fit(self, X, y=None): + """ + At the moment, fitting doesn't require any analysis. + """ + return self + + def transform(self, documents): + """ + Transform a corpus of documents into normalized features. + """ + for document in documents: + yield self.normalize(document) + + +########################################################################## +## Statitics Transformer +########################################################################## + +class TextStats(BaseEstimator, TransformerMixin): + """ + Computes the document statistics like length and number of sentences. + """ + + def fit(self, X, y=None): + return self + + def transform(self, documents): + """ + Returns a dictionary of text features in advance of a DictVectorizer. + """ + for document in documents: + # Collect token and vocabulary counts + counts = Counter( + item[0] for para in document for sent in para for item in sent + ) + + # Yield structured information about the document + yield { + 'paragraphs': len(document), + 'sentences': sum(len(para) for para in document), + 'words': sum(counts.values()), + 'vocab': len(counts), + } + + +########################################################################## +## Model Building Functions +########################################################################## + +def construct_pipeline(classifier): + """ + This function creates a feature extraction pipeline that accepts data + from a CorpusLoader and appends the classification model to the end of + the pipeline, returning a newly constructed Pipeline object that is + ready to be fit and trained! + """ + + return Pipeline([ + # Create a Feature Union of Text Stats and Bag of Words + ('union', FeatureUnion( + transformer_list = [ + + # Pipeline for pulling document structure features + ('stats', Pipeline([ + ('stats', TextStats()), + ('vect', DictVectorizer()), + ])), + + # Pipeline for creating a bag of words TF-IDF vector + ('bow', Pipeline([ + ('tokens', TextNormalizer()), + ('tfidf', TfidfVectorizer( + tokenizer=identity, preprocessor=None, lowercase=False + )), + ('best', TruncatedSVD(n_components=1000)), + ])), + + ], + + # weight components in feature union + transformer_weights = { + 'stats': 0.15, + 'bow': 0.85, + }, + )), + + # Append the estimator to the end of the pipeline + ('classifier', classifier), + ]) + + +@timeit +def build_model(loader, model, **kwargs): + """ + This function creates a pipeline from the feature extraction method in + construct_pipeline and the passed in model and model keyword arguments, + then trains the model with the given loader using all folds, then the + complete dataset given by the loader object. It returns the fitted + pipeline object along with scores and timing information. + """ + + # TODO: Add multiprocessing to parallelize build_inner_fold + # TODO: Add verbosity to inform user on command line what is happening + # TODO: Equip this method to be used by Celery workers + + @timeit + def build_inner_fold(loader, classifier, fold=None): + """ + A timed inner function that will return a set of evaluation scores + if a fold is passed in, otherwise will build the model on the entire + dataset and return the fitted model. + """ + + # Get the training data from the loader + X_train = list(loader.documents(fold, train=True)) + y_train = list(loader.labels(fold, train=True)) + + # Construct the pipeline from the instantiated classifier + model = construct_pipeline(classifier) + model.fit(X_train, y_train) + + # If folds is None, then return the fitted model. + if fold is None: return model + + # Otherwise get the test data from the fold to perform an evaluation. + X_test = list(loader.documents(fold, test=True)) + y_test = list(loader.labels(fold, test=True)) + y_pred = model.predict(X_test) + + # Get the per-class scores as a well-structured object + keys = ('precision', 'recall', 'f1', 'support') + scores = precision_recall_fscore_support(y_test, y_pred, labels=model.classes_) + scores = map(lambda s: dict(zip(model.classes_, s)), scores) + scores = dict(zip(keys, scores)) + + # Get the weighted scores and add to the scores object + weighted = precision_recall_fscore_support(y_test, y_pred, average='weighted', pos_label=None) + for key, wscore in zip(keys, weighted): + scores[key]['average'] = float(wscore) if wscore is not None else None + + return scores + + + # Now that the inner function works, let's run the model build process on + # each fold for cross-validation and a final time to complete the model. + scores = defaultdict(lambda: defaultdict(list)) + for fold in range(loader.n_folds): + + classifier = model(**kwargs) # Instantiate the classifier + score, time = build_inner_fold(loader, classifier, fold) # Fit the model for this fold + + # Update the scores as a list of scores for each run + for name, values in score.items(): + for label, value in values.items(): + scores[name][label].append(value) + + # Add the time to the scores listing + scores['times']['folds'].append(time) + + # Build the final model + classifier = model(**kwargs) + classifier, build_time = build_inner_fold(loader, classifier) + scores['times']['final'] = build_time + + # Return everything we've constructed (*whew) + return classifier, scores + + +if __name__ == '__main__': + import os + import pickle + + from corpus.reader import TranscriptCorpusReader + from sklearn.linear_model import LogisticRegression + + path = os.path.join(os.path.dirname(__file__), "fixtures", "debates") + saveto = os.path.join(os.path.dirname(__file__), "fixtures", "maxent-debates.pickle") + corpus = TranscriptCorpusReader(path) + loader = CorpusLoader(corpus, 12) + + model = LogisticRegression + (model, scores), total_time = build_model(loader, model) + + with open(saveto, 'wb') as f: + pickle.dump(model, f) + + with open('scores.pickle', 'wb') as f: + pickle.dump(scores, f) + + print("Finished build process in {}".format(total_time)) diff --git a/corpus/managers.py b/corpus/managers.py index 97277f4..e315174 100644 --- a/corpus/managers.py +++ b/corpus/managers.py @@ -18,7 +18,7 @@ ########################################################################## from django.db import models - +from django.apps import apps ########################################################################## ## Annotation Manager @@ -37,3 +37,37 @@ def democratic(self): Filters the annotations for only democratic annotations. """ return self.filter(label__slug='democratic') + + +########################################################################## +## Corpus Manager +########################################################################## + +class CorpusManager(models.Manager): + + def create_for_user(self, user, **kwargs): + """ + Creates a user-specific corpus containing all the documents that the + user has tagged to date. Can pass in any additional fields as well. + """ + # Lazy load the document model + Document = apps.get_model('corpus.Document') + LabeledDocument = apps.get_model('corpus.LabeledDocument') + + # Add the user to the kwargs and construct the corpus. + kwargs['user'] = user + corpus = self.create(**kwargs) + + # Now add all the documents the user has annotated to date. + for doc in Document.objects.filter(annotations__user=user): + if corpus.labeled: + label = doc.label(user) + if label is None: continue + else: + label = None + + LabeledDocument.objects.create( + corpus=corpus, document=doc, label=label, + ) + + return corpus diff --git a/corpus/migrations/0002_corpus_models.py b/corpus/migrations/0002_corpus_models.py new file mode 100644 index 0000000..72beca0 --- /dev/null +++ b/corpus/migrations/0002_corpus_models.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.9.7 on 2016-08-21 20:40 +from __future__ import unicode_literals + +import autoslug.fields +from django.conf import settings +from django.db import migrations, models +import django.db.models.deletion +import django.utils.timezone +import model_utils.fields + + +class Migration(migrations.Migration): + + dependencies = [ + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ('corpus', '0001_initial'), + ] + + operations = [ + migrations.CreateModel( + name='Corpus', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('created', model_utils.fields.AutoCreatedField(default=django.utils.timezone.now, editable=False, verbose_name='created')), + ('modified', model_utils.fields.AutoLastModifiedField(default=django.utils.timezone.now, editable=False, verbose_name='modified')), + ('title', models.CharField(blank=True, default=None, max_length=255, null=True)), + ('slug', autoslug.fields.AutoSlugField(editable=False, populate_from='title', unique=True)), + ('labeled', models.BooleanField()), + ], + options={ + 'ordering': ['-created'], + 'verbose_name': 'corpus', + 'db_table': 'corpora', + 'get_latest_by': 'created', + 'verbose_name_plural': 'corpora', + }, + ), + migrations.CreateModel( + name='LabeledDocument', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('created', model_utils.fields.AutoCreatedField(default=django.utils.timezone.now, editable=False, verbose_name='created')), + ('modified', model_utils.fields.AutoLastModifiedField(default=django.utils.timezone.now, editable=False, verbose_name='modified')), + ('corpus', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='corpus.Corpus')), + ('document', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='corpus.Document')), + ('label', models.ForeignKey(blank=True, default=None, null=True, on_delete=django.db.models.deletion.CASCADE, to='corpus.Label')), + ], + options={ + 'db_table': 'corpora_documents', + }, + ), + migrations.AlterModelOptions( + name='annotation', + options={'get_latest_by': 'modified', 'ordering': ['-modified']}, + ), + migrations.AddField( + model_name='corpus', + name='documents', + field=models.ManyToManyField(related_name='corpora', through='corpus.LabeledDocument', to='corpus.Document'), + ), + migrations.AddField( + model_name='corpus', + name='user', + field=models.ForeignKey(blank=True, default=None, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='corpora', to=settings.AUTH_USER_MODEL), + ), + ] diff --git a/corpus/migrations/0003_auto_20160821_1649.py b/corpus/migrations/0003_auto_20160821_1649.py new file mode 100644 index 0000000..b0abeba --- /dev/null +++ b/corpus/migrations/0003_auto_20160821_1649.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.9.7 on 2016-08-21 20:49 +from __future__ import unicode_literals + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('corpus', '0002_corpus_models'), + ] + + operations = [ + migrations.AlterField( + model_name='corpus', + name='labeled', + field=models.BooleanField(default=True), + ), + migrations.AlterField( + model_name='labeleddocument', + name='corpus', + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='labels', to='corpus.Corpus'), + ), + migrations.AlterField( + model_name='labeleddocument', + name='document', + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='+', to='corpus.Document'), + ), + ] diff --git a/corpus/migrations/0004_label_description.py b/corpus/migrations/0004_label_description.py new file mode 100644 index 0000000..1db7c03 --- /dev/null +++ b/corpus/migrations/0004_label_description.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.9.7 on 2016-08-30 01:02 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('corpus', '0003_auto_20160821_1649'), + ] + + operations = [ + migrations.AddField( + model_name='label', + name='description', + field=models.CharField(blank=True, default=None, max_length=512, null=True), + ), + ] diff --git a/corpus/models.py b/corpus/models.py index 4e15494..059d370 100644 --- a/corpus/models.py +++ b/corpus/models.py @@ -23,7 +23,9 @@ from django.core.urlresolvers import reverse from model_utils.models import TimeStampedModel from picklefield.fields import PickledObjectField -from corpus.managers import AnnotationManager +from corpus.managers import AnnotationManager, CorpusManager + +from operator import itemgetter ########################################################################## ## Document Model @@ -38,7 +40,7 @@ class Document(TimeStampedModel): long_url = models.URLField(max_length=2000, unique=True) # The long url for the document short_url = models.URLField(max_length=30, **nullable) # The bit.ly shortened url raw_html = models.TextField(**nullable) # The html content fetched (hopefully) - content = PickledObjectField(**nullable) # The preprocessed NLP content in a parsable text representation + content = PickledObjectField(**nullable) # The preprocessed NLP content in a parsable text representation signature = models.CharField(max_length=44, editable=False, **nullable) # A base64 encoded hash of the content n_words = models.SmallIntegerField(**nullable) # The word count of the document n_vocab = models.SmallIntegerField(**nullable) # The size of the vocabulary used @@ -53,6 +55,41 @@ class Meta: get_latest_by = "created" unique_together = ("long_url", "short_url") + def label(self, user=None): + """ + If a user is specified then returns the label for that user. Otherwise + returns the majority voted label for the document in the corpus. + """ + # If a user is supplied get their annotation and return the label. + if user is not None: + annotation = self.annotations.filter(user=user).first() + if annotation: return annotation.label + + # Otherwise aggregate the annotations per document. + # TODO: Add annotator aggreement logic here! + else: + labels = self.labels.annotate(votes=models.Count('id')) + votes = [(label, label.votes) for label in labels] + if votes: + # If we have more than one thing being voted for. + if len(votes) > 1: + # Check if a tie between all labels + if all([v[1] == o[1] for o in votes for v in votes]): + return None + + # Select the label that has the most votes + vote = max(votes, key=itemgetter(1)) + + # Otherwise we've just got one thing being voted for + else: + vote = votes[0] + + # Make sure that there are enough votes for an article + if vote[1] > 0: + return vote[0] + + return None + def get_absolute_url(self): """ Returns the detail view url for the object @@ -76,7 +113,8 @@ class Label(TimeStampedModel): name = models.CharField(max_length=64, unique=True) # The name of the label slug = AutoSlugField(populate_from='name', unique=True) # A unique slug of the label parent = models.ForeignKey('self', **nullable) # If there is a label hierarchy - documents = models.ManyToManyField( + description = models.CharField(max_length=512, **nullable) # Short description of what the labels means + documents = models.ManyToManyField( 'corpus.Document', through='corpus.Annotation', related_name='labels' ) @@ -120,3 +158,56 @@ def __str__(self): ########################################################################## ## Corpus Model ########################################################################## + +class Corpus(TimeStampedModel): + """ + A model that maintains a mapping of documents to estimators for use in + tracking the training data that is used to fit a text classifier object. + """ + + title = models.CharField(max_length=255, **nullable) + slug = AutoSlugField(populate_from='title', unique=True) + documents = models.ManyToManyField('corpus.Document', through='LabeledDocument', related_name='corpora') + user = models.ForeignKey('auth.User', related_name='corpora', **nullable) + labeled = models.BooleanField(default=True) + + objects = CorpusManager() + + class Meta: + db_table = "corpora" + get_latest_by = "created" + ordering = ["-created"] + verbose_name = "corpus" + verbose_name_plural = "corpora" + + def __str__(self): + if self.title: + return self.title + + # Construct the descriptive string. + s = "{} document corpus created on {}".format( + self.documents.count(), self.created.strftime("%Y-%m-%d") + ) + + if self.user: + s += " by {}".format(self.user) + + return s + + +class LabeledDocument(TimeStampedModel): + """ + A model that tracks the relationship between documents and corpora and + ensures that every document has a static label (or not) so that any model + that has been generated is reproducible. + """ + + corpus = models.ForeignKey('corpus.Corpus', related_name='labels') + document = models.ForeignKey('corpus.Document', related_name='+') + label = models.ForeignKey('corpus.Label', **nullable) + + class Meta: + db_table = "corpora_documents" + + def __str__(self): + return "{} ({})".format(self.document, self.label) diff --git a/corpus/reader.py b/corpus/reader.py new file mode 100644 index 0000000..fee3023 --- /dev/null +++ b/corpus/reader.py @@ -0,0 +1,216 @@ +# corpus.reader +# A simple corpus reader object for training models. +# +# Author: Tony Ojeda +# Created: Mon Jul 25 17:14:45 2016 -0400 +# +# Copyright (C) 2016 District Data Labs +# For license information, see LICENSE.txt +# +# ID: reader.py [01cb9f8] ojedatony1616@gmail.com $ + +""" +A simple corpus reader object for training models. +""" + +########################################################################## +## Imports +########################################################################## + +import os +import nltk + +from corpus.models import Label +from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader + + +########################################################################## +## Module Constants +########################################################################## + +DOC_PATTERN = r'(?!\.)[\w_\s]+/[\w\s\d\-]+\.txt' +CAT_PATTERN = r'([\w_\s]+)/.*' + + +########################################################################## +## Transcript Corpus Reader +########################################################################## + +class TranscriptCorpusReader(CategorizedPlaintextCorpusReader): + + def __init__(self, root, **kwargs): + CategorizedPlaintextCorpusReader.__init__( + self, root, DOC_PATTERN, cat_pattern=CAT_PATTERN + ) + + def tagged(self, **kwargs): + """ + Returns part-of-speech tagged words in sentences in paragraphs. + """ + for para in self.paras(**kwargs): + yield [ + nltk.pos_tag(sent) for sent in para + ] + + +########################################################################## +## Django Query Corpus Reader +########################################################################## + +class QueryCorpusReader(object): + """ + The query corpus reader takes in a query that yields a list of documents + and modifies it such that it is only fetching the preprocessed content in + a streaming fashion. + """ + + def __init__(self, query, user=None): + """ + Pass in a QuerySet or Query object for selecting a group of documents. + Can also optionally pass in a user to determine labeling scheme. + """ + self.user = user + self.query = query + + def fileids(self, categories=None): + """ + Returns a list of file primary keys for the files that make up this + corpus or that make up the given category(s) if specified. + + Categories can be either a single string or a list of strings. + """ + # If categories is None, return all fileids. + if categories is None: + return self.query.values_list('id', flat=True) + + # Convert to a list if a singleton is passed + if isinstance(categories, (str, Label)): + categories = [categories,] + + # Convert to a quick lookup data structure + categories = set(categories) + + # Manually loop through all documents (bummer) + return [ + doc.id for doc in self.query + if doc.label(self.user) in categories + ] + + def categories(self, fileids=None): + """ + Return a list of file identifiers of the categories defined for this + corpus or the file(s) if it is given. + + Fileids can be either a list of integers or a single integer. + """ + # If fileids is None, return all categories + # HACK: use a unique query on the database + if fileids is None: + return list(set([ + str(doc.label(self.user)) for doc in self.query + ])) + + # Convert to a list if a singleton is passed + if isinstance(fileids, int): + fileids = [fileids,] + + return list(set([ + str(doc.label(self.user)) + for doc in self.query.filter(id__in=fileids) + ])) + + def tagged(self, fileids=None, categories=None): + """ + Returns the content of each document. + """ + if fileids is None: + fileids = self.fileids(categories) + + if isinstance(fileids, int): + fileids = [fileids,] + + for doc in self.query.filter(id__in=fileids).values_list('content', flat=True): + for para in doc: + yield para + + +########################################################################## +## Django Corpus Model Reader +########################################################################## + +class CorpusModelReader(QueryCorpusReader): + """ + Takes a corpus object and automatically references documents. + + Note this class takes advantage of the LabeledDocument through model + between documents and corpora in order to perform queries on the database. + The QueryCorpusReader relies on the label() method of a document for + label discovery and therefore cannot do filtering or querying based on + data that is stored in the database. + """ + + def __init__(self, corpus): + self.corpus = corpus + super(CorpusModelReader, self).__init__( + corpus.documents.all(), corpus.user + ) + + def fileids(self, categories=None): + """ + Returns a list of file primary keys for the files that make up this + corpus or that make up the given category(s) if specified. + + Categories can be either a single string or a list of strings. + """ + # If categories is None, return all fileids. + if categories is None: + return self.query.values_list('id', flat=True) + + # Convert to a list if a singleton is passed + if isinstance(categories, (str, Label)): + categories = [categories,] + + # Convert to a quick lookup data structure + categories = set(categories) + + # Filter the labeled documents based on the label. + query = self.corpus.labels.filter(label__in=categories) + return query.values_list('document_id', flat=True) + + def categories(self, fileids=None): + """ + Return a list of file identifiers of the categories defined for this + corpus or the file(s) if it is given. + + Fileids can be either a list of integers or a single integer. + """ + # If fileids is None, return all categories + if fileids is None: + labels = self.corpus.labels.values_list('label', flat=True).distinct() + return Label.objects.filter(id__in=labels).values_list('slug', flat=True) + + # Convert to a list if a singleton is passed + if isinstance(fileids, int): + fileids = [fileids,] + + labels = self.corpus.labels.filter(document_id__in=fileids) + labels = labels.values_list('label', flat=True).distinct() + return Label.objects.filter(id__in=labels).values_list('slug', flat=True) + + +if __name__ == '__main__': + path = os.path.join(os.path.dirname(__file__), "fixtures", "debates") + corpus = TranscriptCorpusReader(path) + + print("{} documents, {} categories".format( + len(corpus.fileids()), len(corpus.categories()) + )) + + print(", ".join([ + "{} {} documents".format(len(corpus.fileids(categories=cat)), cat) + for cat in corpus.categories() + ])) + + print("{} paragraphs, {} sentences, {} words".format( + len(corpus.paras()), len(corpus.sents()), len(corpus.words()) + )) diff --git a/docs/about.md b/docs/about.md new file mode 100644 index 0000000..e6117f5 --- /dev/null +++ b/docs/about.md @@ -0,0 +1,24 @@ +# About + +This small web application is intended to highlight how to operationalize machine learning models in a web application. We use a political text classifier to demonstrate how individualized models can be managed and stored inside of Django. + +**Why this topic?** Political partisanship has been one of the defining characteristics of the 2016 U.S. Presidential race. This application models political partisanship generally, and then it allows users to provide feedback to the model + +#### Establishing the Initial Model + +We've used the 2016 Presidential Primary debate transcripts to train machine learning models to classify text into one of two buckets: Republican or Democratic. The pipeline parses and vectorizes the text from the transcripts, removes non-predictive portions (such as moderator comments and candidate names), and assigns a term frequency-inverse document frequency (TF-IDF) value to each word in the transcripts. We then fit and cross-validated a logistic regression model using 12 folds; it classified transcript text as Republican or Democratic with 89% accuracy. + +Here's a flowchart explaining how we fit and trained our initial model: + +![Building the initial model](img/initialmodel.png) + +#### Operationalizing Machine Learning + +Machine learning models become much more valuable when users can tweak individualized models. Here, we use the [Django web application framework](http://www.djangoproject.com) to create and apply a model for every user. Django is a fast, secure, and scalable Python Web framework that comes with a variety of powerful web application tools. + +This project is designed as a template for future interactive machine learning projects. Combining machine learning with a web application framework has limitless applications. Here are a couple of ways that this design could be used for future projects: + +* Any type of recommender website (a la Amazon or Netflix) with many users +* Any type of targeted advertising within a specific website that has some way to track users + +For more information on our technical design, see the [Partisan Discourse Architecture](/architecture.md). diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100644 index 0000000..915e953 --- /dev/null +++ b/docs/architecture.md @@ -0,0 +1,105 @@ +# Architecture + +**Here is a high-level overview of the Partisan Discourse architecture:** + +![Architecture](img/PDarchitecture.png) + +### Combining Machine Learning and Django + +How and why would someone combine machine learning and Django? Two Partisan Discourse contributors [gave a presentation at PyCon 2016](https://us.pycon.org/2016/schedule/presentation/1614/) about just that: + +![General Architecture](img/ddl-ml-web-app.png) + + +### Code Base + +The Partisan Discourse repository is a Django app that includes a directory for fitting, testing, storing, and operationalizing machine learning models. + +Here's what the directory structure looks like: + +``` +├── LICENSE +├── Makefile +├── Procfile +├── README.md +├── arbiter +│   ├── __init__.py +│   ├── __pycache__ +│   ├── admin.py +│   ├── apps.py +│   ├── management +│   ├── migrations +│   ├── models.py +│   ├── tests.py +│   └── views.py +├── bin +│   ├── install_nltk_data +│   └── post_compile +├── corpus +│   ├── __init__.py +│   ├── __init__.pyc +│   ├── __pycache__ +│   ├── admin.py +│   ├── apps.py +│   ├── apps.pyc +│   ├── bitly.py +│   ├── exceptions.py +│   ├── fixtures +│   ├── learn.py +│   ├── managers.py +│   ├── migrations +│   ├── models.py +│   ├── nlp.py +│   ├── reader.py +│   ├── serializers.py +│   ├── signals.py +│   ├── tests.py +│   ├── urls.py +│   └── views.py +├── docs +│   ├── img +│   └── index.md +├── manage.py +├── members +│   ├── __init__.py +│   ├── __init__.pyc +│   ├── __pycache__ +│   ├── admin.py +│   ├── apps.py +│   ├── apps.pyc +│   ├── migrations +│   ├── models.py +│   ├── permissions.py +│   ├── serializers.py +│   ├── signals.py +│   ├── tests.py +│   ├── urls.py +│   └── views.py +├── mkdocs.yml +├── partisan +│   ├── __init__.py +│   ├── __init__.pyc +│   ├── __pycache__ +│   ├── assets +│   ├── settings +│   ├── templates +│   ├── tests +│   ├── urls.py +│   ├── utils.py +│   ├── utils.pyc +│   ├── version.py +│   ├── version.pyc +│   ├── views.py +│   └── wsgi.py +├── requirements.txt +├── runtime.txt +└── venv + ├── bin + ├── include + ├── lib + ├── man + └── pip-selfcheck.json + +``` + +To see the code base, head to the **[Github repository](https://github.com/DistrictDataLabs/partisan-discourse).** diff --git a/docs/img/PDarchitecture.png b/docs/img/PDarchitecture.png new file mode 100644 index 0000000..60402f8 Binary files /dev/null and b/docs/img/PDarchitecture.png differ diff --git a/docs/img/ddl-ml-web-app.png b/docs/img/ddl-ml-web-app.png new file mode 100644 index 0000000..fab6ba4 Binary files /dev/null and b/docs/img/ddl-ml-web-app.png differ diff --git a/docs/img/initialmodel.png b/docs/img/initialmodel.png new file mode 100644 index 0000000..50ae1b2 Binary files /dev/null and b/docs/img/initialmodel.png differ diff --git a/docs/img/user_interface.png b/docs/img/user_interface.png new file mode 100644 index 0000000..5cfa1f3 Binary files /dev/null and b/docs/img/user_interface.png differ diff --git a/docs/img/user_portal.png b/docs/img/user_portal.png new file mode 100644 index 0000000..02843d4 Binary files /dev/null and b/docs/img/user_portal.png differ diff --git a/docs/img/wordcloud.png b/docs/img/wordcloud.png new file mode 100644 index 0000000..6bcee32 Binary files /dev/null and b/docs/img/wordcloud.png differ diff --git a/docs/index.md b/docs/index.md index 7a3ae90..e70d956 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,3 +1,162 @@ -# Partisan Discourse Documentation +# Partisan Discourse -The future home of the documentation for Partisan discourse. +**A web application that identifies bias in political discourse and serves as a template for operationalized machine learning.** + +[![Build Status][travis_img]][travis_href] +[![Coverage Status][coveralls_img]][coveralls_href] +[![Stories in Ready][waffle_img]][waffle_href] +[![Political Parties](img/partisan.jpg)][partisan.jpg] + +For an in-depth description of the project's purpose, head to the [About](about.md) page. + +## Quick Start Guide for Developers + +**Note:** +This project uses Django and Python 3. If you're unfamiliar with Django, it may be helpful to go through their [tutorial](https://docs.djangoproject.com/en/1.10/intro/tutorial01/). + +1. **Clone the repository:** + + $ git clone git@github.com:DistrictDataLabs/partisan-discourse.git + $ cd partisan-discourse + + +2. **Create a virtualenv and install the dependencies:** + + $ virtualenv venv + $ source venv/bin/activate + $ pip install -r requirements.txt + + +3. **Download NLTK data, which the app uses for NLP purposes.** Open python in your REPL: + + >>> import NLTK + >>> nltk.download() + + + You should have a directory within your home directory called "nltk_data". You'll use the path to that directory in your .env file, which we'll establish soon. + +4. **Google OAuth Client ID and Secret.** Google OAuth is for enabling API access from Google+, which is the method by which users interact with our app. + - Head to the [Google developer console page](https://console.developers.google.com), and click on the "Credentials" tab on the left-hand side of the screen. In the center of the screen, click the "Create a Project" button. Name the project "Partisan Discourse," and if you agree to the terms of service, fill the "yes" radio button and click "Create." + - In the upper-middle section of the page, click "OAuth consent screen." Fill in the product name with "Partisan Discourse;" you can leave the rest blank as long as you're just developing. Click the "Save" button. + - Back under the main "Credentials" page, click the "Create credentials" drop-down button, and select the "OAuth Client ID" option. The page will prompt you to select an application type; select the "Web application" radio button. + - Name the app "Partisan Discourse." In the "Authorized JavaScript origins" field, enter **http://127.0.0.1:8000**. Under "Authorized redirect URIs," enter **http://127.0.0.1:8000/complete/google-oath2/**. Click "create," and you should receive your OAuth client ID and secret. + + + Save your ID and secret; you'll use them soon to allow users to interact with your API using their Google+ information. + + +5. **Bitly Access Token.** Partisan Discourse shortens URLs using the Bitly service. Go to Bit.ly and create an account. + - After creating an account, click on the menu icon at the top right corner of the screen. Go to Settings > Advanced Settings, and click on the "OAuth" link under the "For Developers" section. You should arrive at the [Bitly OAuth page](https://bitly.com/a/oauth_apps). + - Click on the "Register an Application" link, then click on the "Get Registration Code" button. Check the email account that you used to sign up for Bitly, and click on the link in the email that says "Verify your Email." + - After verifying your email, head back to the [manage my apps](https://bitly.com/a/oauth_apps) page, and click "generate token." + - You should have received another email from Bitly; in it, click "Complete Registration." Name it Partisan Discourse, and set the link to **http://127.0.0.1:8000/** (Django's default port is 8000). The redirect link should be **http://127.0.0.1:8000/complete/google-oauth2/**. The description that we use is "A web application that identifies party in political discourse and serves as an example of operationalized machine learning." Click "Create Your App" at the bottom of the page. + - At this point, you should see a page that gives you a client ID, a client secret, and a generic access token. + + + Save your access token; you'll use it soon to allow app users to connect to the Bitly API. + + +6. **Create a .env file for your environment variables:** + + $ touch .env + $ open .env + + Paste in the following: + + WEB_CONCURRENCY=2 + DATABASE_URL=postgres://django@localhost/partisan + DJANGO_SETTINGS_MODULE=partisan.settings.development + SECRET_KEY="" + EMAIL_HOST_USER="" + EMAIL_HOST_PASSWORD="" + GOOGLE_OAUTH2_CLIENT_ID="your-oauth-client-ID" + GOOGLE_OAUTH2_CLIENT_SECRET="your-oauth-client-secret" + BITLY_ACCESS_TOKEN="your-bitly-access-token" + NLTK_DATA= /path/to/your/NLTK/data + + Here's an explanation of each of the .env lines and their purposes: + + - **Web concurrency:** Heroku hosting setting. + - **Database URL:** describes how to connect to Postgres locally. Here, you're specifying to use the Postgres protocol, which is necessary for Partisan Discourse. The "django@localhost" specifies that the user should be "django" and the database should be "partisan". + - **Django settings module:** specifies where to retrieve the settings documents. Instead of just using a settings.py, we use different settings documents for production, development, and testing. Changing the DJANGO_SETTINGS_MODULE in your .env file will change the settings that you use accordingly. + - **Secret key:** additional security. Using the secret key in your .env file (and including your .env file in your .gitignore file) will help prevent unauthorized access to your app. + - **Email host user and password: this allows Django to send email on your behalf. + - **Google OAuth Client ID and secret** (referenced earlier): replace the quotation marks with your client ID and secret. + - **Bitly Access Token:** URL shortening (referenced earlier); replace the quotation marks with your access token. + - **NLTK data:** paste the path to your nltk_data directory. It should have installed to your home directory. + + + +7. **Set up your Postgres database.** Ensure that Postgres is running. Next, create the proper role ("django") and database ("partisan") by typing the following into your psql terminal window: + + =# CREATE ROLE django WITH LOGIN; + =# CREATE DATABASE partisan WITH OWNER django; + + When you view your databases, you should now see one with the name "partisan" and the owner "django." + +8. **Migrate the app.** Django uses migrations to propogate changes you've made in your models (like adding a field) into your database schema. Enter the following into your terminal: + + $ python manage.py migrate + + +9. Run the server. By default, Django uses port 8000. + + $ python manage.py runserver + + In your web browser, go to address 127:0.0.1:8000. If you see a login page, then congratulations! You have successfully set up Partisan Discourse locally! + + + + + +## Contributing to Partisan Discourse + +1. Head to the [Github repository](https://github.com/DistrictDataLabs/partisan-discourse) and fork the project. + + +2. Check out the appropriate branch: + + $ git fetch origin develop + $ git checkout develop + + +3. After making changes that you feel would benefit the project, push them to Github: + + + $ git add . + $ git commit -m "bug fix #[insert fix number here]" + $ git push origin develop + + +4. Submit a pull request. Head to the Partisan Discourse [Github repository](https://github.com/DistrictDataLabs/partisan-discourse), click on "pull requests," and then click on "new pull request." Choose the proper base fork and branch (Partisan Discourse; develop) and the proper head fork and branch (your fork; develop). + +If you find an issue but can't fix it, be sure to submit an issue [here](https://github.com/DistrictDataLabs/partisan-discourse/issues). + + + +#### Changelog + +The release versions that are deployed to the web servers are also tagged in GitHub. You can see the tags through the GitHub web application and download the tarball of the version you'd like. + +The versioning uses a three part version system, "a.b.c" - "a" represents a major release that may not be backwards compatible. "b" is incremented on minor releases that may contain extra features, but are backwards compatible. "c" releases are bug fixes or other micro changes that developers should feel free to immediately update to. + +#### Version 0.1 Beta 1 + +* **tag**: [v0.1b1](https://github.com/DistrictDataLabs/partisan-discourse/releases/tag/v0.1b1) +* **deployment**: Monday, July 18, 2016 +* **commit**: [see tag](#) + +This is the first beta release of the Partisan Discourse application. Right now this simple web application allows users to sign in, then add links to go fetch web content to the global corpus. These links are then preprocessed using NLP. Users can tag the documents as Republican or Democrat, allowing us to build a political classifier. + +#### Attribution + +The image used in this README, [Partisan Fail][partisan.jpg] by [David Colarusso](https://www.flickr.com/photos/dcolarusso/) is licensed under [CC BY-NC 2.0](https://creativecommons.org/licenses/by-nc/2.0/) + + +[travis_img]: https://travis-ci.org/DistrictDataLabs/partisan-discourse.svg +[travis_href]: https://travis-ci.org/DistrictDataLabs/partisan-discourse +[waffle_img]: https://badge.waffle.io/DistrictDataLabs/partisan-discourse.png?label=ready&title=Ready +[waffle_href]: https://waffle.io/DistrictDataLabs/partisan-discourse +[coveralls_img]: https://coveralls.io/repos/github/DistrictDataLabs/partisan-discourse/badge.svg?branch=master +[coveralls_href]:https://coveralls.io/github/DistrictDataLabs/partisan-discourse?branch=master +[partisan.jpg]: https://flic.kr/p/a3bXVU diff --git a/docs/userguide.md b/docs/userguide.md new file mode 100644 index 0000000..d1f43b7 --- /dev/null +++ b/docs/userguide.md @@ -0,0 +1,19 @@ +# User Guide + +Partisan Discourse is straightforward to use. But before users can begin interacting with the app, make sure that it's running (see the last step of the [developers' quick start guide](/index.md) if you have questions). + +Then, users can do the following: + +**Log into the app.** If it's hosted locally (per the developers' quick start guide), then just type **127:0.0.1:8000** into your browser. You should see the following splash page: + + +![User Portal](img/user_portal.png) + + + Click "Sign in with Google" to access the Google OAuth API. Now, you should see this page: + + +![User Portal](img/user_interface.png) + + +**Visit the website of any current political news article and paste the URL into the appropriate box at the top of the screen.** Click "Go." On the right-hand side of the screen, click on the buttons "Democratic" and "Republican" to classify the article one way or the other. diff --git a/members/urls.py b/members/urls.py index ae1e57f..9f8698e 100644 --- a/members/urls.py +++ b/members/urls.py @@ -26,5 +26,5 @@ urlpatterns = ( url(r'^members/$', MemberListView.as_view(), name='list'), - url(r'^(?P[\w-]+)/$', MemberView.as_view(), name='detail'), + url(r'^(?P[\w\.\-]+)/$', MemberView.as_view(), name='detail'), ) diff --git a/mkdocs.yml b/mkdocs.yml index c97182f..6bc7211 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1 +1,13 @@ -site_name: My Docs +site_name: Partisan Discourse +repo_name: GitHub +repo_url: https://github.com/DistrictDataLabs/partisan-discourse +site_description: A web application that identifies party in political discourse and an example of operationalized machine learning. +site_author: District Data Labs +copyright: Copyright © 2015-2016 District Data Labs, All Rights Reserved +theme: readthedocs + +pages: + - Quick Start: index.md + - About: about.md + - Architecture: architecture.md + - User Guide: userguide.md diff --git a/partisan/assets/css/style.css b/partisan/assets/css/style.css index a4b14a0..bf22ab8 100644 --- a/partisan/assets/css/style.css +++ b/partisan/assets/css/style.css @@ -84,3 +84,10 @@ div.header-buttons { .text-democratic { color: #008cba; } + + +.jumbotron { + background-color: #008cba; + margin-top: -41px; + color: #FEFEFE; +} diff --git a/partisan/assets/js/annotate.js b/partisan/assets/js/annotate.js index 944f8ef..abcee8e 100644 --- a/partisan/assets/js/annotate.js +++ b/partisan/assets/js/annotate.js @@ -9,35 +9,65 @@ * - jquery */ -(function($) { - $(document).ready(function() { - var annotateForm = $("#annotateForm"); +(function() { + + // AnnotateView wraps a form with annotation buttons to provide interactive + // functionality with the API to select or deselect annotations in the DOM. + AnnotateView = function(selector, options) { + + this.form = null; + this.options = {}; + this.buttons = []; + this.labelInput = null; + + // Initializes the view + this.init = function(selector, options) { + + // Set default options + options = options || {}; + this.options = _.defaults(options, this.options); + + // Get the required jQuery elements + this.form = $(selector); + this.buttons = this.form.find("button[type=submit]"); + this.labelInput = this.form.find("input[name=label]"); + + // Bind the required event handlers + var self = this; + this.buttons.click(function(e) { return self.onClick(this, e); }); + this.form.submit(function(e) { return self.onSubmit(this, e); }); - annotateForm.find("button[type=submit]").click(function(e) { - // When the annotate button is clicked, set the val of the form. - var target = $(e.target); + // Return this for chaining + return this; - if (!target.data('selected')) { + } + + // When one of the buttons is clicked + this.onClick = function(caller, event) { + var button = $(caller); + + if (!button.data('selected')) { // Label the annotation with the slug of the button - var label = target.data('label-slug'); - annotateForm.find("#label").val(label); + var label = button.data('label-slug'); + this.labelInput.val(label); } else { - // Null the label on the annotation - annotateForm.find("#label").val(""); + // Null the label on the annotation (to "deselect" the annotation) + this.labelInput.val(""); } + } - }); - - annotateForm.submit(function(e) { - e.preventDefault(); + // When the form is submitted + this.onSubmit = function(caller, event) { + var self = this; + event.preventDefault(); // Get the action and method from the form - var method = annotateForm.attr('method'); - var action = annotateForm.attr('action'); + var method = this.form.attr('method'); + var action = this.form.attr('action'); // Get the data from the form var data = { - 'label': annotateForm.find('#label').val() + 'label': this.labelInput.val() } // Now make the AJAX request to the endpoint @@ -49,20 +79,19 @@ }).done(function(data) { // On successful post of the annotation reset the buttons. - var labelSlug = data.label - console.log("Setting annotation to", labelSlug); + console.log("Setting annotation to", data.label); // Go through each button and set the data as required. - $.each(annotateForm.find("button[type=submit]"), function(idx, btn) { + $.each(self.buttons, function(idx, btn) { btn = $(btn); - if (btn.data('label-slug') == labelSlug) { + if (btn.data('label-slug') == data.label) { // Ok this is the newly selected button // Set the selected attribute to true and the class to primary. btn.data('selected', true); btn.removeClass('btn-default'); btn.find("i").addClass('icon-white'); - btn.addClass('btn-' + labelSlug); + btn.addClass('btn-' + data.label); } else { // This is not the newly selected button @@ -83,7 +112,14 @@ }); return false; - }); + } + return this.init(selector, options) + }; + + // When the document is ready bind all annotation views + $(document).ready(function() { + annotators = _.map($(".annotate-form"), AnnotateView); }); -})(jQuery); + +})(); diff --git a/partisan/settings/base.py b/partisan/settings/base.py index 61b4027..af93ce2 100644 --- a/partisan/settings/base.py +++ b/partisan/settings/base.py @@ -115,6 +115,7 @@ def environ_setting(name, default=None): 'django_gravatar', # Partisan Discourse apps + 'arbiter', 'corpus', 'members', ] @@ -276,6 +277,8 @@ def environ_setting(name, default=None): ('Benjamin Bengfort', 'bbengfort@districtdatalabs.com'), ('Tony Ojeda', 'tojeda@districtdatalabs.com'), ('Rebecca Bilbro', 'rbilbro@districtdatalabs.com'), + ('Laura Lorenz', 'llorenz@districtdatalabs.com'), + ('Sam Goodgame', 'samuel.goodgame@gmail.com'), ) SERVER_EMAIL = 'DDL Admin ' diff --git a/partisan/templates/components/footer.html b/partisan/templates/components/footer.html index beaf936..3e157bc 100644 --- a/partisan/templates/components/footer.html +++ b/partisan/templates/components/footer.html @@ -3,6 +3,7 @@
    +
  • About
  • Terms
  • Privacy
  • API
  • diff --git a/partisan/templates/corpus/document.html b/partisan/templates/corpus/document.html index 0c3e92f..5d316f2 100644 --- a/partisan/templates/corpus/document.html +++ b/partisan/templates/corpus/document.html @@ -80,7 +80,7 @@

    {{ document.title }}

-
+
{% for label in labels %} @@ -94,6 +94,9 @@

{{ document.title }}

{{ label }} {% endfor %} +
{% csrf_token %}
@@ -116,6 +119,34 @@

{{ document.title }}

{% endblock %} +{% block modals %} + {{ block.super }} + + +{% endblock %} + {% block javascripts %} {{ block.super }} diff --git a/partisan/templates/site/about.html b/partisan/templates/site/about.html new file mode 100644 index 0000000..35a6b3a --- /dev/null +++ b/partisan/templates/site/about.html @@ -0,0 +1,106 @@ +{% extends 'page.html' %} +{% load staticfiles %} + +{% block content %} + +
+
+

About Partisan Discourse

+

Learn about the team, motivation, and technology behind the Partisan Discourse application.

+
+
+ +
+
+ + + + + +
+ + + +

Partisan Discourse is a Django application designed to highlight the operation and management of multiple machine learning models in a web application.

+ +

This application is intended to show the operation of multiple machine learning models trained to predict the categorization of text documents similar to sentiment analysis, but in this case to detect partisanship. More than one model is at work in this application: an initial model trained on the transcripts of the Democratic and Republican debates, a global model trained on all document annotations, models trained by experts, and per-user models for all participants. The point isn't necessarily to meaningfullly detect partisanship but rather to provide an understandable framework for how machine learning works in web applications.

+ +

The original work on political classification using text modeling was performed by students in the Georgetown University Data Science Certificate Program. They described a method to classify text as “red” or “blue” based on a model trained on transcripts from the 2016 Presidential Candidate Debates. They applied the model to a variety of news sources to determine if they could detect bias. In the process of their exploration they noted that bias was subjective and that models could be influenced by that bias.

+ +

By allowing users to add their own documents and annotate them, we hope to demonstrate how web applications can learn and grow from user interaction as well as demonstrate how bias and subjectivity effect applications.

+ + + +

Development and research of the Partisan Discourse application is incubated by District Data Labs as an open source web application.

+ +

Core Team

+ + +

DDL Contributing Faculty

+ + +

Open Source Contributors

+ +

Partisan Discourse is an open source application, you can contribute by opening an issue or submitting a pull request. Please refer to the documentation for contribution guidelines and getting started for development.

+ + + +

We didn't develop Partisan Discourse from scratch, we used many open source technologies that we'd like to credit here.

+ +

Back-End

+ +

The backend is completely written in Python using Python tools!

+ + + +

Front-End

+ +

We are not designers, and the design of the website relies completely on Boostrap components!

+ + + +

Iconography

+ +

Graphical elements take a lot of work to build and we appreciate those who allow us to use them!

+ + + +
+ +
+ +
+
+ +{% endblock %} diff --git a/partisan/tests/test_init.py b/partisan/tests/test_init.py index b0bed66..cc1e62e 100644 --- a/partisan/tests/test_init.py +++ b/partisan/tests/test_init.py @@ -23,7 +23,7 @@ ## Module variables ########################################################################## -EXPECTED_VERSION = "0.1b2" +EXPECTED_VERSION = "0.2b3" ########################################################################## ## Initialization Tests diff --git a/partisan/urls.py b/partisan/urls.py index 467815f..7f27fa9 100644 --- a/partisan/urls.py +++ b/partisan/urls.py @@ -63,6 +63,7 @@ # Application URLs url(r'^$', HomePageView.as_view(), name='home'), + url(r'^about/$', TemplateView.as_view(template_name='site/about.html'), name='about'), url(r'^terms/$', TemplateView.as_view(template_name='site/legal/terms.html'), name='terms'), url(r'^privacy/$', TemplateView.as_view(template_name='site/legal/privacy.html'), name='privacy'), diff --git a/partisan/utils.py b/partisan/utils.py index 49b82fa..4bd2686 100644 --- a/partisan/utils.py +++ b/partisan/utils.py @@ -24,7 +24,7 @@ from functools import wraps from markdown import markdown - +from datetime import datetime ########################################################################## ## Utilities @@ -74,8 +74,15 @@ def htmlize(text): return text +def identity(arg): + """ + Simple identity function works as a passthrough. + """ + return arg + + ########################################################################## -## Memoization +## Decorators ########################################################################## @@ -95,3 +102,18 @@ def fget_memoized(self): return getattr(self, attr_name) return property(fget_memoized) + + +def timeit(func): + """ + Simple wall clock timer for a function that runs in seconds. Returns a + datetime.timedelta object for use in a models.DurationField. + """ + + @wraps(func) + def func_timed(*args, **kwargs): + start = datetime.now() + result = func(*args, **kwargs) + return result, datetime.now() - start + + return func_timed diff --git a/partisan/version.py b/partisan/version.py index bac560c..f22db70 100644 --- a/partisan/version.py +++ b/partisan/version.py @@ -19,10 +19,10 @@ __version_info__ = { 'major': 0, - 'minor': 1, + 'minor': 2, 'micro': 0, 'releaselevel': 'beta', - 'serial': 2, + 'serial': 3, } diff --git a/requirements.txt b/requirements.txt index 43193d9..876661a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -41,6 +41,11 @@ lxml==3.6.0 chardet==2.3.0 cssselect==0.9.2 +## Machine Learning Dependencies +scikit-learn==0.17.1 +numpy==1.11.1 +scipy==0.18.0 + ## Testing nose==1.3.7 coverage==4.1