mms database layer and management command to train

DistrictDataLabs · Aug 2, 2016 · feb4803 · feb4803
1 parent cd70726
commit feb4803
Show file tree

Hide file tree

Showing 7 changed files with 236 additions and 14 deletions.
diff --git a/arbiter/management/__init__.py b/arbiter/management/__init__.py
@@ -0,0 +1,18 @@
+# arbiter.management
+# A module that specifies Django management commands for the arbiter app.
+#
+# Author:   Benjamin Bengfort <[email protected]>
+# Created:  Tue Aug 02 10:36:54 2016 -0400
+#
+# Copyright (C) 2016 District Data Labs
+# For license information, see LICENSE.txt
+#
+# ID: __init__.py [] [email protected] $
+
+"""
+A module that specifies Django management commands for the arbiter app.
+"""
+
+##########################################################################
+## Imports
+##########################################################################
diff --git a/arbiter/management/commands/__init__.py b/arbiter/management/commands/__init__.py
@@ -0,0 +1,18 @@
+# arbiter.management.commands
+# Module that contains each individual management command for Django.
+#
+# Author:   Benjamin Bengfort <[email protected]>
+# Created:  Tue Aug 02 10:37:24 2016 -0400
+#
+# Copyright (C) 2016 District Data Labs
+# For license information, see LICENSE.txt
+#
+# ID: __init__.py [] [email protected] $
+
+"""
+Module that contains each individual management command for Django.
+"""
+
+##########################################################################
+## Imports
+##########################################################################
diff --git a/arbiter/management/commands/train.py b/arbiter/management/commands/train.py
@@ -0,0 +1,151 @@
+# arbiter.management.commands.train
+# Command to train red/blue classifiers from the command line.
+#
+# Author:   Benjamin Bengfort <[email protected]>
+# Created:  Tue Aug 02 10:38:54 2016 -0400
+#
+# Copyright (C) 2016 District Data Labs
+# For license information, see LICENSE.txt
+#
+# ID: train.py [] [email protected] $
+
+"""
+Command to train red/blue classifiers from the command line.
+"""
+
+##########################################################################
+## Imports
+##########################################################################
+
+import numpy as np
+
+from arbiter.models import Estimator, Score
+from django.contrib.auth.models import User
+from corpus.reader import TranscriptCorpusReader
+from corpus.learn import CorpusLoader, build_model
+from django.core.management.base import BaseCommand, CommandError
+
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.linear_model import SGDClassifier
+from sklearn.linear_model import LogisticRegression
+
+
+##########################################################################
+## Training Command
+##########################################################################
+
+class Command(BaseCommand):
+
+    help = "Trains red/blue classifiers and stores them in the database."
+
+    estimators = {
+        'maxent': (LogisticRegression, {}),
+        'svm': (SGDClassifier, {'loss':'hinge', 'penalty':'l2', 'alpha':1e-3}),
+        'nbayes': (MultinomialNB, {}),
+    }
+
+    def add_arguments(self, parser):
+        """
+        Add command line argparse arguments.
+        """
+        # Model selection argument
+        parser.add_argument(
+            '-m', '--model', choices=self.estimators, default='maxent',
+            help='specify the model form to fit on the given corpus',
+        )
+
+        # Number of folds for cross-validation
+        parser.add_argument(
+            '-f', '--folds', type=int, default=12,
+            help='number of folds to use in cross-validation',
+        )
+
+        # Optional ownership argument
+        parser.add_argument(
+            '-u', '--username', default=None,
+            help='specify the username to associate with the model',
+        )
+
+        # TODO: Change this to allow for a query or a path on disk
+        parser.add_argument('corpus', nargs=1, help='path to the corpus on disk')
+
+    def handle(self, *args, **options):
+        """
+        Handles the model training process
+        """
+
+        # Get the details from the command line arguments
+        model, kwargs = self.estimators[options['model']]
+        owner  = self.get_user(options['username'])
+
+        # Construct the corpus and loader in preparation for training.
+        # TODO: Make the corpus loader construction a method to handle querysets
+        corpus = TranscriptCorpusReader(options['corpus'][0])
+        loader = CorpusLoader(corpus, options['folds'])
+
+        # Inform the user that the training process is beginning
+        self.stdout.write((
+            "Starting training of {} {} models on the corpus at {}\n"
+            "This may take quite a bit of time, please be patient!\n"
+        ).format(
+            loader.n_folds + 1, model.__name__, options['corpus'][0]
+        ))
+
+        # GO! Build the model forever! Whooo!!!
+        (clf, scores), total_time = build_model(loader, model, **kwargs)
+
+        # Save the estimator model
+        estimator = Estimator.objects.create(
+            model_type  = Estimator.TYPES.classifier,
+            model_class = model.__name__,
+            model_form  = repr(clf),
+            estimator   = clf,
+            build_time  = total_time,
+            owner       = owner,
+        )
+
+        # Save the scores objects.
+        for metric, values in scores.items():
+
+            # Handle the time key in particular.
+            if metric == 'times':
+                Score.objects.create(
+                    metric    = Score.METRICS.time,
+                    score     = values['final'].total_seconds(),
+                    folds     = [td.total_seconds() for td in values['folds']],
+                    estimator = estimator,
+                )
+                continue
+
+            # Handle generic scores for the model
+            for label, folds in values.items():
+                if metric == 'support' and label == 'average':
+                    # This will be an array of None values, so skip.
+                    continue
+
+                Score.objects.create(
+                    metric    = metric,
+                    score     = np.asarray(folds).mean(),
+                    label     = label,
+                    folds     = folds,
+                    estimator = estimator,
+                )
+
+
+        # Report model construction complete
+        self.stdout.write(
+            "Training complete in {}! Estimator saved to the database\n".format(total_time)
+        )
+
+    def get_user(self, username):
+        """
+        Returns a user or None, raising a command error if no user with the
+        specified username is found in the database.
+        """
+        if username is None: return None
+        try:
+            return User.objects.get(username=username)
+        except User.DoesNotExist:
+            raise CommandError(
+                "No user with username '{}' in the database".format(username)
+            )
diff --git a/arbiter/migrations/0001_initial.py b/arbiter/migrations/0001_initial.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-# Generated by Django 1.9.7 on 2016-08-02 14:31
+# Generated by Django 1.9.7 on 2016-08-02 17:06
 from __future__ import unicode_literals
 
 from django.conf import settings
@@ -34,7 +34,8 @@ class Migration(migrations.Migration):
                 ('owner', models.ForeignKey(blank=True, default=None, null=True, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
             ],
             options={
-                'abstract': False,
+                'get_latest_by': 'created',
+                'db_table': 'estimators',
             },
         ),
         migrations.CreateModel(
@@ -43,14 +44,15 @@ class Migration(migrations.Migration):
                 ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
                 ('created', model_utils.fields.AutoCreatedField(default=django.utils.timezone.now, editable=False, verbose_name='created')),
                 ('modified', model_utils.fields.AutoLastModifiedField(default=django.utils.timezone.now, editable=False, verbose_name='modified')),
-                ('metric', models.CharField(choices=[('accuracy', 'accuracy'), ('auc', 'auc'), ('brier', 'brier'), ('f1', 'f1'), ('fbeta', 'fbeta'), ('hamming', 'hamming'), ('hinge', 'hinge'), ('jaccard', 'jaccard'), ('logloss', 'logloss'), ('mcc', 'mcc'), ('precision', 'precision'), ('recall', 'recall'), ('roc', 'roc'), ('support', 'support'), ('mae', 'mae'), ('mse', 'mse'), ('mdae', 'mdae'), ('r2', 'r2'), ('rand', 'rand'), ('completeness', 'completeness'), ('homogeneity', 'homogeneity'), ('mutual', 'mutual'), ('silhouette', 'silhouette'), ('v', 'v')], max_length=32)),
+                ('metric', models.CharField(choices=[('accuracy', 'accuracy'), ('auc', 'auc'), ('brier', 'brier'), ('f1', 'f1'), ('fbeta', 'fbeta'), ('hamming', 'hamming'), ('hinge', 'hinge'), ('jaccard', 'jaccard'), ('logloss', 'logloss'), ('mcc', 'mcc'), ('precision', 'precision'), ('recall', 'recall'), ('roc', 'roc'), ('support', 'support'), ('mae', 'mae'), ('mse', 'mse'), ('mdae', 'mdae'), ('r2', 'r2'), ('rand', 'rand'), ('completeness', 'completeness'), ('homogeneity', 'homogeneity'), ('mutual', 'mutual'), ('silhouette', 'silhouette'), ('v', 'v'), ('time', 'time')], max_length=32)),
                 ('score', models.FloatField(blank=True, default=None, null=True)),
                 ('label', models.CharField(blank=True, default=None, max_length=32, null=True)),
                 ('folds', django.contrib.postgres.fields.ArrayField(base_field=models.FloatField(), blank=True, default=None, null=True, size=None)),
                 ('estimator', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='scores', to='arbiter.Estimator')),
             ],
             options={
-                'abstract': False,
+                'get_latest_by': 'created',
+                'db_table': 'evaluations',
             },
         ),
     ]
diff --git a/arbiter/models.py b/arbiter/models.py
@@ -45,6 +45,20 @@ class Estimator(TimeStampedModel):
     build_time  = models.DurationField(**nullable)               # The amount of time it took to buld
     owner       = models.ForeignKey('auth.User', **nullable)     # The owner, if any, of the model
 
+    class Meta:
+        db_table = "estimators"
+        get_latest_by = "created"
+
+    def __str__(self):
+        s =  "{} {} ({})".format(
+            self.model_class, self.model_type.title(), self.created.strftime('%Y-%m-%d')
+        )
+
+        if self.owner:
+            s += " for {}".format(self.owner)
+
+        return s
+
 
 class Score(TimeStampedModel):
     """
@@ -57,10 +71,25 @@ class Score(TimeStampedModel):
         'jaccard', 'logloss', 'mcc', 'precision', 'recall', 'roc', 'support',
         'mae', 'mse', 'mdae', 'r2',
         'rand', 'completeness', 'homogeneity', 'mutual', 'silhouette', 'v',
+        'time',
     )
 
     metric    = models.CharField(choices=METRICS, max_length=32)    # The type of the score
     score     = models.FloatField(**nullable)                       # The actual value of the score
     label     = models.CharField(max_length=32, **nullable)         # The label, if any, of the score
     folds     = ArrayField(models.FloatField(), **nullable)         # Cross-validation scores
     estimator = models.ForeignKey(Estimator, related_name='scores') # The estimator being evaluated
+
+    class Meta:
+        db_table = "evaluations"
+        get_latest_by = "created"
+
+    def __str__(self):
+        s = "{} score for {} = {:0.3f}".format(
+            self.metric.title(), self.estimator, self.score
+        )
+
+        if self.label:
+            s = "{} ".format(self.label.title()) + s
+
+        return s
diff --git a/corpus/learn.py b/corpus/learn.py
@@ -294,6 +294,10 @@ def build_model(loader, model, **kwargs):
     pipeline object along with scores and timing information.
     """
 
+    # TODO: Add multiprocessing to parallelize build_inner_fold
+    # TODO: Add verbosity to inform user on command line what is happening
+    # TODO: Equip this method to be used by Celery workers
+
     @timeit
     def build_inner_fold(loader, classifier, fold=None):
         """
@@ -319,9 +323,8 @@ def build_inner_fold(loader, classifier, fold=None):
         y_pred  = model.predict(X_test)
 
         # Get the per-class scores as a well-structured object
-        keys = ('precision', 'recall', 'fscore', 'support')
+        keys = ('precision', 'recall', 'f1', 'support')
         scores = precision_recall_fscore_support(y_test, y_pred, labels=model.classes_)
-        scores = map(lambda s: map(float, s), scores)
         scores = map(lambda s: dict(zip(model.classes_, s)), scores)
         scores = dict(zip(keys, scores))
 
@@ -360,7 +363,6 @@ def build_inner_fold(loader, classifier, fold=None):
 
 if __name__ == '__main__':
     import os
-    import json
     import pickle
 
     from corpus.reader import TranscriptCorpusReader
@@ -377,5 +379,7 @@ def build_inner_fold(loader, classifier, fold=None):
     with open(saveto, 'wb') as f:
         pickle.dump(model, f)
 
-    print(json.dumps(scores, indent=2))
-    print(total_time)
+    with open('scores.pickle', 'wb') as f:
+        pickle.dump(scores, f)
+
+    print("Finished build process in {}".format(total_time))
diff --git a/partisan/utils.py b/partisan/utils.py
@@ -18,14 +18,13 @@
 ##########################################################################
 
 import re
-import time
 import base64
 import bleach
 import hashlib
 
 from functools import wraps
 from markdown import markdown
-
+from datetime import datetime
 
 ##########################################################################
 ## Utilities
@@ -107,13 +106,14 @@ def fget_memoized(self):
 
 def timeit(func):
     """
-    Simple wall clock timer for a function that runs in seconds.
+    Simple wall clock timer for a function that runs in seconds. Returns a
+    datetime.timedelta object for use in a models.DurationField. 
     """
 
     @wraps(func)
     def func_timed(*args, **kwargs):
-        start  = time.time()
+        start  = datetime.now()
         result = func(*args, **kwargs)
-        return result, time.time() - start
+        return result, datetime.now() - start
 
     return func_timed