From feb480363ade8035fff9953d7a088a24bf6ff7a9 Mon Sep 17 00:00:00 2001 From: Benjamin Bengfort Date: Tue, 2 Aug 2016 13:47:53 -0400 Subject: [PATCH] mms database layer and management command to train --- arbiter/management/__init__.py | 18 +++ arbiter/management/commands/__init__.py | 18 +++ arbiter/management/commands/train.py | 151 ++++++++++++++++++++++++ arbiter/migrations/0001_initial.py | 10 +- arbiter/models.py | 29 +++++ corpus/learn.py | 14 ++- partisan/utils.py | 10 +- 7 files changed, 236 insertions(+), 14 deletions(-) create mode 100644 arbiter/management/__init__.py create mode 100644 arbiter/management/commands/__init__.py create mode 100644 arbiter/management/commands/train.py diff --git a/arbiter/management/__init__.py b/arbiter/management/__init__.py new file mode 100644 index 0000000..a4c6a78 --- /dev/null +++ b/arbiter/management/__init__.py @@ -0,0 +1,18 @@ +# arbiter.management +# A module that specifies Django management commands for the arbiter app. +# +# Author: Benjamin Bengfort +# Created: Tue Aug 02 10:36:54 2016 -0400 +# +# Copyright (C) 2016 District Data Labs +# For license information, see LICENSE.txt +# +# ID: __init__.py [] benjamin@bengfort.com $ + +""" +A module that specifies Django management commands for the arbiter app. +""" + +########################################################################## +## Imports +########################################################################## diff --git a/arbiter/management/commands/__init__.py b/arbiter/management/commands/__init__.py new file mode 100644 index 0000000..e25d4ce --- /dev/null +++ b/arbiter/management/commands/__init__.py @@ -0,0 +1,18 @@ +# arbiter.management.commands +# Module that contains each individual management command for Django. +# +# Author: Benjamin Bengfort +# Created: Tue Aug 02 10:37:24 2016 -0400 +# +# Copyright (C) 2016 District Data Labs +# For license information, see LICENSE.txt +# +# ID: __init__.py [] benjamin@bengfort.com $ + +""" +Module that contains each individual management command for Django. +""" + +########################################################################## +## Imports +########################################################################## diff --git a/arbiter/management/commands/train.py b/arbiter/management/commands/train.py new file mode 100644 index 0000000..06d4a3a --- /dev/null +++ b/arbiter/management/commands/train.py @@ -0,0 +1,151 @@ +# arbiter.management.commands.train +# Command to train red/blue classifiers from the command line. +# +# Author: Benjamin Bengfort +# Created: Tue Aug 02 10:38:54 2016 -0400 +# +# Copyright (C) 2016 District Data Labs +# For license information, see LICENSE.txt +# +# ID: train.py [] benjamin@bengfort.com $ + +""" +Command to train red/blue classifiers from the command line. +""" + +########################################################################## +## Imports +########################################################################## + +import numpy as np + +from arbiter.models import Estimator, Score +from django.contrib.auth.models import User +from corpus.reader import TranscriptCorpusReader +from corpus.learn import CorpusLoader, build_model +from django.core.management.base import BaseCommand, CommandError + +from sklearn.naive_bayes import MultinomialNB +from sklearn.linear_model import SGDClassifier +from sklearn.linear_model import LogisticRegression + + +########################################################################## +## Training Command +########################################################################## + +class Command(BaseCommand): + + help = "Trains red/blue classifiers and stores them in the database." + + estimators = { + 'maxent': (LogisticRegression, {}), + 'svm': (SGDClassifier, {'loss':'hinge', 'penalty':'l2', 'alpha':1e-3}), + 'nbayes': (MultinomialNB, {}), + } + + def add_arguments(self, parser): + """ + Add command line argparse arguments. + """ + # Model selection argument + parser.add_argument( + '-m', '--model', choices=self.estimators, default='maxent', + help='specify the model form to fit on the given corpus', + ) + + # Number of folds for cross-validation + parser.add_argument( + '-f', '--folds', type=int, default=12, + help='number of folds to use in cross-validation', + ) + + # Optional ownership argument + parser.add_argument( + '-u', '--username', default=None, + help='specify the username to associate with the model', + ) + + # TODO: Change this to allow for a query or a path on disk + parser.add_argument('corpus', nargs=1, help='path to the corpus on disk') + + def handle(self, *args, **options): + """ + Handles the model training process + """ + + # Get the details from the command line arguments + model, kwargs = self.estimators[options['model']] + owner = self.get_user(options['username']) + + # Construct the corpus and loader in preparation for training. + # TODO: Make the corpus loader construction a method to handle querysets + corpus = TranscriptCorpusReader(options['corpus'][0]) + loader = CorpusLoader(corpus, options['folds']) + + # Inform the user that the training process is beginning + self.stdout.write(( + "Starting training of {} {} models on the corpus at {}\n" + "This may take quite a bit of time, please be patient!\n" + ).format( + loader.n_folds + 1, model.__name__, options['corpus'][0] + )) + + # GO! Build the model forever! Whooo!!! + (clf, scores), total_time = build_model(loader, model, **kwargs) + + # Save the estimator model + estimator = Estimator.objects.create( + model_type = Estimator.TYPES.classifier, + model_class = model.__name__, + model_form = repr(clf), + estimator = clf, + build_time = total_time, + owner = owner, + ) + + # Save the scores objects. + for metric, values in scores.items(): + + # Handle the time key in particular. + if metric == 'times': + Score.objects.create( + metric = Score.METRICS.time, + score = values['final'].total_seconds(), + folds = [td.total_seconds() for td in values['folds']], + estimator = estimator, + ) + continue + + # Handle generic scores for the model + for label, folds in values.items(): + if metric == 'support' and label == 'average': + # This will be an array of None values, so skip. + continue + + Score.objects.create( + metric = metric, + score = np.asarray(folds).mean(), + label = label, + folds = folds, + estimator = estimator, + ) + + + # Report model construction complete + self.stdout.write( + "Training complete in {}! Estimator saved to the database\n".format(total_time) + ) + + def get_user(self, username): + """ + Returns a user or None, raising a command error if no user with the + specified username is found in the database. + """ + if username is None: return None + try: + return User.objects.get(username=username) + except User.DoesNotExist: + raise CommandError( + "No user with username '{}' in the database".format(username) + ) diff --git a/arbiter/migrations/0001_initial.py b/arbiter/migrations/0001_initial.py index ab6df83..09c5e4e 100644 --- a/arbiter/migrations/0001_initial.py +++ b/arbiter/migrations/0001_initial.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Generated by Django 1.9.7 on 2016-08-02 14:31 +# Generated by Django 1.9.7 on 2016-08-02 17:06 from __future__ import unicode_literals from django.conf import settings @@ -34,7 +34,8 @@ class Migration(migrations.Migration): ('owner', models.ForeignKey(blank=True, default=None, null=True, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)), ], options={ - 'abstract': False, + 'get_latest_by': 'created', + 'db_table': 'estimators', }, ), migrations.CreateModel( @@ -43,14 +44,15 @@ class Migration(migrations.Migration): ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), ('created', model_utils.fields.AutoCreatedField(default=django.utils.timezone.now, editable=False, verbose_name='created')), ('modified', model_utils.fields.AutoLastModifiedField(default=django.utils.timezone.now, editable=False, verbose_name='modified')), - ('metric', models.CharField(choices=[('accuracy', 'accuracy'), ('auc', 'auc'), ('brier', 'brier'), ('f1', 'f1'), ('fbeta', 'fbeta'), ('hamming', 'hamming'), ('hinge', 'hinge'), ('jaccard', 'jaccard'), ('logloss', 'logloss'), ('mcc', 'mcc'), ('precision', 'precision'), ('recall', 'recall'), ('roc', 'roc'), ('support', 'support'), ('mae', 'mae'), ('mse', 'mse'), ('mdae', 'mdae'), ('r2', 'r2'), ('rand', 'rand'), ('completeness', 'completeness'), ('homogeneity', 'homogeneity'), ('mutual', 'mutual'), ('silhouette', 'silhouette'), ('v', 'v')], max_length=32)), + ('metric', models.CharField(choices=[('accuracy', 'accuracy'), ('auc', 'auc'), ('brier', 'brier'), ('f1', 'f1'), ('fbeta', 'fbeta'), ('hamming', 'hamming'), ('hinge', 'hinge'), ('jaccard', 'jaccard'), ('logloss', 'logloss'), ('mcc', 'mcc'), ('precision', 'precision'), ('recall', 'recall'), ('roc', 'roc'), ('support', 'support'), ('mae', 'mae'), ('mse', 'mse'), ('mdae', 'mdae'), ('r2', 'r2'), ('rand', 'rand'), ('completeness', 'completeness'), ('homogeneity', 'homogeneity'), ('mutual', 'mutual'), ('silhouette', 'silhouette'), ('v', 'v'), ('time', 'time')], max_length=32)), ('score', models.FloatField(blank=True, default=None, null=True)), ('label', models.CharField(blank=True, default=None, max_length=32, null=True)), ('folds', django.contrib.postgres.fields.ArrayField(base_field=models.FloatField(), blank=True, default=None, null=True, size=None)), ('estimator', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='scores', to='arbiter.Estimator')), ], options={ - 'abstract': False, + 'get_latest_by': 'created', + 'db_table': 'evaluations', }, ), ] diff --git a/arbiter/models.py b/arbiter/models.py index 935bc16..8282d0d 100644 --- a/arbiter/models.py +++ b/arbiter/models.py @@ -45,6 +45,20 @@ class Estimator(TimeStampedModel): build_time = models.DurationField(**nullable) # The amount of time it took to buld owner = models.ForeignKey('auth.User', **nullable) # The owner, if any, of the model + class Meta: + db_table = "estimators" + get_latest_by = "created" + + def __str__(self): + s = "{} {} ({})".format( + self.model_class, self.model_type.title(), self.created.strftime('%Y-%m-%d') + ) + + if self.owner: + s += " for {}".format(self.owner) + + return s + class Score(TimeStampedModel): """ @@ -57,6 +71,7 @@ class Score(TimeStampedModel): 'jaccard', 'logloss', 'mcc', 'precision', 'recall', 'roc', 'support', 'mae', 'mse', 'mdae', 'r2', 'rand', 'completeness', 'homogeneity', 'mutual', 'silhouette', 'v', + 'time', ) metric = models.CharField(choices=METRICS, max_length=32) # The type of the score @@ -64,3 +79,17 @@ class Score(TimeStampedModel): label = models.CharField(max_length=32, **nullable) # The label, if any, of the score folds = ArrayField(models.FloatField(), **nullable) # Cross-validation scores estimator = models.ForeignKey(Estimator, related_name='scores') # The estimator being evaluated + + class Meta: + db_table = "evaluations" + get_latest_by = "created" + + def __str__(self): + s = "{} score for {} = {:0.3f}".format( + self.metric.title(), self.estimator, self.score + ) + + if self.label: + s = "{} ".format(self.label.title()) + s + + return s diff --git a/corpus/learn.py b/corpus/learn.py index 0bd86bc..e18a0ea 100644 --- a/corpus/learn.py +++ b/corpus/learn.py @@ -294,6 +294,10 @@ def build_model(loader, model, **kwargs): pipeline object along with scores and timing information. """ + # TODO: Add multiprocessing to parallelize build_inner_fold + # TODO: Add verbosity to inform user on command line what is happening + # TODO: Equip this method to be used by Celery workers + @timeit def build_inner_fold(loader, classifier, fold=None): """ @@ -319,9 +323,8 @@ def build_inner_fold(loader, classifier, fold=None): y_pred = model.predict(X_test) # Get the per-class scores as a well-structured object - keys = ('precision', 'recall', 'fscore', 'support') + keys = ('precision', 'recall', 'f1', 'support') scores = precision_recall_fscore_support(y_test, y_pred, labels=model.classes_) - scores = map(lambda s: map(float, s), scores) scores = map(lambda s: dict(zip(model.classes_, s)), scores) scores = dict(zip(keys, scores)) @@ -360,7 +363,6 @@ def build_inner_fold(loader, classifier, fold=None): if __name__ == '__main__': import os - import json import pickle from corpus.reader import TranscriptCorpusReader @@ -377,5 +379,7 @@ def build_inner_fold(loader, classifier, fold=None): with open(saveto, 'wb') as f: pickle.dump(model, f) - print(json.dumps(scores, indent=2)) - print(total_time) + with open('scores.pickle', 'wb') as f: + pickle.dump(scores, f) + + print("Finished build process in {}".format(total_time)) diff --git a/partisan/utils.py b/partisan/utils.py index 19fd247..4bd2686 100644 --- a/partisan/utils.py +++ b/partisan/utils.py @@ -18,14 +18,13 @@ ########################################################################## import re -import time import base64 import bleach import hashlib from functools import wraps from markdown import markdown - +from datetime import datetime ########################################################################## ## Utilities @@ -107,13 +106,14 @@ def fget_memoized(self): def timeit(func): """ - Simple wall clock timer for a function that runs in seconds. + Simple wall clock timer for a function that runs in seconds. Returns a + datetime.timedelta object for use in a models.DurationField. """ @wraps(func) def func_timed(*args, **kwargs): - start = time.time() + start = datetime.now() result = func(*args, **kwargs) - return result, time.time() - start + return result, datetime.now() - start return func_timed