Skip to content

Commit

Permalink
mms database layer and management command to train
Browse files Browse the repository at this point in the history
  • Loading branch information
bbengfort committed Aug 2, 2016
1 parent cd70726 commit feb4803
Show file tree
Hide file tree
Showing 7 changed files with 236 additions and 14 deletions.
18 changes: 18 additions & 0 deletions arbiter/management/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# arbiter.management
# A module that specifies Django management commands for the arbiter app.
#
# Author: Benjamin Bengfort <[email protected]>
# Created: Tue Aug 02 10:36:54 2016 -0400
#
# Copyright (C) 2016 District Data Labs
# For license information, see LICENSE.txt
#
# ID: __init__.py [] [email protected] $

"""
A module that specifies Django management commands for the arbiter app.
"""

##########################################################################
## Imports
##########################################################################
18 changes: 18 additions & 0 deletions arbiter/management/commands/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# arbiter.management.commands
# Module that contains each individual management command for Django.
#
# Author: Benjamin Bengfort <[email protected]>
# Created: Tue Aug 02 10:37:24 2016 -0400
#
# Copyright (C) 2016 District Data Labs
# For license information, see LICENSE.txt
#
# ID: __init__.py [] [email protected] $

"""
Module that contains each individual management command for Django.
"""

##########################################################################
## Imports
##########################################################################
151 changes: 151 additions & 0 deletions arbiter/management/commands/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
# arbiter.management.commands.train
# Command to train red/blue classifiers from the command line.
#
# Author: Benjamin Bengfort <[email protected]>
# Created: Tue Aug 02 10:38:54 2016 -0400
#
# Copyright (C) 2016 District Data Labs
# For license information, see LICENSE.txt
#
# ID: train.py [] [email protected] $

"""
Command to train red/blue classifiers from the command line.
"""

##########################################################################
## Imports
##########################################################################

import numpy as np

from arbiter.models import Estimator, Score
from django.contrib.auth.models import User
from corpus.reader import TranscriptCorpusReader
from corpus.learn import CorpusLoader, build_model
from django.core.management.base import BaseCommand, CommandError

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression


##########################################################################
## Training Command
##########################################################################

class Command(BaseCommand):

help = "Trains red/blue classifiers and stores them in the database."

estimators = {
'maxent': (LogisticRegression, {}),
'svm': (SGDClassifier, {'loss':'hinge', 'penalty':'l2', 'alpha':1e-3}),
'nbayes': (MultinomialNB, {}),
}

def add_arguments(self, parser):
"""
Add command line argparse arguments.
"""
# Model selection argument
parser.add_argument(
'-m', '--model', choices=self.estimators, default='maxent',
help='specify the model form to fit on the given corpus',
)

# Number of folds for cross-validation
parser.add_argument(
'-f', '--folds', type=int, default=12,
help='number of folds to use in cross-validation',
)

# Optional ownership argument
parser.add_argument(
'-u', '--username', default=None,
help='specify the username to associate with the model',
)

# TODO: Change this to allow for a query or a path on disk
parser.add_argument('corpus', nargs=1, help='path to the corpus on disk')

def handle(self, *args, **options):
"""
Handles the model training process
"""

# Get the details from the command line arguments
model, kwargs = self.estimators[options['model']]
owner = self.get_user(options['username'])

# Construct the corpus and loader in preparation for training.
# TODO: Make the corpus loader construction a method to handle querysets
corpus = TranscriptCorpusReader(options['corpus'][0])
loader = CorpusLoader(corpus, options['folds'])

# Inform the user that the training process is beginning
self.stdout.write((
"Starting training of {} {} models on the corpus at {}\n"
"This may take quite a bit of time, please be patient!\n"
).format(
loader.n_folds + 1, model.__name__, options['corpus'][0]
))

# GO! Build the model forever! Whooo!!!
(clf, scores), total_time = build_model(loader, model, **kwargs)

# Save the estimator model
estimator = Estimator.objects.create(
model_type = Estimator.TYPES.classifier,
model_class = model.__name__,
model_form = repr(clf),
estimator = clf,
build_time = total_time,
owner = owner,
)

# Save the scores objects.
for metric, values in scores.items():

# Handle the time key in particular.
if metric == 'times':
Score.objects.create(
metric = Score.METRICS.time,
score = values['final'].total_seconds(),
folds = [td.total_seconds() for td in values['folds']],
estimator = estimator,
)
continue

# Handle generic scores for the model
for label, folds in values.items():
if metric == 'support' and label == 'average':
# This will be an array of None values, so skip.
continue

Score.objects.create(
metric = metric,
score = np.asarray(folds).mean(),
label = label,
folds = folds,
estimator = estimator,
)


# Report model construction complete
self.stdout.write(
"Training complete in {}! Estimator saved to the database\n".format(total_time)
)

def get_user(self, username):
"""
Returns a user or None, raising a command error if no user with the
specified username is found in the database.
"""
if username is None: return None
try:
return User.objects.get(username=username)
except User.DoesNotExist:
raise CommandError(
"No user with username '{}' in the database".format(username)
)
10 changes: 6 additions & 4 deletions arbiter/migrations/0001_initial.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.9.7 on 2016-08-02 14:31
# Generated by Django 1.9.7 on 2016-08-02 17:06
from __future__ import unicode_literals

from django.conf import settings
Expand Down Expand Up @@ -34,7 +34,8 @@ class Migration(migrations.Migration):
('owner', models.ForeignKey(blank=True, default=None, null=True, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
],
options={
'abstract': False,
'get_latest_by': 'created',
'db_table': 'estimators',
},
),
migrations.CreateModel(
Expand All @@ -43,14 +44,15 @@ class Migration(migrations.Migration):
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('created', model_utils.fields.AutoCreatedField(default=django.utils.timezone.now, editable=False, verbose_name='created')),
('modified', model_utils.fields.AutoLastModifiedField(default=django.utils.timezone.now, editable=False, verbose_name='modified')),
('metric', models.CharField(choices=[('accuracy', 'accuracy'), ('auc', 'auc'), ('brier', 'brier'), ('f1', 'f1'), ('fbeta', 'fbeta'), ('hamming', 'hamming'), ('hinge', 'hinge'), ('jaccard', 'jaccard'), ('logloss', 'logloss'), ('mcc', 'mcc'), ('precision', 'precision'), ('recall', 'recall'), ('roc', 'roc'), ('support', 'support'), ('mae', 'mae'), ('mse', 'mse'), ('mdae', 'mdae'), ('r2', 'r2'), ('rand', 'rand'), ('completeness', 'completeness'), ('homogeneity', 'homogeneity'), ('mutual', 'mutual'), ('silhouette', 'silhouette'), ('v', 'v')], max_length=32)),
('metric', models.CharField(choices=[('accuracy', 'accuracy'), ('auc', 'auc'), ('brier', 'brier'), ('f1', 'f1'), ('fbeta', 'fbeta'), ('hamming', 'hamming'), ('hinge', 'hinge'), ('jaccard', 'jaccard'), ('logloss', 'logloss'), ('mcc', 'mcc'), ('precision', 'precision'), ('recall', 'recall'), ('roc', 'roc'), ('support', 'support'), ('mae', 'mae'), ('mse', 'mse'), ('mdae', 'mdae'), ('r2', 'r2'), ('rand', 'rand'), ('completeness', 'completeness'), ('homogeneity', 'homogeneity'), ('mutual', 'mutual'), ('silhouette', 'silhouette'), ('v', 'v'), ('time', 'time')], max_length=32)),
('score', models.FloatField(blank=True, default=None, null=True)),
('label', models.CharField(blank=True, default=None, max_length=32, null=True)),
('folds', django.contrib.postgres.fields.ArrayField(base_field=models.FloatField(), blank=True, default=None, null=True, size=None)),
('estimator', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='scores', to='arbiter.Estimator')),
],
options={
'abstract': False,
'get_latest_by': 'created',
'db_table': 'evaluations',
},
),
]
29 changes: 29 additions & 0 deletions arbiter/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,20 @@ class Estimator(TimeStampedModel):
build_time = models.DurationField(**nullable) # The amount of time it took to buld
owner = models.ForeignKey('auth.User', **nullable) # The owner, if any, of the model

class Meta:
db_table = "estimators"
get_latest_by = "created"

def __str__(self):
s = "{} {} ({})".format(
self.model_class, self.model_type.title(), self.created.strftime('%Y-%m-%d')
)

if self.owner:
s += " for {}".format(self.owner)

return s


class Score(TimeStampedModel):
"""
Expand All @@ -57,10 +71,25 @@ class Score(TimeStampedModel):
'jaccard', 'logloss', 'mcc', 'precision', 'recall', 'roc', 'support',
'mae', 'mse', 'mdae', 'r2',
'rand', 'completeness', 'homogeneity', 'mutual', 'silhouette', 'v',
'time',
)

metric = models.CharField(choices=METRICS, max_length=32) # The type of the score
score = models.FloatField(**nullable) # The actual value of the score
label = models.CharField(max_length=32, **nullable) # The label, if any, of the score
folds = ArrayField(models.FloatField(), **nullable) # Cross-validation scores
estimator = models.ForeignKey(Estimator, related_name='scores') # The estimator being evaluated

class Meta:
db_table = "evaluations"
get_latest_by = "created"

def __str__(self):
s = "{} score for {} = {:0.3f}".format(
self.metric.title(), self.estimator, self.score
)

if self.label:
s = "{} ".format(self.label.title()) + s

return s
14 changes: 9 additions & 5 deletions corpus/learn.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,10 @@ def build_model(loader, model, **kwargs):
pipeline object along with scores and timing information.
"""

# TODO: Add multiprocessing to parallelize build_inner_fold
# TODO: Add verbosity to inform user on command line what is happening
# TODO: Equip this method to be used by Celery workers

@timeit
def build_inner_fold(loader, classifier, fold=None):
"""
Expand All @@ -319,9 +323,8 @@ def build_inner_fold(loader, classifier, fold=None):
y_pred = model.predict(X_test)

# Get the per-class scores as a well-structured object
keys = ('precision', 'recall', 'fscore', 'support')
keys = ('precision', 'recall', 'f1', 'support')
scores = precision_recall_fscore_support(y_test, y_pred, labels=model.classes_)
scores = map(lambda s: map(float, s), scores)
scores = map(lambda s: dict(zip(model.classes_, s)), scores)
scores = dict(zip(keys, scores))

Expand Down Expand Up @@ -360,7 +363,6 @@ def build_inner_fold(loader, classifier, fold=None):

if __name__ == '__main__':
import os
import json
import pickle

from corpus.reader import TranscriptCorpusReader
Expand All @@ -377,5 +379,7 @@ def build_inner_fold(loader, classifier, fold=None):
with open(saveto, 'wb') as f:
pickle.dump(model, f)

print(json.dumps(scores, indent=2))
print(total_time)
with open('scores.pickle', 'wb') as f:
pickle.dump(scores, f)

print("Finished build process in {}".format(total_time))
10 changes: 5 additions & 5 deletions partisan/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,13 @@
##########################################################################

import re
import time
import base64
import bleach
import hashlib

from functools import wraps
from markdown import markdown

from datetime import datetime

##########################################################################
## Utilities
Expand Down Expand Up @@ -107,13 +106,14 @@ def fget_memoized(self):

def timeit(func):
"""
Simple wall clock timer for a function that runs in seconds.
Simple wall clock timer for a function that runs in seconds. Returns a
datetime.timedelta object for use in a models.DurationField.
"""

@wraps(func)
def func_timed(*args, **kwargs):
start = time.time()
start = datetime.now()
result = func(*args, **kwargs)
return result, time.time() - start
return result, datetime.now() - start

return func_timed

0 comments on commit feb4803

Please sign in to comment.