Updated documentation.

jmyrberg · Jul 27, 2017 · 2b2170a · 2b2170a
1 parent 393358e
commit 2b2170a
Show file tree

Hide file tree

Showing 7 changed files with 153 additions and 35 deletions.
diff --git a/README.md b/README.md
@@ -1,36 +1,135 @@
-# Finnish Neural Lemmatizer (finnlem)
-
-## Training steps
-### 1. Fit a dictionary
-python -m dict_train ^
-		--dict-save-path ./data/dictionaries/lemmatizer.dict ^
-		--dict-train-path ./data/dictionaries/lemmatizer.vocab ^
-		--vocab-size 50 ^
-		--min-freq 0.0 ^
-		--max-freq 1.0 ^
-		--file-batch-size 8192 ^
-		--prune-every-n 200
-
-### 2. Create and train a new model
+# finnlem
+
+**finnlem** is a [neural network](https://en.wikipedia.org/wiki/Artificial_neural_network) based [lemmatizer](https://en.wikipedia.org/wiki/Lemmatisation) model for [Finnish language](https://en.wikipedia.org/wiki/Finnish_language).
+
+A trained neural network can map given Finnish words into their base form:
+```
+Original	   Base Form
+'koira'    --> 'koira'
+'koiran'   --> 'koira'
+'koiraa'   --> 'koira'
+'koiraksi' --> 'koira'
+'koirasta' --> 'koira'
+```
+The model is a [tensorflow](https://www.tensorflow.org) implementation of a [sequence-to-sequence](https://arxiv.org/abs/1406.1078) recurrent neural network model. 
+This repository contains the code and data needed for training and making predictions with the model. The [datasets](src/data/datasets) contain over 2M samples in total.
+
+## Features
+![Tensorboard](doc/tensorboard.JPG)
+![Sequence-to-sequence graph](doc/tensorboard-graph.JPG)
+* Easy-to-use Python wrappers for sequence-to-sequence modeling
+* Automatical session handling, model checkpointing and logging
+* Support for tensorboard
+* Sequence-to-sequence model features: [Bahdanau](https://arxiv.org/abs/1409.0473) and [Luong](https://arxiv.org/abs/1508.04025) attention, residual connections, dropout, beamsearch decoding, ...
+
+## Installation
+You should have the latest versions for (as of 7/2017):
+* keras
+* nltk
+* numpy
+* pandas
+* tensorflow (1.3.0 or greater, with CUDA 8.0 and cuDNN 6.0 or greater)
+* unidecode
+
+After this, clone this repository to your local machine.
+
+## Example usage
+
+### Python
+
+The following is a simple example of using some of the features in the Python API.
+See more detailed descriptions of functions and parameters available from the source code documentation.
+
+#### 1. Fit a dictionary with default parameters
+```python
+from dictionary import Dictionary
+
+# Documents to fit in dictionary
+docs = ['abcdefghijklmnopqrstuvwxyz','åäö','DNP','#-']
+
+# Create a new Dictionary object
+d = Dictionary()
+
+# Fit characters of each document
+d.fit(docs)
+
+# Save for later usage
+d.save('./data/dictionaries/lemmatizer.dict')
+```
+
+#### 2. Create and train a Seq2Seq model with default parameters
+```python
+from model_wrappers import Seq2Seq
+
+# Create a new model
+model = Seq2Seq(model_dir='./data/models/lemmatizer,
+				dict_path='./data/dictionaries/lemmatizer.dict'))
+
+# Create some documents to train on
+source_docs = ['koira','koiran','koiraa','koirana','koiraksi','koirassa']*128
+target_docs = ['koira','koira','koira','koira','koira','koira','koira']*128
+
+# Train 100 batches, save checkpoint every 25th batch
+for i in range(100):
+	loss,global_step = model.train(source_docs, target_docs, save_every_n_batch=25)
+	print('Global step %d loss: %f' % (global_step,loss))
+```
+		
+#### 3. Make predictions on test set
+```python
+test_docs = ['koiraa','koirana','koiraksi']
+pred_docs = model.decode(test_docs)
+print(pred_docs) # --> [['koira'],['koira'],['koira']]
+```
+
+
+### Command line
+
+The following is a bit more complicated example of using the command line to train and predict from files.
+
+#### 1. Fit a dictionary with default parameters
+```
+python -m dict_train
+		--dict-save-path ./data/dictionaries/lemmatizer.dict
+		--dict-train-path ./data/dictionaries/lemmatizer.vocab
+```
+The dictionary train path file(s) should contain one document per line ([example](src/data/dictionaries/lemmatizer.vocab)).
+
+#### 2. Create and train a Seq2Seq model with default parameters
+```
+python -m model_train
+		--model-dir ./data/models/lemmatizer
+		--dict-path ./data/dictionaries/lemmatizer.dict
+		--train-data-path ./data/datasets/lemmatizer_train.csv
+		--validation-data-path ./data/datasets/lemmatizer_validation.csv
+		--validate-n-rows 5000
+		
 python -m model_train ^
-		--model-dir ./data/models/lemmatizer2 ^
+		--model-dir ./data/models/lemmatizer ^
 		--dict-path ./data/dictionaries/lemmatizer.dict ^
 		--train-data-path ./data/datasets/lemmatizer_train.csv ^
-		--optimizer 'adam' ^
-		--learning-rate 0.0001 ^
-		--dropout-rate 0.2 ^
-		--batch-size 128 ^
-		--file-batch-size 8192 ^
-		--max-file-pool-size 50 ^
-		--shuffle-files True ^
-		--shuffle-file-batches True ^
-		--save-every-n-batch 500 ^
-		--validate-every-n-batch 100 ^
 		--validation-data-path ./data/datasets/lemmatizer_validation.csv ^
 		--validate-n-rows 5000
+```
+The model train and validation data path file(s) should contain one source and target document per line, 
+separated by a comma ([example](src/data/datasets/lemmatizer_validation.csv)).
 
-### 3. Make predictions on test set
+#### 3. Make predictions on test set
+```
 python -m model_decode ^
 		--model-dir ./data/models/lemmatizer ^
-		--source-data-path ./data/datasets/lemmatizer_test.csv ^
-		--decoded-data-path ./data/decoded/lemmatizer_decoded_1.csv
+		--test-data-path ./data/datasets/lemmatizer_test.csv ^
+		--decoded-data-path ./data/decoded/lemmatizer_decoded.csv
+```
+The model source data path file(s) should contain either:
+* one source document per line, or
+* one source and target document per line, separated by a comma ([example](src/data/datasets/lemmatizer_test.csv))
+
+
+## Acknowledgements and references
+* [JayParks/tf-seq2seq](https://github.com/JayParks/tf-seq2seq): Example sequence-to-sequence implementation in tensorflow
+* [Omorfi](https://github.com/flammie/omorfi): Finnish open source morphology tool
+* [FinnTreeBank](http://www.ling.helsinki.fi/kieliteknologia/tutkimus/treebank/): Source for datasets
+* [Finnish Dependency Parser](http://bionlp.utu.fi/finnish-parser.html): Source for datasets
+
+
diff --git a/doc/cmd_params.md b/doc/cmd_params.md
@@ -0,0 +1,7 @@
+# List of available command line parameters
+
+## dict_train
+
+## model_train
+
+## model_decode
diff --git a/doc/python_api_params.md b/doc/python_api_params.md
@@ -0,0 +1,5 @@
+# List of relevant Python API objects, methods and parameters
+
+## Dictionary
+
+## Seq2Seq model
diff --git a/doc/tensorboard-graph.JPG b/doc/tensorboard-graph.JPG
diff --git a/doc/tensorboard.JPG b/doc/tensorboard.JPG
diff --git a/src/data_utils.py b/src/data_utils.py
@@ -1,20 +1,22 @@
 # -*- coding: utf8 -*-
-'''
-Created on 11.7.2017
+"""Utilities for data processing and batching."""
+
 
-@author: Jesse
-'''
-from collections import deque
 import itertools
 import math
+import time
+
+from collections import deque
 from multiprocessing import cpu_count
 from multiprocessing.pool import Pool
-import time
 
 import numpy as np
 import pandas as pd
 
+
 SEED = 2018
+"""SEED: Seed value for random generators"""
+
 
 def batchify(it, batch_size=32, shuffle=False, max_batches=None):
     """Return iterable in batches."""
@@ -34,6 +36,7 @@ def batchify(it, batch_size=32, shuffle=False, max_batches=None):
     if len(batch) > 0:
         yield batch
 
+
 def rebatch(batches, 
             in_batch_size_limit=8192, 
             out_batch_size=32, 
@@ -73,11 +76,13 @@ def rebatch(batches,
             np.random.shuffle(in_batches)
         yield out_batch
 
+
 def read_file(filename, nrows=None):
     """Read one file entirely."""
     ar = pd.read_csv(filename, nrows=nrows).values
     return ar
 
+
 def read_file_batched(filename, 
                       file_batch_size=8192, 
                       file_batch_shuffle=False, 
@@ -106,6 +111,7 @@ def read_file_batched(filename,
             elif return_mode == 'dict_list':
                 yield batch_df.to_dict('list')
 
+
 def read_files_batched(filenames,
                        file_batch_size=8192, 
                        file_batch_shuffle=False, 
@@ -176,6 +182,7 @@ def callback(batch):
                                            pd_kwargs=pd_kwargs):
                 yield batch
 
+
 def read_files_cycled(filenames,
                       max_file_pool_size=8,
                       file_batch_size=8192,

diff --git a/src/model_decode.py b/src/model_decode.py
@@ -18,7 +18,7 @@
 parser.add_argument("--model-dir", required=True,
                     type=str, action='store',
                     help='Model checkpoint and log save path')
-parser.add_argument("--source-data-path", required=True,
+parser.add_argument("--test-data-path", required=True,
                     type=str, action='store',
                     help='Path to source data to decode')
 parser.add_argument("--decoded-data-path", required=True,