LanguageModel.py

#!/usr/bin/env python

from __future__ import division

__author__    = 'Maximilian Bisani'
__version__   = '$LastChangedRevision: 1668 $'
__date__      = '$LastChangedDate: 2007-06-02 18:14:47 +0200 (Sat, 02 Jun 2007) $'
__copyright__ = 'Copyright (c) 2004-2005  RWTH Aachen University'
__license__   = """
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License Version 2 (June
1991) as published by the Free Software Foundation.
 
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, you will find it at
http://www.gnu.org/licenses/gpl.html, or write to the Free Software
Foundation, Inc., 51 Franlin Street, Fifth Floor, Boston, MA 02110,
USA.
 
Should a provision of no. 9 and 10 of the GNU General Public License
be invalid or become invalid, a valid provision is deemed to have been
agreed upon which comes closest to what the parties intended
commercially. In any case guarantee/warranty shall be limited to gross
negligent actions or intended actions or fraudulent concealment.
"""

notice = """
generated by %(__name__)s %(__version__)s %(__date__)s
copyright: %(__copyright__)s
author:    %(__author__)s
""" % globals()

import mGramCounts, itertools, operator
from pprint import pprint
from misc import restartable, once, reversed, sorted, set

# ===========================================================================
class Discount(object):
    def __init__(self, countsOfCounts = None):
	if countsOfCounts:
	    self.estimateParameters(countsOfCounts)

    def report(self, f):
	import pprint
	pprint.pprint(self.__dict__, f)

    def __call__(self, r):
	"""
	requirements:
	result <= r
	"""
	raise NotImplementedError

class AbsoluteDiscounting(Discount):
    discount = 0.2

    def estimateParameters(self, contsOfCounts, floor=None):
	n = dict(contsOfCounts)
	try:
	    self.discount = n[1] / (n[1] + 2 * n[2])
	except KeyError:
	    pass

	if floor:
	    epsilon = 0.01
	    self.discount = max(self.discount, floor.discount + epsilon)

    def __call__(self, value):
	return max(value - self.discount, 0.0)

    def report(self, f):
	print >> f, 'D =', self.discount


class TripleAbsoluteDiscounting(Discount):
    """
    Chen and Goodman's Modified Kneser-Ney smooting with three
    discounting paramters for n=1, n=2 and n>=3.
    """

    discount1 = 0.1
    discount2 = 0.2
    discount3plus = 0.3

    def estimateParameters(self, countsOfCounts, floor=None):
	n = dict(countsOfCounts)
	Y = n[1] / (n[1] + 2 * n[2])
	self.discount1     = 1 - 2*Y * n[2] / n[1]
	self.discount2     = 2 - 3*Y * n[3] / n[2]
	self.discount3plus = 3 - 4*Y * n[4] / n[3]

	if floor:
	    epsilon = 0.01
	    self.discount1     = max(self.discount1,     floor.discount1     + epsilon)
	    self.discount2     = max(self.discount2,     floor.discount2     + epsilon)
	    self.discount3plus = max(self.discount3plus, floor.discount3plus + epsilon)

    def __call__(self, value):
	if value >= 3:
	    return value - self.discount3plus
	elif value >= 2:
	    return value - self.discount2
	elif value >= self.discount1:
	    return value - self.discount1
	return 0.0

    def report(self, f):
	print >> f, 'D1  =', self.discount1
	print >> f, 'D2  =', self.discount2
	print >> f, 'D3+ =', self.discount3plus


class ZipfGoodTuringDiscounting(Discount):
    """
    Use the "Simple Good-Turing" technique.
    """

    cacheSize = 250
    kDefault = 2

    def estimateParameters(self, countsOfCounts, floor=None):
	import SimpleGoodTuring
	self.n = dict(countsOfCounts)

	self.k = 0
	while (self.k+2 in self.n) and (self.k < self.kDefault):
	    self.k += 1

	nSmoothed = SimpleGoodTuring.zipfFit(countsOfCounts)
	self.alpha = nSmoothed.alpha
	if self.alpha > -1:
	    self.alpha = -1.0
	if self.cacheSize < self.k: self.cacheSize = self.k
	self.store = [self.rStar(r) for r in range(self.cacheSize)]

	if floor:
	    self.alpha = min(self.alpha, floor.alpha)
	    self.store = map(min, self.store, floor.store)

    def rStar(self, r):
	"""
	rStar = (r + 1) * nSmoothed(r + 1) / nSmoothed(r)
	with nSmoothed = exp(alpha * ln(r) + beta)
	"""
	if r == 0:
	    return None
	elif r <= self.k:
	    return (r + 1) * self.n[r + 1] / self.n[r]
	else:
	    return (r+1) * ((r+1) / r)**self.alpha

    def __call__(self, value):
	value = int(value)
	try:
	    return self.store[value]
	except IndexError:
	    return self.rStar(value)

    def report(self, f):
	print >> f, 'k      =', self.k
	print >> f, 'alpha  =', self.alpha
	print >> f, 'r*     =', self.store[1:6]
	print >> f, 'r - r* =', [ r - self(r) for r in range(1, 6) ]

# ===========================================================================
if True:
    from SparseVector import leftJoinInterpolateAndAddOneSparse
else:
    def leftJoinInterpolateAndAddOneSparse(left, scale, right, extraKey, extraValue):
	result = [(extraKey, extraValue)]
	for k, v in left:
	    result.append((k, v + scale * right[k]))
	return Counts(result)

# ===========================================================================
from groupedCounts import Counts, contract, store, sumCounts, sumLotsOfCounts, CountsAccumulator
from IterMap import assertIsSorted, aggregate, leftJoin
import marshal, os, tempfile, SparseVector
MGram = tuple

class LanguageModelBuilder(object):
    minCounts = [1,1,2,3]
    discountType = [ ZipfGoodTuringDiscounting ]

    vocabulary   = None
    highestOrder = None
    discounts    = None

    def setVocabulary(self, vocabulary):
	self.vocabulary = vocabulary
	self.sentenceStart = vocabulary.index('<s>')
	predictedWords = set(self.vocabulary.indices())
	predictedWords.remove(self.sentenceStart)
	predictedWords.remove(self.vocabulary.noneIndex)
	self.predictedWords =  list(predictedWords)
	self.predictedWords.sort()

    def setHighestOrder(self, highestOrder):
	self.highestOrder = highestOrder

    def setDiscountTypes(self, types):
	self.discountType = types

    def estimateDiscounts(self, countsOfCounts):
	self.discounts = []
	lowerOrderDiscount = None
	for order in range(self.highestOrder + 1):
	    discount = self.discountType[min(order, len(self.discountType)-1)]()
	    discount.estimateParameters(countsOfCounts[order], floor=lowerOrderDiscount)
	    self.discounts.append(discount)
	    lowerOrderDiscount = discount

    def setCountCutoffs(self, cutoffs):
	self.minCounts = cutoffs

    def countCutoff(self, order):
	return self.minCounts[min(order, len(self.minCounts)-1)]

    def discount(self, order):
	return self.discounts[order]

    def rawCountsForOrder(self, allCounts, order):
	for history, values in assertIsSorted(allCounts):
	    if len(history) < order: continue
	    history, oldest = history[:order-1], history[order-1]
	    yield history, oldest, values

    def groupedCounts(self, allCounts, order):
	it = self.rawCountsForOrder(allCounts, order)
	history, oldest, values = it.next()
	group = []
	accu = CountsAccumulator()
	accu.set(values)
	for h, o, v in it:
	    if h == history:
		if o == oldest:
		    accu += v
		else:
		    group.append((oldest, accu.sum()))
		    oldest = o
		    accu.set(v)
	    else:
		group.append((oldest, accu.sum()))
		yield history, group
		history = h
		oldest = o
		accu.set(v)
		group = []
	group.append((oldest, accu.sum()))
	yield history, group
    groupedCounts = restartable(groupedCounts)

    def effectiveCounts(self, counts, minCount, discount):
	total = counts.sum()
	effectiveCounts = Counts([
	    (predicted, discount(value))
	    for predicted, value in counts.threshold(minCount)])
	return effectiveCounts, total

    def parametrizeOrder(self, order):
	self.log('\nbuilding order', order)

	minCount = self.countCutoff(order)
	self.log('count cutoff: ingoring counts < %d' % minCount)

	discount = self.discount(order)
	self.log('discounting:')
	if self.logFile: discount.report(self.logFile)

	return minCount, discount

    def makeZeroOrder(self, allCounts):
	minCount, discount = self.parametrizeOrder(0)

	counts = sumLotsOfCounts(itertools.imap(lambda item : item[1], allCounts))
	effectiveCounts, total = self.effectiveCounts(counts, minCount, discount)
	effectiveTotal = effectiveCounts.sum()

	seenWords = set([w for w, n in effectiveCounts])
	assert self.sentenceStart not in seenWords
	unseenWords = set(self.predictedWords) - seenWords
	assert self.sentenceStart not in unseenWords
	self.log('number of unseen words', len(unseenWords))

	pZero = 1 / len(self.predictedWords)
	backOffMass = total - effectiveTotal
	nZero = backOffMass * pZero
	interpolatedCounts = []
	for predicted, effectiveCount in effectiveCounts:
	    interpolatedCounts.append((predicted, effectiveCount + nZero))
	for predicted in unseenWords:
	    interpolatedCounts.append((predicted, nZero))
	interpolatedCounts = Counts(interpolatedCounts)

	self.log('%d predicted events' % (interpolatedCounts.size))
	return [(MGram(()), (interpolatedCounts, total))]

    class StoredEffectiveCounts(object):
	def __init__(self):
	    self.fname = tempfile.mkstemp('counts')[1]
	    self.file = open(self.fname, 'wb')

	def add(self, history, values, total):
	    marshal.dump(history, self.file)
	    SparseVector.dump(values, self.file)
	    marshal.dump(total, self.file)

	def finalize(self):
	    self.file.close()
	    self.file = None

	def __iter__(self):
	    assert self.file is None
	    file = open(self.fname, 'rb')
	    while True:
		try:
		    history = marshal.load(file)
		    values = SparseVector.load(file)
		    total = marshal.load(file)
		    yield history, (values, total)
		except EOFError:
		    break
	    file.close()

	def __del__(self):
	    os.unlink(self.fname)


    def build(self, allCounts, result):
	assert self.vocabulary
	assert self.highestOrder is not None
	assert self.discounts is not None

	result.vocabulary = self.vocabulary

	allEffectiveCounts = self.makeZeroOrder(allCounts)

	result_add = result.topSection(0)
	for history, (values, total) in allEffectiveCounts:
	    probabilities = values / total
	    result_add(history, probabilities)

	for order in range(1, self.highestOrder + 1):
	    minCount, discount = self.parametrizeOrder(order)

	    allLowerOrderEffectiveCounts = allEffectiveCounts
	    groupedCounts = self.groupedCounts(allCounts, order)

	    result_add = result.boSection(order - 1)
	    allEffectiveCounts = self.StoredEffectiveCounts()
	    nHistories = nPredicted = 0
	    for (lowerOrderHistory, (lowerOrderEffectiveCounts, lowerOrderTotal), counts) \
		    in leftJoin(allLowerOrderEffectiveCounts, groupedCounts):
		if counts is None:
		    lowerOrderDistribution = lowerOrderEffectiveCounts / \
					     lowerOrderTotal
		    result_add(lowerOrderHistory, lowerOrderDistribution)
		    continue

		effectiveCounts = []
		for oldest, values in counts:
		    effVals, total = self.effectiveCounts(values, minCount, discount)
		    if effVals:
			effectiveCounts.append((oldest, effVals, total))

		effectiveMarginalCounts = sumCounts([
		    values for oldest, values, total in effectiveCounts ])
		effectiveMarginalTotal  = effectiveMarginalCounts.sum()

		lowerOrderDistribution = []
		den = lowerOrderTotal - effectiveMarginalTotal
		for predicted, lowerOrderEffectiveCount in lowerOrderEffectiveCounts:
		    num = lowerOrderEffectiveCount - effectiveMarginalCounts[predicted]
		    if num <= 0.0 or den <= 0.0:
			self.log('warning: marginal inversion encountered',
				 repr((lowerOrderHistory, predicted,
				       lowerOrderEffectiveCount, effectiveMarginalCounts[predicted],
				       den)))
		    else:
			lowerOrderDistribution.append((predicted, num / den))
		lowerOrderDistribution = Counts(lowerOrderDistribution)

		result_add(lowerOrderHistory, lowerOrderDistribution)

		for oldest, effectiveCountsGroup, total in effectiveCounts:
		    history = lowerOrderHistory + MGram((oldest,))
		    effectiveTotal = effectiveCountsGroup.sum()
		    backOffMass = total - effectiveTotal
		    assert backOffMass >= 0

		    interpolatedCounts = leftJoinInterpolateAndAddOneSparse(
			effectiveCountsGroup,
			backOffMass,
			lowerOrderDistribution,
			self.vocabulary.noneIndex,
			backOffMass)

		    allEffectiveCounts.add(history, interpolatedCounts, total)
		    nHistories += 1
		    nPredicted += interpolatedCounts.size

	    allEffectiveCounts.finalize()
	    self.log('%d predicted events in %d histories' % (nPredicted, nHistories))

	    result_add = result.topSection(order)
	    for history, (values, total) in allEffectiveCounts:
		probabilities = values / total
		result_add(history, probabilities)

	result.finalize()
	return result


    logFile = None

    def setLogFile(self, f):
	self.logFile = f

    def log(self, *args):
	if self.logFile is not None:
	    print >> self.logFile, ' '.join(map(str, args))

    def make(self, vocabulary, counts, order):
	self.setVocabulary(vocabulary)
	self.setHighestOrder(order)
	coc = [ mGramCounts.countsOfCounts(mGramCounts.mGramReduceToOrder(counts, order))
		for order in range(order + 1) ]
	self.estimateDiscounts(coc)
	result = Lm(order)
	counts = store(contract(counts))
	self.build(counts, result)
	return result

# ===========================================================================
import math, sys
from IterMap import outerJoin


class LmDummy(object):
    def boSection(self, order):
	return self.ignore
    def topSection(self, order):
	return self.ignore

    def ignore(self, history, probabilities):
	pass

    def finalize(self):
	pass


class LmArpaWriter(LmDummy):
    vocabulary = None

    def __init__(self, file, highestOrder, notice = None):
	self.file = file
	self.highestOrder = highestOrder
	self.data = []
	if notice:
	    print >> self.file, notice

    def boSection(self, order):
	return self.add

    def topSection(self, order):
	if order == self.highestOrder:
	    return self.add
	else:
	    return self.ignore

    def add(self, history, probabilities):
	order = len(history)
	try:
	    li = self.data[order]
	except IndexError:
	    while len(self.data) <= order : self.data.append([])
	    li = self.data[order]
	li.append((history, probabilities))

    def finalize(self):
	self.writeArpa(self.file)
	self.file.close()

    def writeArpa(self, f):
	M = len(self.data)

	def probabilities(m):
	    for history, probabilities in self.data[m]:
		gram = tuple(map(self.vocabulary.symbol, reversed(history)))
		for predicted, probability in probabilities:
		    if predicted is not self.vocabulary.noneIndex:
			yield gram + (self.vocabulary.symbol(predicted),), probability

	def backOffs(m):
	    for history, probabilities in self.data[m]:
		if self.vocabulary.noneIndex in probabilities:
		    gram = tuple(map(self.vocabulary.symbol, reversed(history)))
		    yield gram, probabilities[self.vocabulary.noneIndex]

	def joined(m):
	    if m+1 < M:
		return outerJoin(sorted(probabilities(m)), sorted(backOffs(m+1)))
	    else:
		return outerJoin(sorted(probabilities(m)), [])

	print >> f
	print >> f, '\\data\\'
	for m in range(M):
	    n = 0
	    for x in joined(m): n += 1
	    print >> f, 'ngram %d=%d' % (m+1, n)
	print >> f
	for m in range(M):
	    print >> f, '\\%d-grams:' % (m+1)
	    for gram, probability, backOff in joined(m):
		if probability is None:
		    score = -99
		else:
		    score = math.log10(probability)
		if backOff is None or backOff == 1.0:
		    print >> f, '%f\t%s' % (score, ' '.join(gram))
		else:
		    print >> f, '%f\t%s\t%f' % (score, ' '.join(gram), math.log10(backOff))
	    print >> f
	print >> f, '\\end\\'


class LmEstarWriter(LmDummy):
    """
    Write LM in Even Simpler Than ARPA (ESTAR) format.

    The ESTAR format goes like this:

      \data\
      \include: foobar-3.lm \
      \history:  \
	      </s>    0.0356978
	      A       0.0144545
      ...
      \history: <s> \
	      __backoff__     0.277374
	      A       0.0293255
	      ABOUT   0.00264798
      \end\

    - Any content before \data\ or after \end\ is ingored
    - The tag \include: \ names a file (realtive path to the current
      file, that is read as if its contents (between \data\ and \end\
      were included in this position.
    - Histories are given in reverse order (i.e. recent-most first)
    - Some implementations expect that shorter histories come before
      longer ones.
    - Other lines are predicted words.
    - The special word __backoff__ is the backoff weight.
    - Number are plain probability values (not logarithms).
    - The file is in UTF-8 encoding.
    - For obvious reasons word tokens cannot be "__backoff__" or
      backslash cannot contain white-space.

    """

    vocabulary = None

    def __init__(self, filePrefix, fileSuffix, notice = None):
	self.filePrefix = filePrefix
	self.fileSuffix = fileSuffix
	self.notice = notice

    class Writer:
	def __init__(self, file, vocabulary, notice):
	    self.file = file
	    self.vocabulary = vocabulary
	    print >> self.file, notice
	    print >> self.file, '\\data\\'

	def include(self, fname):
	    print >> self.file, '\\include: %s \\' % fname

	def __call__(self, history, probabilities):
	    history_string = ' '.join(map(self.vocabulary.symbol, history))
	    print >> self.file, '\\history:', history_string, '\\'
	    for predicted, probability in probabilities:
		symbol = self.vocabulary.symbol(predicted)
		if symbol is None:
		    symbol = '__backoff__'
		print >> self.file, '\t%s\t%g' % (
		    symbol,
		    probability)

	def __del__(self):
	    print >> self.file, '\\end\\'
	    self.file.close()

    def filename(self, which):
	return '%s.estar-%s%s' % (self.filePrefix, which, self.fileSuffix)

    def topSection(self, order):
	f = gOpenOut(self.filename('%d' % (order + 1)))
	comment = 'This is a %d-gram model file.\n' % (order + 1)
	if self.notice: comment = notice + '\n' + comment
	part = self.Writer(f, self.vocabulary, comment)
	for oo in range(order):
	    part.include(os.path.basename(self.filename('%dbo' % (oo + 1))))
	return part

    def boSection(self, order):
	f = gOpenOut(self.filename('%dbo' % (order + 1)))
	comment = 'This is a modfied back-off %d-gram distribution file.\n' % (order + 1)
	if self.notice: comment = notice + '\n' + comment
	part = self.Writer(f, self.vocabulary, comment)
	return part

    def finalize(self):
	pass


class LmNode(object):
    __slots__ = ['history', 'backOffWeight', 'probabilities', 'parent', 'children']
    def __init__(self, history):
	self.parent = None
	self.children = []
	self.history = history
	self.backOffWeight = 1.0
	self.probabilities = {}

    def add(self, predicted, probability):
	self.probabilities.append((predicted, probability))

class Lm(LmDummy):
    Node = LmNode

    def __init__(self, highestOrder):
	self.highestOrder = highestOrder
	self.nodes = {}

    def boSection(self, order):
	return self.add

    def topSection(self, order):
	if order == self.highestOrder:
	    return self.add
	else:
	    return self.ignore

    def add(self, history, probabilities):
	n = self.nodes[history] = self.Node(history)
	for predicted, probability in probabilities:
	    if predicted is None:
		n.backOffWeight = probability
	    else:
		n.probabilities[predicted] = probability

    def finalize(self):
	for n in self.nodes.values():
	    if len(n.history) == 0: continue
	    shorterHistory = n.history[:-1]
	    n.parent = self.nodes[shorterHistory]
	    n.parent.children.append(n)

    def __call__(self, history, predicted):
	backOffWeight = 1.0
	while True:
	    if history in self.nodes:
		n = self.nodes[history]
		if predicted in n.probabilities:
		    return backOffWeight * n.probabilities[predicted]
		backOffWeight *= n.backOffWeight
	    if history:
		history = history[:-1]
	    else:
		break
	return backOffWeight

    def getList(self, node, parentProbabilities=None):
	if node is None: return []
	if parentProbabilities is None:
	    parentProbabilities = self.getList(node.parent)
	result = []
	for w, p, pp in outerJoin(node.probabilities, parentProbabilities):
	    if p is not None:
		result.append((w, p))
	    else:
		result.append((w, node.backOffWeight * pp))
	return result

    def checkNormalisation(self, node=None, parentProbabilities=None):
	if node is None: return self.checkNormalisation(self.root)
	probabilities = self.getList(node, parentProbabilities)
	total = sum([ p for w, p in probabilities ])
	if abs(total - 1.0) > 1e-6:
	    print >> sys.stdout, 'warning: denormalized history:', node.history, total
	for child in node.children:
	    self.checkNormalisation(child, probabilities)

# ===========================================================================
from mGramCounts import FileStorage, TextStorage, loadVocabulary
from groupedCounts import StoredCounts, NonMonotonousHistoriesError
from misc import gOpenIn, gOpenOut

class SentenceStartRemover(object):
    def __init__(self, vocabulary, counts):
	self.sentenceStart = vocabulary.index('<s>')
	self.counts = counts

    def __iter__(self):
	it = iter(self.counts)
	for (history, predicted), value in it:
	    if predicted != self.sentenceStart:
		yield (history, predicted), value
	    else:
		assert history == ()
#	(history, predicted), value = it.next()
#	assert (history, predicted) == ((), self.sentenceStart)
#	for (history, predicted), value in it:
#	    history = MGram(history)
#           yield (history, predicted), value

def loadCounts(fname, vocabulary, binaryCountFile=None):
    try:
	counts = TextStorage(fname, vocabulary.index)
	### work around
	counts = SentenceStartRemover(vocabulary, counts)
	counts = contract(counts)
	counts = store(counts, big=True, filename=binaryCountFile)
    except NonMonotonousHistoriesError, exc:
	h1, h2 = exc. args
	print h1, map(vocabulary.symbol, h1)
	print h2, map(vocabulary.symbol, h2)
	raise
    return counts


def makeLmWriter(options):
    if options.lm_format == 'arpa':
	fname = options.lm
	print >> sys.stdout, 'will write LM to', fname, '...'
	lm = LmArpaWriter(gOpenOut(fname), options.order - 1, notice)
    elif options.lm_format == 'estar':
	filePrefix, fileSuffix = os.path.splitext(options.lm)
	print >> sys.stdout, 'will write LM to %s-*%s ...' % (filePrefix, fileSuffix)
	lm = LmEstarWriter(filePrefix, fileSuffix, notice)
    else:
	raise ValueError(options.lm_format)
    return lm


def maximumCountsOrder(countsOfCounts):
    for order, coc in enumerate(countsOfCounts):
	coc = [ (freq, count) for freq, count in coc if count > 0 ]
	if len(coc) < 2:
	    break
    else:
	order = len(countsOfCounts)
    return order - 1


import os

def main(options, args):
    builder = LanguageModelBuilder()
    builder.setLogFile(sys.stdout)

    vocabulary = loadVocabulary(options.vocabulary)
    builder.setVocabulary(vocabulary)

    builder.setHighestOrder(options.order - 1)

    if options.count_cutoffs:
	cutoffs = map(int, options.count_cutoffs.split())
	builder.setCountCutoffs(cutoffs)

    binaryCountFile = options.read + '.bin'
    if os.path.isfile(binaryCountFile):
	counts = StoredCounts(binaryCountFile)
    else:
	counts = loadCounts(options.read, vocabulary, binaryCountFile)

    if options.counts_of_counts:
	coc = eval(gOpenIn(options.counts_of_counts).read())
    else:
	coc = [ mGramCounts.countsOfCounts(mGramCounts.mGramReduceToOrder(counts, order))
		for order in range(options.order) ]

    maximumOrder = maximumCountsOrder(coc)
    if builder.highestOrder > maximumOrder:
	print 'warning: no counts for orders above %d' % (maximumOrder+1)
	builder.setHighestOrder(maximumOrder)

    builder.estimateDiscounts(coc)

    if options.lm:
	lm = makeLmWriter(options)
    else:
	lm = LmDummy()

    builder.build(counts, lm)

    if __debug__ and False: ### TESTING
	print >> sys.stdout, 'verifying normalization ...'
	lm2 = Lm(lm)
	lm2.checkNormalisation()


if __name__ == '__main__':
    import optparse, tool
    options = optparse.OptionParser()
    tool.addOptions(options)
    options.add_option('-v', '--vocabulary')
    options.add_option('-r', '--read')
    options.add_option('-U', '--count-cutoffs',
		       help='set count cutoff values to n_i for order i',
		       metavar='n_0 n_1 ...')
    options.add_option('-C', '--counts-of-counts',
		       help='read counts-of-counts from FILE', metavar='FILE')
    options.add_option('-M', '--order', type='int', default=3)
    options.add_option('-f', '--lm-format', default='arpa',
		       help='valid choices are: arpa, estar')
    options.add_option('-l', '--lm')

    options.add_option('--storage-class', default='file')
    options.add_option('--memory-limit', type='int')

    options, args = options.parse_args()
    tool.run(main, options, args)