From 47fa305a695aea4ba6c8dff71c47c0f67adf10cf Mon Sep 17 00:00:00 2001 From: Johannes Englisch Date: Thu, 14 Feb 2013 20:18:54 +0100 Subject: [PATCH] created git repository --- MANIFEST.in | 5 + README-win.txt | 47 +++++ README.txt | 53 +++++ formatgloss.pyw | 24 +++ formatgloss_cli.py | 53 +++++ formatglosslib/__init__.py | 1 + formatglosslib/gui.py | 345 +++++++++++++++++++++++++++++++++ formatglosslib/tbgloss.py | 382 +++++++++++++++++++++++++++++++++++++ setup.cfg | 2 + setup.py | 65 +++++++ stdeb.cfg | 3 + 11 files changed, 980 insertions(+) create mode 100644 MANIFEST.in create mode 100644 README-win.txt create mode 100644 README.txt create mode 100755 formatgloss.pyw create mode 100755 formatgloss_cli.py create mode 100644 formatglosslib/__init__.py create mode 100644 formatglosslib/gui.py create mode 100644 formatglosslib/tbgloss.py create mode 100644 setup.cfg create mode 100755 setup.py create mode 100644 stdeb.cfg diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..626416a --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,5 @@ +include MANIFEST.in +include README.txt +include README-win.txt +include setup.cfg +include stdeb.cfg \ No newline at end of file diff --git a/README-win.txt b/README-win.txt new file mode 100644 index 0000000..729f369 --- /dev/null +++ b/README-win.txt @@ -0,0 +1,47 @@ +Formatgloss +=========== + + +## Description ## + +The linguistic software Toolbox can produce interlinearised glosses of data +acquired during fieldwork. However glosses containing combining diacritics are +often misaligned. + +The present script scans text files for Toolbox glosses and realigns them +taking diacritics into consideration. It comes with a command-line interface +as well as a graphical user interface written in wxPython. + + +## Usage ## + +Command-line interface: + + formatgloss_cli.exe file + +Graphical user interface: + + formatgloss.exe + + +## License ## + +Copyright (c) 2013 Johannes Englisch + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..3cd2c32 --- /dev/null +++ b/README.txt @@ -0,0 +1,53 @@ +Formatgloss +=========== + + +## Description ## + +The linguistic software Toolbox can produce interlinearised glosses of data +acquired during fieldwork. However glosses containing combining diacritics are +often misaligned. + +The present script scans text files for Toolbox glosses and realigns them +taking diacritics into consideration. It comes with a command-line interface +as well as a graphical user interface written in wxPython. + + +## Requirements ## + + * This script requires at least Python 2.6 + * The graphical user interface requires wxPython + + +## Usage ## + +Command-line interface: + + formatgloss_cli.py file + +Graphical user interface: + + formatgloss.pyw + + +## License ## + +Copyright (c) 2013 Johannes Englisch + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/formatgloss.pyw b/formatgloss.pyw new file mode 100755 index 0000000..476e3c0 --- /dev/null +++ b/formatgloss.pyw @@ -0,0 +1,24 @@ +#! /usr/bin/env python + +'''Graphical user interface to formatgloss. + +This script opens a Toolbox file and reformats glosses in it. Opening the +input file and saving the reformatted file are accompanied by a wxPython +user interface. + +''' + + +import wx +import formatglosslib.gui + + +def main(): + '''Run wx and create the main frame''' + wxapp = wx.App() + formatglosslib.gui.ResultsWindow(parent=None) + wxapp.MainLoop() + + +if __name__ == '__main__': + main() diff --git a/formatgloss_cli.py b/formatgloss_cli.py new file mode 100755 index 0000000..17a9c09 --- /dev/null +++ b/formatgloss_cli.py @@ -0,0 +1,53 @@ +#! /usr/bin/env python + +'''Command-line interface to Formatgloss. + +This script reads a Toolbox file and reformats glosses in it. The +reformatted glosses are then printed out to standard output. + +''' + + +import sys +import formatglosslib.tbgloss as tbgloss + + +def format_faulty_gloss(gloss): + '''Format a faulty gloss for printing. + + :param gloss: gloss to be formatted + :type gloss: tbgloss.ToolboxGloss + :return: formatted gloss + :rtype: str + + ''' + head = '\n{0:=^60}'.format(' WARNING ') + error = '({0})'.format(gloss.error) + msg = 'Could not parse following gloss:' + line = 60 * '-' + foot = 60 * '=' + return '\n'.join([head, msg, error, line, str(gloss), foot]) + + +def main(args): + '''Read toolbox file and print reformatted file to stdout. + + :param args: command-line arguments + :type args: list of str + + ''' + if len(args) != 2: + sys.stderr.write('Error: needs exactly one text file.\n') + return + with open(args[1]) as input_file: + lines = input_file.readlines() + lines = [line.decode(tbgloss.INPUT_ENC).rstrip() for line in lines] + toolbox_file = tbgloss.ToolboxFile(lines) + print str(toolbox_file) + for gloss in toolbox_file.get_glosses(): + if gloss.is_faulty: + sys.stderr.write(format_faulty_gloss(gloss) + '\n') + + +if __name__ == '__main__': + main(sys.argv) diff --git a/formatglosslib/__init__.py b/formatglosslib/__init__.py new file mode 100644 index 0000000..44e41fe --- /dev/null +++ b/formatglosslib/__init__.py @@ -0,0 +1 @@ +'''This package provides the modules for Formatgloss.''' diff --git a/formatglosslib/gui.py b/formatglosslib/gui.py new file mode 100644 index 0000000..200eccb --- /dev/null +++ b/formatglosslib/gui.py @@ -0,0 +1,345 @@ +'''Graphical user interface to Formatgloss. + +This module contains the wxPython user interface to formatgloss. +This script opens a Toolbox file and reformats glosses in it. Opening the +input file and saving the reformatted file are accompanied by a wxPython +user interface. + +''' + + +import os.path +import wx +import formatglosslib.tbgloss as tbgloss + + +# window strings +MSG_TITLE = 'Toolbox gloss formatter' + +# status messages +MSG_NOFILE = 'No text file opened.' +MSG_FILEEMPTY = 'File "{0}" is empty.' +MSG_SUCCESS = 'Opened file "{0}".' +MSG_REPORT = '''Report + * {number} glosses detected + * {faulty} glosses cannot be formatted''' + +# dialogs +MSG_OPEN = 'Open Toolbox text file' +MSG_SAVE = 'Save formatted Toolbox text file' +MSG_ERROR = 'Error' +MSG_INFO = 'Information' + + +class OpenDialog(wx.FileDialog): + ''''Open file' dialog''' + def __init__(self, parent): + '''Initialise 'open file' dialog. + + :param parent: parent window of the dialog + :type parent: wx.Window + + ''' + super(OpenDialog, self).__init__(parent, + message=MSG_OPEN, + style=wx.FD_OPEN) + + +class SaveDialog(wx.FileDialog): + ''''Save file' dialog''' + def __init__(self, parent): + '''Initialise 'open file' dialog. + + :param parent: parent window of the dialog + :type parent: wx.Window + + ''' + super(SaveDialog, self).__init__(parent, + message=MSG_SAVE, + style=wx.FD_SAVE | + wx.FD_OVERWRITE_PROMPT) + + +class ErrorDialog(wx.MessageDialog): + '''Dialog for showing error messages''' + def __init__(self, parent, message): + '''Create, show and destroy error message. + + :param parent: parent window of the dialog + :type parent: wx.Window + :param message: error message + :type message: str + + ''' + super(ErrorDialog, self).__init__(parent=parent, + message=message, + caption=MSG_ERROR, + style=wx.OK | wx.ICON_ERROR) + self.ShowModal() + self.Destroy() + + +class MessageDialog(wx.MessageDialog): + '''Dialog for showing messages''' + def __init__(self, parent, message): + '''Create, show and destroy message. + + :param parent: parent window of the dialog + :type parent: wx.Window + :param message: error message + :type message: str + + ''' + super(MessageDialog, self).__init__(parent=parent, + message=message, + caption=MSG_INFO, + style=wx.OK | wx.ICON_INFORMATION) + self.ShowModal() + self.Destroy() + + +class ShowGlossesDialog(wx.Dialog): + '''Dialog for showing faulty glosses''' + def __init__(self, parent, toolbox_file): + '''Initialise dialog. + + :param parent: parent window of the dialog + :type parent: wx.Window + :param toolbox_file: toolbox file to be displayed + :type toolbox_file: tbgloss.ToolboxFile + + ''' + super(ShowGlossesDialog, self).__init__(parent) + self.toolbox_file = toolbox_file + self.init_ui() + self.ShowModal() + self.Destroy() + + def init_ui(self): + '''Initialise widgets''' + # window properties + self.SetTitle('Faulty glosses') + self.SetSize((350, 250)) + lbl = wx.StaticText(parent=self, + label='Following glosses cannot be formatted:') + textctrl = wx.TextCtrl(parent=self, style=wx.TE_MULTILINE) + textctrl.SetEditable(False) + textctrl.SetValue(self._format_glosses().decode('utf-8')) + # layout + vsizer = wx.BoxSizer(orient=wx.VERTICAL) + vsizer.Add(lbl, flag=wx.ALL, border=3) + vsizer.Add(textctrl, proportion=1, flag=wx.ALL | wx.EXPAND, border=3) + bsizer = self.CreateStdDialogButtonSizer(wx.OK) + vsizer.Add(bsizer, flag=wx.ALL, border=3) + self.SetSizer(vsizer) + + def _format_glosses(self): + '''Filter and format faulty glosses''' + glosses = ['Gloss:\n{0}\nError: {1}\n'.format(gloss, gloss.error) + for gloss in self.toolbox_file.get_glosses() + if gloss.is_faulty] + return '\n'.join(glosses) + + +class GlossReport(wx.StaticText): + '''Static text which reports occurring faulty Toolbox glosses''' + + def __init__(self, parent, label): + '''Initialise gloss report. + + :param parent: parent window of the gloss report + :type parent: wx.Window + :param label: initial label text + :type label: str + + ''' + super(GlossReport, self).__init__(parent, label=label) + self.toolbox_file = tbgloss.ToolboxFile() + + def _update(self): + '''Update text of the report''' + if not self.toolbox_file: + filename = self.GetParent().filename + self.SetLabel(MSG_FILEEMPTY.format(os.path.basename(filename))) + return + gloss_list = self.toolbox_file.get_glosses() + faulty_list = [gloss for gloss in gloss_list if gloss.is_faulty] + self.SetLabel(MSG_REPORT.format(number=len(gloss_list), + faulty=len(faulty_list))) + + def set_toolbox_file(self, toolbox_file): + '''Set a new toolbox file and update report + + :param toolbox_file: new Toolbox file + :type toolbox_file: tbgloss.ToolboxFile + + ''' + self.toolbox_file = toolbox_file + self._update() + + +class ResultsWindow(wx.Frame): + '''Main frame of the application''' + + def __init__(self, parent): + '''Initialise main frame. + + :param parent: parent window of the frame + :type parent: wx.Window + + ''' + super(ResultsWindow, self).__init__(parent) + self.toolbox_file = tbgloss.ToolboxFile() + self.filename = '' + self.init_ui() + self.reset_window() + self.Show() + + def init_ui(self): + '''Initialise user interface of the frame''' + # window settings + self.SetTitle(MSG_TITLE) + self.SetSize((300, 170)) + + # menu + menubar = wx.MenuBar() + menufile = wx.Menu() + menufile.Append(id=wx.ID_OPEN, text='&Open...\tCtrl-O') + self.filesave = menufile.Append(id=wx.ID_SAVE, text='&Save...\tCtrl-S') + menufile.AppendSeparator() + self.fileclose = menufile.Append(id=wx.ID_CLOSE, + text='&Close file\tCtrl-W') + menufile.Append(id=wx.ID_EXIT, text='&Quit\tCtrl-Q') + menubar.Append(menufile, '&File') + menutools = wx.Menu() + self.toolsshow = menutools.Append(id=wx.ID_ANY, text='Show &errors') + menubar.Append(menutools, '&Tools') + self.SetMenuBar(menubar) + + # widgets + # workaround for the 'ugly dark-grey background problem' in Windows + panel = wx.Panel(parent=self) + self.label = wx.StaticText(parent=panel, label=MSG_NOFILE) + line = wx.StaticLine(parent=panel) + self.report = GlossReport(parent=panel, label='') + openbutton = wx.Button(parent=panel, id=wx.ID_OPEN, label='&Open') + self.showbutton = wx.Button(parent=panel, label='Show &Errors') + self.savebutton = wx.Button(parent=panel, id=wx.ID_SAVE, label='&Save') + + # layout + vsizer = wx.BoxSizer(orient=wx.VERTICAL) + vsizer.Add(self.label, flag=wx.ALL, border=5) + vsizer.Add(line, flag=wx.EXPAND | wx.ALL, border=5) + vsizer.Add(self.report, + proportion=1, + flag=wx.EXPAND | wx.ALL, + border=5) + gsizer = wx.GridSizer(rows=1, cols=3) + gsizer.Add(openbutton, flag=wx.ALL, border=5) + gsizer.Add(self.showbutton, flag=wx.ALL, border=5) + gsizer.Add(self.savebutton, flag=wx.ALL, border=5) + vsizer.Add(gsizer) + panel.SetSizer(vsizer) + + # events + self.Bind(event=wx.EVT_BUTTON, handler=self.on_open, id=wx.ID_OPEN) + self.Bind(event=wx.EVT_BUTTON, handler=self.on_show, + source=self.showbutton) + self.Bind(event=wx.EVT_BUTTON, handler=self.on_save, id=wx.ID_SAVE) + self.Bind(event=wx.EVT_MENU, handler=self.on_open, id=wx.ID_OPEN) + self.Bind(event=wx.EVT_MENU, handler=self.on_save, id=wx.ID_SAVE) + self.Bind(event=wx.EVT_MENU, handler=self.on_close, id=wx.ID_CLOSE) + self.Bind(event=wx.EVT_MENU, handler=self.on_quit, id=wx.ID_EXIT) + self.Bind(event=wx.EVT_MENU, handler=self.on_show, + source=self.toolsshow) + + def disable_save(self): + '''Disable 'save' widgets''' + self.filesave.Enable(False) + self.fileclose.Enable(False) + self.savebutton.Enable(False) + + def disable_show(self): + '''Disable 'show' widgets''' + self.toolsshow.Enable(False) + self.showbutton.Enable(False) + + def enable_save(self): + '''Enable disabled 'save' widgets''' + self.filesave.Enable() + self.fileclose.Enable() + self.savebutton.Enable() + self.savebutton.SetDefault() + self.savebutton.SetFocus() + + def enable_show(self): + '''Enable disabled 'show' widgets''' + self.toolsshow.Enable() + self.showbutton.Enable() + + def on_open(self, event): + '''Handle 'open' event''' + self.read_toolbox_file() + + def on_show(self, event): + '''Handle 'show glosses' event''' + ShowGlossesDialog(parent=self, toolbox_file=self.toolbox_file) + + def on_close(self, event): + '''Handle 'close' event''' + self.reset_window() + + def on_quit(self, event): + '''Handle 'exit' event''' + self.Close() + + def on_save(self, event): + '''Handle 'save' event''' + dlg = SaveDialog(parent=self) + dlg.ShowModal() + filename = dlg.GetPath() + dlg.Destroy() + if not filename: + return + try: + with open(filename, 'w') as outputfile: + outputfile.write(str(self.toolbox_file)) + except IOError as error: + ErrorDialog(parent=None, message=str(error)) + else: + MessageDialog(parent=self, message='File was saved successfully.') + + def read_toolbox_file(self): + '''Read Toolbox glosses from file''' + dlg = OpenDialog(parent=self) + dlg.ShowModal() + filename = dlg.GetPath() + dlg.Destroy() + if not filename: + return + try: + with open(filename, 'r') as inputfile: + content = inputfile.readlines() + content = [unicode(line.strip(), tbgloss.INPUT_ENC) + for line in content] + except IOError as error: + ErrorDialog(parent=self, message=str(error)) + else: + self.toolbox_file = tbgloss.ToolboxFile(content) + self.report.set_toolbox_file(self.toolbox_file) + self.filename = filename + self.label.SetLabel(MSG_SUCCESS.format(os.path.basename(filename))) + self.enable_save() + if any(gloss.is_faulty + for gloss in self.toolbox_file.get_glosses()): + self.enable_show() + else: + self.disable_show() + + def reset_window(self): + '''Reset window to defaults''' + self.filename = '' + self.disable_save() + self.disable_show() + self.label.SetLabel(MSG_NOFILE) + self.report.SetLabel('') diff --git a/formatglosslib/tbgloss.py b/formatglosslib/tbgloss.py new file mode 100644 index 0000000..70c8139 --- /dev/null +++ b/formatglosslib/tbgloss.py @@ -0,0 +1,382 @@ +'''Toolbox gloss parser. + +This module provides functions and classes for parsing and correcting +glosses within Toolbox files. + +''' + +# Encoding of the input file +INPUT_ENC = 'UTF-8' + +# Combining diacritics in unicode +DIACRITICS = [u'\u0301', # acute + u'\u0300', # grave + u'\u030b', # double acute + u'\u030f', # double grave + u'\u030a', # ring above + u'\u0325', # ring below + u'\u0303', # tilde above + u'\u0334', # tilde across + u'\u0330', # tilde below + u'\u0308', # trema above + u'\u0324', # trema below + u'\u0304', # bar above + u'\u032c', # hacek below + u'\u0339', # rounded + u'\u031c', # unrounded + u'\u031f', # advanced + u'\u0320', # retracted + u'\u0329', # syllabic + u'\u032f', # non-syllabic + u'\u033c', # linguolabial + u'\u031d', # raised + u'\u031e', # lowered + u'\u0318', # ATR + u'\u0319', # RTR + u'\u032a', # dental + u'\u033a', # apical + u'\u033b', # laminal + u'\u033d', # mid-centralised + u'\u031a'] # unreleased + + +def true_len(string): + r'''Return the 'true' length of a string without counting diacritics. + + :param string: input string + :type string: unicode + :return: length of the string + :rtype: unicode + + >>> true_len('Completely normal string') + 24 + >>> true_len(u'String with Diacritics: \xe9 e\u0301 \u0268 \u0268\u0301') + 31 + + ''' + return len([char for char in string if char not in DIACRITICS]) + + +def true_fill(string, length, filler=None): + r'''Fill a string to a given length not counting diacritics. + + :param string: input string + :type string: unicode + :param length: minimal length of the output string + :type length: int + :param filler: character with which to fill the input string + :type filler: unicode + :return: lengthened string + :rtype: unicode + + >>> true_fill('ASCII', 10, '!') + 'ASCII!!!!!' + >>> true_fill(u'dia\u0308critic', 15) + u'dia\u0308critic ' + + ''' + if not filler: + filler = ' ' + tail_len = max(0, length - true_len(string)) + return string + tail_len * filler + + +class MorphemeMapError(Exception): + '''Error raised during the process of mapping morphemes to words.''' + + +class GlossError(Exception): + '''Error raised during the interlinearisation process.''' + + +class MorphemeMap(object): + r'''Two-way dictionary that maps words and morphemes to each other. + + This class scans a segmented line and assignes words to morphemes and vice + versa: + + >>> word_line = '\\t das gut gebaute Haus' + >>> morpheme_line = '\\mb das gut ge- bau -t -e Haus' + >>> morph_map = MorphemeMap(word_line, morpheme_line) + >>> morph_map.get_word(4) + 3 + >>> morph_map.get_morphemes(3) + [3, 4, 5, 6] + + This mapping requires that every morpheme has to be assigned to exactly one + word and that every word has to be assigned at least one morpheme, + otherwise an exception is raised: + + >>> word_line = '\\t das sehr gut gebaute Haus' + >>> morpheme_line = '\\mb das gut ge- bau -t -e Haus' + >>> morph_map = MorphemeMap(word_line, morpheme_line) + Traceback (most recent call last): + ... + MorphemeMapError: could not assign all words to morphemes + + ''' + + def __init__(self, word_line, morpheme_line): + '''Initialise morpheme map. + + :param word_line: line containing complete words + :type word_line: unicode + :param morpheme_line: line containing single morphemes + :type morpheme_line: unicode + + ''' + words = word_line.split() + morphemes = morpheme_line.split() + word_index = 0 + word_end = False + self.mapping = list() + for morpheme in morphemes: + # compound words are glossed 'first - second' in Toolbox + if morpheme == '-': + word_end = False + if word_end and not morpheme.startswith('-'): + word_index += 1 + word_end = False + if not morpheme.endswith('-'): + word_end = True + self.mapping.append(word_index) + if word_index < len(words) - 1: + raise MorphemeMapError('could not assign all words to morphemes') + if word_index > len(words) - 1: + raise MorphemeMapError('could not assign all morphemes to a word') + + def get_word(self, index): + '''Return the associated word for a given morpheme. + + :param index: index of the morpheme + :type index: int + :return: index of the associated word + :rtype: int + + ''' + return self.mapping[index] + + def get_morphemes(self, index): + '''Return the associated morphemes for a given word. + + :param index: index of the word + :type index: int + :return: indices of all associated morphemes + :rtype: list of int + + ''' + return [morpheme + for morpheme, word in enumerate(self.mapping) + if word == index] + + +class ToolboxGloss(object): # pylint: disable=R0903 + r'''Representation of a gloss in Toolbox. + + This class parses and realigns a single Toolbox gloss: + + >>> text_line = '\\t das blaue Haus' + >>> gloss_lines = ['\\mb das blau -e Haus', + ... '\\gl the blue -N.SG house', + ... '\\ps no a -ai n'] + >>> print ToolboxGloss(text_line, gloss_lines) + \t das blaue Haus + \mb das blau -e Haus + \gl the blue -N.SG house + \ps no a -ai n + + Error handling: When something goes wrong, the gloss is printed + unchanged. + + >>> text_line = '\\t das blaue Haus' + >>> gloss_lines = ['\\mb das blau -e', + ... '\\gl the blue -N.SG house', + ... '\\ps no a -ai n'] + >>> tb_gloss = ToolboxGloss(text_line, gloss_lines) + >>> print tb_gloss + \t das blaue Haus + \mb das blau -e + \gl the blue -N.SG house + \ps no a -ai n + >>> print tb_gloss.error + could not assign all words to morphemes + + ''' + + def __init__(self, text_line, gloss_lines): + '''Initialise ToolboxGloss. + + :param text_line: line containing complete words + :type text_line: unicode + :param gloss_lines: lines segmented into morphemes + :type gloss_lines: list of unicode + + ''' + self.is_faulty = False + self.error = None + self.text_line = text_line + self.gloss_lines = gloss_lines + self.word_width = None + self.morph_width = None + try: + self.morph_map = MorphemeMap(text_line, gloss_lines[0]) + self._calc_morph_widths() + self._calc_word_widths() + except MorphemeMapError as index_error: + self._error(str(index_error)) + except GlossError as gloss_error: + self._error(str(gloss_error)) + + def _error(self, message=None): + '''Set error message and mark gloss as faulty. + + :param message: error message + :type message: str + + ''' + self.is_faulty = True + if message: + self.error = message + elif not self.error: + self.error = 'Error while interlinearising' + + def _calc_morph_widths(self): + '''Calculate the morpheme column widths in the gloss''' + morphemes = [line.split() for line in self.gloss_lines] + if len(set(len(line) for line in morphemes)) > 1: + raise GlossError('Numbers of morphemes did not match between segmented lines') # pylint: disable=C0301 + self.morph_width = [max(true_len(line[i]) for line in morphemes) + for i in xrange(len(morphemes[0]))] + + def _calc_word_widths(self): + '''Calculate the word column widths in the gloss''' + words = self.text_line.split() + self.word_width = list() + for i in xrange(len(words)): + morphemes = self.morph_map.get_morphemes(i) + seg_length = sum(self.morph_width[j] for j in morphemes) + seg_length += len(morphemes) - 1 + self.word_width.append(max(seg_length, true_len(words[i]))) + + def __unicode__(self): + '''Return Toolbox gloss as a unicode string''' + if self.is_faulty: + return '\n'.join([self.text_line] + self.gloss_lines) + # format words + words = self.text_line.split() + wordsf = [true_fill(words[i], self.word_width[i]) + for i in xrange(len(words))] + wordsf = ' '.join(wordsf).strip() + # format morphemes + morphsf = list() + for line in self.gloss_lines: + morphemes = line.split() + cols = list() + for i in xrange(len(words)): + column = self.morph_map.get_morphemes(i) + columnf = [true_fill(morphemes[j], self.morph_width[j]) + for j in column] + columnf = true_fill(' '.join(columnf), self.word_width[i]) + cols.append(columnf) + morphsf.append(' '.join(cols).strip()) + return '\n'.join([wordsf] + morphsf) + + def __str__(self): + '''Return Toolbox gloss as a non-unicode string''' + return self.__unicode__().encode(INPUT_ENC) + + +class ToolboxFile(object): # pylint: disable=R0903 + r'''Representation of a Toolbox text file. + + This class takes a list of lines as an input and reformats them in order to + correct the alignment of Toolbox glosses and remove unneeded white space. + The input lines have to be in unicode which is the encoding used internally + by the class. The reformatted Toolbox file can be accessed by both the + str() and the unicode() function. + + Within the ToolboxFile class, each line is a ToolboxLine object. The class + scans the file and wraps Toolbox lines belonging to a gloss in a + ToolboxGloss object: + + >>> lines = ['\\ref 001', + ... '\\t das blaue Haus', + ... '\\mb das blau -e Haus', + ... '\\gl the blue -N.SG house', + ... '\\ps no a -ai n', + ... '\\f The blue house'] + >>> tb_file = ToolboxFile(lines) + >>> print tb_file.lines + ['\\ref 001', <__main__.ToolboxGloss object ...>, '\\f The blue house'] + >>> print tb_file + \ref 001 + \t das blaue Haus + \mb das blau -e Haus + \gl the blue -N.SG house + \ps no a -ai n + \f The blue house + + ''' + + def __init__(self, lines=None): + '''Initialise ToolboxFile. + + :param lines: lines of the file + :type lines: list of unicode + + ''' + self.lines = list() + if lines: + self.lines = lines + index = 0 + while index < len(self.lines): + try: + if (self.lines[index].startswith('\\t ') and + self.lines[index + 1].startswith('\\mb ') and + self.lines[index + 2].startswith('\\gl ') and + self.lines[index + 3].startswith('\\ps ')): + gloss = ToolboxGloss(self.lines[index], + self.lines[index + 1:index + 4]) + self.lines = (self.lines[:index] + + [gloss] + + self.lines[index + 4:]) + except IndexError: + break + index += 1 + + def __len__(self): + '''Return length of the toolbox file''' + return len(self.lines) + + def __delitem__(self, key): + '''Delete line from a Toolbox file''' + del self.lines[key] + + def __getitem__(self, key): + '''Return line of a Toolbox file''' + return self.lines[key] + + def __setitem__(self, key, new_line): + '''Exchange a line in the Toolbox file''' + self.lines[key] = new_line + + def __iter__(self): + '''Return iterator of the toolbox file''' + return self.lines + + def __str__(self): + '''Return Toolbox file as a non-unicode string''' + return self.__unicode__().encode(INPUT_ENC) + + def __unicode__(self): + '''Return Toolbox file as a unicode string''' + return '\n'.join(unicode(line) for line in self.lines) + + def get_glosses(self): + '''Return list of glosses in the Toolbox file''' + return [line for line in self.lines if isinstance(line, ToolboxGloss)] + + +if __name__ == '__main__': + import doctest + doctest.testmod(optionflags=doctest.ELLIPSIS) diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..963d7bd --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[global] +command-packages = stdeb.command diff --git a/setup.py b/setup.py new file mode 100755 index 0000000..3a13ebf --- /dev/null +++ b/setup.py @@ -0,0 +1,65 @@ +#! /usr/bin/env python + +'''Formatgloss setup script. + +Uses Distutils2 in order to install the formatgloss package or create +distributable packages. + +''' + + +from distutils.core import setup +try: + import py2exe +except ImportError: + HAS_PY2EXE = False +else: + HAS_PY2EXE = True + + +description = '''Reformatter of text files by the fieldwork software Toolbox. + +The linguistic software Toolbox can produce interlinearised glosses of +data acquired during fieldwork. However glosses containing combining +diacritics are often misaligned. + +The present script scans text files for Toolbox glosses and realigns +them taking diacritics into consideration. It comes with a command-line +as well as with a GUI written in wxPython. + +''' + +# TODO url, download_url +config = {'name': 'Formatgloss', + 'version': '1.0', + 'author': 'Johannes Englisch', + 'author_email': 'cyberjoe0815@hotmail.com', + 'description': 'Reformats glosses in Toolbox files', + 'long_description': description, + 'classifiers': ['Development Status :: 4 - Beta', + 'Environment :: Console', + 'Environment :: MacOS X', + 'Environment :: Win32 (MS Windows)', + 'Environment :: X11 Applications :: GTK', + 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: MIT License', + 'Operating System :: OS Independent', + 'Programming Language :: Python :: 2', + 'Topic :: Text Processing :: Linguistic'], + 'packages': ['formatglosslib'], + 'scripts': ['formatgloss_cli.py', 'formatgloss.pyw'], + 'requires': ['wx']} + +config_py2exe = {'console': ['formatgloss_cli.py'], + 'windows': ['formatgloss.pyw']} +py2exe_options = {'dist_dir': 'dist/%s-%s-win32' % (config['name'], + config['version']), + 'bundle_files': 1} + +if HAS_PY2EXE: + config.update(config_py2exe) + if not config.has_key('options'): + config['options'] = dict() + config['options']['py2exe'] = py2exe_options + +setup(**config) \ No newline at end of file diff --git a/stdeb.cfg b/stdeb.cfg new file mode 100644 index 0000000..8d4aaba --- /dev/null +++ b/stdeb.cfg @@ -0,0 +1,3 @@ +[DEFAULT] +Depends: python-wxgtk2.8 +XS-Python-Version: >= 2.6