Add Japanese support to DepCCGParser (#24)

Co-authored-by: Ian Fan <[email protected]>
CQCL · Aug 16, 2022 · d911686 · d911686
1 parent d25a75b
commit d911686
Showing 1 changed file with 74 additions and 17 deletions.
diff --git a/lambeq/text2diagram/depccg_parser.py b/lambeq/text2diagram/depccg_parser.py
@@ -42,15 +42,18 @@
 
 if TYPE_CHECKING:
     import depccg
-    from depccg.annotator import annotate_XX, english_annotator
+    from depccg.annotator import (annotate_XX, english_annotator,
+                                  japanese_annotator)
     from depccg.cat import Category
 
 
 def _import_depccg() -> None:
-    global depccg, Category, annotate_XX, english_annotator
+    global depccg, Category
+    global annotate_XX, english_annotator, japanese_annotator
     import depccg
     import depccg.allennlp.utils
-    from depccg.annotator import annotate_XX, english_annotator
+    from depccg.annotator import (annotate_XX, english_annotator,
+                                  japanese_annotator)
     from depccg.cat import Category
     import depccg.lang
     import depccg.parsing
@@ -86,34 +89,61 @@ def __init__(self,
                  lang: str = 'en',
                  model: Optional[str] = None,
                  use_model_unary_rules: bool = False,
-                 annotator: Optional[str] = None,
+                 annotator: str = 'janome',
+                 tokenize: Optional[bool] = None,
                  device: int = -1,
-                 root_cats: Iterable[str] = ('S[dcl]', 'S[wq]', 'S[q]',
-                                             'S[qem]', 'NP'),
+                 root_cats: Optional[Iterable[str]] = None,
                  verbose: str = VerbosityLevel.PROGRESS.value,
                  **kwargs: Any) -> None:
         """Instantiate a parser based on `depccg`.
 
         Parameters
         ----------
         lang : { 'en', 'ja' }
-            The language to use. Use of 'ja' is experimental and has not
-            been tested.
+            The language to use: 'en' for English, 'ja' for Japanese.
         model : str, optional
             The name of the model variant to use, if any.
-            (At time of writing) `depccg` supports 'elmo', 'rebank' and
+            At time of writing, `depccg` supports 'elmo', 'rebank' and
             'elmo_rebank' for English only.
         use_model_unary_rules : bool, default: False
             Use the unary rules supplied by the model instead of the
             ones by `lambeq`.
-        annotator : str, optional
-            The annotator to use, if any. (At time of writing) `depccg`
-            supports 'candc' and 'spacy'.
+        annotator : str, default: 'janome'
+            The annotator to use, if any.
+            At time of writing `depccg` supports 'candc' and 'spacy' for
+            English, and 'janome' and 'jigg' for Japanese.
+            By default, no annotator is used for English, and 'janome'
+            is used for Japanese.
+        tokenize : bool, optional
+            Whether to tokenise the input when annotating. This option
+            should only be specified when using the 'spacy' annotator.
         device : int, optional
             The ID of the GPU to use. By default, uses the CPU.
-        root_cats : iterable of str, default: ['S[dcl]', 'S[wq]',
-                                               'S[q]', 'S[qem]', 'NP']
-            A list of categories allowed at the root of the parse.
+        root_cats : iterable of str, optional
+            A list of categories allowed at the root of the parse. By
+            default, the English categories are:
+                - S[dcl]
+                - S[wq]
+                - S[q]
+                - S[qem]
+                - NP
+            and the Japanese categories are:
+                - NP[case=nc,mod=nm,fin=f]
+                - NP[case=nc,mod=nm,fin=t]
+                - S[mod=nm,form=attr,fin=t]
+                - S[mod=nm,form=base,fin=f]
+                - S[mod=nm,form=base,fin=t]
+                - S[mod=nm,form=cont,fin=f]
+                - S[mod=nm,form=cont,fin=t]
+                - S[mod=nm,form=da,fin=f]
+                - S[mod=nm,form=da,fin=t]
+                - S[mod=nm,form=hyp,fin=t]
+                - S[mod=nm,form=imp,fin=f]
+                - S[mod=nm,form=imp,fin=t]
+                - S[mod=nm,form=r,fin=t]
+                - S[mod=nm,form=s,fin=t]
+                - S[mod=nm,form=stem,fin=f]
+                - S[mod=nm,form=stem,fin=t]
         verbose : str, default: 'progress',
             Controls the command-line output of the parser. Only
             'progress' option is available for this parser.
@@ -127,9 +157,36 @@ def __init__(self,
                              '"progress" level of verbosity. '
                              f'`{self.verbose}` was given.')
         _import_depccg()
+        if lang.lower() == 'en':
+            if root_cats is None:
+                root_cats = ['S[dcl]', 'S[wq]', 'S[q]', 'S[qem]', 'NP']
+            self.annotator_fun = english_annotator.get(annotator, annotate_XX)
+            self.tokenize = tokenize if tokenize is not None else False
+        elif lang.lower() == 'ja':
+            if root_cats is None:
+                root_cats = ['NP[case=nc,mod=nm,fin=f]',
+                             'NP[case=nc,mod=nm,fin=t]',
+                             'S[mod=nm,form=attr,fin=t]',
+                             'S[mod=nm,form=base,fin=f]',
+                             'S[mod=nm,form=base,fin=t]',
+                             'S[mod=nm,form=cont,fin=f]',
+                             'S[mod=nm,form=cont,fin=t]',
+                             'S[mod=nm,form=da,fin=f]',
+                             'S[mod=nm,form=da,fin=t]',
+                             'S[mod=nm,form=hyp,fin=t]',
+                             'S[mod=nm,form=imp,fin=f]',
+                             'S[mod=nm,form=imp,fin=t]',
+                             'S[mod=nm,form=r,fin=t]',
+                             'S[mod=nm,form=s,fin=t]',
+                             'S[mod=nm,form=stem,fin=f]',
+                             'S[mod=nm,form=stem,fin=t]']
+            self.annotator_fun = japanese_annotator.get(annotator, annotate_XX)
+            self.tokenize = tokenize if tokenize is not None else True
+        else:
+            raise ValueError('DepCCGParser does not support language: '
+                             f'`{lang}`.')
 
         depccg.lang.set_global_language_to(lang)
-        self.annotator_fun = english_annotator.get(annotator, annotate_XX)
         self.supertagger, config = depccg.instance_models.load_model(model,
                                                                      device)
         (self.apply_binary_rules,
@@ -347,7 +404,7 @@ def sentence2diagram(
     def _depccg_parse(
             self,
             sentences: list[list[str]]) -> list[list[depccg.tree.ScoredTree]]:
-        doc = self.annotator_fun(sentences)
+        doc = self.annotator_fun(sentences, tokenize=self.tokenize)
         score_result, categories = self.supertagger.predict_doc(
                 [[token.word for token in sentence] for sentence in doc])