From 385cf9ed1d2462d4ced2b7cdffccc5df2dc7ca3d Mon Sep 17 00:00:00 2001 From: dlutz2 Date: Mon, 26 Sep 2016 09:43:03 -0400 Subject: [PATCH] Switching to Emory lemmatizer --- pom.xml | 8 ++++---- src/org/opensextant/howler/utils/OWLUtils.java | 15 ++++++++++++--- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/pom.xml b/pom.xml index c9aa1c0..f02f976 100644 --- a/pom.xml +++ b/pom.xml @@ -109,11 +109,11 @@ 1.7.14 - + - edu.stanford.nlp - stanford-corenlp - 3.6.0 + edu.emory.mathcs.nlp + nlp4j-morphology + 1.1.2 diff --git a/src/org/opensextant/howler/utils/OWLUtils.java b/src/org/opensextant/howler/utils/OWLUtils.java index fb03991..306d6ec 100644 --- a/src/org/opensextant/howler/utils/OWLUtils.java +++ b/src/org/opensextant/howler/utils/OWLUtils.java @@ -45,12 +45,16 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import edu.stanford.nlp.process.Morphology; +import edu.emory.mathcs.nlp.common.util.StringUtils; +import edu.emory.mathcs.nlp.component.morph.MorphAnalyzer; +import edu.emory.mathcs.nlp.component.morph.english.EnglishMorphAnalyzer; public class OWLUtils { static Map numbers = new HashMap(); - + + static MorphAnalyzer lemmatizer = new EnglishMorphAnalyzer(); + static { numbers.put("one", 1); numbers.put("two", 2); @@ -199,12 +203,17 @@ public static String normalize(String word, String pos, boolean lower) { return word; } + // don't change numbers or Fixed vocab + if (pos.equals("CD") || pos.equals("FIXED")) { + return word; + } + // don't normalize verbs yet if (pos.startsWith("V")) { return word; } - return (Morphology.lemmaStatic(word, pos, lower)); + return lemmatizer.lemmatize(StringUtils.toSimplifiedForm(word, lower), pos); } public static SubjectPredicateObject rewriteSPO(