diff --git a/models_full/irregular.model b/models_full/irregular.model index 55b5d30a..de8f754a 100644 Binary files a/models_full/irregular.model and b/models_full/irregular.model differ diff --git a/models_full/observation.model b/models_full/observation.model index b6b0184b..f18c7871 100644 Binary files a/models_full/observation.model and b/models_full/observation.model differ diff --git a/models_full/transition.model b/models_full/transition.model index 3de51688..82659d59 100644 Binary files a/models_full/transition.model and b/models_full/transition.model differ diff --git a/models_light/irregular.model b/models_light/irregular.model index a2e24361..16245fec 100644 Binary files a/models_light/irregular.model and b/models_light/irregular.model differ diff --git a/models_light/observation.model b/models_light/observation.model index fa2e320c..743d4afb 100644 Binary files a/models_light/observation.model and b/models_light/observation.model differ diff --git a/models_light/transition.model b/models_light/transition.model index 3de51688..82659d59 100644 Binary files a/models_light/transition.model and b/models_light/transition.model differ diff --git a/resources/irrDic.remove.txt b/resources/irrDic.remove.txt index 59e5b998..2e059f7b 100644 --- a/resources/irrDic.remove.txt +++ b/resources/irrDic.remove.txt @@ -532,5 +532,8 @@ remove : ㅇㅣ/VCP ㅈㅣ key : ㄱㅏㅈㅣ remove : ㄲㅏㅈㅣ -key ; ㅇㅔㅅㅓ +key : ㅇㅔㅅㅓ remove : ㅁㅕㄴ/NNG ㅇㅔㅅㅓ + +key : ㄷㅗ +remove : ㅈㅜㅇ diff --git a/src/main/java/kr/co/shineware/nlp/komoran/util/HangulJamoUtil.java b/src/main/java/kr/co/shineware/nlp/komoran/util/HangulJamoUtil.java new file mode 100644 index 00000000..d5c298bf --- /dev/null +++ b/src/main/java/kr/co/shineware/nlp/komoran/util/HangulJamoUtil.java @@ -0,0 +1,245 @@ +package kr.co.shineware.nlp.komoran.util; + +import java.util.ArrayList; +import java.util.List; + +public class HangulJamoUtil { + public static String ToHangulCompatibilityJamo(String source) { + StringBuilder dest = new StringBuilder(); + + for (int i = 0; i < source.length(); i++) { + char ch = source.charAt(i); + ch = convertChosungToCompatibilityJamo(ch); + ch = convertJungsungToCompatibilityJamo(ch); + ch = convertJongsungToCompatibilityJamo(ch); + dest.append(ch); + } + return dest.toString(); + } + + private static char convertJongsungToCompatibilityJamo(char ch) { + if(ch == 0x11A8){ + ch = 'ㄱ'; + } + if(ch == 0x11A9){ + ch = 'ㄲ'; + } + if(ch == 0x11AA){ + ch = 'ㄳ'; + } + if(ch == 0x11AB){ + ch = 'ㄴ'; + } + if(ch == 0x11AC){ + ch = 'ㄵ'; + } + if(ch == 0x11AD){ + ch = 'ㄶ'; + } + if(ch == 0x11AE){ + ch = 'ㄷ'; + } + if(ch == 0x11AF){ + ch = 'ㄹ'; + } + if(ch == 0x11B0){ + ch = 'ㄺ'; + } + if(ch == 0x11B1){ + ch = 'ㄻ'; + } + if(ch == 0x11B2){ + ch = 'ㄼ'; + } + if(ch == 0x11B3){ + ch = 'ㄽ'; + } + if(ch == 0x11B4){ + ch = 'ㄾ'; + } + if(ch == 0x11B5){ + ch = 'ㄿ'; + } + if(ch == 0x11B6){ + ch = 'ㅀ'; + } + if(ch == 0x11B7){ + ch = 'ㅁ'; + } + if(ch == 0x11B8){ + ch = 'ㅂ'; + } + if(ch == 0x11B9){ + ch = 'ㅄ'; + } + if(ch == 0x11BA){ + ch = 'ㅅ'; + } + if(ch == 0x11BB){ + ch = 'ㅆ'; + } + if(ch == 0x11BC){ + ch = 'ㅇ'; + } + if(ch == 0x11BD){ + ch = 'ㅈ'; + } + if(ch == 0x11BE){ + ch = 'ㅊ'; + } + if(ch == 0x11BF){ + ch = 'ㅋ'; + } + if(ch == 0x11C0){ + ch = 'ㅌ'; + } + if(ch == 0x11C1){ + ch = 'ㅍ'; + } + if(ch == 0x11C2){ + ch = 'ㅎ'; + } + return ch; + } + + private static char convertJungsungToCompatibilityJamo(char ch) { + if(ch == 0x1161){ + ch = 'ㅏ'; + } + if(ch == 0x1162){ + ch = 'ㅐ'; + } + if(ch == 0x1163){ + ch = 'ㅑ'; + } + if(ch == 0x1164){ + ch = 'ㅒ'; + } + if(ch == 0x1165){ + ch = 'ㅓ'; + } + if(ch == 0x1166){ + ch = 'ㅔ'; + } + if(ch == 0x1167){ + ch = 'ㅕ'; + } + if(ch == 0x1168){ + ch = 'ㅖ'; + } + if(ch == 0x1169){ + ch = 'ㅗ'; + } + if(ch == 0x116A){ + ch = 'ㅘ'; + } + if(ch == 0x116B){ + ch = 'ㅙ'; + } + if(ch == 0x116C){ + ch = 'ㅚ'; + } + if(ch == 0x116D){ + ch = 'ㅛ'; + } + if(ch == 0x116E){ + ch = 'ㅜ'; + } + if(ch == 0x116F){ + ch = 'ㅝ'; + } + if(ch == 0x1170){ + ch = 'ㅞ'; + } + if(ch == 0x1171){ + ch = 'ㅟ'; + } + if(ch == 0x1172){ + ch = 'ㅠ'; + } + if(ch == 0x1173){ + ch = 'ㅡ'; + } + if(ch == 0x1174){ + ch = 'ㅢ'; + } + if(ch == 0x1175){ + ch = 'ㅣ'; + } + return ch; + } + + private static char convertChosungToCompatibilityJamo(char ch) { + if(ch == 0x1100){ + ch = 'ㄱ'; + } + if(ch == 0x1101){ + ch = 'ㄲ'; + } + if(ch == 0x1102){ + ch = 'ㄴ'; + } + if(ch == 0x1103){ + ch = 'ㄷ'; + } + if(ch == 0x1104){ + ch = 'ㄸ'; + } + if(ch == 0x1105){ + ch = 'ㄹ'; + } + if(ch == 0x1106){ + ch = 'ㅁ'; + } + if(ch == 0x1107){ + ch = 'ㅂ'; + } + if(ch == 0x1108){ + ch = 'ㅃ'; + } + if(ch == 0x1109){ + ch = 'ㅅ'; + } + if(ch == 0x110A){ + ch = 'ㅆ'; + } + if(ch == 0x110B){ + ch = 'ㅇ'; + } + if(ch == 0x110C){ + ch = 'ㅈ'; + } + if(ch == 0x110D){ + ch = 'ㅉ'; + } + if(ch == 0x110E){ + ch = 'ㅊ'; + } + if(ch == 0x110F){ + ch = 'ㅋ'; + } + if(ch == 0x1110){ + ch = 'ㅌ'; + } + if(ch == 0x1111){ + ch = 'ㅍ'; + } + if(ch == 0x1112){ + ch = 'ㅎ'; + } + return ch; + } + + public static List getHangulJamos(String source) { + + List jamoList = new ArrayList<>(); + + for (int i = 0; i < source.length(); i++) { + char ch = source.charAt(i); + if (Character.UnicodeBlock.of(ch) == Character.UnicodeBlock.HANGUL_JAMO) { + jamoList.add(ch); + } + } + return jamoList; + } +} diff --git a/src/main/resources/models_full/irregular.model b/src/main/resources/models_full/irregular.model index 55b5d30a..de8f754a 100644 Binary files a/src/main/resources/models_full/irregular.model and b/src/main/resources/models_full/irregular.model differ diff --git a/src/main/resources/models_full/observation.model b/src/main/resources/models_full/observation.model index b6b0184b..f18c7871 100644 Binary files a/src/main/resources/models_full/observation.model and b/src/main/resources/models_full/observation.model differ diff --git a/src/main/resources/models_full/transition.model b/src/main/resources/models_full/transition.model index 3de51688..82659d59 100644 Binary files a/src/main/resources/models_full/transition.model and b/src/main/resources/models_full/transition.model differ diff --git a/src/main/resources/models_light/irregular.model b/src/main/resources/models_light/irregular.model index a2e24361..16245fec 100644 Binary files a/src/main/resources/models_light/irregular.model and b/src/main/resources/models_light/irregular.model differ diff --git a/src/main/resources/models_light/observation.model b/src/main/resources/models_light/observation.model index fa2e320c..743d4afb 100644 Binary files a/src/main/resources/models_light/observation.model and b/src/main/resources/models_light/observation.model differ diff --git a/src/main/resources/models_light/transition.model b/src/main/resources/models_light/transition.model index 3de51688..82659d59 100644 Binary files a/src/main/resources/models_light/transition.model and b/src/main/resources/models_light/transition.model differ diff --git a/src/test/java/kr/co/shineware/nlp/komoran/core/KomoranTest.java b/src/test/java/kr/co/shineware/nlp/komoran/core/KomoranTest.java index fdd5c67a..6cd2d8a9 100644 --- a/src/test/java/kr/co/shineware/nlp/komoran/core/KomoranTest.java +++ b/src/test/java/kr/co/shineware/nlp/komoran/core/KomoranTest.java @@ -7,11 +7,14 @@ import kr.co.shineware.nlp.komoran.util.ElapsedTimeChecker; import kr.co.shineware.util.common.file.FileUtil; import kr.co.shineware.util.common.model.Pair; +import kr.co.shineware.util.common.string.StringUtil; import org.junit.Before; import org.junit.Ignore; import org.junit.Test; import java.io.*; +import java.nio.charset.StandardCharsets; +import java.text.Normalizer; import java.util.ArrayList; import java.util.List; @@ -25,9 +28,24 @@ public void init() { this.komoran = new Komoran(DEFAULT_MODEL.LIGHT); } + @Test + public void getUnicode() throws UnsupportedEncodingException { + String korean = "되ᄅ"; + korean = "난"; + printcodePointAndUnicodeBlock(korean); + } + + private void printcodePointAndUnicodeBlock(String korean) { + korean = StringUtil.korean2JasoString(korean); + for(int i=0;i nbestResult = this.komoran.analyze("가을", 2); + List nbestResult = this.komoran.analyze("치뜬", 1); for (KomoranResult result : nbestResult) { System.out.println(result.getPlainText()); } diff --git a/src/test/java/kr/co/shineware/nlp/komoran/core/SejongToTrainingData.java b/src/test/java/kr/co/shineware/nlp/komoran/core/SejongToTrainingData.java index 742c8614..73586df4 100644 --- a/src/test/java/kr/co/shineware/nlp/komoran/core/SejongToTrainingData.java +++ b/src/test/java/kr/co/shineware/nlp/komoran/core/SejongToTrainingData.java @@ -1,6 +1,7 @@ package kr.co.shineware.nlp.komoran.core; import kr.co.shineware.nlp.komoran.constant.SEJONGTAGS; +import kr.co.shineware.nlp.komoran.util.HangulJamoUtil; import kr.co.shineware.util.common.file.FileUtil; import kr.co.shineware.util.common.string.StringUtil; import org.junit.Before; @@ -90,6 +91,10 @@ public void loadKEBilingualCorpusAndConvertUTF8() throws IOException { answer += line.replaceAll("", "/").replaceAll("<.+?>", "") + " "; } if (line.startsWith("")) { + + problem = convertUnicodeJamoToJamoCompatibility(problem); + answer = convertUnicodeJamoToJamoCompatibility(answer); + if (!isValidFormat(problem + "\t" + answer)) { problem = ""; answer = ""; @@ -150,7 +155,7 @@ private boolean isValidFormat(String convertedPair) { } String pos = morphPosToken[1].trim(); if (!sejongTagSet.contains(pos)) { - System.out.println("Wrong POS : (" + pos + ")" + convertedPair); +// System.out.println("Wrong POS : (" + pos + ")" + convertedPair); return false; } for (int i = 0; i < pos.length(); i++) { @@ -201,6 +206,7 @@ public void loadKJBilingualCorpusAndConvertUTF8() throws IOException { } if (isHeadArea || isSentenceArea) { + line = convertUnicodeJamoToJamoCompatibility(line); String[] entity = line.split("\t"); if (entity.length != 2) { continue; @@ -213,7 +219,7 @@ public void loadKJBilingualCorpusAndConvertUTF8() throws IOException { } answer = answer.replaceAll("\\+", " ").replaceAll(" {2}", " +"); if (!isValidFormat(problem + "\t" + answer)) { - System.out.println(filename + ":" + lineCount + ":" + line); +// System.out.println(filename + ":" + lineCount + ":" + line); continue; } bw.write(problem + "\t" + answer); @@ -248,6 +254,7 @@ public void loadSejongSpeechTextAndConvertUTF8() throws IOException { } if (isTextArea) { + line = convertUnicodeJamoToJamoCompatibility(line); String[] entity = line.split("\t"); if (entity.length != 3) { continue; @@ -266,7 +273,7 @@ public void loadSejongSpeechTextAndConvertUTF8() throws IOException { continue; } if (!isValidFormat(problem + "\t" + answer)) { - System.out.println(filename + ":" + lineCount + ":" + line); +// System.out.println(filename + ":" + lineCount + ":" + line); continue; } bw.write(problem + "\t" + answer); @@ -313,11 +320,12 @@ public void loadSejongTextAndConvertUTF8() throws IOException { if (isHeadArea || isPhraseArea) { try { + line = convertUnicodeJamoToJamoCompatibility(line); String problem = line.split("\t")[1]; String answers = line.split("\t")[2]; answers = answers.replaceAll(" \\+ ", " "); if (!isValidFormat(problem + "\t" + answers)) { - System.out.println(filename + ":" + lineCount + ":" + line); +// System.out.println(filename + ":" + lineCount + ":" + line); continue; } bw.write(problem + "\t" + answers); @@ -334,4 +342,8 @@ public void loadSejongTextAndConvertUTF8() throws IOException { } bw.close(); } + + private String convertUnicodeJamoToJamoCompatibility(String line) { + return HangulJamoUtil.ToHangulCompatibilityJamo(line); + } } diff --git a/src/test/java/kr/co/shineware/nlp/komoran/core/Training.java b/src/test/java/kr/co/shineware/nlp/komoran/core/Training.java index 67ac354f..045dafef 100644 --- a/src/test/java/kr/co/shineware/nlp/komoran/core/Training.java +++ b/src/test/java/kr/co/shineware/nlp/komoran/core/Training.java @@ -2,19 +2,18 @@ import kr.co.shineware.nlp.komoran.corpus.builder.CorpusBuilder; import kr.co.shineware.nlp.komoran.modeler.builder.ModelBuilder; -import kr.co.shineware.util.common.file.FileUtil; import org.junit.Ignore; import org.junit.Test; import java.io.File; -import java.util.List; +@Ignore public class Training { @Test public void training() { CorpusBuilder corpusBuilder = new CorpusBuilder(); corpusBuilder.setExclusiveIrrRule("resources/irrDic.remove.txt"); - corpusBuilder.buildPath("tagged_corpus", "tag"); + corpusBuilder.buildPath("D:\\data\\komoran_training_data","refine.txt"); corpusBuilder.save("corpus_build"); ModelBuilder modelBuilder = new ModelBuilder(); diff --git a/src/test/java/kr/co/shineware/nlp/komoran/util/HangulJamoUtilTest.java b/src/test/java/kr/co/shineware/nlp/komoran/util/HangulJamoUtilTest.java new file mode 100644 index 00000000..c4b43f84 --- /dev/null +++ b/src/test/java/kr/co/shineware/nlp/komoran/util/HangulJamoUtilTest.java @@ -0,0 +1,40 @@ +package kr.co.shineware.nlp.komoran.util; + +import org.junit.Ignore; +import org.junit.Test; + +import java.util.List; + +@Ignore +public class HangulJamoUtilTest { + + @Test + public void toHangulCompatibilityJamo() { + String source = "나쁨에\t나쁘/VA ᄆ/ETN 에/JKB:3\n" + + "뿌연\t뿌옇/VA ᆫ/ETM:40\n" + + "쓰임이나\t쓰이/VV ᄆ/ETN 이나/JC:1\n" + + "얽혀져\t얽히/VV 어/EC 지/VX 어/EC:1\n" + + "뿌여\t뿌옇/VA:1\n" + + "읽힌\t읽히/VV ᆫ/ETM:4\n" + + "흘려보낼\t흘려보내/VV ᆯ/ETM:2\n" + + "치달릴\t치달리/VV ᆯ/ETM:2\n" + + "흘려보낸\t흘려보내/VV ᆫ/ETM:1\n" + + "들러간\t들러가/VV ᆫ/ETM:1"; + System.out.println(hasOnlyValidHangul(source)); + System.out.println(source); + System.out.println(); + System.out.println(HangulJamoUtil.ToHangulCompatibilityJamo(source)); + } + + private boolean hasOnlyValidHangul(String line) { + List jamoList = HangulJamoUtil.getHangulJamos(line); + if (jamoList.size() != 0) { + return false; + } + return true; + } + + @Test + public void getHangulJamos() { + } +} \ No newline at end of file