diff --git a/CzechStemmerLight.java b/CzechStemmerLight.java new file mode 100644 index 00000000..c5018a15 --- /dev/null +++ b/CzechStemmerLight.java @@ -0,0 +1,297 @@ +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.io.Reader; +import java.io.Writer; +import java.nio.charset.StandardCharsets; + +/** + * @author Dolamic Ljiljana University of Neuchatel + * + * Czech stemmer-removes case endings form nouns and adjectives, possessive adj. + * endings from names + * and takes care of palatalisation + */ +public class CzechStemmerLight { + + /** + * A buffer of the current word being stemmed + */ + private StringBuffer sb=new StringBuffer(); + + + /** + * Default constructor + */ + public CzechStemmerLight(){} // constructor + + public String stem(String input){ + + // + input=input.toLowerCase(); + + //reset string buffer + sb.delete(0,sb.length()); + sb.insert(0,input); + + // stemming... + //removes case endings from nouns and adjectives + removeCase(sb); + + //removes possessive endings from names -ov- and -in- + removePossessives(sb); + + String result = sb.toString(); + + + return result; + } + private void palatalise(StringBuffer buffer){ + int len=buffer.length(); + + if( buffer.substring( len- 2 ,len).equals("ci")|| + buffer.substring( len- 2 ,len).equals("ce")|| + buffer.substring( len- 2 ,len).equals("\u010di")|| //-či + buffer.substring( len- 2 ,len).equals("\u010de")){ //-če + + buffer.replace(len- 2 ,len, "k"); + return; + } + if( buffer.substring( len- 2 ,len).equals("zi")|| + buffer.substring( len- 2 ,len).equals("ze")|| + buffer.substring( len- 2 ,len).equals("\u017ei")|| //-ži + buffer.substring( len- 2 ,len).equals("\u017ee")){ //-že + + buffer.replace(len- 2 ,len, "h"); + return; + } + if( buffer.substring( len- 3 ,len).equals("\u010dt\u011b")|| //-čtě + buffer.substring( len- 3 ,len).equals("\u010dti")|| //-čti + buffer.substring( len- 3 ,len).equals("\u010dt\u00ed")){ //-čtí + + buffer.replace(len- 3 ,len, "ck"); + return; + } + if( buffer.substring( len- 3 ,len).equals("\u0161t\u011b")|| //-ště + buffer.substring( len- 3 ,len).equals("\u0161ti")|| //-šti + buffer.substring( len- 3 ,len).equals("\u0161t\u00ed")){ //-ští + + buffer.replace(len- 3 ,len, "sk"); + return; + } + buffer.delete( len- 1 , len); + return; + }//palatalise + + private void removePossessives(StringBuffer buffer) { + int len=buffer.length(); + + if( len> 5 ){ + if( buffer.substring( len- 2 ,len).equals("ov")){ + + buffer.delete( len- 2 , len); + return; + } + if( buffer.substring( len-2,len).equals("\u016fv")){ //-ův + + buffer.delete( len- 2 , len); + return; + } + if( buffer.substring( len- 2 ,len).equals("in")){ + + buffer.delete( len- 1 , len); + palatalise(buffer); + return; + } + } + return; + }//removePossessives + + private void removeCase(StringBuffer buffer) { + int len=buffer.length(); + // + if( (len> 7 )&& + buffer.substring( len- 5 ,len).equals("atech")){ + + buffer.delete( len- 5 , len); + return; + }//len>7 + if( len> 6 ){ + if(buffer.substring( len- 4 ,len).equals("\u011btem")){ //-ětem + + buffer.delete( len- 3 , len); + palatalise(buffer); + return; + } + if(buffer.substring( len- 4 ,len).equals("at\u016fm")){ //-atům + buffer.delete( len- 4 , len); + return; + } + + } + if( len> 5 ){ + if(buffer.substring( len-3,len).equals("ech")|| + buffer.substring( len-3,len).equals("ich")|| + buffer.substring( len-3,len).equals("\u00edch")){ //-ích + + buffer.delete( len-2 , len); + palatalise(buffer); + return; + } + if(buffer.substring( len-3,len).equals("\u00e9ho")|| //-ého + buffer.substring( len-3,len).equals("\u011bmi")|| //-ěmi + buffer.substring( len-3,len).equals("emi")|| + buffer.substring( len-3,len).equals("\u00e9mu")|| //-ému + buffer.substring( len-3,len).equals("\u011bte")|| //-ěte + buffer.substring( len-3,len).equals("\u011bti")|| //-ěti + buffer.substring( len-3,len).equals("iho")|| + buffer.substring( len-3,len).equals("\u00edho")|| //-ího + buffer.substring( len-3,len).equals("\u00edmi")|| //-ími + buffer.substring( len-3,len).equals("imu")){ + + buffer.delete( len- 2 , len); + palatalise(buffer); + return; + } + if( buffer.substring( len-3,len).equals("\u00e1ch")|| //-ách + buffer.substring( len-3,len).equals("ata")|| + buffer.substring( len-3,len).equals("aty")|| + buffer.substring( len-3,len).equals("\u00fdch")|| //-ých + buffer.substring( len-3,len).equals("ama")|| + buffer.substring( len-3,len).equals("ami")|| + buffer.substring( len-3,len).equals("ov\u00e9")|| //-ové + buffer.substring( len-3,len).equals("ovi")|| + buffer.substring( len-3,len).equals("\u00fdmi")){ //-ými + + buffer.delete( len- 3 , len); + return; + } + } + if( len> 4){ + if(buffer.substring( len-2,len).equals("em")){ + + buffer.delete( len- 1 , len); + palatalise(buffer); + return; + + } + if( buffer.substring( len-2,len).equals("es")|| + buffer.substring( len-2,len).equals("\u00e9m")|| //-ém + buffer.substring( len-2,len).equals("\u00edm")){ //-ím + + buffer.delete( len- 1 , len); + palatalise(buffer); + return; + } + if( buffer.substring( len-2,len).equals("\u016fm")){ //-ům + + buffer.delete( len- 2 , len); + return; + } + if( buffer.substring( len-2,len).equals("at")|| + buffer.substring( len-2,len).equals("\u00e1m")|| //-ám + buffer.substring( len-2,len).equals("os")|| + buffer.substring( len-2,len).equals("us")|| + buffer.substring( len-2,len).equals("\u00fdm")|| //-ým + buffer.substring( len-2,len).equals("mi")|| + buffer.substring( len-2,len).equals("ou")){ + + buffer.delete( len- 2 , len); + return; + } + }//len>4 + if( len> 3){ + if( buffer.substring( len-1,len).equals("e")|| + buffer.substring( len-1,len).equals("i")){ + + palatalise(buffer); + return; + } + if( buffer.substring( len-1,len).equals("\u00ed")|| //-í + buffer.substring( len-1,len).equals("\u011b")){ //-ě + + palatalise(buffer); + return; + } + if( buffer.substring( len-1,len).equals("u")|| + buffer.substring( len-1,len).equals("y")|| + buffer.substring( len-1,len).equals("\u016f")){ //-ů + + buffer.delete( len- 1 , len); + return; + + } + if( buffer.substring( len-1,len).equals("a")|| + buffer.substring( len-1,len).equals("o")|| + buffer.substring( len-1,len).equals("\u00e1")|| // -á + buffer.substring( len-1,len).equals("\u00e9")|| //-é + buffer.substring( len-1,len).equals("\u00fd")){ //-ý + + buffer.delete( len- 1 , len); + return; + } + }//len>3 + } + + + private static void usage() + { + System.err.println("Usage: TestApp [] [-o ]"); + } + + public static void main(String [] args) throws Throwable { + if (args.length < 1) { + usage(); + return; + } + + CzechStemmerLight stemmer = new CzechStemmerLight(); + + int arg = 1; + + InputStream instream; + if (args.length > arg && !args[arg].equals("-o")) { + instream = new FileInputStream(args[arg++]); + } else { + instream = System.in; + } + + OutputStream outstream; + if (args.length > arg) { + if (args.length != arg + 2 || !args[arg].equals("-o")) { + usage(); + return; + } + outstream = new FileOutputStream(args[arg + 1]); + } else { + outstream = System.out; + } + + Reader reader = new InputStreamReader(instream, StandardCharsets.UTF_8); + reader = new BufferedReader(reader); + + Writer output = new OutputStreamWriter(outstream, StandardCharsets.UTF_8); + output = new BufferedWriter(output); + + StringBuffer input = new StringBuffer(); + int character; + while ((character = reader.read()) != -1) { + char ch = (char) character; + if (Character.isWhitespace(ch)) { + String result = stemmer.stem(input.toString()); + output.write(result); + output.write('\n'); + input.delete(0, input.length()); + } else { + input.append(ch < 127 ? Character.toLowerCase(ch) : ch); + } + } + output.flush(); + } + +}//CzechStemmer_1 diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl new file mode 100644 index 00000000..49cb4419 --- /dev/null +++ b/algorithms/czech.sbl @@ -0,0 +1,155 @@ +routines ( + R1 + palatalise_e + palatalise_ecaron_or_iacute + palatalise_i + mark_regions + possessive_suffix + case_suffix +) + +externals ( stem ) + +integers ( p1 x ) + +groupings ( v v_or_syllabic_c ) + +stringescapes {} + +stringdef a' '{U+00E1}' +stringdef c^ '{U+010D}' +stringdef d^ '{U+010F}' +stringdef e' '{U+00E9}' +stringdef e^ '{U+011B}' +stringdef i' '{U+00ED}' +stringdef n^ '{U+0148}' +stringdef o' '{U+00F3}' +stringdef r^ '{U+0159}' +stringdef s^ '{U+0161}' +stringdef t^ '{U+0165}' +stringdef u' '{U+00FA}' +stringdef u* '{U+016F}' +stringdef y' '{U+00FD}' +stringdef z^ '{U+017E}' + +define v 'aeiouy{a'}{e^}{e'}{i'}{o'}{u'}{u*}{y'}' + +// Some consonants in Czech can be syllabic - if these occur between two other +// consonants then they act in a vowel-like way and it is helpful to include +// them in the definition of R1. +// +// Some sources also list 'm' and 'n' as syllabic consonants for Czech but they +// seem to be much rarer and including them makes no difference to the results +// of stemming any words in our sample vocabulary list. +define v_or_syllabic_c v + 'lr' + +define mark_regions as ( + + $p1 = limit + test(hop 3 setmark x) + + do ( + // A syllabic consonant must occur between two consonants, or be + // preceded by a consonant and at the end of the word. + // + // Instead of literally testing that, we handle the first character + // specially by only checking if it's a vowel; for subsequent + // characters we know that the character before is a consonant because + // otherwise we'd have stopped already. + // + // We also don't actually need to check the character after, since + // if it's a vowel then that vowel means we'd end up at the same + // position after `gopast non-v` anyway, and if it's the end of the + // word then there's no non-v after it. + (v or (next gopast v_or_syllabic_c)) gopast non-v + setmark p1 + try($p1 < x $p1 = x) // at least 3 + ) +) + +backwardmode ( + + define R1 as $p1 <= cursor + + define palatalise_e as ( + [substring] among ( + 'c' '{c^}' (<- 'k') + 'z' '{z^}' (<- 'h') + ) + ) + + define palatalise_ecaron_or_iacute as ( + [substring] among ( + '{c^}t' (<- 'ck') + '{s^}t' (<- 'sk') + ) + ) + + define palatalise_i as ( + [substring] among ( + 'c' '{c^}' (<- 'k') + 'z' '{z^}' (<- 'h') + '{c^}t' (<- 'ck') + '{s^}t' (<- 'sk') + ) + ) + + define possessive_suffix as ( + [substring] R1 among ( + 'ov' '{u*}v' + (delete) + 'in' + ( + delete + try palatalise_i + ) + ) + ) + + define case_suffix as ( + setlimit tomark p1 for ( [substring] ) among ( + 'atech' + 'at{u*}m' + '{a'}ch' '{y'}ch' 'ov{e'}' '{y'}mi' + 'ata' 'aty' 'ama' 'ami' 'ovi' + 'at' '{a'}m' 'us' '{u*}m' '{y'}m' 'mi' 'ou' + '{e'}ho' '{e'}m' '{e'}mu' + 'u' 'y' '{u*}' 'a' 'o' '{a'}' '{e'}' '{y'}' + (delete) + '{e^}' '{e^}tem' '{e^}mi' '{e^}te' '{e^}ti' + ( + delete + try palatalise_ecaron_or_iacute + ) + 'e' 'ech' 'em' 'emi' 'ete' 'etem' + ( + delete + try palatalise_e + ) + 'i' + ( + delete + try palatalise_i + ) + '{i'}' '{i'}ch' '{i'}ho' '{i'}m' '{i'}mi' '{i'}mu' + ( + delete + try palatalise_ecaron_or_iacute + ) + ) + ) +) + +define stem as ( + do mark_regions + backwards ( + do case_suffix + do possessive_suffix + ) +) + +// Ljiljana Dolamic and Jacques Savoy. 2009. +// Indexing and stemming approaches for the Czech language. +// Inf. Process. Manage. 45, 6 (November 2009), 714-720. +// based on Java code by Ljiljana Dolamic: +// http://members.unine.ch/jacques.savoy/clef/CzechStemmerLight.txt diff --git a/libstemmer/modules.txt b/libstemmer/modules.txt index cd36a219..58df1464 100644 --- a/libstemmer/modules.txt +++ b/libstemmer/modules.txt @@ -13,6 +13,7 @@ arabic UTF_8 arabic,ar,ara armenian UTF_8 armenian,hy,hye,arm basque UTF_8,ISO_8859_1 basque,eu,eus,baq catalan UTF_8,ISO_8859_1 catalan,ca,cat +czech UTF_8,ISO_8859_2 czech,cs,ces,cze danish UTF_8,ISO_8859_1 danish,da,dan dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld english UTF_8,ISO_8859_1 english,en,eng