From d7d3c8eb453efc1ef601268812369c59ae58d146 Mon Sep 17 00:00:00 2001 From: Clinton Gormley Date: Fri, 23 May 2014 10:07:09 +0200 Subject: [PATCH] Moved the multilingual chapter to the beginning of languages --- 02_Dealing_with_language.asciidoc | 17 +-- 200_Identifying_words.asciidoc | 12 --- 200_Language_analyzers.asciidoc | 10 ++ 200_Language_analyzers/00_Intro.asciidoc | 47 ++++++++ 200_Language_analyzers/10_Using.asciidoc | 101 ++++++++++++++++++ .../20_Configuring.asciidoc | 65 +++++++++++ .../30_Multiple_languages.asciidoc | 12 +++ 210_Identifying_words.asciidoc | 21 ++++ .../00_Intro.asciidoc | 2 +- .../10_Standard_analyzer.asciidoc | 0 .../20_Standard_tokenizer.asciidoc | 0 .../30_ICU_plugin.asciidoc | 0 .../40_ICU_tokenizer.asciidoc | 0 .../50_Tidying_text.asciidoc | 0 210_Token_normalization.asciidoc | 14 --- 220_Stemming.asciidoc | 20 ---- 220_Token_normalization.asciidoc | 17 +++ .../00_Intro.asciidoc | 0 .../10_Lowercasing.asciidoc | 0 .../20_Removing_diacritics.asciidoc | 0 .../30_Unicode_world.asciidoc | 0 .../40_Case_folding.asciidoc | 0 .../50_Character_folding.asciidoc | 2 +- .../60_Sorting_and_collations.asciidoc | 0 230_Stemming.asciidoc | 32 ++++++ 230_Stemming/00_Intro.asciidoc | 74 +++++++++++++ 230_Stemming/10_Algorithmic_stemmers.asciidoc | 63 +++++++++++ 240_Multilingual.asciidoc | 10 -- ...opwords.asciidoc => 240_Stopwords.asciidoc | 0 ...Synonyms.asciidoc => 260_Synonyms.asciidoc | 0 ...ng.asciidoc => 270_Fuzzy_matching.asciidoc | 0 .../010_Intro.asciidoc | 0 ...esters.asciidoc => 280_Suggesters.asciidoc | 0 book.asciidoc | 16 +-- 34 files changed, 462 insertions(+), 73 deletions(-) delete mode 100644 200_Identifying_words.asciidoc create mode 100644 200_Language_analyzers.asciidoc create mode 100644 200_Language_analyzers/00_Intro.asciidoc create mode 100644 200_Language_analyzers/10_Using.asciidoc create mode 100644 200_Language_analyzers/20_Configuring.asciidoc create mode 100644 200_Language_analyzers/30_Multiple_languages.asciidoc create mode 100644 210_Identifying_words.asciidoc rename {200_Identifying_words => 210_Identifying_words}/00_Intro.asciidoc (94%) rename {200_Identifying_words => 210_Identifying_words}/10_Standard_analyzer.asciidoc (100%) rename {200_Identifying_words => 210_Identifying_words}/20_Standard_tokenizer.asciidoc (100%) rename {200_Identifying_words => 210_Identifying_words}/30_ICU_plugin.asciidoc (100%) rename {200_Identifying_words => 210_Identifying_words}/40_ICU_tokenizer.asciidoc (100%) rename {200_Identifying_words => 210_Identifying_words}/50_Tidying_text.asciidoc (100%) delete mode 100644 210_Token_normalization.asciidoc delete mode 100644 220_Stemming.asciidoc create mode 100644 220_Token_normalization.asciidoc rename {210_Token_normalization => 220_Token_normalization}/00_Intro.asciidoc (100%) rename {210_Token_normalization => 220_Token_normalization}/10_Lowercasing.asciidoc (100%) rename {210_Token_normalization => 220_Token_normalization}/20_Removing_diacritics.asciidoc (100%) rename {210_Token_normalization => 220_Token_normalization}/30_Unicode_world.asciidoc (100%) rename {210_Token_normalization => 220_Token_normalization}/40_Case_folding.asciidoc (100%) rename {210_Token_normalization => 220_Token_normalization}/50_Character_folding.asciidoc (98%) rename {210_Token_normalization => 220_Token_normalization}/60_Sorting_and_collations.asciidoc (100%) create mode 100644 230_Stemming.asciidoc create mode 100644 230_Stemming/00_Intro.asciidoc create mode 100644 230_Stemming/10_Algorithmic_stemmers.asciidoc delete mode 100644 240_Multilingual.asciidoc rename 230_Stopwords.asciidoc => 240_Stopwords.asciidoc (100%) rename 250_Synonyms.asciidoc => 260_Synonyms.asciidoc (100%) rename 260_Fuzzy_matching.asciidoc => 270_Fuzzy_matching.asciidoc (100%) rename {260_Fuzzy_matching => 270_Fuzzy_matching}/010_Intro.asciidoc (100%) rename 270_Suggesters.asciidoc => 280_Suggesters.asciidoc (100%) diff --git a/02_Dealing_with_language.asciidoc b/02_Dealing_with_language.asciidoc index 5f1d206f4..ec27cbeb8 100644 --- a/02_Dealing_with_language.asciidoc +++ b/02_Dealing_with_language.asciidoc @@ -7,11 +7,13 @@ [quote,Matt Groening] ``I know all those words, but that sentence makes no sense to me.'' -In <> we covered the mechanics of search, but we didn't pay -much attention to the words themselves. It is not enough for full text search -to just match the exact words that the user has queried. Instead, we need to +Full text search is a battle between _precision_ -- returning as few +irrelevant documents as possible -- and _recall_ -- returning as many relevant +documents as possible. While matching only the exact words that the user has +queried would be precise, it is not enough. We would miss out on many +documents that the user would consider to be relevant. Instead, we need to spread the net wider, to also search for words that are not exactly the same -as the original, but are related. +as the original but are related. Wouldn't you expect a search for ``quick brown fox'' to match a document containing ``fast brown foxes'', ``Johnny Walker'' to match ``Johnnie @@ -46,8 +48,9 @@ There are several lines of attack: that we know exists in the index, and a _did-you-mean_ suggester to redirect users who may have mistyped a search term. See <>. -But before we can manipulate individual words, we need to divide text up into -words, which means that we need to know what constitutes a _word_, which we -will tackle in <>. +Before we can manipulate individual words, we need to divide text up into +words, which means that we need to know what constitutes a _word_. We will +tackle this in <>. +But first, let's take a look at how to get started quickly and easily. -- diff --git a/200_Identifying_words.asciidoc b/200_Identifying_words.asciidoc deleted file mode 100644 index 44f9e3f90..000000000 --- a/200_Identifying_words.asciidoc +++ /dev/null @@ -1,12 +0,0 @@ -include::200_Identifying_words/00_Intro.asciidoc[] - -include::200_Identifying_words/10_Standard_analyzer.asciidoc[] - -include::200_Identifying_words/20_Standard_tokenizer.asciidoc[] - -include::200_Identifying_words/30_ICU_plugin.asciidoc[] - -include::200_Identifying_words/40_ICU_tokenizer.asciidoc[] - -include::200_Identifying_words/50_Tidying_text.asciidoc[] - diff --git a/200_Language_analyzers.asciidoc b/200_Language_analyzers.asciidoc new file mode 100644 index 000000000..598858b73 --- /dev/null +++ b/200_Language_analyzers.asciidoc @@ -0,0 +1,10 @@ +include::200_Language_analyzers/00_Intro.asciidoc[] + +include::200_Language_analyzers/10_Using.asciidoc[] + +include::200_Language_analyzers/20_Configuring.asciidoc[] + +include::200_Language_analyzers/30_Multiple_languages.asciidoc[] + + + diff --git a/200_Language_analyzers/00_Intro.asciidoc b/200_Language_analyzers/00_Intro.asciidoc new file mode 100644 index 000000000..0d3f5618d --- /dev/null +++ b/200_Language_analyzers/00_Intro.asciidoc @@ -0,0 +1,47 @@ +[[language-analyzers]] +== Language analyzers + +Elasticsearch ships with a collection of language analyzers which provide +good, basic, out-of-the-box support for a number of the world's most common +languages: + +Arabic, Armenian, Basque, Brazilian, Bulgarian, Catalan, Chinese, +Czech, Danish, Dutch, English, Finnish, French, Galician, German, Greek, +Hindi, Hungarian, Indonesian, Italian, Japanese, Korean, Norwegian, Persian, +Portuguese, Romanian, Russian, Spanish, Swedish, Turkish, and Thai. + +These analyzers typically perform four roles: + +* Tokenize text into individual words: ++ +`The quick brown foxes` -> [`The`, `quick`, `brown`, `foxes`] + +* Lowercase tokens: ++ +`The` -> `the` + +* Remove commom _stopwords_: ++ +[`The`, `quick`, `brown`, `foxes`] -> [`quick`, `brown`, `foxes`] + +* Stem tokens to their root form: ++ +`foxes` -> `fox` + +Each analyzer may also apply other transformations specific to its language in +order to make words from that language more searchable: + +* the `english` analyzer removes the possessive `'s`: ++ +`John's` -> `john` + +* the `french` analyzer removes _elisions_ like `l'` and `qu'` and + _diactrics_ like `¨` or `^`: ++ +`l'église` -> `eglis` + +* the `german` analyzer normalizes terms, replacing `ä` and `ae` with `a`, or + `ß` with `ss`, among others: ++ +`äußerst` -> `ausserst` + diff --git a/200_Language_analyzers/10_Using.asciidoc b/200_Language_analyzers/10_Using.asciidoc new file mode 100644 index 000000000..f93429c97 --- /dev/null +++ b/200_Language_analyzers/10_Using.asciidoc @@ -0,0 +1,101 @@ +[[using-language-analyzers]] +=== Using language analyzers + +The built-in language analyzers are available globally and don't need to be +configured before being used. They can be specified directly in the field +mapping: + +[source,js] +-------------------------------------------------- +PUT /my_index +{ + "mappings": { + "blog": { + "properties": { + "title": { + "type": "string", + "analyzer": "english" <1> + } + } + } + } +} +-------------------------------------------------- +<1> The `title` field will use the `english` analyzer instead of the default + `standard` analyzer. + +Of course, by passing text through the `english` analyzer, we lose +information: + +[source,js] +-------------------------------------------------- +GET /my_index/_analyze?field=title <1> +I'm not happy about the foxes +-------------------------------------------------- +<1> Emits tokens: `i'm`, `happi`, `about`, `fox` + +We can't tell if the document mentions one `fox` or many `foxes`; the word +`not` is a stopword and is removed, so we can't tell whether the document is +happy about foxes or *not*. By using the `english` analyzer, we have increased +recall as we can match more loosely, but we have reduced our ability to rank +documents accurately. + +To get the best of both worlds, we can use <> to +index the `title` field twice: once with the `english` analyzer and once with +the `standard` analyzer: + +[source,js] +-------------------------------------------------- +PUT /my_index +{ + "mappings": { + "blog": { + "properties": { + "title": { <1> + "type": "string", + "fields": { + "english": { <2> + "type": "string", + "analyzer": "english" + } + } + } + } + } + } +} +-------------------------------------------------- +<1> The main `title` field uses the `standard` analyzer. +<2> The `title.english` sub-field uses the `english` analyzer. + +With this mapping in place, we can index some test documents to demonstrate +how to use both fields at query time: + +[source,js] +-------------------------------------------------- +PUT /my_index/blog/1 +{ "title": "I'm happy for this fox" } + +PUT /my_index/blog/2 +{ "title": "I'm not happy about my fox problem" } + +GET /_search +{ + "query": { + "multi_match": { + "type": "most_fields" <1> + "query": "not happy foxes", + "fields": [ "title", "title.english" ] + } + } +} +-------------------------------------------------- +<1> Use the <> query type to match the + same text in as many fields as possible. + +Even though neither of our documents contain the word `foxes`, both documents +are returned as results thanks to the word stemming on the `title.english` +field. The second document is ranked as more relevant, because the word `not` +matches on the `title` field. + + diff --git a/200_Language_analyzers/20_Configuring.asciidoc b/200_Language_analyzers/20_Configuring.asciidoc new file mode 100644 index 000000000..159a001fc --- /dev/null +++ b/200_Language_analyzers/20_Configuring.asciidoc @@ -0,0 +1,65 @@ +[[configuring-language-analyzers]] +=== Configuring language analyzers + +While the language analyzers can be used out of the box without any +configuration, most of them do allow you to control aspects of their +behaviour, specifically: + +Stem word exclusion:: ++ +Imagine, for instance, that users searching for the ``World Health +Organization'' are instead getting results for ``organ health''. The reason +for this confusion is that both ``organ'' and ``organization'' are stemmed to +the same root word: `organ`. Often this isn't a problem, but in this +particular collection of documents this leads to confusing results. We would +like to prevent the words `organization` and `organizations` from being +stemmed. + +Custom stopwords:: + +The default list of stopwords used in English are: ++ + a, an, and, are, as, at, be, but, by, for, if, in, into, is, it, + no, not, of, on, or, such, that, the, their, then, there, these, + they, this, to, was, will, with ++ +The unusual thing about `no` and `not` is that they invert the meaning of the +words that follow them. Perhaps we decide that these two words are important +and that we shouldn't treat them as stopwords. + +In order to customize the behaviour of the `english` analyzer, we need to +create a custom analyzer which uses the `english` analyzer as its base, but +adds some configuration: + +[source,js] +-------------------------------------------------- +PUT /my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_english": { + "type": "english", + "stem_exclusion": [ "organization", "organizations" ], <1> + "stopwords": [ <2> + "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", + "if", "in", "into", "is", "it", "of", "on", "or", "such", "that", + "the", "their", "then", "there", "these", "they", "this", "to", + "was", "will", "with" + ] + } + } + } + } +} + +GET /my_index/_analyze?analyzer=my_english <3> +The World Health Organization does not sell organs. +-------------------------------------------------- +<1> Prevents `organization` and `organizations` from being stemmed. +<2> Specifies a custom list of stopwords. +<3> Emits tokens `world`, `health`, `organization`, `doe`, `not`, `sell`, `organ`. + +We will discuss stemming and stopwords in much more detail in <> and +<> respectively. + diff --git a/200_Language_analyzers/30_Multiple_languages.asciidoc b/200_Language_analyzers/30_Multiple_languages.asciidoc new file mode 100644 index 000000000..dd5c20afe --- /dev/null +++ b/200_Language_analyzers/30_Multiple_languages.asciidoc @@ -0,0 +1,12 @@ +[[multiple-languages]] +=== Handling multiple languages + +If you only have to deal with a single language, count yourself lucky. +Finding the right strategy for handling documents written in several languages +can be challenging. There are three possible scenarios: + +* each document contains text in a single language +* a document may contain more than one language, but each field contains + text in a single language +* a single field may contain a mixture of languages + diff --git a/210_Identifying_words.asciidoc b/210_Identifying_words.asciidoc new file mode 100644 index 000000000..5848af766 --- /dev/null +++ b/210_Identifying_words.asciidoc @@ -0,0 +1,21 @@ +include::210_Identifying_words/00_Intro.asciidoc[] + +include::210_Identifying_words/10_Standard_analyzer.asciidoc[] + +include::210_Identifying_words/20_Standard_tokenizer.asciidoc[] + +include::210_Identifying_words/30_ICU_plugin.asciidoc[] + +include::210_Identifying_words/40_ICU_tokenizer.asciidoc[] + +include::210_Identifying_words/50_Tidying_text.asciidoc[] + +////////////////// + +Compound words + +language specific + - kuromoji + - chinese + +////////////////// diff --git a/200_Identifying_words/00_Intro.asciidoc b/210_Identifying_words/00_Intro.asciidoc similarity index 94% rename from 200_Identifying_words/00_Intro.asciidoc rename to 210_Identifying_words/00_Intro.asciidoc index 8b7872619..f8a5cfe0e 100644 --- a/200_Identifying_words/00_Intro.asciidoc +++ b/210_Identifying_words/00_Intro.asciidoc @@ -15,7 +15,7 @@ constituent parts. Asian languages are even more complex: some have no whitespace between words, sentences or even paragraphs. Some words can be represented by a single character, but the same single character, when placed next to other -characters, can form just a part of a longer word with a quite different +characters, can form just one part of a longer word with a quite different meaning. It should be obvious that there is no ``silver bullet'' analyzer that will diff --git a/200_Identifying_words/10_Standard_analyzer.asciidoc b/210_Identifying_words/10_Standard_analyzer.asciidoc similarity index 100% rename from 200_Identifying_words/10_Standard_analyzer.asciidoc rename to 210_Identifying_words/10_Standard_analyzer.asciidoc diff --git a/200_Identifying_words/20_Standard_tokenizer.asciidoc b/210_Identifying_words/20_Standard_tokenizer.asciidoc similarity index 100% rename from 200_Identifying_words/20_Standard_tokenizer.asciidoc rename to 210_Identifying_words/20_Standard_tokenizer.asciidoc diff --git a/200_Identifying_words/30_ICU_plugin.asciidoc b/210_Identifying_words/30_ICU_plugin.asciidoc similarity index 100% rename from 200_Identifying_words/30_ICU_plugin.asciidoc rename to 210_Identifying_words/30_ICU_plugin.asciidoc diff --git a/200_Identifying_words/40_ICU_tokenizer.asciidoc b/210_Identifying_words/40_ICU_tokenizer.asciidoc similarity index 100% rename from 200_Identifying_words/40_ICU_tokenizer.asciidoc rename to 210_Identifying_words/40_ICU_tokenizer.asciidoc diff --git a/200_Identifying_words/50_Tidying_text.asciidoc b/210_Identifying_words/50_Tidying_text.asciidoc similarity index 100% rename from 200_Identifying_words/50_Tidying_text.asciidoc rename to 210_Identifying_words/50_Tidying_text.asciidoc diff --git a/210_Token_normalization.asciidoc b/210_Token_normalization.asciidoc deleted file mode 100644 index d73d4567c..000000000 --- a/210_Token_normalization.asciidoc +++ /dev/null @@ -1,14 +0,0 @@ -include::210_Token_normalization/00_Intro.asciidoc[] - -include::210_Token_normalization/10_Lowercasing.asciidoc[] - -include::210_Token_normalization/20_Removing_diacritics.asciidoc[] - -include::210_Token_normalization/30_Unicode_world.asciidoc[] - -include::210_Token_normalization/40_Case_folding.asciidoc[] - -include::210_Token_normalization/50_Character_folding.asciidoc[] - -include::210_Token_normalization/60_Sorting_and_collations.asciidoc[] - diff --git a/220_Stemming.asciidoc b/220_Stemming.asciidoc deleted file mode 100644 index 05dbb3f70..000000000 --- a/220_Stemming.asciidoc +++ /dev/null @@ -1,20 +0,0 @@ -[[stemming]] -== Reducing words to their root form - -Stemming vs lemmatization - -algorithmic stemmers - porter stem token filter - stemmer token filter - kstem token filter - snowball token filter - -dictionary based - hunspell - -controlling stemmers - stemmer override token filter - keyword marker token filter - - keyword repeat token filter - unique token filter diff --git a/220_Token_normalization.asciidoc b/220_Token_normalization.asciidoc new file mode 100644 index 000000000..6ddb3695d --- /dev/null +++ b/220_Token_normalization.asciidoc @@ -0,0 +1,17 @@ +include::220_Token_normalization/00_Intro.asciidoc[] + +include::220_Token_normalization/10_Lowercasing.asciidoc[] + +include::220_Token_normalization/20_Removing_diacritics.asciidoc[] + +include::220_Token_normalization/30_Unicode_world.asciidoc[] + +include::220_Token_normalization/40_Case_folding.asciidoc[] + +include::220_Token_normalization/50_Character_folding.asciidoc[] + +// TODO: Add normalization character filter with ngram tokenizer for decompounding german +// German ngrams should be 4, not 3 + +include::220_Token_normalization/60_Sorting_and_collations.asciidoc[] + diff --git a/210_Token_normalization/00_Intro.asciidoc b/220_Token_normalization/00_Intro.asciidoc similarity index 100% rename from 210_Token_normalization/00_Intro.asciidoc rename to 220_Token_normalization/00_Intro.asciidoc diff --git a/210_Token_normalization/10_Lowercasing.asciidoc b/220_Token_normalization/10_Lowercasing.asciidoc similarity index 100% rename from 210_Token_normalization/10_Lowercasing.asciidoc rename to 220_Token_normalization/10_Lowercasing.asciidoc diff --git a/210_Token_normalization/20_Removing_diacritics.asciidoc b/220_Token_normalization/20_Removing_diacritics.asciidoc similarity index 100% rename from 210_Token_normalization/20_Removing_diacritics.asciidoc rename to 220_Token_normalization/20_Removing_diacritics.asciidoc diff --git a/210_Token_normalization/30_Unicode_world.asciidoc b/220_Token_normalization/30_Unicode_world.asciidoc similarity index 100% rename from 210_Token_normalization/30_Unicode_world.asciidoc rename to 220_Token_normalization/30_Unicode_world.asciidoc diff --git a/210_Token_normalization/40_Case_folding.asciidoc b/220_Token_normalization/40_Case_folding.asciidoc similarity index 100% rename from 210_Token_normalization/40_Case_folding.asciidoc rename to 220_Token_normalization/40_Case_folding.asciidoc diff --git a/210_Token_normalization/50_Character_folding.asciidoc b/220_Token_normalization/50_Character_folding.asciidoc similarity index 98% rename from 210_Token_normalization/50_Character_folding.asciidoc rename to 220_Token_normalization/50_Character_folding.asciidoc index eb674bb88..9ed9f792b 100644 --- a/210_Token_normalization/50_Character_folding.asciidoc +++ b/220_Token_normalization/50_Character_folding.asciidoc @@ -44,7 +44,7 @@ http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html[_UnicodeSet characters may be folded. For instance, to exclude the Swedish letters `å`, `ä`, `ö`, ++Å++, `Ä` and `Ö` from folding, you would specify a character class representing all Unicode characters, except for those letters: `[^åäöÅÄÖ]` -(`^` means ``except''). +(`^` means ``everything except''). [source,js] -------------------------------------------------- diff --git a/210_Token_normalization/60_Sorting_and_collations.asciidoc b/220_Token_normalization/60_Sorting_and_collations.asciidoc similarity index 100% rename from 210_Token_normalization/60_Sorting_and_collations.asciidoc rename to 220_Token_normalization/60_Sorting_and_collations.asciidoc diff --git a/230_Stemming.asciidoc b/230_Stemming.asciidoc new file mode 100644 index 000000000..d9216e3c9 --- /dev/null +++ b/230_Stemming.asciidoc @@ -0,0 +1,32 @@ +include::230_Stemming/00_Intro.asciidoc[] + +include::230_Stemming/10_Algorithmic_stemmers.asciidoc[] + +algorithmic stemmers + porter stem token filter + stemmer token filter + kstem token filter + snowball token filter + +dictionary based + hunspell + +controlling stemmers + stemmer override token filter + keyword marker token filter + + keyword repeat token filter + unique token filter + +Mix stemmers for different languages + +porter stem token filter +stemmer token filter +stemmer override token filter +keyword marker token filter +keyword repeat token filter +kstem token filter +snowball token filter +elision token filter +unique token filter +normalization token filter diff --git a/230_Stemming/00_Intro.asciidoc b/230_Stemming/00_Intro.asciidoc new file mode 100644 index 000000000..3c0a5a666 --- /dev/null +++ b/230_Stemming/00_Intro.asciidoc @@ -0,0 +1,74 @@ +[[stemming]] +== Reducing words to their root form + +Most languages of the world are *inflected*, meaning that words can change +their form to express differences in: + +[horizontal] +_number_:: fox, foxes +_tense_:: pay, paid, paying +_gender_:: waiter, waitress +_person_:: hear, hears +_case_:: I, me, my +_aspect_:: ate, eaten +_mood_:: so *be* it, *were* it so + +While inflection aids expressivity, it interferes with retrievability as a +single root _word sense_ or meaning may be represented by many different +sequences of letters. English is a weakly inflected language -- you could +ignore inflections and still get reasonable search results -- but some other +languages are highly inflected and need extra work in order to attain +reasonable results. + +_Stemming_ attempts to remove the difference between inflected forms of a +word, in order to reduce each word to its root form. For instance `foxes` may +be reduced to the root `fox`, to remove the difference between singular and +plural in the same way that we removed the difference betwen lower case and +upper case. + +The root form of a word may not even be a real word. The words `jumping` and +`jumpiness` may both be stemmed to `jumpi`. It doesn't matter -- as long as +the same terms are prodced at index time and at search time, search will just +work. + +If stemming were easy, there would be only one implementation. Unfortunately, +stemming is an inexact science which suffers from two issues: _understemming_ +and _overstemming_. + +Understemming is the failure to reduce words with the same meaning to the same +root. For example, `jumped`, and `jumps` may be reduced to `jump`, while +`jumping` may be reduced to `jumpi`. Understemming reduces retrieval -- +relevant documents are not returned. + +Overstemming is the failure to keep two words with distinct meaning separate. +For instance `general` and `generate` may both be reduced to `gener`. +Overstemming reduces precision -- irrelevant documents are returned when they +shouldn't be. + +.Lemmatisation +********************************************** + +A _lemma_ is the canonical or dictionary form of a set of related words -- the +lemma of `paying`, `paid` and `pays` is `pay`. Usually the lemma resembles +the words it is related to but sometimes it doesn't -- the lemma of `is`, +`was`, `am` and `being` is `be`. + +Lemmatisation, like stemming, tries to group related words, but it goes one +step further than stemming in that it tries to group words by their _word +sense_ or meaning. The same word may represent two different meanings - +``wake'' can mean ``to wake up '' or ``a funeral''. While lemmatisation would +try to distinguish these two word senses, stemming would incorrectly conflate +them. + +Lemmatisation is a much more complicated and expensive process that needs to +understand the context in which words to appear in order to make decisions +about what they mean. For now, stemmers are the best tools that we have +available. + +********************************************** + +There are two types of stemmers available: algorithmic stemmers and dictionary +stemmers. + + + diff --git a/230_Stemming/10_Algorithmic_stemmers.asciidoc b/230_Stemming/10_Algorithmic_stemmers.asciidoc new file mode 100644 index 000000000..f21097cca --- /dev/null +++ b/230_Stemming/10_Algorithmic_stemmers.asciidoc @@ -0,0 +1,63 @@ +[[algorithmic-stemmers]] +=== Algorithmic stemmers + +Most of the stemmers available in Elasticsearch are algorithmic in that they +apply a set of rules to a word in order to reduce them to their root forms, +such as stripping the final `s` or `es` from plurals. They don't have to +know anything about individual words in order to stem them. + +These algorithmic stemmers have the advantage that they are available out of +the box, are fast, use little memory, and work well for regular words. The +downside is that they don't cope well with irrgular words like `be`, `are`, +`am`. + +One of the earliest stemming algorithms was the Porter stemmer for English, +which is still the recommended English stemmer today. Martin Porter +subsequently went on to create the +http://snowball.tartarus.org/[Snowball language] for creating stemming +algorithms and a number of the stemmers available in Elasticsearch are +written in Snowball. + +******************************************** + +The `kstem` token filter is a stemmer for English which combines the +algorithmic approach with a built-in dictionary. The dictionary contains a +list of root words and exceptions in order to avoid conflating words +incorrectly. + +******************************************** + +All of the algorithmic stemmers can be accessed by creating a custom +{ref}analysis-stemmer-tokenfilter.html[`stemmer` token filter] and specifying the +`language`. For instance, perhaps you find the default stemmer used by the +`german` analyzer to be too aggressive and you want to replace it with the +`light_german` stemmer. At the same time, you would like to take advantage of +the <>. You would do so like this: + +[source,js] +-------------------------------------------------- +PUT /my_index +{ + "settings": { + "analysis": { + "filter": { + "light_german_stemmer": { <1> + "type": "stemmer", + "language": "light_german" + } + }, + "analyzer": { + "light_german_analyzer": { <2> + "tokenizer": "icu_tokenizer", + "filter": [ + "lowercase", + "light_german_stemmer", + "icu_folding" + ] + } + } + } + } +} +-------------------------------------------------- + diff --git a/240_Multilingual.asciidoc b/240_Multilingual.asciidoc deleted file mode 100644 index 4744c05e6..000000000 --- a/240_Multilingual.asciidoc +++ /dev/null @@ -1,10 +0,0 @@ -[[multi-lingual]] -== Handling multiple languages - -Predominant language -Mixed documents - - _analyzer - -Separate indices / term freqs / fields - -false cognates diff --git a/230_Stopwords.asciidoc b/240_Stopwords.asciidoc similarity index 100% rename from 230_Stopwords.asciidoc rename to 240_Stopwords.asciidoc diff --git a/250_Synonyms.asciidoc b/260_Synonyms.asciidoc similarity index 100% rename from 250_Synonyms.asciidoc rename to 260_Synonyms.asciidoc diff --git a/260_Fuzzy_matching.asciidoc b/270_Fuzzy_matching.asciidoc similarity index 100% rename from 260_Fuzzy_matching.asciidoc rename to 270_Fuzzy_matching.asciidoc diff --git a/260_Fuzzy_matching/010_Intro.asciidoc b/270_Fuzzy_matching/010_Intro.asciidoc similarity index 100% rename from 260_Fuzzy_matching/010_Intro.asciidoc rename to 270_Fuzzy_matching/010_Intro.asciidoc diff --git a/270_Suggesters.asciidoc b/280_Suggesters.asciidoc similarity index 100% rename from 270_Suggesters.asciidoc rename to 280_Suggesters.asciidoc diff --git a/book.asciidoc b/book.asciidoc index 23e553cd8..002565b3a 100644 --- a/book.asciidoc +++ b/book.asciidoc @@ -52,21 +52,21 @@ include::180_Non_Prose.asciidoc[] include::02_Dealing_with_language.asciidoc[] -include::200_Identifying_words.asciidoc[] +include::200_Language_analyzers.asciidoc[] -include::210_Token_normalization.asciidoc[] +include::210_Identifying_words.asciidoc[] -include::220_Stemming.asciidoc[] +include::220_Token_normalization.asciidoc[] -include::230_Stopwords.asciidoc[] +include::230_Stemming.asciidoc[] -include::240_Multilingual.asciidoc[] +include::240_Stopwords.asciidoc[] -include::250_Synonyms.asciidoc[] +include::260_Synonyms.asciidoc[] -include::260_Fuzzy_matching.asciidoc[] +include::270_Fuzzy_matching.asciidoc[] -include::270_Suggesters.asciidoc[] +include::280_Suggesters.asciidoc[] // Part 4