From d7d3c8eb453efc1ef601268812369c59ae58d146 Mon Sep 17 00:00:00 2001
From: Clinton Gormley <clint@traveljury.com>
Date: Fri, 23 May 2014 10:07:09 +0200
Subject: [PATCH] Moved the multilingual chapter to the beginning of languages

---
 02_Dealing_with_language.asciidoc             |  17 +--
 200_Identifying_words.asciidoc                |  12 ---
 200_Language_analyzers.asciidoc               |  10 ++
 200_Language_analyzers/00_Intro.asciidoc      |  47 ++++++++
 200_Language_analyzers/10_Using.asciidoc      | 101 ++++++++++++++++++
 .../20_Configuring.asciidoc                   |  65 +++++++++++
 .../30_Multiple_languages.asciidoc            |  12 +++
 210_Identifying_words.asciidoc                |  21 ++++
 .../00_Intro.asciidoc                         |   2 +-
 .../10_Standard_analyzer.asciidoc             |   0
 .../20_Standard_tokenizer.asciidoc            |   0
 .../30_ICU_plugin.asciidoc                    |   0
 .../40_ICU_tokenizer.asciidoc                 |   0
 .../50_Tidying_text.asciidoc                  |   0
 210_Token_normalization.asciidoc              |  14 ---
 220_Stemming.asciidoc                         |  20 ----
 220_Token_normalization.asciidoc              |  17 +++
 .../00_Intro.asciidoc                         |   0
 .../10_Lowercasing.asciidoc                   |   0
 .../20_Removing_diacritics.asciidoc           |   0
 .../30_Unicode_world.asciidoc                 |   0
 .../40_Case_folding.asciidoc                  |   0
 .../50_Character_folding.asciidoc             |   2 +-
 .../60_Sorting_and_collations.asciidoc        |   0
 230_Stemming.asciidoc                         |  32 ++++++
 230_Stemming/00_Intro.asciidoc                |  74 +++++++++++++
 230_Stemming/10_Algorithmic_stemmers.asciidoc |  63 +++++++++++
 240_Multilingual.asciidoc                     |  10 --
 ...opwords.asciidoc => 240_Stopwords.asciidoc |   0
 ...Synonyms.asciidoc => 260_Synonyms.asciidoc |   0
 ...ng.asciidoc => 270_Fuzzy_matching.asciidoc |   0
 .../010_Intro.asciidoc                        |   0
 ...esters.asciidoc => 280_Suggesters.asciidoc |   0
 book.asciidoc                                 |  16 +--
 34 files changed, 462 insertions(+), 73 deletions(-)
 delete mode 100644 200_Identifying_words.asciidoc
 create mode 100644 200_Language_analyzers.asciidoc
 create mode 100644 200_Language_analyzers/00_Intro.asciidoc
 create mode 100644 200_Language_analyzers/10_Using.asciidoc
 create mode 100644 200_Language_analyzers/20_Configuring.asciidoc
 create mode 100644 200_Language_analyzers/30_Multiple_languages.asciidoc
 create mode 100644 210_Identifying_words.asciidoc
 rename {200_Identifying_words => 210_Identifying_words}/00_Intro.asciidoc (94%)
 rename {200_Identifying_words => 210_Identifying_words}/10_Standard_analyzer.asciidoc (100%)
 rename {200_Identifying_words => 210_Identifying_words}/20_Standard_tokenizer.asciidoc (100%)
 rename {200_Identifying_words => 210_Identifying_words}/30_ICU_plugin.asciidoc (100%)
 rename {200_Identifying_words => 210_Identifying_words}/40_ICU_tokenizer.asciidoc (100%)
 rename {200_Identifying_words => 210_Identifying_words}/50_Tidying_text.asciidoc (100%)
 delete mode 100644 210_Token_normalization.asciidoc
 delete mode 100644 220_Stemming.asciidoc
 create mode 100644 220_Token_normalization.asciidoc
 rename {210_Token_normalization => 220_Token_normalization}/00_Intro.asciidoc (100%)
 rename {210_Token_normalization => 220_Token_normalization}/10_Lowercasing.asciidoc (100%)
 rename {210_Token_normalization => 220_Token_normalization}/20_Removing_diacritics.asciidoc (100%)
 rename {210_Token_normalization => 220_Token_normalization}/30_Unicode_world.asciidoc (100%)
 rename {210_Token_normalization => 220_Token_normalization}/40_Case_folding.asciidoc (100%)
 rename {210_Token_normalization => 220_Token_normalization}/50_Character_folding.asciidoc (98%)
 rename {210_Token_normalization => 220_Token_normalization}/60_Sorting_and_collations.asciidoc (100%)
 create mode 100644 230_Stemming.asciidoc
 create mode 100644 230_Stemming/00_Intro.asciidoc
 create mode 100644 230_Stemming/10_Algorithmic_stemmers.asciidoc
 delete mode 100644 240_Multilingual.asciidoc
 rename 230_Stopwords.asciidoc => 240_Stopwords.asciidoc (100%)
 rename 250_Synonyms.asciidoc => 260_Synonyms.asciidoc (100%)
 rename 260_Fuzzy_matching.asciidoc => 270_Fuzzy_matching.asciidoc (100%)
 rename {260_Fuzzy_matching => 270_Fuzzy_matching}/010_Intro.asciidoc (100%)
 rename 270_Suggesters.asciidoc => 280_Suggesters.asciidoc (100%)

diff --git a/02_Dealing_with_language.asciidoc b/02_Dealing_with_language.asciidoc
index 5f1d206f4..ec27cbeb8 100644
--- a/02_Dealing_with_language.asciidoc
+++ b/02_Dealing_with_language.asciidoc
@@ -7,11 +7,13 @@
 [quote,Matt Groening]
 ``I know all those words, but that sentence makes no sense to me.''
 
-In <<search-in-depth>> we covered the mechanics of search, but we didn't pay
-much attention to the words themselves. It is not enough for full text  search
-to just match the exact words that the user has queried. Instead, we need to
+Full text search is a battle between _precision_ -- returning as few
+irrelevant documents as possible -- and _recall_ -- returning as many relevant
+documents as possible. While matching only the exact words that the user has
+queried would be precise, it is not enough. We would miss out on many
+documents that the user would consider to be relevant. Instead, we need to
 spread the net wider, to also search for words that are not exactly the same
-as the original, but are related.
+as the original but are related.
 
 Wouldn't you expect a search for ``quick brown fox'' to match a document
 containing ``fast brown foxes'', ``Johnny Walker'' to match ``Johnnie
@@ -46,8 +48,9 @@ There are several lines of attack:
     that we know exists in the index, and a _did-you-mean_ suggester to
     redirect users who may have mistyped a search term. See <<suggesters>>.
 
-But before we can manipulate individual words, we need to divide text up into
-words, which means that we need to know what constitutes a _word_, which we
-will tackle in <<identifying-words>>.
+Before we can manipulate individual words, we need to divide text up into
+words, which means that we need to know what constitutes a _word_. We will
+tackle this in <<identifying-words>>.
 
+But first, let's take a look at how to get started quickly and easily.
 --
diff --git a/200_Identifying_words.asciidoc b/200_Identifying_words.asciidoc
deleted file mode 100644
index 44f9e3f90..000000000
--- a/200_Identifying_words.asciidoc
+++ /dev/null
@@ -1,12 +0,0 @@
-include::200_Identifying_words/00_Intro.asciidoc[]
-
-include::200_Identifying_words/10_Standard_analyzer.asciidoc[]
-
-include::200_Identifying_words/20_Standard_tokenizer.asciidoc[]
-
-include::200_Identifying_words/30_ICU_plugin.asciidoc[]
-
-include::200_Identifying_words/40_ICU_tokenizer.asciidoc[]
-
-include::200_Identifying_words/50_Tidying_text.asciidoc[]
-
diff --git a/200_Language_analyzers.asciidoc b/200_Language_analyzers.asciidoc
new file mode 100644
index 000000000..598858b73
--- /dev/null
+++ b/200_Language_analyzers.asciidoc
@@ -0,0 +1,10 @@
+include::200_Language_analyzers/00_Intro.asciidoc[]
+
+include::200_Language_analyzers/10_Using.asciidoc[]
+
+include::200_Language_analyzers/20_Configuring.asciidoc[]
+
+include::200_Language_analyzers/30_Multiple_languages.asciidoc[]
+
+
+
diff --git a/200_Language_analyzers/00_Intro.asciidoc b/200_Language_analyzers/00_Intro.asciidoc
new file mode 100644
index 000000000..0d3f5618d
--- /dev/null
+++ b/200_Language_analyzers/00_Intro.asciidoc
@@ -0,0 +1,47 @@
+[[language-analyzers]]
+== Language analyzers
+
+Elasticsearch ships with a collection of language analyzers which provide
+good, basic, out-of-the-box support for a number of the world's most common
+languages:
+
+Arabic, Armenian, Basque, Brazilian, Bulgarian, Catalan, Chinese,
+Czech, Danish, Dutch, English, Finnish, French, Galician, German, Greek,
+Hindi, Hungarian, Indonesian, Italian, Japanese, Korean, Norwegian, Persian,
+Portuguese, Romanian, Russian, Spanish, Swedish, Turkish, and Thai.
+
+These analyzers typically perform four roles:
+
+* Tokenize text into individual words:
++
+`The quick brown foxes` -> [`The`, `quick`, `brown`, `foxes`]
+
+* Lowercase tokens:
++
+`The` -> `the`
+
+* Remove commom _stopwords_:
++
+&#91;`The`, `quick`, `brown`, `foxes`] -> [`quick`, `brown`, `foxes`]
+
+* Stem tokens to their root form:
++
+`foxes` -> `fox`
+
+Each analyzer may also apply other transformations specific to its language in
+order to make words from that language more searchable:
+
+* the `english` analyzer removes the possessive `'s`:
++
+`John's` -> `john`
+
+* the `french` analyzer removes _elisions_ like `l'` and `qu'` and
+  _diactrics_ like `¨` or `^`:
++
+`l'église` -> `eglis`
+
+* the `german` analyzer normalizes terms, replacing `ä` and `ae` with `a`, or
+  `ß` with `ss`, among others:
++
+`äußerst` -> `ausserst`
+
diff --git a/200_Language_analyzers/10_Using.asciidoc b/200_Language_analyzers/10_Using.asciidoc
new file mode 100644
index 000000000..f93429c97
--- /dev/null
+++ b/200_Language_analyzers/10_Using.asciidoc
@@ -0,0 +1,101 @@
+[[using-language-analyzers]]
+=== Using language analyzers
+
+The built-in language analyzers are available globally and don't need to be
+configured before being used.  They can be specified directly in the field
+mapping:
+
+[source,js]
+--------------------------------------------------
+PUT /my_index
+{
+  "mappings": {
+    "blog": {
+      "properties": {
+        "title": {
+          "type":     "string",
+          "analyzer": "english" <1>
+        }
+      }
+    }
+  }
+}
+--------------------------------------------------
+<1> The `title` field will use the `english` analyzer instead of the default
+    `standard` analyzer.
+
+Of course, by passing text through the `english` analyzer, we lose
+information:
+
+[source,js]
+--------------------------------------------------
+GET /my_index/_analyze?field=title <1>
+I'm not happy about the foxes
+--------------------------------------------------
+<1> Emits tokens: `i'm`, `happi`, `about`, `fox`
+
+We can't tell if the document mentions one `fox` or many  `foxes`; the word
+`not` is a stopword and is removed, so we can't tell whether the document is
+happy about foxes or *not*. By using the `english` analyzer, we have increased
+recall as we can match more loosely, but we have reduced our ability to rank
+documents accurately.
+
+To get the best of both worlds, we can use <<multi-fields,multi-fields>> to
+index the `title` field twice: once with the `english` analyzer and once with
+the `standard` analyzer:
+
+[source,js]
+--------------------------------------------------
+PUT /my_index
+{
+  "mappings": {
+    "blog": {
+      "properties": {
+        "title": { <1>
+          "type": "string",
+          "fields": {
+            "english": { <2>
+              "type":     "string",
+              "analyzer": "english"
+            }
+          }
+        }
+      }
+    }
+  }
+}
+--------------------------------------------------
+<1> The main `title` field uses the `standard` analyzer.
+<2> The `title.english` sub-field uses the `english` analyzer.
+
+With this mapping in place, we can index some test documents to demonstrate
+how to use both fields at query time:
+
+[source,js]
+--------------------------------------------------
+PUT /my_index/blog/1
+{ "title": "I'm happy for this fox" }
+
+PUT /my_index/blog/2
+{ "title": "I'm not happy about my fox problem" }
+
+GET /_search
+{
+  "query": {
+    "multi_match": {
+      "type":     "most_fields" <1>
+      "query":    "not happy foxes",
+      "fields": [ "title", "title.english" ]
+    }
+  }
+}
+--------------------------------------------------
+<1> Use the <<most-fields,`most_fields`>> query type to match the
+    same text in as many fields as possible.
+
+Even though neither of our documents contain the word `foxes`,  both documents
+are returned as results thanks to the word stemming on the `title.english`
+field.  The second document is ranked as more relevant, because the word `not`
+matches on the `title` field.
+
+
diff --git a/200_Language_analyzers/20_Configuring.asciidoc b/200_Language_analyzers/20_Configuring.asciidoc
new file mode 100644
index 000000000..159a001fc
--- /dev/null
+++ b/200_Language_analyzers/20_Configuring.asciidoc
@@ -0,0 +1,65 @@
+[[configuring-language-analyzers]]
+=== Configuring language analyzers
+
+While the language analyzers can be used out of the box without any
+configuration, most of them do allow you to control aspects of their
+behaviour, specifically:
+
+Stem word exclusion::
++
+Imagine, for instance, that users searching for the ``World Health
+Organization'' are instead getting results for ``organ health''. The reason
+for this confusion is that both ``organ'' and ``organization'' are stemmed to
+the same root word: `organ`. Often this isn't a problem, but in this
+particular collection of documents this leads to confusing results. We would
+like to prevent the words `organization` and `organizations` from being
+stemmed.
+
+Custom stopwords::
+
+The default list of stopwords used in English are:
++
+    a, an, and, are, as, at, be, but, by, for, if, in, into, is, it,
+    no, not, of, on, or, such, that, the, their, then, there, these,
+    they, this, to, was, will, with
++
+The unusual thing about `no` and `not` is that they invert the meaning of the
+words that follow them. Perhaps we decide that these two words are important
+and that we shouldn't treat them as stopwords.
+
+In order to customize the behaviour of the `english` analyzer, we need to
+create a custom analyzer which uses the `english` analyzer as its base, but
+adds some configuration:
+
+[source,js]
+--------------------------------------------------
+PUT /my_index
+{
+  "settings": {
+    "analysis": {
+      "analyzer": {
+        "my_english": {
+          "type": "english",
+          "stem_exclusion": [ "organization", "organizations" ], <1>
+          "stopwords": [ <2>
+            "a", "an", "and", "are", "as", "at", "be", "but", "by", "for",
+            "if", "in", "into", "is", "it", "of", "on", "or", "such", "that",
+            "the", "their", "then", "there", "these", "they", "this", "to",
+            "was", "will", "with"
+          ]
+        }
+      }
+    }
+  }
+}
+
+GET /my_index/_analyze?analyzer=my_english <3>
+The World Health Organization does not sell organs.
+--------------------------------------------------
+<1> Prevents `organization` and `organizations` from being stemmed.
+<2> Specifies a custom list of stopwords.
+<3> Emits tokens `world`, `health`, `organization`, `doe`, `not`, `sell`, `organ`.
+
+We will discuss stemming and stopwords in much more detail in <<stemming>> and
+<<stopwords>> respectively.
+
diff --git a/200_Language_analyzers/30_Multiple_languages.asciidoc b/200_Language_analyzers/30_Multiple_languages.asciidoc
new file mode 100644
index 000000000..dd5c20afe
--- /dev/null
+++ b/200_Language_analyzers/30_Multiple_languages.asciidoc
@@ -0,0 +1,12 @@
+[[multiple-languages]]
+=== Handling multiple languages
+
+If you only have to deal with a single language, count yourself lucky.
+Finding the right strategy for handling documents written in several languages
+can be challenging. There are three possible scenarios:
+
+* each document contains text in a single language
+* a document may contain more than one language, but each field contains
+  text in a single language
+* a single field may contain a mixture of languages
+
diff --git a/210_Identifying_words.asciidoc b/210_Identifying_words.asciidoc
new file mode 100644
index 000000000..5848af766
--- /dev/null
+++ b/210_Identifying_words.asciidoc
@@ -0,0 +1,21 @@
+include::210_Identifying_words/00_Intro.asciidoc[]
+
+include::210_Identifying_words/10_Standard_analyzer.asciidoc[]
+
+include::210_Identifying_words/20_Standard_tokenizer.asciidoc[]
+
+include::210_Identifying_words/30_ICU_plugin.asciidoc[]
+
+include::210_Identifying_words/40_ICU_tokenizer.asciidoc[]
+
+include::210_Identifying_words/50_Tidying_text.asciidoc[]
+
+//////////////////
+
+Compound words
+
+language specific
+ - kuromoji
+ - chinese
+
+//////////////////
diff --git a/200_Identifying_words/00_Intro.asciidoc b/210_Identifying_words/00_Intro.asciidoc
similarity index 94%
rename from 200_Identifying_words/00_Intro.asciidoc
rename to 210_Identifying_words/00_Intro.asciidoc
index 8b7872619..f8a5cfe0e 100644
--- a/200_Identifying_words/00_Intro.asciidoc
+++ b/210_Identifying_words/00_Intro.asciidoc
@@ -15,7 +15,7 @@ constituent parts.
 Asian languages are even more complex: some have no whitespace between words,
 sentences or even paragraphs. Some words can be represented by a single
 character, but the same single character, when placed next to other
-characters, can form just a part of a longer word with a quite different
+characters, can form just one part of a longer word with a quite different
 meaning.
 
 It should be obvious that there is no ``silver bullet'' analyzer that will
diff --git a/200_Identifying_words/10_Standard_analyzer.asciidoc b/210_Identifying_words/10_Standard_analyzer.asciidoc
similarity index 100%
rename from 200_Identifying_words/10_Standard_analyzer.asciidoc
rename to 210_Identifying_words/10_Standard_analyzer.asciidoc
diff --git a/200_Identifying_words/20_Standard_tokenizer.asciidoc b/210_Identifying_words/20_Standard_tokenizer.asciidoc
similarity index 100%
rename from 200_Identifying_words/20_Standard_tokenizer.asciidoc
rename to 210_Identifying_words/20_Standard_tokenizer.asciidoc
diff --git a/200_Identifying_words/30_ICU_plugin.asciidoc b/210_Identifying_words/30_ICU_plugin.asciidoc
similarity index 100%
rename from 200_Identifying_words/30_ICU_plugin.asciidoc
rename to 210_Identifying_words/30_ICU_plugin.asciidoc
diff --git a/200_Identifying_words/40_ICU_tokenizer.asciidoc b/210_Identifying_words/40_ICU_tokenizer.asciidoc
similarity index 100%
rename from 200_Identifying_words/40_ICU_tokenizer.asciidoc
rename to 210_Identifying_words/40_ICU_tokenizer.asciidoc
diff --git a/200_Identifying_words/50_Tidying_text.asciidoc b/210_Identifying_words/50_Tidying_text.asciidoc
similarity index 100%
rename from 200_Identifying_words/50_Tidying_text.asciidoc
rename to 210_Identifying_words/50_Tidying_text.asciidoc
diff --git a/210_Token_normalization.asciidoc b/210_Token_normalization.asciidoc
deleted file mode 100644
index d73d4567c..000000000
--- a/210_Token_normalization.asciidoc
+++ /dev/null
@@ -1,14 +0,0 @@
-include::210_Token_normalization/00_Intro.asciidoc[]
-
-include::210_Token_normalization/10_Lowercasing.asciidoc[]
-
-include::210_Token_normalization/20_Removing_diacritics.asciidoc[]
-
-include::210_Token_normalization/30_Unicode_world.asciidoc[]
-
-include::210_Token_normalization/40_Case_folding.asciidoc[]
-
-include::210_Token_normalization/50_Character_folding.asciidoc[]
-
-include::210_Token_normalization/60_Sorting_and_collations.asciidoc[]
-
diff --git a/220_Stemming.asciidoc b/220_Stemming.asciidoc
deleted file mode 100644
index 05dbb3f70..000000000
--- a/220_Stemming.asciidoc
+++ /dev/null
@@ -1,20 +0,0 @@
-[[stemming]]
-== Reducing words to their root form
-
-Stemming vs lemmatization
-
-algorithmic stemmers
-    porter stem token filter
-    stemmer token filter
-    kstem token filter
-    snowball token filter
-
-dictionary based
-    hunspell
-
-controlling stemmers
-    stemmer override token filter
-    keyword marker token filter
-
-    keyword repeat token filter
-    unique token filter
diff --git a/220_Token_normalization.asciidoc b/220_Token_normalization.asciidoc
new file mode 100644
index 000000000..6ddb3695d
--- /dev/null
+++ b/220_Token_normalization.asciidoc
@@ -0,0 +1,17 @@
+include::220_Token_normalization/00_Intro.asciidoc[]
+
+include::220_Token_normalization/10_Lowercasing.asciidoc[]
+
+include::220_Token_normalization/20_Removing_diacritics.asciidoc[]
+
+include::220_Token_normalization/30_Unicode_world.asciidoc[]
+
+include::220_Token_normalization/40_Case_folding.asciidoc[]
+
+include::220_Token_normalization/50_Character_folding.asciidoc[]
+
+// TODO: Add normalization character filter with ngram tokenizer for decompounding german
+// German ngrams should be 4, not 3
+
+include::220_Token_normalization/60_Sorting_and_collations.asciidoc[]
+
diff --git a/210_Token_normalization/00_Intro.asciidoc b/220_Token_normalization/00_Intro.asciidoc
similarity index 100%
rename from 210_Token_normalization/00_Intro.asciidoc
rename to 220_Token_normalization/00_Intro.asciidoc
diff --git a/210_Token_normalization/10_Lowercasing.asciidoc b/220_Token_normalization/10_Lowercasing.asciidoc
similarity index 100%
rename from 210_Token_normalization/10_Lowercasing.asciidoc
rename to 220_Token_normalization/10_Lowercasing.asciidoc
diff --git a/210_Token_normalization/20_Removing_diacritics.asciidoc b/220_Token_normalization/20_Removing_diacritics.asciidoc
similarity index 100%
rename from 210_Token_normalization/20_Removing_diacritics.asciidoc
rename to 220_Token_normalization/20_Removing_diacritics.asciidoc
diff --git a/210_Token_normalization/30_Unicode_world.asciidoc b/220_Token_normalization/30_Unicode_world.asciidoc
similarity index 100%
rename from 210_Token_normalization/30_Unicode_world.asciidoc
rename to 220_Token_normalization/30_Unicode_world.asciidoc
diff --git a/210_Token_normalization/40_Case_folding.asciidoc b/220_Token_normalization/40_Case_folding.asciidoc
similarity index 100%
rename from 210_Token_normalization/40_Case_folding.asciidoc
rename to 220_Token_normalization/40_Case_folding.asciidoc
diff --git a/210_Token_normalization/50_Character_folding.asciidoc b/220_Token_normalization/50_Character_folding.asciidoc
similarity index 98%
rename from 210_Token_normalization/50_Character_folding.asciidoc
rename to 220_Token_normalization/50_Character_folding.asciidoc
index eb674bb88..9ed9f792b 100644
--- a/210_Token_normalization/50_Character_folding.asciidoc
+++ b/220_Token_normalization/50_Character_folding.asciidoc
@@ -44,7 +44,7 @@ http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html[_UnicodeSet
 characters may be folded.  For instance, to exclude the Swedish letters `å`,
 `ä`, `ö`, ++Å++, `Ä` and `Ö` from folding, you would specify a character class
 representing all Unicode characters, except for those letters: `[^åäöÅÄÖ]`
-(`^` means ``except'').
+(`^` means ``everything except'').
 
 [source,js]
 --------------------------------------------------
diff --git a/210_Token_normalization/60_Sorting_and_collations.asciidoc b/220_Token_normalization/60_Sorting_and_collations.asciidoc
similarity index 100%
rename from 210_Token_normalization/60_Sorting_and_collations.asciidoc
rename to 220_Token_normalization/60_Sorting_and_collations.asciidoc
diff --git a/230_Stemming.asciidoc b/230_Stemming.asciidoc
new file mode 100644
index 000000000..d9216e3c9
--- /dev/null
+++ b/230_Stemming.asciidoc
@@ -0,0 +1,32 @@
+include::230_Stemming/00_Intro.asciidoc[]
+
+include::230_Stemming/10_Algorithmic_stemmers.asciidoc[]
+
+algorithmic stemmers
+    porter stem token filter
+    stemmer token filter
+    kstem token filter
+    snowball token filter
+
+dictionary based
+    hunspell
+
+controlling stemmers
+    stemmer override token filter
+    keyword marker token filter
+
+    keyword repeat token filter
+    unique token filter
+
+Mix stemmers for different languages
+
+porter stem token filter
+stemmer token filter
+stemmer override token filter
+keyword marker token filter
+keyword repeat token filter
+kstem token filter
+snowball token filter
+elision token filter
+unique token filter
+normalization token filter
diff --git a/230_Stemming/00_Intro.asciidoc b/230_Stemming/00_Intro.asciidoc
new file mode 100644
index 000000000..3c0a5a666
--- /dev/null
+++ b/230_Stemming/00_Intro.asciidoc
@@ -0,0 +1,74 @@
+[[stemming]]
+== Reducing words to their root form
+
+Most languages of the world are *inflected*, meaning that words can change
+their form to express differences in:
+
+[horizontal]
+_number_::      fox, foxes
+_tense_::       pay, paid, paying
+_gender_::      waiter, waitress
+_person_::      hear, hears
+_case_::        I, me, my
+_aspect_::      ate, eaten
+_mood_::        so *be* it, *were* it so
+
+While inflection aids expressivity, it interferes with retrievability as a
+single root _word sense_ or meaning may be represented by many different
+sequences of letters. English is a weakly inflected language -- you could
+ignore inflections and still get reasonable search results -- but some other
+languages are highly inflected and need extra work in order to attain
+reasonable results.
+
+_Stemming_ attempts to remove the difference between inflected forms of a
+word, in order to reduce each word to its root form. For instance `foxes` may
+be reduced to the root `fox`, to remove the difference between singular and
+plural in the same way that we removed the difference betwen lower case and
+upper case.
+
+The root form of a word may not even be a real word. The words `jumping` and
+`jumpiness` may both be stemmed to `jumpi`. It doesn't matter -- as long as
+the same terms are prodced at index time and at search time, search will just
+work.
+
+If stemming were easy, there would be only one implementation. Unfortunately,
+stemming is an inexact science which suffers from two issues: _understemming_
+and _overstemming_.
+
+Understemming is the failure to reduce words with the same meaning to the same
+root. For example, `jumped`, and `jumps` may be reduced to `jump`, while
+`jumping` may be reduced to `jumpi`.  Understemming reduces retrieval --
+relevant documents are not returned.
+
+Overstemming is the failure to keep two words with distinct meaning separate.
+For instance `general` and `generate` may both be reduced to `gener`.
+Overstemming reduces precision -- irrelevant documents are returned when they
+shouldn't be.
+
+.Lemmatisation
+**********************************************
+
+A _lemma_ is the canonical or dictionary form of a set of related words -- the
+lemma of `paying`, `paid` and `pays` is `pay`.  Usually the lemma resembles
+the words it is related to but sometimes it doesn't -- the lemma of `is`,
+`was`, `am` and `being` is `be`.
+
+Lemmatisation, like stemming, tries to group related words, but it goes one
+step further than stemming in that it tries to group words by their _word
+sense_ or meaning.  The same word may represent two different meanings -
+``wake'' can mean ``to wake up '' or ``a funeral''.  While lemmatisation would
+try to distinguish these two word senses, stemming would incorrectly conflate
+them.
+
+Lemmatisation is a much more complicated and expensive process that needs to
+understand the context in which words to appear in order to make decisions
+about what they mean. For now, stemmers are the best tools that we have
+available.
+
+**********************************************
+
+There are two types of stemmers available: algorithmic stemmers and dictionary
+stemmers.
+
+
+
diff --git a/230_Stemming/10_Algorithmic_stemmers.asciidoc b/230_Stemming/10_Algorithmic_stemmers.asciidoc
new file mode 100644
index 000000000..f21097cca
--- /dev/null
+++ b/230_Stemming/10_Algorithmic_stemmers.asciidoc
@@ -0,0 +1,63 @@
+[[algorithmic-stemmers]]
+=== Algorithmic stemmers
+
+Most of the stemmers available in Elasticsearch are algorithmic in that they
+apply a set of rules to a word in order to reduce them to their root forms,
+such as stripping the final `s` or `es` from plurals.   They don't have to
+know anything about individual words in order to stem them.
+
+These algorithmic stemmers have the advantage that they are available out of
+the box, are fast, use little memory, and work well for regular words.  The
+downside is that they don't cope well with irrgular words like `be`, `are`,
+`am`.
+
+One of the earliest stemming algorithms was the Porter stemmer for English,
+which is still the recommended English stemmer today.  Martin Porter
+subsequently went on to create the
+http://snowball.tartarus.org/[Snowball language] for creating stemming
+algorithms and a number of the stemmers available in Elasticsearch are
+written in Snowball.
+
+********************************************
+
+The `kstem` token filter is a stemmer for English which combines the
+algorithmic approach with a built-in dictionary. The dictionary contains a
+list of root words and exceptions in order to avoid conflating words
+incorrectly.
+
+********************************************
+
+All of the algorithmic stemmers can be accessed by creating a custom
+{ref}analysis-stemmer-tokenfilter.html[`stemmer` token filter] and specifying the
+`language`. For instance, perhaps you find the default stemmer used by the
+`german` analyzer to be too aggressive and you want to replace it with the
+`light_german` stemmer.  At the same time, you would like to take advantage of
+the <<character-folding,`icu_folding` filter>>.  You would do so like this:
+
+[source,js]
+--------------------------------------------------
+PUT /my_index
+{
+  "settings": {
+    "analysis": {
+      "filter": {
+        "light_german_stemmer": {  <1>
+          "type":     "stemmer",
+          "language": "light_german"
+        }
+      },
+      "analyzer": {
+        "light_german_analyzer": { <2>
+          "tokenizer":  "icu_tokenizer",
+          "filter": [
+                        "lowercase",
+                        "light_german_stemmer",
+                        "icu_folding"
+          ]
+        }
+      }
+    }
+  }
+}
+--------------------------------------------------
+
diff --git a/240_Multilingual.asciidoc b/240_Multilingual.asciidoc
deleted file mode 100644
index 4744c05e6..000000000
--- a/240_Multilingual.asciidoc
+++ /dev/null
@@ -1,10 +0,0 @@
-[[multi-lingual]]
-== Handling multiple languages
-
-Predominant language
-Mixed documents
- - _analyzer
-
-Separate indices / term freqs / fields
-
-false cognates
diff --git a/230_Stopwords.asciidoc b/240_Stopwords.asciidoc
similarity index 100%
rename from 230_Stopwords.asciidoc
rename to 240_Stopwords.asciidoc
diff --git a/250_Synonyms.asciidoc b/260_Synonyms.asciidoc
similarity index 100%
rename from 250_Synonyms.asciidoc
rename to 260_Synonyms.asciidoc
diff --git a/260_Fuzzy_matching.asciidoc b/270_Fuzzy_matching.asciidoc
similarity index 100%
rename from 260_Fuzzy_matching.asciidoc
rename to 270_Fuzzy_matching.asciidoc
diff --git a/260_Fuzzy_matching/010_Intro.asciidoc b/270_Fuzzy_matching/010_Intro.asciidoc
similarity index 100%
rename from 260_Fuzzy_matching/010_Intro.asciidoc
rename to 270_Fuzzy_matching/010_Intro.asciidoc
diff --git a/270_Suggesters.asciidoc b/280_Suggesters.asciidoc
similarity index 100%
rename from 270_Suggesters.asciidoc
rename to 280_Suggesters.asciidoc
diff --git a/book.asciidoc b/book.asciidoc
index 23e553cd8..002565b3a 100644
--- a/book.asciidoc
+++ b/book.asciidoc
@@ -52,21 +52,21 @@ include::180_Non_Prose.asciidoc[]
 
 include::02_Dealing_with_language.asciidoc[]
 
-include::200_Identifying_words.asciidoc[]
+include::200_Language_analyzers.asciidoc[]
 
-include::210_Token_normalization.asciidoc[]
+include::210_Identifying_words.asciidoc[]
 
-include::220_Stemming.asciidoc[]
+include::220_Token_normalization.asciidoc[]
 
-include::230_Stopwords.asciidoc[]
+include::230_Stemming.asciidoc[]
 
-include::240_Multilingual.asciidoc[]
+include::240_Stopwords.asciidoc[]
 
-include::250_Synonyms.asciidoc[]
+include::260_Synonyms.asciidoc[]
 
-include::260_Fuzzy_matching.asciidoc[]
+include::270_Fuzzy_matching.asciidoc[]
 
-include::270_Suggesters.asciidoc[]
+include::280_Suggesters.asciidoc[]
 
 // Part 4