Skip to content

Commit

Permalink
Rename latin tokenizer to "ascii_stemmer"
Browse files Browse the repository at this point in the history
  • Loading branch information
ang-zeyu committed Dec 30, 2022
1 parent c0e0f06 commit 98159f1
Show file tree
Hide file tree
Showing 28 changed files with 98 additions and 71 deletions.
14 changes: 7 additions & 7 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,15 @@ releaseAsciiLanguage:

# These 2 are separate as the prior needs to be published first
preReleaseOtherLanguages:
cd packages/infisearch_languages/infisearch_lang_latin &&\
cd packages/infisearch_languages/infisearch_lang_ascii_stemmer &&\
cargo package &&\
cargo package --list
cd packages/infisearch_languages/infisearch_lang_chinese &&\
cargo package &&\
cargo package --list

releaseOtherLanguages:
cd packages/infisearch_languages/infisearch_lang_latin &&\
cd packages/infisearch_languages/infisearch_lang_ascii_stemmer &&\
cargo publish
cd packages/infisearch_languages/infisearch_lang_chinese &&\
cargo publish
Expand Down
2 changes: 1 addition & 1 deletion docs/src/SUMMARY.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@
- [CSP](./csp.md)
- [Search UI](./search_configuration.md)
- [Styling](./search_configuration_styling.md)
- [Language](./language.md)
- [Indexer](./indexer_configuration.md)
- [Fields](./indexer/fields.md)
- [Files](./indexer/files.md)
- [Miscellaneous](./indexer/misc.md)
- [Language](./language.md)
- [Search API](./search_api.md)
- [Query Syntax](./search_syntax.md)
- [Larger Collections](./larger_collections.md)
Expand Down
20 changes: 13 additions & 7 deletions docs/src/language.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,19 @@

There are 3 language modules available. To configure these, you will need to serve the appropriate [language bundle](./getting_started.md#hosting-the-files) in your HTML (or edit the CDN link accordingly), and edit the indexer configuration file.

## Ascii Tokenizer
```json
{
"lang_config": {
// ... options go here ...
}
}
```

#### CDN link
## Ascii Tokenizer

The default tokenizer splits on sentences, then whitespaces to obtain tokens.
The default tokenizer should work for any language that relies on ASCII characters, or their inflections (e.g. "á").

An [asciiFoldingFilter](https://github.com/tantivy-search/tantivy/blob/main/src/tokenizer/ascii_folding_filter.rs) is then applied to these tokens, followed by punctuation and non-word-character boundary removal.
The text is first split into on sentences, then whitespaces to obtain tokens. An [asciiFoldingFilter](https://github.com/tantivy-search/tantivy/blob/main/src/tokenizer/ascii_folding_filter.rs) is then applied to normalize diacritics, followed by punctuation and non-word-character boundary removal.

```json
{
Expand All @@ -34,13 +40,13 @@ An [asciiFoldingFilter](https://github.com/tantivy-search/tantivy/blob/main/src/
<script src="https://cdn.jsdelivr.net/gh/ang-zeyu/[email protected]/packages/search-ui/dist/search-ui.ascii.bundle.js"></script>
```

## Latin Tokenizer
## Ascii Tokenizer with Stemmer

This is essentially the same as the ascii tokenizer, but adds a `stemmer` option.

```json
{
"lang": "latin",
"lang": "ascii_stemmer",
"options": {
// ----------------------------------
// Ascii Tokenizer options also apply
Expand All @@ -60,7 +66,7 @@ If you do not need stemming, use the `ascii` tokenizer, which has a smaller wasm
**CDN Link**

```html
<script src="https://cdn.jsdelivr.net/gh/ang-zeyu/[email protected]/packages/search-ui/dist/search-ui.latin.bundle.js"></script>
<script src="https://cdn.jsdelivr.net/gh/ang-zeyu/[email protected]/packages/search-ui/dist/search-ui.ascii-stemmer.bundle.js"></script>
```

## Chinese Tokenizer
Expand Down
48 changes: 34 additions & 14 deletions e2e/e2e.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,16 @@ const testSuite = async (configFile, with_positions, with_filters) => {

await reloadPage(lang);

const isAsciiStemmer = lang === 'ascii_stemmer';

// ------------------------------------------------------
// Various basic tests on docid=0
await typeText('+npm +run +dev +installmdbook');
await assertSingle('use the npm run dev script');
await assertSingle(
isAsciiStemmer
? 'then run npm run devServer1 to'
: 'use the npm run dev script',
);

if (with_positions) {
await typeText('+"npm run dev" +(installmdbook 8080)');
Expand Down Expand Up @@ -399,8 +405,12 @@ const testSuite = async (configFile, with_positions, with_filters) => {
'detecting such terms',
'the change detection',
'detectedd as per the earlier section',
'mobile device detection',
];
if (!isAsciiStemmer) {
// The stemmer version generates 'detect' and 'detectedd',
// weighting the other sub result more heavily and discarding this one.
expectedPrefixResults.push('mobile device detection');
}
await typeText('detec');
await assertMultiple(expectedPrefixResults, 2);

Expand All @@ -427,7 +437,11 @@ const testSuite = async (configFile, with_positions, with_filters) => {
await assertSingle('lorem ipsum is simply dummy text');

await typePhraseOrAnd('test many json 2', with_positions);
await assertSingle('test many json 2');
if (isAsciiStemmer) {
await assertMultiple(['test many json 2', 'estimate from testing'], 2);
} else {
await assertSingle('test many json 2');
}
// ------------------------------------------------------

// ------------------------------------------------------
Expand All @@ -454,8 +468,14 @@ const testSuite = async (configFile, with_positions, with_filters) => {
runFullIndex(configFile);

// 1, to be deleted later

// For AND queries, make it slightly more specific
const test404Query = 'This URL is invaldi' + (with_positions ? '' : ' navigation');
const testContributionsQuery = 'Contributions of any form' + (with_positions ? '' : ' development');
const testContributionsUpdatedQuery = 'Contributions of all forms' + (with_positions ? '' : ' atquejxusd');

await reloadPage(lang);
await typePhraseOrAnd('This URL is invaldi', with_positions);
await typePhraseOrAnd(test404Query, with_positions);
await waitNoResults();

fs.copyFileSync(
Expand All @@ -465,11 +485,11 @@ const testSuite = async (configFile, with_positions, with_filters) => {
runIncrementalIndex(configFile);

await reloadPage(lang);
await typePhraseOrAnd('This URL is invaldi', with_positions);
await typePhraseOrAnd(test404Query, with_positions);
await assertSingle('this url is invalid');

// 2, to be updated later
await typePhraseOrAnd('Contributions of any form', with_positions);
await typePhraseOrAnd(testContributionsQuery, with_positions);
await waitNoResults();

const contributingHtmlOutputPath = path.join(__dirname, 'input/contributing.html');
Expand All @@ -480,7 +500,7 @@ const testSuite = async (configFile, with_positions, with_filters) => {
runIncrementalIndex(configFile);

await reloadPage(lang);
await typePhraseOrAnd('Contributions of any form', with_positions);
await typePhraseOrAnd(testContributionsQuery, with_positions);
await assertSingle('contributions of any form');

// ------------------------------------------------------
Expand All @@ -494,7 +514,7 @@ const testSuite = async (configFile, with_positions, with_filters) => {
runIncrementalIndex(configFile);

await reloadPage(lang);
await typePhraseOrAnd('This URL is invaldi', with_positions);
await typePhraseOrAnd(test404Query, with_positions);
await waitNoResults();

expectNumDeletedDocs(1);
Expand All @@ -504,7 +524,7 @@ const testSuite = async (configFile, with_positions, with_filters) => {
// ------------------------------------------------------
// Test incremental indexing update

await typePhraseOrAnd('Contributions of all forms', with_positions);
await typePhraseOrAnd(testContributionsUpdatedQuery, with_positions);
await waitNoResults();

let contributingHtml = fs.readFileSync(contributingHtmlOutputPath, 'utf-8');
Expand All @@ -515,10 +535,10 @@ const testSuite = async (configFile, with_positions, with_filters) => {
runIncrementalIndex(configFile);

await reloadPage(lang);
await typePhraseOrAnd('Contributions of any form', with_positions);
await typePhraseOrAnd(testContributionsQuery, with_positions);
await waitNoResults();

await typePhraseOrAnd('Contributions of all forms', with_positions);
await typePhraseOrAnd(testContributionsUpdatedQuery, with_positions);
await assertSingle('contributions of all forms');

await typeText('atquejxusd ');
Expand All @@ -532,10 +552,10 @@ const testSuite = async (configFile, with_positions, with_filters) => {
runIncrementalIndex(configFile);

await reloadPage(lang);
await typePhraseOrAnd('Contributions of any form', with_positions);
await typePhraseOrAnd(testContributionsQuery, with_positions);
await waitNoResults();

await typePhraseOrAnd('Contributions of all forms', with_positions);
await typePhraseOrAnd(testContributionsUpdatedQuery, with_positions);
await waitNoResults();

await typeText('atquejxusd');
Expand Down Expand Up @@ -678,7 +698,7 @@ const mainTest = async () => {
outputConfig = readOutputConfig();
expect(outputConfig.indexingConfig.plNamesToCache).toHaveLength(0);

// Latin tokenizer
// Ascii with stemmer tokenizer
// No positions
cleanup();
console.log('Starting infi_search_4 tests');
Expand Down
2 changes: 1 addition & 1 deletion e2e/input/infi_search_4.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"fields": {}
},
"lang_config": {
"lang": "ascii",
"lang": "ascii_stemmer",
"options": {
"ignore_stop_words": false
}
Expand Down
2 changes: 1 addition & 1 deletion packages/infisearch/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ log = { version = "0.4", features = ["max_level_info", "release_max_level_info"]
log4rs = "1.0"
infisearch_common = { path = "../infisearch_common", version="=0.9.1", features = ["indexer"] }
infisearch_lang_ascii = { path = "../infisearch_languages/infisearch_lang_ascii", version="=0.9.1", features = ["indexer"] }
infisearch_lang_latin = { path = "../infisearch_languages/infisearch_lang_latin", version="=0.9.1", features = ["indexer"] }
infisearch_lang_ascii_stemmer = { path = "../infisearch_languages/infisearch_lang_ascii_stemmer", version="=0.9.1", features = ["indexer"] }
infisearch_lang_chinese = { path = "../infisearch_languages/infisearch_lang_chinese", version="=0.9.1", features = ["indexer"] }
num_cpus = "1"
path-absolutize = { version = "3.0", features = ["lazy_static_cache"] }
Expand Down
4 changes: 2 additions & 2 deletions packages/infisearch/src/indexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ use infisearch_common::language::InfiLanguageConfig;
use infisearch_common::METADATA_FILE;
use infisearch_common::tokenize::IndexerTokenizer;
use infisearch_lang_ascii::ascii;
use infisearch_lang_latin::latin;
use infisearch_lang_ascii_stemmer::ascii_stemmer;
use infisearch_lang_chinese::chinese;

use crate::dictionary_writer::DictWriter;
Expand Down Expand Up @@ -247,7 +247,7 @@ impl Indexer {
fn resolve_tokenizer(lang_config: &InfiLanguageConfig) -> Arc<dyn IndexerTokenizer + Send + Sync> {
match lang_config.lang.as_str() {
"ascii" => Arc::new(ascii::new_with_options(lang_config)),
"latin" => Arc::new(latin::new_with_options(lang_config)),
"ascii_stemmer" => Arc::new(ascii_stemmer::new_with_options(lang_config)),
"chinese" => Arc::new(chinese::new_with_options(lang_config)),
_ => panic!("Unsupported language {}", lang_config.lang),
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[package]
name = "infisearch_lang_latin"
name = "infisearch_lang_ascii_stemmer"
version = "0.9.1"
authors = ["Ze Yu <[email protected]>"]
edition = "2018"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pub mod ascii_stemmer;

This file was deleted.

4 changes: 2 additions & 2 deletions packages/infisearch_search/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ crate-type = ["cdylib"]
[features]
default = ["lang_ascii"]
lang_ascii = ["infisearch_lang_ascii"]
lang_latin = ["infisearch_lang_latin"]
lang_ascii_stemmer = ["infisearch_lang_ascii_stemmer"]
lang_chinese = ["infisearch_lang_chinese"]
perf = ["web-sys"]

Expand All @@ -20,7 +20,7 @@ byteorder = "1"
js-sys = "0.3.51"
infisearch_common = { path = "../infisearch_common", features = [] }
infisearch_lang_ascii = { path = "../infisearch_languages/infisearch_lang_ascii", optional = true, features = [] }
infisearch_lang_latin = { path = "../infisearch_languages/infisearch_lang_latin", optional = true, features = [] }
infisearch_lang_ascii_stemmer = { path = "../infisearch_languages/infisearch_lang_ascii_stemmer", optional = true, features = [] }
infisearch_lang_chinese = { path = "../infisearch_languages/infisearch_lang_chinese", optional = true, features = [] }
smartstring = "0.2.7"
wasm-bindgen = { version = "0.2" }
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"name": "@infisearch/lang-latin",
"name": "@infisearch/lang-ascii-stemmer",
"collaborators": [
"Ze Yu <[email protected]>"
],
Expand Down
10 changes: 5 additions & 5 deletions packages/infisearch_search/src/searcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ use crate::utils;

#[cfg(feature = "lang_ascii")]
use infisearch_lang_ascii::ascii;
#[cfg(feature = "lang_latin")]
use infisearch_lang_latin::latin;
#[cfg(feature = "lang_ascii_stemmer")]
use infisearch_lang_ascii_stemmer::ascii_stemmer;
#[cfg(feature = "lang_chinese")]
use infisearch_lang_chinese::chinese;

Expand Down Expand Up @@ -80,9 +80,9 @@ fn get_tokenizer(lang_config: &InfiLanguageConfig) -> Box<dyn SearchTokenizer> {
Box::new(ascii::new_with_options(lang_config))
}

#[cfg(feature = "lang_latin")]
#[cfg(feature = "lang_ascii_stemmer")]
fn get_tokenizer(lang_config: &InfiLanguageConfig) -> Box<dyn SearchTokenizer> {
Box::new(latin::new_with_options(lang_config))
Box::new(ascii_stemmer::new_with_options(lang_config))
}

#[cfg(feature = "lang_chinese")]
Expand Down Expand Up @@ -441,7 +441,7 @@ pub mod test {
with_positions: true,
},
lang_config: InfiLanguageConfig {
lang: "latin".to_owned(),
lang: "ascii_stemmer".to_owned(),
options: InfiLanguageConfigOpts::default(),
},
field_infos,
Expand Down
2 changes: 1 addition & 1 deletion packages/infisearch_search/src/searcher/query_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -701,7 +701,7 @@ pub mod test {

pub fn parse_wo_pos(query: &str) -> Vec<QueryPart> {
let tokenizer = ascii::new_with_options(&InfiLanguageConfig {
lang: "latin".to_owned(),
lang: "ascii_stemmer".to_owned(),
options: InfiLanguageConfigOpts::default(),
});

Expand Down
3 changes: 2 additions & 1 deletion packages/mdbook-infisearch/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,8 @@ const base_url = '{}';
const mode = {};
{}
</script>",
base_url, lang,
base_url,
lang.replace("_", "-"),
base_url,
base_url,
mode,
Expand Down
Loading

0 comments on commit 98159f1

Please sign in to comment.