-
Notifications
You must be signed in to change notification settings - Fork 3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Pack tokenizer rust code and support lindera
Signed-off-by: aoiasd <[email protected]>
- Loading branch information
Showing
20 changed files
with
2,052 additions
and
319 deletions.
There are no files selected for viewing
1,806 changes: 1,643 additions & 163 deletions
1,806
internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
40 changes: 40 additions & 0 deletions
40
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/build_in_analyzer.rs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
use tantivy::tokenizer::*; | ||
|
||
use crate::analyzer::tokenizers::*; | ||
use crate::analyzer::filter::*; | ||
use crate::analyzer::stop_words; | ||
|
||
// default build-in analyzer | ||
pub(crate) fn standard_analyzer(stop_words: Vec<String>) -> TextAnalyzer { | ||
let builder = standard_builder().filter(LowerCaser); | ||
|
||
if stop_words.len() > 0 { | ||
return builder.filter(StopWordFilter::remove(stop_words)).build(); | ||
} | ||
|
||
builder.build() | ||
} | ||
|
||
pub fn chinese_analyzer(stop_words: Vec<String>) -> TextAnalyzer { | ||
let builder = jieba_builder().filter(CnAlphaNumOnlyFilter); | ||
if stop_words.len() > 0 { | ||
return builder.filter(StopWordFilter::remove(stop_words)).build(); | ||
} | ||
|
||
builder.build() | ||
} | ||
|
||
pub fn english_analyzer(stop_words: Vec<String>) -> TextAnalyzer { | ||
let builder = standard_builder() | ||
.filter(LowerCaser) | ||
.filter(Stemmer::new(Language::English)) | ||
.filter(StopWordFilter::remove( | ||
stop_words::ENGLISH.iter().map(|&word| word.to_owned()), | ||
)); | ||
|
||
if stop_words.len() > 0 { | ||
return builder.filter(StopWordFilter::remove(stop_words)).build(); | ||
} | ||
|
||
builder.build() | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
9 changes: 9 additions & 0 deletions
9
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/mod.rs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
mod analyzer; | ||
mod stop_words; | ||
mod tokenizers; | ||
mod build_in_analyzer; | ||
mod filter; | ||
mod util; | ||
|
||
pub(crate) use self::analyzer::create_analyzer; | ||
pub(crate) use self::build_in_analyzer::standard_analyzer; |
File renamed without changes.
File renamed without changes.
Oops, something went wrong.