Skip to content

Commit

Permalink
Pack tokenizer rust code and support lindera
Browse files Browse the repository at this point in the history
Signed-off-by: aoiasd <[email protected]>
  • Loading branch information
aoiasd committed Feb 11, 2025
1 parent 78b2d20 commit 9e8951e
Show file tree
Hide file tree
Showing 20 changed files with 2,052 additions and 319 deletions.
1,806 changes: 1,643 additions & 163 deletions internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock

Large diffs are not rendered by default.

9 changes: 9 additions & 0 deletions internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,17 @@ edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[features]
default = ["lindera-ipadic", "lindera-ko-dic", "lindera-cc-cedict"]
lindera-ipadic = ["lindera/ipadic"]
lindera-ipadic-neologd = ["lindera/ipadic-neologd"]
lindera-unidic = ["lindera/unidic"]
lindera-ko-dic = ["lindera/ko-dic"]
lindera-cc-cedict = ["lindera/cc-cedict"]

[dependencies]
tantivy = { git = "https://github.com/milvus-io/tantivy", tag = "v0.1.0" } # we have make a private fix for milvus, should be removed in future after milvus fixing the bug.
lindera = "0.38.1"
futures = "0.3.21"
libc = "0.2"
scopeguard = "1.2"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -318,11 +318,11 @@ bool tantivy_token_stream_advance(void *token_stream);

const char *tantivy_token_stream_get_token(void *token_stream);

RustResult tantivy_create_tokenizer(const char *analyzer_params);
RustResult tantivy_create_analyzer(const char *analyzer_params);

void *tantivy_clone_tokenizer(void *ptr);
void *tantivy_clone_analyzer(void *ptr);

void tantivy_free_tokenizer(void *tokenizer);
void tantivy_free_analyzer(void *tokenizer);

bool tantivy_index_exist(const char *path);

Expand Down
Original file line number Diff line number Diff line change
@@ -1,80 +1,19 @@
use log::warn;
use serde_json as json;
use std::collections::HashMap;
use tantivy::tokenizer::StopWordFilter;
use tantivy::tokenizer::*;
use serde_json as json;

use crate::error::Result;
use crate::error::TantivyBindingError;
use crate::jieba_tokenizer::JiebaTokenizer;
use crate::stop_words;
use crate::tokenizer_filter::*;
use crate::util::*;

// default build-in analyzer
pub(crate) fn standard_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
let builder = standard_builder().filter(LowerCaser);

if stop_words.len() > 0 {
return builder.filter(StopWordFilter::remove(stop_words)).build();
}

builder.build()
}

fn chinese_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
let builder = jieba_builder().filter(CnAlphaNumOnlyFilter);
if stop_words.len() > 0 {
return builder.filter(StopWordFilter::remove(stop_words)).build();
}

builder.build()
}
use crate::analyzer::{
build_in_analyzer::*,
tokenizers::get_builder_with_tokenizer,
filter::*,
util::*
};

fn english_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
let builder = standard_builder()
.filter(LowerCaser)
.filter(Stemmer::new(Language::English))
.filter(StopWordFilter::remove(
stop_words::ENGLISH.iter().map(|&word| word.to_owned()),
));

if stop_words.len() > 0 {
return builder.filter(StopWordFilter::remove(stop_words)).build();
}

builder.build()
}

fn standard_builder() -> TextAnalyzerBuilder {
TextAnalyzer::builder(SimpleTokenizer::default()).dynamic()
}

fn whitespace_builder() -> TextAnalyzerBuilder {
TextAnalyzer::builder(WhitespaceTokenizer::default()).dynamic()
}

fn jieba_builder() -> TextAnalyzerBuilder {
TextAnalyzer::builder(JiebaTokenizer::new()).dynamic()
}

fn get_builder_by_name(name: &String) -> Result<TextAnalyzerBuilder> {
match name.as_str() {
"standard" => Ok(standard_builder()),
"whitespace" => Ok(whitespace_builder()),
"jieba" => Ok(jieba_builder()),
other => {
warn!("unsupported tokenizer: {}", other);
Err(TantivyBindingError::InternalError(format!(
"unsupported tokenizer: {}",
other
)))
}
}
}

struct AnalyzerBuilder<'a> {
// builder: TextAnalyzerBuilder
filters: HashMap<String, SystemFilter>,
params: &'a json::Map<String, json::Value>,
}
Expand All @@ -87,20 +26,21 @@ impl AnalyzerBuilder<'_> {
}
}

fn get_tokenizer_name(&self) -> Result<String>{
fn get_tokenizer_params(&self) -> Result<&json::Value>{
let tokenizer=self.params.get("tokenizer");
if tokenizer.is_none(){
return Err(TantivyBindingError::InternalError(format!(
"tokenizer name or type must be set"
)));
}
if !tokenizer.unwrap().is_string() {
return Err(TantivyBindingError::InternalError(format!(
"tokenizer name should be string"
)));
let value = tokenizer.unwrap();
if value.is_object() || value.is_string() {
return Ok(tokenizer.unwrap())
}

Ok(tokenizer.unwrap().as_str().unwrap().to_string())
Err(TantivyBindingError::InternalError(format!(
"tokenizer name should be string or dict"
)))
}

fn add_custom_filter(
Expand Down Expand Up @@ -196,7 +136,7 @@ impl AnalyzerBuilder<'_> {
let str_list = get_string_list(value, "filter stop_words")?;
Ok(get_stop_words_list(str_list))
}
None => Ok(vec![]),
_ => Ok(vec![]),
}
}

Expand Down Expand Up @@ -227,16 +167,16 @@ impl AnalyzerBuilder<'_> {
};

//build custom analyzer
let tokenizer_name = self.get_tokenizer_name()?;
let mut builder = get_builder_by_name(&tokenizer_name)?;
let tokenizer_params = self.get_tokenizer_params()?;
let mut builder = get_builder_with_tokenizer(&tokenizer_params)?;

// build with option
builder = self.build_option(builder)?;
Ok(builder.build())
}
}

pub(crate) fn create_tokenizer_with_filter(params: &String) -> Result<TextAnalyzer> {
pub(crate) fn create_analyzer_with_filter(params: &String) -> Result<TextAnalyzer> {
match json::from_str::<json::Value>(&params) {
Ok(value) => {
if value.is_null() {
Expand Down Expand Up @@ -280,16 +220,16 @@ pub(crate) fn create_tokenizer_with_filter(params: &String) -> Result<TextAnalyz
}
}

pub(crate) fn create_tokenizer(params: &str) -> Result<TextAnalyzer> {
pub(crate) fn create_analyzer(params: &str) -> Result<TextAnalyzer> {
if params.len() == 0 {
return Ok(standard_analyzer(vec![]));
}
create_tokenizer_with_filter(&format!("{{\"analyzer\":{}}}", params))
create_analyzer_with_filter(&format!("{{\"analyzer\":{}}}", params))
}

#[cfg(test)]
mod tests {
use crate::tokenizer::create_tokenizer;
use crate::analyzer::analyzer::create_analyzer;

#[test]
fn test_standard_analyzer() {
Expand All @@ -298,7 +238,7 @@ mod tests {
"stop_words": ["_english_"]
}"#;

let tokenizer = create_tokenizer(&params.to_string());
let tokenizer = create_analyzer(&params.to_string());
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
}

Expand All @@ -308,7 +248,7 @@ mod tests {
"type": "chinese"
}"#;

let tokenizer = create_tokenizer(&params.to_string());
let tokenizer = create_analyzer(&params.to_string());
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
let mut bining = tokenizer.unwrap();
let mut stream = bining.token_stream("系统安全;,'';lxyz密码");
Expand All @@ -321,4 +261,28 @@ mod tests {

print!("test tokens :{:?}\n", results)
}

#[test]
fn test_lindera_analyzer() {
let params = r#"{
"tokenizer": {
"type": "lindera",
"dict_kind": "ipadic"
}
}"#;

let tokenizer = create_analyzer(&params.to_string());
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());

let mut bining = tokenizer.unwrap();
let mut stream = bining.token_stream("東京スカイツリーの最寄り駅はとうきょうスカイツリー駅です");

let mut results = Vec::<String>::new();
while stream.advance() {
let token = stream.token();
results.push(token.text.clone());
}

print!("test tokens :{:?}\n", results)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
use tantivy::tokenizer::*;

use crate::analyzer::tokenizers::*;
use crate::analyzer::filter::*;
use crate::analyzer::stop_words;

// default build-in analyzer
pub(crate) fn standard_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
let builder = standard_builder().filter(LowerCaser);

if stop_words.len() > 0 {
return builder.filter(StopWordFilter::remove(stop_words)).build();
}

builder.build()
}

pub fn chinese_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
let builder = jieba_builder().filter(CnAlphaNumOnlyFilter);
if stop_words.len() > 0 {
return builder.filter(StopWordFilter::remove(stop_words)).build();
}

builder.build()
}

pub fn english_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
let builder = standard_builder()
.filter(LowerCaser)
.filter(Stemmer::new(Language::English))
.filter(StopWordFilter::remove(
stop_words::ENGLISH.iter().map(|&word| word.to_owned()),
));

if stop_words.len() > 0 {
return builder.filter(StopWordFilter::remove(stop_words)).build();
}

builder.build()
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@ use regex;
use serde_json as json;
use tantivy::tokenizer::*;

use crate::error::Result;
use crate::error::TantivyBindingError;
use crate::util::*;
use crate::error::{Result,TantivyBindingError};
use crate::analyzer::util::*;

pub(crate) enum SystemFilter {
Invalid,
Expand Down Expand Up @@ -79,7 +78,7 @@ fn get_decompounder_filter(params: &json::Map<String, json::Value>) -> Result<Sy
for element in stop_words {
match element.as_str() {
Some(word) => str_list.push(word.to_string()),
None => {
_ => {
return Err(TantivyBindingError::InternalError(
"decompounder word list item should be string".to_string(),
))
Expand Down Expand Up @@ -114,12 +113,10 @@ fn get_stemmer_filter(params: &json::Map<String, json::Value>) -> Result<SystemF
}

trait LanguageParser {
type Error;
fn into_language(self) -> Result<Language>;
}

impl LanguageParser for &str {
type Error = TantivyBindingError;
fn into_language(self) -> Result<Language> {
match self.to_lowercase().as_str() {
"arabig" => Ok(Language::Arabic),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
mod analyzer;
mod stop_words;
mod tokenizers;
mod build_in_analyzer;
mod filter;
mod util;

pub(crate) use self::analyzer::create_analyzer;
pub(crate) use self::build_in_analyzer::standard_analyzer;
Loading

0 comments on commit 9e8951e

Please sign in to comment.