From a33bc6b30061ca28c874a3c9d7f47e670054a92b Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Fri, 25 Oct 2024 17:43:23 +0900 Subject: [PATCH 1/5] add new line for the matrix size --- sudachi-cli/src/build.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sudachi-cli/src/build.rs b/sudachi-cli/src/build.rs index eea11ecf..eb2e716c 100644 --- a/sudachi-cli/src/build.rs +++ b/sudachi-cli/src/build.rs @@ -210,7 +210,7 @@ fn dump_pos(grammar: &Grammar, w: &mut W) { fn dump_matrix(grammar: &Grammar, w: &mut W) { let conn = grammar.conn_matrix(); - write!(w, "{} {}", conn.num_left(), conn.num_right()).unwrap(); + write!(w, "{} {}\n", conn.num_left(), conn.num_right()).unwrap(); for left in 0..conn.num_left() { for right in 0..conn.num_right() { From 848e637fc271735e3ede206a84d03b776722b708 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 28 Oct 2024 16:03:29 +0900 Subject: [PATCH 2/5] dump pos_id --- sudachi-cli/src/build.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sudachi-cli/src/build.rs b/sudachi-cli/src/build.rs index eb2e716c..62248809 100644 --- a/sudachi-cli/src/build.rs +++ b/sudachi-cli/src/build.rs @@ -196,7 +196,8 @@ fn dump_part(dict: PathBuf, part: String, output: PathBuf) { } fn dump_pos(grammar: &Grammar, w: &mut W) { - for p in grammar.pos_list.iter() { + for (id, p) in grammar.pos_list.iter().enumerate() { + write!(w, "{},", id).unwrap(); for (i, e) in p.iter().enumerate() { w.write_all(e.as_bytes()).unwrap(); if (i + 1) == p.len() { From ec9a0f4dff64d7a6792504ef7e37ebcf57bdc978 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 28 Oct 2024 17:16:55 +0900 Subject: [PATCH 3/5] dump winfo in lexicon format --- sudachi-cli/src/build.rs | 97 +++++++++++++++++++++++++++++++++------- 1 file changed, 80 insertions(+), 17 deletions(-) diff --git a/sudachi-cli/src/build.rs b/sudachi-cli/src/build.rs index 62248809..ec7f4620 100644 --- a/sudachi-cli/src/build.rs +++ b/sudachi-cli/src/build.rs @@ -27,6 +27,7 @@ use sudachi::dic::build::report::DictPartReport; use sudachi::dic::build::DictBuilder; use sudachi::dic::dictionary::JapaneseDictionary; use sudachi::dic::grammar::Grammar; +use sudachi::dic::lexicon::word_infos::WordInfo; use sudachi::dic::lexicon_set::LexiconSet; use sudachi::dic::word_id::WordId; use sudachi::dic::DictionaryLoader; @@ -79,6 +80,7 @@ pub(crate) enum BuildCli { dict: PathBuf, part: String, output: PathBuf, + // todo: dump user dict }, } @@ -189,7 +191,7 @@ fn dump_part(dict: PathBuf, part: String, output: PathBuf) { match part.as_str() { "pos" => dump_pos(dict.grammar(), &mut writer), "matrix" => dump_matrix(dict.grammar(), &mut writer), - "winfo" => dump_word_info(dict.lexicon(), &mut writer).unwrap(), + "winfo" => dump_word_info(&dict, &mut writer).unwrap(), _ => unimplemented!(), } writer.flush().unwrap(); @@ -221,23 +223,28 @@ fn dump_matrix(grammar: &Grammar, w: &mut W) { } } -fn dump_word_info(lex: &LexiconSet, w: &mut W) -> SudachiResult<()> { +fn dump_word_info(dict: &dyn DictionaryAccess, w: &mut W) -> SudachiResult<()> { + let grammar = dict.grammar(); + let lex = dict.lexicon(); let size = lex.size(); for i in 0..size { let wid = WordId::checked(0, i)?; let (left, right, cost) = lex.get_word_param(wid); let winfo = lex.get_word_info(wid)?; + write!(w, "{},", unicode_escape(winfo.surface()))?; write!(w, "{},{},{},", left, right, cost)?; - write!(w, "{},", winfo.surface())?; - write!(w, "{},", winfo.head_word_length())?; - write!(w, "{},", winfo.normalized_form())?; - write!(w, "{},", winfo.dictionary_form_word_id())?; - write!(w, "{},", winfo.reading_form())?; - dump_wids(w, winfo.a_unit_split())?; + write!(w, "{},", unicode_escape(winfo.surface()))?; // writing + write!(w, "{},", pos_string(grammar, winfo.pos_id()))?; + write!(w, "{},", unicode_escape(winfo.reading_form()))?; + write!(w, "{},", unicode_escape(winfo.normalized_form()))?; + let dict_form = dictionary_form_string(grammar, lex, winfo.dictionary_form_word_id()); + write!(w, "{},", dict_form)?; + write!(w, "{},", split_mode(&winfo))?; + dump_wids(w, grammar, lex, winfo.a_unit_split())?; w.write_all(b",")?; - dump_wids(w, winfo.b_unit_split())?; + dump_wids(w, grammar, lex, winfo.b_unit_split())?; w.write_all(b",")?; - dump_wids(w, winfo.word_structure())?; + dump_wids(w, grammar, lex, winfo.word_structure())?; w.write_all(b",")?; dump_gids(w, winfo.synonym_group_ids())?; w.write_all(b"\n")?; @@ -245,23 +252,79 @@ fn dump_word_info(lex: &LexiconSet, w: &mut W) -> SudachiResult<()> { Ok(()) } -fn dump_wids(w: &mut W, data: &[WordId]) -> SudachiResult<()> { +fn unicode_escape(raw: &str) -> String { + // replace '"' and ',' + let escaped = raw + .to_string() + .replace("\"", "\\u0022") + .replace(",", "\\u002c"); + escaped +} + +fn split_mode(winfo: &WordInfo) -> &str { + // todo: check + let asplits = winfo.a_unit_split(); + if asplits.len() == 0 { + return "A"; + } + let bsplits = winfo.b_unit_split(); + if bsplits.len() == 0 { + return "B"; + } + return "C"; +} + +fn pos_string(grammar: &Grammar, posid: u16) -> String { + let pos_parts = grammar.pos_components(posid); + pos_parts.join(",") +} + +fn dictionary_form_string(grammar: &Grammar, lex: &LexiconSet, wid: i32) -> String { + if wid < 0 { + return "*".to_string(); + } + let wid_with_dic = WordId::checked(0, wid as u32).expect("invalid wordid"); + format!("\"{}\"", wordref_string(grammar, lex, &wid_with_dic)) +} + +fn wordref_string(grammar: &Grammar, lex: &LexiconSet, wid: &WordId) -> String { + let winfo = lex.get_word_info(*wid).expect("failed to get wordinfo"); + format!( + "{},{},{}", + unicode_escape(winfo.surface()), + pos_string(grammar, winfo.pos_id()), + unicode_escape(winfo.reading_form()), + ) +} + +fn dump_wids( + w: &mut W, + grammar: &Grammar, + lex: &LexiconSet, + data: &[WordId], +) -> SudachiResult<()> { + if data.len() == 0 { + write!(w, "*")?; + return Ok(()); + } + w.write_all(b"\"")?; for (i, e) in data.iter().enumerate() { - let prefix = match e.dic() { - 0 => "", - _ => "U", - }; - write!(w, "{}{}", prefix, e.word())?; + write!(w, "{}", wordref_string(grammar, lex, e))?; if i + 1 != data.len() { w.write_all(b"/")?; } } + w.write_all(b"\"")?; Ok(()) } fn dump_gids(w: &mut W, data: &[u32]) -> SudachiResult<()> { + if data.len() == 0 { + write!(w, "*")?; + return Ok(()); + } for (i, e) in data.iter().enumerate() { - write!(w, "{}", e)?; + write!(w, "{:06}", e)?; if i + 1 != data.len() { w.write_all(b"/")?; } From d11cafc56f675345156c6cb5d98854bed7bc4e7b Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Tue, 29 Oct 2024 16:46:54 +0900 Subject: [PATCH 4/5] dump user dict --- sudachi-cli/src/build.rs | 110 ++++++++++++++++++++++++++++++--------- 1 file changed, 86 insertions(+), 24 deletions(-) diff --git a/sudachi-cli/src/build.rs b/sudachi-cli/src/build.rs index ec7f4620..f1d1dc27 100644 --- a/sudachi-cli/src/build.rs +++ b/sudachi-cli/src/build.rs @@ -27,6 +27,7 @@ use sudachi::dic::build::report::DictPartReport; use sudachi::dic::build::DictBuilder; use sudachi::dic::dictionary::JapaneseDictionary; use sudachi::dic::grammar::Grammar; +use sudachi::dic::header::HeaderVersion; use sudachi::dic::lexicon::word_infos::WordInfo; use sudachi::dic::lexicon_set::LexiconSet; use sudachi::dic::word_id::WordId; @@ -77,10 +78,17 @@ pub(crate) enum BuildCli { #[command(name = "dump")] Dump { - dict: PathBuf, + /// target dictionary to dump + dictionary: PathBuf, + /// dump target (matrix, pos, winfo) part: String, + /// output file output: PathBuf, - // todo: dump user dict + + /// reference system dictionary. + /// required to dump winfo of an user dictionary + #[arg(short = 's', long = "system")] + system: Option, }, } @@ -103,7 +111,12 @@ pub fn build_main(subcommand: BuildCli) { match subcommand { BuildCli::System { common, matrix } => build_system(common, matrix), BuildCli::User { common, dictionary } => build_user(common, dictionary), - BuildCli::Dump { dict, part, output } => dump_part(dict, part, output), + BuildCli::Dump { + dictionary, + part, + output, + system, + } => dump_part(dictionary, system, part, output), } } @@ -178,26 +191,30 @@ fn output_file(p: &Path) -> File { .unwrap_or_else(|e| panic!("failed to open {:?} for writing:\n{:?}", p, e)) } -fn dump_part(dict: PathBuf, part: String, output: PathBuf) { - let file = File::open(&dict).expect("open failed"); - let data = unsafe { Mmap::map(&file) }.expect("mmap failed"); +fn dump_part(dict: PathBuf, system: Option, part: String, output: PathBuf) { + let file = File::open(&dict).expect("open dict failed"); + let data = unsafe { Mmap::map(&file) }.expect("mmap dict failed"); let loader = unsafe { DictionaryLoader::read_any_dictionary(&data) }.expect("failed to load dictionary"); - let dict = loader.to_loaded().expect("should contain grammar"); let outf = output_file(&output); let mut writer = BufWriter::new(outf); match part.as_str() { - "pos" => dump_pos(dict.grammar(), &mut writer), - "matrix" => dump_matrix(dict.grammar(), &mut writer), - "winfo" => dump_word_info(&dict, &mut writer).unwrap(), + "pos" => dump_pos(loader, &mut writer), + "matrix" => dump_matrix(loader, &mut writer), + "winfo" => dump_word_info(loader, system, &mut writer).unwrap(), _ => unimplemented!(), } writer.flush().unwrap(); } -fn dump_pos(grammar: &Grammar, w: &mut W) { +fn dump_pos(dict: DictionaryLoader, w: &mut W) { + let dict = dict + .to_loaded() + .expect("target dict should contain grammar"); + let grammar = dict.grammar(); + for (id, p) in grammar.pos_list.iter().enumerate() { write!(w, "{},", id).unwrap(); for (i, e) in p.iter().enumerate() { @@ -211,10 +228,18 @@ fn dump_pos(grammar: &Grammar, w: &mut W) { } } -fn dump_matrix(grammar: &Grammar, w: &mut W) { +fn dump_matrix(dict: DictionaryLoader, w: &mut W) { + if let HeaderVersion::UserDict(_) = dict.header.version { + panic!("user dictionary does not have connection matrix.") + } + + let dict = dict + .to_loaded() + .expect("target dict should contain grammar"); + let grammar = dict.grammar(); let conn = grammar.conn_matrix(); - write!(w, "{} {}\n", conn.num_left(), conn.num_right()).unwrap(); + write!(w, "{} {}\n", conn.num_left(), conn.num_right()).unwrap(); for left in 0..conn.num_left() { for right in 0..conn.num_right() { let cost = conn.cost(left as _, right as _); @@ -223,28 +248,66 @@ fn dump_matrix(grammar: &Grammar, w: &mut W) { } } -fn dump_word_info(dict: &dyn DictionaryAccess, w: &mut W) -> SudachiResult<()> { - let grammar = dict.grammar(); - let lex = dict.lexicon(); - let size = lex.size(); +fn dump_word_info( + dict: DictionaryLoader, + system: Option, + w: &mut W, +) -> SudachiResult<()> { + let is_user = match dict.header.version { + HeaderVersion::UserDict(_) => true, + HeaderVersion::SystemDict(_) => false, + }; + let did = if is_user { 1 } else { 0 }; + let size = dict.lexicon.size(); + + let data = system.map(|system_path| { + let file = File::open(&system_path).expect("open system failed"); + unsafe { Mmap::map(&file) }.expect("mmap system failed") + }); + let system = data.as_ref().map(|data| { + let loader = DictionaryLoader::read_system_dictionary(data) + .expect("failed to load system dictionary"); + loader + .to_loaded() + .expect("failed to load system dictionary") + }); + + let (base, user) = if is_user { + ( + system.expect("system dictionary is required to dump user dictionary lexicon"), + Some(dict), + ) + } else { + (dict.to_loaded().expect("failed to load dictionary"), None) + }; + + let mut lex = base.lexicon_set; + let mut grammar = base.grammar; + if let Some(udic) = user { + lex.append(udic.lexicon, grammar.pos_list.len())?; + if let Some(g) = udic.grammar { + grammar.merge(g) + } + } + for i in 0..size { - let wid = WordId::checked(0, i)?; + let wid = WordId::checked(did, i)?; let (left, right, cost) = lex.get_word_param(wid); let winfo = lex.get_word_info(wid)?; write!(w, "{},", unicode_escape(winfo.surface()))?; write!(w, "{},{},{},", left, right, cost)?; write!(w, "{},", unicode_escape(winfo.surface()))?; // writing - write!(w, "{},", pos_string(grammar, winfo.pos_id()))?; + write!(w, "{},", pos_string(&grammar, winfo.pos_id()))?; write!(w, "{},", unicode_escape(winfo.reading_form()))?; write!(w, "{},", unicode_escape(winfo.normalized_form()))?; - let dict_form = dictionary_form_string(grammar, lex, winfo.dictionary_form_word_id()); + let dict_form = dictionary_form_string(&grammar, &lex, winfo.dictionary_form_word_id()); write!(w, "{},", dict_form)?; write!(w, "{},", split_mode(&winfo))?; - dump_wids(w, grammar, lex, winfo.a_unit_split())?; + dump_wids(w, &grammar, &lex, winfo.a_unit_split())?; w.write_all(b",")?; - dump_wids(w, grammar, lex, winfo.b_unit_split())?; + dump_wids(w, &grammar, &lex, winfo.b_unit_split())?; w.write_all(b",")?; - dump_wids(w, grammar, lex, winfo.word_structure())?; + dump_wids(w, &grammar, &lex, winfo.word_structure())?; w.write_all(b",")?; dump_gids(w, winfo.synonym_group_ids())?; w.write_all(b"\n")?; @@ -262,7 +325,6 @@ fn unicode_escape(raw: &str) -> String { } fn split_mode(winfo: &WordInfo) -> &str { - // todo: check let asplits = winfo.a_unit_split(); if asplits.len() == 0 { return "A"; From ecddc0beb899fb4cfcb9e2d36c45d6372f4ad90d Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Tue, 29 Oct 2024 16:51:53 +0900 Subject: [PATCH 5/5] fix clippy warnings --- sudachi-cli/src/build.rs | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/sudachi-cli/src/build.rs b/sudachi-cli/src/build.rs index f1d1dc27..dbb03444 100644 --- a/sudachi-cli/src/build.rs +++ b/sudachi-cli/src/build.rs @@ -187,12 +187,12 @@ fn output_file(p: &Path) -> File { OpenOptions::new() .write(true) .create_new(true) - .open(&p) + .open(p) .unwrap_or_else(|e| panic!("failed to open {:?} for writing:\n{:?}", p, e)) } fn dump_part(dict: PathBuf, system: Option, part: String, output: PathBuf) { - let file = File::open(&dict).expect("open dict failed"); + let file = File::open(dict).expect("open dict failed"); let data = unsafe { Mmap::map(&file) }.expect("mmap dict failed"); let loader = unsafe { DictionaryLoader::read_any_dictionary(&data) }.expect("failed to load dictionary"); @@ -239,11 +239,11 @@ fn dump_matrix(dict: DictionaryLoader, w: &mut W) { let grammar = dict.grammar(); let conn = grammar.conn_matrix(); - write!(w, "{} {}\n", conn.num_left(), conn.num_right()).unwrap(); + writeln!(w, "{} {}", conn.num_left(), conn.num_right()).unwrap(); for left in 0..conn.num_left() { for right in 0..conn.num_right() { let cost = conn.cost(left as _, right as _); - write!(w, "{} {} {}\n", left, right, cost).unwrap(); + writeln!(w, "{} {} {}", left, right, cost).unwrap(); } } } @@ -261,7 +261,7 @@ fn dump_word_info( let size = dict.lexicon.size(); let data = system.map(|system_path| { - let file = File::open(&system_path).expect("open system failed"); + let file = File::open(system_path).expect("open system failed"); unsafe { Mmap::map(&file) }.expect("mmap system failed") }); let system = data.as_ref().map(|data| { @@ -317,23 +317,21 @@ fn dump_word_info( fn unicode_escape(raw: &str) -> String { // replace '"' and ',' - let escaped = raw - .to_string() - .replace("\"", "\\u0022") - .replace(",", "\\u002c"); - escaped + raw.to_string() + .replace('"', "\\u0022") + .replace(',', "\\u002c") } fn split_mode(winfo: &WordInfo) -> &str { let asplits = winfo.a_unit_split(); - if asplits.len() == 0 { + if asplits.is_empty() { return "A"; } let bsplits = winfo.b_unit_split(); - if bsplits.len() == 0 { + if bsplits.is_empty() { return "B"; } - return "C"; + "C" } fn pos_string(grammar: &Grammar, posid: u16) -> String { @@ -365,7 +363,7 @@ fn dump_wids( lex: &LexiconSet, data: &[WordId], ) -> SudachiResult<()> { - if data.len() == 0 { + if data.is_empty() { write!(w, "*")?; return Ok(()); } @@ -381,7 +379,7 @@ fn dump_wids( } fn dump_gids(w: &mut W, data: &[u32]) -> SudachiResult<()> { - if data.len() == 0 { + if data.is_empty() { write!(w, "*")?; return Ok(()); }