Skip to content

Commit

Permalink
Import terms from Ethereum.org Translations' Glossary (#20)
Browse files Browse the repository at this point in the history
Co-authored-by: Sloth Service <[email protected]>
  • Loading branch information
ChihChengLiang and slothservice authored Jan 23, 2024
1 parent b6473d6 commit 569e358
Show file tree
Hide file tree
Showing 9 changed files with 1,220 additions and 26 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/format.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,4 @@ jobs:
- name: Check term sorting
run: |
cargo build
cargo run -- --check
cargo run --bin terms -- --check
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
/target
index.html
build/index.html
build/index.html
*.tbx
88 changes: 86 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@ edition = "2021"
[dependencies]
serde = { version="1.0.164", features=["derive"]}
toml = "0.7.4"
serde-xml-rs = "0.5"
regex = "1.9.6"
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Run the main script
.PHONY: run
run:
cargo run --release
cargo run --release --bin terms


# Default target
Expand Down
120 changes: 120 additions & 0 deletions src/bin/parse_tbx.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
use std::{
fs::{self, File},
io::Write,
};

use regex::Regex;
use serde::Deserialize;
use serde_xml_rs::from_str;
use terms::terms::{Term, Terms};

#[derive(Debug, Deserialize)]
struct LangSet {
#[serde(rename = "xmllang")]
lang: String,
#[serde(rename = "tig")]
tig: Tig,
}

#[derive(Debug, Deserialize)]
struct Tig {
#[serde(rename = "term")]
term: String,
#[serde(rename = "termNote")]
_term_note: Option<TermNote>,
}

#[derive(Debug, Deserialize)]
struct TermNote {
#[serde(rename = "type")]
_note_type: String,
}

fn main() {
// Specify the file path to your XML file
let file_path = "Ethereum.org_Translationss_Glossary.tbx";

// Read the XML data from the file into a string
let mut xml_str = match fs::read_to_string(file_path) {
Ok(content) => content,
Err(err) => {
eprintln!("Error reading the XML file: {:?}", err);
return;
}
};

// Define a regular expression pattern to find 'xml:lang'
let pattern = r#"xml:lang"#;
let regex = Regex::new(pattern).unwrap();

// Replace 'xml:lang' with 'xmllang' throughout the XML string
xml_str = regex.replace_all(&xml_str, "xmllang").to_string();

let parsed: Result<martif, serde_xml_rs::Error> = from_str(&xml_str);

let path = "terms.toml";
let mut terms = Terms::load_terms(path).unwrap();

let file_path = "untranslated.txt";
let mut untranslated = File::create(file_path).unwrap();

match parsed {
Ok(data) => {
// Iterate through langSet elements and print zh-TW translations
for entry in data.text.body.term_entry.iter() {
let en = entry.lang_set.iter().find(|lang_set| lang_set.lang == "en");
let zh_tw = entry
.lang_set
.iter()
.find(|lang_set| lang_set.lang == "zh-TW");
if let Some(en) = en {
if let Some(zh_tw) = zh_tw {
let term = Term {
term: en.tig.term.to_string(),
tags: vec![],
translation: zh_tw.tig.term.to_string(),
};
terms.terms.push(term);
} else {
let line = format!("untranslated: {}", en.tig.term);
println!("{}", line);
untranslated.write_all(line.as_bytes()).unwrap();
untranslated.write_all(b"\n").unwrap();
}
}
}
}
Err(e) => {
eprintln!("Error parsing XML: {:?}", e);
}
}
terms.sort_terms();
terms.to_file(path).unwrap();
}

#[derive(Debug, Deserialize)]
#[allow(non_camel_case_types)]
struct martif {
#[serde(rename = "xmllang")]
_lang: String,
#[serde(rename = "text")]
text: Text,
}

#[derive(Debug, Deserialize)]
struct Text {
#[serde(rename = "body")]
body: Body,
}

#[derive(Debug, Deserialize)]
struct Body {
#[serde(rename = "termEntry")]
term_entry: Vec<TermEntry>,
}

#[derive(Debug, Deserialize)]
struct TermEntry {
#[serde(rename = "langSet")]
lang_set: Vec<LangSet>,
}
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pub mod terms;
Loading

0 comments on commit 569e358

Please sign in to comment.