Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Import terms from Ethereum.org Translations' Glossary #20

Merged
merged 8 commits into from
Jan 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/format.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,4 @@ jobs:
- name: Check term sorting
run: |
cargo build
cargo run -- --check
cargo run --bin terms -- --check
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
/target
index.html
build/index.html
build/index.html
*.tbx
88 changes: 86 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@ edition = "2021"
[dependencies]
serde = { version="1.0.164", features=["derive"]}
toml = "0.7.4"
serde-xml-rs = "0.5"
regex = "1.9.6"
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Run the main script
.PHONY: run
run:
cargo run --release
cargo run --release --bin terms


# Default target
Expand Down
120 changes: 120 additions & 0 deletions src/bin/parse_tbx.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
use std::{
fs::{self, File},
io::Write,
};

use regex::Regex;
use serde::Deserialize;
use serde_xml_rs::from_str;
use terms::terms::{Term, Terms};

#[derive(Debug, Deserialize)]
struct LangSet {
#[serde(rename = "xmllang")]
lang: String,
#[serde(rename = "tig")]
tig: Tig,
}

#[derive(Debug, Deserialize)]
struct Tig {
#[serde(rename = "term")]
term: String,
#[serde(rename = "termNote")]
_term_note: Option<TermNote>,
}

#[derive(Debug, Deserialize)]
struct TermNote {
#[serde(rename = "type")]
_note_type: String,
}

fn main() {
// Specify the file path to your XML file
let file_path = "Ethereum.org_Translationss_Glossary.tbx";

// Read the XML data from the file into a string
let mut xml_str = match fs::read_to_string(file_path) {
Ok(content) => content,
Err(err) => {
eprintln!("Error reading the XML file: {:?}", err);
return;
}
};

// Define a regular expression pattern to find 'xml:lang'
let pattern = r#"xml:lang"#;
let regex = Regex::new(pattern).unwrap();

// Replace 'xml:lang' with 'xmllang' throughout the XML string
xml_str = regex.replace_all(&xml_str, "xmllang").to_string();

let parsed: Result<martif, serde_xml_rs::Error> = from_str(&xml_str);

let path = "terms.toml";
let mut terms = Terms::load_terms(path).unwrap();

let file_path = "untranslated.txt";
let mut untranslated = File::create(file_path).unwrap();

match parsed {
Ok(data) => {
// Iterate through langSet elements and print zh-TW translations
for entry in data.text.body.term_entry.iter() {
let en = entry.lang_set.iter().find(|lang_set| lang_set.lang == "en");
let zh_tw = entry
.lang_set
.iter()
.find(|lang_set| lang_set.lang == "zh-TW");
if let Some(en) = en {
if let Some(zh_tw) = zh_tw {
let term = Term {
term: en.tig.term.to_string(),
tags: vec![],
translation: zh_tw.tig.term.to_string(),
};
terms.terms.push(term);
} else {
let line = format!("untranslated: {}", en.tig.term);
println!("{}", line);
untranslated.write_all(line.as_bytes()).unwrap();
untranslated.write_all(b"\n").unwrap();
}
}
}
}
Err(e) => {
eprintln!("Error parsing XML: {:?}", e);
}
}
terms.sort_terms();
terms.to_file(path).unwrap();
}

#[derive(Debug, Deserialize)]
#[allow(non_camel_case_types)]
struct martif {
#[serde(rename = "xmllang")]
_lang: String,
#[serde(rename = "text")]
text: Text,
}

#[derive(Debug, Deserialize)]
struct Text {
#[serde(rename = "body")]
body: Body,
}

#[derive(Debug, Deserialize)]
struct Body {
#[serde(rename = "termEntry")]
term_entry: Vec<TermEntry>,
}

#[derive(Debug, Deserialize)]
struct TermEntry {
#[serde(rename = "langSet")]
lang_set: Vec<LangSet>,
}
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pub mod terms;
Loading
Loading