diff --git a/Cargo.toml b/Cargo.toml index 588279e..aaeeaca 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,6 +22,7 @@ name = "dialogflow" anyhow = "1.0" axum = {version = "0.7", features = ["query", "tokio", "macros", "multipart"]} bigdecimal = "0.4" +# bytes = "1.9" # candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.5.1" } candle = { version = "0.8", package = "candle-core", default-features = false } # candle = { git = "https://github.com/huggingface/candle.git", package = "candle-core", default-features = false } @@ -74,11 +75,13 @@ lettre = { version = "0.11", features = ["tokio1", "smtp-transport", "tokio1-nat unicase = "2.8.0" sqlx = { version = "0.8", default-features = false, features = ["runtime-tokio", "sqlite", "macros"] } lopdf = "0.34.0" -docx-rs = "0.4.17" +# docx-rs = "0.4.17" # lancedb = "0.13.0" libsqlite3-sys = { version = "0.30", features = ["bundled"] } sqlite-vec = "0.1.6" validator = "0.19.0" +zip = "2.2" +quick-xml = "0.37" # triple_accel = "0.4.0" [build-dependencies] diff --git a/src/kb/crud.rs b/src/kb/crud.rs index 12282d6..0f1d161 100644 --- a/src/kb/crud.rs +++ b/src/kb/crud.rs @@ -7,7 +7,7 @@ use axum::{ Json, }; -use super::doc; +use super::docx; use super::dto::QuestionAnswerPair; use crate::result::{Error, Result}; use crate::robot::dto::RobotQuery; @@ -62,7 +62,7 @@ async fn upload_doc_inner(robot_id: &str, mut multipart: Multipart) -> Result<() data.len() ); - let text = doc::parse_docx(data.to_vec())?; + let text = docx::parse_docx(data.to_vec())?; log::info!("Extract text: {text}"); } } diff --git a/src/kb/doc.rs b/src/kb/docx.rs similarity index 56% rename from src/kb/doc.rs rename to src/kb/docx.rs index 053e9bc..2877342 100644 --- a/src/kb/doc.rs +++ b/src/kb/docx.rs @@ -4,12 +4,15 @@ use core::time::Duration; // use std::io::Read; // use std::path::Path; use std::fs::OpenOptions; +use std::io::{BufReader, Cursor, Read}; use std::sync::OnceLock; use std::vec::Vec; -use docx_rs::read_docx; use futures_util::StreamExt; +use quick_xml::events::Event; +use quick_xml::Reader; use sqlx::{pool::PoolOptions, Row, Sqlite}; +use zip::ZipArchive; use crate::result::{Error, Result}; @@ -79,53 +82,37 @@ pub(crate) async fn init_tables(robot_id: &str) -> Result<()> { Ok(()) } -pub(super) fn parse_docx(buf: Vec) -> Result { +pub(super) fn parse_docx(b: Vec) -> Result { // let mut file = File::open("./numbering.docx")?; // let mut buf = Vec::with_capacity(3096); // file.read_to_end(&mut buf)?; let mut doc_text = String::with_capacity(3096); - let docx = read_docx(&buf)?; - let doc = docx.document; - for d in doc.children.iter() { - match d { - docx_rs::DocumentChild::Paragraph(paragraph) => { - for p in paragraph.children() { - match p { - docx_rs::ParagraphChild::Run(run) => { - for r in run.children.iter() { - match r { - docx_rs::RunChild::Text(text) => { - log::info!("Docx text={}", text.text); - doc_text.push_str(&text.text); - // doc_text.push('\n'); - // doc_text.push('\n'); - } - docx_rs::RunChild::Sym(sym) => { - log::info!("meet sym"); - doc_text.push_str(&sym.char); - } - docx_rs::RunChild::Break(_) => { - log::info!("meet break"); - doc_text.push('\n'); - } - docx_rs::RunChild::Tab(_) => { - log::info!("meet tab"); - doc_text.push('\n'); - } - _ => {} - } - } - } - docx_rs::ParagraphChild::Hyperlink(hyperlink) => { - log::info!("hyperlink: {:?}", hyperlink.link) - } - _ => {} - } - } + let reader = Cursor::new(b); + let mut archive = ZipArchive::new(reader)?; + let mut zip_file = archive.by_name("word/document.xml")?; + let mut cache = String::with_capacity(zip_file.size() as usize); + zip_file.read_to_string(&mut cache)?; + + // 创建 XML 解析器 + let mut reader = Reader::from_str(&cache); + reader.config_mut().trim_text(false); + let mut in_paragraph = false; + + // 读取 XML 内容 + loop { + match reader.read_event() { + Ok(Event::Start(ref e)) if e.name().as_ref() == b"w:p" => in_paragraph = true, + Ok(Event::End(ref e)) if e.name().as_ref() == b"w:p" => { + doc_text.push('\n'); + in_paragraph = false; + } + Ok(Event::Empty(ref e)) if e.name().as_ref() == b"w:p" => doc_text.push('\n'), + Ok(Event::Text(e)) if in_paragraph => { + doc_text.push_str(&e.unescape()?); } - docx_rs::DocumentChild::Table(_table) => {} - docx_rs::DocumentChild::TableOfContents(_table_of_contents) => {} - _ => {} + Ok(Event::Eof) => break, + Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e), + _ => (), } } Ok(doc_text) diff --git a/src/kb/mod.rs b/src/kb/mod.rs index 045510f..2bd2cbf 100644 --- a/src/kb/mod.rs +++ b/src/kb/mod.rs @@ -1,4 +1,4 @@ pub(crate) mod crud; -pub(crate) mod doc; +pub(crate) mod docx; pub(crate) mod dto; pub(crate) mod qa; diff --git a/src/result/mod.rs b/src/result/mod.rs index 62ac8a3..4534da9 100644 --- a/src/result/mod.rs +++ b/src/result/mod.rs @@ -195,8 +195,14 @@ impl From for Error { } } -impl From for Error { - fn from(err: docx_rs::ReaderError) -> Self { +impl From for Error { + fn from(err: zip::result::ZipError) -> Self { + Error::ErrorWithMessage(format!("Read docx file failed: {:?}", err)) + } +} + +impl From for Error { + fn from(err: quick_xml::errors::Error) -> Self { Error::ErrorWithMessage(format!("Read docx file failed: {:?}", err)) } } diff --git a/src/robot/crud.rs b/src/robot/crud.rs index dc7e43f..7214dab 100644 --- a/src/robot/crud.rs +++ b/src/robot/crud.rs @@ -66,7 +66,7 @@ async fn new(d: &RobotData, is_en: bool) -> Result<()> { settings::init(&d.robot_id)?; crate::intent::phrase::init_tables(&d.robot_id).await?; crate::kb::qa::init_tables(&d.robot_id).await?; - crate::kb::doc::init_tables(&d.robot_id).await?; + crate::kb::docx::init_tables(&d.robot_id).await?; // 意图 intent::init(&d.robot_id, is_en)?; // 变量 diff --git a/src/web/server.rs b/src/web/server.rs index da6e2a4..3f03db3 100644 --- a/src/web/server.rs +++ b/src/web/server.rs @@ -73,7 +73,7 @@ pub async fn start_app() { .await .expect("Failed initialize knowledge base QnA vector database."); - crate::kb::doc::init_datasource() + crate::kb::docx::init_datasource() .await .expect("Failed initialize knowledge base QnA vector database."); @@ -379,7 +379,7 @@ async fn shutdown_signal(sender: tokio::sync::oneshot::Sender<()>) { crate::intent::phrase::shutdown_db().await; crate::kb::qa::shutdown_db().await; - crate::kb::doc::shutdown_db().await; + crate::kb::docx::shutdown_db().await; let m = if *IS_EN { "This program has been terminated"