Skip to content

Commit

Permalink
Support extracting text from docx file
Browse files Browse the repository at this point in the history
  • Loading branch information
dialogflowchatbot committed Dec 16, 2024
1 parent 606164c commit ab2f031
Show file tree
Hide file tree
Showing 7 changed files with 48 additions and 52 deletions.
5 changes: 4 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ name = "dialogflow"
anyhow = "1.0"
axum = {version = "0.7", features = ["query", "tokio", "macros", "multipart"]}
bigdecimal = "0.4"
# bytes = "1.9"
# candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.5.1" }
candle = { version = "0.8", package = "candle-core", default-features = false }
# candle = { git = "https://github.com/huggingface/candle.git", package = "candle-core", default-features = false }
Expand Down Expand Up @@ -74,11 +75,13 @@ lettre = { version = "0.11", features = ["tokio1", "smtp-transport", "tokio1-nat
unicase = "2.8.0"
sqlx = { version = "0.8", default-features = false, features = ["runtime-tokio", "sqlite", "macros"] }
lopdf = "0.34.0"
docx-rs = "0.4.17"
# docx-rs = "0.4.17"
# lancedb = "0.13.0"
libsqlite3-sys = { version = "0.30", features = ["bundled"] }
sqlite-vec = "0.1.6"
validator = "0.19.0"
zip = "2.2"
quick-xml = "0.37"
# triple_accel = "0.4.0"

[build-dependencies]
Expand Down
4 changes: 2 additions & 2 deletions src/kb/crud.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use axum::{
Json,
};

use super::doc;
use super::docx;
use super::dto::QuestionAnswerPair;
use crate::result::{Error, Result};
use crate::robot::dto::RobotQuery;
Expand Down Expand Up @@ -62,7 +62,7 @@ async fn upload_doc_inner(robot_id: &str, mut multipart: Multipart) -> Result<()
data.len()
);

let text = doc::parse_docx(data.to_vec())?;
let text = docx::parse_docx(data.to_vec())?;
log::info!("Extract text: {text}");
}
}
Expand Down
73 changes: 30 additions & 43 deletions src/kb/doc.rs → src/kb/docx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,15 @@ use core::time::Duration;
// use std::io::Read;
// use std::path::Path;
use std::fs::OpenOptions;
use std::io::{BufReader, Cursor, Read};
use std::sync::OnceLock;
use std::vec::Vec;

use docx_rs::read_docx;
use futures_util::StreamExt;
use quick_xml::events::Event;
use quick_xml::Reader;
use sqlx::{pool::PoolOptions, Row, Sqlite};
use zip::ZipArchive;

use crate::result::{Error, Result};

Expand Down Expand Up @@ -79,53 +82,37 @@ pub(crate) async fn init_tables(robot_id: &str) -> Result<()> {
Ok(())
}

pub(super) fn parse_docx(buf: Vec<u8>) -> Result<String> {
pub(super) fn parse_docx(b: Vec<u8>) -> Result<String> {
// let mut file = File::open("./numbering.docx")?;
// let mut buf = Vec::with_capacity(3096);
// file.read_to_end(&mut buf)?;
let mut doc_text = String::with_capacity(3096);
let docx = read_docx(&buf)?;
let doc = docx.document;
for d in doc.children.iter() {
match d {
docx_rs::DocumentChild::Paragraph(paragraph) => {
for p in paragraph.children() {
match p {
docx_rs::ParagraphChild::Run(run) => {
for r in run.children.iter() {
match r {
docx_rs::RunChild::Text(text) => {
log::info!("Docx text={}", text.text);
doc_text.push_str(&text.text);
// doc_text.push('\n');
// doc_text.push('\n');
}
docx_rs::RunChild::Sym(sym) => {
log::info!("meet sym");
doc_text.push_str(&sym.char);
}
docx_rs::RunChild::Break(_) => {
log::info!("meet break");
doc_text.push('\n');
}
docx_rs::RunChild::Tab(_) => {
log::info!("meet tab");
doc_text.push('\n');
}
_ => {}
}
}
}
docx_rs::ParagraphChild::Hyperlink(hyperlink) => {
log::info!("hyperlink: {:?}", hyperlink.link)
}
_ => {}
}
}
let reader = Cursor::new(b);
let mut archive = ZipArchive::new(reader)?;
let mut zip_file = archive.by_name("word/document.xml")?;
let mut cache = String::with_capacity(zip_file.size() as usize);
zip_file.read_to_string(&mut cache)?;

// 创建 XML 解析器
let mut reader = Reader::from_str(&cache);
reader.config_mut().trim_text(false);
let mut in_paragraph = false;

// 读取 XML 内容
loop {
match reader.read_event() {
Ok(Event::Start(ref e)) if e.name().as_ref() == b"w:p" => in_paragraph = true,
Ok(Event::End(ref e)) if e.name().as_ref() == b"w:p" => {
doc_text.push('\n');
in_paragraph = false;
}
Ok(Event::Empty(ref e)) if e.name().as_ref() == b"w:p" => doc_text.push('\n'),
Ok(Event::Text(e)) if in_paragraph => {
doc_text.push_str(&e.unescape()?);
}
docx_rs::DocumentChild::Table(_table) => {}
docx_rs::DocumentChild::TableOfContents(_table_of_contents) => {}
_ => {}
Ok(Event::Eof) => break,
Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e),
_ => (),
}
}
Ok(doc_text)
Expand Down
2 changes: 1 addition & 1 deletion src/kb/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
pub(crate) mod crud;
pub(crate) mod doc;
pub(crate) mod docx;
pub(crate) mod dto;
pub(crate) mod qa;
10 changes: 8 additions & 2 deletions src/result/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -195,8 +195,14 @@ impl From<axum::extract::multipart::MultipartError> for Error {
}
}

impl From<docx_rs::ReaderError> for Error {
fn from(err: docx_rs::ReaderError) -> Self {
impl From<zip::result::ZipError> for Error {
fn from(err: zip::result::ZipError) -> Self {
Error::ErrorWithMessage(format!("Read docx file failed: {:?}", err))
}
}

impl From<quick_xml::errors::Error> for Error {
fn from(err: quick_xml::errors::Error) -> Self {
Error::ErrorWithMessage(format!("Read docx file failed: {:?}", err))
}
}
2 changes: 1 addition & 1 deletion src/robot/crud.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ async fn new(d: &RobotData, is_en: bool) -> Result<()> {
settings::init(&d.robot_id)?;
crate::intent::phrase::init_tables(&d.robot_id).await?;
crate::kb::qa::init_tables(&d.robot_id).await?;
crate::kb::doc::init_tables(&d.robot_id).await?;
crate::kb::docx::init_tables(&d.robot_id).await?;
// 意图
intent::init(&d.robot_id, is_en)?;
// 变量
Expand Down
4 changes: 2 additions & 2 deletions src/web/server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ pub async fn start_app() {
.await
.expect("Failed initialize knowledge base QnA vector database.");

crate::kb::doc::init_datasource()
crate::kb::docx::init_datasource()
.await
.expect("Failed initialize knowledge base QnA vector database.");

Expand Down Expand Up @@ -379,7 +379,7 @@ async fn shutdown_signal(sender: tokio::sync::oneshot::Sender<()>) {

crate::intent::phrase::shutdown_db().await;
crate::kb::qa::shutdown_db().await;
crate::kb::doc::shutdown_db().await;
crate::kb::docx::shutdown_db().await;

let m = if *IS_EN {
"This program has been terminated"
Expand Down

0 comments on commit ab2f031

Please sign in to comment.