Skip to content

Commit

Permalink
Able to keep header and footer
Browse files Browse the repository at this point in the history
  • Loading branch information
vidy committed Dec 9, 2024
1 parent 59e6331 commit ea8ddf1
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 17 deletions.
11 changes: 8 additions & 3 deletions examples/text.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,20 @@ fn main() {

// for (page_nr, page) in file.pages().enumerate() {
let page: pdf::object::PageRc = file.get_page(0).unwrap();
let flow = pdf_text::run(&file, &page, &resolver, Default::default()).expect("can't render page");
let flow = pdf_text::run(&file, &page, &resolver, Default::default(), false).expect("can't render page");

println!("# page {}", 0 + 1);
for run in flow.runs {
for line in run.lines {
for w in line.words {
println!(": {}", w.text);
println!("{}", w.text);
}
}
println!();
}
for line in flow.lines {
for w in line.words {
println!("{}", w.text);
}
}
// }
}
3 changes: 3 additions & 0 deletions src/flow.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ pub(crate) fn build<E: Encoder>(mut flow: &mut Flow, spans: &[TextSpan<E>], node
Class::Header => RunType::Header,
_ => RunType::Paragraph,
};

flow.add_line(words, t);
}
}
Expand All @@ -121,6 +122,8 @@ pub(crate) fn build<E: Encoder>(mut flow: &mut Flow, spans: &[TextSpan<E>], node
Class::Header => RunType::Header,
_ => RunType::Paragraph,
};


flow.add_line(words, t);
}
NodeTag::Paragraph => {
Expand Down
4 changes: 2 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ mod text;
mod classify;
pub mod flow;

pub fn run<B: Backend>(file: &pdf::file::CachedFile<B>, page: &Page, resolve: &impl Resolve, transform: Transform2F) -> Result<Flow, PdfError> {
pub fn run<B: Backend>(file: &pdf::file::CachedFile<B>, page: &Page, resolve: &impl Resolve, transform: Transform2F, without_header_and_footer: bool) -> Result<Flow, PdfError> {
let mut cache = TraceCache::new(OutlineBuilder::default());

let mut clip_paths = vec![];
Expand Down Expand Up @@ -88,7 +88,7 @@ pub fn run<B: Backend>(file: &pdf::file::CachedFile<B>, page: &Page, resolve: &i
visit_item(item);
}

let root = node::build(&spans, bbox, &lines);
let root = node::build(&spans, bbox, &lines, without_header_and_footer);

let mut flow = Flow::new();
flow::build(&mut flow, &spans, &root, bbox.min_x());
Expand Down
39 changes: 30 additions & 9 deletions src/node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ mod gap;
mod line;
mod render;

use gap::{dist_x, dist_y, gaps, left_right_gap, top_bottom_gap};
use gap::{dist_x, dist_y, gap_list, gaps, left_right_gap, top_bottom_gap};
use line::{analyze_lines, overlapping_lines, Lines};
use pdf_render::TextSpan;
use pathfinder_geometry::rect::RectF;
Expand All @@ -15,18 +15,30 @@ use crate::util::avg;
#[cfg(feature="ocr")]
use tesseract_plumbing::Text;

use std::boxed;
use std::mem::take;
use table::Table;
use font::Encoder;

pub fn build<E: Encoder>(spans: &[TextSpan<E>], bbox: RectF, lines: &[[f32; 4]]) -> Node {
pub fn build<E: Encoder>(spans: &[TextSpan<E>], bbox: RectF, lines: &[[f32; 4]], without_header_and_footer: bool) -> Node {
if spans.len() == 0 {
return Node::singleton(&[]);
}

let mut boxes: Vec<(RectF, usize)> = spans.iter().enumerate().map(|(i, t)| (t.rect, i)).collect();
let mut boxes = boxes.as_mut_slice();
if without_header_and_footer {
boxes = exclude_header_and_footer(boxes, bbox, spans);
}

let lines = analyze_lines(lines);

let avg_font_size = avg(spans.iter().map(|s| s.font_size)).unwrap();
split(&mut boxes, &spans, &lines)
}

pub fn exclude_header_and_footer<'a, E: Encoder>(boxes: &'a mut [(RectF, usize)], bbox: RectF, spans: &[TextSpan<E>]) -> &'a mut [(RectF, usize)]
{
let avg_font_size: f32 = avg(spans.iter().map(|s| s.font_size)).unwrap();

let probably_header = |boxes: &[(RectF, usize)]| {
let class = classify(boxes.iter().filter_map(|&(_, i)| spans.get(i)));
Expand All @@ -41,11 +53,15 @@ pub fn build<E: Encoder>(spans: &[TextSpan<E>], bbox: RectF, lines: &[[f32; 4]])
let x_gaps: Vec<f32> = gap::gaps(avg_font_size, boxes, |r| (r.min_x(), r.max_x()))
.collect();

let count = split_by(boxes, &x_gaps, |r| r.min_x()).filter(|cell| probably_header(cell)).count();
count == x_gaps.len() + 1
let is_footer = split_by(boxes, x_gaps.as_slice(), |r| r.min_x())
.all(|cell| probably_header(cell));

is_footer
};

sort_y(boxes);

let mut boxes = boxes;
let (top, bottom) = top_bottom_gap(boxes, bbox);
if let Some(bottom) = bottom {
if probably_footer(&mut boxes[bottom..]) {
Expand All @@ -69,10 +85,9 @@ pub fn build<E: Encoder>(spans: &[TextSpan<E>], bbox: RectF, lines: &[[f32; 4]])
boxes = &mut boxes[left..];
}
}
let lines = analyze_lines(lines);
split(boxes, &spans, &lines)
}

boxes
}

#[derive(Copy, Clone, Debug)]
struct Span {
Expand Down Expand Up @@ -326,8 +341,13 @@ fn split<E: Encoder>(boxes: &mut [(RectF, usize)], spans: &[TextSpan<E>], lines:
return Node::singleton(boxes);
}

// for b in boxes.iter(){
// dbg!(b.0, b.1, spans.get(b.1).unwrap().text.as_str());
// }

sort_x(boxes);
let max_x_gap = dist_x(boxes);

sort_y(boxes);
let max_y_gap = dist_y(boxes);

Expand Down Expand Up @@ -362,8 +382,8 @@ fn split<E: Encoder>(boxes: &mut [(RectF, usize)], spans: &[TextSpan<E>], lines:
}

sort_y(boxes);
for row in split_by(boxes, &y_gaps, |r| r.min_y()) {

for row in split_by(boxes, &y_gaps, |r| r.min_y()) {
if x_gaps.len() > 0 {
sort_x(row);
for cell in split_by(row, &x_gaps, |r| r.min_x()) {
Expand Down Expand Up @@ -400,6 +420,7 @@ fn split<E: Encoder>(boxes: &mut [(RectF, usize)], spans: &[TextSpan<E>], lines:
tag,
}
}

#[allow(dead_code)]
fn split_v(boxes: &mut [(RectF, usize)]) -> Node {
let num_boxes = boxes.len();
Expand Down
4 changes: 3 additions & 1 deletion src/node/gap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ pub fn gap_list<'a>(boxes: &'a [(RectF, usize)], span: impl Fn(&RectF) -> (f32,
let mut boxes = boxes.iter();
let &(ref r, _) = boxes.next().unwrap();
let (_, mut last_max) = span(r);

boxes.enumerate().filter_map(move |(idx, &(ref r, _))| {
// top left y, bottom right y
let (min, max) = span(&r);
let r = if min > last_max {
Some((last_max, min, idx+1))
Expand All @@ -25,6 +25,7 @@ pub fn gaps<'a>(threshold: f32, boxes: &'a [(RectF, usize)], span: impl Fn(&Rect
boxes.filter_map(move |&(ref r, _)| {
let (min, max) = span(&r);
let r = if min - last_max >= threshold {
// The middle position of the gap
Some(0.5 * (last_max + min))
} else {
None
Expand All @@ -34,6 +35,7 @@ pub fn gaps<'a>(threshold: f32, boxes: &'a [(RectF, usize)], span: impl Fn(&Rect
})
}

/// Return the size of the gap and the middle position of the gap.
pub fn max_gap(boxes: &[(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32)) -> Option<(f32, f32)> {
gap_list(boxes, span)
.max_by_key(|&(a, b, _)| NotNan::new(b - a).unwrap())
Expand Down
13 changes: 11 additions & 2 deletions src/node/line.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,17 @@ pub fn analyze_lines(lines: &[[f32; 4]]) -> Lines {

let mut line_grid = vec![false; vlines.len() * hlines.len()];
for &[x1, y1, x2, y2] in lines {
// horizontal line
if x1 == x2 {
let v_idx = vlines.iter().position(|&(a, b)| a <= x1 && x1 <= b).unwrap_or(vlines.len());
let h_start = hlines.iter().position(|&(a, b)| y1 >= a).unwrap_or(hlines.len());
let h_end = hlines.iter().position(|&(a, b)| y2 <= b).unwrap_or(hlines.len());
for h in h_start .. h_end {
line_grid[v_idx * hlines.len() + h] = true;
}
} else if y1 == y2 {
}
// vertical line
else if y1 == y2 {
let h_idx = hlines.iter().position(|&(a, b)| a <= y1 && y1 <= b).unwrap_or(hlines.len());
let v_start = vlines.iter().position(|&(a, b)| x1 >= a).unwrap_or(vlines.len());
let v_end = vlines.iter().position(|&(a, b)| x2 <= b).unwrap_or(vlines.len());
Expand All @@ -60,19 +63,25 @@ pub fn analyze_lines(lines: &[[f32; 4]]) -> Lines {
}
}


//println!("hlines: {:?}", hlines);
//println!("vlines: {:?}", vlines);

Lines { hlines, vlines, line_grid }
}

#[derive(Debug)]
pub struct Lines {
pub hlines: Vec<(f32, f32)>,
pub vlines: Vec<(f32, f32)>,
pub line_grid: Vec<bool>,
}

/// Deals with things like superscript and subscript, which fall outside the usual bounds
/// but need to be assigned to the correct line.
///
/// example, two lines:
/// hello world
/// m³2 test a number℡
pub fn overlapping_lines(boxes: &mut [(RectF, usize)]) -> Node {
sort_y(boxes);
let avg_height = avg(boxes.iter().map(|(r, _)| r.height())).unwrap();
Expand Down

0 comments on commit ea8ddf1

Please sign in to comment.