Skip to content

Commit

Permalink
Updated word splitting in PDF text imports
Browse files Browse the repository at this point in the history
  • Loading branch information
Balearica committed Dec 20, 2024
1 parent a9afa1b commit 6f28688
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 0 deletions.
Binary file modified mupdf/libmupdf.wasm
Binary file not shown.
Binary file added tests/assets/fti_filing_p25.pdf
Binary file not shown.
15 changes: 15 additions & 0 deletions tests/module/importPdfText.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,21 @@ describe('Check that PDF imports split lines correctly.', function () {
});
}).timeout(120000);

describe('Check that PDF imports split words correctly.', function () {
this.timeout(10000);

it('Should correctly split words not separated by space or any character defined in may_add_space', async () => {
await scribe.importFiles([`${ASSETS_PATH_KARMA}/fti_filing_p25.pdf`]);

assert.strictEqual(scribe.data.ocr.active[0].lines[4].words[0].text, '☒');
assert.strictEqual(scribe.data.ocr.active[0].lines[4].words[1].text, 'ANNUAL');
}).timeout(10000);

after(async () => {
await scribe.terminate();
});
}).timeout(120000);

describe('Check that line baselines are imported correctly.', function () {
this.timeout(10000);

Expand Down

0 comments on commit 6f28688

Please sign in to comment.