Skip to content

Commit

Permalink
Updated OCR import to remove space characters
Browse files Browse the repository at this point in the history
  • Loading branch information
Balearica committed Dec 1, 2024
1 parent 07f19f1 commit 0630433
Showing 1 changed file with 4 additions and 1 deletion.
5 changes: 4 additions & 1 deletion js/import/convertPageBlocks.js
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ export async function convertPageBlocks({
continue;
}

const wordObj = new ocr.OcrWord(lineObj, word.text, wordbox, id);
const wordObj = new ocr.OcrWord(lineObj, word.text.trim(), wordbox, id);
wordObj.lang = word.language;
wordObj.conf = word.confidence;

Expand All @@ -150,6 +150,9 @@ export async function convertPageBlocks({
for (let m = 0; m < word.symbols.length; m++) {
const symbol = word.symbols[m];

// The LSTM model sometimes produces space characters.
if (!symbol.text?.trim()) continue;

const symbolbox = {
left: symbol.bbox.x0, top: symbol.bbox.y0, right: symbol.bbox.x1, bottom: symbol.bbox.y1,
};
Expand Down

0 comments on commit 0630433

Please sign in to comment.