diff --git a/docs/image-format.md b/docs/image-format.md index 742671bb0..b5435ff4e 100644 --- a/docs/image-format.md +++ b/docs/image-format.md @@ -8,7 +8,9 @@ On a browser, an image can be: - an `img`, `video`, or `canvas` element - a `File` object (from a file ``) - a path or URL to an accessible image +- a base64 encoded image fits `data:image\/([a-zA-Z]*);base64,([^"]*)` regexp In Node.js, an image can be - a path to a local image - a Buffer storing binary image +- a base64 encoded image fits `data:image\/([a-zA-Z]*);base64,([^"]*)` regexp diff --git a/src/browser/b64toU8Array.js b/src/browser/b64toU8Array.js new file mode 100644 index 000000000..06c84125e --- /dev/null +++ b/src/browser/b64toU8Array.js @@ -0,0 +1 @@ +module.exports = s => new Uint8Array(atob(s).split('').map(c => c.charCodeAt(0))); diff --git a/src/browser/index.js b/src/browser/index.js index 1be4ee3e1..b7accd8ee 100644 --- a/src/browser/index.js +++ b/src/browser/index.js @@ -10,6 +10,7 @@ const check = require('check-types'); const resolveURL = require('resolve-url'); const axios = require('axios'); +const b64toU8Array = require('./b64toU8Array'); const { defaultOptions } = require('../common/options'); const { version } = require('../../package.json'); @@ -38,6 +39,7 @@ const readFromBlobOrFile = (blob, res) => { * @access private * @param {string, object} image - image source, supported formats: * string: URL string, can be relative path + * string: base64 image * img HTMLElement: extract image source from src attribute * video HTMLElement: extract image source from poster attribute * canvas HTMLElement: extract image data by converting to Blob @@ -46,6 +48,11 @@ const readFromBlobOrFile = (blob, res) => { */ const loadImage = (image) => { if (check.string(image)) { + // Base64 Image + if (/data:image\/([a-zA-Z]*);base64,([^"]*)/.test(image)) { + return Promise.resolve(b64toU8Array(image.split(',')[1])); + } + // Image URL return axios.get(resolveURL(image), { responseType: 'arraybuffer', }) diff --git a/src/browser/worker.js b/src/browser/worker.js index 55a6cbe06..4eabe74b5 100644 --- a/src/browser/worker.js +++ b/src/browser/worker.js @@ -10,6 +10,7 @@ const check = require('check-types'); const workerUtils = require('../common/workerUtils'); +const b64toU8Array = require('./b64toU8Array'); /* * register message handler @@ -42,7 +43,7 @@ workerUtils.setAdapter({ } return global.TesseractCore; }, - b64toU8Array: s => new Uint8Array(atob(s).split('').map(c => c.charCodeAt(0))), + b64toU8Array, writeFile: (path, data, type) => { postMessage({ jobId: 'Download', diff --git a/src/node/b64toU8Array.js b/src/node/b64toU8Array.js new file mode 100644 index 000000000..349587fe7 --- /dev/null +++ b/src/node/b64toU8Array.js @@ -0,0 +1 @@ +module.exports = s => Buffer.from(s, 'base64'); diff --git a/src/node/index.js b/src/node/index.js index 54dcbdd13..54d87a176 100644 --- a/src/node/index.js +++ b/src/node/index.js @@ -13,6 +13,7 @@ const axios = require('axios'); const isURL = require('is-url'); const { fork } = require('child_process'); const path = require('path'); +const b64toU8Array = require('./b64toU8Array'); const { defaultOptions } = require('../common/options'); const readFile = util.promisify(fs.readFile); @@ -25,6 +26,7 @@ const readFile = util.promisify(fs.readFile); * @access public * @param {string} image - image source, supported formats: * string: URL string or file path + * string: base64 image * buffer: image buffer * @returns {array} binary image in array format */ @@ -36,6 +38,10 @@ const loadImage = (image) => { .then(resp => resp.data); } + if (/data:image\/([a-zA-Z]*);base64,([^"]*)/.test(image)) { + return Promise.resolve(b64toU8Array(image.split(',')[1])); + } + if (Buffer.isBuffer(image)) { return Promise.resolve(image); } diff --git a/src/node/worker.js b/src/node/worker.js index 99848dec1..1dafb2b1e 100644 --- a/src/node/worker.js +++ b/src/node/worker.js @@ -10,6 +10,7 @@ const check = require('check-types'); const workerUtils = require('../common/workerUtils'); +const b64toU8Array = require('./b64toU8Array'); let TesseractCore = null; @@ -33,7 +34,7 @@ workerUtils.setAdapter({ } return TesseractCore; }, - b64toU8Array: s => Buffer.from(s, 'base64'), + b64toU8Array, writeFile: (path, data) => { const fs = require('fs'); fs.writeFile(path, data, (err) => { diff --git a/tests/recognize.test.js b/tests/recognize.test.js index 4d3defe9c..a53f0f73e 100644 --- a/tests/recognize.test.js +++ b/tests/recognize.test.js @@ -1,6 +1,8 @@ const { TesseractWorker } = Tesseract; const isBrowser = typeof window !== 'undefined' && typeof window.document !== 'undefined'; +const SIMPLE_PNG = ''; +const SIMPLE_JPG = ''; const IMAGE_PATH = 'http://localhost:3000/tests/assets/images'; const SIMPLE_TEXT = 'Tesseract.js\n'; const COMSIC_TEXT = 'HellO World\nfrom beyond\nthe Cosmic Void\n'; @@ -23,6 +25,24 @@ const getWorker = options => ( ); describe('recognize()', () => { + describe('should recognize base64 image', () => { + [ + { format: 'png', image: SIMPLE_PNG, ans: SIMPLE_TEXT }, + { format: 'jpg', image: SIMPLE_JPG, ans: SIMPLE_TEXT }, + ].forEach(({ format, image, ans }) => ( + it(`recongize ${format} in base64`, (done) => { + const worker = getWorker(); + worker + .recognize(image) + .then(({ text }) => { + expect(text).to.be(ans); + worker.terminate(); + done(); + }); + }).timeout(30000) + )); + }); + describe('should recognize different langs', () => { [ { name: 'chinese.png', lang: 'chi_tra', ans: CHINESE_TEXT }, @@ -39,7 +59,7 @@ describe('recognize()', () => { }).timeout(30000) )); }); - + describe('should read bmp, jpg, png and pbm format images', () => { FORMATS.forEach(format => ( it(`support ${format} format`, (done) => {