Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
brianc committed Nov 13, 2013
0 parents commit 991294d
Show file tree
Hide file tree
Showing 7 changed files with 103 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
node_modules
1 change: 1 addition & 0 deletions .npmignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
test/
Empty file added README.md
Empty file.
37 changes: 37 additions & 0 deletions index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
var _ = require('lodash')
var Parser = require('pdf2json')

//clear the pdf logger
require('util')._logN = function() { }

//given a path to a pdf
//turn it into a json structure
module.exports = function(path, cb) {
var parser = new Parser()
parser.on('pdfParser_dataReady', function(result) {

//attach some handy methods to the result object

//get text on a particular page
result.data.Pages.forEach(function(page) {
page.getText = function() {
return _(page.Texts).map('R').flatten().map('T').map(decodeURIComponent).value()
}
})

//get text from a page by page number (0 indexed)
result.getTextOnPage = function(number) {
return result.data.Pages[number].getText()
}

//get all text in document
result.getText = function() {
return _(this.data.Pages).map(function(p) { return p.getText() }).flatten().value()
}

return cb(null, result.getText(), result)
})

parser.on('pdfParser_dataError', cb)
parser.loadPDF(path)
}
30 changes: 30 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"name": "pdf-text",
"version": "0.0.0",
"description": "Extract an array of text chunks from a pdf",
"main": "index.js",
"scripts": {
"test": "mocha"
},
"repository": {
"type": "git",
"url": "git://github.com/brianc/node-pdf-text.git"
},
"keywords": [
"pdf",
"text"
],
"author": "Brian M. Carlson",
"license": "MIT",
"bugs": {
"url": "https://github.com/brianc/node-pdf-text/issues"
},
"devDependencies": {
"mocha": "~1.14.0",
"okay": "~0.3.0"
},
"dependencies": {
"pdf2json": "~0.5.6",
"lodash": "~2.3.0"
}
}
34 changes: 34 additions & 0 deletions test/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
var ok = require('okay')
var pdfText = require('../')
var assert = require('assert')

describe('pdf-text', function() {
before(function(done) {
self = this
pdfText(__dirname + '/w4.pdf', ok(done, function(text, res) {
self.text = text
self.result = res
done()
}))
})

it('returns array of text', function() {
assert(require('util').isArray(this.text))
})

it('returns the pdf document', function() {
assert(this.result)
assert.equal(this.text.length, self.result.getText().length)
})

it('returns correct text', function() {
assert.equal(this.text.indexOf('Form W-4 (2013)'), 0)
assert(this.text.indexOf('Additional amount, if any, you want withheld from each paycheck'))
assert(this.text.indexOf('See the instructions for your income tax return.'))
})

it('returns correct results per page', function() {
assert.equal(this.result.getTextOnPage(0).indexOf('Additional amount, if any, you want withheld from each paycheck'), -1)
assert(this.result.getTextOnPage(1).indexOf('See the instructions for your income tax return.'))
})
})
Binary file added test/w4.pdf
Binary file not shown.

0 comments on commit 991294d

Please sign in to comment.