diff --git a/.gitignore b/.gitignore index ade411a..7e39c34 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,9 @@ Resumes/ # Installed node modules node_modules/ +# Shell script to run module against test resumes in Resumes directory +parse_test_resumes.sh + # Sublime Text project configuration files linkedin-pdf-to-json.sublime-project linkedin-pdf-to-json.sublime-workspace \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..383faeb --- /dev/null +++ b/LICENSE @@ -0,0 +1,8 @@ +The MIT License (MIT) +Copyright (c) 2016 Isaac Mast + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md index 7a1aaa6..9cbdbfe 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,17 @@ # linkedin-pdf-to-json -JavaScript recursive descent parser for storing text from LinkedIn profile PDFs in a JSON object. \ No newline at end of file +JavaScript recursive descent parser for storing text from LinkedIn profile PDFs in a JSON object. + +## Unsupported Sections + +Currently unsupported sections include: + +* Education +* Publications +* Projects +* Certifications +* Specialties +* Honors and Awards +* Interests +* Courses +* Recommendations \ No newline at end of file diff --git a/index.js b/index.js index 5ed3c6b..bdbf765 100644 --- a/index.js +++ b/index.js @@ -1,249 +1,518 @@ -// Recursive descent parser for LinkedIn profile PDFs written in JavaScript. +// JavaScript recursive descent parser for storing text retrieved from LinkedIn profile PDFs in JSON format. // Author: Isaac Mast [https://github.com/isaacmast] - -var pdfText = require('pdf-text'); - -var linkedinPdfToJson = module.exports = {}, - CONTENT, // the string array retrieved from pdf-text module, but after the removal of page numbers - index = 0, // the CONTENT array index - jobCount = 0, // the number of jobs in the current section - json = {}, // the JSON object to hold the parsed data - jobProperty, // the json object job key (dynamically generated using generateJobKey()) - last, // the previously parsed text string - section, // the current section of the PDF - text, // the current text string being parsed - token = 'START', // the current token - PATH_TO_PDF = process.argv[2]; // the path to be parsed PDF passed from the command line - -var SECTION_HEADERS = { - summary: 'Summary', - education: 'Education', - experience: 'Experience', - skills: 'Skills & Expertise', - volunteerXP: 'Volunteer Experience', - personHasRecommended: 'person has recommended', - peopleHaveRecommended: 'people have recommended' -}; - -// available token values -var EOF_TOKEN = 'EOF', - SECTION_HEADER_TOKEN = 'SECTION HEADER', - JOB_TOKEN = 'JOB', - JOB_DATE_TOKEN = 'JOB DATE', - JOB_DURATION_TOKEN = 'JOB DURATION', - SECTION_CONTENT_TOKEN = 'SECTION CONTENT'; - -// Retrieves the text from the PDF, storing each line as an element in the chunks string array. -pdfText(PATH_TO_PDF, function(error, chunks) { - if (error) { - console.log(error); - return; - } - - console.log(chunks); - linkedinPdfToJson.parse(chunks); -}); - -//=========================== -// GRAMMAR LOGIC -//=========================== - -linkedinPdfToJson.parse = function(chunks) { - this.removePageNumbers(chunks); - CONTENT = chunks; - this.setBasicInfo(); - this.getNextToken(); - while (token) { - if (token === SECTION_HEADER_TOKEN) { - this.sectionHeader(); - this.resetJobCount(); - } - this.getNextToken(); - } -}; - -linkedinPdfToJson.sectionHeader = function() { - if (json[CONTENT[index].toLowerCase()]) { - throw 'JSON property (' + CONTENT[index] + ') already exists.'; - } - json[CONTENT[index].toLowerCase()] = {}; - console.log(json); - this.getNextToken(); - if (section === SECTION_HEADERS.summary) { - - } else if (section === SECTION_HEADERS.experience) { - while (token === JOB_TOKEN) { - jobCount++; - this.job(); - this.getNextToken(); +// GitHub repo: https://github.com/isaacmast/linkedin-pdf-to-json + +var linkedinPdfToJson = function() { + + var pdfText = require('pdf-text'); + + var CONTENT, // the string array retrieved from pdf-text module, but after the removal of page numbers + index = 0, // the current index of the CONTENT array + jobCount = 0, // the number of jobs in the current section + json = {}, // the JSON object to hold the parsed data + last, // the previously parsed text string + section, // the current section of the PDF + text, // the current text string being parsed + token = 'START', // the current token + PATH_TO_PDF = process.argv[2]; // the path to be parsed PDF passed from the command line + + // possible section headers that are currently supported + var SECTION_HEADERS = { + 'Summary': 'summary', + 'Languages': 'languages', + 'Experience': 'experience', + 'Skills & Expertise': 'skills', + 'Volunteer Experience': 'volunteerXP', + 'Unsupported': 'unsupported' + }; + + // currently unsupported sections + var UNSUPPORTED_SECTIONS = ['Publications', 'Projects', 'Certifications', 'Education', 'Specialties', 'Honors and Awards', 'Interests', 'Courses', 'recommendations']; + + // available token values + var TOKENS = { + 'EOF': 'eof', + 'SECTION_HEADER': 'section_header', + 'NAME': 'name', + 'JOB': 'job', + 'JOB_DATE': 'date_range', + 'JOB_DURATION': 'duration', + 'SECTION_CONTENT': 'section_content', + 'SKILL': 'skill', + 'LANGUAGE': 'language', + 'LANGUAGE_PROFICIENCY': 'proficiency', + 'UNSUPPORTED': 'unsupported', + 'UNKNOWN': 'unknown' + }; + + // Retrieves the text from the PDF, storing each line as an element in the chunks string array. + pdfText(PATH_TO_PDF, function(error, chunks) { + if (error) { + console.log(error); + return; } - } else { - error(); - } -}; -// Creates and populates a new JSON job property under the appropriate section header. -linkedinPdfToJson.job = function() { - jobProperty = this.generateJobKey(); - json[section][jobProperty] = json[section][jobProperty] || {}; - this.setJobTitle(); - if (token === JOB_DATE_TOKEN) { - this.setJobDate(); + // console.log(chunks); + // console.log(); + this.parse(chunks); + // console.log(); + // console.log(json); + // console.log(); + // console.log('...Parsing complete'); + // console.log(); + }); + + //=========================== + // GRAMMAR LOGIC + //=========================== + + // Parses the PDF using the chunks array retrieved from the pdf-text node module + this.parse = function(chunks) { + // console.log('Parsing (' + chunks[2] + ')...'); + this.sanitize(chunks); + CONTENT = chunks; + this.setBasicInfo(); this.getNextToken(); - if (token === JOB_DURATION_TOKEN) { - this.setJobDuration(); + while (token !== TOKENS.EOF) { + if (token === TOKENS.SECTION_HEADER || token === TOKENS.UNSUPPORTED || token === TOKENS.UNKNOWN) { + this.section(); + } else { + this.error(); + } + } + }; + + // Parses a section of the PDF. + this.section = function() { + // console.log('ZZZ SECTION'); + // console.log('section = ' + section); + if (section !== 'unsupported') { + json[section] = json[section] || {}; + } + if (section === SECTION_HEADERS.Summary) { + this.getNextToken(); + if (token === TOKENS.SECTION_CONTENT) { + this.summary(); + } else { + this.error(); + } + } else if (section === SECTION_HEADERS.Education) { + this.getNextToken(); + if (token === TOKENS.SCHOOL) { + this.education(); + } else { + this.error(); + } + } else if (section === SECTION_HEADERS.Experience || section === SECTION_HEADERS['Volunteer Experience']) { + this.getNextToken(); + if (token === TOKENS.JOB) { + while (token === TOKENS.JOB) { + jobCount++; + this.job(); + } + this.resetJobCount(); + } else { + this.error(); + } + } else if (section === SECTION_HEADERS.Languages) { + this.getNextToken(); + if (token === TOKENS.LANGUAGE) { + while (section === SECTION_HEADERS.Languages) { + this.languages(); + } + } else { + this.error(); + } + } else if (section === SECTION_HEADERS['Skills & Expertise']) { this.getNextToken(); - while (token === SECTION_CONTENT_TOKEN) { + if (token === TOKENS.SKILL) { + this.skillsAndExpertise(); + } else { + this.error(); + } + // TODO: Implement the rest of the unsupported sections + // This marks the start of the unsupported sections + // See global UNSUPPORTED_SECTIONS variable at the top for a list of + // all the unsupported sections. + } else if (token === TOKENS.UNSUPPORTED || token === TOKENS.UNKNOWN) { + json[section] = json[section] || []; + while (token === TOKENS.UNSUPPORTED || token === TOKENS.UNKNOWN) { + if (json[section].indexOf(text) === -1) { // check is mainly for '...........' section separators to avoid redundant '.' elements + json[section].push(text); + } + this.getNextToken(); } } else { this.error(); } - } else { - this.error(); - } -}; - -// Determines the next token based on the next text chunk -linkedinPdfToJson.getNextToken = function() { - console.log('Setting token...'); - index++; - text = CONTENT[index]; - console.log('text = ' + text); - if (this.isSectionHeader(text)) { - token = SECTION_HEADER_TOKEN; - section = text; - } else if (this.isJobTitle(text)) { - text = text.replace(/\s{2}at\s{2}/, ' at '); - token = JOB_TOKEN; - } else if (this.isJobDate(text)) { - text = text.trim(); - token = JOB_DATE_TOKEN; - } else if (this.isJobPeriod(text)) { - token = JOB_DURATION_TOKEN; - } else if (this.isSectionContent(text)) { - token = SECTION_CONTENT_TOKEN; - } else if (this.isEndOfFile(text)) { - token = EOF_TOKEN; - } else { - this.error(); - } - console.log('current token = ' + token); -}; - -//=========================== -// GENERATORS AND SETTERS -//=========================== - -// Generates a new JSON job key by concatenating the job count to the string 'job_'. -// @return the generated JSON job key. -linkedinPdfToJson.generateJobKey = function() { - return 'job_' + jobCount.toString(); -}; - -// Resets the job count to 0. -linkedinPdfToJson.resetJobCount = function() { - jobCount = 0; -}; - -// Sets the title for the currently parsed job under the appropriate section. -linkedinPdfToJson.setJobTitle = function() { - json[section][jobProperty].title = text; -}; - -// Sets the date for the currently parsed job under the appropriate section. -linkedinPdfToJson.setJobDate = function() { - json[section][jobProperty].date = text; -}; - -// Sets the duration for the currently parsed job under the appropriate section. -linkedinPdfToJson.setJobDuration = function() { - json[section][jobProperty].duration = text; -}; - -//=========================== -// INITIAL HELPERS -//=========================== - -// Removes unnecessary 'Page' and '{0}' elements from chunks string array. -// @param chunks - array of string elements representing the top-to-bottom -// flow of text from the PDF. -linkedinPdfToJson.removePageNumbers = function(chunks) { - for (var i = 0; i < chunks.length; i++) { - if (chunks[i] === 'Page' && chunks[i + 1].match(/\d+/)) { - chunks.splice(i, 2); + }; + + // Parses the summary section of the PDF. + // For description fields in a LinkedIn profile section, the user may choose to format their descriptions + // by outlining them with letters or numbers or by using bullet points/bullet-like symbols e.g. 1., a., -, •, #, ~, * . + // The goal of this function is to retain that user defined formatting by putting each bulleted line in its own object property. + // For non-formatted descriptions the text is simply concatenated into a single string. + this.summary = function() { + // console.log('ZZZ SUMMARY'); + var textProperty = 'text'; + var textCount = 0; + var inBulleted = false; + var hasBulleted = this.hasBulletedText(); + if (hasBulleted) { + while (token === TOKENS.SECTION_CONTENT) { + var bulleted = this.isBulleted(); + if (bulleted) { + inBulleted = true; + textCount++; + textProperty = this.generateTextProperty(textCount); + json[section][textProperty] = text; + } else if (inBulleted && !!text.match(/^\s\S/)) { + json[section][textProperty] = json[section][textProperty] + text; + } else { + inBulleted = false; + textCount++; + textProperty = this.generateTextProperty(textCount); + json[section][textProperty] = text; + } + this.getNextToken(); + } + } else { + while (token === TOKENS.SECTION_CONTENT) { + var jobText = json[section][textProperty]; + json[section][textProperty] = jobText ? jobText + text : text; + this.getNextToken(); + } } - } -}; - -// Sets the name, current job, and potentially email properties of the json object -// based on the PDF text. -// The email property may not be set if it's not provided in the PDF. -// These properties can just be assumed since it's standard across all LinkedIn profile PDFs. -linkedinPdfToJson.setBasicInfo = function() { - json.name = CONTENT[0]; - json.currentJob = CONTENT[1]; - json.email = ''; - index += 2; - if (!this.isSectionHeader(CONTENT[2])) { - json.email = CONTENT[2]; - } -}; - -//=========================== -// ERRORS -//=========================== - -linkedinPdfToJson.error = function() { - throw 'Unable to parse "' + text + '".'; -}; - -//=========================== -// TOKEN CHECKS -//=========================== - -// Determines whether the passed in text chunk is a LinkedIn profile section header. -// @param chunk - the current text chunk from the PDF. -// @return true if chunk is present in SECTION_HEADERS array. -// @return false if chunk is not present in SECTION_HEADERS array. -linkedinPdfToJson.isSectionHeader = function(chunk) { - return SECTION_HEADERS.indexOf(chunk) !== -1 ? true : false; -}; - -// Checks if the chunk is a job title e.g. 'Software Developer at Foobar'. -// Job titles follow this general format: 'job_title at company'. -// NOTE: LinkedIn PDF job titles have two spaces before and after the 'at'. -// @param chunk - the current text chunk from the PDF. -// @return true if the current token is 'HEADER' and chunk matches the regex. -// @return false otherwise. -linkedinPdfToJson.isJobTitle = function(chunk) { - return (token === SECTION_HEADER_TOKEN && !!chunk.match(/\s{2}at\s{2}/)) ? true : false; -}; - -// Checks if the chunk is a job date range e.g. 'September 2014 - December 2014'. -// Job dates follow this general format: '[month_name] year - [present|[[month_name] year]]]'. -// @param chunk - the current text chunk from the PDF. -// @return true if the current token is 'JOB TITLE' and chunk matches the regex. -// @return false otherwise. -linkedinPdfToJson.isJobDate = function(chunk) { - return (token === JOB_TOKEN && !!chunk.match(/\w*\s*\d+\s{2}\-\s{2}\w*\s*\d*/)) ? true : false; -}; - -// Checks if the chunk is a job period e.g. '(1 year 2 months)'. -// Job periods follow this general format: '(number month(s)|year)|(number year(s)[ number month(s)])'. -// @param chunk - the current text chunk from the PDF. -// @return true if the current token is 'JOB DATE' and chunk matches the regex. -// @return false otherwise. -linkedinPdfToJson.isJobPeriod = function(chunk) { - return (token === JOB_DATE_TOKEN && !!chunk.match(/\(\d+\s\w+\s*\d*\s*\w*\)/)) ? true : false; -}; - -linkedinPdfToJson.isSectionContent = function(chunk) { - return (token === JOB_DURATION_TOKEN && !this.isSectionHeader(chunk)) ? true : false; -}; + }; + + // Creates and populates a new JSON job property under the appropriate section header. + // For description fields in a LinkedIn profile section, the user may choose to format their descriptions + // by outlining them with letters or numbers or by using bullet points/bullet-like symbols e.g. 1., a., -, •, #, ~, * . + // The goal of this function is to retain that user defined formatting by putting each text chunk in its own object property if the + // job description contains bulleted text. + // For non-formatted descriptions the text is simply concatenated into a single string and stored in a property called 'text'. + this.job = function() { + // console.log('ZZZ JOB'); + var jobProperty = this.generateJobProperty(); + json[section][jobProperty] = json[section][jobProperty] || {}; + while (token === TOKENS.JOB && token !== TOKENS.JOB_DATE) { + var currentTitle = json[section][jobProperty].title; + json[section][jobProperty].title = currentTitle ? currentTitle + text : text; + this.getNextToken(); + } + if (token === TOKENS.JOB_DATE) { + json[section][jobProperty][TOKENS.JOB_DATE] = text; + this.getNextToken(); + if (token === TOKENS.JOB_DURATION) { + json[section][jobProperty].duration = text; + this.getNextToken(); + } + if (token === TOKENS.SECTION_CONTENT) { + var textProperty = 'text'; + var textCount = 0; + var inBulleted = false; + var hasBulleted = this.hasBulletedText(); + if (hasBulleted) { + while (token === TOKENS.SECTION_CONTENT) { + var bulleted = this.isBulleted(); + if (bulleted) { + inBulleted = true; + textCount++; + textProperty = this.generateTextProperty(textCount); + json[section][jobProperty][textProperty] = text; + } else if (inBulleted && !!text.match(/^\s\S/)) { + json[section][jobProperty][textProperty] = json[section][jobProperty][textProperty] + text; + } else { + inBulleted = false; + textCount++; + textProperty = this.generateTextProperty(textCount); + json[section][jobProperty][textProperty] = text; + } + this.getNextToken(); + } + } else { + while (token === TOKENS.SECTION_CONTENT) { + var jobText = json[section][jobProperty][textProperty]; + json[section][jobProperty][textProperty] = jobText ? jobText + text : text; + this.getNextToken(); + } + } + } + } + }; + + this.languages = function() { + json[section] = json[section] || {}; + var languageCount = 0; + while (token === TOKENS.LANGUAGE) { + languageCount++; + var languageProperty = this.generateLanguageProperty(languageCount); + json[section][languageProperty] = json[section][languageProperty] || {}; + json[section][languageProperty][TOKENS.LANGUAGE] = text; + this.getNextToken(); + if (token === TOKENS.LANGUAGE_PROFICIENCY) { + json[section][languageProperty][TOKENS.LANGUAGE_PROFICIENCY] = text; + this.getNextToken(); + } + } + }; -linkedinPdfToJson.isEndOfFile = function(chunk) { - return !chunk ? true : false; + this.skillsAndExpertise = function() { + // console.log('ZZZ SKILLS'); + json[section] = []; + while (token === TOKENS.SKILL) { + json[section].push(text); + this.getNextToken(); + } + }; + + //=========================== + // GENERATORS/SETTERS/HELPERS + //=========================== + + // Generates a new JSON job key by concatenating the job count to the string 'job'. + // @return the generated JSON job key. + this.generateJobProperty = function() { + return 'job' + jobCount.toString(); + }; + + // Resets the job count to 0. + this.resetJobCount = function() { + jobCount = 0; + }; + + // Generates a new JSON text key by concatenating the text count to the string 'text'. + // @return the generated JSON text key. + this.generateTextProperty = function(textCount) { + return 'text' + textCount.toString(); + }; + + // Removes unnecessary 'Page' and '{0}' elements and 'Contact {person} on LinkedIn' element from chunks array. + // @param chunks - array of string elements representing the top-to-bottom + // flow of text from the PDF. + this.sanitize = function(chunks) { + for (var i = 0; i < chunks.length; i++) { + if (chunks[i] === 'Page' && chunks[i + 1].match(/\d+/)) { + chunks.splice(i, 2); + } + } + chunks.splice(chunks.length - 1, 1); + }; + + this.generateSchoolProperty = function(schoolCount) { + return 'school' + schoolCount.toString(); + }; + + this.generateLanguageProperty = function(languageCount) { + return 'language' + languageCount.toString(); + }; + + // Sets the name, current job, and potentially email properties of the json object + // based on the PDF text. + // The email property may not be set if it's not provided in the PDF. + // These properties can just be assumed since it's standard across all LinkedIn profile PDFs. + this.setBasicInfo = function() { + json.name = CONTENT[index]; + index++; + json.currentJob = CONTENT[index]; + if (!this.isSectionHeader(CONTENT[index + 1]) && UNSUPPORTED_SECTIONS.indexOf(CONTENT[index + 1]) === -1) { + index++; + json.email = CONTENT[index]; + } + }; + + // Searches through the current job description for bulleted text. + // @return true if a text chunk from the current job description is bulleted. + // @return false otherwise. + this.hasBulletedText = function() { + var currentToken = token, + currentText = text, + currentSection = section, + currentIndex = index; + while (token === TOKENS.SECTION_CONTENT) { + if (this.isBulleted()) { + token = currentToken; + text = currentText; + section = currentSection; + index = currentIndex; + return true; + } + this.getNextToken(); + } + token = currentToken; + text = currentText; + section = currentSection; + index = currentIndex; + return false; + }; + + //=========================== + // TOKEN CHECKS + //=========================== + + // TODO: Simplify by using subsections for grade, activities, etc. + // Determines the next token based on the next text chunk + this.getNextToken = function() { + // console.log(); + // console.log(json); + // console.log('Setting token...'); + // console.log('previous token = ' + token); + index++; + last = text; + text = CONTENT[index]; + // console.log('text (untrimmed) = ' + '"' + text + '"'); + if (this.isEndOfFile()) { + token = section = TOKENS.EOF; + section = SECTION_HEADERS.Unsupported; + } else if (this.isSectionHeader()) { + token = TOKENS.SECTION_HEADER; + section = SECTION_HEADERS[text.trim()]; + } else if (this.isUnsupported()) { + token = TOKENS.UNSUPPORTED; + section = SECTION_HEADERS.Unsupported; + } else if (this.isInUnsupported()) { + token = TOKENS.UNKNOWN; + } else if (this.isSkill()) { + token = TOKENS.SKILL; + } else if (this.isJobTitle()) { + text = text.replace(/\s{2,}at\s{2,}/, ' at '); + token = TOKENS.JOB; + } else if (this.isDateRange()) { + text = text.trim().replace(/\s{2,}\-\s{2,}/, ' - '); + token = TOKENS.JOB_DATE; + } else if (this.isJobDuration()) { + token = TOKENS.JOB_DURATION; + } else if (this.isLanguageProficiency()) { + token = TOKENS.LANGUAGE_PROFICIENCY; + } else if (this.isLanguage()) { + token = TOKENS.LANGUAGE; + } else if (this.isSectionContent()) { + token = TOKENS.SECTION_CONTENT; + } else { + this.error(true); + } + // console.log('text (trimmed) = ' + '"' + text + '"'); + // console.log('new token = ' + token); + // console.log('Token set!'); + }; + + // Determines if the text chunk is preceded by a bullet/bullet-like symbol or outlined with numbers or letters e.g. •, -, A., 1., etc. + // @param previous (optional) - a specific text chunk to evaluate. + // @return true if the chunk has been preceded by a bullet or bullet-like symbol. + // @return false otherwise. + this.isBulleted = function(previous) { + var chunk = previous || text; + return !!chunk.match(/^([A-z0-9](?=\.)|[\-\•\#\~\*])/); + }; + + // Checks if the text chunk is the end of the file. + // @return true if the text chunk is the end of the file i.e. undefined. + // @return false otherwise. + this.isEndOfFile = function() { + return !text; + }; + + // Determines whether the passed in text chunk is a LinkedIn profile section header. + // @param chunk (optional) - a specific text chunk to evaluate. + // @return true if text chunk is present in SECTION_HEADERS object. + // @return false otherwise. + this.isSectionHeader = function(chunk) { + chunk = chunk || text; + return SECTION_HEADERS.hasOwnProperty(chunk.trim()); + }; + + // Checks if the text chunk is a section that is currently unsupported + // @return true if the text chunk is the section header of an unsupported section. + // @return false otherwise. + this.isUnsupported = function() { + var chunk = text; + if (chunk === json.name && CONTENT[index + 1] === json.currentJob) { + chunk = 'recommendations'; + } + return chunk ? UNSUPPORTED_SECTIONS.indexOf(chunk.trim()) !== -1 : false; + }; + + this.isInUnsupported = function() { + return token === TOKENS.UNSUPPORTED || token === TOKENS.UNKNOWN; + }; + + // Checks if the text chunk is a skill. + // @return true if the text chunk is a skill under the Skills & Expertise section. + // @return false otherwise. + this.isSkill = function() { + return (token === TOKENS.SKILL || token === TOKENS.SECTION_HEADER) && section === SECTION_HEADERS['Skills & Expertise']; + }; + + // Checks if the text chunk is a job title. + // Job titles follow this general format: 'job_title at company'. + // NOTE: LinkedIn PDF job titles have two spaces before and three spaces after the 'at'. + // Job titles are also required by LinkedIn to fill out an Experience or Volunteer Experience section. + // @return true if the text chunk is the job title of the currently parsed job. + // @return false otherwise. + this.isJobTitle = function() { + return ((token === TOKENS.SECTION_HEADER || token === TOKENS.SECTION_CONTENT || token === TOKENS.JOB_DURATION || token === TOKENS.JOB_DATE) && !!text.match(/\s{2,}at\s{2,}/)) || (token === TOKENS.JOB && !this.isDateRange()); + }; + + // Checks if the text chunk is a job date range e.g. 'September 2014 - December 2014'. + // Job dates follow this general format: '[month_name] year - [present|[[month_name] year]]]'. + // NOTE: Job dates are required by LinkedIn to fill out an Experience or Volunteer Experience section. + // This is also used when parsing the Education section to gather basic education info. + // @param chunk (optional) - a specific text chunk to evaluate. + // @return true if the text chunk is a date range of the currently parse job. + // @return false otherwise. + this.isDateRange = function(chunk) { + chunk = chunk || text; + return (token === TOKENS.JOB || token === TOKENS.EDU_BASIC_INFO) && !!chunk.match(/^\w*\s*\d+\s+\-\s+\w*\s*\d*/); + }; + + // Checks if the text chunk is a job period e.g. '(1 year 2 months)'. + // Job periods follow this general format: '(number month(s)|year)|(number year(s)[ number month(s)])'. + // NOTE: Job durations are always present for jobs since they are calculated by LinkedIn based on the job date, + // which is required by LinkedIn to fill out an Experience or Volunteer Experience section. + // @return true if the text chunk is a time duration of the currently parsed job. + // @return false otherwise. + this.isJobDuration = function() { + return token === TOKENS.JOB_DATE && !!text.match(/\(\d+\s\w+\s*\d*\s*\w*\)|^\(less than a year\)/); + }; + + this.isLanguageProficiency = function() { + return token === TOKENS.LANGUAGE && text.match(/proficiency\)$/); + }; + + this.isLanguage = function() { + return (section === SECTION_HEADERS.Languages || token === TOKENS.LANGUAGE); + }; + + // Checks if the text chunk is part of a section. + // @return true if the text chunk is part the current sections text content + // @return false otherwise. + this.isSectionContent = function() { + return token === TOKENS.JOB_DURATION || token === TOKENS.JOB_DATE || token === TOKENS.SECTION_CONTENT || token === TOKENS.SECTION_HEADER; + }; + + //=========================== + // ERROR HANDLING + //=========================== + + // Throws a runtime parsing error to the console. + // @param tokenError - boolean flag for registering a token processing error rather than a normal parsing error. + // @throw an error message stating with the text chunk that was unable to be parsed. + this.error = function(tokenError) { + tokenError && this.tokenError() || this.parsingError(); + }; + + // Throws an error resultant of an error during a call to getNextToken(). + // @throw TokenError with the name of the person and the text chunk that caused the error to be thrown. + this.tokenError = function() { + throw 'linkedin-pdf-to-json TokenError: (' + json.name + ') Unable to set token for the following text chunk: \'' + text + '\''; + }; + + // Throws an error resultant of an error during parsing. + // @throw ParsingError with the name of the person and the text chunk that caused the error to be thrown. + this.parsingError = function() { + throw 'linkedin-pdf-to-json ParsingError: (' + json.name + ') Unable to parse the following text chunk: \'' + text + '\''; + }; }; -// return linkedinPdfToJson; \ No newline at end of file +module.exports = linkedinPdfToJson; \ No newline at end of file diff --git a/package.json b/package.json index 20824c5..e9d08f4 100644 --- a/package.json +++ b/package.json @@ -1,29 +1,29 @@ { - "name": "linkedin-pdf-to-json", - "version": "1.0.0", - "description": "Converts a LinkedIn profile PDF to JSON.", - "main": "index.js", - "scripts": { - "test": "echo \"Error: no test specified\" && exit 1" - }, - "repository": { - "type": "git", - "url": "git+https://github.com/isaacmast/linkedin-pdf-to-json.git" - }, - "keywords": [ - "linkedin", - "pdf", - "json", - "profile", - "parse" - ], - "author": "Isaac Mast ", - "dependencies": { - "pdf-text": "^0.4.0" - }, - "license": "ISC", - "bugs": { - "url": "https://github.com/isaacmast/linkedin-pdf-to-json/issues" - }, - "homepage": "https://github.com/isaacmast/linkedin-pdf-to-json#readme" -} + "name": "linkedin-pdf-to-json", + "version": "1.0.0", + "description": "Converts a LinkedIn profile PDF to JSON.", + "main": "index.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "repository": { + "type": "git", + "url": "git+https://github.com/isaacmast/linkedin-pdf-to-json.git" + }, + "keywords": [ + "linkedin", + "pdf", + "json", + "profile", + "parse" + ], + "author": "Isaac Mast ", + "dependencies": { + "pdf-text": "~0.4.0" + }, + "license": "MIT", + "bugs": { + "url": "https://github.com/isaacmast/linkedin-pdf-to-json/issues" + }, + "homepage": "https://github.com/isaacmast/linkedin-pdf-to-json#readme" +} \ No newline at end of file