Skip to content

Commit

Permalink
Initial Commit
Browse files Browse the repository at this point in the history
  • Loading branch information
DisK0nn3cT committed Mar 29, 2018
0 parents commit 8597e87
Show file tree
Hide file tree
Showing 4 changed files with 300 additions and 0 deletions.
29 changes: 29 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# ScrapedIn
tool to scrape LinkedIn

this tool assists in performing reconnaissance using the LinkedIn.com website/API. Provide a search string just as you would on the original website and let ScrapedIn do all the dirty work. Output is stored as an XLSX file, however it is intended to be used with Google Spreadsheets. After importing the XLSX into Google Spreadsheets there will be a "dataset" worksheet and a "report" worksheet.

## dataset
- first name
- last name
- occupation
- location
- industry
- profile URL
- picture URL

## report
- Picture (displayed)
- Full Name, Occupation
- Link to Profile

### Disclaimer
this tool is for educational purposes only and violates LinkedIn.com's TOS. Use at your own risk.

## Screenshots

![alt tag](https://lh5.googleusercontent.com/Og8_HVL2sMhcw2Q20YTy5XxWF5-mZkJjFOtyG9CThVYhhT-yulm2Mv3zbo0OU4tNZInJCWeowmRwOF8=w3202-h1722-rw)

![alt tag](https://lh4.googleusercontent.com/bFDcl7Iai3iEULLxRKX8MpGN0zdl_a5besT9MDpRq88L-REROKmPAhDV2jNVtMPxD4ucoMJDDFdWxvI=w3202-h1722-rw)

![alt tag](https://lh6.googleusercontent.com/EqDaZcEFjWqHIkaHmhmJB3Ru2yzefebJVaxmFYuZ8S-r0r2daGkh5ehxvTrwaQaumnG_LY5Ua-iW59c=w3202-h1722-rw)
89 changes: 89 additions & 0 deletions SI_login.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#!/usr/bin/python

__author__ = 'Danny Chrastil'
__email__ = '[email protected]'
__description__ = 'Python Requests doesnt handle LinkedIn authentication well. This uses urllib instead'
__version__ = '0.2'

import cookielib
import os
import urllib
import urllib2
import re
import string
import sys
import config
from bs4 import BeautifulSoup

def linkedIn():
global opener
cookie_filename = "cookies.txt"

# Simulate browser with cookies enabled
cj = cookielib.MozillaCookieJar(cookie_filename)
if os.access(cookie_filename, os.F_OK):
cj.load()

# Load Proxy settings
if len(config.proxylist) > 0:
#print "[Status] Setting up proxy (%s)" % config.proxylist[0]
proxy_handler = urllib2.ProxyHandler({'https':config.proxylist[0]})
opener = urllib2.build_opener(
proxy_handler,
urllib2.HTTPRedirectHandler(),
urllib2.HTTPHandler(debuglevel=0),
urllib2.HTTPSHandler(debuglevel=0),
urllib2.HTTPCookieProcessor(cj)
)
else:
opener = urllib2.build_opener(
urllib2.HTTPRedirectHandler(),
urllib2.HTTPHandler(debuglevel=0),
urllib2.HTTPSHandler(debuglevel=0),
urllib2.HTTPCookieProcessor(cj)
)

# Get CSRF Token
#print "[Status] Obtaining a CSRF token"
html = loadPage("https://www.linkedin.com/")
soup = BeautifulSoup(html, "html.parser")
csrf = soup.find(id="loginCsrfParam-login")['value']
#print csrf
# Authenticate
login_data = urllib.urlencode({
'session_key': config.linkedin['username'],
'session_password': config.linkedin['password'],
'loginCsrfParam': csrf,
})
#print "[Status] Authenticating to Linkedin"
html = loadPage("https://www.linkedin.com/uas/login-submit", login_data)
soup = BeautifulSoup(html, "html.parser")
try:
print cj._cookies['.www.linkedin.com']['/']['li_at'].value
except:
print "error"
cj.save()
os.remove(cookie_filename)

def loadPage(url, data=None):
try:
response = opener.open(url)
except:
print "\n[Fatal] Your IP may have been temporarily blocked"

try:
if data is not None:
response = opener.open(url, data)
else:
response = opener.open(url)
#return response.headers.get('Set-Cookie')
return ''.join(response.readlines())
except:
# If URL doesn't load for ANY reason, try again...
# Quick and dirty solution for 404 returns because of network problems
# However, this could infinite loop if there's an actual problem
print "[Notice] Exception hit"
sys.exit(0)

linkedIn()

159 changes: 159 additions & 0 deletions ScrapedIn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
#!/usr/bin/python

__title__ = "ScrapeIn - Tool to Scrape LinkedIn"
__author__ = 'Danny Chrastil'
__email__ = '[email protected]'
__description__ = "A recon tool that allows you to scrape profile search results from LinkedIn"
__disclaimer__ = "This tool violates TOS of LinkedIn.com. For educational purposes only. Use at your own risk"
__version__ = '2.0'

import sys
import re
import time
import xlsxwriter
import json
import argparse
import requests
import subprocess
import urllib
import math
from thready import threaded
reload(sys)
sys.setdefaultencoding('utf-8')

""" Setup Argument Parameters """
parser = argparse.ArgumentParser(description='Discovery LinkedIn')
parser.add_argument('-u', '--keywords', help='Keywords to search')
parser.add_argument('-o', '--output', help='Output file (do not include extentions)')
args = parser.parse_args()

def get_search():
# Fetch the initial page to get results/page counts
#url = 'https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List()&keywords=%s&origin=GLOBAL_SEARCH_HEADER&q=guided&searchId=1489295486936&start=0' % search
url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(v-%%3EPEOPLE,facetGeoRegion-%%3Ear%%3A0)&keywords=%s&origin=FACETED_SEARCH&q=guided&start=0" % search
#url = 'https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(v->PEOPLE,facetCurrentCompany->31752)&origin=GLOBAL_SEARCH_HEADER&q=guided&start=0'
#url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(v->PEOPLE,facetCurrentCompany->31752)&origin=OTHER&q=guided&start=0"
#url = 'https://www.linkedin.com/search/results/people/?facetCurrentCompany=%5B"75769"%5D'

headers = {'Csrf-Token':'ajax:7736867257193100830'}
cookies['JSESSIONID'] = 'ajax:7736867257193100830'
cookies['X-RestLi-Protocol-Version'] = '2.0.0'
r = requests.get(url, cookies=cookies, headers=headers)
content = json.loads(r.text)
data_total = content['paging']['total']

# Calculate pages off final results at 40 results/page
pages = data_total / 40
if data_total % 40 == 0:
# Becuase we count 0... Subtract a page if there are no left over results on the last page
pages = pages - 1
if pages == 0:
pages = 1

print "[Info] %i Results Found" % data_total
if data_total > 1000:
pages = 24
print "[Notice] LinkedIn only allows 1000 results. Refine keywords to capture all data"
print "[Info] Fetching %i Pages" % pages
print

# Set record position for XLSX
recordpos = 1

for p in range(pages):
# Request results for each page using the start offset
url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List()&keywords=%s&origin=GLOBAL_SEARCH_HEADER&q=guided&searchId=1489295486936&start=%i" % (search, p*40)
url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(v-%%3EPEOPLE,facetGeoRegion-%%3Ear%%3A0)&keywords=%s&origin=FACETED_SEARCH&q=guided&start=%i" % (search, p*40)
#url = 'https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(v->PEOPLE,facetCurrentCompany->31752)&origin=GLOBAL_SEARCH_HEADER&q=guided&start=%i' % (p*40)
#url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(v->PEOPLE,facetCurrentCompany->%s)&origin=OTHER&q=guided&start=%i" % (p*40)
#url = 'https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(v->PEOPLE,facetCurrentCompany->75769)&keywords=%s&origin=GLOBAL_SEARCH_HEADER&q=guided&searchId=1489295486936&start=%i' % (search, p*40)
#url = 'https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(facetGeoRegion-%%3Ear%%3A0)&keywords=%s&origin=GLOBAL_SEARCH_HEADER&q=guided&searchId=1489295486936&start=%i' % (search, p*40)
#print url
#print
r = requests.get(url, cookies=cookies, headers=headers)
content = r.text.encode('UTF-8')
content = json.loads(content)
print "[Info] Fetching page %i with %i results" % (p+1,len(content['elements'][0]['elements']))
for c in content['elements'][0]['elements']:
try:
if c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['headless'] == False:
try:
data_industry = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['industry']
except:
data_industry = ""
data_firstname = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['firstName']
data_lastname = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['lastName']
data_slug = "https://www.linkedin.com/in/%s" % c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['publicIdentifier']
data_occupation = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['occupation']
data_location = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['location']
try:
data_picture = "https://media.licdn.com/mpr/mpr/shrinknp_400_400%s" % c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['picture']['com.linkedin.voyager.common.MediaProcessorImage']['id']
except:
#print "[Notice] No picture found for %s %s, %s" % (data_firstname, data_lastname, data_occupation)
data_picture = ""

# Write data to XLSX file
worksheet1.write('A%i' % recordpos, data_firstname)
worksheet1.write('B%i' % recordpos, data_lastname)
worksheet1.write('C%i' % recordpos, data_occupation)
worksheet1.write('D%i' % recordpos, data_location)
worksheet1.write('E%i' % recordpos, data_industry)
worksheet1.write('F%i' % recordpos, data_slug)
worksheet1.write('G%i' % recordpos, data_picture)
worksheet2.write('A%i' % recordpos, '=IMAGE(dataset!G%i)' % recordpos)
worksheet2.write('B%i' % recordpos, '=dataset!A%i&" "&dataset!B%i&"\n"&dataset!C%i&"\n"&dataset!D%i&"\n"&dataset!E%i' % (recordpos,recordpos,recordpos,recordpos,recordpos))
worksheet2.write('C%i' % recordpos, '=HYPERLINK(dataset!F%i)' % recordpos)
worksheet2.set_row(recordpos-1,125)
# Increment Record Position
recordpos = recordpos + 1
else:
print "[Notice] Headless profile found. Skipping"
except:
print "[Notice] Skipping"
continue
print

def authenticate():
try:
session = subprocess.Popen(['python', 'SI_login.py'], stdout=subprocess.PIPE).communicate()[0].replace("\n","")
if len(session) == 0:
sys.exit("[Error] Unable to login to LinkedIn.com")
print "[Info] Obtained new session: %s" % session
cookies = dict(li_at=session)
except Exception, e:
sys.exit("[Fatal] Could not authenticate to linkedin. %s" % e)
return cookies

if __name__ == '__main__':
title = """
__ _ _____
/ _\ ___ _ __ __ _ _ __ ___ __| | \_ \_ __
\ \ / __| '__/ _` | '_ \ / _ \/ _` | / /\/ '_ \
_\ \ (__| | | (_| | |_) | __/ (_| /\/ /_ | | | |
\__/\___|_| \__,_| .__/ \___|\__,_\____/ |_| |_|
|_|
tool to scrape linkedin v2.0
"""
print title.decode('UTF-8')

# Prompt user for data variables
search = args.keywords if args.keywords!=None else raw_input("Enter search Keywords (use quotes for more percise results)\n")
outfile = args.output if args.output!=None else raw_input("Enter filename for output (exclude file extension)\n")
print

# URL Encode for the querystring
search = urllib.quote_plus(search)
cookies = authenticate()

# Initiate XLSX File
workbook = xlsxwriter.Workbook('results/%s.xlsx' % outfile)
worksheet1 = workbook.add_worksheet('dataset')
worksheet2 = workbook.add_worksheet('report')
worksheet2.set_column(0,0, 25)
worksheet2.set_column(1,2, 75)

# Initialize Scraping
get_search()

# Close XLSD File
workbook.close()
23 changes: 23 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/usr/bin/python

## [LINKEDIN CREDENTIALS] ##
# it may be preferable to use a fake
# account to avoid account suspension

linkedin = dict(
username = '',
password = '',
)

## [PROXY LIST] ##
# Leave empty to use your own IP address
# by using a proxy you can avoid being
# blocked for sending too much traffic

proxylist = []
#proxylist.append('http://127.0.0.1:8080')

## [MISCELLANEOUS] ##

timeout = 10

0 comments on commit 8597e87

Please sign in to comment.