Initial Commit

dchrastil · Mar 29, 2018 · 8597e87 · 8597e87
commit 8597e87
Show file tree

Hide file tree

Showing 4 changed files with 300 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,29 @@
+# ScrapedIn
+tool to scrape LinkedIn
+
+this tool assists in performing reconnaissance using the LinkedIn.com website/API. Provide a search string just as you would on the original website and let ScrapedIn do all the dirty work. Output is stored as an XLSX file, however it is intended to be used with Google Spreadsheets. After importing the XLSX into Google Spreadsheets there will be a "dataset" worksheet and a "report" worksheet.
+
+## dataset
+- first name
+- last name
+- occupation
+- location
+- industry
+- profile URL
+- picture URL
+
+## report
+- Picture (displayed)
+- Full Name, Occupation
+- Link to Profile
+
+### Disclaimer
+this tool is for educational purposes only and violates LinkedIn.com's TOS. Use at your own risk.
+
+## Screenshots
+
+![alt tag](https://lh5.googleusercontent.com/Og8_HVL2sMhcw2Q20YTy5XxWF5-mZkJjFOtyG9CThVYhhT-yulm2Mv3zbo0OU4tNZInJCWeowmRwOF8=w3202-h1722-rw)
+
+![alt tag](https://lh4.googleusercontent.com/bFDcl7Iai3iEULLxRKX8MpGN0zdl_a5besT9MDpRq88L-REROKmPAhDV2jNVtMPxD4ucoMJDDFdWxvI=w3202-h1722-rw)
+
+![alt tag](https://lh6.googleusercontent.com/EqDaZcEFjWqHIkaHmhmJB3Ru2yzefebJVaxmFYuZ8S-r0r2daGkh5ehxvTrwaQaumnG_LY5Ua-iW59c=w3202-h1722-rw)
diff --git a/SI_login.py b/SI_login.py
@@ -0,0 +1,89 @@
+#!/usr/bin/python
+
+__author__ = 'Danny Chrastil'
+__email__ = '[email protected]'
+__description__ = 'Python Requests doesnt handle LinkedIn authentication well. This uses urllib instead'
+__version__ = '0.2'
+
+import cookielib
+import os
+import urllib
+import urllib2
+import re
+import string
+import sys
+import config
+from bs4 import BeautifulSoup
+
+def linkedIn():
+        global opener
+        cookie_filename = "cookies.txt"
+
+        # Simulate browser with cookies enabled
+        cj = cookielib.MozillaCookieJar(cookie_filename)
+        if os.access(cookie_filename, os.F_OK):
+            cj.load()
+
+        # Load Proxy settings
+        if len(config.proxylist) > 0:
+            #print "[Status] Setting up proxy (%s)" % config.proxylist[0]
+            proxy_handler = urllib2.ProxyHandler({'https':config.proxylist[0]})
+            opener = urllib2.build_opener(
+                proxy_handler,
+                urllib2.HTTPRedirectHandler(),
+                urllib2.HTTPHandler(debuglevel=0),
+                urllib2.HTTPSHandler(debuglevel=0),
+                urllib2.HTTPCookieProcessor(cj)
+            )
+        else:
+            opener = urllib2.build_opener(
+                urllib2.HTTPRedirectHandler(),
+                urllib2.HTTPHandler(debuglevel=0),
+                urllib2.HTTPSHandler(debuglevel=0),
+                urllib2.HTTPCookieProcessor(cj)
+            )
+
+        # Get CSRF Token
+        #print "[Status] Obtaining a CSRF token"
+        html = loadPage("https://www.linkedin.com/")
+        soup = BeautifulSoup(html, "html.parser")
+        csrf = soup.find(id="loginCsrfParam-login")['value']
+        #print csrf
+        # Authenticate
+        login_data = urllib.urlencode({
+            'session_key': config.linkedin['username'],
+            'session_password': config.linkedin['password'],
+            'loginCsrfParam': csrf,
+        })
+        #print "[Status] Authenticating to Linkedin"
+        html = loadPage("https://www.linkedin.com/uas/login-submit", login_data)
+        soup = BeautifulSoup(html, "html.parser")
+        try:
+            print cj._cookies['.www.linkedin.com']['/']['li_at'].value
+        except:
+            print "error"
+        cj.save()
+        os.remove(cookie_filename)
+
+def loadPage(url, data=None):
+        try:
+            response = opener.open(url)
+        except:
+            print "\n[Fatal] Your IP may have been temporarily blocked"
+
+        try:
+            if data is not None:
+                response = opener.open(url, data)
+            else:
+                response = opener.open(url)
+            #return response.headers.get('Set-Cookie')
+            return ''.join(response.readlines())
+        except:
+            # If URL doesn't load for ANY reason, try again...
+            # Quick and dirty solution for 404 returns because of network problems
+            # However, this could infinite loop if there's an actual problem
+            print "[Notice] Exception hit"
+            sys.exit(0)
+
+linkedIn()
+
diff --git a/ScrapedIn.py b/ScrapedIn.py
@@ -0,0 +1,159 @@
+#!/usr/bin/python
+
+__title__ = "ScrapeIn - Tool to Scrape LinkedIn"
+__author__ = 'Danny Chrastil'
+__email__ = '[email protected]'
+__description__ = "A recon tool that allows you to scrape profile search results from LinkedIn"
+__disclaimer__ = "This tool violates TOS of LinkedIn.com. For educational purposes only. Use at your own risk"
+__version__ = '2.0'
+
+import sys
+import re
+import time
+import xlsxwriter
+import json
+import argparse
+import requests
+import subprocess
+import urllib
+import math
+from thready import threaded
+reload(sys)
+sys.setdefaultencoding('utf-8')
+
+""" Setup Argument Parameters """
+parser = argparse.ArgumentParser(description='Discovery LinkedIn')
+parser.add_argument('-u', '--keywords', help='Keywords to search')
+parser.add_argument('-o', '--output', help='Output file (do not include extentions)')
+args = parser.parse_args()
+
+def get_search():
+    # Fetch the initial page to get results/page counts
+    #url = 'https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List()&keywords=%s&origin=GLOBAL_SEARCH_HEADER&q=guided&searchId=1489295486936&start=0' % search
+    url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(v-%%3EPEOPLE,facetGeoRegion-%%3Ear%%3A0)&keywords=%s&origin=FACETED_SEARCH&q=guided&start=0" % search
+    #url = 'https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(v->PEOPLE,facetCurrentCompany->31752)&origin=GLOBAL_SEARCH_HEADER&q=guided&start=0'
+    #url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(v->PEOPLE,facetCurrentCompany->31752)&origin=OTHER&q=guided&start=0"
+    #url = 'https://www.linkedin.com/search/results/people/?facetCurrentCompany=%5B"75769"%5D'
+
+    headers = {'Csrf-Token':'ajax:7736867257193100830'}
+    cookies['JSESSIONID'] = 'ajax:7736867257193100830'
+    cookies['X-RestLi-Protocol-Version'] = '2.0.0' 
+    r = requests.get(url, cookies=cookies, headers=headers)
+    content = json.loads(r.text)
+    data_total = content['paging']['total']
+
+    # Calculate pages off final results at 40 results/page
+    pages = data_total / 40
+    if data_total % 40 == 0:
+        # Becuase we count 0... Subtract a page if there are no left over results on the last page
+        pages = pages - 1 
+    if pages == 0: 
+        pages = 1
+
+    print "[Info] %i Results Found" % data_total
+    if data_total > 1000:
+        pages = 24
+        print "[Notice] LinkedIn only allows 1000 results. Refine keywords to capture all data"
+    print "[Info] Fetching %i Pages" % pages
+    print
+
+    # Set record position for XLSX
+    recordpos = 1
+
+    for p in range(pages):
+        # Request results for each page using the start offset
+        url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List()&keywords=%s&origin=GLOBAL_SEARCH_HEADER&q=guided&searchId=1489295486936&start=%i" % (search, p*40)
+        url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(v-%%3EPEOPLE,facetGeoRegion-%%3Ear%%3A0)&keywords=%s&origin=FACETED_SEARCH&q=guided&start=%i" % (search, p*40)
+        #url = 'https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(v->PEOPLE,facetCurrentCompany->31752)&origin=GLOBAL_SEARCH_HEADER&q=guided&start=%i' % (p*40)
+        #url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(v->PEOPLE,facetCurrentCompany->%s)&origin=OTHER&q=guided&start=%i" % (p*40)
+        #url = 'https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(v->PEOPLE,facetCurrentCompany->75769)&keywords=%s&origin=GLOBAL_SEARCH_HEADER&q=guided&searchId=1489295486936&start=%i' % (search, p*40)
+        #url = 'https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(facetGeoRegion-%%3Ear%%3A0)&keywords=%s&origin=GLOBAL_SEARCH_HEADER&q=guided&searchId=1489295486936&start=%i' % (search, p*40)
+        #print url
+        #print
+        r = requests.get(url, cookies=cookies, headers=headers)
+        content = r.text.encode('UTF-8')
+        content = json.loads(content)
+        print "[Info] Fetching page %i with %i results" % (p+1,len(content['elements'][0]['elements']))
+        for c in content['elements'][0]['elements']:
+            try:
+                if c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['headless'] == False:
+                    try:
+                        data_industry = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['industry']
+                    except:
+                        data_industry = ""    
+                    data_firstname = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['firstName']
+                    data_lastname = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['lastName']
+                    data_slug = "https://www.linkedin.com/in/%s" % c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['publicIdentifier']
+                    data_occupation = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['occupation']
+                    data_location = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['location']
+                    try:
+                        data_picture = "https://media.licdn.com/mpr/mpr/shrinknp_400_400%s" % c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['picture']['com.linkedin.voyager.common.MediaProcessorImage']['id']
+                    except:
+                        #print "[Notice] No picture found for %s %s, %s" % (data_firstname, data_lastname, data_occupation)
+                        data_picture = ""
+
+                    # Write data to XLSX file
+                    worksheet1.write('A%i' % recordpos, data_firstname)          
+                    worksheet1.write('B%i' % recordpos, data_lastname)          
+                    worksheet1.write('C%i' % recordpos, data_occupation)          
+                    worksheet1.write('D%i' % recordpos, data_location)          
+                    worksheet1.write('E%i' % recordpos, data_industry)          
+                    worksheet1.write('F%i' % recordpos, data_slug)          
+                    worksheet1.write('G%i' % recordpos, data_picture)          
+                    worksheet2.write('A%i' % recordpos, '=IMAGE(dataset!G%i)' % recordpos)
+                    worksheet2.write('B%i' % recordpos, '=dataset!A%i&" "&dataset!B%i&"\n"&dataset!C%i&"\n"&dataset!D%i&"\n"&dataset!E%i' % (recordpos,recordpos,recordpos,recordpos,recordpos))
+                    worksheet2.write('C%i' % recordpos, '=HYPERLINK(dataset!F%i)' % recordpos)
+                    worksheet2.set_row(recordpos-1,125)        
+                    # Increment Record Position
+                    recordpos = recordpos + 1
+                else:
+                    print "[Notice] Headless profile found. Skipping"
+            except:
+                print "[Notice] Skipping"
+                continue
+        print
+
+def authenticate():
+    try:
+        session = subprocess.Popen(['python', 'SI_login.py'], stdout=subprocess.PIPE).communicate()[0].replace("\n","")
+        if len(session) == 0:
+            sys.exit("[Error] Unable to login to LinkedIn.com")
+        print "[Info] Obtained new session: %s" % session
+        cookies = dict(li_at=session)
+    except Exception, e:
+        sys.exit("[Fatal] Could not authenticate to linkedin. %s" % e)
+    return cookies
+
+if __name__ == '__main__':
+    title = """
+ __                               _  _____       
+/ _\ ___ _ __ __ _ _ __   ___  __| | \_   \_ __  
+\ \ / __| '__/ _` | '_ \ / _ \/ _` |  / /\/ '_ \ 
+_\ \ (__| | | (_| | |_) |  __/ (_| /\/ /_ | | | |
+\__/\___|_|  \__,_| .__/ \___|\__,_\____/ |_| |_|
+                  |_|                            
+tool to scrape linkedin v2.0
+"""
+    print title.decode('UTF-8')
+
+    # Prompt user for data variables
+    search = args.keywords if args.keywords!=None else raw_input("Enter search Keywords (use quotes for more percise results)\n")
+    outfile = args.output if args.output!=None else raw_input("Enter filename for output (exclude file extension)\n")
+    print 
+
+    # URL Encode for the querystring
+    search = urllib.quote_plus(search)
+    cookies = authenticate()
+
+    # Initiate XLSX File
+    workbook = xlsxwriter.Workbook('results/%s.xlsx' % outfile)
+    worksheet1 = workbook.add_worksheet('dataset')
+    worksheet2 = workbook.add_worksheet('report')
+    worksheet2.set_column(0,0, 25)
+    worksheet2.set_column(1,2, 75)
+
+    # Initialize Scraping
+    get_search()
+
+    # Close XLSD File
+    workbook.close()
diff --git a/config.py b/config.py
@@ -0,0 +1,23 @@
+#!/usr/bin/python
+
+## [LINKEDIN CREDENTIALS] ##
+# it may be preferable to use a fake
+# account to avoid account suspension
+
+linkedin = dict(
+    username = '',
+    password = '',
+)
+
+## [PROXY LIST] ##
+# Leave empty to use your own IP address
+# by using a proxy you can avoid being
+# blocked for sending too much traffic
+
+proxylist = []
+#proxylist.append('http://127.0.0.1:8080')
+
+## [MISCELLANEOUS] ##
+
+timeout = 10
+