Added cookie support

AliHunt · Apr 17, 2016 · 4bb032d · 4bb032d
1 parent fb628ef
commit 4bb032d
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -21,6 +21,11 @@ If you wish to login with HTTP Basic Auth then crawl:
 ./xsscrapy.py -u http://example.com/login_page -l loginname --basic
 ```
 
+If you wish to use cookies:
+```shell
+./xsscrapy.py -u http://example.com/login_page --cookie "SessionID=abcdef1234567890"
+```
+
 If you wish to limit simultaneous connections to 20: 
 ```shell
 ./xsscrapy.py -u http://example.com -c 20

diff --git a/xsscrapy.py b/xsscrapy.py
@@ -19,6 +19,7 @@ def get_args():
     parser.add_argument('-c', '--connections', default='30', help="Set the max number of simultaneous connections allowed, default=30")
     parser.add_argument('-r', '--ratelimit', default='0', help="Rate in requests per minute, default=0")
     parser.add_argument('--basic', help="Use HTTP Basic Auth to login", action="store_true")
+    parser.add_argument('-k', '--cookie',help="Cookie key; --cookie SessionID=afgh3193e9103bca9318031bcdf")
     args = parser.parse_args()
     return args
 
@@ -28,9 +29,12 @@ def main():
     if rate not in [None, '0']:
         rate = str(60 / float(rate))
     try:
-        execute(['scrapy', 'crawl', 'xsscrapy', 
-                 '-a', 'url=%s' % args.url, '-a', 'user=%s' % args.login, '-a', 
-                 'pw=%s' % args.password, '-a', 'basic=%s' % args.basic, 
+        cookie_key = args.cookie.split('=',1)[0] if args.cookie else None
+        cookie_value = ''.join(args.cookie.split('=',1)[1:]) if args.cookie else None
+        execute(['scrapy', 'crawl', 'xsscrapy',
+                 '-a', 'url=%s' % args.url, '-a', 'user=%s' % args.login, '-a',
+                 'pw=%s' % args.password, '-a', 'basic=%s' % args.basic,
+                 '-a', 'cookie_key=%s' % cookie_key, '-a', 'cookie_value=%s' % cookie_value,
                  '-s', 'CONCURRENT_REQUESTS=%s' % args.connections,
                  '-s', 'DOWNLOAD_DELAY=%s' % rate])
     except KeyboardInterrupt:

diff --git a/xsscrapy/spiders/xss_spider.py b/xsscrapy/spiders/xss_spider.py
@@ -45,13 +45,24 @@ def __init__(self, *args, **kwargs):
         # gruyere or the second cookie delim
         self.test_str = '\'"(){}<x>:/'
 
-        # Login details
+        # Login details. Either user or cookie
         self.login_user = kwargs.get('user')
+        self.login_cookie_key = kwargs.get('cookie_key')
+        self.login_cookie_value = kwargs.get('cookie_value')
+
+        # Turn Nones to Nones
         if self.login_user == 'None':
             self.login_user = None
-        else:
-            # Don't hit links with 'logout' in them since self.login_user exists
+        if self.login_cookie_key == 'None':
+            self.login_cookie_key = None
+        if self.login_cookie_value == 'None':
+            self.login_cookie_value = None
+
+        if self.login_user or (self.login_cookie_key and self.login_cookie_value):
+            # Don't hit links with 'logout' in them since self.login_user or cookies exists
             self.rules = (Rule(LinkExtractor(deny=('logout')), callback='parse_resp', follow=True), )
+
+        # If password is not set and login user is then get password, otherwise set it
         if kwargs.get('pw') == 'None' and self.login_user is not None:
             self.login_pass = raw_input("Please enter the password: ")
         else:
@@ -83,11 +94,24 @@ def start_requests(self):
             otherwise pass it to the normal callback function '''
         if self.login_user and self.login_pass:
             if self.basic_auth == 'true':
-                yield Request(url=self.start_urls[0]) # Take out the callback arg so crawler falls back to the rules' callback
+                # Take out the callback arg so crawler falls back to the rules' callback
+                if self.login_cookie_key and self.login_cookie_value:
+                    yield Request(url=self.start_urls[0], cookies={self.login_cookie_key: self.login_cookie_value})
+                else:
+                    yield Request(url=self.start_urls[0])
             else:
-                yield Request(url=self.start_urls[0], callback=self.login)
+                if self.login_cookie_key and self.login_cookie_value:
+                    yield Request(url=self.start_urls[0],
+                                  cookies={self.login_cookie_key: self.login_cookie_value},
+                                  callback=self.login)
+                else:
+                    yield Request(url=self.start_urls[0], callback=self.login)
         else:
-            yield Request(url=self.start_urls[0]) # Take out the callback arg so crawler falls back to the rules' callback
+            # Take out the callback arg so crawler falls back to the rules' callback
+            if self.login_cookie_key and self.login_cookie_value:
+                yield Request(url=self.start_urls[0], cookies={self.login_cookie_key: self.login_cookie_value})
+            else:
+                yield Request(url=self.start_urls[0])
 
     def login(self, response):
         ''' Fill out the login form and return the request'''