Merge branch 'release/0.4.1'

bibcure · Aug 27, 2017 · 606db91 · 606db91
2 parents c66e7e8 + 577a3b5
commit 606db91
Show file tree

Hide file tree

Showing 7 changed files with 28 additions and 128 deletions.
diff --git a/LICENSE.txt b/LICENSE.txt
diff --git a/README b/README
@@ -15,7 +15,7 @@ Install
 
 ::
 
-    $ sudo pip install scihub2pdf
+    $ sudo python /usr/bin/pip install scihub2pdf
 
 If you want to download files from scihub you will need to get PhantomJS
 
@@ -24,7 +24,7 @@ OSX
 
 ::
 
-    $ brew install phantomjs
+    $ npm install -g phantomjs
 
 Linux Using npm
 ~~~~~~~~~~~~~~~
@@ -41,7 +41,7 @@ Given a bibtex file
 
 ::
 
-    $ scihub2pdf -i input.bib 
+    $ scihub2pdf -i input.bib
 
 Given a DOI number...
 
@@ -109,8 +109,8 @@ Given a text file like
 
 ::
 
-    Some Title 1 
-    Some Title 2 
+    Some Title 1
+    Some Title 2
     .....
 
 download all pdf's
@@ -132,5 +132,5 @@ download all pdf's
 
 ::
 
-    $ scihub2pdf -i arxiv_ids.txt --txt 
+    $ scihub2pdf -i arxiv_ids.txt --txt
 
diff --git a/README.md b/README.md
@@ -11,13 +11,13 @@ database of libgen,  Sci-Hub and Arxiv.
 ## Install
 
 ```
-$ sudo pip install scihub2pdf
+$ sudo python /usr/bin/pip install scihub2pdf
 ```
 If you want  to download files from scihub you will need to get  PhantomJS
 
 ### OSX
 ```
-$ brew install phantomjs
+$ npm install -g phantomjs
 ```
 ### Linux Using npm
 
@@ -32,7 +32,7 @@ $ sudo npm install -g phantomjs
 
 Given a bibtex file
 ```
-$ scihub2pdf -i input.bib 
+$ scihub2pdf -i input.bib
 ```
 
 Given a DOI number...
@@ -91,8 +91,8 @@ $ scihub2pdf -i dois.txt --txt
 Given a text file like
 
 ```
-Some Title 1 
-Some Title 2 
+Some Title 1
+Some Title 2
 .....
 ```
 download all pdf's
@@ -109,5 +109,5 @@ arXiv:1708.05948
 ```
 download all pdf's
 ```
-$ scihub2pdf -i arxiv_ids.txt --txt 
+$ scihub2pdf -i arxiv_ids.txt --txt
 ```
diff --git a/scihub2pdf/__init__.py b/scihub2pdf/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.4.0"
+__version__ = "0.4.1"
 __license__ = "AGPLv3"
 __author__ = "Bruno Messias"
 __author_email__ = "[email protected]"
diff --git a/scihub2pdf/scihub.py b/scihub2pdf/scihub.py
@@ -2,11 +2,13 @@
 
 import requests
 from selenium import webdriver
-from selenium.common.exceptions import NoSuchElementException
+from selenium.common.exceptions import NoSuchElementException, WebDriverException
+
 from PIL import Image
 from scihub2pdf.tools import norm_url, download_pdf
 from base64 import b64decode as b64d
 from six import string_types
+import sys
 try:
     from StringIO import StringIO
     from io import BytesIO
@@ -45,116 +47,14 @@ def __init__(self,
 
 
     def start(self):
-        self.driver = webdriver.PhantomJS()
-        self.s = requests.Session()
-
-    def get_session(self):
-        cookies = self.driver.get_cookies()
-        for cookie in cookies:
-            self.s.cookies.set(cookie['name'], cookie['value'])
-
-        return self.s
-
-    def download(self):
-        found, r = download_pdf(
-            self.s,
-            self.pdf_file,
-            self.pdf_url,
-            self.headers)
-
-        if not found:
-            self.driver.save_screenshot(self.pdf_file+".png")
-
-        return found,  r
-
-    def navigate_to(self, doi, pdf_file):
-        self.doi = doi
-        self.pdf_file = pdf_file
-        self.sci_url = self.domain_scihub+doi
-        print("\n\tDOI: ", doi)
-        print("\tSci-Hub Link: ", self.sci_url)
-        r = requests.get(self.sci_url)
-        found = r.status_code == 200
-        if found:
-            self.driver.get(self.sci_url)
-            self.driver.set_window_size(1120, 550)
-        else:
-            print("\tSomething is wrong with sci-hub,")
-            print("\tstatus_code: ", r.status_code)
-        return found, r
-
-    def get_captcha_img(self):
-        self.driver.execute_script("document.getElementById('content').style.zIndex = 9999;")
-        self.driver.switch_to.frame(self.el_iframe)
-        self.driver.execute_script("document.getElementById('captcha').style.zIndex = 9999;")
-        location = self.el_captcha.location
-        size = self.el_captcha.size
-        captcha_screenshot = self.driver.get_screenshot_as_base64()
-        image_b64d = b64d(captcha_screenshot)
-        if isinstance(image_b64d, string_types):
-            image = Image.open(StringIO(image_b64d))
-        else:
-            image = Image.open(BytesIO(image_b64d))
-
-        left = location['x']
-        top = location['y']
-        right = location['x'] + size['width']
-        bottom = location['y'] + size['height']
-        image = image.crop((left, top, right, bottom))
-        self.driver.switch_to.default_content()
-        return image
-
-
-    def solve_captcha(self, captcha_text):
-
-        # self.driver.save_screenshot(self.pdf_file+"before_solve.png")
-        self.driver.switch_to.frame(self.el_iframe)
-        self.el_input_text.send_keys(captcha_text)
-        # self.driver.save_screenshot(self.pdf_file+"send_keys.png")
-        self.el_form.submit()
-
-        self.driver.switch_to.default_content()
-        # with self.wait_for_page_load(timeout=10):
-        found, r = self.navigate_to(self.doi, self.pdf_file)
-        # self.driver.save_screenshot(self.pdf_file+"after_submit.png")
-        return self.check_captcha()
-
-    def get_iframe(self):
-        self.has_iframe, self.el_iframe = self.get_el(self.xpath_pdf)
-        if self.has_iframe:
-            self.pdf_url = norm_url(self.el_iframe.get_attribute("src"))
-        else:
-            self.driver.save_screenshot(self.pdf_file+".png")
-
-        return self.has_iframe
-
-    def get_el(self, xpath):
         try:
-            el = self.driver.find_element_by_xpath(
-                xpath
-            )
-            found = True
-        except NoSuchElementException:
-            el = None
-            found = False
-
-        return found, el
-
-    def check_captcha(self):
-        print("\tchecking if has captcha...")
-        has_iframe = self.get_iframe()
-        if has_iframe is False:
-            print("\tNo pdf found. Maybe, the sci-hub dosen't have the file")
-            print("\tTry to open the link in your browser.")
-            return False, has_iframe
-
-        self.driver.save_screenshot(self.pdf_file+"check_captcha.png")
-        self.driver.switch_to.frame(self.el_iframe)
-        self.has_captcha, self.el_captcha = self.get_el(self.xpath_captcha)
-        if self.has_captcha:
-            found, self.el_input_text = self.get_el(self.xpath_input)
-            found, self.el_form = self.get_el(self.xpath_form)
-
-        self.driver.switch_to.default_content()
-
-        return self.has_captcha, has_iframe
+            self.driver = webdriver.PhantomJS()
+        except WebDriverException:
+            print("\n\t Install PhantomJS for download files in sci-hub.\n")
+            print("\t OSX:")
+            print("\t\t npm install -g phantomjs")
+            print("\n\t Linux with npm:")
+            print("\t\t sudo apt-get install npm\n")
+            print("\t\t sudo npm install -g phantomjs\n")
+
+            sys.exit(1)
diff --git a/scihub2pdf/tools.py b/scihub2pdf/tools.py
diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 
 setup(
     name="scihub2pdf",
-    version="0.4.0",
+    version="0.4.1",
     packages=find_packages(exclude=["build", ]),
     scripts=["scihub2pdf/bin/scihub2pdf"],
     long_description=README_TEXT,
@@ -24,7 +24,7 @@
     description="Downloads pdfs via a DOI number(or arxivId), article title or a bibtex file, sci-hub",
     author="Bruno Messias",
     author_email="[email protected]",
-    download_url="https://github.com/bibcure/scihub2pdf/archive/0.4.0.tar.gz",
+    download_url="https://github.com/bibcure/scihub2pdf/archive/0.4.1.tar.gz",
     keywords=["bibtex", "sci-hub", "libgen", "doi",  "science", "scientific-journals"],
 
     classifiers=[