diff --git a/LICENSE.txt b/LICENSE.txt old mode 100644 new mode 100755 diff --git a/README b/README index 2706393..e6cd03c 100755 --- a/README +++ b/README @@ -15,7 +15,7 @@ Install :: - $ sudo pip install scihub2pdf + $ sudo python /usr/bin/pip install scihub2pdf If you want to download files from scihub you will need to get PhantomJS @@ -24,7 +24,7 @@ OSX :: - $ brew install phantomjs + $ npm install -g phantomjs Linux Using npm ~~~~~~~~~~~~~~~ @@ -41,7 +41,7 @@ Given a bibtex file :: - $ scihub2pdf -i input.bib + $ scihub2pdf -i input.bib Given a DOI number... @@ -109,8 +109,8 @@ Given a text file like :: - Some Title 1 - Some Title 2 + Some Title 1 + Some Title 2 ..... download all pdf's @@ -132,5 +132,5 @@ download all pdf's :: - $ scihub2pdf -i arxiv_ids.txt --txt + $ scihub2pdf -i arxiv_ids.txt --txt diff --git a/README.md b/README.md index 5018822..772bc5f 100755 --- a/README.md +++ b/README.md @@ -11,13 +11,13 @@ database of libgen, Sci-Hub and Arxiv. ## Install ``` -$ sudo pip install scihub2pdf +$ sudo python /usr/bin/pip install scihub2pdf ``` If you want to download files from scihub you will need to get PhantomJS ### OSX ``` -$ brew install phantomjs +$ npm install -g phantomjs ``` ### Linux Using npm @@ -32,7 +32,7 @@ $ sudo npm install -g phantomjs Given a bibtex file ``` -$ scihub2pdf -i input.bib +$ scihub2pdf -i input.bib ``` Given a DOI number... @@ -91,8 +91,8 @@ $ scihub2pdf -i dois.txt --txt Given a text file like ``` -Some Title 1 -Some Title 2 +Some Title 1 +Some Title 2 ..... ``` download all pdf's @@ -109,5 +109,5 @@ arXiv:1708.05948 ``` download all pdf's ``` -$ scihub2pdf -i arxiv_ids.txt --txt +$ scihub2pdf -i arxiv_ids.txt --txt ``` diff --git a/scihub2pdf/__init__.py b/scihub2pdf/__init__.py index 4c29e3c..7b60e12 100755 --- a/scihub2pdf/__init__.py +++ b/scihub2pdf/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.4.0" +__version__ = "0.4.1" __license__ = "AGPLv3" __author__ = "Bruno Messias" __author_email__ = "messias.physics@gmail.com" diff --git a/scihub2pdf/scihub.py b/scihub2pdf/scihub.py old mode 100644 new mode 100755 index 99fd21f..bd08822 --- a/scihub2pdf/scihub.py +++ b/scihub2pdf/scihub.py @@ -2,11 +2,13 @@ import requests from selenium import webdriver -from selenium.common.exceptions import NoSuchElementException +from selenium.common.exceptions import NoSuchElementException, WebDriverException + from PIL import Image from scihub2pdf.tools import norm_url, download_pdf from base64 import b64decode as b64d from six import string_types +import sys try: from StringIO import StringIO from io import BytesIO @@ -45,116 +47,14 @@ def __init__(self, def start(self): - self.driver = webdriver.PhantomJS() - self.s = requests.Session() - - def get_session(self): - cookies = self.driver.get_cookies() - for cookie in cookies: - self.s.cookies.set(cookie['name'], cookie['value']) - - return self.s - - def download(self): - found, r = download_pdf( - self.s, - self.pdf_file, - self.pdf_url, - self.headers) - - if not found: - self.driver.save_screenshot(self.pdf_file+".png") - - return found, r - - def navigate_to(self, doi, pdf_file): - self.doi = doi - self.pdf_file = pdf_file - self.sci_url = self.domain_scihub+doi - print("\n\tDOI: ", doi) - print("\tSci-Hub Link: ", self.sci_url) - r = requests.get(self.sci_url) - found = r.status_code == 200 - if found: - self.driver.get(self.sci_url) - self.driver.set_window_size(1120, 550) - else: - print("\tSomething is wrong with sci-hub,") - print("\tstatus_code: ", r.status_code) - return found, r - - def get_captcha_img(self): - self.driver.execute_script("document.getElementById('content').style.zIndex = 9999;") - self.driver.switch_to.frame(self.el_iframe) - self.driver.execute_script("document.getElementById('captcha').style.zIndex = 9999;") - location = self.el_captcha.location - size = self.el_captcha.size - captcha_screenshot = self.driver.get_screenshot_as_base64() - image_b64d = b64d(captcha_screenshot) - if isinstance(image_b64d, string_types): - image = Image.open(StringIO(image_b64d)) - else: - image = Image.open(BytesIO(image_b64d)) - - left = location['x'] - top = location['y'] - right = location['x'] + size['width'] - bottom = location['y'] + size['height'] - image = image.crop((left, top, right, bottom)) - self.driver.switch_to.default_content() - return image - - - def solve_captcha(self, captcha_text): - - # self.driver.save_screenshot(self.pdf_file+"before_solve.png") - self.driver.switch_to.frame(self.el_iframe) - self.el_input_text.send_keys(captcha_text) - # self.driver.save_screenshot(self.pdf_file+"send_keys.png") - self.el_form.submit() - - self.driver.switch_to.default_content() - # with self.wait_for_page_load(timeout=10): - found, r = self.navigate_to(self.doi, self.pdf_file) - # self.driver.save_screenshot(self.pdf_file+"after_submit.png") - return self.check_captcha() - - def get_iframe(self): - self.has_iframe, self.el_iframe = self.get_el(self.xpath_pdf) - if self.has_iframe: - self.pdf_url = norm_url(self.el_iframe.get_attribute("src")) - else: - self.driver.save_screenshot(self.pdf_file+".png") - - return self.has_iframe - - def get_el(self, xpath): try: - el = self.driver.find_element_by_xpath( - xpath - ) - found = True - except NoSuchElementException: - el = None - found = False - - return found, el - - def check_captcha(self): - print("\tchecking if has captcha...") - has_iframe = self.get_iframe() - if has_iframe is False: - print("\tNo pdf found. Maybe, the sci-hub dosen't have the file") - print("\tTry to open the link in your browser.") - return False, has_iframe - - self.driver.save_screenshot(self.pdf_file+"check_captcha.png") - self.driver.switch_to.frame(self.el_iframe) - self.has_captcha, self.el_captcha = self.get_el(self.xpath_captcha) - if self.has_captcha: - found, self.el_input_text = self.get_el(self.xpath_input) - found, self.el_form = self.get_el(self.xpath_form) - - self.driver.switch_to.default_content() - - return self.has_captcha, has_iframe + self.driver = webdriver.PhantomJS() + except WebDriverException: + print("\n\t Install PhantomJS for download files in sci-hub.\n") + print("\t OSX:") + print("\t\t npm install -g phantomjs") + print("\n\t Linux with npm:") + print("\t\t sudo apt-get install npm\n") + print("\t\t sudo npm install -g phantomjs\n") + + sys.exit(1) diff --git a/scihub2pdf/tools.py b/scihub2pdf/tools.py old mode 100644 new mode 100755 diff --git a/setup.py b/setup.py index d927ca6..87ff368 100755 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setup( name="scihub2pdf", - version="0.4.0", + version="0.4.1", packages=find_packages(exclude=["build", ]), scripts=["scihub2pdf/bin/scihub2pdf"], long_description=README_TEXT, @@ -24,7 +24,7 @@ description="Downloads pdfs via a DOI number(or arxivId), article title or a bibtex file, sci-hub", author="Bruno Messias", author_email="messias.physics@gmail.com", - download_url="https://github.com/bibcure/scihub2pdf/archive/0.4.0.tar.gz", + download_url="https://github.com/bibcure/scihub2pdf/archive/0.4.1.tar.gz", keywords=["bibtex", "sci-hub", "libgen", "doi", "science", "scientific-journals"], classifiers=[