diff --git a/scihub2pdf/__init__.py b/scihub2pdf/__init__.py index 7b60e12..9102f91 100755 --- a/scihub2pdf/__init__.py +++ b/scihub2pdf/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.4.1" +__version__ = "0.4.2" __license__ = "AGPLv3" __author__ = "Bruno Messias" __author_email__ = "messias.physics@gmail.com" diff --git a/scihub2pdf/bin/scihub2pdf b/scihub2pdf/bin/scihub2pdf index af1fe17..4d01736 100755 --- a/scihub2pdf/bin/scihub2pdf +++ b/scihub2pdf/bin/scihub2pdf @@ -175,7 +175,7 @@ def main(): parser = bibtexparser.bparser.BibTexParser() parser.alt_dict = dict_parser with io.open(args[0].inputfile, "r", encoding="utf-8") as inputfile: - refs_string = args[0].inputfile.read() + refs_string = inputfile.read() bibtex = bibtexparser.loads(refs_string, parser=parser) diff --git a/scihub2pdf/scihub.py b/scihub2pdf/scihub.py index bd08822..30568b0 100755 --- a/scihub2pdf/scihub.py +++ b/scihub2pdf/scihub.py @@ -45,9 +45,9 @@ def __init__(self, self.pdf_file = None self.s = None - def start(self): try: + self.s = requests.Session() self.driver = webdriver.PhantomJS() except WebDriverException: print("\n\t Install PhantomJS for download files in sci-hub.\n") @@ -58,3 +58,114 @@ def start(self): print("\t\t sudo npm install -g phantomjs\n") sys.exit(1) + + def get_session(self): + cookies = self.driver.get_cookies() + for cookie in cookies: + self.s.cookies.set(cookie['name'], cookie['value']) + + return self.s + + def download(self): + found, r = download_pdf( + self.s, + self.pdf_file, + self.pdf_url, + self.headers) + + if not found: + self.driver.save_screenshot(self.pdf_file+".png") + + return found, r + + def navigate_to(self, doi, pdf_file): + self.doi = doi + self.pdf_file = pdf_file + self.sci_url = self.domain_scihub+doi + print("\n\tDOI: ", doi) + print("\tSci-Hub Link: ", self.sci_url) + r = requests.get(self.sci_url) + found = r.status_code == 200 + if found: + self.driver.get(self.sci_url) + self.driver.set_window_size(1120, 550) + else: + print("\tSomething is wrong with sci-hub,") + print("\tstatus_code: ", r.status_code) + return found, r + + def get_captcha_img(self): + self.driver.execute_script("document.getElementById('content').style.zIndex = 9999;") + self.driver.switch_to.frame(self.el_iframe) + self.driver.execute_script("document.getElementById('captcha').style.zIndex = 9999;") + location = self.el_captcha.location + size = self.el_captcha.size + captcha_screenshot = self.driver.get_screenshot_as_base64() + image_b64d = b64d(captcha_screenshot) + if isinstance(image_b64d, string_types): + image = Image.open(StringIO(image_b64d)) + else: + image = Image.open(BytesIO(image_b64d)) + + left = location['x'] + top = location['y'] + right = location['x'] + size['width'] + bottom = location['y'] + size['height'] + image = image.crop((left, top, right, bottom)) + self.driver.switch_to.default_content() + return image + + + def solve_captcha(self, captcha_text): + + # self.driver.save_screenshot(self.pdf_file+"before_solve.png") + self.driver.switch_to.frame(self.el_iframe) + self.el_input_text.send_keys(captcha_text) + # self.driver.save_screenshot(self.pdf_file+"send_keys.png") + self.el_form.submit() + + self.driver.switch_to.default_content() + # with self.wait_for_page_load(timeout=10): + found, r = self.navigate_to(self.doi, self.pdf_file) + # self.driver.save_screenshot(self.pdf_file+"after_submit.png") + return self.check_captcha() + + def get_iframe(self): + self.has_iframe, self.el_iframe = self.get_el(self.xpath_pdf) + if self.has_iframe: + self.pdf_url = norm_url(self.el_iframe.get_attribute("src")) + else: + self.driver.save_screenshot(self.pdf_file+".png") + + return self.has_iframe + + def get_el(self, xpath): + try: + el = self.driver.find_element_by_xpath( + xpath + ) + found = True + except NoSuchElementException: + el = None + found = False + + return found, el + + def check_captcha(self): + print("\tchecking if has captcha...") + has_iframe = self.get_iframe() + if has_iframe is False: + print("\tNo pdf found. Maybe, the sci-hub dosen't have the file") + print("\tTry to open the link in your browser.") + return False, has_iframe + + self.driver.save_screenshot(self.pdf_file+"check_captcha.png") + self.driver.switch_to.frame(self.el_iframe) + self.has_captcha, self.el_captcha = self.get_el(self.xpath_captcha) + if self.has_captcha: + found, self.el_input_text = self.get_el(self.xpath_input) + found, self.el_form = self.get_el(self.xpath_form) + + self.driver.switch_to.default_content() + + return self.has_captcha, has_iframe diff --git a/setup.py b/setup.py index 87ff368..83328a8 100755 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setup( name="scihub2pdf", - version="0.4.1", + version="0.4.2", packages=find_packages(exclude=["build", ]), scripts=["scihub2pdf/bin/scihub2pdf"], long_description=README_TEXT, @@ -24,7 +24,7 @@ description="Downloads pdfs via a DOI number(or arxivId), article title or a bibtex file, sci-hub", author="Bruno Messias", author_email="messias.physics@gmail.com", - download_url="https://github.com/bibcure/scihub2pdf/archive/0.4.1.tar.gz", + download_url="https://github.com/bibcure/scihub2pdf/archive/0.4.2.tar.gz", keywords=["bibtex", "sci-hub", "libgen", "doi", "science", "scientific-journals"], classifiers=[