Skip to content

Commit

Permalink
Merge branch 'release/0.4.1'
Browse files Browse the repository at this point in the history
  • Loading branch information
devmessias committed Aug 27, 2017
2 parents c66e7e8 + 577a3b5 commit 606db91
Show file tree
Hide file tree
Showing 7 changed files with 28 additions and 128 deletions.
Empty file modified LICENSE.txt
100644 → 100755
Empty file.
12 changes: 6 additions & 6 deletions README
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Install

::

$ sudo pip install scihub2pdf
$ sudo python /usr/bin/pip install scihub2pdf

If you want to download files from scihub you will need to get PhantomJS

Expand All @@ -24,7 +24,7 @@ OSX

::

$ brew install phantomjs
$ npm install -g phantomjs

Linux Using npm
~~~~~~~~~~~~~~~
Expand All @@ -41,7 +41,7 @@ Given a bibtex file

::

$ scihub2pdf -i input.bib
$ scihub2pdf -i input.bib

Given a DOI number...

Expand Down Expand Up @@ -109,8 +109,8 @@ Given a text file like

::

Some Title 1
Some Title 2
Some Title 1
Some Title 2
.....

download all pdf's
Expand All @@ -132,5 +132,5 @@ download all pdf's

::

$ scihub2pdf -i arxiv_ids.txt --txt
$ scihub2pdf -i arxiv_ids.txt --txt

12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,13 @@ database of libgen, Sci-Hub and Arxiv.
## Install

```
$ sudo pip install scihub2pdf
$ sudo python /usr/bin/pip install scihub2pdf
```
If you want to download files from scihub you will need to get PhantomJS

### OSX
```
$ brew install phantomjs
$ npm install -g phantomjs
```
### Linux Using npm

Expand All @@ -32,7 +32,7 @@ $ sudo npm install -g phantomjs

Given a bibtex file
```
$ scihub2pdf -i input.bib
$ scihub2pdf -i input.bib
```

Given a DOI number...
Expand Down Expand Up @@ -91,8 +91,8 @@ $ scihub2pdf -i dois.txt --txt
Given a text file like

```
Some Title 1
Some Title 2
Some Title 1
Some Title 2
.....
```
download all pdf's
Expand All @@ -109,5 +109,5 @@ arXiv:1708.05948
```
download all pdf's
```
$ scihub2pdf -i arxiv_ids.txt --txt
$ scihub2pdf -i arxiv_ids.txt --txt
```
2 changes: 1 addition & 1 deletion scihub2pdf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.4.0"
__version__ = "0.4.1"
__license__ = "AGPLv3"
__author__ = "Bruno Messias"
__author_email__ = "[email protected]"
126 changes: 13 additions & 113 deletions scihub2pdf/scihub.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@

import requests
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoSuchElementException, WebDriverException

from PIL import Image
from scihub2pdf.tools import norm_url, download_pdf
from base64 import b64decode as b64d
from six import string_types
import sys
try:
from StringIO import StringIO
from io import BytesIO
Expand Down Expand Up @@ -45,116 +47,14 @@ def __init__(self,


def start(self):
self.driver = webdriver.PhantomJS()
self.s = requests.Session()

def get_session(self):
cookies = self.driver.get_cookies()
for cookie in cookies:
self.s.cookies.set(cookie['name'], cookie['value'])

return self.s

def download(self):
found, r = download_pdf(
self.s,
self.pdf_file,
self.pdf_url,
self.headers)

if not found:
self.driver.save_screenshot(self.pdf_file+".png")

return found, r

def navigate_to(self, doi, pdf_file):
self.doi = doi
self.pdf_file = pdf_file
self.sci_url = self.domain_scihub+doi
print("\n\tDOI: ", doi)
print("\tSci-Hub Link: ", self.sci_url)
r = requests.get(self.sci_url)
found = r.status_code == 200
if found:
self.driver.get(self.sci_url)
self.driver.set_window_size(1120, 550)
else:
print("\tSomething is wrong with sci-hub,")
print("\tstatus_code: ", r.status_code)
return found, r

def get_captcha_img(self):
self.driver.execute_script("document.getElementById('content').style.zIndex = 9999;")
self.driver.switch_to.frame(self.el_iframe)
self.driver.execute_script("document.getElementById('captcha').style.zIndex = 9999;")
location = self.el_captcha.location
size = self.el_captcha.size
captcha_screenshot = self.driver.get_screenshot_as_base64()
image_b64d = b64d(captcha_screenshot)
if isinstance(image_b64d, string_types):
image = Image.open(StringIO(image_b64d))
else:
image = Image.open(BytesIO(image_b64d))

left = location['x']
top = location['y']
right = location['x'] + size['width']
bottom = location['y'] + size['height']
image = image.crop((left, top, right, bottom))
self.driver.switch_to.default_content()
return image


def solve_captcha(self, captcha_text):

# self.driver.save_screenshot(self.pdf_file+"before_solve.png")
self.driver.switch_to.frame(self.el_iframe)
self.el_input_text.send_keys(captcha_text)
# self.driver.save_screenshot(self.pdf_file+"send_keys.png")
self.el_form.submit()

self.driver.switch_to.default_content()
# with self.wait_for_page_load(timeout=10):
found, r = self.navigate_to(self.doi, self.pdf_file)
# self.driver.save_screenshot(self.pdf_file+"after_submit.png")
return self.check_captcha()

def get_iframe(self):
self.has_iframe, self.el_iframe = self.get_el(self.xpath_pdf)
if self.has_iframe:
self.pdf_url = norm_url(self.el_iframe.get_attribute("src"))
else:
self.driver.save_screenshot(self.pdf_file+".png")

return self.has_iframe

def get_el(self, xpath):
try:
el = self.driver.find_element_by_xpath(
xpath
)
found = True
except NoSuchElementException:
el = None
found = False

return found, el

def check_captcha(self):
print("\tchecking if has captcha...")
has_iframe = self.get_iframe()
if has_iframe is False:
print("\tNo pdf found. Maybe, the sci-hub dosen't have the file")
print("\tTry to open the link in your browser.")
return False, has_iframe

self.driver.save_screenshot(self.pdf_file+"check_captcha.png")
self.driver.switch_to.frame(self.el_iframe)
self.has_captcha, self.el_captcha = self.get_el(self.xpath_captcha)
if self.has_captcha:
found, self.el_input_text = self.get_el(self.xpath_input)
found, self.el_form = self.get_el(self.xpath_form)

self.driver.switch_to.default_content()

return self.has_captcha, has_iframe
self.driver = webdriver.PhantomJS()
except WebDriverException:
print("\n\t Install PhantomJS for download files in sci-hub.\n")
print("\t OSX:")
print("\t\t npm install -g phantomjs")
print("\n\t Linux with npm:")
print("\t\t sudo apt-get install npm\n")
print("\t\t sudo npm install -g phantomjs\n")

sys.exit(1)
Empty file modified scihub2pdf/tools.py
100644 → 100755
Empty file.
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

setup(
name="scihub2pdf",
version="0.4.0",
version="0.4.1",
packages=find_packages(exclude=["build", ]),
scripts=["scihub2pdf/bin/scihub2pdf"],
long_description=README_TEXT,
Expand All @@ -24,7 +24,7 @@
description="Downloads pdfs via a DOI number(or arxivId), article title or a bibtex file, sci-hub",
author="Bruno Messias",
author_email="[email protected]",
download_url="https://github.com/bibcure/scihub2pdf/archive/0.4.0.tar.gz",
download_url="https://github.com/bibcure/scihub2pdf/archive/0.4.1.tar.gz",
keywords=["bibtex", "sci-hub", "libgen", "doi", "science", "scientific-journals"],

classifiers=[
Expand Down

0 comments on commit 606db91

Please sign in to comment.