Skip to content

Commit

Permalink
Merge branch 'release/0.4.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
devmessias committed Aug 27, 2017
2 parents d8abe0c + 34107c4 commit c66e7e8
Show file tree
Hide file tree
Showing 12 changed files with 1,218 additions and 237 deletions.
661 changes: 661 additions & 0 deletions LICENSE.txt

Large diffs are not rendered by default.

19 changes: 18 additions & 1 deletion README
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ scihub2pdf is a module of
`bibcure <https://github.com/bibcure/bibcure>`__

Downloads pdfs via a DOI number, article title or a bibtex file, using
the database of libgen or Sci-Hub.
the database of libgen, Sci-Hub and Arxiv.

Install
-------
Expand All @@ -17,6 +17,23 @@ Install

$ sudo pip install scihub2pdf

If you want to download files from scihub you will need to get PhantomJS

OSX
~~~

::

$ brew install phantomjs

Linux Using npm
~~~~~~~~~~~~~~~

::

$ sudo apt-get install npm
$ sudo npm install -g phantomjs

Features and how to use
-----------------------

Expand Down
16 changes: 15 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,27 @@
scihub2pdf is a module of [bibcure](https://github.com/bibcure/bibcure)

Downloads pdfs via a DOI number, article title or a bibtex file, using the
database of libgen or Sci-Hub.
database of libgen, Sci-Hub and Arxiv.

## Install

```
$ sudo pip install scihub2pdf
```
If you want to download files from scihub you will need to get PhantomJS

### OSX
```
$ brew install phantomjs
```
### Linux Using npm

```
$ sudo apt-get install npm
$ sudo npm install -g phantomjs
```



## Features and how to use

Expand Down
2 changes: 2 additions & 0 deletions requeriments.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,5 @@ lxml
title2bib
Pillow
arxivcheck
selenium
six
6 changes: 4 additions & 2 deletions scihub2pdf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@

__version__ = "0.3.1"
__version__ = "0.4.0"
__license__ = "AGPLv3"
__author__ = "Bruno Messias"
__author_email__ = "[email protected]"
35 changes: 35 additions & 0 deletions scihub2pdf/arxiv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from __future__ import unicode_literals, print_function, absolute_import
import requests
from arxivcheck.arxiv import get_arxiv_pdf_link
from scihub2pdf.tools import download_pdf


class Arxiv(object):
def __init__(self, headers={}):
self.headers = headers
self.s = None
self.pdf_file = None
self.pdf_url = None

def start(self):
self.s = requests.Session()

def download(self):
found, r = download_pdf(
self.s,
self.pdf_file,
self.pdf_url,
self.headers)

return found, r

def navigate_to(self, value, pdf_file, field="id"):
self.pdf_file = pdf_file
found, self.pdf_url = get_arxiv_pdf_link(value, field)

print("\n\t"+value)
print("\tLINK: ", self.pdf_url)
if not found:
print("\tArxiv ", value, " not found\n")
print(self.pdf_url)
return found, self.pdf_url
48 changes: 37 additions & 11 deletions scihub2pdf/bin/scihub2pdf
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
#!/usr/bib/env python
from __future__ import print_function
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, print_function, absolute_import
import sys
import bibtexparser
import argparse
import textwrap
# from sci2pdf.libgen import download_pdf_from_bibs, download_from_doi
# from sci2pdf.libgen import download_from_title
from scihub2pdf.scihub import download_pdf_from_bibs, download_from_doi
from scihub2pdf.scihub import download_from_title, download_from_arxiv
from unidecode import unidecode
from scihub2pdf.download import (download_pdf_from_bibs, download_from_doi,
download_from_title, download_from_arxiv,
start_scihub, start_libgen, start_arxiv)
import re
import io

pyversion = sys.version_info[0]

def main():
parser = argparse.ArgumentParser(
Expand Down Expand Up @@ -85,7 +88,6 @@ def main():
parser.add_argument(
"--input", "-i",
dest="inputfile",
type=argparse.FileType("r"),
help="bibtex input file"
)
parser.add_argument(
Expand All @@ -98,7 +100,7 @@ def main():
"--uselibgen",
dest="uselibgen",
action="store_true",
help="Use libgen.io instead sci-hub. Sci-hub has annoying captcha but is stable. Libgen has no captcha but is unstable."
help="Use libgen.io instead sci-hub."
)
parser.add_argument(
"--location", "-l",
Expand All @@ -123,23 +125,32 @@ def main():
location = args[0].location

if use_libgen:
start_libgen()
print("\n\t Using Libgen.\n")
else:
start_scihub()
print("\n\t Using Sci-Hub.\n")

start_arxiv()

if inline_search:
value = " ".join(args[1])

#values = [c if pyversion == 3 else c.decode(sys.stdout.encoding) for c in args[1]]
values = [c if pyversion == 3 else c.decode(sys.stdout.encoding) for c in args[1]]
value = " ".join(values)
is_arxiv = bool(re.match("arxiv:", value, re.I))
if is_arxiv:
field = "ti" if title_search else "id"
download_from_arxiv(value, field, location)
download_from_arxiv(value, location, field)
elif title_search:
download_from_title(value, location, use_libgen)
else:
download_from_doi(value, location, use_libgen)
else:
if is_txt:
file_values = args[0].inputfile.read()
with io.open(args[0].inputfile, "r", encoding = "utf-8") as inputfile:
file_values = inputfile.read()

for value in file_values.split("\n"):
is_arxiv = bool(re.match("arxiv:", value, re.I))
if value != "":
Expand All @@ -152,7 +163,22 @@ def main():
download_from_doi(value, location, use_libgen)

else:
bibtex = bibtexparser.loads(args[0].inputfile.read())
dict_parser = {
'keywords': 'keyword',
'keyw': 'keyword',
'subjects': 'subject',
'urls': 'url',
'link': 'url',
'links': 'url',
'editors': 'editor',
'authors': 'author'}
parser = bibtexparser.bparser.BibTexParser()
parser.alt_dict = dict_parser
with io.open(args[0].inputfile, "r", encoding="utf-8") as inputfile:
refs_string = args[0].inputfile.read()

bibtex = bibtexparser.loads(refs_string,
parser=parser)
bibs = bibtex.entries
if len(bibs) == 0:
print("Input File is empty or corrupted.")
Expand Down
180 changes: 180 additions & 0 deletions scihub2pdf/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
from __future__ import unicode_literals, print_function, absolute_import

from builtins import input
import bibtexparser
# from . import __version__
# from lxml.etree import ParserError
import re
from title2bib.crossref import get_bib_from_title
from scihub2pdf.scihub import SciHub
from scihub2pdf.libgen import LibGen
from scihub2pdf.arxiv import Arxiv
headers = {
# "Connection": "keep-alive",
# "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
}
# print("\n\t Checking state of scihub...")
# url_state = "https://raw.githubusercontent.com/bibcure/scihub_state/master/state.txt"
# try:
# r = requests.get(url_state, headers=headers)
# state_scihub = [i.split(">>")[1] for i in r.iter_lines()]
# url_captcha_scihub = state_scihub[0]
# url_libgen = state_scihub[1]
# url_scihub = state_scihub[2]
# xpath_libgen_a = state_scihub[3]
# xpath_scihub_captcha = state_scihub[4]
# xpath_scihub_iframe = state_scihub[5]
# xpath_scihub_pdf = state_scihub[6]
# has_update = state_scihub[7] != __version__
# if has_update:
# print("\n\t\tWill be better if you upgrade scihub2pdf.")
# print("\t\tFor that, just do:\n")
# print("\t\t\t sudo pip install scihub2pdf --upgrade\n")
# except:
# s = None

libgen_url = "http://libgen.io/scimag/ads.php"
libgen_xpath_pdf_url = "/html/body/table/tr/td[3]/a"
xpath_captcha = "//*[@id='captcha']"
xpath_pdf = "//*[@id='pdf']"
xpath_input = "/html/body/div/table/tbody/tr/td/form/input"
xpath_form = "/html/body/div/table/tbody/tr/td/form"
domain_scihub = "http://sci-hub.cc/"

ScrapSci = SciHub(headers,
xpath_captcha,
xpath_pdf,
xpath_input,
xpath_form,
domain_scihub
)
ScrapArx = Arxiv(headers)
ScrapLib = LibGen(headers=headers,
libgen_url=libgen_url,
xpath_pdf_url=libgen_xpath_pdf_url)


def start_scihub():
ScrapSci.start()

def start_libgen():
ScrapLib.start()

def start_arxiv():
ScrapArx.start()


def download_from_libgen(doi, pdf_file):
found, r = ScrapLib.navigate_to(doi, pdf_file)
if not found:
return False, r

success, tree = ScrapLib.generate_tree()
if not success:
return False, r

found, url = ScrapLib.get_pdf_url()
if not found:
return False, r

found, r = ScrapLib.download()

return found, r


def download_from_arxiv(value, location, field="id"):

pdf_file = location
if not location.endswith(".pdf"):
pdf_file = location+value+".pdf"

found, pdf_url = ScrapArx.navigate_to(value, pdf_file, field)
if found:
found, r = ScrapArx.download()

return found, pdf_url


def download_from_scihub(doi, pdf_file):
found, r = ScrapSci.navigate_to(doi, pdf_file)
if not found:
return False, r

has_captcha, has_iframe = ScrapSci.check_captcha()
while (has_captcha and has_iframe):
captcha_img = ScrapSci.get_captcha_img()
captcha_img.show()
captcha_text = input("\tPut captcha:\n\t")
has_captcha, has_iframe = ScrapSci.solve_captcha(captcha_text)

if has_iframe:
found, r = ScrapSci.download()

found = has_iframe
return has_iframe, r


def download_pdf_from_bibs(bibs, location="",
use_libgen=False):
def put_pdf_location(bib):
pdf_name = bib["ID"] if "ID" in bib else bib["doi"].replace("/", "_")
pdf_name += ".pdf"
bib["pdf_file"] = location+pdf_name
return bib

# bibs_with_doi = list(filter(lambda bib: "doi" in bib, bibs))
bibs_with_doi = []
# bibs_arxiv = []
for bib in bibs:
if "journal" in bib:
if bool(re.match("arxiv:", bib["journal"], re.I)):
pdf_file = location+bib["journal"]+".pdf"
download_from_arxiv(bib["journal"], pdf_file, "id")
elif "doi" in bib:
bibs_with_doi.append(bib)

elif "doi" in bib:
bibs_with_doi.append(bib)
# bibs_journal = list(filter(lambda bib: "journal" in bib, bibs))
# bibs_arxiv = list(
# filter(
# lambda bib: bool(re.match("arxiv:", bib["journal"], re.I)) in bib, bibs_journal
# )
# )

bibs_with_doi = list(map(put_pdf_location, bibs_with_doi))
# libgen has no captcha, try to use multiprocessing?
if use_libgen:
list(map(
lambda bib: download_from_libgen(bib["doi"], bib["pdf_file"]
),
bibs_with_doi))
else:
for bib in bibs_with_doi:
found, bib = download_from_scihub(bib["doi"], bib["pdf_file"])


def download_from_doi(doi, location="", use_libgen=False):
pdf_name = "{}.pdf".format(doi.replace("/", "_"))
pdf_file = location+pdf_name
if use_libgen:
download_from_libgen(doi, pdf_file)
else:
download_from_scihub(doi, pdf_file)


def download_from_title(title, location="", use_libgen=False):
found, bib_string = get_bib_from_title(title)
if found:
bib = bibtexparser.loads(bib_string).entries[0]
if "doi" in bib:
pdf_name = "{}.pdf".format(
bib["doi"].replace("/", "_")
)
bib["pdf_file"] = location+pdf_name
if use_libgen:
download_from_libgen(bib["doi"], bib["pdf_file"])
else:
found, bib = download_from_scihub(bib["doi"], bib["pdf_file"])
else:
print("\tAbsent DOI")
Loading

0 comments on commit c66e7e8

Please sign in to comment.