Skip to content

Commit

Permalink
Merge branch 'release/0.2.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
devmessias committed Aug 24, 2017
2 parents 2811f15 + b09aaa6 commit 4d6f74a
Show file tree
Hide file tree
Showing 6 changed files with 94 additions and 29 deletions.
24 changes: 23 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# SciHub to PDF(Beta)


## Description

scihub2pdf is a module of [bibcure](https://github.com/bibcure/bibcure)
Expand Down Expand Up @@ -27,8 +28,15 @@ $ scihub2pdf 10.1038/s41524-017-0032-0

Given a title...
```
$ sci2bib --title An useful paper
$ scihub2pdf --title An useful paper
```

Arxiv...
```
$ scihub2pdf arxiv:0901.2686
$ scihub2pdf --title arxiv:Periodic table for topological insulators
```

Location folder as argument
```
$ scihub2pdf -i input.bib -l somefoler/
Expand All @@ -52,3 +60,17 @@ $ scihub2pdf -i input.bib --uselibgen
- Unstalbe
- No CAPTCHA
- Slow

## Using bibcure modules

Given a text file like
```
10.1063/1.3149495
10.7717/peerj.3714
.....
```
download all pdf's
```
$ doi2bib -i input_dois.txt > refs.bib
$ scihub2pdf -i refs.bib
```
1 change: 1 addition & 0 deletions requeriments.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ requests
lxml
title2bib
Pillow
arxivcheck
2 changes: 1 addition & 1 deletion scihub2pdf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@

__version__ = "0.1.0"
__version__ = "0.2.0"
14 changes: 12 additions & 2 deletions scihub2pdf/bin/scihub2pdf
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import textwrap
# from sci2pdf.libgen import download_pdf_from_bibs, download_from_doi
# from sci2pdf.libgen import download_from_title
from scihub2pdf.scihub import download_pdf_from_bibs, download_from_doi
from scihub2pdf.scihub import download_from_title
from scihub2pdf.scihub import download_from_title, download_from_arxiv



Expand All @@ -33,6 +33,12 @@ def main():
$ scihub2pdf --title An useful paper
Arxiv...
$ scihub2pdf arxiv:0901.2686
$ scihub2pdf --title arxiv:Periodic table for topological insulators
-----------------------------------------------------
@author: Bruno Messias
@email: [email protected]
Expand Down Expand Up @@ -79,7 +85,11 @@ def main():

if inline_search:
value = " ".join(args[1])
if title_search:
is_arxiv = value.startswith("arxiv:")
if is_arxiv:
field = "ti" if title_search else "id"
download_from_arxiv(value, field, location)
elif title_search:
download_from_title(value, location, use_libgen)
else:
download_from_doi(value, location, use_libgen)
Expand Down
74 changes: 53 additions & 21 deletions scihub2pdf/scihub.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@
import requests
from lxml import html
from title2bib.crossref import get_bib_from_title
from arxivcheck.arxiv import get_arxiv_pdf_link
import bibtexparser
from builtins import input
from PIL import Image
from . import __version__
import re
try:
from StringIO import StringIO
except ImportError:
Expand All @@ -15,7 +17,7 @@
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
}
print("\n\t Checking state of scihub...")
url_state ="https://raw.githubusercontent.com/bibcure/scihub_state/master/state.txt"
url_state = "https://raw.githubusercontent.com/bibcure/scihub_state/master/state.txt"
try:
r = requests.get(url_state)
state_scihub = [i.split(">>")[1] for i in r.iter_lines()]
Expand Down Expand Up @@ -66,7 +68,7 @@ def download_from_libgen(bib, s):
return

download_link = norm_url(html_a[0].attrib["href"])
bib["scihub"] = download_link
bib["pdf_link"] = download_link
download_pdf(bib, s)

return
Expand Down Expand Up @@ -108,7 +110,7 @@ def download_from_scihub(bib, s):
print("\n\t", url, "\n")
return

iframe_url = iframe_url[0]
iframe_url = norm_url(iframe_url[0])
ri = s.get(iframe_url, headers=headers)
html_tree_ri = html.fromstring(ri.content)

Expand All @@ -123,13 +125,13 @@ def download_from_scihub(bib, s):
return

download_link = norm_url(html_pdf[0].attrib["src"])
bib["scihub"] = download_link
bib["pdf_link"] = download_link
download_pdf(bib, s)
return


def download_pdf(bib, s):
r = s.get(bib["scihub"], headers=headers)
r = s.get(bib["pdf_link"], headers=headers)
if r.status_code == 200:
pdf_file = open(bib["pdf_file"], "wb")
pdf_file.write(r.content)
Expand All @@ -147,29 +149,43 @@ def download_pdf_from_bibs(bibs, location="",
def put_pdf_location(bib):
pdf_name = bib["ID"] if "ID" in bib else bib["doi"].replace("/", "_")
pdf_name += ".pdf"
pdf_file = location+pdf_name

bib["pdf_file"] = pdf_file
bib["pdf_file"] = location+pdf_name
return bib

bibs_with_doi = list(filter(lambda bib: "doi" in bib, bibs))

bibs = list(map(put_pdf_location, bibs_with_doi))

# bibs_with_doi = list(filter(lambda bib: "doi" in bib, bibs))
bibs_with_doi = []
# bibs_arxiv = []
for bib in bibs:
if "journal" in bib:
if bool(re.match("arxiv:", bib["journal"], re.I)):
download_from_arxiv(bib["journal"], "id", location)
elif "doi" in bib:
bibs_with_doi.append(bib)

elif "doi" in bib:
bibs_with_doi.append(bib)
# bibs_journal = list(filter(lambda bib: "journal" in bib, bibs))
# bibs_arxiv = list(
# filter(
# lambda bib: bool(re.match("arxiv:", bib["journal"], re.I)) in bib, bibs_journal
# )
# )

bibs_with_doi = list(map(put_pdf_location, bibs_with_doi))

# libgen has no captcha, try to use multiprocessing?
with requests.Session() as s:
if use_libgen:
#libgen has no captcha, try to use multiprocessing?
list(map(lambda bib: download_from_libgen(bib, s), bibs))
list(map(lambda bib: download_from_libgen(bib, s), bibs_with_doi))
else:
for bib in bibs:
for bib in bibs_with_doi:
download_from_scihub(bib, s)


def download_from_doi(doi, location="", use_libgen=False):
bib = {"doi": doi}
pdf_name = "sci2pdf-{}.pdf".format(doi.replace("/", "_"))
pdf_file = location+pdf_name
bib["pdf_file"] = pdf_file
pdf_name = "{}.pdf".format(doi.replace("/", "_"))
bib["pdf_file"] = location+pdf_name

with requests.Session() as s:
if use_libgen:
Expand All @@ -184,15 +200,31 @@ def download_from_title(title, location="", use_libgen=False):
if found:
bib = bibtexparser.loads(bib_string).entries[0]
if "doi" in bib:
pdf_name = "sci2pdf-{}.pdf".format(
pdf_name = "{}.pdf".format(
bib["doi"].replace("/", "_")
)
pdf_file = location+pdf_name
bib["pdf_file"] = pdf_file
bib["pdf_file"] = location+pdf_name
with requests.Session() as s:
if use_libgen:
download_from_libgen(bib, s)
else:
download_from_scihub(bib, s)
else:
print("Absent DOI")


def download_from_arxiv(value, field="id", location=""):

value = re.sub("arxiv\:", "", value)
found, pdf_link = get_arxiv_pdf_link(value, field)
if found and pdf_link is not None:
bib = {}
pdf_name = "{}.pdf".format(value.replace("/", "_"))
bib["pdf_file"] = location+pdf_name

bib["pdf_link"] = pdf_link
s = requests.Session()
download_pdf(bib, s)
else:
print("Arxiv not found.")

8 changes: 4 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,23 @@

setup(
name="scihub2pdf",
version="0.1.0",
version="0.2.0",
packages = find_packages(exclude=["build",]),
# packages = find_packages(),
scripts=["scihub2pdf/bin/scihub2pdf"],
long_description = README_TEXT,
install_requires = ["bibtexparser",
"title2bib",
"arxivcheck",
"future",
"Pillow",
"requests",
"lxml"],
include_package_data=True,
license="GPLv3",
description="Downloads pdfs via a DOI number, article title or a bibtex file, sci-hub",
description="Downloads pdfs via a DOI number(or arxivId), article title or a bibtex file, sci-hub",
author="Bruno Messias",
author_email="[email protected]",
download_url="https://github.com/bibcure/scihub2pdf/archive/0.1.0.tar.gz",
download_url="https://github.com/bibcure/scihub2pdf/archive/0.2.0.tar.gz",
keywords=["bibtex", "sci-hub", "libgen", "doi", "science","scientific-journals"],

classifiers=[
Expand Down

0 comments on commit 4d6f74a

Please sign in to comment.