From 85e256d223947022a89a74b030b559eaa69e36f3 Mon Sep 17 00:00:00 2001 From: bruno_messias Date: Thu, 24 Aug 2017 09:59:11 -0300 Subject: [PATCH 1/5] Update requeriments --- requeriments.txt | 1 + setup.py | 1 + 2 files changed, 2 insertions(+) diff --git a/requeriments.txt b/requeriments.txt index 9bd1a53..08b39ab 100755 --- a/requeriments.txt +++ b/requeriments.txt @@ -4,3 +4,4 @@ requests lxml title2bib Pillow +arxivcheck diff --git a/setup.py b/setup.py index 6fd27b6..9f172b5 100755 --- a/setup.py +++ b/setup.py @@ -13,6 +13,7 @@ long_description = README_TEXT, install_requires = ["bibtexparser", "title2bib", + "arxivcheck", "future", "Pillow", "requests", From 2b5885b1059043e2bcc8e5241d01de29e6c4853c Mon Sep 17 00:00:00 2001 From: bruno_messias Date: Thu, 24 Aug 2017 10:43:51 -0300 Subject: [PATCH 2/5] Allows the user to download files from arxiv --- scihub2pdf/bin/scihub2pdf | 8 ++++++-- scihub2pdf/scihub.py | 41 ++++++++++++++++++++++++++------------- 2 files changed, 33 insertions(+), 16 deletions(-) diff --git a/scihub2pdf/bin/scihub2pdf b/scihub2pdf/bin/scihub2pdf index 5aefa44..a796710 100755 --- a/scihub2pdf/bin/scihub2pdf +++ b/scihub2pdf/bin/scihub2pdf @@ -7,7 +7,7 @@ import textwrap # from sci2pdf.libgen import download_pdf_from_bibs, download_from_doi # from sci2pdf.libgen import download_from_title from scihub2pdf.scihub import download_pdf_from_bibs, download_from_doi -from scihub2pdf.scihub import download_from_title +from scihub2pdf.scihub import download_from_title, download_from_arxiv @@ -79,7 +79,11 @@ def main(): if inline_search: value = " ".join(args[1]) - if title_search: + is_arxiv = value.startswith("arxiv:") + if is_arxiv: + field = "ti" if title_search else "id" + download_from_arxiv(value, field, location) + elif title_search: download_from_title(value, location, use_libgen) else: download_from_doi(value, location, use_libgen) diff --git a/scihub2pdf/scihub.py b/scihub2pdf/scihub.py index 057b485..ac78e90 100755 --- a/scihub2pdf/scihub.py +++ b/scihub2pdf/scihub.py @@ -2,6 +2,7 @@ import requests from lxml import html from title2bib.crossref import get_bib_from_title +from arxivcheck.arxiv import get_arxiv_pdf_link import bibtexparser from builtins import input from PIL import Image @@ -15,7 +16,7 @@ "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" } print("\n\t Checking state of scihub...") -url_state ="https://raw.githubusercontent.com/bibcure/scihub_state/master/state.txt" +url_state = "https://raw.githubusercontent.com/bibcure/scihub_state/master/state.txt" try: r = requests.get(url_state) state_scihub = [i.split(">>")[1] for i in r.iter_lines()] @@ -66,7 +67,7 @@ def download_from_libgen(bib, s): return download_link = norm_url(html_a[0].attrib["href"]) - bib["scihub"] = download_link + bib["pdf_link"] = download_link download_pdf(bib, s) return @@ -123,13 +124,13 @@ def download_from_scihub(bib, s): return download_link = norm_url(html_pdf[0].attrib["src"]) - bib["scihub"] = download_link + bib["pdf_link"] = download_link download_pdf(bib, s) return def download_pdf(bib, s): - r = s.get(bib["scihub"], headers=headers) + r = s.get(bib["pdf_link"], headers=headers) if r.status_code == 200: pdf_file = open(bib["pdf_file"], "wb") pdf_file.write(r.content) @@ -147,18 +148,16 @@ def download_pdf_from_bibs(bibs, location="", def put_pdf_location(bib): pdf_name = bib["ID"] if "ID" in bib else bib["doi"].replace("/", "_") pdf_name += ".pdf" - pdf_file = location+pdf_name - - bib["pdf_file"] = pdf_file + bib["pdf_file"] = location+pdf_name return bib bibs_with_doi = list(filter(lambda bib: "doi" in bib, bibs)) bibs = list(map(put_pdf_location, bibs_with_doi)) + # libgen has no captcha, try to use multiprocessing? with requests.Session() as s: if use_libgen: - #libgen has no captcha, try to use multiprocessing? list(map(lambda bib: download_from_libgen(bib, s), bibs)) else: for bib in bibs: @@ -167,9 +166,8 @@ def put_pdf_location(bib): def download_from_doi(doi, location="", use_libgen=False): bib = {"doi": doi} - pdf_name = "sci2pdf-{}.pdf".format(doi.replace("/", "_")) - pdf_file = location+pdf_name - bib["pdf_file"] = pdf_file + pdf_name = "{}.pdf".format(doi.replace("/", "_")) + bib["pdf_file"] = location+pdf_name with requests.Session() as s: if use_libgen: @@ -184,11 +182,10 @@ def download_from_title(title, location="", use_libgen=False): if found: bib = bibtexparser.loads(bib_string).entries[0] if "doi" in bib: - pdf_name = "sci2pdf-{}.pdf".format( + pdf_name = "{}.pdf".format( bib["doi"].replace("/", "_") ) - pdf_file = location+pdf_name - bib["pdf_file"] = pdf_file + bib["pdf_file"] = location+pdf_name with requests.Session() as s: if use_libgen: download_from_libgen(bib, s) @@ -196,3 +193,19 @@ def download_from_title(title, location="", use_libgen=False): download_from_scihub(bib, s) else: print("Absent DOI") + + +def download_from_arxiv(value, field="id", location=""): + value = value.replace("arxiv:", "") + found, pdf_link = get_arxiv_pdf_link(value, field) + if found and pdf_link is not None: + bib = {} + pdf_name = "{}.pdf".format(value.replace("/", "_")) + bib["pdf_file"] = location+pdf_name + + bib["pdf_link"] = pdf_link + s = requests.Session() + download_pdf(bib, s) + else: + print("Arxiv not found.") + From d2d3f56bd13a3af52845ee85f02e120f5fbfb03c Mon Sep 17 00:00:00 2001 From: bruno_messias Date: Thu, 24 Aug 2017 11:09:30 -0300 Subject: [PATCH 3/5] Allows the user to download files from arxiv(bibtex) --- scihub2pdf/scihub.py | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/scihub2pdf/scihub.py b/scihub2pdf/scihub.py index ac78e90..b76eb95 100755 --- a/scihub2pdf/scihub.py +++ b/scihub2pdf/scihub.py @@ -7,6 +7,7 @@ from builtins import input from PIL import Image from . import __version__ +import re try: from StringIO import StringIO except ImportError: @@ -151,16 +152,33 @@ def put_pdf_location(bib): bib["pdf_file"] = location+pdf_name return bib - bibs_with_doi = list(filter(lambda bib: "doi" in bib, bibs)) - - bibs = list(map(put_pdf_location, bibs_with_doi)) + # bibs_with_doi = list(filter(lambda bib: "doi" in bib, bibs)) + bibs_with_doi = [] + # bibs_arxiv = [] + for bib in bibs: + if "journal" in bib: + if bool(re.match("arxiv:", bib["journal"], re.I)): + download_from_arxiv(bib["journal"], "id", location) + elif "doi" in bib: + bibs_with_doi.append(bib) + + elif "doi" in bib: + bibs_with_doi.append(bib) + # bibs_journal = list(filter(lambda bib: "journal" in bib, bibs)) + # bibs_arxiv = list( + # filter( + # lambda bib: bool(re.match("arxiv:", bib["journal"], re.I)) in bib, bibs_journal + # ) + # ) + + bibs_with_doi = list(map(put_pdf_location, bibs_with_doi)) # libgen has no captcha, try to use multiprocessing? with requests.Session() as s: if use_libgen: - list(map(lambda bib: download_from_libgen(bib, s), bibs)) + list(map(lambda bib: download_from_libgen(bib, s), bibs_with_doi)) else: - for bib in bibs: + for bib in bibs_with_doi: download_from_scihub(bib, s) @@ -196,7 +214,8 @@ def download_from_title(title, location="", use_libgen=False): def download_from_arxiv(value, field="id", location=""): - value = value.replace("arxiv:", "") + + value = re.sub("arxiv\:", "", value) found, pdf_link = get_arxiv_pdf_link(value, field) if found and pdf_link is not None: bib = {} From 5917b1eaab364ea94e8aa49577ba2ab30aa71530 Mon Sep 17 00:00:00 2001 From: bruno_messias Date: Thu, 24 Aug 2017 11:15:48 -0300 Subject: [PATCH 4/5] Fix url bug iframe. Sometimes, scihub gives a url iframe like //domain... --- scihub2pdf/scihub.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scihub2pdf/scihub.py b/scihub2pdf/scihub.py index b76eb95..a03a6b7 100755 --- a/scihub2pdf/scihub.py +++ b/scihub2pdf/scihub.py @@ -110,7 +110,7 @@ def download_from_scihub(bib, s): print("\n\t", url, "\n") return - iframe_url = iframe_url[0] + iframe_url = norm_url(iframe_url[0]) ri = s.get(iframe_url, headers=headers) html_tree_ri = html.fromstring(ri.content) From b09aaa63b26fc308b31ba32e2f4012eb5128867c Mon Sep 17 00:00:00 2001 From: bruno_messias Date: Thu, 24 Aug 2017 11:35:55 -0300 Subject: [PATCH 5/5] Update version info --- README.md | 24 +++++++++++++++++++++++- scihub2pdf/__init__.py | 2 +- scihub2pdf/bin/scihub2pdf | 6 ++++++ setup.py | 7 +++---- 4 files changed, 33 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index a3aeb7f..c54c75f 100755 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ # SciHub to PDF(Beta) + ## Description scihub2pdf is a module of [bibcure](https://github.com/bibcure/bibcure) @@ -27,8 +28,15 @@ $ scihub2pdf 10.1038/s41524-017-0032-0 Given a title... ``` -$ sci2bib --title An useful paper +$ scihub2pdf --title An useful paper +``` + +Arxiv... +``` +$ scihub2pdf arxiv:0901.2686 +$ scihub2pdf --title arxiv:Periodic table for topological insulators ``` + Location folder as argument ``` $ scihub2pdf -i input.bib -l somefoler/ @@ -52,3 +60,17 @@ $ scihub2pdf -i input.bib --uselibgen - Unstalbe - No CAPTCHA - Slow + +## Using bibcure modules + +Given a text file like +``` +10.1063/1.3149495 +10.7717/peerj.3714 +..... +``` +download all pdf's +``` +$ doi2bib -i input_dois.txt > refs.bib +$ scihub2pdf -i refs.bib +``` diff --git a/scihub2pdf/__init__.py b/scihub2pdf/__init__.py index 36e61c6..15e7287 100755 --- a/scihub2pdf/__init__.py +++ b/scihub2pdf/__init__.py @@ -1,2 +1,2 @@ -__version__ = "0.1.0" +__version__ = "0.2.0" diff --git a/scihub2pdf/bin/scihub2pdf b/scihub2pdf/bin/scihub2pdf index a796710..39593a9 100755 --- a/scihub2pdf/bin/scihub2pdf +++ b/scihub2pdf/bin/scihub2pdf @@ -33,6 +33,12 @@ def main(): $ scihub2pdf --title An useful paper + Arxiv... + + $ scihub2pdf arxiv:0901.2686 + + $ scihub2pdf --title arxiv:Periodic table for topological insulators + ----------------------------------------------------- @author: Bruno Messias @email: messias.physics@gmail.com diff --git a/setup.py b/setup.py index 9f172b5..6121b5f 100755 --- a/setup.py +++ b/setup.py @@ -6,9 +6,8 @@ setup( name="scihub2pdf", - version="0.1.0", + version="0.2.0", packages = find_packages(exclude=["build",]), - # packages = find_packages(), scripts=["scihub2pdf/bin/scihub2pdf"], long_description = README_TEXT, install_requires = ["bibtexparser", @@ -20,10 +19,10 @@ "lxml"], include_package_data=True, license="GPLv3", - description="Downloads pdfs via a DOI number, article title or a bibtex file, sci-hub", + description="Downloads pdfs via a DOI number(or arxivId), article title or a bibtex file, sci-hub", author="Bruno Messias", author_email="messias.physics@gmail.com", - download_url="https://github.com/bibcure/scihub2pdf/archive/0.1.0.tar.gz", + download_url="https://github.com/bibcure/scihub2pdf/archive/0.2.0.tar.gz", keywords=["bibtex", "sci-hub", "libgen", "doi", "science","scientific-journals"], classifiers=[