diff --git a/README.md b/README.md index a3aeb7f..c54c75f 100755 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ # SciHub to PDF(Beta) + ## Description scihub2pdf is a module of [bibcure](https://github.com/bibcure/bibcure) @@ -27,8 +28,15 @@ $ scihub2pdf 10.1038/s41524-017-0032-0 Given a title... ``` -$ sci2bib --title An useful paper +$ scihub2pdf --title An useful paper +``` + +Arxiv... +``` +$ scihub2pdf arxiv:0901.2686 +$ scihub2pdf --title arxiv:Periodic table for topological insulators ``` + Location folder as argument ``` $ scihub2pdf -i input.bib -l somefoler/ @@ -52,3 +60,17 @@ $ scihub2pdf -i input.bib --uselibgen - Unstalbe - No CAPTCHA - Slow + +## Using bibcure modules + +Given a text file like +``` +10.1063/1.3149495 +10.7717/peerj.3714 +..... +``` +download all pdf's +``` +$ doi2bib -i input_dois.txt > refs.bib +$ scihub2pdf -i refs.bib +``` diff --git a/requeriments.txt b/requeriments.txt index 9bd1a53..08b39ab 100755 --- a/requeriments.txt +++ b/requeriments.txt @@ -4,3 +4,4 @@ requests lxml title2bib Pillow +arxivcheck diff --git a/scihub2pdf/__init__.py b/scihub2pdf/__init__.py index 36e61c6..15e7287 100755 --- a/scihub2pdf/__init__.py +++ b/scihub2pdf/__init__.py @@ -1,2 +1,2 @@ -__version__ = "0.1.0" +__version__ = "0.2.0" diff --git a/scihub2pdf/bin/scihub2pdf b/scihub2pdf/bin/scihub2pdf index 5aefa44..39593a9 100755 --- a/scihub2pdf/bin/scihub2pdf +++ b/scihub2pdf/bin/scihub2pdf @@ -7,7 +7,7 @@ import textwrap # from sci2pdf.libgen import download_pdf_from_bibs, download_from_doi # from sci2pdf.libgen import download_from_title from scihub2pdf.scihub import download_pdf_from_bibs, download_from_doi -from scihub2pdf.scihub import download_from_title +from scihub2pdf.scihub import download_from_title, download_from_arxiv @@ -33,6 +33,12 @@ def main(): $ scihub2pdf --title An useful paper + Arxiv... + + $ scihub2pdf arxiv:0901.2686 + + $ scihub2pdf --title arxiv:Periodic table for topological insulators + ----------------------------------------------------- @author: Bruno Messias @email: messias.physics@gmail.com @@ -79,7 +85,11 @@ def main(): if inline_search: value = " ".join(args[1]) - if title_search: + is_arxiv = value.startswith("arxiv:") + if is_arxiv: + field = "ti" if title_search else "id" + download_from_arxiv(value, field, location) + elif title_search: download_from_title(value, location, use_libgen) else: download_from_doi(value, location, use_libgen) diff --git a/scihub2pdf/scihub.py b/scihub2pdf/scihub.py index 057b485..a03a6b7 100755 --- a/scihub2pdf/scihub.py +++ b/scihub2pdf/scihub.py @@ -2,10 +2,12 @@ import requests from lxml import html from title2bib.crossref import get_bib_from_title +from arxivcheck.arxiv import get_arxiv_pdf_link import bibtexparser from builtins import input from PIL import Image from . import __version__ +import re try: from StringIO import StringIO except ImportError: @@ -15,7 +17,7 @@ "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" } print("\n\t Checking state of scihub...") -url_state ="https://raw.githubusercontent.com/bibcure/scihub_state/master/state.txt" +url_state = "https://raw.githubusercontent.com/bibcure/scihub_state/master/state.txt" try: r = requests.get(url_state) state_scihub = [i.split(">>")[1] for i in r.iter_lines()] @@ -66,7 +68,7 @@ def download_from_libgen(bib, s): return download_link = norm_url(html_a[0].attrib["href"]) - bib["scihub"] = download_link + bib["pdf_link"] = download_link download_pdf(bib, s) return @@ -108,7 +110,7 @@ def download_from_scihub(bib, s): print("\n\t", url, "\n") return - iframe_url = iframe_url[0] + iframe_url = norm_url(iframe_url[0]) ri = s.get(iframe_url, headers=headers) html_tree_ri = html.fromstring(ri.content) @@ -123,13 +125,13 @@ def download_from_scihub(bib, s): return download_link = norm_url(html_pdf[0].attrib["src"]) - bib["scihub"] = download_link + bib["pdf_link"] = download_link download_pdf(bib, s) return def download_pdf(bib, s): - r = s.get(bib["scihub"], headers=headers) + r = s.get(bib["pdf_link"], headers=headers) if r.status_code == 200: pdf_file = open(bib["pdf_file"], "wb") pdf_file.write(r.content) @@ -147,29 +149,43 @@ def download_pdf_from_bibs(bibs, location="", def put_pdf_location(bib): pdf_name = bib["ID"] if "ID" in bib else bib["doi"].replace("/", "_") pdf_name += ".pdf" - pdf_file = location+pdf_name - - bib["pdf_file"] = pdf_file + bib["pdf_file"] = location+pdf_name return bib - bibs_with_doi = list(filter(lambda bib: "doi" in bib, bibs)) - - bibs = list(map(put_pdf_location, bibs_with_doi)) - + # bibs_with_doi = list(filter(lambda bib: "doi" in bib, bibs)) + bibs_with_doi = [] + # bibs_arxiv = [] + for bib in bibs: + if "journal" in bib: + if bool(re.match("arxiv:", bib["journal"], re.I)): + download_from_arxiv(bib["journal"], "id", location) + elif "doi" in bib: + bibs_with_doi.append(bib) + + elif "doi" in bib: + bibs_with_doi.append(bib) + # bibs_journal = list(filter(lambda bib: "journal" in bib, bibs)) + # bibs_arxiv = list( + # filter( + # lambda bib: bool(re.match("arxiv:", bib["journal"], re.I)) in bib, bibs_journal + # ) + # ) + + bibs_with_doi = list(map(put_pdf_location, bibs_with_doi)) + + # libgen has no captcha, try to use multiprocessing? with requests.Session() as s: if use_libgen: - #libgen has no captcha, try to use multiprocessing? - list(map(lambda bib: download_from_libgen(bib, s), bibs)) + list(map(lambda bib: download_from_libgen(bib, s), bibs_with_doi)) else: - for bib in bibs: + for bib in bibs_with_doi: download_from_scihub(bib, s) def download_from_doi(doi, location="", use_libgen=False): bib = {"doi": doi} - pdf_name = "sci2pdf-{}.pdf".format(doi.replace("/", "_")) - pdf_file = location+pdf_name - bib["pdf_file"] = pdf_file + pdf_name = "{}.pdf".format(doi.replace("/", "_")) + bib["pdf_file"] = location+pdf_name with requests.Session() as s: if use_libgen: @@ -184,11 +200,10 @@ def download_from_title(title, location="", use_libgen=False): if found: bib = bibtexparser.loads(bib_string).entries[0] if "doi" in bib: - pdf_name = "sci2pdf-{}.pdf".format( + pdf_name = "{}.pdf".format( bib["doi"].replace("/", "_") ) - pdf_file = location+pdf_name - bib["pdf_file"] = pdf_file + bib["pdf_file"] = location+pdf_name with requests.Session() as s: if use_libgen: download_from_libgen(bib, s) @@ -196,3 +211,20 @@ def download_from_title(title, location="", use_libgen=False): download_from_scihub(bib, s) else: print("Absent DOI") + + +def download_from_arxiv(value, field="id", location=""): + + value = re.sub("arxiv\:", "", value) + found, pdf_link = get_arxiv_pdf_link(value, field) + if found and pdf_link is not None: + bib = {} + pdf_name = "{}.pdf".format(value.replace("/", "_")) + bib["pdf_file"] = location+pdf_name + + bib["pdf_link"] = pdf_link + s = requests.Session() + download_pdf(bib, s) + else: + print("Arxiv not found.") + diff --git a/setup.py b/setup.py index 6fd27b6..6121b5f 100755 --- a/setup.py +++ b/setup.py @@ -6,23 +6,23 @@ setup( name="scihub2pdf", - version="0.1.0", + version="0.2.0", packages = find_packages(exclude=["build",]), - # packages = find_packages(), scripts=["scihub2pdf/bin/scihub2pdf"], long_description = README_TEXT, install_requires = ["bibtexparser", "title2bib", + "arxivcheck", "future", "Pillow", "requests", "lxml"], include_package_data=True, license="GPLv3", - description="Downloads pdfs via a DOI number, article title or a bibtex file, sci-hub", + description="Downloads pdfs via a DOI number(or arxivId), article title or a bibtex file, sci-hub", author="Bruno Messias", author_email="messias.physics@gmail.com", - download_url="https://github.com/bibcure/scihub2pdf/archive/0.1.0.tar.gz", + download_url="https://github.com/bibcure/scihub2pdf/archive/0.2.0.tar.gz", keywords=["bibtex", "sci-hub", "libgen", "doi", "science","scientific-journals"], classifiers=[