Merge branch 'release/0.2.0'

bibcure · Aug 24, 2017 · 4d6f74a · 4d6f74a
2 parents 2811f15 + b09aaa6
commit 4d6f74a
Show file tree

Hide file tree

Showing 6 changed files with 94 additions and 29 deletions.
diff --git a/README.md b/README.md
@@ -1,5 +1,6 @@
 # SciHub to PDF(Beta)
 
+
 ## Description
 
 scihub2pdf is a module of [bibcure](https://github.com/bibcure/bibcure)
@@ -27,8 +28,15 @@ $ scihub2pdf 10.1038/s41524-017-0032-0
 
 Given a title...
 ```
-$ sci2bib --title An useful paper
+$ scihub2pdf --title An useful paper
+```
+
+Arxiv...
+```
+$ scihub2pdf arxiv:0901.2686
+$ scihub2pdf --title arxiv:Periodic table for topological insulators
 ```
+
 Location folder as argument
 ```
 $ scihub2pdf -i input.bib -l somefoler/
@@ -52,3 +60,17 @@ $ scihub2pdf -i input.bib --uselibgen
 - Unstalbe
 - No CAPTCHA
 - Slow
+
+## Using bibcure modules
+
+Given a text file like
+```
+10.1063/1.3149495
+10.7717/peerj.3714
+.....
+```
+download all pdf's
+```
+$ doi2bib -i input_dois.txt > refs.bib
+$ scihub2pdf -i refs.bib
+```
diff --git a/requeriments.txt b/requeriments.txt
@@ -4,3 +4,4 @@ requests
 lxml
 title2bib
 Pillow
+arxivcheck
diff --git a/scihub2pdf/__init__.py b/scihub2pdf/__init__.py
@@ -1,2 +1,2 @@
 
-__version__ = "0.1.0"
+__version__ = "0.2.0"
diff --git a/scihub2pdf/bin/scihub2pdf b/scihub2pdf/bin/scihub2pdf
@@ -7,7 +7,7 @@ import textwrap
 # from sci2pdf.libgen import download_pdf_from_bibs, download_from_doi
 # from sci2pdf.libgen import download_from_title
 from scihub2pdf.scihub import download_pdf_from_bibs, download_from_doi
-from scihub2pdf.scihub import download_from_title
+from scihub2pdf.scihub import download_from_title, download_from_arxiv
 
 
 
@@ -33,6 +33,12 @@ def main():
 
         $ scihub2pdf --title An useful paper
 
+        Arxiv...
+
+        $ scihub2pdf arxiv:0901.2686
+
+        $ scihub2pdf --title arxiv:Periodic table for topological insulators
+
        -----------------------------------------------------
             @author: Bruno Messias
             @email: [email protected]
@@ -79,7 +85,11 @@ def main():
 
     if inline_search:
         value = " ".join(args[1])
-        if title_search:
+        is_arxiv = value.startswith("arxiv:")
+        if is_arxiv:
+            field = "ti" if title_search else "id"
+            download_from_arxiv(value, field, location)
+        elif title_search:
             download_from_title(value, location, use_libgen)
         else:
             download_from_doi(value, location, use_libgen)

diff --git a/scihub2pdf/scihub.py b/scihub2pdf/scihub.py
@@ -2,10 +2,12 @@
 import requests
 from lxml import html
 from title2bib.crossref import get_bib_from_title
+from arxivcheck.arxiv import get_arxiv_pdf_link
 import bibtexparser
 from builtins import input
 from PIL import Image
 from . import __version__
+import re
 try:
     from StringIO import StringIO
 except ImportError:
@@ -15,7 +17,7 @@
     "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
 }
 print("\n\t Checking state of scihub...")
-url_state ="https://raw.githubusercontent.com/bibcure/scihub_state/master/state.txt"
+url_state = "https://raw.githubusercontent.com/bibcure/scihub_state/master/state.txt"
 try:
     r = requests.get(url_state)
     state_scihub = [i.split(">>")[1] for i in r.iter_lines()]
@@ -66,7 +68,7 @@ def download_from_libgen(bib, s):
         return
 
     download_link = norm_url(html_a[0].attrib["href"])
-    bib["scihub"] = download_link
+    bib["pdf_link"] = download_link
     download_pdf(bib, s)
 
     return
@@ -108,7 +110,7 @@ def download_from_scihub(bib, s):
         print("\n\t", url, "\n")
         return
 
-    iframe_url = iframe_url[0]
+    iframe_url = norm_url(iframe_url[0])
     ri = s.get(iframe_url, headers=headers)
     html_tree_ri = html.fromstring(ri.content)
 
@@ -123,13 +125,13 @@ def download_from_scihub(bib, s):
         return
 
     download_link = norm_url(html_pdf[0].attrib["src"])
-    bib["scihub"] = download_link
+    bib["pdf_link"] = download_link
     download_pdf(bib, s)
     return
 
 
 def download_pdf(bib, s):
-    r = s.get(bib["scihub"], headers=headers)
+    r = s.get(bib["pdf_link"], headers=headers)
     if r.status_code == 200:
         pdf_file = open(bib["pdf_file"], "wb")
         pdf_file.write(r.content)
@@ -147,29 +149,43 @@ def download_pdf_from_bibs(bibs, location="",
     def put_pdf_location(bib):
         pdf_name = bib["ID"] if "ID" in bib else bib["doi"].replace("/", "_")
         pdf_name += ".pdf"
-        pdf_file = location+pdf_name
-
-        bib["pdf_file"] = pdf_file
+        bib["pdf_file"] = location+pdf_name
         return bib
 
-    bibs_with_doi = list(filter(lambda bib: "doi" in bib, bibs))
-
-    bibs = list(map(put_pdf_location, bibs_with_doi))
-
+    # bibs_with_doi = list(filter(lambda bib: "doi" in bib, bibs))
+    bibs_with_doi = []
+    # bibs_arxiv = []
+    for bib in bibs:
+        if "journal" in bib:
+            if bool(re.match("arxiv:", bib["journal"], re.I)):
+                download_from_arxiv(bib["journal"], "id", location)
+            elif "doi" in bib:
+                bibs_with_doi.append(bib)
+
+        elif "doi" in bib:
+            bibs_with_doi.append(bib)
+    # bibs_journal = list(filter(lambda bib: "journal" in bib, bibs))
+    # bibs_arxiv = list(
+        # filter(
+            # lambda bib: bool(re.match("arxiv:", bib["journal"], re.I)) in bib, bibs_journal
+        # )
+    # )
+
+    bibs_with_doi = list(map(put_pdf_location, bibs_with_doi))
+
+    # libgen has no  captcha, try to use multiprocessing?
     with requests.Session() as s:
         if use_libgen:
-            #libgen has no  captcha, try to use multiprocessing?
-            list(map(lambda bib: download_from_libgen(bib, s), bibs))
+            list(map(lambda bib: download_from_libgen(bib, s), bibs_with_doi))
         else:
-            for bib in bibs:
+            for bib in bibs_with_doi:
                 download_from_scihub(bib, s)
 
 
 def download_from_doi(doi, location="", use_libgen=False):
     bib = {"doi": doi}
-    pdf_name = "sci2pdf-{}.pdf".format(doi.replace("/", "_"))
-    pdf_file = location+pdf_name
-    bib["pdf_file"] = pdf_file
+    pdf_name = "{}.pdf".format(doi.replace("/", "_"))
+    bib["pdf_file"] = location+pdf_name
 
     with requests.Session() as s:
         if use_libgen:
@@ -184,15 +200,31 @@ def download_from_title(title, location="", use_libgen=False):
     if found:
         bib = bibtexparser.loads(bib_string).entries[0]
         if "doi" in bib:
-            pdf_name = "sci2pdf-{}.pdf".format(
+            pdf_name = "{}.pdf".format(
                 bib["doi"].replace("/", "_")
             )
-            pdf_file = location+pdf_name
-            bib["pdf_file"] = pdf_file
+            bib["pdf_file"] = location+pdf_name
             with requests.Session() as s:
                 if use_libgen:
                     download_from_libgen(bib, s)
                 else:
                     download_from_scihub(bib, s)
         else:
             print("Absent DOI")
+
+
+def download_from_arxiv(value, field="id", location=""):
+
+    value = re.sub("arxiv\:", "", value)
+    found, pdf_link = get_arxiv_pdf_link(value, field)
+    if found and pdf_link is not None:
+        bib = {}
+        pdf_name = "{}.pdf".format(value.replace("/", "_"))
+        bib["pdf_file"] = location+pdf_name
+
+        bib["pdf_link"] = pdf_link
+        s = requests.Session()
+        download_pdf(bib, s)
+    else:
+        print("Arxiv not found.")
+
diff --git a/setup.py b/setup.py
@@ -6,23 +6,23 @@
 
 setup(
     name="scihub2pdf",
-    version="0.1.0",
+    version="0.2.0",
     packages = find_packages(exclude=["build",]),
-    # packages = find_packages(),
     scripts=["scihub2pdf/bin/scihub2pdf"],
     long_description = README_TEXT,
     install_requires = ["bibtexparser",
                         "title2bib",
+                        "arxivcheck",
                         "future",
                         "Pillow",
                         "requests",
                         "lxml"],
     include_package_data=True,
     license="GPLv3",
-    description="Downloads pdfs via a DOI number, article title or a bibtex file, sci-hub",
+    description="Downloads pdfs via a DOI number(or arxivId), article title or a bibtex file, sci-hub",
     author="Bruno Messias",
     author_email="[email protected]",
-    download_url="https://github.com/bibcure/scihub2pdf/archive/0.1.0.tar.gz",
+    download_url="https://github.com/bibcure/scihub2pdf/archive/0.2.0.tar.gz",
     keywords=["bibtex", "sci-hub", "libgen", "doi",  "science","scientific-journals"],
 
     classifiers=[
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,3 +4,4 @@ requests @@
     lxml
     title2bib
     Pillow
+    arxivcheck