Merge branch 'release/0.4.0'

bibcure · Aug 27, 2017 · c66e7e8 · c66e7e8
2 parents d8abe0c + 34107c4
commit c66e7e8
Show file tree

Hide file tree

Showing 12 changed files with 1,218 additions and 237 deletions.
diff --git a/LICENSE.txt b/LICENSE.txt
diff --git a/README b/README
@@ -8,7 +8,7 @@ scihub2pdf is a module of
 `bibcure <https://github.com/bibcure/bibcure>`__
 
 Downloads pdfs via a DOI number, article title or a bibtex file, using
-the database of libgen or Sci-Hub.
+the database of libgen, Sci-Hub and Arxiv.
 
 Install
 -------
@@ -17,6 +17,23 @@ Install
 
     $ sudo pip install scihub2pdf
 
+If you want to download files from scihub you will need to get PhantomJS
+
+OSX
+~~~
+
+::
+
+    $ brew install phantomjs
+
+Linux Using npm
+~~~~~~~~~~~~~~~
+
+::
+
+    $ sudo apt-get install npm
+    $ sudo npm install -g phantomjs
+
 Features and how to use
 -----------------------
 

diff --git a/README.md b/README.md
@@ -6,13 +6,27 @@
 scihub2pdf is a module of [bibcure](https://github.com/bibcure/bibcure)
 
 Downloads pdfs via a DOI number, article title or a bibtex file, using the
-database of libgen or Sci-Hub.
+database of libgen,  Sci-Hub and Arxiv.
 
 ## Install
 
 ```
 $ sudo pip install scihub2pdf
 ```
+If you want  to download files from scihub you will need to get  PhantomJS
+
+### OSX
+```
+$ brew install phantomjs
+```
+### Linux Using npm
+
+```
+$ sudo apt-get install npm
+$ sudo npm install -g phantomjs
+```
+
+
 
 ## Features and how to use
 

diff --git a/requeriments.txt b/requeriments.txt
@@ -5,3 +5,5 @@ lxml
 title2bib
 Pillow
 arxivcheck
+selenium
+six
diff --git a/scihub2pdf/__init__.py b/scihub2pdf/__init__.py
@@ -1,2 +1,4 @@
-
-__version__ = "0.3.1"
+__version__ = "0.4.0"
+__license__ = "AGPLv3"
+__author__ = "Bruno Messias"
+__author_email__ = "[email protected]"
diff --git a/scihub2pdf/arxiv.py b/scihub2pdf/arxiv.py
@@ -0,0 +1,35 @@
+from __future__ import unicode_literals, print_function, absolute_import
+import requests
+from arxivcheck.arxiv import get_arxiv_pdf_link
+from scihub2pdf.tools import download_pdf
+
+
+class Arxiv(object):
+    def __init__(self, headers={}):
+        self.headers = headers
+        self.s = None
+        self.pdf_file = None
+        self.pdf_url = None
+
+    def start(self):
+        self.s = requests.Session()
+
+    def download(self):
+        found, r = download_pdf(
+            self.s,
+            self.pdf_file,
+            self.pdf_url,
+            self.headers)
+
+        return found,  r
+
+    def navigate_to(self, value, pdf_file, field="id"):
+        self.pdf_file = pdf_file
+        found, self.pdf_url = get_arxiv_pdf_link(value, field)
+
+        print("\n\t"+value)
+        print("\tLINK: ", self.pdf_url)
+        if not found:
+            print("\tArxiv ", value, " not found\n")
+            print(self.pdf_url)
+        return found, self.pdf_url
diff --git a/scihub2pdf/bin/scihub2pdf b/scihub2pdf/bin/scihub2pdf
@@ -1,15 +1,18 @@
 #!/usr/bib/env python
-from __future__ import print_function
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals, print_function, absolute_import
 import sys
 import bibtexparser
 import argparse
 import textwrap
-# from sci2pdf.libgen import download_pdf_from_bibs, download_from_doi
-# from sci2pdf.libgen import download_from_title
-from scihub2pdf.scihub import download_pdf_from_bibs, download_from_doi
-from scihub2pdf.scihub import download_from_title, download_from_arxiv
+from unidecode import unidecode
+from scihub2pdf.download import (download_pdf_from_bibs, download_from_doi,
+                                 download_from_title, download_from_arxiv,
+                                 start_scihub, start_libgen, start_arxiv)
 import re
+import io
 
+pyversion = sys.version_info[0]
 
 def main():
     parser = argparse.ArgumentParser(
@@ -85,7 +88,6 @@ def main():
     parser.add_argument(
         "--input", "-i",
         dest="inputfile",
-        type=argparse.FileType("r"),
         help="bibtex input file"
     )
     parser.add_argument(
@@ -98,7 +100,7 @@ def main():
         "--uselibgen",
         dest="uselibgen",
         action="store_true",
-        help="Use libgen.io instead sci-hub. Sci-hub has annoying captcha but is stable. Libgen has no captcha but is unstable."
+        help="Use libgen.io instead sci-hub."
                         )
     parser.add_argument(
         "--location", "-l",
@@ -123,23 +125,32 @@ def main():
     location = args[0].location
 
     if use_libgen:
+        start_libgen()
         print("\n\t Using Libgen.\n")
     else:
+        start_scihub()
         print("\n\t Using Sci-Hub.\n")
 
+    start_arxiv()
+
     if inline_search:
-        value = " ".join(args[1])
+
+        #values = [c if pyversion == 3 else c.decode(sys.stdout.encoding) for c in args[1]]
+        values = [c if pyversion == 3 else c.decode(sys.stdout.encoding) for c in args[1]]
+        value = " ".join(values)
         is_arxiv = bool(re.match("arxiv:", value, re.I))
         if is_arxiv:
             field = "ti" if title_search else "id"
-            download_from_arxiv(value, field, location)
+            download_from_arxiv(value, location, field)
         elif title_search:
             download_from_title(value, location, use_libgen)
         else:
             download_from_doi(value, location, use_libgen)
     else:
         if is_txt:
-            file_values = args[0].inputfile.read()
+            with io.open(args[0].inputfile, "r", encoding = "utf-8") as inputfile:
+                file_values = inputfile.read()
+
             for value in file_values.split("\n"):
                 is_arxiv = bool(re.match("arxiv:", value, re.I))
                 if value != "":
@@ -152,7 +163,22 @@ def main():
                         download_from_doi(value, location, use_libgen)
 
         else:
-            bibtex = bibtexparser.loads(args[0].inputfile.read())
+            dict_parser = {
+                'keywords': 'keyword',
+                'keyw': 'keyword',
+                'subjects': 'subject',
+                'urls': 'url',
+                'link': 'url',
+                'links': 'url',
+                'editors': 'editor',
+                'authors': 'author'}
+            parser = bibtexparser.bparser.BibTexParser()
+            parser.alt_dict = dict_parser
+            with io.open(args[0].inputfile, "r", encoding="utf-8") as inputfile:
+                    refs_string = args[0].inputfile.read()
+
+            bibtex = bibtexparser.loads(refs_string,
+                                        parser=parser)
             bibs = bibtex.entries
             if len(bibs) == 0:
                 print("Input File is empty or corrupted.")

diff --git a/scihub2pdf/download.py b/scihub2pdf/download.py
@@ -0,0 +1,180 @@
+from __future__ import unicode_literals, print_function, absolute_import
+
+from builtins import input
+import bibtexparser
+# from . import __version__
+# from lxml.etree import ParserError
+import re
+from title2bib.crossref import get_bib_from_title
+from scihub2pdf.scihub import SciHub
+from scihub2pdf.libgen import LibGen
+from scihub2pdf.arxiv import Arxiv
+headers = {
+    # "Connection": "keep-alive",
+    # "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
+}
+# print("\n\t Checking state of scihub...")
+# url_state = "https://raw.githubusercontent.com/bibcure/scihub_state/master/state.txt"
+# try:
+# r = requests.get(url_state, headers=headers)
+# state_scihub = [i.split(">>")[1] for i in r.iter_lines()]
+# url_captcha_scihub = state_scihub[0]
+# url_libgen = state_scihub[1]
+# url_scihub = state_scihub[2]
+# xpath_libgen_a = state_scihub[3]
+# xpath_scihub_captcha = state_scihub[4]
+# xpath_scihub_iframe = state_scihub[5]
+# xpath_scihub_pdf = state_scihub[6]
+# has_update = state_scihub[7] != __version__
+# if has_update:
+# print("\n\t\tWill be better if you upgrade scihub2pdf.")
+# print("\t\tFor that, just do:\n")
+# print("\t\t\t sudo pip install scihub2pdf --upgrade\n")
+# except:
+# s = None
+
+libgen_url = "http://libgen.io/scimag/ads.php"
+libgen_xpath_pdf_url = "/html/body/table/tr/td[3]/a"
+xpath_captcha = "//*[@id='captcha']"
+xpath_pdf = "//*[@id='pdf']"
+xpath_input = "/html/body/div/table/tbody/tr/td/form/input"
+xpath_form = "/html/body/div/table/tbody/tr/td/form"
+domain_scihub = "http://sci-hub.cc/"
+
+ScrapSci = SciHub(headers,
+                  xpath_captcha,
+                  xpath_pdf,
+                  xpath_input,
+                  xpath_form,
+                  domain_scihub
+                  )
+ScrapArx = Arxiv(headers)
+ScrapLib = LibGen(headers=headers,
+                  libgen_url=libgen_url,
+                  xpath_pdf_url=libgen_xpath_pdf_url)
+
+
+def start_scihub():
+    ScrapSci.start()
+
+def start_libgen():
+    ScrapLib.start()
+
+def start_arxiv():
+    ScrapArx.start()
+
+
+def download_from_libgen(doi, pdf_file):
+    found, r = ScrapLib.navigate_to(doi, pdf_file)
+    if not found:
+        return False, r
+
+    success, tree = ScrapLib.generate_tree()
+    if not success:
+        return False, r
+
+    found, url = ScrapLib.get_pdf_url()
+    if not found:
+        return False, r
+
+    found, r = ScrapLib.download()
+
+    return found, r
+
+
+def download_from_arxiv(value, location, field="id"):
+
+    pdf_file = location
+    if not location.endswith(".pdf"):
+        pdf_file = location+value+".pdf"
+
+    found, pdf_url = ScrapArx.navigate_to(value, pdf_file, field)
+    if found:
+        found, r = ScrapArx.download()
+
+    return found, pdf_url
+
+
+def download_from_scihub(doi, pdf_file):
+    found, r = ScrapSci.navigate_to(doi, pdf_file)
+    if not found:
+        return False, r
+
+    has_captcha, has_iframe = ScrapSci.check_captcha()
+    while (has_captcha and has_iframe):
+        captcha_img = ScrapSci.get_captcha_img()
+        captcha_img.show()
+        captcha_text = input("\tPut captcha:\n\t")
+        has_captcha, has_iframe = ScrapSci.solve_captcha(captcha_text)
+
+    if has_iframe:
+        found, r = ScrapSci.download()
+
+    found = has_iframe
+    return has_iframe, r
+
+
+def download_pdf_from_bibs(bibs, location="",
+                           use_libgen=False):
+    def put_pdf_location(bib):
+        pdf_name = bib["ID"] if "ID" in bib else bib["doi"].replace("/", "_")
+        pdf_name += ".pdf"
+        bib["pdf_file"] = location+pdf_name
+        return bib
+
+    # bibs_with_doi = list(filter(lambda bib: "doi" in bib, bibs))
+    bibs_with_doi = []
+    # bibs_arxiv = []
+    for bib in bibs:
+        if "journal" in bib:
+            if bool(re.match("arxiv:", bib["journal"], re.I)):
+                pdf_file = location+bib["journal"]+".pdf"
+                download_from_arxiv(bib["journal"], pdf_file, "id")
+            elif "doi" in bib:
+                bibs_with_doi.append(bib)
+
+        elif "doi" in bib:
+            bibs_with_doi.append(bib)
+            # bibs_journal = list(filter(lambda bib: "journal" in bib, bibs))
+            # bibs_arxiv = list(
+            # filter(
+            # lambda bib: bool(re.match("arxiv:", bib["journal"], re.I)) in bib, bibs_journal
+            # )
+            # )
+
+    bibs_with_doi = list(map(put_pdf_location, bibs_with_doi))
+    # libgen has no  captcha, try to use multiprocessing?
+    if use_libgen:
+        list(map(
+            lambda bib: download_from_libgen(bib["doi"], bib["pdf_file"]
+                                             ),
+            bibs_with_doi))
+    else:
+        for bib in bibs_with_doi:
+            found, bib = download_from_scihub(bib["doi"], bib["pdf_file"])
+
+
+def download_from_doi(doi, location="", use_libgen=False):
+    pdf_name = "{}.pdf".format(doi.replace("/", "_"))
+    pdf_file = location+pdf_name
+    if use_libgen:
+        download_from_libgen(doi, pdf_file)
+    else:
+        download_from_scihub(doi, pdf_file)
+
+
+def download_from_title(title, location="", use_libgen=False):
+    found, bib_string = get_bib_from_title(title)
+    if found:
+        bib = bibtexparser.loads(bib_string).entries[0]
+        if "doi" in bib:
+            pdf_name = "{}.pdf".format(
+                bib["doi"].replace("/", "_")
+            )
+            bib["pdf_file"] = location+pdf_name
+            if use_libgen:
+                download_from_libgen(bib["doi"], bib["pdf_file"])
+            else:
+                found, bib = download_from_scihub(bib["doi"], bib["pdf_file"])
+        else:
+            print("\tAbsent DOI")