Add new features

* Arguments parsing (-d, --no-html, --overwrite) * Pass to next download if the current one fails * Title printing is shortened * Better URL handling
jippey67 · Oct 24, 2016 · 7b5edb4 · 7b5edb4
1 parent 5460c72
commit 7b5edb4
Showing 1 changed file with 88 additions and 35 deletions.
diff --git a/download.py b/download.py
@@ -1,53 +1,106 @@
 import os
+import re
 import urllib2
+import shutil
+import argparse
 import mistune
 import bs4 as BeautifulSoup
 
 def download_pdf(link, location, name):
-	try:
-	    response = urllib2.urlopen(link)
-	    file = open(os.path.join(location, name), 'w')
-	    file.write(response.read())
-	    file.close()
-	except urllib2.HTTPError:
-		print('>>> Error 404: cannot be downloaded!\n')    
+    try:
+        response = urllib2.urlopen(link)
+        file = open(os.path.join(location, name), 'w')
+        file.write(response.read())
+        file.close()
+    except urllib2.HTTPError:
+        print('>>> Error 404: cannot be downloaded!\n') 
+        raise   
 
 def clean_pdf_link(link):
-	if 'arxiv' in link:
-		link = link.replace('abs', 'pdf')	
-	return link
+    if 'arxiv' in link:
+        link = link.replace('abs', 'pdf')   
+        if not(link.endswith('.pdf')):
+            link = '.'.join((link, 'pdf'))
+    if 'github' in link:
+        link = '.'.join((link, 'html'))        
+    return link
+
+def clean_text(text, replacements = {' ': '_', '/': '_', '.': '', '"': ''}):
+    for key, rep in replacements.items():
+        text = text.replace(key, rep)
+    return text    
+
+def print_title(title, pattern = "-"):
+    print('\n'.join(("", title, pattern * len(title)))) 
+
+def get_extension(link):
+    extension = os.path.splitext(link)[1][1:]
+    if extension in ['pdf', 'html']:
+        return extension
+    if 'pdf' in extension:
+        return 'pdf'    
+    return 'pdf'    
+
+def shorten_title(title):
+    m1 = re.search('[[0-9]*]', title)
+    m2 = re.search('".*"', title)
+    if m1:
+        title = m1.group(0)
+    if m2:
+        title = ' '.join((title, m2.group(0)))   
+    return title[:50] + ' [...]'    
 
-def clean_header(text):
-	return text.replace(' ', '_').replace('/', '_')	   
 
 if __name__ == '__main__':
 
-	with open('README.md') as readme:
-		readme_html = mistune.markdown(readme.read())
-		readme_soup = BeautifulSoup.BeautifulSoup(readme_html, "html.parser")
+    parser = argparse.ArgumentParser(description = 'Download all the PDF/HTML links into README.md')
+    parser.add_argument('-d', action="store", dest="directory")
+    parser.add_argument('--no-html', action="store_true", dest="nohtml", default = False)
+    parser.add_argument('--overwrite', action="store_true", default = False)    
+    results = parser.parse_args()
+
+    output_directory = 'pdfs' if results.directory is None else results.directory
+
+    forbidden_extensions = ['html', 'htm'] if results.nohtml else []
 
-	point = readme_soup.find_all('h1')[1]
+    if results.overwrite and os.path.exists(output_directory):
+        shutil.rmtree(output_directory)
 
-	while point is not None:
-		if point.name == 'h1':
-			level1_directory = os.path.join('pdfs', clean_header(point.text))
-			os.makedirs(level1_directory)
-			print('\n'.join(("", point.text, "+" * len(point.text))))
+    with open('README.md') as readme:
+        readme_html = mistune.markdown(readme.read())
+        readme_soup = BeautifulSoup.BeautifulSoup(readme_html, "html.parser")
 
-		elif point.name == 'h2':
-			current_directory = os.path.join(level1_directory, clean_header(point.text))
-			os.mkdir(current_directory)
-			print('\n'.join(("", point.text, "-" * len(point.text))))
+    point = readme_soup.find_all('h1')[1]
 
-		elif point.name == 'p':
-			link = clean_pdf_link(point.find('a').attrs['href'])
-			extension = os.path.splitext(link)[1][1:]
-			extension = 'pdf' if extension not in ['pdf', 'html'] else extension
-			name = point.text.split('[' + extension + ']')[0].replace('.', '').replace('/', '_')
-			if link is not None:
-				print(name + ' (' + link + ')')
-				download_pdf(link, current_directory, '.'.join((name, extension)))
-
-		point = point.next_sibling			
+    failures = []
+    while point is not None:
+        if point.name:
+            if re.search('h[1-2]', point.name):
+                if point.name == 'h1':
+                    h1_directory = os.path.join(output_directory, clean_text(point.text))
+                    current_directory = h1_directory
+                elif point.name == 'h2':
+                    current_directory = os.path.join(h1_directory, clean_text(point.text))  
+                os.makedirs(current_directory)
+                print_title(point.text)
 
+            if point.name == 'p':
+                link = point.find('a')
+                if link is not None:
+                    link = clean_pdf_link(link.attrs['href'])
+                    ext = get_extension(link)
+                    if not ext in forbidden_extensions:
+                        print(shorten_title(point.text) + ' (' + link + ')')
+                        try:
+                            name = clean_text(point.text.split('[' + ext + ']')[0])
+                            download_pdf(link, current_directory, '.'.join((name, ext)))
+                        except:
+                            failures.append(point.text)
+
+        point = point.next_sibling          
 
+    print('Done!')
+    if failures:
+        print('Some downloads have failed:')
+        for fail in failures:
+            print('> ' + fail)