Skip to content

Commit

Permalink
Add new features
Browse files Browse the repository at this point in the history
* Arguments parsing (-d, --no-html, --overwrite)
* Pass to next download if the current one fails
* Title printing is shortened
* Better URL handling
  • Loading branch information
mjanv committed Oct 24, 2016
1 parent 5460c72 commit 7b5edb4
Showing 1 changed file with 88 additions and 35 deletions.
123 changes: 88 additions & 35 deletions download.py
Original file line number Diff line number Diff line change
@@ -1,53 +1,106 @@
import os
import re
import urllib2
import shutil
import argparse
import mistune
import bs4 as BeautifulSoup

def download_pdf(link, location, name):
try:
response = urllib2.urlopen(link)
file = open(os.path.join(location, name), 'w')
file.write(response.read())
file.close()
except urllib2.HTTPError:
print('>>> Error 404: cannot be downloaded!\n')
try:
response = urllib2.urlopen(link)
file = open(os.path.join(location, name), 'w')
file.write(response.read())
file.close()
except urllib2.HTTPError:
print('>>> Error 404: cannot be downloaded!\n')
raise

def clean_pdf_link(link):
if 'arxiv' in link:
link = link.replace('abs', 'pdf')
return link
if 'arxiv' in link:
link = link.replace('abs', 'pdf')
if not(link.endswith('.pdf')):
link = '.'.join((link, 'pdf'))
if 'github' in link:
link = '.'.join((link, 'html'))
return link

def clean_text(text, replacements = {' ': '_', '/': '_', '.': '', '"': ''}):
for key, rep in replacements.items():
text = text.replace(key, rep)
return text

def print_title(title, pattern = "-"):
print('\n'.join(("", title, pattern * len(title))))

def get_extension(link):
extension = os.path.splitext(link)[1][1:]
if extension in ['pdf', 'html']:
return extension
if 'pdf' in extension:
return 'pdf'
return 'pdf'

def shorten_title(title):
m1 = re.search('[[0-9]*]', title)
m2 = re.search('".*"', title)
if m1:
title = m1.group(0)
if m2:
title = ' '.join((title, m2.group(0)))
return title[:50] + ' [...]'

def clean_header(text):
return text.replace(' ', '_').replace('/', '_')

if __name__ == '__main__':

with open('README.md') as readme:
readme_html = mistune.markdown(readme.read())
readme_soup = BeautifulSoup.BeautifulSoup(readme_html, "html.parser")
parser = argparse.ArgumentParser(description = 'Download all the PDF/HTML links into README.md')
parser.add_argument('-d', action="store", dest="directory")
parser.add_argument('--no-html', action="store_true", dest="nohtml", default = False)
parser.add_argument('--overwrite', action="store_true", default = False)
results = parser.parse_args()

output_directory = 'pdfs' if results.directory is None else results.directory

forbidden_extensions = ['html', 'htm'] if results.nohtml else []

point = readme_soup.find_all('h1')[1]
if results.overwrite and os.path.exists(output_directory):
shutil.rmtree(output_directory)

while point is not None:
if point.name == 'h1':
level1_directory = os.path.join('pdfs', clean_header(point.text))
os.makedirs(level1_directory)
print('\n'.join(("", point.text, "+" * len(point.text))))
with open('README.md') as readme:
readme_html = mistune.markdown(readme.read())
readme_soup = BeautifulSoup.BeautifulSoup(readme_html, "html.parser")

elif point.name == 'h2':
current_directory = os.path.join(level1_directory, clean_header(point.text))
os.mkdir(current_directory)
print('\n'.join(("", point.text, "-" * len(point.text))))
point = readme_soup.find_all('h1')[1]

elif point.name == 'p':
link = clean_pdf_link(point.find('a').attrs['href'])
extension = os.path.splitext(link)[1][1:]
extension = 'pdf' if extension not in ['pdf', 'html'] else extension
name = point.text.split('[' + extension + ']')[0].replace('.', '').replace('/', '_')
if link is not None:
print(name + ' (' + link + ')')
download_pdf(link, current_directory, '.'.join((name, extension)))

point = point.next_sibling
failures = []
while point is not None:
if point.name:
if re.search('h[1-2]', point.name):
if point.name == 'h1':
h1_directory = os.path.join(output_directory, clean_text(point.text))
current_directory = h1_directory
elif point.name == 'h2':
current_directory = os.path.join(h1_directory, clean_text(point.text))
os.makedirs(current_directory)
print_title(point.text)

if point.name == 'p':
link = point.find('a')
if link is not None:
link = clean_pdf_link(link.attrs['href'])
ext = get_extension(link)
if not ext in forbidden_extensions:
print(shorten_title(point.text) + ' (' + link + ')')
try:
name = clean_text(point.text.split('[' + ext + ']')[0])
download_pdf(link, current_directory, '.'.join((name, ext)))
except:
failures.append(point.text)

point = point.next_sibling

print('Done!')
if failures:
print('Some downloads have failed:')
for fail in failures:
print('> ' + fail)

0 comments on commit 7b5edb4

Please sign in to comment.