Skip to content

Commit

Permalink
Add script for automatic papers download
Browse files Browse the repository at this point in the history
   virtualenv --python=2.7 py
   source py/bin/activate
   pip install -r requirements.txt
   python download.py
  • Loading branch information
mjanv committed Oct 23, 2016
1 parent 4b783b1 commit 3313775
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 0 deletions.
52 changes: 52 additions & 0 deletions download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import os
import urllib2
import mistune
import bs4 as BeautifulSoup

def download_pdf(link, location, name):
try:
response = urllib2.urlopen(link)
file = open(os.path.join(location, name), 'w')
file.write(response.read())
file.close()
except urllib2.HTTPError:
print('>>> Error 404: cannot be downloaded!\n')

def clean_pdf_link(link):
if 'arxiv' in link:
link = link.replace('abs', 'pdf')
return link

def clean_header(text):
return text.replace(' ', '_').replace('/', '_')

if __name__ == '__main__':

with open('README.md') as readme:
readme_html = mistune.markdown(readme.read())
readme_soup = BeautifulSoup.BeautifulSoup(readme_html, "html.parser")

point = readme_soup.find_all('h1')[1]

while point is not None:
if point.name == 'h1':
level1_directory = os.path.join('pdfs', clean_header(point.text))
os.makedirs(level1_directory)
print('\n'.join((point.text, "+" * len(point.text), "")))

elif point.name == 'h2':
current_directory = os.path.join(level1_directory, clean_header(point.text))
os.mkdir(current_directory)
print('\n'.join((point.text, "+" * len(point.text), "")))

elif point.name == 'p':
link = clean_pdf_link(point.find('a').attrs['href'])
extension = os.path.splitext(link)[1][1:]
name = point.text.split('[' + extension + ']')[0].replace('.', '').replace('/', '_')
if link is not None:
print(name + ' (' + link + ')')
download_pdf(link, current_directory, '.'.join((name, extension)))

point = point.next_sibling


2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
mistune>=0.7.2
beautifulsoup4>=4.4.1

0 comments on commit 3313775

Please sign in to comment.