diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/dbpediaDown.py b/dbpediaDown.py index 891d556..0f4eacc 100644 --- a/dbpediaDown.py +++ b/dbpediaDown.py @@ -1,39 +1,40 @@ import mechanize import os from time import sleep +import requests + br = mechanize.Browser() br.open('http://downloads.dbpedia.org/current/core-i18n/en/') -f=open("source.html","w") +f = open("source.html", "w") f.write(br.response().read()) -filetypes=[".ttl.bz2"] -myfiles=[] +filetype = ".ttl.bz2" +myfiles = [] for l in br.links(): - for t in filetypes: - if t in str(l): - myfiles.append(l) - - -def downloadlink(l): - # f=open(l.text,"w") - # br.click_link(l) - br._factory.is_html = True - br.retrieve(l, l.text) - # f.write(br.response().read()) - print l.text," is downloaded. Extracting..." + if filetype in str(l): + myfiles.append(l) + + +def download_big_file(l): + local_filename = l.text + r = requests.get(str(l), stream=True) + with open(local_filename, 'wb') as f: + for chunk in r.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + f.write(chunk) + print local_filename, " is downloaded. Extracting..." os.system("bzip2 -d " + l.text) - # os.system("rm " + l.text) -for l in myfiles: - sleep(5) # sleep so to let the server breath - downloadlink(l) +for l in myfiles: + sleep(5) # sleep so to let the server breath + download_big_file(l) print "making dbpedia.ttl ..." os.system("cat *.ttl > a.ttt") os.system("rm source.html") os.system("rm *.ttl") os.system("mv a.ttt DBpedia.ttl") -print "DBpedia.ttl is created. have fun!" \ No newline at end of file +print "DBpedia.ttl is created. have fun!"