Skip to content

Commit

Permalink
rewrite
Browse files Browse the repository at this point in the history
  • Loading branch information
afshinsadeghi committed Mar 26, 2018
1 parent 9e17a32 commit 5080365
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 20 deletions.
6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

41 changes: 21 additions & 20 deletions dbpediaDown.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,40 @@
import mechanize
import os
from time import sleep
import requests

br = mechanize.Browser()

br.open('http://downloads.dbpedia.org/current/core-i18n/en/')

f=open("source.html","w")
f = open("source.html", "w")
f.write(br.response().read())

filetypes=[".ttl.bz2"]
myfiles=[]
filetype = ".ttl.bz2"
myfiles = []
for l in br.links():
for t in filetypes:
if t in str(l):
myfiles.append(l)


def downloadlink(l):
# f=open(l.text,"w")
# br.click_link(l)
br._factory.is_html = True
br.retrieve(l, l.text)
# f.write(br.response().read())
print l.text," is downloaded. Extracting..."
if filetype in str(l):
myfiles.append(l)


def download_big_file(l):
local_filename = l.text
r = requests.get(str(l), stream=True)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
print local_filename, " is downloaded. Extracting..."
os.system("bzip2 -d " + l.text)
# os.system("rm " + l.text)

for l in myfiles:
sleep(5) # sleep so to let the server breath
downloadlink(l)

for l in myfiles:
sleep(5) # sleep so to let the server breath
download_big_file(l)

print "making dbpedia.ttl ..."
os.system("cat *.ttl > a.ttt")
os.system("rm source.html")
os.system("rm *.ttl")
os.system("mv a.ttt DBpedia.ttl")
print "DBpedia.ttl is created. have fun!"
print "DBpedia.ttl is created. have fun!"

0 comments on commit 5080365

Please sign in to comment.