forked from practical-recommender-systems/moviegeek
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpopulate_moviegeek.py
69 lines (45 loc) · 1.74 KB
/
populate_moviegeek.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import os
import urllib.request
from tqdm import tqdm
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'prs_project.settings')
import django
django.setup()
from moviegeeks.models import Movie, Genre
def create_movie(movie_id, title, genres):
movie = Movie.objects.get_or_create(movie_id=movie_id)[0]
title_and_year = title.split(sep="(")
movie.title = title_and_year[0]
movie.year = title_and_year[1][:-1]
if genres:
for genre in genres.split(sep="|"):
g = Genre.objects.get_or_create(name=genre)[0]
movie.genres.add(g)
g.save()
movie.save()
return movie
def download_movies(URL = 'https://raw.githubusercontent.com/sidooms/MovieTweetings/master/latest/movies.dat'):
response = urllib.request.urlopen(URL)
data = response.read()
return data.decode('utf-8')
def delete_db():
print('truncate db')
movie_count = Movie.objects.all().count()
if movie_count > 1:
Movie.objects.all().delete()
Genre.objects.all().delete()
print('finished truncate db')
def populate():
movies = download_movies()
if len(movies) == 0:
print('The latest dataset seems to be empty. Older movie list downloaded.')
print('Please have a look at https://github.com/sidooms/MovieTweetings/issues and see if there is an issue')
movies = download_movies('https://raw.githubusercontent.com/sidooms/MovieTweetings/master/snapshots/100K/movies.dat')
print('movie data downloaded')
for movie in tqdm(movies.split(sep="\n")):
m = movie.split(sep="::")
if len(m) == 3:
create_movie(m[0], m[1], m[2])
if __name__ == '__main__':
print("Starting MovieGeeks Population script...")
delete_db()
populate()