Skip to content

Commit

Permalink
side info
Browse files Browse the repository at this point in the history
  • Loading branch information
yongqyu committed Nov 1, 2018
1 parent 83a85e3 commit e4d3b5e
Show file tree
Hide file tree
Showing 6 changed files with 251 additions and 50 deletions.
194 changes: 187 additions & 7 deletions Preprocess.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# Train/Valid/Test split"
"# 1M split"
]
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 89,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -20,7 +20,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 90,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -36,7 +36,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 91,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -48,12 +48,12 @@
"\n",
"rating_train = ratings[:len_train]\n",
"rating_val = ratings[len_train:len_val]\n",
"rating_test = ratings[len_val:len_test]"
"rating_test = ratings[len_val:]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 92,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -64,7 +64,7 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 93,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -79,6 +79,186 @@
" rating_mtx[r, u, v] = 1\n",
" torch.save(rating_mtx, './data/rating_%d.pkl'%i)"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [],
"source": [
"users_headers = ['user id', 'gender', 'age', 'occupation', 'zip code']\n",
"users_df = pd.read_csv('./data/ml_1m/users.dat', sep = '::', header = None, names = users_headers, engine = 'python', encoding = 'latin-1')\n",
"movie_headers = ['movie id', 'movie title', 'genre']\n",
"movie_df = pd.read_csv('./data/ml_1m/movies.dat', sep = '::', header = None, names = movie_headers, engine = 'python', encoding = 'latin-1')\n"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"occupation = set(users_df['occupation'].values.tolist())\n",
"age_dict = {1:0., 18:1., 25:2., 35:3., 45:4., 50:5., 56:6.}\n",
"gender_dict = {'M': 0., 'F': 1.}\n",
"occupation_dict = {f: i for i, f in enumerate(occupation, start=2)}\n",
"\n",
"num_feats = 2 + len(occupation_dict)\n",
"\n",
"u_features = np.zeros((num_users, num_feats), dtype=np.float32)\n",
"for _, row in users_df.iterrows():\n",
" u_id = row['user id']-1\n",
" # age\n",
" u_features[u_id, 0] = age_dict[row['age']]\n",
" # gender\n",
" u_features[u_id, 1] = gender_dict[row['gender']]\n",
" # occupation\n",
" u_features[u_id, occupation_dict[row['occupation']]] = 1.\n",
"torch.save(torch.from_numpy(u_features), './data/ml_1m/u_features.pkl')"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {},
"outputs": [],
"source": [
"genre_dict = {'Action':0, 'Adventure':1, 'Animation':2, \"Children's\":3, 'Comedy':4,\n",
" 'Crime':5, 'Documentary':6, 'Drama':7, 'Fantasy':8, 'Film-Noir':9, 'Horror':10,\n",
" 'Musical':11, 'Mystery':12, 'Romance':13, 'Sci-Fi':14, 'Thriller':15,\n",
" 'War':16, 'Western':17}\n",
"num_genres = len(genre_dict)\n",
"\n",
"v_features = np.zeros((num_items, num_genres), dtype=np.float32)\n",
"for movie_id, g_vec in zip(movie_df['movie id'].values.tolist(), movie_df['genre'].values.tolist()):\n",
" # check if movie_id was listed in ratings file and therefore in mapping dictionary\n",
" for j in [genre_dict[g] for g in g_vec.split('|')]:\n",
" v_features[movie_id-1][j] = 1\n",
"\n",
"torch.save(torch.from_numpy(v_features), './data/ml_1m/v_features.pkl')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 100K split"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"train = pd.read_csv('./data/ml_100k/u1.base', sep = '\\t', header = None, engine = 'python', encoding = 'latin-1')\n",
"test = pd.read_csv('./data/ml_100k/u1.test', sep = '\\t', header = None, engine = 'python', encoding = 'latin-1')\n"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"train_length = len(train)\n",
"train = train.sample(frac=1)\n",
"\n",
"len_train = int(train_length*0.9)\n",
"\n",
"rating_train = train[:len_train]\n",
"rating_val = train[len_train:]\n",
"rating_test = test"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"num_users = 943\n",
"num_items = 1682\n",
"rating_cnt= 5"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"for i, ratings in enumerate([rating_train, rating_val, rating_test]):\n",
" rating_mtx = torch.zeros(rating_cnt, num_users, num_items)\n",
" \n",
" for index, row in ratings.iterrows():\n",
" u = row[0]-1\n",
" v = row[1]-1\n",
" r = row[2]-1\n",
" \n",
" rating_mtx[r, u, v] = 1\n",
" torch.save(rating_mtx, './data/ml_100k/rating_%d.pkl'%i)"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"users_headers = ['user id', 'age', 'gender', 'occupation', 'zip code']\n",
"users_df = pd.read_csv('./data/ml_100k/u.user', sep = '|', header = None, names = users_headers, engine = 'python', encoding = 'latin-1')\n",
"movie_headers = ['movie id', 'movie title', 'release date', 'video release date',\n",
" 'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation',\n",
" 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',\n",
" 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',\n",
" 'Thriller', 'War', 'Western']\n",
"movie_df = pd.read_csv('./data/ml_100k/u.item', sep = '|', header = None, names = movie_headers, engine = 'python', encoding = 'latin-1')\n"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"occupation = set(users_df['occupation'].values.tolist())\n",
"age = users_df['age'].values\n",
"age_max = age.max()\n",
"gender_dict = {'M': 0., 'F': 1.}\n",
"occupation_dict = {f: i for i, f in enumerate(occupation, start=2)}\n",
"\n",
"num_feats = 2 + len(occupation_dict)\n",
"\n",
"u_features = np.zeros((num_users, num_feats), dtype=np.float32)\n",
"for _, row in users_df.iterrows():\n",
" u_id = row['user id']-1\n",
" # age\n",
" u_features[u_id, 0] = row['age'] / np.float(age_max)\n",
" # gender\n",
" u_features[u_id, 1] = gender_dict[row['gender']]\n",
" # occupation\n",
" u_features[u_id, occupation_dict[row['occupation']]] = 1.\n",
"torch.save(torch.from_numpy(u_features), './data/ml_100k/u_features.pkl')"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
"genre_headers = movie_df.columns.values[6:]\n",
"num_genres = genre_headers.shape[0]\n",
"\n",
"v_features = np.zeros((num_items, num_genres), dtype=np.float32)\n",
"for movie_id, g_vec in zip(movie_df['movie id'].values.tolist(), movie_df[genre_headers].values.tolist()):\n",
" # check if movie_id was listed in ratings file and therefore in mapping dictionary\n",
" v_features[movie_id-1] = g_vec\n",
"torch.save(torch.from_numpy(v_features), './data/ml_100k/v_features.pkl')"
]
}
],
"metadata": {
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

PyTorch based implementation of Graph Convolutional Matrix Completion for recommender systems, based on [Kipf and Welling](https://arxiv.org/abs/1706.02263) (2017) paper. We also implemented them based on their [source code](https://github.com/riannevdberg/gc-mc).

This code only covers the Movielens 1M Dataset.
This code only covers the Movielens 1M, 100K Dataset.

After downloading [ml_1m](https://grouplens.org/datasets/movielens/) to the ```./data``` directory, you need to preprocess it by ```Preprocess.ipynb```.

Expand Down
11 changes: 6 additions & 5 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,14 @@ def get_args():
parser.add_argument('--mode', type=str, default="train",
help='train / test')
parser.add_argument('--model-path', type=str, default="./models")
parser.add_argument('--data-path', type=str, default="./data")
parser.add_argument('--data-path', type=str, default="./data/ml_1m/")
parser.add_argument('--data-shuffle', type=bool, default=True)
parser.add_argument('--batch-size', type=int, default=128)
parser.add_argument('--num-epochs', type=int, default=200)
parser.add_argument('--val-step', type=int, default=5)
parser.add_argument('--test-epoch', type=int, default=50)
parser.add_argument('--start-epoch', type=int, default=0)
parser.add_argument('--neg-cnt', type=int, default=100)
parser.add_argument('--at-k', type=int, default=10)
parser.add_argument('--lr', type=float, default=0.01)
parser.add_argument('--weight_decay', type=float, default=0.005)
parser.add_argument('--dropout', type=float, default=0.7)
Expand All @@ -28,9 +27,11 @@ def get_args():
parser.add_argument('--item-cnt', type=int, default=3953)
parser.add_argument('--class-cnt', type=int, default=5)

parser.add_argument('--train-path', type=str, default='./data/rating_train.pkl')
parser.add_argument('--val-path', type=str, default='./data/rating_val.pkl')
parser.add_argument('--test-path', type=str, default='./data/rating_test.pkl')
parser.add_argument('--users-path', type=str, default='u_features.pkl')
parser.add_argument('--movie-path', type=str, default='v_features.pkl')
parser.add_argument('--train-path', type=str, default='rating_train.pkl')
parser.add_argument('--val-path', type=str, default='rating_val.pkl')
parser.add_argument('--test-path', type=str, default='rating_test.pkl')

args = parser.parse_args()

Expand Down
21 changes: 12 additions & 9 deletions layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,31 +11,34 @@ class GraphConvolution(Module):
Simple GCN layer, similar to https://arxiv.org/abs/1609.02907
"""

def __init__(self, in_features, hidden, num_classes, dropout, bias=True):
def __init__(self, u_features, v_features, hidden, num_classes, dropout, bias=True):
super(GraphConvolution, self).__init__()
self.in_features = in_features
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

self.dropout = nn.Dropout(dropout)
self.weight = [Parameter(torch.randn(in_features, hidden)).to(self.device)
for _ in range(num_classes)]
self.u_weight = [Parameter(torch.randn(u_features, hidden)).to(self.device)
for _ in range(num_classes)]
self.v_weight = [Parameter(torch.randn(v_features, hidden)).to(self.device)
for _ in range(num_classes)]
if bias:
self.bias = Parameter(torch.randn(hidden)).to(self.device)
else:
self.bias = None
for weight in self.weight:
for weight in self.u_weight+self.v_weight:
nn.init.xavier_normal_(weight)


def forward(self, input, adj, degree, r):
def forward(self, u, v, adj, degree, r):
adj = torch.cat((torch.cat((torch.zeros(adj.size(0), adj.size(0)).to(self.device), adj), 1),
torch.cat((adj.t(), torch.zeros(adj.size(1), adj.size(1)).to(self.device)), 1)), 0)
diag = torch.diag(degree)
adj = torch.spmm(diag, adj)

input = self.dropout(input)
weight = torch.sum(torch.stack([self.weight[i] for i in range(r+1)], 0), 0)
support = torch.mm(input, weight)
u = self.dropout(u)
v = self.dropout(v)
u_weight = torch.sum(torch.stack([self.u_weight[i] for i in range(r+1)], 0), 0)
v_weight = torch.sum(torch.stack([self.v_weight[i] for i in range(r+1)], 0), 0)
support = torch.cat((torch.mm(u, u_weight), torch.mm(v, v_weight)), 0)
if self.bias is not None:
support += self.bias
output = torch.spmm(adj, support)
Expand Down
Loading

0 comments on commit e4d3b5e

Please sign in to comment.