side info

SongFGH · Nov 1, 2018 · e4d3b5e · e4d3b5e
1 parent 83a85e3
commit e4d3b5e
Show file tree

Hide file tree

Showing 6 changed files with 251 additions and 50 deletions.
diff --git a/Preprocess.ipynb b/Preprocess.ipynb
@@ -4,12 +4,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Train/Valid/Test split"
+    "# 1M split"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 89,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -20,7 +20,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 90,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -36,7 +36,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 91,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -48,12 +48,12 @@
     "\n",
     "rating_train = ratings[:len_train]\n",
     "rating_val   = ratings[len_train:len_val]\n",
-    "rating_test  = ratings[len_val:len_test]"
+    "rating_test  = ratings[len_val:]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 92,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -64,7 +64,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 93,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -79,6 +79,186 @@
     "        rating_mtx[r, u, v] = 1\n",
     "    torch.save(rating_mtx, './data/rating_%d.pkl'%i)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 94,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "users_headers = ['user id', 'gender', 'age', 'occupation', 'zip code']\n",
+    "users_df = pd.read_csv('./data/ml_1m/users.dat', sep = '::', header = None, names = users_headers, engine = 'python', encoding = 'latin-1')\n",
+    "movie_headers = ['movie id', 'movie title', 'genre']\n",
+    "movie_df = pd.read_csv('./data/ml_1m/movies.dat', sep = '::', header = None, names = movie_headers, engine = 'python', encoding = 'latin-1')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 97,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "occupation = set(users_df['occupation'].values.tolist())\n",
+    "age_dict = {1:0., 18:1., 25:2., 35:3., 45:4., 50:5., 56:6.}\n",
+    "gender_dict = {'M': 0., 'F': 1.}\n",
+    "occupation_dict = {f: i for i, f in enumerate(occupation, start=2)}\n",
+    "\n",
+    "num_feats = 2 + len(occupation_dict)\n",
+    "\n",
+    "u_features = np.zeros((num_users, num_feats), dtype=np.float32)\n",
+    "for _, row in users_df.iterrows():\n",
+    "    u_id = row['user id']-1\n",
+    "    # age\n",
+    "    u_features[u_id, 0] = age_dict[row['age']]\n",
+    "    # gender\n",
+    "    u_features[u_id, 1] = gender_dict[row['gender']]\n",
+    "    # occupation\n",
+    "    u_features[u_id, occupation_dict[row['occupation']]] = 1.\n",
+    "torch.save(torch.from_numpy(u_features), './data/ml_1m/u_features.pkl')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "genre_dict = {'Action':0, 'Adventure':1, 'Animation':2, \"Children's\":3, 'Comedy':4,\n",
+    "              'Crime':5, 'Documentary':6, 'Drama':7, 'Fantasy':8, 'Film-Noir':9, 'Horror':10,\n",
+    "              'Musical':11, 'Mystery':12, 'Romance':13, 'Sci-Fi':14, 'Thriller':15,\n",
+    "              'War':16, 'Western':17}\n",
+    "num_genres = len(genre_dict)\n",
+    "\n",
+    "v_features = np.zeros((num_items, num_genres), dtype=np.float32)\n",
+    "for movie_id, g_vec in zip(movie_df['movie id'].values.tolist(), movie_df['genre'].values.tolist()):\n",
+    "    # check if movie_id was listed in ratings file and therefore in mapping dictionary\n",
+    "    for j in [genre_dict[g] for g in g_vec.split('|')]:\n",
+    "        v_features[movie_id-1][j] = 1\n",
+    "\n",
+    "torch.save(torch.from_numpy(v_features), './data/ml_1m/v_features.pkl')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 100K split"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train = pd.read_csv('./data/ml_100k/u1.base', sep = '\\t', header = None, engine = 'python', encoding = 'latin-1')\n",
+    "test  = pd.read_csv('./data/ml_100k/u1.test', sep = '\\t', header = None, engine = 'python', encoding = 'latin-1')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_length = len(train)\n",
+    "train = train.sample(frac=1)\n",
+    "\n",
+    "len_train = int(train_length*0.9)\n",
+    "\n",
+    "rating_train = train[:len_train]\n",
+    "rating_val   = train[len_train:]\n",
+    "rating_test  = test"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_users = 943\n",
+    "num_items = 1682\n",
+    "rating_cnt= 5"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i, ratings in enumerate([rating_train, rating_val, rating_test]):\n",
+    "    rating_mtx = torch.zeros(rating_cnt, num_users, num_items)\n",
+    "    \n",
+    "    for index, row in ratings.iterrows():\n",
+    "        u = row[0]-1\n",
+    "        v = row[1]-1\n",
+    "        r = row[2]-1\n",
+    "        \n",
+    "        rating_mtx[r, u, v] = 1\n",
+    "    torch.save(rating_mtx, './data/ml_100k/rating_%d.pkl'%i)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "users_headers = ['user id', 'age', 'gender', 'occupation', 'zip code']\n",
+    "users_df = pd.read_csv('./data/ml_100k/u.user', sep = '|', header = None, names = users_headers, engine = 'python', encoding = 'latin-1')\n",
+    "movie_headers = ['movie id', 'movie title', 'release date', 'video release date',\n",
+    "                 'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation',\n",
+    "                 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',\n",
+    "                 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',\n",
+    "                 'Thriller', 'War', 'Western']\n",
+    "movie_df = pd.read_csv('./data/ml_100k/u.item', sep = '|', header = None, names = movie_headers, engine = 'python', encoding = 'latin-1')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "occupation = set(users_df['occupation'].values.tolist())\n",
+    "age = users_df['age'].values\n",
+    "age_max = age.max()\n",
+    "gender_dict = {'M': 0., 'F': 1.}\n",
+    "occupation_dict = {f: i for i, f in enumerate(occupation, start=2)}\n",
+    "\n",
+    "num_feats = 2 + len(occupation_dict)\n",
+    "\n",
+    "u_features = np.zeros((num_users, num_feats), dtype=np.float32)\n",
+    "for _, row in users_df.iterrows():\n",
+    "    u_id = row['user id']-1\n",
+    "    # age\n",
+    "    u_features[u_id, 0] = row['age'] / np.float(age_max)\n",
+    "    # gender\n",
+    "    u_features[u_id, 1] = gender_dict[row['gender']]\n",
+    "    # occupation\n",
+    "    u_features[u_id, occupation_dict[row['occupation']]] = 1.\n",
+    "torch.save(torch.from_numpy(u_features), './data/ml_100k/u_features.pkl')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "genre_headers = movie_df.columns.values[6:]\n",
+    "num_genres = genre_headers.shape[0]\n",
+    "\n",
+    "v_features = np.zeros((num_items, num_genres), dtype=np.float32)\n",
+    "for movie_id, g_vec in zip(movie_df['movie id'].values.tolist(), movie_df[genre_headers].values.tolist()):\n",
+    "    # check if movie_id was listed in ratings file and therefore in mapping dictionary\n",
+    "    v_features[movie_id-1] = g_vec\n",
+    "torch.save(torch.from_numpy(v_features), './data/ml_100k/v_features.pkl')"
+   ]
   }
  ],
  "metadata": {

diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 PyTorch based implementation of Graph Convolutional Matrix Completion for recommender systems, based on [Kipf and Welling](https://arxiv.org/abs/1706.02263) (2017) paper. We also implemented them based on their [source code](https://github.com/riannevdberg/gc-mc).
 
-This code only covers the Movielens 1M Dataset.
+This code only covers the Movielens 1M, 100K Dataset.
 
 After downloading [ml_1m](https://grouplens.org/datasets/movielens/) to the ```./data``` directory, you need to preprocess it by ```Preprocess.ipynb```.
 

diff --git a/config.py b/config.py
@@ -7,15 +7,14 @@ def get_args():
     parser.add_argument('--mode', type=str, default="train",
                                   help='train / test')
     parser.add_argument('--model-path', type=str, default="./models")
-    parser.add_argument('--data-path', type=str, default="./data")
+    parser.add_argument('--data-path', type=str, default="./data/ml_1m/")
     parser.add_argument('--data-shuffle', type=bool, default=True)
     parser.add_argument('--batch-size', type=int, default=128)
     parser.add_argument('--num-epochs', type=int, default=200)
     parser.add_argument('--val-step', type=int, default=5)
     parser.add_argument('--test-epoch', type=int, default=50)
     parser.add_argument('--start-epoch', type=int, default=0)
     parser.add_argument('--neg-cnt', type=int, default=100)
-    parser.add_argument('--at-k', type=int, default=10)
     parser.add_argument('--lr', type=float, default=0.01)
     parser.add_argument('--weight_decay', type=float, default=0.005)
     parser.add_argument('--dropout', type=float, default=0.7)
@@ -28,9 +27,11 @@ def get_args():
     parser.add_argument('--item-cnt', type=int, default=3953)
     parser.add_argument('--class-cnt', type=int, default=5)
 
-    parser.add_argument('--train-path', type=str, default='./data/rating_train.pkl')
-    parser.add_argument('--val-path', type=str, default='./data/rating_val.pkl')
-    parser.add_argument('--test-path', type=str, default='./data/rating_test.pkl')
+    parser.add_argument('--users-path', type=str, default='u_features.pkl')
+    parser.add_argument('--movie-path', type=str, default='v_features.pkl')
+    parser.add_argument('--train-path', type=str, default='rating_train.pkl')
+    parser.add_argument('--val-path', type=str, default='rating_val.pkl')
+    parser.add_argument('--test-path', type=str, default='rating_test.pkl')
 
     args = parser.parse_args()
 

diff --git a/layers.py b/layers.py
@@ -11,31 +11,34 @@ class GraphConvolution(Module):
     Simple GCN layer, similar to https://arxiv.org/abs/1609.02907
     """
 
-    def __init__(self, in_features, hidden, num_classes, dropout, bias=True):
+    def __init__(self, u_features, v_features, hidden, num_classes, dropout, bias=True):
         super(GraphConvolution, self).__init__()
-        self.in_features = in_features
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
         self.dropout = nn.Dropout(dropout)
-        self.weight = [Parameter(torch.randn(in_features, hidden)).to(self.device)
-                       for _ in range(num_classes)]
+        self.u_weight = [Parameter(torch.randn(u_features, hidden)).to(self.device)
+                         for _ in range(num_classes)]
+        self.v_weight = [Parameter(torch.randn(v_features, hidden)).to(self.device)
+                         for _ in range(num_classes)]
         if bias:
             self.bias = Parameter(torch.randn(hidden)).to(self.device)
         else:
             self.bias = None
-        for weight in self.weight:
+        for weight in self.u_weight+self.v_weight:
             nn.init.xavier_normal_(weight)
 
 
-    def forward(self, input, adj, degree, r):
+    def forward(self, u, v, adj, degree, r):
         adj = torch.cat((torch.cat((torch.zeros(adj.size(0), adj.size(0)).to(self.device), adj), 1),
                          torch.cat((adj.t(), torch.zeros(adj.size(1), adj.size(1)).to(self.device)), 1)), 0)
         diag = torch.diag(degree)
         adj = torch.spmm(diag, adj)
 
-        input = self.dropout(input)
-        weight = torch.sum(torch.stack([self.weight[i] for i in range(r+1)], 0), 0)
-        support = torch.mm(input, weight)
+        u = self.dropout(u)
+        v = self.dropout(v)
+        u_weight = torch.sum(torch.stack([self.u_weight[i] for i in range(r+1)], 0), 0)
+        v_weight = torch.sum(torch.stack([self.v_weight[i] for i in range(r+1)], 0), 0)
+        support = torch.cat((torch.mm(u, u_weight), torch.mm(v, v_weight)), 0)
         if self.bias is not None:
             support += self.bias
         output = torch.spmm(adj, support)