Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add recommend() function for the base class of Recommender #538

Merged
merged 68 commits into from
Oct 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
68 commits
Select commit Hold shift + click to select a range
139f0eb
Dataset uid_map and iid_map are the global ones
tqtg Oct 25, 2023
89fa6d2
Add recommend API in Recommender
tqtg Oct 25, 2023
b2c8b06
Update and add new test cases
tqtg Oct 25, 2023
6124089
Fix user_iter() and item_iter()
tqtg Oct 25, 2023
0843a2e
Fix conflicts
tqtg Oct 25, 2023
33ba820
Rename functions
tqtg Oct 25, 2023
cd250be
Update AMR model
tqtg Oct 25, 2023
e67fb5b
Update BaselineOnly model
tqtg Oct 25, 2023
b6a6269
Update BiVAE model
tqtg Oct 25, 2023
ffd4354
Update BPR model
tqtg Oct 25, 2023
d969288
Update WBPR model
tqtg Oct 25, 2023
5779c7c
Update C2PF model
tqtg Oct 25, 2023
934e934
Update CausalRec model
tqtg Oct 25, 2023
f7bb13d
Update CDL model
tqtg Oct 25, 2023
6e5f436
Update CDR model
tqtg Oct 25, 2023
e3fb683
Update COE model
tqtg Oct 25, 2023
3c6e8bd
Update ComparER models
tqtg Oct 25, 2023
05748a6
Update ConvMF model
tqtg Oct 25, 2023
ded098a
Update Recommender APIs
tqtg Oct 25, 2023
92937af
Update CTR model
tqtg Oct 25, 2023
632dc5b
Update CVAE model
tqtg Oct 25, 2023
bd36ffc
Update model
tqtg Oct 25, 2023
66b26ab
Update model
tqtg Oct 25, 2023
8353533
Update model
tqtg Oct 25, 2023
437aca3
Update model
tqtg Oct 25, 2023
1f12696
Update model
tqtg Oct 25, 2023
278fc56
Update model
tqtg Oct 25, 2023
a0c3476
Update model
tqtg Oct 25, 2023
c85e7a8
Update model
tqtg Oct 25, 2023
ba614c9
Update model
tqtg Oct 25, 2023
2a0cfd3
Update model
tqtg Oct 25, 2023
cfc23f8
Update model
tqtg Oct 25, 2023
4cebdd6
Update model
tqtg Oct 25, 2023
7f051c9
Update model
tqtg Oct 25, 2023
8d03e9c
Update model
tqtg Oct 25, 2023
35b287d
Update model
tqtg Oct 25, 2023
d30a489
Update model
tqtg Oct 25, 2023
59ff4c4
Update model
tqtg Oct 25, 2023
81a823d
Update model
tqtg Oct 26, 2023
c448858
Update model
tqtg Oct 26, 2023
4c96965
Update model
tqtg Oct 26, 2023
36d2e66
Update model
tqtg Oct 26, 2023
3ef2f3d
Update model
tqtg Oct 26, 2023
882b12a
Update model
tqtg Oct 26, 2023
6b6b4df
Update model
tqtg Oct 26, 2023
a60a041
Update model
tqtg Oct 26, 2023
c27afb8
Update model
tqtg Oct 26, 2023
b745204
Update model
tqtg Oct 26, 2023
4bc3d0d
Update model
tqtg Oct 26, 2023
5a0b7d6
Update model
tqtg Oct 26, 2023
8e98d87
Update model
tqtg Oct 26, 2023
c05b6a3
Update model
tqtg Oct 26, 2023
17aa1dc
Update model
tqtg Oct 26, 2023
6e7fafb
Update model
tqtg Oct 26, 2023
f315fda
Update model
tqtg Oct 26, 2023
9bae9e3
Update model
tqtg Oct 26, 2023
a3cc3e1
Update model
tqtg Oct 26, 2023
0124979
Update model
tqtg Oct 26, 2023
1638d05
Update model
tqtg Oct 26, 2023
d0d7c98
Update model
tqtg Oct 26, 2023
8124b44
hell yeah last one
tqtg Oct 26, 2023
7510bda
remove cpp file
tqtg Oct 26, 2023
62d2563
Update test cases
tqtg Oct 26, 2023
a429d61
Let models use knows_user() and knows_item() instead of using train_set
tqtg Oct 26, 2023
8a4f2f8
Let models manage total_users and total_items instead of train_set, t…
tqtg Oct 26, 2023
654f278
Keep num_users and num_items, maybe changing to train_users and train…
tqtg Oct 26, 2023
ec273b8
Provide train_set for init() of ComparERSub
tqtg Oct 30, 2023
5c1a933
Provide train_set for init_params() of ComparERObj
tqtg Oct 30, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ __pycache__/

# C extensions
*.so
cornac/models/*/*.cpp
cornac/models/*/cython/*.cpp
cornac/utils/*.cpp

# Distribution / packaging
bin/
Expand Down
89 changes: 24 additions & 65 deletions cornac/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,13 @@ class Dataset(object):

global_mean: float
Average value over the rating observations.

uir_tuple: tuple
Tuple three numpy arrays (user_indices, item_indices, rating_values).

timestamps: numpy.array
Numpy array of timestamps corresponding to feedback in `uir_tuple`.
This is only available when input data is in `UIRT` format.

"""

def __init__(
Expand Down Expand Up @@ -99,12 +98,8 @@ def __init__(
self.min_rating = np.min(r_values)
self.global_mean = np.mean(r_values)

self.__total_users = None
self.__total_items = None
self.__user_ids = None
self.__item_ids = None
self.__user_indices = None
self.__item_indices = None

self.__user_data = None
self.__item_data = None
Expand All @@ -114,47 +109,19 @@ def __init__(
self.__csc_matrix = None
self.__dok_matrix = None

@property
def total_users(self):
"""Total number of users including test and validation users if exists"""
return self.__total_users if self.__total_users is not None else self.num_users

@total_users.setter
def total_users(self, input_value):
"""Set total number of users for the dataset"""
assert input_value >= self.num_users
self.__total_users = input_value

@property
def total_items(self):
"""Total number of items including test and validation items if exists"""
return self.__total_items if self.__total_items is not None else self.num_items

@total_items.setter
def total_items(self, input_value):
"""Set total number of items for the dataset"""
assert input_value >= self.num_items
self.__total_items = input_value

@property
def user_ids(self):
"""An iterator over the raw user ids"""
return self.uid_map.keys()
"""Return the list of raw user ids"""
if self.__user_ids is None:
self.__user_ids = list(self.uid_map.keys())
return self.__user_ids

@property
def item_ids(self):
"""An iterator over the raw item ids"""
return self.iid_map.keys()

@property
def user_indices(self):
"""An iterator over the user indices"""
return self.uid_map.values()

@property
def item_indices(self):
"""An iterator over the item indices"""
return self.iid_map.values()
"""Return the list of raw item ids"""
if self.__item_ids is None:
self.__item_ids = list(self.iid_map.keys())
return self.__item_ids

@property
def user_data(self):
Expand Down Expand Up @@ -185,7 +152,7 @@ def item_data(self):
@property
def chrono_user_data(self):
"""Data organized by user sorted chronologically (timestamps required).
A dictionary where keys are users, values are tuples of three chronologically
A dictionary where keys are users, values are tuples of three chronologically
sorted lists (items, ratings, timestamps) interacted by the corresponding users.
"""
if self.timestamps is None:
Expand Down Expand Up @@ -214,7 +181,7 @@ def chrono_user_data(self):
@property
def chrono_item_data(self):
"""Data organized by item sorted chronologically (timestamps required).
A dictionary where keys are items, values are tuples of three chronologically
A dictionary where keys are items, values are tuples of three chronologically
sorted lists (users, ratings, timestamps) interacted with the corresponding items.
"""
if self.timestamps is None:
Expand Down Expand Up @@ -272,7 +239,7 @@ def dok_matrix(self):
"""The user-item interaction matrix in DOK sparse format"""
if self.__dok_matrix is None:
self.__dok_matrix = dok_matrix(
(self.num_users, self.num_items), dtype='float'
(self.num_users, self.num_items), dtype="float"
)
for u, i, r in zip(*self.uir_tuple):
self.__dok_matrix[u, i] = r
Expand Down Expand Up @@ -364,27 +331,29 @@ def build(
raise ValueError("data is empty after being filtered!")

uir_tuple = (
np.asarray(u_indices, dtype='int'),
np.asarray(i_indices, dtype='int'),
np.asarray(r_values, dtype='float'),
np.asarray(u_indices, dtype="int"),
np.asarray(i_indices, dtype="int"),
np.asarray(r_values, dtype="float"),
)

timestamps = (
np.fromiter((int(data[i][3]) for i in valid_idx), dtype='int')
np.fromiter((int(data[i][3]) for i in valid_idx), dtype="int")
if fmt == "UIRT"
else None
)

return cls(
dataset = cls(
num_users=len(global_uid_map),
num_items=len(global_iid_map),
uid_map=uid_map,
iid_map=iid_map,
uid_map=global_uid_map,
iid_map=global_iid_map,
uir_tuple=uir_tuple,
timestamps=timestamps,
seed=seed,
)

return dataset

@classmethod
def from_uir(cls, data, seed=None):
"""Constructing Dataset from UIR (User, Item, Rating) triplet data.
Expand All @@ -407,7 +376,7 @@ def from_uir(cls, data, seed=None):

@classmethod
def from_uirt(cls, data, seed=None):
"""Constructing Dataset from UIRT (User, Item, Rating, Timestamp)
"""Constructing Dataset from UIRT (User, Item, Rating, Timestamp)
quadruplet data.

Parameters
Expand Down Expand Up @@ -528,7 +497,6 @@ def uij_iter(self, batch_size=1, shuffle=False, neg_sampling="uniform"):
batch of negative items (array of 'int')

"""

if neg_sampling.lower() == "uniform":
neg_population = np.arange(self.num_items)
elif neg_sampling.lower() == "popularity":
Expand Down Expand Up @@ -564,7 +532,7 @@ def user_iter(self, batch_size=1, shuffle=False):
-------
iterator : batch of user indices (array of 'int')
"""
user_indices = np.fromiter(self.user_indices, dtype='int')
user_indices = np.fromiter(set(self.uir_tuple[0]), dtype="int")
for batch_ids in self.idx_iter(len(user_indices), batch_size, shuffle):
yield user_indices[batch_ids]

Expand All @@ -582,18 +550,10 @@ def item_iter(self, batch_size=1, shuffle=False):
-------
iterator : batch of item indices (array of 'int')
"""
item_indices = np.fromiter(self.item_indices, 'int')
item_indices = np.fromiter(set(self.uir_tuple[1]), "int")
for batch_ids in self.idx_iter(len(item_indices), batch_size, shuffle):
yield item_indices[batch_ids]

def is_unk_user(self, user_idx):
"""Return whether or not a user is unknown given the user index"""
return user_idx >= self.num_users

def is_unk_item(self, item_idx):
"""Return whether or not an item is unknown given the item index"""
return item_idx >= self.num_items

def add_modalities(self, **kwargs):
self.user_feature = kwargs.get("user_feature", None)
self.item_feature = kwargs.get("item_feature", None)
Expand All @@ -605,4 +565,3 @@ def add_modalities(self, **kwargs):
self.item_graph = kwargs.get("item_graph", None)
self.sentiment = kwargs.get("sentiment", None)
self.review_text = kwargs.get("review_text", None)

21 changes: 10 additions & 11 deletions cornac/eval_methods/base_method.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ def rating_eval(model, metrics, test_set, user_based=False, verbose=False):
gt_mat = test_set.csr_matrix
pd_mat = csr_matrix((r_preds, (u_indices, i_indices)), shape=gt_mat.shape)

test_user_indices = set(u_indices)
for mt in metrics:
if user_based: # averaging over users
user_results.append(
Expand All @@ -93,7 +94,7 @@ def rating_eval(model, metrics, test_set, user_based=False, verbose=False):
gt_ratings=gt_mat.getrow(user_idx).data,
pd_ratings=pd_mat.getrow(user_idx).data,
).item()
for user_idx in test_set.user_indices
for user_idx in test_user_indices
}
)
avg_results.append(sum(user_results[-1].values()) / len(user_results[-1]))
Expand Down Expand Up @@ -159,7 +160,7 @@ def ranking_eval(
avg_results = []
user_results = [{} for _ in enumerate(metrics)]

gt_mat = test_set.csr_matrix
test_mat = test_set.csr_matrix
train_mat = train_set.csr_matrix
val_mat = None if val_set is None else val_set.csr_matrix

Expand All @@ -170,10 +171,11 @@ def pos_items(csr_row):
if rating >= rating_threshold
]

test_user_indices = set(test_set.uir_tuple[0])
for user_idx in tqdm(
test_set.user_indices, desc="Ranking", disable=not verbose, miniters=100
test_user_indices, desc="Ranking", disable=not verbose, miniters=100
):
test_pos_items = pos_items(gt_mat.getrow(user_idx))
test_pos_items = pos_items(test_mat.getrow(user_idx))
if len(test_pos_items) == 0:
continue

Expand All @@ -183,9 +185,9 @@ def pos_items(csr_row):

val_pos_items = [] if val_mat is None else pos_items(val_mat.getrow(user_idx))
train_pos_items = (
[]
if train_set.is_unk_user(user_idx)
else pos_items(train_mat.getrow(user_idx))
pos_items(train_mat.getrow(user_idx))
if user_idx < train_mat.shape[0]
else []
)

# binary mask for ground-truth negative items, removing all positive items
Expand All @@ -196,7 +198,7 @@ def pos_items(csr_row):
if exclude_unknowns:
u_gt_pos_mask = u_gt_pos_mask[: train_set.num_items]
u_gt_neg_mask = u_gt_neg_mask[: train_set.num_items]

item_indices = np.nonzero(u_gt_pos_mask + u_gt_neg_mask)[0]
u_gt_pos_items = np.nonzero(u_gt_pos_mask)[0]
u_gt_neg_items = np.nonzero(u_gt_neg_mask)[0]
Expand Down Expand Up @@ -538,9 +540,6 @@ def _build_datasets(self, train_data, test_data, val_data=None):
print("Total users = {}".format(self.total_users))
print("Total items = {}".format(self.total_items))

self.train_set.total_users = self.total_users
self.train_set.total_items = self.total_items

def _build_modalities(self):
for user_modality in [
self.user_feature,
Expand Down
39 changes: 19 additions & 20 deletions cornac/eval_methods/propensity_stratified_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,38 +25,38 @@ def ranking_eval(
props=None,
):
"""Evaluate model on provided ranking metrics.

Parameters
----------
model: :obj:`cornac.models.Recommender`, required
Recommender model to be evaluated.

metrics: :obj:`iterable`, required
List of rating metrics :obj:`cornac.metrics.RankingMetric`.

train_set: :obj:`cornac.data.Dataset`, required
Dataset to be used for model training. This will be used to exclude
observations already appeared during training.

test_set: :obj:`cornac.data.Dataset`, required
Dataset to be used for evaluation.

val_set: :obj:`cornac.data.Dataset`, optional, default: None
Dataset to be used for model selection. This will be used to exclude
observations already appeared during validation.

rating_threshold: float, optional, default: 1.0
The threshold to convert ratings into positive or negative feedback.

exclude_unknowns: bool, optional, default: True
Ignore unknown users and items during evaluation.

verbose: bool, optional, default: False
Output evaluation progress.

props: dictionary, optional, default: None
items propensity scores

Returns
-------
res: (List, List)
Expand All @@ -82,12 +82,13 @@ def pos_items(csr_row):
if rating >= rating_threshold
]

for user_idx in tqdm.tqdm(test_set.user_indices, disable=not verbose, miniters=100):
test_user_indices = set(test_set.uir_tuple[0])
for user_idx in tqdm.tqdm(test_user_indices, disable=not verbose, miniters=100):
test_pos_items = pos_items(gt_mat.getrow(user_idx))
if len(test_pos_items) == 0:
continue

u_gt_pos = np.zeros(test_set.num_items, dtype='float')
u_gt_pos = np.zeros(test_set.num_items, dtype="float")
u_gt_pos[test_pos_items] = 1

val_pos_items = [] if val_mat is None else pos_items(val_mat.getrow(user_idx))
Expand All @@ -97,7 +98,7 @@ def pos_items(csr_row):
else pos_items(train_mat.getrow(user_idx))
)

u_gt_neg = np.ones(test_set.num_items, dtype='int')
u_gt_neg = np.ones(test_set.num_items, dtype="int")
u_gt_neg[test_pos_items + val_pos_items + train_pos_items] = 0

item_indices = None if exclude_unknowns else np.arange(test_set.num_items)
Expand Down Expand Up @@ -256,7 +257,7 @@ def _estimate_propensities(self):
item_freq[i] += 1

# fit the exponential param
data = np.array([e for e in item_freq.values()], dtype='float')
data = np.array([e for e in item_freq.values()], dtype="float")
results = powerlaw.Fit(data, discrete=True, fit_method="Likelihood")
alpha = results.power_law.alpha
fmin = results.power_law.xmin
Expand All @@ -276,9 +277,7 @@ def _build_stratified_dataset(self, test_data):
self.stratified_sets = {}

# match the corresponding propensity score for each feedback
test_props = np.array(
[self.props[i] for u, i, r in test_data], dtype='float'
)
test_props = np.array([self.props[i] for u, i, r in test_data], dtype="float")

# stratify
minp = min(test_props) - 0.01 * min(test_props)
Expand Down Expand Up @@ -338,11 +337,11 @@ def evaluate(self, model, metrics, user_based, show_validation=True):
metrics: :obj:`iterable`
List of metrics.

user_based: bool, required
Evaluation strategy for the rating metrics. Whether results
user_based: bool, required
Evaluation strategy for the rating metrics. Whether results
are averaging based on number of users or number of ratings.

show_validation: bool, optional, default: True
show_validation: bool, optional, default: True
Whether to show the results on validation set (if exists).

Returns
Expand Down
Loading
Loading