Skip to content

Commit

Permalink
Merge branch 'icml-2024' into debiased-bs
Browse files Browse the repository at this point in the history
  • Loading branch information
Vincent-Maladiere committed Dec 19, 2023
2 parents 4322afe + a80546a commit d82db3b
Show file tree
Hide file tree
Showing 15 changed files with 832 additions and 136 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -161,5 +161,8 @@ cython_debug/
#.idea/

.vscode

doc/generated/


# This dataset should not be redistributed, because users have to sign an agreement.
hazardous/data/seer_cancer_cardio_raw_data.txt
Binary file added doc/_static/seer_extract.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/_static/seer_home.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ Datasets
:nosignatures:

data.make_synthetic_competing_weibull
data.load_seer


Inverse Probability Censoring Weight
Expand Down
46 changes: 46 additions & 0 deletions doc/downloading_seer.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@

.. _downloading_seer:

How to get the SEER dataset
===========================

.. currentmodule:: hazardous

SEER is a reference dataset for cancer statistics in the US, used in competitive risk analysis.

Below is a quick guide to obtain it.
Note that you will need a **Windows computer** (or emulator) to download the files.

1. Head to the `SEER data access webpage <https://seerdataaccess.cancer.gov/seer-data-access>`_, enter your email address and click on "Research Data Requests".
2. Fill the form.
3. Confirm your email address and wait a few days before the confirmation.
4. After receiving the confirmation email, go to the `SEER*Stat Software webpage <https://seer.cancer.gov/seerstat/software/>`_ to download the software on Windows.
5. Open it and sign in with your SEER credendials received by email.

.. image:: _static/seer_home.png
:width: 300


.. raw:: html

<br></br>


6. Use seerstat to open the `data/seer.sl` file.

.. image:: _static/seer_extract.png
:width: 400


.. raw:: html

<br></br>


You should get a txt file. This file can be loaded using :func:`hazardous.data.load_seer`.


.. raw:: html

<br></br>

1 change: 1 addition & 0 deletions doc/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,4 @@ of this library at this time.

api
auto_examples/index
downloading_seer
117 changes: 74 additions & 43 deletions examples/plot_complex_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,42 +13,41 @@
"""

# %%
import numpy as np
import hazardous.data._competing_weibull as competing_w
from time import perf_counter
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.utils import check_random_state
import pandas as pd
import seaborn as sns

from hazardous import GradientBoostingIncidence
from lifelines import AalenJohansenFitter

seed = 0
rng = check_random_state(seed)


# %%
# In the following cell, we verify that the synthetic dataset is well defined.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from hazardous.data._competing_weibull import compute_shape_and_scale

df_features = pd.DataFrame(rng.randn(100_000, 10))
df_features.columns = [f"feature_{i}" for i in range(10)]

df_shape_scale_star = competing_w.compute_shape_and_scale(
df_features,
X = rng.randn(10_000, 10)
column_names = [f"feature_{i}" for i in range(X.shape[1])]
X = pd.DataFrame(X, columns=column_names)

SS_star = compute_shape_and_scale(
X,
features_rate=0.2,
n_events=3,
degree_interaction=2,
random_state=0,
)

fig, axes = plt.subplots(2, 3, figsize=(10, 7))
for idx, col in enumerate(df_shape_scale_star.columns):
sns.histplot(df_shape_scale_star[col], ax=axes[idx // 3][idx % 3])
for idx, col in enumerate(SS_star.columns):
sns.histplot(SS_star[col], ax=axes[idx // 3][idx % 3])

# %%
from hazardous.data._competing_weibull import make_synthetic_competing_weibull


X, y_censored, y_uncensored = competing_w.make_synthetic_competing_weibull(
X, y_censored, y_uncensored = make_synthetic_competing_weibull(
n_samples=30_000,
base_scale=1_000,
n_features=10,
Expand All @@ -57,20 +56,35 @@
independent_censoring=True,
features_censoring_rate=0.2,
return_uncensored_data=True,
return_X_y=True,
feature_rounding=3,
target_rounding=None,
censoring_relative_scale=1.5,
random_state=0,
complex_features=True,
return_X_y=True,
random_state=seed,
)

print(y_censored["event"].value_counts().sort_index())

sns.histplot(
y_censored,
x="duration",
hue="event",
multiple="stack",
palette="magma",
)

# %%
n_samples = len(X)
calculate_variance = n_samples <= 5_000
from lifelines import AalenJohansenFitter


calculate_variance = X.shape[0] <= 5_000
aj = AalenJohansenFitter(calculate_variance=calculate_variance, seed=0)
aj

# %%
from hazardous._gradient_boosting_incidence import GradientBoostingIncidence

y_censored["event"].value_counts()

gb_incidence = GradientBoostingIncidence(
learning_rate=0.1,
Expand All @@ -82,7 +96,7 @@
show_progressbar=False,
random_state=seed,
)

gb_incidence

# %%
#
Expand All @@ -92,9 +106,18 @@
# Let's now estimate the CIFs on uncensored data and plot them against the
# theoretical CIFs:

import numpy as np
from time import perf_counter


def plot_cumulative_incidence_functions(
X, y, gb_incidence=None, aj=None, X_test=None, y_test=None
X,
y,
gb_incidence=None,
aj=None,
X_test=None,
y_test=None,
verbose=False,
):
n_events = y["event"].max()
t_max = y["duration"].max()
Expand All @@ -113,17 +136,28 @@ def plot_cumulative_incidence_functions(
gb_incidence.set_params(event_of_interest=event_id)
gb_incidence.fit(X, y)
duration = perf_counter() - tic
print(f"GB Incidence for event {event_id} fit in {duration:.3f} s")

if verbose:
print(f"GB Incidence for event {event_id} fit in {duration:.3f} s")

tic = perf_counter()
cifs_pred = gb_incidence.predict_cumulative_incidence(X, coarse_timegrid)
cif_mean = cifs_pred.mean(axis=0)
duration = perf_counter() - tic
print(f"GB Incidence for event {event_id} prediction in {duration:.3f} s")
print("Brier score on training data:", -gb_incidence.score(X, y))
if X_test is not None:

if verbose:
print(
"Brier score on testing data:", -gb_incidence.score(X_test, y_test)
f"GB Incidence for event {event_id} prediction in {duration:.3f} s"
)

if verbose:
brier_score_train = -gb_incidence.score(X, y)
print(f"Brier score on training data: {brier_score_train:.3f}")
if X_test is not None:
brier_score_test = -gb_incidence.score(X_test, y_test)
print(
f"Brier score on testing data: {brier_score_test:.3f}",
)
ax.plot(
coarse_timegrid,
cif_mean,
Expand All @@ -135,39 +169,36 @@ def plot_cumulative_incidence_functions(
tic = perf_counter()
aj.fit(y["duration"], y["event"], event_of_interest=event_id)
duration = perf_counter() - tic
print(f"Aalen-Johansen for event {event_id} fit in {duration:.3f} s")
if verbose:
print(f"Aalen-Johansen for event {event_id} fit in {duration:.3f} s")

aj.plot(label="Aalen-Johansen", ax=ax)
ax.set_xlim(0, 8_000)
ax.set_ylim(0, 0.5)

if event_id == 1:
ax.legend(loc="lower right")
else:
ax.legend().remove()

if verbose:
print("=" * 16, "\n")


# %%
from sklearn.model_selection import train_test_split


X_train, X_test, y_train_c, y_test_c = train_test_split(
X, y_censored, test_size=0.3, random_state=seed
)
y_train_u = y_uncensored.loc[y_train_c.index]
y_test_u = y_uncensored.loc[y_test_c.index]

gb_incidence = GradientBoostingIncidence(
learning_rate=0.1,
n_iter=20,
max_leaf_nodes=15,
hard_zero_fraction=0.1,
min_samples_leaf=5,
loss="ibs",
show_progressbar=False,
random_state=seed,
)

plot_cumulative_incidence_functions(
X_train,
y_train_u,
gb_incidence=None,
gb_incidence=gb_incidence,
aj=aj,
X_test=X_test,
y_test=y_test_u,
Expand All @@ -176,7 +207,7 @@ def plot_cumulative_incidence_functions(
plot_cumulative_incidence_functions(
X_train,
y_train_c,
gb_incidence=None,
gb_incidence=gb_incidence,
aj=aj,
X_test=X_test,
y_test=y_test_c,
Expand Down
3 changes: 2 additions & 1 deletion hazardous/data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from ._competing_weibull import make_synthetic_competing_weibull
from ._seer import load_seer

__all__ = ["make_synthetic_competing_weibull"]
__all__ = ["make_synthetic_competing_weibull", "load_seer"]
Loading

0 comments on commit d82db3b

Please sign in to comment.