Merge branch 'icml-2024' into debiased-bs

soda-inria · Dec 19, 2023 · d82db3b · d82db3b
2 parents 4322afe + a80546a
commit d82db3b
Show file tree

Hide file tree

Showing 15 changed files with 832 additions and 136 deletions.
diff --git a/.gitignore b/.gitignore
@@ -161,5 +161,8 @@ cython_debug/
 #.idea/
 
 .vscode
-
 doc/generated/
+
+
+# This dataset should not be redistributed, because users have to sign an agreement.
+hazardous/data/seer_cancer_cardio_raw_data.txt
diff --git a/doc/_static/seer_extract.png b/doc/_static/seer_extract.png
diff --git a/doc/_static/seer_home.png b/doc/_static/seer_home.png
diff --git a/doc/api.rst b/doc/api.rst
@@ -40,6 +40,7 @@ Datasets
     :nosignatures:
 
     data.make_synthetic_competing_weibull
+    data.load_seer
 
 
 Inverse Probability Censoring Weight

diff --git a/doc/downloading_seer.rst b/doc/downloading_seer.rst
@@ -0,0 +1,46 @@
+
+.. _downloading_seer:
+
+How to get the SEER dataset
+===========================
+
+.. currentmodule:: hazardous
+
+SEER is a reference dataset for cancer statistics in the US, used in competitive risk analysis.
+
+Below is a quick guide to obtain it.
+Note that you will need a **Windows computer** (or emulator) to download the files.
+
+1. Head to the `SEER data access webpage <https://seerdataaccess.cancer.gov/seer-data-access>`_, enter your email address and click on "Research Data Requests".
+2. Fill the form.
+3. Confirm your email address and wait a few days before the confirmation.
+4. After receiving the confirmation email, go to the `SEER*Stat Software webpage <https://seer.cancer.gov/seerstat/software/>`_ to download the software on Windows.
+5. Open it and sign in with your SEER credendials received by email.
+
+.. image:: _static/seer_home.png
+  :width: 300
+
+
+.. raw:: html
+
+   <br></br>
+
+
+6. Use seerstat to open the `data/seer.sl` file.
+
+.. image:: _static/seer_extract.png
+   :width: 400
+
+
+.. raw:: html
+
+   <br></br>
+
+
+You should get a txt file. This file can be loaded using :func:`hazardous.data.load_seer`.
+
+
+.. raw:: html
+
+   <br></br>
+
diff --git a/doc/index.rst b/doc/index.rst
@@ -40,3 +40,4 @@ of this library at this time.
 
    api
    auto_examples/index
+   downloading_seer
diff --git a/examples/plot_complex_data.py b/examples/plot_complex_data.py
@@ -13,42 +13,41 @@
 """
 
 # %%
-import numpy as np
-import hazardous.data._competing_weibull as competing_w
-from time import perf_counter
-import matplotlib.pyplot as plt
-from sklearn.model_selection import train_test_split
 from sklearn.utils import check_random_state
-import pandas as pd
-import seaborn as sns
-
-from hazardous import GradientBoostingIncidence
-from lifelines import AalenJohansenFitter
 
 seed = 0
 rng = check_random_state(seed)
 
 
 # %%
 # In the following cell, we verify that the synthetic dataset is well defined.
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from hazardous.data._competing_weibull import compute_shape_and_scale
 
-df_features = pd.DataFrame(rng.randn(100_000, 10))
-df_features.columns = [f"feature_{i}" for i in range(10)]
 
-df_shape_scale_star = competing_w.compute_shape_and_scale(
-    df_features,
+X = rng.randn(10_000, 10)
+column_names = [f"feature_{i}" for i in range(X.shape[1])]
+X = pd.DataFrame(X, columns=column_names)
+
+SS_star = compute_shape_and_scale(
+    X,
     features_rate=0.2,
     n_events=3,
     degree_interaction=2,
     random_state=0,
 )
+
 fig, axes = plt.subplots(2, 3, figsize=(10, 7))
-for idx, col in enumerate(df_shape_scale_star.columns):
-    sns.histplot(df_shape_scale_star[col], ax=axes[idx // 3][idx % 3])
+for idx, col in enumerate(SS_star.columns):
+    sns.histplot(SS_star[col], ax=axes[idx // 3][idx % 3])
 
 # %%
+from hazardous.data._competing_weibull import make_synthetic_competing_weibull
+
 
-X, y_censored, y_uncensored = competing_w.make_synthetic_competing_weibull(
+X, y_censored, y_uncensored = make_synthetic_competing_weibull(
     n_samples=30_000,
     base_scale=1_000,
     n_features=10,
@@ -57,20 +56,35 @@
     independent_censoring=True,
     features_censoring_rate=0.2,
     return_uncensored_data=True,
-    return_X_y=True,
     feature_rounding=3,
     target_rounding=None,
     censoring_relative_scale=1.5,
-    random_state=0,
     complex_features=True,
+    return_X_y=True,
+    random_state=seed,
+)
+
+print(y_censored["event"].value_counts().sort_index())
+
+sns.histplot(
+    y_censored,
+    x="duration",
+    hue="event",
+    multiple="stack",
+    palette="magma",
 )
 
 # %%
-n_samples = len(X)
-calculate_variance = n_samples <= 5_000
+from lifelines import AalenJohansenFitter
+
+
+calculate_variance = X.shape[0] <= 5_000
 aj = AalenJohansenFitter(calculate_variance=calculate_variance, seed=0)
+aj
+
+# %%
+from hazardous._gradient_boosting_incidence import GradientBoostingIncidence
 
-y_censored["event"].value_counts()
 
 gb_incidence = GradientBoostingIncidence(
     learning_rate=0.1,
@@ -82,7 +96,7 @@
     show_progressbar=False,
     random_state=seed,
 )
-
+gb_incidence
 
 # %%
 #
@@ -92,9 +106,18 @@
 # Let's now estimate the CIFs on uncensored data and plot them against the
 # theoretical CIFs:
 
+import numpy as np
+from time import perf_counter
+
 
 def plot_cumulative_incidence_functions(
-    X, y, gb_incidence=None, aj=None, X_test=None, y_test=None
+    X,
+    y,
+    gb_incidence=None,
+    aj=None,
+    X_test=None,
+    y_test=None,
+    verbose=False,
 ):
     n_events = y["event"].max()
     t_max = y["duration"].max()
@@ -113,17 +136,28 @@ def plot_cumulative_incidence_functions(
             gb_incidence.set_params(event_of_interest=event_id)
             gb_incidence.fit(X, y)
             duration = perf_counter() - tic
-            print(f"GB Incidence for event {event_id} fit in {duration:.3f} s")
+
+            if verbose:
+                print(f"GB Incidence for event {event_id} fit in {duration:.3f} s")
+
             tic = perf_counter()
             cifs_pred = gb_incidence.predict_cumulative_incidence(X, coarse_timegrid)
             cif_mean = cifs_pred.mean(axis=0)
             duration = perf_counter() - tic
-            print(f"GB Incidence for event {event_id} prediction in {duration:.3f} s")
-            print("Brier score on training data:", -gb_incidence.score(X, y))
-            if X_test is not None:
+
+            if verbose:
                 print(
-                    "Brier score on testing data:", -gb_incidence.score(X_test, y_test)
+                    f"GB Incidence for event {event_id} prediction in {duration:.3f} s"
                 )
+
+            if verbose:
+                brier_score_train = -gb_incidence.score(X, y)
+                print(f"Brier score on training data: {brier_score_train:.3f}")
+                if X_test is not None:
+                    brier_score_test = -gb_incidence.score(X_test, y_test)
+                    print(
+                        f"Brier score on testing data: {brier_score_test:.3f}",
+                    )
             ax.plot(
                 coarse_timegrid,
                 cif_mean,
@@ -135,39 +169,36 @@ def plot_cumulative_incidence_functions(
             tic = perf_counter()
             aj.fit(y["duration"], y["event"], event_of_interest=event_id)
             duration = perf_counter() - tic
-            print(f"Aalen-Johansen for event {event_id} fit in {duration:.3f} s")
+            if verbose:
+                print(f"Aalen-Johansen for event {event_id} fit in {duration:.3f} s")
+
             aj.plot(label="Aalen-Johansen", ax=ax)
             ax.set_xlim(0, 8_000)
             ax.set_ylim(0, 0.5)
+
         if event_id == 1:
             ax.legend(loc="lower right")
         else:
             ax.legend().remove()
 
+        if verbose:
+            print("=" * 16, "\n")
+
 
 # %%
+from sklearn.model_selection import train_test_split
+
 
 X_train, X_test, y_train_c, y_test_c = train_test_split(
     X, y_censored, test_size=0.3, random_state=seed
 )
 y_train_u = y_uncensored.loc[y_train_c.index]
 y_test_u = y_uncensored.loc[y_test_c.index]
 
-gb_incidence = GradientBoostingIncidence(
-    learning_rate=0.1,
-    n_iter=20,
-    max_leaf_nodes=15,
-    hard_zero_fraction=0.1,
-    min_samples_leaf=5,
-    loss="ibs",
-    show_progressbar=False,
-    random_state=seed,
-)
-
 plot_cumulative_incidence_functions(
     X_train,
     y_train_u,
-    gb_incidence=None,
+    gb_incidence=gb_incidence,
     aj=aj,
     X_test=X_test,
     y_test=y_test_u,
@@ -176,7 +207,7 @@ def plot_cumulative_incidence_functions(
 plot_cumulative_incidence_functions(
     X_train,
     y_train_c,
-    gb_incidence=None,
+    gb_incidence=gb_incidence,
     aj=aj,
     X_test=X_test,
     y_test=y_test_c,

diff --git a/hazardous/data/__init__.py b/hazardous/data/__init__.py
@@ -1,3 +1,4 @@
 from ._competing_weibull import make_synthetic_competing_weibull
+from ._seer import load_seer
 
-__all__ = ["make_synthetic_competing_weibull"]
+__all__ = ["make_synthetic_competing_weibull", "load_seer"]
Original file line number	Diff line number	Diff line change
Expand Up		@@ -40,3 +40,4 @@ of this library at this time.

		api
		auto_examples/index
		downloading_seer