Add comments to all and add README.

RijksICTGilde · Oct 14, 2021 · cf3e7d7 · cf3e7d7
1 parent b2c7544
commit cf3e7d7
Show file tree

Hide file tree

Showing 6 changed files with 399 additions and 113,748 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,14 @@
+# XAI in de praktijk
+
+Deze XAI workshop is ontwikkeld in het kader van een intern project binnen het RIG om meer ervaring te krijgen over XAI technieken, en om deze kennis te kunenn delen. 
+Het doel van de workshop is om te oefenen met XAI-technieken in de praktijk.
+Als basis is de structuur van het [boek](https://ema.drwhy.ai/) van Biecek and Burzykowski gebruikt.
+
+Deze repo bevat verschillende scripts waarin de verschillende XAI-technieken uit elkaar gehouden worden. In de notebook zijn al deze scripts gecombineerd.
+
+De notebook bevat begeleidende tekst en kan op zichzelf doorlopen. 
+
+De requirements file bevat informatie over de libraries die benodigd zijn voor het uitvoeren van de scripts. Om dit te doen, voer het volgende commando uit:
+```
+pip install -r requirements.txt
+```
diff --git a/cat_image.jpg → data/cat_image.jpg b/cat_image.jpg → data/cat_image.jpg
diff --git a/lime_images.py b/lime_images.py
@@ -1,17 +1,19 @@
+import os
+
 import numpy as np
 import skimage
+from lime import lime_image
+from matplotlib import pyplot as plt
 from skimage import io, transform
+from skimage.segmentation import mark_boundaries
 from tensorflow.keras.applications import inception_v3 as inc_net
 from tensorflow.keras.applications.imagenet_utils import decode_predictions
-from lime import lime_image
-from skimage.segmentation import mark_boundaries
-from matplotlib import pyplot as plt
 
 # Load the inception V3 model
 inet_model = inc_net.InceptionV3()
 
 # Read the image and transform it into an image that can be read by the inception V3 model
-image = skimage.io.imread('cat_image.jpg')
+image = skimage.io.imread(os.path.join('data', 'cat_image.jpg'))
 image = skimage.transform.resize(image, (299, 299))
 image = (image - 0.5) * 2
 image = np.expand_dims(image, axis=0)

diff --git a/requirements.txt b/requirements.txt
@@ -9,4 +9,3 @@ dalex==1.3.0
 alibi==0.6.0
 tensorflow==2.5.1
 scikit_learn==0.24.2
-scikit_image==0.18.2
diff --git a/workshop.ipynb b/workshop.ipynb
diff --git a/workshop_code.py → workshop.py b/workshop_code.py → workshop.py
@@ -1,113 +1,128 @@
-import dalex as dx
-import numpy as np
-import seaborn as sns
-
-sns.set(rc={'figure.figsize': (8, 8)})
+import itertools
+import random
+from collections import defaultdict
+from random import sample
 
+import dalex as dx
 import matplotlib.pyplot as plt
+import numpy as np
 import pandas as pd
+import seaborn as sns
+from lime.lime_tabular import LimeTabularExplainer
 from sklearn.datasets import fetch_california_housing
 from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
+from sklearn.inspection import plot_partial_dependence
 from sklearn.model_selection import train_test_split
 from tqdm import tqdm
-from random import sample
-import random
-from collections import defaultdict
-from sklearn.inspection import plot_partial_dependence
-from lime.lime_tabular import LimeTabularExplainer
 
-import itertools
+np.random.seed(1)
+sns.set(rc={'figure.figsize': (8, 8)})
 
 # Load the dataset
 data = fetch_california_housing(as_frame=True)
 
+# Plot the mean house value per block with relation to the geographical location of the block
 sns.scatterplot(data=data.frame, x="Longitude", y="Latitude",
                 size="MedHouseVal", hue="MedHouseVal",
                 palette="viridis", alpha=0.5)
 plt.legend(title="MedHouseVal", bbox_to_anchor=(1.05, 0.95),
            loc="upper left")
 _ = plt.title("Median house value depending on spatial location")
 
+# Plot the average rooms values against the average bedrooms values
 sns.scatterplot(data=data.frame, x="AveRooms", y="AveBedrms")
 _ = plt.title("Relation between AvgRooms en AvgBedrms")
 
+# Correct the average rooms and the average bedrooms values
+# Plot the corrected average rooms values against the corrected average bedrooms values
 data.frame = data.frame[(data.frame.AveRooms < 50) & (data.frame.AveBedrms < 10)]
 sns.scatterplot(data=data.frame, x="AveRooms", y="AveBedrms")
 _ = plt.title("Relation between AvgRooms en AvgBedrms")
 
+# Plot the population values against the average occupation values
 sns.scatterplot(data=data.frame, x="Population", y="AveOccup")
 _ = plt.title("Relation between Population en AveOccup")
 
+# Correct the population and the average occupation values
+# Plot the corrected population values against the corrected average occupation values
 data.frame = data.frame[(data.frame.Population < 17500) & (data.frame.AveOccup < 20)]
 sns.scatterplot(data=data.frame, x="Population", y="AveOccup")
 _ = plt.title("Relation between Population en AveOccup")
 
+# Plot the distribution of house age
 sns.histplot(data=data.frame, x="HouseAge")
 _ = plt.title("Distribution of HouseAge")
 
+# Plot the distribution of median income
 sns.histplot(data=data.frame, x="MedInc")
 _ = plt.title("Distribution of median income")
 
+# Plot the distribution of median house value
 sns.histplot(data=data.frame, x="MedHouseVal")
 _ = plt.title("Distribution of median house value")
 
+# Plot the mean house value per block with relation to the geographical location of the block after corrections
 sns.scatterplot(data=data.frame, x="Longitude", y="Latitude",
                 size="MedHouseVal", hue="MedHouseVal",
                 palette="viridis", alpha=0.5)
 plt.legend(title="MedHouseVal", bbox_to_anchor=(1.05, 0.95),
            loc="upper left")
 _ = plt.title("Median house value depending on spatial location")
 
-# Split the data into train and test data:
 # Split the data in train and test sets
 np.random.seed(1)
 X_train, X_test, y_train, y_test = train_test_split(data.frame[data.feature_names],
                                                     data.frame[data.target_names].iloc[:, 0], test_size=0.2)
 
-np.random.seed(1)
 # Create regression models
+np.random.seed(1)
 model = RandomForestRegressor(random_state=0).fit(X_train, y_train)
-# model = SVR().fit(X_train, y_train)
-# model = MLPRegressor(hidden_layer_sizes=(16, 32, 16)).fit(X_train, y_train)
 
+# Take a sample from the test data
 X_test_sample = pd.DataFrame(X_test.iloc[1, :]).T
 y_test_sample = y_test.iloc[1]
+print({X_test_sample})
+print({y_test_sample})
 
-X_test_sample
-y_test_sample
-
+# Create the explainer object
 exp = dx.Explainer(model, X_train, y_train)
 
+# Breakdown plot with latitude first, then longitude
 breakdown = exp.predict_parts(X_test_sample,
                               type='break_down',
                               order=np.array(['Latitude', 'Longitude', 'AveRooms', 'AveBedrms',
                                               'Population', 'AveOccup', 'HouseAge', 'MedInc']),
                               random_state=1)
 breakdown.plot()
 
+# Breakdown plot with longitude first, then latitude
 breakdown = exp.predict_parts(X_test_sample,
                               type='break_down',
                               order=np.array(['Longitude', 'Latitude', 'AveRooms', 'AveBedrms',
                                               'Population', 'AveOccup', 'HouseAge', 'MedInc']),
                               random_state=1)
 breakdown.plot()
 
+# Breakdown plot with interactions
 breakdown_interaction_10 = exp.predict_parts(X_test_sample,
                                              type='break_down_interactions',
                                              interaction_preference=10,
                                              random_state=1)
 breakdown_interaction_10.plot()
 
-contributions = defaultdict(lambda: [])
-
+# Create n random breakdown plots (in order to show the differences)
+n = 50
 random.seed(1)
+contributions = defaultdict(lambda: [])
 for ordering in tqdm(sample(list(itertools.permutations(data.feature_names)), 100)):
     breakdown = exp.predict_parts(X_test_sample, type='break_down', order=list(ordering))
     for item in list(zip(breakdown.result.variable_name, breakdown.result.contribution))[1:-1]:
         contributions[item[0]].append(item[1])
 
 sns.boxplot(data=pd.DataFrame(contributions))
+_ = plt.title(f'Contribution values for different variables, N={n}')
 
+# Calculate the Shapley values and plot them
 shapley_values = exp.predict_parts(X_test_sample, type='shap', random_state=1)
 shapley_values.plot()
 
@@ -119,18 +134,20 @@
                                  random_state=1,
                                  kernel_width=1)
 lime = explainer.explain_instance(X_test_sample.iloc[0, :], model.predict)
-lime.show_in_notebook(show_table=True)
+lime.as_pyplot_figure()
+# lime.show_in_notebook(show_table=True)
 
-## CP oscillations
+# Create CP profile for the sample
 cp_sample = exp.predict_profile(X_test_sample)
-cp_sample.result
+print(cp_sample.result)
 cp_sample.plot()
 
+# Manually change the MedInc value of the sample; the prediction now corresponds to that indicated by the CP profile
 X_test_sample_high_medinc = X_test_sample.copy()
 X_test_sample_high_medinc['MedInc'] = 7.895
 model.predict(X_test_sample_high_medinc)
 
-## CP oscillations
+# Calculate the CP oscillations profiles, for uniform sampling and sampling from the empirical distribution
 prediction = model.predict(X_test_sample)
 cp_sample_res = exp.predict_profile(X_test_sample).result
 
@@ -148,49 +165,33 @@
     cp_oscillations_unif[feature] = cp_oscillations_abs / len(feature_sublist)
     cp_oscillations_emp[feature] = cp_oscillations_abs * emp_corr
 
+# Plot CP oscillation values for uniform distribution
 data_unif = pd.DataFrame([(k, sum(v)) for k, v in cp_oscillations_unif.items()])
 data_unif.columns = ['var', 'oscillations']
-sns.barplot(x="oscillations",
-            y="var",
-            data=data_unif.sort_values(by=['oscillations'], ascending=False, axis=0),
-            color='lightblue')
+sns.barplot(x="oscillations", y="var", data=data_unif.sort_values(by=['oscillations'], ascending=False, axis=0), color='lightblue')
 _ = plt.title("cp oscillations for uniform distribution")
 plt.show()
 
+# Plot CP oscillation values for empirical distribution
 data_emp = pd.DataFrame([(k, sum(v)) for k, v in cp_oscillations_emp.items()])
 data_emp.columns = ['var', 'oscillations']
-sns.barplot(x="oscillations",
-            y="var",
-            data=data_emp.sort_values(by=['oscillations'], ascending=False, axis=0),
-            color='lightblue')
+sns.barplot(x="oscillations", y="var", data=data_emp.sort_values(by=['oscillations'], ascending=False, axis=0), color='lightblue')
 _ = plt.title("cp oscillations for empirical distribution")
 plt.show()
 
+# Calculate and plot variable importance
 mp_rf = exp.model_parts()
 mp_rf.plot()
 
+# Calculate and plot grouped variable importance
 mp_rf_grouped = exp.model_parts(variable_groups={'Location': ['Latitude', 'Longitude'],
                                                  'House': ['AveBedrms', 'AveRooms', 'HouseAge'],
                                                  'People': ['Population', 'MedInc', 'AveOccup']})
 mp_rf_grouped.plot()
 
+# Plot partial dependence profile plots
 partial_dependence = exp.model_profile(variables=['MedInc', 'AveOccup'], N=100)
 partial_dependence.plot(geom='profiles')
 
 sns.set(rc={'figure.figsize': (16, 6)})
 plot_partial_dependence(model, X_train, ['Longitude', 'Latitude', ['Longitude', 'Latitude']])
-
-y_train_med = y_train.median()
-y_train_bool = y_train.apply(lambda x: True if x > y_train_med else False)
-y_test_bool = y_test.apply(lambda x: True if x > y_train_med else False)
-
-X_test_sample = pd.DataFrame(X_test.iloc[1, :]).T
-y_test_sample = y_test_bool.iloc[1]
-
-model = RandomForestClassifier(random_state=0).fit(X_train, y_train_bool)
-model.score(X_test, y_test_bool)
-
-X_test_sample
-
-shape = (1,) + X_train.shape[1:]
-alibi_test.explainers.counterfactual.CounterFactual(model.predict_proba, shape)