Skip to content

Commit

Permalink
Add comments to all and add README.
Browse files Browse the repository at this point in the history
  • Loading branch information
navanhalem committed Oct 14, 2021
1 parent b2c7544 commit cf3e7d7
Show file tree
Hide file tree
Showing 6 changed files with 399 additions and 113,748 deletions.
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# XAI in de praktijk

Deze XAI workshop is ontwikkeld in het kader van een intern project binnen het RIG om meer ervaring te krijgen over XAI technieken, en om deze kennis te kunenn delen.
Het doel van de workshop is om te oefenen met XAI-technieken in de praktijk.
Als basis is de structuur van het [boek](https://ema.drwhy.ai/) van Biecek and Burzykowski gebruikt.

Deze repo bevat verschillende scripts waarin de verschillende XAI-technieken uit elkaar gehouden worden. In de notebook zijn al deze scripts gecombineerd.

De notebook bevat begeleidende tekst en kan op zichzelf doorlopen.

De requirements file bevat informatie over de libraries die benodigd zijn voor het uitvoeren van de scripts. Om dit te doen, voer het volgende commando uit:
```
pip install -r requirements.txt
```
File renamed without changes
10 changes: 6 additions & 4 deletions lime_images.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
import os

import numpy as np
import skimage
from lime import lime_image
from matplotlib import pyplot as plt
from skimage import io, transform
from skimage.segmentation import mark_boundaries
from tensorflow.keras.applications import inception_v3 as inc_net
from tensorflow.keras.applications.imagenet_utils import decode_predictions
from lime import lime_image
from skimage.segmentation import mark_boundaries
from matplotlib import pyplot as plt

# Load the inception V3 model
inet_model = inc_net.InceptionV3()

# Read the image and transform it into an image that can be read by the inception V3 model
image = skimage.io.imread('cat_image.jpg')
image = skimage.io.imread(os.path.join('data', 'cat_image.jpg'))
image = skimage.transform.resize(image, (299, 299))
image = (image - 0.5) * 2
image = np.expand_dims(image, axis=0)
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,3 @@ dalex==1.3.0
alibi==0.6.0
tensorflow==2.5.1
scikit_learn==0.24.2
scikit_image==0.18.2
114,027 changes: 331 additions & 113,696 deletions workshop.ipynb

Large diffs are not rendered by default.

95 changes: 48 additions & 47 deletions workshop_code.py → workshop.py
Original file line number Diff line number Diff line change
@@ -1,113 +1,128 @@
import dalex as dx
import numpy as np
import seaborn as sns

sns.set(rc={'figure.figsize': (8, 8)})
import itertools
import random
from collections import defaultdict
from random import sample

import dalex as dx
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from lime.lime_tabular import LimeTabularExplainer
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.inspection import plot_partial_dependence
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from random import sample
import random
from collections import defaultdict
from sklearn.inspection import plot_partial_dependence
from lime.lime_tabular import LimeTabularExplainer

import itertools
np.random.seed(1)
sns.set(rc={'figure.figsize': (8, 8)})

# Load the dataset
data = fetch_california_housing(as_frame=True)

# Plot the mean house value per block with relation to the geographical location of the block
sns.scatterplot(data=data.frame, x="Longitude", y="Latitude",
size="MedHouseVal", hue="MedHouseVal",
palette="viridis", alpha=0.5)
plt.legend(title="MedHouseVal", bbox_to_anchor=(1.05, 0.95),
loc="upper left")
_ = plt.title("Median house value depending on spatial location")

# Plot the average rooms values against the average bedrooms values
sns.scatterplot(data=data.frame, x="AveRooms", y="AveBedrms")
_ = plt.title("Relation between AvgRooms en AvgBedrms")

# Correct the average rooms and the average bedrooms values
# Plot the corrected average rooms values against the corrected average bedrooms values
data.frame = data.frame[(data.frame.AveRooms < 50) & (data.frame.AveBedrms < 10)]
sns.scatterplot(data=data.frame, x="AveRooms", y="AveBedrms")
_ = plt.title("Relation between AvgRooms en AvgBedrms")

# Plot the population values against the average occupation values
sns.scatterplot(data=data.frame, x="Population", y="AveOccup")
_ = plt.title("Relation between Population en AveOccup")

# Correct the population and the average occupation values
# Plot the corrected population values against the corrected average occupation values
data.frame = data.frame[(data.frame.Population < 17500) & (data.frame.AveOccup < 20)]
sns.scatterplot(data=data.frame, x="Population", y="AveOccup")
_ = plt.title("Relation between Population en AveOccup")

# Plot the distribution of house age
sns.histplot(data=data.frame, x="HouseAge")
_ = plt.title("Distribution of HouseAge")

# Plot the distribution of median income
sns.histplot(data=data.frame, x="MedInc")
_ = plt.title("Distribution of median income")

# Plot the distribution of median house value
sns.histplot(data=data.frame, x="MedHouseVal")
_ = plt.title("Distribution of median house value")

# Plot the mean house value per block with relation to the geographical location of the block after corrections
sns.scatterplot(data=data.frame, x="Longitude", y="Latitude",
size="MedHouseVal", hue="MedHouseVal",
palette="viridis", alpha=0.5)
plt.legend(title="MedHouseVal", bbox_to_anchor=(1.05, 0.95),
loc="upper left")
_ = plt.title("Median house value depending on spatial location")

# Split the data into train and test data:
# Split the data in train and test sets
np.random.seed(1)
X_train, X_test, y_train, y_test = train_test_split(data.frame[data.feature_names],
data.frame[data.target_names].iloc[:, 0], test_size=0.2)

np.random.seed(1)
# Create regression models
np.random.seed(1)
model = RandomForestRegressor(random_state=0).fit(X_train, y_train)
# model = SVR().fit(X_train, y_train)
# model = MLPRegressor(hidden_layer_sizes=(16, 32, 16)).fit(X_train, y_train)

# Take a sample from the test data
X_test_sample = pd.DataFrame(X_test.iloc[1, :]).T
y_test_sample = y_test.iloc[1]
print({X_test_sample})
print({y_test_sample})

X_test_sample
y_test_sample

# Create the explainer object
exp = dx.Explainer(model, X_train, y_train)

# Breakdown plot with latitude first, then longitude
breakdown = exp.predict_parts(X_test_sample,
type='break_down',
order=np.array(['Latitude', 'Longitude', 'AveRooms', 'AveBedrms',
'Population', 'AveOccup', 'HouseAge', 'MedInc']),
random_state=1)
breakdown.plot()

# Breakdown plot with longitude first, then latitude
breakdown = exp.predict_parts(X_test_sample,
type='break_down',
order=np.array(['Longitude', 'Latitude', 'AveRooms', 'AveBedrms',
'Population', 'AveOccup', 'HouseAge', 'MedInc']),
random_state=1)
breakdown.plot()

# Breakdown plot with interactions
breakdown_interaction_10 = exp.predict_parts(X_test_sample,
type='break_down_interactions',
interaction_preference=10,
random_state=1)
breakdown_interaction_10.plot()

contributions = defaultdict(lambda: [])

# Create n random breakdown plots (in order to show the differences)
n = 50
random.seed(1)
contributions = defaultdict(lambda: [])
for ordering in tqdm(sample(list(itertools.permutations(data.feature_names)), 100)):
breakdown = exp.predict_parts(X_test_sample, type='break_down', order=list(ordering))
for item in list(zip(breakdown.result.variable_name, breakdown.result.contribution))[1:-1]:
contributions[item[0]].append(item[1])

sns.boxplot(data=pd.DataFrame(contributions))
_ = plt.title(f'Contribution values for different variables, N={n}')

# Calculate the Shapley values and plot them
shapley_values = exp.predict_parts(X_test_sample, type='shap', random_state=1)
shapley_values.plot()

Expand All @@ -119,18 +134,20 @@
random_state=1,
kernel_width=1)
lime = explainer.explain_instance(X_test_sample.iloc[0, :], model.predict)
lime.show_in_notebook(show_table=True)
lime.as_pyplot_figure()
# lime.show_in_notebook(show_table=True)

## CP oscillations
# Create CP profile for the sample
cp_sample = exp.predict_profile(X_test_sample)
cp_sample.result
print(cp_sample.result)
cp_sample.plot()

# Manually change the MedInc value of the sample; the prediction now corresponds to that indicated by the CP profile
X_test_sample_high_medinc = X_test_sample.copy()
X_test_sample_high_medinc['MedInc'] = 7.895
model.predict(X_test_sample_high_medinc)

## CP oscillations
# Calculate the CP oscillations profiles, for uniform sampling and sampling from the empirical distribution
prediction = model.predict(X_test_sample)
cp_sample_res = exp.predict_profile(X_test_sample).result

Expand All @@ -148,49 +165,33 @@
cp_oscillations_unif[feature] = cp_oscillations_abs / len(feature_sublist)
cp_oscillations_emp[feature] = cp_oscillations_abs * emp_corr

# Plot CP oscillation values for uniform distribution
data_unif = pd.DataFrame([(k, sum(v)) for k, v in cp_oscillations_unif.items()])
data_unif.columns = ['var', 'oscillations']
sns.barplot(x="oscillations",
y="var",
data=data_unif.sort_values(by=['oscillations'], ascending=False, axis=0),
color='lightblue')
sns.barplot(x="oscillations", y="var", data=data_unif.sort_values(by=['oscillations'], ascending=False, axis=0), color='lightblue')
_ = plt.title("cp oscillations for uniform distribution")
plt.show()

# Plot CP oscillation values for empirical distribution
data_emp = pd.DataFrame([(k, sum(v)) for k, v in cp_oscillations_emp.items()])
data_emp.columns = ['var', 'oscillations']
sns.barplot(x="oscillations",
y="var",
data=data_emp.sort_values(by=['oscillations'], ascending=False, axis=0),
color='lightblue')
sns.barplot(x="oscillations", y="var", data=data_emp.sort_values(by=['oscillations'], ascending=False, axis=0), color='lightblue')
_ = plt.title("cp oscillations for empirical distribution")
plt.show()

# Calculate and plot variable importance
mp_rf = exp.model_parts()
mp_rf.plot()

# Calculate and plot grouped variable importance
mp_rf_grouped = exp.model_parts(variable_groups={'Location': ['Latitude', 'Longitude'],
'House': ['AveBedrms', 'AveRooms', 'HouseAge'],
'People': ['Population', 'MedInc', 'AveOccup']})
mp_rf_grouped.plot()

# Plot partial dependence profile plots
partial_dependence = exp.model_profile(variables=['MedInc', 'AveOccup'], N=100)
partial_dependence.plot(geom='profiles')

sns.set(rc={'figure.figsize': (16, 6)})
plot_partial_dependence(model, X_train, ['Longitude', 'Latitude', ['Longitude', 'Latitude']])

y_train_med = y_train.median()
y_train_bool = y_train.apply(lambda x: True if x > y_train_med else False)
y_test_bool = y_test.apply(lambda x: True if x > y_train_med else False)

X_test_sample = pd.DataFrame(X_test.iloc[1, :]).T
y_test_sample = y_test_bool.iloc[1]

model = RandomForestClassifier(random_state=0).fit(X_train, y_train_bool)
model.score(X_test, y_test_bool)

X_test_sample

shape = (1,) + X_train.shape[1:]
alibi_test.explainers.counterfactual.CounterFactual(model.predict_proba, shape)

0 comments on commit cf3e7d7

Please sign in to comment.