Skip to content

Commit

Permalink
Merge pull request #66 from prio-data/integrate_new_vtl
Browse files Browse the repository at this point in the history
integrated new views_tensor utils and implemented drift self-test
  • Loading branch information
jimdale authored Sep 30, 2024
2 parents 64e5eea + 6d12c70 commit 9415b3d
Show file tree
Hide file tree
Showing 6 changed files with 542 additions and 62 deletions.
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "viewser"
version = "6.5.3"
version = "6.6.0"
description = "The Views 3 CLI tool"
authors = ["peder2911 <[email protected]>"]
readme = "README.md"
Expand Down Expand Up @@ -28,7 +28,7 @@ strconv = "^0.4.2"
pyarrow = ">9.0.0"
views-storage = "^1.1.0"
tqdm = "^4.66.0"
views_tensor_utilities = "<1.0.0"
views_tensor_utilities = ">=1.0.0"
pyod = "<1.1.0"

[tool.poetry.scripts]
Expand Down
79 changes: 45 additions & 34 deletions viewser/commands/queryset/config_drift.py
Original file line number Diff line number Diff line change
@@ -1,65 +1,76 @@
import numpy as np
from . import integrity_checks as ic

default_dne = -np.inf
default_missing = np.nan


default_config_dict = {

'global_missingness': {'threshold': 0.05,
'test_function': ic.get_global_nan_fracs,
'message': 'dataset missingness'},
'test_function': 'global_nan_fracs',
'message': 'dataset missingness',
'self_test': 0.10},

'global_zeros': {'threshold': 0.95,
'test_function': ic.get_global_zero_fracs,
'message': 'dataset zero'},
'global_zeros': {'threshold': 0.75,
'test_function': 'global_zero_fracs',
'message': 'dataset zero',
'self_test': 0.999},

'time_missingness': {'threshold': 0.01,
'test_function': ic.get_time_nan_fracs,
'message': 'time-unit missingness'},
'test_function': 'time_nan_fracs',
'message': 'time-unit missingness',
'self_test': 0.02},

'space_missingness': {'threshold': 0.03,
'test_function': ic.get_space_nan_fracs,
'message': 'space-unit missingness'},
'test_function': 'space_nan_fracs',
'message': 'space-unit missingness',
'self_test': 0.06},

'feature_missingness': {'threshold': 0.01,
'test_function': ic.get_feature_nan_fracs,
'message': 'feature missingness'},
'test_function': 'feature_nan_fracs',
'message': 'feature missingness',
'self_test': 0.02},

'time_zeros': {'threshold': 0.95,
'test_function': ic.get_time_zero_fracs,
'message': 'time-unit zero'},
'time_zeros': {'threshold': 0.75,
'test_function': 'time_zero_fracs',
'message': 'time-unit zero',
'self_test': 0.9999},

'space_zeros': {'threshold': 0.95,
'test_function': ic.get_space_zero_fracs,
'message': 'space-unit zero'},
'test_function': 'space_zero_fracs',
'message': 'space-unit zero',
'self_test': 0.99},

'feature_zeros': {'threshold': 0.95,
'test_function': ic.get_feature_zero_fracs,
'message': 'feature zero'},
'feature_zeros': {'threshold': 0.75,
'test_function': 'feature_zero_fracs',
'message': 'feature zero',
'self_test': 0.9999},

'delta_completeness': {'threshold': 1.25,
'test_function': ic.get_delta_completeness,
'message': 'feature delta_completeness'},
'delta_completeness': {'threshold': 1.01,
'test_function': 'delta_completeness',
'message': 'feature delta_completeness',
'self_test': 0.99},

'delta_zeroes': {'threshold': 1.25,
'test_function': ic.get_delta_zeroes,
'message': 'feature delta_zeroes'},
'delta_zeroes': {'threshold': 1.01,
'test_function': 'delta_zeroes',
'message': 'feature delta_zeroes',
'self_test': 0.99},

'extreme_values': {'threshold': 4.0,
'test_function': ic.get_extreme_values,
'message': 'feature extreme values'},
'test_function': 'extreme_values',
'message': 'feature extreme values',
'self_test': 8.0},

'ks_drift': {'threshold': 100.,
'test_function': ic.get_ks_drift,
'message': 'feature KS drift'},
'test_function': 'ks_drift',
'message': 'feature KS drift',
'self_test': None},

'ecod_drift': {'threshold': 0.05,
'test_function': ic.get_ecod_drift,
'message': 'dataset ECOD drift'},
'test_function': 'ecod_drift',
'message': 'dataset ECOD drift',
'self_test': None},

'standard_partition_length': 10,
'test_partition_length': 1
'test_partition_length': 1,

}
90 changes: 84 additions & 6 deletions viewser/commands/queryset/drift_detection.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import numpy as np
import scipy
from views_tensor_utilities import objects, mappings
from . import config_drift as config
from . import integrity_checks as ic
from . import self_test as st
#import viewser.commands.queryset.models.self_test_data as std
import datetime


Expand Down Expand Up @@ -60,7 +62,10 @@ def generate_alarms(self):
"""

results, translation_dict = self.test_function(
# print(self.test_function,self.test_partition_length,self.standard_partition_length,self.data.shape,
# self.features)

results, translation_dict = getattr(ic, self.test_function)(
tensor=self.data,
index=self.index,
features=self.features,
Expand Down Expand Up @@ -107,17 +112,90 @@ class InputGate:
"""

def __init__(self, df, drift_config_dict=None):
def __init__(self, df, drift_config_dict=None, self_test=False, self_test_data=None):
self.config_dict = drift_config_dict
self.tensor_container = objects.ViewsDataframe(df, cast_to_dtype=np.float64).to_numpy_time_space()
self.numeric_part = self.tensor_container.get_numeric_part()
self.default_config_dict = config.default_config_dict

if self_test:
self.__self_test(self_test_data)

self.tensor_container = (objects.ViewsDataframe(df, split_strategy='float_string', cast_strategy='to_64').
to_numpy_time_space())
self.numeric_part = self.tensor_container.get_numeric_views_tensors()[0]
self.tensor = self.numeric_part.tensor
self.index = self.tensor_container.index
self.columns = self.numeric_part.columns

self.default_config_dict = config.default_config_dict
self.testers = []

def __self_test(self, self_test_data):

"""
___self_test
Method driving the self-test machinery for the drift detection system
A standard dataframe is fetched and custom perturbation functions, one per integrity-checking function, are
called upon to perturb the standard data in ways designed to trigger alerts from the drift-detector.
Perturbed data is passed to the Tester method as normal and alerts are collected.
By design, all integrity checks should be failed. If this is not the case, it implies a problem with one
or more of the integrity-checking routines, or with the input data, which must be investigated.
"""

self_test_container = (objects.ViewsDataframe(self_test_data,
split_strategy='float_string',
cast_strategy='to_64').to_numpy_time_space())

self_test_index = self_test_container.index
self_test_features = self_test_container.get_numeric_views_tensors()[0].columns

self_test_data = self_test_container.get_numeric_numpy_tensors()[0]

testers = []

for key in self.config_dict.keys():
try:

self_test_dict = self.default_config_dict[key]

self_test_dict['index'] = self_test_index

self_test_dict['test_partition_length'] = self.config_dict['test_partition_length']

self_test_dict['standard_partition_length'] = self.config_dict['standard_partition_length']

perturbed_self_test_data = getattr(st, 'perturb_'+self_test_dict['test_function'])(self_test_data,
**self_test_dict)

testers.append(Tester(test_function=self_test_dict['test_function'],
test_partition_length=self.config_dict['test_partition_length'],
standard_partition_length=self.config_dict['standard_partition_length'],
threshold=self_test_dict['threshold'],
message=self_test_dict['message'],
data=perturbed_self_test_data,
index=self_test_index,
features=self_test_features,
))

except:
pass

alerts = [tester.generate_alarms() for tester in testers]

nfailures = 0
for alert in alerts:
if 'alarm' in str(alert[0]):
nfailures += 1

print(f'{nfailures}/{len(testers)} tests failed')

print()
print()
print('******END*******')
print()
print()

def assemble_alerts(self):
"""
assemble_alerts
Expand Down
Loading

0 comments on commit 9415b3d

Please sign in to comment.