Merge pull request #66 from prio-data/integrate_new_vtl

integrated new views_tensor utils and implemented drift self-test
prio-data · Sep 30, 2024 · 9415b3d · 9415b3d
2 parents 64e5eea + 6d12c70
commit 9415b3d
Show file tree

Hide file tree

Showing 6 changed files with 542 additions and 62 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "viewser"
-version = "6.5.3"
+version = "6.6.0"
 description = "The Views 3 CLI tool"
 authors = ["peder2911 <[email protected]>"]
 readme = "README.md"
@@ -28,7 +28,7 @@ strconv = "^0.4.2"
 pyarrow = ">9.0.0"
 views-storage = "^1.1.0"
 tqdm = "^4.66.0"
-views_tensor_utilities = "<1.0.0"
+views_tensor_utilities = ">=1.0.0"
 pyod = "<1.1.0"
 
 [tool.poetry.scripts]

diff --git a/viewser/commands/queryset/config_drift.py b/viewser/commands/queryset/config_drift.py
@@ -1,65 +1,76 @@
 import numpy as np
-from . import integrity_checks as ic
 
 default_dne = -np.inf
 default_missing = np.nan
 
-
 default_config_dict = {
 
     'global_missingness':  {'threshold': 0.05,
-                            'test_function': ic.get_global_nan_fracs,
-                            'message': 'dataset missingness'},
+                            'test_function': 'global_nan_fracs',
+                            'message': 'dataset missingness',
+                            'self_test': 0.10},
 
-    'global_zeros':        {'threshold': 0.95,
-                            'test_function': ic.get_global_zero_fracs,
-                            'message': 'dataset zero'},
+    'global_zeros':        {'threshold': 0.75,
+                            'test_function': 'global_zero_fracs',
+                            'message': 'dataset zero',
+                            'self_test': 0.999},
 
     'time_missingness':    {'threshold': 0.01,
-                            'test_function': ic.get_time_nan_fracs,
-                            'message': 'time-unit missingness'},
+                            'test_function': 'time_nan_fracs',
+                            'message': 'time-unit missingness',
+                            'self_test': 0.02},
 
     'space_missingness':   {'threshold': 0.03,
-                            'test_function': ic.get_space_nan_fracs,
-                            'message': 'space-unit missingness'},
+                            'test_function': 'space_nan_fracs',
+                            'message': 'space-unit missingness',
+                            'self_test': 0.06},
 
     'feature_missingness': {'threshold': 0.01,
-                            'test_function': ic.get_feature_nan_fracs,
-                            'message': 'feature missingness'},
+                            'test_function': 'feature_nan_fracs',
+                            'message': 'feature missingness',
+                            'self_test': 0.02},
 
-    'time_zeros':          {'threshold': 0.95,
-                            'test_function': ic.get_time_zero_fracs,
-                            'message': 'time-unit zero'},
+    'time_zeros':          {'threshold': 0.75,
+                            'test_function': 'time_zero_fracs',
+                            'message': 'time-unit zero',
+                            'self_test': 0.9999},
 
     'space_zeros':         {'threshold': 0.95,
-                            'test_function': ic.get_space_zero_fracs,
-                            'message': 'space-unit zero'},
+                            'test_function': 'space_zero_fracs',
+                            'message': 'space-unit zero',
+                            'self_test': 0.99},
 
-    'feature_zeros':       {'threshold': 0.95,
-                            'test_function': ic.get_feature_zero_fracs,
-                            'message': 'feature zero'},
+    'feature_zeros':       {'threshold': 0.75,
+                            'test_function': 'feature_zero_fracs',
+                            'message': 'feature zero',
+                            'self_test': 0.9999},
 
-    'delta_completeness':  {'threshold': 1.25,
-                            'test_function': ic.get_delta_completeness,
-                            'message': 'feature delta_completeness'},
+    'delta_completeness':  {'threshold': 1.01,
+                            'test_function': 'delta_completeness',
+                            'message': 'feature delta_completeness',
+                            'self_test': 0.99},
 
-    'delta_zeroes':        {'threshold': 1.25,
-                            'test_function': ic.get_delta_zeroes,
-                            'message': 'feature delta_zeroes'},
+    'delta_zeroes':        {'threshold': 1.01,
+                            'test_function': 'delta_zeroes',
+                            'message': 'feature delta_zeroes',
+                            'self_test': 0.99},
 
     'extreme_values':      {'threshold': 4.0,
-                            'test_function': ic.get_extreme_values,
-                            'message': 'feature extreme values'},
+                            'test_function': 'extreme_values',
+                            'message': 'feature extreme values',
+                            'self_test': 8.0},
 
     'ks_drift':            {'threshold': 100.,
-                            'test_function': ic.get_ks_drift,
-                            'message': 'feature KS drift'},
+                            'test_function': 'ks_drift',
+                            'message': 'feature KS drift',
+                            'self_test': None},
 
     'ecod_drift':          {'threshold': 0.05,
-                            'test_function': ic.get_ecod_drift,
-                            'message': 'dataset ECOD drift'},
+                            'test_function': 'ecod_drift',
+                            'message': 'dataset ECOD drift',
+                            'self_test': None},
 
     'standard_partition_length': 10,
-    'test_partition_length': 1
+    'test_partition_length': 1,
 
     }
diff --git a/viewser/commands/queryset/drift_detection.py b/viewser/commands/queryset/drift_detection.py
@@ -1,7 +1,9 @@
 import numpy as np
-import scipy
 from views_tensor_utilities import objects, mappings
 from . import config_drift as config
+from . import integrity_checks as ic
+from . import self_test as st
+#import viewser.commands.queryset.models.self_test_data as std
 import datetime
 
 
@@ -60,7 +62,10 @@ def generate_alarms(self):
 
         """
 
-        results, translation_dict = self.test_function(
+#        print(self.test_function,self.test_partition_length,self.standard_partition_length,self.data.shape,
+#              self.features)
+
+        results, translation_dict = getattr(ic, self.test_function)(
                                               tensor=self.data,
                                               index=self.index,
                                               features=self.features,
@@ -107,17 +112,90 @@ class InputGate:
 
     """
 
-    def __init__(self, df, drift_config_dict=None):
+    def __init__(self, df, drift_config_dict=None, self_test=False, self_test_data=None):
         self.config_dict = drift_config_dict
-        self.tensor_container = objects.ViewsDataframe(df, cast_to_dtype=np.float64).to_numpy_time_space()
-        self.numeric_part = self.tensor_container.get_numeric_part()
+        self.default_config_dict = config.default_config_dict
+
+        if self_test:
+            self.__self_test(self_test_data)
+
+        self.tensor_container = (objects.ViewsDataframe(df, split_strategy='float_string', cast_strategy='to_64').
+                                 to_numpy_time_space())
+        self.numeric_part = self.tensor_container.get_numeric_views_tensors()[0]
         self.tensor = self.numeric_part.tensor
         self.index = self.tensor_container.index
         self.columns = self.numeric_part.columns
 
-        self.default_config_dict = config.default_config_dict
         self.testers = []
 
+    def __self_test(self, self_test_data):
+
+        """
+        ___self_test
+
+        Method driving the self-test machinery for the drift detection system
+
+        A standard dataframe is fetched and custom perturbation functions, one per integrity-checking function, are
+        called upon to perturb the standard data in ways designed to trigger alerts from the drift-detector.
+        Perturbed data is passed to the Tester method as normal and alerts are collected.
+        By design, all integrity checks should be failed. If this is not the case, it implies a problem with one
+        or more of the integrity-checking routines, or with the input data, which must be investigated.
+
+        """
+
+        self_test_container = (objects.ViewsDataframe(self_test_data,
+                                                      split_strategy='float_string',
+                                                      cast_strategy='to_64').to_numpy_time_space())
+
+        self_test_index = self_test_container.index
+        self_test_features = self_test_container.get_numeric_views_tensors()[0].columns
+
+        self_test_data = self_test_container.get_numeric_numpy_tensors()[0]
+
+        testers = []
+
+        for key in self.config_dict.keys():
+            try:
+
+                self_test_dict = self.default_config_dict[key]
+
+                self_test_dict['index'] = self_test_index
+
+                self_test_dict['test_partition_length'] = self.config_dict['test_partition_length']
+
+                self_test_dict['standard_partition_length'] = self.config_dict['standard_partition_length']
+
+                perturbed_self_test_data = getattr(st, 'perturb_'+self_test_dict['test_function'])(self_test_data,
+                                                                                                   **self_test_dict)
+
+                testers.append(Tester(test_function=self_test_dict['test_function'],
+                                      test_partition_length=self.config_dict['test_partition_length'],
+                                      standard_partition_length=self.config_dict['standard_partition_length'],
+                                      threshold=self_test_dict['threshold'],
+                                      message=self_test_dict['message'],
+                                      data=perturbed_self_test_data,
+                                      index=self_test_index,
+                                      features=self_test_features,
+                                      ))
+
+            except:
+                pass
+
+        alerts = [tester.generate_alarms() for tester in testers]
+
+        nfailures = 0
+        for alert in alerts:
+            if 'alarm' in str(alert[0]):
+                nfailures += 1
+
+        print(f'{nfailures}/{len(testers)} tests failed')
+
+        print()
+        print()
+        print('******END*******')
+        print()
+        print()
+
     def assemble_alerts(self):
         """
         assemble_alerts