V1 4 6 fixes (#54)

* clip to (-20, 20) before log transform * Fix ordinal transform * 1.4.6 version * Add pretest and sample size params * Avoid dangerous math in UT
janoPig · Jan 27, 2024 · 1eb4693 · 1eb4693
1 parent 9753114
commit 1eb4693
Show file tree

Hide file tree

Showing 9 changed files with 28 additions and 9 deletions.
diff --git a/.gitignore b/.gitignore
@@ -16,3 +16,4 @@ test_reg.py
 test_clf.py
 profile_cumtime.txt
 profile_tottime.txt
+TODO.txt
diff --git a/HROCH/classifier.py b/HROCH/classifier.py
@@ -68,11 +68,13 @@ class NonlinearLogisticRegressor(SymbolicSolver, ClassifierMixin):
      algo_settings : dict, default = None
         If not defined SymbolicSolver.ALGO_SETTINGS is used.
         ```python
-        algo_settings = {'neighbours_count':15, 'alpha':0.15, 'beta':0.5}
+        algo_settings = {'neighbours_count':15, 'alpha':0.15, 'beta':0.5, 'pretest_size':1, 'sample_size':16}
         ```
         - 'neighbours_count' : (int) Number tested neighbours in each iteration
         - 'alpha' : (float) Score worsening limit for a iteration
         - 'beta' : (float) Tree breadth-wise expanding factor in a range from 0 to 1
+        - 'pretest_size' : (int) Batch count(batch is 64 rows sample) for fast fitness preevaluating
+        - 'sample_size : (int) Number of batches of sample used to calculate the score during training
 
     code_settings : dict, default = None
         If not defined SymbolicSolver.CODE_SETTINGS is used.

diff --git a/HROCH/fuzzy.py b/HROCH/fuzzy.py
@@ -55,11 +55,13 @@ class FuzzyRegressor(SymbolicSolver, ClassifierMixin):
      algo_settings : dict, default = None
         If not defined SymbolicSolver.ALGO_SETTINGS is used.
         ```python
-        algo_settings = {'neighbours_count':15, 'alpha':0.15, 'beta':0.5}
+        algo_settings = {'neighbours_count':15, 'alpha':0.15, 'beta':0.5, 'pretest_size':1, 'sample_size':16}
         ```
         - 'neighbours_count' : (int) Number tested neighbours in each iteration
         - 'alpha' : (float) Score worsening limit for a iteration
         - 'beta' : (float) Tree breadth-wise expanding factor in a range from 0 to 1
+        - 'pretest_size' : (int) Batch count(batch is 64 rows sample) for fast fitness preevaluating
+        - 'sample_size : (int) Number of batches of sample used to calculate the score during training
 
     code_settings : dict, default = None
         If not defined SymbolicSolver.CODE_SETTINGS is used.

diff --git a/HROCH/hroch.bin b/HROCH/hroch.bin
diff --git a/HROCH/hroch.dll b/HROCH/hroch.dll
diff --git a/HROCH/hroch.py b/HROCH/hroch.py
@@ -57,6 +57,8 @@ class FitParams(ctypes.Structure):
                 ("verbose", ctypes.c_uint),
                 ("pop_sel", ctypes.c_uint),
                 ("metric", ctypes.c_uint),
+                ("pretest_size", ctypes.c_uint),
+                ("sample_size", ctypes.c_uint),
                 ("neighbours_count", ctypes.c_uint),
                 ("alpha", ctypes.c_double),
                 ("beta", ctypes.c_double),
@@ -163,7 +165,7 @@ def _predict(self, X: numpy.ndarray, c=None, transform=True, check_input=True):
     def __transform(self, y):
         if self.transformation is not None:
             if self.transformation == 'LOGISTIC':
-                y = 1.0/(1.0+numpy.exp(-numpy.clip(y,a_min=-100.0, a_max=100.0)))
+                y = 1.0/(1.0+numpy.exp(-numpy.clip(y,a_min=-20.0, a_max=20.0)))
             elif self.transformation == 'ORDINAL':
                 y = numpy.round(y)
 
@@ -488,11 +490,13 @@ class SymbolicSolver(BaseEstimator):
     algo_settings : dict, default = None
         If not defined SymbolicSolver.ALGO_SETTINGS is used.
         ```python
-        algo_settings = {'neighbours_count':15, 'alpha':0.15, 'beta':0.5}
+        algo_settings = {'neighbours_count':15, 'alpha':0.15, 'beta':0.5, 'pretest_size':1, 'sample_size':16}
         ```
         - 'neighbours_count' : (int) Number tested neighbours in each iteration
         - 'alpha' : (float) Score worsening limit for a iteration
         - 'beta' : (float) Tree breadth-wise expanding factor in a range from 0 to 1
+        - 'pretest_size' : (int) Batch count(batch is 64 rows sample) for fast fitness preevaluating
+        - 'sample_size : (int) Number of batches of sample used to calculate the score during training
 
     code_settings : dict, default = None
         If not defined SymbolicSolver.CODE_SETTINGS is used.
@@ -571,7 +575,7 @@ class SymbolicSolver(BaseEstimator):
 
     FUZZY = {'nop': 0.01, 'f_and': 1.0, 'f_or': 1.0, 'f_xor': 1.0, 'f_not': 1.0}
 
-    ALGO_SETTINGS = {'neighbours_count':15, 'alpha':0.15, 'beta':0.5}
+    ALGO_SETTINGS = {'neighbours_count':15, 'alpha':0.15, 'beta':0.5, 'pretest_size':1, 'sample_size':16}
     CODE_SETTINGS = {'min_size': 32, 'max_size':32, 'const_size':8}
     POPULATION_SETTINGS = {'size': 64, 'tournament':4}
     INIT_CONST_SETTINGS = {'const_min':-1.0, 'const_max':1.0, 'predefined_const_prob':0.0, 'predefined_const_set': []}
@@ -738,6 +742,8 @@ def val(d, key, v):
             verbose=self.verbose,
             pop_sel=val(population_settings, 'tournament', 4),
             metric=self.__parse_metric(self.metric),
+            pretest_size=val(algo_settings, 'pretest_size', 1),
+            sample_size=val(algo_settings, 'sample_size', 16),
             neighbours_count=val(algo_settings, 'neighbours_count', 15),
             alpha=val(algo_settings, 'alpha', 0.15),
             beta=val(algo_settings, 'beta', 0.5),
@@ -930,7 +936,7 @@ def __parse_transformation(self, transformation: str):
         elif transformation == 'PSEUDOLOG':
             return 2
         elif transformation == 'ORDINAL':
-            return 4
+            return 3
         return 0
 
     def __create_model(self, m: MathModel):

diff --git a/HROCH/regressor.py b/HROCH/regressor.py
@@ -64,11 +64,13 @@ class SymbolicRegressor(SymbolicSolver, RegressorMixin):
      algo_settings : dict, default = None
         If not defined SymbolicSolver.ALGO_SETTINGS is used.
         ```python
-        algo_settings = {'neighbours_count':15, 'alpha':0.15, 'beta':0.5}
+        algo_settings = {'neighbours_count':15, 'alpha':0.15, 'beta':0.5, 'pretest_size':1, 'sample_size':16}
         ```
         - 'neighbours_count' : (int) Number tested neighbours in each iteration
         - 'alpha' : (float) Score worsening limit for a iteration
         - 'beta' : (float) Tree breadth-wise expanding factor in a range from 0 to 1
+        - 'pretest_size' : (int) Batch count(batch is 64 rows sample) for fast fitness preevaluating
+        - 'sample_size : (int) Number of batches of sample used to calculate the score during training
 
     code_settings : dict, default = None
         If not defined SymbolicSolver.CODE_SETTINGS is used.

diff --git a/setup.py b/setup.py
@@ -92,7 +92,7 @@
 
 setup(
     name='HROCH',
-    version='1.4.5',
+    version='1.4.6',
     description='Symbolic regression and classification',
     long_description=ldesc,
     long_description_content_type="text/markdown",

diff --git a/test/sklear_test.py b/test/sklear_test.py
@@ -7,7 +7,13 @@
     'check_sample_weights_invariance': [{'kind': 'zeros'}], # mixing samples in this test leads to inconsistent results for small iter_limit
 }
 
-common_params = {'iter_limit':1000, 'time_limit':0.0, 'random_state':42, 'num_threads':1}
+common_params = {
+    'iter_limit':1000,
+    'time_limit':0.0,
+    'random_state':42,
+    'num_threads':1,
+    'problem':{'add':1.0, 'mul':1.0, 'sub':0.1}, # avoid dangerous div or sqrt
+    }
 binary_estimators = [SymbolicRegressor, NonlinearLogisticRegressor, FuzzyRegressor, RegressorMathModel, ClassifierMathModel]
 class TestSklearnCheck(unittest.TestCase):
     def __test_estimator(self, estimator):