From 9d377483d68032cab58fd47643bd0bc0a1f7939d Mon Sep 17 00:00:00 2001 From: Jose Angel Hernao Date: Sat, 18 Sep 2021 11:49:10 -0500 Subject: [PATCH] Add preparing tests --- tests/creators/creator_preparing.py | 70 ++++++++++ tests/test_created__preparing.py | 199 ++++++++++++++++++++++++++++ tests/test_preparing.py | 130 ++++++++++++++++++ 3 files changed, 399 insertions(+) create mode 100644 tests/creators/creator_preparing.py create mode 100644 tests/test_created__preparing.py create mode 100644 tests/test_preparing.py diff --git a/tests/creators/creator_preparing.py b/tests/creators/creator_preparing.py new file mode 100644 index 000000000..baeb5e8c7 --- /dev/null +++ b/tests/creators/creator_preparing.py @@ -0,0 +1,70 @@ +import datetime +import sys +sys.path.append("../..") + + +def create(): + from optimus import Optimus + from optimus.tests.creator import TestCreator, default_configs + + op = Optimus("pandas") + df = op.create.dataframe({ + 'NullType': [None, None, None, None, None, None], + 'attributes': [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]], + 'date arrival': ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], + 'function(binary)': [bytearray('Leader', 'utf-8'), bytearray('Espionage', 'utf-8'), bytearray('Security', 'utf-8'), bytearray('First Lieutenant', 'utf-8'), bytearray('None', 'utf-8'), bytearray('Battle Station', 'utf-8')], + 'height(ft)': [-28, 17, 26, 13, None, 300], + 'japanese name': [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']], + ('last date seen', 'date'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], + 'last position seen': ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], + 'rank': [10, 7, 7, 8, 10, 8], + ('Cybertronian', 'bool'): [True, True, True, True, True, False], + ('Date Type'): [datetime.datetime(2016, 9, 10), datetime.datetime(2015, 8, 10), datetime.datetime(2014, 6, 24), datetime.datetime(2013, 6, 24), datetime.datetime(2012, 5, 10), datetime.datetime(2011, 4, 10)], + ('age', 'int'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], + ('function', 'string'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', 'None', 'Battle Station'], + ('names', 'str'): ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'], + ('timestamp', 'time'): [datetime.datetime(2014, 6, 24, 0, 0), datetime.datetime(2014, 6, 24, 0, 0), datetime.datetime(2014, 6, 24, 0, 0), datetime.datetime(2014, 6, 24, 0, 0), datetime.datetime(2014, 6, 24, 0, 0), datetime.datetime(2014, 6, 24, 0, 0)], + ('weight(t)', 'float'): [4.3, 2.0, 4.0, 1.8, 5.7, None] + }) + + t = TestCreator(op, df, name="preparing", configs=default_configs) + + t.create(method="cols.impute", variant="all", cols="*", strategy="constant", fill_value=float("inf")) + t.create(method="cols.impute", variant="numeric_single_mean",cols=["height(ft)"], strategy="mean") + t.create(method="cols.impute", variant="numeric_single_median",cols=["height(ft)"], strategy="median") + t.create(method="cols.impute", variant="numeric_single_most_frequent",cols=["height(ft)"], strategy="most_frequent") + t.create(method="cols.impute", variant="numeric_single_constant",cols=["height(ft)"], strategy="constant") + t.create(method="cols.impute", variant="numeric_multiple_mean",cols=["rank","age","weight(t)"], strategy="mean", output_cols=["rk","ag","wt"]) + t.create(method="cols.impute", variant="numeric_multiple_median",cols=["rank","age","weight(t)"], strategy="median", output_cols=["rk","ag","wt"]) + t.create(method="cols.impute", variant="numeric_multiple_most_frequent",cols=["rank","age","weight(t)"], strategy="most_frequent", output_cols=["rk","ag","wt"]) + t.create(method="cols.impute", variant="numeric_multiple_constamt",cols=["rank","age","weight(t)"], strategy="constant", fill_value=12, output_cols=["rk","ag","wt"]) + t.create(method="cols.impute", variant="string_single_most_frequent",cols=["function"], strategy="most_frequent") + t.create(method="cols.impute", variant="string_single_constant",cols=["function"], strategy="constant", fill_value=0.32132) + t.create(method="cols.impute", variant="string_multiple_most_frequent",cols=["names","function"], strategy="most_frequent") + t.create(method="cols.impute", variant="string_multiple_constamt",cols=["names","function"], strategy="constant", fill_value=float("-inf")) + t.create(method="cols.impute", variant="multiple_most_frequent", cols=["rank","age","weight(t)","names","function"], strategy="most_frequent", output_cols=["rk","ag","wt","nm","fn"]) + t.create(method="cols.impute", variant="multiple_constant", cols=["rank","age","weight(t)","names","function"], strategy="constant", fill_value=-13, output_cols=["rk","ag","wt","nm","fn"]) + + df = op.create.dataframe({ + 'NullType': [None, None, None, None, None, None], + 'attributes': [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]], + 'date arrival': ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], + 'function(binary)': [bytearray('Leader', 'utf-8'), bytearray('Espionage', 'utf-8'), bytearray('Security', 'utf-8'), bytearray('First Lieutenant', 'utf-8'), bytearray('None', 'utf-8'), bytearray('Battle Station', 'utf-8')], + 'japanese name': [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']], + ('last date seen', 'date'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], + 'last position seen': ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], + ('Cybertronian', 'bool'): [True, True, True, True, True, False], + ('function', 'string'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', 'None', 'Battle Station'], + ('names', 'str'): ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'] + }) + + t.create(method="cols.one_hot_encode", variant="all", cols="*") + t.create(method="cols.one_hot_encode", variant="all_prefix", cols="*", prefix="cols") + t.create(method="cols.one_hot_encode", variant="string", cols=["names"]) + t.create(method="cols.one_hot_encode", variant="string_prefix", cols=["names"], prefix="name") + t.create(method="cols.one_hot_encode", variant="multiple", cols=["NullType","japanese name","last date seen"]) + t.create(method="cols.one_hot_encode", variant="multiple_prefix", cols=["NullType","japanese name","last date seen"], prefix="type") + + t.run() + +create() \ No newline at end of file diff --git a/tests/test_created__preparing.py b/tests/test_created__preparing.py new file mode 100644 index 000000000..9814cd94a --- /dev/null +++ b/tests/test_created__preparing.py @@ -0,0 +1,199 @@ +import datetime +from optimus.tests.base import TestBase +from optimus.helpers.json import json_encoding +from optimus.helpers.functions import deep_sort, df_dicts_equal, results_equal + + +def Timestamp(t): + return datetime.datetime.strptime(t, "%Y-%m-%d %H:%M:%S") + + +nan = float("nan") +inf = float("inf") + + +class TestPreparingPandas(TestBase): + config = {'engine': 'pandas'} + dict = {('NullType', 'object'): [None, None, None, None, None, None], ('attributes', 'object'): [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('function(binary)', 'object'): [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('japanese name', 'object'): [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', 'None', 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]} + maxDiff = None + + def test_cols_impute_all(self): + df = self.df + result = df.cols.impute(cols='*', strategy='constant', fill_value=inf) + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_impute_multiple_constant(self): + df = self.df + result = df.cols.impute(cols=['rank', 'age', 'weight(t)', 'names', 'function'], strategy='constant', fill_value=-13, output_cols=['rk', 'ag', 'wt', 'nm', 'fn']) + expected = self.create_dataframe(dict={('NullType', 'object'): [None, None, None, None, None, None], ('attributes', 'object'): [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('function(binary)', 'object'): [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('japanese name', 'object'): [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('rk', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('ag', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', 'None', 'Battle Station'], ('fn', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', 'None', 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'], ('nm', 'object'): ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan], ('wt', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, -13.0]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_impute_multiple_most_frequent(self): + df = self.df + result = df.cols.impute(cols=['rank', 'age', 'weight(t)', 'names', 'function'], strategy='most_frequent', output_cols=['rk', 'ag', 'wt', 'nm', 'fn']) + expected = self.create_dataframe(dict={('NullType', 'object'): [None, None, None, None, None, None], ('attributes', 'object'): [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('function(binary)', 'object'): [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('japanese name', 'object'): [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('rk', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('ag', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', 'None', 'Battle Station'], ('fn', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', 'None', 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'], ('nm', 'object'): ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan], ('wt', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, 1.8]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_impute_numeric_multiple_constamt(self): + df = self.df + result = df.cols.impute(cols=['rank', 'age', 'weight(t)'], strategy='constant', fill_value=12, output_cols=['rk', 'ag', 'wt']) + expected = self.create_dataframe(dict={('NullType', 'object'): [None, None, None, None, None, None], ('attributes', 'object'): [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('function(binary)', 'object'): [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('japanese name', 'object'): [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('rk', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('ag', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', 'None', 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan], ('wt', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, 12.0]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_impute_numeric_multiple_mean(self): + df = self.df + result = df.cols.impute(cols=['rank', 'age', 'weight(t)'], strategy='mean', output_cols=['rk', 'ag', 'wt']) + expected = self.create_dataframe(dict={('NullType', 'object'): [None, None, None, None, None, None], ('attributes', 'object'): [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('function(binary)', 'object'): [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('japanese name', 'object'): [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'float64'): [10.0, 7.0, 7.0, 8.0, 10.0, 8.0], ('rk', 'float64'): [10.0, 7.0, 7.0, 8.0, 10.0, 8.0], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'float64'): [5000000.0, 5000000.0, 5000000.0, 5000000.0, 5000000.0, 5000000.0], ('ag', 'float64'): [5000000.0, 5000000.0, 5000000.0, 5000000.0, 5000000.0, 5000000.0], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', 'None', 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan], ('wt', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, 3.56]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_impute_numeric_multiple_median(self): + df = self.df + result = df.cols.impute(cols=['rank', 'age', 'weight(t)'], strategy='median', output_cols=['rk', 'ag', 'wt']) + expected = self.create_dataframe(dict={('NullType', 'object'): [None, None, None, None, None, None], ('attributes', 'object'): [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('function(binary)', 'object'): [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('japanese name', 'object'): [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'float64'): [10.0, 7.0, 7.0, 8.0, 10.0, 8.0], ('rk', 'float64'): [10.0, 7.0, 7.0, 8.0, 10.0, 8.0], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'float64'): [5000000.0, 5000000.0, 5000000.0, 5000000.0, 5000000.0, 5000000.0], ('ag', 'float64'): [5000000.0, 5000000.0, 5000000.0, 5000000.0, 5000000.0, 5000000.0], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', 'None', 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan], ('wt', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, 4.0]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_impute_numeric_multiple_most_frequent(self): + df = self.df + result = df.cols.impute(cols=['rank', 'age', 'weight(t)'], strategy='most_frequent', output_cols=['rk', 'ag', 'wt']) + expected = self.create_dataframe(dict={('NullType', 'object'): [None, None, None, None, None, None], ('attributes', 'object'): [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('function(binary)', 'object'): [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('japanese name', 'object'): [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('rk', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('ag', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', 'None', 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan], ('wt', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, 1.8]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_impute_numeric_single_constant(self): + df = self.df + result = df.cols.impute(cols=['height(ft)'], strategy='constant') + expected = self.create_dataframe(dict={('NullType', 'object'): [None, None, None, None, None, None], ('attributes', 'object'): [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('function(binary)', 'object'): [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, 0.0, 300.0], ('japanese name', 'object'): [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', 'None', 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_impute_numeric_single_mean(self): + df = self.df + result = df.cols.impute(cols=['height(ft)'], strategy='mean') + expected = self.create_dataframe(dict={('NullType', 'object'): [None, None, None, None, None, None], ('attributes', 'object'): [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('function(binary)', 'object'): [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, 65.6, 300.0], ('japanese name', 'object'): [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', 'None', 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_impute_numeric_single_median(self): + df = self.df + result = df.cols.impute(cols=['height(ft)'], strategy='median') + expected = self.create_dataframe(dict={('NullType', 'object'): [None, None, None, None, None, None], ('attributes', 'object'): [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('function(binary)', 'object'): [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, 17.0, 300.0], ('japanese name', 'object'): [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', 'None', 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_impute_numeric_single_most_frequent(self): + df = self.df + result = df.cols.impute(cols=['height(ft)'], strategy='most_frequent') + expected = self.create_dataframe(dict={('NullType', 'object'): [None, None, None, None, None, None], ('attributes', 'object'): [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('function(binary)', 'object'): [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, -28.0, 300.0], ('japanese name', 'object'): [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', 'None', 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_impute_string_multiple_constamt(self): + df = self.df + result = df.cols.impute(cols=['names', 'function'], strategy='constant', fill_value=-inf) + expected = self.create_dataframe(dict={('NullType', 'object'): [None, None, None, None, None, None], ('attributes', 'object'): [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('function(binary)', 'object'): [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('japanese name', 'object'): [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', 'None', 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_impute_string_multiple_most_frequent(self): + df = self.df + result = df.cols.impute(cols=['names', 'function'], strategy='most_frequent') + expected = self.create_dataframe(dict={('NullType', 'object'): [None, None, None, None, None, None], ('attributes', 'object'): [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('function(binary)', 'object'): [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('japanese name', 'object'): [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', 'None', 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_impute_string_single_constant(self): + df = self.df + result = df.cols.impute(cols=['function'], strategy='constant', fill_value=0.32132) + expected = self.create_dataframe(dict={('NullType', 'object'): [None, None, None, None, None, None], ('attributes', 'object'): [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('function(binary)', 'object'): [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('japanese name', 'object'): [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', 'None', 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_impute_string_single_most_frequent(self): + df = self.df + result = df.cols.impute(cols=['function'], strategy='most_frequent') + expected = self.create_dataframe(dict={('NullType', 'object'): [None, None, None, None, None, None], ('attributes', 'object'): [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('function(binary)', 'object'): [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('japanese name', 'object'): [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', 'None', 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_one_hot_encode_all(self): + df = self.df + result = df.cols.one_hot_encode(cols='*') + expected = self.create_dataframe(dict={('NullType', 'object'): [None, None, None, None, None, None], ('attributes', 'object'): [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('function(binary)', 'object'): [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('japanese name', 'object'): [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', 'None', 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan], ('NullType_None', 'uint8'): [1, 1, 1, 1, 1, 1], ('attributes_[3.9624, 1800.0]', 'uint8'): [0, 0, 0, 1, 0, 0], ('attributes_[5.334, 2000.0]', 'uint8'): [0, 1, 0, 0, 0, 0], ('attributes_[7.9248, 4000.0]', 'uint8'): [0, 0, 1, 0, 0, 0], ('attributes_[8.5344, 4300.0]', 'uint8'): [1, 0, 0, 0, 0, 0], ('attributes_[91.44, None]', 'uint8'): [0, 0, 0, 0, 0, 1], ('attributes_[None, 5700.0]', 'uint8'): [0, 0, 0, 0, 1, 0], ('date arrival_1980/04/10', 'uint8'): [1, 1, 1, 1, 1, 1], ("function(binary)_bytearray(b'Battle Station')", 'uint8'): [0, 0, 0, 0, 0, 1], ("function(binary)_bytearray(b'Espionage')", 'uint8'): [0, 1, 0, 0, 0, 0], ("function(binary)_bytearray(b'First Lieutenant')", 'uint8'): [0, 0, 0, 1, 0, 0], ("function(binary)_bytearray(b'Leader')", 'uint8'): [1, 0, 0, 0, 0, 0], ("function(binary)_bytearray(b'None')", 'uint8'): [0, 0, 0, 0, 1, 0], ("function(binary)_bytearray(b'Security')", 'uint8'): [0, 0, 1, 0, 0, 0], ("japanese name_['Bumble', 'Goldback']", 'uint8'): [0, 1, 0, 0, 0, 0], ("japanese name_['Inochi', 'Convoy']", 'uint8'): [1, 0, 0, 0, 0, 0], ("japanese name_['Megatron']", 'uint8'): [0, 0, 0, 0, 1, 0], ("japanese name_['Meister']", 'uint8'): [0, 0, 0, 1, 0, 0], ("japanese name_['Metroflex']", 'uint8'): [0, 0, 0, 0, 0, 1], ("japanese name_['Roadbuster']", 'uint8'): [0, 0, 1, 0, 0, 0], ('last date seen_2011/04/10', 'uint8'): [0, 0, 0, 0, 0, 1], ('last date seen_2012/05/10', 'uint8'): [0, 0, 0, 0, 1, 0], ('last date seen_2013/06/10', 'uint8'): [0, 0, 0, 1, 0, 0], ('last date seen_2014/07/10', 'uint8'): [0, 0, 1, 0, 0, 0], ('last date seen_2015/08/10', 'uint8'): [0, 1, 0, 0, 0, 0], ('last date seen_2016/09/10', 'uint8'): [1, 0, 0, 0, 0, 0], ('last position seen_10.642707,-71.612534', 'uint8'): [0, 1, 0, 0, 0, 0], ('last position seen_19.442735,-99.201111', 'uint8'): [1, 0, 0, 0, 0, 0], ('last position seen_33.670666,-117.841553', 'uint8'): [0, 0, 0, 1, 0, 0], ('last position seen_37.789563,-122.400356', 'uint8'): [0, 0, 1, 0, 0, 0], ('last position seen_None', 'uint8'): [0, 0, 0, 0, 1, 1], ('function_Battle Station', 'uint8'): [0, 0, 0, 0, 0, 1], ('function_Espionage', 'uint8'): [0, 1, 0, 0, 0, 0], ('function_First Lieutenant', 'uint8'): [0, 0, 0, 1, 0, 0], ('function_Leader', 'uint8'): [1, 0, 0, 0, 0, 0], ('function_None', 'uint8'): [0, 0, 0, 0, 1, 0], ('function_Security', 'uint8'): [0, 0, 1, 0, 0, 0], ('names_Jazz', 'uint8'): [0, 0, 0, 1, 0, 0], ('names_Megatron', 'uint8'): [0, 0, 0, 0, 1, 0], ('names_Metroplex_)^$', 'uint8'): [0, 0, 0, 0, 0, 1], ('names_Optimus', 'uint8'): [1, 0, 0, 0, 0, 0], ('names_bumbl#ebéé ', 'uint8'): [0, 1, 0, 0, 0, 0], ('names_ironhide&', 'uint8'): [0, 0, 1, 0, 0, 0]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_one_hot_encode_all_prefix(self): + df = self.df + result = df.cols.one_hot_encode(cols='*', prefix='cols') + expected = self.create_dataframe(dict={('NullType', 'object'): [None, None, None, None, None, None], ('attributes', 'object'): [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('function(binary)', 'object'): [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('japanese name', 'object'): [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', 'None', 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan], ('cols_None', + 'cols_None uint8\ncols_None uint8\ncols_None uint8\ndtype: object'): [0, 0, 0, 0, 1, 0], ('cols_[3.9624, 1800.0]', 'uint8'): [0, 0, 0, 1, 0, 0], ('cols_[5.334, 2000.0]', 'uint8'): [0, 1, 0, 0, 0, 0], ('cols_[7.9248, 4000.0]', 'uint8'): [0, 0, 1, 0, 0, 0], ('cols_[8.5344, 4300.0]', 'uint8'): [1, 0, 0, 0, 0, 0], ('cols_[91.44, None]', 'uint8'): [0, 0, 0, 0, 0, 1], ('cols_[None, 5700.0]', 'uint8'): [0, 0, 0, 0, 1, 0], ('cols_1980/04/10', 'uint8'): [1, 1, 1, 1, 1, 1], ("cols_bytearray(b'Battle Station')", 'uint8'): [0, 0, 0, 0, 0, 1], ("cols_bytearray(b'Espionage')", 'uint8'): [0, 1, 0, 0, 0, 0], ("cols_bytearray(b'First Lieutenant')", 'uint8'): [0, 0, 0, 1, 0, 0], ("cols_bytearray(b'Leader')", 'uint8'): [1, 0, 0, 0, 0, 0], ("cols_bytearray(b'None')", 'uint8'): [0, 0, 0, 0, 1, 0], ("cols_bytearray(b'Security')", 'uint8'): [0, 0, 1, 0, 0, 0], ("cols_['Bumble', 'Goldback']", 'uint8'): [0, 1, 0, 0, 0, 0], ("cols_['Inochi', 'Convoy']", 'uint8'): [1, 0, 0, 0, 0, 0], ("cols_['Megatron']", 'uint8'): [0, 0, 0, 0, 1, 0], ("cols_['Meister']", 'uint8'): [0, 0, 0, 1, 0, 0], ("cols_['Metroflex']", 'uint8'): [0, 0, 0, 0, 0, 1], ("cols_['Roadbuster']", 'uint8'): [0, 0, 1, 0, 0, 0], ('cols_2011/04/10', 'uint8'): [0, 0, 0, 0, 0, 1], ('cols_2012/05/10', 'uint8'): [0, 0, 0, 0, 1, 0], ('cols_2013/06/10', 'uint8'): [0, 0, 0, 1, 0, 0], ('cols_2014/07/10', 'uint8'): [0, 0, 1, 0, 0, 0], ('cols_2015/08/10', 'uint8'): [0, 1, 0, 0, 0, 0], ('cols_2016/09/10', 'uint8'): [1, 0, 0, 0, 0, 0], ('cols_10.642707,-71.612534', 'uint8'): [0, 1, 0, 0, 0, 0], ('cols_19.442735,-99.201111', 'uint8'): [1, 0, 0, 0, 0, 0], ('cols_33.670666,-117.841553', 'uint8'): [0, 0, 0, 1, 0, 0], ('cols_37.789563,-122.400356', 'uint8'): [0, 0, 1, 0, 0, 0], ('cols_Battle Station', 'uint8'): [0, 0, 0, 0, 0, 1], ('cols_Espionage', 'uint8'): [0, 1, 0, 0, 0, 0], ('cols_First Lieutenant', 'uint8'): [0, 0, 0, 1, 0, 0], ('cols_Leader', 'uint8'): [1, 0, 0, 0, 0, 0], ('cols_Security', 'uint8'): [0, 0, 1, 0, 0, 0], ('cols_Jazz', 'uint8'): [0, 0, 0, 1, 0, 0], ('cols_Megatron', 'uint8'): [0, 0, 0, 0, 1, 0], ('cols_Metroplex_)^$', 'uint8'): [0, 0, 0, 0, 0, 1], ('cols_Optimus', 'uint8'): [1, 0, 0, 0, 0, 0], ('cols_bumbl#ebéé ', 'uint8'): [0, 1, 0, 0, 0, 0], ('cols_ironhide&', 'uint8'): [0, 0, 1, 0, 0, 0]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_one_hot_encode_multiple(self): + df = self.df + result = df.cols.one_hot_encode(cols=['NullType', 'japanese name', 'last date seen']) + expected = self.create_dataframe(dict={('NullType', 'object'): [None, None, None, None, None, None], ('attributes', 'object'): [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('function(binary)', 'object'): [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('japanese name', 'object'): [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', 'None', 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan], ('NullType_None', 'uint8'): [1, 1, 1, 1, 1, 1], ("japanese name_['Bumble', 'Goldback']", 'uint8'): [0, 1, 0, 0, 0, 0], ("japanese name_['Inochi', 'Convoy']", 'uint8'): [1, 0, 0, 0, 0, 0], ("japanese name_['Megatron']", 'uint8'): [0, 0, 0, 0, 1, 0], ("japanese name_['Meister']", 'uint8'): [0, 0, 0, 1, 0, 0], ("japanese name_['Metroflex']", 'uint8'): [0, 0, 0, 0, 0, 1], ("japanese name_['Roadbuster']", 'uint8'): [0, 0, 1, 0, 0, 0], ('last date seen_2011/04/10', 'uint8'): [0, 0, 0, 0, 0, 1], ('last date seen_2012/05/10', 'uint8'): [0, 0, 0, 0, 1, 0], ('last date seen_2013/06/10', 'uint8'): [0, 0, 0, 1, 0, 0], ('last date seen_2014/07/10', 'uint8'): [0, 0, 1, 0, 0, 0], ('last date seen_2015/08/10', 'uint8'): [0, 1, 0, 0, 0, 0], ('last date seen_2016/09/10', 'uint8'): [1, 0, 0, 0, 0, 0]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_one_hot_encode_multiple_prefix(self): + df = self.df + result = df.cols.one_hot_encode(cols=['NullType', 'japanese name', 'last date seen'], prefix='type') + expected = self.create_dataframe(dict={('NullType', 'object'): [None, None, None, None, None, None], ('attributes', 'object'): [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('function(binary)', 'object'): [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('japanese name', 'object'): [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', 'None', 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan], ('type_None', 'uint8'): [1, 1, 1, 1, 1, 1], ("type_['Bumble', 'Goldback']", 'uint8'): [0, 1, 0, 0, 0, 0], ("type_['Inochi', 'Convoy']", 'uint8'): [1, 0, 0, 0, 0, 0], ("type_['Megatron']", 'uint8'): [0, 0, 0, 0, 1, 0], ("type_['Meister']", 'uint8'): [0, 0, 0, 1, 0, 0], ("type_['Metroflex']", 'uint8'): [0, 0, 0, 0, 0, 1], ("type_['Roadbuster']", 'uint8'): [0, 0, 1, 0, 0, 0], ('type_2011/04/10', 'uint8'): [0, 0, 0, 0, 0, 1], ('type_2012/05/10', 'uint8'): [0, 0, 0, 0, 1, 0], ('type_2013/06/10', 'uint8'): [0, 0, 0, 1, 0, 0], ('type_2014/07/10', 'uint8'): [0, 0, 1, 0, 0, 0], ('type_2015/08/10', 'uint8'): [0, 1, 0, 0, 0, 0], ('type_2016/09/10', 'uint8'): [1, 0, 0, 0, 0, 0]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_one_hot_encode_string(self): + df = self.df + result = df.cols.one_hot_encode(cols=['names']) + expected = self.create_dataframe(dict={('NullType', 'object'): [None, None, None, None, None, None], ('attributes', 'object'): [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('function(binary)', 'object'): [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('japanese name', 'object'): [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', 'None', 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan], ('names_Jazz', 'uint8'): [0, 0, 0, 1, 0, 0], ('names_Megatron', 'uint8'): [0, 0, 0, 0, 1, 0], ('names_Metroplex_)^$', 'uint8'): [0, 0, 0, 0, 0, 1], ('names_Optimus', 'uint8'): [1, 0, 0, 0, 0, 0], ('names_bumbl#ebéé ', 'uint8'): [0, 1, 0, 0, 0, 0], ('names_ironhide&', 'uint8'): [0, 0, 1, 0, 0, 0]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_one_hot_encode_string_prefix(self): + df = self.df + result = df.cols.one_hot_encode(cols=['names'], prefix='name') + expected = self.create_dataframe(dict={('NullType', 'object'): [None, None, None, None, None, None], ('attributes', 'object'): [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('function(binary)', 'object'): [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('japanese name', 'object'): [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', 'None', 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan], ('name_Jazz', 'uint8'): [0, 0, 0, 1, 0, 0], ('name_Megatron', 'uint8'): [0, 0, 0, 0, 1, 0], ('name_Metroplex_)^$', 'uint8'): [0, 0, 0, 0, 0, 1], ('name_Optimus', 'uint8'): [1, 0, 0, 0, 0, 0], ('name_bumbl#ebéé ', 'uint8'): [0, 1, 0, 0, 0, 0], ('name_ironhide&', 'uint8'): [0, 0, 1, 0, 0, 0]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + +class TestPreparingDask(TestPreparingPandas): + config = {'engine': 'dask', 'n_partitions': 1} + + +class TestPreparingPartitionDask(TestPreparingPandas): + config = {'engine': 'dask', 'n_partitions': 2} + + +try: + import cudf # pyright: reportMissingImports=false +except: + pass +else: + class TestPreparingCUDF(TestPreparingPandas): + config = {'engine': 'cudf'} + + +try: + import dask_cudf # pyright: reportMissingImports=false +except: + pass +else: + class TestPreparingDC(TestPreparingPandas): + config = {'engine': 'dask_cudf', 'n_partitions': 1} + + +try: + import dask_cudf # pyright: reportMissingImports=false +except: + pass +else: + class TestPreparingPartitionDC(TestPreparingPandas): + config = {'engine': 'dask_cudf', 'n_partitions': 2} + + +try: + import pyspark # pyright: reportMissingImports=false +except: + pass +else: + class TestPreparingSpark(TestPreparingPandas): + config = {'engine': 'spark'} + + +try: + import vaex # pyright: reportMissingImports=false +except: + pass +else: + class TestPreparingVaex(TestPreparingPandas): + config = {'engine': 'vaex'} diff --git a/tests/test_preparing.py b/tests/test_preparing.py new file mode 100644 index 000000000..e1d09b6f0 --- /dev/null +++ b/tests/test_preparing.py @@ -0,0 +1,130 @@ +import datetime +from optimus.tests.base import TestBase +from optimus.helpers.json import json_encoding +from optimus.helpers.functions import deep_sort, df_dicts_equal, results_equal + + +def Timestamp(t): + return datetime.datetime.strptime(t, "%Y-%m-%d %H:%M:%S") + + +nan = float("nan") +inf = float("inf") + + +class TestPreparingStIPandas(TestBase): + config = {'engine': 'pandas'} + dict = {('NullType', 'object'): [None, None, None, None, None, None], ('attributes', 'object'): [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('function(binary)', 'object'): [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('japanese name', 'object'): [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', 'None', 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]} + maxDiff = None + + def test_cols_string_to_index_all(self): + df = self.df + result = df.cols.string_to_index(cols="*") + expected = self.create_dataframe(dict={'NullType': [None, None, None, None, None, None],'NullType_string_to_index': [0, 0, 0, 0, 0, 0],'attributes': [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]],'attributes_string_to_index': [3, 1, 2, 0, 5, 4],'date arrival': ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'],'date arrival_string_to_index': [0, 0, 0, 0, 0, 0],'function(binary)': [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')],'function(binary)_string_to_index': [3, 1, 5, 2, 4, 0],'height(ft)': [-28.0, 17.0, 26.0, 13.0, nan, 300.0],'height(ft)_string_to_index': [0, 2, 3, 1, 5, 4],'japanese name': [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']],'japanese name_string_to_index': [1, 0, 5, 3, 2, 4],'last date seen': ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'],'last date seen_string_to_index': [5, 4, 3, 2, 1, 0],'last position seen': ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None],'last position seen_string_to_index': [1, 0, 3, 2, 4, 4],'rank': [10, 7, 7, 8, 10, 8],'rank_string_to_index': [0, 1, 1, 2, 0, 2],'Cybertronian': [True, True, True, True, True, False],'Cybertronian_string_to_index': [1, 1, 1, 1, 1, 0],'Date Type': [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')],'Date Type_string_to_index': [5, 4, 3, 2, 1, 0],'age': [5000000, 5000000, 5000000, 5000000, 5000000, 5000000],'age_string_to_index': [0, 0, 0, 0, 0, 0],'function': ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'],'function_string_to_index': [3, 1, 5, 2, 4, 0],'names': ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'],'names_string_to_index': [3, 4, 5, 0, 1, 2],'timestamp': [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')],'timestamp_string_to_index': [0, 0, 0, 0, 0, 0],'weight(t)': [4.3, 2.0, 4.0, 1.8, 5.7, nan],'weight(t)_string_to_index': [3, 1, 2, 0, 4, 5]}) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_string_to_index_numeric(self): + df = self.df + result = df.cols.string_to_index(cols=["rank"]) + expected = self.create_dataframe(dict={'NullType': [None, None, None, None, None, None],'attributes': [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]],'date arrival': ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'],'function(binary)': [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')],'height(ft)': [-28.0, 17.0, 26.0, 13.0, nan, 300.0],'japanese name': [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']],'last date seen': ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'],'last position seen': ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None],'rank': [10, 7, 7, 8, 10, 8],'rank_string_to_index': [0, 1, 1, 2, 0, 2],'Cybertronian': [True, True, True, True, True, False],'Date Type': [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')],'age': [5000000, 5000000, 5000000, 5000000, 5000000, 5000000],'function': ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'],'names': ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'],'timestamp': [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')],'weight(t)': [4.3, 2.0, 4.0, 1.8, 5.7, nan]}) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_string_to_index_string(self): + df = self.df + result = df.cols.string_to_index(cols=["names"]) + expected = self.create_dataframe(dict={'NullType': [None, None, None, None, None, None],'attributes': [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]],'date arrival': ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'],'function(binary)': [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')],'height(ft)': [-28.0, 17.0, 26.0, 13.0, nan, 300.0],'japanese name': [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']],'last date seen': ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'],'last position seen': ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None],'rank': [10, 7, 7, 8, 10, 8],'Cybertronian': [True, True, True, True, True, False],'Date Type': [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')],'age': [5000000, 5000000, 5000000, 5000000, 5000000, 5000000],'function': ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'],'names': ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'],'names_string_to_index': [3, 4, 5, 0, 1, 2],'timestamp': [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')],'weight(t)': [4.3, 2.0, 4.0, 1.8, 5.7, nan]}) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_string_to_index_multiple(self): + df = self.df + result = df.cols.string_to_index(cols=["attributes","Date Type","Cybertronian"], output_cols=["at","dt","ct"]) + expected = self.create_dataframe(dict={'NullType': [None, None, None, None, None, None],'attributes': [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]],'at': [3, 1, 2, 0, 5, 4],'date arrival': ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'],'function(binary)': [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')],'height(ft)': [-28.0, 17.0, 26.0, 13.0, nan, 300.0],'japanese name': [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']],'last date seen': ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'],'last position seen': ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None],'rank': [10, 7, 7, 8, 10, 8],'Cybertronian': [True, True, True, True, True, False],'ct': [1, 1, 1, 1, 1, 0],'Date Type': [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')],'dt': [5, 4, 3, 2, 1, 0],'age': [5000000, 5000000, 5000000, 5000000, 5000000, 5000000],'function': ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'],'names': ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'],'timestamp': [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')],'weight(t)': [4.3, 2.0, 4.0, 1.8, 5.7, nan]}) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_index_to_string_all(self): + df = self.df.cols.string_to_index(cols="*") + result = df.rows.numeric(cols="*") + expected = self.create_dataframe(dict={'NullType': [None, None, None, None, None, None],'NullType_string_to_index': [0, 0, 0, 0, 0, 0],'attributes': [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]],'attributes_string_to_index': [3, 1, 2, 0, 5, 4],'date arrival': ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'],'date arrival_string_to_index': [0, 0, 0, 0, 0, 0],'function(binary)': [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')],'function(binary)_string_to_index': [3, 1, 5, 2, 4, 0],'height(ft)': [-28.0, 17.0, 26.0, 13.0, nan, 300.0],'height(ft)_string_to_index': [0, 2, 3, 1, 5, 4],'japanese name': [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']],'japanese name_string_to_index': [1, 0, 5, 3, 2, 4],'last date seen': ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'],'last date seen_string_to_index': [5, 4, 3, 2, 1, 0],'last position seen': ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None],'last position seen_string_to_index': [1, 0, 3, 2, 4, 4],'rank': [10, 7, 7, 8, 10, 8],'rank_string_to_index': [0, 1, 1, 2, 0, 2],'Cybertronian': [True, True, True, True, True, False],'Cybertronian_string_to_index': [1, 1, 1, 1, 1, 0],'Date Type': [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')],'Date Type_string_to_index': [5, 4, 3, 2, 1, 0],'age': [5000000, 5000000, 5000000, 5000000, 5000000, 5000000],'age_string_to_index': [0, 0, 0, 0, 0, 0],'function': ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'],'function_string_to_index': [3, 1, 5, 2, 4, 0],'names': ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'],'names_string_to_index': [3, 4, 5, 0, 1, 2],'timestamp': [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')],'timestamp_string_to_index': [0, 0, 0, 0, 0, 0],'weight(t)': [4.3, 2.0, 4.0, 1.8, 5.7, nan],'weight(t)_string_to_index': [3, 1, 2, 0, 4, 5]}) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + result = df.cols.index_to_string(cols=['NullType_string_to_index','attributes_string_to_index','date arrival_string_to_index','function(binary)_string_to_index','height(ft)_string_to_index','japanese name_string_to_index','last date seen_string_to_index','last position seen_string_to_index','rank_string_to_index','Cybertronian_string_to_index','Date Type_string_to_index','age_string_to_index','function_string_to_index','names_string_to_index','timestamp_string_to_index','weight(t)_string_to_index']) + expected = self.create_dataframe(dict={'NullType': [None, None, None, None, None, None],'NullType_string_to_index': [0, 0, 0, 0, 0, 0],'NullType_string_to_index_index_to_string': [None, None, None, None, None, None],'attributes': [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]],'attributes_string_to_index': [3, 1, 2, 0, 5, 4],'attributes_string_to_index_index_to_string': [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]],'date arrival': ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'],'date arrival_string_to_index': [0, 0, 0, 0, 0, 0],'date arrival_string_to_index_index_to_string': ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'],'function(binary)': [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')],'function(binary)_string_to_index': [3, 1, 5, 2, 4, 0],'function(binary)_string_to_index_index_to_string': [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')],'height(ft)': [-28.0, 17.0, 26.0, 13.0, nan, 300.0],'height(ft)_string_to_index': [0, 2, 3, 1, 5, 4],'height(ft)_string_to_index_index_to_string': [-28.0, 17.0, 26.0, 13.0, nan, 300.0],'japanese name': [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']],'japanese name_string_to_index': [1, 0, 5, 3, 2, 4],'japanese name_string_to_index_index_to_string': [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']],'last date seen': ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'],'last date seen_string_to_index': [5, 4, 3, 2, 1, 0],'last date seen_string_to_index_index_to_string': ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'],'last position seen': ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None],'last position seen_string_to_index': [1, 0, 3, 2, 4, 4],'last position seen_string_to_index_index_to_string': ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None],'rank': [10, 7, 7, 8, 10, 8],'rank_string_to_index': [0, 1, 1, 2, 0, 2],'rank_string_to_index_index_to_string': [10, 7, 7, 8, 10, 8],'Cybertronian': [True, True, True, True, True, False],'Cybertronian_string_to_index': [1, 1, 1, 1, 1, 0],'Cybertronian_string_to_index_index_to_string': [True, True, True, True, True, False],'Date Type': [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')],'Date Type_string_to_index': [5, 4, 3, 2, 1, 0],'Date Type_string_to_index_index_to_string': [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')],'age': [5000000, 5000000, 5000000, 5000000, 5000000, 5000000],'age_string_to_index': [0, 0, 0, 0, 0, 0],'age_string_to_index_index_to_string': [5000000, 5000000, 5000000, 5000000, 5000000, 5000000],'function': ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'],'function_string_to_index': [3, 1, 5, 2, 4, 0],'function_string_to_index_index_to_string': ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'],'names': ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'],'names_string_to_index': [3, 4, 5, 0, 1, 2],'names_string_to_index_index_to_string': ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'],'timestamp': [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')],'timestamp_string_to_index': [0, 0, 0, 0, 0, 0],'timestamp_string_to_index_index_to_string': [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')],'weight(t)': [4.3, 2.0, 4.0, 1.8, 5.7, nan],'weight(t)_string_to_index': [3, 1, 2, 0, 4, 5],'weight(t)_string_to_index_index_to_string': [4.3, 2.0, 4.0, 1.8, 5.7, nan]}) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_index_to_string_numeric(self): + df = self.df.cols.string_to_index(cols=["rank"]) + result = df.rows.numeric(cols="*") + expected = self.create_dataframe(dict={'NullType': [None, None, None, None, None, None],'attributes': [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]],'date arrival': ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'],'function(binary)': [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')],'height(ft)': [-28.0, 17.0, 26.0, 13.0, nan, 300.0],'japanese name': [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']],'last date seen': ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'],'last position seen': ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None],'rank': [10, 7, 7, 8, 10, 8],'rank_string_to_index': [0, 1, 1, 2, 0, 2],'Cybertronian': [True, True, True, True, True, False],'Date Type': [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')],'age': [5000000, 5000000, 5000000, 5000000, 5000000, 5000000],'function': ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'],'names': ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'],'timestamp': [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')],'weight(t)': [4.3, 2.0, 4.0, 1.8, 5.7, nan]}) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + result = df.cols.index_to_string(cols=['rank_string_to_index']) + expected = self.create_dataframe(dict={'NullType': [None, None, None, None, None, None],'attributes': [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]],'date arrival': ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'],'function(binary)': [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')],'height(ft)': [-28.0, 17.0, 26.0, 13.0, nan, 300.0],'japanese name': [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']],'last date seen': ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'],'last position seen': ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None],'rank': [10, 7, 7, 8, 10, 8],'rank_string_to_index': [0, 1, 1, 2, 0, 2],'rank_string_to_index_index_to_string': ['10', '7', '7', '8', '10', '8'],'Cybertronian': [True, True, True, True, True, False],'Date Type': [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')],'age': [5000000, 5000000, 5000000, 5000000, 5000000, 5000000],'function': ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'],'names': ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'],'timestamp': [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')],'weight(t)': [4.3, 2.0, 4.0, 1.8, 5.7, nan]}) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_index_to_string_string(self): + df = self.df.cols.string_to_index(cols=["names"]) + result = df.rows.numeric(cols="*") + expected = self.create_dataframe(dict={'NullType': [None, None, None, None, None, None],'attributes': [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]],'date arrival': ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'],'function(binary)': [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')],'height(ft)': [-28.0, 17.0, 26.0, 13.0, nan, 300.0],'japanese name': [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']],'last date seen': ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'],'last position seen': ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None],'rank': [10, 7, 7, 8, 10, 8],'Cybertronian': [True, True, True, True, True, False],'Date Type': [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')],'age': [5000000, 5000000, 5000000, 5000000, 5000000, 5000000],'function': ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'],'names': ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'],'names_string_to_index': [3, 4, 5, 0, 1, 2],'timestamp': [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')],'weight(t)': [4.3, 2.0, 4.0, 1.8, 5.7, nan]}) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + result = df.cols.index_to_string(cols=['names_string_to_index']) + expected = self.create_dataframe(dict={'NullType': [None, None, None, None, None, None],'attributes': [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]],'date arrival': ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'],'function(binary)': [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')],'height(ft)': [-28.0, 17.0, 26.0, 13.0, nan, 300.0],'japanese name': [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']],'last date seen': ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'],'last position seen': ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None],'rank': [10, 7, 7, 8, 10, 8],'Cybertronian': [True, True, True, True, True, False],'Date Type': [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')],'age': [5000000, 5000000, 5000000, 5000000, 5000000, 5000000],'function': ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'],'names': ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'],'names_string_to_index': [3, 4, 5, 0, 1, 2],'names_string_to_index_index_to_string': ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'],'timestamp': [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')],'weight(t)': [4.3, 2.0, 4.0, 1.8, 5.7, nan]}) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_index_to_string_multiple(self): + df = self.df.cols.string_to_index(cols=["attributes","Date Type","Cybertronian"], output_cols=["at","dt","ct"]) + result = df.rows.numeric(cols="*") + expected = self.create_dataframe(dict={'NullType': [None, None, None, None, None, None],'attributes': [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]],'at': [3, 1, 2, 0, 5, 4],'date arrival': ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'],'function(binary)': [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')],'height(ft)': [-28.0, 17.0, 26.0, 13.0, nan, 300.0],'japanese name': [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']],'last date seen': ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'],'last position seen': ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None],'rank': [10, 7, 7, 8, 10, 8],'Cybertronian': [True, True, True, True, True, False],'ct': [1, 1, 1, 1, 1, 0],'Date Type': [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')],'dt': [5, 4, 3, 2, 1, 0],'age': [5000000, 5000000, 5000000, 5000000, 5000000, 5000000],'function': ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'],'names': ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'],'timestamp': [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')],'weight(t)': [4.3, 2.0, 4.0, 1.8, 5.7, nan]}) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + result = df.cols.index_to_string(cols=["at","dt","ct"]) + expected = self.create_dataframe(dict={'NullType': [None, None, None, None, None, None],'attributes': [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]],'at': [3, 1, 2, 0, 5, 4],'at_index_to_string': [[8.5344, 4300.0], [5.334, 2000.0], [7.9248, 4000.0], [3.9624, 1800.0], [None, 5700.0], [91.44, None]],'date arrival': ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'],'function(binary)': [bytearray(b'Leader'), bytearray(b'Espionage'), bytearray(b'Security'), bytearray(b'First Lieutenant'), bytearray(b'None'), bytearray(b'Battle Station')],'height(ft)': [-28.0, 17.0, 26.0, 13.0, nan, 300.0],'japanese name': [['Inochi', 'Convoy'], ['Bumble', 'Goldback'], ['Roadbuster'], ['Meister'], ['Megatron'], ['Metroflex']],'last date seen': ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'],'last position seen': ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None],'rank': [10, 7, 7, 8, 10, 8],'Cybertronian': [True, True, True, True, True, False],'ct': [1, 1, 1, 1, 1, 0],'ct_index_to_string': [True, True, True, True, True, False],'Date Type': [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')],'dt': [5, 4, 3, 2, 1, 0],'dt_index_to_string': [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')],'age': [5000000, 5000000, 5000000, 5000000, 5000000, 5000000],'function': ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'],'names': ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'],'timestamp': [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')],'weight(t)': [4.3, 2.0, 4.0, 1.8, 5.7, nan]}) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + +class TestPreparingStIDask(TestPreparingStIPandas): + config = {'engine': 'dask', 'n_partitions': 1} + + +class TestPreparingStIPartitionDask(TestPreparingStIPandas): + config = {'engine': 'dask', 'n_partitions': 2} + + +try: + import cudf # pyright: reportMissingImports=false +except: + pass +else: + class TestPreparingStICUDF(TestPreparingStIPandas): + config = {'engine': 'cudf'} + + +try: + import dask_cudf # pyright: reportMissingImports=false +except: + pass +else: + class TestPreparingStIDC(TestPreparingStIPandas): + config = {'engine': 'dask_cudf', 'n_partitions': 1} + + +try: + import dask_cudf # pyright: reportMissingImports=false +except: + pass +else: + class TestPreparingStIPartitionDC(TestPreparingStIPandas): + config = {'engine': 'dask_cudf', 'n_partitions': 2} + + +try: + import pyspark # pyright: reportMissingImports=false +except: + pass +else: + class TestPreparingStISpark(TestPreparingStIPandas): + config = {'engine': 'spark'} + + +try: + import vaex # pyright: reportMissingImports=false +except: + pass +else: + class TestPreparingStIVaex(TestPreparingStIPandas): + config = {'engine': 'vaex'}