Skip to content

Commit

Permalink
ML-655: none value should remain None in CsvSource (#237)
Browse files Browse the repository at this point in the history
* ML-655: none value should remain None in CsvSource

* pr comments
  • Loading branch information
katyakats authored Jun 15, 2021
1 parent 239fdea commit c6179ac
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 0 deletions.
2 changes: 2 additions & 0 deletions storey/sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,8 @@ def _infer_type(self, value):
def _parse_field(self, field, index):
typ = self._types[index]
if typ == 's':
if field == '':
return None
return field
if typ == 'f':
return float(field) if field != '' else math.nan
Expand Down
24 changes: 24 additions & 0 deletions tests/test_flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -2589,3 +2589,27 @@ def test_csv_none_value_first_row(tmpdir):

for c in columns:
assert read_back_df.dtypes.to_dict()[c] == data.dtypes.to_dict()[c]


def test_csv_none_value_string(tmpdir):
out_file_par = f'{tmpdir}/test_csv_none_value_first_row_{uuid.uuid4().hex}.parquet'
out_file_csv = f'{tmpdir}/test_csv_none_value_first_row_{uuid.uuid4().hex}.csv'

columns = ['first_name', 'str']
data = pd.DataFrame([['katya', 'strrrr'], ['dina', None]],
columns=columns)
data.to_csv(out_file_csv)

controller = build_flow([
CSVSource(out_file_csv, header=True, key_field='first_name', build_dict=True),
ParquetTarget(out_file_par)
]).run()

controller.await_termination()
read_back_df = pd.read_parquet(out_file_par)

u = pd.read_csv(out_file_csv)
u.to_parquet(out_file_par)
r2 = pd.read_parquet(out_file_par)

assert r2['str'].compare(read_back_df['str']).empty

0 comments on commit c6179ac

Please sign in to comment.