Skip to content

Commit

Permalink
revert to origincal join function but add checks for empty df
Browse files Browse the repository at this point in the history
  • Loading branch information
danlu1 committed Mar 18, 2024
1 parent 2b31d08 commit 0804228
Show file tree
Hide file tree
Showing 2 changed files with 111 additions and 16 deletions.
28 changes: 13 additions & 15 deletions genie/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,22 +192,21 @@ def _update_table(
# Columns must be in the same order
new_dataset = new_dataset[orig_database_cols]
database[primary_key_cols] = database[primary_key_cols].applymap(str)
database[primary_key] = (
database[primary_key_cols]
.stack()
.groupby(level=0)
.agg(" ".join)
.apply(lambda x: x.strip())
)

if database.empty:
database[primary_key] = ""
else:
database[primary_key] = database[primary_key_cols].apply(
lambda x: " ".join(x), axis=1
)

new_dataset[primary_key_cols] = new_dataset[primary_key_cols].applymap(str)
new_dataset[primary_key] = (
new_dataset[primary_key_cols]
.stack()
.groupby(level=0)
.agg(" ".join)
.apply(lambda x: x.strip())
)
if new_dataset.empty:
new_dataset[primary_key] = ""
else:
new_dataset[primary_key] = new_dataset[primary_key_cols].apply(
lambda x: " ".join(x), axis=1
)
allupdates = pd.DataFrame(columns=col_order)
to_append_rows = process_functions._append_rows(new_dataset, database, primary_key)
to_update_rows = process_functions._update_rows(new_dataset, database, primary_key)
Expand All @@ -222,7 +221,6 @@ def _update_table(
update_all_file = tempfile.NamedTemporaryFile(
dir=process_functions.SCRIPT_DIR, delete=False
)

with open(update_all_file.name, "w") as updatefile:
# Must write out the headers in case there are no appends or updates
updatefile.write(",".join(col_order) + "\n")
Expand Down
99 changes: 98 additions & 1 deletion tests/test_load.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
from unittest.mock import patch
from unittest.mock import patch, mock_open

import synapseclient
from synapseclient.core.exceptions import SynapseTimeoutError

from genie import load, __version__
from genie import process_functions
import pandas as pd
import tempfile
import os


def test_store_file(syn):
Expand Down Expand Up @@ -67,3 +71,96 @@ def test_store_table_error(syn):
with patch.object(syn, "store", side_effect=SynapseTimeoutError) as patch_store:
load.store_table(syn, "full/path", "syn1234")
patch_store.assert_called_once()


def test__update_table_non_empty_dataframe(syn):
"""Test _update_table function with both new_dataset and database as non_empty dataframe"""
database_synid = "syn123"
primary_key_cols = ["test", "foo"]
to_delete = (False,)
new_dataset = pd.DataFrame(
{
"test": ["test1", "test2", "test3", "test4"],
"foo": [1, 2, 3, 4],
"baz": [float("nan"), float("nan"), float("nan"), 3.2],
}
)
database = pd.DataFrame(
{
"test": ["test1", "test2", "test3"],
"foo": [1, 2, 3],
"baz": [float("nan"), float("nan"), float("nan")],
}
)
expected_results = ["ROW_ID,ROW_VERSION,test,foo,baz\n", ",,test4,4,3.2\n"]
with patch("os.unlink") as mock_unlink, patch(
"tempfile.NamedTemporaryFile"
) as mock_tempfile, patch.object(syn, "store") as syn_store:
with patch("builtins.open", mock_open()) as mock_file_open:
# set the tempfile name
mock_tempfile.return_value.name = "test.csv"
load._update_table(
syn, database, new_dataset, database_synid, primary_key_cols, to_delete
)
mock_file_open.assert_called_once_with("test.csv", "w")
mock_file_handle = mock_file_open()
write_calls = mock_file_handle.write.call_args_list
results = [call_args[0][0] for call_args in write_calls]
assert results == expected_results
mock_unlink.assert_called_once_with("test.csv")


def test__update_table_empty_dataframe(syn):
"""Test _update_table function with empty new_dataset"""
database_synid = "syn123"
primary_key_cols = ["test", "foo"]
to_delete = False
new_dataset = pd.DataFrame(columns=["test", "foo", "baz"])
database = pd.DataFrame(
{
"test": ["test1", "test2", "test3"],
"foo": [1, 2, 3],
"baz": [float("nan"), float("nan"), float("nan")],
}
)
expected_results = ["ROW_ID,ROW_VERSION,test,foo,baz\n"]
with patch("os.unlink") as mock_unlink, patch(
"tempfile.NamedTemporaryFile"
) as mock_tempfile, patch.object(syn, "store") as syn_store:
with patch("builtins.open", mock_open()) as mock_file_open:
# set the tempfile name
mock_tempfile.return_value.name = "test.csv"
load._update_table(
syn, database, new_dataset, database_synid, primary_key_cols, to_delete
)
mock_file_open.assert_called_once_with("test.csv", "w")
mock_file_handle = mock_file_open()
write_calls = mock_file_handle.write.call_args_list
results = [call_args[0][0] for call_args in write_calls]
assert results == expected_results
mock_unlink.assert_called_once_with("test.csv")


def test__update_table_empty_dataframes(syn):
"""Test _update_table function with empty new_dataset and database"""
database_synid = "syn123"
primary_key_cols = ["test", "foo"]
to_delete = False
new_dataset = pd.DataFrame(columns=["test", "foo", "baz"])
database = pd.DataFrame(columns=["test", "foo", "baz"])
expected_results = ["ROW_ID,ROW_VERSION,test,foo,baz\n"]
with patch("os.unlink") as mock_unlink, patch(
"tempfile.NamedTemporaryFile"
) as mock_tempfile, patch.object(syn, "store") as syn_store:
with patch("builtins.open", mock_open()) as mock_file_open:
# set the tempfile name
mock_tempfile.return_value.name = "test.csv"
load._update_table(
syn, database, new_dataset, database_synid, primary_key_cols, to_delete
)
mock_file_open.assert_called_once_with("test.csv", "w")
mock_file_handle = mock_file_open()
write_calls = mock_file_handle.write.call_args_list
results = [call_args[0][0] for call_args in write_calls]
assert results == expected_results
mock_unlink.assert_called_once_with("test.csv")

0 comments on commit 0804228

Please sign in to comment.