Skip to content

Commit

Permalink
[GEN-1263] Fix bed file duplicates (#562)
Browse files Browse the repository at this point in the history
* add bed database fix and tests

* fix tests

* fix upload_table param
  • Loading branch information
rxu17 authored May 13, 2024
1 parent 71cb5fd commit ae4465a
Show file tree
Hide file tree
Showing 4 changed files with 134 additions and 14 deletions.
7 changes: 6 additions & 1 deletion genie/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,12 @@ def update_table(
else:
newData = newData[database.columns]
_update_table(
syn, database, newData, databaseSynId, databaseEnt.primaryKey, toDelete
syn=syn,
database=database,
new_dataset=newData,
database_synid=databaseSynId,
primary_key_cols=databaseEnt.primaryKey,
to_delete=toDelete,
)


Expand Down
30 changes: 19 additions & 11 deletions genie_registry/bed.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,26 +579,34 @@ def preprocess(self, newpath):
seq_assay_id = seq_assay_id.upper().replace("_", "-")
return {"seq_assay_id": seq_assay_id}

def process_steps(self, beddf, newPath, parentId, databaseSynId, seq_assay_id):
"""
Process bed file, update bed database, write bed file to path
def process_steps(
self,
beddf: pd.DataFrame,
newPath: str,
parentId: str,
databaseSynId: str,
seq_assay_id: str,
) -> str:
"""Process bed file, update bed database, write bed file to path
Args:
beddf: Bed dataframe
newPath: Path to new bed file
parentId: Synapse id to store gene panel file
databaseSynId: Synapse id of bed database
seq_assay_id: GENIE seq assay id
beddf (pd.DataFrame): input bed data
newPath (str): Path to new bed file
parentId (str): Synapse id to store gene panel file
databaseSynId (str): Synapse id of bed database
seq_assay_id (str): GENIE seq assay id
Returns:
string: Path to new bed file
str: Path to new bed file
"""
final_beddf = self._process(beddf, seq_assay_id, newPath, parentId)
final_beddf = self._process(
beddf=beddf, seq_assay_id=seq_assay_id, newpath=newPath, parentid=parentId
)
load.update_table(
syn=self.syn,
databaseSynId=databaseSynId,
newData=final_beddf,
filterBy=self.center,
filterBy=seq_assay_id,
filterByColumn="SEQ_ASSAY_ID",
toDelete=True,
)
Expand Down
50 changes: 49 additions & 1 deletion tests/test_bed.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

import genie_registry.bed
from genie_registry.bed import bed
from genie import validate
from genie import load, validate

if not shutil.which("bedtools"):
pytest.skip("bedtools is not found, skipping bed tests", allow_module_level=True)
Expand Down Expand Up @@ -203,6 +203,54 @@ def test_clinicalreport___process(bed_class):
)


def test_process_steps_calls_expected_values(bed_class):
bedDf = pd.DataFrame(
dict(
a=["2", "9", "12"],
b=[69688533, 99401860, 53701241],
c=[69901480, 99417584, 53718647],
d=["AAK1", "AAED1", "AAAS"],
e=[True, True, False],
f=[True, True, False],
)
)

with patch.object(
bed, "_process", return_value=bedDf
) as patch__process, patch.object(
load, "update_table"
) as patch_load_update_table, patch.object(
pd.DataFrame, "to_csv"
) as patch_to_csv:
bed_path = "somepath"
db_syn_id = "synZZZZZ"
seq_assay_id_filter_val = "SAGE-PANEL-1"

result = bed_class.process_steps(
beddf=bedDf,
newPath=bed_path,
parentId="synAAAAA",
databaseSynId=db_syn_id,
seq_assay_id=seq_assay_id_filter_val,
)
patch__process.assert_called_once_with(
beddf=bedDf,
seq_assay_id=seq_assay_id_filter_val,
newpath=bed_path,
parentid="synAAAAA",
)
patch_load_update_table.assert_called_once_with(
syn=bed_class.syn,
databaseSynId=db_syn_id,
newData=bedDf,
filterBy=seq_assay_id_filter_val,
filterByColumn="SEQ_ASSAY_ID",
toDelete=True,
)
patch_to_csv.assert_called_once_with(bed_path, sep="\t", index=False)
assert result == bed_path


def test_filetype(bed_class):
assert bed_class._fileType == "bed"

Expand Down
61 changes: 60 additions & 1 deletion tests/test_load.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from unittest.mock import patch
from unittest.mock import Mock, patch

import pandas as pd
from pandas.testing import assert_frame_equal
import pytest
import synapseclient
from synapseclient.core.exceptions import SynapseTimeoutError

Expand Down Expand Up @@ -67,3 +70,59 @@ def test_store_table_error(syn):
with patch.object(syn, "store", side_effect=SynapseTimeoutError) as patch_store:
load.store_table(syn, "full/path", "syn1234")
patch_store.assert_called_once()


@pytest.mark.parametrize(
"cols_subset, to_delete, subsetted_data",
[
(None, True, pd.DataFrame({"CENTER": ["test-center"], "data": [123]})),
(None, False, pd.DataFrame({"CENTER": ["test-center"], "data": [123]})),
(["CENTER"], True, pd.DataFrame({"CENTER": ["test-center"]})),
(["CENTER", "col_extra"], False, pd.DataFrame({"CENTER": ["test-center"]})),
],
ids=[
"to_delete_is_true",
"to_delete_is_false",
"col_is_not_none",
"col_has_column_not_in_db",
],
)
def test_that_update_table_has_expected_calls(
syn, cols_subset, to_delete, subsetted_data
):
test_table_synid = "synZZZZ"
test_data = pd.DataFrame({"CENTER": ["test-center"], "data": [123]})
test_new_data = pd.DataFrame({"CENTER": ["test-center"], "data": [123]})

mock_database_ent = Mock()
mock_database_ent.primaryKey = "PRIMARY_KEY"
mock_database = Mock()

with patch.object(syn, "get", return_value=mock_database_ent), patch.object(
syn, "tableQuery", return_value=mock_database
) as patch_table_query, patch.object(
mock_database, "asDataFrame", return_value=test_data
), patch.object(
load, "_update_table"
) as patch__update_table:
load.update_table(
syn,
databaseSynId=test_table_synid,
newData=test_new_data,
filterBy="test-center",
filterByColumn="CENTER",
col=cols_subset,
toDelete=to_delete,
)
patch_table_query.assert_called_with(
f"SELECT * FROM {test_table_synid} where CENTER ='test-center'"
)

# use this method to be able to compare dataframe arg values directly
called_kwargs = patch__update_table.call_args.kwargs
assert called_kwargs["syn"] == syn
assert_frame_equal(called_kwargs["database"], subsetted_data)
assert_frame_equal(called_kwargs["new_dataset"], test_new_data)
assert called_kwargs["database_synid"] == test_table_synid
assert called_kwargs["primary_key_cols"] == "PRIMARY_KEY"
assert called_kwargs["to_delete"] == to_delete

0 comments on commit ae4465a

Please sign in to comment.