jeromekelleher · jeromekelleher · Dec 17, 2024 · Dec 17, 2024
diff --git a/sc2ts/dataset.py b/sc2ts/dataset.py
@@ -179,6 +179,7 @@ class Variant:
 class Dataset(collections.abc.Mapping):
 
     def __init__(self, path, chunk_cache_size=1, date_field="date"):
+        logger.info(f"Loading dateset @{path} using {date_field} as date field")
         self.date_field = date_field
         self.path = pathlib.Path(path)
         if self.path.suffix == ".zip":

diff --git a/sc2ts/inference.py b/sc2ts/inference.py
@@ -506,6 +506,7 @@ def extend(
     date,
     base_ts,
     match_db,
+    date_field="date",
     include_samples=None,
     num_mismatches=None,
     hmm_cost_threshold=None,
@@ -557,7 +558,7 @@ def extend(
 
     start_time = time.time()  # wall time
     base_ts = tszip.load(base_ts)
-    ds = _dataset.Dataset(dataset)
+    ds = _dataset.Dataset(dataset, date_field=date_field)
 
     with MatchDb(match_db) as matches:
         tables = _extend(
@@ -1462,7 +1463,7 @@ def match_tsinfer(
             coord_map, mirror_coordinates, ts.sites_position
         )
         logger.debug(
-            f"HMM@T={mismatch_threshold}: {sample.strain} {sample.pango}"
+            f"HMM@T={mismatch_threshold}: {sample.strain} {sample.pango} "
             f"hmm_cost={sample.hmm_match.cost} match={sample.hmm_match.summary()}"
         )
 

diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -269,6 +269,13 @@ def test_examples(self, fx_dataset):
             ],
         )
 
+    def test_date_field(self, fx_dataset):
+        ds1 = sc2ts.Dataset(fx_dataset.path, date_field="date")
+        ds2 = sc2ts.Dataset(fx_dataset.path, date_field="Collection_date")
+        diffs = np.where(ds1.metadata.sample_date != ds2.metadata.sample_date)[0]
+        # The point is just to see if they are different here
+        assert len(diffs) == 6
+
 
 class TestDatasetAlignments:
 

diff --git a/tests/test_inference.py b/tests/test_inference.py
@@ -386,6 +386,20 @@ def test_2020_01_25(self, tmp_path, fx_ts_map, fx_dataset):
         }
         ts.tables.assert_equals(fx_ts_map["2020-01-25"].tables, ignore_provenance=True)
 
+    def test_using_collection_date(self, tmp_path, fx_ts_map, fx_dataset):
+
+        ts = run_extend(
+            dataset=fx_dataset,
+            base_ts=fx_ts_map[self.dates[0]],
+            date="2020-06-16",
+            match_db=sc2ts.MatchDb.initialise(tmp_path / "match.db"),
+            date_field="Collection_date",
+            hmm_cost_threshold=25,
+        )
+        assert ts.num_samples == 1
+        # This has a hmm cost of 21
+        assert ts.metadata["sc2ts"]["samples_strain"] == ["SRR15736313"]
+
     @pytest.mark.parametrize("num_threads", [0, 1, 3, 10])
     def test_2020_02_02(self, tmp_path, fx_ts_map, fx_dataset, num_threads):
         ts = run_extend(