feat: a new Haplotypes.merge() method (#256)

CAST-genomics · Oct 14, 2024 · cf4ccb2 · cf4ccb2
1 parent ab877f7
commit cf4ccb2
Show file tree

Hide file tree

Showing 4 changed files with 86 additions and 7 deletions.
diff --git a/.devcontainer.json b/.devcontainer.json
@@ -5,10 +5,7 @@
 	// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
 	"image": "mcr.microsoft.com/devcontainers/base:jammy",
 	"features": {
-		"ghcr.io/rocker-org/devcontainer-features/miniforge:1": {
-			"version": "latest",
-			"variant": "Miniforge"
-		}
+		"ghcr.io/rocker-org/devcontainer-features/miniforge:2": {}
 	},
 
 	// Features to add to the dev container. More info: https://containers.dev/features.

diff --git a/docs/api/data.rst b/docs/api/data.rst
@@ -350,6 +350,23 @@ Using the ``transform()`` function, you can obtain a full instance of the :class
 	hap_gts = haplotypes.transform(genotypes)
 	hap_gts   # a GenotypesVCF instance where haplotypes are variants
 
+Subsetting and merging
+**********************
+If you want to keep only a few haplotypes from an existing Haplotypes object, you can pass a tuple of haplotype IDs to the ``subset()`` method:
+
+.. code-block:: python
+
+	haplotypes = data.Haplotypes.load('tests/data/basic.hap')
+	haplotypes = haplotypes.subset(haplotypes=("chr21.q.3365*1",))
+
+You can also merge multiple Haplotypes objects using the ``merge()`` class method:
+
+.. code-block:: python
+
+	haps1 = data.Haplotypes.load('tests/data/basic.hap')
+	haps2 = data.Haplotypes.load('tests/data/simple.hap')
+	haplotypes = Haplotypes.merge((haps1, haps2), fname='new.hap')
+
 Haplotype
 +++++++++
 The :class:`Haplotype` class stores haplotype lines from the **.hap** file. Each property in the object is a field in the line. A separate ``variants`` property stores a tuple of :class:`Variant` objects belonging to this haplotype.

diff --git a/haptools/data/haplotypes.py b/haptools/data/haplotypes.py
@@ -345,6 +345,11 @@ class Haplotype:
     variants: tuple = field(default_factory=tuple, init=False)
     _extras: tuple = field(default=tuple(), init=False, repr=False)
 
+    def __len__(self):
+        if self.variants is not None:
+            return len(self.variants)
+        return 0
+
     @property
     def ID(self):
         """
@@ -773,6 +778,11 @@ def __init__(
         self.types = {"H": haplotype, "V": variant, "R": repeat}
         self.type_ids = None
 
+    def __len__(self):
+        if self.data is not None:
+            return len(self.data)
+        return 0
+
     @classmethod
     def load(
         cls: Haplotypes,
@@ -1451,3 +1461,42 @@ def subset(self, haplotypes: tuple[str], inplace: bool = False):
         hps.index(force=True)
         if not inplace:
             return hps
+
+    @classmethod
+    def merge(cls, objs: tuple[Haplotypes], **kwargs) -> Haplotypes:
+        """
+        Merge Haplotypes objects with different sets of haplotypes together
+
+        .. note::
+            The input Haplotype instances are expected to have distinct indices and
+            must all be of the same type
+
+        Parameters
+        ----------
+        objs: tuple[Haplotypes]
+            The objects that should be merged together
+        **kwargs
+            Any parameters to pass to :py:meth:`~.Haplotypes._init__`
+
+        Raises
+        ------
+        ValueError
+            If the set of indices among the haplotypes is not distinct
+
+        Returns
+        -------
+        Haplotypes
+            A new object containing all of the Haplotypes from each input object
+        """
+        hps = cls(**kwargs)
+        hps.data = {}
+        for haps in objs:
+            for hap in haps.data.values():
+                if hap.id in hps.data:
+                    raise ValueError(
+                        "Failed to merge haplotype objects because their indices are "
+                        "not distinct"
+                    )
+                hps.data[hap.id] = hap
+        hps.index()
+        return hps
diff --git a/tests/test_data.py b/tests/test_data.py
@@ -1195,11 +1195,14 @@ def test_load(self):
         # can we load this data from the hap file?
         haps = Haplotypes.load(DATADIR / "basic.hap")
         assert expected == haps.data
+        assert len(haps) == len(expected)
+        assert len(haps.data["chr21.q.3365*1"]) == 4
 
         # also check the indexed file
         # it should be the same
         haps = Haplotypes.load(DATADIR / "basic.hap.gz")
         assert expected == haps.data
+        assert len(haps) == len(expected)
 
     def test_load_no_header(self):
         expected = self._basic_haps()
@@ -1380,11 +1383,10 @@ def test_read_subset(self):
         assert expected == haps.data
 
     def test_subset(self):
-        expected = Haplotypes(DATADIR / "basic.hap")
+        expected = Haplotypes.load(DATADIR / "basic.hap")
         expected.read(haplotypes={"chr21.q.3365*1"})
 
-        haps = Haplotypes(DATADIR / "basic.hap")
-        haps.read()
+        haps = Haplotypes.load(DATADIR / "basic.hap")
         haps = haps.subset(haplotypes=("chr21.q.3365*1",))
 
         assert len(expected.data) == len(haps.data)
@@ -1717,6 +1719,20 @@ def test_sort(self):
 
         test_hap1.fname.unlink()
 
+    def test_merge(self):
+        haps1 = Haplotypes.load(DATADIR / "basic.hap")
+        haps2 = Haplotypes.load(DATADIR / "example.hap.gz")
+        # should raise value error because indices aren't distinct
+        with pytest.raises(ValueError) as info:
+            Haplotypes.merge((haps1, haps2), fname="new.hap")
+        haps2 = Haplotypes.load(DATADIR / "simple.hap")
+        haplotypes = Haplotypes.merge((haps1, haps2), fname="new.hap")
+        # now check that they got merged
+        haps1_vals = haps1.data.values()
+        haps2_vals = haps2.data.values()
+        for hp in haplotypes.data.values():
+            assert (hp in haps1_vals) or (hp in haps2_vals)
+
 
 class TestGenotypesVCF:
     def _get_fake_genotypes_refalt(self, with_phase=False):