Skip to content

Commit

Permalink
feat: a new Haplotypes.merge() method (#256)
Browse files Browse the repository at this point in the history
  • Loading branch information
aryarm authored Oct 14, 2024
1 parent ab877f7 commit cf4ccb2
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 7 deletions.
5 changes: 1 addition & 4 deletions .devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,7 @@
// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
"image": "mcr.microsoft.com/devcontainers/base:jammy",
"features": {
"ghcr.io/rocker-org/devcontainer-features/miniforge:1": {
"version": "latest",
"variant": "Miniforge"
}
"ghcr.io/rocker-org/devcontainer-features/miniforge:2": {}
},

// Features to add to the dev container. More info: https://containers.dev/features.
Expand Down
17 changes: 17 additions & 0 deletions docs/api/data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,23 @@ Using the ``transform()`` function, you can obtain a full instance of the :class
hap_gts = haplotypes.transform(genotypes)
hap_gts # a GenotypesVCF instance where haplotypes are variants
Subsetting and merging
**********************
If you want to keep only a few haplotypes from an existing Haplotypes object, you can pass a tuple of haplotype IDs to the ``subset()`` method:

.. code-block:: python
haplotypes = data.Haplotypes.load('tests/data/basic.hap')
haplotypes = haplotypes.subset(haplotypes=("chr21.q.3365*1",))
You can also merge multiple Haplotypes objects using the ``merge()`` class method:

.. code-block:: python
haps1 = data.Haplotypes.load('tests/data/basic.hap')
haps2 = data.Haplotypes.load('tests/data/simple.hap')
haplotypes = Haplotypes.merge((haps1, haps2), fname='new.hap')
Haplotype
+++++++++
The :class:`Haplotype` class stores haplotype lines from the **.hap** file. Each property in the object is a field in the line. A separate ``variants`` property stores a tuple of :class:`Variant` objects belonging to this haplotype.
Expand Down
49 changes: 49 additions & 0 deletions haptools/data/haplotypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,11 @@ class Haplotype:
variants: tuple = field(default_factory=tuple, init=False)
_extras: tuple = field(default=tuple(), init=False, repr=False)

def __len__(self):
if self.variants is not None:
return len(self.variants)
return 0

@property
def ID(self):
"""
Expand Down Expand Up @@ -773,6 +778,11 @@ def __init__(
self.types = {"H": haplotype, "V": variant, "R": repeat}
self.type_ids = None

def __len__(self):
if self.data is not None:
return len(self.data)
return 0

@classmethod
def load(
cls: Haplotypes,
Expand Down Expand Up @@ -1451,3 +1461,42 @@ def subset(self, haplotypes: tuple[str], inplace: bool = False):
hps.index(force=True)
if not inplace:
return hps

@classmethod
def merge(cls, objs: tuple[Haplotypes], **kwargs) -> Haplotypes:
"""
Merge Haplotypes objects with different sets of haplotypes together
.. note::
The input Haplotype instances are expected to have distinct indices and
must all be of the same type
Parameters
----------
objs: tuple[Haplotypes]
The objects that should be merged together
**kwargs
Any parameters to pass to :py:meth:`~.Haplotypes._init__`
Raises
------
ValueError
If the set of indices among the haplotypes is not distinct
Returns
-------
Haplotypes
A new object containing all of the Haplotypes from each input object
"""
hps = cls(**kwargs)
hps.data = {}
for haps in objs:
for hap in haps.data.values():
if hap.id in hps.data:
raise ValueError(
"Failed to merge haplotype objects because their indices are "
"not distinct"
)
hps.data[hap.id] = hap
hps.index()
return hps
22 changes: 19 additions & 3 deletions tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -1195,11 +1195,14 @@ def test_load(self):
# can we load this data from the hap file?
haps = Haplotypes.load(DATADIR / "basic.hap")
assert expected == haps.data
assert len(haps) == len(expected)
assert len(haps.data["chr21.q.3365*1"]) == 4

# also check the indexed file
# it should be the same
haps = Haplotypes.load(DATADIR / "basic.hap.gz")
assert expected == haps.data
assert len(haps) == len(expected)

def test_load_no_header(self):
expected = self._basic_haps()
Expand Down Expand Up @@ -1380,11 +1383,10 @@ def test_read_subset(self):
assert expected == haps.data

def test_subset(self):
expected = Haplotypes(DATADIR / "basic.hap")
expected = Haplotypes.load(DATADIR / "basic.hap")
expected.read(haplotypes={"chr21.q.3365*1"})

haps = Haplotypes(DATADIR / "basic.hap")
haps.read()
haps = Haplotypes.load(DATADIR / "basic.hap")
haps = haps.subset(haplotypes=("chr21.q.3365*1",))

assert len(expected.data) == len(haps.data)
Expand Down Expand Up @@ -1717,6 +1719,20 @@ def test_sort(self):

test_hap1.fname.unlink()

def test_merge(self):
haps1 = Haplotypes.load(DATADIR / "basic.hap")
haps2 = Haplotypes.load(DATADIR / "example.hap.gz")
# should raise value error because indices aren't distinct
with pytest.raises(ValueError) as info:
Haplotypes.merge((haps1, haps2), fname="new.hap")
haps2 = Haplotypes.load(DATADIR / "simple.hap")
haplotypes = Haplotypes.merge((haps1, haps2), fname="new.hap")
# now check that they got merged
haps1_vals = haps1.data.values()
haps2_vals = haps2.data.values()
for hp in haplotypes.data.values():
assert (hp in haps1_vals) or (hp in haps2_vals)


class TestGenotypesVCF:
def _get_fake_genotypes_refalt(self, with_phase=False):
Expand Down

0 comments on commit cf4ccb2

Please sign in to comment.