From b6dcd0c44a524e2bdae2bad8f6e31fe25dee752e Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 5 Nov 2024 07:17:45 -0800 Subject: [PATCH] docs: creating an `.hanc` file from a `.bp` file (#260) --- docs/api/data.rst | 2 ++ docs/api/examples.rst | 45 ++++++++++++++++++++++++++++++++++++++ haptools/data/genotypes.py | 3 ++- tests/data/simple.hanc | 10 +++++++++ tests/test_data.py | 28 ++++++++++++++++++++++++ 5 files changed, 87 insertions(+), 1 deletion(-) create mode 100644 tests/data/simple.hanc diff --git a/docs/api/data.rst b/docs/api/data.rst index 4b8d7c81..f236af14 100644 --- a/docs/api/data.rst +++ b/docs/api/data.rst @@ -636,6 +636,8 @@ You'll have to call ``__iter()__`` manually if you want to specify any function for sample, blocks in breakpoints.__iter__(samples={"HG00097", "HG00099"}): print(sample, blocks) +.. _api-data-bp2anc: + Obtaining ancestral labels for a list of positions ************************************************** In the end, we're usually only interested in the ancestral labels of a set of variant positions, as a matrix of values. The ``population_array()`` method generates a numpy array denoting the ancestral label of each sample for each variant you specify. diff --git a/docs/api/examples.rst b/docs/api/examples.rst index 09882ce9..c6199adb 100644 --- a/docs/api/examples.rst +++ b/docs/api/examples.rst @@ -108,3 +108,48 @@ You can use the :ref:`data API ` and the :ref:`simphenotype API `. + +**Input:** + +* Breakpoints in a :ref:`.bp file ` +* A list of variants in a :ref:`PLINK2 PVAR file ` + +**Output:** + +* An ``.hanc`` per-site ancestry file as described in `the admix-simu documentation `_: + +.. include:: ../../tests/data/simple.hanc + :literal: + +.. code-block:: python + + import numpy as np + from pathlib import Path + from haptools import data + + output = Path("output.hanc") + + # load breakpoints from the bp file and encode each population label as an int + breakpoints = data.Breakpoints.load("tests/data/simple.bp") + breakpoints.encode() + print(breakpoints.labels) + + # load the SNPs array from a PVAR file + snps = data.GenotypesPLINK("tests/data/simple.pgen") + snps.read_variants() + snps = snps.variants[["chrom", "pos"]] + + # create array of per-site ancestry values + arr = breakpoints.population_array(variants=snps) + # reshape from n x p x 2 to n*2 x p + # so rows are haplotypes and columns are variants + arr = arr.transpose((0, 2, 1)).reshape(-1, arr.shape[1]) + + # write to haplotype ancestry file + np.savetxt(output, arr, fmt="%i", delimiter="") diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index afba858e..953d8210 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -1236,7 +1236,8 @@ def read_variants( if variants is not None: max_variants = len(variants) if max_variants is None: - raise ValueError("Provide either the variants or max_variants parameter!") + p = pgenlib.PvarReader(bytes(str(self.fname.with_suffix(".pvar")), "utf8")) + max_variants = p.get_variant_ct() # first, preallocate the array and the indices of each variant self.variants = np.empty((max_variants,), dtype=self.variants.dtype) indices = np.empty((max_variants,), dtype=np.uint32) diff --git a/tests/data/simple.hanc b/tests/data/simple.hanc new file mode 100644 index 00000000..dd70bf02 --- /dev/null +++ b/tests/data/simple.hanc @@ -0,0 +1,10 @@ +0000 +1020 +0000 +0100 +0010 +0000 +2101 +0100 +0002 +0102 diff --git a/tests/test_data.py b/tests/test_data.py index 4c6efa21..d5d23fd8 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -2221,3 +2221,31 @@ def test_gts2hap(self): with open(DATADIR / "apoe.hap") as expected: assert hp_file.read() == expected.read() hp.fname.unlink() + + def test_bp2anc(self): + output = Path("output.hanc") + + # load breakpoints from the bp file and encode each population label as an int + breakpoints = Breakpoints.load(DATADIR / "simple.bp") + breakpoints.encode() + # print(breakpoints.labels) + + # load the SNPs array from a PVAR file + snps = GenotypesPLINK(DATADIR / "simple.pgen") + snps.read_variants() + snps = snps.variants[["chrom", "pos"]] + + # create array of per-site ancestry values + arr = breakpoints.population_array(variants=snps) + # reshape from n x p x 2 to n*2 x p + # so rows are haplotypes and columns are variants + arr = arr.transpose((0, 2, 1)).reshape(-1, arr.shape[1]) + + # write to haplotype ancestry file + np.savetxt(output, arr, fmt="%i", delimiter="") + + # validate the output and clean up afterwards + with open(output) as anc_file: + with open(DATADIR / "simple.hanc") as expected: + assert anc_file.read() == expected.read() + output.unlink()