From b6dcd0c44a524e2bdae2bad8f6e31fe25dee752e Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Tue, 5 Nov 2024 07:17:45 -0800
Subject: [PATCH] docs: creating an `.hanc` file from a `.bp` file (#260)

---
 docs/api/data.rst          |  2 ++
 docs/api/examples.rst      | 45 ++++++++++++++++++++++++++++++++++++++
 haptools/data/genotypes.py |  3 ++-
 tests/data/simple.hanc     | 10 +++++++++
 tests/test_data.py         | 28 ++++++++++++++++++++++++
 5 files changed, 87 insertions(+), 1 deletion(-)
 create mode 100644 tests/data/simple.hanc
diff --git a/docs/api/data.rst b/docs/api/data.rst
index 4b8d7c81..f236af14 100644
--- a/docs/api/data.rst
+++ b/docs/api/data.rst
@@ -636,6 +636,8 @@ You'll have to call ``__iter()__`` manually if you want to specify any function
 	for sample, blocks in breakpoints.__iter__(samples={"HG00097", "HG00099"}):
 	    print(sample, blocks)
 
+.. _api-data-bp2anc:
+
 Obtaining ancestral labels for a list of positions
 **************************************************
 In the end, we're usually only interested in the ancestral labels of a set of variant positions, as a matrix of values. The ``population_array()`` method generates a numpy array denoting the ancestral label of each sample for each variant you specify.
diff --git a/docs/api/examples.rst b/docs/api/examples.rst
index 09882ce9..c6199adb 100644
--- a/docs/api/examples.rst
+++ b/docs/api/examples.rst
@@ -108,3 +108,48 @@ You can use the :ref:`data API <api-data>` and the :ref:`simphenotype API <api-h
         hp.data[ID].variants = (data.Variant(start=pos, end=end, id=ID, allele=allele),)
 
     hp.write()
+
+.. _api-examples-bp2anc:
+
+Converting a ``.bp`` file into a ``.hanc`` per-site ancestry file
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+You can obtain the ancestry of a list of variants directly from a ``.bp`` file using the :ref:`data API <api-data-bp2anc>`.
+
+**Input:**
+
+* Breakpoints in a :ref:`.bp file <formats-breakpoints>`
+* A list of variants in a :ref:`PLINK2 PVAR file <formats-genotypesplink>`
+
+**Output:**
+
+* An ``.hanc`` per-site ancestry file as described in `the admix-simu documentation <https://github.com/williamslab/admix-simu/tree/master?tab=readme-ov-file#per-site-ancestry-values>`_:
+
+.. include:: ../../tests/data/simple.hanc
+  :literal:
+
+.. code-block:: python
+
+    import numpy as np
+    from pathlib import Path
+    from haptools import data
+
+    output = Path("output.hanc")
+
+    # load breakpoints from the bp file and encode each population label as an int
+    breakpoints = data.Breakpoints.load("tests/data/simple.bp")
+    breakpoints.encode()
+    print(breakpoints.labels)
+
+    # load the SNPs array from a PVAR file
+    snps = data.GenotypesPLINK("tests/data/simple.pgen")
+    snps.read_variants()
+    snps = snps.variants[["chrom", "pos"]]
+
+    # create array of per-site ancestry values
+    arr = breakpoints.population_array(variants=snps)
+    # reshape from n x p x 2 to n*2 x p
+    # so rows are haplotypes and columns are variants
+    arr = arr.transpose((0, 2, 1)).reshape(-1, arr.shape[1])
+
+    # write to haplotype ancestry file
+    np.savetxt(output, arr, fmt="%i", delimiter="")
diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index afba858e..953d8210 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -1236,7 +1236,8 @@ def read_variants(
         if variants is not None:
             max_variants = len(variants)
         if max_variants is None:
-            raise ValueError("Provide either the variants or max_variants parameter!")
+            p = pgenlib.PvarReader(bytes(str(self.fname.with_suffix(".pvar")), "utf8"))
+            max_variants = p.get_variant_ct()
         # first, preallocate the array and the indices of each variant
         self.variants = np.empty((max_variants,), dtype=self.variants.dtype)
         indices = np.empty((max_variants,), dtype=np.uint32)
diff --git a/tests/data/simple.hanc b/tests/data/simple.hanc
new file mode 100644
index 00000000..dd70bf02
--- /dev/null
+++ b/tests/data/simple.hanc
@@ -0,0 +1,10 @@
+0000
+1020
+0000
+0100
+0010
+0000
+2101
+0100
+0002
+0102
diff --git a/tests/test_data.py b/tests/test_data.py
index 4c6efa21..d5d23fd8 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -2221,3 +2221,31 @@ def test_gts2hap(self):
             with open(DATADIR / "apoe.hap") as expected:
                 assert hp_file.read() == expected.read()
         hp.fname.unlink()
+
+    def test_bp2anc(self):
+        output = Path("output.hanc")
+
+        # load breakpoints from the bp file and encode each population label as an int
+        breakpoints = Breakpoints.load(DATADIR / "simple.bp")
+        breakpoints.encode()
+        # print(breakpoints.labels)
+
+        # load the SNPs array from a PVAR file
+        snps = GenotypesPLINK(DATADIR / "simple.pgen")
+        snps.read_variants()
+        snps = snps.variants[["chrom", "pos"]]
+
+        # create array of per-site ancestry values
+        arr = breakpoints.population_array(variants=snps)
+        # reshape from n x p x 2 to n*2 x p
+        # so rows are haplotypes and columns are variants
+        arr = arr.transpose((0, 2, 1)).reshape(-1, arr.shape[1])
+
+        # write to haplotype ancestry file
+        np.savetxt(output, arr, fmt="%i", delimiter="")
+
+        # validate the output and clean up afterwards
+        with open(output) as anc_file:
+            with open(DATADIR / "simple.hanc") as expected:
+                assert anc_file.read() == expected.read()
+        output.unlink()