Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Local alleles #266

Merged
merged 13 commits into from
Jul 10, 2024
Prev Previous commit
Next Next commit
Only add LAA if PL is present
Will-Tyler committed Jul 10, 2024
commit 8951cab13f6f54d7c379e0557e5083e78ca4c9a1
16 changes: 10 additions & 6 deletions bio2zarr/vcf2zarr/icf.py
Original file line number Diff line number Diff line change
@@ -239,6 +239,7 @@ def scan_vcf(path, target_num_partitions, *, local_alleles):
# Indicates whether vcf2zarr can introduce local alleles
can_localize = False
should_add_laa_field = True
has_PL = False
fields = fixed_vcf_field_definitions()
for h in vcf.header_iter():
if h["HeaderType"] in ["INFO", "FORMAT"]:
@@ -247,12 +248,15 @@ def scan_vcf(path, target_num_partitions, *, local_alleles):
field.vcf_type = "Integer"
field.vcf_number = "."
fields.append(field)
if field.category == "FORMAT" and field.name in {"GT", "AD"}:
can_localize = True
if (field.category, field.name) == ("FORMAT", "LAA"):
should_add_laa_field = False

if local_alleles and can_localize and should_add_laa_field:
if field.category == "FORMAT":
if field.name in {"GT", "AD", "PL"}:
can_localize = True
if field.name == "PL":
has_PL = True
if field.name == "LAA":
should_add_laa_field = False

if local_alleles and can_localize and should_add_laa_field and has_PL:
laa_field = VcfField(
category="FORMAT",
name="LAA",
50 changes: 38 additions & 12 deletions tests/test_icf.py
Original file line number Diff line number Diff line change
@@ -27,11 +27,6 @@ def icf(self, tmp_path_factory):
out = tmp_path_factory.mktemp("data") / "example.exploded"
return vcf2zarr.explode(out, [self.data_path], local_alleles=False)

@pytest.fixture(scope="class")
def icf_local_alleles(self, tmp_path_factory):
out = tmp_path_factory.mktemp("data") / "example.exploded"
return vcf2zarr.explode(out, [self.data_path])

def test_format_version(self, icf):
assert icf.metadata.format_version == icf_mod.ICF_METADATA_FORMAT_VERSION

@@ -54,13 +49,6 @@ def test_summary_table(self, icf):
fields = [d["name"] for d in data]
assert tuple(sorted(fields)) == self.fields

def test_summary_table_local_allleles(self, icf_local_alleles):
data = icf_local_alleles.summary_table()
fields = [d["name"] for d in data]
fields.sort()
expected = tuple(sorted((*self.fields, "FORMAT/LAA")))
assert tuple(fields) == expected

def test_inspect(self, icf):
assert icf.summary_table() == vcf2zarr.inspect(icf.path)

@@ -103,6 +91,44 @@ def test_INFO_NS(self, icf):
assert icf["INFO/NS"].values == [None, None, 3, 3, 2, 3, 3, None, None]


class TestLocalAllelesExample:
data_path = "tests/data/vcf/local_alleles.vcf.gz"

fields = (
"ALT",
"CHROM",
"FILTERS",
"FORMAT/AD",
"FORMAT/DP",
"FORMAT/GQ",
"FORMAT/GT",
"FORMAT/LAA",
"FORMAT/PL",
"ID",
"INFO/AA",
"INFO/AC",
"INFO/AF",
"INFO/AN",
"INFO/DB",
"INFO/DP",
"INFO/H2",
"INFO/NS",
"POS",
"QUAL",
"REF",
)

@pytest.fixture(scope="class")
def icf(self, tmp_path_factory):
out = tmp_path_factory.mktemp("data") / "example.exploded"
return vcf2zarr.explode(out, [self.data_path])

def test_summary_table(self, icf):
data = icf.summary_table()
fields = [d["name"] for d in data]
assert tuple(sorted(fields)) == self.fields


class TestIcfWriterExample:
data_path = "tests/data/vcf/sample.vcf.gz"

14 changes: 2 additions & 12 deletions tests/test_vcf_examples.py
Original file line number Diff line number Diff line change
@@ -233,18 +233,8 @@ def test_call_HQ(self, ds):
nt.assert_array_equal(ds["call_HQ"], call_HQ)

def test_call_LAA(self, ds):
call_LAA = [
[[-2, -2], [-2, -2], [1, -2]],
[[-2, -2], [-2, -2], [1, -2]],
[[-2, -2], [1, -2], [1, -2]],
[[-2, -2], [1, -2], [-2, -2]],
[[1, 2], [1, 2], [2, -2]],
[[-2, -2], [-2, -2], [-2, -2]],
[[1, -2], [2, -2], [-2, -2]],
[[-2, -2], [-2, -2], [-2, -2]],
[[-2, -2], [1, -2], [2, -2]],
]
nt.assert_array_equal(ds["call_LAA"], call_LAA)
# The small example VCF does not have a PL field
assert "call_LA" not in ds

def test_no_genotypes(self, ds, tmp_path):
path = "tests/data/vcf/sample_no_genotypes.vcf.gz"