diff --git a/include/genotyper.h b/include/genotyper.h index a0ffd89b..dda83a3c 100644 --- a/include/genotyper.h +++ b/include/genotyper.h @@ -36,6 +36,7 @@ enum class NoCallReason { OverlappingVariants, /// site spans multiple variants which overlap each other MonoallelicSite, /// site is monoallelic; no assertion about the presence of either ref or alt allele InputNonCalled, /// the relevant input gVCF record is itself non-called + HaploidCall, /// genotype consists of the only allele }; /// A single allele call and metadata; diploid samples each have two calls diff --git a/src/genotyper.cc b/src/genotyper.cc index 15419dff..abd81390 100644 --- a/src/genotyper.cc +++ b/src/genotyper.cc @@ -96,6 +96,11 @@ static inline bool is_deletion(const string& ref, const string& alt) { Status revise_genotypes(const genotyper_config& cfg, const unified_site& us, const map& sample_mapping, const bcf_hdr_t* hdr, bcf1_t_plus& vr) { + // Skip revision of genotypes for haploid variants. It breaks analysis as + // there're only `n_allele` PL values, not `n_allele * (n_allele + 1) / 2` + // values as in diploid variants. + if (vr.was_haploid) + return Status::OK(); assert(!vr.is_ref); // Speed optimization: our prior on genotypes will be effectively flat // if there are no lost ALT alleles or homozygous-ALT genotypes called, so @@ -510,6 +515,10 @@ static Status translate_genotypes(const genotyper_config& cfg, const unified_sit #define fill_allele(rec,depth,in_ofs,out_ofs) \ assert(rec); \ genotypes[2*ij.second+(out_ofs)].RNC = NoCallReason::InputNonCalled; \ + if (rec->was_haploid && \ + bcf_gt_is_missing(rec->gt[2*ij.first+(in_ofs)])) { \ + genotypes[2*ij.second+(out_ofs)].RNC = NoCallReason::HaploidCall; \ + } \ if (rec->gt[2*ij.first+in_ofs] != bcf_int32_vector_end && \ !bcf_gt_is_missing(rec->gt[2*ij.first+(in_ofs)])) { \ auto al = bcf_gt_allele(rec->gt[2*ij.first+(in_ofs)]); \ @@ -632,6 +641,10 @@ static Status translate_monoallelic(const genotyper_config& cfg, const unified_s #define fill_monoallelic(ofs) \ genotypes[2*ij.second+(ofs)].RNC = NoCallReason::InputNonCalled; \ + if (record->was_haploid && \ + bcf_gt_is_missing(record->gt[2*ij.first+(ofs)])) { \ + genotypes[2*ij.second+(ofs)].RNC = NoCallReason::HaploidCall; \ + } \ if (record->gt[2*ij.first+ofs] != bcf_int32_vector_end && \ !bcf_gt_is_missing(record->gt[2*ij.first+(ofs)])) { \ auto al = bcf_gt_allele(record->gt[2*ij.first+(ofs)]); \ @@ -840,7 +853,8 @@ Status genotype_site(const genotyper_config& cfg, MetadataCache& cache, BCFData& // Clean up emission order of alleles for(size_t i=0; i < samples.size(); i++) { if ((genotypes[2*i].allele != bcf_gt_missing && genotypes[2*i+1].allele == bcf_gt_missing) || - genotypes[2*i].allele > genotypes[2*i+1].allele) { + genotypes[2*i].allele > genotypes[2*i+1].allele || + (genotypes[2*i].allele == bcf_gt_missing && genotypes[2*i].RNC == NoCallReason::HaploidCall)) { swap(genotypes[2*i], genotypes[2*i+1]); } } @@ -920,6 +934,7 @@ Status genotype_site(const genotyper_config& cfg, MetadataCache& cache, BCFData& RNC_CASE(OverlappingVariants,"O") RNC_CASE(MonoallelicSite,"1") RNC_CASE(InputNonCalled, "I") + RNC_CASE(HaploidCall, "H") default: assert(c.RNC == NoCallReason::MissingData); } diff --git a/src/genotyper_utils.h b/src/genotyper_utils.h index b4698a25..654a0b3d 100644 --- a/src/genotyper_utils.h +++ b/src/genotyper_utils.h @@ -957,6 +957,20 @@ Status update_format_fields(const genotyper_config& cfg, const string& dataset, records_to_use = &all_records; } + bool has_haploid_records = false; + for (const auto& record : *records_to_use) { + if (record->was_haploid) { + has_haploid_records = true; + break; + } + } + if ((format_helper->field_info.number == RetainedFieldNumber::GENOTYPE) && has_haploid_records) { + for (const auto& p : sample_mapping) { + S(format_helper->censor(p.second, false)); + } + continue; + } + for (const auto& record : *records_to_use) { s = format_helper->add_record_data(dataset, dataset_header, record->p.get(), sample_mapping, record->allele_mapping, site.alleles.size());