Skip to content

Commit

Permalink
Merge branch 'release/1.4.1'
Browse files Browse the repository at this point in the history
  • Loading branch information
carjed committed Jun 24, 2019
2 parents 3203f56 + 08b3060 commit e34a3c9
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 10 deletions.
2 changes: 1 addition & 1 deletion env.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ dependencies:
- matplotlib
- biopython=1.71
- colorama
- cyvcf2=0.8
- cyvcf2=0.11
- pyfaidx=0.5.4
- joblib=0.11
# - gcc_linux-64
Expand Down
48 changes: 39 additions & 9 deletions util.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,15 @@ def process_vcf(self, inputfile):

sg_dict = indexGroups(self.args.samplefile, self.args.groupvar)
samples = sorted(list(set(sg_dict.values())))

# get boolean vector of samples that are in sample file
samples_keep_match = np.isin(all_samples, list(sg_dict.keys()))

# get indices of matching samples
samples_keep_idx = np.where(samples_keep_match)

# get list of individual sample ids to keep
samples_keep = sorted(list(set(sg_dict.keys())))

util_log.debug("%s samples will be pooled into %s groups: %s",
len(all_samples), len(samples), ",".join(samples))
Expand Down Expand Up @@ -411,19 +420,40 @@ def process_vcf(self, inputfile):

# currently only works with singletons--
if (self.args.samplefile and self.args.groupvar):

gt_new = record.gt_types

if record.gt_types.sum() == 0:
numsites_skip += 1
continue
if (self.args.impute and 3 in gt_new):
gt_complete = gt_new[gt_new != 3]
freq = sum(gt_complete) / len(gt_complete)
gt_new[gt_new == 3] = freq

carrier = all_samples[record.gt_types.tolist().index(1)]
if carrier not in sg_dict:
else:
gt_new[gt_new == 3] = 0

# if not any("/" in b for b in record.gt_bases):
if self.args.haploid:
gt_new = np.divide(gt_new, 2.)

# get array of genotypes only for samples in samplefile
gt_sub = gt_new[samples_keep_idx]

if gt_sub.sum() == 0:
numsites_skip += 1
continue

sample_gp = sg_dict[carrier]
ind = samples.index(sample_gp)
M[ind, st] += 1

# initialize dict of group allele counts = 0
sg_counts = {k: 0 for k in sorted(list(set(sg_dict.values())))}

# initialize dict of allele counts per sample
d2 = dict(zip(samples_keep, gt_sub))

# iterate per-sample counts and update per-group counts
for key, value in d2.items():
sg_counts[sg_dict[key]] += value

# add to matrix
M[:, st] = M[:, st] + list(sg_counts.values())
numsites_keep += 1

else:
Expand Down

0 comments on commit e34a3c9

Please sign in to comment.