Skip to content

Commit

Permalink
Merge branch 'release/1.1.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
carjed committed Nov 8, 2018
2 parents 0b25d98 + 5b19dee commit 3ab80f5
Show file tree
Hide file tree
Showing 7 changed files with 261 additions and 10 deletions.
Binary file added assets/figS2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
101 changes: 101 additions & 0 deletions assets/randomly_generated_tumors.txt

Large diffs are not rendered by default.

9 changes: 9 additions & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ optional arguments:
input is MAF file, specify column name of the grouping
variable to pool samples by. If left blank, matrix
will be constructed per sample/tumor ID as usual
-u, --impute if using VCF input mode, missing genotypes (i.e.,
"./.") will be imputed as the allele frequency of the
samples with non-missing genotypes
-C [INT], --minsnvs [INT]
minimum # of SNVs per individual to be included in
analysis. Default is 0.
Expand Down Expand Up @@ -136,6 +139,12 @@ In some cases it may be necessary (or desired) to run *Helmsman* with samples po

Note that the `--samplefile` option will operate as above--only samples present in this file will be considered when generating the mutation spectra matrix.

#### Impute missing genotypes

`--impute`

By default, when using VCF mode, samples with missing genotypes (i.e., "./.") are coded as a 0, and the mutation spectra matrix does not get incremented for those samples. The `--impute` option forces any missing genotypes to be set to the average allele frequency of the non-missing samples. For example, for a given site, if there are 11 samples in the VCF file with one sample missing a genotype, 5 samples homozygous for the reference allele and 5 heterozygous, this option will impute the genotype for the missing sample as 0.5. Users should exercise caution when using this option, as assumptions about the allele frequency across samples may not be valid for VCF files containing heterogeneous tumor samples.

### MAF mode

`--mode maf --input /path/to/input.maf`
Expand Down
6 changes: 6 additions & 0 deletions helmsman.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,12 @@
type=str,
metavar='STR')

parser.add_argument("-u", "--impute",
help="if using VCF input mode, missing genotypes \
(i.e., \"./.\") will be imputed as the allele \
frequency of the samples with non-missing genotypes",
action="store_true")

#-----------------------------------------------------------------------------
# Pre-filtering args
#-----------------------------------------------------------------------------
Expand Down
41 changes: 41 additions & 0 deletions out/subtype_count_matrix.txt

Large diffs are not rendered by default.

41 changes: 41 additions & 0 deletions out/subtype_count_matrix_spectra.txt

Large diffs are not rendered by default.

73 changes: 63 additions & 10 deletions util.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,7 @@ def processVCF(args, inputvcf, subtypes_dict, par):
numsites_keep = 0
numsites_skip = 0
chrseq = '0'
chr_check = "none"

for record in vcf_reader:

Expand All @@ -283,10 +284,32 @@ def processVCF(args, inputvcf, subtypes_dict, par):
numsites_skip += 1
continue

row_chr = record.CHROM

# check chromosome formatting matches between MAF and fasta files
if numsites_keep == 0:
if "chr1" in fasta_reader and "chr" not in row_chr:
chr_check = "add"
util_log.debug("formatting mismatch: 'chr' only in fasta file")
elif "chr1" not in fasta_reader and "chr" in row_chr:
chr_check = "delete"
util_log.debug("formatting mismatch: 'chr' only in MAF file")
else:
util_log.debug("chromosome formatting matches")

if chr_check == "add":
row_chr = "chr" + row_chr
elif chr_check == "delete":
row_chr = row_chr.replace('chr', '')

if row_chr != chrseq:
sequence = fasta_reader[row_chr]
chrseq = row_chr

# check and update chromosome sequence
if record.CHROM != chrseq:
sequence = fasta_reader[record.CHROM]
chrseq = record.CHROM
# if record.CHROM != chrseq:
# sequence = fasta_reader[record.CHROM]
# chrseq = record.CHROM

lseq = sequence[record.POS-(nbp+1):record.POS+nbp].seq

Expand Down Expand Up @@ -320,7 +343,12 @@ def processVCF(args, inputvcf, subtypes_dict, par):

else:
gt_new = record.gt_types
gt_new[gt_new == 3] = 0
if (args.impute and 3 in gt_new):
gt_complete = gt_new[gt_new!=3]
freq = sum(gt_complete)/len(gt_complete)
gt_new[gt_new == 3] = freq
else:
gt_new[gt_new == 3] = 0
M[:,st] = M[:,st]+gt_new
numsites_keep += 1

Expand Down Expand Up @@ -356,18 +384,43 @@ def processMAF(args, subtypes_dict):

reader = csv.DictReader(filter(lambda row: row[0]!='#', f), delimiter='\t')
counter = 0
chr_check = "none"
for row in reader:

if(row['Variant_Type'] != "SNP"): continue

pos = int(row['Start_position'])
if(row['Variant_Type'] not in ["SNP", "SNV"]): continue

if 'Start_Position' in row:
pos = int(row['Start_Position'])
else:
pos = int(row['Start_position'])
ref = row['Reference_Allele']
alt = row['Tumor_Seq_Allele2']
row_chr = row['Chromosome']
sample = row[args.groupvar]

if row['Chromosome'] != chrseq:
sequence = fasta_reader[row['Chromosome']]
chrseq = row['Chromosome']
# check chromosome formatting matches between MAF and fasta files
if counter == 0:
if "chr1" in fasta_reader and "chr" not in row_chr:
chr_check = "add"
util_log.debug("formatting mismatch: 'chr' only in fasta file")
elif "chr1" not in fasta_reader and "chr" in row_chr:
chr_check = "delete"
util_log.debug("formatting mismatch: 'chr' only in MAF file")
else:
util_log.debug("chromosome formatting matches")

if chr_check == "add":
row_chr = "chr" + row_chr
elif chr_check == "delete":
row_chr = row_chr.replace('chr', '')

if row_chr != chrseq:
sequence = fasta_reader[row_chr]
chrseq = row_chr

# if row['Chromosome'] != chrseq:
# sequence = fasta_reader[row['Chromosome']]
# chrseq = row['Chromosome']

counter += 1
mu_type = ref + alt
Expand Down

0 comments on commit 3ab80f5

Please sign in to comment.