Skip to content

Commit

Permalink
Merge branch 'release/1.4.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
carjed committed May 28, 2019
2 parents 2ab6f7b + 2dd7670 commit 3203f56
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 23 deletions.
2 changes: 1 addition & 1 deletion docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ Note that the `chisq` outlier detection mode is generally more robust to low-SNV

`--maxac [INT]`

By default, *Helmsman* will evaluate only singleton SNVs (`--maxac 1`). To include more common SNVs, increase the value of this parameter. To include all SNVs, use `--maxac 0`. This is not recommended when using *Helmsman* for QC functionality, but may be useful for more general mutation signature analysis.
By default, *Helmsman* will evaluate all SNVs present in the input data [`--maxac 0`]. To include more common SNVs, up to a specific allele count threshold, increase the value of this parameter. For example, `--maxac 2` will include only singletons and doubletons.

## Output options

Expand Down
18 changes: 14 additions & 4 deletions helmsman.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,12 @@ def main():
# get latest version from github tags
# via https://stackoverflow.com/questions/14989858
try:
version = subprocess.check_output(["git",
"describe"]).strip().decode('utf-8')
v_dir = os.path.dirname(os.path.realpath(__file__)) + "/.git/refs/tags"
files = os.listdir(v_dir)
files = [os.path.join(v_dir, f) for f in files] # add path to each file
files.sort(key=lambda x: os.path.getmtime(x))
version = files[-1]
version = os.path.basename(version)
except AttributeError:
version = "[version not found]"

Expand Down Expand Up @@ -277,8 +281,14 @@ def main():

log.info("----------------------------------")
try:
version = subprocess.check_output(["git",
"describe"]).strip().decode('utf-8')
# version = subprocess.check_output(["git",
# "describe"]).strip().decode('utf-8')
v_dir = os.path.dirname(os.path.realpath(__file__)) + "/.git/refs/tags"
files = os.listdir(v_dir)
files = [os.path.join(v_dir, f) for f in files] # add path to each file
files.sort(key=lambda x: os.path.getmtime(x))
version = files[-1]
version = os.path.basename(version)
log.info("%s %s", sys.argv[0], version)
except AttributeError:
version = "[version not found]"
Expand Down
90 changes: 72 additions & 18 deletions util.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,10 @@ def get_logger(name=None,
# Manipulate sequence motifs etc.
###############################################################################
def getCategory(mu_type):
""" collapse mutation types per strand symmetry """
"""
collapse mutation types per strand symmetry
"""

# if re.match("^[ACGT]*$", mu_type):
if mu_type in ('AC', 'TG'):
Expand All @@ -123,7 +126,10 @@ def getCategory(mu_type):


def getMotif(sequence):
""" query reference genome for local sequence motif """
"""
query reference genome for local sequence motif
"""

motif = Seq(sequence, IUPAC.unambiguous_dna)
altmotif = motif.reverse_complement()
Expand All @@ -140,7 +146,11 @@ def getMotif(sequence):


def indexSubtypes(motiflength):
""" define k-mer mutation subtypes """
"""
define k-mer mutation subtypes
"""

categories = ["T_G", "T_C", "T_A", "C_G", "C_T", "C_A"]
bases = ["A", "C", "G", "T"]
flank = (motiflength - 1) // 2
Expand Down Expand Up @@ -178,7 +188,11 @@ def indexSubtypes(motiflength):


def indexGroups(samplefile, groupvar):
""" Build dictionary with sample ID as key, group ID as value """
"""
Build dictionary with sample ID as key, group ID as value
"""

sg_dict = {}

f = open(samplefile, 'r', encoding="utf-8")
Expand All @@ -191,7 +205,11 @@ def indexGroups(samplefile, groupvar):


def get_samples(sample_file):
""" get samples from input M matrix when using aggregation mode """
"""
get samples from input M matrix when using aggregation mode
"""

samples = np.loadtxt(
sample_file, dtype='S120', skiprows=1, delimiter='\t', usecols=(0, ))

Expand All @@ -201,7 +219,11 @@ def get_samples(sample_file):


def parseSampleFile(samplefile):
""" get list of samples to keep if samplefile supplied """
"""
get list of samples to keep if samplefile supplied
"""

# f = open(args.input, 'r', encoding = "ISO-8859-1")
f = open(samplefile, 'r', encoding="utf-8")
reader = csv.DictReader(f, delimiter='\t')
Expand All @@ -213,7 +235,11 @@ def parseSampleFile(samplefile):


def get_samples_vcf(args, inputvcf):
""" get samples from VCF file """
"""
get samples from VCF file
"""

if args.samplefile:
keep_samples = parseSampleFile(args.samplefile)
vcf_reader = VCF(
Expand All @@ -235,6 +261,7 @@ class processInput:
- MAF format
- plain text format
- Aggregation of existing subtype count matrices
"""

def __init__(self, mode, args, subtypes_dict, par=False):
Expand Down Expand Up @@ -268,7 +295,7 @@ def __init__(self, mode, args, subtypes_dict, par=False):

samples = np.array([])
for vcf in vcf_list:
samples = np.append(samples, self.get_samples_vcf(vcf))
samples = np.append(samples, get_samples_vcf(args, vcf))

else:
nrow, ncol = results[1].shape
Expand All @@ -282,7 +309,10 @@ def __init__(self, mode, args, subtypes_dict, par=False):
count_matrix, samples)

def process_vcf(self, inputfile):
""" Main function for parsing VCF """
"""
Main function for parsing VCF
"""
# initialize reference genome
fasta_reader = Fasta(self.args.fastafile, read_ahead=1000000)

Expand Down Expand Up @@ -428,7 +458,11 @@ def process_vcf(self, inputfile):
return out

def process_maf(self):
""" process MAF files """
"""
process MAF files
"""

fasta_reader = Fasta(self.args.fastafile, read_ahead=1000000)

nbp = (self.args.length - 1) // 2
Expand Down Expand Up @@ -513,7 +547,11 @@ def process_maf(self):
return out

def process_agg(self):
""" aggregate M matrices from list of input files """
"""
aggregate M matrices from list of input files
"""

inputM = self.args.input
colnames = ["ID"]
M_colnames = colnames + list(sorted(self.subtypes_dict.keys()))
Expand Down Expand Up @@ -564,6 +602,7 @@ def process_txt(self):
"""
process tab-delimited text file, containing the following columns:
CHR POS REF ALT SAMPLE_ID
"""

fasta_reader = Fasta(self.args.fastafile, read_ahead=1000000)
Expand Down Expand Up @@ -617,7 +656,10 @@ def process_txt(self):


class DecompModel:
""" Class for fitting PCA or NMF models """
"""
Class for fitting PCA and NMF models
"""

def __init__(self, M_run, rank, seed, decomp):
self.M_run = M_run / (M_run.sum(axis=1) + 1e-8)[:, None]
Expand Down Expand Up @@ -665,21 +707,21 @@ def __init__(self, M_run, rank, seed, decomp):
elif self.decomp == "nmf":

if self.rank > 0:
model = self.NMFmod(self.rank)
model = self.run_nmf_model(self.rank)
self.modrank = self.rank

elif self.rank == 0:
util_log.debug("Finding optimal rank for %s decomposition",
decomp)
self.evarprev = 0
for i in range(1, self.M_run.shape[0]):
model = self.NMFmod(rank=i)
model = self.run_nmf_model(rank=i)
model_fit = model()
evar = model_fit.fit.evar()
self.modrank = i

if (i > 2 and evar - evarprev < 0.001):
model = self.NMFmod(rank=i - 1)
model = self.run_nmf_model(rank=i - 1)
self.modrank = i - 1
break

Expand All @@ -697,7 +739,11 @@ def __init__(self, M_run, rank, seed, decomp):
# Specify NMF model
# options can be added/modified per
# http://nimfa.biolab.si/nimfa.methods.factorization.nmf.html
def NMFmod(self, rank):
def run_nmf_model(self, rank):
"""
Run NMF model
"""

prng = np.random.RandomState(self.seed)
W_init = prng.rand(self.M_run.shape[0], rank)
Expand All @@ -717,8 +763,12 @@ def NMFmod(self, rank):


class writeOutput:
"""
Class of functions for writing the output of Helmsman.
"""

def __init__(self, dat_paths, samples, subtypes_dict):
""" abc """
self.dat_paths = dat_paths
self.samples = samples
self.subtypes_dict = subtypes_dict
Expand Down Expand Up @@ -762,7 +812,11 @@ def writeM(self, count_matrix):


def writeR(package, projectdir, matrixname):
""" auto-generate R script """
"""
auto-generate R script
"""

rscript_path = projectdir + "/" + "Helmsman_to_" + package + ".R"
rscript = open(rscript_path, "w+")
print("library(\"" + package + "\")", file=rscript)
Expand Down

0 comments on commit 3203f56

Please sign in to comment.