Skip to content

Commit

Permalink
fix bug when using rowwise parameter. fixes #12
Browse files Browse the repository at this point in the history
  • Loading branch information
carjed committed May 28, 2019
1 parent 4a1d502 commit 2dd7670
Showing 1 changed file with 72 additions and 18 deletions.
90 changes: 72 additions & 18 deletions util.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,10 @@ def get_logger(name=None,
# Manipulate sequence motifs etc.
###############################################################################
def getCategory(mu_type):
""" collapse mutation types per strand symmetry """
"""
collapse mutation types per strand symmetry
"""

# if re.match("^[ACGT]*$", mu_type):
if mu_type in ('AC', 'TG'):
Expand All @@ -123,7 +126,10 @@ def getCategory(mu_type):


def getMotif(sequence):
""" query reference genome for local sequence motif """
"""
query reference genome for local sequence motif
"""

motif = Seq(sequence, IUPAC.unambiguous_dna)
altmotif = motif.reverse_complement()
Expand All @@ -140,7 +146,11 @@ def getMotif(sequence):


def indexSubtypes(motiflength):
""" define k-mer mutation subtypes """
"""
define k-mer mutation subtypes
"""

categories = ["T_G", "T_C", "T_A", "C_G", "C_T", "C_A"]
bases = ["A", "C", "G", "T"]
flank = (motiflength - 1) // 2
Expand Down Expand Up @@ -178,7 +188,11 @@ def indexSubtypes(motiflength):


def indexGroups(samplefile, groupvar):
""" Build dictionary with sample ID as key, group ID as value """
"""
Build dictionary with sample ID as key, group ID as value
"""

sg_dict = {}

f = open(samplefile, 'r', encoding="utf-8")
Expand All @@ -191,7 +205,11 @@ def indexGroups(samplefile, groupvar):


def get_samples(sample_file):
""" get samples from input M matrix when using aggregation mode """
"""
get samples from input M matrix when using aggregation mode
"""

samples = np.loadtxt(
sample_file, dtype='S120', skiprows=1, delimiter='\t', usecols=(0, ))

Expand All @@ -201,7 +219,11 @@ def get_samples(sample_file):


def parseSampleFile(samplefile):
""" get list of samples to keep if samplefile supplied """
"""
get list of samples to keep if samplefile supplied
"""

# f = open(args.input, 'r', encoding = "ISO-8859-1")
f = open(samplefile, 'r', encoding="utf-8")
reader = csv.DictReader(f, delimiter='\t')
Expand All @@ -213,7 +235,11 @@ def parseSampleFile(samplefile):


def get_samples_vcf(args, inputvcf):
""" get samples from VCF file """
"""
get samples from VCF file
"""

if args.samplefile:
keep_samples = parseSampleFile(args.samplefile)
vcf_reader = VCF(
Expand All @@ -235,6 +261,7 @@ class processInput:
- MAF format
- plain text format
- Aggregation of existing subtype count matrices
"""

def __init__(self, mode, args, subtypes_dict, par=False):
Expand Down Expand Up @@ -268,7 +295,7 @@ def __init__(self, mode, args, subtypes_dict, par=False):

samples = np.array([])
for vcf in vcf_list:
samples = np.append(samples, self.get_samples_vcf(vcf))
samples = np.append(samples, get_samples_vcf(args, vcf))

else:
nrow, ncol = results[1].shape
Expand All @@ -282,7 +309,10 @@ def __init__(self, mode, args, subtypes_dict, par=False):
count_matrix, samples)

def process_vcf(self, inputfile):
""" Main function for parsing VCF """
"""
Main function for parsing VCF
"""
# initialize reference genome
fasta_reader = Fasta(self.args.fastafile, read_ahead=1000000)

Expand Down Expand Up @@ -428,7 +458,11 @@ def process_vcf(self, inputfile):
return out

def process_maf(self):
""" process MAF files """
"""
process MAF files
"""

fasta_reader = Fasta(self.args.fastafile, read_ahead=1000000)

nbp = (self.args.length - 1) // 2
Expand Down Expand Up @@ -513,7 +547,11 @@ def process_maf(self):
return out

def process_agg(self):
""" aggregate M matrices from list of input files """
"""
aggregate M matrices from list of input files
"""

inputM = self.args.input
colnames = ["ID"]
M_colnames = colnames + list(sorted(self.subtypes_dict.keys()))
Expand Down Expand Up @@ -564,6 +602,7 @@ def process_txt(self):
"""
process tab-delimited text file, containing the following columns:
CHR POS REF ALT SAMPLE_ID
"""

fasta_reader = Fasta(self.args.fastafile, read_ahead=1000000)
Expand Down Expand Up @@ -617,7 +656,10 @@ def process_txt(self):


class DecompModel:
""" Class for fitting PCA or NMF models """
"""
Class for fitting PCA and NMF models
"""

def __init__(self, M_run, rank, seed, decomp):
self.M_run = M_run / (M_run.sum(axis=1) + 1e-8)[:, None]
Expand Down Expand Up @@ -665,21 +707,21 @@ def __init__(self, M_run, rank, seed, decomp):
elif self.decomp == "nmf":

if self.rank > 0:
model = self.NMFmod(self.rank)
model = self.run_nmf_model(self.rank)
self.modrank = self.rank

elif self.rank == 0:
util_log.debug("Finding optimal rank for %s decomposition",
decomp)
self.evarprev = 0
for i in range(1, self.M_run.shape[0]):
model = self.NMFmod(rank=i)
model = self.run_nmf_model(rank=i)
model_fit = model()
evar = model_fit.fit.evar()
self.modrank = i

if (i > 2 and evar - evarprev < 0.001):
model = self.NMFmod(rank=i - 1)
model = self.run_nmf_model(rank=i - 1)
self.modrank = i - 1
break

Expand All @@ -697,7 +739,11 @@ def __init__(self, M_run, rank, seed, decomp):
# Specify NMF model
# options can be added/modified per
# http://nimfa.biolab.si/nimfa.methods.factorization.nmf.html
def NMFmod(self, rank):
def run_nmf_model(self, rank):
"""
Run NMF model
"""

prng = np.random.RandomState(self.seed)
W_init = prng.rand(self.M_run.shape[0], rank)
Expand All @@ -717,8 +763,12 @@ def NMFmod(self, rank):


class writeOutput:
"""
Class of functions for writing the output of Helmsman.
"""

def __init__(self, dat_paths, samples, subtypes_dict):
""" abc """
self.dat_paths = dat_paths
self.samples = samples
self.subtypes_dict = subtypes_dict
Expand Down Expand Up @@ -762,7 +812,11 @@ def writeM(self, count_matrix):


def writeR(package, projectdir, matrixname):
""" auto-generate R script """
"""
auto-generate R script
"""

rscript_path = projectdir + "/" + "Helmsman_to_" + package + ".R"
rscript = open(rscript_path, "w+")
print("library(\"" + package + "\")", file=rscript)
Expand Down

0 comments on commit 2dd7670

Please sign in to comment.