diff --git a/docs/usage.md b/docs/usage.md index ce39e64..5de59c7 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -207,7 +207,7 @@ Note that the `chisq` outlier detection mode is generally more robust to low-SNV `--maxac [INT]` -By default, *Helmsman* will evaluate only singleton SNVs (`--maxac 1`). To include more common SNVs, increase the value of this parameter. To include all SNVs, use `--maxac 0`. This is not recommended when using *Helmsman* for QC functionality, but may be useful for more general mutation signature analysis. +By default, *Helmsman* will evaluate all SNVs present in the input data [`--maxac 0`]. To include more common SNVs, up to a specific allele count threshold, increase the value of this parameter. For example, `--maxac 2` will include only singletons and doubletons. ## Output options diff --git a/helmsman.py b/helmsman.py index 3b2336d..c10f615 100644 --- a/helmsman.py +++ b/helmsman.py @@ -26,8 +26,12 @@ def main(): # get latest version from github tags # via https://stackoverflow.com/questions/14989858 try: - version = subprocess.check_output(["git", - "describe"]).strip().decode('utf-8') + v_dir = os.path.dirname(os.path.realpath(__file__)) + "/.git/refs/tags" + files = os.listdir(v_dir) + files = [os.path.join(v_dir, f) for f in files] # add path to each file + files.sort(key=lambda x: os.path.getmtime(x)) + version = files[-1] + version = os.path.basename(version) except AttributeError: version = "[version not found]" @@ -277,8 +281,14 @@ def main(): log.info("----------------------------------") try: - version = subprocess.check_output(["git", - "describe"]).strip().decode('utf-8') + # version = subprocess.check_output(["git", + # "describe"]).strip().decode('utf-8') + v_dir = os.path.dirname(os.path.realpath(__file__)) + "/.git/refs/tags" + files = os.listdir(v_dir) + files = [os.path.join(v_dir, f) for f in files] # add path to each file + files.sort(key=lambda x: os.path.getmtime(x)) + version = files[-1] + version = os.path.basename(version) log.info("%s %s", sys.argv[0], version) except AttributeError: version = "[version not found]" diff --git a/util.py b/util.py index 69d45d3..0d8e9d5 100644 --- a/util.py +++ b/util.py @@ -102,7 +102,10 @@ def get_logger(name=None, # Manipulate sequence motifs etc. ############################################################################### def getCategory(mu_type): - """ collapse mutation types per strand symmetry """ + """ + collapse mutation types per strand symmetry + + """ # if re.match("^[ACGT]*$", mu_type): if mu_type in ('AC', 'TG'): @@ -123,7 +126,10 @@ def getCategory(mu_type): def getMotif(sequence): - """ query reference genome for local sequence motif """ + """ + query reference genome for local sequence motif + + """ motif = Seq(sequence, IUPAC.unambiguous_dna) altmotif = motif.reverse_complement() @@ -140,7 +146,11 @@ def getMotif(sequence): def indexSubtypes(motiflength): - """ define k-mer mutation subtypes """ + """ + define k-mer mutation subtypes + + """ + categories = ["T_G", "T_C", "T_A", "C_G", "C_T", "C_A"] bases = ["A", "C", "G", "T"] flank = (motiflength - 1) // 2 @@ -178,7 +188,11 @@ def indexSubtypes(motiflength): def indexGroups(samplefile, groupvar): - """ Build dictionary with sample ID as key, group ID as value """ + """ + Build dictionary with sample ID as key, group ID as value + + """ + sg_dict = {} f = open(samplefile, 'r', encoding="utf-8") @@ -191,7 +205,11 @@ def indexGroups(samplefile, groupvar): def get_samples(sample_file): - """ get samples from input M matrix when using aggregation mode """ + """ + get samples from input M matrix when using aggregation mode + + """ + samples = np.loadtxt( sample_file, dtype='S120', skiprows=1, delimiter='\t', usecols=(0, )) @@ -201,7 +219,11 @@ def get_samples(sample_file): def parseSampleFile(samplefile): - """ get list of samples to keep if samplefile supplied """ + """ + get list of samples to keep if samplefile supplied + + """ + # f = open(args.input, 'r', encoding = "ISO-8859-1") f = open(samplefile, 'r', encoding="utf-8") reader = csv.DictReader(f, delimiter='\t') @@ -213,7 +235,11 @@ def parseSampleFile(samplefile): def get_samples_vcf(args, inputvcf): - """ get samples from VCF file """ + """ + get samples from VCF file + + """ + if args.samplefile: keep_samples = parseSampleFile(args.samplefile) vcf_reader = VCF( @@ -235,6 +261,7 @@ class processInput: - MAF format - plain text format - Aggregation of existing subtype count matrices + """ def __init__(self, mode, args, subtypes_dict, par=False): @@ -268,7 +295,7 @@ def __init__(self, mode, args, subtypes_dict, par=False): samples = np.array([]) for vcf in vcf_list: - samples = np.append(samples, self.get_samples_vcf(vcf)) + samples = np.append(samples, get_samples_vcf(args, vcf)) else: nrow, ncol = results[1].shape @@ -282,7 +309,10 @@ def __init__(self, mode, args, subtypes_dict, par=False): count_matrix, samples) def process_vcf(self, inputfile): - """ Main function for parsing VCF """ + """ + Main function for parsing VCF + + """ # initialize reference genome fasta_reader = Fasta(self.args.fastafile, read_ahead=1000000) @@ -428,7 +458,11 @@ def process_vcf(self, inputfile): return out def process_maf(self): - """ process MAF files """ + """ + process MAF files + + """ + fasta_reader = Fasta(self.args.fastafile, read_ahead=1000000) nbp = (self.args.length - 1) // 2 @@ -513,7 +547,11 @@ def process_maf(self): return out def process_agg(self): - """ aggregate M matrices from list of input files """ + """ + aggregate M matrices from list of input files + + """ + inputM = self.args.input colnames = ["ID"] M_colnames = colnames + list(sorted(self.subtypes_dict.keys())) @@ -564,6 +602,7 @@ def process_txt(self): """ process tab-delimited text file, containing the following columns: CHR POS REF ALT SAMPLE_ID + """ fasta_reader = Fasta(self.args.fastafile, read_ahead=1000000) @@ -617,7 +656,10 @@ def process_txt(self): class DecompModel: - """ Class for fitting PCA or NMF models """ + """ + Class for fitting PCA and NMF models + + """ def __init__(self, M_run, rank, seed, decomp): self.M_run = M_run / (M_run.sum(axis=1) + 1e-8)[:, None] @@ -665,7 +707,7 @@ def __init__(self, M_run, rank, seed, decomp): elif self.decomp == "nmf": if self.rank > 0: - model = self.NMFmod(self.rank) + model = self.run_nmf_model(self.rank) self.modrank = self.rank elif self.rank == 0: @@ -673,13 +715,13 @@ def __init__(self, M_run, rank, seed, decomp): decomp) self.evarprev = 0 for i in range(1, self.M_run.shape[0]): - model = self.NMFmod(rank=i) + model = self.run_nmf_model(rank=i) model_fit = model() evar = model_fit.fit.evar() self.modrank = i if (i > 2 and evar - evarprev < 0.001): - model = self.NMFmod(rank=i - 1) + model = self.run_nmf_model(rank=i - 1) self.modrank = i - 1 break @@ -697,7 +739,11 @@ def __init__(self, M_run, rank, seed, decomp): # Specify NMF model # options can be added/modified per # http://nimfa.biolab.si/nimfa.methods.factorization.nmf.html - def NMFmod(self, rank): + def run_nmf_model(self, rank): + """ + Run NMF model + + """ prng = np.random.RandomState(self.seed) W_init = prng.rand(self.M_run.shape[0], rank) @@ -717,8 +763,12 @@ def NMFmod(self, rank): class writeOutput: + """ + Class of functions for writing the output of Helmsman. + + """ + def __init__(self, dat_paths, samples, subtypes_dict): - """ abc """ self.dat_paths = dat_paths self.samples = samples self.subtypes_dict = subtypes_dict @@ -762,7 +812,11 @@ def writeM(self, count_matrix): def writeR(package, projectdir, matrixname): - """ auto-generate R script """ + """ + auto-generate R script + + """ + rscript_path = projectdir + "/" + "Helmsman_to_" + package + ".R" rscript = open(rscript_path, "w+") print("library(\"" + package + "\")", file=rscript)