Merge branch 'release/1.4.0'

carjed · May 28, 2019 · 3203f56 · 3203f56
2 parents 2ab6f7b + 2dd7670
commit 3203f56
Show file tree

Hide file tree

Showing 3 changed files with 87 additions and 23 deletions.
diff --git a/docs/usage.md b/docs/usage.md
@@ -207,7 +207,7 @@ Note that the `chisq` outlier detection mode is generally more robust to low-SNV
 
 `--maxac [INT]`
 
-By default, *Helmsman* will evaluate only singleton SNVs (`--maxac 1`). To include more common SNVs, increase the value of this parameter. To include all SNVs, use `--maxac 0`. This is not recommended when using *Helmsman* for QC functionality, but may be useful for more general mutation signature analysis.
+By default, *Helmsman* will evaluate all SNVs present in the input data [`--maxac 0`]. To include more common SNVs, up to a specific allele count threshold, increase the value of this parameter. For example, `--maxac 2` will include only singletons and doubletons.
 
 ## Output options
 

diff --git a/helmsman.py b/helmsman.py
@@ -26,8 +26,12 @@ def main():
     # get latest version from github tags
     # via https://stackoverflow.com/questions/14989858
     try:
-        version = subprocess.check_output(["git",
-                                           "describe"]).strip().decode('utf-8')
+        v_dir = os.path.dirname(os.path.realpath(__file__)) + "/.git/refs/tags"
+        files = os.listdir(v_dir)
+        files = [os.path.join(v_dir, f) for f in files] # add path to each file
+        files.sort(key=lambda x: os.path.getmtime(x))
+        version = files[-1]
+        version = os.path.basename(version)
     except AttributeError:
         version = "[version not found]"
 
@@ -277,8 +281,14 @@ def main():
 
     log.info("----------------------------------")
     try:
-        version = subprocess.check_output(["git",
-                                           "describe"]).strip().decode('utf-8')
+        # version = subprocess.check_output(["git",
+        #                                   "describe"]).strip().decode('utf-8')
+        v_dir = os.path.dirname(os.path.realpath(__file__)) + "/.git/refs/tags"
+        files = os.listdir(v_dir)
+        files = [os.path.join(v_dir, f) for f in files] # add path to each file
+        files.sort(key=lambda x: os.path.getmtime(x))
+        version = files[-1]
+        version = os.path.basename(version)
         log.info("%s %s", sys.argv[0], version)
     except AttributeError:
         version = "[version not found]"

diff --git a/util.py b/util.py
@@ -102,7 +102,10 @@ def get_logger(name=None,
 # Manipulate sequence motifs etc.
 ###############################################################################
 def getCategory(mu_type):
-    """ collapse mutation types per strand symmetry """
+    """
+    collapse mutation types per strand symmetry
+
+    """
 
     # if re.match("^[ACGT]*$", mu_type):
     if mu_type in ('AC', 'TG'):
@@ -123,7 +126,10 @@ def getCategory(mu_type):
 
 
 def getMotif(sequence):
-    """ query reference genome for local sequence motif """
+    """
+    query reference genome for local sequence motif
+
+    """
 
     motif = Seq(sequence, IUPAC.unambiguous_dna)
     altmotif = motif.reverse_complement()
@@ -140,7 +146,11 @@ def getMotif(sequence):
 
 
 def indexSubtypes(motiflength):
-    """ define k-mer mutation subtypes """
+    """
+    define k-mer mutation subtypes
+
+    """
+
     categories = ["T_G", "T_C", "T_A", "C_G", "C_T", "C_A"]
     bases = ["A", "C", "G", "T"]
     flank = (motiflength - 1) // 2
@@ -178,7 +188,11 @@ def indexSubtypes(motiflength):
 
 
 def indexGroups(samplefile, groupvar):
-    """ Build dictionary with sample ID as key, group ID as value """
+    """
+    Build dictionary with sample ID as key, group ID as value
+
+    """
+
     sg_dict = {}
 
     f = open(samplefile, 'r', encoding="utf-8")
@@ -191,7 +205,11 @@ def indexGroups(samplefile, groupvar):
 
 
 def get_samples(sample_file):
-    """ get samples from input M matrix when using aggregation mode """
+    """
+    get samples from input M matrix when using aggregation mode
+
+    """
+
     samples = np.loadtxt(
         sample_file, dtype='S120', skiprows=1, delimiter='\t', usecols=(0, ))
 
@@ -201,7 +219,11 @@ def get_samples(sample_file):
 
 
 def parseSampleFile(samplefile):
-    """ get list of samples to keep if samplefile supplied """
+    """
+    get list of samples to keep if samplefile supplied
+
+    """
+
     # f = open(args.input, 'r', encoding = "ISO-8859-1")
     f = open(samplefile, 'r', encoding="utf-8")
     reader = csv.DictReader(f, delimiter='\t')
@@ -213,7 +235,11 @@ def parseSampleFile(samplefile):
 
 
 def get_samples_vcf(args, inputvcf):
-    """ get samples from VCF file """
+    """
+    get samples from VCF file
+
+    """
+
     if args.samplefile:
         keep_samples = parseSampleFile(args.samplefile)
         vcf_reader = VCF(
@@ -235,6 +261,7 @@ class processInput:
     - MAF format
     - plain text format
     - Aggregation of existing subtype count matrices
+
     """
 
     def __init__(self, mode, args, subtypes_dict, par=False):
@@ -268,7 +295,7 @@ def __init__(self, mode, args, subtypes_dict, par=False):
 
                     samples = np.array([])
                     for vcf in vcf_list:
-                        samples = np.append(samples, self.get_samples_vcf(vcf))
+                        samples = np.append(samples, get_samples_vcf(args, vcf))
 
                 else:
                     nrow, ncol = results[1].shape
@@ -282,7 +309,10 @@ def __init__(self, mode, args, subtypes_dict, par=False):
                     count_matrix, samples)
 
     def process_vcf(self, inputfile):
-        """ Main function for parsing VCF """
+        """
+        Main function for parsing VCF
+
+        """
         # initialize reference genome
         fasta_reader = Fasta(self.args.fastafile, read_ahead=1000000)
 
@@ -428,7 +458,11 @@ def process_vcf(self, inputfile):
         return out
 
     def process_maf(self):
-        """ process MAF files """
+        """
+        process MAF files
+
+        """
+
         fasta_reader = Fasta(self.args.fastafile, read_ahead=1000000)
 
         nbp = (self.args.length - 1) // 2
@@ -513,7 +547,11 @@ def process_maf(self):
         return out
 
     def process_agg(self):
-        """ aggregate M matrices from list of input files """
+        """
+        aggregate M matrices from list of input files
+
+        """
+
         inputM = self.args.input
         colnames = ["ID"]
         M_colnames = colnames + list(sorted(self.subtypes_dict.keys()))
@@ -564,6 +602,7 @@ def process_txt(self):
         """
         process tab-delimited text file, containing the following columns:
         CHR    POS    REF    ALT    SAMPLE_ID
+
         """
 
         fasta_reader = Fasta(self.args.fastafile, read_ahead=1000000)
@@ -617,7 +656,10 @@ def process_txt(self):
 
 
 class DecompModel:
-    """ Class for fitting PCA or NMF models """
+    """
+    Class for fitting PCA and NMF models
+
+    """
 
     def __init__(self, M_run, rank, seed, decomp):
         self.M_run = M_run / (M_run.sum(axis=1) + 1e-8)[:, None]
@@ -665,21 +707,21 @@ def __init__(self, M_run, rank, seed, decomp):
         elif self.decomp == "nmf":
 
             if self.rank > 0:
-                model = self.NMFmod(self.rank)
+                model = self.run_nmf_model(self.rank)
                 self.modrank = self.rank
 
             elif self.rank == 0:
                 util_log.debug("Finding optimal rank for %s decomposition",
                                decomp)
                 self.evarprev = 0
                 for i in range(1, self.M_run.shape[0]):
-                    model = self.NMFmod(rank=i)
+                    model = self.run_nmf_model(rank=i)
                     model_fit = model()
                     evar = model_fit.fit.evar()
                     self.modrank = i
 
                     if (i > 2 and evar - evarprev < 0.001):
-                        model = self.NMFmod(rank=i - 1)
+                        model = self.run_nmf_model(rank=i - 1)
                         self.modrank = i - 1
                         break
 
@@ -697,7 +739,11 @@ def __init__(self, M_run, rank, seed, decomp):
     # Specify NMF model
     # options can be added/modified per
     # http://nimfa.biolab.si/nimfa.methods.factorization.nmf.html
-    def NMFmod(self, rank):
+    def run_nmf_model(self, rank):
+        """
+        Run NMF model
+
+        """
 
         prng = np.random.RandomState(self.seed)
         W_init = prng.rand(self.M_run.shape[0], rank)
@@ -717,8 +763,12 @@ def NMFmod(self, rank):
 
 
 class writeOutput:
+    """
+    Class of functions for writing the output of Helmsman.
+
+    """
+
     def __init__(self, dat_paths, samples, subtypes_dict):
-        """ abc """
         self.dat_paths = dat_paths
         self.samples = samples
         self.subtypes_dict = subtypes_dict
@@ -762,7 +812,11 @@ def writeM(self, count_matrix):
 
 
 def writeR(package, projectdir, matrixname):
-    """ auto-generate R script """
+    """
+    auto-generate R script
+
+    """
+
     rscript_path = projectdir + "/" + "Helmsman_to_" + package + ".R"
     rscript = open(rscript_path, "w+")
     print("library(\"" + package + "\")", file=rscript)