Update to fix 'txt' processing label mismatch (#18) (#20)

* Fixing label misordering, adding N/unusual trinucleotide handling * Clean up excess debug messages * Cleaned implementation of skipping trinucleotides with N Co-authored-by: Jakob McBroome <[email protected]>
carjed · Jul 22, 2020 · ffde86c · ffde86c
1 parent 0ddfa32
commit ffde86c
Showing 1 changed file with 6 additions and 4 deletions.
diff --git a/util.py b/util.py
@@ -671,7 +671,9 @@ def process_txt(self):
                         # eprint("lseq:", lseq)
                     motif_a = getMotif(lseq)
                     subtype = str(category + "." + motif_a)
-                    st = self.subtypes_dict[subtype]
+
+                    if subtype not in self.subtypes_dict:
+                        continue
 
                     if sample not in samples_dict:
                         samples_dict[sample] = {}
@@ -680,9 +682,9 @@ def process_txt(self):
                         samples_dict[sample][subtype] = 1
                     else:
                         samples_dict[sample][subtype] += 1
-
-            M = pd.DataFrame(samples_dict).T.fillna(0).values
-            samples = sorted(samples_dict)
+            mdf = pd.DataFrame(samples_dict).T.fillna(0)
+            samples = mdf.index.tolist() #instead of using samples_dict with sorted(), which leads to mismatching, simply retain the explicit ordering of the matrix dataframe.
+            M = mdf.values 
 
         out = collections.namedtuple('Out', ['M', 'samples'])(M, samples)
         return out