From fa98106b1458e5e6310d4c336f1447ad450a7d09 Mon Sep 17 00:00:00 2001
From: Mark Keller <7525285+keller-mark@users.noreply.github.com>
Date: Sun, 22 Jul 2018 19:01:40 -0400
Subject: [PATCH] Added initial clustering endpoint

---
 app/main.py            | 10 +++++++
 app/plot_processing.py | 62 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+)

diff --git a/app/main.py b/app/main.py
index 7898675..8de8d63 100644
--- a/app/main.py
+++ b/app/main.py
@@ -98,6 +98,16 @@ def route_samples_with_signatures():
 
   return response_json(app, output)
 
+@app.route('/clustering', methods=['POST'])
+def route_clustering():
+  req = request.get_json(force=True)
+  signatures = json_or(req, 'signatures', [], r'.*')
+  projects = json_or(req, 'sources', [], PROJ_RE)
+
+  output = PlotProcessing.clustering(signatures, projects)
+
+  return response_json(app, output)
+
 if __name__ == '__main__':
   app.run(
       host='0.0.0.0',
diff --git a/app/plot_processing.py b/app/plot_processing.py
index 646a83c..feff1b6 100644
--- a/app/plot_processing.py
+++ b/app/plot_processing.py
@@ -1,10 +1,12 @@
 import pandas as pd
 import numpy as np
+import scipy.cluster
 import os
 import io
 import re
 import sys
 import json
+from functools import reduce
 from yaml import load
 from yaml import Loader
 from web_constants import *
@@ -255,3 +257,63 @@ def chromosome_bands():
     df = pd.read_csv(CHROMOSOME_BANDS_FILE, sep='\t')
 
     return PlotProcessing.pd_as_file(df, index_val=False)
+  
+  @staticmethod
+  def clustering(sigs, projects):
+    signatures = Signatures(SIGS_FILE, SIGS_META_FILE, chosen_sigs=sigs)
+    sig_names = signatures.get_chosen_names()
+    full_exps_df = pd.DataFrame(index=[], columns=sig_names)
+    
+    project_metadata = PlotProcessing.project_metadata()
+    for proj_id in projects:
+      if project_metadata[proj_id]["has_counts"]:
+        # counts data
+        counts_filepath = project_metadata[proj_id]["counts_path"]
+        counts_df = PlotProcessing.pd_fetch_tsv(counts_filepath, index_col=0)
+        counts_df = counts_df.dropna(how='any', axis='index')
+        
+        if len(counts_df) > 0:
+          # compute exposures
+          exps_df = signatures.get_exposures(counts_df)
+
+          full_exps_df = full_exps_df.append(exps_df, ignore_index=False)
+
+    # Do hierarchical clustering 
+    # Reference: https://gist.github.com/mdml/7537455
+    observation_vectors = full_exps_df.values
+    Z = scipy.cluster.hierarchy.linkage(observation_vectors, method='ward')
+    T = scipy.cluster.hierarchy.to_tree(Z)
+
+    # Create dictionary for labeling nodes by their IDs
+    labels = list(full_exps_df.index.values)
+    id2label = dict(zip(range(len(labels)), labels))
+
+    # Create a nested dictionary from the ClusterNode's returned by SciPy
+    def add_node(node, parent):
+      # First create the new node and append it to its parent's children
+      new_node = dict( node_id=node.id, children=[] )
+      parent["children"].append( new_node )
+      # Recursively add the current node's children
+      if node.left: add_node( node.left, new_node )
+      if node.right: add_node( node.right, new_node )
+    
+    # Initialize nested dictionary for d3, then recursively iterate through tree
+    tree_dict = dict(children=[], name="root")
+    add_node( T, tree_dict )
+  
+    # Label each node with the names of each leaf in its subtree
+    def label_tree( n ):
+      # If the node is a leaf, then we have its name
+      if len(n["children"]) == 0:
+        leaf_names = [ id2label[n["node_id"]] ]
+      # If not, flatten all the leaves in the node's subtree
+      else:
+        leaf_names = reduce(lambda ls, c: ls + label_tree(c), n["children"], [])
+      # Delete the node id since we don't need it anymore and it makes for cleaner JSON
+      del n["node_id"]
+      # Labeling convention: "-"-separated leaf names
+      n["name"] = name = "-".join(sorted(map(str, leaf_names)))
+      return leaf_names
+
+    label_tree( tree_dict["children"][0] )
+    return tree_dict