From fa98106b1458e5e6310d4c336f1447ad450a7d09 Mon Sep 17 00:00:00 2001 From: Mark Keller <7525285+keller-mark@users.noreply.github.com> Date: Sun, 22 Jul 2018 19:01:40 -0400 Subject: [PATCH] Added initial clustering endpoint --- app/main.py | 10 +++++++ app/plot_processing.py | 62 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/app/main.py b/app/main.py index 7898675..8de8d63 100644 --- a/app/main.py +++ b/app/main.py @@ -98,6 +98,16 @@ def route_samples_with_signatures(): return response_json(app, output) +@app.route('/clustering', methods=['POST']) +def route_clustering(): + req = request.get_json(force=True) + signatures = json_or(req, 'signatures', [], r'.*') + projects = json_or(req, 'sources', [], PROJ_RE) + + output = PlotProcessing.clustering(signatures, projects) + + return response_json(app, output) + if __name__ == '__main__': app.run( host='0.0.0.0', diff --git a/app/plot_processing.py b/app/plot_processing.py index 646a83c..feff1b6 100644 --- a/app/plot_processing.py +++ b/app/plot_processing.py @@ -1,10 +1,12 @@ import pandas as pd import numpy as np +import scipy.cluster import os import io import re import sys import json +from functools import reduce from yaml import load from yaml import Loader from web_constants import * @@ -255,3 +257,63 @@ def chromosome_bands(): df = pd.read_csv(CHROMOSOME_BANDS_FILE, sep='\t') return PlotProcessing.pd_as_file(df, index_val=False) + + @staticmethod + def clustering(sigs, projects): + signatures = Signatures(SIGS_FILE, SIGS_META_FILE, chosen_sigs=sigs) + sig_names = signatures.get_chosen_names() + full_exps_df = pd.DataFrame(index=[], columns=sig_names) + + project_metadata = PlotProcessing.project_metadata() + for proj_id in projects: + if project_metadata[proj_id]["has_counts"]: + # counts data + counts_filepath = project_metadata[proj_id]["counts_path"] + counts_df = PlotProcessing.pd_fetch_tsv(counts_filepath, index_col=0) + counts_df = counts_df.dropna(how='any', axis='index') + + if len(counts_df) > 0: + # compute exposures + exps_df = signatures.get_exposures(counts_df) + + full_exps_df = full_exps_df.append(exps_df, ignore_index=False) + + # Do hierarchical clustering + # Reference: https://gist.github.com/mdml/7537455 + observation_vectors = full_exps_df.values + Z = scipy.cluster.hierarchy.linkage(observation_vectors, method='ward') + T = scipy.cluster.hierarchy.to_tree(Z) + + # Create dictionary for labeling nodes by their IDs + labels = list(full_exps_df.index.values) + id2label = dict(zip(range(len(labels)), labels)) + + # Create a nested dictionary from the ClusterNode's returned by SciPy + def add_node(node, parent): + # First create the new node and append it to its parent's children + new_node = dict( node_id=node.id, children=[] ) + parent["children"].append( new_node ) + # Recursively add the current node's children + if node.left: add_node( node.left, new_node ) + if node.right: add_node( node.right, new_node ) + + # Initialize nested dictionary for d3, then recursively iterate through tree + tree_dict = dict(children=[], name="root") + add_node( T, tree_dict ) + + # Label each node with the names of each leaf in its subtree + def label_tree( n ): + # If the node is a leaf, then we have its name + if len(n["children"]) == 0: + leaf_names = [ id2label[n["node_id"]] ] + # If not, flatten all the leaves in the node's subtree + else: + leaf_names = reduce(lambda ls, c: ls + label_tree(c), n["children"], []) + # Delete the node id since we don't need it anymore and it makes for cleaner JSON + del n["node_id"] + # Labeling convention: "-"-separated leaf names + n["name"] = name = "-".join(sorted(map(str, leaf_names))) + return leaf_names + + label_tree( tree_dict["children"][0] ) + return tree_dict