stjude-biohackathon · j-andrews7 · May 5, 2023 · May 4, 2023 · May 4, 2023 · May 4, 2023
diff --git a/crcminer/compare.py b/crcminer/compare.py
@@ -0,0 +1,88 @@
+
+import os
+import logging
+import argparse
+
+SCRIPT_PATH = os.path.abspath(__file__)
+FORMAT = '[%(asctime)s] %(levelname)s %(message)s'
+l = logging.getLogger()
+lh = logging.StreamHandler()
+lh.setFormatter(logging.Formatter(FORMAT))
+l.addHandler(lh)
+l.setLevel(logging.INFO)
+debug = l.debug; info = l.info; warning = l.warning; error = l.error
+
+def parse_bed(_infile):
+     import pandas as pd
+     df = pd.read_csv(_infile, sep='\t')
+     df = df[['gene','motif']] # these names are mocked. 
+     df2=df.dropna()
+     df2=df2.assign(gene=df2['gene'].str.split(','), 
+     motif=df2['motif'].str.split(',')).explode('gene').explode('motif').reset_index(drop=True)
+     # print(df.head())
+     _list=df2.values.tolist() # edgelist
+     # return tuple
+     # (['A', 'BAR'], ['B', 'BAR'], ['C', 'BAR'], ['FOO', 'X'], ['FOO', 'Y'], ['FOO', 'Z'], ['JANE1', 'DOE'])
+     return list(df2.itertuples(index=False, name=None))
+
+
+
+
+def network_jaccard(edgelist1, edgelist2):
+  '''
+  inputs:
+  edgelist1: output from parse_bed for sampleX
+
+  edgelist2: output from parse_bed for sampleY
+
+  Output: 
+  float: indicating similarity index
+
+  ref:
+  https://medium.com/rapids-ai/similarity-in-graphs-jaccard-versus-the-overlap-coefficient-610e083b877d
+
+  '''
+
+  edgeintersection = len(set(edgelist1).intersection(edgelist2))
+  edgeunion = (len(edgelist1) + len(edgelist2)) - edgeintersection
+
+  print(float((edgeintersection) / edgeunion))
+  return float((edgeintersection) / edgeunion)
+
+DESCRIPTION = '''
+When you have two different network graph
+objects between mutiple samples 
+compare them using jaccard similarity index
+'''
+
+EPILOG = '''
+'''
+
+class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
+    argparse.RawDescriptionHelpFormatter):
+  pass
+parser = argparse.ArgumentParser(description=DESCRIPTION, epilog=EPILOG,
+  formatter_class=CustomFormatter)
+
+parser.add_argument('edgelist1')
+parser.add_argument('-v', '--verbose', action='store_true',
+    help='Set logging level to DEBUG')
+
+args = parser.parse_args()
+
+e1 = parse_bed(args.edgelist1)
+# print e1
+# [('A', 'BAR'), ('B', 'BAR'), ('C', 'BAR'), ('FOO', 'X'), ('FOO', 'Y'), ('FOO', 'Z'), ('JANE1', 'DOE')]
+# copying the same set for testing. 
+e2 = parse_bed(args.edgelist1)
+e2 = e2[0:3] # subsetting 
+# print(e2)
+# [('A', 'BAR'), ('B', 'BAR'), ('C', 'BAR')]
+
+network_jaccard(e1, e2) # should print out  0.42857142857142855
+
+if args.verbose:
+  l.setLevel(logging.DEBUG)
+
+debug('%s begin', SCRIPT_PATH)
+
diff --git a/crcminer/crcminer.py b/crcminer/crcminer.py
@@ -0,0 +1,31 @@
+import click
+# Command Group
+@click.group(name='CRCminer')
+def cli_tools():
+    """Tool related commands"""
+    pass
+
+@cli_tools.command(name='mine', help='mines for CRC')
+@click.option('--fasta', type=click.Path(),help='fasta')
+@click.option('--enhancer', type=click.Path(), help='ROSE2 output of annotated (super)enhancers')
+@click.option('--mapping', type=click.Path(), help='Motif ID to gene ID mapping file.')
+def test(fasta,enchancer,mapping):
+    pass
+
+@cli_tools.command(name='compare', help='compare two networks')
+@click.option('--edgelist1', type=click.Path(), help='edgelist from sampleX')
+@click.option('--edgelist2', type=click.Path(), help='edgelist from sampleY')
+def test2(edgelist1,edgelist2):
+    pass
+
+@cli_tools.command(name='report', help='report vizualization')
+@click.option('--indir',type=click.Path(),help='input directory CRCminer results')
+
+def test3(indir):
+    pass
+
+# todo 
+# add more description
+
+if __name__ == '__main__':
+    cli_tools()
diff --git a/crcminer/network.py b/crcminer/network.py
@@ -0,0 +1,191 @@
+import os
+import logging
+import argparse
+import networkx as nx
+import pandas as pd
+
+
+SCRIPT_PATH = os.path.abspath(__file__)
+FORMAT = '[%(asctime)s] %(levelname)s %(message)s'
+l = logging.getLogger()
+lh = logging.StreamHandler()
+lh.setFormatter(logging.Formatter(FORMAT))
+l.addHandler(lh)
+l.setLevel(logging.INFO)
+debug = l.debug; info = l.info; warning = l.warning; error = l.error
+
+
+
+def parse_bed(_infile):
+    '''
+    input: motif bed file 
+      chr	st	en	gene	motif
+    chr1	10	20	A,B,C	BAR
+    chr1	11	21	FOO	X,Y,Z
+    chr1	1111	1121	JANE1	DOE
+    chr1	11131	11421		FOO1
+    chr1	11151	11521	BAR1
+    output:
+    edgelist: [['A', 'BAR'], ['B', 'BAR'], ['C', 'BAR'], ['FOO', 'X'], ['FOO', 'Y'], ['FOO', 'Z'], ['JANE1', 'DOE']]
+
+    '''
+    df = pd.read_csv(_infile, sep='\t')
+    df = df[['gene','motif']] # these names are mocked. 
+    df2=df.dropna()
+    df2=df2.assign(gene=df2['gene'].str.split(','), 
+    motif=df2['motif'].str.split(',')).explode('gene').explode('motif').reset_index(drop=True)
+    # print(df.head())
+    _list=df2.values.tolist() # edgelist
+
+    return _list
+
+
+def networkX_helpers(input_TFbed):
+
+  '''
+  Input is a node edge list
+  These will have both node and edge list 
+  my_graph.add_edges_from([('A', 'B'), 
+                            ('A', 'D'), 
+                            ('A', 'E'), 
+                            ('A', 'G'), 
+                            ('A', 'H'), 
+                            ('B', 'C'), 
+                            ('B', 'E'), 
+                            ('C', 'I'), 
+                            ('C', 'K'), 
+                            ('C', 'L'), 
+                            ('D', 'A'), 
+                            ('D', 'C'), 
+                            ('D', 'H'), 
+                            ('D', 'I'), 
+                            ('A', 'A'),('H','H'),('D','D'),('H','A'),('H','D')])
+  Output: 
+  text file with indegree, outdegree counts and CRC TF clique fractions. 
+
+  '''
+  # Create a networkx graph object
+  info("Initializing Graph")
+  _graph = nx.DiGraph()
+
+  # Add edges to to the graph object
+  info("Add edges to graph object")
+  _graph.add_edges_from(input_nodelist)
+
+  info("Calculating in-degree out-degree stats")
+
+  #degrees_dict = {node:deg for (node, deg) in _graph.degree()}
+  out_degree_dict = {node:deg for (node, deg) in _graph.out_degree()}
+  in_degree_dict = {node:deg for (node, deg) in _graph.in_degree()}
+
+  out_degree=pd.DataFrame(data={'TF':out_degree_dict.keys(), 'Out':out_degree_dict.values()})
+  in_degree=pd.DataFrame(data={'TF':in_degree_dict.keys(), 'In':in_degree_dict.values()})
+
+  NetworkMetricsOutput=pd.merge(out_degree,in_degree,left_on="TF",right_on="TF",how="outer")
+  # TOTAL DEGREE
+  NetworkMetricsOutput['Total'] = NetworkMetricsOutput['Out'] + NetworkMetricsOutput['In']
+
+  info("Fetching self Loops")
+  # Self loops
+  #autoregulatoryLoops = nx.selfloop_edges(_graph) # not needed?
+
+  selfLoops = list(nx.nodes_with_selfloops(_graph))
+
+  info("Fetch selfregulatory loops")
+  from itertools import product
+  nodePairs=[]
+  for ele in list(product(selfLoops,repeat=2)):
+      if ele[0] != ele[1] and _graph.has_edge(ele[0],ele[1]) and _graph.has_edge(ele[1],ele[0]):
+          nodePairs.append(ele)
+
+  info("Fetch Self regulating Clique")
+  unDirGraph = nx.from_edgelist(nodePairs)
+  cliqueGen = nx.find_cliques_recursive(unDirGraph)
+  cliqueList = list(cliqueGen)
+
+  # I am not sure what this does at this point but
+  # this is a place holder until SV confirms
+  # All cliques - not needed right now (SV)
+  #cliqueGen_ALL = list(nx.find_cliques_recursive(_graph))
+
+  print("hi",len(cliqueList))
+  info("Scores all the CRC's")
+  '''
+  ## SCORING THE CRCs using sum outdegree for each TF and dividing by the number of TFs in the clique
+  '''
+  cliqueRanking = []
+  outDegreeDict = _graph.out_degree()
+
+  for crcs in cliqueList:
+      score = 0
+      for gene in crcs:
+          score += outDegreeDict[gene]
+      score = score/len(crcs)
+      if score > 0 and len(crcs) > 2:
+          cliqueRanking.append((crcs, score))
+
+
+  sortedRankedCliques = pd.DataFrame(sorted(cliqueRanking, reverse=True, key=lambda x:x[1]))
+
+  factorEnrichmentDict = dict.fromkeys(selfLoops,0)
+
+  info("Calculate enrichment of each TF in a CRC clique")
+  '''
+  ## Enrichment of each TF calculated as (number of CRC cliques with the given TF)/(number of CRC cliques)
+  '''
+  for crcClique in cliqueList:
+    for TF in crcClique:
+        factorEnrichmentDict[TF] += 1
+
+  factorRankingTable = dict.fromkeys(selfLoops,0)
+  for TF in selfLoops:
+        factorRankingTable[TF]=factorEnrichmentDict[TF]/float(len(cliqueRanking))
+
+
+  FactorRank=pd.DataFrame(data={'TF':factorRankingTable.keys(), 'TF_CliqueFraction':factorRankingTable.values()})
+
+  TFSpecificCliques=nx.cliques_containing_node(unDirGraph)
+
+  NetworkMetricsOutput=pd.merge(NetworkMetricsOutput,FactorRank,left_on="TF",right_on="TF",how="outer")
+
+  info("Write to file")
+  NetworkMetricsOutput.to_csv('TF_Degrees.csv',index=False)
+  sortedRankedCliques.to_csv('Putative_CRC_Cliques.csv',index=False, header=False)
+
+  #print(nx.cliques_containing_node(unDirGraph,"RFX3"))
+  #print(len(nx.cliques_containing_node(unDirGraph,"RFX3")))
+  #print(TFSpecificCliques)
+
+  return _graph
+
+
+DESCRIPTION = '''
+Create networkx objects. 
+'''
+
+
+
+EPILOG = '''
+'''
+
+class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
+    argparse.RawDescriptionHelpFormatter):
+  pass
+parser = argparse.ArgumentParser(description=DESCRIPTION, epilog=EPILOG,
+  formatter_class=CustomFormatter)
+
+parser.add_argument('arg')
+parser.add_argument('-v', '--verbose', action='store_true',
+    help='Set logging level to DEBUG')
+
+args = parser.parse_args()
+
+edge_list = parse_bed(args.arg)
+networkX_helpers(edge_list)
+
+
+if args.verbose:
+  l.setLevel(logging.DEBUG)
+
+debug('%s begin', SCRIPT_PATH)
+