From 70c6fcbf6ee45d92fce6a27f177e720d3d584c2b Mon Sep 17 00:00:00 2001 From: sridhar0605 Date: Thu, 4 May 2023 10:51:11 -0500 Subject: [PATCH 1/8] networkx helpers --- crcminer/create_nodes_graph.py | 146 +++++++++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 crcminer/create_nodes_graph.py diff --git a/crcminer/create_nodes_graph.py b/crcminer/create_nodes_graph.py new file mode 100644 index 0000000..82b7bde --- /dev/null +++ b/crcminer/create_nodes_graph.py @@ -0,0 +1,146 @@ + +import os +import logging +import argparse + +SCRIPT_PATH = os.path.abspath(__file__) +FORMAT = '[%(asctime)s] %(levelname)s %(message)s' +l = logging.getLogger() +lh = logging.StreamHandler() +lh.setFormatter(logging.Formatter(FORMAT)) +l.addHandler(lh) +l.setLevel(logging.INFO) +debug = l.debug; info = l.info; warning = l.warning; error = l.error + + + +def parse_bed(_infile): + import pandas as pd + df.read_csv(_infile, sep='\t') + df = df[['gene','motif']] # these names are mocked. + df2=df.dropna() + df2=df2.assign(gene=df2['gene'].str.split(','), + motif=df2['motif'].str.split(',')).explode('gene').explode('motif').reset_index(drop=True) + # print(df.head()) + _list=df2.values.tolist() # edgelist + + return _list + + +def networkX_helpers(input_TFbed): + ''' + Input is a node edge list + These will have both node and edge list + my_graph.add_edges_from([('A', 'B'), + ('A', 'D'), + ('A', 'E'), + ('A', 'G'), + ('A', 'H'), + ('B', 'C'), + ('B', 'E'), + ('C', 'I'), + ('C', 'K'), + ('C', 'L'), + ('D', 'A'), + ('D', 'C'), + ('D', 'H'), + ('D', 'I'), + ('A', 'A'),('H','H'),('D','D'),('H','A'),('H','D')]) + Output: + text file with indegree and outdegree counts. + + ''' + + import networkx as nx + + # Create a networkx graph object + info("Initializing Graph") + _graph = nx.DiGraph() + + # Add edges to to the graph object + info("Add edges to graph object") + _graph.add_edges_from(edge_list) + + info("Calculating indegree outdegree stats") + InDegreeDict = _graph.in_degree() + OutDegreeDict = _graph.out_degree() + + info("Fetching self Loops") + # Self loops + autoregulatoryLoops = nx.selfloop_edges(_graph) # not needed? + + selfLoops = list(nx.nodes_with_selfloops(_graph)) + + info("Fetch selfregulatory loops") + from itertools import product + pairs=[] + for ele in list(product(selfLoops,repeat=2)): + if ele[0] == ele[1]: + continue + pairs.append(ele) + + info("Fetch Clique") + unDirGraph = nx.from_edgelist(pairs) + cliqueGen = nx.find_cliques_recursive(unDirGraph) + cliqueList = list(cliqueGen) + + # I am not sure what this does at this point but + # this is a place holder until SV confirms + cliqueGen_ALL = list(nx.find_cliques_recursive(_graph)) + + info("Scores all the CRC's") + ''' + ## SCORING THE CRCs using sum outdegree for each TF and dividing by the number of TFs in the clique + ''' + cliqueRanking = [] + outDegreeDict = my_graph.out_degree() + + for crcs in cliqueList: + score = 0 + for gene in crcs: + score += outDegreeDict[gene] + score = score/len(crcs) + if score > 0 and len(crcs) > 2: + cliqueRanking.append((crcs, score)) + + + sortCliqueRanking = sorted(cliqueRanking, reverse=True, key=lambda x:x[1]) + + + return sortCliqueRanking # may be write this to table? + + + + + + +DESCRIPTION = ''' +Create networkx objects. +''' + + + +EPILOG = ''' +''' + +class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, + argparse.RawDescriptionHelpFormatter): + pass +parser = argparse.ArgumentParser(description=DESCRIPTION, epilog=EPILOG, + formatter_class=CustomFormatter) + +parser.add_argument('arg') +parser.add_argument('-v', '--verbose', action='store_true', + help='Set logging level to DEBUG') + +args = parser.parse_args() + +edge_list = parse_bed(args.arg) +networkX_helpers(edge_list) + + +if args.verbose: + l.setLevel(logging.DEBUG) + +debug('%s begin', SCRIPT_PATH) + From e8cf1745ca5d57cd5a650dca20af648effd28054 Mon Sep 17 00:00:00 2001 From: sridhar0605 Date: Thu, 4 May 2023 11:13:18 -0500 Subject: [PATCH 2/8] tidy --- crcminer/create_nodes_graph.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/crcminer/create_nodes_graph.py b/crcminer/create_nodes_graph.py index 82b7bde..7f75f73 100644 --- a/crcminer/create_nodes_graph.py +++ b/crcminer/create_nodes_graph.py @@ -1,4 +1,3 @@ - import os import logging import argparse @@ -105,15 +104,10 @@ def networkX_helpers(input_TFbed): sortCliqueRanking = sorted(cliqueRanking, reverse=True, key=lambda x:x[1]) - return sortCliqueRanking # may be write this to table? - - - - DESCRIPTION = ''' Create networkx objects. ''' From 460cdfda5db15a5d002490f120139989e4d60b14 Mon Sep 17 00:00:00 2001 From: sridhar0605 Date: Thu, 4 May 2023 13:47:15 -0500 Subject: [PATCH 3/8] jaccard index similarity between network and some code cleanup --- crcminer/compare_network_jaccardIndex.py | 88 ++++++++++++++++++++++++ crcminer/create_nodes_graph.py | 35 +++++++--- 2 files changed, 113 insertions(+), 10 deletions(-) create mode 100644 crcminer/compare_network_jaccardIndex.py diff --git a/crcminer/compare_network_jaccardIndex.py b/crcminer/compare_network_jaccardIndex.py new file mode 100644 index 0000000..d6f4afd --- /dev/null +++ b/crcminer/compare_network_jaccardIndex.py @@ -0,0 +1,88 @@ + +import os +import logging +import argparse + +SCRIPT_PATH = os.path.abspath(__file__) +FORMAT = '[%(asctime)s] %(levelname)s %(message)s' +l = logging.getLogger() +lh = logging.StreamHandler() +lh.setFormatter(logging.Formatter(FORMAT)) +l.addHandler(lh) +l.setLevel(logging.INFO) +debug = l.debug; info = l.info; warning = l.warning; error = l.error + +def parse_bed(_infile): + import pandas as pd + df = pd.read_csv(_infile, sep='\t') + df = df[['gene','motif']] # these names are mocked. + df2=df.dropna() + df2=df2.assign(gene=df2['gene'].str.split(','), + motif=df2['motif'].str.split(',')).explode('gene').explode('motif').reset_index(drop=True) + # print(df.head()) + _list=df2.values.tolist() # edgelist + # return tuple + # (['A', 'BAR'], ['B', 'BAR'], ['C', 'BAR'], ['FOO', 'X'], ['FOO', 'Y'], ['FOO', 'Z'], ['JANE1', 'DOE']) + return list(df2.itertuples(index=False, name=None)) + + + + +def network_jaccard(edgelist1, edgelist2): + ''' + inputs: + edgelist1: output from parse_bed for sampleX + + edgelist2: output from parse_bed for sampleY + + Output: + float: indicating similarity index + + ref: + https://medium.com/rapids-ai/similarity-in-graphs-jaccard-versus-the-overlap-coefficient-610e083b877d + + ''' + + edgeintersection = len(set(edgelist1).intersection(edgelist2)) + edgeunion = (len(edgelist1) + len(edgelist2)) - edgeintersection + + print(float((edgeintersection) / edgeunion)) + return float((edgeintersection) / edgeunion) + +DESCRIPTION = ''' +When you have two different network graph +objects between mutiple samples +compare them using jaccard similarity index +''' + +EPILOG = ''' +''' + +class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, + argparse.RawDescriptionHelpFormatter): + pass +parser = argparse.ArgumentParser(description=DESCRIPTION, epilog=EPILOG, + formatter_class=CustomFormatter) + +parser.add_argument('edgelist1') +parser.add_argument('-v', '--verbose', action='store_true', + help='Set logging level to DEBUG') + +args = parser.parse_args() + +e1 = parse_bed(args.edgelist1) +# print e1 +# [('A', 'BAR'), ('B', 'BAR'), ('C', 'BAR'), ('FOO', 'X'), ('FOO', 'Y'), ('FOO', 'Z'), ('JANE1', 'DOE')] +# copying the same set for testing. +e2 = parse_bed(args.edgelist1) +e2 = e2[0:3] # subsetting +# print(e2) +# [('A', 'BAR'), ('B', 'BAR'), ('C', 'BAR')] + +network_jaccard(e1, e2) # should print out 0.42857142857142855 + +if args.verbose: + l.setLevel(logging.DEBUG) + +debug('%s begin', SCRIPT_PATH) + diff --git a/crcminer/create_nodes_graph.py b/crcminer/create_nodes_graph.py index 7f75f73..b8a9979 100644 --- a/crcminer/create_nodes_graph.py +++ b/crcminer/create_nodes_graph.py @@ -14,16 +14,28 @@ def parse_bed(_infile): - import pandas as pd - df.read_csv(_infile, sep='\t') - df = df[['gene','motif']] # these names are mocked. - df2=df.dropna() - df2=df2.assign(gene=df2['gene'].str.split(','), - motif=df2['motif'].str.split(',')).explode('gene').explode('motif').reset_index(drop=True) - # print(df.head()) - _list=df2.values.tolist() # edgelist - - return _list + ''' + input: motif bed file + chr st en gene motif + chr1 10 20 A,B,C BAR + chr1 11 21 FOO X,Y,Z + chr1 1111 1121 JANE1 DOE + chr1 11131 11421 FOO1 + chr1 11151 11521 BAR1 + output: + edgelist: [['A', 'BAR'], ['B', 'BAR'], ['C', 'BAR'], ['FOO', 'X'], ['FOO', 'Y'], ['FOO', 'Z'], ['JANE1', 'DOE']] + + ''' + import pandas as pd + df = pd.read_csv(_infile, sep='\t') + df = df[['gene','motif']] # these names are mocked. + df2=df.dropna() + df2=df2.assign(gene=df2['gene'].str.split(','), + motif=df2['motif'].str.split(',')).explode('gene').explode('motif').reset_index(drop=True) + # print(df.head()) + _list=df2.values.tolist() # edgelist + + return _list def networkX_helpers(input_TFbed): @@ -64,6 +76,9 @@ def networkX_helpers(input_TFbed): InDegreeDict = _graph.in_degree() OutDegreeDict = _graph.out_degree() + # writing this out to a file + + info("Fetching self Loops") # Self loops autoregulatoryLoops = nx.selfloop_edges(_graph) # not needed? From 15b7f356fba33a3794b9b55930bbfed5ac865552 Mon Sep 17 00:00:00 2001 From: sridhar0605 Date: Thu, 4 May 2023 14:40:19 -0500 Subject: [PATCH 4/8] click skeleton --- crcminer/crcminer.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 crcminer/crcminer.py diff --git a/crcminer/crcminer.py b/crcminer/crcminer.py new file mode 100644 index 0000000..363084f --- /dev/null +++ b/crcminer/crcminer.py @@ -0,0 +1,30 @@ +import click + + +# Command Group +@click.group(name='CRCminer') +def cli_tools(): + """Tool related commands""" + pass + +@cli_tools.command(name='mine', help='mines for CRC') +@click.option('--fasta', help='fasta') +@click.option('--enhancer', help='ROSE2 output of annotated (super)enhancers') +@click.option('--mapping', help='Motif ID to gene ID mapping file.') +def test(fasta,enchancer,mapping): + pass + +@cli_tools.command(name='compare', help='compare two networks') +@click.option('--edgelist1',help='edgelist from sampleX') +@click.option('--edgelist2',help='edgelist from sampleY') +def test2(edgelist1,edgelist2): + pass + +@cli_tools.command(name='report', help='report vizualization') +@click.option('--indir',help='input directory CRCminer results') + +def test3(indir): + pass + +if __name__ == '__main__': + cli_tools() \ No newline at end of file From 48dd761ef980b52149ecf19798487cca15488f00 Mon Sep 17 00:00:00 2001 From: sridhar0605 Date: Thu, 4 May 2023 14:54:35 -0500 Subject: [PATCH 5/8] click commands --- crcminer/crcminer.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/crcminer/crcminer.py b/crcminer/crcminer.py index 363084f..133178c 100644 --- a/crcminer/crcminer.py +++ b/crcminer/crcminer.py @@ -1,6 +1,4 @@ import click - - # Command Group @click.group(name='CRCminer') def cli_tools(): @@ -8,20 +6,20 @@ def cli_tools(): pass @cli_tools.command(name='mine', help='mines for CRC') -@click.option('--fasta', help='fasta') -@click.option('--enhancer', help='ROSE2 output of annotated (super)enhancers') -@click.option('--mapping', help='Motif ID to gene ID mapping file.') +@click.option('--fasta', type=click.Path(),help='fasta') +@click.option('--enhancer', type=click.Path(), help='ROSE2 output of annotated (super)enhancers') +@click.option('--mapping', type=click.Path(), help='Motif ID to gene ID mapping file.') def test(fasta,enchancer,mapping): pass @cli_tools.command(name='compare', help='compare two networks') -@click.option('--edgelist1',help='edgelist from sampleX') -@click.option('--edgelist2',help='edgelist from sampleY') +@click.option('--edgelist1', type=click.Path(), help='edgelist from sampleX') +@click.option('--edgelist2', type=click.Path(), help='edgelist from sampleY') def test2(edgelist1,edgelist2): pass @cli_tools.command(name='report', help='report vizualization') -@click.option('--indir',help='input directory CRCminer results') +@click.option('--indir',type=click.Path(),help='input directory CRCminer results') def test3(indir): pass From a514ca08e8380c6764b1fb086e708e5dcb96391f Mon Sep 17 00:00:00 2001 From: sridhar0605 Date: Thu, 4 May 2023 17:18:05 -0500 Subject: [PATCH 6/8] testing ssh --- crcminer/crcminer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/crcminer/crcminer.py b/crcminer/crcminer.py index 133178c..2603b6e 100644 --- a/crcminer/crcminer.py +++ b/crcminer/crcminer.py @@ -24,5 +24,8 @@ def test2(edgelist1,edgelist2): def test3(indir): pass +# todo +# add more description + if __name__ == '__main__': - cli_tools() \ No newline at end of file + cli_tools() From 90fb0c83dbb19bf3baecc4451eab829242027b6e Mon Sep 17 00:00:00 2001 From: sridhar0605 Date: Thu, 4 May 2023 19:28:32 -0500 Subject: [PATCH 7/8] sanitize names --- crcminer/{compare_network_jaccardIndex.py => compare.py} | 0 crcminer/{create_nodes_graph.py => network.py} | 6 +++--- 2 files changed, 3 insertions(+), 3 deletions(-) rename crcminer/{compare_network_jaccardIndex.py => compare.py} (100%) rename crcminer/{create_nodes_graph.py => network.py} (98%) diff --git a/crcminer/compare_network_jaccardIndex.py b/crcminer/compare.py similarity index 100% rename from crcminer/compare_network_jaccardIndex.py rename to crcminer/compare.py diff --git a/crcminer/create_nodes_graph.py b/crcminer/network.py similarity index 98% rename from crcminer/create_nodes_graph.py rename to crcminer/network.py index b8a9979..6f8c478 100644 --- a/crcminer/create_nodes_graph.py +++ b/crcminer/network.py @@ -1,6 +1,9 @@ import os import logging import argparse +import networkx as nx +import pandas as pd + SCRIPT_PATH = os.path.abspath(__file__) FORMAT = '[%(asctime)s] %(levelname)s %(message)s' @@ -26,7 +29,6 @@ def parse_bed(_infile): edgelist: [['A', 'BAR'], ['B', 'BAR'], ['C', 'BAR'], ['FOO', 'X'], ['FOO', 'Y'], ['FOO', 'Z'], ['JANE1', 'DOE']] ''' - import pandas as pd df = pd.read_csv(_infile, sep='\t') df = df[['gene','motif']] # these names are mocked. df2=df.dropna() @@ -62,8 +64,6 @@ def networkX_helpers(input_TFbed): ''' - import networkx as nx - # Create a networkx graph object info("Initializing Graph") _graph = nx.DiGraph() From 4555e99d565ec3d2575502093a29629353e33764 Mon Sep 17 00:00:00 2001 From: Srinidhi Varadharajan Date: Thu, 4 May 2023 21:36:16 -0500 Subject: [PATCH 8/8] Update network.py --- crcminer/network.py | 80 ++++++++++++++++++++++++++++++++------------- 1 file changed, 58 insertions(+), 22 deletions(-) diff --git a/crcminer/network.py b/crcminer/network.py index 6f8c478..230f600 100644 --- a/crcminer/network.py +++ b/crcminer/network.py @@ -41,6 +41,7 @@ def parse_bed(_infile): def networkX_helpers(input_TFbed): + ''' Input is a node edge list These will have both node and edge list @@ -60,54 +61,60 @@ def networkX_helpers(input_TFbed): ('D', 'I'), ('A', 'A'),('H','H'),('D','D'),('H','A'),('H','D')]) Output: - text file with indegree and outdegree counts. + text file with indegree, outdegree counts and CRC TF clique fractions. ''' - # Create a networkx graph object info("Initializing Graph") - _graph = nx.DiGraph() + _graph = nx.DiGraph() # Add edges to to the graph object info("Add edges to graph object") - _graph.add_edges_from(edge_list) + _graph.add_edges_from(input_nodelist) - info("Calculating indegree outdegree stats") - InDegreeDict = _graph.in_degree() - OutDegreeDict = _graph.out_degree() + info("Calculating in-degree out-degree stats") - # writing this out to a file + #degrees_dict = {node:deg for (node, deg) in _graph.degree()} + out_degree_dict = {node:deg for (node, deg) in _graph.out_degree()} + in_degree_dict = {node:deg for (node, deg) in _graph.in_degree()} + out_degree=pd.DataFrame(data={'TF':out_degree_dict.keys(), 'Out':out_degree_dict.values()}) + in_degree=pd.DataFrame(data={'TF':in_degree_dict.keys(), 'In':in_degree_dict.values()}) + NetworkMetricsOutput=pd.merge(out_degree,in_degree,left_on="TF",right_on="TF",how="outer") + # TOTAL DEGREE + NetworkMetricsOutput['Total'] = NetworkMetricsOutput['Out'] + NetworkMetricsOutput['In'] + info("Fetching self Loops") # Self loops - autoregulatoryLoops = nx.selfloop_edges(_graph) # not needed? + #autoregulatoryLoops = nx.selfloop_edges(_graph) # not needed? selfLoops = list(nx.nodes_with_selfloops(_graph)) info("Fetch selfregulatory loops") from itertools import product - pairs=[] + nodePairs=[] for ele in list(product(selfLoops,repeat=2)): - if ele[0] == ele[1]: - continue - pairs.append(ele) + if ele[0] != ele[1] and _graph.has_edge(ele[0],ele[1]) and _graph.has_edge(ele[1],ele[0]): + nodePairs.append(ele) - info("Fetch Clique") - unDirGraph = nx.from_edgelist(pairs) + info("Fetch Self regulating Clique") + unDirGraph = nx.from_edgelist(nodePairs) cliqueGen = nx.find_cliques_recursive(unDirGraph) cliqueList = list(cliqueGen) # I am not sure what this does at this point but # this is a place holder until SV confirms - cliqueGen_ALL = list(nx.find_cliques_recursive(_graph)) + # All cliques - not needed right now (SV) + #cliqueGen_ALL = list(nx.find_cliques_recursive(_graph)) + print("hi",len(cliqueList)) info("Scores all the CRC's") ''' ## SCORING THE CRCs using sum outdegree for each TF and dividing by the number of TFs in the clique ''' cliqueRanking = [] - outDegreeDict = my_graph.out_degree() + outDegreeDict = _graph.out_degree() for crcs in cliqueList: score = 0 @@ -116,11 +123,40 @@ def networkX_helpers(input_TFbed): score = score/len(crcs) if score > 0 and len(crcs) > 2: cliqueRanking.append((crcs, score)) - - - sortCliqueRanking = sorted(cliqueRanking, reverse=True, key=lambda x:x[1]) - - return sortCliqueRanking # may be write this to table? + + + sortedRankedCliques = pd.DataFrame(sorted(cliqueRanking, reverse=True, key=lambda x:x[1])) + + factorEnrichmentDict = dict.fromkeys(selfLoops,0) + + info("Calculate enrichment of each TF in a CRC clique") + ''' + ## Enrichment of each TF calculated as (number of CRC cliques with the given TF)/(number of CRC cliques) + ''' + for crcClique in cliqueList: + for TF in crcClique: + factorEnrichmentDict[TF] += 1 + + factorRankingTable = dict.fromkeys(selfLoops,0) + for TF in selfLoops: + factorRankingTable[TF]=factorEnrichmentDict[TF]/float(len(cliqueRanking)) + + + FactorRank=pd.DataFrame(data={'TF':factorRankingTable.keys(), 'TF_CliqueFraction':factorRankingTable.values()}) + + TFSpecificCliques=nx.cliques_containing_node(unDirGraph) + + NetworkMetricsOutput=pd.merge(NetworkMetricsOutput,FactorRank,left_on="TF",right_on="TF",how="outer") + + info("Write to file") + NetworkMetricsOutput.to_csv('TF_Degrees.csv',index=False) + sortedRankedCliques.to_csv('Putative_CRC_Cliques.csv',index=False, header=False) + + #print(nx.cliques_containing_node(unDirGraph,"RFX3")) + #print(len(nx.cliques_containing_node(unDirGraph,"RFX3"))) + #print(TFSpecificCliques) + + return _graph DESCRIPTION = '''