Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add network module. #13

Merged
merged 8 commits into from
May 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 88 additions & 0 deletions crcminer/compare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@

import os
import logging
import argparse

SCRIPT_PATH = os.path.abspath(__file__)
FORMAT = '[%(asctime)s] %(levelname)s %(message)s'
l = logging.getLogger()
lh = logging.StreamHandler()
lh.setFormatter(logging.Formatter(FORMAT))
l.addHandler(lh)
l.setLevel(logging.INFO)
debug = l.debug; info = l.info; warning = l.warning; error = l.error

def parse_bed(_infile):
import pandas as pd
df = pd.read_csv(_infile, sep='\t')
df = df[['gene','motif']] # these names are mocked.
df2=df.dropna()
df2=df2.assign(gene=df2['gene'].str.split(','),
motif=df2['motif'].str.split(',')).explode('gene').explode('motif').reset_index(drop=True)
# print(df.head())
_list=df2.values.tolist() # edgelist
# return tuple
# (['A', 'BAR'], ['B', 'BAR'], ['C', 'BAR'], ['FOO', 'X'], ['FOO', 'Y'], ['FOO', 'Z'], ['JANE1', 'DOE'])
return list(df2.itertuples(index=False, name=None))




def network_jaccard(edgelist1, edgelist2):
'''
inputs:
edgelist1: output from parse_bed for sampleX

edgelist2: output from parse_bed for sampleY

Output:
float: indicating similarity index

ref:
https://medium.com/rapids-ai/similarity-in-graphs-jaccard-versus-the-overlap-coefficient-610e083b877d

'''

edgeintersection = len(set(edgelist1).intersection(edgelist2))
edgeunion = (len(edgelist1) + len(edgelist2)) - edgeintersection

print(float((edgeintersection) / edgeunion))
return float((edgeintersection) / edgeunion)

DESCRIPTION = '''
When you have two different network graph
objects between mutiple samples
compare them using jaccard similarity index
'''

EPILOG = '''
'''

class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
argparse.RawDescriptionHelpFormatter):
pass
parser = argparse.ArgumentParser(description=DESCRIPTION, epilog=EPILOG,
formatter_class=CustomFormatter)

parser.add_argument('edgelist1')
parser.add_argument('-v', '--verbose', action='store_true',
help='Set logging level to DEBUG')

args = parser.parse_args()

e1 = parse_bed(args.edgelist1)
# print e1
# [('A', 'BAR'), ('B', 'BAR'), ('C', 'BAR'), ('FOO', 'X'), ('FOO', 'Y'), ('FOO', 'Z'), ('JANE1', 'DOE')]
# copying the same set for testing.
e2 = parse_bed(args.edgelist1)
e2 = e2[0:3] # subsetting
# print(e2)
# [('A', 'BAR'), ('B', 'BAR'), ('C', 'BAR')]

network_jaccard(e1, e2) # should print out 0.42857142857142855

if args.verbose:
l.setLevel(logging.DEBUG)

debug('%s begin', SCRIPT_PATH)

31 changes: 31 additions & 0 deletions crcminer/crcminer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import click
# Command Group
@click.group(name='CRCminer')
def cli_tools():
"""Tool related commands"""
pass

@cli_tools.command(name='mine', help='mines for CRC')
@click.option('--fasta', type=click.Path(),help='fasta')
@click.option('--enhancer', type=click.Path(), help='ROSE2 output of annotated (super)enhancers')
@click.option('--mapping', type=click.Path(), help='Motif ID to gene ID mapping file.')
def test(fasta,enchancer,mapping):
pass

@cli_tools.command(name='compare', help='compare two networks')
@click.option('--edgelist1', type=click.Path(), help='edgelist from sampleX')
@click.option('--edgelist2', type=click.Path(), help='edgelist from sampleY')
def test2(edgelist1,edgelist2):
pass

@cli_tools.command(name='report', help='report vizualization')
@click.option('--indir',type=click.Path(),help='input directory CRCminer results')

def test3(indir):
pass

# todo
# add more description

if __name__ == '__main__':
cli_tools()
191 changes: 191 additions & 0 deletions crcminer/network.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
import os
import logging
import argparse
import networkx as nx
import pandas as pd


SCRIPT_PATH = os.path.abspath(__file__)
FORMAT = '[%(asctime)s] %(levelname)s %(message)s'
l = logging.getLogger()
lh = logging.StreamHandler()
lh.setFormatter(logging.Formatter(FORMAT))
l.addHandler(lh)
l.setLevel(logging.INFO)
debug = l.debug; info = l.info; warning = l.warning; error = l.error



def parse_bed(_infile):
'''
input: motif bed file
chr st en gene motif
chr1 10 20 A,B,C BAR
chr1 11 21 FOO X,Y,Z
chr1 1111 1121 JANE1 DOE
chr1 11131 11421 FOO1
chr1 11151 11521 BAR1
output:
edgelist: [['A', 'BAR'], ['B', 'BAR'], ['C', 'BAR'], ['FOO', 'X'], ['FOO', 'Y'], ['FOO', 'Z'], ['JANE1', 'DOE']]

'''
df = pd.read_csv(_infile, sep='\t')
df = df[['gene','motif']] # these names are mocked.
df2=df.dropna()
df2=df2.assign(gene=df2['gene'].str.split(','),
motif=df2['motif'].str.split(',')).explode('gene').explode('motif').reset_index(drop=True)
# print(df.head())
_list=df2.values.tolist() # edgelist

return _list


def networkX_helpers(input_TFbed):

'''
Input is a node edge list
These will have both node and edge list
my_graph.add_edges_from([('A', 'B'),
('A', 'D'),
('A', 'E'),
('A', 'G'),
('A', 'H'),
('B', 'C'),
('B', 'E'),
('C', 'I'),
('C', 'K'),
('C', 'L'),
('D', 'A'),
('D', 'C'),
('D', 'H'),
('D', 'I'),
('A', 'A'),('H','H'),('D','D'),('H','A'),('H','D')])
Output:
text file with indegree, outdegree counts and CRC TF clique fractions.

'''
# Create a networkx graph object
info("Initializing Graph")
_graph = nx.DiGraph()

# Add edges to to the graph object
info("Add edges to graph object")
_graph.add_edges_from(input_nodelist)

info("Calculating in-degree out-degree stats")

#degrees_dict = {node:deg for (node, deg) in _graph.degree()}
out_degree_dict = {node:deg for (node, deg) in _graph.out_degree()}
in_degree_dict = {node:deg for (node, deg) in _graph.in_degree()}

out_degree=pd.DataFrame(data={'TF':out_degree_dict.keys(), 'Out':out_degree_dict.values()})
in_degree=pd.DataFrame(data={'TF':in_degree_dict.keys(), 'In':in_degree_dict.values()})

NetworkMetricsOutput=pd.merge(out_degree,in_degree,left_on="TF",right_on="TF",how="outer")
# TOTAL DEGREE
NetworkMetricsOutput['Total'] = NetworkMetricsOutput['Out'] + NetworkMetricsOutput['In']

info("Fetching self Loops")
# Self loops
#autoregulatoryLoops = nx.selfloop_edges(_graph) # not needed?

selfLoops = list(nx.nodes_with_selfloops(_graph))

info("Fetch selfregulatory loops")
from itertools import product
nodePairs=[]
for ele in list(product(selfLoops,repeat=2)):
if ele[0] != ele[1] and _graph.has_edge(ele[0],ele[1]) and _graph.has_edge(ele[1],ele[0]):
nodePairs.append(ele)

info("Fetch Self regulating Clique")
unDirGraph = nx.from_edgelist(nodePairs)
cliqueGen = nx.find_cliques_recursive(unDirGraph)
cliqueList = list(cliqueGen)

# I am not sure what this does at this point but
# this is a place holder until SV confirms
# All cliques - not needed right now (SV)
#cliqueGen_ALL = list(nx.find_cliques_recursive(_graph))

print("hi",len(cliqueList))
info("Scores all the CRC's")
'''
## SCORING THE CRCs using sum outdegree for each TF and dividing by the number of TFs in the clique
'''
cliqueRanking = []
outDegreeDict = _graph.out_degree()

for crcs in cliqueList:
score = 0
for gene in crcs:
score += outDegreeDict[gene]
score = score/len(crcs)
if score > 0 and len(crcs) > 2:
cliqueRanking.append((crcs, score))


sortedRankedCliques = pd.DataFrame(sorted(cliqueRanking, reverse=True, key=lambda x:x[1]))

factorEnrichmentDict = dict.fromkeys(selfLoops,0)

info("Calculate enrichment of each TF in a CRC clique")
'''
## Enrichment of each TF calculated as (number of CRC cliques with the given TF)/(number of CRC cliques)
'''
for crcClique in cliqueList:
for TF in crcClique:
factorEnrichmentDict[TF] += 1

factorRankingTable = dict.fromkeys(selfLoops,0)
for TF in selfLoops:
factorRankingTable[TF]=factorEnrichmentDict[TF]/float(len(cliqueRanking))


FactorRank=pd.DataFrame(data={'TF':factorRankingTable.keys(), 'TF_CliqueFraction':factorRankingTable.values()})

TFSpecificCliques=nx.cliques_containing_node(unDirGraph)

NetworkMetricsOutput=pd.merge(NetworkMetricsOutput,FactorRank,left_on="TF",right_on="TF",how="outer")

info("Write to file")
NetworkMetricsOutput.to_csv('TF_Degrees.csv',index=False)
sortedRankedCliques.to_csv('Putative_CRC_Cliques.csv',index=False, header=False)

#print(nx.cliques_containing_node(unDirGraph,"RFX3"))
#print(len(nx.cliques_containing_node(unDirGraph,"RFX3")))
#print(TFSpecificCliques)

return _graph


DESCRIPTION = '''
Create networkx objects.
'''



EPILOG = '''
'''

class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
argparse.RawDescriptionHelpFormatter):
pass
parser = argparse.ArgumentParser(description=DESCRIPTION, epilog=EPILOG,
formatter_class=CustomFormatter)

parser.add_argument('arg')
parser.add_argument('-v', '--verbose', action='store_true',
help='Set logging level to DEBUG')

args = parser.parse_args()

edge_list = parse_bed(args.arg)
networkX_helpers(edge_list)


if args.verbose:
l.setLevel(logging.DEBUG)

debug('%s begin', SCRIPT_PATH)