Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
marakeby committed Oct 26, 2020
0 parents commit cb7162f
Show file tree
Hide file tree
Showing 51 changed files with 6,887 additions and 0 deletions.
14 changes: 14 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
*
!.gitattributes
!.gitignore
!readme.md
!.gitkeep
!*.py
!*/
!*.ipynb
_testing/
_test/
logs/
.vscode/
.idea/
.ipynb_checkpoints/
13 changes: 13 additions & 0 deletions config_path.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from os.path import join, realpath, dirname

BASE_PATH = dirname(realpath(__file__))
DATA_PATH = join(BASE_PATH, 'data')
GENE_PATH = join(DATA_PATH, 'genes')
PATHWAY_PATH = join(DATA_PATH, 'pathways')
REACTOM_PATHWAY_PATH = join(PATHWAY_PATH, 'Reactome')
PROSTATE_DATA_PATH = join(DATA_PATH, 'prostate_paper')
RUN_PATH = join(BASE_PATH, 'run')
LOG_PATH = join(RUN_PATH, 'logs')
PROSTATE_LOG_PATH = join(LOG_PATH, 'p1000')
PARAMS_PATH = join(RUN_PATH, 'params')
POSTATE_PARAMS_PATH = join(PARAMS_PATH, 'P1000')
Empty file added data/__init__.py
Empty file.
68 changes: 68 additions & 0 deletions data/data_access.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import logging

# from data.io_lung.data_reader import IODataLung
# from data.io_melanoma.data_reader import IODataMelanoma
# from data.io_msk.data_reader import IODataMSK
# from data.prostate_final.data_reader import ProstateDataFinal
# from data.ras.data_reader import RASData
# from data.ras_tcga.data_reader import RAS_TCGAData
# from data.simul ated.data_reader import SimulatedData
# from data.tcga_skcm.data_reader import SKCMData
# from data.prostate_jake.data_reader import ProstateDataJake
from data.prostate_paper.data_reader import ProstateDataPaper
# from data.tcga_skcm.data_reader import SKCMData
import numpy as np

# from data.LVI.data_reader import LVIDataReader


#
# from data.claims.data_reader import ClaimsData
# from data.io.data_reader import IOData
# from data.mel.data_reader import MelData
# from data.melanoma_io.data_reader import Mel_IO
# from data.profile.data_reader import ProfileData
# from data.prostate.data_reader import ProstateData

class Data():
def __init__(self, id, type, params, test_size=0.3, stratify=True):

self.test_size = test_size
self.stratify = stratify
self.data_type = type
self.data_params = params


if self.data_type == 'prostate_paper':
self.data_reader = ProstateDataPaper(**params)

else:
logging.error('unsupported data type')
raise ValueError('unsupported data type')

def get_train_validate_test(self):
return self.data_reader.get_train_validate_test()

def get_train_test(self):
x_train, x_validate, x_test, y_train, y_validate, y_test, info_train, info_validate, info_test, columns = self.data_reader.get_train_validate_test()
# combine training and validation datasets
x_train = np.concatenate((x_train, x_validate))
y_train = np.concatenate((y_train, y_validate))
# info_train = pd.concat([info_train,info_validate ])
info_train = list(info_train) + list(info_validate)
return x_train, x_test, y_train, y_test, info_train, info_test, columns


def get_data(self):
x = self.data_reader.x
y = self.data_reader.y
info = self.data_reader.info
columns = self.data_reader.columns
return x, y, info, columns

def get_relevant_features(self):
if hasattr(self.data_reader, 'relevant_features'):
return self.data_reader.get_relevant_features()
else:
return None

22 changes: 22 additions & 0 deletions data/data_access_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from data.data_access import Data

selected_genes = 'tcga_prostate_expressed_genes_and_cancer_genes.csv'

data_params = {'id': 'ALL', 'type': 'prostate_paper',
'params': {
'data_type': ['mut_important', 'cnv_del', 'cnv_amp'],
'drop_AR': False,
'cnv_levels': 3,
'mut_binary': True,
'balanced_data': False,
'combine_type': 'union', # intersection
'use_coding_genes_only': True,
'selected_genes': selected_genes,
'training_split': 0,
}
}

data_adapter = Data(**data_params)
x, y, info, columns = data_adapter.get_data()

print (info)
Empty file added data/genes/__init__.py
Empty file.
59 changes: 59 additions & 0 deletions data/gmt_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import os
import re

import pandas as pd


# data_dir = os.path.dirname(__file__)
class GMT():
# genes_cols : start reading genes from genes_col(default 1, it can be 2 e.g. if an information col is added after the pathway col)
# pathway col is considered to be the first column (0)
def load_data(self, filename, genes_col=1, pathway_col=0):

data_dict_list = []
with open(filename) as gmt:

data_list = gmt.readlines()

print data_list[0]
for row in data_list:
genes = row.strip().split('\t')
genes = [re.sub('_copy.*', '', g) for g in genes]
genes = [re.sub('\\n.*', '', g) for g in genes]
for gene in genes[genes_col:]:
pathway = genes[pathway_col]
dict = {'group': pathway, 'gene': gene}
data_dict_list.append(dict)

df = pd.DataFrame(data_dict_list)
print df.head()

return df

def load_data_dict(self, filename):

data_dict_list = []
dict = {}
with open(os.path.join(data_dir, filename)) as gmt:
data_list = gmt.readlines()

print data_list[0]
for row in data_list:
genes = row.split('\t')
dict[genes[0]] = genes[2:]

return dict

def write_dict_to_file(self, dict, filename):
lines = []
with open(filename, 'w') as gmt:
for k in dict:
str1 = ' '.join(str(e) for e in dict[k])
line = str(k) + ' ' + str1 + '\n'
lines.append(line)
gmt.writelines(lines)
return

def __init__(self):

return
Empty file added data/pathways/__init__.py
Empty file.
34 changes: 34 additions & 0 deletions data/pathways/gmt_pathway.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import logging
import numpy as np
import pandas as pd
from data.gmt_reader import GMT


def get_KEGG_map(input_list, filename='c2.cp.kegg.v6.1.symbols.gmt', genes_col=1, shuffle_genes=False):
'''
:param input_list: list of inputs under consideration (e.g. genes)
:param filename: a gmt formated file e.g. pathway1 gene1 gene2 gene3
# pathway2 gene4 gene5 gene6
:param genes_col: the start index of the gene columns
:param shuffle_genes: {True, False}
:return: dataframe with rows =genes and columns = pathways values = 1 or 0 based on the membership of certain gene in the corresponding pathway
'''
d = GMT()
df = d.load_data( filename, genes_col)
df['value'] = 1
mapp = pd.pivot_table(df, values='value', index='gene', columns='group', aggfunc=np.sum)
mapp = mapp.fillna(0)
cols_df = pd.DataFrame(index=input_list)
mapp = cols_df.merge(mapp, right_index=True, left_index=True, how='left')
mapp = mapp.fillna(0)
genes = mapp.index
pathways = mapp.columns
mapp = mapp.as_matrix()

if shuffle_genes:
logging.info('shuffling')
ones_ratio = np.sum(mapp) / np.prod(mapp.shape)
logging.info('ones_ratio {}'.format(ones_ratio))
mapp = np.random.choice([0, 1], size=mapp.shape, p=[1 - ones_ratio, ones_ratio])
logging.info('random map ones_ratio {}'.format(ones_ratio))
return mapp, genes, pathways
8 changes: 8 additions & 0 deletions data/pathways/gmt_pathway_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from data.pathways.gmt_pathway import get_KEGG_map
from os.path import expanduser
input_genes = ['AR', 'AKT', 'EGFR']
filename = expanduser('~/Data/pathways/MsigDB/c2.cp.kegg.v6.1.symbols.gmt')
mapp, genes, pathways = get_KEGG_map(input_genes, filename)
print 'genes', genes
print 'pathways', pathways
print 'mapp', mapp
173 changes: 173 additions & 0 deletions data/pathways/reactome.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
import re
from os.path import join, dirname, expanduser
import networkx as nx
import pandas as pd
from data.gmt_reader import GMT

data_dir = dirname(__file__)
reactome_base_dir = join(data_dir, 'Reactome')
reactome_base_dir = join(reactome_base_dir, 'input')

relations_file_name = 'ReactomePathwaysRelation.txt'
pathway_names = 'ReactomePathways.txt'
pathway_genes = 'ReactomePathways.gmt'


def add_edges(G, node, n_levels):
edges = []
source = node
for l in range(n_levels):
target = node + '_copy' + str(l + 1)
edge = (source, target)
source = target
edges.append(edge)

G.add_edges_from(edges)
return G


def complete_network(G, n_leveles=4):
sub_graph = nx.ego_graph(G, 'root', radius=n_leveles)
terminal_nodes = [n for n, d in sub_graph.out_degree() if d == 0]
distances = [len(nx.shortest_path(G, source='root', target=node)) for node in terminal_nodes]
# nunique, counts = np.unique(distances, return_counts=True)

for node in terminal_nodes:
distance = len(nx.shortest_path(sub_graph, source='root', target=node))
if distance <= n_leveles:
diff = n_leveles - distance + 1
sub_graph = add_edges(sub_graph, node, diff)

return sub_graph


def get_nodes_at_level(net, distance):
# get all nodes within distance around the query node
nodes = set(nx.ego_graph(net, 'root', radius=distance))

# remove nodes that are not **at** the specified distance but closer
if distance >= 1.:
nodes -= set(nx.ego_graph(net, 'root', radius=distance - 1))

return list(nodes)


def get_layers_from_net(net, n_levels):
layers = []
for i in range(n_levels):
nodes = get_nodes_at_level(net, i)
dict = {}
for n in nodes:
n_name = re.sub('_copy.*', '', n)
next = net.successors(n)
dict[n_name] = [re.sub('_copy.*', '', nex) for nex in next]
layers.append(dict)
return layers


class Reactome():

def __init__(self):
self.pathway_names = self.load_names()
self.hierarchy = self.load_hierarchy()
self.pathway_genes = self.load_genes()

def load_names(self):
filename = join(reactome_base_dir, pathway_names)
df = pd.read_csv(filename, sep='\t')
df.columns = ['reactome_id', 'pathway_name', 'species']
return df

def load_genes(self):
filename = join(reactome_base_dir, pathway_genes)
gmt = GMT()
df = gmt.load_data(filename, pathway_col=1, genes_col=3)
return df

def load_hierarchy(self):
filename = join(reactome_base_dir, relations_file_name)
df = pd.read_csv(filename, sep='\t')
df.columns = ['child', 'parent']
return df


class ReactomeNetwork():

def __init__(self):
self.reactome = Reactome() # low level access to reactome pathways and genes
self.netx = self.get_reactome_networkx()

def get_terminals(self):
terminal_nodes = [n for n, d in self.netx.out_degree() if d == 0]
return terminal_nodes

def get_roots(self):

# roots = [n for n, d in self.netx.in_degree() if d == 0]
roots = get_nodes_at_level(self.netx, distance=1)
return roots

# get a DiGraph representation of the Reactome hierarchy
def get_reactome_networkx(self):
if hasattr(self, 'netx'):
return self.netx
hierarchy = self.reactome.hierarchy
# filter hierarchy to have human pathways only
human_hierarchy = hierarchy[hierarchy['child'].str.contains('HSA')]
net = nx.from_pandas_edgelist(human_hierarchy, 'child', 'parent', create_using=nx.DiGraph())
net.name = 'reactome'

# add root node
roots = [n for n, d in net.in_degree() if d == 0]
root_node = 'root'
edges = [(root_node, n) for n in roots]
net.add_edges_from(edges)

return net

def info(self):
return nx.info(self.netx)

def get_tree(self):

# convert to tree
G = nx.bfs_tree(self.netx, 'root')

return G

def get_completed_network(self, n_levels):
G = complete_network(self.netx, n_leveles=n_levels)
return G

def get_completed_tree(self, n_levels):
G = self.get_tree()
G = complete_network(G, n_leveles=n_levels)
return G

def get_layers(self, n_levels, direction='root_to_leaf'):
if direction == 'root_to_leaf':
net = self.get_completed_network(n_levels)
layers = get_layers_from_net(net, n_levels)
# layers=layers[:-2]
else:
net = self.get_completed_network(5)
layers = get_layers_from_net(net,5)
layers = layers[5 - n_levels:5]


# get the last layer (genes level)
terminal_nodes = [n for n, d in net.out_degree() if d == 0] # set of terminal pathways
# we need to find genes belonging to these pathways
genes_df = self.reactome.pathway_genes

dict = {}
missing_pathways = []
for p in terminal_nodes:
pathway_name = re.sub('_copy.*', '', p)
genes = genes_df[genes_df['group'] == pathway_name]['gene'].unique()
if len(genes) == 0:
missing_pathways.append(pathway_name)
dict[pathway_name] = genes

layers.append(dict)
return layers
Loading

0 comments on commit cb7162f

Please sign in to comment.