forked from marakeby/pnet_prostate_paper
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit cb7162f
Showing
51 changed files
with
6,887 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
* | ||
!.gitattributes | ||
!.gitignore | ||
!readme.md | ||
!.gitkeep | ||
!*.py | ||
!*/ | ||
!*.ipynb | ||
_testing/ | ||
_test/ | ||
logs/ | ||
.vscode/ | ||
.idea/ | ||
.ipynb_checkpoints/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
from os.path import join, realpath, dirname | ||
|
||
BASE_PATH = dirname(realpath(__file__)) | ||
DATA_PATH = join(BASE_PATH, 'data') | ||
GENE_PATH = join(DATA_PATH, 'genes') | ||
PATHWAY_PATH = join(DATA_PATH, 'pathways') | ||
REACTOM_PATHWAY_PATH = join(PATHWAY_PATH, 'Reactome') | ||
PROSTATE_DATA_PATH = join(DATA_PATH, 'prostate_paper') | ||
RUN_PATH = join(BASE_PATH, 'run') | ||
LOG_PATH = join(RUN_PATH, 'logs') | ||
PROSTATE_LOG_PATH = join(LOG_PATH, 'p1000') | ||
PARAMS_PATH = join(RUN_PATH, 'params') | ||
POSTATE_PARAMS_PATH = join(PARAMS_PATH, 'P1000') |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
import logging | ||
|
||
# from data.io_lung.data_reader import IODataLung | ||
# from data.io_melanoma.data_reader import IODataMelanoma | ||
# from data.io_msk.data_reader import IODataMSK | ||
# from data.prostate_final.data_reader import ProstateDataFinal | ||
# from data.ras.data_reader import RASData | ||
# from data.ras_tcga.data_reader import RAS_TCGAData | ||
# from data.simul ated.data_reader import SimulatedData | ||
# from data.tcga_skcm.data_reader import SKCMData | ||
# from data.prostate_jake.data_reader import ProstateDataJake | ||
from data.prostate_paper.data_reader import ProstateDataPaper | ||
# from data.tcga_skcm.data_reader import SKCMData | ||
import numpy as np | ||
|
||
# from data.LVI.data_reader import LVIDataReader | ||
|
||
|
||
# | ||
# from data.claims.data_reader import ClaimsData | ||
# from data.io.data_reader import IOData | ||
# from data.mel.data_reader import MelData | ||
# from data.melanoma_io.data_reader import Mel_IO | ||
# from data.profile.data_reader import ProfileData | ||
# from data.prostate.data_reader import ProstateData | ||
|
||
class Data(): | ||
def __init__(self, id, type, params, test_size=0.3, stratify=True): | ||
|
||
self.test_size = test_size | ||
self.stratify = stratify | ||
self.data_type = type | ||
self.data_params = params | ||
|
||
|
||
if self.data_type == 'prostate_paper': | ||
self.data_reader = ProstateDataPaper(**params) | ||
|
||
else: | ||
logging.error('unsupported data type') | ||
raise ValueError('unsupported data type') | ||
|
||
def get_train_validate_test(self): | ||
return self.data_reader.get_train_validate_test() | ||
|
||
def get_train_test(self): | ||
x_train, x_validate, x_test, y_train, y_validate, y_test, info_train, info_validate, info_test, columns = self.data_reader.get_train_validate_test() | ||
# combine training and validation datasets | ||
x_train = np.concatenate((x_train, x_validate)) | ||
y_train = np.concatenate((y_train, y_validate)) | ||
# info_train = pd.concat([info_train,info_validate ]) | ||
info_train = list(info_train) + list(info_validate) | ||
return x_train, x_test, y_train, y_test, info_train, info_test, columns | ||
|
||
|
||
def get_data(self): | ||
x = self.data_reader.x | ||
y = self.data_reader.y | ||
info = self.data_reader.info | ||
columns = self.data_reader.columns | ||
return x, y, info, columns | ||
|
||
def get_relevant_features(self): | ||
if hasattr(self.data_reader, 'relevant_features'): | ||
return self.data_reader.get_relevant_features() | ||
else: | ||
return None | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
from data.data_access import Data | ||
|
||
selected_genes = 'tcga_prostate_expressed_genes_and_cancer_genes.csv' | ||
|
||
data_params = {'id': 'ALL', 'type': 'prostate_paper', | ||
'params': { | ||
'data_type': ['mut_important', 'cnv_del', 'cnv_amp'], | ||
'drop_AR': False, | ||
'cnv_levels': 3, | ||
'mut_binary': True, | ||
'balanced_data': False, | ||
'combine_type': 'union', # intersection | ||
'use_coding_genes_only': True, | ||
'selected_genes': selected_genes, | ||
'training_split': 0, | ||
} | ||
} | ||
|
||
data_adapter = Data(**data_params) | ||
x, y, info, columns = data_adapter.get_data() | ||
|
||
print (info) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
import os | ||
import re | ||
|
||
import pandas as pd | ||
|
||
|
||
# data_dir = os.path.dirname(__file__) | ||
class GMT(): | ||
# genes_cols : start reading genes from genes_col(default 1, it can be 2 e.g. if an information col is added after the pathway col) | ||
# pathway col is considered to be the first column (0) | ||
def load_data(self, filename, genes_col=1, pathway_col=0): | ||
|
||
data_dict_list = [] | ||
with open(filename) as gmt: | ||
|
||
data_list = gmt.readlines() | ||
|
||
print data_list[0] | ||
for row in data_list: | ||
genes = row.strip().split('\t') | ||
genes = [re.sub('_copy.*', '', g) for g in genes] | ||
genes = [re.sub('\\n.*', '', g) for g in genes] | ||
for gene in genes[genes_col:]: | ||
pathway = genes[pathway_col] | ||
dict = {'group': pathway, 'gene': gene} | ||
data_dict_list.append(dict) | ||
|
||
df = pd.DataFrame(data_dict_list) | ||
print df.head() | ||
|
||
return df | ||
|
||
def load_data_dict(self, filename): | ||
|
||
data_dict_list = [] | ||
dict = {} | ||
with open(os.path.join(data_dir, filename)) as gmt: | ||
data_list = gmt.readlines() | ||
|
||
print data_list[0] | ||
for row in data_list: | ||
genes = row.split('\t') | ||
dict[genes[0]] = genes[2:] | ||
|
||
return dict | ||
|
||
def write_dict_to_file(self, dict, filename): | ||
lines = [] | ||
with open(filename, 'w') as gmt: | ||
for k in dict: | ||
str1 = ' '.join(str(e) for e in dict[k]) | ||
line = str(k) + ' ' + str1 + '\n' | ||
lines.append(line) | ||
gmt.writelines(lines) | ||
return | ||
|
||
def __init__(self): | ||
|
||
return |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import logging | ||
import numpy as np | ||
import pandas as pd | ||
from data.gmt_reader import GMT | ||
|
||
|
||
def get_KEGG_map(input_list, filename='c2.cp.kegg.v6.1.symbols.gmt', genes_col=1, shuffle_genes=False): | ||
''' | ||
:param input_list: list of inputs under consideration (e.g. genes) | ||
:param filename: a gmt formated file e.g. pathway1 gene1 gene2 gene3 | ||
# pathway2 gene4 gene5 gene6 | ||
:param genes_col: the start index of the gene columns | ||
:param shuffle_genes: {True, False} | ||
:return: dataframe with rows =genes and columns = pathways values = 1 or 0 based on the membership of certain gene in the corresponding pathway | ||
''' | ||
d = GMT() | ||
df = d.load_data( filename, genes_col) | ||
df['value'] = 1 | ||
mapp = pd.pivot_table(df, values='value', index='gene', columns='group', aggfunc=np.sum) | ||
mapp = mapp.fillna(0) | ||
cols_df = pd.DataFrame(index=input_list) | ||
mapp = cols_df.merge(mapp, right_index=True, left_index=True, how='left') | ||
mapp = mapp.fillna(0) | ||
genes = mapp.index | ||
pathways = mapp.columns | ||
mapp = mapp.as_matrix() | ||
|
||
if shuffle_genes: | ||
logging.info('shuffling') | ||
ones_ratio = np.sum(mapp) / np.prod(mapp.shape) | ||
logging.info('ones_ratio {}'.format(ones_ratio)) | ||
mapp = np.random.choice([0, 1], size=mapp.shape, p=[1 - ones_ratio, ones_ratio]) | ||
logging.info('random map ones_ratio {}'.format(ones_ratio)) | ||
return mapp, genes, pathways |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
from data.pathways.gmt_pathway import get_KEGG_map | ||
from os.path import expanduser | ||
input_genes = ['AR', 'AKT', 'EGFR'] | ||
filename = expanduser('~/Data/pathways/MsigDB/c2.cp.kegg.v6.1.symbols.gmt') | ||
mapp, genes, pathways = get_KEGG_map(input_genes, filename) | ||
print 'genes', genes | ||
print 'pathways', pathways | ||
print 'mapp', mapp |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,173 @@ | ||
import re | ||
from os.path import join, dirname, expanduser | ||
import networkx as nx | ||
import pandas as pd | ||
from data.gmt_reader import GMT | ||
|
||
data_dir = dirname(__file__) | ||
reactome_base_dir = join(data_dir, 'Reactome') | ||
reactome_base_dir = join(reactome_base_dir, 'input') | ||
|
||
relations_file_name = 'ReactomePathwaysRelation.txt' | ||
pathway_names = 'ReactomePathways.txt' | ||
pathway_genes = 'ReactomePathways.gmt' | ||
|
||
|
||
def add_edges(G, node, n_levels): | ||
edges = [] | ||
source = node | ||
for l in range(n_levels): | ||
target = node + '_copy' + str(l + 1) | ||
edge = (source, target) | ||
source = target | ||
edges.append(edge) | ||
|
||
G.add_edges_from(edges) | ||
return G | ||
|
||
|
||
def complete_network(G, n_leveles=4): | ||
sub_graph = nx.ego_graph(G, 'root', radius=n_leveles) | ||
terminal_nodes = [n for n, d in sub_graph.out_degree() if d == 0] | ||
distances = [len(nx.shortest_path(G, source='root', target=node)) for node in terminal_nodes] | ||
# nunique, counts = np.unique(distances, return_counts=True) | ||
|
||
for node in terminal_nodes: | ||
distance = len(nx.shortest_path(sub_graph, source='root', target=node)) | ||
if distance <= n_leveles: | ||
diff = n_leveles - distance + 1 | ||
sub_graph = add_edges(sub_graph, node, diff) | ||
|
||
return sub_graph | ||
|
||
|
||
def get_nodes_at_level(net, distance): | ||
# get all nodes within distance around the query node | ||
nodes = set(nx.ego_graph(net, 'root', radius=distance)) | ||
|
||
# remove nodes that are not **at** the specified distance but closer | ||
if distance >= 1.: | ||
nodes -= set(nx.ego_graph(net, 'root', radius=distance - 1)) | ||
|
||
return list(nodes) | ||
|
||
|
||
def get_layers_from_net(net, n_levels): | ||
layers = [] | ||
for i in range(n_levels): | ||
nodes = get_nodes_at_level(net, i) | ||
dict = {} | ||
for n in nodes: | ||
n_name = re.sub('_copy.*', '', n) | ||
next = net.successors(n) | ||
dict[n_name] = [re.sub('_copy.*', '', nex) for nex in next] | ||
layers.append(dict) | ||
return layers | ||
|
||
|
||
class Reactome(): | ||
|
||
def __init__(self): | ||
self.pathway_names = self.load_names() | ||
self.hierarchy = self.load_hierarchy() | ||
self.pathway_genes = self.load_genes() | ||
|
||
def load_names(self): | ||
filename = join(reactome_base_dir, pathway_names) | ||
df = pd.read_csv(filename, sep='\t') | ||
df.columns = ['reactome_id', 'pathway_name', 'species'] | ||
return df | ||
|
||
def load_genes(self): | ||
filename = join(reactome_base_dir, pathway_genes) | ||
gmt = GMT() | ||
df = gmt.load_data(filename, pathway_col=1, genes_col=3) | ||
return df | ||
|
||
def load_hierarchy(self): | ||
filename = join(reactome_base_dir, relations_file_name) | ||
df = pd.read_csv(filename, sep='\t') | ||
df.columns = ['child', 'parent'] | ||
return df | ||
|
||
|
||
class ReactomeNetwork(): | ||
|
||
def __init__(self): | ||
self.reactome = Reactome() # low level access to reactome pathways and genes | ||
self.netx = self.get_reactome_networkx() | ||
|
||
def get_terminals(self): | ||
terminal_nodes = [n for n, d in self.netx.out_degree() if d == 0] | ||
return terminal_nodes | ||
|
||
def get_roots(self): | ||
|
||
# roots = [n for n, d in self.netx.in_degree() if d == 0] | ||
roots = get_nodes_at_level(self.netx, distance=1) | ||
return roots | ||
|
||
# get a DiGraph representation of the Reactome hierarchy | ||
def get_reactome_networkx(self): | ||
if hasattr(self, 'netx'): | ||
return self.netx | ||
hierarchy = self.reactome.hierarchy | ||
# filter hierarchy to have human pathways only | ||
human_hierarchy = hierarchy[hierarchy['child'].str.contains('HSA')] | ||
net = nx.from_pandas_edgelist(human_hierarchy, 'child', 'parent', create_using=nx.DiGraph()) | ||
net.name = 'reactome' | ||
|
||
# add root node | ||
roots = [n for n, d in net.in_degree() if d == 0] | ||
root_node = 'root' | ||
edges = [(root_node, n) for n in roots] | ||
net.add_edges_from(edges) | ||
|
||
return net | ||
|
||
def info(self): | ||
return nx.info(self.netx) | ||
|
||
def get_tree(self): | ||
|
||
# convert to tree | ||
G = nx.bfs_tree(self.netx, 'root') | ||
|
||
return G | ||
|
||
def get_completed_network(self, n_levels): | ||
G = complete_network(self.netx, n_leveles=n_levels) | ||
return G | ||
|
||
def get_completed_tree(self, n_levels): | ||
G = self.get_tree() | ||
G = complete_network(G, n_leveles=n_levels) | ||
return G | ||
|
||
def get_layers(self, n_levels, direction='root_to_leaf'): | ||
if direction == 'root_to_leaf': | ||
net = self.get_completed_network(n_levels) | ||
layers = get_layers_from_net(net, n_levels) | ||
# layers=layers[:-2] | ||
else: | ||
net = self.get_completed_network(5) | ||
layers = get_layers_from_net(net,5) | ||
layers = layers[5 - n_levels:5] | ||
|
||
|
||
# get the last layer (genes level) | ||
terminal_nodes = [n for n, d in net.out_degree() if d == 0] # set of terminal pathways | ||
# we need to find genes belonging to these pathways | ||
genes_df = self.reactome.pathway_genes | ||
|
||
dict = {} | ||
missing_pathways = [] | ||
for p in terminal_nodes: | ||
pathway_name = re.sub('_copy.*', '', p) | ||
genes = genes_df[genes_df['group'] == pathway_name]['gene'].unique() | ||
if len(genes) == 0: | ||
missing_pathways.append(pathway_name) | ||
dict[pathway_name] = genes | ||
|
||
layers.append(dict) | ||
return layers |
Oops, something went wrong.