-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathcount_pvalue.py
143 lines (117 loc) · 4.65 KB
/
count_pvalue.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import csv
from operator import attrgetter
import pandas as pd
import time
from scipy.stats import chisquare, ranksums
import sys
import os
import xlsxwriter
import numpy as np
from scipy import stats
def Count_pvalue_replicates(AS, output_dir, s1_namelist, s2_namelist, g1_name, g2_name):
len_S1 = len(s1_namelist)
len_S2 = len(s2_namelist)
S1_list, S2_list = {}, {}
writer_list = []
output_columns = ['chrom', 'geneName', 'splicedExonStart', 'splicedExonEnd', 'P_value', 'ChromRegionLong']
for k, sample1 in enumerate(s1_namelist):
S1_df = pd.read_csv(os.path.join(output_dir, sample1+"_"+AS+".csv"), delimiter="\t")
S1_list[k] = list(S1_df.values.tolist())
for p, sample2 in enumerate(s2_namelist):
S2_df = pd.read_csv(os.path.join(output_dir, sample2+"_"+AS+".csv"), delimiter="\t")
S2_list[p] = list(S2_df.values.tolist())
AS_file_length = len(S2_df)
#### need to check this part, whether it can read the values
for i in range(1, AS_file_length):
X, Y = [], []
N1s, N2s = [], []
for k in range(len_S1):
full_list = S1_list[k]
chrom = full_list[i][0]
gene = full_list[i][1]
start = full_list[i][2]
end = full_list[i][3]
n1 = float(full_list[i][10])
N1 = float(full_list[i][11])
N1s.append(N1)
N1 = N1 + n1
if N1 > 0:
r1 = n1/N1
X.append(r1)
for p in range(len_S2):
full_list = S2_list[p]
n2 = float(full_list[i][10])
N2 = float(full_list[i][11])
N2s.append(N2)
N1 = N2 + n2
if N2 > 0:
r2 = n2/N2
Y.append(r2)
##### added N filter: 11.14.2022
##### both N should be >= 10
if (min(np.mean(N1s), np.mean(N2s)) >= 10) and sum(np.array(X))!=0 and sum(np.array(Y))!=0:
#stat, p_val = ranksums(X, Y)
stat, p_val = stats.ttest_ind(X, Y)
#print("P-val", p_val)
chrom_region_long = chrom+':'+gene+":"+str(start)+"-"+str(end)
writer_list.append((chrom, gene, start, end, p_val, chrom_region_long))
df_out = pd.DataFrame(writer_list, columns = output_columns)
df_out.to_csv(os.path.join(output_dir, AS+"_Output.csv"), sep='\t', index=False)
for sample1 in s1_namelist:
os.remove(os.path.join(output_dir, sample1+"_"+AS+".csv"))
for sample2 in s2_namelist:
os.remove(os.path.join(output_dir, sample2+"_"+AS+".csv"))
return
def Count_pvalue(AS, output_dir, s1_namelist, s2_namelist, g1_name, g2_name):
len_S1, len_S2 = len(s1_namelist), len(s2_namelist)
S1_list, S2_list = {}, {}
writer_list = []
output_columns = ['chrom', 'geneName', 'splicedExonStart', 'splicedExonEnd', 'P_value', 'ratioDifference', 'absoluteRatioDifference', g1_name+':n1', g1_name+':N1', g2_name+':n2', g2_name+':N2', 'chromRegionLLong']
for k, sample1 in enumerate(s1_namelist):
S1_df = pd.read_csv(os.path.join(output_dir, sample1+"_"+AS+".csv"), delimiter="\t")
S1_list[k] = list(S1_df.values.tolist())
for p, sample2 in enumerate(s2_namelist):
S2_df = pd.read_csv(os.path.join(output_dir, sample2+"_"+AS+".csv"), delimiter="\t")
S2_list[p] = list(S2_df.values.tolist())
AS_file_length = len(S2_df)
for i in range(1,AS_file_length):
S1_n_total, S1_N_total, S2_n_total, S2_N_total = 0.0, 0.0, 0.0, 0.0
for k in range(len_S1):
full_list = S1_list[k]
S1_n_total += float(full_list[i][10])
S1_N_total += float(full_list[i][11])
chrom = full_list[i][0]
gene = full_list[i][1]
start = full_list[i][2]
end = full_list[i][3]
n1 = S1_n_total/len_S1
N1 = S1_N_total/len_S1
for p in range(len_S2):
full_list = S2_list[p]
S2_n_total += float(full_list[i][10])
S2_N_total += float(full_list[i][11])
n2 = S2_n_total/len_S2
N2 = S2_N_total/len_S2
##### added N filter: 11.14.2022
##### both N should be >= 10 and no negative values of n and N are allowed
if (min(N1, N2) >= 10) and (n1 < N1) and (n2 < N2):
N1 = N1 + n1
N2 = N2 + n2
if N1>0 and N2>0:
ratio_diff = (n1/N1) - (n2/N2)
abs_ratio_diff = abs(ratio_diff)
P0 = (n1+n2)/(N1+N2)
n10 = N1 * P0
n20 = N2 * P0
exp = [n10, N1-n10, n20, N2-n20]
if 0 not in exp:
res = chisquare([n1, N1-n1, n2, N2-n2], f_exp=exp, ddof = 1)
chrom_region_long = chrom+':'+gene+":"+str(start)+"-"+str(end)
writer_list.append((chrom, gene, start, end, res[1], ratio_diff, abs_ratio_diff, n1, N1-n1, n2, N2-n2, chrom_region_long))
df_out = pd.DataFrame(writer_list, columns = output_columns)
df_out.to_csv(os.path.join(output_dir, AS+"_Output.csv"), sep='\t', index=False)
for sample1 in s1_namelist:
os.remove(os.path.join(output_dir, sample1+"_"+AS+".csv"))
for sample2 in s2_namelist:
os.remove(os.path.join(output_dir, sample2+"_"+AS+".csv"))
return