-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathcalculate_posterior.R
132 lines (106 loc) · 3.76 KB
/
calculate_posterior.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
args <- commandArgs(trailingOnly = TRUE)
counts<-args[1]
filename.posteriors<- args[2]
# clipping.info<- args[3]
filename.all<- args[3]
print(args)
# load libraries
library(data.table)
library(plyr)
library(dplyr)
library(rmutil)
library(readr)
library(rtracklayer)
# read counts file
df<- as.data.frame(fread(counts, sep="\t"))
print(paste("# of base positions=",nrow(df), sep=""))
# keep positions with minimum of 10 reads
df= subset(df, N >= 10)
print(paste("# of base positions after filtering for >10 reads=",nrow(df), sep=""))
# functions to calculate posterior mean and standard deviation
#Mean
calcMean <- function(successes, total, a, b) {
calcBetaMean <-
function(aa, bb) {
BetaMean <- (aa) / (aa + bb)
return(BetaMean)
}
posterior_a = a + successes
posterior_b = b + total - successes
posterior_mean <- calcBetaMean(posterior_a, posterior_b)
return(posterior_mean)
}
#Standard deviation
calcSd <- function(successes, total, a, b) {
calcBetaSd <-
function(aa, bb) {
BetaSd <-
sqrt((aa * bb) / (((aa + bb) ^ 2) * (aa + bb + 1)))
return(BetaSd)
}
posterior_a = a + successes
posterior_b = b + total - successes
posterior_sd <- calcBetaSd(posterior_a, posterior_b)
return(posterior_sd)
}
# estimate beta prior
estBetaParams <- function(mu, var) {
alpha <- ((1 - mu) / var - 1 / mu) * mu ^ 2
beta <- alpha * (1 / mu - 1)
return(params = list(alpha = alpha, beta = beta))
}
# features <- c("right", "left", "INS", "DEL")
features <- c("INS", "DEL")
# function to estimate posteriors
my.func<- function(i){
dat<- df[, c("bpos", "N", features[i])]
dat<- dat[,c(1,3,2)]
# get beta prior
mu<- mean(dat[,2]/dat[,3])
var<- var(dat[,2]/dat[,3])
beta.prior= estBetaParams(mu, var)
# subset dat
test<- dat[dat[,2]>0,]
# get pm and psd
test$pm= apply(test[, c(2,3)], 1, function(x) calcMean(x[1], x[2], beta.prior$alpha, beta.prior$beta))
test$psd= apply(test[, c(2,3)], 1, function(x) calcSd(x[1], x[2], beta.prior$alpha, beta.prior$beta))
colnames(test)[c(4,5)]<- paste(features[i], colnames(test)[c(4,5)], sep="_")
final<- test[, c(1,4,5)]
rownames(final)<- NULL
return(final)
}
pos.list<- llply(1:length(features), my.func, .progress = progress_text(char="+"))
# compile posteriors
print("output posteriors")
posteriors= Reduce(function(x, y) merge(x, y, all=TRUE, by="bpos"), pos.list)
posteriors[is.na(posteriors)]<- 0
write_tsv(posteriors,filename.posteriors)
# compile non indel and indel positions
print("add non tested positions")
non<- df$bpos[!df$bpos %in% posteriors$bpos]
non.df<- data.frame(matrix(nrow = length(non), ncol = ncol(posteriors)))
colnames(non.df)<- colnames(posteriors)
non.df$bpos<- non
non.df[is.na(non.df)]<- 0
input<- rbind(posteriors, non.df)
setDT(input, key="bpos")
# # load clipping info
# print("add clip information")
# if(!file.exists(clipping.info)){
# # create empty data frame when no clipping is present
# setcol<- c("right_mean_length","right_sd_length","right_IC","left_mean_length",
# "left_sd_length", "left_IC", "bpos")
# clip_info<- as.data.frame(matrix(nrow= nrow(input), ncol= length(setcol),data = 0))
# colnames(clip_info)<- setcol
# clip_info$bpos<- input$bpos
# }else{
# # read clip info file
# clip_info<- read.table(clipping.info)
# }
# setDT(clip_info, key = "bpos")
# merge data.table
# new.input<- merge(input, clip_info, by="bpos", all.x= T)
# new.input<- as.data.frame(new.input)
# new.input[is.na(new.input)]<- 0
# write_tsv(new.input, filename.all)
write_tsv(input, filename.all)