-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathorf_floss_cutoff.R
76 lines (64 loc) · 2.42 KB
/
orf_floss_cutoff.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#' usage:
#' Rscript --vanilla path/to/orf_floss_cutoff.R path_input path_output
#'
#' @path/to/orf_floss_cutoff.R: path to this script
#' @path_input: input floss score path (generated by `orf_floss.py`)
#' @path_output: output table of ORF floss score with upper bound attached
#' requires R >= 4.1, dplyr, readr, ggplot2
library(dplyr)
library(readr)
library(ggplot2)
args <- commandArgs(trailingOnly = TRUE)
path_input <- args[1]
path_output <- args[2]
# read data and clean
dtt <- read_tsv(path_input)
dtt <- rename(dtt, cnt = floss_cnt)
cds <- dtt |>
filter(type == 'CDS') |>
select(id:type) |>
arrange(id, -cnt) |>
filter(!duplicated(id), cnt > 0, !is.na(floss))
orf <- dtt |> filter(type == 'ORF') |> select(id:type)
# determine upper bound in each window
frange <- cds |>
mutate(grp = cut(rank(cnt), breaks = 200)) |>
group_by(grp) |>
summarise(cnt = median(cnt),
q1 = quantile(floss, probs = 0.25),
q3 = quantile(floss, probs = 0.75)) |>
mutate(upperbd = q3 + 1.5 * (q3 - q1))
# smoothing per window floss upper bound with loess
fit <- loess(upperbd ~ log10(cnt), data = frange)
frange$smoothed <- predict(fit, newdata = frange)
# predict floss upperbound
min_cnt <- first(frange$cnt)
min_cnt_floss <- first(frange$smoothed)
max_cnt <- last(frange$cnt)
max_cnt_floss <- last(frange$smoothed)
orf <- orf |>
mutate(floss_ubd = predict(fit, newdata = orf),
floss_ubd = case_when(
cnt <= min_cnt ~ min_cnt_floss,
cnt >= max_cnt ~ max_cnt_floss,
TRUE ~ floss_ubd))
orf_out <- orf |> select(
orf_id = id, tx_name, floss, floss_cnt = cnt, floss_ubd)
write_tsv(orf_out, file = path_output)
# plot
cds <- cds |>
mutate(floss_ubd = predict(fit, newdata = cds),
floss_ubd = case_when(
cnt <= min_cnt ~ min_cnt_floss,
cnt >= max_cnt ~ max_cnt_floss,
TRUE ~ floss_ubd))
p <- ggplot(bind_rows(cds, orf), aes(x = cnt, y = floss)) +
geom_point(aes(color = type), alpha = 0.2, size = 0.5) +
geom_line(data = frange, mapping = aes(x = cnt, y = smoothed)) +
scale_color_brewer(palette = 'Set1') +
scale_x_log10() +
theme_classic(base_size = 12) +
theme(legend.position = c(0.95, 0.95),
legend.justification = c(1, 1)) +
labs(x = 'Number of reads', y = 'FLOSS', color = NULL)
ggsave(paste0(path_output, '.jpg'), plot = p, width = 8, height = 6)