-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathswissprot_expected_homorepeats.Rmd
95 lines (78 loc) · 4.65 KB
/
swissprot_expected_homorepeats.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
---
title: 'Expected Homorepeats'
output:
html_document: default
pdf_document: default
date: "April, 2019"
---
# Housekeeping
```{r Housekeeping, echo=FALSE, message=FALSE, warning=FALSE}
# Before executing this file, do: setwd("path/to/your/git/on/swissrepeat/results")
setwd("/home/matteo/polybox/MSc_ACLS/swissrepeat/results")
source("local_config.R")
setwd(paste0(local_base_path,"/results"))
rm(list = ls(all = TRUE))
gc()
source("helpers.R")
# colour setup:
#library(RColorBrewer); display.brewer.all() # to display available colour palettes
colour_count = 13 # alternative: length(unique(sp_gathered$Kingdom))
getPalette = colorRampPalette(brewer.pal(9, "Dark2"))
cols1 <- c("#AA3939", "#AA7939", "#29506D", "#2D882D") #http://paletton.com/#uid=7000I0kllllaFw0g0qFqFg0w0aF
cols2 <- c("#FFAAAA", "#FFDBAA", "#718EA4", "#88CC88")
cols3 <- c("#801515", "#805215", "#123652", "#116611")
cols4 <- c("#550000", "#553100", "#042037", "#004400")
```
# Data Loading
```{r Data Loading , echo=FALSE, eval=TRUE}
tr_path = "results/tr_annotations/tr_annotations.csv"
tr_all = load_tr_annotations(tr_path)
sp_path = "data/swissprot_annotations.tsv"
sp_all = load_swissprot(sp_path, tr_all)
# Add meta_data from sp_all to tr_all. -> Do a left join.
tr_all_sp = merge(x = tr_all, y = sp_all, by = "ID", all.x = TRUE)
# Add disorder information from modiDB
discoor_path = "results/disorder_annotations/mobidb_coordinates.csv"
discoor_all = load_disorder_annotations(discoor_path)
discoor_all$disorder_region_length = (discoor_all$end - discoor_all$start) + 1
discoor_all$center = floor(discoor_all$start + (discoor_all$disorder_region_length/2))
discoor_all_sp = merge(x = discoor_all, y = sp_all, by = "ID", all.x = TRUE)
# Remove eventual regions with incorrect coordinates
discoor_all_sp = discoor_all_sp[(discoor_all_sp$center<discoor_all_sp$Length),]
#Aggregate disordered regions by protein, calculate disorder residues count in a protein
sp_protein= ddply(discoor_all, .(ID), summarize,
disorder_count=(sum(disorder_region_length)))
sp_protein = merge(x = sp_protein, y = sp_all, by = "ID", all.x = TRUE)
# load swissprot protein sequences
path_uniprot_sprot_fasta <- paste0(local_base_path, local_path_separator, "data", local_path_separator, "uniprot_sprot.fasta.gz")
sp_protein_sequences <- load_sp_protein_sequences(path_uniprot_sprot_fasta)
# load AA counts
aa_counts <- read.csv(paste0(local_base_path, local_path_separator, "data", local_path_separator, "aa_count.csv"), header = TRUE)
aa_counts <- aa_counts[,-1]
# Encode the order of the AA as increasing factor
aa_counts$aafac <- seq(1, length(df$aa))
# Add the disorder propensity from Uversky paper
aa_counts$disorderpropensity <- c(0.00, 0.004, 0.090, 0.113, 0.117, 0.195, 0.259, 0.263, 0.285, 0.291, 0.394, 0.401, 0.407, 0.437, 0.450, 0.588,0.665,0.713, 0.781,1.000, NA, NA, NA, NA, NA)
# # Remove NAs (*,-,.,+)
# aa_ignore = c("B", "X", "Z", "O", "U", "*","-",".","+", "other")
# dfSP <- subset(dfSP, !(dfSP[, 2] %in% aa_ignore))
# homo_path = "results/empirical_and_expected_homorepeat_counts/swiss_prot_homorepeat_frequencies/empirical_and_expected/swissprot.csv"
# homo_all = load_homorepeat_data(homo_path, aa_ignore, "all")
#
# homo_disorder_path = "results/empirical_and_expected_homorepeat_counts/swiss_prot_homorepeat_frequencies/empirical_and_expected/mobidb_consensus.csv"
# homo_d_all = load_homorepeat_data(homo_disorder_path, aa_ignore, "disorder")
#
# homo_order_path = "results/empirical_and_expected_homorepeat_counts/swiss_prot_homorepeat_frequencies/empirical_and_expected/mobidb_consensus_inverse.csv"
# homo_o_all = load_homorepeat_data(homo_order_path, aa_ignore, "order")
#
# homo_exp_path = "results/empirical_and_expected_homorepeat_counts/swiss_prot_homorepeat_frequencies/expected_on_unbound_sequence/swissprot.csv"
# # homo_exp_path = "results/empirical_and_expected_homorepeat_counts/swiss_prot_homorepeat_frequencies/expected_on_unbound_sequence/swissprot_kingdomwise.csv"
# # couldn't find the file "swissport_kingdomwise.csv" and don't know where it's generated... however swissprot exists in this directory and seems to work.
# homo_exp_all = load_expected_homorepeat_frequencies(homo_exp_path, aa_ignore, "all")
#
# homo_exp_path = "results/empirical_and_expected_homorepeat_counts/swiss_prot_homorepeat_frequencies/expected_on_unbound_sequence/mobidb_consensus.csv"
# homo_exp_d = load_expected_homorepeat_frequencies(homo_exp_path, aa_ignore, "disorder")
#
# homo_exp_path = "results/empirical_and_expected_homorepeat_counts/swiss_prot_homorepeat_frequencies/expected_on_unbound_sequence/mobidb_consensus_inverse.csv"
# homo_exp_o = load_expected_homorepeat_frequencies(homo_exp_path, aa_ignore, "order")
```