results/swissprot_expected_homorepeats.Rmd

---
  title: 'Expected Homorepeats'
output:
  html_document: default
pdf_document: default
date: "April, 2019"
---
  
  
# Housekeeping
```{r Housekeeping, echo=FALSE, message=FALSE, warning=FALSE}
# Before executing this file, do: setwd("path/to/your/git/on/swissrepeat/results")
setwd("/home/matteo/polybox/MSc_ACLS/swissrepeat/results")
source("local_config.R")
setwd(paste0(local_base_path,"/results"))

rm(list = ls(all = TRUE))
gc()
source("helpers.R")

# colour setup:
#library(RColorBrewer); display.brewer.all() # to display available colour palettes
colour_count = 13 # alternative: length(unique(sp_gathered$Kingdom))
getPalette = colorRampPalette(brewer.pal(9, "Dark2"))
cols1 <- c("#AA3939", "#AA7939", "#29506D", "#2D882D")  #http://paletton.com/#uid=7000I0kllllaFw0g0qFqFg0w0aF
cols2 <- c("#FFAAAA", "#FFDBAA", "#718EA4", "#88CC88") 
cols3 <- c("#801515", "#805215", "#123652", "#116611")
cols4 <- c("#550000", "#553100", "#042037", "#004400")
```

# Data Loading 
```{r Data Loading , echo=FALSE, eval=TRUE}
tr_path = "results/tr_annotations/tr_annotations.csv"
tr_all = load_tr_annotations(tr_path)

sp_path = "data/swissprot_annotations.tsv"
sp_all = load_swissprot(sp_path, tr_all)

# Add meta_data from sp_all to tr_all. -> Do a left join.
tr_all_sp = merge(x = tr_all, y = sp_all, by = "ID", all.x = TRUE)

# Add disorder information from modiDB
discoor_path = "results/disorder_annotations/mobidb_coordinates.csv"
discoor_all = load_disorder_annotations(discoor_path)

discoor_all$disorder_region_length =  (discoor_all$end - discoor_all$start) + 1
discoor_all$center = floor(discoor_all$start + (discoor_all$disorder_region_length/2))
discoor_all_sp = merge(x = discoor_all, y = sp_all, by = "ID", all.x = TRUE)

# Remove eventual regions with incorrect coordinates
discoor_all_sp = discoor_all_sp[(discoor_all_sp$center<discoor_all_sp$Length),]

#Aggregate disordered regions by protein, calculate disorder residues count in a protein
sp_protein= ddply(discoor_all, .(ID), summarize,
                  disorder_count=(sum(disorder_region_length)))
sp_protein = merge(x = sp_protein, y = sp_all, by = "ID", all.x = TRUE)

# load swissprot protein sequences
path_uniprot_sprot_fasta <- paste0(local_base_path, local_path_separator, "data", local_path_separator, "uniprot_sprot.fasta.gz")
sp_protein_sequences <- load_sp_protein_sequences(path_uniprot_sprot_fasta)

# load AA counts
aa_counts <- read.csv(paste0(local_base_path, local_path_separator, "data", local_path_separator, "aa_count.csv"), header = TRUE)
aa_counts <- aa_counts[,-1]
# Encode the order of the AA as increasing factor
aa_counts$aafac <- seq(1, length(df$aa))
# Add the disorder propensity from Uversky paper
aa_counts$disorderpropensity <- c(0.00, 0.004, 0.090, 0.113, 0.117, 0.195, 0.259, 0.263, 0.285, 0.291, 0.394, 0.401, 0.407, 0.437, 0.450, 0.588,0.665,0.713, 0.781,1.000, NA, NA, NA, NA, NA)


# # Remove NAs (*,-,.,+)
# aa_ignore = c("B", "X", "Z", "O", "U", "*","-",".","+", "other")
# dfSP <- subset(dfSP, !(dfSP[, 2] %in% aa_ignore))

# homo_path = "results/empirical_and_expected_homorepeat_counts/swiss_prot_homorepeat_frequencies/empirical_and_expected/swissprot.csv"
# homo_all = load_homorepeat_data(homo_path, aa_ignore, "all")
# 
# homo_disorder_path = "results/empirical_and_expected_homorepeat_counts/swiss_prot_homorepeat_frequencies/empirical_and_expected/mobidb_consensus.csv"
# homo_d_all = load_homorepeat_data(homo_disorder_path, aa_ignore, "disorder")
# 
# homo_order_path = "results/empirical_and_expected_homorepeat_counts/swiss_prot_homorepeat_frequencies/empirical_and_expected/mobidb_consensus_inverse.csv"
# homo_o_all = load_homorepeat_data(homo_order_path, aa_ignore, "order")
# 
# homo_exp_path = "results/empirical_and_expected_homorepeat_counts/swiss_prot_homorepeat_frequencies/expected_on_unbound_sequence/swissprot.csv"
# # homo_exp_path = "results/empirical_and_expected_homorepeat_counts/swiss_prot_homorepeat_frequencies/expected_on_unbound_sequence/swissprot_kingdomwise.csv"
# # couldn't find the file "swissport_kingdomwise.csv" and don't know where it's generated... however swissprot exists in this directory and seems to work.
# homo_exp_all = load_expected_homorepeat_frequencies(homo_exp_path, aa_ignore, "all")
# 
# homo_exp_path = "results/empirical_and_expected_homorepeat_counts/swiss_prot_homorepeat_frequencies/expected_on_unbound_sequence/mobidb_consensus.csv"
# homo_exp_d = load_expected_homorepeat_frequencies(homo_exp_path, aa_ignore, "disorder")
# 
# homo_exp_path = "results/empirical_and_expected_homorepeat_counts/swiss_prot_homorepeat_frequencies/expected_on_unbound_sequence/mobidb_consensus_inverse.csv"
# homo_exp_o = load_expected_homorepeat_frequencies(homo_exp_path, aa_ignore, "order")
```