-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpfam_clans_analysis.R
82 lines (69 loc) · 3.98 KB
/
pfam_clans_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
###############################
### NOTE: Code not working.
### In file src/disordered_repeats_annotate.py activate the idr_perc calculation.
### therefore all files need to be recalculated...
### This could be done with the most recent version of swiss-prot/uniprot and the other DB and TRAL...
###
### idr_perc is calculated by the length of the disordered region divided by the length of the TR to which it overlaps.
### IDEA: - calculate this here in R from the data from the cluster (?).
### - get the pfam data from here: https://pfam.xfam.org/help#tabview=tab13
### - connect the two and get some information out of it. (WHICH INFORMATION?????)
rm(list = ls(all = TRUE))
gc()
library("ggplot2")
library("plyr")
library("scales")
require(grid)
library(RColorBrewer)
library(stringr)
setwd("/home/matteo/polybox/MSc_ACLS/swissrepeat/results")
source("helpers.R")
sp_path = "data/swissprot_annotations.tsv"
sp_all = load_swissprot(sp_path)
# tr_consensus_disorder_path = "results/tr_annotations/tr_all_with_disorder.csv"
tr_consensus_disorder_path = "results/tr_idr_overlap/repeat_disorder_overlap_regions.csv"
tr_consensus_disorder = load_tr_annotations(tr_consensus_disorder_path)
tr_consensus_disorder_sp = merge(x = tr_consensus_disorder, y = sp_all, by = "ID", all.x = TRUE)
pfam_families=read.delim("/home/matteo/polybox/MSc_ACLS/swissrepeat/data/pfam_annotations.tab", header = TRUE, sep='\t')
pfam_clans=read.delim("/home/matteo/polybox/MSc_ACLS/swissrepeat/data/Pfam-A.clans.tsv", header = TRUE, sep='\t')
# Annotate protein with a clan
# no run, only to create file, takes long!
# add_clan <- function(proteinID){
# pfam_families_str<-as.character(pfam_families[pfam_families$ID==proteinID, "Pfam_ID"])
# pfam_families = unlist(strsplit(pfam_families_str, ";"))
# clans<-c()
# for (family in pfam_families) {
# if (family %in% pfam_clans$Pfam_ID) {
# this_clan=as.character(pfam_clans[pfam_clans$Pfam_ID==family, "CL_ID"])
# clans<-c(clans,this_clan)
# }
# }
# return(paste(clans, sep=";"))}
#
#
# if ("parallel" %in% rownames(installed.packages())){
# # Multiprocessing by forking only if 'parallel' package is already installed
# library(parallel)
# ncores <- detectCores()-1
# pfam_families$Pfam_clans=mclapply(pfam_families$ID, add_clan, mc.cores = ncores)
# } else{
# pfam_families$Pfam_clans=lapply(pfam_families$ID, add_clan)
# }
#
# pfam_families$Pfam_clans = vapply(pfam_families$Pfam_clans, paste, collapse = ", ", character(1L))
# write.table(pfam_families, file = "../data/pfam_annotations_with_clans.tab")
# load annotation file produced above:
pfam_families=read.delim("/home/matteo/polybox/MSc_ACLS/swissrepeat/data/pfam_annotations_with_clans.tab", header = TRUE, sep="")
############ Plotting ###############
tr_consensus_disorder_sp = merge(x = tr_consensus_disorder_sp, y = pfam_families, by = "ID", all.x = TRUE)
#tr_consensus_disorder_sp$Pfam_clans=lapply(as.character(tr_consensus_disorder_sp$ID), add_clan)
disorder_type_bin <- with(tr_consensus_disorder_sp, cut(idr_perc, breaks = c(-0.1,0, 0.1, 0.3, 0.5, 1.0)))
tr_consensus_disorder_sp$disorder_group = disorder_type_bin
sorted_clan_counts=sort(table(unlist(tr_consensus_disorder_sp[tr_consensus_disorder_sp$l_type=='micro'&tr_consensus_disorder_sp$idr_perc>=0.8,"Pfam_clans"])), decreasing=TRUE)
sorted_clan_counts_df = data.frame(x=names(sorted_clan_counts[1:30]), y=sorted_clan_counts[1:30], row.names=NULL)
sorted_clan_counts_df = merge(sorted_clan_counts_df,pfam_clans[!duplicated(pfam_clans[,c('CL_Name','CL_ID')]),c('CL_Name','CL_ID')], by.x = "x", by.y = "CL_ID",all.x = TRUE)
sorted_clan_counts_df=na.omit(sorted_clan_counts_df[ order(-sorted_clan_counts_df[,"y"]), ])
ggplot(sorted_clan_counts_df[sorted_clan_counts_df$x!="",],aes(x=CL_Name, y=y.Freq)) +
geom_bar(stat='identity') + scale_x_discrete(limits = sorted_clan_counts_df$CL_Name) +
labs(title="Pfam clans, micro repeats", x="Pfam clan", y="Number of repeats") +
theme_minimal() + theme(axis.text.x = element_text(angle=45, hjust=1))