-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathGender.R
133 lines (110 loc) · 5.54 KB
/
Gender.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#-------------------------------------------------------------------------------------------#
# Author : Andrew J Skelton |
# Language : R Statistical Programming Language |
# Study : WES PID |
# Data Owner : Newcastle University - Prof. Sophie Hambleton |
# Description : Validate Gender of samples, by assessing the coverage of the SRY Gene. |
# Using the Pedigree file, we can determine expected gender against known |
# gender to identify potential errors |
#-------------------------------------------------------------------------------------------#
##'Set the Working Directory, load essential packages
##'-----------------------------------------------------------------------------------------#
library(readr)
library(ggplot2)
library(dplyr)
setwd("/Volumes/WORKING_DATA/Exome_Project/../David/Raw_Data/")
ped_in <- read_tsv("Scripts/Ref/Samples.ped", col_names = F) %>%
as.data.frame %>% na.omit
# Set hard limits on
# extracted loci
low_lim = 2655000
hi_lim = 2656000
##'-----------------------------------------------------------------------------------------#
##' Read in Coverage Files
##' - Read coverage file, check that the file isn't empty
##' Tag the dataframe with the file/ sample name
##'-----------------------------------------------------------------------------------------#
coverage <- c()
cov_files <- list.files("Preprocessing/",
pattern = "*.cov",
full.names = T,
recursive = T)
cov_names <- sapply(cov_files, function(x){strsplit(basename(x),"_")[[1]][1]})
for(i in 1:length(cov_files)){
if(file.size(cov_files[i]) > 0) {
foo <- read_tsv(cov_files[i], col_names = F) %>%
as.data.frame %>%
mutate(Sample = cov_names[i])
coverage <- bind_rows(coverage, foo)
} else {
message(paste0("Empty File: ", cov_files[i]))
}
}
##'-----------------------------------------------------------------------------------------#
##' Read in Coverage Files
##' Code block for checking single samples
##'-----------------------------------------------------------------------------------------#
# coverage_in <- coverage %>% filter(Sample %in% cov_names[1:5])
# ggplot(coverage_in, aes(x = X5, y = X6, group = Sample, colour = Sample)) +
# geom_line(size = 1) +
# theme_bw()
##'-----------------------------------------------------------------------------------------#
##' Check Samples
##' - Loop through each of the processed Samples
##' Assign estimated gender based on SRY gene coverage
##' Check for mismatches, flag and log
##'-----------------------------------------------------------------------------------------#
sample_error <- c()
error_out <- c()
targets <- unique(coverage$Sample) %>% .[. %in% ped_in$X2]
for(i in targets){
foo <- coverage %>% filter(Sample == i, X5 > 1001, X5 < 2000) %>%
left_join(ped_in, by = c("Sample" = "X2"))
foo_known <- ifelse(foo$X5.y[1] == "1" ,"M","F")
foo_Pred <- ifelse(mean(foo$X6.x) > 5, "M", "F")
# message(paste0("Sample: ", i, " - ", foo_known, " ", foo_Pred, " ", mean(foo$X6.x)))
if(foo_known != foo_Pred){
message(paste0("Potential Sample Mismatch: ", i,
"\n Logged as ", foo_known,
", SRY coverage suggests ",
foo_Pred, "\n Mean Cov: ", mean(foo$X6.x), "\n"))
sample_error <- c(sample_error,i)
error_out <- bind_rows(error_out,
data.frame(SampleID = i,
MeanSRY = mean(foo$X6.x) %>% round(2),
Logged_Gender = foo_known,
Predicted_Gender = foo_Pred))
}
}
##'-----------------------------------------------------------------------------------------#
##' Plot Errors
##' - Take each of the potential mismatches
##' Graph the coverage for visual inspection
##'-----------------------------------------------------------------------------------------#
coverage_in <- coverage %>%
filter(Sample %in% sample_error) %>%
left_join(ped_in, by = c("Sample" = "X2")) %>%
mutate(Gender = ifelse(X5.y == "1" ,"M","F"))
gg <- ggplot(coverage_in, aes(x = X5.x, y = X6.x, group = Sample, colour = Gender)) +
geom_line(size = 1) +
facet_grid(Sample ~ .) +
xlab("Locus") +
ylab("Depth") +
theme_bw() +
guides(colour = guide_legend(title = "Known Gender")) +
ggtitle("Gender Mismatches")
print(gg)
##'-----------------------------------------------------------------------------------------#
##' Compile Results
##' - Output the results in an xlsx speadsheets for clinicians
##'-----------------------------------------------------------------------------------------#
wb <- openxlsx::createWorkbook()
openxlsx::addWorksheet(wb, "Gender_Mismatches")
openxlsx::writeData(wb, "Gender_Mismatches", error_out)
openxlsx::saveWorkbook(wb, "/Volumes/WORKING_DATA/Exome_Project/QC/GenderMismatch.xlsx",
overwrite = T)
png("/Volumes/WORKING_DATA/Exome_Project/QC/GenderMismatch.png",
width=8, height=9, units="in", res=600)
print(gg)
dev.off()
##'-----------------------------------------------------------------------------------------#