-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilter_v2.R
126 lines (101 loc) · 3.19 KB
/
filter_v2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
library(data.table)
library(tictoc)
library(statip)
options(scipen=999)
# Calculates the amount of missing data in a row or column where missing data is defined as a -, N, NA
missi <- function(x)
{
sum(1*(x == "./."))/ length(x)
}
## TRIQUAD ##
# Calculates the number of different homozygous calls present in the data (1 - 4).
# Does not take heterozygous calls into account as these can be excluded later via the mixes call.
# Also does not take into account the frequency of each allele.
triquad <- function(x)
{
xx <- x[x != "./."];
res <- 1*(sum(1*(xx=="0/0"))>0) + 1*(sum(1*(xx=="1/1"))>0) + 1*(sum(1*(xx=="0/1"))>0);
res
}
#------------ Extracting genotypes from the raw data:
setwd("/media/Data/Data/Documents_Karim/Fadel/Alfred/Data/raw")
vcf <- "Farafenni_1990.vcf.gz"
Genotypes = '../filtered/Genotypes.txt'
expression = '%CHROM\t%POS\t%REF\t%ALT[\t%GT]\n'
system(sprintf("bcftools query -f'%s' %s > %s", expression, vcf, Genotypes))
#------- Computing the missingness on SNPs and samples
#---- put the first four columns in a variable
Genotype = fread(Genotypes, header = FALSE)
first4Column = subset(Genotype, select=c(1:4))
Genotype = subset(Genotype, select=-c(1:4))
#===================
## Remove invariants
#==================
varsnp <- apply(Genotype, 1, triquad)
keepvar <- which(varsnp != 1)
data2 <- cbind(first4Column, Genotype)
data2 <- data2[keepvar,]
geno <- data2[,5:ncol(data2)]
first4Column <- data2[,1:4]
nrow(geno)
ncol(geno)
# Determine missingness on SNPs/ Isolates
#======================================
tic();misssnp <- apply(geno, 1, missi);toc()
tic();missiso <- apply(geno, 2, missi);toc()
par(mfrow=c(2,1))
plot(missiso, pch=16)
plot(misssnp, pch='.')
#=====================================
# Remove SNPs with too much missingness
#=====================================
keepsnp <- which(misssnp <= 0.8)
data2 <- cbind(first4Column, geno)
data2 <- data2[keepsnp,]
geno <- data2[,5:ncol(data2)]
first4Column <- data2[,1:4]
nrow(geno)
ncol(geno)
#=======================================
# Determine missingness on SNPs/ Isolates
#======================================
tic();misssnp <- apply(geno, 1, missi);toc()
tic();missiso <- apply(geno, 2, missi);toc()
par(mfrow=c(2,1))
plot(missiso, pch=16)
plot(misssnp, pch='.')
## Remove isos with too much missingness
isomissallow <- 0.2
geno <- as.data.frame(geno)
geno <- geno[,which(missiso < isomissallow)]
nrow(geno)
ncol(geno)
############# ADDED
#===================
## Remove invariants
#==================
varsnp <- apply(geno, 1, triquad)
keepvar <- which(varsnp != 1)
data2 <- cbind(first4Column, geno)
data2 <- data2[keepvar,]
geno <- data2[,5:ncol(data2)]
first4Column <- data2[,1:4]
nrow(geno)
ncol(geno)
# Determine missingness on SNPs/ Isolates
#======================================
tic();misssnp <- apply(geno, 1, missi);toc()
tic();missiso <- apply(geno, 2, missi);toc()
par(mfrow=c(2,1))
plot(missiso, pch=16)
plot(misssnp, pch='.')
#=====================================
# Remove SNPs with too much missingness
#=====================================
keepsnp <- which(misssnp <= 0.2)
data2 <- cbind(first4Column, geno)
data2 <- data2[keepsnp,]
geno <- data2[,5:ncol(data2)]
first4Column <- data2[,1:4]
nrow(geno)
ncol(geno)