-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathneutrality_test.R
110 lines (91 loc) · 3.35 KB
/
neutrality_test.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# neutrality_test.R
#
# Author: Mouhamadou Fadel DIOP
# Date: 2021-10-06
#
# Purpose:
# Estimate Neutrality test for Plasmodium falciparum genes
# Produce tajima's D excel file.
#
# ------------------------------------------------------------------
# NOTE - uncomment these lines to install packages as needed
# install.packages("data.table")
# install.packages("PopGenome")
# Load packages
require(data.table)
require(PopGenome)
args <- commandArgs(trailingOnly = TRUE)
print(args)
input.vcf <- as.character(args[1])
gtFile <- as.character(args[2])
pathToOutput <- as.character(args[3])
if(!file.exists(input.vcf))
stop(input.vcf,": No such file or directory")
if(!file.exists(gtFile))
stop(gtFile,": No such file or directory")
if(!dir.exists(pathToOutput))
dir.create(pathToOutput)
Filename <- gsub("\\.[0-9a-zA-Z]*", "", basename(input.vcf))
tempDir <- paste0(pathToOutput, "/", Filename)
system(paste0("mkdir -p ", tempDir))
## Compress and index VCF file
# system(paste0("bgzip ", input.vcf))
# system(paste0("tabix ", input.vcf, ".gz"))
## Load Plasmodium falciparum GFF file
geneFile <- fread(gtFile, header = FALSE)
inc <- 1
fuli <- NULL
for (i in 1:nrow(geneFile))
{
cat("\n")
cat("i = ", i)
cat("\n")
cat("Extract ", geneFile$V6[i], " from ", input.vcf)
cat("\n")
# Create output file
File <- paste0(pathToOutput, "/", Filename, '_tajima.xlsx')
# Extract gene from vcf file
region <- paste0(geneFile$V1[i], ":", geneFile$V4[i], "-", geneFile$V5[i])
output <- paste0(tempDir, '/', geneFile$V6[i], ".vcf.gz")
system(paste0("bcftools view -r ", region," -Oz -o ", output, " ", input.vcf))
system(paste0("tabix ", output))
## Compute neutrality test on gene with at least 3 SNPs
skipNum <- as.numeric(system(paste("zgrep \"##\"", output," | wc -l"), TRUE))
vcf <- read.table(output,
skip = skipNum,
header = TRUE,
comment.char = "",
stringsAsFactors = FALSE,
check.names = FALSE)
NbreSNPs = nrow(vcf)
if(NbreSNPs >= 3)
{
GENOME.class <- readVCF(output, NbreSNPs, geneFile$V1[i], geneFile$V4[i], geneFile$V5[i], include.unknown = TRUE)
Stats <- neutrality.stats(GENOME.class, FAST=TRUE)
if(inc == 1)
{
geneName <- gsub(".vcf.gz", "", basename(output))
ff <- as.data.frame(subset(get.neutrality(Stats)[[1]], select=-c(3,6:9)))
fuli <- cbind(geneName, geneFile$V1[i], rownames(ff), ff)
cat("Save output as ", File)
system(paste0("touch ", File))
write.table(fuli, File, col.names = TRUE, row.names = FALSE, quote = FALSE, sep = '\t')
}
else
{
geneName <- gsub(".vcf.gz", "", basename(output))
ff <- as.data.frame(subset(get.neutrality(Stats)[[1]], select=-c(3,6:9)))
fuli <- cbind(geneName, geneFile$V1[i], rownames(ff), ff)
# Save file
write.table(fuli, File, col.names = FALSE, row.names = FALSE, quote = FALSE, append = TRUE, sep = '\t')
}
inc <- inc + 1
file.remove(output, paste0(output, ".tbi"))
}
else
{
cat('\n')
file.remove(output, paste0(output, ".tbi"))
}
}
system(paste0("rm -rf ", tempDir))