-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWx80_sequence_processing.py
190 lines (168 loc) · 8.97 KB
/
Wx80_sequence_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# -*- coding: utf-8 -*-
###############################################################################
# Bioinformatic processing of multi-gene MiSeq data.
# First, use PEAR paired end read merger to merge paired sequence files.
# PEAR read merger is called by "pear", requires sequence files to NOT be gzipped.
# This only works well for 18S and COI sequences, as very few 16S and 26S sequences overlap.
###############################################################################
import glob, os, subprocess, re
def run_PEAR(R1_file, R2_file, outfile, log):
start_PEAR = "pear -f {0} -r {1} -v 50 -o {2}".format(R1_file, R2_file, outfile)
print(start_PEAR)
subprocess.call(start_PEAR.split(), stdout = log, stderr = subprocess.STDOUT)
os.chdir("Wx80_sequence_data") # Path to raw sequence data
log = open("PEAR_log.txt", "a")
R1_files = glob.glob("*R1_001.fastq")
for R1_file in R1_files:
print("R1 file: {0}".format(R1_file))
R2_file = glob.glob("{0}*R2_001.fastq".format(R1_file[:13]))[0]
print("R2 file: {0}".format(R2_file))
outfile = "{0}_pear_contigs".format(R1_file[:13])
print("Outfile {0}".format(outfile))
run_PEAR(R1_file, R2_file, outfile, log)
log.close()
###############################################################################
# Next, split the merged (18S, COI) and/or unmerged (16S, 26S) sequence files
# by primer, and relabel the sequences by sample.
###############################################################################
from Bio import SeqIO, SeqUtils # This part uses Biopython
def split_relabel(infile, gene, m, sample_IDs, output_handle):
## Set up logfile
log = open("Primer_split_log.txt","a")
log.write("Reads file\tPrimer\tRead count\n")
if m == "merged":
label = re.split('Wx80_|_pear', infile)[1]
elif m == "unmerged":
label = re.split('Wx80_|_L001', infile)[1]
reads_handle = open(infile)
#reads_handle = gzip.open(fastq_file) # Use if files are gzipped
well = re.split('_S', label)[0] # NZGL well number
sample_ID = sample_IDs.get(well) # Corresponding sample ID
count = 0
primer_f = primerlist.get(gene)[0]
primer_r = primerlist.get(gene)[1]
trimmed = ""
for record in SeqIO.parse(reads_handle, "fastq"):
primer_search = SeqUtils.nt_search(str(record.seq), primer_f) # Searches record.seq for primer
if len(primer_search) > 1 and (primer_search)[1] == 0: # Check if primer found (len > 1) at start of sequence ([1] == 0)
if m == "merged":
trimmed = record[len(primer_f):-len(primer_r)]
elif m == "unmerged":
trimmed = record[len(primer_f):]
trimmed.id = (("{0}|gene_{1}|sample_{2}").format(trimmed.id, gene, sample_ID)) # Adds gene and sample ID
print(trimmed.id)
print(len(trimmed.seq))
SeqIO.write(trimmed, output_handle, "fastq")
count += 1
print("{0} {1} {2} {3}".format(gene, label, sample_ID, count))
print("{0} {1}: Saved {2} reads".format(label, gene, count))
log.write("{0} {1} {2}: {3}".format(gene, label, sample_ID, count))
reads_handle.close()
#output_handle.close()
log.close()
# Load a tab-delimited list of gene names and primer sequences.
primerlist = {}
primerfile = open("../Wx80_metadata/Barcode_split_primers.txt", "r")
for row in primerfile:
gene, primer_f, primer_r = row.strip().split("\t")
primerlist[gene] = primer_f, primer_r
print ("{0} {1} {2}".format(gene, primer_f, primer_r))
# Load a tab-delimited list of sample IDs
sample_IDs = {}
ID_file = open("../Wx80_metadata/NZGL01278_Sample_ID.txt", "r")
for row in ID_file:
sample, name = row.strip().split("\t")
if name.startswith("S2-"): # Exclude any non-Wx80 samples
sample_IDs[sample] = name
print ("{0} {1}".format(sample, name))
log = open("Primer_split_log.txt","a")
log.write("Reads file\tPrimer\tRead count\n")
log.close()
# Split and relabel all the sequences
for gene in "18S","COI":
outfile = open("Wx80_{0}_all_merged.fastq".format(gene), "a")
infiles = glob.glob("*_pear*.fastq")
for infile in infiles:
print ("Processing {0}".format(infile))
split_relabel(infile, gene, m = "merged", sample_IDs = sample_IDs, output_handle = outfile)
outfile.close()
for gene in "16S","26S":
outfile = open("Wx80_{0}_all_R1_reads.fastq".format(gene), "a")
infiles = glob.glob("*_R1_001.fastq")
for infile in infiles:
print ("Processing {0}".format(infile))
split_relabel(infile, gene, m = "unmerged", sample_IDs = sample_IDs, output_handle = outfile)
outfile.close()
primerfile.close()
sample_IDs.close()
###############################################################################
# Then process the each amplicon file into OTUs using USEARCH.
# For merged sequences, trim to length appropriate for each target.
# For unmerged R1 reads, trim to 250 bp.
###############################################################################
#usearch = "../Usearch/usearch7.0.1090_i86linux64"
usearch = "../Usearch/usearch8.0.1623_i86linux64"
def filter_trim_merged(input_file, label, minlen, maxee, log):
# Trim and quality filter merged sequences
filter_trim = "{0} -fastq_filter {1} -fastq_minlen {2} -fastq_maxee {3} -fastaout {4}_filter_trim.fasta".format(usearch, input_file, minlen, maxee, label)
subprocess.call(filter_trim.split(), stdout=log, stderr=subprocess.STDOUT)
log.write('\nFilter and trim reads:\n' + str(filter_trim) + '\n')
def filter_trim_reads(input_file, label, trunclen, maxee, log):
# Trim and quality filter unmerged R1 reads
filter_trim = "{0} -fastq_filter {1} -fastq_trunclen {2} -fastq_maxee {3} -fastaout {4}_filter_trim.fasta".format(usearch, input_file, trunclen, maxee, label)
subprocess.call(filter_trim.split(), stdout=log, stderr=subprocess.STDOUT)
log.write('\nFilter and trim reads:\n' + str(filter_trim) + '\n')
def get_usearch_OTUs(label, minsize, log):
# Remove duplicate sequences
derep = "{0} -derep_fulllength {1}_filter_trim.fasta -fastaout {1}_uniques.fasta -sizeout".format(usearch, label)
subprocess.call(derep.split(), stdout=log, stderr=subprocess.STDOUT)
log.write("\nDereplicate filtered/trimmed reads:\n" + derep + "\n")
# Sort sequences by size (abundance)
sortbysize = "{0} -sortbysize {1}_uniques.fasta -minsize {2} -fastaout {1}_uniques_sorted.fasta".format(usearch, label, minsize)
subprocess.call(sortbysize.split(), stdout=log, stderr=subprocess.STDOUT)
log.write("\nSort reads by size, minsize {0}".format(minsize) + "\n" + sortbysize + "\n")
# Cluster the filtered, dereplicated, and sorted sequences into OTUs
clusterOTUs = "{0} -cluster_otus {1}_uniques_sorted.fasta -otus {1}_OTUs.fasta -relabel OTU -sizein -sizeout -fastaout {1}_OTUS_uparse.fasta".format(usearch, label)
subprocess.call(clusterOTUs.split(), stdout=log, stderr=subprocess.STDOUT)
log.write("\nCluster sequences into OTUs at 97%:\n" + clusterOTUs + "\n")
# Filter the OTUs for de novo chimeras
chimera_denovo = "{0} -uchime_denovo {1}_OTUs.fasta -uchimeout {1}_OTUs_filtered.uchime -nonchimeras {1}_OTUs_good.fasta".format(usearch, label)
s = str(chimera_denovo)
log.write("\n Chimera filter OTUs: \n" + s + "\n")
subprocess.call(chimera_denovo.split(), stdout=log, stderr=subprocess.STDOUT)
# Map the filtered sequence reads to the OTUs
searchReads = "{0} -usearch_global {1}_filter_trim.fasta -db {1}_OTUs_good.fasta -strand plus -id 0.97 -uc {1}_OTUs_num_readmap.uc".format(usearch, label)
subprocess.call(searchReads.split(), stdout=log, stderr=subprocess.STDOUT)
log.write("\nMap the filtered/trimmed reads to the OTUs\n" + searchReads + "\n")
# Make an OTU abundance table
makeOTUtable = "python make_otutable_from_readmap.py {1}_OTUs_num_readmap".format(label)
subprocess.call(makeOTUtable.split(), stdout=log, stderr=subprocess.STDOUT)
maxee_values = [2.5, 1.0, 0.5]
minsize_values = [1, 2]
genes = ["16S", "18S", "26S", "COI"]
for gene in genes:
f = glob.glob("Wx80_{0}_all_*fastq".format(gene))[0]
m = f.split("_")[3].split(".fastq")[0]
if gene == "18S":
minlen = 310
elif gene == "COI":
minlen = 300
elif gene == "16S" or gene == "26S":
trunclen = 250
# To run with different maxee filtering values:
#for maxee in maxee_values:
# Or to run with different minsize values:
for minsize in minsize_values:
maxee = 1.0
#minsize = 2
label = ("Wx80_{0}_{1}_MEE{2}_min{3}".format(gene, m, maxee, minsize))
log = open("{0}_usearch_log.txt".format(label), "a")
if m == "merged":
filter_trim_merged(f, label, minlen, maxee, log)
get_usearch_OTUs(label, minsize, log)
elif m == "R1":
filter_trim_reads(f, label, trunclen, maxee, log)
get_usearch_OTUs(label, minsize, log)
os.remove("{0}_filter_trim.fasta".format(label))
os.remove("{0}_uniques.fasta".format(label))
os.remove("{0}_uniques_sorted.fasta".format(label))