-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Upload scripts for phylogenomic analysis with GFs
- Loading branch information
1 parent
cadb045
commit 5fdcc49
Showing
10 changed files
with
835 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import sys | ||
from Bio import SeqIO | ||
|
||
filename = sys.argv[1] # input file name | ||
frag_size = sys.argv[2] # alignment length | ||
n_seqs = sys.argv[3] # number of sequences in the alignments | ||
acceptable_seqs = 0 | ||
bad_seqs = 0 | ||
|
||
# calculate the %N per sequence, categorize them into 'bad' or 'acceptable', | ||
# and count the number of 'bad' and 'acceptable' sequences | ||
for seq_record in SeqIO.parse(sys.stdin, 'fasta'): | ||
sequence = str(seq_record.seq).upper() | ||
percent_n = round((sequence.count('N') / int(frag_size)) * 100, 2) | ||
print(f"{seq_record.id}\t{percent_n}%") | ||
# acceptable: %N is between 20% and 50% | ||
if 20 < percent_n <= 50: | ||
acceptable_seqs += 1 | ||
# bad: %N is higher than 50% | ||
elif percent_n > 50: | ||
bad_seqs += 1 | ||
|
||
# write output files | ||
with open('good.gfs', 'a') as fout1, open('bad.gfs', 'a') as fout2: | ||
# good alignments: do not contain 'bad' sequences and can contain up to 20% | ||
# of 'acceptable' sequences | ||
if bad_seqs == 0 and acceptable_seqs <= int(n_seqs) * 0.2: | ||
fout1.write(f"{filename}\t{acceptable_seqs}\t{bad_seqs}\n") | ||
# bad alignments: do not fit good GF criteria | ||
else: | ||
fout2.write(f"{filename}\t{acceptable_seqs}\t{bad_seqs}\n") | ||
|
||
fout1.close() | ||
fout2.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
#!/bin/bash | ||
# | ||
# Usage: | ||
# clean_fasta.sh <indir> <no_repeats.bed> <outdir> <njobs> | ||
# | ||
# Description: | ||
# Remove repetitive regions from the consensus sequences and calculate the | ||
# percentage of ambiguous sites in each sequence. | ||
# | ||
# Requirements: | ||
# bedtools | ||
# python3 | ||
# parallel | ||
# samtools | ||
# 'concatenate_fasta.py' (must be in same directory as 'clean_masked_fasta.sh') | ||
# | ||
# Important: | ||
# <no_repeats.bed> is the BED file containing non-repetitive regions in | ||
# scaffolds >= 1 Mbp (i.e. the file 'no_repeats_1mb_scaffolds.bed' generated | ||
# with 'process_assembly.sh'). | ||
|
||
# extract non-repetitive regions from masked FASTA | ||
parallel -j $4 bedtools getfasta -fi {} -bed $2 -fo $3/{/.}.clean.fa ::: $1/*.fa | ||
|
||
# concatenate FASTA regions per sample | ||
parallel -j $4 python3 ./concatenate_fasta.py {} $3/{/.}.concat.fa ::: $3/*.clean.fa | ||
|
||
# index FASTAs | ||
parallel -j $4 samtools faidx {} ::: $3/*.concat.fa | ||
|
||
# calculate proportion of N's per FASTA | ||
for fasta in $3/*.concat.fa; do | ||
sample=$(basename ${fasta%.clean.concat.fa}) | ||
seq_len=$(grep -v '^>' ${fasta} | tr -d '\n' | wc -c) | ||
n_count=$(grep -v '^>' ${fasta} | tr -cd N | wc -c) | ||
n_percent=$(python -c "print(f'{round((${n_count} / ${seq_len}) * 100, 2)}')") | ||
echo -e "${sample}\t${seq_len}\t${n_count}\t${n_percent}" >> $3/n_percent.tmp | ||
done | ||
cat <(echo -e "file\tlength_bp\t#N\t%N") <(sort -grk 4,4 $3/n_percent.tmp) > $3/n_percent.tbl | ||
|
||
# remove intermediate files | ||
rm $3/*.clean.fa $3/n_percent.tmp |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
from sys import argv | ||
|
||
infile = argv[1] | ||
outfile = argv[2] | ||
last_chrom = '' | ||
|
||
with open(infile) as fin, open(outfile, 'w') as fout: | ||
for line in fin.readlines(): | ||
if line.startswith('>'): # FASTA header line | ||
chrom = line.strip().split(':')[0] # extract chrom. id from header | ||
|
||
print(f"Concatenating chromosome '{chrom.replace('>', '')}'...") | ||
|
||
if not last_chrom: # first chromosome | ||
fout.write(f'{chrom}\n') # write header | ||
elif chrom != last_chrom: # second chromosome onwards | ||
fout.write(f'\n{chrom}\n') # write header | ||
else: | ||
pass | ||
|
||
else: # sequence line | ||
fout.write(line.strip()) # write sequence | ||
|
||
print("Done!") | ||
|
||
last_chrom = chrom # remember last concatenated chromosome |
File renamed without changes.
Oops, something went wrong.