Skip to content

Commit

Permalink
GZip compression detection no longer rely on extension
Browse files Browse the repository at this point in the history
  • Loading branch information
fethalen committed Apr 17, 2024
1 parent bede610 commit 82f60d6
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 21 deletions.
30 changes: 17 additions & 13 deletions src/Patchwork.jl
Original file line number Diff line number Diff line change
Expand Up @@ -120,10 +120,11 @@ const DIAMONDOUTPUT = "diamond_out"
const STATSOUTPUT = "sequence_stats"
const PLOTSOUTPUT = "plots"
const RULER = repeat('', 74)
const VERSION = "0.6.0"
const VERSION = "0.6.1"
# Default scoremodel taken from DIAMOND's defaults for BLOSUM62
const DEFAULT_SCOREMODEL = BioAlignments.AffineGapScoreModel(BioAlignments.BLOSUM62,
gap_open = -11, gap_extend = -1)
const GZIP_MAGIC_NUMBER = UInt8[0x1f, 0x8b]

"""
printinfo()
Expand Down Expand Up @@ -426,18 +427,20 @@ function main()
end
refseqs_count = countsequences(references_file)

if length(args["contigs"]) > 1
all(map(isfile, args["contigs"])) || error("One or more contig files not found")
if !(all(f -> isfastafile(f), args["contigs"]) || all(f -> isfastqfile(f), args["contigs"]))
error("Query files must be in the same format (FASTA or FASTQ).")
end
queries = cat(args["contigs"])
elseif length(args["contigs"]) == 1
contig_path = only(args["contigs"])
isfile(contig_path) || error("No contig file in path $contig_path")
queries = only(args["contigs"])
contigpaths = args["contigs"]
contigsformat = sequencefiles_format(contigpaths)
println("Sequence data file format detected: ", contigsformat)

# Concatenate multiple sequence files if necessary.
if length(contigpaths) > 1
println("Concatenating query sequence files...")
queries = cat(contigpaths)
elseif length(contigpaths) == 1
queries = only(contigpaths)
end

contigscount = countrecords(queries, contigsformat)

outdir = args["output-dir"]
alignmentoutput = outdir * "/" * ALIGNMENTOUTPUT
fastaoutput = outdir * "/" * FASTAOUTPUT
Expand Down Expand Up @@ -552,8 +555,9 @@ function main()

println(RULER)
percentmarkers = getpercentage(queryseqs_count, refseqs_count)
println("# markers in: ", refseqs_count)
println("# markers out: ", queryseqs_count, " (", percentmarkers, "%)")
println("# query sequences in: ", contigscount)
println("# reference sequences in: ", refseqs_count)
println("# alignments out: ", queryseqs_count, " (", percentmarkers, "%)")

CSV.write(*(statsoutput, "/statistics.csv"), statistics, delim = ",")
statssummary = select(describe(select(statistics, Not(:id))), Not([:nmissing, :eltype]))
Expand Down
75 changes: 75 additions & 0 deletions src/checkinput.jl
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,7 @@ function setpatchworkflags!(args::Dict{String, Any}) # Run this fct. before
setgapopen!(args)
setgapextend!(args)
checkgappenalty(args[getmatrixtype(args)], args["gapopen"], args["gapextend"])
contigfiles_exists(args)
end

# """
Expand Down Expand Up @@ -368,3 +369,77 @@ function collectdiamondflags(args::Dict{String, Any})::Vector{String}
# println(diamondflags)
return diamondflags
end

"""
contigfiles_exists(args::Dict{String, Any})
Verify that none of the provided contig files are missing. Missing files are reported and
results in an error.
"""
function contigfiles_exists(args::Dict{String, Any})
contigfiles = args["contigs"]
filesexists = map(isfile, contigfiles)
if !all(filesexists)
missingfiles = findall(iszero, filesexists)
plural = ""
if length(missingfiles) > 1
plural = "s"
end
println("The following contig file$plural were not found:")
for index in missingfiles
println(" ", contigfiles[index])
end
exit(1)
end
return contigfiles
end

"""
sequencefiles_format(sequencefiles::Vector{String})
Takes a list of `sequencefiles` as input and verifies that they are either in
FASTA order FASTQ format. Returns the string "FASTA" for the FASTA file format
and "FASTQ" for the FASTQ format. Generates an error message and exists if any
of the files are neither in FASTA or FASTQ format.
"""
function sequencefiles_format(sequencefiles::Vector{String})
if all(file -> isfastafile(file), sequencefiles)
return "FASTA"
elseif all(file -> isfastqfile(file), sequencefiles)
return "FASTQ"
else
println("Contig sequence files must be in the same format. FASTA or FASTQ.")
exit(1)
end
end

"""
isgzipcompressed(file::AbstractString)
Reads the first two bytes of the provided files and returns true if they correspond to the
GZip magic number `UInt8[0x1f, 0x8b]`. Failure to read the provided file results in `false`
being returned.
"""
function isgzipcompressed(file::AbstractString)
open(file, "r") do io
magic_number = UInt8[]
readbytes!(io, magic_number, 2)
return magic_number == GZIP_MAGIC_NUMBER
end
end

"""
countrecords(sequencefile::String, sequencefile_format::String)
Takes the path to a sequence file and the name of a sequence file format (FASTA or FASTQ)
as an input. Returns the number of sequence records in that file. An unrecognized file
format results in a 0 being returned.
"""
function countrecords(sequencefile::String, sequencefile_format::String)
if sequencefile_format == "FASTA"
return fasta_countrecords(sequencefile)
elseif sequencefile_format == "FASTQ"
return fastq_countrecords(sequencefile)
end
return 0
end
27 changes: 19 additions & 8 deletions src/fasta.jl
Original file line number Diff line number Diff line change
Expand Up @@ -172,13 +172,6 @@ function selectsequences(
return msa
end

function isgzipcompressed(file::AbstractString)
occursin(".", file) || return false
extension = rsplit(file, ".", limit=2)[2]
isequal(extension, "gz") && return true
return false
end

function isfastafile(
path::AbstractString,
ext::AbstractVector{String}=FASTAEXTENSIONS
Expand All @@ -192,7 +185,7 @@ function isfastafile(
try # in case the file is a tmp file without extension
read!(reader, record)
close(reader)
catch e
catch
close(reader)
return false
end
Expand Down Expand Up @@ -247,3 +240,21 @@ function countsequences(path::AbstractString)::Int
return count
end

function fasta_countrecords(file::AbstractString)
if isgzipcompressed(file)
reader = FASTA.Reader(GzipDecompressorStream(open(file)))
else
reader = FASTA.Reader(open(file))
end

recordcount = 0
for _ in reader
recordcount += 1
end

close(reader)

return recordcount
end


17 changes: 17 additions & 0 deletions src/fastq.jl
Original file line number Diff line number Diff line change
Expand Up @@ -136,3 +136,20 @@ function combinefiles(files::AbstractVector{String})
close(writer)
return file
end

function fastq_countrecords(file::AbstractString)
if isgzipcompressed(file)
reader = FASTQ.Reader(GzipDecompressorStream(open(file)))
else
reader = FASTQ.Reader(open(file))
end

recordcount = 0
for _ in reader
recordcount += 1
end

close(reader)

return recordcount
end

0 comments on commit 82f60d6

Please sign in to comment.