diff --git a/.gitattributes b/.gitattributes
index c92adacc..5fa86d47 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,49 +1,50 @@
-# Set the default behavior, in case people don't have core.autocrlf set.
-* text=auto
-
-*.sh eol=lf
-*.txt eol=lf
-*.md eol=lf
-*.eb eol=lf
-*.json eol=lf
-*.sed eol=lf
-*.c eol=lf
-*.cpp eol=lf
-*.h eol=lf
-*.asm eol=lf
-*.S eol=lf
-Makefile eol=lf
-
-# Declare our textual formats as binary so we can check in test files with Unix or Windows style end-of-lines
-*.vcf binary
-*.bcf binary
-*.sam binary
-*.bam binary
-*.cram binary
-*.gff3 binary
-*.gff binary
-*.gvf binary
-*.gtf binary
-genome_*.txt binary
-*.fa binary
-*.fasta binary
-*.fa binary
-*.faa binary
-*.ffn binary
-*.fnn binary
-*.fna binary
-*.frn binary
-*.fas binary
-*.fq binary
-*.fastq binary
-*.phy binary
-*.bed binary
-*.kraken binary
-*.chain binary
-*.locs binary
-*.genozip binary
-
-# installer files
-*.tar binary
-*.exe binary
-*.o binary
+# Set the default behavior, in case people don't have core.autocrlf set.
+* text=lf
+
+*.sh eol=lf
+*.txt eol=lf
+*.md eol=lf
+*.eb eol=lf
+*.json eol=lf
+*.sed eol=lf
+*.c eol=lf
+*.cpp eol=lf
+*.h eol=lf
+*.asm eol=lf
+*.S eol=lf
+*.yaml eol=lf
+Makefile eol=lf
+
+# Declare our textual formats as binary so we can check in test files with Unix or Windows style end-of-lines
+*.vcf binary
+*.bcf binary
+*.sam binary
+*.bam binary
+*.cram binary
+*.gff3 binary
+*.gff binary
+*.gvf binary
+*.gtf binary
+genome_*.txt binary
+*.fa binary
+*.fasta binary
+*.fa binary
+*.faa binary
+*.ffn binary
+*.fnn binary
+*.fna binary
+*.frn binary
+*.fas binary
+*.fq binary
+*.fastq binary
+*.phy binary
+*.bed binary
+*.kraken binary
+*.chain binary
+*.locs binary
+*.genozip binary
+
+# installer files
+*.tar binary
+*.exe binary
+*.o binary
diff --git a/LICENSE.txt b/LICENSE.txt
index 47166469..16869f69 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -159,5 +159,5 @@ ABOVE STATED REMEDY FAILS OF ITS ESSENTIAL PURPOSE.
END OF TERMS AND CONDITIONS
-Genozip license version: 15.0.61
+Genozip license version: 15.0.62
diff --git a/RELEASE_NOTES.txt b/RELEASE_NOTES.txt
index 1310a4c4..25781ce4 100644
--- a/RELEASE_NOTES.txt
+++ b/RELEASE_NOTES.txt
@@ -3,6 +3,12 @@ Note on versioning:
- Minor version changes with bug fixes and minor feature updates
- Some minor versions are skipped due to failed deployment pipelines
+15.0.62
+- I/O optimizations for faster compression
+- Bug fixes
+- New diagnostic options: --show-gz-uncomp, --generate-gzil
+- Removed bash autocomplete for genozip as it didn't work very well. If this was installed, it can be removed by manually editing ~/.bash_completion
+
15.0.61 22/6/2024
- --optimize can now take an optional argument for fine-grained control of which fields get optimized: --optimize=QUAL,rx:f (optimize if possible, but only these fields) or --optimize=^QUAL,rx:f (optimize all fields possible, except for these fields)
- VCF: better compression of files generated by freebayes ; better compression of Type=Float annotations
diff --git a/installers/LICENSE.html b/installers/LICENSE.html
index d080673f..c0399d72 100644
--- a/installers/LICENSE.html
+++ b/installers/LICENSE.html
@@ -34,4 +34,4 @@
10. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides Genozip on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Genozip and assume any risks associated with Your exercise of permissions under this License.
11. LIMITATION OF LIABILITY. TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, STRICT LIABILITY OR OTHER LEGAL OR EQUITABLE THEORY, SHALL LICENSOR OR DEVELOPER BE LIABLE FOR DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER ARISING AS A RESULT OF THIS LICENSE OR OUT OF THE USE OR INABILITY TO USE GENOZIP (INCLUDING BUT NOT LIMITED TO DAMAGES FOR LOSS OF GOODWILL, WORK STOPPAGE, COMPUTER FAILURE OR MALFUNCTION, FILE CORRUPTION, DATA LOSS, OR ANY AND ALL OTHER COMMERCIAL DAMAGES OR LOSSES), EVEN IF LICENSOR OR DEVELOPER HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL LICENSOR'S OR DEVELOPER'S TOTAL LIABILITY TO LICENSEE FOR ALL DAMAGES (OTHER THAN AS MAY BE REQUIRED BY APPLICABLE LAW IN CASES INVOLVING PERSONAL INJURY) EXCEED THE AMOUNT OF $500 USD. THE FOREGOING LIMITATIONS WILL APPLY EVEN IF THE ABOVE STATED REMEDY FAILS OF ITS ESSENTIAL PURPOSE.
END OF TERMS AND CONDITIONS
-Genozip license version: 15.0.61
+Genozip license version: 15.0.62
diff --git a/installers/genozip-installer.exe b/installers/genozip-installer.exe
index 4a5a1ce2..dee031fc 100644
Binary files a/installers/genozip-installer.exe and b/installers/genozip-installer.exe differ
diff --git a/installers/genozip-linux-x86_64.tar b/installers/genozip-linux-x86_64.tar
index 6cae53bb..cbfdb47a 100644
Binary files a/installers/genozip-linux-x86_64.tar and b/installers/genozip-linux-x86_64.tar differ
diff --git a/installers/genozip-osx-arm.tar b/installers/genozip-osx-arm.tar
index d5425f7a..b5fe66b9 100644
Binary files a/installers/genozip-osx-arm.tar and b/installers/genozip-osx-arm.tar differ
diff --git a/installers/genozip-osx-x86.tar b/installers/genozip-osx-x86.tar
index ab558f02..0b0f0892 100644
Binary files a/installers/genozip-osx-x86.tar and b/installers/genozip-osx-x86.tar differ
diff --git a/src/Makefile b/src/Makefile
index 95e0e9f8..cb17f75e 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -156,7 +156,7 @@ MY_SRCS = genozip.c genols.c context.c container.c strings.c stats.c arch.c tip.
sam_blasr.c sam_dragen.c sam_minimap2.c sam_10xGenomics.c sam_biobambam.c sam_pos.c sam_deep.c \
sam_star.c sam_abra2.c sam_optimize.c \
fastq.c fastq_desc.c fastq_seq.c fastq_qual.c fastq_deep.c fastq_saux.c deep.c \
- fasta.c gff.c bed.c me23.c locs.c generic.c lookback.c compressor.c \
+ fasta.c gff.c bed.c me23.c locs.c generic.c lookback.c compressor.c \
buffer.c buf_struct.c buf_list.c random_access.c sections.c base64.c bgzf.c coverage.c txtheader.c \
codec.c codec_bz2.c codec_lzma.c codec_acgt.c codec_domq.c codec_bsc.c codec_pacb.c \
codec_pbwt.c codec_none.c codec_htscodecs.c codec_longr.c codec_normq.c codec_homp.c codec_t0.c \
@@ -193,7 +193,7 @@ INCLUDES += dict_id_gen.h aes.h dispatcher.h profiler.h dict_id.h aliases.h txtf
buffer.h buf_struct.h buf_list.h file.h context.h context_struct.h container.h seg.h text_license.h version.h compressor.h \
crypt.h genozip.h piz.h vblock.h zfile.h random_access.h regions.h reconstruct.h tar.h qname.h qname_flavors.h codec.h \
lookback.h tokenizer.h codec_longr_alg.c gencomp.h dict_io.h recon_plan_io.h tip.h deep.h filename.h stats.h multiplexer.h \
- reference.h ref_private.h refhash.h ref_iupacs.h aligner.h mutex.h bgzf.h coverage.h threads.h local_type.h \
+ reference.h ref_private.h refhash.h ref_iupacs.h aligner.h mutex.h bgzf.h coverage.h threads.h local_type.h \
arch.h license.h file_types.h data_types.h base64.h txtheader.h writer.h zriter.h bases_filter.h genols.h contigs.h chrom.h \
vcf.h vcf_private.h sam.h sam_private.h me23.h fasta.h fasta_private.h gff.h bed.h locs.h generic.h \
fastq.h fastq_private.h user_message.h mac_compat.h b250.h zip_dyn_int.h qname_filter.h \
@@ -335,9 +335,9 @@ DEBUG_OBJS := $(addprefix $(OBJDIR)/,$(C_SRCS:.c=.debug-o)) $(addprefix $(OBJDIR
OPT_OBJS := $(addprefix $(OBJDIR)/,$(C_SRCS:.c=.opt-o)) $(addprefix $(OBJDIR)/,$(CXX_SRCS:.cpp=.opt-o)) $(IGZIP_OBJS) $(BSC_OPT_OBJS) # optimized but with debug info, for debugging issues that only manifest with compiler optimization
DEPS := $(addprefix $(OBJDIR)/,$(C_SRCS:.c=.d)) $(addprefix $(OBJDIR)/,$(CXX_SRCS:.cpp=.d)) $(IGZIP_DEPS)
-EXECUTABLES = genozip$(EXE) genounzip$(EXE) genocat$(EXE) genols$(EXE) autocomplete.sh
-DEBUG_EXECUTABLES = genozip-debug$(EXE) genounzip-debug$(EXE) genocat-debug$(EXE) genols-debug$(EXE) autocomplete-debug.sh
-OPT_EXECUTABLES = genozip-opt$(EXE) genounzip-opt$(EXE) genocat-opt$(EXE) genols-opt$(EXE) autocomplete-debug.sh
+EXECUTABLES = genozip$(EXE) genounzip$(EXE) genocat$(EXE) genols$(EXE) # autocomplete.sh
+DEBUG_EXECUTABLES = genozip-debug$(EXE) genounzip-debug$(EXE) genocat-debug$(EXE) genols-debug$(EXE) # autocomplete-debug.sh
+OPT_EXECUTABLES = genozip-opt$(EXE) genounzip-opt$(EXE) genocat-opt$(EXE) genols-opt$(EXE) # autocomplete-debug.sh
all : CFLAGS += $(OPTFLAGS)
all : CXXFLAGS += $(OPTFLAGS)
@@ -611,7 +611,7 @@ CONDA_RECIPE_DIR = $(CONDA_FEEDSTOCK)/recipe
# publish to conda-forge
conda/.conda-timestamp: conda/meta.yaml conda/README.md conda/build.sh conda/bld.bat
@echo "Publishing to conda-forge"
- @$(SH_VERIFY_ALL_COMMITTED)
+ @$(SH_VERIFY_ALL_STAGED)
@echo " "
@echo "Copying $^ to conda feedstock"
@(cd $(CONDA_FEEDSTOCK); git reset --hard; git pull)
diff --git a/src/aes.c b/src/aes.c
index 0555429d..516baeb0 100644
--- a/src/aes.c
+++ b/src/aes.c
@@ -255,12 +255,10 @@ void aes_initialize (VBlockP vb, bytes key)
rom aes_display_data (bytes data, unsigned data_len)
{
char *str = MALLOC (data_len * 2 + 1);
- return str_to_hex (STRa(data), str, false);
+ return str_to_hex_ (STRa(data), str, false);
}
StrText aes_display_key (bytes key)
{
- StrText s;
- str_to_hex (key, AES_KEYLEN, s.s, false);
- return s;
+ return str_to_hex (key, AES_KEYLEN);
}
diff --git a/src/aliases.c b/src/aliases.c
index 8515ddfa..b9c5b2e3 100644
--- a/src/aliases.c
+++ b/src/aliases.c
@@ -19,12 +19,10 @@ void show_aliases (void)
iprint0 ("No aliases in this file\n");
else {
- static rom names[] = ALIAS_TYPE_NAMES;
-
iprintf ("Contents of SEC_DICT_ID_ALIASES section (num_aliases=%u):\n", z_file->aliases.len32);
for_buf (DictIdAlias, alias, z_file->aliases)
- iprintf ("type=%-4s\talias=%s/%-8s\tdst=%s/%-8s\n", names[alias->alias_type],
+ iprintf ("type=%-4s\talias=%s/%-8s\tdst=%s/%-8s\n", (rom[])ALIAS_TYPE_NAMES[alias->alias_type],
dtype_name_z (alias->alias), dis_dict_id (alias->alias).s,
dtype_name_z (alias->dst), dis_dict_id (alias->dst).s);
}
diff --git a/src/bam_seg.c b/src/bam_seg.c
index f2c6fa14..13da9ead 100644
--- a/src/bam_seg.c
+++ b/src/bam_seg.c
@@ -218,7 +218,7 @@ void bam_seg_BIN (VBlockSAMP vb, ZipDataLineSAMP dl, uint16_t bin /* used only i
static inline void bam_seg_ref_id (VBlockSAMP vb, ZipDataLineSAMP dl, Did did_i, int32_t ref_id, int32_t compare_to_ref_i)
{
- ASSERT (ref_id == -1 || (sam_hdr_contigs && ref_id >= 0 && ref_id < (int32_t)sam_hdr_contigs->contigs.len),
+ ASSERT (ref_id == -1 || (sam_hdr_contigs && IN_RANGE (ref_id, 0, sam_hdr_contigs->contigs.len32-1)),
"%s: encountered %s.ref_id=%d but header has only %u contigs%s",
LN_NAME, CTX(did_i)->tag_name, ref_id, sam_hdr_contigs ? sam_hdr_contigs->contigs.len32 : 0,
MP(LONGRANGER) ? ". This is a known longranger bug (samtools won't accept this file either)." : "");
@@ -412,8 +412,8 @@ rom bam_seg_txt_line (VBlockP vb_, rom alignment /* BAM terminology for one line
// a non-sensical block_size might indicate an false-positive identification of a BAM alignment in bam_unconsumed
ASSERT (block_size + 4 >= sizeof (BAMAlignmentFixed) && block_size + 4 <= remaining_txt_len,
- "%s: (block_size+4)=%u is out of range - too small, or goes beyond end of txt data: remaining_txt_len=%u",
- LN_NAME, block_size+4, remaining_txt_len);
+ "%s: (block_size+4)=%u is out of range - too small, or goes beyond end of txt data: txt_data.len=%u remaining_txt_len=%u",
+ LN_NAME, block_size+4, vb->txt_data.len32, remaining_txt_len);
rom after = alignment + block_size + sizeof (uint32_t);
diff --git a/src/bgzf.c b/src/bgzf.c
index acaaba94..51305729 100644
--- a/src/bgzf.c
+++ b/src/bgzf.c
@@ -25,6 +25,7 @@
#include "writer.h"
#include "gencomp.h"
#include "filename.h"
+#include "strings.h"
#define LIBDEFLATE_MAX_LEVEL 12
#define ZLIB_MAX_LEVEL 9
@@ -45,10 +46,24 @@ typedef struct __attribute__ ((packed, aligned(2))) BgzfHeader { // 18 bytes
uint16_t bsize; // BGZF extra field - (compressed block size -1)
} BgzfHeader;
-typedef struct BgzfFooter {
+#define BGZF_HEADER_LEN ((int)sizeof(BgzfHeader))
+
+typedef struct GzipFooter {
uint32_t crc32; // CRC32 of uncompressed data
uint32_t isize; // Input (i.e. uncompressed) Size
-} BgzfFooter;
+} GzipFooter;
+
+#define GZIP_FOOTER_LEN ((int)sizeof(GzipFooter))
+
+typedef struct __attribute__ ((packed, aligned(2))) GzipHeader { // 10 bytes
+ uint8_t id1; // Gzip id - must be 31 (0x1f)
+ uint8_t id2; // Gzip id - must be 139 (0x8b)
+ uint8_t cm; // Compression Method - must be 8
+ uint8_t flg; // Flags - must be 0
+ uint32_t mtime; // Modification Time - must be 0
+ uint8_t xfl; // eXtra Flags - must be 0
+ uint8_t os; // Operating System - must be 3
+} GzipHeader;
static FlagsBgzf bgzf_recompression_levels[1+MAX_FLAG_BGZF] = {
{ .library = BGZF_LIBDEFLATE19, .level = 0, .has_eof_block = true }, // --bgzf=0 : BGZF blocks with no compression
@@ -61,6 +76,11 @@ static FlagsBgzf bgzf_recompression_levels[1+MAX_FLAG_BGZF] = {
#define bgzf_no_recompression (FlagsBgzf){ .library = BGZF_NO_LIBRARY, .level = BGZF_NO_BGZF, .has_eof_block = false }
+rom gzstatus_name (GzStatus st)
+{
+ return IN_RANGE(st, 0, NUM_GZ_STATUSES-1) ? (rom[])GZSTATUS_NAMES[st] : "InvalidGzStatus";
+}
+
// possible return values, see libdeflate_result in libdeflate.h
static rom libdeflate_error (int err)
{
@@ -74,7 +94,7 @@ static rom libdeflate_error (int err)
}
typedef struct { char s[100]; } BgzfBlockStr;
-static BgzfBlockStr display_bb (BgzfBlockZip *bb)
+static BgzfBlockStr display_bb (GzBlockZip *bb)
{
BgzfBlockStr s;
snprintf (s.s, sizeof (s.s), "{txt_index=%u txt_size=%u compressed_index=%u comp_size=%u is_decompressed=%u}",
@@ -97,46 +117,27 @@ void bgzf_initialize_discovery (FileP file)
{
ASSERTNOTINUSE (file->bgzf_plausible_levels);
- if (file->codec == CODEC_GZ) {
- if (flag.show_gz) {
- // attempt to detect GZ blocks (up to 64MB)
- segconf.vb_size = 65 MB;
- txt_file = file;
- z_file = CALLOC (sizeof (File));
- txtfile_read_vblock (evb);
- iprintf ("%s: is %s but not BGZF\n", txt_file->name, src_codec_name (txt_file->source_codec, flag.zip_comp_i).s); fflush (info_stream);
- FREE (z_file);
- exit_ok;
- }
+ if (file->codec == CODEC_BGZF) {
+ ARRAY_alloc (FlagsBgzf, ll, (LIBDEFLATE_MAX_LEVEL+1)+LIBDEFLATE_MAX_LEVEL+ZLIB_MAX_LEVEL,
+ false, file->bgzf_plausible_levels, evb, "txt_file->bgzf_plausible_levels");
- else if (flag.show_bgzf) {
- iprintf ("%s: is GZIP but not BGZF\n", file->name);
- fflush (info_stream);
- }
-
- else return;
- }
-
- else if (file->codec != CODEC_BGZF) {
- if (flag.show_gz || flag.show_bgzf) {
- iprintf ("%s: is not GZIP, it is %s\n", file->name, codec_name (file->source_codec));
- if (flag.show_gz) exit_ok;
- }
- else return;
- }
+ int next=0;
+ for (int l=0; l <= LIBDEFLATE_MAX_LEVEL; l++) // level=0 only here, bc it would be the same in all libraries
+ ll[next++] = (FlagsBgzf){ .library = BGZF_LIBDEFLATE19, .level = l};
- ARRAY_alloc (FlagsBgzf, ll, (LIBDEFLATE_MAX_LEVEL+1)+LIBDEFLATE_MAX_LEVEL+ZLIB_MAX_LEVEL,
- false, file->bgzf_plausible_levels, evb, "txt_file->bgzf_plausible_levels");
+ for (int l=1; l <= LIBDEFLATE_MAX_LEVEL; l++)
+ ll[next++] = (FlagsBgzf){ .library = BGZF_LIBDEFLATE7, .level = l};
- int next=0;
- for (int l=0; l <= LIBDEFLATE_MAX_LEVEL; l++) // level=0 only here, bc it would be the same in all libraries
- ll[next++] = (FlagsBgzf){ .library = BGZF_LIBDEFLATE19, .level = l};
+ for (int l=1; l <= ZLIB_MAX_LEVEL; l++)
+ ll[next++] = (FlagsBgzf){ .library = BGZF_ZLIB, .level = l};
+ }
- for (int l=1; l <= LIBDEFLATE_MAX_LEVEL; l++)
- ll[next++] = (FlagsBgzf){ .library = BGZF_LIBDEFLATE7, .level = l};
+ else if (file->codec == CODEC_GZIL) {
+ // bug 1101: we don't yet know the plausible levels for GZIL
+ }
- for (int l=1; l <= ZLIB_MAX_LEVEL; l++)
- ll[next++] = (FlagsBgzf){ .library = BGZF_ZLIB, .level = l};
+ else
+ ABORT ("Supported codec=%s for discovery", codec_name (file->codec));
}
// ZIP main thread
@@ -158,27 +159,31 @@ void bgzf_finalize_discovery (void)
if (n_levels == 0) {
bgzf_discover_finalize_testing (0, BGZF_COMP_LEVEL_UNKNOWN); // has BGZF, but cannot identify level
- if (flag.show_bgzf || flag.show_gz)
- iprintf ("%s: is a BGZF file, generated by an unidentified library\n", txt_name);
+ if (flag.show_bgzf)
+ iprintf ("Discover:%s: is a BGZF file, generated by an unidentified library\n", txt_name);
}
// case: one or more library/level combinations was verified with all test bgzf blocks (10 blocks, unless file is shorter)
else {
bgzf_discover_finalize_testing (B1ST(FlagsBgzf, txt_file->bgzf_plausible_levels)->library, B1ST(FlagsBgzf, txt_file->bgzf_plausible_levels)->level);
- if (flag.show_bgzf || flag.show_gz)
- iprintf ("%s: %s %s level %u\n", txt_name,
+ if (flag.show_bgzf)
+ iprintf ("Discover: %s: %s %s level %u\n", txt_name,
(n_levels == 1) ? "Identified as BGZF generated with" : "Multiple plausible levels, arbitrarily selecting",
bgzf_library_name (txt_file->bgzf_flags.library, true), txt_file->bgzf_flags.level);
}
- if (flag.show_gz) exit_ok;
+ if (flag.show_gz) {
+ iprintf ("%s: txt_codec=%s\n", txt_file->basename, txtfile_codec_name (z_file, flag.zip_comp_i).s); // same format as in txtfile_zip_finalize_codecs
+ exit_ok;
+ }
}
// ZIP: test a BGZF block against all the remaining plausible levels, and eliminate those that don't match.
static void bgzf_discover_library_and_level (VBlockP vb, int test_block_i, STRp(comp), STRp(uncomp))
{
- if (comp_len <= sizeof (BgzfHeader) + sizeof (BgzfFooter)) {
+ uint32_t header_len = TXT_IS_BGZF ? BGZF_HEADER_LEN : GZIL_HEADER_LEN;
+ if (comp_len <= header_len + GZIP_FOOTER_LEN) {
txt_file->bgzf_plausible_levels.len = 0;
if (flag.show_bgzf || flag.show_gz)
@@ -190,11 +195,11 @@ static void bgzf_discover_library_and_level (VBlockP vb, int test_block_i, STRp(
}
// ignore the header and footer of the block
- comp += sizeof (BgzfHeader);
- comp_len -= sizeof (BgzfHeader) + sizeof (BgzfFooter);
+ comp += header_len;
+ comp_len -= header_len + GZIP_FOOTER_LEN;
// compress with each of the remaining plausible levels - testing if the compression is identical to the actual
- STRl (recomp, BGZF_MAX_BLOCK_SIZE);
+ STRl (recomp, TXT_IS_BGZF ? BGZF_MAX_BLOCK_SIZE : GZIL_MAX_BLOCK_SIZE);
for_buf (FlagsBgzf, ll, txt_file->bgzf_plausible_levels) {
@@ -252,126 +257,258 @@ static void bgzf_discover_library_and_level (VBlockP vb, int test_block_i, STRp(
// ZIP SIDE - decompress BGZF-compressed file and prepare BGZF section
//--------------------------------------------------------------------
-// ZIP: reads and validates a BGZF block, and returns the uncompressed size or (only if soft_fail) an error
-static int32_t bgzf_read_block_raw (FILE *file, // txt_file is not yet assigned when called from file_open_txt_read
- uint8_t *block /* must be BGZF_MAX_BLOCK_SIZE in size */, uint32_t *block_size /* out */,
- rom basename, bool is_remote, FailType soft_fail,
- int64_t *disk_so_far)
+static void bgzf_update_file_isizes (FileP file)
{
- BgzfHeader *h = (BgzfHeader *)block;
+ // add isize to buffer that will be written to SEC_BGZF
+ if (file->gz_data.uncomp_len) { // don't store EOF block (bc isize=0 cannot be represented as (isize-1) )
+ #define BGZF_INITIAL_ALLOC 16 // just of the sake of a bit of effeciency: 16 chosen carefully so 16*63000 < 1MB min vb_size but over segconf size
+ if (file->bgzf_isizes.len32 <= BGZF_INITIAL_ALLOC) { // entered thrice: when called from file_open_txt_read, segconf, and in first VB
+ buf_alloc (evb, &file->bgzf_isizes, 0, MAX_(BGZF_INITIAL_ALLOC, segconf.vb_size / 63000), uint16_t, 0, "txt_file->bgzf_isizes");
+ buf_alloc (evb, &file->bgzf_starts, 0, MAX_(BGZF_INITIAL_ALLOC, segconf.vb_size / 63000), uint64_t, 0, "txt_file->bgzf_starts");
+ }
- // read the header
- *block_size = fread (h, 1, sizeof (struct BgzfHeader), file);
+ buf_append_one (file->bgzf_isizes, BGEN16 ((uint16_t)(file->gz_data.uncomp_len - 1))); // -1 to make the range 0..65535
+ buf_append_one (file->bgzf_starts, file->disk_so_far - file->gz_data.len); // not BGEN bc not written to z_file. note: first block is read from file_open_txt_read before txt_file is assigned
+ }
+ else {
+ // if isize is 0, we're expecting an EOF block
+ ASSERT (str_issame_(file->gz_data.data, file->gz_data.comp_len, BGZF_EOF, BGZF_EOF_LEN),
+ "Corrupt BGZF block in %s offset=%"PRIu64" bgzf_block_size=%u: isize=0 but this is not an EOF block",
+ file->name, file->disk_so_far - file->gz_data.comp_len, file->gz_data.comp_len);
- if (disk_so_far) *disk_so_far += *block_size;
+ ASSERT (file->disk_so_far == file->disk_size || // expected
+ !file->disk_size || is_read_via_ext_decompressor (file) || file->redirected || file->is_remote, // cases in which we can't reliably test this condition
+ "Corrupt BGZF file %s (size=%"PRIu64"): BGZF EOF block encountered at offset=%"PRIu64" length=%u, but this is not the end of the file",
+ file->name, file->disk_size, file->disk_so_far - file->gz_data.comp_len, file->gz_data.comp_len);
- if (*block_size != sizeof (struct BgzfHeader)) {
- ASSERT (!ferror (file), //|| (disk_so_far && *disk_so_far == *block_size),
- "Error while reading %s: %s", basename, strerror (errno));
- return (disk_so_far && *disk_so_far == *block_size) ? BGZF_BLOCK_IS_NOT_GZIP : BGZF_ABRUBT_EOF ; // EOF without an EOF block (possibly a very short non-GZ file)
+ if (txt_file)
+ txt_file->bgzf_flags.has_eof_block = true;
}
-
- if (*block_size < 12) {
- ASSERT (soft_fail, "file %s appears truncated - it ends with a partial gzip block header", basename); // less than the minimal gz block header size
- return BGZF_BLOCK_IS_NOT_GZIP;
- }
+}
- // case: this is not a GZ / BGZF block at all (see: https://tools.ietf.org/html/rfc1952)
- if (h->id1 != 31 || h->id2 != 139) {
- ASSERT (soft_fail, "expecting %s to be compressed with gzip format, but it is not", basename);
- return BGZF_BLOCK_IS_NOT_GZIP;
+void inc_disk_gz_uncomp_or_trunc_(FileP file, uint64_t inc, FUNCLINE)
+{
+ __atomic_add_fetch (&file->disk_gz_uncomp_or_trunc, inc, __ATOMIC_RELAXED);
+
+ if (flag.show_gz_uncomp)
+ iprintf ("%s:%u: disk_gz_uncomp_or_trunc + %"PRIu64"\t= %"PRIu64"\n", func, code_line, inc, file->disk_gz_uncomp_or_trunc);
+}
+
+// ZIP main thread: reads gzil data from disk into gz_data, and updates gz_data.comp_len/uncomp_len
+// of the gzil block at the beginning of gz_data
+// returns: discovering: GZ_SUCCESS, GZ_IS_NOT_GZIL
+// otherwise: GZ_SUCCESS
+GzStatus gzil_read_block (FileP file, // txt_file is not yet assigned when called from txtfile_discover_gz_codec
+ bool discovering,
+ bool *is_eof) // is the block indicated by comp_len/uncomp_len the final block in the file
+{
+ START_TIMER;
+ FILE *fp = (FILE *)file->file;
+ file->gz_data.comp_len = file->gz_data.uncomp_len = 0; // init
+
+ // top up gz_data to GZIL_MAX_BLOCK_SIZE (or less if EOF)
+ // performance note: typical top-up is 200-250KB which is more than the block device read-ahead buffer of 128 KB, which means
+ // that fread will block on the second half. However, when using GZIL we are not de-compressing in the main thread
+ // (except for tiny amounts) so the disk is never idling and we are pushing up gzil data just as fast as the disk can read it.
+ // An R2 GZIL file (R2 always decompresses in the main thread) will be read through igzip for this reason - see txtfile_discover_gz_codec.
+ uint32_t bytes = txtfile_fread (file, fp, BAFT8(file->gz_data), GZIL_MAX_BLOCK_SIZE - file->gz_data.len32, &file->disk_so_far);
+ file->gz_data.len32 += bytes;
+
+ // case: eof: no data in gz header
+ if (file->gz_data.len32 == 0)
+ return discovering ? GZ_IS_NOT_GZIL // empty file (not expected to ever happen - we already checked that the file is not empty)
+ : GZ_SUCCESS; // no more data in this file - that's ok
+
+ // case: not GSIL header
+ else if (file->gz_data.len32 >= GZIL_HEADER_LEN && memcmp (B1ST8(file->gz_data), GZIL_HEADER, GZIL_HEADER_LEN)) {
+ if (discovering)
+ return GZ_IS_NOT_GZIL;
+ else
+ ABORT ("Encountered a GZIP block that unexpectedly is not GZIL in %s offset=%"PRIu64"\nSolution: use --no-bgzf",
+ file->basename, (uint64_t)ftello64 (fp) - bytes);
}
- // case: this is GZIP block that is NOT a valid BGZF block (see: https://samtools.github.io/hts-specs/SAMv1.pdf)
- if (flag.no_bgzf || // user instructed us to treat BGZF data as normal GZIP data
- (!(*block_size == sizeof (struct BgzfHeader) && !memcmp (h, BGZF_PREFIX, BGZF_PREFIX_LEN)))) {
- ASSINP (soft_fail, "Encountered a GZIP block that unexpectedly is not BGZF in %s offset=%"PRIu64"\nSolution: use --no-bgzf",
- basename, (uint64_t)ftello64 (file) - *block_size);
+ // search for block size by beginning of next block. Because FASTQ is quick compressible,
+ // we expect the gz header of the next block to be in gz_data. note: we do this even if EOF,
+ // because gz_data might contain several gzil blocks. note: also NULL if data is too short.
+ uint8_t *isize_p = memmem (B1ST8(file->gz_data), file->gz_data.len32,
+ GZIL_ISIZE GZIL_HEADER, GZIL_ISIZE_LEN + GZIL_HEADER_LEN);
- return BGZF_BLOCK_GZIP_NOT_BGZIP;
+ // case: a non-last block was found. note: all blocks GZIL (except for the last) are expected to be 1 MB
+ if (isize_p && (file->gz_data.uncomp_len = LTEN32 (GET_UINT32(isize_p))) == 1 MB) {
+ file->gz_data.comp_len = BNUM (file->gz_data, isize_p + 4);
+ *is_eof = false; // we found the beginning of the next block, so this is definitly not the last block. this can happen even if feof(fp) tell us there nothing more to read in the file, but there are still more blocks in gz_data.
}
- *block_size = LTEN16 (h->bsize) + 1;
+ // case: remaining data could be a final gzil block, we will know for sure when trying to decompress it
+ else if (file->gz_data.len32 >= GZIL_HEADER_LEN + GZIP_FOOTER_LEN &&
+ (file->gz_data.uncomp_len = LTEN32 (GET_UINT32 (BAFT8(file->gz_data) - 4))) <= 1 MB) {
+ file->gz_data.comp_len = file->gz_data.len32;
+ *is_eof = true;
+ }
- uint32_t body_size = *block_size - sizeof (struct BgzfHeader);
- uint32_t bytes = fread (h+1, 1, body_size, file);
+ // case: data in gz_data is does not contain a gzil block - either not gzil file is truncated
+ else {
+ if (discovering)
+ return GZ_IS_NOT_GZIL;
- if (disk_so_far) *disk_so_far += bytes;
+ // data is not GZIL somewhere in the middle of the file...
+ ASSERT (feof (fp), "Encountered a GZIP block that unexpectedly is not GZIL in %s offset=%"PRIu64"\nSolution: use --no-bgzf",
+ file->basename, (uint64_t)ftello64 ((FILE *)file->file) - file->gz_data.len);
- int save_errno = errno; // we want to report errno of fread, not ftell.
+ // case: final data in file is not a full GZIL block and truncation allowed:
+ // account and then ignore the data that will not be gz-decompressed
+ if (flag.truncate) {
+ WARN ("FYI: %s is truncated - its final GZIL block in incomplete. Dropping final %u bytes of the GZ data.", txt_name, file->gz_data.len32);
- // if failed, always error, even if soft_fail
- ASSERT (bytes == body_size || flag.truncate,
- "%s %s (ftell=%"PRId64" err=\"%s\" bytes_read=%u but expecting=%u filesystem=%s). %s\n",
- feof (file) ? "Unexpected end of file while reading" : "Failed to read body",
- basename, ftello64 (file),
- (is_remote && save_errno == ESPIPE) ? "Disconnected from remote host" : strerror (save_errno),
- bytes, body_size, arch_get_txt_filesystem().s,
- feof (file) ? "If file is expected to be truncated, you may use --truncate-partial-last-line to disregard the final partial BGZF block." : "");
-
- return (bytes == body_size) ? BGZF_BLOCK_SUCCESS : BGZF_BLOCK_TRUNCATED;
+ inc_disk_gz_uncomp_or_trunc (file, file->gz_data.len);
+ file->gz_data.len32 = file->gz_data.uncomp_len = 0;
+ segconf.zip_txt_modified = true;
+
+ *is_eof = true;
+ }
+
+ else
+ ABORTINP ("%s is truncated mid-way through GZIL block. Tip: If this is expected, use --truncate to discard the final partial GZIL block", txt_name);
+ }
+
+ if (!discovering && file->gz_data.uncomp_len)
+ file->bgzf_isizes.len++; // count GZIL blocks
+
+ COPY_TIMER_EVB (gzil_read_block);
+ return GZ_SUCCESS;
}
-// ZIP main thread: reads and validates a BGZF block, and returns the uncompressed size or (only if soft_fail) an error
-int32_t bgzf_read_block (FileP file, // txt_file is not yet assigned when called from file_open_txt_read
- uint8_t *block /* must be BGZF_MAX_BLOCK_SIZE in size */, uint32_t *block_size /* out */,
- FailType soft_fail)
+// ZIP: reads and validates a BGZF block
+// returns: discoverying: GZ_SUCCESS, GZ_IS_NOT_GZIP, GZ_IS_GZIP_NOT_BGZF
+// otherwise: GZ_SUCCESS, GZ_EOF_WITHOUT_EOF_BLOCK, GZ_TRUNCATED
+static GzStatus bgzf_read_block_do (FileP file, // txt_file is not yet assigned when called from txtfile_discover_gz_codec
+ bool is_remote, bool discovering)
{
- START_TIMER;
+ FILE *fp = (FILE *)file->file;
+ file->gz_data.comp_len = file->gz_data.uncomp_len = 0; // init
+
+ // top-up if needed
+ if (file->gz_data.len32 < BGZF_MAX_BLOCK_SIZE && !feof (fp)) {
+ uint32_t chunk_size = flag.zip_uncompress_source_during_read ? 150 KB // a bit more than default block-device read-ahead buffer for best parallelization between disk read-ahead and CPU decompression
+ : BGZF_MAX_CHUCK_SIZE; // bigger block is faster if we are prepared to yield the CPU when waiting for the disk
+ file->gz_data.len32 += txtfile_fread (file, fp, BAFT8(file->gz_data), chunk_size - file->gz_data.len32, &file->disk_so_far);
+ }
- int ret = bgzf_read_block_raw ((FILE *)file->file, block, block_size, file->basename, file->is_remote, soft_fail, &file->disk_so_far);
- if (ret == BGZF_BLOCK_IS_NOT_GZIP || ret == BGZF_BLOCK_GZIP_NOT_BGZIP)
- return ret; // happens only if soft_fail
-
- if (ret == BGZF_ABRUBT_EOF) {
- ASSERT (!file->disk_size || // redirected or remote
- flag.truncate || // possibly compressing while downloading
- file->disk_so_far == file->disk_size, // entire file was read
- "Abrupt EOF in BGZF file %s: disk_so_far=%s disk_size=%s filesystem=%s",
- file->name, str_int_commas (file->disk_so_far).s, str_int_commas (file->disk_size).s, arch_get_filesystem_type (file).s);
-
- return 0; // no EOF block, that's fine
+ BgzfHeader *h = B1ST (BgzfHeader, file->gz_data);
+
+ // no data at all
+ if (file->gz_data.len32 == 0)
+ return discovering ? GZ_IS_NOT_GZIP // no data was read from this file at all
+ : GZ_EOF_WITHOUT_EOF_BLOCK; // EOF without an EOF block (we know this is a BGZF file bc this isn't the first block)
+
+ // truncated mid-way through header
+ else if (file->gz_data.len32 < BGZF_HEADER_LEN) {
+ if (discovering)
+ return GZ_IS_NOT_GZIP; // file smaller than a gzip header - its not GZIP
+
+ else if (flag.truncate)
+ return GZ_TRUNCATED; // truncated file
+
+ else
+ ABORT ("file %s appears truncated - it ends with a partial gzip block header", file->basename); // less than the minimal gz block header size
}
- else if (ret == BGZF_BLOCK_TRUNCATED) {
- if (txt_file->bgzf_truncated_last_block) // we arrive here twice - show warning only on the second time
- WARN ("FYI: %s is truncated - its final BGZF block in incomplete. Dropping this defective BGZF block.", txt_name);
- txt_file->bgzf_truncated_last_block = true;
- return 0;
+ // case: this is not a GZ / BGZF block at all (see: https://tools.ietf.org/html/rfc1952)
+ else if (h->id1 != 31 || h->id2 != 139) {
+ if (discovering)
+ return GZ_IS_NOT_GZIP;
+ else
+ ABORT ("expecting %s to be compressed with gzip format, but it is not", file->basename);
}
- uint32_t isize_lt32 = GET_UINT32 (&block[*block_size - 4]);
- uint32_t isize = LTEN32 (isize_lt32); // 0...65536 per spec: "input size" = uncompressed data length
- ASSERT (isize <= 65536, "isize=%u ∉ [0,65536]", isize);
+ // case: this is GZIP block (by the magic) but it is NOT a valid BGZF block (see: https://samtools.github.io/hts-specs/SAMv1.pdf)
+ else if (memcmp (h, BGZF_PREFIX, BGZF_PREFIX_LEN)) {
+ if (discovering)
+ return GZ_IS_GZIP_NOT_BGZF;
+ else
+ ABORT ("Encountered a GZIP block that unexpectedly is not BGZF in %s offset=%"PRIu64"\nSolution: use --no-bgzf",
+ file->basename, (uint64_t)ftello64 (fp) - file->gz_data.len32);
+ }
- // add isize to buffer that will be written to SEC_BGZF
- if (isize) { // don't store EOF block (bc isize=0 cannot be represented as (isize-1) )
- #define BGZF_INITIAL_ALLOC 16 // just of the sake of a bit of effeciency: 16 chosen carefully so 16*63000 < 1MB min vb_size but over segconf size
- if (file->bgzf_isizes.len32 <= BGZF_INITIAL_ALLOC) { // entered thrice: when called from file_open_txt_read, segconf, and in first VB
- buf_alloc (evb, &file->bgzf_isizes, 0, MAX_(BGZF_INITIAL_ALLOC, segconf.vb_size / 63000), uint16_t, 0, "txt_file->bgzf_isizes");
- buf_alloc (evb, &file->bgzf_starts, 0, MAX_(BGZF_INITIAL_ALLOC, segconf.vb_size / 63000), uint64_t, 0, "txt_file->bgzf_starts");
- }
+ uint32_t body_size = (LTEN16 (h->bsize) + 1) - BGZF_HEADER_LEN;
+
+ if (file->gz_data.len32 >= body_size) {
+ file->gz_data.comp_len = BGZF_HEADER_LEN + body_size;
- buf_append_one (file->bgzf_isizes, BGEN16 ((uint16_t)(isize - 1))); // -1 to make the range 0..65535
- buf_append_one (file->bgzf_starts, txt_file ? txt_file->disk_so_far - *block_size : 0); // not BGEN bc not written to z_file. note: first block is read from file_open_txt_read before txt_file is assigned
+ file->gz_data.uncomp_len = LTEN32 (GET_UINT32 (B8(file->gz_data, file->gz_data.comp_len-4))); // 0...65536 per spec: "isize / input size" = uncompressed data length
+ ASSERT (file->gz_data.uncomp_len <= 65536, "isize=%u ∉ [0,65536] in %s offset=%"PRIu64, file->gz_data.uncomp_len, file->basename, (uint64_t)ftello64 (fp) - file->gz_data.len32);
+
+ return GZ_SUCCESS;
}
- else {
- // if isize is 0, we're expecting an EOF block
- ASSERT (str_issame_((rom)block, *block_size, BGZF_EOF, BGZF_EOF_LEN),
- "Corrupt BGZF block in %s offset=%"PRIu64" bgzf_block_size=%u: isize=0 but this is not an EOF block",
- file->name, file->disk_so_far - *block_size, *block_size);
- ASSERT (file->disk_so_far == file->disk_size || // expected
- !file->disk_size || is_read_via_ext_decompressor (file) || file->redirected || file->is_remote, // cases in which we can't reliably test this condition
- "Corrupt BGZF file %s (size=%"PRIu64"): BGZF EOF block encountered at offset=%"PRIu64" length=%u, but this is not the end of the file",
- file->name, file->disk_size, file->disk_so_far - *block_size, *block_size);
+ // truncated first block: compress as non-GZIP
+ else if (discovering)
+ return GZ_IS_NOT_GZIP;
+
+ else if (flag.truncate)
+ return GZ_TRUNCATED;
+
+ else { // if failed, always error, even if in discovery (except if truncation allowed)
+ int save_errno = errno; // we want to report errno of fread, not ftell.
+ ABORT ("%s %s (ftell=%"PRId64" err=\"%s\" gz_data.len=%u but expecting=%u filesystem=%s). %s\n",
+ feof (fp) ? "Unexpected end of file while reading" : "Failed to read file",
+ file->basename, ftello64 (fp),
+ (is_remote && save_errno == ESPIPE) ? "Disconnected from remote host" : strerror (save_errno),
+ file->gz_data.len32, body_size, arch_get_txt_filesystem().s,
+ feof (fp) ? "Tip: If the file is expected to be truncated, you use --truncate to disregard the final partial BGZF block." : "");
+ }
+}
- if (txt_file)
- txt_file->bgzf_flags.has_eof_block = true;
+// ZIP main thread: reads a BGZF block into gz_data
+GzStatus bgzf_read_block (FileP file, bool discovering)
+{
+ START_TIMER;
+
+ // with BGZF, gz_data is either empty or contains exactly 1 bgzf block
+ if (file->gz_data.comp_len) return GZ_SUCCESS; // we already have 1 block
+
+ GzStatus ret = bgzf_read_block_do (file, file->is_remote, discovering);
+
+ switch (ret) {
+ case GZ_SUCCESS: // successful read of a BGZF block
+ bgzf_update_file_isizes (file);
+ break;
+
+ case GZ_IS_NOT_GZIP:
+ case GZ_IS_GZIP_NOT_BGZF:
+ ASSERT (discovering, "ret=%d expected only if discovering", ret);
+ break; // file->gz_data contains data that is not BGZF data
+
+ case GZ_EOF_WITHOUT_EOF_BLOCK: // file ended without EOF block: that's fine
+ ASSERT0 (!discovering, "unexpected ret=GZ_EOF_WITHOUT_EOF_BLOCK unexpected when discovering");
+ ret = GZ_SUCCESS;
+ break; // note: if file was not entirely read, we will detect that at the end of zip_one_file
+
+ case GZ_TRUNCATED: // file ended mid-way through a BGZF block
+ ASSERT0 (!discovering, "unexpected ret=GZ_TRUNCATED unexpected when discovering");
+
+ // case: truncation allowed: account and then discard the data that will not be gz-decompressed
+ if (flag.truncate) {
+ WARN ("FYI: %s is truncated - its final BGZF block in incomplete. Dropping final %u bytes of the GZ data.", txt_name, file->gz_data.len32);
+
+ inc_disk_gz_uncomp_or_trunc (file, file->gz_data.len);
+ file->gz_data.len32 = file->gz_data.comp_len = file->gz_data.uncomp_len = 0; // discard partial BGZF block
+ segconf.zip_txt_modified = true;
+
+ ret = GZ_SUCCESS;
+ break;
+ }
+
+ else
+ ABORTINP ("%s is truncated mid-way through BGZF block. Tip: If this is expected, use --truncate to discard the final partial BGZF block", txt_name);
+
+ default:
+ ABORT ("Unexpected ret=%s", gzstatus_name (ret));
}
COPY_TIMER_EVB (bgzf_read_block);
- return isize;
+ return ret;
}
// ZIP: BGZF section per component
@@ -402,32 +539,48 @@ void bgzf_compress_bgzf_section (void)
// uncompresses a BGZF block in vb->scratch referred to by bb, into its place in vb->txt_data as prescribed by bb
// might be called from main thread or compute threads
-void bgzf_uncompress_one_block (VBlockP vb, BgzfBlockZip *bb)
+void bgz_uncompress_one_block (VBlockP vb, GzBlockZip *bb, Codec codec)
{
- if (bb->is_decompressed) return; // already decompressed - nothing to do
+ if (bb->is_decompressed) return; // already decompressed (or a BGZF EOF block) - nothing to do
ASSERT0 (vb->gzip_compressor, "vb->gzip_compressor=NULL");
- BgzfHeader *h = (BgzfHeader *)Bc(vb->scratch, bb->compressed_index);
+ int header_len = TXT_IS_BGZF ? BGZF_HEADER_LEN : GZIL_HEADER_LEN;
+
+ uint8_t *h = B8(vb->scratch, bb->compressed_index);
// verify that entire block is within vb->scratch
- ASSERT (bb->compressed_index + sizeof (BgzfHeader) < vb->scratch.len && // we have at least the header - we can access bsize
- bb->compressed_index + (uint32_t)LTEN16 (h->bsize) + 1 <= vb->scratch.len,
- "%s: BGZF block size goes past the end of in vb->scratch: bb=%s compressed_index=%u vb->scratch.len=%"PRIu64,
- VB_NAME, display_bb (bb).s, bb->compressed_index, vb->scratch.len);
+ ASSERT (bb->compressed_index + header_len < vb->scratch.len && // we have at least the header - we can access bsize
+ bb->compressed_index + bb->comp_size <= vb->scratch.len,
+ "%s: %s block size goes past the end of in vb->scratch: bb=%s compressed_index=%u vb->scratch.len=%"PRIu64,
+ VB_NAME, codec_name (txt_file->codec), display_bb (bb).s, bb->compressed_index, vb->scratch.len);
- ASSERT (h->id1==31 && h->id2==139, "%s: invalid BGZF block in vb->scratch: compressed_index=%u", VB_NAME, bb->compressed_index);
+ ASSERT (h[0]==31 && h[1]==139, "%s: invalid %s block in vb->scratch: compressed_index=%u", VB_NAME, codec_name (codec), bb->compressed_index);
if (flag.show_bgzf)
- iprintf ("%-7s %s i=%u compressed_index=%u size=%u txt_index=%u size=%u ",
+ iprintf ("UNCOMPRESS thread=%s %s i=%u comp_index=%u comp_len=%u txt_index=%u txt_len=%u eof=%s ",
threads_am_i_main_thread() ? "MAIN" : "COMPUTE", VB_NAME,
- BNUM (vb->bgzf_blocks, bb), bb->compressed_index, bb->comp_size, bb->txt_index, bb->txt_size);
+ BNUM (vb->gz_blocks, bb), bb->compressed_index, bb->comp_size, bb->txt_index, bb->txt_size, TF(bb->is_eof));
enum libdeflate_result ret =
libdeflate_deflate_decompress (vb->gzip_compressor,
- h+1, bb->comp_size - sizeof(BgzfHeader) - sizeof (BgzfFooter), // compressed
+ h + header_len, bb->comp_size - header_len - GZIP_FOOTER_LEN, // compressed
Btxt (bb->txt_index), bb->txt_size, NULL); // uncompressed
+ // account for the case of decompression, and also the case bb is discarded due to a certain truncate situation (see below).
+ inc_disk_gz_uncomp_or_trunc (txt_file, bb->comp_size);
+
+ // case: final GZIL block, which is truncated, but we have --truncate, and the garbage last word
+ // unluckily < 1MB so it went undetected as a legimiate block in gzil_read_block. we drop this block now.
+ if (ret != LIBDEFLATE_SUCCESS && vb->txt_codec == CODEC_GZIL && bb->is_eof) {
+ if (flag.truncate)
+ return; // with bb->is_decompressed=false
+ else {
+ ABORT ("Failed to decompress the final %s block of the file: %s. Tip: If it is expected that the file is truncated, use --truncate to ignore the defective final block.",
+ codec_name (vb->txt_codec), libdeflate_error(ret));
+ }
+ }
+
ASSERT (ret == LIBDEFLATE_SUCCESS, "libdeflate_deflate_decompress failed: %s", libdeflate_error(ret));
bb->is_decompressed = true;
@@ -451,15 +604,15 @@ void bgzf_uncompress_one_block (VBlockP vb, BgzfBlockZip *bb)
bgzf_finalize_discovery();
}
-// ZIP: called from the compute thread: zip_compress_one_vb and main thread: txtfile_read_block_bgzf
-void bgzf_uncompress_vb (VBlockP vb)
+// ZIP: called from the compute thread: zip_compress_one_vb and main thread: txtfile_read_block_bgz
+void bgz_uncompress_vb (VBlockP vb, Codec codec)
{
START_TIMER;
vb->gzip_compressor = libdeflate_alloc_decompressor(vb, __FUNCLINE);
- for_buf (BgzfBlockZip, bb, vb->bgzf_blocks)
- bgzf_uncompress_one_block (vb, bb);
+ for_buf (GzBlockZip, bb, vb->gz_blocks)
+ bgz_uncompress_one_block (vb, bb, codec);
libdeflate_free_decompressor ((struct libdeflate_decompressor **)&vb->gzip_compressor, __FUNCLINE);
@@ -470,7 +623,7 @@ void bgzf_uncompress_vb (VBlockP vb)
else COPY_TIMER (bgzf_compute_thread);
}
- COPY_TIMER (bgzf_uncompress_vb);
+ COPY_TIMER (bgz_uncompress_vb);
}
// ZIP: decompresses a prescribed BGZF block when re-reading DEPN lines
@@ -480,19 +633,13 @@ static inline void bgzf_uncompress_one_prescribed_block (VBlockP vb, STRp(bgzf_b
BgzfHeader *h = (BgzfHeader *)bgzf_block;
- // verify that entire block is within vb->scratch
- ASSERT ((uint32_t)LTEN16 (h->bsize) + 1 == bgzf_block_len,
- "%s: BGZF reread: expecting block_size=%u but found %u", VB_NAME, bgzf_block_len, LTEN16 (h->bsize) + 1);
-
- ASSERT (h->id1==31 && h->id2==139, "%s: invalid BGZF block", VB_NAME);
-
if (flag.show_bgzf)
- iprintf ("REREAD vb=%s reread bb_i=%"PRIu64" comp_size=%u uncomp_size=%u ",
+ iprintf ("REREAD %s reread bb_i=%"PRIu64" comp_size=%u uncomp_size=%u ",
VB_NAME, bb_i, bgzf_block_len, uncomp_block_len);
enum libdeflate_result ret =
libdeflate_deflate_decompress (vb->gzip_compressor,
- h+1, bgzf_block_len - sizeof(BgzfHeader) - sizeof (BgzfFooter), // compressed
+ h+1, bgzf_block_len - BGZF_HEADER_LEN - GZIP_FOOTER_LEN, // compressed
STRa(uncomp_block), NULL); // uncompressed
ASSERT (ret == LIBDEFLATE_SUCCESS, "%s: libdeflate_deflate_decompress failed: %s. bgzf_block_len=%u uncomp_block_len=%u bb_i=%"PRIu64,
@@ -506,8 +653,30 @@ static inline void bgzf_uncompress_one_prescribed_block (VBlockP vb, STRp(bgzf_b
COPY_TIMER (bgzf_uncompress_one_prescribed_block);
}
-// ZIP: compute thread of a DEPN VB: actually re-reading data into txt_data according to vb->reread_prescription
-void bgzf_reread_uncompress_vb_as_prescribed (VBlockP vb, FILE *file)
+// ZIP: re-reads and validates one BGZF block
+static void bgzf_reread_one_prescribed_block (FILE *fp, uint64_t offset, qSTRp (bgzf_block))
+{
+ ASSERT (!fseeko64 (fp, offset, SEEK_SET),
+ "fseeko64(%s, %"PRIu64") failed while rereading BGZF depn lines: %s", txt_name, offset, strerror(errno));
+
+ // read the header
+ uint32_t header_bytes = txtfile_fread (txt_file, fp, bgzf_block, BGZF_HEADER_LEN, NULL);
+
+ // failed to read as prescribed
+ ASSERT (header_bytes == BGZF_HEADER_LEN && !memcmp (bgzf_block, BGZF_PREFIX, BGZF_PREFIX_LEN),
+ "failed to re-read a BGZF block header as perscribed BGZF: offset=%"PRIu64" bytes_read=%u header=%s", offset, header_bytes, str_to_hex ((bytes)bgzf_block, header_bytes).s);
+
+ uint32_t body_size = (LTEN16 (((BgzfHeader*)bgzf_block)->bsize) + 1) - BGZF_HEADER_LEN;
+ uint32_t body_bytes = txtfile_fread (txt_file, fp, bgzf_block + BGZF_HEADER_LEN, body_size, NULL);
+
+ ASSERT (body_bytes == body_size, "failed to re-read a BGZF block body as perscribed BGZF: offset=%"PRIu64" bytes_read=%u expected=%u",
+ offset, body_bytes, body_size);
+
+ *bgzf_block_len = BGZF_HEADER_LEN + body_size;
+}
+
+// ZIP: SAM/BAM: compute thread of a DEPN VB: actually re-reading data into txt_data according to vb->reread_prescription
+void bgzf_reread_uncompress_vb_as_prescribed (VBlockP vb, FILE *fp)
{
uint64_t last_offset = -1LL;
char uncomp_block[BGZF_MAX_BLOCK_SIZE];
@@ -518,19 +687,16 @@ void bgzf_reread_uncompress_vb_as_prescribed (VBlockP vb, FILE *file)
// a line might span 1 or more BGZF blocks
while (line->line_len) {
+ ASSERT (line->offset.bb_i < txt_file->bgzf_starts.len32, "Expecting bb_i=%"PRIu64" < bgzf_starts.len=%"PRIu64,
+ (uint64_t)line->offset.bb_i, txt_file->bgzf_starts.len);
uint64_t offset = *B64 (txt_file->bgzf_starts, line->offset.bb_i);
uint32_t isize = BGEN16 (*B16 (txt_file->bgzf_isizes, line->offset.bb_i)) + 1; // maximum isize is 65536 (not 65535)
if (offset != last_offset) {
- ASSERT (!fseeko64 (file, offset, SEEK_SET),
- "%s: fseeko64 on %s failed while rereading BGZF depn lines: %s", VB_NAME, txt_name, strerror(errno));
-
STRl (bgzf_block, BGZF_MAX_BLOCK_SIZE);
- int32_t ret = bgzf_read_block_raw (file, (uint8_t*)qSTRa(bgzf_block), txt_file->basename, false, HARD_FAIL, NULL);
- ASSERT (ret != BGZF_ABRUBT_EOF, "Unexpected BGZF_ABRUBT_EOF while re-reading BGZF block in %s: filesystem=%s offset=%"PRIu64" uncomp_block_size=%u",
- txt_name, arch_get_txt_filesystem().s, offset, isize);
+ bgzf_reread_one_prescribed_block (fp, offset, qSTRa(bgzf_block));
bgzf_uncompress_one_prescribed_block (vb, STRa(bgzf_block), uncomp_block, isize, line->offset.bb_i);
last_offset = offset;
@@ -556,37 +722,37 @@ void bgzf_libdeflate_1_7_initialize (void)
}
// ZIP: called by Seg to set the bgzf index of the next line
-void bgzf_zip_advance_index (VBlockP vb, uint32_t line_len)
+void bgz_zip_advance_index (VBlockP vb, uint32_t line_len)
{
- if (!vb->bgzf_blocks.len) return; // no BGZF blocks in this VB - all data came from "unconsumed_txt"
+ if (!vb->gz_blocks.len) return; // no BGZF blocks in this VB - all data came from "unconsumed_txt"
vb->line_bgzf_uoffset += line_len;
// udpate current_bb_i and bgzf_offset (note: line_len might span multiple bgzf blocks)
- BgzfBlockZip *bb;
- for (bb = B(BgzfBlockZip, vb->bgzf_blocks, vb->bgzf_blocks.current_bb_i);
+ GzBlockZip *bb;
+ for (bb = B(GzBlockZip, vb->gz_blocks, vb->gz_blocks.current_bb_i);
vb->line_bgzf_uoffset && vb->line_bgzf_uoffset >= bb->txt_size; // note: careful to also terminate on the edge case that line_bgzf_uoffset==0 and in the final VB block bb->txt_size==0
bb++)
vb->line_bgzf_uoffset -= bb->txt_size; // index into the next BGZF block
- vb->bgzf_blocks.current_bb_i = BNUM(vb->bgzf_blocks, bb);
+ vb->gz_blocks.current_bb_i = BNUM(vb->gz_blocks, bb);
}
-// ZIP: after reading data for a txt_header or VB, copy unconsumed bgzf_blocks to txt_file->unconsumed_bgzf_blocks
+// ZIP: after reading data for a txt_header or VB, copy unconsumed gz_blocks to txt_file->unconsumed_bgz_blocks
// The first block might be partially consumed.
-int64_t bgzf_copy_unconsumed_blocks (VBlockP vb)
+int64_t bgz_copy_unconsumed_blocks (VBlockP vb)
{
START_TIMER;
- ASSERTISZERO (txt_file->unconsumed_bgzf_blocks.len32);
+ ASSERTISZERO (txt_file->unconsumed_bgz_blocks.len32);
- if (!vb->bgzf_blocks.len) return 0; // not a BGZF-compressed file
+ if (!vb->gz_blocks.len) return 0; // not a BGZF-compressed file
- int32_t consumed = // amount of data in vb->bgzf_blocks that does NOT need to be copied to next VB bc it was consumed by this VB or the previous one
+ int32_t consumed = // amount of data in vb->gz_blocks that does NOT need to be copied to next VB bc it was consumed by this VB or the previous one
Ltxt + // amount of data consumed by this VB
- vb->bgzf_blocks.consumed_by_prev_vb; // amount of data in first BGZF block was conusmed by the previous VB
+ vb->gz_blocks.consumed_by_prev_vb; // amount of data in first BGZF block was conusmed by the previous VB
- ARRAY (BgzfBlockZip, bb, vb->bgzf_blocks);
+ ARRAY (GzBlockZip, bb, vb->gz_blocks);
bool done = false;
bool consumed_full_bgzf_blocks=false;
@@ -594,16 +760,16 @@ int64_t bgzf_copy_unconsumed_blocks (VBlockP vb)
for (uint32_t i=0; i < bb_len; i++) {
// if some of the BGZF blocks are not consumed (the first of them might be partially consumed) - move the blocks
- // to unconsumed_bgzf_blocks - to be moved to the next VB
+ // to unconsumed_bgz_blocks - to be moved to the next VB
if (consumed - bb[i].txt_size < 0 && !done/*enter only once*/) {
consumed_full_bgzf_blocks = (consumed == 0); // no partially-consumed block
// block i might be partially consumed or not consumed at all, subsequent blocks are not consumed at all
- buf_append (evb, txt_file->unconsumed_bgzf_blocks, BgzfBlockZip,
- B(BgzfBlockZip, vb->bgzf_blocks, i), vb->bgzf_blocks.len32 - i, "txt_file->unconsumed_bgzf_blocks");
+ buf_append (evb, txt_file->unconsumed_bgz_blocks, GzBlockZip,
+ B(GzBlockZip, vb->gz_blocks, i), vb->gz_blocks.len32 - i, "txt_file->unconsumed_bgz_blocks");
- txt_file->unconsumed_bgzf_blocks.consumed_by_prev_vb = consumed; // part of first BGZF block already consumed
+ txt_file->unconsumed_bgz_blocks.consumed_by_prev_vb = consumed; // part of first BGZF block already consumed
done = true;
}
else if (!done)
@@ -617,55 +783,55 @@ int64_t bgzf_copy_unconsumed_blocks (VBlockP vb)
// update bb.txt_index for next VB
// note: first bb.txt_data of the next VB is possibly negative if some of its data was consumed by the current VB
- int32_t txt_index = -txt_file->unconsumed_bgzf_blocks.consumed_by_prev_vb;
- for_buf (BgzfBlockZip, bb, txt_file->unconsumed_bgzf_blocks) {
+ int32_t txt_index = -txt_file->unconsumed_bgz_blocks.consumed_by_prev_vb;
+ for_buf (GzBlockZip, bb, txt_file->unconsumed_bgz_blocks) {
bb->txt_index = txt_index;
txt_index += bb->txt_size;
}
- COPY_TIMER (bgzf_copy_unconsumed_blocks);
+ COPY_TIMER (bgz_copy_unconsumed_blocks);
return consumed_full_bgzf_blocks ? compressed_size : 0;
}
// return blocks used by the segconf VB to the unconsumed blocks
-void bgzf_return_segconf_blocks (VBlockP vb)
+void bgz_return_segconf_blocks (VBlockP vb)
{
- buf_copy (evb, &txt_file->unconsumed_bgzf_blocks, &vb->bgzf_blocks, BgzfBlockZip, 0, 0, 0);
- txt_file->unconsumed_bgzf_blocks.consumed_by_prev_vb = vb->bgzf_blocks.consumed_by_prev_vb;
+ buf_copy (evb, &txt_file->unconsumed_bgz_blocks, &vb->gz_blocks, GzBlockZip, 0, 0, 0);
+ txt_file->unconsumed_bgz_blocks.consumed_by_prev_vb = vb->gz_blocks.consumed_by_prev_vb;
}
-// ZIP: before reading data for a VB, populate bgzf_blocks with some or all of the unconsumed blocks passed
+// ZIP: before reading data for a VB, populate gz_blocks with some or all of the unconsumed blocks passed
// from the previous VB or txt_header
-void bgzf_zip_init_vb (VBlockP vb)
+void bgz_zip_init_vb (VBlockP vb)
{
- vb->vb_bgzf_i = txt_file->bgzf_isizes.len + txt_file->bgzf_flags.has_eof_block; // index of first bgzf block to be used by the VB
+ vb->vb_bgz_i = txt_file->bgzf_isizes.len + txt_file->bgzf_flags.has_eof_block; // index of first bgzf block to be used by the VB
- if (!txt_file->unconsumed_bgzf_blocks.len) return; // happens when either unconsumed_bytes=0 or not a BGZF-compressed file
+ if (!txt_file->unconsumed_bgz_blocks.len) return; // happens when either unconsumed_bytes=0 or not a BGZF-compressed file
// data in the first BGZF block already consumed by previous VB or txt_header
- vb->bgzf_blocks.consumed_by_prev_vb = vb->line_bgzf_uoffset = txt_file->unconsumed_bgzf_blocks.consumed_by_prev_vb;
+ vb->gz_blocks.consumed_by_prev_vb = vb->line_bgzf_uoffset = txt_file->unconsumed_bgz_blocks.consumed_by_prev_vb;
- // copy all unconsumed BGZF blocks - we might not need all of them - the unconsumed ones will moved back in bgzf_copy_unconsumed_blocks
- buf_copy (vb, &vb->bgzf_blocks, &txt_file->unconsumed_bgzf_blocks, BgzfBlockZip, 0, 0, "bgzf_blocks");
+ // copy all unconsumed BGZF blocks - we might not need all of them - the unconsumed ones will moved back in bgz_copy_unconsumed_blocks
+ buf_copy (vb, &vb->gz_blocks, &txt_file->unconsumed_bgz_blocks, GzBlockZip, 0, 0, "gz_blocks");
- vb->vb_bgzf_i -= txt_file->unconsumed_bgzf_blocks.len32;
+ vb->vb_bgz_i -= txt_file->unconsumed_bgz_blocks.len32;
- txt_file->unconsumed_bgzf_blocks.len32 = txt_file->unconsumed_bgzf_blocks.consumed_by_prev_vb = 0;
+ txt_file->unconsumed_bgz_blocks.len32 = txt_file->unconsumed_bgz_blocks.consumed_by_prev_vb = 0;
// sanity check
- int32_t available = -vb->bgzf_blocks.consumed_by_prev_vb; // possibly start negative
- for_buf (BgzfBlockZip, bb, vb->bgzf_blocks)
+ int32_t available = -vb->gz_blocks.consumed_by_prev_vb; // possibly start negative
+ for_buf (GzBlockZip, bb, vb->gz_blocks)
available += bb->txt_size;
- ASSERT (available >= Ltxt, "BGZF blocks in txt_file->unconsumed_bgzf_blocks cover only %d bytes, less than the needed unconsumed_bytes=%d",
- available, Ltxt);
+ ASSERT (available >= Ltxt, "%s blocks in txt_file->unconsumed_bgz_blocks cover only %d bytes, less than the needed unconsumed_bytes=%d",
+ codec_name (txt_file->codec), available, Ltxt);
}
//-----------------------------------------------------
// PIZ SIDE - setting up BGZF for a particular txt file
//-----------------------------------------------------
-static Buffer isizes = {}; // will be grabbed into txt_file->bgzf_isizes;
+static Buffer isizes = {}; // Will be grabbed into txt_file->bgzf_isizes.
static inline FlagsBgzf recompression_template (int bgzf_level)
{
@@ -857,12 +1023,12 @@ static void bgzf_compress_one_block (VBlockP vb, rom in, uint32_t isize,
ASSERT0 (vb->gzip_compressor, "vb->gzip_compressor=NULL");
- #define BGZF_MAX_CDATA_SIZE (BGZF_MAX_BLOCK_SIZE - sizeof (BgzfHeader) - sizeof (BgzfFooter))
+ #define BGZF_MAX_CDATA_SIZE (BGZF_MAX_BLOCK_SIZE - BGZF_HEADER_LEN - GZIP_FOOTER_LEN)
buf_alloc (vb, compressed, BGZF_MAX_BLOCK_SIZE, 0, char, 1.2, "scratch");
BgzfHeader *header = (BgzfHeader *)BAFTc (*compressed);
- buf_add (compressed, BGZF_EOF, sizeof (BgzfHeader)); // template of header - only bsize needs updating
+ buf_add (compressed, BGZF_EOF, BGZF_HEADER_LEN); // template of header - only bsize needs updating
uint32_t comp_index = compressed->len;
int out_size;
@@ -878,12 +1044,12 @@ static void bgzf_compress_one_block (VBlockP vb, rom in, uint32_t isize,
strm.next_in = (uint8_t *)in;
strm.avail_in = isize;
strm.next_out = BAFT8 (*compressed);
- strm.avail_out = BGZF_MAX_CDATA_SIZE + sizeof (BgzfFooter);
+ strm.avail_out = BGZF_MAX_CDATA_SIZE + GZIP_FOOTER_LEN;
int ret = isal_deflate_stateless (&strm);
ASSERT (ret == ISAL_DECOMP_OK, "%s: isal_deflate_stateless: %s. isize=%u", VB_NAME, isal_error (ret), isize);
- out_size = BGZF_MAX_CDATA_SIZE + sizeof (BgzfFooter) - strm.avail_out;
+ out_size = BGZF_MAX_CDATA_SIZE + GZIP_FOOTER_LEN - strm.avail_out;
}
else if (txt_file->bgzf_flags.library == BGZF_LIBDEFLATE19) { // libdeflate 1.19
@@ -895,7 +1061,7 @@ static void bgzf_compress_one_block (VBlockP vb, rom in, uint32_t isize,
// "scratch" to just under 64K while in our compression level it is just over 64K.
if (!out_size) {
void *high_compressor = libdeflate_alloc_compressor (vb, LIBDEFLATE_MAX_LEVEL, __FUNCLINE); // libdefate's highest level
- out_size = libdeflate_deflate_compress (vb->gzip_compressor, in, isize, BAFTc (*compressed), BGZF_MAX_CDATA_SIZE);
+ out_size = libdeflate_deflate_compress (high_compressor, in, isize, BAFTc (*compressed), BGZF_MAX_CDATA_SIZE);
libdeflate_free_compressor (high_compressor, __FUNCLINE);
}
}
@@ -935,17 +1101,17 @@ static void bgzf_compress_one_block (VBlockP vb, rom in, uint32_t isize,
ASSERT (out_size, "cannot compress block with %u bytes into a BGZF block with %u bytes", isize, BGZF_MAX_BLOCK_SIZE);
compressed->len += out_size;
- header->bsize = LTEN16 ((uint16_t)(sizeof (BgzfHeader) + out_size + sizeof (BgzfFooter) - 1));
+ header->bsize = LTEN16 ((uint16_t)(BGZF_HEADER_LEN + out_size + GZIP_FOOTER_LEN - 1));
- BgzfFooter footer = { .crc32 = LTEN32 (crc32 (0, in, isize)),
+ GzipFooter footer = { .crc32 = LTEN32 (crc32 (0, in, isize)),
.isize = LTEN32 (isize) };
- buf_add (compressed, (rom)&footer, sizeof (BgzfFooter));
+ buf_add (compressed, (rom)&footer, GZIP_FOOTER_LEN);
if (flag.show_bgzf)
#define C(i) (i < isize ? char_to_printable (in[i]).s : "")
- iprintf ("%-7s %s i=%d compressed_index=%u size=%u txt_index=%d size=%u txt_data[5]=%1s%1s%1s%1s%1s %s\n",
+ iprintf ("COMPRESS thread=%s %s i=%d compressed_index=%u size=%u txt_index=%d size=%u txt_data[5]=%1s%1s%1s%1s%1s %s\n",
threads_am_i_main_thread() ? "MAIN" : threads_am_i_writer_thread() ? "WRITER" : "COMPUTE", VB_NAME, block_i,
- comp_index - (int)sizeof (BgzfHeader), (unsigned)out_size + (int)sizeof (BgzfHeader) + (int)sizeof (BgzfFooter), txt_index, isize, C(0), C(1), C(2), C(3), C(4),
+ comp_index - (int)BGZF_HEADER_LEN, (unsigned)out_size + BGZF_HEADER_LEN + GZIP_FOOTER_LEN, txt_index, isize, C(0), C(1), C(2), C(3), C(4),
out_size == BGZF_EOF_LEN ? "EOF" : "");
#undef C
@@ -984,7 +1150,7 @@ void bgzf_sign (uint64_t disk_size, uint8_t *signature)
}
// Entry point of BGZF compression compute thread.
-// bgzf-compress vb->txt_data into vb->z_data - using BGZF blocks as prescribed in vb->bgzf_blocks.
+// bgzf-compress vb->txt_data into vb->z_data - using BGZF blocks as prescribed in vb->gz_blocks.
// Note: we hope to reconstruct the exact same byte-level BGZF blocks, as the original files, but that
// will only happen if the GZIP library (eg libdeflate), version and parameters are the same
static void bgzf_compress_vb (VBlockP vb)
@@ -995,12 +1161,12 @@ static void bgzf_compress_vb (VBlockP vb)
iprintf ("%s: start BZGF re-compression: bgzf_library=%s bgzf_level=%d\n",
VB_NAME, bgzf_library_name (txt_file->bgzf_flags.library, true), txt_file->bgzf_flags.level);
- ASSERTNOTEMPTY (vb->bgzf_blocks);
+ ASSERTNOTEMPTY (vb->gz_blocks);
- buf_alloc (vb, &vb->z_data, 0, vb->bgzf_blocks.len32 * BGZF_MAX_BLOCK_SIZE/2, uint8_t, 1, "z_data"); // alloc based on estimated size
+ buf_alloc (vb, &vb->z_data, 0, vb->gz_blocks.len32 * BGZF_MAX_BLOCK_SIZE/2, uint8_t, 1, "z_data"); // alloc based on estimated size
bgzf_alloc_compressor (vb, txt_file->bgzf_flags);
- for_buf2 (BgzfBlockPiz, block, i, vb->bgzf_blocks) {
+ for_buf2 (BgzfBlockPiz, block, i, vb->gz_blocks) {
ASSERT (block->txt_index + block->txt_size <= Ltxt,
"block=%u out of range: expecting txt_index=%u txt_size=%u <= txt_data.len=%u",
@@ -1038,9 +1204,9 @@ static uint32_t bgzf_calculate_blocks_one_vb (VBlockP vb, bool is_last)
break; // the data at the end of this VB doesn't fill a whole BGZF block - pass it down to next vb
}
- buf_alloc (vb, &vb->bgzf_blocks, 1, Ltxt / 63000, BgzfBlockPiz, 1.5, "bgzf_blocks");
+ buf_alloc (vb, &vb->gz_blocks, 1, Ltxt / 63000, BgzfBlockPiz, 1.5, "gz_blocks");
- BNXT (BgzfBlockPiz, vb->bgzf_blocks) = (BgzfBlockPiz){ .txt_index = index, .txt_size = isize };
+ BNXT (BgzfBlockPiz, vb->gz_blocks) = (BgzfBlockPiz){ .txt_index = index, .txt_size = isize };
index += isize;
txt_file->bgzf_isizes.next++;
@@ -1097,3 +1263,26 @@ rom bgzf_library_name (BgzfLibraryType library, bool long_name)
: long_name ? (rom[])BGZF_LIB_NAMES_LONG[library]
: (rom[])BGZF_LIB_NAMES_SHRT[library];
}
+
+// used by test/Makefile
+void gzil_compress (void)
+{
+ void *compressor = libdeflate_alloc_compressor (evb, 5, __FUNCLINE);
+
+ uint8_t *in = MALLOC (1 MB), *out = MALLOC (2 MB);
+ uint32_t in_len;
+ for (int i=0; (in_len = fread (in, 1, 1 MB, stdin)); i++) {
+ GzipFooter footer = { .crc32 = LTEN32 (crc32 (0, in, in_len)),
+ .isize = LTEN32 (in_len) };
+
+ uint32_t out_len = libdeflate_deflate_compress (compressor, in, in_len, out, 2 MB);
+ ASSERT (out_len, "deflate failed: in_len=%u block_i=%u", in_len, i);
+
+ ASSERT0 (1 == fwrite (_S(GZIL_HEADER), 1, stdout), "fwrite failed #1");
+ ASSERT (1 == fwrite (STRa(out), 1, stdout), "fwrite failed: #2 out_len=%u", out_len);
+ ASSERT0 (1 == fwrite (&footer, sizeof (footer), 1, stdout), "fwrite failed #3");
+ }
+
+ fflush (stdout);
+ exit (0);
+}
diff --git a/src/bgzf.h b/src/bgzf.h
index 3a404758..53984b2c 100644
--- a/src/bgzf.h
+++ b/src/bgzf.h
@@ -10,7 +10,8 @@
#define BGZF_DEFAULT_LEVEL 2 // PIZ: used if --bgzf is not specified (it is actually faster than 1 if also writing to disk)
-#define BGZF_MAX_BLOCK_SIZE 65536 // maximum block size of both compressed and uncompressed data of one block
+#define BGZF_MAX_BLOCK_SIZE ((uint32_t)(64 KB)) // maximum block size of both compressed and uncompressed data of one block
+#define BGZF_MAX_CHUCK_SIZE ((uint32_t)(1 MB)) // max amount we read from disk at a time
// First 16 bytes of every BGZF block
#define BGZF_PREFIX_LEN 16
@@ -20,55 +21,61 @@
#define BGZF_EOF_LEN 28
#define BGZF_EOF BGZF_PREFIX "\x1b\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00"
-// data type of VBlock.bgzf_blocks
-typedef struct BgzfBlockZip {
- int32_t txt_index; // index of uncompressed block within vb->txt_data. If there is passed-down data from previous VB/txt_header, then txt_index of the first block will be negative (see bgzf_copy_unconsumed_blocks)
- uint32_t txt_size : 17; // max value is BGZF_MAX_BLOCK_SIZE
- uint32_t is_decompressed : 1; // has data been BGZF-decompressed by main thread
- uint32_t compressed_index, comp_size; // index within vb->scratch
-} BgzfBlockZip;
+// maximum block size of uncompressed data, and we are going to assume that since GZIL is only
+// for FASTQ, and FASTQ is quite compressible, (GZIL_MAX_BLOCK_SIZE-GZIL_HEADER_LEN) it is an upper limit on GZIL-compressed data
+#define GZIL_MAX_BLOCK_SIZE ((uint32_t)(1 MB))
+
+#define GZIL_HEADER "\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x03"
+#define GZIL_HEADER_LEN 10
+
+#define GZIL_ISIZE "\x00\x00\x10\x00" // isize == 1MB in all blocks except the last
+#define GZIL_ISIZE_LEN 4
typedef struct BgzfBlockPiz {
int32_t txt_index, txt_size; // index of uncompressed block within vb->txt_data. The first block index will be negative if there is passed-down unconsumed data
} BgzfBlockPiz;
-extern void bgzf_libdeflate_1_7_initialize (void);
-extern void bgzf_sign (uint64_t disk_size, uint8_t *signature);
-
-//---------
// ZIP side
-//---------
+typedef enum { GZ_SUCCESS, GZ_IS_GZIP_NOT_BGZF, GZ_IS_NOT_GZIL, GZ_IS_NOT_GZIP, GZ_EOF_WITHOUT_EOF_BLOCK, GZ_TRUNCATED, NUM_GZ_STATUSES } GzStatus; // file is truncated
+#define GZSTATUS_NAMES { "SUCCESS", "IS_GZIP_NOT_BGZF", "IS_NOT_GZIL", "IS_NOT_GZIP", "EOF_WITHOUT_EOF_BLOCK", "TRUNCATED" }
-#define BGZF_BLOCK_SUCCESS 0
-#define BGZF_BLOCK_GZIP_NOT_BGZIP -1
-#define BGZF_BLOCK_IS_NOT_GZIP -2
-#define BGZF_ABRUBT_EOF -3 // EOF without an EOF block
-#define BGZF_BLOCK_TRUNCATED -4 // likely file is truncated
-extern int32_t bgzf_read_block (FileP file, uint8_t *block, uint32_t *block_size, FailType soft_fail);
-extern void bgzf_uncompress_vb (VBlockP vb);
-extern void bgzf_uncompress_one_block (VBlockP vb, BgzfBlockZip *bb);
+// data type of VBlock.gz_blocks and txt_file->unconsumed_bgz_blocks : details of BGZF/GZIL blocks.
+typedef struct GzBlockZip {
+ int32_t txt_index; // index of uncompressed block within vb->txt_data. If there is passed-down data from previous VB/txt_header, then txt_index of the first block will be negative (see bgz_copy_unconsumed_blocks)
+ uint32_t txt_size : 30;
+ uint32_t is_decompressed : 1; // true if data has been GZ-decompressed by main thread
+ uint32_t is_eof : 1; // true if this is the last GZ-block in the file
+ uint32_t compressed_index, comp_size; // index within vb->scratch
+} GzBlockZip;
+
+extern GzStatus bgzf_read_block (FileP file, bool discovering);
+extern GzStatus gzil_read_block (FileP file, bool discovering, bool *is_eof);
+extern void bgz_uncompress_vb (VBlockP vb, Codec codec);
+extern void bgz_uncompress_one_block (VBlockP vb, GzBlockZip *bb, Codec codec);
extern void bgzf_reread_uncompress_vb_as_prescribed (VBlockP vb, FILE *file);
extern void bgzf_compress_bgzf_section (void);
-extern void bgzf_zip_advance_index (VBlockP vb, uint32_t line_len);
-extern int64_t bgzf_copy_unconsumed_blocks (VBlockP vb);
-extern void bgzf_zip_init_vb (VBlockP vb);
+extern void bgz_zip_advance_index (VBlockP vb, uint32_t line_len);
+extern int64_t bgz_copy_unconsumed_blocks (VBlockP vb);
+extern void bgz_zip_init_vb (VBlockP vb);
extern void bgzf_insert_back_segconf_blocks (VBlockP vb);
-extern void bgzf_return_segconf_blocks (VBlockP vb);
+extern void bgz_return_segconf_blocks (VBlockP vb);
+
+extern void inc_disk_gz_uncomp_or_trunc_(FileP file, uint64_t inc, FUNCLINE);
+#define inc_disk_gz_uncomp_or_trunc(file, inc) inc_disk_gz_uncomp_or_trunc_((file), (inc), __FUNCLINE)
// library / level discovery
extern void bgzf_initialize_discovery (FileP file);
extern void bgzf_finalize_discovery (void);
-#define consumed_by_prev_vb prm32[0] // bytes of the first BGZF block consumed by the prev VB or txt_header
-#define current_bb_i prm32[1] // index into vb->bgzf_blocks of first bgzf block of current line
-
-//---------
// PIZ side
-//---------
-
extern FlagsBgzf bgzf_piz_calculate_bgzf_flags (CompIType comp_i, Codec src_codec);
extern void bgzf_piz_set_txt_file_bgzf_info (FlagsBgzf bgzf_flags, bytes codec_info);
extern void bgzf_dispatch_compress (Dispatcher dispatcher, STRp (uncomp), CompIType comp_i, bool is_last);
extern void bgzf_write_finalize (void);
-extern rom bgzf_library_name (BgzfLibraryType library, bool long_name);
\ No newline at end of file
+// misc
+extern rom bgzf_library_name (BgzfLibraryType library, bool long_name);
+extern rom gzstatus_name (GzStatus st);
+extern void gzil_compress (void);
+extern void bgzf_libdeflate_1_7_initialize (void);
+extern void bgzf_sign (uint64_t disk_size, uint8_t *signature);
diff --git a/src/buf_struct.c b/src/buf_struct.c
index 82190af5..b1d2f7c0 100644
--- a/src/buf_struct.c
+++ b/src/buf_struct.c
@@ -77,9 +77,9 @@ void buf_initialize()
rom buf_type_name (ConstBufferP buf)
{
- static rom names[] = BUFTYPE_NAMES;
- if (buf->type >= 0 && buf->type < BUF_NUM_TYPES)
- return names[buf->type];
+ if (IN_RANGE (buf->type, 0, BUF_NUM_TYPES-1))
+ return (rom[])BUFTYPE_NAMES[buf->type];
+
else {
char *s = malloc (32); // used for error printing
snprintf (s, 32, "invalid_buf_type=%u", buf->type);
@@ -113,15 +113,15 @@ const BufDescType buf_desc (ConstBufferP buf)
}
// quick inline for internal buf_struct.c use check overflow and underflow in an allocated buffer
-static void no_integrity (ConstBufferP buf, FUNCLINE, rom buf_func)
+static inline void no_integrity (ConstBufferP buf, FUNCLINE, rom buf_func)
{
flag.quiet = false;
- ASSERTW (BUNDERFLOW(buf) == UNDERFLOW_TRAP, "called from %s:%u to %s: Error in %s: buffer has corrupt underflow trap",
- func, code_line, buf_func, buf_desc(buf).s);
+ ASSERTW (BUNDERFLOW(buf) == UNDERFLOW_TRAP, "called from %s:%u to %s: Error in %s: buffer has corrupt underflow trap: %s",
+ func, code_line, buf_func, buf_desc(buf).s, str_to_printable_(buf->memory, 8).s);
- ASSERTW (BOVERFLOW(buf) == OVERFLOW_TRAP, "called from %s:%u to %s: Error in %s: buffer has corrupt overflow trap",
- func, code_line, buf_func, buf_desc(buf).s);
+ ASSERTW (BOVERFLOW(buf) == OVERFLOW_TRAP, "called from %s:%u to %s: Error in %s: buffer has corrupt overflow trap: %s",
+ func, code_line, buf_func, buf_desc(buf).s, str_to_printable_(buf->memory + buf->size + sizeof(uint64_t), 8).s);
bool corruption_detected = buflist_test_overflows (buf->vb, buf_func);
if (corruption_detected) buflist_test_overflows_all_other_vb (buf->vb, buf_func, true); // corruption not from this VB - test the others
diff --git a/src/buf_struct.h b/src/buf_struct.h
index e8852935..271e062c 100644
--- a/src/buf_struct.h
+++ b/src/buf_struct.h
@@ -17,7 +17,7 @@
// Notes: 1 byte. BUF_UNALLOCATED must be 0. all values must be identical to
#define BUF_TYPE_BITS 3
typedef enum { BUF_UNALLOCATED=0, BUF_REGULAR, BITS_STANDALONE, BUF_SHM, BUF_DISOWNED, BUF_NUM_TYPES } BufferType;
-#define BUFTYPE_NAMES { "UNALLOCATED", "REGULAR", "BITS_STANDALONE", "SHM", "DISOWNED" }
+#define BUFTYPE_NAMES { "UNALLOCATED", "REGULAR", "BITS_STANDALONE", "SHM", "DISOWNED" }
typedef struct {
bool lock;
@@ -42,6 +42,9 @@ typedef struct Buffer { // 72 bytes
uint16_t prm16[4];
uint8_t prm8 [8];
void *pointer;
+ struct { int32_t uncomp_len, comp_len; }; // (signed) used for compressed buffer: uncomp_len is the length of the uncompressing comp_len data from the buffer
+ struct { uint32_t consumed_by_prev_vb; // vb->gz_blocks: bytes of the first BGZF block consumed by the prev VB or txt_header
+ uint32_t current_bb_i; }; // index into vb->gz_blocks of first bgzf block of current line
};
union {
uint64_t len; // used by the buffer user according to its internal logic. not modified by malloc/realloc, zeroed by buf_free (in Bits - nwords)
@@ -94,7 +97,7 @@ extern void buf_initialize(void);
#define ASSERTNOTINUSE(buf) ASSERT (!buf_is_alloc (&(buf)) && !(buf).len && !(buf).param, "expecting %s to be free, but it's not: %s", #buf, buf_desc (&(buf)).s)
#define ASSERTISALLOCED(buf) ASSERT (buf_is_alloc (&(buf)), "%s is not allocated", #buf)
#define ASSERTISEMPTY(buf) ASSERT (buf_is_alloc (&(buf)) && !(buf).len, "expecting %s to be be allocated and empty, but it isn't: %s", #buf, buf_desc (&(buf)).s)
-#define ASSERTNOTEMPTY(buf) ASSERT ((buf).len, "expecting %s to be contain some data, but it doesn't: %s", #buf, buf_desc (&(buf)).s)
+#define ASSERTNOTEMPTY(buf) ASSERT ((buf).len && (buf).data, "expecting %s to be contain some data, but it doesn't: %s", #buf, buf_desc (&(buf)).s)
extern void buf_alloc_do (VBlockP vb, BufferP buf, uint64_t requested_size, float grow_at_least_factor, rom name, FUNCLINE);
diff --git a/src/chrom.c b/src/chrom.c
index fbe57dbf..0425b25d 100644
--- a/src/chrom.c
+++ b/src/chrom.c
@@ -99,8 +99,8 @@ void chrom_2ref_load (Reference ref)
WordIndex chrom_index = BGEN32 (ent->chrom_index);
WordIndex ref_index = BGEN32 (ent->ref_index);
- ASSERT (chrom_index >= 0 && chrom_index < zctx->word_list.len, "chrom_index=%d ∉ [0,%d]", chrom_index, (int32_t)zctx->word_list.len-1);
- ASSERT (!num_ref_contigs /* ref not loaded */ || (ref_index >= -1 && ref_index < num_ref_contigs),
+ ASSERT (IN_RANGE(chrom_index, 0, zctx->word_list.len32-1), "chrom_index=%d ∉ [0,%d]", chrom_index, (int32_t)zctx->word_list.len-1);
+ ASSERT (!num_ref_contigs /* ref not loaded */ || IN_RANGE (ref_index, -1, num_ref_contigs-1),
"ref_index=%d ∉ [-1,%u] (chrom_index=%u i=%u len=%u)",
ref_index, num_ref_contigs-1, chrom_index, i, evb->scratch.len32);
@@ -135,7 +135,7 @@ WordIndex chrom_2ref_seg_get (Reference ref, ConstVBlockP vb, WordIndex chrom_in
: (chrom_index < ctx->chrom2ref_map.len32) ? *B(WordIndex, ctx->chrom2ref_map, chrom_index - ol_len) // possibly WORD_INDEX_NONE, see chrom_seg_ex
: WORD_INDEX_NONE;
- ASSSEG (ref_index >= WORD_INDEX_NONE && ref_index < (WordIndex)ref_num_contigs (ref),
+ ASSSEG (IN_RANGE (ref_index, WORD_INDEX_NONE, (WordIndex)ref_num_contigs (ref)-1),
"ref_index=%d out of range: ref->ranges.len=%u, chrom_index=%d", ref_index, ref_num_contigs (ref), chrom_index);
return ref_index;
diff --git a/src/codec.c b/src/codec.c
index 45127a3e..94fc0f2c 100644
--- a/src/codec.c
+++ b/src/codec.c
@@ -94,7 +94,7 @@ static uint32_t codec_est_size_default (Codec codec, uint64_t uncompressed_len)
// returns 4-character codec name
rom codec_name (Codec codec)
{
- return (codec >=0 && codec < NUM_CODECS) ? codec_args[codec].name : "BAD!";
+ return IN_RANGE (codec, 0, NUM_CODECS-1) ? codec_args[codec].name : "BAD!";
}
void codec_initialize (void)
@@ -311,7 +311,7 @@ Codec codec_assign_best_codec (VBlockP vb,
// save the assignment for future VBs, but not in --best, where each VB tests on its own.
// note: for local (except in --fast), we don't commit for vb=1 bc less representative of data
// (ok for --best as we count (BEST_LOCK_IN_THREASHOLD) anyway))
- if ((is_b250 || (is_local && (flag.best || flag.fast || vb->vblock_i > 1 || vb->is_eof))) && *selected_codec != CODEC_UNKNOWN && zctx)
+ if ((is_b250 || (is_local && (flag.best || flag.fast || vb->vblock_i > 1 || vb->is_last_vb_in_txt_file))) && *selected_codec != CODEC_UNKNOWN && zctx)
ctx_commit_codec_to_zf_ctx (vb, ctx, is_local, true);
done:
diff --git a/src/codec.h b/src/codec.h
index 9e7a6993..e7861e51 100644
--- a/src/codec.h
+++ b/src/codec.h
@@ -97,6 +97,7 @@ typedef struct {
{ 0, "SMUX", "+", codec_smux_compress, USE_SUBCODEC, codec_smux_reconstruct, codec_trivial_size, }, \
{ 0, "ORA", "+.ora", NA1, NA2, NA3, NA4 }, \
{ 0, "OQ", "+", codec_oq_compress, USE_SUBCODEC, codec_oq_reconstruct, codec_RANB_est_size, }, \
+ { 0, "GZIL", "+.gz", NA1, NA2, NA3, NA4 }, \
}
extern CodecArgs codec_args[NUM_CODECS];
diff --git a/src/codec_domq.c b/src/codec_domq.c
index d5f4cf23..c5f48e62 100644
--- a/src/codec_domq.c
+++ b/src/codec_domq.c
@@ -155,7 +155,7 @@ static void codec_domq_calc_histogram (VBlockP vb, ContextP qual_ctx, ContextP d
// validate bases and get line dom
uint32_t max_score_count=0;
for (int ascii_i=0; ascii_i < 256; ascii_i++) {
- ASSERT ((ascii_i >= FIRST_Q && ascii_i <= LAST_Q) || !line_ascii_histogram[ascii_i],
+ ASSERT (IN_RANGE (ascii_i, FIRST_Q, LAST_Q) || !line_ascii_histogram[ascii_i],
"%s/%u: QUAL value=%u ∉ [%u, %u] for %s",
VB_NAME, line_i, ascii_i, FIRST_Q, LAST_Q, qual_ctx->tag_name);
diff --git a/src/conda/meta.template.yaml b/src/conda/meta.template.yaml
index cd758315..7432a19e 100644
--- a/src/conda/meta.template.yaml
+++ b/src/conda/meta.template.yaml
@@ -26,17 +26,17 @@ build:
requirements:
build:
- - {{posix}}make
+ - {{ posix }}make
- nasm # [not arm64]
- {{ compiler('c') }} # [not win]
- {{ compiler('cxx') }} # [not win]
- {{ compiler('m2w64_c') }} # [win]
- - {{posix}}sed # [win]
- - {{posix}}coreutils # [win]
+ - {{ posix }}sed # [win]
+ - {{ posix }}coreutils # [win]
host:
- - {{native}}gcc-libs # [win]
+ - {{ native }}gcc-libs # [win]
run:
- - {{native}}gcc-libs # [win]
+ - {{ native }}gcc-libs # [win]
- curl
test:
diff --git a/src/crypt.c b/src/crypt.c
index ffeab106..06313ac5 100644
--- a/src/crypt.c
+++ b/src/crypt.c
@@ -147,6 +147,5 @@ void crypt_pad (uint8_t *data, uint32_t data_len, uint32_t padding_len)
rom encryption_name (EncryptionType encryption_type)
{
- static rom names[NUM_ENCRYPTION_TYPES] = ENC_NAMES;
- return type_name (encryption_type, &names[encryption_type], ARRAY_LEN(names));
+ return type_name (encryption_type, &(rom[])ENC_NAMES[encryption_type], NUM_ENCRYPTION_TYPES);
}
diff --git a/src/dict_id_gen.h b/src/dict_id_gen.h
index 5dfe2377..11e78f98 100644
--- a/src/dict_id_gen.h
+++ b/src/dict_id_gen.h
@@ -668,6 +668,7 @@
#define _SAM_SAG ((uint64_t)4669715)
#define _SAM_SAALN ((uint64_t)336286794003)
#define _SAM_FQ_AUX ((uint64_t)379387465990)
+#define _SAM_FQ_AUX_OLD ((uint64_t)1516192525)
#define _OPTION_AM_i ((uint64_t)1765428545)
#define _OPTION_AS_i ((uint64_t)1765430081)
#define _OPTION_CC_Z ((uint64_t)1513767747)
@@ -1710,7 +1711,7 @@ typedef enum { VCF_CHROM, VCF_POS, VCF_MATE_POS, VCF_ID, VCF_REFALT, VCF_MATE_CH
[INFO_DPB] = { { _INFO_DPB }, TAG(DPB) }, \
}
-typedef enum { SAM_RNAME, SAM_QNAME, SAM_Q0NAME, SAM_Q1NAME, SAM_Q2NAME, SAM_Q3NAME, SAM_Q4NAME, SAM_Q5NAME, SAM_Q6NAME, SAM_Q7NAME, SAM_Q8NAME, SAM_Q9NAME, SAM_QANAME, SAM_QBNAME, SAM_QCNAME, SAM_QDNAME, SAM_QENAME, SAM_QmNAME, SAM_QNAME2, SAM_Q0NAME2, SAM_Q1NAME2, SAM_Q2NAME2, SAM_Q3NAME2, SAM_Q4NAME2, SAM_Q5NAME2, SAM_Q6NAME2, SAM_Q7NAME2, SAM_Q8NAME2, SAM_Q9NAME2, SAM_QANAME2, SAM_QBNAME2, SAM_QCNAME2, SAM_QDNAME2, SAM_QeNAME2, SAM_QmNAME2, FASTQ_EXTRA, SAM_AUX, SAM_SQBITMAP, SAM_NONREF, SAM_NONREF_X, SAM_GPOS, SAM_GPOS_DELTA, SAM_GPOS_R2, SAM_STRAND, SAM_STRAND_R2, SAM_SEQMIS_A, SAM_SEQMIS_C, SAM_SEQMIS_G, SAM_SEQMIS_T, SAM_SEQINS_A, SAM_SEQINS_C, SAM_SEQINS_G, SAM_SEQINS_T, SAM_QUAL, SAM_DOMQRUNS, SAM_QUALMPLX, SAM_DIVRQUAL, SAM_CQUAL, SAM_CDOMQRUNS, SAM_CQUALMPLX, SAM_CDIVRQUAL, SAM_TOPLEVEL, SAM_BUDDY, SAM_TAXID, SAM_DEBUG_LINES, FASTQ_DEEP, FASTQ_DEEP_DELTA, FASTQ_E1L, FASTQ_E2L, FASTQ_LINE3, FASTQ_T0HIRD, FASTQ_T1HIRD, FASTQ_T2HIRD, FASTQ_T3HIRD, FASTQ_T4HIRD, FASTQ_T5HIRD, FASTQ_T6HIRD, FASTQ_T7HIRD, FASTQ_T8HIRD, FASTQ_T9HIRD, FASTQ_TAHIRD, FASTQ_TBHIRD, FASTQ_TCHIRD, FASTQ_TDHIRD, FASTQ_TEHIRD, FASTQ_TmHIRD, FASTQ_AUX_LENGTH, SAM_FLAG, SAM_FLAG0, SAM_FLAG1, SAM_POS, SAM_MAPQ, SAM_CIGAR, SAM_RNEXT, SAM_PNEXT, SAM_TLEN, SAM_QNAMESA, SAM_QUALSA, SAM_QUAL_FLANK, SAM_QUAL_FLANK_DOMQRUNS, SAM_QUAL_FLANK_QUALMPLX, SAM_QUAL_FLANK_DIVRQUAL, SAM_QUAL_PACBIO_DIFF, SAM_EOL, SAM_BAM_BIN, SAM_TOP2BAM, SAM_TOP2NONE, SAM_SAG, SAM_SAALN, SAM_FQ_AUX, OPTION_AM_i, OPTION_AS_i, OPTION_CC_Z, OPTION_CP_i, OPTION_CM_i, OPTION_FI_i, OPTION_H0_i, OPTION_H1_i, OPTION_H2_i, OPTION_MC_Z, OPTION_MC0_Z, OPTION_MD_Z, OPTION_MQ_i, OPTION_NH_i, OPTION_IH_i, OPTION_HI_i, OPTION_NM_i, OPTION_PQ_i, OPTION_SM_i, OPTION_TC_i, OPTION_UQ_i, OPTION_BQ_Z, OPTION_ML_B_C, OPTION_MM_Z, OPTION_U2_Z, OPTION_U2_DOMQRUNS, OPTION_U2_QUALMPLX, OPTION_U2_DIVRQUAL, OPTION_E2_Z, OPTION_2NONREF, OPTION_N2ONREFX, OPTION_2GPOS, OPTION_S2TRAND, OPTION_SA_Z, OPTION_SA_RNAME, OPTION_SA_STRAND, OPTION_SA_POS, OPTION_SA_CIGAR, OPTION_SA_NM, OPTION_SA_MAPQ, OPTION_SA_MAIN, OPTION_PG_Z, OPTION_PU_Z, OPTION_RG_Z, OPTION_LB_Z, OPTION_BC_Z, OPTION_BC_ARR, OPTION_QT_Z, OPTION_QT_ARR, OPTION_QT_DOMQRUNS, OPTION_QT_QUALMPLX, OPTION_QT_DIVRQUAL, OPTION_YB_Z, OPTION_CR_Z, OPTION_CR_Z_X, OPTION_CB_Z, OPTION_CB_ARR, OPTION_CB_SUFFIX, OPTION_CY_Z, OPTION_CY_ARR, OPTION_CY_DOMQRUNS, OPTION_CY_QUALMPLX, OPTION_CY_DIVRQUAL, OPTION_BZ_Z, OPTION_BZ_ARR, OPTION_BZ_DOMQRUNS, OPTION_BZ_QUALMPLX, OPTION_BZ_DIVRQUAL, OPTION_OX_Z, OPTION_MI_Z, OPTION_OQ_Z, OPTION_OQ_DOMQRUNS, OPTION_OQ_QUALMPLX, OPTION_OQ_DIVRQUAL, OPTION_OA_Z, OPTION_OA_RNAME, OPTION_OA_STRAND, OPTION_OA_POS, OPTION_OA_CIGAR, OPTION_OA_NM, OPTION_OA_MAPQ, OPTION_OC_Z, OPTION_OP_i, OPTION_X0_i, OPTION_X1_i, OPTION_XC_i, OPTION_XN_i, OPTION_XM_i, OPTION_XO_i, OPTION_XG_i, OPTION_XT_A, OPTION_XS_i, OPTION_XE_i, OPTION_XF_i, OPTION_XA_Z, OPTION_XA_LOOKBACK, OPTION_XA_RNAME, OPTION_XA_STRAND, OPTION_XA_POS, OPTION_XA_CIGAR, OPTION_XA_NM, OPTION_XA_STRAND_POS, OPTION_TS_A, OPTION_YF_Z, OPTION_YS_i, OPTION_YT_Z, OPTION_XA_i, OPTION_ZS_i, OPTION_Zs_Z, OPTION_Zs_POS, OPTION_Zs_TYPE, OPTION_Zs_RS, OPTION_ZA_i, OPTION_ZB_i, OPTION_ZC_B_i, OPTION_ZF_i, OPTION_ZG_i, OPTION_ZM_B_s, OPTION_ZP_B_f, OPTION_ZT_Z, OPTION_ZE_Z, OPTION_YE_Z, OPTION_ZK_Z, OPTION_YK_Z, OPTION_nM_i, OPTION_jM_B_c, OPTION_jI_B_i, OPTION_rB_B_i, OPTION_XS_A, OPTION_uT_A, OPTION_vA_i, OPTION_vG_Z, OPTION_vW_i, OPTION_UR_Z, OPTION_UB_Z, OPTION_UY_Z, OPTION_GN_Z, OPTION_GX_Z, OPTION_gn_Z, OPTION_gx_Z, OPTION_sS_Z, OPTION_sQ_Z, OPTION_sM_Z, OPTION_TX_Z, OPTION_TX_LOOKBACK, OPTION_TX_NEGATIVE, OPTION_TX_GENE, OPTION_TX_STRAND, OPTION_TX_POS, OPTION_TX_CIGAR, OPTION_TX_SAM_POS, OPTION_AN_Z, OPTION_AN_LOOKBACK, OPTION_AN_NEGATIVE, OPTION_AN_GENE, OPTION_AN_STRAND, OPTION_AN_POS, OPTION_AN_CIGAR, OPTION_AN_SAM_POS, OPTION_GR_Z, OPTION_GR_Z_X, OPTION_GY_Z, OPTION_GY_Z_X, OPTION_2R_Z, OPTION_2Y_Z, OPTION_2Y_DOMQRUNS, OPTION_2Y_QUALMPLX, OPTION_2Y_DIVRQUAL, OPTION_fb_Z, OPTION_fr_Z, OPTION_fq_Z, OPTION_fx_Z, OPTION_GP_i, OPTION_MP_i, OPTION_RX_Z, OPTION_RX_Z_X, OPTION_BX_Z, OPTION_QX_Z, OPTION_QX_DOMQRUNS, OPTION_QX_QUALMPLX, OPTION_QX_DIVRQUAL, OPTION_TR_Z, OPTION_TQ_Z, OPTION_TQ_DOMQRUNS, OPTION_TQ_QUALMPLX, OPTION_TQ_DIVRQUAL, OPTION_PC_i, OPTION_PS_i, OPTION_HP_i, OPTION_MI_i, OPTION_AM_A, OPTION_XM_A, OPTION_DM_Z, OPTION_XL_i, OPTION_XQ_i, OPTION_XT_i, OPTION_cx_i, OPTION_qs_i, OPTION_qe_i, OPTION_ws_i, OPTION_we_i, OPTION_zm_i, OPTION_np_i, OPTION_ec_f, OPTION_rq_f, OPTION_sn_B_f, OPTION_dt_Z, OPTION_st_Z, OPTION_mq_Z, OPTION_dq_Z, OPTION_iq_Z, OPTION_sq_Z, OPTION_iq_sq_dq, OPTION_ip_B_C, OPTION_ip_ARR, OPTION_pw_B_C, OPTION_fi_B_C, OPTION_ri_B_C, OPTION_fp_B_C, OPTION_rp_B_C, OPTION_fn_i, OPTION_rn_i, OPTION_sz_A, OPTION_sc_A, OPTION_ls_B_C, OPTION_ac_B_i, OPTION_ma_i, OPTION_bc_B_S, OPTION_bq_i, OPTION_bl_Z, OPTION_bt_Z, OPTION_ql_Z, OPTION_qt_Z, OPTION_bx_B_i, OPTION_ms_i, OPTION_mc_i, OPTION_cq_Z, OPTION_Z5_i, OPTION_Zq_i, OPTION_YH_Z, OPTION_YQ_Z, OPTION_XR_i, OPTION_QS_i, OPTION_QE_i, OPTION_XI_f, OPTION_CV_f, OPTION_XP_Z, OPTION_XO_Z, OPTION_XM_Z, OPTION_XG_Z, OPTION_XB_Z, OPTION_YS_Z, OPTION_XR_Z, OPTION_XB_A, OPTION_X4_Z, OPTION_X5_Z, OPTION_md_Z, OPTION_sd_f, OPTION_xq_i, OPTION_BD_Z, OPTION_BI_Z, OPTION_BD_BI, OPTION_XQ_Z, OPTION_tp_A, OPTION_cm_i, OPTION_s1_i, OPTION_s2_i, OPTION_nn_i, OPTION_ts_A, OPTION_cs_Z, OPTION_dv_f, OPTION_de_f, OPTION_rl_i, OPTION_tp_B_c, OPTION_tp_B_ARR, OPTION_bi_Z, OPTION_XV_Z, OPTION_XW_Z, OPTION_tm_Z, OPTION_t0_Z, OPTION_pr_i, OPTION_pt_i, OPTION_px_i, OPTION_py_i, OPTION_si_i, OPTION_a3_i, OPTION_tq_i, OPTION_tz_i, OPTION_DS_i, OPTION_ZX_i, OPTION_ZY_i, OPTION_ZW_f, OPTION_ZA_Z, OPTION_ZB_Z, OPTION_fi_Z, OPTION_XI_i, OPTION_XJ_i, OPTION_xc_i, OPTION_xm_i, OPTION_xd_i, OPTION_zd_Z, OPTION_zp_Z, OPTION_zn_Z, OPTION_ls_B_i, OPTION_YA_Z, OPTION_YO_Z, OPTION_YX_i, OPTION_YM_i, OPTION_XX_i, OPTION_XY_i, OPTION_YY_i, OPTION_rb_Z, OPTION_mb_Z, OPTION_tx_i, OPTION_CIGAR, SAM_E2_Z, SAM_U2_Z, NUM_SAM_FIELDS } SAMFields;
+typedef enum { SAM_RNAME, SAM_QNAME, SAM_Q0NAME, SAM_Q1NAME, SAM_Q2NAME, SAM_Q3NAME, SAM_Q4NAME, SAM_Q5NAME, SAM_Q6NAME, SAM_Q7NAME, SAM_Q8NAME, SAM_Q9NAME, SAM_QANAME, SAM_QBNAME, SAM_QCNAME, SAM_QDNAME, SAM_QENAME, SAM_QmNAME, SAM_QNAME2, SAM_Q0NAME2, SAM_Q1NAME2, SAM_Q2NAME2, SAM_Q3NAME2, SAM_Q4NAME2, SAM_Q5NAME2, SAM_Q6NAME2, SAM_Q7NAME2, SAM_Q8NAME2, SAM_Q9NAME2, SAM_QANAME2, SAM_QBNAME2, SAM_QCNAME2, SAM_QDNAME2, SAM_QeNAME2, SAM_QmNAME2, FASTQ_EXTRA, SAM_AUX, SAM_SQBITMAP, SAM_NONREF, SAM_NONREF_X, SAM_GPOS, SAM_GPOS_DELTA, SAM_GPOS_R2, SAM_STRAND, SAM_STRAND_R2, SAM_SEQMIS_A, SAM_SEQMIS_C, SAM_SEQMIS_G, SAM_SEQMIS_T, SAM_SEQINS_A, SAM_SEQINS_C, SAM_SEQINS_G, SAM_SEQINS_T, SAM_QUAL, SAM_DOMQRUNS, SAM_QUALMPLX, SAM_DIVRQUAL, SAM_CQUAL, SAM_CDOMQRUNS, SAM_CQUALMPLX, SAM_CDIVRQUAL, SAM_TOPLEVEL, SAM_BUDDY, SAM_TAXID, SAM_DEBUG_LINES, FASTQ_DEEP, FASTQ_DEEP_DELTA, FASTQ_E1L, FASTQ_E2L, FASTQ_LINE3, FASTQ_T0HIRD, FASTQ_T1HIRD, FASTQ_T2HIRD, FASTQ_T3HIRD, FASTQ_T4HIRD, FASTQ_T5HIRD, FASTQ_T6HIRD, FASTQ_T7HIRD, FASTQ_T8HIRD, FASTQ_T9HIRD, FASTQ_TAHIRD, FASTQ_TBHIRD, FASTQ_TCHIRD, FASTQ_TDHIRD, FASTQ_TEHIRD, FASTQ_TmHIRD, FASTQ_AUX_LENGTH, SAM_FLAG, SAM_FLAG0, SAM_FLAG1, SAM_POS, SAM_MAPQ, SAM_CIGAR, SAM_RNEXT, SAM_PNEXT, SAM_TLEN, SAM_QNAMESA, SAM_QUALSA, SAM_QUAL_FLANK, SAM_QUAL_FLANK_DOMQRUNS, SAM_QUAL_FLANK_QUALMPLX, SAM_QUAL_FLANK_DIVRQUAL, SAM_QUAL_PACBIO_DIFF, SAM_EOL, SAM_BAM_BIN, SAM_TOP2BAM, SAM_TOP2NONE, SAM_SAG, SAM_SAALN, SAM_FQ_AUX, SAM_FQ_AUX_OLD, OPTION_AM_i, OPTION_AS_i, OPTION_CC_Z, OPTION_CP_i, OPTION_CM_i, OPTION_FI_i, OPTION_H0_i, OPTION_H1_i, OPTION_H2_i, OPTION_MC_Z, OPTION_MC0_Z, OPTION_MD_Z, OPTION_MQ_i, OPTION_NH_i, OPTION_IH_i, OPTION_HI_i, OPTION_NM_i, OPTION_PQ_i, OPTION_SM_i, OPTION_TC_i, OPTION_UQ_i, OPTION_BQ_Z, OPTION_ML_B_C, OPTION_MM_Z, OPTION_U2_Z, OPTION_U2_DOMQRUNS, OPTION_U2_QUALMPLX, OPTION_U2_DIVRQUAL, OPTION_E2_Z, OPTION_2NONREF, OPTION_N2ONREFX, OPTION_2GPOS, OPTION_S2TRAND, OPTION_SA_Z, OPTION_SA_RNAME, OPTION_SA_STRAND, OPTION_SA_POS, OPTION_SA_CIGAR, OPTION_SA_NM, OPTION_SA_MAPQ, OPTION_SA_MAIN, OPTION_PG_Z, OPTION_PU_Z, OPTION_RG_Z, OPTION_LB_Z, OPTION_BC_Z, OPTION_BC_ARR, OPTION_QT_Z, OPTION_QT_ARR, OPTION_QT_DOMQRUNS, OPTION_QT_QUALMPLX, OPTION_QT_DIVRQUAL, OPTION_YB_Z, OPTION_CR_Z, OPTION_CR_Z_X, OPTION_CB_Z, OPTION_CB_ARR, OPTION_CB_SUFFIX, OPTION_CY_Z, OPTION_CY_ARR, OPTION_CY_DOMQRUNS, OPTION_CY_QUALMPLX, OPTION_CY_DIVRQUAL, OPTION_BZ_Z, OPTION_BZ_ARR, OPTION_BZ_DOMQRUNS, OPTION_BZ_QUALMPLX, OPTION_BZ_DIVRQUAL, OPTION_OX_Z, OPTION_MI_Z, OPTION_OQ_Z, OPTION_OQ_DOMQRUNS, OPTION_OQ_QUALMPLX, OPTION_OQ_DIVRQUAL, OPTION_OA_Z, OPTION_OA_RNAME, OPTION_OA_STRAND, OPTION_OA_POS, OPTION_OA_CIGAR, OPTION_OA_NM, OPTION_OA_MAPQ, OPTION_OC_Z, OPTION_OP_i, OPTION_X0_i, OPTION_X1_i, OPTION_XC_i, OPTION_XN_i, OPTION_XM_i, OPTION_XO_i, OPTION_XG_i, OPTION_XT_A, OPTION_XS_i, OPTION_XE_i, OPTION_XF_i, OPTION_XA_Z, OPTION_XA_LOOKBACK, OPTION_XA_RNAME, OPTION_XA_STRAND, OPTION_XA_POS, OPTION_XA_CIGAR, OPTION_XA_NM, OPTION_XA_STRAND_POS, OPTION_TS_A, OPTION_YF_Z, OPTION_YS_i, OPTION_YT_Z, OPTION_XA_i, OPTION_ZS_i, OPTION_Zs_Z, OPTION_Zs_POS, OPTION_Zs_TYPE, OPTION_Zs_RS, OPTION_ZA_i, OPTION_ZB_i, OPTION_ZC_B_i, OPTION_ZF_i, OPTION_ZG_i, OPTION_ZM_B_s, OPTION_ZP_B_f, OPTION_ZT_Z, OPTION_ZE_Z, OPTION_YE_Z, OPTION_ZK_Z, OPTION_YK_Z, OPTION_nM_i, OPTION_jM_B_c, OPTION_jI_B_i, OPTION_rB_B_i, OPTION_XS_A, OPTION_uT_A, OPTION_vA_i, OPTION_vG_Z, OPTION_vW_i, OPTION_UR_Z, OPTION_UB_Z, OPTION_UY_Z, OPTION_GN_Z, OPTION_GX_Z, OPTION_gn_Z, OPTION_gx_Z, OPTION_sS_Z, OPTION_sQ_Z, OPTION_sM_Z, OPTION_TX_Z, OPTION_TX_LOOKBACK, OPTION_TX_NEGATIVE, OPTION_TX_GENE, OPTION_TX_STRAND, OPTION_TX_POS, OPTION_TX_CIGAR, OPTION_TX_SAM_POS, OPTION_AN_Z, OPTION_AN_LOOKBACK, OPTION_AN_NEGATIVE, OPTION_AN_GENE, OPTION_AN_STRAND, OPTION_AN_POS, OPTION_AN_CIGAR, OPTION_AN_SAM_POS, OPTION_GR_Z, OPTION_GR_Z_X, OPTION_GY_Z, OPTION_GY_Z_X, OPTION_2R_Z, OPTION_2Y_Z, OPTION_2Y_DOMQRUNS, OPTION_2Y_QUALMPLX, OPTION_2Y_DIVRQUAL, OPTION_fb_Z, OPTION_fr_Z, OPTION_fq_Z, OPTION_fx_Z, OPTION_GP_i, OPTION_MP_i, OPTION_RX_Z, OPTION_RX_Z_X, OPTION_BX_Z, OPTION_QX_Z, OPTION_QX_DOMQRUNS, OPTION_QX_QUALMPLX, OPTION_QX_DIVRQUAL, OPTION_TR_Z, OPTION_TQ_Z, OPTION_TQ_DOMQRUNS, OPTION_TQ_QUALMPLX, OPTION_TQ_DIVRQUAL, OPTION_PC_i, OPTION_PS_i, OPTION_HP_i, OPTION_MI_i, OPTION_AM_A, OPTION_XM_A, OPTION_DM_Z, OPTION_XL_i, OPTION_XQ_i, OPTION_XT_i, OPTION_cx_i, OPTION_qs_i, OPTION_qe_i, OPTION_ws_i, OPTION_we_i, OPTION_zm_i, OPTION_np_i, OPTION_ec_f, OPTION_rq_f, OPTION_sn_B_f, OPTION_dt_Z, OPTION_st_Z, OPTION_mq_Z, OPTION_dq_Z, OPTION_iq_Z, OPTION_sq_Z, OPTION_iq_sq_dq, OPTION_ip_B_C, OPTION_ip_ARR, OPTION_pw_B_C, OPTION_fi_B_C, OPTION_ri_B_C, OPTION_fp_B_C, OPTION_rp_B_C, OPTION_fn_i, OPTION_rn_i, OPTION_sz_A, OPTION_sc_A, OPTION_ls_B_C, OPTION_ac_B_i, OPTION_ma_i, OPTION_bc_B_S, OPTION_bq_i, OPTION_bl_Z, OPTION_bt_Z, OPTION_ql_Z, OPTION_qt_Z, OPTION_bx_B_i, OPTION_ms_i, OPTION_mc_i, OPTION_cq_Z, OPTION_Z5_i, OPTION_Zq_i, OPTION_YH_Z, OPTION_YQ_Z, OPTION_XR_i, OPTION_QS_i, OPTION_QE_i, OPTION_XI_f, OPTION_CV_f, OPTION_XP_Z, OPTION_XO_Z, OPTION_XM_Z, OPTION_XG_Z, OPTION_XB_Z, OPTION_YS_Z, OPTION_XR_Z, OPTION_XB_A, OPTION_X4_Z, OPTION_X5_Z, OPTION_md_Z, OPTION_sd_f, OPTION_xq_i, OPTION_BD_Z, OPTION_BI_Z, OPTION_BD_BI, OPTION_XQ_Z, OPTION_tp_A, OPTION_cm_i, OPTION_s1_i, OPTION_s2_i, OPTION_nn_i, OPTION_ts_A, OPTION_cs_Z, OPTION_dv_f, OPTION_de_f, OPTION_rl_i, OPTION_tp_B_c, OPTION_tp_B_ARR, OPTION_bi_Z, OPTION_XV_Z, OPTION_XW_Z, OPTION_tm_Z, OPTION_t0_Z, OPTION_pr_i, OPTION_pt_i, OPTION_px_i, OPTION_py_i, OPTION_si_i, OPTION_a3_i, OPTION_tq_i, OPTION_tz_i, OPTION_DS_i, OPTION_ZX_i, OPTION_ZY_i, OPTION_ZW_f, OPTION_ZA_Z, OPTION_ZB_Z, OPTION_fi_Z, OPTION_XI_i, OPTION_XJ_i, OPTION_xc_i, OPTION_xm_i, OPTION_xd_i, OPTION_zd_Z, OPTION_zp_Z, OPTION_zn_Z, OPTION_ls_B_i, OPTION_YA_Z, OPTION_YO_Z, OPTION_YX_i, OPTION_YM_i, OPTION_XX_i, OPTION_XY_i, OPTION_YY_i, OPTION_rb_Z, OPTION_mb_Z, OPTION_tx_i, OPTION_CIGAR, SAM_E2_Z, SAM_U2_Z, NUM_SAM_FIELDS } SAMFields;
#define SAM_PREDEFINED { \
[SAM_RNAME] = { { _SAM_RNAME }, TAG(RNAME) }, \
@@ -1823,6 +1824,7 @@ typedef enum { SAM_RNAME, SAM_QNAME, SAM_Q0NAME, SAM_Q1NAME, SAM_Q2NAME, SAM_Q3N
[SAM_SAG] = { { _SAM_SAG }, TAG(SAG) }, \
[SAM_SAALN] = { { _SAM_SAALN }, TAG(SAALN) }, \
[SAM_FQ_AUX] = { { _SAM_FQ_AUX }, TAG(FQAUX) }, \
+ [SAM_FQ_AUX_OLD] = { { _SAM_FQ_AUX_OLD }, TAG(MC_Z) }, \
[OPTION_AM_i] = { { _OPTION_AM_i }, TAG(AM:i) }, \
[OPTION_AS_i] = { { _OPTION_AS_i }, TAG(AS:i) }, \
[OPTION_CC_Z] = { { _OPTION_CC_Z }, TAG(CC:Z) }, \
diff --git a/src/dispatcher.c b/src/dispatcher.c
index b192f344..5e54fbee 100644
--- a/src/dispatcher.c
+++ b/src/dispatcher.c
@@ -13,6 +13,7 @@
#include "threads.h"
#include "segconf.h"
#include "arch.h"
+#include "zip.h"
#define RR(x) ((x) % d->max_threads)
@@ -67,8 +68,7 @@ void dispatcher_increment_progress (rom where, int64_t increment)
// update target
if (IS_ZIP && !txt_file->est_num_lines)
- d->target_progress = (3 + segconf.zip_txt_modified) // read, (modify), seg, compress
- * txtfile_get_seggable_size();
+ d->target_progress = zip_get_target_progress();
// in unbind mode - dispatcher is not done if there's another component after this one
bool done = dispatcher_is_done (main_dispatcher);
@@ -388,7 +388,7 @@ bool dispatcher_is_input_exhausted (Dispatcher d)
Dispatcher dispatcher_fan_out_task (rom task_name,
rom filename, // NULL to continue with previous filename
- uint32_t target_progress, // used if progress_type=PROGRESS_PERCENT
+ uint64_t target_progress, // used if progress_type=PROGRESS_PERCENT
rom prog_msg, // implies progress_type=PROGRESS_MESSAGE
bool out_of_order,
bool test_mode,
diff --git a/src/dispatcher.h b/src/dispatcher.h
index 722cf096..702fa7c7 100644
--- a/src/dispatcher.h
+++ b/src/dispatcher.h
@@ -38,7 +38,7 @@ extern void dispatcher_set_no_data_available (Dispatcher dispatcher, bool abando
extern bool dispatcher_is_input_exhausted (Dispatcher dispatcher);
extern bool dispatcher_is_done (Dispatcher dispatcher);
extern void dispatcher_set_task_name (Dispatcher dispatcher, rom task_name);
-extern Dispatcher dispatcher_fan_out_task (rom task_name, rom filename, uint32_t target_progress, rom prog_msg, bool out_of_order, bool test_mode, bool force_single_thread, uint32_t previous_vb_i, uint32_t idle_sleep_microsec, bool free_when_done, DispatcherFunc prepare, DispatcherFunc compute, DispatcherFunc output);
+extern Dispatcher dispatcher_fan_out_task (rom task_name, rom filename, uint64_t target_progress, rom prog_msg, bool out_of_order, bool test_mode, bool force_single_thread, uint32_t previous_vb_i, uint32_t idle_sleep_microsec, bool free_when_done, DispatcherFunc prepare, DispatcherFunc compute, DispatcherFunc output);
#define PROGRESS_UNIT (txt_file->est_num_lines ? vb->lines.len : vb->txt_size) // ZIP
extern void dispatcher_increment_progress (rom where, int64_t increment);
extern void dispatcher_calc_avg_compute_vbs (Dispatcher d);
diff --git a/src/fasta.c b/src/fasta.c
index cdd6d15a..6049f002 100644
--- a/src/fasta.c
+++ b/src/fasta.c
@@ -268,7 +268,7 @@ static bool fasta_segconf_is_qualless_fastq (VBlockP vb)
#define NUM_FASTQ_TEST_LINES 2000
str_split_by_lines (vb->txt_data.data, vb->txt_data.len32, NUM_FASTQ_TEST_LINES);
- if (n_lines < 16 && !txt_file->is_eof)
+ if (n_lines < 16 && !txt_file->no_more_blocks)
return false; // not enough lines to determine (except of segconf is the entire file)
n_lines = ROUNDDOWN2 (n_lines) - 2; // keep whole pairs or lines, and drop last pair that might be truncated. now there are at least 4 pairs (8 lines)
diff --git a/src/fastq.c b/src/fastq.c
index 9458a8e5..47ffad28 100644
--- a/src/fastq.c
+++ b/src/fastq.c
@@ -134,8 +134,8 @@ int32_t fastq_unconsumed (VBlockP vb,
{
ASSERT (*i_out >= 0 && *i_out < Ltxt, "*i=%d is ∉ [0,%u]", *i_out, Ltxt);
- rom nl[16]={}; // newline pointers: nl[0] is the first from the end
- uint32_t l[16]={}; // lengths of segments excluding \n and \r: l[1] is the segment that starts at nl[1]+1 until nl[0]-1 (or nl[0]-2 if there is a \r). l[0] is not used.
+ rom nl[17]={}; // newline pointers: nl[0] is the first from the end
+ uint32_t l[17]={}; // lengths of segments excluding \n and \r: l[1] is the segment that starts at nl[1]+1 until nl[0]-1 (or nl[0]-2 if there is a \r). l[0] is not used.
// search backwards a suffient number of newlines (eg. for normal FASTQ: best case: \nD\nS\nT\nQ\n ; worst case: \nD1\nS1\nT1\nQ1\nD2\nS2\nT2\nq2 (q2 is partial Q2))
int n=0;
@@ -143,7 +143,7 @@ int32_t fastq_unconsumed (VBlockP vb,
int min_lines = height * (segconf.is_interleaved ? 2 : 1); // minimum lines needed for testing
int max_lines = min_lines * 2; // maximum lines needed for testing
- for (rom c=Btxt (*i_out), first_c=Btxt (first_i) ; c >= first_c-1/*one beyond*/ && n < max_lines; c--)
+ for (rom c=Btxt (*i_out), first_c=Btxt (first_i) ; c >= first_c-1/*one beyond*/ && n <= max_lines; c--)
if (c == (first_c-1) || *c == '\n') { // we consider character before the start to also be a "virtual newline"
nl[n] = c;
if (n) l[n] = ((nl[n-1]) - (nl[n-1][-1] == '\r')) - (nl[n] + 1);
diff --git a/src/fastq_deep.c b/src/fastq_deep.c
index 99c58060..a02ca0a8 100644
--- a/src/fastq_deep.c
+++ b/src/fastq_deep.c
@@ -56,14 +56,12 @@ void fastq_deep_seg_initialize (VBlockFASTQP vb)
void fastq_deep_zip_finalize (void)
{
if (flag.show_deep) {
- static rom names[] = NO_DEEP_NAMES;
-
uint64_t total = z_file->deep_stats[NDP_FQ_READS];
iprint0 ("\nFASTQ reads breakdown by deepability:\n");
for (int i=0; i < NUM_DEEP_STATS; i++)
if (z_file->deep_stats[i])
- iprintf ("%-11.11s: %"PRIu64" (%.1f%%)\n", names[i], z_file->deep_stats[i], 100.0 * (double)z_file->deep_stats[i] / (double)total);
+ iprintf ("%-11.11s: %"PRIu64" (%.1f%%)\n", (rom[])NO_DEEP_NAMES[i], z_file->deep_stats[i], 100.0 * (double)z_file->deep_stats[i] / (double)total);
}
ARRAY (ZipZDeep, deep_ents, z_file->deep_ents);
diff --git a/src/fastq_seq.c b/src/fastq_seq.c
index ccbfc1a5..4d068284 100644
--- a/src/fastq_seq.c
+++ b/src/fastq_seq.c
@@ -265,7 +265,7 @@ static void fastq_update_coverage_aligned (VBlockFASTQP vb)
ASSPIZ0 (gpos != NO_GPOS, "expecting a GPOS, because sequence is aligned");
- WordIndex ref_index = ref_contig_get_by_gpos (gref, gpos, 0, NULL);
+ WordIndex ref_index = ref_contig_get_by_gpos (gref, gpos, 0, NULL, true); // if gpos is in a gap between to contigs, it means that bulk of seq is on the next contig while its beginning is in the gap
ASSPIZ0 (ref_index != WORD_INDEX_NONE, "expecting ref_index, because sequence is aligned");
if (flag.show_coverage)
diff --git a/src/file.c b/src/file.c
index 7d16e43c..8d802402 100644
--- a/src/file.c
+++ b/src/file.c
@@ -1,1518 +1,1410 @@
-// ------------------------------------------------------------------
-// file.c
-// Copyright (C) 2019-2024 Genozip Limited. Patent Pending.
-// Please see terms and conditions in the file LICENSE.txt
-//
-// WARNING: Genozip is proprietary, not open source software. Modifying the source code is strictly prohibited,
-// under penalties specified in the license.
-
-#include
-#include
-#include
-#ifdef _WIN32
-#include
-#include
-#endif
-#define Z_LARGE64
-#ifdef __APPLE__
- #define off64_t __int64_t
-#endif
-#include "bzlib/bzlib.h"
-#include "zlib/zlib.h"
-#include "file.h"
-#include "url.h"
-#include "codec.h"
-#include "bgzf.h"
-#include "progress.h"
-#include "tar.h"
-#include "writer.h"
-#include "filename.h"
-#include "huffman.h"
-
-// globals
-FileP z_file = NULL;
-FileP txt_file = NULL;
-
-static StreamP input_decompressor = NULL; // bcftools, xz, unzip, samtools or orad - only one at a time
-static StreamP output_compressor = NULL; // samtools (for cram), bcftools
-
-// global pointers - so the can be compared eg "if (mode == READ)"
-rom READ = "rb"; // use binary mode (b) in read and write so Windows doesn't add \r. "b" is accepted but ignored in Linux and MacOS.
-rom WRITE = "wb";
-rom WRITEREAD = "wb+"; // only supported for z_file and gencomp disk files
-
-rom file_exts[] = FILE_EXTS;
-
-static const struct { FileType in; Codec codec; FileType out; } txt_in_ft_by_dt[NUM_DATATYPES][50] = TXT_IN_FT_BY_DT;
-static const FileType txt_out_ft_by_dt[NUM_DATATYPES][20] = TXT_OUT_FT_BY_DT;
-static const FileType z_ft_by_dt[NUM_DATATYPES][20] = Z_FT_BY_DT;
-
-// get data type by file type
-DataType file_get_data_type_of_input_file (FileType ft)
-{
- // note: if make-reference, we scan the array from dt=0 (DT_REF), otherwise we ignore DT_REF
- for (DataType dt=!flag.make_reference; dt < NUM_DATATYPES; dt++)
- for (unsigned i=0; txt_in_ft_by_dt[dt][i].in; i++)
- if (txt_in_ft_by_dt[dt][i].in == ft)
- return dt;
-
- return (ft == UNKNOWN_FILE_TYPE) ? DT_GNRIC : DT_NONE;
-}
-
-DataType file_piz_get_dt_of_out_filename (void)
-{
- if (!flag.out_filename) return DT_NONE;
-
- FileType ft = file_get_type (flag.out_filename);
-
- for (DataType dt=1/*skip DT_REF*/; dt < NUM_DATATYPES; dt++)
- for (unsigned i=0; txt_out_ft_by_dt[dt][i]; i++)
- if (txt_out_ft_by_dt[dt][i] == ft)
- return dt;
-
- return DT_NONE;
-}
-
-// get genozip file type by txt file type
-FileType file_get_z_ft_by_txt_in_ft (DataType dt, FileType txt_ft)
-{
- for (unsigned i=0; txt_in_ft_by_dt[dt][i].in; i++)
- if (txt_in_ft_by_dt[dt][i].in == txt_ft) return txt_in_ft_by_dt[dt][i].out;
-
- return UNKNOWN_FILE_TYPE;
-}
-
-// get codec by txt file type
-Codec file_get_codec_by_txt_ft (DataType dt, FileType txt_ft, bool source)
-{
- if (source && txt_ft == BAM) return CODEC_BAM; // if !source, it would be CODEC_BGZF
- if (txt_ft == BCF || txt_ft == BCF_GZ || txt_ft == BCF_BGZF) return CODEC_BCF;
-
- for (unsigned i=0; txt_in_ft_by_dt[dt][i].in; i++)
- if (txt_in_ft_by_dt[dt][i].in == txt_ft)
- return txt_in_ft_by_dt[dt][i].codec;
-
- return CODEC_NONE;
-}
-
-// get codec by txt file type
-FileType file_get_txt_ft_by_codec (DataType dt, Codec codec)
-{
- for (unsigned i=0; txt_in_ft_by_dt[dt][i].in; i++)
- if (txt_in_ft_by_dt[dt][i].codec == codec)
- return txt_in_ft_by_dt[dt][i].in;
-
- return UNKNOWN_FILE_TYPE;
-}
-
-// get data_type of e.g. "myfile.fastq.genozip"
-// note: DT_SAM is returned for SAM/BAM/CRAM, and DT_VCF for VCF/BCF
-DataType file_get_dt_by_z_filename (rom z_filename)
-{
- FileType z_ft = file_get_type (z_filename);
-
- for (DataType dt=0; dt < NUM_DATATYPES; dt++)
- for (unsigned i=0; z_ft_by_dt[dt][i]; i++)
- if (z_ft == z_ft_by_dt[dt][i])
- return dt;
-
- return DT_NONE;
-}
-
-// returns default file type of the .genozip file of the given dt (e.g. DT_FASTA -> .fasta.genozip)
-FileType file_get_default_z_ft_of_data_type (DataType dt)
-{
- return z_ft_by_dt[dt][0];
-}
-
-// possible arguments for --input
-StrTextLong file_compressible_extensions (bool plain_only)
-{
- StrTextLong s;
- int s_len = 0;
-
- for (DataType dt=1; dt < NUM_DATATYPES; dt++) { // start from 1, excluding DT_REFERENCE
-
- if (dt == DT_GNRIC || dt == DT_ME23 || !txt_in_ft_by_dt[dt][0].in) continue;
-
- if (plain_only)
- for (unsigned i=0; txt_in_ft_by_dt[dt][i].in; i++) {
- Codec codec = txt_in_ft_by_dt[dt][i].codec;
- if (codec != CODEC_BGZF && codec != CODEC_GZ && codec != CODEC_BZ2 && codec != CODEC_XZ && codec != CODEC_ZIP && codec != CODEC_ORA)
- SNPRINTF (s, "%s ", &file_exts[txt_in_ft_by_dt[dt][i].in][1]);
- }
-
- else {
- SNPRINTF (s, "\n%-8s: ", dt_name (dt));
-
- for (unsigned i=0; txt_in_ft_by_dt[dt][i].in; i++)
- SNPRINTF (s, "%s ", &file_exts[txt_in_ft_by_dt[dt][i].in][1]);
- }
- }
-
- if (plain_only)
- SNPRINTF0 (s, "23andme generic ");
- else
- SNPRINTF0 (s, "\n23andMe : 23andme 23andme.zip"
- "\nOther : generic");
- return s;
-}
-
-FileType file_get_type (rom filename)
-{
- if (!filename) return UNKNOWN_FILE_TYPE;
-
- // 23andme files have the format "genome_Firstname_Lastname_optionalversion_timestamp.txt" or .zip
- if (strstr (filename, "genome") && strstr (filename, "Full")) {
- if (filename_has_ext (filename, ".txt")) return ME23;
- if (filename_has_ext (filename, ".zip")) return ME23_ZIP;
- if (filename_has_ext (filename, ".txt.genozip")) return ME23_GENOZIP;
- }
-
- for (FileType ft=UNKNOWN_FILE_TYPE+1; ft < AFTER_LAST_FILE_TYPE; ft++) {
-
- // files that end with .txt/.txt.genozip/.zip are not classified as ME23, we already handled ME23 above
- if (ft == ME23 || ft == ME23_ZIP || ft == ME23_GENOZIP) continue;
-
- if (filename_has_ext (filename, file_exts[ft]))
- return ft;
- }
-
- return UNKNOWN_FILE_TYPE; // this should never happen, because GNRIC_ is "" so it will catch everything
-}
-
-static FileType file_get_type_of_generic (rom filename)
-{
- if (!filename) return UNKNOWN_FILE_TYPE;
-
- if (filename_has_ext (filename, ".gz")) return GNRIC_GZ;
- if (filename_has_ext (filename, ".bz2")) return GNRIC_BZ2;
- if (filename_has_ext (filename, ".xz")) return GNRIC_XZ;
- if (filename_has_ext (filename, ".zip")) return GNRIC_ZIP;
- else return GNRIC;
-}
-
-static FileType file_get_type_force_dt (rom filename, DataType dt)
-{
- FileType ft = file_get_type (filename);
-
- // if ft cannot be determined by filename (i.e. comes out as generic), get it by data type
- switch (ft) {
- case GNRIC_GZ : return file_get_txt_ft_by_codec (dt, CODEC_GZ);
- case GNRIC_BZ2 : return file_get_txt_ft_by_codec (dt, CODEC_BZ2);
- case GNRIC_XZ : return file_get_txt_ft_by_codec (dt, CODEC_XZ);
- case GNRIC_ZIP : return file_get_txt_ft_by_codec (dt, CODEC_ZIP);
- default : return ft;
- }
-}
-
-// returns the filename without the extension eg myfile.1.sam.gz -> myfile.1.
-// if raw_name is given, memory is allocated sufficiently to concatenate a extension. Otherwise, filename is overwritten
-uint32_t file_get_raw_name_and_type (rom filename, rom *raw_name, FileType *out_ft)
-{
- unsigned len = strlen (filename);
-
- if (raw_name) {
- *raw_name = MALLOC (len + 30);
- memcpy (*(char **)raw_name, filename, len);
- (*(char **)raw_name)[len] = 0;
- }
- else
- raw_name = &filename; // overwrite filename
-
- FileType ft = file_get_type (filename);
- if (ft != UNKNOWN_FILE_TYPE) {
- len -= strlen (file_exts[ft]);
- (*(char **)raw_name)[len] = 0;
- }
-
- if (out_ft) *out_ft = ft;
-
- return len;
-}
-
-static void file_ask_user_to_confirm_overwrite (rom filename)
-{
- if (!strcmp (filename, "/dev/null")) return; // don't ask for /dev/null
-
- fprintf (stderr, "%s: output file %s already exists: in the future, you may use --force to overwrite\n", global_cmd, filename);
-
- if (!isatty(0) || !isatty(2)) exit_on_error(false); // if we stdin or stderr is redirected - we cannot ask the user an interactive question
-
- if (!str_query_user_yn ("Do you wish to overwrite it now?", QDEF_NO)) {
- fprintf (stderr, "No worries, I'm stopping here - no damage done!\n");
- exit (EXIT_OK);
- }
-
- fprintf (stderr, "\n");
-}
-
-static void file_redirect_output_to_stream (FileP file, rom exec_name,
- rom format_option_0, rom format_option_1, rom format_option_2, rom format_option_3)
-{
- char threads_str[20];
- snprintf (threads_str, sizeof (threads_str), "%u", global_max_threads);
-
- FILE *redirected_stdout_file = NULL;
- if (!flag.to_stdout) {
- redirected_stdout_file = fopen (file->name, file->mode); // exec_name will redirect its output to this file
- ASSINP (redirected_stdout_file, "cannot open file \"%s\": %s", file->name, strerror(errno));
- }
-
- char reason[40];
- snprintf (reason, sizeof (reason), "To output a %s file", file_exts[file->type]);
- output_compressor = stream_create (0, 0, 0, DEFAULT_PIPE_SIZE,
- redirected_stdout_file, // output is redirected unless flag.to_stdout
- 0, false, reason,
- exec_name,
- format_option_0,
- "--threads", threads_str,
- format_option_1, format_option_2, format_option_3,
- NULL);
-
- file->file = stream_to_stream_stdin (output_compressor);
-}
-
-// starting samtools 1.10, a PG record is added to the SAM header every "samtools view", and an option, --no-PG,
-// is provided to avoid this. See: https://github.com/samtools/samtools/releases/
-// returns "--no-PG" if the option exists, or NULL if not
-static rom file_samtools_no_PG (void)
-{
- static rom ret_str[2] = { NULL, "--no-PG" };
- static int has_no_PG = -1; // unknown
-
- if (has_no_PG >= 0) return ret_str[has_no_PG]; // we already tested
-
- #define SAMTOOLS_HELP_MAX_LEN 20000
- char samtools_help_text[SAMTOOLS_HELP_MAX_LEN];
-
- #define MIN_ACCEPTABLE_LEN 100
- int len=0;
-
- for (unsigned i=1; i < 15 && len < MIN_ACCEPTABLE_LEN; i++) {
- // Tested on samtools 1.11: The normal way to see help is "samtools help view" however it fails if stdin is not the terminal.
- // Instead, we use samtools view --threads, invalidly without an argument. This *sometimes* shows the help, and sometimes
- // just shows one line "samtools view:". We overcome this by repeating if the response is not long enough.
- StreamP samtools = stream_create (0, DEFAULT_PIPE_SIZE, DEFAULT_PIPE_SIZE, 0, 0, 0, 0, "To read/write CRAM files",
- "samtools", "view", "--threads", NULL);
- usleep (50000 * i); // wait for samtools
-
- // read both stderr and stdout from samtools
- len = read (fileno (stream_from_stream_stderr (samtools)), samtools_help_text, SAMTOOLS_HELP_MAX_LEN-1);
- len += read (fileno (stream_from_stream_stdout (samtools)), &samtools_help_text[len], SAMTOOLS_HELP_MAX_LEN-len-1);
-
- stream_close (&samtools, STREAM_DONT_WAIT_FOR_PROCESS);
- }
- samtools_help_text[len] = '\0'; // terminate string (more portable, strnstr and memmem are non-standard)
-
- ASSERT0 (len >= MIN_ACCEPTABLE_LEN, "no response from \"samtools view --threads\"");
-
- return ret_str[(has_no_PG = !!strstr (samtools_help_text, "--no-PG"))];
-}
-
-// show meaningful error if file is not a supported type and return TRUE if it file should be skipped
-static bool file_open_txt_read_test_valid_dt (ConstFileP file)
-{
- if (file->data_type == DT_NONE) {
-
- if (flag.multiple_files || tar_zip_is_tar()) {
- if (filename_has_ext (file->name, ".genozip")) {
-
- // case: --tar - include .genozip files verbatim
- if (tar_zip_is_tar()) {
- tar_copy_file (file->name, file->name);
- RETURNW (false, true, "Copied %s to the tar file", file_printname(file));
- }
- else
- RETURNW (false, true, "Skipping %s - it is already compressed", file_printname(file));
-
- }
-
- RETURNW (false, true, "Skipping %s - genozip doesn't know how to compress this file type (use --input to tell it)",
- file_printname (file));
- }
- else {
- ASSINP (!filename_has_ext (file->name, ".genozip"),
- "cannot compress %s because it is already compressed", file_printname(file));
-
- ABORT0 ("Unexpectedly, data_type==DT_NONE"); // not expecting to ever reach here, bc if file is not recognized, it should have been set to GENERIC
- }
- }
-
- return false; // all good - no need to skip this file
-}
-
-static void file_set_filename (FileP file, rom fn)
-{
- // copy filename
- unsigned fn_size = strlen (fn) + 1; // inc. \0
- file->name = MALLOC (fn_size);
- memcpy (file->name, fn, fn_size);
-}
-
-static void file_initialize_txt_file_data (FileP file)
-{
- #define TXT_INIT(buf) ({ buf_set_promiscuous (&file->buf, "txt_file->" #buf); })
-
- if (IS_ZIP) {
- mutex_initialize (file->recon_plan_mutex);
-
- // initialize evb "promiscuous" buffers - i.e. buffers that can be allocated by any thread
- // promiscuous buffers must be initialized by the main thread, and buffer.c does not verify their integrity.
- TXT_INIT(line_info[0]);
- TXT_INIT(line_info[1]);
- TXT_INIT(vb_info[0]);
- TXT_INIT(vb_info[1]);
- }
- else {
-
- }
-}
-
-FileP file_open_txt_read (rom filename)
-{
- FileP file = (FileP)CALLOC (sizeof(File));
-
- file->supertype = TXT_FILE;
- file->redirected = !filename; // later on, also CRAM, XZ, BCF will be set as redirected
- file->mode = READ;
- file->is_remote = filename && url_is_url (filename);
- flag.from_url = file->is_remote;
-
- int64_t url_file_size = 0; // will be -1 if the web/ftp site does not provide the file size
- rom error = NULL;
-
- thool is_file_exists = unknown;
-
- if (file->is_remote) {
- error = url_get_status (filename, &is_file_exists, &url_file_size); // accessing is expensive - get existance and size in one call
- if (!error && url_file_size >= 0) file->disk_size = (uint64_t)url_file_size;
- }
-
- else if (!file->redirected) { // not stdin
- is_file_exists = file_exists (filename);
- error = strerror (errno);
-
- if (is_file_exists)
- file->disk_size = file_get_size (filename);
- }
-
- // return null if genozip input file size is known to be 0, so we can skip it. note: file size of url might be unknown
- if (is_file_exists == yes && !file->disk_size && !url_file_size &&
- !(!file->is_remote && !file->redirected && file_is_fifo (filename))) // a fifo is allowed is be size 0 (as it always is)
- goto fail;
-
- if (!file->redirected) {
- ASSINP (is_file_exists != no, "Failed to open \"%s\" for reading: %s", filename, error);
-
- file_set_filename (file, filename);
-
- file->basename = filename_base (file->name, false, "", NULL, 0);
-
- // if user provided the type with --input, we use that, otherwise derive from the file name
- file->type = flag.stdin_type == GNRIC ? file_get_type_of_generic (file->basename)
- : flag.stdin_type ? flag.stdin_type
- : file_get_type (file->basename);
- }
-
- else { // stdin
- file->basename = filename_base (NULL, false, FILENAME_STDIN, NULL, 0);
- file->type = flag.stdin_type;
- }
-
- file->data_type = file_get_data_type_of_input_file (file->type);
-
- // show meaningful error if file is not a supported data type
- if (file_open_txt_read_test_valid_dt (file)) goto fail; // skip this file
-
- // open the file, based on the codec (as guessed by file extension)
- file->codec = file_get_codec_by_txt_ft (file->data_type, file->type, false);
- file->source_codec = file_get_codec_by_txt_ft (file->data_type, file->type, true);
-
- switch (file->codec) {
- case CODEC_CRAM: {
- // note: in CRAM, we read the header in advance in possible, directly (without samtools), so we can handle the case
- // that the reference file is wrong. In samtools, if we read beyond the header with a wrong ref, samtools will hang.
- if (!file->is_remote && !file->redirected) {
- cram_inspect_file (file); // if file is indeed CRAM, updates file->est_num_lines, file->header_size, and if not, updates file->data_type and file->codec/source_codec
- if (file->codec == CODEC_GZ || file->codec == CODEC_NONE) goto gz; // actually, this is a GZ file (possibly BAM)
- }
-
- StrTextSuperLong samtools_T_option = cram_get_samtools_option_T (gref);
-
- input_decompressor = stream_create (0, DEFAULT_PIPE_SIZE, DEFAULT_PIPE_SIZE, 0, 0,
- file->is_remote ? file->name : NULL, // url
- file->redirected,
- "To decompress a CRAM file",
- "samtools", "view",
- file->header_size ? SKIP_ARG : "-h", // don't output the SAM header if already read in cram_inspect_file
- "--bam", "--uncompressed", // BAM with BGZF blocks in which the payload is not compressed
- "--threads", "10", // in practice, samtools is able to consume ~4 cores
- file_samtools_no_PG() ? "--no-PG" : SKIP_ARG, // don't add a PG line to the header
- file->is_remote ? SKIP_ARG : file->name, // local file name
- samtools_T_option.s[0] ? samtools_T_option.s : NULL,
- NULL);
- file->file = stream_from_stream_stdout (input_decompressor);
- file->redirected = true;
- file->codec = CODEC_BGZF; // because the samtools stream is BGZF-blocks
- break;
- }
-
- case CODEC_GZ: // we test the first few bytes of the file to differentiate between NONE, GZ and BGZIP
- case CODEC_BGZF:
- case CODEC_NONE: gz: {
- file->file = file->is_remote ? url_open_remote_file (NULL, file->name)
- : file->redirected ? fdopen (STDIN_FILENO, "rb")
- : fopen (file->name, READ);
- ASSERT (file->file, "failed to open %s: %s", file->name, strerror (errno));
-
- if (!file->is_remote && !file->redirected)
- stream_set_inheritability (fileno (file->file), false); // Windows: allow file_remove in case of --replace
-
- // read the first potential BGZF block to test if this is GZ or BGZF
- uint8_t block[BGZF_MAX_BLOCK_SIZE];
- uint32_t block_size;
-
- ASSERTNOTINUSE (evb->scratch);
-
- int32_t bgzf_uncompressed_size = bgzf_read_block (file, block, &block_size, SOFT_FAIL);
-
- // case: this is indeed a bgzf - we put the still-compressed data in vb->scratch for later consumption
- // in txtfile_read_block_bgzf
- if (bgzf_uncompressed_size > 0) {
- if (file->source_codec != CODEC_CRAM && file->source_codec != CODEC_BAM && file->source_codec != CODEC_BCF)
- file->source_codec = CODEC_BGZF;
-
- file->codec = CODEC_BGZF;
-
- evb->scratch.count = bgzf_uncompressed_size;
- buf_add_more (evb, &evb->scratch, (char*)block, block_size, "scratch");
- }
-
- // for regulars files, we already skipped 0 size files. This can happen in STDIN
- else if (bgzf_uncompressed_size == 0) {
-
- ASSINP (!flags_pipe_in_process_died(), // only works for Linux
- "Pipe-in process %s (pid=%u) died without sending any data",
- flags_pipe_in_process_name(), flags_pipe_in_pid());
-
- ABORTINP ("No data exists in input file %s", file->name ? file->name : FILENAME_STDIN);
- }
-
- // case: this is non-BGZF GZIP format
- else if (bgzf_uncompressed_size == BGZF_BLOCK_GZIP_NOT_BGZIP) {
- file->codec = file->source_codec = CODEC_GZ;
-
- txtfile_init_read_igzip (file);
- buf_add_more (evb, &file->igzip_data, block, block_size, "igzip_data");
- }
-
- // case: this is not GZIP format at all. treat as a plain file, and put the data read in vb->scratch
- // for later consumption is txtfile_read_block_plain
- else if (bgzf_uncompressed_size == BGZF_BLOCK_IS_NOT_GZIP) {
-
- #define BZ2_MAGIC "BZh"
- #define XZ_MAGIC (char[]){ 0xFD, '7', 'z', 'X', 'Z', 0 }
- #define ZIP_MAGIC (char[]){ 0x50, 0x4b, 0x03, 0x04 }
- #define ORA_MAGIC (char[]){ 0x49, 0x7c } // https://support-docs.illumina.com/SW/ORA_Format_Specification/Content/SW/ORA/ORAFormatSpecification.htm
-
- // we already open the file, so not easy to re-open with BZ2_bzopen as it would require injecting the read data into the BZ2 buffers
- if (str_isprefix_((rom)block, block_size, BZ2_MAGIC, 3))
- ABORTINP0 ("The data seems to be in bz2 format. Please use --input to specify the type (eg: \"genozip --input sam.bz2\")");
-
- else if (str_isprefix_((rom)block, block_size, XZ_MAGIC, 6)) {
- if (file->redirected) ABORTINP0 ("Compressing piped-in data in xz format is not currently supported");
- if (file->is_remote) ABORTINP0 ("The data seems to be in xz format. Please use --input to specify the type (eg: \"genozip --input sam.xz\")");
- ABORTINP0 ("The data seems to be in xz format. Please use --input to specify the type (eg: \"genozip --input sam.xz\")");
- }
-
- else if (str_isprefix_((rom)block, block_size, ZIP_MAGIC, 4)) {
- if (file->redirected) ABORTINP0 ("Compressing piped-in data in zip format is not currently supported");
- if (file->is_remote) ABORTINP0 ("The data seems to be in zip format. Please use --input to specify the type (eg: \"genozip --input generic.zip\")");
- ABORTINP0 ("The data seems to be in zip format. Please use --input to specify the type (eg: \"genozip --input generic.zip\")");
- }
-
- else if (str_isprefix_((rom)block, block_size, ORA_MAGIC, 2)) {
- if (file->redirected) ABORTINP0 ("Compressing piped-in data in ora format is not currently supported");
- if (file->is_remote) ABORTINP0 ("The data seems to be in ora format. Please use --input to specify the type (eg: \"genozip --input fastq.ora\")");
- ABORTINP0 ("The data seems to be in ora format. Please use --input to specify the type (eg: \"genozip --input fastq.ora\")");
- }
-
- file->codec = CODEC_NONE;
- buf_add_more (evb, &evb->scratch, (char*)block, block_size, "scratch");
- }
-
- ASSINP (!file->redirected || file->codec == CODEC_NONE || file->codec == CODEC_BGZF || file->codec == CODEC_GZ,
- "genozip only supports piping in data that is either plain (uncompressed) or compressed in GZIP format (typically with .gz extension) (codec=%s)",
- codec_name (file->codec));
- break;
- }
- case CODEC_BZ2:
- if (file->is_remote) {
- FILE *url_fp = url_open_remote_file (NULL, file->name);
- file->file = BZ2_bzdopen (fileno(url_fp), READ); // we're abandoning the FILE structure (and leaking it, if libc implementation dynamically allocates it) and working only with the fd
- }
- else if (file->redirected)
- file->file = BZ2_bzdopen (STDIN_FILENO, READ);
-
- else {
- file->file = BZ2_bzopen (file->name, READ); // for local files we decompress ourselves
-
- if (file->file)
- stream_set_inheritability (BZ2_get_fd (file->file), false); // Windows: allow file_remove in case of --replace
- }
- break;
-
- case CODEC_XZ:
- if (file->redirected) ABORTINP0 ("Compressing piped-in data in xz format is not currently supported");
-
- input_decompressor = stream_create (0, DEFAULT_PIPE_SIZE, DEFAULT_PIPE_SIZE, 0, 0,
- file->is_remote ? file->name : NULL, // url
- file->redirected,
- "To uncompress an .xz file", "xz", // reason, exec_name
- file->is_remote ? SKIP_ARG : file->name, // local file name
- "--threads=8", "--decompress", "--keep", "--stdout",
- flag.quiet ? "--quiet" : SKIP_ARG,
- NULL);
- file->file = stream_from_stream_stdout (input_decompressor);
- file->redirected = true;
- file->codec = CODEC_NONE;
- break;
-
- case CODEC_ZIP:
- input_decompressor = stream_create (0, DEFAULT_PIPE_SIZE, DEFAULT_PIPE_SIZE, 0, 0,
- file->is_remote ? file->name : NULL, // url
- file->redirected,
- "To uncompress a .zip file", "unzip", // reason, exec_name
- "-p", // must be before file name
- file->is_remote ? SKIP_ARG : file->name, // local file name
- flag.quiet ? "--quiet" : SKIP_ARG,
- NULL);
- file->file = stream_from_stream_stdout (input_decompressor);
- file->redirected = true;
- file->codec = CODEC_NONE;
- break;
-
- case CODEC_BCF: {
- input_decompressor = stream_create (0, DEFAULT_PIPE_SIZE, DEFAULT_PIPE_SIZE, 0, 0,
- file->is_remote ? file->name : NULL, // url
- file->redirected,
- "To compress a BCF file",
- "bcftools", "view", "--threads", "8", "-Ov",
- file->is_remote ? SKIP_ARG : file->name, // local file name
- "--no-version", // BCF: do not append version and command line to the header
- NULL);
- file->file = stream_from_stream_stdout (input_decompressor);
- file->redirected = true;
- file->codec = CODEC_NONE;
- break;
- }
-
- case CODEC_ORA: {
- input_decompressor = stream_create (0, DEFAULT_PIPE_SIZE, DEFAULT_PIPE_SIZE, 0, 0,
- file->is_remote ? file->name : NULL, // url
- file->redirected,
- "To compress an Ora file",
- "orad",
- "--raw", "--quiet", "--stdout",
- "--threads", str_int_s (global_max_threads).s,
- (file->is_remote || file->redirected) ? "-" : file->name, // local file name
- NULL);
- file->file = stream_from_stream_stdout (input_decompressor);
- file->redirected = true;
- file->codec = CODEC_NONE;
- break;
- }
-
- default:
- ABORT ("%s: invalid filename extension for %s files: %s", global_cmd, dt_name (file->data_type), file->name);
- }
-
- bgzf_initialize_discovery (file);
-
- if (flag.show_codec)
- iprintf ("%s: source_code=%s\n", file->basename, codec_name (file->source_codec));
-
- if (!file->file) goto fail;
-
- file_initialize_txt_file_data (file);
-
- if (file->is_remote) FREE (error); // allocated by url_get_status
- return file;
-
-fail:
- if (file->is_remote) FREE (error);
- FREE (file->name);
- FREE (file->basename);
- FREE (file);
- return NULL;
-}
-
-FileP file_open_txt_write (rom filename, DataType data_type, BgzfLevel bgzf_level)
-{
- ASSERT (data_type > DT_NONE && data_type < NUM_DATATYPES ,"invalid data_type=%d", data_type);
-
- FileP file = (FileP)CALLOC (sizeof(File));
-
- file->supertype = TXT_FILE;
- file->mode = WRITE;
- file->data_type = data_type;
- file->redirected = !filename;
-
- file->codec = data_type == DT_CRAM ? CODEC_CRAM
- : data_type == DT_BCF ? CODEC_BCF
- : bgzf_level != BGZF_NO_BGZF ? CODEC_BGZF // see bgzf_piz_calculate_bgzf_flags
- : /* BGZF_NO_BGZF */ CODEC_NONE;
-
- if (!file->redirected) { // not stdout
- if (file_exists (filename) &&
- !file_is_fifo (filename) && // a fifo can be "overwritten" (that's just normal writing to a fifo)
- !flag.force && !flag.test)
-
- file_ask_user_to_confirm_overwrite (filename); // function doesn't return if user responds "no"
-
- file_set_filename (file, filename);
-
- file->type = file_get_type_force_dt (filename, data_type);
- }
-
- else // stdout
- file->type = txt_out_ft_by_dt[data_type][bgzf_level >= 1]; // plain file or .gz file
-
- file->basename = filename_base (file->name, false, FILENAME_STDOUT, NULL, 0);
-
- // don't actually open the output file if we're not going to write to it
- if (flag.no_writer) return file;
-
- // open the file, based on the codec
- switch (file->codec) {
- case CODEC_BGZF :
- case CODEC_NONE : file->file = file->redirected ? fdopen (STDOUT_FILENO, "wb") : fopen (file->name, WRITE); break;
-
- case CODEC_CRAM : {
- StrTextSuperLong samtools_T_option = cram_get_samtools_option_T (gref);
- file_redirect_output_to_stream (file, "samtools", "view", "-OCRAM",
- file_samtools_no_PG(),
- samtools_T_option.s[0] ? samtools_T_option.s : NULL);
- break;
- }
-
- case CODEC_BCF : {
- char comp_level[4] = { '-', 'l', '0' + MIN_(bgzf_level, 9), 0 };
-
- if (flag.show_bgzf)
- iprintf ("%s: launching external compressor \"bcftools\" with bgzf_level=%d\n", file->basename, bgzf_level);
-
- file_redirect_output_to_stream (file, "bcftools", "view", "-Ob", comp_level, NULL);
- break;
- }
-
- default: {} // never reaches here
- }
-
- ASSINP (file->file, "cannot open file \"%s\": %s", file->name, strerror(errno)); // errno will be retrieve even the open() was called through zlib and bzlib
-
- file_initialize_txt_file_data (file);
-
- return file;
-}
-
-// note: we insert all the z_file buffers into the buffer list in advance and mark them as promiscuous, to avoid this
-// thread satety issue: without this pre-allocation, some of these buffers will be first allocated by the first
-// compute thread to use it, causing buf_alloc to modify evb's buf_list - this is not permitted as the main
-// thread might be doing so concurrently resulting in a corrupted evb.buf_list.
-
-static void file_initialize_z_file_data (FileP file)
-{
- init_dict_id_to_did_map (file->d2d_map);
- profiler_new_z_file();
-
- #define Z_INIT(buf) ({ buf_set_promiscuous (&file->buf, "z_file->" #buf); })
-
- if (file->mode != READ) { // careful not to use IS_ZIP - which is set when reading aux files
- for (Did did_i=0; did_i < MAX_DICTS; did_i++) {
- ctx_zip_init_promiscuous (&file->contexts[did_i]); // must be done from main thread
- file->contexts[did_i].vb_1_pending_merges = -1; // uninitialized - will be initialized in ctx_set_vb_1_pending_merges
- }
- __atomic_thread_fence (__ATOMIC_RELEASE); // release all vb_1_pending_merges
-
- // initialize evb "promiscuous" buffers - i.e. buffers that can be allocated by any thread (obviously protected by eg a mutex)
- // promiscuous buffers must be initialized by the main thread, and buffer.c does not verify their integrity.
- Z_INIT (ra_buf);
- Z_INIT (sag_grps);
- Z_INIT (sag_alns);
- Z_INIT (sag_qnames);
- Z_INIT (sag_cigars); // union with solo_data
- Z_INIT (sag_seq);
- Z_INIT (sag_qual);
- Z_INIT (deep_index_by[BY_SEQ]);
- Z_INIT (deep_index_by[BY_QNAME]);
- Z_INIT (deep_ents);
- Z_INIT (section_list_buf);
- Z_INIT (contexts[CHROM].chrom2ref_map);
- }
- else {
- Z_INIT (sag_qual);
- Z_INIT (sag_cigars); // union with solo_data
- }
-
- if (flag.no_biopsy_line) // no need to initialize in --biopsy-line (as destroying it later will error)
- serializer_initialize (file->digest_serializer);
-
- clock_gettime (CLOCK_REALTIME, &file->start_time);
-}
-
-// get time since creation of z_file object in memory
-StrText file_get_z_run_time (FileP file)
-{
- TimeSpecType tb;
- clock_gettime(CLOCK_REALTIME, &tb);
-
- int seconds_so_far = ((tb.tv_sec - file->start_time.tv_sec)*1000 +
- ((int64_t)tb.tv_nsec - (int64_t)file->start_time.tv_nsec) / 1000000) / 1000;
-
- return str_human_time (seconds_so_far, true);
-}
-
-// opens z_file for read or write
-FileP file_open_z_read (rom filename)
-{
- START_TIMER;
-
- ASSINP0 (filename, "it is not possible to redirect genozip files from stdin");
-
- FileP file = (FileP)CALLOC (sizeof(File));
-
- file->supertype = Z_FILE;
- file->mode = READ;
- file->is_in_tar = (flag.t_offset > 0);
-
- if (flag.debug_tar)
- iprintf ("file_open_z_read: t_offset=%"PRIu64" t_size=%"PRIu64" %s\n", flag.t_offset, flag.t_size, filename);
-
- rom disk_filename = file->is_in_tar ? tar_get_tar_name() : filename;
- ASSINP (file_exists (disk_filename), "cannot open \"%s\" for reading: %s", disk_filename, strerror (errno));
-
- file->disk_size = file->is_in_tar ? flag.t_size : file_get_size (filename);
-
- file_set_filename (file, filename);
-
- file->type = file_get_type (file->name);
-
- file->basename = filename_base (file->name, false, NULL, NULL, 0);
-
- // if a FASTA file was given as an argument to --reference or --REFERENCE, get the .ref.genozip file,
- // possobily running --make-reference in a separate process if needed
- if (flag.reading_reference && (file_get_data_type_of_input_file (file_get_type (file->name)) == DT_FASTA) && (file_get_type (file->name) != FASTA_GENOZIP))
- disk_filename = ref_fasta_to_ref (file);
-
- ASSINP (!flag.reading_reference || filename_has_ext (file->name, REF_GENOZIP_),
- "You specified file \"%s\", however with --reference or --REFERENCE, you must specify a reference file (%s file or FASTA file)\n"
- "Tip: To create a genozip reference file from a FASTA file, use 'genozip --make-reference myfasta.fa'",
- file->name, REF_GENOZIP_);
-
- if ((!flag.seg_only && !flag.show_bam) || flag_loading_auxiliary) {
-
- // make sure file is a regular file (not FIFO, directory etc)
- struct stat sb;
- int cause=0, stat_errno=0;
- if (stat (disk_filename, &sb)) {
- cause = 6; // stat failed
- stat_errno = errno;
- }
-
- if ((sb.st_mode & S_IFMT) != S_IFREG) cause=7; // not regular file
-
- if (!cause) {
- file->file = fopen (disk_filename, READ);
-
- stream_set_inheritability (fileno (file->file), false); // Windows: allow file_remove in case of --replace
- }
-
- // verify that this is a genozip file
- // we read the Magic at the end of the file (as the magic at the beginning may be encrypted)
- uint32_t magic;
- if (cause ||
- (cause = 1 * !file->file) ||
- (cause = 2 * !sb.st_size) ||
- (cause = 3 * !file_seek (file, -(int)sizeof (magic), SEEK_END, READ, SOFT_FAIL)) ||
- (cause = 4 * !fread (&magic, sizeof (magic), 1, file->file)) ||
- (cause = 5 * (BGEN32 (magic) != GENOZIP_MAGIC && !(flag.show_headers && flag.force)))) {
-
- int fail_errno = errno;
- FCLOSE (file->file, disk_filename);
-
- if (flag.validate == VLD_REPORT_INVALID)
- flag.validate = VLD_INVALID_FOUND;
-
- else if (flag.validate == VLD_NO_REPORT)
- exit (EXIT_INVALID_GENOZIP_FILE); // silent exit with error code, if even a single file is not valid
-
- rom cause_str = cause==1 ? strerror (fail_errno)
- : cause==2 ? "file is empty"
- : cause==3 ? "file_seek failed"
- : cause==4 ? "fread failed"
- : cause==5 ? "Not a valid genozip file (bad magic)"
- : cause==6 ? strerror (stat_errno)
- : cause==7 ? "Not a regular file"
- : "no error";
-
- if (flag.multiple_files) {
-
- if (flag.validate == VLD_INVALID_FOUND) // outputs even if --quiet
- iprintf ("Cannot open %s: %s\n", disk_filename, cause_str);
-
- else if (flag.validate == VLD_NONE) { // silenced by --quiet
- static int once=0;
- WARN ("Skipping %s: %s%s", file->name, cause_str,
- !(once++) ? " (--quiet to silence this message)" : "");
- }
-
- file_close (&file);
-
- goto done;
- }
- else { // single file
- if (flag.validate == VLD_REPORT_VALID)
- exit (EXIT_INVALID_GENOZIP_FILE); // exit quietly - with a return code indicating invalidity
- else
- ABORTINP ("Cannot open %s: %s %s", disk_filename, cause_str,
- (cause==3 || cause==4) ? strerror(fail_errno) : "");
- }
- }
-
- // file is valid
- else if (flag.validate == VLD_REPORT_VALID)
- iprintf ("%s\n", file->name); // print just filename, so a script can use this output
- }
-
- file->data_type = DT_NONE; // we will get the data type from the genozip header, not by the file name
-
- file_initialize_z_file_data (file);
-
- ASSINP (file->file, "cannot open file \"%s\": %s", file->name, strerror(errno)); // errno will be retrieve even the open() was called through zlib and bzlib
-
-done:
- COPY_TIMER_EVB (file_open_z);
- return file;
-}
-
-// opens z_file for read or write
-FileP file_open_z_write (rom filename, FileMode mode, DataType data_type, Codec source_codec)
-{
- START_TIMER;
-
- ASSINP0 (filename, "it is not possible to redirect genozip files to stdout");
-
- FileP file = (FileP)CALLOC (sizeof(File));
-
- file->supertype = Z_FILE;
- file->mode = mode;
- file->is_in_tar = tar_zip_is_tar();
-
- if (file_exists (filename) &&
- !flag.force &&
- !flag.zip_no_z_file && // not zip with --seg-only
- !file->is_in_tar)
-
- file_ask_user_to_confirm_overwrite (filename); // function doesn't return if user responds "no"
-
- file_set_filename (file, filename);
-
- file->type = file_get_type_force_dt (file->name, data_type);
- file->data_type = data_type;
- file->source_codec = source_codec;
-
- file->basename = filename_base (file->name, false, NULL, NULL, 0);
-
- ASSINP (filename_has_ext (file->name, GENOZIP_EXT),
- "file %s must have a " GENOZIP_EXT " extension", file_printname (file));
-
- // set file->type according to the data type, overriding the previous setting - i.e. if the user
- // uses the --output option, he is unrestricted in the choice of a file name
- file->type = file_get_z_ft_by_txt_in_ft (file->data_type, txt_file->type);
-
- mutex_initialize (file->dicts_mutex);
- mutex_initialize (file->custom_merge_mutex);
- mutex_initialize (file->zriter_mutex);
-
- if (!flag.zip_no_z_file) {
-
- if (flag.force && !file->is_in_tar)
- unlink (file->name); // delete file if it already exists (needed in weird cases, eg symlink to non-existing file)
-
- // if we're writing to a tar file, we get the already-openned tar file
- if (file->is_in_tar)
- file->file = tar_open_file (file->name, file->name);
- // note: tar doesn't have a z_reread_file bc --pair and --deep are not yet supported with --tar
-
- else {
- file->file = fopen (file->name, file->mode);
-
- if (!flag.no_zriter)
- file->z_reread_file = fopen (file->name, READ);
-
-#ifndef _WIN32
- // set z_file permissions to be the same as the txt_file permissions (if possible)
- if (file->file && txt_file && txt_file->name && !txt_file->is_remote) {
- struct stat st;
- if (stat (txt_file->name, &st))
- WARN ("FYI: Failed to set permissions of %s because failed to stat(%s): %s", file->name, txt_file->name, strerror(errno));
-
- else
- chmod (file->name, st.st_mode); // ignore errors (e.g. this doesn't work on NTFS)
- }
-#endif
- }
- }
-
- file->genozip_version = code_version_major(); // to allow the VER macro to operate consistently across ZIP/PIZ
- file->genozip_minor_ver = code_version_minor();
-
- file_initialize_z_file_data (file);
-
- ASSINP (file->file || flag.zip_no_z_file,
- "cannot open file \"%s\": %s", file->name, strerror(errno)); // errno will be retrieve even the open() was called through zlib and bzlib
-
- COPY_TIMER_EVB (file_open_z);
- return file;
-}
-
-// index file is it is a disk file of a type that can be indexed
-static void file_index_txt (ConstFileP file)
-{
- ASSERTNOTNULL (file);
-
- RETURNW (file->name,, "%s: cannot create an index file when output goes to stdout", global_cmd);
-
- StreamP indexing = NULL;
-
- switch (file->data_type) {
- case DT_SAM:
- case DT_BAM:
- RETURNW (file->codec == CODEC_BGZF,, "%s: output file needs to be a .sam.gz or .bam to be indexed", global_cmd);
- indexing = stream_create (0, 0, 0, 0, 0, 0, 0, "to create an index", "samtools", "index", file->name, NULL);
- break;
-
- case DT_VCF:
- RETURNW (file->codec == CODEC_BGZF,, "%s: output file needs to be a .vcf.gz or .bcf to be indexed", global_cmd);
- RETURNW (vcf_header_get_has_fileformat(),, "%s: file needs to start with ##fileformat=VCF be indexed", global_cmd);
- indexing = stream_create (0, 0, 0, 0, 0, 0, 0, "to create an index", "bcftools", "index", file->name, NULL);
- break;
-
- case DT_FASTQ:
- case DT_FASTA:
- RETURNW (file->codec == CODEC_BGZF || file->codec == CODEC_NONE,,
- "%s: To be indexed, the output file cannot be compressed with %s", global_cmd, codec_name (file->codec));
- indexing = stream_create (0, 0, 0, 0, 0, 0, 0, "to create an index", "samtools", "faidx", file->name, NULL);
- break;
-
- default: break; // we don't know how to create an index for other data types
- }
-
- if (indexing) {
- progress_new_component (file->basename, "Indexing", false, NULL);
-
- stream_wait_for_exit (indexing, false);
-
- progress_finalize_component_time ("Done indexing", DIGEST_NONE);
- }
-}
-
-void file_close (FileP *file_p)
-{
- START_TIMER;
-
- FileP file = *file_p;
-
- if (z_file && file == z_file && !flag_loading_auxiliary &&
- flag.show_time_comp_i == COMP_ALL && !flag.show_time[0]) // show-time without the optional parameter
- profiler_add_evb_and_print_report();
-
- __atomic_store_n (file_p, (FileP)NULL, __ATOMIC_RELAXED);
-
- if (!file) return; // nothing to do
-
- if (file->file && file->supertype == TXT_FILE) {
-
- if (file->mode == READ && file->codec == CODEC_BZ2)
- BZ2_bzclose((BZFILE *)file->file);
-
- else if (file->mode == READ && is_read_via_ext_decompressor (file))
- stream_close (&input_decompressor, STREAM_WAIT_FOR_PROCESS);
-
- else if (file->mode == WRITE && is_written_via_ext_compressor (file->codec))
- stream_close (&output_compressor, STREAM_WAIT_FOR_PROCESS);
-
- // if its stdout - just flush, don't close - we might need it for the next file
- else if (file->mode == WRITE && flag.to_stdout)
- fflush ((FILE *)file->file);
-
- else if (file->is_remote)
- url_close_remote_file_stream ((FILE**)&file->file);
-
- else
- FCLOSE (file->file, file_printname (file));
-
- // create an index file using samtools, bcftools etc, if applicable
- if (file->mode == WRITE && flag.index_txt && !flag_loading_auxiliary)
- file_index_txt (file);
- }
-
- else if (file->file && file->supertype == Z_FILE) {
-
- // ZIP note: we need to destory all even if unused, because they were initialized in file_initialize_z_file_data
- if (IS_ZIP)
- for (Did did_i=0; did_i < (IS_ZIP ? MAX_DICTS : file->num_contexts); did_i++)
- mutex_destroy (file->ctx_mutex[did_i]);
-
- if (file->is_in_tar && file->mode != READ)
- tar_close_file (&file->file);
- else {
- FCLOSE (file->file, file_printname (file));
- FCLOSE (file->z_reread_file, file_printname (file));
- }
- serializer_destroy (file->digest_serializer);
- }
-
- // free resources if we are NOT near the end of the execution. If we are at the end of the execution
- // it is faster to just let the process die
-
- if (!flag.let_OS_cleanup_on_exit) {
-
- if (IS_PIZ && flag.deep && file->supertype == Z_FILE) { // in this case, deep_index and deep_ents are Buffers containing arrays of Buffers
- for_buf (Buffer, buf, file->deep_index) buf_destroy (*buf);
- for_buf (Buffer, buf, file->deep_ents) buf_destroy (*buf);
- huffman_destroy (&file->qname_huf);
- }
-
- buflist_destroy_file_bufs (file);
-
- mutex_destroy (file->zriter_mutex);
- mutex_destroy (file->dicts_mutex);
- mutex_destroy (file->custom_merge_mutex);
- mutex_destroy (file->qname_huf_mutex);
- mutex_destroy (file->recon_plan_mutex);
-
- FREE (file->name);
- FREE (file->basename);
- FREE (file);
- }
-
- COPY_TIMER_EVB (file_close);
-}
-
-void file_write_txt (const void *data, unsigned len)
-{
- if (!len) return; // nothing to do
-
- ASSERTNOTNULL (txt_file);
- ASSERTNOTNULL (txt_file->file);
- ASSERTNOTNULL (data);
-
- size_t bytes_written = fwrite (data, 1, len, (FILE *)txt_file->file); // use fwrite - let libc manage write buffers for us
-
- // if we're streaming our genounzip/genocat/genols output to another process and that process has
- // ended prematurely then exit quietly. In genozip we display an error because this means the resulting
- // .genozip file will be corrupted
- if (!txt_file->name && errno == EINVAL) exit (EXIT_DOWNSTREAM_LOST);
-
- // exit quietly if failed to write to stdout - likely downstream consumer (piped executable or terminal) was closed
- if (bytes_written < len && !txt_file->name) exit (EXIT_DOWNSTREAM_LOST);
-
- // error if failed to write to file
- ASSERT (bytes_written == len, "wrote only %u of the expected %u bytes to %s: %s", (int)bytes_written, len, txt_file->name, strerror(errno));
-}
-
-void file_remove (rom filename, bool fail_quietly)
-{
- chmod (filename, S_IRUSR | S_IWUSR); // make sure its +w so we don't get permission denied (ignore errors)
-
-#ifndef _WIN32
- int ret = remove (filename);
- ASSERTW (!ret || fail_quietly, "Warning: failed to remove %s: %s", filename, strerror (errno));
-#else
- ASSERTW (DeleteFile (filename) || fail_quietly, "Warning: failed to remove %s: %s", filename, str_win_error());
-#endif
-}
-
-bool file_rename (rom old_name, rom new_name, bool fail_quietly)
-{
- chmod (old_name, S_IRUSR | S_IWUSR); // make sure its +w so we don't get permission denied (ignore errors)
-
- int ret = rename (old_name, new_name);
- ASSERTW (!ret || fail_quietly, "Warning: failed to rename %s to %s: %s", old_name, new_name, strerror (errno));
-
- return !ret; // true if successful
-}
-
-// also updates filename to .gz (but not if .bam)
-void file_gzip (char *filename)
-{
- unsigned fn_len = strlen (filename);
-
- char command[fn_len + 50];
-
- int ret = 1;
-
- snprintf (command, sizeof (command), "bgzip -@%u -f \"%s\" %s", global_max_threads, filename, flag.is_windows ? "" : " > /dev/null 2>&1");
- ret = system (command); // note: runs sh on Unix, and cmd.exe on Windows
-
- if (ret && errno == ENOENT) { // no bgzip - try pigz
- snprintf (command, sizeof (command), "pigz -f \"%s\" %s", filename, flag.is_windows ? "" : " > /dev/null 2>&1");
- ret = system (command);
- }
-
- if (ret && errno == ENOENT) { // no pigz - try gzip
- snprintf (command, sizeof (command), "gzip -f \"%s\" %s", filename, flag.is_windows ? "" : " > /dev/null 2>&1");
- ret = system (command);
- }
-
- ASSERTW (!ret, "FYI: \"%s\" returned %d. No harm.", command, ret);
-
- if (!ret) {
- // special case: rename .bam.gz -> .bam
- if (fn_len >= 4 && !memcmp (&filename[fn_len-4], ".bam", 4)) {
- char gz_filename[fn_len + 10];
- snprintf (gz_filename, sizeof (gz_filename), "%s.gz", filename);
- file_remove (filename, true);
- file_rename (gz_filename, filename, false);
- }
- else
- strcpy (&filename[fn_len], ".gz");
- }
-}
-
-void file_mkfifo (rom filename)
-{
-#ifndef _WIN32
- file_remove (filename, true);
- ASSERT (!mkfifo (filename, 0666), "mkfifo failed for %s: %s", filename, strerror (errno));
-
-#else
- ABORT0 ("file_mkfifo not supported on Windows");
-#endif
-}
-
-bool file_is_fifo (rom filename)
-{
- if (flag.is_windows) return false; // we don't support FIFOs in Win32 yet
-
- struct stat st;
- ASSERT (!stat (filename, &st), "stat failed on %s", filename);
-
- return S_ISFIFO (st.st_mode);
-}
-
-bool file_exists (rom filename)
-{
-
- if (!filename || !filename[0]) return false;
- bool exists = !access (filename, F_OK);
-
-#ifdef _WIN32
- // TO DO: overcome this limitation, see: https://docs.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation
- if (!exists && strlen (filename) > MAX_PATH)
- WARN_ONCE ("Genozip limitation: filenames on Windows are limited to %u characters. Please contact "EMAIL_SUPPORT" for advice: %s", PATH_MAX, filename);
-#endif
-
- return exists;
-}
-
-// returns true if successful. depending on soft_fail, a failure will either emit an error
-// (and exit) or a warning (and return).
-bool file_seek (FileP file, int64_t offset,
- int whence, // SEEK_SET, SEEK_CUR or SEEK_END
- rom mode, // READ if seeking before reading, WRITE if before writing
- FailType fail_type)
-{
- ASSERTNOTNULL (file);
- ASSERTNOTNULL (file->file);
-
- if (file->supertype == Z_FILE) {
-
- if (whence == SEEK_END && file->is_in_tar && IS_PIZ) {
- offset += flag.t_offset + flag.t_size;
- whence = SEEK_SET;
- goto test_already_there;
- }
-
- // in SEEK_SET of a z_file that is being tarred, update the offset to the beginning of the file data in the tar file
- else if (whence == SEEK_SET) {
- offset += (IS_ZIP ? tar_file_offset() : flag.t_offset); // 0 if not using tar
-
- test_already_there:
- if (ftello64 (GET_FP(file, mode)) == offset) return true; // already at the right offset
- }
- }
-
- int ret = fseeko64 (GET_FP(file, mode), offset, whence);
-
- if (fail_type != HARD_FAIL) {
- if (!flag.to_stdout && fail_type==WARNING_FAIL) {
- ASSERTW (!ret, errno == EINVAL ? "Warning: Error while reading file %s (fseeko64 (whence=%d offset=%"PRId64")): it is too small%s"
- : "Warning: fseeko64 failed on file %s (whence=%d offset=%"PRId64"): %s",
- file_printname (file), whence, offset, errno == EINVAL ? "" : strerror (errno));
- }
- }
- else
- ASSERT (!ret, "fseeko64(offset=%"PRId64" whence=%d) failed on file %s (FILE*=%p remote=%s redirected=%s): %s",
- offset, whence, file_printname (file), file->file, TF(file->is_remote), TF(file->redirected), strerror (errno));
-
- return !ret;
-}
-
-int64_t file_tell_do (FileP file, FailType soft_fail, rom func, unsigned line)
-{
- ASSERTNOTNULL (file);
- ASSERTNOTNULL (file->file);
-
- if (IS_ZIP && file->supertype == TXT_FILE && file->codec == CODEC_GZ)
- return txt_file->disk_so_far;
-
- if (IS_ZIP && file->supertype == TXT_FILE && file->codec == CODEC_BZ2)
- return BZ2_consumed ((BZFILE *)file->file);
-
- int64_t offset = ftello64 ((FILE *)file->file);
- ASSERT (offset >= 0 || soft_fail, "called from %s:%u: ftello64 failed for %s (FILE*=%p remote=%s redirected=%s): %s",
- func, line, file->name, file->file, TF(file->is_remote), TF(file->redirected), strerror (errno));
-
- if (offset < 0) return -1; // soft fail
-
- // in in z_file that is being tarred, update the offset to the beginning of the file data in the tar file
- if (file->supertype == Z_FILE)
- offset -= tar_file_offset(); // 0 if not using tar
-
- return offset;
-}
-
-uint64_t file_get_size (rom filename)
-{
- struct stat64 st;
-
- int ret = stat64(filename, &st);
- ASSERT (!ret, "stat64 failed on '%s': %s", filename, strerror(errno));
-
- return st.st_size;
-}
-
-bool file_is_dir (rom filename)
-{
- ASSERTNOTNULL (filename);
- int filename_len = strlen (filename);
-
- // temporarily remove trailing /
- if (filename[filename_len-1] == '/' || filename[filename_len-1] == '\\')
- filename_len--;
-
- SAFE_NULT (filename);
-
- struct stat64 st;
- int ret = stat64 (filename, &st); // 0 if successful
-
- SAFE_RESTORE;
-
- return !ret && S_ISDIR (st.st_mode);
-}
-
-void file_mkdir (rom dirname)
-{
- if (file_is_dir (dirname)) return; // already exists - that's ok
-
-#ifdef _WIN32
- int ret = _mkdir (flag.out_dirname);
-#else
- int ret = mkdir (flag.out_dirname, 0777);
-#endif
- ASSERT (!ret, "mkdir(%s) failed: %s", flag.out_dirname, strerror (errno));
-}
-
-// reads an entire file into a buffer. if filename is "-", reads from stdin
-void file_get_file (VBlockP vb, rom filename, BufferP buf, rom buf_name,
- uint64_t max_size, // 0 to read entire file, or specify for max size
- FileContentVerificationType ver_type, bool add_string_terminator)
-{
- bool is_stdin = !strcmp (filename, "-");
- if (is_stdin && !max_size) max_size = 10000000; // max size for stdin
-
- uint64_t file_size = is_stdin ? 0 : file_get_size (filename);
-
- uint64_t size = is_stdin ? max_size
- : !file_size ? max_size
- : max_size ? MIN_(max_size, file_size)
- : file_size;
-
- buf_alloc (vb, buf, 0, size + add_string_terminator, char, 1, buf_name);
-
- FILE *file = is_stdin ? stdin : fopen (filename, "rb");
- ASSINP (file, "cannot open \"%s\": %s", filename, strerror (errno));
-
- buf->len = fread (buf->data, 1, size, file);
- ASSERT (is_stdin || max_size || buf->len == size, "Error reading file %s: %s", filename, strerror (errno));
-
- ASSINP (ver_type != VERIFY_ASCII || str_is_printable (STRb(*buf)), "Expecting %s to contain text (ASCII)", filename);
- ASSINP (ver_type != VERIFY_UTF8 || str_is_utf8 (STRb(*buf)), "Expecting %s to contain ASCII or UTF-8 text", filename);
-
- if (add_string_terminator)
- *BAFTc (*buf) = 0;
-
- FCLOSE (file, filename);
-}
-
-// writes data to a file and flushes it, returns true if successful
-
-static Mutex put_data_mutex = {};
-#define MAX_PUT_DATA_FILES_PER_EXECUTION 10 // maximum files deletable at abort
-static rom put_data_tmp_filenames[MAX_PUT_DATA_FILES_PER_EXECUTION] = {};
-static unsigned num_put_files=0;
-
-bool file_put_data (rom filename, const void *data, uint64_t len,
- mode_t mode) // optional - ignored if 0
-{
- int fn_len = strlen (filename);
-
- // remove invalid characters from filename
- if (flag.is_windows)
- for (char *c=(char*)filename ; *c ; c++)
- if (*c == ':' && (c-filename != 1)) *c = '-'; // ':' exist eg in SAM AUX names
-
- int tmp_filename_size = fn_len + 5;
- char *tmp_filename = MALLOC (tmp_filename_size);
- // we first write to tmp_filename, and after we complete and flush, we rename to the final name
- // so that if a file exists (in its final name) - then its guaranteed to be fully written
- snprintf (tmp_filename, tmp_filename_size, "%s.tmp", filename);
-
- file_remove (filename, true);
- file_remove (tmp_filename, true);
-
- FILE *file = fopen (tmp_filename, "wb");
- if (!file) return false;
-
- // save file name in put_data_tmp_filenames, to be deleted in case of aborting by file_put_data_abort
- mutex_initialize (put_data_mutex); // first caller initializes. not thread safe, but good enough for the purpose.
- mutex_lock (put_data_mutex);
-
- unsigned my_file_i = num_put_files;
-
- if (num_put_files < MAX_PUT_DATA_FILES_PER_EXECUTION)
- put_data_tmp_filenames[num_put_files++] = tmp_filename;
-
- mutex_unlock (put_data_mutex);
-
- // write in blocks (Windows hangs if the block is too big, a few GB)
- size_t written = 0;
- const uint64_t block_size = 1 << 24; // 16MB
- for (int i=0; i < (len + block_size - 1) / block_size; i++) // round up
- written += fwrite (&((rom)data)[i * block_size], 1, MIN_(block_size, len - i*block_size), file);
-
- SAVE_VALUE (errno);
- fflush (file);
- FCLOSE (file, tmp_filename);
- RESTORE_VALUE (errno); // in cases caller wants to print fwrite error
-
- if (written != len) {
- WARN ("Failed to write %s: wrote only %"PRIu64" bytes of the expected %"PRIu64, tmp_filename, (uint64_t)written, len);
- put_data_tmp_filenames[my_file_i] = NULL; // no need to lock mutex
- remove (tmp_filename);
- return false;
- }
-
- // we can't enter if file_put_data_abort is active, or it needs to wait for us before deleting tmp files
- mutex_lock (put_data_mutex);
-
- remove (filename);
- int renamed_failed = rename (tmp_filename, filename);
-
- put_data_tmp_filenames[my_file_i] = NULL; // remove tmp file name from list
-
- mutex_unlock (put_data_mutex);
-
- ASSERT (!renamed_failed, "Failed to rename %s to %s: %s", tmp_filename, filename, strerror (errno));
- FREE (tmp_filename);
-
- if (mode)
- ASSERT (!chmod (filename, mode), "Failed to chmod %s: %s", filename, strerror (errno));
-
- return true;
-}
-
-// error handling: unlink files currently being written (the actual writing will terminate when the thread terminates)
-void file_put_data_abort (void)
-{
- if (!put_data_mutex.initialized) return;
-
- mutex_lock (put_data_mutex);
-
- for (unsigned i=0; i < num_put_files; i++)
- if (put_data_tmp_filenames[i]) {
- // TODO: this works on Linux but not Windows - gets "Permission Denied" if file_put_data is in fflush()
- unlink (put_data_tmp_filenames[i]); // ignore errors
- remove (put_data_tmp_filenames[i]); // in case unlinked failed - eg NTFS - ignore errors
- }
-
- // mutex remains locked - no more files can be put after this point
-}
-
-void file_put_data_reset_after_fork (void)
-{
- put_data_mutex = (Mutex){};
-}
-
-PutLineFn file_put_line (VBlockP vb, STRp(line), rom msg)
-{
- PutLineFn fn;
- snprintf (fn.s, sizeof (fn.s), "line.%u.%d.%s%s", vb->vblock_i, vb->line_i, command==ZIP ? "zip" : "piz",
- file_plain_ext_by_dt ((VB_DT(SAM) && z_file->z_flags.txt_is_bin) ? DT_BAM : vb->data_type));
-
- file_put_data (fn.s, STRa(line), 0);
-
- if (IS_PIZ)
- WARN ("\n%s line=%s line_in_file(1-based)=%"PRId64". Dumped %s (dumping first occurance only)",
- msg, line_name(vb).s, writer_get_txt_line_i (vb, vb->line_i), fn.s);
- else
- WARN ("\n%s line=%s vb_size=%u MB. Dumped %s", msg, line_name(vb).s, (int)(segconf.vb_size >> 20), fn.s);
-
- return fn;
-}
-
-void file_assert_ext_decompressor (void)
-{
- if (!stream_wait_for_exit (input_decompressor, false)) return; // just a normal EOF - all good!
-
- if (flag.truncate) return; // truncated as requested - all good
-
- // read error from stderr
- #define INPUT_DECOMPRESSOR_RESPSONSE_LEN 4096
- char error_str[INPUT_DECOMPRESSOR_RESPSONSE_LEN];
-
- FILE *stderr_pipe = stream_from_stream_stderr (input_decompressor);
- int bytes_read = fread (error_str, 1, INPUT_DECOMPRESSOR_RESPSONSE_LEN-1, stderr_pipe);
- error_str[bytes_read] = 0; // string terminator
-
- ABORT ("%s: failed to read file: %s\n%s: %s",
- global_cmd, txt_name, stream_get_exec_name (input_decompressor), error_str);
-}
-
-// used when aborting due to an error. avoid the compressors outputting their own errors after our process is gone
-void file_kill_external_compressors (void)
-{
- stream_close (&input_decompressor, STREAM_KILL_PROCESS);
- stream_close (&output_compressor, STREAM_KILL_PROCESS);
-}
-
-rom ft_name (FileType ft)
-{
- return type_name (ft, &file_exts[ft], ARRAY_LEN(file_exts));
-}
-
-rom file_plain_ext_by_dt (DataType dt)
-{
- FileType plain_ft = txt_in_ft_by_dt[FAF ? DT_FASTA : dt][0].in;
-
- return file_exts[plain_ft];
-}
-
-bool file_buf_locate (FileP file, ConstBufferP buf)
-{
- return is_p_in_range (buf, file, sizeof (File));
-}
+// ------------------------------------------------------------------
+// file.c
+// Copyright (C) 2019-2024 Genozip Limited. Patent Pending.
+// Please see terms and conditions in the file LICENSE.txt
+//
+// WARNING: Genozip is proprietary, not open source software. Modifying the source code is strictly prohibited,
+// under penalties specified in the license.
+
+#include
+#include
+#include
+#include
+#ifdef _WIN32
+#include
+#include
+#endif
+#define Z_LARGE64
+#ifdef __APPLE__
+ #define off64_t __int64_t
+#endif
+#include "bzlib/bzlib.h"
+#include "zlib/zlib.h"
+#include "file.h"
+#include "url.h"
+#include "codec.h"
+#include "bgzf.h"
+#include "progress.h"
+#include "tar.h"
+#include "writer.h"
+#include "filename.h"
+#include "huffman.h"
+
+// globals
+FileP z_file = NULL;
+FileP txt_file = NULL;
+
+static StreamP input_decompressor = NULL; // bcftools, xz, unzip, samtools or orad - only one at a time
+static StreamP output_compressor = NULL; // samtools (for cram), bcftools
+
+// global pointers - so the can be compared eg "if (mode == READ)"
+rom READ = "rb"; // use binary mode (b) in read and write so Windows doesn't add \r. "b" is accepted but ignored in Linux and MacOS.
+rom WRITE = "wb";
+rom WRITEREAD = "wb+"; // only supported for z_file and gencomp disk files
+
+rom file_exts[] = FILE_EXTS;
+
+static const struct { FileType in; Codec codec; FileType out; } txt_in_ft_by_dt[NUM_DATATYPES][50] = TXT_IN_FT_BY_DT;
+static const FileType txt_out_ft_by_dt[NUM_DATATYPES][20] = TXT_OUT_FT_BY_DT;
+static const FileType z_ft_by_dt[NUM_DATATYPES][20] = Z_FT_BY_DT;
+
+// get data type by file type
+DataType file_get_data_type_of_input_file (FileType ft)
+{
+ // note: if make-reference, we scan the array from dt=0 (DT_REF), otherwise we ignore DT_REF
+ for (DataType dt=!flag.make_reference; dt < NUM_DATATYPES; dt++)
+ for (unsigned i=0; txt_in_ft_by_dt[dt][i].in; i++)
+ if (txt_in_ft_by_dt[dt][i].in == ft)
+ return dt;
+
+ return (ft == UNKNOWN_FILE_TYPE) ? DT_GNRIC : DT_NONE;
+}
+
+DataType file_piz_get_dt_of_out_filename (void)
+{
+ if (!flag.out_filename) return DT_NONE;
+
+ FileType ft = file_get_type (flag.out_filename);
+
+ for (DataType dt=1/*skip DT_REF*/; dt < NUM_DATATYPES; dt++)
+ for (unsigned i=0; txt_out_ft_by_dt[dt][i]; i++)
+ if (txt_out_ft_by_dt[dt][i] == ft)
+ return dt;
+
+ return DT_NONE;
+}
+
+// get genozip file type by txt file type
+FileType file_get_z_ft_by_txt_in_ft (DataType dt, FileType txt_ft)
+{
+ for (unsigned i=0; txt_in_ft_by_dt[dt][i].in; i++)
+ if (txt_in_ft_by_dt[dt][i].in == txt_ft) return txt_in_ft_by_dt[dt][i].out;
+
+ return UNKNOWN_FILE_TYPE;
+}
+
+// get codec by txt file type
+Codec file_get_codec_by_txt_ft (DataType dt, FileType txt_ft, bool source)
+{
+ if (source && txt_ft == BAM) return CODEC_BAM; // if !source, it would be CODEC_BGZF
+ if (txt_ft == BCF || txt_ft == BCF_GZ || txt_ft == BCF_BGZF) return CODEC_BCF;
+
+ for (unsigned i=0; txt_in_ft_by_dt[dt][i].in; i++)
+ if (txt_in_ft_by_dt[dt][i].in == txt_ft)
+ return txt_in_ft_by_dt[dt][i].codec;
+
+ return CODEC_NONE;
+}
+
+// get codec by txt file type
+FileType file_get_txt_ft_by_codec (DataType dt, Codec codec)
+{
+ for (unsigned i=0; txt_in_ft_by_dt[dt][i].in; i++)
+ if (txt_in_ft_by_dt[dt][i].codec == codec)
+ return txt_in_ft_by_dt[dt][i].in;
+
+ return UNKNOWN_FILE_TYPE;
+}
+
+// get data_type of e.g. "myfile.fastq.genozip"
+// note: DT_SAM is returned for SAM/BAM/CRAM, and DT_VCF for VCF/BCF
+DataType file_get_dt_by_z_filename (rom z_filename)
+{
+ FileType z_ft = file_get_type (z_filename);
+
+ for (DataType dt=0; dt < NUM_DATATYPES; dt++)
+ for (unsigned i=0; z_ft_by_dt[dt][i]; i++)
+ if (z_ft == z_ft_by_dt[dt][i])
+ return dt;
+
+ return DT_NONE;
+}
+
+// returns default file type of the .genozip file of the given dt (e.g. DT_FASTA -> .fasta.genozip)
+FileType file_get_default_z_ft_of_data_type (DataType dt)
+{
+ return z_ft_by_dt[dt][0];
+}
+
+// possible arguments for --input
+StrTextLong file_compressible_extensions (bool plain_only)
+{
+ StrTextLong s;
+ int s_len = 0;
+
+ for (DataType dt=1; dt < NUM_DATATYPES; dt++) { // start from 1, excluding DT_REFERENCE
+
+ if (dt == DT_GNRIC || dt == DT_ME23 || !txt_in_ft_by_dt[dt][0].in) continue;
+
+ if (plain_only)
+ for (unsigned i=0; txt_in_ft_by_dt[dt][i].in; i++) {
+ Codec codec = txt_in_ft_by_dt[dt][i].codec;
+ if (codec != CODEC_BGZF && codec != CODEC_GZ && codec != CODEC_BZ2 && codec != CODEC_XZ && codec != CODEC_ZIP && codec != CODEC_ORA)
+ SNPRINTF (s, "%s ", &file_exts[txt_in_ft_by_dt[dt][i].in][1]);
+ }
+
+ else {
+ SNPRINTF (s, "\n%-8s: ", dt_name (dt));
+
+ for (unsigned i=0; txt_in_ft_by_dt[dt][i].in; i++)
+ SNPRINTF (s, "%s ", &file_exts[txt_in_ft_by_dt[dt][i].in][1]);
+ }
+ }
+
+ if (plain_only)
+ SNPRINTF0 (s, "23andme generic ");
+ else
+ SNPRINTF0 (s, "\n23andMe : 23andme 23andme.zip"
+ "\nOther : generic");
+ return s;
+}
+
+FileType file_get_type (rom filename)
+{
+ if (!filename) return UNKNOWN_FILE_TYPE;
+
+ // 23andme files have the format "genome_Firstname_Lastname_optionalversion_timestamp.txt" or .zip
+ if (strstr (filename, "genome") && strstr (filename, "Full")) {
+ if (filename_has_ext (filename, ".txt")) return ME23;
+ if (filename_has_ext (filename, ".zip")) return ME23_ZIP;
+ if (filename_has_ext (filename, ".txt.genozip")) return ME23_GENOZIP;
+ }
+
+ for (FileType ft=UNKNOWN_FILE_TYPE+1; ft < AFTER_LAST_FILE_TYPE; ft++) {
+
+ // files that end with .txt/.txt.genozip/.zip are not classified as ME23, we already handled ME23 above
+ if (ft == ME23 || ft == ME23_ZIP || ft == ME23_GENOZIP) continue;
+
+ if (filename_has_ext (filename, file_exts[ft]))
+ return ft;
+ }
+
+ return UNKNOWN_FILE_TYPE; // this should never happen, because GNRIC_ is "" so it will catch everything
+}
+
+static FileType file_get_type_of_generic (rom filename)
+{
+ if (!filename) return UNKNOWN_FILE_TYPE;
+
+ if (filename_has_ext (filename, ".gz")) return GNRIC_GZ;
+ if (filename_has_ext (filename, ".bz2")) return GNRIC_BZ2;
+ if (filename_has_ext (filename, ".xz")) return GNRIC_XZ;
+ if (filename_has_ext (filename, ".zip")) return GNRIC_ZIP;
+ else return GNRIC;
+}
+
+static FileType file_get_type_force_dt (rom filename, DataType dt)
+{
+ FileType ft = file_get_type (filename);
+
+ // if ft cannot be determined by filename (i.e. comes out as generic), get it by data type
+ switch (ft) {
+ case GNRIC_GZ : return file_get_txt_ft_by_codec (dt, CODEC_GZ);
+ case GNRIC_BZ2 : return file_get_txt_ft_by_codec (dt, CODEC_BZ2);
+ case GNRIC_XZ : return file_get_txt_ft_by_codec (dt, CODEC_XZ);
+ case GNRIC_ZIP : return file_get_txt_ft_by_codec (dt, CODEC_ZIP);
+ default : return ft;
+ }
+}
+
+// returns the filename without the extension eg myfile.1.sam.gz -> myfile.1.
+// if raw_name is given, memory is allocated sufficiently to concatenate a extension. Otherwise, filename is overwritten
+uint32_t file_get_raw_name_and_type (rom filename, rom *raw_name, FileType *out_ft)
+{
+ unsigned len = strlen (filename);
+
+ if (raw_name) {
+ *raw_name = MALLOC (len + 30);
+ memcpy (*(char **)raw_name, filename, len);
+ (*(char **)raw_name)[len] = 0;
+ }
+ else
+ raw_name = &filename; // overwrite filename
+
+ FileType ft = file_get_type (filename);
+ if (ft != UNKNOWN_FILE_TYPE) {
+ len -= strlen (file_exts[ft]);
+ (*(char **)raw_name)[len] = 0;
+ }
+
+ if (out_ft) *out_ft = ft;
+
+ return len;
+}
+
+static void file_ask_user_to_confirm_overwrite (rom filename)
+{
+ if (!strcmp (filename, "/dev/null")) return; // don't ask for /dev/null
+
+ fprintf (stderr, "%s: output file %s already exists: in the future, you may use --force to overwrite\n", global_cmd, filename);
+
+ if (!isatty(0) || !isatty(2)) exit_on_error(false); // if we stdin or stderr is redirected - we cannot ask the user an interactive question
+
+ if (!str_query_user_yn ("Do you wish to overwrite it now?", QDEF_NO)) {
+ fprintf (stderr, "No worries, I'm stopping here - no damage done!\n");
+ exit (EXIT_OK);
+ }
+
+ fprintf (stderr, "\n");
+}
+
+static void file_redirect_output_to_stream (FileP file, rom exec_name,
+ rom format_option_0, rom format_option_1, rom format_option_2, rom format_option_3)
+{
+ char threads_str[20];
+ snprintf (threads_str, sizeof (threads_str), "%u", global_max_threads);
+
+ FILE *redirected_stdout_file = NULL;
+ if (!flag.to_stdout) {
+ redirected_stdout_file = fopen (file->name, file->mode); // exec_name will redirect its output to this file
+ ASSINP (redirected_stdout_file, "cannot open file \"%s\": %s", file->name, strerror(errno));
+ }
+
+ char reason[40];
+ snprintf (reason, sizeof (reason), "To output a %s file", file_exts[file->type]);
+ output_compressor = stream_create (0, 0, 0, DEFAULT_PIPE_SIZE,
+ redirected_stdout_file, // output is redirected unless flag.to_stdout
+ 0, false, reason,
+ exec_name,
+ format_option_0,
+ "--threads", threads_str,
+ format_option_1, format_option_2, format_option_3,
+ NULL);
+
+ file->file = stream_to_stream_stdin (output_compressor);
+}
+
+// starting samtools 1.10, a PG record is added to the SAM header every "samtools view", and an option, --no-PG,
+// is provided to avoid this. See: https://github.com/samtools/samtools/releases/
+// returns "--no-PG" if the option exists, or NULL if not
+static rom file_samtools_no_PG (void)
+{
+ static rom ret_str[2] = { NULL, "--no-PG" };
+ static int has_no_PG = -1; // unknown
+
+ if (has_no_PG >= 0) return ret_str[has_no_PG]; // we already tested
+
+ #define SAMTOOLS_HELP_MAX_LEN 20000
+ char samtools_help_text[SAMTOOLS_HELP_MAX_LEN];
+
+ #define MIN_ACCEPTABLE_LEN 100
+ int len=0;
+
+ for (unsigned i=1; i < 15 && len < MIN_ACCEPTABLE_LEN; i++) {
+ // Tested on samtools 1.11: The normal way to see help is "samtools help view" however it fails if stdin is not the terminal.
+ // Instead, we use samtools view --threads, invalidly without an argument. This *sometimes* shows the help, and sometimes
+ // just shows one line "samtools view:". We overcome this by repeating if the response is not long enough.
+ StreamP samtools = stream_create (0, DEFAULT_PIPE_SIZE, DEFAULT_PIPE_SIZE, 0, 0, 0, 0, "To read/write CRAM files",
+ "samtools", "view", "--threads", NULL);
+ usleep (50000 * i); // wait for samtools
+
+ // read both stderr and stdout from samtools
+ len = read (fileno (stream_from_stream_stderr (samtools)), samtools_help_text, SAMTOOLS_HELP_MAX_LEN-1);
+ len += read (fileno (stream_from_stream_stdout (samtools)), &samtools_help_text[len], SAMTOOLS_HELP_MAX_LEN-len-1);
+
+ stream_close (&samtools, STREAM_DONT_WAIT_FOR_PROCESS);
+ }
+ samtools_help_text[len] = '\0'; // terminate string (more portable, strnstr and memmem are non-standard)
+
+ ASSERT0 (len >= MIN_ACCEPTABLE_LEN, "no response from \"samtools view --threads\"");
+
+ return ret_str[(has_no_PG = !!strstr (samtools_help_text, "--no-PG"))];
+}
+
+// show meaningful error if file is not a supported type and return TRUE if it file should be skipped
+static bool file_open_txt_read_test_valid_dt (ConstFileP file)
+{
+ if (file->data_type == DT_NONE) {
+
+ if (flag.multiple_files || tar_zip_is_tar()) {
+ if (filename_has_ext (file->name, ".genozip")) {
+
+ // case: --tar - include .genozip files verbatim
+ if (tar_zip_is_tar()) {
+ tar_copy_file (file->name, file->name);
+ RETURNW (false, true, "Copied %s to the tar file", file_printname(file));
+ }
+ else
+ RETURNW (false, true, "Skipping %s - it is already compressed", file_printname(file));
+
+ }
+
+ RETURNW (false, true, "Skipping %s - genozip doesn't know how to compress this file type (use --input to tell it)",
+ file_printname (file));
+ }
+ else {
+ ASSINP (!filename_has_ext (file->name, ".genozip"),
+ "cannot compress %s because it is already compressed", file_printname(file));
+
+ ABORT0 ("Unexpectedly, data_type==DT_NONE"); // not expecting to ever reach here, bc if file is not recognized, it should have been set to GENERIC
+ }
+ }
+
+ return false; // all good - no need to skip this file
+}
+
+static void file_set_filename (FileP file, rom fn)
+{
+ // copy filename
+ unsigned fn_size = strlen (fn) + 1; // inc. \0
+ file->name = MALLOC (fn_size);
+ memcpy (file->name, fn, fn_size);
+}
+
+static void file_initialize_txt_file_fields (FileP file)
+{
+ #define TXT_INIT(buf) ({ buf_set_promiscuous (&file->buf, "txt_file->" #buf); })
+
+ if (IS_ZIP) {
+ mutex_initialize (file->recon_plan_mutex);
+
+ // initialize evb "promiscuous" buffers - i.e. buffers that can be allocated by any thread
+ // promiscuous buffers must be initialized by the main thread, and buffer.c does not verify their integrity.
+ TXT_INIT(line_info[0]);
+ TXT_INIT(line_info[1]);
+ TXT_INIT(vb_info[0]);
+ TXT_INIT(vb_info[1]);
+ }
+ else {
+
+ }
+}
+
+static void file_open_ext_decompessor (FileP file, rom exec_name, rom subcommand, Codec streamed_codec, bool name_if_not_remote, rom args[7])
+{
+ char reason[64]; // used for error message if stream_create fails
+ snprintf (reason, sizeof(reason), "To compress a %s file", codec_name (file->codec));
+
+ input_decompressor =
+ stream_create (0, DEFAULT_PIPE_SIZE, DEFAULT_PIPE_SIZE, 0, 0,
+ file->is_remote ? file->name : NULL, // url
+ file->redirected, reason, exec_name,
+ subcommand ? subcommand : SKIP_ARG,
+ #define A(i) (args[i] ? args[i] : SKIP_ARG)
+ A(0), A(1), A(2), A(3), A(4), A(5), A(6),
+ (name_if_not_remote && !file->is_remote) ? file->name : SKIP_ARG,
+ NULL);
+
+ file->file = stream_from_stream_stdout (input_decompressor);
+ file->redirected = true;
+ file->codec = streamed_codec; // data received from input_decompressor is in this codec
+}
+
+static void file_open_txt_read_bz2 (FileP file)
+{
+ file->file = file->is_remote ? BZ2_bzdopen (fileno (url_open_remote_file (NULL, file->name)), READ) // we're abandoning the FILE structure (and leaking it, if libc implementation dynamically allocates it) and working only with the fd
+ : file->redirected ? BZ2_bzdopen (STDIN_FILENO, READ)
+ : BZ2_bzopen (file->name, READ); // for local files we decompress ourselves
+
+ ASSERT (file->file, "failed to open BZ2 file %s", file->name);
+
+ if (!file->is_remote && !file->redirected) {
+ int fd = BZ2_get_fd (file->file);
+
+ stream_set_inheritability (fd, false); // Windows: allow file_remove in case of --replace
+ #ifdef __linux__
+ posix_fadvise (fd, 0, 0, POSIX_FADV_SEQUENTIAL); // ignore errors
+ #endif
+ }
+}
+
+static void file_open_txt_read_gz (FileP file)
+{
+ file->file = file->is_remote ? url_open_remote_file (NULL, file->name)
+ : file->redirected ? fdopen (STDIN_FILENO, "rb")
+ : fopen (file->name, READ);
+
+ ASSERT (file->file, "failed to open %s: %s", file->name, strerror (errno));
+
+ if (!file->is_remote && !file->redirected) {
+ stream_set_inheritability (fileno (file->file), false); // Windows: allow file_remove in case of --replace
+ #ifdef __linux__
+ posix_fadvise (fileno ((FILE *)file->file), 0, 0, POSIX_FADV_SEQUENTIAL); // ignore errors
+ #endif
+ }
+
+ txtfile_discover_gz_codec (file); // decide between CODEC_GZ, CODEC_BGZF or CODEC_GZIL
+}
+
+FileP file_open_txt_read (rom filename)
+{
+ FileP file = (FileP)CALLOC (sizeof(File));
+
+ file->supertype = TXT_FILE;
+ file->redirected = !filename; // later on, also CRAM, XZ, BCF will be set as redirected
+ file->mode = READ;
+ file->is_remote = filename && url_is_url (filename);
+ flag.from_url = file->is_remote;
+
+ int64_t url_file_size = 0; // will be -1 if the web/ftp site does not provide the file size
+ rom error = NULL;
+
+ thool is_file_exists = unknown;
+
+ if (file->is_remote) {
+ error = url_get_status (filename, &is_file_exists, &url_file_size); // accessing is expensive - get existance and size in one call
+ if (!error && url_file_size >= 0) file->disk_size = (uint64_t)url_file_size;
+ }
+
+ else if (!file->redirected) { // not stdin
+ is_file_exists = file_exists (filename);
+ error = strerror (errno);
+
+ if (is_file_exists)
+ file->disk_size = file_get_size (filename);
+ }
+
+ // return null if genozip input file size is known to be 0, so we can skip it. note: file size of url might be unknown
+ if (is_file_exists == yes && !file->disk_size && !url_file_size &&
+ !(!file->is_remote && !file->redirected && file_is_fifo (filename))) // a fifo is allowed is be size 0 (as it always is)
+ goto fail;
+
+ if (!file->redirected) {
+ ASSINP (is_file_exists != no, "Failed to open \"%s\" for reading: %s", filename, error);
+
+ file_set_filename (file, filename);
+
+ file->basename = filename_base (file->name, false, "", NULL, 0);
+
+ // if user provided the type with --input, we use that, otherwise derive from the file name
+ file->type = flag.stdin_type == GNRIC ? file_get_type_of_generic (file->basename)
+ : flag.stdin_type ? flag.stdin_type
+ : file_get_type (file->basename);
+ }
+
+ else { // stdin
+ file->basename = filename_base (NULL, false, FILENAME_STDIN, NULL, 0);
+ file->type = flag.stdin_type;
+ }
+
+ file->data_type = file_get_data_type_of_input_file (file->type);
+
+ // show meaningful error if file is not a supported data type
+ if (file_open_txt_read_test_valid_dt (file)) goto fail; // skip this file
+
+ // open the file, based on the codec (as guessed by file extension)
+ file->codec = file_get_codec_by_txt_ft (file->data_type, file->type, false);
+ file->source_codec = file_get_codec_by_txt_ft (file->data_type, file->type, true);
+
+ switch (file->codec) {
+ case CODEC_GZ: case CODEC_BGZF: case CODEC_NONE: gz:
+ file_open_txt_read_gz (file);
+ break;
+
+ case CODEC_BZ2:
+ file_open_txt_read_bz2 (file);
+ break;
+
+ case CODEC_CRAM: {
+ // note: in CRAM, we read the header in advance in possible, directly (without samtools), so we can handle the case
+ // that the reference file is wrong. In samtools, if we read beyond the header with a wrong ref, samtools will hang.
+ if (!file->is_remote && !file->redirected) {
+ cram_inspect_file (file); // if file is indeed CRAM, updates file->est_num_lines, file->header_size, and if not, updates file->data_type and file->codec/source_codec
+ if (file->codec == CODEC_GZ || file->codec == CODEC_NONE) goto gz; // actually, this is a GZ file (possibly BAM)
+ }
+
+ StrTextSuperLong samtools_T_option = cram_get_samtools_option_T (gref);
+
+ file_open_ext_decompessor (file, "samtools", "view", CODEC_BGZF, true, (rom[7]){
+ "--bam", "--uncompressed", // BAM with BGZF blocks in which the payload is not compressed
+ "--threads=10", // in practice, samtools is able to consume ~4 cores
+ file_samtools_no_PG() ? "--no-PG" : SKIP_ARG, // don't add a PG line to the header
+ samtools_T_option.s[0] ? samtools_T_option.s : SKIP_ARG });
+
+ txtfile_discover_gz_codec (file); // also allocates gz_data
+ break;
+ }
+
+ case CODEC_BCF: {
+ file_open_ext_decompessor (file, "bcftools", "view", CODEC_NONE, true, (rom[7]){
+ "--threads=8", "-Ov", "--no-version" }); // BCF: do not append version and command line to the header
+ break;
+ }
+
+ case CODEC_XZ:
+ if (file->redirected) ABORTINP0 ("Compressing piped-in data in xz format is not currently supported");
+
+ file_open_ext_decompessor (file, "xz", NULL, CODEC_NONE, true, (rom[7]){
+ "--threads=8", "--decompress", "--keep", "--stdout",
+ flag.quiet ? "--quiet" : SKIP_ARG });
+ break;
+
+ case CODEC_ZIP:
+ file_open_ext_decompessor (file, "unzip", NULL, CODEC_NONE, true, (rom[7]){
+ "-p", flag.quiet ? "--quiet" : SKIP_ARG });
+ break;
+
+ case CODEC_ORA: {
+ file_open_ext_decompessor (file, "orad", NULL, CODEC_NONE, false, (rom[7]){
+ "--raw", "--quiet", "--stdout",
+ "--threads", str_int_s (global_max_threads).s,
+ (file->is_remote || file->redirected) ? "-" : file->name }); // local file name
+ break;
+ }
+
+ default:
+ ABORT ("%s: invalid filename extension for %s files: %s", global_cmd, dt_name (file->data_type), file->name);
+ }
+
+ if (!file->file) goto fail;
+
+ file_initialize_txt_file_fields (file);
+
+ if (file->is_remote) FREE (error); // allocated by url_get_status
+
+ return file;
+
+fail:
+ if (file->is_remote) FREE (error);
+ FREE (file->name);
+ FREE (file->basename);
+ FREE (file);
+ return NULL;
+}
+
+FileP file_open_txt_write (rom filename, DataType data_type, BgzfLevel bgzf_level)
+{
+ ASSERT (data_type > DT_NONE && data_type < NUM_DATATYPES ,"invalid data_type=%d", data_type);
+
+ FileP file = (FileP)CALLOC (sizeof(File));
+
+ file->supertype = TXT_FILE;
+ file->mode = WRITE;
+ file->data_type = data_type;
+ file->redirected = !filename;
+
+ file->codec = data_type == DT_CRAM ? CODEC_CRAM
+ : data_type == DT_BCF ? CODEC_BCF
+ : bgzf_level != BGZF_NO_BGZF ? CODEC_BGZF // see bgzf_piz_calculate_bgzf_flags
+ : /* BGZF_NO_BGZF */ CODEC_NONE;
+
+ if (!file->redirected) { // not stdout
+ if (file_exists (filename) &&
+ !file_is_fifo (filename) && // a fifo can be "overwritten" (that's just normal writing to a fifo)
+ !flag.force && !flag.test)
+
+ file_ask_user_to_confirm_overwrite (filename); // function doesn't return if user responds "no"
+
+ file_set_filename (file, filename);
+
+ file->type = file_get_type_force_dt (filename, data_type);
+ }
+
+ else // stdout
+ file->type = txt_out_ft_by_dt[data_type][bgzf_level >= 1]; // plain file or .gz file
+
+ file->basename = filename_base (file->name, false, FILENAME_STDOUT, NULL, 0);
+
+ // don't actually open the output file if we're not going to write to it
+ if (flag.no_writer) return file;
+
+ // open the file, based on the codec
+ switch (file->codec) {
+ case CODEC_BGZF :
+ case CODEC_NONE : file->file = file->redirected ? fdopen (STDOUT_FILENO, "wb") : fopen (file->name, WRITE); break;
+
+ case CODEC_CRAM : {
+ StrTextSuperLong samtools_T_option = cram_get_samtools_option_T (gref);
+ file_redirect_output_to_stream (file, "samtools", "view", "-OCRAM",
+ file_samtools_no_PG(),
+ samtools_T_option.s[0] ? samtools_T_option.s : NULL);
+ break;
+ }
+
+ case CODEC_BCF : {
+ char comp_level[4] = { '-', 'l', '0' + MIN_(bgzf_level, 9), 0 };
+
+ if (flag.show_bgzf)
+ iprintf ("%s: launching external compressor \"bcftools\" with bgzf_level=%d\n", file->basename, bgzf_level);
+
+ file_redirect_output_to_stream (file, "bcftools", "view", "-Ob", comp_level, NULL);
+ break;
+ }
+
+ default: {} // never reaches here
+ }
+
+ ASSINP (file->file, "cannot open file \"%s\": %s", file->name, strerror(errno)); // errno will be retrieve even the open() was called through zlib and bzlib
+
+ file_initialize_txt_file_fields (file);
+
+ return file;
+}
+
+// note: we insert all the z_file buffers into the buffer list in advance and mark them as promiscuous, to avoid this
+// thread satety issue: without this pre-allocation, some of these buffers will be first allocated by the first
+// compute thread to use it, causing buf_alloc to modify evb's buf_list - this is not permitted as the main
+// thread might be doing so concurrently resulting in a corrupted evb.buf_list.
+
+static void file_initialize_z_file_data (FileP file)
+{
+ init_dict_id_to_did_map (file->d2d_map);
+ profiler_new_z_file();
+
+ #define Z_INIT(buf) ({ buf_set_promiscuous (&file->buf, "z_file->" #buf); })
+
+ if (file->mode != READ) { // careful not to use IS_ZIP - which is set when reading aux files
+ for (Did did_i=0; did_i < MAX_DICTS; did_i++) {
+ ctx_zip_init_promiscuous (&file->contexts[did_i]); // must be done from main thread
+ file->contexts[did_i].vb_1_pending_merges = -1; // uninitialized - will be initialized in ctx_set_vb_1_pending_merges
+ }
+ __atomic_thread_fence (__ATOMIC_RELEASE); // release all vb_1_pending_merges
+
+ // initialize evb "promiscuous" buffers - i.e. buffers that can be allocated by any thread (obviously protected by eg a mutex)
+ // promiscuous buffers must be initialized by the main thread, and buffer.c does not verify their integrity.
+ Z_INIT (ra_buf);
+ Z_INIT (sag_grps);
+ Z_INIT (sag_alns);
+ Z_INIT (sag_qnames);
+ Z_INIT (sag_cigars); // union with solo_data
+ Z_INIT (sag_seq);
+ Z_INIT (sag_qual);
+ Z_INIT (deep_index_by[BY_SEQ]);
+ Z_INIT (deep_index_by[BY_QNAME]);
+ Z_INIT (deep_ents);
+ Z_INIT (section_list_buf);
+ Z_INIT (contexts[CHROM].chrom2ref_map);
+ }
+ else {
+ Z_INIT (sag_qual);
+ Z_INIT (sag_cigars); // union with solo_data
+ }
+
+ if (flag.no_biopsy_line) // no need to initialize in --biopsy-line (as destroying it later will error)
+ serializer_initialize (file->digest_serializer);
+
+ clock_gettime (CLOCK_REALTIME, &file->start_time);
+}
+
+// get time since creation of z_file object in memory
+StrText file_get_z_run_time (FileP file)
+{
+ TimeSpecType tb;
+ clock_gettime(CLOCK_REALTIME, &tb);
+
+ int seconds_so_far = ((tb.tv_sec - file->start_time.tv_sec)*1000 +
+ ((int64_t)tb.tv_nsec - (int64_t)file->start_time.tv_nsec) / 1000000) / 1000;
+
+ return str_human_time (seconds_so_far, true);
+}
+
+// opens z_file for read or write
+FileP file_open_z_read (rom filename)
+{
+ START_TIMER;
+
+ ASSINP0 (filename, "it is not possible to redirect genozip files from stdin");
+
+ FileP file = (FileP)CALLOC (sizeof(File));
+
+ file->supertype = Z_FILE;
+ file->mode = READ;
+ file->is_in_tar = (flag.t_offset > 0);
+
+ if (flag.debug_tar)
+ iprintf ("file_open_z_read: t_offset=%"PRIu64" t_size=%"PRIu64" %s\n", flag.t_offset, flag.t_size, filename);
+
+ rom disk_filename = file->is_in_tar ? tar_get_tar_name() : filename;
+ ASSINP (file_exists (disk_filename), "cannot open \"%s\" for reading: %s", disk_filename, strerror (errno));
+
+ file->disk_size = file->is_in_tar ? flag.t_size : file_get_size (filename);
+
+ file_set_filename (file, filename);
+
+ file->type = file_get_type (file->name);
+
+ file->basename = filename_base (file->name, false, NULL, NULL, 0);
+
+ // if a FASTA file was given as an argument to --reference or --REFERENCE, get the .ref.genozip file,
+ // possobily running --make-reference in a separate process if needed
+ if (flag.reading_reference && (file_get_data_type_of_input_file (file_get_type (file->name)) == DT_FASTA) && (file_get_type (file->name) != FASTA_GENOZIP))
+ disk_filename = ref_fasta_to_ref (file);
+
+ ASSINP (!flag.reading_reference || filename_has_ext (file->name, REF_GENOZIP_),
+ "You specified file \"%s\", however with --reference or --REFERENCE, you must specify a reference file (%s file or FASTA file)\n"
+ "Tip: To create a genozip reference file from a FASTA file, use 'genozip --make-reference myfasta.fa'",
+ file->name, REF_GENOZIP_);
+
+ if ((!flag.seg_only && !flag.show_bam) || flag_loading_auxiliary) {
+
+ // make sure file is a regular file (not FIFO, directory etc)
+ struct stat sb;
+ int cause=0, stat_errno=0;
+ if (stat (disk_filename, &sb)) {
+ cause = 6; // stat failed
+ stat_errno = errno;
+ }
+
+ if ((sb.st_mode & S_IFMT) != S_IFREG) cause=7; // not regular file
+
+ if (!cause) {
+ file->file = fopen (disk_filename, READ);
+
+ stream_set_inheritability (fileno (file->file), false); // Windows: allow file_remove in case of --replace
+ }
+
+ // verify that this is a genozip file
+ // we read the Magic at the end of the file (as the magic at the beginning may be encrypted)
+ uint32_t magic;
+ if (cause ||
+ (cause = 1 * !file->file) ||
+ (cause = 2 * !sb.st_size) ||
+ (cause = 3 * !file_seek (file, -(int)sizeof (magic), SEEK_END, READ, SOFT_FAIL)) ||
+ (cause = 4 * !fread (&magic, sizeof (magic), 1, file->file)) ||
+ (cause = 5 * (BGEN32 (magic) != GENOZIP_MAGIC && !(flag.show_headers && flag.force)))) {
+
+ int fail_errno = errno;
+ FCLOSE (file->file, disk_filename);
+
+ if (flag.validate == VLD_REPORT_INVALID)
+ flag.validate = VLD_INVALID_FOUND;
+
+ else if (flag.validate == VLD_NO_REPORT)
+ exit (EXIT_INVALID_GENOZIP_FILE); // silent exit with error code, if even a single file is not valid
+
+ rom cause_str = cause==1 ? strerror (fail_errno)
+ : cause==2 ? "file is empty"
+ : cause==3 ? "file_seek failed"
+ : cause==4 ? "fread failed"
+ : cause==5 ? "Not a valid genozip file (bad magic)"
+ : cause==6 ? strerror (stat_errno)
+ : cause==7 ? "Not a regular file"
+ : "no error";
+
+ if (flag.multiple_files) {
+
+ if (flag.validate == VLD_INVALID_FOUND) // outputs even if --quiet
+ iprintf ("Cannot open %s: %s\n", disk_filename, cause_str);
+
+ else if (flag.validate == VLD_NONE) { // silenced by --quiet
+ static int once=0;
+ WARN ("Skipping %s: %s%s", file->name, cause_str,
+ !(once++) ? " (--quiet to silence this message)" : "");
+ }
+
+ file_close (&file);
+
+ goto done;
+ }
+ else { // single file
+ if (flag.validate == VLD_REPORT_VALID)
+ exit (EXIT_INVALID_GENOZIP_FILE); // exit quietly - with a return code indicating invalidity
+ else
+ ABORTINP ("Cannot open %s: %s %s", disk_filename, cause_str,
+ (cause==3 || cause==4) ? strerror(fail_errno) : "");
+ }
+ }
+
+ // file is valid
+ else if (flag.validate == VLD_REPORT_VALID)
+ iprintf ("%s\n", file->name); // print just filename, so a script can use this output
+ }
+
+ file->data_type = DT_NONE; // we will get the data type from the genozip header, not by the file name
+
+ file_initialize_z_file_data (file);
+
+ ASSINP (file->file, "cannot open file \"%s\": %s", file->name, strerror(errno)); // errno will be retrieve even the open() was called through zlib and bzlib
+
+done:
+ COPY_TIMER_EVB (file_open_z);
+ return file;
+}
+
+// opens z_file for read or write
+FileP file_open_z_write (rom filename, FileMode mode, DataType data_type, Codec source_codec)
+{
+ START_TIMER;
+
+ ASSINP0 (filename, "it is not possible to redirect genozip files to stdout");
+
+ FileP file = (FileP)CALLOC (sizeof(File));
+
+ file->supertype = Z_FILE;
+ file->mode = mode;
+ file->is_in_tar = tar_zip_is_tar();
+
+ if (file_exists (filename) &&
+ !flag.force &&
+ !flag.zip_no_z_file && // not zip with --seg-only
+ !file->is_in_tar)
+
+ file_ask_user_to_confirm_overwrite (filename); // function doesn't return if user responds "no"
+
+ file_set_filename (file, filename);
+
+ file->type = file_get_type_force_dt (file->name, data_type);
+ file->data_type = data_type;
+ file->source_codec = source_codec;
+
+ file->basename = filename_base (file->name, false, NULL, NULL, 0);
+
+ ASSINP (filename_has_ext (file->name, GENOZIP_EXT),
+ "file %s must have a " GENOZIP_EXT " extension", file_printname (file));
+
+ // set file->type according to the data type, overriding the previous setting - i.e. if the user
+ // uses the --output option, he is unrestricted in the choice of a file name
+ file->type = file_get_z_ft_by_txt_in_ft (file->data_type, txt_file->type);
+
+ mutex_initialize (file->dicts_mutex);
+ mutex_initialize (file->custom_merge_mutex);
+ mutex_initialize (file->zriter_mutex);
+
+ if (!flag.zip_no_z_file) {
+
+ if (flag.force && !file->is_in_tar)
+ unlink (file->name); // delete file if it already exists (needed in weird cases, eg symlink to non-existing file)
+
+ // if we're writing to a tar file, we get the already-openned tar file
+ if (file->is_in_tar)
+ file->file = tar_open_file (file->name, file->name);
+ // note: tar doesn't have a z_reread_file bc --pair and --deep are not yet supported with --tar
+
+ else {
+ file->file = fopen (file->name, file->mode);
+
+ if (!flag.no_zriter)
+ file->z_reread_file = fopen (file->name, READ);
+
+#ifndef _WIN32
+ // set z_file permissions to be the same as the txt_file permissions (if possible)
+ if (file->file && txt_file && txt_file->name && !txt_file->is_remote) {
+ struct stat st;
+ if (stat (txt_file->name, &st))
+ WARN ("FYI: Failed to set permissions of %s because failed to stat(%s): %s", file->name, txt_file->name, strerror(errno));
+
+ else
+ chmod (file->name, st.st_mode); // ignore errors (e.g. this doesn't work on NTFS)
+ }
+#endif
+ }
+ }
+
+ file->genozip_version = code_version_major(); // to allow the VER macro to operate consistently across ZIP/PIZ
+ file->genozip_minor_ver = code_version_minor();
+
+ file_initialize_z_file_data (file);
+
+ ASSINP (file->file || flag.zip_no_z_file,
+ "cannot open file \"%s\": %s", file->name, strerror(errno)); // errno will be retrieve even the open() was called through zlib and bzlib
+
+ COPY_TIMER_EVB (file_open_z);
+ return file;
+}
+
+// index file is it is a disk file of a type that can be indexed
+static void file_index_txt (ConstFileP file)
+{
+ ASSERTNOTNULL (file);
+
+ RETURNW (file->name,, "%s: cannot create an index file when output goes to stdout", global_cmd);
+
+ StreamP indexing = NULL;
+
+ switch (file->data_type) {
+ case DT_SAM:
+ case DT_BAM:
+ RETURNW (file->codec == CODEC_BGZF,, "%s: output file needs to be a .sam.gz or .bam to be indexed", global_cmd);
+ indexing = stream_create (0, 0, 0, 0, 0, 0, 0, "to create an index", "samtools", "index", file->name, NULL);
+ break;
+
+ case DT_VCF:
+ RETURNW (file->codec == CODEC_BGZF,, "%s: output file needs to be a .vcf.gz or .bcf to be indexed", global_cmd);
+ RETURNW (vcf_header_get_has_fileformat(),, "%s: file needs to start with ##fileformat=VCF be indexed", global_cmd);
+ indexing = stream_create (0, 0, 0, 0, 0, 0, 0, "to create an index", "bcftools", "index", file->name, NULL);
+ break;
+
+ case DT_FASTQ:
+ case DT_FASTA:
+ RETURNW (file->codec == CODEC_BGZF || file->codec == CODEC_NONE,,
+ "%s: To be indexed, the output file cannot be compressed with %s", global_cmd, codec_name (file->codec));
+ indexing = stream_create (0, 0, 0, 0, 0, 0, 0, "to create an index", "samtools", "faidx", file->name, NULL);
+ break;
+
+ default: break; // we don't know how to create an index for other data types
+ }
+
+ if (indexing) {
+ progress_new_component (file->basename, "Indexing", false, NULL);
+
+ stream_wait_for_exit (indexing, false);
+
+ progress_finalize_component_time ("Done indexing", DIGEST_NONE);
+ }
+}
+
+void file_close (FileP *file_p)
+{
+ START_TIMER;
+
+ FileP file = *file_p;
+
+ if (z_file && file == z_file && !flag_loading_auxiliary &&
+ flag.show_time_comp_i == COMP_ALL && !flag.show_time[0]) // show-time without the optional parameter
+ profiler_add_evb_and_print_report();
+
+ __atomic_store_n (file_p, (FileP)NULL, __ATOMIC_RELAXED);
+
+ if (!file) return; // nothing to do
+
+ if (file->file && file->supertype == TXT_FILE) {
+
+ if (file->mode == READ && file->codec == CODEC_BZ2)
+ BZ2_bzclose((BZFILE *)file->file);
+
+ else if (file->mode == READ && is_read_via_ext_decompressor (file))
+ stream_close (&input_decompressor, STREAM_WAIT_FOR_PROCESS);
+
+ else if (file->mode == WRITE && is_written_via_ext_compressor (file->codec))
+ stream_close (&output_compressor, STREAM_WAIT_FOR_PROCESS);
+
+ // if its stdout - just flush, don't close - we might need it for the next file
+ else if (file->mode == WRITE && flag.to_stdout)
+ fflush ((FILE *)file->file);
+
+ else if (file->is_remote)
+ url_close_remote_file_stream ((FILE**)&file->file);
+
+ else
+ FCLOSE (file->file, file_printname (file));
+
+ // create an index file using samtools, bcftools etc, if applicable
+ if (file->mode == WRITE && flag.index_txt && !flag_loading_auxiliary)
+ file_index_txt (file);
+ }
+
+ else if (file->file && file->supertype == Z_FILE) {
+
+ // ZIP note: we need to destory all even if unused, because they were initialized in file_initialize_z_file_data
+ if (IS_ZIP)
+ for (Did did_i=0; did_i < (IS_ZIP ? MAX_DICTS : file->num_contexts); did_i++)
+ mutex_destroy (file->ctx_mutex[did_i]);
+
+ if (file->is_in_tar && file->mode != READ)
+ tar_close_file (&file->file);
+ else {
+ FCLOSE (file->file, file_printname (file));
+ FCLOSE (file->z_reread_file, file_printname (file));
+ }
+ serializer_destroy (file->digest_serializer);
+ }
+
+ // free resources if we are NOT near the end of the execution. If we are at the end of the execution
+ // it is faster to just let the process die
+
+ if (!flag.let_OS_cleanup_on_exit) {
+
+ if (IS_PIZ && flag.deep && file->supertype == Z_FILE) { // in this case, deep_index and deep_ents are Buffers containing arrays of Buffers
+ for_buf (Buffer, buf, file->deep_index) buf_destroy (*buf);
+ for_buf (Buffer, buf, file->deep_ents) buf_destroy (*buf);
+ huffman_destroy (&file->qname_huf);
+ }
+
+ buflist_destroy_file_bufs (file);
+
+ mutex_destroy (file->zriter_mutex);
+ mutex_destroy (file->dicts_mutex);
+ mutex_destroy (file->custom_merge_mutex);
+ mutex_destroy (file->qname_huf_mutex);
+ mutex_destroy (file->recon_plan_mutex);
+
+ FREE (file->name);
+ FREE (file->basename);
+ FREE (file);
+ }
+
+ COPY_TIMER_EVB (file_close);
+}
+
+void file_remove (rom filename, bool fail_quietly)
+{
+ chmod (filename, S_IRUSR | S_IWUSR); // make sure its +w so we don't get permission denied (ignore errors)
+
+#ifndef _WIN32
+ int ret = remove (filename);
+ ASSERTW (!ret || fail_quietly, "Warning: failed to remove %s: %s", filename, strerror (errno));
+#else
+ ASSERTW (DeleteFile (filename) || fail_quietly, "Warning: failed to remove %s: %s", filename, str_win_error());
+#endif
+}
+
+bool file_rename (rom old_name, rom new_name, bool fail_quietly)
+{
+ chmod (old_name, S_IRUSR | S_IWUSR); // make sure its +w so we don't get permission denied (ignore errors)
+
+ int ret = rename (old_name, new_name);
+ ASSERTW (!ret || fail_quietly, "Warning: failed to rename %s to %s: %s", old_name, new_name, strerror (errno));
+
+ return !ret; // true if successful
+}
+
+// also updates filename to .gz (but not if .bam)
+void file_gzip (char *filename)
+{
+ unsigned fn_len = strlen (filename);
+
+ char command[fn_len + 50];
+
+ int ret = 1;
+
+ snprintf (command, sizeof (command), "bgzip -@%u -f \"%s\" %s", global_max_threads, filename, flag.is_windows ? "" : " > /dev/null 2>&1");
+ ret = system (command); // note: runs sh on Unix, and cmd.exe on Windows
+
+ if (ret && errno == ENOENT) { // no bgzip - try pigz
+ snprintf (command, sizeof (command), "pigz -f \"%s\" %s", filename, flag.is_windows ? "" : " > /dev/null 2>&1");
+ ret = system (command);
+ }
+
+ if (ret && errno == ENOENT) { // no pigz - try gzip
+ snprintf (command, sizeof (command), "gzip -f \"%s\" %s", filename, flag.is_windows ? "" : " > /dev/null 2>&1");
+ ret = system (command);
+ }
+
+ ASSERTW (!ret, "FYI: \"%s\" returned %d. No harm.", command, ret);
+
+ if (!ret) {
+ // special case: rename .bam.gz -> .bam
+ if (fn_len >= 4 && !memcmp (&filename[fn_len-4], ".bam", 4)) {
+ char gz_filename[fn_len + 10];
+ snprintf (gz_filename, sizeof (gz_filename), "%s.gz", filename);
+ file_remove (filename, true);
+ file_rename (gz_filename, filename, false);
+ }
+ else
+ strcpy (&filename[fn_len], ".gz");
+ }
+}
+
+void file_mkfifo (rom filename)
+{
+#ifndef _WIN32
+ file_remove (filename, true);
+ ASSERT (!mkfifo (filename, 0666), "mkfifo failed for %s: %s", filename, strerror (errno));
+
+#else
+ ABORT0 ("file_mkfifo not supported on Windows");
+#endif
+}
+
+bool file_is_fifo (rom filename)
+{
+ if (flag.is_windows) return false; // we don't support FIFOs in Win32 yet
+
+ struct stat st;
+ ASSERT (!stat (filename, &st), "stat failed on %s", filename);
+
+ return S_ISFIFO (st.st_mode);
+}
+
+bool file_exists (rom filename)
+{
+
+ if (!filename || !filename[0]) return false;
+ bool exists = !access (filename, F_OK);
+
+#ifdef _WIN32
+ // TO DO: overcome this limitation, see: https://docs.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation
+ if (!exists && strlen (filename) > MAX_PATH)
+ WARN_ONCE ("Genozip limitation: filenames on Windows are limited to %u characters. Please contact "EMAIL_SUPPORT" for advice: %s", PATH_MAX, filename);
+#endif
+
+ return exists;
+}
+
+// returns true if successful. depending on soft_fail, a failure will either emit an error
+// (and exit) or a warning (and return).
+bool file_seek (FileP file, int64_t offset,
+ int whence, // SEEK_SET, SEEK_CUR or SEEK_END
+ rom mode, // READ if seeking before reading, WRITE if before writing
+ FailType fail_type)
+{
+ ASSERTNOTNULL (file);
+ ASSERTNOTNULL (file->file);
+
+ if (file->supertype == Z_FILE) {
+
+ if (whence == SEEK_END && file->is_in_tar && IS_PIZ) {
+ offset += flag.t_offset + flag.t_size;
+ whence = SEEK_SET;
+ goto test_already_there;
+ }
+
+ // in SEEK_SET of a z_file that is being tarred, update the offset to the beginning of the file data in the tar file
+ else if (whence == SEEK_SET) {
+ offset += (IS_ZIP ? tar_file_offset() : flag.t_offset); // 0 if not using tar
+
+ test_already_there:
+ if (ftello64 (GET_FP(file, mode)) == offset) return true; // already at the right offset
+ }
+ }
+
+ int ret = fseeko64 (GET_FP(file, mode), offset, whence);
+
+ if (fail_type != HARD_FAIL) {
+ if (!flag.to_stdout && fail_type==WARNING_FAIL) {
+ ASSERTW (!ret, errno == EINVAL ? "Warning: Error while reading file %s (fseeko64 (whence=%d offset=%"PRId64")): it is too small%s"
+ : "Warning: fseeko64 failed on file %s (whence=%d offset=%"PRId64"): %s",
+ file_printname (file), whence, offset, errno == EINVAL ? "" : strerror (errno));
+ }
+ }
+ else
+ ASSERT (!ret, "fseeko64(offset=%"PRId64" whence=%d) failed on file %s (FILE*=%p remote=%s redirected=%s): %s",
+ offset, whence, file_printname (file), file->file, TF(file->is_remote), TF(file->redirected), strerror (errno));
+
+ return !ret;
+}
+
+int64_t file_tell_do (FileP file, FailType soft_fail, rom func, unsigned line)
+{
+ ASSERTNOTNULL (file);
+ ASSERTNOTNULL (file->file);
+
+ if (IS_ZIP && file->supertype == TXT_FILE && file->codec == CODEC_GZ)
+ return txt_file->disk_so_far;
+
+ if (IS_ZIP && file->supertype == TXT_FILE && file->codec == CODEC_BZ2)
+ return BZ2_consumed ((BZFILE *)file->file);
+
+ int64_t offset = ftello64 ((FILE *)file->file);
+ ASSERT (offset >= 0 || soft_fail, "called from %s:%u: ftello64 failed for %s (FILE*=%p remote=%s redirected=%s): %s",
+ func, line, file->name, file->file, TF(file->is_remote), TF(file->redirected), strerror (errno));
+
+ if (offset < 0) return -1; // soft fail
+
+ // in in z_file that is being tarred, update the offset to the beginning of the file data in the tar file
+ if (file->supertype == Z_FILE)
+ offset -= tar_file_offset(); // 0 if not using tar
+
+ return offset;
+}
+
+uint64_t file_get_size (rom filename)
+{
+ struct stat64 st;
+
+ int ret = stat64(filename, &st);
+ ASSERT (!ret, "stat64 failed on '%s': %s", filename, strerror(errno));
+
+ return st.st_size;
+}
+
+bool file_is_dir (rom filename)
+{
+ ASSERTNOTNULL (filename);
+ int filename_len = strlen (filename);
+
+ // temporarily remove trailing /
+ if (filename[filename_len-1] == '/' || filename[filename_len-1] == '\\')
+ filename_len--;
+
+ SAFE_NULT (filename);
+
+ struct stat64 st;
+ int ret = stat64 (filename, &st); // 0 if successful
+
+ SAFE_RESTORE;
+
+ return !ret && S_ISDIR (st.st_mode);
+}
+
+void file_mkdir (rom dirname)
+{
+ if (file_is_dir (dirname)) return; // already exists - that's ok
+
+#ifdef _WIN32
+ int ret = _mkdir (flag.out_dirname);
+#else
+ int ret = mkdir (flag.out_dirname, 0777);
+#endif
+ ASSERT (!ret, "mkdir(%s) failed: %s", flag.out_dirname, strerror (errno));
+}
+
+// reads an entire file into a buffer. if filename is "-", reads from stdin
+void file_get_file (VBlockP vb, rom filename, BufferP buf, rom buf_name,
+ uint64_t max_size, // 0 to read entire file, or specify for max size
+ FileContentVerificationType ver_type, bool add_string_terminator)
+{
+ bool is_stdin = !strcmp (filename, "-");
+ if (is_stdin && !max_size) max_size = 10000000; // max size for stdin
+
+ uint64_t file_size = is_stdin ? 0 : file_get_size (filename);
+
+ uint64_t size = is_stdin ? max_size
+ : !file_size ? max_size
+ : max_size ? MIN_(max_size, file_size)
+ : file_size;
+
+ buf_alloc (vb, buf, 0, size + add_string_terminator, char, 1, buf_name);
+
+ FILE *file = is_stdin ? stdin : fopen (filename, "rb");
+ ASSINP (file, "cannot open \"%s\": %s", filename, strerror (errno));
+
+ buf->len = fread (buf->data, 1, size, file);
+ ASSERT (is_stdin || max_size || buf->len == size, "Error reading file %s: %s", filename, strerror (errno));
+
+ ASSINP (ver_type != VERIFY_ASCII || str_is_printable (STRb(*buf)), "Expecting %s to contain text (ASCII)", filename);
+ ASSINP (ver_type != VERIFY_UTF8 || str_is_utf8 (STRb(*buf)), "Expecting %s to contain ASCII or UTF-8 text", filename);
+
+ if (add_string_terminator)
+ *BAFTc (*buf) = 0;
+
+ FCLOSE (file, filename);
+}
+
+// writes data to a file and flushes it, returns true if successful
+
+static Mutex put_data_mutex = {};
+#define MAX_PUT_DATA_FILES_PER_EXECUTION 10 // maximum files deletable at abort
+static rom put_data_tmp_filenames[MAX_PUT_DATA_FILES_PER_EXECUTION] = {};
+static unsigned num_put_files=0;
+
+bool file_put_data (rom filename, const void *data, uint64_t len,
+ mode_t mode) // optional - ignored if 0
+{
+ int fn_len = strlen (filename);
+
+ // remove invalid characters from filename
+ if (flag.is_windows)
+ for (char *c=(char*)filename ; *c ; c++)
+ if (*c == ':' && (c-filename != 1)) *c = '-'; // ':' exist eg in SAM AUX names
+
+ int tmp_filename_size = fn_len + 5;
+ char *tmp_filename = MALLOC (tmp_filename_size);
+ // we first write to tmp_filename, and after we complete and flush, we rename to the final name
+ // so that if a file exists (in its final name) - then its guaranteed to be fully written
+ snprintf (tmp_filename, tmp_filename_size, "%s.tmp", filename);
+
+ file_remove (filename, true);
+ file_remove (tmp_filename, true);
+
+ FILE *file = fopen (tmp_filename, "wb");
+ if (!file) return false;
+
+ // save file name in put_data_tmp_filenames, to be deleted in case of aborting by file_put_data_abort
+ mutex_initialize (put_data_mutex); // first caller initializes. not thread safe, but good enough for the purpose.
+ mutex_lock (put_data_mutex);
+
+ unsigned my_file_i = num_put_files;
+
+ if (num_put_files < MAX_PUT_DATA_FILES_PER_EXECUTION)
+ put_data_tmp_filenames[num_put_files++] = tmp_filename;
+
+ mutex_unlock (put_data_mutex);
+
+ // write in blocks (Windows hangs if the block is too big, a few GB)
+ size_t written = 0;
+ const uint64_t block_size = 1 << 24; // 16MB
+ for (int i=0; i < (len + block_size - 1) / block_size; i++) // round up
+ written += fwrite (&((rom)data)[i * block_size], 1, MIN_(block_size, len - i*block_size), file);
+
+ SAVE_VALUE (errno);
+ fflush (file);
+ FCLOSE (file, tmp_filename);
+ RESTORE_VALUE (errno); // in cases caller wants to print fwrite error
+
+ if (written != len) {
+ WARN ("Failed to write %s: wrote only %"PRIu64" bytes of the expected %"PRIu64, tmp_filename, (uint64_t)written, len);
+ put_data_tmp_filenames[my_file_i] = NULL; // no need to lock mutex
+ remove (tmp_filename);
+ return false;
+ }
+
+ // we can't enter if file_put_data_abort is active, or it needs to wait for us before deleting tmp files
+ mutex_lock (put_data_mutex);
+
+ remove (filename);
+ int renamed_failed = rename (tmp_filename, filename);
+
+ put_data_tmp_filenames[my_file_i] = NULL; // remove tmp file name from list
+
+ mutex_unlock (put_data_mutex);
+
+ ASSERT (!renamed_failed, "Failed to rename %s to %s: %s", tmp_filename, filename, strerror (errno));
+ FREE (tmp_filename);
+
+ if (mode)
+ ASSERT (!chmod (filename, mode), "Failed to chmod %s: %s", filename, strerror (errno));
+
+ return true;
+}
+
+// error handling: unlink files currently being written (the actual writing will terminate when the thread terminates)
+void file_put_data_abort (void)
+{
+ if (!put_data_mutex.initialized) return;
+
+ mutex_lock (put_data_mutex);
+
+ for (unsigned i=0; i < num_put_files; i++)
+ if (put_data_tmp_filenames[i]) {
+ // TODO: this works on Linux but not Windows - gets "Permission Denied" if file_put_data is in fflush()
+ unlink (put_data_tmp_filenames[i]); // ignore errors
+ remove (put_data_tmp_filenames[i]); // in case unlinked failed - eg NTFS - ignore errors
+ }
+
+ // mutex remains locked - no more files can be put after this point
+}
+
+void file_put_data_reset_after_fork (void)
+{
+ put_data_mutex = (Mutex){};
+}
+
+PutLineFn file_put_line (VBlockP vb, STRp(line), rom msg)
+{
+ PutLineFn fn;
+ snprintf (fn.s, sizeof (fn.s), "line.%u.%d.%s%s", vb->vblock_i, vb->line_i, command==ZIP ? "zip" : "piz",
+ file_plain_ext_by_dt ((VB_DT(SAM) && z_file->z_flags.txt_is_bin) ? DT_BAM : vb->data_type));
+
+ file_put_data (fn.s, STRa(line), 0);
+
+ if (IS_PIZ)
+ WARN ("\n%s line=%s line_in_file(1-based)=%"PRId64". Dumped %s (dumping first occurance only)",
+ msg, line_name(vb).s, writer_get_txt_line_i (vb, vb->line_i), fn.s);
+ else
+ WARN ("\n%s line=%s vb_size=%u MB. Dumped %s", msg, line_name(vb).s, (int)(segconf.vb_size >> 20), fn.s);
+
+ return fn;
+}
+
+void file_assert_ext_decompressor (void)
+{
+ if (!stream_wait_for_exit (input_decompressor, false)) return; // just a normal EOF - all good!
+
+ if (flag.truncate) return; // truncated as requested - all good
+
+ // read error from stderr
+ #define INPUT_DECOMPRESSOR_RESPSONSE_LEN 4096
+ char error_str[INPUT_DECOMPRESSOR_RESPSONSE_LEN];
+
+ FILE *stderr_pipe = stream_from_stream_stderr (input_decompressor);
+ int bytes_read = fread (error_str, 1, INPUT_DECOMPRESSOR_RESPSONSE_LEN-1, stderr_pipe);
+ error_str[bytes_read] = 0; // string terminator
+
+ ABORT ("%s: failed to read file: %s\n%s: %s",
+ global_cmd, txt_name, stream_get_exec_name (input_decompressor), error_str);
+}
+
+// used when aborting due to an error. avoid the compressors outputting their own errors after our process is gone
+void file_kill_external_compressors (void)
+{
+ stream_close (&input_decompressor, STREAM_KILL_PROCESS);
+ stream_close (&output_compressor, STREAM_KILL_PROCESS);
+}
+
+rom ft_name (FileType ft)
+{
+ return type_name (ft, &file_exts[ft], ARRAY_LEN(file_exts));
+}
+
+rom file_plain_ext_by_dt (DataType dt)
+{
+ FileType plain_ft = txt_in_ft_by_dt[FAF ? DT_FASTA : dt][0].in;
+
+ return file_exts[plain_ft];
+}
+
+bool file_buf_locate (FileP file, ConstBufferP buf)
+{
+ return is_p_in_range (buf, file, sizeof (File));
+}
diff --git a/src/file.h b/src/file.h
index ff77684d..19dd76d7 100644
--- a/src/file.h
+++ b/src/file.h
@@ -32,17 +32,19 @@ typedef struct File {
FileType type;
bool is_remote; // true if file is downloaded from a url
bool redirected; // TXT_FILE: true if this file is redirected from stdin/stdout or a pipe
- bool is_eof; // we've read the entire file
+ bool no_more_blocks; // ZIP TXT_FILE: txtfile_read_block has completed returning all the file's data (note: it is possible that we read all the data on the disk file so feof(fp)=true, but no_more_blocks=false, because some of it is waiting in buffers: gz_data or state->avail_in)
bool is_in_tar; // z_file: file is embedded in tar file
bool is_scanned; // TXT_FILE: sam_sag_by_flag_scan_for_depn has been performed for this file
DataType data_type;
Codec source_codec; // TXT_FILE ZIP: codec of txt file before redirection (eg CRAM, XZ, ZIP...). Note: CODEC_BAM if BAM (with or without internal bgzf compression)
// Z_FILE PIZ: set to CODEC_BCF or CODEC_CRAM iff GenozipHeader.data_type is DT_BCF/DT_CRAM
- Codec codec; // TXT_FILE ZIP: generic codec used with this file. If redirected - as read by txtfile (eg for cram files this is BGZF)
+ Codec codec; // TXT_FILE ZIP: internal decompression codec used with this file. If redirected - as read by txtfile (eg for cram files this is BGZF)
+ Codec gunzip_method; // TXT_FILE ZIP: if codec∈{GZ,BGZF,GZIL}, method used to decompress it (either the same as codec, or GZ)
// these relate to actual bytes on the disk
int64_t disk_size; // ZIP: size of actual file on disk. 0 if not known (eg stdin or http stream).
int64_t disk_so_far; // ZIP: Z/TXT_FILE: data actually read/write to/from "disk" (using fread/fwrite), (TXT_FILE: possibley bgzf/gz/bz2 compressed ; 0 if external compressor is used for reading).
+ int64_t disk_gz_uncomp_or_trunc; // ZIP: TXT_FILE: gz-compressed data actually either decompressed or discarded due to truncate
int64_t est_seggable_size; // TXT_FILE ZIP, access via txtfile_get_seggable_size(). Estimated size of txt_data in file, i.e. excluding the header. It is exact for plain files, or based on test_vb if the file has source compression
int64_t est_num_lines; // TXT_FILE ZIP, an alternative for progress bar - by lines instead of bytes (used for CRAM)
@@ -119,23 +121,20 @@ typedef struct File {
Buffer section_list_save; // a copy of the section_list in case it is modified due to recon plan.
// TXT file: reading
- Buffer unconsumed_txt; // ZIP: excess uncompressed data read from the txt file - moved to the next VB
+ Buffer unconsumed_txt; // ZIP: excess uncompressed data read from the txt file - moved to the next VB: the final part of vb->txt_data that was not consumed
+ Buffer unconsumed_bgz_blocks; // ZIP TXT BGZF/GZIL: unconsumed or partially consumed bgzf/gzil blocks - moved to the next VB
+ Buffer gz_data; // ZIP TXT GZ: yet-unconsumed gz data read from disk. .comp_len/.uncomp_len refer to the first block in the buffer (in GZIL, but not BGZF, there might be additional data after the first block)
+ Buffer igzip_state; // ZIP TXT GZ (with igzip)
// TXT file: BGZF stuff reading and writing compressed txt files
- Buffer unconsumed_bgzf_blocks; // ZIP TXT BGZF: unconsumed or partially consumed bgzf blocks - moved to the next VB
- Buffer bgzf_isizes; // ZIP/PIZ: uncompressed size of the bgzf blocks in which this txt file is compressed (in BGEN16)
- Buffer bgzf_starts; // ZIP: offset in txt_file of each BGZF block
- Buffer bgzf_plausible_levels; // ZIP: discovering library/level. .count = number of BGZF blocks tested so far
- struct FlagsBgzf bgzf_flags; // corresponds to SectionHeader.flags in SEC_BGZF
- uint8_t bgzf_signature[3]; // PIZ: 3 LSB of size of source BGZF-compressed file, as passed in SectionHeaderTxtHeader.codec_info
-
- // TXT file: IGZIP stuff reading and writing compressed txt files
- Buffer igzip_data; // ZIP TXT GZ (with igzip): yet-uncompressed data read from disk
- Buffer igzip_state; // ZIP TXT GZ (with igzip)
- uint64_t gzip_start_Ltxt; // ZIP TXT GZ: Ltxt at the beginning for this gzip_section
+ Buffer bgzf_isizes; // ZIP/PIZ: BGZF: uncompressed size of the BGZF blocks in which this txt file is compressed (in BGEN16).
+ // ZIP : GZIL: only .len is used to count GZIL blocks (as their isize is always 1MB except for the last block)
+ Buffer bgzf_starts; // ZIP: offset in txt_file of each BGZF block
+ Buffer bgzf_plausible_levels; // ZIP: discovering library/level. .count = number of BGZF blocks tested so far
+ struct FlagsBgzf bgzf_flags; // corresponds to SectionHeader.flags in SEC_BGZF
+ uint8_t bgzf_signature[3]; // PIZ: 3 LSB of size of source BGZF-compressed file, as passed in SectionHeaderTxtHeader.codec_info
// TXT FILE: accounting for truncation when --truncate-partial-last-line is used
- bool bgzf_truncated_last_block; // ZIP: detected a truncated last block
uint32_t last_truncated_line_len; // ZIP: bytes truncated due to incomplete final line. note that if file is BGZF, then this truncated data is contained in the final intact BGZF blocks, after already discarding the final incomplete BGZF block
// TXT file: data used in --sex, --coverage and --idxstats
@@ -225,9 +224,10 @@ typedef struct File {
int64_t txt_data_so_far_bind_0_comp[MAX_NUM_COMPS]; // Z_FILE ZIP: per-component txt_size before modifications
Codec comp_codec[MAX_NUM_COMPS]; // Z_FILE ZIP: codec used for every txt file component (i.e. excluding generated components)
Codec comp_source_codec[MAX_NUM_COMPS]; // Z_FILE ZIP: source codec used for every txt file component (i.e. excluding generated components)
+ Codec comp_gunzip_method[MAX_NUM_COMPS]; // Z_FILE ZIP: gunzip_method used for every txt file component with codec∈{GZ,BGZF,GZIL}
FlagsBgzf comp_bgzf[MAX_NUM_COMPS]; // Z_FILE ZIP BGZF: library and level of BGZF of each comp
- bool gzip_section_size_single_block[MAX_NUM_COMPS]; // Z_FILE ZIP GZ: if true, disregard gzip_section_size as the entire file is just one GZ block
- uint64_t gzip_section_size[MAX_NUM_COMPS]; // Z_FILE ZIP GZ: size of one gzip section (in uncompressed terms) in case of concatenated gzip. -1 if they are not equal size.
+ uint64_t gz_isize[MAX_NUM_COMPS][2]; // Z_FILE ZIP GZ: isize(=uncomp_size) of the first two gzip block multi-gz-blocks (excluding BGZF and GZIL).
+ uint8_t gz_header[MAX_NUM_COMPS][12]; // Z_FILE ZIP GZ: first 12 bytes of the gz header (10 if FEXTRA=false) (excluding BGZF and GZIL).
} File;
#define z_has_gencomp (z_file && z_file->z_flags.has_gencomp) // ZIP/PIZ
@@ -240,7 +240,6 @@ extern StrText file_get_z_run_time (FileP file);
extern FileP file_open_txt_read (rom filename);
extern FileP file_open_txt_write (rom filename, DataType data_type, BgzfLevel bgzf_level);
extern void file_close (FileP *file_p);
-extern void file_write_txt (const void *data, unsigned len);
extern bool file_seek (FileP file, int64_t offset, int whence, rom mode, FailType soft_fail); // SEEK_SET, SEEK_CUR or SEEK_END
extern int64_t file_tell_do (FileP file, FailType soft_fail, FUNCLINE);
#define file_tell(file,soft_fail) file_tell_do ((file), (soft_fail), __FUNCLINE)
@@ -298,12 +297,17 @@ extern bool file_buf_locate (FileP file, ConstBufferP buf);
#define SRC_CODEC(x) (txt_file->source_codec == CODEC_##x)
+#define TXT_IS_PLAIN (txt_file->codec == CODEC_NONE)
+#define TXT_IS_BGZF (txt_file->codec == CODEC_BGZF)
+#define TXT_IS_GZIL (txt_file->codec == CODEC_GZIL)
+#define TXT_IS_GZ (txt_file->codec == CODEC_GZ)
+#define TXT_IS_BZ2 (txt_file->codec == CODEC_BZ2)
+
#define SC(x) (file->source_codec == CODEC_##x)
static inline bool is_read_via_ext_decompressor(ConstFileP file) { return SC(XZ)|| SC(ZIP) || SC(BCF)|| SC(CRAM) || SC(ORA); }
#undef SC
#define FC(x) (codec == CODEC_##x)
-static inline bool is_read_via_int_decompressor(Codec codec) { return FC(GZ)|| FC(BGZF)|| FC(BZ2); }
static inline bool is_written_via_ext_compressor(Codec codec) { return FC(BCF) || FC(CRAM); }
#undef FC
diff --git a/src/flags.c b/src/flags.c
index a76733e9..be666195 100644
--- a/src/flags.c
+++ b/src/flags.c
@@ -423,8 +423,8 @@ void flags_init_from_command_line (int argc, char **argv)
#define _g {"grep", required_argument, 0, 25 }
#define _gw {"grep-w", required_argument, 0, 'g' }
#define _n {"lines", required_argument, 0, 'n' }
- #define _nh {"head", optional_argument, 0, 22 } // genozip
- #define _nH {"head", required_argument, 0, 22 } // genocat
+ #define _nh {"head", required_argument, 0, 22 } // genozip
+ #define _nH {"head", optional_argument, 0, 22 } // genocat
#define _nt {"tail", optional_argument, 0, 23 }
#define _G {"drop-genotypes", no_argument, &flag.drop_genotypes, 1 }
#define _H1 {"no-header", no_argument, &flag.no_header, 1 }
@@ -473,6 +473,7 @@ void flags_init_from_command_line (int argc, char **argv)
#define _WB {"show-wrong-XB", no_argument, &flag.show_wrong_xb, 1 }
#define _su {"show-uncompress", no_argument, &flag.show_uncompress, 1 }
#define _so {"show-compress", no_argument, &flag.show_compress, 1 }
+ #define _gz {"show-gz-uncomp", no_argument, &flag.show_gz_uncomp, 1 }
#define _sv {"show-vblocks", optional_argument, 0, 141 }
#define _ai {"analyze-insertions", no_argument, &flag.analyze_ins, 1 }
#define _ov {"one-vb", required_argument, 0, 8 }
@@ -573,12 +574,13 @@ void flags_init_from_command_line (int argc, char **argv)
#define _ts {"t_size", required_argument, 0, 143, }
#define _lp {"license-prepare", required_argument, 0, 148, }
#define _00 {0, 0, 0, 0 }
+ #define _gg {"generate-gzil", no_argument, 0, 153 }
typedef const struct option Option;
- static Option genozip_lo[] = { _lg, _i, _I, _d, _f, _h, _x, _D, _L1, _L2, _q, _Q, _qq, _t, _Nt, _DL, _nb, _nz, _nc,_nu, _V, _z, _m, _th, _o, _p, _e, _E, _H1, _sL, _ss, _SS, _sd, _sT, _sb, _Sb, _lc, _lh, _lH, _s2, _s7, _S7, _S0, _S8, _S9, _sa, _st, _sm, _sh, _si, _Si, _Sh, _sr, _su, _so, _sv, _sn, _pn, _ai, _B, _xt, _dm, _dp, _dL, _dD, _dq, _dB, _dt, _dw, _dM, _dr, _dR, _dP, _dG, _dN, _dF, _DF, _dQ, _dH, _dO, _dC, _fQ, _fC, _fO, _fS, _fH, _fN, _dU, _dl, _dc, _dg, _dh,_dS, _bS, _9, _8, _pe, _fa, _bs, _lm, _nh, _rg, _hC, _rA, _rS, _me, _s5, _S5, _sM, _sA, _sB, _sP, _sc, _Sc, _AL, _sI, _cn, _s6, _oe, _al, _Lf, _dd, _T, _TT, _TL, _wM, _wm, _WM, _WB, _bi, _bl, _sk, _VV, _DV, _Ds, _DS, _sp, _Du, _DD, _DP, _SH, _Dd, _to, _ts, _hc, _dv, _TR, _NE, _lp, _Sd, _St, _um, _fP, _nF, _nI, _00 };
- static Option genounzip_lo[] = { _lg, _d, _f, _h, _x, _D, _L1, _L2, _q, _Q, _t, _DL, _nc, _V, _z, _m, _th, _u, _o, _p, _e, _sL, _ss, _SS, _sG, _sd, _sT, _sb, _lc, _lh, _lH, _s2, _s7, _S7, _S0, _S8, _S9, _sa, _st, _sm, _sh, _si, _Si, _Sh, _sr, _su, _sv, _sn, _pn, _ov, _xt, _dm, _dp, _dD, _dB, _dt, _dR, _dc, _lm, _sR, _pR, _hC, _rA, _rS, _me, _s5, _S5, _sM, _sA, _sB, _Sc, _AL, _sI, _cn, _cN, _s6, _oe, _dd, _T, _TT, _Dp, _Dh, _sp, _DD, _Dd, _to, _ts, _RC, _dv, _TR, _NE, _np, _00 };
- static Option genocat_lo[] = { _lg, _d, _f, _h, _D, _L1, _L2, _q, _Q, _nc, _V, _z, _zr, _zR, _zb, _zB, _zs, _zS, _zq, _zQ, _zf, _zF, _zc, _zC, _zv, _zV, _th, _o, _p, _e, _il, _r, _R, _Rg, _qf, _qF, _Qf, _QF, _SF, _s, _sf, _sq, _G, _1, _H0, _H1, _H2, _H3, _Gt, _So, _Io, _IU, _iu, _GT, _sL, _ss, _SS, _sG, _sd, _sT, _sb, _lc, _lh, _lH, _s2, _s7, _S7, _S0, _S8, _S9, _sa, _st, _sm, _sh, _si, _Si, _Sh, _sr, _su, _sv, _sn, _pn, _ov, _R1, _R2, _RX, _xt, _dm, _dp, _dD, _dB, _dt, _dR, _dc, _ds, _lm, _fs, _g, _gw, _n, _nt, _nH, _sR, _pR, _sC, _pC, _hC, _rA, _rI, _pI, _rS, _me, _s5, _S5, _sM, _sA, _sB, _Sc, _AL, _sI, _cn, _cN, _pg, _PG, _SX, _ix, _ct, _vl, _s6, _oe, _al, _dd, _T, _Dp, _Dh, _sp, _DD, _Dd, _DT, _RC, _dv, _TR, _NE, _np, _00 };
- static Option genols_lo[] = { _lg, _f, _h, _l, _L1, _L2, _q, _V, _p, _st, _sm, _dm, _dt, _sM, _b, _LC, _oe, _dd, _T, _sp, _DD, _dv, _NE, _00 };
+ static Option genozip_lo[] = { _lg, _i, _I, _d, _f, _h, _x, _D, _L1, _L2, _q, _Q, _qq, _t, _Nt, _DL, _nb, _nz, _nc,_nu, _V, _z, _m, _th, _o, _p, _e, _E, _H1, _sL, _ss, _SS, _sd, _sT, _sb, _Sb, _lc, _lh, _lH, _s2, _s7, _S7, _S0, _S8, _S9, _sa, _st, _sm, _sh, _si, _Si, _Sh, _sr, _su, _so, _gz, _sv, _sn, _pn, _ai, _B, _xt, _dm, _dp, _dL, _dD, _dq, _dB, _dt, _dw, _dM, _dr, _dR, _dP, _dG, _dN, _dF, _DF, _dQ, _dH, _dO, _dC, _fQ, _fC, _fO, _fS, _fH, _fN, _dU, _dl, _dc, _dg, _dh,_dS, _bS, _9, _8, _pe, _fa, _bs, _lm, _nh, _rg, _hC, _rA, _rS, _me, _s5, _S5, _sM, _sA, _sB, _sP, _sc, _Sc, _AL, _sI, _cn, _s6, _oe, _al, _Lf, _dd, _T, _TT, _TL, _wM, _wm, _WM, _WB, _bi, _bl, _sk, _VV, _DV, _Ds, _DS, _sp, _Du, _DD, _DP, _SH, _Dd, _to, _ts, _hc, _dv, _TR, _NE, _lp, _Sd, _St, _um, _fP, _nF, _nI, _gg, _00 };
+ static Option genounzip_lo[] = { _lg, _d, _f, _h, _x, _D, _L1, _L2, _q, _Q, _t, _DL, _nc, _V, _z, _m, _th, _u, _o, _p, _e, _sL, _ss, _SS, _sG, _sd, _sT, _sb, _lc, _lh, _lH, _s2, _s7, _S7, _S0, _S8, _S9, _sa, _st, _sm, _sh, _si, _Si, _Sh, _sr, _su, _sv, _sn, _pn, _ov, _xt, _dm, _dp, _dD, _dB, _dt, _dR, _dc, _lm, _sR, _pR, _hC, _rA, _rS, _me, _s5, _S5, _sM, _sA, _sB, _Sc, _AL, _sI, _cn, _cN, _s6, _oe, _dd, _T, _TT, _Dp, _Dh, _sp, _DD, _Dd, _to, _ts, _RC, _dv, _TR, _NE, _np, _00 };
+ static Option genocat_lo[] = { _lg, _d, _f, _h, _D, _L1, _L2, _q, _Q, _nc, _V, _z, _zr, _zR, _zb, _zB, _zs, _zS, _zq, _zQ, _zf, _zF, _zc, _zC, _zv, _zV, _th, _o, _p, _e, _il, _r, _R, _Rg, _qf, _qF, _Qf, _QF, _SF, _s, _sf, _sq, _G, _1, _H0, _H1, _H2, _H3, _Gt, _So, _Io, _IU, _iu, _GT, _sL, _ss, _SS, _sG, _sd, _sT, _sb, _lc, _lh, _lH, _s2, _s7, _S7, _S0, _S8, _S9, _sa, _st, _sm, _sh, _si, _Si, _Sh, _sr, _su, _sv, _sn, _pn, _ov, _R1, _R2, _RX, _xt, _dm, _dp, _dD, _dB, _dt, _dR, _dc, _ds, _lm, _fs, _g, _gw, _n, _nt, _nH, _sR, _pR, _sC, _pC, _hC, _rA, _rI, _pI, _rS, _me, _s5, _S5, _sM, _sA, _sB, _Sc, _AL, _sI, _cn, _cN, _pg, _PG, _SX, _ix, _ct, _vl, _s6, _oe, _al, _dd, _T, _Dp, _Dh, _sp, _DD, _Dd, _DT, _RC, _dv, _TR, _NE, _np, _00 };
+ static Option genols_lo[] = { _lg, _f, _h, _l, _L1, _L2, _q, _V, _p, _st, _sm, _dm, _dt, _sM, _b, _LC, _oe, _dd, _T, _sp, _DD, _dv, _NE, _00 };
static Option *long_options[NUM_EXE_TYPES] = { genozip_lo, genounzip_lo, genocat_lo, genols_lo }; // same order as ExeType
// include the option letter here for the short version (eg "-t") to work. ':' indicates an argument.
@@ -707,7 +709,7 @@ void flags_init_from_command_line (int argc, char **argv)
case 148 : license_prepare (optarg); break;
case 151 : ASSINP (str_get_int_range64 (optarg, strlen (optarg), 1, 0xffffffff, &flag.sendto), "Expecting the value of --sendto=%s to a number", optarg); break;
case 152 : user_message_init (optarg); break;
-
+ case 153 : gzil_compress(); // doesn't return
case 0 : break; // a long option that doesn't have short version will land here - already handled so nothing to do
default : // unrecognized option
@@ -958,10 +960,12 @@ static void flags_zip_verify_pair_rules (unsigned num_files, rom *filenames)
// if which --output is missing, we check if every pair of files has a consistent name (except in deep, where default z_name is based on the BAM filename)
if (!flag.out_filename && !flag.deep)
- for (unsigned i=0; i < num_files; i += 2)
+ for (unsigned i=0; i < num_files; i += 2) {
+ ASSINP (strcmp (filenames[i], filenames[i+1]), "you specfied the same file twice: %s", filenames[i]);
ASSINP (filename_z_pair (filenames[i], filenames[i+1], true),
"to use %s without specifying --output, the naming of the files needs to be consistent and include the numbers 1 and 2 respectively, but these files don't: %s %s",
OT("pair", "2"), filenames[i], filenames[i+1]);
+ }
}
// ZIP: --deep: verify conditions
@@ -1064,7 +1068,7 @@ void flags_update (unsigned num_files, rom *filenames)
if (flag.show_dict || flag.show_b250 || flag.show_headers || flag.show_threads || flag.show_bgzf || flag.show_mutex || flag.show_containers || flag.show_stack ||
flag.dict_id_show_one_b250.num || flag.show_one_dict || flag.show_one_counts.num || flag.show_sag || flag.show_depn || flag.show_singletons_dict_id.num ||
flag.show_reference || flag.show_digest || flag.list_chroms || flag.show_coverage == COV_ONE || flag.show_ranges || flag.show_snips || flag.show_compress ||
- flag.show_alleles || flag.show_vblocks || flag.show_codec || flag.show_cache || flag.debug_gencomp || flag.show_qual || flag.show_aligner ||
+ flag.show_alleles || flag.show_vblocks || flag.show_codec || flag.show_cache || flag.debug_gencomp || flag.show_qual || flag.show_aligner || flag.show_gz ||
flag.show_buddy || flag.debug_peek || flag.show_aliases || (flag.show_index && command==PIZ) || flag.count || flag.biopsy || flag.show_gz)
flag.quiet=true; // don't show progress or warnings
@@ -1735,8 +1739,5 @@ void flags_finalize (void)
rom pair_type_name (PairType p)
{
- rom names[] = PAIR_TYPE_NAMES;
- ASSERT (p >= 0 && p <= 3, "Invalid pair_type=%d", p);
-
- return names[p];
+ return IN_RANGE (p, 0, 3) ? (rom[])PAIR_TYPE_NAMES[p] : "InvalidPairType";
}
\ No newline at end of file
diff --git a/src/flags.h b/src/flags.h
index 4580e1ea..9288f895 100644
--- a/src/flags.h
+++ b/src/flags.h
@@ -139,7 +139,7 @@ typedef struct {
show_index, show_gheader, show_ref_contigs, show_ref_seq,
show_reference, show_ref_hash, show_ref_index, show_chrom2ref, show_ref_iupacs, show_ranges,
show_codec, show_cache, show_memory, show_snips,
- show_alleles, show_bgzf, show_gz, show_txt_contigs, show_lines,
+ show_alleles, show_bgzf, show_gz, show_txt_contigs, show_lines, show_gz_uncomp,
show_threads, show_uncompress, biopsy, skip_segconf, show_data_type,
debug_progress, show_hash, debug_memory, debug_threads, debug_stats, debug_generate, debug_recon_size, debug_seg,
debug_LONG, show_qual, debug_qname, debug_read_ctxs, debug_sag, debug_gencomp, debug_lines, debug_latest,
@@ -154,7 +154,6 @@ typedef struct {
show_containers, show_stack, show_aligner, show_buddy,
echo, // show the command line in case of an error
recover, // PIZ: attempted recovery from data corruption
-
#define SHOW_ALL_HEADERS (-1)
show_headers; // (1 + SectionType to display) or 0=flag off or -1=all sections
rom help, dump_section, show_is_set, show_time, show_mutex, show_vblocks, show_header_dict_name;
@@ -204,6 +203,7 @@ typedef struct {
missing_contexts_allowed, // PIZ: its not an error if contexts are missing - just reconstruct as an empty string
piz_txt_modified, // PIZ: output is NOT precisely identical to the compressed source, and hence we cannot use its BZGF blocks or verify digest
zip_lines_counted_at_init_vb, // ZIP: VB lines need to be counted at zip_init_vb instead of zip_update_txt_counters, requiring BGZF-uncompression of a VB by the main thread
+ zip_uncompress_source_during_read, // ZIP: uncompress source compression in main thread during read, rather than compute thread
explicit_ref, // ref->filename was set by --reference or --REFERENCE (as opposed to being read from the genozip header)
collect_coverage, // PIZ: collect coverage data for show_coverage/idxstats
deep_fq_only, // PIZ: SAM data is reconstructed by not written, only FASTQ data is written
diff --git a/src/gencomp.c b/src/gencomp.c
index 86cc5388..af042351 100644
--- a/src/gencomp.c
+++ b/src/gencomp.c
@@ -15,6 +15,7 @@
#include "bgzf.h"
#include "biopsy.h"
#include "stream.h"
+#include "dispatcher.h"
//-----------------------
// Types & macros
@@ -149,9 +150,9 @@ void gencomp_seg_add_line (VBlockP vb, CompIType comp_i, STRp(line)/*pointer int
// If we're might re-read depn lines from the txt file, we store their coordinates in the txt file
if (componentsP[comp_i].type == GCT_DEPN && depn_method == DEPN_REREAD) {
- if (txt_file->codec == CODEC_BGZF) {
- uint64_t bb_i = vb->vb_bgzf_i + vb->bgzf_blocks.current_bb_i;
- ASSERT (bb_i <= MAX_BB_I, "BGZF bb_i=%"PRIu64" exceeds maximum of %"PRIu64, bb_i, MAX_BB_I);
+ if (TXT_IS_BGZF) {
+ uint64_t bb_i = vb->vb_bgz_i + vb->gz_blocks.current_bb_i;
+ ASSERT (bb_i <= MAX_BB_I, "%s: BGZF bb_i=%"PRIu64" exceeds maximum of %"PRIu64, VB_NAME, bb_i, MAX_BB_I);
BLST (GencompLineIEntry, vb->gencomp_lines)->offset = (LineOffset){ .bb_i = bb_i, .uoffset = vb->line_bgzf_uoffset };
}
@@ -223,7 +224,7 @@ void gencomp_initialize (CompIType comp_i, GencompType gct)
buf_set_promiscuous (&depn.thread_data_comp, "depn.thread_data_comp");
// if we cannot re-read the depn lines from the file, we will offload them to disk
- if ((txt_file->codec != CODEC_BGZF && txt_file->codec != CODEC_NONE) ||
+ if ((!TXT_IS_BGZF && !TXT_IS_PLAIN) ||
txt_file->redirected || txt_file->is_remote ||
segconf.zip_txt_modified) {
depn_method = DEPN_OFFLOAD;
@@ -709,6 +710,10 @@ static void gencomp_get_txt_data_from_queue (VBlockP vb, GencompType gct)
if (flag.debug_gencomp)
debug_gencomp (vb->comp_i==1 ? "disp_comp1" : "disp_comp2", false);
+ if (flag_is_show_vblocks (ZIP_TASK_NAME))
+ iprintf ("TXT_DATA_FROM_GENCOMP_QUEUE(id=%d) vb=%s buf_i=%u Ltxt=%u n_lines=%u\n",
+ vb->id, VB_NAME, buf_i, Ltxt, vb->lines.len32);
+
mutex_unlock (gc_protected);
}
@@ -811,7 +816,7 @@ bool gencomp_get_txt_data (VBlockP vb)
// case: finished ingesting PRIM and no more out-of-band data, and all MAIN data has been flushed (which also means txt_file reached EOF,
// see zip_prepare_one_vb_for_dispatching) - so no more MAIN or GetQBit data will be available. time for DEPN data.
if (sam_finished_ingesting_prim && reread_depn_lines.next < reread_depn_lines.len)
- RETURN (txt_file->codec == CODEC_BGZF ? "REREAD_BGZF" : "REREAD_PLAIN", gencomp_prescribe_reread (vb));
+ RETURN (TXT_IS_BGZF ? "REREAD_BGZF" : "REREAD_PLAIN", gencomp_prescribe_reread (vb));
// no more data exists at this point OR we have GCT_DEPN, but not finished ingesting PRIM yet
// DEBUG_GENCOMP ("NO_DATA_AVAILABLE"); // commenting out because there are too many of
@@ -890,27 +895,31 @@ void gencomp_reread_lines_as_prescribed (VBlockP vb)
Ltxt = 0;
// open a file handle private to this VB
- FILE *file = fopen (txt_file->name, "rb");
- ASSERT (file, "%s: Failed to open %s for rereading depn lines: %s", VB_NAME, txt_file->name, strerror(errno));
+ FILE *fp = fopen (txt_file->name, "rb");
+ ASSERT (fp, "%s: Failed to open %s for rereading depn lines: %s", VB_NAME, txt_file->name, strerror(errno));
+
+ stream_set_inheritability (fileno (fp), false); // Windows: allow file_remove in case of --replace
- stream_set_inheritability (fileno (file), false); // Windows: allow file_remove in case of --replace
+ if (flag_is_show_vblocks (ZIP_TASK_NAME))
+ iprintf ("REREAD_DEPN(id=%d) vb=%s n_lines=%u codec=%s\n",
+ vb->id, VB_NAME, vb->reread_prescription.len32, codec_name (txt_file->codec));
- if (txt_file->codec == CODEC_BGZF)
- bgzf_reread_uncompress_vb_as_prescribed (vb, file);
+ if (TXT_IS_BGZF)
+ bgzf_reread_uncompress_vb_as_prescribed (vb, fp);
else { // CODEC_NONE
for_buf (RereadLine, line, vb->reread_prescription) {
- ASSERT (!fseeko64 (file, line->offset.offset, SEEK_SET),
+ ASSERT (!fseeko64 (fp, line->offset.offset, SEEK_SET),
"%s: fseeko64 on %s failed while rereading depn lines: %s", VB_NAME, txt_file->name, strerror(errno));
- ASSERT (fread (BAFTtxt, line->line_len, 1, file) == 1,
+ ASSERT (fread (BAFTtxt, line->line_len, 1, fp) == 1,
"%s: fread of %u bytes on %s failed while rereading depn lines: %s", VB_NAME, line->line_len, txt_file->name, strerror(errno));
Ltxt += line->line_len;
}
}
- fclose (file);
+ fclose (fp);
if (flag.debug_gencomp)
iprintf ("%s: Reread %u gencomp lines from txt_file adler32=%u\n",
diff --git a/src/generic.c b/src/generic.c
index 8e381a60..649ff820 100644
--- a/src/generic.c
+++ b/src/generic.c
@@ -144,7 +144,7 @@ StrTextLong generic_get_magic (void)
s.s[len++] = '"';
s.s[len++] = ' ';
- str_to_hex ((bytes)magic, strlen(magic), &s.s[len], true);
+ str_to_hex_((bytes)magic, strlen(magic), &s.s[len], true);
return s;
}
diff --git a/src/genozip.c b/src/genozip.c
index b2c42a75..f65eaaf9 100644
--- a/src/genozip.c
+++ b/src/genozip.c
@@ -740,7 +740,7 @@ static void main_no_files (int argc)
{
// case: --register
if (flag.do_register) {
- license_register (false);
+ license_register();
threads_finalize();
}
@@ -765,7 +765,7 @@ static void main_no_files (int argc)
// genozip with no parameters and not registered yet - register now
else if (is_genozip && argc == 1 && isatty(0) && !license_is_registered())
- license_register (false);
+ license_register();
// otherwise: show help
else
diff --git a/src/genozip.h b/src/genozip.h
index b45b1cec..f288d1a4 100644
--- a/src/genozip.h
+++ b/src/genozip.h
@@ -246,7 +246,7 @@ typedef packed_enum { // 1 byte
CODEC_UNKNOWN=0, CODEC_NONE=1,
// internal source codecs
- CODEC_BGZF=20, CODEC_GZ=2, CODEC_BZ2=3,
+ CODEC_BGZF=20, CODEC_GZ=2, CODEC_GZIL=34, CODEC_BZ2=3,
// external source codecs (used by executing an external application)
CODEC_XZ=21, CODEC_BCF=22, CODEC_CRAM=24, CODEC_ZIP=25, CODEC_ORA=32,
@@ -271,7 +271,7 @@ typedef packed_enum { // 1 byte
CODEC_T0 = 29, // compress the T0:Z field (Ultima)
CODEC_OQ = 33, // compress the OQ:Z field (mostly generated by GATK BQSR)
- NUM_CODECS,
+ NUM_CODECS = 35,
} Codec;
// note: the numbering of the sections cannot be modified, for backward compatibility
@@ -348,6 +348,8 @@ typedef int ThreadId;
#define SQR(x) ((x)*(x))
#endif
+#define IN_RANGE(x,min,max) ((x) >= (min) && (x) <= (max))
+
#define MAXB64(x) ((1ULL<<(x))-1)
#define MAXB(x) ((uint32_t)MAXB64(x)) // eg: MAXB(3) == 0b111 == 7
diff --git a/src/license.h b/src/license.h
index a25ec54a..bc6a1e75 100644
--- a/src/license.h
+++ b/src/license.h
@@ -10,7 +10,7 @@
#include "sections.h"
-extern void license_register (bool);
+extern void license_register (void);
extern bool license_is_registered (void);
extern void license_set_filename (rom filename);
extern void license_load (void);
diff --git a/src/lookback.c b/src/lookback.c
index 625bbb35..af8b1ba1 100644
--- a/src/lookback.c
+++ b/src/lookback.c
@@ -137,7 +137,7 @@ uint32_t lookback_get_next (VBlockP vb, ContextP lb_ctx, ContextP ctx, WordIndex
if (*B(WordIndex, *buf, *iterator) == search_for)
lookback = (RR(*iterator - buf->newest_index + 1, lb_size));
- ASSERT (lookback >= 0 && lookback < lb_size, "Invalid lookback=%d", lookback);
+ ASSERT (IN_RANGE (lookback, 0, lb_size-1), "Invalid lookback=%d", lookback);
return lookback;
}
diff --git a/src/objdir.linux/secure/license.o b/src/objdir.linux/secure/license.o
index 2aa8b48d..05e5f19f 100644
Binary files a/src/objdir.linux/secure/license.o and b/src/objdir.linux/secure/license.o differ
diff --git a/src/objdir.osx-arm/secure/license.o b/src/objdir.osx-arm/secure/license.o
index 993124fc..312bf181 100644
Binary files a/src/objdir.osx-arm/secure/license.o and b/src/objdir.osx-arm/secure/license.o differ
diff --git a/src/objdir.osx-x86/secure/license.o b/src/objdir.osx-x86/secure/license.o
index 3a52f458..94d26dd5 100644
Binary files a/src/objdir.osx-x86/secure/license.o and b/src/objdir.osx-x86/secure/license.o differ
diff --git a/src/objdir.windows/secure/license.o b/src/objdir.windows/secure/license.o
index 2867d38d..a64e9df5 100644
Binary files a/src/objdir.windows/secure/license.o and b/src/objdir.windows/secure/license.o differ
diff --git a/src/profiler.c b/src/profiler.c
index 92ee20c6..e293003c 100644
--- a/src/profiler.c
+++ b/src/profiler.c
@@ -1,430 +1,432 @@
-// ------------------------------------------------------------------
-// profiler.c
-// Copyright (C) 2019-2024 Genozip Limited. Patent Pending.
-// Please see terms and conditions in the file LICENSE.txt
-//
-// WARNING: Genozip is proprietary, not open source software. Modifying the source code is strictly prohibited,
-// under penalties specified in the license.
-
-#include "profiler.h"
-#include "file.h"
-
-static ProfilerRec profile = {}; // data for this z_file
-static Mutex profile_mutex = {};
-static TimeSpecType profiler_timer; // wallclock
-
-void profiler_initialize (void)
-{
- mutex_bottleneck_analysis_init();
- mutex_initialize (profile_mutex);
-}
-
-void profiler_new_z_file (void)
-{
- memset (&profile, 0, sizeof (profile));
- clock_gettime (CLOCK_REALTIME, &profiler_timer); // initialze wallclock
-}
-
-void profiler_set_avg_compute_vbs (float avg_compute_vbs)
-{
- ASSERT0 (profile.num_txt_files >= 0 && profile.num_txt_files < MAX_NUM_TXT_FILES_IN_ZFILE, "too many txt files");
-
- profile.avg_compute_vbs[profile.num_txt_files++] = avg_compute_vbs;
-}
-
-
-StrTextSuperLong profiler_get_avg_compute_vbs (char sep)
-{
- StrTextSuperLong s = {};
- int s_len = 0;
- for (int i=0; i < profile.num_txt_files; i++)
- SNPRINTF (s, "%.1f%c", profile.avg_compute_vbs[i], sep);
-
- if (s_len) s.s[s_len-1] = 0; // remove final separator
-
- return s;
-}
-
-void profiler_add (ConstVBlockP vb)
-{
- mutex_lock (profile_mutex);
-
- if (Ltxt) {
- profile.num_vbs++;
- profile.max_vb_size_mb = MAX_(profile.max_vb_size_mb, segconf.vb_size >> 20);
- }
-
- int num_profiled = sizeof (profile.nanosecs) / sizeof (uint64_t) - MAX_DICTS +
- (IS_PIZ ? vb->num_contexts : 0);
-
- for (int i=0; i < num_profiled; i++)
- if (((uint64_t *)&vb->profile.count)[i]) {
- ((uint64_t *)&profile.nanosecs)[i] += ((uint64_t *)&vb->profile.nanosecs)[i];
- ((uint64_t *)&profile.count)[i] += ((uint64_t *)&vb->profile.count)[i];
- }
-
- // ZIP: add compressor data by zctx, while collected by vctx
- if (IS_ZIP)
- for (int v=num_profiled; v < num_profiled + vb->num_contexts; v++)
- if (((uint64_t *)&vb->profile.count)[v]) {
- ContextP zctx = ctx_get_zctx_from_vctx (CTX(v-num_profiled), false, true);
- if (!zctx) continue; // should never happen
-
- int z = zctx->did_i + num_profiled;
- ((uint64_t *)&profile.nanosecs)[z] += ((uint64_t *)&vb->profile.nanosecs)[v];
- ((uint64_t *)&profile.count)[z] += ((uint64_t *)&vb->profile.count)[v];
- }
-
- mutex_unlock (profile_mutex);
-}
-
-static inline uint32_t ms(uint64_t ns) { return (uint32_t)(ns / 1000000);}
-
-rom profiler_print_short (const ProfilerRec *p)
-{
- static char str[300]; // not thread safe
- snprintf (str, sizeof (str), "read: %s compute:%s write: %s", str_int_commas (ms(p->nanosecs.read)).s, str_int_commas (ms(p->nanosecs.compute)).s, str_int_commas (ms(p->nanosecs.write)).s);
- return str;
-}
-
-void profiler_add_evb_and_print_report (void)
-{
- profiler_add (evb);
-
- static rom space = " ";
-# define PRINT_(x, label, level) if (profile.nanosecs.x) \
- iprintf ("%.*s %s: %s (N=%s)\n", (level)*3, space, (label), \
- str_int_commas (ms(profile.nanosecs.x)).s, \
- str_int_commas (profile.count.x).s);
-
-# define PRINT(x, level) PRINT_(x, #x, (level))
-
- rom os = flag.is_windows ? "Windows"
- : flag.is_mac ? "MacOS"
- : flag.is_linux ? "Linux"
- : "Unknown OS";
-
- iprintf ("\n%s PROFILER:\n", IS_ZIP ? "ZIP" : "PIZ");
- iprintf ("OS=%s\n", os);
- iprintf ("Build=%s\n", flag.debug ? "Debug" : "Optimized");
-
- iprintf ("Wallclock: %s milliseconds\n", str_int_commas (ms (CHECK_TIMER)).s);
-
- if (command == PIZ) { // this is a uncompress operation
-
- iprint0 ("GENOUNZIP main thread (piz_one_txt_file):\n");
- PRINT (ref_load_stored_reference, 1);
- PRINT (ref_initialize_ranges, 2);
- PRINT (ref_read_one_range, 2);
- PRINT (ref_uncompress_one_range, 2);
- PRINT (piz_read_global_area, 1); // sometimes also includes ref_load_stored_reference, but usually not
- PRINT (dict_io_read_all_dictionaries, 2);
- PRINT (dict_io_build_word_lists, 3);
- PRINT (txtheader_piz_read_and_reconstruct, 1);
- PRINT (sam_header_inspect, 2);
- PRINT (digest_txt_header, 2);
- PRINT (vb_get_vb, 1);
- PRINT (piz_read_one_vb, 1);
- PRINT (read, 2);
- PRINT (bgzf_io_thread, 1);
- PRINT (bgzf_writer_thread, 1);
- PRINT (write, 1);
- PRINT (sam_sa_prim_finalize_ingest, 1);
- PRINT (sam_piz_deep_grab_deep_ents, 1);
- PRINT (sam_piz_deep_finalize_ents, 2);
- PRINT (piz_main_loop_idle, 1);
-
- iprintf ("GENOUNZIP compute threads: %s\n", str_int_commas (ms(profile.nanosecs.compute)).s);
- PRINT (zfile_uncompress_section, 1);
- PRINT (compressor_bz2, 2);
- PRINT (compressor_lzma, 2);
- PRINT (compressor_bsc, 2);
- PRINT (compressor_rans, 2);
- PRINT (compressor_arith, 2);
- PRINT (compressor_domq, 2);
- PRINT (compressor_normq, 2);
- PRINT (compressor_actg, 2);
- PRINT (compressor_pbwt, 2);
- PRINT (compressor_longr, 2);
- PRINT (compressor_homp, 2);
- PRINT (compressor_pacb, 2);
- PRINT (compressor_smux, 2);
- PRINT (compressor_t0, 2);
- PRINT (compressor_oq, 2);
-
- PRINT (reconstruct_vb, 1);
- for (Did did_i=0; did_i < z_file->num_contexts; did_i++)
- PRINT_(fields[did_i], ZCTX(did_i)->tag_name, 2);
-
- PRINT (sam_reconstruct_SEQ_vs_ref, 2);
- PRINT (reconstruct_SEQ_copy_sag_prim, 3);
- PRINT (sam_analyze_copied_SEQ, 3);
- PRINT (sam_bismark_piz_update_meth_call, 3);
- PRINT (aligner_reconstruct_seq, 2);
- PRINT (sam_piz_special_QUAL, 2);
- if (Z_DT(SAM) || Z_DT(BAM)) {
- PRINT (codec_longr_reconstruct,3);
- PRINT (codec_homp_reconstruct, 3);
- PRINT (codec_t0_reconstruct, 3);
- PRINT (codec_smux_reconstruct, 3);
- PRINT (codec_pacb_reconstruct, 3);
- PRINT (codec_domq_reconstruct, 3);
- PRINT (codec_domq_reconstruct_dom_run, 4);
- PRINT (codec_oq_reconstruct, 3);
- }
- PRINT (fastq_special_monochar_QUAL, 2);
- PRINT (sam_piz_sam2fastq_QUAL, 2);
- PRINT (sam_piz_sam2bam_QUAL, 2);
- PRINT (sam_cigar_special_CIGAR, 2);
-
- PRINT (sam_piz_con_item_cb, 2);
- PRINT (sam_piz_deep_add_qname, 3);
- PRINT (sam_piz_deep_add_seq, 3);
- PRINT (sam_piz_deep_add_qual, 3);
- PRINT (sam_piz_deep_compress, 4); // mostly under qual, a tiny bit of seq
-
- PRINT (fastq_special_set_deep, 2);
- PRINT (fastq_special_deep_copy_QNAME, 2);
- PRINT (fastq_special_deep_copy_SEQ, 2);
- PRINT (fastq_special_deep_copy_QUAL, 2);
-
- PRINT (sam_zip_prim_ingest_vb, 1);
- PRINT (digest, 1); // note: in SAM/BAM digest is done in the writer thread, otherwise its done in the compute thread. TODO: change level to 0 in case of SAM/BAM
- PRINT (piz_get_line_subfields, 2);
- PRINT (sam_load_groups_add_one_prim_vb, 1);
- if (Z_DT(FASTQ)) {
- PRINT (codec_longr_reconstruct, 3);
- PRINT (codec_domq_reconstruct, 2);
- PRINT (codec_domq_reconstruct_dom_run, 3);
- PRINT (codec_homp_reconstruct, 2);
- PRINT (codec_smux_reconstruct, 2);
- PRINT (codec_pacb_reconstruct, 2);
- }
-
- if (profile.nanosecs.bgzf_compute_thread) {
- iprintf ("GENOUNZIP BGZF threads: %s\n", str_int_commas (ms(profile.nanosecs.bgzf_compute_thread)).s);
- PRINT (bgzf_compute_thread, 1);
- PRINT (bgzf_compress_one_block, 2);
- }
- }
-
- else { // compress
- iprint0 ("GENOZIP main thread (zip_one_file):\n");
- PRINT (ref_load_stored_reference, 1);
- PRINT (ref_read_one_range, 2);
- PRINT (ref_uncompress_one_range, 2);
- PRINT (ref_load_digest, 2);
- PRINT (refhash_load, 1);
- PRINT (refhash_load_digest, 2);
- PRINT (refhash_read_one_vb, 2);
- PRINT (refhash_compress_digest, 2); // make-ref
- PRINT (refhash_uncompress_one_vb, 2);
- PRINT (cram_inspect_file, 1);
- PRINT (txtheader_zip_read_and_compress, 1);
- PRINT (txtfile_read_header, 2);
- PRINT (sam_header_inspect, 2);
- PRINT (sam_header_zip_inspect_SQ_lines, 3);
- PRINT (sam_header_add_contig, 4);
- PRINT (contigs_create_index, 4);
- PRINT (sam_header_zip_inspect_PG_lines, 3);
- PRINT (sam_header_zip_inspect_RG_lines, 3);
- PRINT (sam_header_zip_inspect_HD_line, 3);
- PRINT (ref_initialize_ranges, 2);
- PRINT (txtheader_compress, 2);
- PRINT (txtheader_compress_one_fragment, 3);
- PRINT (digest_txt_header, 2);
- PRINT (vb_get_vb, 1);
- PRINT (fastq_read_pair_1_data, 1);
- PRINT (piz_read_all_ctxs, 2);
- PRINT (txtfile_read_vblock, 1);
- PRINT (read, 2);
- PRINT (txtfile_read_block_zlib, 3);
- PRINT (txtfile_read_block_gz, 3);
- PRINT (txtfile_read_block_bz2, 3);
- PRINT (txtfile_read_block_bgzf, 3);
- PRINT (bgzf_read_block, 4);
- PRINT (txtfile_read_block_bgzf_uncompress, 4);
- PRINT (fastq_txtfile_have_enough_lines, 2);
- PRINT (txtfile_get_unconsumed_to_pass_to_next_vb, 2);
- PRINT (bgzf_copy_unconsumed_blocks, 2);
- PRINT (zriter_write, 1);
- PRINT (write_fg, 2);
- PRINT (write_bg, 2);
- PRINT (bgzf_io_thread, 1);
- PRINT (sam_sa_prim_finalize_ingest, 1);
- PRINT (zip_main_loop_idle, 1);
- PRINT (zip_free_undeeded_zctx_bufs_after_seg, 1);
- PRINT (generate_recon_plan, 1);
- PRINT (sam_zip_recon_plan_add_gc_lines, 2);
- PRINT (sam_zip_recon_plan_count_writers, 3);
- PRINT (recon_plan_compress, 2);
- PRINT (recon_plan_deltify, 3);
- PRINT (recon_plan_compress_one_fragment, 3);
- PRINT (zip_write_global_area, 1);
- PRINT (dict_io_compress_dictionaries, 2);
- PRINT (dict_io_assign_codecs, 3);
- PRINT (dict_io_compress_one_fragment, 3);
- PRINT (ref_compress_ref, 2);
- PRINT (ref_compress_one_range, 3);
- PRINT (refhash_calc_one_range, 4);
- PRINT (refhash_compress_refhash, 2);
- PRINT (refhash_compress_one_vb, 3);
- PRINT (refhash_compress_digest, 3);
- PRINT (ref_make_calculate_digest, 3);
- PRINT (ref_contigs_compress, 3);
- PRINT (ref_copy_compressed_sections_from_reference_file, 3);
- PRINT (random_access_finalize_entries, 2);
- PRINT (random_access_compress, 2);
- PRINT (ctx_compress_counts, 2);
- PRINT (zfile_compress_genozip_header, 2);
-
- iprintf ("GENOZIP compute threads %s\n", str_int_commas (ms(profile.nanosecs.compute)).s);
- PRINT (bgzf_uncompress_vb, 1);
- PRINT (ctx_clone, 1);
- PRINT (scan_index_qnames_preprocessing, 1);
- PRINT (zip_modify, 1);
- PRINT (vcf_zip_modify, 2);
- PRINT (vcf_optimize_QUAL, 3);
- PRINT (vcf_optimize_INFO, 3);
- PRINT (vcf_optimize_samples, 3);
- PRINT (vcf_convert_probabilites_to_phred, 4);
- PRINT (vcf_convert_likelihoods_to_phred, 4);
- PRINT (vcf_phred_optimize, 4);
- PRINT (optimize_float_3_sig_dig, 4);
- PRINT (seg_all_data_lines, 1);
- PRINT (seg_initialize, 2);
- PRINT (qname_seg, 2);
- PRINT (sam_cigar_seg, 2);
- PRINT (squank_seg, 3);
- PRINT (fastq_seg_get_lines, 2);
- PRINT (seg_get_next_line, 3);
- PRINT (seg_get_next_item, 3);
- PRINT (fastq_seg_deep, 2);
- PRINT (fastq_deep_seg_find_subseq, 3);
- PRINT (fastq_seg_deep_consume_unique_matching_ent, 3);
- PRINT (fastq_seg_SEQ, 2);
- PRINT (sam_seg_SEQ, 2);
- PRINT (sam_seg_SEQ_vs_ref, 3);
- PRINT (sam_seg_bisulfite_M, 4);
- PRINT (sam_seg_verify_saggy_line_SEQ, 3);
- PRINT (sam_analyze_copied_SEQ, 3);
- PRINT (aligner_seg_seq, 3);
- PRINT (aligner_best_match, 4);
- PRINT (aligner_first_layer, 5);
- PRINT (aligner_additional_layers, 5);
- PRINT (aligner_update_best, 5);
- PRINT (aligner_get_word_from_seq, 5);
- PRINT (aligner_seq_to_bitmap, 5);
- PRINT (fastq_seg_DESC, 2);
- PRINT (fastq_seg_saux, 2);
- PRINT (bam_seq_to_sam, 2);
- PRINT (fastq_seg_QUAL, 2);
- PRINT (sam_seg_QUAL, 2);
- PRINT (sam_seg_is_gc_line, 2);
- PRINT (sam_seg_MD_Z_analyze, 2);
- PRINT (sam_cigar_binary_to_textual, 2);
- PRINT (sam_seg_bsseeker2_XG_Z_analyze, 2);
- PRINT (sam_seg_sag_stuff, 2);
- PRINT (sam_seg_aux_all, 2);
- PRINT (sam_seg_SA_Z, 3);
- PRINT (sam_seg_AS_i, 3);
- PRINT (sam_seg_NM_i, 3);
- PRINT (sam_seg_BWA_XA_Z, 3);
- PRINT (sam_seg_BWA_XA_pos, 4);
- PRINT (sam_seg_BWA_XS_i, 3);
- PRINT (sam_seg_bismark_XM_Z, 3);
- PRINT (sam_seg_bsbolt_XB, 3);
- PRINT (sam_seg_TX_AN_Z, 3);
- PRINT (sam_seg_barcode_qual, 3);
- PRINT (sam_seg_CB_Z, 3);
- PRINT (sam_seg_CR_Z, 3);
- PRINT (sam_seg_RX_Z, 3);
- PRINT (sam_seg_BX_Z, 3);
- PRINT (sam_seg_QX_Z, 3);
- PRINT (sam_seg_BC_Z, 3);
- PRINT (sam_seg_gene_name_id, 3);
- PRINT (sam_seg_fx_Z, 3);
- PRINT (sam_seg_other_seq, 3);
- PRINT (sam_seg_GR_Z, 3);
- PRINT (sam_seg_GY_Z, 3);
- PRINT (vcf_seg_QUAL, 3)
- PRINT (sam_seg_ULTIMA_tp, 3);
- PRINT (vcf_seg_PROBE_A, 3);
- PRINT (random_access_merge_in_vb, 1);
- PRINT (gencomp_absorb_add_to_queue, 1);
- PRINT (gencomp_flush, 2);
- PRINT (gencomp_offload_DEPN_to_disk, 3);
- PRINT (gencomp_reread_lines_as_prescribed, 1);
- PRINT (bgzf_uncompress_one_prescribed_block, 2);
- PRINT (ctx_merge_in_vb_ctx, 1);
- PRINT (wait_for_merge, 2);
- PRINT (sam_deep_merge, 2);
- PRINT (zip_compress_ctxs, 1);
- PRINT (b250_zip_generate, 2);
- PRINT (zip_generate_local, 2);
- PRINT (codec_assign_best_codec, 2);
-
- PRINT (compressor_bz2, 2);
- PRINT (compressor_lzma, 2);
- PRINT (compressor_bsc, 2);
- PRINT (compressor_rans, 2);
- PRINT (compressor_arith, 2);
- PRINT (compressor_domq, 2);
- PRINT (compressor_normq, 2);
- PRINT (compressor_actg, 2);
- PRINT (compressor_pbwt, 2);
- PRINT (compressor_longr, 2);
- PRINT (compressor_homp, 2);
- PRINT (compressor_t0, 2);
- PRINT (compressor_pacb, 2);
- PRINT (compressor_smux, 2);
-
- for_zctx
- PRINT_(fields[zctx->did_i], zctx->tag_name, 2);
-
- PRINT (sam_zip_prim_ingest_vb, 1);
- PRINT (digest, 1);
- }
-
- // ZIP and PIZ compute thread
- iprint0 ("COMPUTE THREADS ADMIN\n");
- PRINT (buf_alloc_compute, 1);
- PRINT (buflist_add_buf, 2);
- PRINT (buf_destroy_do_do_compute, 1);
- PRINT (buf_free_compute, 1);
- PRINT (buflist_remove_buf, 2)
- PRINT (buf_overlay_do, 1);
-
- PRINT (file_open_z, 0);
- PRINT (file_close, 0);
- PRINT (buf_alloc_main, 0);
- PRINT (buf_destroy_do_do_main, 0);
- PRINT (buflist_test_overflows_do, 0);
- PRINT (buflist_sort, 0);
- PRINT (sections_create_index, 0);
- PRINT (buflist_find_buf, 0);
- PRINT (buf_low_level_free, 0);
- PRINT (vb_release_vb_do, 0);
- PRINT (vb_destroy_vb, 0);
- PRINT (dispatcher_recycle_vbs, 0);
- PRINT (refhash_generate_emoneg, 0);
- PRINT (tmp1, 0);
- PRINT (tmp2, 0);
- PRINT (tmp3, 0);
- PRINT (tmp4, 0);
- PRINT (tmp5, 0);
-
- if (profile.num_vbs) {
- iprint0 ("\nVblock stats:\n");
- iprintf (" Vblocks: %u\n", profile.num_vbs);
- iprintf (" Maximum vblock size: %u MB\n", profile.max_vb_size_mb);
- iprintf (" Average number of VBs in compute (per txt file): %s\n", profiler_get_avg_compute_vbs(',').s); // average during the lifetime of the ZIP/PIZ dispatcher, i.e. excluding global_area time etc
- iprintf (" Average read time: %u ms\n", ms(profile.nanosecs.read) / profile.num_vbs);
- iprintf (" Average compute time: %u ms\n", ms(profile.nanosecs.compute) / profile.num_vbs);
- iprintf (" Average write time: %u ms\n", ms(profile.nanosecs.write) / profile.num_vbs);
- }
-
- iprint0 ("\n\n");
-
- mutex_show_bottleneck_analsyis();
-}
+// ------------------------------------------------------------------
+// profiler.c
+// Copyright (C) 2019-2024 Genozip Limited. Patent Pending.
+// Please see terms and conditions in the file LICENSE.txt
+//
+// WARNING: Genozip is proprietary, not open source software. Modifying the source code is strictly prohibited,
+// under penalties specified in the license.
+
+#include "profiler.h"
+#include "file.h"
+
+static ProfilerRec profile = {}; // data for this z_file
+static Mutex profile_mutex = {};
+static TimeSpecType profiler_timer; // wallclock
+
+void profiler_initialize (void)
+{
+ mutex_bottleneck_analysis_init();
+ mutex_initialize (profile_mutex);
+}
+
+void profiler_new_z_file (void)
+{
+ memset (&profile, 0, sizeof (profile));
+ clock_gettime (CLOCK_REALTIME, &profiler_timer); // initialze wallclock
+}
+
+void profiler_set_avg_compute_vbs (float avg_compute_vbs)
+{
+ ASSERT0 (profile.num_txt_files >= 0 && profile.num_txt_files < MAX_NUM_TXT_FILES_IN_ZFILE, "too many txt files");
+
+ profile.avg_compute_vbs[profile.num_txt_files++] = avg_compute_vbs;
+}
+
+
+StrTextSuperLong profiler_get_avg_compute_vbs (char sep)
+{
+ StrTextSuperLong s = {};
+ int s_len = 0;
+ for (int i=0; i < profile.num_txt_files; i++)
+ SNPRINTF (s, "%.1f%c", profile.avg_compute_vbs[i], sep);
+
+ if (s_len) s.s[s_len-1] = 0; // remove final separator
+
+ return s;
+}
+
+void profiler_add (ConstVBlockP vb)
+{
+ mutex_lock (profile_mutex);
+
+ if (Ltxt) {
+ profile.num_vbs++;
+ profile.max_vb_size_mb = MAX_(profile.max_vb_size_mb, segconf.vb_size >> 20);
+ }
+
+ int num_profiled = sizeof (profile.nanosecs) / sizeof (uint64_t) - MAX_DICTS +
+ (IS_PIZ ? vb->num_contexts : 0);
+
+ for (int i=0; i < num_profiled; i++)
+ if (((uint64_t *)&vb->profile.count)[i]) {
+ ((uint64_t *)&profile.nanosecs)[i] += ((uint64_t *)&vb->profile.nanosecs)[i];
+ ((uint64_t *)&profile.count)[i] += ((uint64_t *)&vb->profile.count)[i];
+ }
+
+ // ZIP: add compressor data by zctx, while collected by vctx
+ if (IS_ZIP)
+ for (int v=num_profiled; v < num_profiled + vb->num_contexts; v++)
+ if (((uint64_t *)&vb->profile.count)[v]) {
+ ContextP zctx = ctx_get_zctx_from_vctx (CTX(v-num_profiled), false, true);
+ if (!zctx) continue; // should never happen
+
+ int z = zctx->did_i + num_profiled;
+ ((uint64_t *)&profile.nanosecs)[z] += ((uint64_t *)&vb->profile.nanosecs)[v];
+ ((uint64_t *)&profile.count)[z] += ((uint64_t *)&vb->profile.count)[v];
+ }
+
+ mutex_unlock (profile_mutex);
+}
+
+static inline uint32_t ms(uint64_t ns) { return (uint32_t)(ns / 1000000);}
+
+rom profiler_print_short (const ProfilerRec *p)
+{
+ static char str[300]; // not thread safe
+ snprintf (str, sizeof (str), "read: %s compute:%s write: %s", str_int_commas (ms(p->nanosecs.read)).s, str_int_commas (ms(p->nanosecs.compute)).s, str_int_commas (ms(p->nanosecs.write)).s);
+ return str;
+}
+
+void profiler_add_evb_and_print_report (void)
+{
+ profiler_add (evb);
+
+ static rom space = " ";
+# define PRINT_(x, label, level) if (profile.nanosecs.x) \
+ iprintf ("%.*s %s: %s (N=%s)\n", (level)*3, space, (label), \
+ str_int_commas (ms(profile.nanosecs.x)).s, \
+ str_int_commas (profile.count.x).s);
+
+# define PRINT(x, level) PRINT_(x, #x, (level))
+
+ rom os = flag.is_windows ? "Windows"
+ : flag.is_mac ? "MacOS"
+ : flag.is_linux ? "Linux"
+ : "Unknown OS";
+
+ iprintf ("\n%s PROFILER:\n", IS_ZIP ? "ZIP" : "PIZ");
+ iprintf ("OS=%s\n", os);
+ iprintf ("Build=%s\n", flag.debug ? "Debug" : "Optimized");
+
+ iprintf ("Wallclock: %s milliseconds\n", str_int_commas (ms (CHECK_TIMER)).s);
+
+ if (command == PIZ) { // this is a uncompress operation
+
+ iprint0 ("GENOUNZIP main thread (piz_one_txt_file):\n");
+ PRINT (ref_load_stored_reference, 1);
+ PRINT (ref_initialize_ranges, 2);
+ PRINT (ref_read_one_range, 2);
+ PRINT (ref_uncompress_one_range, 2);
+ PRINT (piz_read_global_area, 1); // sometimes also includes ref_load_stored_reference, but usually not
+ PRINT (dict_io_read_all_dictionaries, 2);
+ PRINT (dict_io_build_word_lists, 3);
+ PRINT (txtheader_piz_read_and_reconstruct, 1);
+ PRINT (sam_header_inspect, 2);
+ PRINT (digest_txt_header, 2);
+ PRINT (vb_get_vb, 1);
+ PRINT (piz_read_one_vb, 1);
+ PRINT (read, 2);
+ PRINT (bgzf_io_thread, 1);
+ PRINT (bgzf_writer_thread, 1);
+ PRINT (write, 1);
+ PRINT (sam_sa_prim_finalize_ingest, 1);
+ PRINT (sam_piz_deep_grab_deep_ents, 1);
+ PRINT (sam_piz_deep_finalize_ents, 2);
+ PRINT (piz_main_loop_idle, 1);
+
+ iprintf ("GENOUNZIP compute threads: %s\n", str_int_commas (ms(profile.nanosecs.compute)).s);
+ PRINT (zfile_uncompress_section, 1);
+ PRINT (compressor_bz2, 2);
+ PRINT (compressor_lzma, 2);
+ PRINT (compressor_bsc, 2);
+ PRINT (compressor_rans, 2);
+ PRINT (compressor_arith, 2);
+ PRINT (compressor_domq, 2);
+ PRINT (compressor_normq, 2);
+ PRINT (compressor_actg, 2);
+ PRINT (compressor_pbwt, 2);
+ PRINT (compressor_longr, 2);
+ PRINT (compressor_homp, 2);
+ PRINT (compressor_pacb, 2);
+ PRINT (compressor_smux, 2);
+ PRINT (compressor_t0, 2);
+ PRINT (compressor_oq, 2);
+
+ PRINT (reconstruct_vb, 1);
+ for (Did did_i=0; did_i < z_file->num_contexts; did_i++)
+ PRINT_(fields[did_i], ZCTX(did_i)->tag_name, 2);
+
+ PRINT (sam_reconstruct_SEQ_vs_ref, 2);
+ PRINT (reconstruct_SEQ_copy_sag_prim, 3);
+ PRINT (sam_analyze_copied_SEQ, 3);
+ PRINT (sam_bismark_piz_update_meth_call, 3);
+ PRINT (aligner_reconstruct_seq, 2);
+ PRINT (sam_piz_special_QUAL, 2);
+ if (Z_DT(SAM) || Z_DT(BAM)) {
+ PRINT (codec_longr_reconstruct,3);
+ PRINT (codec_homp_reconstruct, 3);
+ PRINT (codec_t0_reconstruct, 3);
+ PRINT (codec_smux_reconstruct, 3);
+ PRINT (codec_pacb_reconstruct, 3);
+ PRINT (codec_domq_reconstruct, 3);
+ PRINT (codec_domq_reconstruct_dom_run, 4);
+ PRINT (codec_oq_reconstruct, 3);
+ }
+ PRINT (fastq_special_monochar_QUAL, 2);
+ PRINT (sam_piz_sam2fastq_QUAL, 2);
+ PRINT (sam_piz_sam2bam_QUAL, 2);
+ PRINT (sam_cigar_special_CIGAR, 2);
+
+ PRINT (sam_piz_con_item_cb, 2);
+ PRINT (sam_piz_deep_add_qname, 3);
+ PRINT (sam_piz_deep_add_seq, 3);
+ PRINT (sam_piz_deep_add_qual, 3);
+ PRINT (sam_piz_deep_compress, 4); // mostly under qual, a tiny bit of seq
+
+ PRINT (fastq_special_set_deep, 2);
+ PRINT (fastq_special_deep_copy_QNAME, 2);
+ PRINT (fastq_special_deep_copy_SEQ, 2);
+ PRINT (fastq_special_deep_copy_QUAL, 2);
+
+ PRINT (sam_zip_prim_ingest_vb, 1);
+ PRINT (digest, 1); // note: in SAM/BAM digest is done in the writer thread, otherwise its done in the compute thread. TODO: change level to 0 in case of SAM/BAM
+ PRINT (piz_get_line_subfields, 2);
+ PRINT (sam_load_groups_add_one_prim_vb, 1);
+ if (Z_DT(FASTQ)) {
+ PRINT (codec_longr_reconstruct, 3);
+ PRINT (codec_domq_reconstruct, 2);
+ PRINT (codec_domq_reconstruct_dom_run, 3);
+ PRINT (codec_homp_reconstruct, 2);
+ PRINT (codec_smux_reconstruct, 2);
+ PRINT (codec_pacb_reconstruct, 2);
+ }
+
+ if (profile.nanosecs.bgzf_compute_thread) {
+ iprintf ("GENOUNZIP BGZF threads: %s\n", str_int_commas (ms(profile.nanosecs.bgzf_compute_thread)).s);
+ PRINT (bgzf_compute_thread, 1);
+ PRINT (bgzf_compress_one_block, 2);
+ }
+ }
+
+ else { // compress
+ iprint0 ("GENOZIP main thread (zip_one_file):\n");
+ PRINT (ref_load_stored_reference, 1);
+ PRINT (ref_read_one_range, 2);
+ PRINT (ref_uncompress_one_range, 2);
+ PRINT (ref_load_digest, 2);
+ PRINT (refhash_load, 1);
+ PRINT (refhash_load_digest, 2);
+ PRINT (refhash_read_one_vb, 2);
+ PRINT (refhash_compress_digest, 2); // make-ref
+ PRINT (refhash_uncompress_one_vb, 2);
+ PRINT (cram_inspect_file, 1);
+ PRINT (txtheader_zip_read_and_compress, 1);
+ PRINT (txtfile_read_header, 2);
+ PRINT (sam_header_inspect, 2);
+ PRINT (sam_header_zip_inspect_SQ_lines, 3);
+ PRINT (sam_header_add_contig, 4);
+ PRINT (contigs_create_index, 4);
+ PRINT (sam_header_zip_inspect_PG_lines, 3);
+ PRINT (sam_header_zip_inspect_RG_lines, 3);
+ PRINT (sam_header_zip_inspect_HD_line, 3);
+ PRINT (ref_initialize_ranges, 2);
+ PRINT (txtheader_compress, 2);
+ PRINT (txtheader_compress_one_fragment, 3);
+ PRINT (digest_txt_header, 2);
+ PRINT (vb_get_vb, 1);
+ PRINT (fastq_read_pair_1_data, 1);
+ PRINT (piz_read_all_ctxs, 2);
+ PRINT (txtfile_read_vblock, 1);
+ PRINT (read, 2);
+ PRINT (txtfile_read_block_zlib, 3);
+ PRINT (txtfile_read_block_igzip, 3);
+ PRINT (igzip_uncompress_during_read, 4);
+ PRINT (txtfile_read_block_bz2, 3);
+ PRINT (txtfile_read_block_bgz, 3);
+ PRINT (bgzf_read_block, 4);
+ PRINT (gzil_read_block, 4)
+ PRINT (bgz_uncompress_during_read, 4);
+ PRINT (fastq_txtfile_have_enough_lines, 2);
+ PRINT (txtfile_get_unconsumed_callback, 2);
+ PRINT (bgz_copy_unconsumed_blocks, 2);
+ PRINT (zriter_write, 1);
+ PRINT (write_fg, 2);
+ PRINT (write_bg, 2);
+ PRINT (bgzf_io_thread, 1);
+ PRINT (sam_sa_prim_finalize_ingest, 1);
+ PRINT (zip_main_loop_idle, 1);
+ PRINT (zip_free_undeeded_zctx_bufs_after_seg, 1);
+ PRINT (generate_recon_plan, 1);
+ PRINT (sam_zip_recon_plan_add_gc_lines, 2);
+ PRINT (sam_zip_recon_plan_count_writers, 3);
+ PRINT (recon_plan_compress, 2);
+ PRINT (recon_plan_deltify, 3);
+ PRINT (recon_plan_compress_one_fragment, 3);
+ PRINT (zip_write_global_area, 1);
+ PRINT (dict_io_compress_dictionaries, 2);
+ PRINT (dict_io_assign_codecs, 3);
+ PRINT (dict_io_compress_one_fragment, 3);
+ PRINT (ref_compress_ref, 2);
+ PRINT (ref_compress_one_range, 3);
+ PRINT (refhash_calc_one_range, 4);
+ PRINT (refhash_compress_refhash, 2);
+ PRINT (refhash_compress_one_vb, 3);
+ PRINT (refhash_compress_digest, 3);
+ PRINT (ref_make_calculate_digest, 3);
+ PRINT (ref_contigs_compress, 3);
+ PRINT (ref_copy_compressed_sections_from_reference_file, 3);
+ PRINT (random_access_finalize_entries, 2);
+ PRINT (random_access_compress, 2);
+ PRINT (ctx_compress_counts, 2);
+ PRINT (zfile_compress_genozip_header, 2);
+
+ iprintf ("GENOZIP compute threads %s\n", str_int_commas (ms(profile.nanosecs.compute)).s);
+ PRINT (bgz_uncompress_vb, 1);
+ PRINT (ctx_clone, 1);
+ PRINT (scan_index_qnames_preprocessing, 1);
+ PRINT (zip_modify, 1);
+ PRINT (vcf_zip_modify, 2);
+ PRINT (vcf_optimize_QUAL, 3);
+ PRINT (vcf_optimize_INFO, 3);
+ PRINT (vcf_optimize_samples, 3);
+ PRINT (vcf_convert_probabilites_to_phred, 4);
+ PRINT (vcf_convert_likelihoods_to_phred, 4);
+ PRINT (vcf_phred_optimize, 4);
+ PRINT (optimize_float_3_sig_dig, 4);
+ PRINT (seg_all_data_lines, 1);
+ PRINT (seg_initialize, 2);
+ PRINT (qname_seg, 2);
+ PRINT (sam_cigar_seg, 2);
+ PRINT (squank_seg, 3);
+ PRINT (fastq_seg_get_lines, 2);
+ PRINT (seg_get_next_line, 3);
+ PRINT (seg_get_next_item, 3);
+ PRINT (fastq_seg_deep, 2);
+ PRINT (fastq_deep_seg_find_subseq, 3);
+ PRINT (fastq_seg_deep_consume_unique_matching_ent, 3);
+ PRINT (fastq_seg_SEQ, 2);
+ PRINT (sam_seg_SEQ, 2);
+ PRINT (sam_seg_SEQ_vs_ref, 3);
+ PRINT (sam_seg_bisulfite_M, 4);
+ PRINT (sam_seg_verify_saggy_line_SEQ, 3);
+ PRINT (sam_analyze_copied_SEQ, 3);
+ PRINT (aligner_seg_seq, 3);
+ PRINT (aligner_best_match, 4);
+ PRINT (aligner_first_layer, 5);
+ PRINT (aligner_additional_layers, 5);
+ PRINT (aligner_update_best, 5);
+ PRINT (aligner_get_word_from_seq, 5);
+ PRINT (aligner_seq_to_bitmap, 5);
+ PRINT (fastq_seg_DESC, 2);
+ PRINT (fastq_seg_saux, 2);
+ PRINT (bam_seq_to_sam, 2);
+ PRINT (fastq_seg_QUAL, 2);
+ PRINT (sam_seg_QUAL, 2);
+ PRINT (sam_seg_is_gc_line, 2);
+ PRINT (sam_seg_MD_Z_analyze, 2);
+ PRINT (sam_cigar_binary_to_textual, 2);
+ PRINT (sam_seg_bsseeker2_XG_Z_analyze, 2);
+ PRINT (sam_seg_sag_stuff, 2);
+ PRINT (sam_seg_aux_all, 2);
+ PRINT (sam_seg_SA_Z, 3);
+ PRINT (sam_seg_AS_i, 3);
+ PRINT (sam_seg_NM_i, 3);
+ PRINT (sam_seg_BWA_XA_Z, 3);
+ PRINT (sam_seg_BWA_XA_pos, 4);
+ PRINT (sam_seg_BWA_XS_i, 3);
+ PRINT (sam_seg_bismark_XM_Z, 3);
+ PRINT (sam_seg_bsbolt_XB, 3);
+ PRINT (sam_seg_TX_AN_Z, 3);
+ PRINT (sam_seg_barcode_qual, 3);
+ PRINT (sam_seg_CB_Z, 3);
+ PRINT (sam_seg_CR_Z, 3);
+ PRINT (sam_seg_RX_Z, 3);
+ PRINT (sam_seg_BX_Z, 3);
+ PRINT (sam_seg_QX_Z, 3);
+ PRINT (sam_seg_BC_Z, 3);
+ PRINT (sam_seg_gene_name_id, 3);
+ PRINT (sam_seg_fx_Z, 3);
+ PRINT (sam_seg_other_seq, 3);
+ PRINT (sam_seg_GR_Z, 3);
+ PRINT (sam_seg_GY_Z, 3);
+ PRINT (vcf_seg_QUAL, 3)
+ PRINT (sam_seg_ULTIMA_tp, 3);
+ PRINT (vcf_seg_PROBE_A, 3);
+ PRINT (random_access_merge_in_vb, 1);
+ PRINT (gencomp_absorb_add_to_queue, 1);
+ PRINT (gencomp_flush, 2);
+ PRINT (gencomp_offload_DEPN_to_disk, 3);
+ PRINT (gencomp_reread_lines_as_prescribed, 1);
+ PRINT (bgzf_uncompress_one_prescribed_block, 2);
+ PRINT (ctx_merge_in_vb_ctx, 1);
+ PRINT (wait_for_merge, 2);
+ PRINT (sam_deep_merge, 2);
+ PRINT (zip_compress_ctxs, 1);
+ PRINT (b250_zip_generate, 2);
+ PRINT (zip_generate_local, 2);
+ PRINT (codec_assign_best_codec, 2);
+
+ PRINT (compressor_bz2, 2);
+ PRINT (compressor_lzma, 2);
+ PRINT (compressor_bsc, 2);
+ PRINT (compressor_rans, 2);
+ PRINT (compressor_arith, 2);
+ PRINT (compressor_domq, 2);
+ PRINT (compressor_normq, 2);
+ PRINT (compressor_actg, 2);
+ PRINT (compressor_pbwt, 2);
+ PRINT (compressor_longr, 2);
+ PRINT (compressor_homp, 2);
+ PRINT (compressor_t0, 2);
+ PRINT (compressor_pacb, 2);
+ PRINT (compressor_smux, 2);
+
+ for_zctx
+ PRINT_(fields[zctx->did_i], zctx->tag_name, 2);
+
+ PRINT (sam_zip_prim_ingest_vb, 1);
+ PRINT (digest, 1);
+ }
+
+ // ZIP and PIZ compute thread
+ iprint0 ("COMPUTE THREADS ADMIN\n");
+ PRINT (buf_alloc_compute, 1);
+ PRINT (buflist_add_buf, 2);
+ PRINT (buf_destroy_do_do_compute, 1);
+ PRINT (buf_free_compute, 1);
+ PRINT (buflist_remove_buf, 2)
+ PRINT (buf_overlay_do, 1);
+
+ PRINT (file_open_z, 0);
+ PRINT (file_close, 0);
+ PRINT (buf_alloc_main, 0);
+ PRINT (buf_destroy_do_do_main, 0);
+ PRINT (buflist_test_overflows_do, 0);
+ PRINT (buflist_sort, 0);
+ PRINT (sections_create_index, 0);
+ PRINT (buflist_find_buf, 0);
+ PRINT (buf_low_level_free, 0);
+ PRINT (vb_release_vb_do, 0);
+ PRINT (vb_destroy_vb, 0);
+ PRINT (dispatcher_recycle_vbs, 0);
+ PRINT (refhash_generate_emoneg, 0);
+ PRINT (tmp1, 0);
+ PRINT (tmp2, 0);
+ PRINT (tmp3, 0);
+ PRINT (tmp4, 0);
+ PRINT (tmp5, 0);
+
+ if (profile.num_vbs) {
+ iprint0 ("\nVblock stats:\n");
+ iprintf (" Vblocks: %u\n", profile.num_vbs);
+ iprintf (" Maximum vblock size: %u MB\n", profile.max_vb_size_mb);
+ iprintf (" Average number of VBs in compute (per txt file): %s\n", profiler_get_avg_compute_vbs(',').s); // average during the lifetime of the ZIP/PIZ dispatcher, i.e. excluding global_area time etc
+ iprintf (" Average read time: %u ms\n", ms(profile.nanosecs.read) / profile.num_vbs);
+ iprintf (" Average compute time: %u ms\n", ms(profile.nanosecs.compute) / profile.num_vbs);
+ iprintf (" Average write time: %u ms\n", ms(profile.nanosecs.write) / profile.num_vbs);
+ }
+
+ iprint0 ("\n\n");
+
+ mutex_show_bottleneck_analsyis();
+}
diff --git a/src/profiler.h b/src/profiler.h
index 8739fd2b..465a3431 100644
--- a/src/profiler.h
+++ b/src/profiler.h
@@ -18,7 +18,7 @@
file_open_z, file_close, buf_low_level_free, buflist_find_buf, buflist_sort, buflist_test_overflows_do,\
read, compute, compressor_bz2, compressor_lzma, compressor_bsc, \
write, write_fg, write_bg, zriter_write, piz_read_one_vb, vb_get_vb,\
- compressor_domq, compressor_actg, txtfile_read_block_bgzf_uncompress,\
+ compressor_domq, compressor_actg, bgz_uncompress_during_read, igzip_uncompress_during_read, \
piz_get_line_subfields, b250_zip_generate, zip_generate_local, zip_compress_ctxs, ctx_merge_in_vb_ctx, wait_for_merge,\
zfile_uncompress_section, codec_assign_best_codec, compressor_pbwt, compressor_longr, compressor_homp, compressor_t0, \
compressor_rans, compressor_arith, compressor_normq, compressor_pacb, compressor_smux, compressor_oq, \
@@ -27,10 +27,11 @@
reconstruct_vb, buf_alloc_main, buf_alloc_compute, buf_destroy_do_do_main, buf_destroy_do_do_compute, buf_overlay_do, \
buf_free_main, buf_free_compute, buflist_add_buf, buflist_remove_buf, \
dispatcher_recycle_vbs, sections_create_index, \
- txtfile_read_header, txtfile_read_vblock, txtfile_get_unconsumed_to_pass_to_next_vb, fastq_txtfile_have_enough_lines, \
- txtfile_read_block_bgzf, txtfile_read_block_zlib, txtfile_read_block_gz, txtfile_read_block_bz2, \
- bgzf_io_thread, bgzf_compute_thread, bgzf_writer_thread, bgzf_uncompress_vb, bgzf_copy_unconsumed_blocks, bgzf_read_block, \
+ txtfile_read_header, txtfile_read_vblock, txtfile_get_unconsumed_callback, fastq_txtfile_have_enough_lines, \
+ txtfile_read_block_bgz, txtfile_read_block_zlib, txtfile_read_block_igzip, txtfile_read_block_bz2, \
+ bgzf_io_thread, bgzf_compute_thread, bgzf_writer_thread, bgz_uncompress_vb, bgz_copy_unconsumed_blocks, bgzf_read_block, \
bgzf_compress_one_block, bgzf_uncompress_one_prescribed_block, \
+ gzil_read_block, \
zip_modify, vcf_zip_modify, vcf_optimize_samples, vcf_optimize_QUAL, vcf_optimize_INFO, vcf_convert_probabilites_to_phred, \
vcf_convert_likelihoods_to_phred, vcf_phred_optimize, optimize_float_3_sig_dig, \
seg_all_data_lines, seg_get_next_line, seg_get_next_item, seg_initialize,\
diff --git a/src/progress.c b/src/progress.c
index 94db84e5..4173c72f 100644
--- a/src/progress.c
+++ b/src/progress.c
@@ -139,7 +139,8 @@ void progress_update (rom task, uint64_t sofar, uint64_t total, bool done)
if (!flag.debug_progress)
progress_update_status (NULL, "Finalizing...");
else {
- snprintf (progress_str, sizeof(progress_str), "Finalizing... %u%% task=%s sofar=%"PRIu64" total=%"PRIu64, (unsigned)percent, task, sofar, total);
+ snprintf (progress_str, sizeof(progress_str), "Finalizing... %u%% task=%s sofar=%.20s total=%.20s",
+ (unsigned)percent, task, str_int_commas(sofar).s, str_int_commas(total).s);
progress_update_status (NULL, progress_str);
}
}
@@ -154,8 +155,8 @@ void progress_update (rom task, uint64_t sofar, uint64_t total, bool done)
if (!flag.debug_progress)
snprintf (progress_str, sizeof(progress_str), "%u%% (%s)", (unsigned)percent, str_human_time (secs, false).s);
else
- snprintf (progress_str, sizeof(progress_str), "%u%% (%s) task=%s sofar=%"PRIu64" total=%"PRIu64" seconds_so_far=%d",
- (unsigned)percent, str_human_time (secs, false).s, task, sofar, total, seconds_so_far);
+ snprintf (progress_str, sizeof(progress_str), "%u%% (%s) task=%s sofar=%.20s total=%.20s seconds_so_far=%d",
+ (unsigned)percent, str_human_time (secs, false).s, task, str_int_commas(sofar).s, str_int_commas(total).s, seconds_so_far);
progress_update_status (NULL, progress_str);
}
diff --git a/src/qname.c b/src/qname.c
index e6528841..fa71c3aa 100644
--- a/src/qname.c
+++ b/src/qname.c
@@ -21,7 +21,7 @@ sSTRl(snip_redirect_to_QNAME2, 16);
static inline Did did_by_q (QType q)
{
- ASSERT (q >= 0 && q < NUM_QTYPES, "Invalid q=%d", q);
+ ASSERT (IN_RANGE (q, 0, NUM_QTYPES-1), "Invalid q=%d", q);
return (Did[]){ FASTQ_QNAME, FASTQ_QNAME2, FASTQ_LINE3 }[q]; // note: SAM and FASTQ have the same dids for QNAMEs
}
diff --git a/src/recon_history.c b/src/recon_history.c
index 7630d63e..04d30e8f 100644
--- a/src/recon_history.c
+++ b/src/recon_history.c
@@ -116,9 +116,7 @@ void reconstruct_to_history (VBlockP vb, ContextP ctx)
rom lookup_type_name (LookupType lookup)
{
- static rom names[] = LOOKUP_TYPE_NAMES;
-
- return (lookup < 0 || lookup >= ARRAY_LEN(names)) ? "Invalid" : names[lookup];
+ return IN_RANGE (lookup, 0, ARRAY_LEN((rom[])LOOKUP_TYPE_NAMES)-1) ? (rom[])LOOKUP_TYPE_NAMES[lookup] : "Invalid LookupType";
}
void recon_history_get_historical_snip (VBlockP vb, ContextP ctx, LineIType buddy_line_i, pSTRp(snip))
diff --git a/src/reconstruct.h b/src/reconstruct.h
index d759e1fa..240dece9 100644
--- a/src/reconstruct.h
+++ b/src/reconstruct.h
@@ -45,8 +45,8 @@ extern void asspiz_text (VBlockP vb, FUNCLINE);
#define ABORT_PIZ0(string) ABORT_PIZ (string "%s", "")
// goes into ctx->history if not STORE_INT
-typedef packed_enum { LookupTxtData, LookupDict, LookupLocal, LookupPerLine } LookupType;
-#define LOOKUP_TYPE_NAMES { "LookupTxtData", "LookupDict", "LookupLocal", "LookupPerLine" }
+typedef packed_enum { LookupTxtData, LookupDict, LookupLocal, LookupPerLine } LookupType;
+#define LOOKUP_TYPE_NAMES { "LookupTxtData", "LookupDict", "LookupLocal", "LookupPerLine" }
extern rom lookup_type_name (LookupType lookup);
typedef struct __attribute__ ((packed)) { // 9 bytes
diff --git a/src/ref_cache.c b/src/ref_cache.c
index 7d042ccc..d9ed66f6 100644
--- a/src/ref_cache.c
+++ b/src/ref_cache.c
@@ -433,7 +433,5 @@ void ref_cache_detach (Reference ref)
rom cache_state_name (RefCacheState cs)
{
- rom names[] = CACHE_STATE_NAMES;
- if (cs >=0 && cs < NUM_CACHE_STATES) return names[cs];
- else return "INVALID_CACHE_STATS";
+ return IN_RANGE (cs, 0, NUM_CACHE_STATES-1) ? (rom[])CACHE_STATE_NAMES[cs] : "InvalidRefCacheState";
}
diff --git a/src/ref_contigs.c b/src/ref_contigs.c
index 20a1a29f..41c41804 100644
--- a/src/ref_contigs.c
+++ b/src/ref_contigs.c
@@ -268,7 +268,7 @@ static void ref_contigs_load_set_contig_names (Reference ref)
if (!contig[i].max_pos) continue;
WordIndex chrom_index = contig[i].ref_index;
- ASSERT (chrom_index >= 0 && chrom_index < chrom_len, "Expecting contig[%u].ref_index=%d to be in the range [0,%d]", i, chrom_index, (int)chrom_len-1);
+ ASSERT (IN_RANGE (chrom_index, 0, chrom_len-1), "Expecting contig[%u].ref_index=%d to be in the range [0,%d]", i, chrom_index, (int)chrom_len-1);
contig[i].char_index = chrom[chrom_index].char_index;
contig[i].snip_len = chrom[chrom_index].snip_len;
}
@@ -517,9 +517,13 @@ static ContigP ref_contig_search_by_gpos_v13 (const Reference ref, PosType64 gpo
return NULL; // not found
}
-static ContigP ref_contig_search_by_gpos (const Reference ref, PosType64 gpos, WordIndex first_ctg_i, WordIndex last_ctg_i)
+static ContigP ref_contig_search_by_gpos (const Reference ref, PosType64 gpos,
+ WordIndex first_ctg_i, WordIndex last_ctg_i,
+ bool next_contig_if_in_gap)
{
- if (first_ctg_i > last_ctg_i) return NULL; // gpos is after all contigs, or in the gaps between contigs
+ if (first_ctg_i > last_ctg_i)
+ return next_contig_if_in_gap ? B(Contig, ref->ctgs.contigs, first_ctg_i)
+ : NULL; // gpos is after all contigs, or in the gaps between contigs
WordIndex mid_ctg_i = (first_ctg_i + last_ctg_i) / 2;
ContigP rc = B(Contig, ref->ctgs.contigs, mid_ctg_i);
@@ -528,17 +532,18 @@ static ContigP ref_contig_search_by_gpos (const Reference ref, PosType64 gpos, W
return rc;
else if (gpos < rc->gpos)
- return ref_contig_search_by_gpos (ref, gpos, first_ctg_i, mid_ctg_i - 1);
+ return ref_contig_search_by_gpos (ref, gpos, first_ctg_i, mid_ctg_i - 1, next_contig_if_in_gap);
else
- return ref_contig_search_by_gpos (ref, gpos, mid_ctg_i + 1, last_ctg_i);
+ return ref_contig_search_by_gpos (ref, gpos, mid_ctg_i + 1, last_ctg_i, next_contig_if_in_gap);
}
WordIndex ref_contig_get_by_gpos (const Reference ref, PosType64 gpos,
int32_t seq_len, // if non-0 succeed only if range is entirely with the contig (may be positive or negative number)
- PosType32 *pos) // optional out, POS within the CHROM matching gpos
+ PosType32 *pos, // optional out, POS within the CHROM matching gpos
+ bool next_contig_if_in_gap)
{
- ContigP rc = VER(14) ? ref_contig_search_by_gpos (ref, gpos, 0, ref->ctgs.contigs.len32 - 1)
+ ContigP rc = VER(14) ? ref_contig_search_by_gpos (ref, gpos, 0, ref->ctgs.contigs.len32 - 1, next_contig_if_in_gap)
: ref_contig_search_by_gpos_v13 (ref, gpos);
if (!rc)
diff --git a/src/ref_iupacs.c b/src/ref_iupacs.c
index 7d62850b..0267dde6 100644
--- a/src/ref_iupacs.c
+++ b/src/ref_iupacs.c
@@ -98,7 +98,7 @@ void ref_iupacs_load (Reference ref)
if (flag.show_ref_iupacs) {
PosType32 pos;
- WordIndex chrom_index = ref_contig_get_by_gpos (ref, iupacs[i].gpos, 0, &pos);
+ WordIndex chrom_index = ref_contig_get_by_gpos (ref, iupacs[i].gpos, 0, &pos, false);
iprintf ("IUPAC=%c\tCHROM=%s\tPOS=%u\tGPOS=%"PRIu64"\n",
iupacs[i].iupac, ctx_get_snip_by_word_index0 (ZCTX(FASTA_CONTIG), chrom_index), pos, iupacs[i].gpos);
}
diff --git a/src/reference.c b/src/reference.c
index 1256f949..1c260bda 100644
--- a/src/reference.c
+++ b/src/reference.c
@@ -220,7 +220,7 @@ static void ref_uncompact_ref (RangeP r, int64_t first_bit, int64_t last_bit, co
RangeP ref_get_range_by_chrom (Reference ref, WordIndex chrom, rom *chrom_name)
{
decl_zctx (CHROM);
- ASSERT (chrom >= 0 && chrom < zctx->word_list.len, "chrom=%d out of range - ctx->word_list.len=%u",
+ ASSERT (IN_RANGE (chrom, 0, zctx->word_list.len32-1), "chrom=%d out of range - ctx->word_list.len=%u",
chrom, zctx->word_list.len32);
if (chrom_name)
@@ -418,7 +418,7 @@ static void ref_uncompress_one_range (VBlockP vb)
uint64_t start = MAX_(sec_start_within_contig, 0);
uint64_t len = ref_sec_len - initial_flanking_len - final_flanking_len;
- ASSERT (len >= 0 && len <= ref_sec_len, "expecting ref_sec_len=%"PRIu64" >= initial_flanking_len=%"PRIu64" + final_flanking_len=%"PRIu64,
+ ASSERT (IN_RANGE (len, 0, ref_sec_len), "expecting ref_sec_len=%"PRIu64" >= initial_flanking_len=%"PRIu64" + final_flanking_len=%"PRIu64,
ref_sec_len, initial_flanking_len, final_flanking_len);
RefLock lock = ref_lock (vb->ref, start + r->gpos, len + 63);
diff --git a/src/reference.h b/src/reference.h
index a60c9eb9..decca318 100644
--- a/src/reference.h
+++ b/src/reference.h
@@ -111,7 +111,7 @@ extern void ref_contigs_load_contigs (Reference ref);
extern uint32_t ref_contigs_get_num_contigs (Reference ref);
extern PosType64 ref_contigs_get_genome_nbases (Reference ref);
-extern WordIndex ref_contig_get_by_gpos (const Reference ref, PosType64 gpos, int32_t seq_len, PosType32 *pos);
+extern WordIndex ref_contig_get_by_gpos (const Reference ref, PosType64 gpos, int32_t seq_len, PosType32 *pos, bool next_contig_if_in_gap);
// cache stuff
extern bool ref_cache_is_cached (Reference ref);
@@ -147,7 +147,7 @@ static inline void ref_assert_nucleotide_available (ConstRangeP range, PosType64
bool available;
switch (flag.reference) {
case REF_STORED : available = ref_is_nucleotide_set (range, pos); break;
- default : available = (pos >= range->first_pos && pos <= range->last_pos); break;
+ default : available = IN_RANGE (pos, range->first_pos, range->last_pos); break;
}
ASSERT (available, "reference is not set: chrom=%.*s pos=%"PRId64, (range)->chrom_name_len, (range)->chrom_name, (pos));
}
diff --git a/src/sam.h b/src/sam.h
index 6238d49e..ccd4c727 100644
--- a/src/sam.h
+++ b/src/sam.h
@@ -152,6 +152,7 @@
#pragma GENDICT SAM_SAG=DTYPE_FIELD=SAG // PRIM and DEPN: the sag from which to copy data
#pragma GENDICT SAM_SAALN=DTYPE_FIELD=SAALN // DEPN: sags: the alignment within sag which is this line (not needed for PRIM, as the aln_i is always 0)
#pragma GENDICT SAM_FQ_AUX=DTYPE_FIELD=FQAUX // used for consuming some AUX fields in case of translation to FASTQ (name is "MC_Z" for as up to 14.0.25 it was called SAM_MC_Z)
+#pragma GENDICT SAM_FQ_AUX_OLD=DTYPE_FIELD=MC_Z // 15.0.62: returned MC_Z need for back comp that was incorrectly modified in SAM_FQ_AUX in some 15.0.x version. now we need to support both for back comp...
// Standard AUX fields - section 1.1 here: https://samtools.github.io/hts-specs/SAMtags.pdf
#define SAM_FIRST_OPTIONAL_DID OPTION_AM_i
@@ -170,7 +171,7 @@
#pragma GENDICT OPTION_MQ_i=DTYPE_2=MQ:i // Mapping quality of the mate/next segment
#pragma GENDICT OPTION_NH_i=DTYPE_2=NH:i // Number of reported alignments that contain the query in the current record
#pragma GENDICT OPTION_IH_i=DTYPE_2=IH:i // Query hit total count. Novoalign: Number of stored alignments in SAM that contains the query in the current record. Only present if there is more than one alignment reported for the read (i.e. IH <= NH)
-#pragma GENDICT OPTION_HI_i=DTYPE_2=HI:i // Query hit index (a number [1,NH])
+#pragma GENDICT OPTION_HI_i=DTYPE_2=HI:i // Query hit index ∈[1,NH]
#pragma GENDICT OPTION_NM_i=DTYPE_2=NM:i // Edit distance to the reference
#pragma GENDICT OPTION_PQ_i=DTYPE_2=PQ:i // Phred likelihood of the template, conditional on the mapping locations of both/all segments being correct.
#pragma GENDICT OPTION_SM_i=DTYPE_2=SM:i // Template-independent mapping quality
diff --git a/src/sam_cigar.c b/src/sam_cigar.c
index 295d6d8d..4f4e1da9 100644
--- a/src/sam_cigar.c
+++ b/src/sam_cigar.c
@@ -665,7 +665,7 @@ static void sam_cigar_update_random_access (VBlockSAMP vb, ZipDataLineSAMP dl)
if (LN == -1) {}
- else if (last_pos >= 1 && last_pos <= LN)
+ else if (IN_RANGE (last_pos, 1, LN))
random_access_update_last_pos (VB, last_pos);
else // we circled back to the beginning for the chromosome - i.e. this VB RA is the entire chromosome
@@ -1000,6 +1000,7 @@ SPECIAL_RECONSTRUCTOR_DT (sam_piz_special_COPY_BUDDY_CIGAR)
// get CIGAR field value previously reconstructed in BAM **BINARY** format
STR(bam_cigar);
+ CTX(SAM_CIGAR)->empty_lookup_ok = true; // in case CIGAR is "*" (i.e. empty in BAM) (was incorrectly missing, added in 15.0.62)
sam_reconstruct_from_buddy_get_textual_snip (vb, CTX(SAM_CIGAR), bt, pSTRa(bam_cigar));
#ifndef GENOZIP_ALLOW_UNALIGNED_ACCESS
@@ -1041,8 +1042,8 @@ void sam_reconstruct_main_cigar_from_sag (VBlockSAMP vb, bool do_htos, ReconType
uint32_t uncomp_len = cigar_len;
void *success = rans_uncompress_to_4x16 (VB, comp, a->cigar.piz.comp_len,
B1ST8(vb->scratch), &uncomp_len);
- ASSPIZ (success && uncomp_len == cigar_len, "rans_uncompress_to_4x16 failed to decompress an SA Aln CIGAR data: grp_i=%u aln_i=%"PRIu64" success=%u comp_len=%u uncomp_len=%u expected_uncomp_len=%u cigar_index=%"PRIu64" comp[10]=%s",
- ZGRP_I(vb->sag), ZALN_I(a), !!success, (uint32_t)a->cigar.piz.comp_len, uncomp_len, cigar_len, (uint64_t)a->cigar.piz.index, str_hex10 (comp, a->cigar.piz.comp_len).s);
+ ASSPIZ (success && uncomp_len == cigar_len, "rans_uncompress_to_4x16 failed to decompress an SA Aln CIGAR data: grp_i=%u aln_i=%"PRIu64" success=%u comp_len=%u uncomp_len=%u expected_uncomp_len=%u cigar_index=%"PRIu64" comp[10]=%.10s",
+ ZGRP_I(vb->sag), ZALN_I(a), !!success, (uint32_t)a->cigar.piz.comp_len, uncomp_len, cigar_len, (uint64_t)a->cigar.piz.index, str_to_hex (comp, a->cigar.piz.comp_len).s);
}
// case: not compressed
@@ -1160,12 +1161,8 @@ bool cigar_is_same_signature (CigarSignature sig1, CigarSignature sig2)
return !memcmp (sig1.bytes, sig2.bytes, CIGAR_SIG_LEN);
}
-DisCigarSig cigar_display_signature (CigarSignature sig)
-{
- DisCigarSig dis;
-
- str_to_hex (sig.bytes, CIGAR_SIG_LEN, dis.s, false);
-
- return dis;
+StrText cigar_display_signature (CigarSignature sig)
+{
+ return str_to_hex (sig.bytes, CIGAR_SIG_LEN);
}
diff --git a/src/sam_deep.c b/src/sam_deep.c
index c02b3f8b..1a1d6341 100644
--- a/src/sam_deep.c
+++ b/src/sam_deep.c
@@ -89,8 +89,6 @@ static void sam_deep_zip_show_index_stats (void)
static void sam_deep_zip_display_reasons (void)
{
- static rom rsn_names[] = RSN_NAMES;
-
uint64_t total = z_file->num_lines;
iprintf ("%s Alignments breakdown by deepability:\n", dt_name (txt_file->data_type));
@@ -98,7 +96,7 @@ static void sam_deep_zip_display_reasons (void)
for (int i=0; i < NUM_DEEP_STATS_ZIP; i++)
if (z_file->deep_stats[i])
- iprintf ("%-13.13s: %"PRIu64" (%.1f%%)\n", rsn_names[i], z_file->deep_stats[i], 100.0 * (double)z_file->deep_stats[i] / (double)total);
+ iprintf ("%-13.13s: %"PRIu64" (%.1f%%)\n", (rom[])RSN_NAMES[i], z_file->deep_stats[i], 100.0 * (double)z_file->deep_stats[i] / (double)total);
}
// Called during zip_finalize of the SAM component for a Deep compression
@@ -619,14 +617,12 @@ static void sam_piz_deep_finalize_ents (void)
}
if (flag.show_deep) {
- static rom names[] = DEEP_ENT_NAMES;
-
uint64_t total = z_file->deep_stats[QNAME_BYTES] + z_file->deep_stats[SEQ_BYTES] + z_file->deep_stats[QUAL_BYTES];
iprint0 ("\ndeep_ents RAM consumption breakdown by field:\n");
iprintf ("Total: %s\n", str_size (total).s);
for (int i=0; i < NUM_DEEP_STATS_PIZ; i++)
- iprintf ("%-5.5s: %s (%.1f%%)\n", names[i], str_size (z_file->deep_stats[i]).s, 100.0 * (double)z_file->deep_stats[i] / (double)total);
+ iprintf ("%-5.5s: %s (%.1f%%)\n", (rom[])DEEP_ENT_NAMES[i], str_size (z_file->deep_stats[i]).s, 100.0 * (double)z_file->deep_stats[i] / (double)total);
}
// We need to set param only after deep_ents/index is finalized.
diff --git a/src/sam_fields.c b/src/sam_fields.c
index 5a4f661b..a6e561a5 100644
--- a/src/sam_fields.c
+++ b/src/sam_fields.c
@@ -370,7 +370,7 @@ static void sam_seg_SM_i (VBlockSAMP vb, ZipDataLineSAMP dl, int64_t SM, unsigne
{
decl_ctx (OPTION_SM_i);
- if (SM >= 0 && SM <= 255 &&
+ if (IN_RANGE (SM, 0, 255) &&
SM != 254 && // note: 254 is a valid, but highly improbable value - we use 254 for "copy from MAPQ" so a actual 254 is segged as an exception
!(SM && !dl->MAPQ)) { // we're expecting SM=0 if MAPQ=0
@@ -416,7 +416,7 @@ static void sam_seg_AM_i (VBlockSAMP vb, ZipDataLineSAMP dl, int64_t AM, unsigne
// note: currently we only support for this algorithm AM appearing after SM. Easily fixable if ever needed.
// AM is often one of 3 options: 0, =SM =MAPQ-SM. If SM=0 then AM is expected to be 0.
if (has(SM_i) &&
- AM >= 0 && AM <= 255 && // valid value
+ IN_RANGE (AM, 0, 255) && // valid value
AM != 253 && AM != 254) { // note: 253,254 are valid, but highly improbable values
int32_t SM;
@@ -667,7 +667,7 @@ static inline void sam_seg_AS_i (VBlockSAMP vb, ZipDataLineSAMP dl, int64_t as,
// in bowtie2-like data, we might be able to copy from mate
else if (segconf.is_bowtie2) {
- ASSERT (as >= MIN_AS_i && as <= MAX_AS_i, "%s: AS=%"PRId64" is ∉ [%d,%d]", LN_NAME, as, MIN_AS_i, MAX_AS_i);
+ ASSERT (IN_RANGE (as, MIN_AS_i, MAX_AS_i), "%s: AS=%"PRId64" is ∉ [%d,%d]", LN_NAME, as, MIN_AS_i, MAX_AS_i);
ZipDataLineSAMP mate_dl = DATA_LINE (vb->mate_line_i); // an invalid pointer if mate_line_i is -1
@@ -894,7 +894,7 @@ SPECIAL_RECONSTRUCTOR (sam_piz_special_DEMUX_MAPQ)
// Seg against mate if we have one, or else against MAPQ as it is often very similar
static inline void sam_seg_MQ_i (VBlockSAMP vb, ZipDataLineSAMP dl, int64_t mq, unsigned add_bytes)
{
- ASSERT (mq >=0 && mq <= 255, "%s: Invalid MQ:i=%"PRId64": expecting an integer [0,255]", LN_NAME, mq);
+ ASSERT (IN_RANGE (mq, 0, 255), "%s: Invalid MQ:i=%"PRId64": expecting an integer [0,255]", LN_NAME, mq);
dl->MQ = mq;
ContextP channel_ctx = seg_mux_get_channel_ctx (VB, OPTION_MQ_i, (MultiplexerP)&vb->mux_MQ, sam_has_mate);
@@ -912,7 +912,7 @@ static inline void sam_seg_MQ_i (VBlockSAMP vb, ZipDataLineSAMP dl, int64_t mq,
// PQ:i Phred likelihood of the template, conditional on the mapping locations of both/all segments being correct.
static inline void sam_seg_PQ_i (VBlockSAMP vb, ZipDataLineSAMP dl, int64_t pq, unsigned add_bytes)
{
- if (pq >= 0 && pq <= 65534) // dl->PQ is uint16_t
+ if (IN_RANGE (pq, 0, 65534)) // dl->PQ is uint16_t
dl->PQ = pq + 1; // +1, so that if pq is out of this range, leave dl as 0, which will mean "no valid PQ"
ContextP channel_ctx = seg_mux_get_channel_ctx (VB, OPTION_PQ_i, (MultiplexerP)&vb->mux_PQ, sam_has_mate);
@@ -977,7 +977,7 @@ void sam_seg_buddied_i_fields (VBlockSAMP vb, ZipDataLineSAMP dl, Did did_i,
// BAM spec permits values up to 0xffffffff, and SAM is unlimited, however for code covenience we limit
// values segged with this method to int32_t. If this is ever an issue, it can be solved.
- ASSERT (my_value >= -0x80000000LL && my_value <= 0x7fffffffLL, "%s: Value of %s is %"PRId64", outside the supported range by Genozip of [%d,%d]",
+ ASSERT (IN_RANGE (my_value, -0x80000000LL, 0x7fffffffLL), "%s: Value of %s is %"PRId64", outside the supported range by Genozip of [%d,%d]",
LN_NAME, ctx->tag_name, my_value, -0x80000000, 0x7fffffff);
#define by_mate (mux->special_code == SAM_SPECIAL_DEMUX_BY_MATE)
diff --git a/src/sam_piz.c b/src/sam_piz.c
index 98e4d491..87a1cf71 100644
--- a/src/sam_piz.c
+++ b/src/sam_piz.c
@@ -778,9 +778,11 @@ CONTAINER_FILTER_FUNC (sam_piz_filter)
// collect_coverage: rather than reconstructing optional, reconstruct SAM_FQ_AUX that just consumes MC:Z if it exists
else if (dict_id.num == _SAM_AUX) {
- if (flag.collect_coverage) { // filter_repeats is set in the AUX container since v14
- ASSISLOADED(CTX(SAM_FQ_AUX));
- reconstruct_from_ctx (vb, SAM_FQ_AUX, 0, false);
+ if (flag.collect_coverage) {
+ if (CTX(SAM_FQ_AUX )->is_loaded) reconstruct_from_ctx (vb, SAM_FQ_AUX, 0, false); // filter_repeats is set in the AUX container since v14
+ else if (CTX(SAM_FQ_AUX_OLD)->is_loaded) reconstruct_from_ctx (vb, SAM_FQ_AUX_OLD, 0, false);
+ else ABORT0 ("Neither SAM_FQ_AUX or SAM_FQ_AUX_OLD are loaded");
+
return false; // don't reconstruct AUX
}
diff --git a/src/sam_private.h b/src/sam_private.h
index 36ca75bd..281778c8 100644
--- a/src/sam_private.h
+++ b/src/sam_private.h
@@ -610,8 +610,7 @@ extern void sam_reconstruct_SA_cigar_from_SA_Group (VBlockSAMP vb, SAAln *a);
extern CigarSignature cigar_sign (STRp(cigar));
extern bool cigar_is_same_signature (CigarSignature sig1, CigarSignature sig2) ;
-typedef struct { char s[CIGAR_SIG_LEN*2 + 1]; } DisCigarSig;
-extern DisCigarSig cigar_display_signature (CigarSignature sig);
+extern StrText cigar_display_signature (CigarSignature sig);
#define SA_CIGAR_DISPLAY_LEN 12
extern rom sam_piz_display_aln_cigar (const SAAln *a);
@@ -880,7 +879,7 @@ static inline char sam_seg_sam_type_to_bam_type (char type, int64_t n)
// i converts to one of 6: C,c,S,s,I,i
for (int i=0 ; i < 6; i++)
- if (n >= lt_min (test[i]) && n <= lt_max (test[i]))
+ if (IN_RANGE (n, lt_min (test[i]), lt_max (test[i])))
return lt_desc[test[i]].sam_type;
return 0; // number out of range
diff --git a/src/sam_qual.c b/src/sam_qual.c
index 13e74bf8..95c26b91 100644
--- a/src/sam_qual.c
+++ b/src/sam_qual.c
@@ -31,7 +31,7 @@ rom bam_qual_display (bytes qual, uint32_t l_seq) // caller should free memory
else {
char *str = MALLOC (l_seq*3 + 2);
- return str_to_hex (qual, l_seq, str, true);
+ return str_to_hex_ (qual, l_seq, str, true);
}
}
diff --git a/src/sam_sa.c b/src/sam_sa.c
index 667e1fd2..7317568c 100644
--- a/src/sam_sa.c
+++ b/src/sam_sa.c
@@ -216,7 +216,7 @@ void sam_seg_SA_Z (VBlockSAMP vb, ZipDataLineSAMP dl, STRp(sa), unsigned add_byt
// We already tested the SA to be good when we added this line to PRIM in sam_seg_prim_add_sag_SA
- ASSSEG (num_alns >= 2 && num_alns <= MAX_SA_NUM_ALNS, "%s: Not expecting a malformed SA field in PRIM. num_alns=%u SA:Z=\"%.*s\"",
+ ASSSEG (IN_RANGE (num_alns, 2, MAX_SA_NUM_ALNS), "%s: Not expecting a malformed SA field in PRIM. num_alns=%u SA:Z=\"%.*s\"",
LN_NAME, num_alns, STRf(sa));
// use SA.local to store number of alignments in this SA Group (inc. primary)
diff --git a/src/sam_sag_scan.c b/src/sam_sag_scan.c
index d475d900..fe6851a6 100644
--- a/src/sam_sag_scan.c
+++ b/src/sam_sag_scan.c
@@ -36,7 +36,7 @@ static rom scan_index_one_line (VBlockSAMP vb, rom alignment, uint32_t remaining
alignment_len = GET_UINT32 (alignment) + 4;
// a non-sensical block_size might indicate an false-positive identification of a BAM alignment in bam_unconsumed
- ASSERT (alignment_len >= sizeof (BAMAlignmentFixed) && alignment_len <= remaining_txt_len,
+ ASSERT (IN_RANGE (alignment_len, sizeof (BAMAlignmentFixed), remaining_txt_len),
"%s: alignment_len=%u is out of range - too small, or goes beyond end of txt data: remaining_txt_len=%u",
LN_NAME, alignment_len, remaining_txt_len);
@@ -160,8 +160,8 @@ static void scan_index_qnames_preprocessing (VBlockP vb)
START_TIMER;
// if the txt file is compressed with BGZF, we uncompress now, in the compute thread
- if (txt_file->codec == CODEC_BGZF)
- bgzf_uncompress_vb (vb); // some of the blocks might already have been decompressed while reading - we decompress the remaining
+ if (TXT_IS_BGZF)
+ bgz_uncompress_vb (vb, CODEC_BGZF); // some of the blocks might already have been decompressed while reading - we decompress the remaining
rom next = B1STtxt;
rom after = BAFTtxt;
diff --git a/src/sam_sag_zip.c b/src/sam_sag_zip.c
index ffa455ee..596a2a8f 100644
--- a/src/sam_sag_zip.c
+++ b/src/sam_sag_zip.c
@@ -1069,8 +1069,5 @@ void sam_stats_reallocate (void)
rom sag_type_name (SagType sagt)
{
- rom names[] = SAM_SAG_TYPE_NAMES;
-
- if (sagt < 0 || sagt >= NUM_SAG_TYPES) return "InvalidSagType";
- else return names[sagt];
+ return IN_RANGE (sagt, 0, NUM_SAG_TYPES-1) ? (rom[])SAM_SAG_TYPE_NAMES[sagt] : "InvalidSagType";
}
diff --git a/src/sam_seg.c b/src/sam_seg.c
index 937ed7f5..f1ee1e11 100644
--- a/src/sam_seg.c
+++ b/src/sam_seg.c
@@ -663,8 +663,8 @@ void sam_segconf_finalize (VBlockP vb_)
segconf.sam_cigar_len = 1 + ((segconf.sam_cigar_len - 1) / vb->lines.len32); // set to the average CIGAR len (rounded up)
segconf.est_sam_factor = (double)segconf.est_segconf_sam_size / (double)Ltxt;
- if (num_lines_at_max_len(vb) > vb->lines.len32 / 2 && // more than half the lines are at exactly maximal length
- (vb->lines.len32 > 100 || txt_file->is_eof) && // enough lines to be reasonably convinced that this is not by chance
+ if (num_lines_at_max_len(vb) > vb->lines.len32 / 2 && // more than half the lines are at exactly maximal length
+ (vb->lines.len32 > 100 || txt_file->no_more_blocks) && // enough lines to be reasonably convinced that this is not by chance
!segconf.is_long_reads) // TO DO: trimming long-read qual in FASTQ with --deep would mess up LONGR codec, we need to sort this out
segconf.sam_cropped_at = vb->longest_seq_len; // possibily the FASTQ reads were cropped to be all equal length
diff --git a/src/sections.c b/src/sections.c
index 2c928fbe..dcbef47f 100644
--- a/src/sections.c
+++ b/src/sections.c
@@ -175,7 +175,7 @@ bool sections_prev_sec2 (Section *sl_ent, // optional in/out. if NULL - search
{
Section sec = sl_ent ? *sl_ent : NULL;
- ASSERT (!sec || (sec >= B1ST(SectionEnt, z_file->section_list_buf) && sec <= BLST(SectionEnt, z_file->section_list_buf)),
+ ASSERT (!sec || IN_RANGE (sec, B1ST(SectionEnt, z_file->section_list_buf), BLST(SectionEnt, z_file->section_list_buf)),
"Invalid sec: st1=%s st2=%s", st_name (st1), st_name (st2));
while (!sec || sec >= B1ST (SectionEnt, z_file->section_list_buf)) {
@@ -458,7 +458,7 @@ void sections_list_memory_to_file_format (bool in_place) // in place, or to evb-
SectionEnt sec = *B(SectionEnt, z_file->section_list_buf, i); // copy before it gets overwritten
int64_t offset_delta = (int64_t)sec.offset - (int64_t)prev_sec.offset;
- ASSERT (offset_delta >=0LL && offset_delta <= 0xffffffffLL, // note: offset_delta is size of previous section
+ ASSERT (IN_RANGE (offset_delta, 0LL, 0xffffffffLL), // note: offset_delta is size of previous section
"section_i=%u size=%"PRId64" st=%s is too big", i-1, offset_delta, st_name ((fsec-1)->st));
int32_t vb_delta = INTERLACE(int32_t, (int32_t)sec.vblock_i - (int32_t)prev_sec.vblock_i);
diff --git a/src/seg.c b/src/seg.c
index 48e7ebc9..7784a1a2 100644
--- a/src/seg.c
+++ b/src/seg.c
@@ -1541,7 +1541,7 @@ void zip_modify (VBlockP vb)
void seg_all_data_lines (VBlockP vb)
{
START_TIMER;
-
+
// sanity (leave 64b to detect bugs)
ASSERT (vb->lines.len <= vb->txt_data.len, "%s: Expecting lines.len=%"PRIu64" < txt_data.len=%"PRIu64,
VB_NAME, vb->lines.len, vb->txt_data.len); // 64 bit test in case of memory corruption
@@ -1565,7 +1565,13 @@ void seg_all_data_lines (VBlockP vb)
if (debug_lines_ctx) debug_lines_ctx->ltype = LT_UINT32;
}
- DT_FUNC (vb, seg_initialize)(vb); // data-type specific initialization
+ DT_FUNC (vb, seg_initialize)(vb); // data-type specific initialization (SAM DEPN: re-read lines here)
+
+ if (flag_is_show_vblocks (ZIP_TASK_NAME))
+ iprintf ("SEG(id=%d) vb=%s Ltxt=%u %.*s\n", vb->id, VB_NAME, vb->txt_data.len32,
+ MIN_(64, Ltxt), cond_str (!DTP(is_binary), "txt_data[64]=", B1STtxt ? B1STtxt : "(null)"));
+
+ ASSERTNOTEMPTY (vb->txt_data); // after this print ^
// in segconf, seg_initialize might change the data_type and realloc the segconf vb (eg FASTA->FASTQ)
if (segconf.running) vb = vb_get_nonpool_vb (VB_ID_SEGCONF);
@@ -1621,8 +1627,8 @@ void seg_all_data_lines (VBlockP vb)
line = next_line;
// update line_bgzf_uoffset to next line
- if (txt_file->codec == CODEC_BGZF && vb->comp_i == COMP_MAIN)
- bgzf_zip_advance_index (vb, line_len);
+ if (TXT_IS_BGZF && vb->comp_i == COMP_MAIN)
+ bgz_zip_advance_index (vb, line_len);
// if our estimate number of lines was too small, increase it
if (vb->line_i == vb->lines.len32-1 && line - vb->txt_data.data != vb->txt_data.len)
diff --git a/src/segconf.c b/src/segconf.c
index 52cd62ee..ab4e0a97 100644
--- a/src/segconf.c
+++ b/src/segconf.c
@@ -139,6 +139,10 @@ static void segconf_set_vb_size (VBlockP vb, uint64_t curr_vb_size)
flag.vblock, MAX_VBLOCK_MEMORY);
segconf.vb_size = (uint64_t)mem_size_mb MB;
+
+ // we can't use GZIL for tiny VBs
+ if (TXT_IS_GZIL && segconf.vb_size < GZIL_MAX_BLOCK_SIZE)
+ txt_file->codec = CODEC_GZ; // leave source_code=GZIL for stats
}
// case: developer option - a number of bytes eg "100000B"
@@ -364,12 +368,12 @@ static void segconf_show_has (void)
void segconf_calculate (void)
{
// check for components that don't need segconf
- if (segconf_no_calculate()) return;
+ if (segconf_no_calculate()) goto finalize;
if (TXT_DT(GNRIC) || // no need for a segconf test VB in generic files
flag.skip_segconf) { // for use in combination with --biopsy, to biopsy of a defective file
segconf_set_vb_size (NULL, segconf.vb_size);
- return;
+ goto finalize;
}
segconf.running = true;
@@ -382,12 +386,13 @@ void segconf_calculate (void)
for (int s = (txt_file->codec == CODEC_BZ2); s < ARRAY_LEN(vb_sizes) && !Ltxt; s++) {
segconf.vb_size = vb_sizes[s];
+ if (TXT_IS_GZIL) segconf.vb_size = ROUNDUP1M (segconf.vb_size);
txtfile_read_vblock (vb);
}
if (!Ltxt) {
// error unless this is a header-only file
- ASSERT (txt_file->header_size, "Failed to segconf. Possible reasons: cannot find a single valid line. eof=%s", TF(vb->is_eof));
+ ASSERT (txt_file->header_size, "Failed to segconf. Possible reasons: cannot find a single valid line. is_last_vb_in_txt_file=%s", TF(vb->is_last_vb_in_txt_file));
segconf_set_vb_size (vb, save_vb_size);
goto done; // cannot find a single line - vb_size set to default and other segconf fields remain default, or previous file's setting
@@ -426,7 +431,7 @@ void segconf_calculate (void)
// finalize flag.zip_txt_modified after finalizing optimizations
if (flag.optimize) {
segconf_finalize_optimize(); // filter fields to be optimized based on positive or negative flags
- segconf.zip_txt_modified = segconf_get_zip_txt_modified (false);
+ segconf.zip_txt_modified |= segconf_get_zip_txt_modified (false); // note: might be already marked as modified if truncated
}
// true if txt_file->num_lines need to be counted at zip_init_vb instead of zip_update_txt_counters,
@@ -464,8 +469,8 @@ void segconf_calculate (void)
buf_insert (evb, txt_file->unconsumed_txt, char, 0, txt_data_copy.data, txt_data_copy.len, "txt_file->unconsumed_txt");
buf_destroy (txt_data_copy);
- if (txt_file->codec == CODEC_BGZF)
- bgzf_return_segconf_blocks (vb); // return BGZF used by the segconf VB to the unconsumed BGZF blocks
+ if (TXT_IS_BGZF || TXT_IS_GZIL)
+ bgz_return_segconf_blocks (vb); // return BGZF used by the segconf VB to the unconsumed BGZF blocks
// in case of generated component data - undo
vb->gencomp_lines.len = 0;
@@ -487,6 +492,14 @@ void segconf_calculate (void)
// restore (used for --optimize-DESC / --add-line-numbers)
txt_file->num_lines = 0;
segconf.running = false;
+
+finalize: // code to execute even if segconf was skipped
+ flag.zip_uncompress_source_during_read =
+ flag.pair == PAIR_R2 || // if we're reading the 2nd paired file, fastq_txtfile_have_enough_lines needs the whole data
+ flag.make_reference || // unconsumed callback for make-reference needs to inspect the whole data
+ flag.biopsy ||
+ flag.zip_lines_counted_at_init_vb; // *_zip_init_vb needs to count lines
+
#undef vb
}
diff --git a/src/strings.c b/src/strings.c
index 8fd5248e..b13834b6 100644
--- a/src/strings.c
+++ b/src/strings.c
@@ -67,13 +67,12 @@ StrText char_to_printable (char c)
}
}
-// replaces \t, \n, \r, \b, \ with "\t" etc, replaces unprintables with '?'. caller should allocate out.
-// returns length (excluding \0). out should be allocated by caller to (in_len*2 + 1), out is null-terminated
+// returns length (excluding \0). out should be allocated by caller to (in_len*4 + 1), out is null-terminated
uint32_t str_to_printable (STRp(in), char *out, int out_len)
{
char *start = out;
- for (uint32_t i=0; i < in_len && out_len > 3; i++) // 3 = 2 characters + nul
+ for (uint32_t i=0; i < in_len && out_len > 5; i++) // 5 = 4 characters + nul
switch (in[i]) {
case 32 ... '\\'-1: case '\\'+1 ... 126:
*out++ = in[i]; ; out_len -= 1; break;
@@ -83,7 +82,7 @@ uint32_t str_to_printable (STRp(in), char *out, int out_len)
case '\b' : *out++ = '\\'; *out++ = 'b' ; out_len -= 2; break;
case '\\' : *out++ = '\\'; *out++ = '\\'; out_len -= 2; break;
case 0 ... 7 : *out++ = '\\'; *out++ = '0' + in[i]; out_len -= 2; break;
- default : *out++ = '\\'; *out++ = '?' ; out_len -= 2;
+ default : *out++ = '\\'; *out++ = 'x' ; *out++ = NUM2HEXDIGIT(in[i] >> 4), *out++ = NUM2HEXDIGIT(in[i] & 0xf); out_len -= 4;
}
*out = 0;
@@ -290,7 +289,7 @@ bool str_get_int_range##func_num (rom str, uint32_t str_len, int64_t min_val, in
if (!str_get_int (str, str_len ? str_len : strlen (str), &value64)) return false; \
if (value) *value = (type)value64; \
\
- return value64 >= min_val && value64 <= max_val; \
+ return IN_RANGE (value64, min_val, max_val); \
}
str_get_int_range_type(8,uint8_t) // unsigned
str_get_int_range_type(16,uint16_t) // unsigned
@@ -374,7 +373,7 @@ str_get_int_range_allow_hex_bits(32) // unsigned
str_get_int_range_allow_hex_bits(64) // unsigned
// caller should allocate hex_str[data_len*2+1] (or *3 if with_dot). returns nul-terminated string.
-rom str_to_hex (bytes data, uint32_t data_len, char *hex_str, bool with_dot)
+rom str_to_hex_ (bytes data, uint32_t data_len, char *hex_str, bool with_dot)
{
char *s = hex_str;
@@ -390,14 +389,6 @@ rom str_to_hex (bytes data, uint32_t data_len, char *hex_str, bool with_dot)
return hex_str;
}
-StrText str_hex10 (bytes data, uint32_t data_len)
-{
- StrText s = {};
- str_to_hex (data, MIN_(10, data_len), s.s, false);
- return s;
-}
-
-
StrText str_int_commas (int64_t n)
{
StrText s = {};
@@ -1289,3 +1280,16 @@ bool str_is_utf8 (STRp(s))
return true;
}
+
+#ifdef __MINGW64__
+void *memmem (const void *haystack, size_t haystack_len, // same as in gcc
+ const void *needle, size_t needle_len)
+{
+ const void *after_haystack = (rom)haystack + haystack_len - needle_len + 1;
+ for (; haystack < after_haystack; haystack=(rom)haystack + 1)
+ if (!memcmp (haystack, needle, needle_len))
+ return (void *)haystack;
+
+ return NULL;
+}
+#endif
diff --git a/src/strings.h b/src/strings.h
index be83d1e0..5ebeed16 100644
--- a/src/strings.h
+++ b/src/strings.h
@@ -11,7 +11,7 @@
#include "genozip.h"
#define IS_DIGIT(c) ((c)>='0' && (c)<='9')
-#define NUM2HEXDIGIT(n) ((n)<=9 ? '0' + (n) : 'A'+((n)-10)) // converts a number [0,15] to hex digit character
+#define NUM2HEXDIGIT(n) ((n)<=9 ? '0' + (n) : 'a'+((n)-10))
#define HEXDIGIT2NUM(c) (IS_DIGIT(c) ? ((c)-'0') : ((c)-'A'+10)) // converts an uppercase hex digit to a number [0,15]
#define IS_HEXDIGIT(c) (IS_DIGIT(c) || ((c)>='A' && (c)<='F') || ((c)>='a' && (c)<='f'))
#define IS_HEXDIGITlo(c) (IS_DIGIT(c) || ((c)>='a' && (c)<='f'))
@@ -214,8 +214,13 @@ extern StrTextLong str_int_s_(rom label, int64_t n);
extern StrTextLong str_str_s_(rom label, rom str);
#define cond_str(cond, label, str) ((cond) ? str_str_s_((label), (str)).s : "") /* note: str does not evaluate if cond is false! */\
-extern rom str_to_hex (bytes data, uint32_t data_len, char *hex_str, bool with_dot);
-extern StrText str_hex10 (bytes data, uint32_t data_len); // up to 10 bytes in hex (21 chars inc. \0)
+extern rom str_to_hex_(bytes data, uint32_t data_len, char *hex_str, bool with_dot);
+static inline StrText str_to_hex (bytes data, uint32_t data_len) // note: for data_len up to 39 (truncated if longer)
+{
+ StrText s;
+ str_to_hex_(data, MIN_(data_len, sizeof(s)/2 - 1), s.s, false);
+ return s;
+}
// string length of an integer. #include if using this.
static inline unsigned str_int_len (int64_t n_)
@@ -367,6 +372,10 @@ static inline char *strpcpy(char *restrict dst, const void *restrict src)
return mempcpy (dst, src, len);
}
+#ifdef __MINGW64__
+extern void *memmem (const void *haystack, size_t haystack_len, const void *needle, size_t needle_len);
+#endif
+
extern rom str_win_error_(uint32_t error);
extern rom str_win_error (void);
diff --git a/src/tar.c b/src/tar.c
index 7bff0230..7375cc1d 100644
--- a/src/tar.c
+++ b/src/tar.c
@@ -161,6 +161,19 @@ static void tar_copy_metadata_from_file (rom fn)
#endif
}
+static void tar_fwrite (const void *data, uint32_t size, rom object)
+{
+ if (!size) return; // nothing to do
+
+ ASSERTNOTNULL (tar_file);
+ ASSERTNOTNULL (data);
+
+ uint32_t bytes = fwrite (data, 1, size, tar_file);
+
+ ASSERT (bytes == size, "Error writing %s to %s on filesystem=%s - requested %u bytes but wrote only %u: (%u)%s",
+ object, tar_name, arch_get_filesystem_type (txt_file).s, size, bytes, errno, strerror (errno));
+}
+
// filenames that have a last component longer than 99 characters don't fit in POSIX tar. Instead, we use a GNU-specific extension
// of storing the filename as a pseudo-file in the tarball. See: https://itecnote.com/tecnote/r-what-exactly-is-the-gnu-tar-longlink-trick/
static void tar_write_gnu_long_filename (STRp(fn_in_tar)/* length includes \0 */)
@@ -185,15 +198,12 @@ static void tar_write_gnu_long_filename (STRp(fn_in_tar)/* length includes \0 */
for (unsigned i=0; i < 512; i++) checksum += ((bytes)&ll_hdr)[i];
snprintf (ll_hdr.checksum, sizeof (ll_hdr.checksum), "%06o", checksum);
- ASSERT (fwrite (&ll_hdr, 512, 1, tar_file) == 1, "failed to write LongLink header of %s to %s", fn_in_tar, tar_name);
- ASSERT (fwrite (STRa(fn_in_tar), 1, tar_file) == 1, "failed to write long filename of %s to %s", fn_in_tar, tar_name);
+ tar_fwrite (&ll_hdr, 512, "LongLink header");
+ tar_fwrite (STRa(fn_in_tar), "long filename");
// pad to full block
- if (fn_in_tar_len % 512) {
- char padding[512] = "";
- ASSERT (fwrite (padding, ROUNDUP512(fn_in_tar_len) - fn_in_tar_len, 1, tar_file) == 1,
- "failed to write long filename padding of %s to %s", fn_in_tar, tar_name);
- }
+ if (fn_in_tar_len % 512)
+ tar_fwrite ((char[512]){}, ROUNDUP512(fn_in_tar_len) - fn_in_tar_len, "long filename padding");
t_offset += 512 + ROUNDUP512(fn_in_tar_len);
}
@@ -243,7 +253,7 @@ FILE *tar_open_file (rom fn_on_disk, rom fn_in_tar)
iprintf ("tar_open_file: t_offset=%"PRIu64" ftell=%"PRIu64" data_start=%"PRIu64" %s\n",
t_offset, ftello64 (tar_file), t_offset + 512, fn);
- ASSERT (fwrite (&hdr, 512, 1, tar_file) == 1, "failed to write header of %s to %s", fn_in_tar, tar_name);
+ tar_fwrite (&hdr, 512, fn_in_tar);
t_offset += 512; // past tar header
return tar_file;
@@ -275,7 +285,7 @@ static void tar_add_hard_link (rom fn_on_disk, rom fn_in_tar_src, rom fn_in_tar_
if (flag.debug_tar)
iprintf ("tar_add_hard_link: t_offset=%"PRIu64" ftell=%"PRIu64" %s\n", t_offset, ftello64 (tar_file), hdr.name);
- ASSERT (fwrite (&hdr, 512, 1, tar_file) == 1, "failed to write header of %s to %s", fn_in_tar_dst, tar_name);
+ tar_fwrite (&hdr, 512, fn_in_tar_dst);
t_offset += 512; // past tar header
}
@@ -302,9 +312,7 @@ void tar_close_file (void **file)
int64_t padding_len = ROUNDUP512(tar_size) - tar_size;
if (padding_len) {
- char padding[512] = "";
- ASSERT (fwrite (padding, padding_len, 1, tar_file) == 1, "failed to write file padding to %s", tar_name);
-
+ tar_fwrite ((char[512]){}, padding_len, "file padding");
tar_size += padding_len;
}
@@ -326,7 +334,7 @@ void tar_close_file (void **file)
// update header
ASSERT (!fseeko64 (tar_file, t_offset-512, SEEK_SET), "fseek(%"PRId64") of %s failed (1): %s", t_offset-512, tar_name, strerror (errno));
- ASSERT (fwrite (&hdr, 512, 1, tar_file) == 1, "failed to write file header to %s", tar_name);
+ tar_fwrite (&hdr, 512, "file header");
ASSERT (!fseeko64 (tar_file, 0, SEEK_END), "fseek(END) of %s failed (2): %s", tar_name, strerror (errno));
// flush to finalize z_file within tar file, before deleting txt file, and also before spawning a process to test it
@@ -358,7 +366,7 @@ void tar_copy_file (rom fn_on_disk, rom fn_in_tar)
int64_t size;
int64_t bytes_copied = 0;
while ((size = fread (data, 1, BLOCK_SIZE, src_file))) {
- ASSERT (fwrite (data, 1, size, tar_file) == size, "failed to copy %s to %s - failed to write %"PRId64" bytes", fn_on_disk, tar_name, size);
+ tar_fwrite (data, size, fn_on_disk);
bytes_copied += size;
}
@@ -379,8 +387,7 @@ void tar_finalize (void)
iprintf ("tar_finalize EOF block: t_offset=%"PRIu64" ftell=%"PRIu64"\n", t_offset, ftello64 (tar_file));
// tar file format: two empty tar blocks as EOF
- char s[1024] = "";
- ASSERT (fwrite (s, 1024, 1, tar_file) == 1, "failed to EOF tar blocks to %s", tar_name);
+ tar_fwrite ((char[1024]){}, 1024, "EOF tar blocks");
FCLOSE (tar_file, "tar_file");
t_offset = 0;
diff --git a/src/test.sh b/src/test.sh
index 154ed61e..4454e2b8 100644
--- a/src/test.sh
+++ b/src/test.sh
@@ -495,9 +495,22 @@ batch_special_algs()
test_header "two consecutive Is in CIGAR"
$genozip ${TESTDIR}/regression.two-consecutive-Is.sam -ft || exit 1
+ # bug was: no handling of GZIL blocks after move to igzip (defect 2024-03-01)
+ test_header "Illumina GZIL blocks"
+ $genozip -tf --no-bgzf ${TESTDIR}/regression.defect-2024-03-01.multi-gzip-break-between-reads.fq.gz || exit 1
+ $genozip -tf --no-bgzf ${TESTDIR}/regression.defect-2024-03-01.multi-gzip-break-within-read.fq.gz || exit 1
+
# bug was: CHECKSUM at the end of 1MB-gz block in R2 was not handled correctly, failing R2 zip.
test_header "CHECKSUM in Illumina-style gzip: R2 alignment to R1"
- $genozip -tf --pair -e $hs37d5 --truncate -B19 ${TESTDIR}/regression.defect-2024-06-21.R1.fq.gz ${TESTDIR}/regression.defect-2024-06-21.R2.fq.gz
+ $genozip -tf --pair -e $hs37d5 --truncate -B19 --no-bgzf ${TESTDIR}/regression.defect-2024-06-21.R1.gzil-broke-w-B19.fq.gz \
+ ${TESTDIR}/regression.defect-2024-06-21.R2.gzil-broke-w-B19.fq.gz || exit 1
+
+ # two special code paths for handling truncated GZIL files, depending if the garbage last word of the file >1MB (detected during read) or <=1MB (detected during uncompress)
+ $genozip -tf --truncate ${TESTDIR}/special.gzil.truncated-last-word.gt.1MB.fastq.gz || exit 1
+ $genozip -tf --truncate ${TESTDIR}/special.gzil.truncated-last-word.eq.1MB.fastq.gz || exit 1
+
+ # bug was: MC copying CIGAR from mate, when both are "*" (=empty in BAM). Fixed in 15.0.62 in PIZ.
+ $genozip -tf ${TESTDIR}/regression.2024-06-26.MC-copy-from-mate-CIGAR.null-CIGAR.bam || exit 1
}
batch_qual_codecs()
@@ -1054,6 +1067,22 @@ batch_real_world_optimize()
cd -
}
+batch_gzil_fastq()
+{
+ batch_print_header
+
+ cd $TESTDIR
+ local files=( `ls -1 gzil.*.fq.gz` )
+
+ $genozip --show-filename --test --force ${files[*]} || exit 1
+
+ # expecting R2 to be decompressed by igzip as it is faster than isil for gz decompressing while reading
+ $genozip gzil.R1.fq.gz gzil.R1.fq.gz -o $output -e $hs37d5 -tf -2
+
+ cd -
+}
+
+
# test genounzip with many files of different types in a single process
batch_real_world_genounzip_single_process() # $1 extra genozip argument
{
@@ -1542,12 +1571,12 @@ batch_external_ora()
{
batch_print_header
- # Commented out (uncomment if needed) because its very slow - 80 seconds.
+ return; # skipped because its very slow - 80 seconds.
# Reason for slowness: test.fastq.ora is a abrupt subset of a larger file - orad doesn't like that.
- #
- # if `command -v orad >& /dev/null`; then
- # ORA_REF_PATH=$REFDIR $genozip --truncate -ft -e $hs37d5 $TESTDIR/test.fastq.ora || exit $?
- # fi
+
+ if `command -v orad >& /dev/null`; then
+ ORA_REF_PATH=$REFDIR $genozip --truncate -ft -e $hs37d5 $TESTDIR/test.fastq.ora || exit $?
+ fi
cleanup
}
@@ -1784,6 +1813,12 @@ update_latest()
pushd ../genozip-latest
git reset --hard
git pull
+
+ if [ -n "$is_mac" ]; then
+ chmod +x src/*.sh # reverted by git pull
+ fi
+
+ make -j clean
make -j
popd
}
@@ -2157,16 +2192,21 @@ SCRIPTSDIR=$BASEDIR/private/scripts
LICENSESDIR=$BASEDIR/private/licenses
OUTDIR=$TESTDIR/tmp
REFDIR=$BASEDIR/public
-if [ -n "$is_windows" ]; then
- LICFILE=$APPDATA/genozip/.genozip_license.v15
+if [ -n "$is_windows" ]; then
if [[ ! -v APPDATA ]]; then
export APPDATA="$BASEDIR/../AppData/Roaming"
fi
+
+ LICFILE=$APPDATA/genozip/.genozip_license.v15
else
LICFILE=$HOME/.genozip_license.v15
fi
+if [ -n "$is_mac" ]; then
+ chmod +x $BASEDIR/private/scripts/* $BASEDIR/private/utils/mac/* $BASEDIR/src/*.sh # reverted by git pull
+fi
+
output=${OUTDIR}/output.genozip
output2=${OUTDIR}/output2.genozip
recon=${OUTDIR}/recon.txt
@@ -2359,21 +2399,22 @@ case $GENOZIP_TEST in
56) batch_user_message_permissions ;;
57) batch_password_permissions ;;
58) batch_reference_backcomp ;;
-59) batch_real_world_backcomp 11.0.11 ;; # note: versions must match VERSIONS in test/Makefile
-60) batch_real_world_backcomp 12.0.42 ;;
-61) batch_real_world_backcomp 13.0.21 ;;
-62) batch_real_world_backcomp 14.0.33 ;;
-63) batch_real_world_backcomp latest ;;
-64) batch_basic basic.vcf latest ;;
-65) batch_basic basic.bam latest ;;
-66) batch_basic basic.sam latest ;;
-67) batch_basic basic.fq latest ;;
-68) batch_basic basic.fa latest ;;
-69) batch_basic basic.bed latest ;;
-70) batch_basic basic.gvf latest ;;
-71) batch_basic basic.gtf latest ;;
-72) batch_basic basic.me23 latest ;;
-73) batch_basic basic.generic latest ;;
+59) batch_gzil_fastq ;;
+60) batch_real_world_backcomp 11.0.11 ;; # note: versions must match VERSIONS in test/Makefile
+61) batch_real_world_backcomp 12.0.42 ;;
+62) batch_real_world_backcomp 13.0.21 ;;
+63) batch_real_world_backcomp 14.0.33 ;;
+64) batch_real_world_backcomp latest ;;
+65) batch_basic basic.vcf latest ;;
+66) batch_basic basic.bam latest ;;
+67) batch_basic basic.sam latest ;;
+68) batch_basic basic.fq latest ;;
+69) batch_basic basic.fa latest ;;
+70) batch_basic basic.bed latest ;;
+71) batch_basic basic.gvf latest ;;
+72) batch_basic basic.gtf latest ;;
+73) batch_basic basic.me23 latest ;;
+74) batch_basic basic.generic latest ;;
* ) break; # break out of loop
diff --git a/src/txtfile.c b/src/txtfile.c
index 2ad948aa..4c3a9958 100644
--- a/src/txtfile.c
+++ b/src/txtfile.c
@@ -26,6 +26,8 @@
#define MAX_TXT_HEADER_LEN ((uint64_t)0xffffffff) // maximum length of txt header - one issue with enlarging it is that we digest it in one go, and the digest module is 32 bit
+#define TXTFILE_READ_VB_PADDING 16 // we need this quantity of unused bytes at the end of vb.txt_data
+
// PIZ: dump bad vb to disk
StrTextLong txtfile_dump_vb (VBlockP vb, rom base_name)
{
@@ -41,31 +43,65 @@ StrTextLong txtfile_dump_vb (VBlockP vb, rom base_name)
return dump_filename;
}
+// returns the requested number of bytes, except if eof in which case it could be less.
+uint32_t txtfile_fread (FileP file,
+ FILE *fp, // note: non-NULL if different from file->file (when re-reading)
+ void *addr, uint32_t size, int64_t *disk_so_far)
+{
+ ASSERTNOTNULL (addr);
+ if (!fp) fp = (FILE *)file->file;
+
+ uint32_t bytes = fread (addr, 1, size, fp);
+ if (disk_so_far) *disk_so_far += bytes;
+
+ ASSERT (bytes == size || !ferror (fp), "Error while reading %s codec=%s on filesystem=%s - requested %u bytes but read only %u: (%u)%s",
+ file->basename, codec_name (file->codec), arch_get_filesystem_type (file).s, size, bytes, errno, strerror (errno));
+
+ // note: since we now took care of errors, we know that it feof iff bytes < size
+ return bytes;
+}
+
+void txtfile_fwrite (const void *data, uint32_t size)
+{
+ if (!size) return; // nothing to do
+
+ ASSERTNOTNULL (txt_file);
+ ASSERTNOTNULL (txt_file->file);
+ ASSERTNOTNULL (data);
+
+ uint32_t bytes = fwrite (data, 1, size, (FILE *)txt_file->file);
+
+ // if we're streaming our txt output to another process and that process has ended prematurely or
+ // otherwise closed the pipe, then exit quietly (note: sometimes the shell will kill us before we reach here)
+ if (bytes < size && errno == EPIPE) exit (EXIT_DOWNSTREAM_LOST);
+
+ // error if failed to write to file
+ ASSERT (bytes == size, "Error writing to %s on filesystem=%s - requested %u bytes but wrote only %u: (%u)%s",
+ txt_file->basename, arch_get_filesystem_type (txt_file).s, size, bytes, errno, strerror (errno));
+}
+
static inline uint32_t txtfile_read_block_plain (VBlockP vb, uint32_t max_bytes)
{
char *data = BAFTtxt;
int32_t bytes_read;
- // case: we have data passed to us from file_open_txt_read - handle it first
- if (!Ltxt && evb->scratch.len) {
- memcpy (data, evb->scratch.data, (bytes_read = evb->scratch.len));
- buf_free (evb->scratch);
+ // case: we have data passed to us from txtfile_discover_gz_codec - handle it first (possibly txt_data already contains data passed down from previous VBs)
+ if (txt_file->gz_data.len) {
+ bytes_read = MIN_(txt_file->gz_data.len32, max_bytes);
+ memcpy (BAFTtxt, B1STc (txt_file->gz_data), bytes_read);
+ buf_remove (txt_file->gz_data, char, 0, bytes_read);
}
// case: normal read
else {
- bytes_read = fread (data, 1, max_bytes, (FILE *)txt_file->file); // -1 if error in libc
- ASSERT (!ferror((FILE *)txt_file->file) && bytes_read >= 0, "Error reading PLAIN file %s on filesystem=%s: %s",
- txt_name, arch_get_txt_filesystem().s, strerror (errno));
-
- txt_file->disk_so_far += (int64_t)bytes_read;
+ bytes_read = txtfile_fread (txt_file, NULL, data, max_bytes, &txt_file->disk_so_far);
if (!bytes_read) {
// case external decompressor: inspect its stderr to make sure this is just an EOF and not an error
if (is_read_via_ext_decompressor (txt_file))
file_assert_ext_decompressor();
- txt_file->is_eof = true;
+ txt_file->no_more_blocks = true;
}
}
@@ -109,9 +145,11 @@ rom isal_error (int ret)
}
}
-#define IGZIP_CHUNK (128 KB)
+// chuck size chosen to be equal to the default disk read-ahead buffer in Linux (eg /sys/block/sda/queue/read_ahead_kb)
+// so that decompression is parallelized with disk read-ahead buffer filling (a bigger buffer would cause the disk to be idle while we are still decompressing)
+#define IGZIP_CHUNK (128 KB)
-void txtfile_init_read_igzip (FileP file)
+static void txtfile_initialize_igzip (FileP file)
{
ASSERTNOTINUSE (file->igzip_state);
@@ -120,30 +158,155 @@ void txtfile_init_read_igzip (FileP file)
isal_inflate_init (state);
state->crc_flag = ISAL_GZIP;
+}
+
+void txtfile_discover_gz_codec (FileP file)
+{
+ buf_alloc (evb, &file->gz_data, 0, MAX_(IGZIP_CHUNK, GZIL_MAX_BLOCK_SIZE), char, 0, "gz_data");
+
+ // read the first potential BGZF block to test if this is GZ or BGZF
+ // note: we read if --no-bgzf to capture the data for z_file->gz_header
+ GzStatus status = bgzf_read_block (file, true);
+
+ // case: this is a BGZF block
+ // note: we keep the still-compressed data in vb->scratch for later consumption
+ if (!flag.no_bgzf && status == GZ_SUCCESS && file->gz_data.uncomp_len > 0) {
+ if (file->source_codec != CODEC_CRAM && file->source_codec != CODEC_BAM && file->source_codec != CODEC_BCF)
+ file->source_codec = CODEC_BGZF;
+
+ file->codec = CODEC_BGZF;
+ bgzf_initialize_discovery (file);
+ }
+
+ // for regulars files, we already skipped 0 size files. This can happen in STDIN
+ else if (status == GZ_SUCCESS && file->gz_data.uncomp_len == 0) {
+ ASSINP (!flags_pipe_in_process_died(), // only works for Linux
+ "Pipe-in process %s (pid=%u) died without sending any data",
+ flags_pipe_in_process_name(), flags_pipe_in_pid());
+
+ ABORTINP ("No data exists in input file %s", file->name ? file->name : FILENAME_STDIN);
+ }
+
+ // case: this is non-BGZF GZIP format
+ else if (flag.no_bgzf || status == GZ_IS_GZIP_NOT_BGZF) {
+ // case: this is FASTQ (judged by the filename) that is GZIL
+ bool is_eof = false;
+ if (!flag.no_bgzf && file->data_type == DT_FASTQ &&
+ gzil_read_block (file, true, &is_eof) != GZ_IS_NOT_GZIL)
+
+ file->codec = file->source_codec = CODEC_GZIL;
+
+ // case: neither BGZF or GZIL - treat as normal GZ
+ else
+ file->codec = file->source_codec = CODEC_GZ;
+ }
+
+ // case: this is not GZIP format at all. treat as a plain file, and put the data read in vb->scratch
+ // for later consumption is txtfile_read_block_plain
+ else if (status == GZ_IS_NOT_GZIP) {
+
+ #define BZ2_MAGIC "BZh"
+ #define XZ_MAGIC (char[]){ 0xFD, '7', 'z', 'X', 'Z', 0 }
+ #define ZIP_MAGIC (char[]){ 0x50, 0x4b, 0x03, 0x04 }
+ #define ORA_MAGIC (char[]){ 0x49, 0x7c } // https://support-docs.illumina.com/SW/ORA_Format_Specification/Content/SW/ORA/ORAFormatSpecification.htm
+
+ // we already open the file, so not easy to re-open with BZ2_bzopen as it would require injecting the read data into the BZ2 buffers
+ if (str_isprefix_(STRb(file->gz_data), BZ2_MAGIC, 3))
+ ABORTINP0 ("The data seems to be in bz2 format. Please use --input to specify the type (eg: \"genozip --input sam.bz2\")");
+
+ else if (str_isprefix_(STRb(file->gz_data), XZ_MAGIC, 6)) {
+ if (file->redirected) ABORTINP0 ("Compressing piped-in data in xz format is not currently supported");
+ if (file->is_remote) ABORTINP0 ("The data seems to be in xz format. Please use --input to specify the type (eg: \"genozip --input sam.xz\")");
+ ABORTINP0 ("The data seems to be in xz format. Please use --input to specify the type (eg: \"genozip --input sam.xz\")");
+ }
- buf_alloc (evb, &file->igzip_data, 0, IGZIP_CHUNK, char, 0, "txt_file->igzip_data");
+ else if (str_isprefix_(STRb(file->gz_data), ZIP_MAGIC, 4)) {
+ if (file->redirected) ABORTINP0 ("Compressing piped-in data in zip format is not currently supported");
+ if (file->is_remote) ABORTINP0 ("The data seems to be in zip format. Please use --input to specify the type (eg: \"genozip --input generic.zip\")");
+ ABORTINP0 ("The data seems to be in zip format. Please use --input to specify the type (eg: \"genozip --input generic.zip\")");
+ }
+
+ else if (str_isprefix_(STRb(file->gz_data), ORA_MAGIC, 2)) {
+ if (file->redirected) ABORTINP0 ("Compressing piped-in data in ora format is not currently supported");
+ if (file->is_remote) ABORTINP0 ("The data seems to be in ora format. Please use --input to specify the type (eg: \"genozip --input fastq.ora\")");
+ ABORTINP0 ("The data seems to be in ora format. Please use --input to specify the type (eg: \"genozip --input fastq.ora\")");
+ }
+
+ file->codec = CODEC_NONE;
+ }
+
+ else
+ ABORT ("Invalid status=%u", status);
+
+ // if this is R2 we are going to uncompress in the main thread. IGZIP is a faster method for doing so
+ // than BGZF or GZIP, bc it is better at parallelizing disk read-aheads and decompression. The only reason
+ // to keep BGZF is if we want to store BGZF isizes for exact reconstruction, which is only possible if we discovered
+ // the library. BGZF library discovery has not yet occurred for R2, so we take the R1 results as proxy
+ // (if the proxying is wrong - either we will compress unneccsary slowly with BGZF instead of IGZIP, or we will
+ // incorrectly compress with IGZIP and drop the BGZF isizes preventing exact reconstruction - that's ok)
+ bool is_pair2 = flag.pair && ((flag.zip_comp_i == FQ_COMP_R2 && Z_DT(FASTQ)) || // note: flag.pair is not incremented yet; z_file only exists if this 2nd+ component so test that first
+ (flag.zip_comp_i == SAM_COMP_FQ01 && (Z_DT(BAM) || Z_DT(SAM))));
+ if ((is_pair2 && (file->codec == CODEC_GZIL || (z_file->comp_codec[flag.zip_comp_i-1] == CODEC_BGZF && z_file->comp_bgzf[flag.zip_comp_i-1].level == BGZF_COMP_LEVEL_UNKNOWN))) ||
+ // likewise for --make-reference: we uncompress by main thread, and we don't care about retaining BGZF isizes
+ (flag.make_reference && (file->codec == CODEC_GZIL || file->codec == CODEC_BGZF))) {
+
+ file->gunzip_method = CODEC_GZ;
+ }
+
+ else
+ file->gunzip_method = file->codec;
+
+ if (file->gunzip_method == CODEC_GZ)
+ txtfile_initialize_igzip (file);
}
-// runs in main thread, populates txt_data for vb
-static uint32_t txtfile_read_block_gz (VBlockP vb, uint32_t max_bytes)
+// ZIP main thread: called after txt and z are open, and txt codecs have been discovered.
+void txtfile_zip_finalize_codecs (void)
+{
+ ASSERTNOTNULL (z_file);
+ ASSERTNOTNULL (txt_file);
+
+ if (flag.zip_comp_i < MAX_NUM_COMPS) { // for stats
+ z_file->comp_codec[flag.zip_comp_i] = txt_file->codec;
+ z_file->comp_source_codec[flag.zip_comp_i] = txt_file->source_codec;
+ z_file->comp_gunzip_method[flag.zip_comp_i] = txt_file->gunzip_method;
+
+ // copy GZ header (but not if BGZF, GZIL): data should be in gz_data txtfile_discover_gz_codec
+ if (TXT_IS_GZ && txt_file->gz_data.len >= 12)
+ memcpy (z_file->gz_header, B1ST8 (txt_file->gz_data), 12);
+ }
+
+ // note: for BGZF, we report in bgzf_finalize_discovery as we don't yet know the library/level here
+ if ((flag.show_gz || flag.show_bgzf) && txt_file->gunzip_method != CODEC_BGZF) {
+ iprintf ("%s: txt_codec=%s", txt_file->basename, txtfile_codec_name (z_file, flag.zip_comp_i).s);
+ if (flag.show_gz) { iprint0 ("\n"); exit_ok; };
+
+ iprintf (" gunzip_method=%s\n", codec_name (txt_file->gunzip_method));
+ }
+}
+
+// runs in main thread, reads and uncompressed GZ, and populates txt_data for vb
+static uint32_t txtfile_read_block_igzip (VBlockP vb, uint32_t max_bytes)
{
START_TIMER;
- ASSERTISALLOCED (txt_file->igzip_data);
+ ASSERTISALLOCED (txt_file->gz_data);
struct inflate_state *state = B1ST (struct inflate_state, txt_file->igzip_state);
- // top up igzip_data
- int32_t bytes_read = fread (BAFTc(txt_file->igzip_data), 1, IGZIP_CHUNK - txt_file->igzip_data.len32, (FILE *)txt_file->file);
+ // top up gz_data
+ int32_t bytes_read = (txt_file->gz_data.len32 < IGZIP_CHUNK)
+ ? txtfile_fread (txt_file, NULL, BAFTc(txt_file->gz_data), IGZIP_CHUNK - txt_file->gz_data.len32, &txt_file->disk_so_far)
+ : 0;
ASSERT (!ferror((FILE *)txt_file->file) && bytes_read >= 0, "Error reading GZ file %s on filesystem=%s: %s",
txt_name, arch_get_txt_filesystem().s, strerror (errno));
- txt_file->igzip_data.len32 += bytes_read; // yet-uncompressed data read from disk
+ txt_file->gz_data.len32 += bytes_read; // yet-uncompressed data read from disk
- txt_file->disk_so_far += bytes_read;
+ { START_TIMER
- state->next_in = B1ST8 (txt_file->igzip_data);
- state->avail_in = txt_file->igzip_data.len32;
+ state->next_in = B1ST8 (txt_file->gz_data);
+ state->avail_in = txt_file->gz_data.len32;
state->next_out = BAFT8 (vb->txt_data);
state->avail_out = max_bytes;
@@ -153,45 +316,40 @@ static uint32_t txtfile_read_block_gz (VBlockP vb, uint32_t max_bytes)
if (state->block_state == ISAL_CHECKSUM_CHECK) {
int ret = isal_inflate (state); // new gzip header in a file that has concatented gzip compressions
ASSERT (ret == ISAL_DECOMP_OK, "isal_inflate failed checksum: %s avail_in=%u avail_out=%u",
- isal_error (ret), txt_file->igzip_data.len32, max_bytes);
+ isal_error (ret), txt_file->gz_data.len32, max_bytes);
}
// case: happens in blocked-GZ: we decompressed an entire GZ-block and verified the
- // checksum, now we need to move on to the next GZ block
+ // checksum (either in isal_inflate below in the previous call to this function,
+ // or in isal_inflate above in this call). now we need to move on to the next GZ block.
if (state->block_state == ISAL_BLOCK_FINISH)
isal_inflate_reset (state);
int ret = isal_inflate (state); // new gzip header in a file that has concatented gzip compressions
ASSERT (ret == ISAL_DECOMP_OK || ret == ISAL_END_INPUT, "isal_inflate error: %s avail_in=%u avail_out=%u",
- isal_error (ret), txt_file->igzip_data.len32, max_bytes);
+ isal_error (ret), txt_file->gz_data.len32, max_bytes);
- uint32_t gz_data_consumed = BNUM (txt_file->igzip_data, state->next_in);
- buf_remove (txt_file->igzip_data, char, 0, gz_data_consumed);
+ COPY_TIMER(igzip_uncompress_during_read); }
- txt_file->is_eof = (!state->avail_in && feof ((FILE *)txt_file->file));
+ uint32_t gz_data_consumed = BNUM (txt_file->gz_data, state->next_in);
- Ltxt = BNUMtxt (state->next_out);
-
- // stats for the case of multiple concatenated gzip sections
- if (!txt_file->is_eof && state->block_state == ISAL_BLOCK_FINISH) {
- uint64_t after_gzip_section = txt_file->txt_data_so_far_single/*previous VBs*/ + Ltxt/*this VB*/;
- uint64_t gzip_section_size = after_gzip_section - txt_file->gzip_start_Ltxt;
+ // for stats: read and save the isize from the gzip footer (of the first 2 gzip blocks)
+ if (state->block_state == ISAL_BLOCK_FINISH && z_file && gz_data_consumed >= 4)
+ for (int i=0; i <= 1; i++)
+ if (!z_file->gz_isize[flag.zip_comp_i][i]) {
+ z_file->gz_isize[flag.zip_comp_i][i] = LTEN32 (GET_UINT32 (B8(txt_file->gz_data, gz_data_consumed - 4)));
+ break;
+ }
- if (!txt_file->gzip_start_Ltxt) {
- z_file->gzip_section_size[flag.zip_comp_i] = gzip_section_size;
- z_file->gzip_section_size_single_block[flag.zip_comp_i] = true;
- }
+ buf_remove (txt_file->gz_data, char, 0, gz_data_consumed);
- else if (gzip_section_size != z_file->gzip_section_size[flag.zip_comp_i])
- z_file->gzip_section_size[flag.zip_comp_i] = 0; // not equal size
+ inc_disk_gz_uncomp_or_trunc (txt_file, gz_data_consumed);
- else
- z_file->gzip_section_size_single_block[flag.zip_comp_i] = false; // more than one block has the same size
+ txt_file->no_more_blocks = (!state->avail_in && feof ((FILE *)txt_file->file));
- txt_file->gzip_start_Ltxt = after_gzip_section;
- }
+ Ltxt = BNUMtxt (state->next_out);
- COPY_TIMER (txtfile_read_block_gz);
+ COPY_TIMER (txtfile_read_block_igzip);
return max_bytes - state->avail_out; // uncompressed data length
}
@@ -203,10 +361,10 @@ static inline uint32_t txtfile_read_block_bz2 (VBlockP vb, uint32_t max_bytes)
uint32_t bytes_read = BZ2_bzread ((BZFILE *)txt_file->file, BAFTtxt, max_bytes);
Ltxt += bytes_read;
- if (bytes_read)
+ if (bytes_read)
txt_file->disk_so_far = BZ2_consumed ((BZFILE *)txt_file->file);
else
- txt_file->is_eof = true;
+ txt_file->no_more_blocks = true;
COPY_TIMER (txtfile_read_block_bz2);
@@ -215,133 +373,120 @@ static inline uint32_t txtfile_read_block_bz2 (VBlockP vb, uint32_t max_bytes)
// BGZF: we read *compressed* data into vb->scratch - that will be decompressed now or later, depending on "uncompress".
// We read data with a *decompressed* size up to max_uncomp. vb->scratch always contains only full BGZF blocks
-static inline uint32_t txtfile_read_block_bgzf (VBlockP vb, int32_t max_uncomp /* must be signed */, bool uncompress)
+static inline uint32_t txtfile_read_block_bgz (VBlockP vb, int32_t max_uncomp /* must be signed */, bool uncompress)
{
START_TIMER;
- #define uncomp_len prm32[0] // we use vb->compress.param to hold the uncompressed length of the bgzf data in vb->compress
-
- uint32_t block_comp_len, block_uncomp_len, this_uncomp_len=0;
+ uint32_t this_uncomp_len=0;
if (uncompress)
vb->gzip_compressor = libdeflate_alloc_decompressor(vb, __FUNCLINE);
int64_t start_uncomp_len = vb->scratch.uncomp_len;
+ int32_t max_block_size = TXT_IS_BGZF ? BGZF_MAX_BLOCK_SIZE : GZIL_MAX_BLOCK_SIZE;
- while (vb->scratch.uncomp_len - start_uncomp_len < max_uncomp - BGZF_MAX_BLOCK_SIZE) {
-
- buf_alloc (vb, &vb->scratch, BGZF_MAX_BLOCK_SIZE, max_uncomp/4, char, 1.5, "scratch");
+ // scratch contains gz-compressed data; we use .uncomp_len to track its uncompress length
+ buf_alloc (vb, &vb->scratch, 0, max_uncomp/2, char, 0, "scratch");
- // case: we have data passed to us from file_open_txt_read - handle it first
- if (!Ltxt && evb->scratch.len) {
- block_uncomp_len = evb->scratch.uncomp_len;
- block_comp_len = evb->scratch.len32;
-
- // if we're reading a VB (not the txt header) - copy the compressed data from evb to vb
- if (evb != vb) {
- buf_copy (vb, &vb->scratch, &evb->scratch, char,0,0,0);
- buf_free (evb->scratch);
- }
+ while (vb->scratch.uncomp_len - start_uncomp_len <= max_uncomp - max_block_size &&
+ !txt_file->no_more_blocks) {
+
+ bool is_eof = false; // only used for GZIL
+ GzStatus status = TXT_IS_BGZF ? bgzf_read_block (txt_file, false)
+ : gzil_read_block (txt_file, false, &is_eof);
+
+ uint32_t this_block_start = vb->scratch.len32;
+ buf_add_more (vb, &vb->scratch, txt_file->gz_data.data, txt_file->gz_data.comp_len, "scratch");
+
+ // check for corrupt data - at this point we've already confirm the file is BGZF so not expecting a different block
+ if (status != GZ_SUCCESS) {
+ // dump to file
+ char dump_fn[strlen(txt_name)+100];
+ snprintf (dump_fn, sizeof (dump_fn), "%s.vb-%u.bad-%s.bad-offset-0x%x",
+ txt_name, vb->vblock_i, codec_name (txt_file->codec), this_block_start);
+
+ buf_dump_to_file (dump_fn, &vb->scratch, 1, false, false, true, false);
- // add block to list
- buf_alloc (vb, &vb->bgzf_blocks, 1, 1.2 * max_uncomp / BGZF_MAX_BLOCK_SIZE, BgzfBlockZip, 2, "bgzf_blocks");
- BNXT (BgzfBlockZip, vb->bgzf_blocks) = (BgzfBlockZip)
- { .txt_index = 0,
- .compressed_index = 0,
- .txt_size = block_uncomp_len,
- .comp_size = block_comp_len,
- .is_decompressed = false };
+ ABORT ("%s: Invalid %s block: block_comp_len=%u. Entire data of this vblock dumped to %s, bad block stats at offset 0x%x",
+ VB_NAME, codec_name (txt_file->codec), txt_file->gz_data.comp_len, dump_fn, this_block_start);
+ }
- // note: this is the first BGZF block of the file, so vb->bgzf_i remains 0
+ // add block to list - including the EOF block (block_comp_len=BGZF_EOF_LEN block_uncomp_len=0)
+ if (txt_file->gz_data.comp_len/* note: if is 0 if truncated or EOF with no EOF block */) {
+ buf_alloc (vb, &vb->gz_blocks, 1, MAX_(1000, 1.2 * max_uncomp / max_block_size), GzBlockZip, 2, "gz_blocks");
+ BNXT (GzBlockZip, vb->gz_blocks) = (GzBlockZip)
+ { .txt_index = Ltxt, // after passed-down data and all previous blocks
+ .compressed_index = this_block_start,
+ .txt_size = txt_file->gz_data.uncomp_len,
+ .comp_size = txt_file->gz_data.comp_len,
+ .is_decompressed = !txt_file->gz_data.uncomp_len, // EOF block is always considered decompressed
+ .is_eof = is_eof };
+
+ // case EOF block: are not going to decompress the block, so account for it here
+ if (!txt_file->gz_data.uncomp_len)
+ inc_disk_gz_uncomp_or_trunc (txt_file, txt_file->gz_data.comp_len);
}
- else {
- if (!vb->bgzf_blocks.len) // possibly bgzf_blocks already contains bgzf blocks unconsumed by the previous VB (see bgzf_zip_init_vb)
- vb->vb_bgzf_i = txt_file->bgzf_isizes.len; // first bgzf block number for this VB
+
+ // case EOF - happens in 2 cases: 1. EOF block (block_comp_len=BGZF_EOF_LEN) or 2. no EOF block (block_comp_len=0)
+ if (!txt_file->gz_data.uncomp_len) {
+ txt_file->no_more_blocks = true;
- block_uncomp_len = (uint32_t)bgzf_read_block (txt_file, BAFT8 (vb->scratch), &block_comp_len, HARD_FAIL);
-
- // check for corrupt data - at this point we've already confirm the file is BGZF so not expecting a different block
- if (block_uncomp_len == BGZF_BLOCK_GZIP_NOT_BGZIP || block_uncomp_len == BGZF_BLOCK_IS_NOT_GZIP) {
- // dump to file
- char dump_fn[strlen(txt_name)+100];
- snprintf (dump_fn, sizeof (dump_fn), "%s.vb-%u.bad-bgzf.bad-offset-0x%X", txt_name, vb->vblock_i, vb->scratch.len32);
- Buffer dump_buffer = vb->scratch; // a copy
- dump_buffer.len32 += block_comp_len; // compressed size
- buf_dump_to_file (dump_fn, &dump_buffer, 1, false, false, true, false);
-
- ABORT ("%s: Invalid BGZF block: block_comp_len=%u. Entire BGZF data of this vblock dumped to %s, bad block stats at offset 0x%X",
- VB_NAME, block_comp_len, dump_fn, vb->scratch.len32);
- }
+ if (flag.show_bgzf && txt_file->bgzf_flags.has_eof_block)
+ iprint0 ("IO vb=0 EOF\n");
+ }
- // add block to list - including the EOF block (block_comp_len=BGZF_EOF_LEN block_uncomp_len=0)
- if (block_comp_len && block_uncomp_len/* 0 if truncated */) {
- buf_alloc (vb, &vb->bgzf_blocks, 1, 1.2 * max_uncomp / BGZF_MAX_BLOCK_SIZE, BgzfBlockZip, 2, "bgzf_blocks");
- BNXT (BgzfBlockZip, vb->bgzf_blocks) = (BgzfBlockZip)
- { .txt_index = Ltxt, // after passed-down data and all previous blocks
- .compressed_index = vb->scratch.len32,
- .txt_size = block_uncomp_len,
- .comp_size = block_comp_len,
- .is_decompressed = !block_uncomp_len }; // EOF block is always considered decompressed
-
- vb->scratch.len32 += block_comp_len; // compressed size
- }
-
- // case EOF - happens in 2 cases: 1. EOF block (block_comp_len=BGZF_EOF_LEN) or 2. no EOF block (block_comp_len=0)
- if (!block_uncomp_len) {
- txt_file->is_eof = true;
- if (flag.show_bgzf && txt_file->bgzf_flags.has_eof_block)
- iprint0 ("IO vb=0 EOF\n");
- break;
+ else {
+ this_uncomp_len += txt_file->gz_data.uncomp_len; // total uncompressed length of data read by this function call
+ vb->scratch.uncomp_len += txt_file->gz_data.uncomp_len; // total uncompressed length of data in vb->compress
+ Ltxt += txt_file->gz_data.uncomp_len; // total length of txt_data after adding decompressed vb->scratch (may also include pass-down data)
+
+ // we decompress one block a time in the loop so that the decompression is parallel with the disk reading into cache
+ if (uncompress) {
+ START_TIMER;
+ bgz_uncompress_one_block (vb, BLST (GzBlockZip, vb->gz_blocks), txt_file->codec);
+ COPY_TIMER(bgz_uncompress_during_read);
}
}
- this_uncomp_len += block_uncomp_len; // total uncompressed length of data read by this function call
- vb->scratch.uncomp_len += block_uncomp_len; // total uncompressed length of data in vb->compress
- Ltxt += block_uncomp_len; // total length of txt_data after adding decompressed vb->scratch (may also include pass-down data)
-
- // we decompress one block a time in the loop so that the decompression is parallel with the disk reading into cache
- if (uncompress) {
- START_TIMER;
- bgzf_uncompress_one_block (vb, BLST (BgzfBlockZip, vb->bgzf_blocks));
- COPY_TIMER(txtfile_read_block_bgzf_uncompress);
- }
+ buf_remove (txt_file->gz_data, uint8_t, 0, txt_file->gz_data.comp_len);
+ txt_file->gz_data.comp_len = txt_file->gz_data.uncomp_len = 0;
}
if (uncompress) {
- buf_free (evb->scratch);
+ buf_free (vb->scratch);
libdeflate_free_decompressor ((struct libdeflate_decompressor **)&vb->gzip_compressor, __FUNCLINE);
}
- COPY_TIMER (txtfile_read_block_bgzf);
+ COPY_TIMER (txtfile_read_block_bgz);
return this_uncomp_len;
-#undef param
}
// performs a single I/O read operation - returns number of bytes read
// data is placed in vb->txt_data, except if its BGZF and uncompress=false - compressed data is placed in vb->scratch
static uint32_t txtfile_read_block (VBlockP vb, uint32_t max_bytes,
- bool uncompress) // in BGZF, whether to uncompress the data. ignored if not BGZF
+ bool uncompress) // in BGZF/GZIL, whether to uncompress the data. ignored if not BGZF/GZIL
{
START_TIMER;
- if (txt_file->is_eof) return 0; // nothing more to read
+ if (txt_file->no_more_blocks) return 0; // nothing more to read
- uint32_t bytes_read=0;
+ uint32_t uncomp_len=0;
// BGZF note: we read *compressed* data into vb->scratch - that will be decompressed later. we read
// data with a *decompressed* size up to max_bytes. vb->scratch always contains only full BGZF blocks
switch (txt_file->codec) {
- case CODEC_NONE : bytes_read = txtfile_read_block_plain (vb, max_bytes); break;
- case CODEC_BGZF : bytes_read = txtfile_read_block_bgzf (vb, max_bytes, uncompress); break; // bytes_read is in uncompressed terms
- case CODEC_GZ : bytes_read = txtfile_read_block_gz (vb, max_bytes); break;
- case CODEC_BZ2 : bytes_read = txtfile_read_block_bz2 (vb, max_bytes); break;
+ case CODEC_NONE : uncomp_len = txtfile_read_block_plain (vb, max_bytes); break;
+ case CODEC_GZIL :
+ case CODEC_BGZF : uncomp_len = txtfile_read_block_bgz (vb, max_bytes, uncompress); break;
+ case CODEC_GZ : uncomp_len = txtfile_read_block_igzip (vb, max_bytes); break;
+ case CODEC_BZ2 : uncomp_len = txtfile_read_block_bz2 (vb, max_bytes); break;
default: ABORT ("txtfile_read_block: Invalid file type %s (codec=%s)", ft_name (txt_file->type), codec_name (txt_file->codec));
}
COPY_TIMER_EVB (read);
- return bytes_read;
+ return uncomp_len;
}
// iterator on a buffer containing newline-terminated lines
@@ -457,7 +602,7 @@ void txtfile_read_header (bool is_first_txt)
buf_copy (evb, &txt_file->unconsumed_txt, &evb->txt_data, char, header_len, 0, "txt_file->unconsumed_txt");
evb->txt_data.len = header_len; // trim to uncompressed length of txt header
- txt_file->header_size_bgzf = bgzf_copy_unconsumed_blocks (evb); // copy unconsumed or partially consumed bgzf_blocks to txt_file->unconsumed_bgzf_blocks
+ txt_file->header_size_bgzf = bgz_copy_unconsumed_blocks (evb); // copy unconsumed or partially consumed gz_blocks to txt_file->unconsumed_bgz_blocks
}
txt_file->txt_data_so_far_single = txt_file->header_size = header_len;
@@ -490,39 +635,63 @@ static uint32_t txtfile_get_unconsumed_to_pass_to_next_vb (VBlockP vb)
// case: the data is BGZF-compressed in vb->scratch, except for passed down data from prev VB
// uncompress one block at a time to see if its sufficient. usually, one block is enough
- if (txt_file->codec == CODEC_BGZF && vb->scratch.len) {
+ if ((TXT_IS_BGZF || TXT_IS_GZIL) && vb->scratch.len) {
vb->gzip_compressor = libdeflate_alloc_decompressor (vb, __FUNCLINE);
- for (int block_i=vb->bgzf_blocks.len32 - 1; block_i >= 0; block_i--) {
- BgzfBlockZip *bb = B(BgzfBlockZip, vb->bgzf_blocks, block_i);
- bgzf_uncompress_one_block (vb, bb);
+ for (int block_i=vb->gz_blocks.len32 - 1; block_i >= 0; block_i--) {
+ GzBlockZip *bb = B(GzBlockZip, vb->gz_blocks, block_i);
+
+ START_TIMER;
+ bgz_uncompress_one_block (vb, bb, txt_file->codec);
+ COPY_TIMER(bgz_uncompress_during_read);
+
+ // case: we dropped the bb: happens only for the final block in GZIL is truncated, and it was not detected earlier in gzil_read_block.
+ if (!bb->is_decompressed) {
+ vb->gz_blocks.len32--;
+ Ltxt -= bb->txt_size;
+ segconf.zip_txt_modified = true;
+
+ WARN ("FYI: %s is truncated - its final GZIL block in incomplete. Dropping final %u bytes of the GZ data.", txt_name, bb->comp_size);
+ }
+
+ else {
+ START_TIMER;
+ int32_t last_i = Ltxt-1; // test from end of data
+ pass_to_next_vb_len = (DT_FUNC(txt_file, unconsumed)(vb, MAX_(bb->txt_index, 0), &last_i)); // note: bb->txt_index might be negative if part of this bb was consumed by the previous VB
+ COPY_TIMER (txtfile_get_unconsumed_callback);
- int32_t last_i = Ltxt-1; // test from end of data
- pass_to_next_vb_len = (DT_FUNC(txt_file, unconsumed)(vb, MAX_(bb->txt_index, 0), &last_i)); // note: bb->txt_index might be negative if part of this bb was consumed by the previous VB
- if (pass_to_next_vb_len >= 0) goto done; // we have the answer (callback returns -1 if it needs more data)
+ if (pass_to_next_vb_len >= 0) goto done; // we have the answer (callback returns -1 if it needs more data)
+ }
}
}
// test remaining txt_data including passed-down data from previous VB
+ {
+ START_TIMER;
int32_t last_i = Ltxt-1; // test from end of data
pass_to_next_vb_len = (DT_FUNC(vb, unconsumed)(vb, 0, &last_i));
+ COPY_TIMER (txtfile_get_unconsumed_callback);
+ }
+ // case: callback doesn't have enough data for even one line, but file has no more data
if (flag.truncate && pass_to_next_vb_len < 0 && !segconf.running) {
- WARN ("FYI: %s is truncated - its final %s in incomplete. Dropping this final %s.", txt_name, DTPT(line_name), DTPT(line_name));
+ WARN ("FYI: %s is truncated - its final %s in incomplete. Dropping this partial final %s of %u bytes.",
+ txt_name, DTPT(line_name), DTPT(line_name), Ltxt);
txt_file->last_truncated_line_len = Ltxt;
Ltxt = pass_to_next_vb_len = 0; // truncate last partial line
+ segconf.zip_txt_modified = true;
}
ASSERT (pass_to_next_vb_len >= 0 ||
segconf.running, // case: we're testing memory and this VB is too small for a single line - return and caller will try again with a larger VB
- "Reason: failed to find a full line %sin vb=%s data_type=%s txt_data.len=%u txt_file->codec=%s vb->is_eof=%s interleaved=%s.\n"
+ "Reason: failed to find a full line %sin vb=%s data_type=%s txt_data.len=%u txt_file->codec=%s is_last_vb_in_txt_file=%s interleaved=%s.\n"
"Known possible causes:\n"
- "- The file is %s %s.\n"
+ "- The file is %s %s. Tip: try running with --truncate\n"
"- The file is not a %s file.\n"
"VB dumped: %s\n",
DTPT(is_binary) ? "" : "(i.e. newline-terminated) ",
- VB_NAME, dt_name (txt_file->data_type), Ltxt, codec_name (txt_file->codec), TF(vb->is_eof), TF(segconf.is_interleaved),
+ VB_NAME, dt_name (txt_file->data_type), Ltxt, codec_name (txt_file->codec), TF(vb->is_last_vb_in_txt_file), TF(segconf.is_interleaved),
DTPT(is_binary) ? "truncated but not on the boundary of the" : "missing a newline on the last", DTPT(line_name),
TXT_DT(REF) ? "FASTA" : dt_name (txt_file->data_type),
txtfile_dump_vb (vb, txt_name).s);
@@ -531,14 +700,12 @@ static uint32_t txtfile_get_unconsumed_to_pass_to_next_vb (VBlockP vb)
if (vb->gzip_compressor)
libdeflate_free_decompressor ((struct libdeflate_decompressor **)&vb->gzip_compressor, __FUNCLINE);
- COPY_TIMER (txtfile_get_unconsumed_to_pass_to_next_vb);
return (uint32_t)pass_to_next_vb_len;
}
static bool seggable_size_is_modifiable (void)
{
- Codec c = txt_file->source_codec;
- return c==CODEC_GZ || c==CODEC_BGZF || c==CODEC_CRAM || c==CODEC_BZ2 || c==CODEC_BAM;
+ return !is_read_via_ext_decompressor (txt_file) && !TXT_IS_PLAIN;
}
// estimate the size of the txt_data of the file - i.e. the uncompressed data excluding the header -
// based on the observed or assumed compression ratio of the source compression so far
@@ -548,37 +715,36 @@ static void txtfile_set_seggable_size (void)
: flag.stdin_size ? flag.stdin_size // user-provided size
: 0; // our estimate will be 0
double source_comp_ratio=1;
- switch (txt_file->source_codec) {
- case CODEC_GZ: // for internal compressors, we use the observed source-compression ratio
- case CODEC_BGZF:
- case CODEC_BAM:
- case CODEC_BZ2: {
- if (txt_file->is_remote || txt_file->redirected)
- source_comp_ratio = 4;
- else {
- double plain_len = txt_file->txt_data_so_far_single + txt_file->unconsumed_txt.len;
- double gz_bz2_len = file_tell (txt_file, HARD_FAIL); // should always work for bz2 or gz. For BZ2 this includes up to 64K read from disk but still in its internal buffers
-
- // case: header is whole BGZF blocks - remove header from calculation to get a better estimate of the seggable compression ratio
- if (txt_file->header_size_bgzf) {
- plain_len -= txt_file->header_size;
- gz_bz2_len -= txt_file->header_size_bgzf;
- }
-
- source_comp_ratio = plain_len / gz_bz2_len;
+
+ if (!is_read_via_ext_decompressor (txt_file)) {
+ if (txt_file->is_remote || txt_file->redirected)
+ source_comp_ratio = 4;
+
+ else if (TXT_IS_PLAIN)
+ source_comp_ratio = 1;
+
+ else {
+ double plain_len = txt_file->txt_data_so_far_single + txt_file->unconsumed_txt.len; // all data that has been decompressed
+ double comp_len = TXT_IS_BZ2 ? file_tell (txt_file, HARD_FAIL)
+ : txt_file->disk_so_far - txt_file->gz_data.len; // data read from disk, excluding data still awaiting decompression
+
+ // case: header is whole BGZF blocks - remove header from calculation to get a better estimate of the seggable compression ratio
+ if (txt_file->header_size_bgzf) {
+ plain_len -= txt_file->header_size;
+ comp_len -= txt_file->header_size_bgzf;
}
- break;
+
+ source_comp_ratio = plain_len / MAX_(comp_len, 1);
}
+ }
- // external decompressors
+ // external decompressors
+ else switch (txt_file->source_codec) {
case CODEC_BCF: source_comp_ratio = 10; break; // note: .bcf files might be compressed or uncompressed - we have no way of knowing as "bcftools view" always serves them to us in plain VCF format. These ratios are assuming the bcf is compressed as it normally is.
case CODEC_XZ: source_comp_ratio = 15; break;
case CODEC_CRAM: source_comp_ratio = 25; break;
case CODEC_ORA: source_comp_ratio = 25; break;
case CODEC_ZIP: source_comp_ratio = 3; break;
-
- case CODEC_NONE: source_comp_ratio = 1; break;
-
default: ABORT ("unspecified txt_file->codec=%s (%u)", codec_name (txt_file->codec), txt_file->codec);
}
@@ -595,9 +761,10 @@ int64_t txtfile_get_seggable_size (void)
return txt_file->est_seggable_size;
}
-uint64_t txtfile_max_memory_per_vb (void)
+static uint32_t txt_data_alloc_size (uint32_t vb_size)
{
- return segconf.vb_size - TXTFILE_READ_VB_PADDING;
+ return vb_size +
+ TXTFILE_READ_VB_PADDING; // we need this quantity of unused bytes at the end of vb.txt_data
}
// ZIP main thread
@@ -607,47 +774,50 @@ void txtfile_read_vblock (VBlockP vb)
ASSERTNOTNULL (txt_file);
ASSERT_DT_FUNC (txt_file, unconsumed);
- ASSERT ((segconf.vb_size >= ABSOLUTE_MIN_VBLOCK_MEMORY && segconf.vb_size <= ABSOLUTE_MAX_VBLOCK_MEMORY) || segconf.running,
+ ASSERT (IN_RANGE (segconf.vb_size, ABSOLUTE_MIN_VBLOCK_MEMORY, ABSOLUTE_MAX_VBLOCK_MEMORY) || segconf.running,
"Invalid vb_size=%"PRIu64" comp_i(0-based)=%u", segconf.vb_size, z_file->num_txts_so_far-1);
- buf_alloc (vb, &vb->txt_data, 0, segconf.vb_size, char, 1, "txt_data");
+ if (txt_file->no_more_blocks && !txt_file->unconsumed_txt.len) return; // we're done
+
+ uint32_t my_vb_size = segconf.vb_size; // might grow to match a FASTQ R2 vb to its R1 pair
+
+ buf_alloc (vb, &vb->txt_data, 0, txt_data_alloc_size (my_vb_size), char, 1, "txt_data");
- // read data from the file until either 1. EOF is reached 2. end of block is reached
- uint64_t max_memory_per_vb = txtfile_max_memory_per_vb();
+ // read data from the file until either 1. EOF is reached 2. end of vb is reached
uint32_t pass_to_next_vb_len = 0;
// start with using the data passed down from the previous VB (note: copy & free and not move! so we can reuse txt_data next vb)
if (txt_file->unconsumed_txt.len) {
- uint64_t bytes_moved = MIN_(txt_file->unconsumed_txt.len, max_memory_per_vb);
+ uint64_t bytes_moved = MIN_(txt_file->unconsumed_txt.len, segconf.vb_size);
buf_copy (vb, &vb->txt_data, &txt_file->unconsumed_txt, char, 0, bytes_moved, "txt_data");
buf_remove (txt_file->unconsumed_txt, char, 0, bytes_moved);
}
- if (txt_file->codec == CODEC_BGZF)
- bgzf_zip_init_vb (vb);
+ bool is_bgz = TXT_IS_BGZF || TXT_IS_GZIL;
+
+ if (is_bgz) bgz_zip_init_vb (vb);
vb->comp_i = flag.zip_comp_i; // needed for VB_NAME
- bool always_uncompress = flag.pair == PAIR_R2 || // if we're reading the 2nd paired file, fastq_txtfile_have_enough_lines needs the whole data
- flag.make_reference || // unconsumed callback for make-reference needs to inspect the whole data
- segconf.running ||
- flag.zip_lines_counted_at_init_vb || // *_zip_init_vb needs to count lines
- flag.biopsy;
+ bool always_uncompress = flag.zip_uncompress_source_during_read || segconf.running;
- for (bool first=true; ; first=false) {
+ // case: compute thread should decompress
+ if (!always_uncompress && (TXT_IS_BGZF || TXT_IS_GZIL))
+ vb->txt_codec = txt_file->codec;
- bool is_bgzf = (txt_file->codec == CODEC_BGZF);
+ uint32_t max_block_size = TXT_IS_BGZF ? BGZF_MAX_BLOCK_SIZE : GZIL_MAX_BLOCK_SIZE;
- uint32_t bytes_requested = MIN_(max_memory_per_vb - Ltxt, 1 GB /* read() can't handle more */);
- bool no_read_expected = is_bgzf && (bytes_requested <= BGZF_MAX_BLOCK_SIZE); // in this case, txtfile_read_block is expected to return 0
+ for (bool first=true; ; first=false) {
+ uint32_t bytes_requested = MIN_(my_vb_size - Ltxt, 1 GB /* read() can't handle more */);
+ bool no_read_expected = is_bgz && (bytes_requested <= max_block_size); // in this case, txtfile_read_block is expected to return 0
- uint32_t len = (max_memory_per_vb > Ltxt) ? txtfile_read_block (vb, bytes_requested, always_uncompress) : 0;
+ uint32_t len = (my_vb_size > Ltxt) ? txtfile_read_block (vb, bytes_requested, always_uncompress) : 0;
- if (!len && first && !Ltxt) return;
+ if (!len && first && !Ltxt) goto done; // case: no data read nor pass up from prev vb (and hence also no data to pass down to next vb)
- // when reading BGZF, we might be filled up even without completely filling max_memory_per_vb
+ // when reading BGZF, we might be filled up even without completely filling my_vb_size
// if there is room left for only a partial BGZF block (we can't read partial blocks)
- uint32_t filled_up = max_memory_per_vb - (is_bgzf ? (BGZF_MAX_BLOCK_SIZE - 1): 0);
+ uint32_t filled_up = my_vb_size - (is_bgz ? (max_block_size - 1) : 0);
if (len && Ltxt < filled_up) continue; // continue filling up txt_data...
@@ -658,13 +828,15 @@ void txtfile_read_vblock (VBlockP vb)
!fastq_txtfile_have_enough_lines (vb, &pass_to_next_vb_len, &my_lines, &pair_vb_i, &pair_num_lines, &pair_txt_data_len)) { // we don't yet have all the data we need
// note: the opposite case where R2 has more reads than R1 is caught in fastq_txtfile_have_enough_lines or zip_prepare_one_vb_for_dispatching
- ASSINP ((len || no_read_expected) && Ltxt, "Error: File %s has less FASTQ reads than its R1 mate (vb=%s has %u lines while its pair_vb_i=%d num_R1_VBs=%u has pair_txt_data_len=%u pair_num_lines=%u; vb=%s Ltxt=%u bytes_requested=%u bytes_read=%u eof=%s max_memory_per_vb=%"PRIu64" vb_size=%s src_codec=%s)",
- txt_name, VB_NAME, my_lines, pair_vb_i, sections_get_num_vbs (FQ_COMP_R1), pair_txt_data_len/*only set if flag.debug*/, pair_num_lines, VB_NAME, Ltxt, bytes_requested, len, TF(txt_file->is_eof), max_memory_per_vb, str_size (segconf.vb_size).s, src_codec_name (txt_file->source_codec, vb->comp_i).s);
+ ASSINP ((len || no_read_expected) && Ltxt, "Error: File %s has less FASTQ reads than its R1 mate (vb=%s has %u lines while its pair_vb_i=%d num_R1_VBs=%u has pair_txt_data_len=%u pair_num_lines=%u; vb=%s Ltxt=%u bytes_requested=%u bytes_read=%u no_more_blocks=%s my_vb_size=%u vb_size=%s src_codec=%s disk_so_far=%"PRIu64").%s",
+ txt_name, VB_NAME, my_lines, pair_vb_i, sections_get_num_vbs (FQ_COMP_R1), pair_txt_data_len/*only set if flag.debug*/, pair_num_lines, VB_NAME, Ltxt, bytes_requested, len, TF(txt_file->no_more_blocks), my_vb_size, str_size (segconf.vb_size).s, txtfile_codec_name (z_file, vb->comp_i).s, txt_file->disk_so_far,
+ (flag.truncate && (TXT_IS_BGZF || TXT_IS_GZIL || z_file->comp_codec[0] == CODEC_BGZF || z_file->comp_codec[0] == CODEC_GZIL)) ? " Tip: this might due to --truncate. Try adding --no-bgzf" : "");
+
// if we need more lines - increase memory and keep on reading
- max_memory_per_vb += MAX_((is_bgzf ? 2 * BGZF_MAX_BLOCK_SIZE : 0), max_memory_per_vb / 16);
+ my_vb_size *= 1.1 * ((double)pair_num_lines / (double)my_lines);
- buf_alloc (vb, &vb->txt_data, 0, max_memory_per_vb, char, 1, "txt_data");
+ buf_alloc (vb, &vb->txt_data, 0, txt_data_alloc_size (my_vb_size), char, 1, "txt_data");
}
else
break;
@@ -682,28 +854,23 @@ void txtfile_read_vblock (VBlockP vb)
if (segconf.running && pass_to_next_vb_len == (uint32_t)-1) {
buf_copy (evb, &txt_file->unconsumed_txt, &vb->txt_data, char, 0, 0, "txt_file->unconsumed_txt");
buf_free (vb->txt_data);
- return;
+ goto done;
}
}
if (pass_to_next_vb_len) {
-
// note: we might some unconsumed data, pass it up to the next vb. possibly we still have unconsumed data (can happen if DVCF reject
- // data was passed down from the txt header, greater than max_memory_per_vb)
+ // data was passed down from the txt header, greater than my_vb_size)
buf_insert (evb, txt_file->unconsumed_txt, char, 0, Btxt (Ltxt - pass_to_next_vb_len), pass_to_next_vb_len, "txt_file->unconsumed_txt");
Ltxt -= pass_to_next_vb_len;
- // copy unconsumed or partially consumed bgzf_blocks to txt_file->unconsumed_bgzf_blocks
- if (txt_file->codec == CODEC_BGZF)
- bgzf_copy_unconsumed_blocks (vb);
-
- // if is possible we reached eof but still have pass_up_data - this happens eg in make-reference when a
- // VB takes only one contig from txt_data and pass up the rest - reset eof so that we come back here to process the rest
- txt_file->is_eof = false;
+ // copy unconsumed or partially consumed gz_blocks to txt_file->unconsumed_bgz_blocks
+ if (is_bgz)
+ bgz_copy_unconsumed_blocks (vb);
}
vb->vb_position_txt_file = txt_file->txt_data_so_far_single;
- vb->is_eof = txt_file->is_eof;
+ vb->is_last_vb_in_txt_file = txt_file->no_more_blocks && !txt_file->unconsumed_txt.len;
txt_file->txt_data_so_far_single += Ltxt;
zip_init_vb (vb);
@@ -716,6 +883,11 @@ void txtfile_read_vblock (VBlockP vb)
dispatcher_increment_progress ("read", txt_file->est_num_lines ? (Ltxt / MAX_(segconf.line_len,1)) : Ltxt);
}
+done:
+ if (flag_is_show_vblocks (ZIP_TASK_NAME))
+ iprintf ("VB_READ(id=%d) vb=%s Ltxt=%u vb_position_txt_file=%"PRIu64" unconsumed_txt.len=%u is_last_vb_in_txt_file=%s\n",
+ vb->id, VB_NAME, Ltxt, vb->vb_position_txt_file, txt_file->unconsumed_txt.len32, TF(vb->is_last_vb_in_txt_file));
+
COPY_TIMER (txtfile_read_vblock);
}
@@ -729,23 +901,32 @@ DataType txtfile_zip_get_file_dt (rom filename)
return file_get_data_type_of_input_file (ft);
}
-// outputs details on src_codec of a component, as stored in z_file
-StrText src_codec_name (Codec src_codec, CompIType comp_i) // COMP_NONE means report for current txt_file
+// outputs details on txt_file->codec of a component, as stored in z_file
+StrText txtfile_codec_name (FileP z_file/*obscures global*/, CompIType comp_i)
{
StrText s;
-
- if (src_codec == CODEC_BGZF ||
- (src_codec == CODEC_BAM && z_file->comp_codec[comp_i==CODEC_BGZF])) {
+
+ if (!IN_RANGE (comp_i, 0, MAX_NUM_COMPS-1))
+ snprintf (s.s, sizeof (s.s), "comp_i=%u out_of_range", comp_i);
+
+ else if (z_file->comp_codec[comp_i] == CODEC_BGZF) {
if (z_file->comp_bgzf[comp_i].level < BGZF_COMP_LEVEL_UNKNOWN)
snprintf (s.s, sizeof (s.s), "BGZF(%s[%d])", bgzf_library_name (z_file->comp_bgzf[comp_i].library, false), z_file->comp_bgzf[comp_i].level);
else
strcpy (s.s, "BGZF(unknown_lib)");
}
- else if (src_codec==CODEC_GZ && z_file->gzip_section_size[comp_i] && !z_file->gzip_section_size_single_block[comp_i])
- snprintf (s.s, sizeof (s), "GZ(%.50s)", str_size (z_file->gzip_section_size[comp_i]).s);
+
+ else if (z_file->comp_codec[comp_i]==CODEC_GZ) {
+ bool fextra = ((z_file->gz_header[comp_i][3] & 4) == 4); // FEXTRA is bit 2 of FLG
+
+ snprintf (s.s, sizeof (s), "GZ(%.24s%.20s%.20s)",
+ str_to_hex (z_file->gz_header[comp_i], fextra ? 12 : 10).s,
+ cond_str (z_file->gz_isize[comp_i][0], "-", str_size (z_file->gz_isize[comp_i][0]).s),
+ cond_str (z_file->gz_isize[comp_i][1], "-", str_size (z_file->gz_isize[comp_i][1]).s));
+ }
else
- strcpy (s.s, codec_name (src_codec));
+ strcpy (s.s, codec_name (z_file->comp_codec[comp_i]));
return s;
}
\ No newline at end of file
diff --git a/src/txtfile.h b/src/txtfile.h
index 7e9d5e7d..58f560f6 100644
--- a/src/txtfile.h
+++ b/src/txtfile.h
@@ -11,12 +11,15 @@
#include "genozip.h"
#include "digest.h"
+extern uint32_t txtfile_fread (FileP file, FILE *fp, void *addr, uint32_t size, int64_t *disk_so_far);
+extern void txtfile_fwrite (const void *data, uint32_t size);
+
extern StrTextLong txtfile_dump_vb (VBlockP vb, rom base_name);
-extern StrText src_codec_name (Codec src_codec, CompIType comp_i);
+extern StrText txtfile_codec_name (FileP z_file, CompIType comp_i);
+extern void txtfile_zip_finalize_codecs (void);
extern void txtfile_read_header (bool is_first_txt);
-#define TXTFILE_READ_VB_PADDING 16 // txtfile_read_vblock ensure this quantity of bytes at the end of vb.txt_data are unused
extern uint64_t txtfile_max_memory_per_vb (void);
extern void txtfile_read_vblock (VBlockP vb);
extern int64_t txtfile_get_seggable_size (void);
@@ -25,7 +28,7 @@ typedef bool (*TxtIteratorCallback)(rom line, unsigned line_len, void *cb_param1
extern char *txtfile_foreach_line (BufferP txt_header, bool reverse, TxtIteratorCallback callback, void *cb_param1, void *cb_param2, unsigned cb_param3, int64_t *line_len);
// igzip
-extern void txtfile_init_read_igzip (FileP file);
+extern void txtfile_discover_gz_codec (FileP file);
extern rom isal_error (int ret);
// callbacks
diff --git a/src/txtheader.c b/src/txtheader.c
index 50c54be4..3ae85a9d 100644
--- a/src/txtheader.c
+++ b/src/txtheader.c
@@ -88,7 +88,7 @@ void txtheader_compress (BufferP txt_header,
txt_file->basename && !filename_has_ext (txt_file->basename, ".gz") && !filename_has_ext (txt_file->basename, ".bgz");
// In BGZF, we store the 3 least significant bytes of the file size, so check if the reconstructed BGZF file is likely the same
- if (txt_file->codec == CODEC_BGZF)
+ if (TXT_IS_BGZF)
bgzf_sign (txt_file->disk_size, section_header.codec_info);
filename_base (txt_file->name, false, FILENAME_STDIN, section_header.txt_filename, TXT_FILENAME_LEN);
@@ -359,7 +359,7 @@ void txtheader_piz_read_and_reconstruct (Section sec)
if (!flag.to_stdout && !flag.out_filename) FREE (filename); // file_open_z copies the names
// set BGZF info in txt_file - either that originates from SEC_BGZF, or constructed based on bgzf_flags
- if (needs_recon && txt_file->codec == CODEC_BGZF)
+ if (needs_recon && TXT_IS_BGZF)
bgzf_piz_set_txt_file_bgzf_info (bgzf_flags, header.codec_info);
// note: this is reset for each component:
diff --git a/src/url.c b/src/url.c
index fc1b26fa..1227fdc3 100644
--- a/src/url.c
+++ b/src/url.c
@@ -433,11 +433,11 @@ char *url_esc_non_valid_chars_(rom in, char *out/*malloced if NULL*/, bool esc_a
return out;
}
-UrlStr url_esc_non_valid_charsS (rom in) // for short strings - on stack
+StrTextLong url_esc_non_valid_charsS (rom in) // for short strings - on stack
{
- rom esc = url_esc_non_valid_chars_(in, NULL, false); // note: might be longer than UrlStr
+ rom esc = url_esc_non_valid_chars_(in, NULL, false); // note: might be longer than StrTextLong
- UrlStr out;
+ StrTextLong out;
int out_len = MIN_(sizeof (out.s)-1, strlen(esc)); // trim if needed - possibly resulting in an invalid URL!
memcpy (out.s, esc, out_len);
out.s[out_len] = 0;
diff --git a/src/url.h b/src/url.h
index 07494a46..568d02de 100644
--- a/src/url.h
+++ b/src/url.h
@@ -29,8 +29,6 @@ extern bool url_is_url (rom filename);
extern char *url_esc_non_valid_chars_(rom in, char *out, bool esc_all_or_none);
static inline char *url_esc_non_valid_chars (rom in) { return url_esc_non_valid_chars_ (in, NULL, false); } // on heap
+static inline char *url_esc_all_or_none (rom in) { return url_esc_non_valid_chars_ (in, NULL, true ); } // on heap
-static inline char *url_esc_all_or_none (rom in) { return url_esc_non_valid_chars_ (in, NULL, true); } // on heap
-
-typedef struct { char s[1024]; } UrlStr;
-extern UrlStr url_esc_non_valid_charsS (rom in); // for short strings - on stack
+extern StrTextLong url_esc_non_valid_charsS (rom in); // for short strings - on stack
diff --git a/src/vblock.h b/src/vblock.h
index b82208ba..8d7279f1 100644
--- a/src/vblock.h
+++ b/src/vblock.h
@@ -50,7 +50,8 @@ typedef struct {
\
VBIType vblock_i; /* VB 1-based sequential number in the dispatcher (or 0 if not in dispatcher) */\
CompIType comp_i; /* ZIP/PIZ: txt component within z_file that this VB belongs to */ \
- bool is_eof; /* encountered EOF when reading this VB data from file */ \
+ bool is_last_vb_in_txt_file; /* ZIP: this VB is the last VB in its txt_file (excluding gencomp VBs) */ \
+ Codec txt_codec; /* ZIP: if compute thread is expected to decompress scratch into txt_data, this is the codec. If not, CODEC_UNKNOWN. */ \
\
/* compute thread stuff */ \
ThreadId compute_thread_id; /* id of compute thread currently processing this VB */ \
@@ -75,7 +76,7 @@ typedef struct {
\
/* tracking execution */\
uint64_t vb_position_txt_file;/* ZIP/PIZ: position of this VB's data in the plain text file (without source compression): ZIP: as read before any ZIP-side modifications ; PIZ: as reconstructed with all modifications */\
- uint64_t vb_bgzf_i; /* ZIP: index into txt_file->bgzf_isizes of the first BGZF block of this VB */ \
+ uint64_t vb_bgz_i; /* ZIP: index into txt_file->bgzf_isizes of the first BGZF/GZIL block of this VB */ \
int32_t recon_size; /* ZIP: actual size of txt if this VB is reconstructed in PRIMARY coordinates (inc. as ##primary_only in --luft) */\
/* PIZ: expected reconstruction size in the coordinates of reconstruction */\
int32_t txt_size; /* ZIP: original size of of text data read from the file */ \
@@ -126,7 +127,7 @@ typedef struct {
\
/* bgzf - for handling bgzf-compressed files */ \
void *gzip_compressor; /* Handle into libdeflate compressor or decompressor, or zlib's z_stream. Pointer to codec_bufs[].data */ \
- Buffer bgzf_blocks; /* ZIP: an array of BgzfBlockZip tracking the decompression of bgzf blocks in scratch into txt_data. */\
+ Buffer gz_blocks; /* ZIP: an array of GzBlockZip tracking the decompression of bgzf/gzil blocks in scratch into txt_data. */\
/* PIZ: an array of BgzfBlockPiz */ \
\
/* random access, chrom, pos */ \
diff --git a/src/version.h b/src/version.h
index ac5fd109..d6038152 100644
--- a/src/version.h
+++ b/src/version.h
@@ -1,4 +1,4 @@
-#define GENOZIP_CODE_VERSION "15.0.61"
+#define GENOZIP_CODE_VERSION "15.0.62"
extern int code_version_major (void);
extern int code_version_minor (void);
diff --git a/src/writer.c b/src/writer.c
index 470968db..d2dfce40 100644
--- a/src/writer.c
+++ b/src/writer.c
@@ -1004,7 +1004,7 @@ bool writer_create_plan (void)
#define BGZF_FLUSH_THRESHOLD (32 MB)
#define PLAIN_FLUSH_THRESHOLD (4 MB)
-#define FLUSH_THRESHOLD ((txt_file->codec == CODEC_BGZF) ? BGZF_FLUSH_THRESHOLD : PLAIN_FLUSH_THRESHOLD)
+#define FLUSH_THRESHOLD (TXT_IS_BGZF ? BGZF_FLUSH_THRESHOLD : PLAIN_FLUSH_THRESHOLD)
static void writer_write (BufferP buf, uint64_t txt_data_len)
{
@@ -1012,7 +1012,7 @@ static void writer_write (BufferP buf, uint64_t txt_data_len)
if (!buf->len) return;
- file_write_txt (STRb(*buf));
+ txtfile_fwrite (STRb(*buf));
txt_file->disk_so_far += buf->len;
@@ -1236,7 +1236,7 @@ static void writer_main_loop (VBlockP wvb) // same as wvb global variable
threads_set_writer_thread();
// if we need to BGZF-compress, we will dispatch the compression workload to compute threads
- Dispatcher dispatcher = (!flag.no_writer && txt_file->codec == CODEC_BGZF && txt_file->bgzf_flags.library != BGZF_EXTERNAL_LIB) ?
+ Dispatcher dispatcher = (!flag.no_writer && TXT_IS_BGZF && txt_file->bgzf_flags.library != BGZF_EXTERNAL_LIB) ?
dispatcher_init ("bgzf", NULL, POOL_BGZF, writer_get_max_bgzf_threads(), 0, false, false, NULL, 0, NULL) : NULL;
// normally, we digest in the compute thread but in case gencomp lines can be inserted into the vb we digest here.
diff --git a/src/zfile.c b/src/zfile.c
index b0d04602..177bff32 100644
--- a/src/zfile.c
+++ b/src/zfile.c
@@ -1,1110 +1,1110 @@
-// ------------------------------------------------------------------
-// zfile.c
-// Copyright (C) 2019-2024 Genozip Limited. Patent Pending.
-// Please see terms and conditions in the file LICENSE.txt
-//
-// WARNING: Genozip is proprietary, not open source software. Modifying the source code is strictly prohibited,
-// under penalties specified in the license.
-
-#include
-#include
-#include
-#include
-#include
-#include "vblock.h"
-#include "zfile.h"
-#include "crypt.h"
-#include "context.h"
-#include "compressor.h"
-#include "piz.h"
-#include "zip.h"
-#include "license.h"
-#include "gencomp.h"
-#include "threads.h"
-#include "refhash.h"
-#include "seg.h"
-#include "dispatcher.h"
-#include "zriter.h"
-#include "b250.h"
-#include "libdeflate_1.19/libdeflate.h"
-
-static void zfile_show_b250_section (SectionHeaderUnionP header_p, ConstBufferP b250_data)
-{
- static Mutex show_b250_mutex = {}; // protect so compute thread's outputs don't get mix
-
- SectionHeaderCtxP header = header_p.ctx;
-
- if (!flag.show_b250 && dict_id_typeless (header->dict_id).num != flag.dict_id_show_one_b250.num) return;
-
- mutex_initialize (show_b250_mutex); // possible unlikely race condition on initializing - good enough for debugging purposes
- mutex_lock (show_b250_mutex);
-
- iprintf ("vb_i=%u %*.*s: ", BGEN32 (header->vblock_i), -DICT_ID_LEN-1, DICT_ID_LEN, dict_id_typeless (header->dict_id).id);
-
- bytes data = B1ST (const uint8_t, *b250_data);
- bytes after = BAFT (const uint8_t, *b250_data);
-
- while (data < after) {
- WordIndex word_index = b250_piz_decode (&data, true, header->b250_size, "zfile_show_b250_section");
- switch (word_index) {
- case WORD_INDEX_ONE_UP : iprint0 ("ONE_UP " ) ; break ;
- case WORD_INDEX_EMPTY : iprint0 ("EMPTY " ) ; break ;
- case WORD_INDEX_MISSING : iprint0 ("MISSING ") ; break ;
- default : iprintf ("%u ", word_index);
- }
- }
- iprint0 ("\n");
-
- mutex_unlock (show_b250_mutex);
-}
-
-// Write uncompressed, unencrypted section to ...[header|body].
-// Note: header includes encryption padding if it was encrypted
-static void zfile_dump_section (BufferP uncompressed_data, SectionHeaderP header, unsigned section_len, DictId dict_id)
-{
- char filename[100];
- VBIType vb_i = BGEN32 (header->vblock_i);
-
- // header
- snprintf (filename, sizeof(filename), "%s.%u.%s.header", st_name (header->section_type), vb_i, dis_dict_id (dict_id).s);
- file_put_data (filename, header, section_len, 0);
-
- // body
- if (uncompressed_data->len) {
- snprintf (filename, sizeof(filename),"%s.%u.%s.body", st_name (header->section_type), vb_i, dis_dict_id (dict_id).s);
- buf_dump_to_file (filename, uncompressed_data, 1, false, false, true, false);
- }
-}
-
-// uncompressed a block and adds a \0 at its end. Returns the length of the uncompressed block, without the \0.
-// when we get here, the header is already unencrypted zfile_one_section
-void zfile_uncompress_section (VBlockP vb,
- SectionHeaderUnionP header_p,
- BufferP uncompressed_data,
- rom uncompressed_data_buf_name, // a name if Buffer, NULL ok if buffer need not be realloced
- uint32_t expected_vb_i,
- SectionType expected_section_type)
-{
- START_TIMER;
- ASSERTNOTNULL (header_p.common);
-
- DictId dict_id = DICT_ID_NONE;
- uint8_t codec_param = 0;
-
- if (expected_section_type == SEC_DICT)
- dict_id = header_p.dict->dict_id;
- else if (expected_section_type == SEC_B250 || expected_section_type == SEC_LOCAL) {
- dict_id = header_p.ctx->dict_id;
- codec_param = header_p.ctx->param;
- }
- else if (expected_section_type == SEC_COUNTS)
- dict_id = header_p.counts->dict_id;
- else if (expected_section_type == SEC_SUBDICTS)
- dict_id = header_p.subdicts->dict_id;
-
- ContextP ctx = NULL;
- if (IS_DICTED_SEC (expected_section_type)) {
- ctx = ECTX(dict_id);
- if (ctx && !ctx->is_loaded && IS_PIZ) // note: never skip in ZIP (when an R2 VB uncompressed R1 sections)
- return; // section was skipped
- }
- else
- if (piz_is_skip_undicted_section (expected_section_type)) return; // undicted section was skipped
-
- SectionHeaderP header = header_p.common;
- uint32_t data_encrypted_len = BGEN32 (header->data_encrypted_len);
- uint32_t data_compressed_len = BGEN32 (header->data_compressed_len);
- uint32_t data_uncompressed_len = BGEN32 (header->data_uncompressed_len);
- uint32_t expected_z_digest = BGEN32 (header->z_digest);
- VBIType vblock_i = BGEN32 (header->vblock_i);
-
- // sanity checks
- ASSERT (header->section_type == expected_section_type, "expecting section type %s but seeing %s", st_name(expected_section_type), st_name(header->section_type));
-
- ASSERT (vblock_i == expected_vb_i || !expected_vb_i, // dictionaries are uncompressed by the main thread with pseduo_vb (vb_i=0)
- "bad vblock_i: header->vblock_i=%u but expecting it to be %u (section_type=%s dict_id=%s)",
- vblock_i, expected_vb_i, st_name (expected_section_type), dis_dict_id(dict_id).s);
-
- if (flag.show_uncompress)
- iprintf ("Uncompress: %s %-9s %-8s comp_len=%-7u uncomp_len=%u\n", VB_NAME,
- st_name (expected_section_type), dict_id.num ? dis_dict_id (dict_id).s : "", data_compressed_len, data_uncompressed_len);
-
- uint32_t compressed_offset = st_header_size (header->section_type);
- if (data_encrypted_len) compressed_offset = ROUNDUP16 (compressed_offset);
-
- uint32_t actual_z_digest = adler32 (1, (uint8_t*)header + compressed_offset, MAX_(data_compressed_len, data_encrypted_len));
-
- if (VER(15) && expected_z_digest != actual_z_digest) {
- sections_show_header (header_p.common, vb, 0, 'E');
- ABORT ("%s:%s: Section %s data failed digest verification: expected_z_digest=%u != actual_z_digest=%u",
- z_name, VB_NAME, st_name(header->section_type), expected_z_digest, actual_z_digest);
- }
-
- // decrypt data (in-place) if needed
- if (data_encrypted_len)
- crypt_do (vb, (uint8_t*)header + compressed_offset, data_encrypted_len, vblock_i, header->section_type, false);
-
- bool bad_compression = false;
-
- if (data_uncompressed_len > 0) { // FORMAT, for example, can be missing in a sample-less file
-
- if (uncompressed_data_buf_name) {
- buf_alloc (vb, uncompressed_data, 0, data_uncompressed_len + sizeof (uint64_t), char, 1.1, uncompressed_data_buf_name); // add a 64b word for safety in case this buffer will be converted to a bits later
- uncompressed_data->len = data_uncompressed_len;
- }
-
- comp_uncompress (vb, ctx, header->codec,
- header->section_type == SEC_LOCAL ? header->sub_codec : 0,
- codec_param,
- (char*)header + compressed_offset, data_compressed_len,
- uncompressed_data, data_uncompressed_len,
- dict_id.num ? dis_dict_id(dict_id).s : st_name(expected_section_type));
-
- //--verify-codec: verify that adler32 of the decompressed data is equal that of the original uncompressed data
- if (flag.verify_codec && uncompressed_data && data_uncompressed_len &&
- BGEN32 (header->magic) != GENOZIP_MAGIC &&
- header->uncomp_adler32 != adler32 (1, uncompressed_data->data, data_uncompressed_len)) {
-
- iprintf ("--verify-codec: BAD ADLER32 section decompressed incorrectly: codec=%s\n", codec_name(header->codec));
- sections_show_header (header, NULL, 0, 'R');
- bad_compression = true;
- }
- }
-
- if (flag.show_b250 && expected_section_type == SEC_B250)
- zfile_show_b250_section (header_p, uncompressed_data);
-
- if ((flag.dump_section && !strcmp (st_name (expected_section_type), flag.dump_section)) || bad_compression) {
- uint64_t save_len = uncompressed_data->len;
- uncompressed_data->len = data_uncompressed_len; // might be different, eg in the case of ref_hash
- zfile_dump_section (uncompressed_data, header, compressed_offset, dict_id);
- uncompressed_data->len = save_len; // restore
- }
-
- if (vb) COPY_TIMER (zfile_uncompress_section);
-}
-
-// uncompress into a specific offset in a pre-allocated buffer
-void zfile_uncompress_section_into_buf (VBlockP vb, SectionHeaderUnionP header_p, uint32_t expected_vb_i, SectionType expected_section_type,
- BufferP dst_buf,
- char *dst) // pointer into dst_buf.data
-{
- if (!header_p.common->data_uncompressed_len) return;
-
- ASSERT (dst >= B1STc(*dst_buf) && dst <= BLSTc(*dst_buf), "expecting dst=%p to be within dst_buf=%s", dst, buf_desc(dst_buf).s);
-
- Buffer copy = *dst_buf;
- copy.data = dst; // somewhat of a hack
- zfile_uncompress_section (vb, header_p, ©, NULL, expected_vb_i, expected_section_type); // NULL name prevents buf_alloc
-}
-
-uint32_t zfile_compress_b250_data (VBlockP vb, ContextP ctx)
-{
- struct FlagsCtx flags = ctx->flags; // make a copy
-
- if (VB_DT(FASTQ))
- flags.paired = (flag.pair == PAIR_R1 && fastq_zip_use_pair_identical (ctx->dict_id)) || // "paired" flag in R1 means: "In R2, reconstruct R1 data IFF R2 data is absent" (v15)
- (flag.pair == PAIR_R2 && fastq_zip_use_pair_assisted (ctx->dict_id, SEC_B250)); // "paired" flag in R2 means: "Reconstruction of R2 requires R2 data as well as R1 data"
-
- SectionHeaderCtx header = (SectionHeaderCtx) {
- .magic = BGEN32 (GENOZIP_MAGIC),
- .section_type = SEC_B250,
- .data_uncompressed_len = BGEN32 (ctx->b250.len32),
- .codec = ctx->bcodec == CODEC_UNKNOWN ? CODEC_RANS8 : ctx->bcodec,
- .vblock_i = BGEN32 (vb->vblock_i),
- .flags.ctx = flags,
- .dict_id = ctx->dict_id,
- .b250_size = ctx->b250_size,
- };
-
- ctx->b250_in_z = vb->z_data.len32;
-
- uint32_t compressed_size = comp_compress (vb, ctx, &vb->z_data, &header, ctx->b250.data, NO_CALLBACK, ctx->tag_name);
-
- ctx->b250_in_z_len = vb->z_data.len32 - ctx->b250_in_z;
-
- ctx_zip_z_data_exist (ctx);
-
- return compressed_size;
-}
-
-// returns compressed size
-uint32_t zfile_compress_local_data (VBlockP vb, ContextP ctx, uint32_t sample_size /* 0 means entire local buffer */)
-{
- struct FlagsCtx flags = ctx->flags; // make a copy
-
- if (VB_DT(FASTQ))
- flags.paired = (flag.pair == PAIR_R1 && fastq_zip_use_pair_identical (ctx->dict_id)) || // "paired" flag in R1 means: "Load R1 data in R2, if R2 data is absent" (v15)
- (flag.pair == PAIR_R2 && fastq_zip_use_pair_assisted (ctx->dict_id, SEC_LOCAL)); // "paired" flag in R2 means: "Reconstruction of R2 requires R2 data as well as R1 data"
-
- uint32_t uncompressed_len = ctx->local.len32 * lt_width(ctx);
-
- // case: we're just testing a small sample
- if (sample_size && uncompressed_len > sample_size)
- uncompressed_len = sample_size;
-
- SectionHeaderCtx header = (SectionHeaderCtx) {
- .magic = BGEN32 (GENOZIP_MAGIC),
- .section_type = SEC_LOCAL,
- .data_uncompressed_len = BGEN32 (uncompressed_len),
- .codec = ctx->lcodec == CODEC_UNKNOWN ? CODEC_RANS8 : ctx->lcodec, // if codec has not been decided yet, fall back on RANS8
- .sub_codec = ctx->lsubcodec_piz ? ctx->lsubcodec_piz : CODEC_UNKNOWN,
- .vblock_i = BGEN32 (vb->vblock_i),
- .flags.ctx = flags,
- .dict_id = ctx->dict_id,
- .ltype = ctx->ltype,
- .param = ctx->local_param ? ctx->local.prm8[0] : 0,
- };
-
- if (lt_max(ctx->ltype)) // integer ltype
- header.nothing_char = ctx->nothing_char ? ctx->nothing_char : 0xff; // note: nothing_char=0 is trasmitted as 0xff in SectionHeaderCtx, because 0 means "logic up to version 15.0.37"
-
- LocalGetLineCB *callback = zip_get_local_data_callback (vb->data_type, ctx);
-
- ctx->local_in_z = vb->z_data.len32;
-
- uint32_t compressed_size = comp_compress (vb, ctx, &vb->z_data, &header,
- callback ? NULL : ctx->local.data, callback, ctx->tag_name);
-
- ctx->local_in_z_len = vb->z_data.len32 - ctx->local_in_z;
-
- ctx_zip_z_data_exist (ctx);
-
- return compressed_size;
-}
-
-// compress section - two options for input data -
-// 1. contiguous data in section_data
-// 2. line by line data - by providing a callback + total_len
-void zfile_compress_section_data_ex (VBlockP vb,
- ContextP ctx, // NULL if not context data
- SectionType section_type,
- BufferP section_data, // option 1 - compress contiguous data
- LocalGetLineCB callback, uint32_t total_len, // option 2 - compress data one line at a time
- Codec codec, SectionFlags flags,
- rom name)
-{
- ASSERT (st_header_size (section_type) == sizeof (SectionHeader), "cannot use this for section_type=%s", st_name (section_type));
-
- SectionHeader header = {
- .magic = BGEN32 (GENOZIP_MAGIC),
- .section_type = section_type,
- .data_uncompressed_len = BGEN32 (section_data ? section_data->len : total_len),
- .codec = codec,
- .vblock_i = BGEN32 (vb->vblock_i),
- .flags = flags
- };
-
- if (flag.show_time) codec_show_time (vb, name ? name : st_name (section_type), NULL, codec);
-
- comp_compress (vb, ctx,
- // note: when called from codec_assign_best_codec we use z_data_test. this is because codec_assign_best_codec can be
- // called from within complex codecs for their subcodecs, and if we had used z_data, comp_compress could realloc it as it
- // is being populated by complex codec
- in_assign_codec ? &vb->z_data_test : &vb->z_data,
- &header,
- section_data ? section_data->data : NULL,
- callback, st_name (section_type));
-}
-
-typedef struct { uint64_t start, len; } RemovedSection;
-
-static DESCENDING_SORTER (sort_removed_sections, RemovedSection, start)
-
-// remove ctx and all other ctxs consolidated to it from z_data. akin of unscrambling an egg.
-void zfile_remove_ctx_group_from_z_data (VBlockP vb, Did remove_did_i)
-{
- unsigned num_rms=0;
- RemovedSection rm[vb->num_contexts * 2];
-
- // remove all contexts in the group
- CTX(remove_did_i)->st_did_i = remove_did_i; // so the loop catches it too
- for_ctx_that (ctx->st_did_i == remove_did_i) {
- if (ctx->b250_in_z_len)
- rm[num_rms++] = (RemovedSection){.start = ctx->b250_in_z, .len = ctx->b250_in_z_len };
-
- if (ctx->local_in_z_len)
- rm[num_rms++] = (RemovedSection){.start = ctx->local_in_z, .len = ctx->local_in_z_len};
-
- vb->recon_size -= ctx->txt_len; // it won't be reconstructed after all
-
- ctx_update_zctx_txt_len (vb, ctx, -(int64_t)ctx->txt_len); // substract txt_len added to zctx during merge
-
- buflist_free_ctx (vb, ctx);
- }
-
- // update VB Header (always first in z_data) with reduced recon_size (re-encrypting it if encrypting)
- uint64_t save = vb->z_data.len;
- vb->z_data.len = 0;
- zfile_compress_vb_header (vb);
- vb->z_data.len = save;
-
- // sort indices to the to-be-removed sections in reverse order
- qsort (rm, num_rms, sizeof(RemovedSection), sort_removed_sections);
-
- bool is_encrypted = has_password(); // we can't (easily) test magic if header is encrypted
-
- for (unsigned i=0; i < num_rms; i++) {
- ASSERT (is_encrypted || ((SectionHeader*)B8 (vb->z_data, rm[i].start))->magic == BGEN32(GENOZIP_MAGIC),
- "Data to be cut out start=%"PRIu64" len=%"PRIu64" is not on section boundary", rm[i].start, rm[i].len);
-
- buf_remove (vb->z_data, char, rm[i].start, rm[i].len);
- sections_remove_from_list (vb, rm[i].start, rm[i].len);
-
- ASSERT (is_encrypted || rm[i].start == vb->z_data.len || ((SectionHeader*)B8 (vb->z_data, rm[i].start))->magic == BGEN32(GENOZIP_MAGIC),
- "Data cut out is not exactly one section start=%"PRIu64" len=%"PRIu64, rm[i].start, rm[i].len);
- }
-}
-
-// reads exactly the length required, error otherwise.
-// return a pointer to the data read
-static void *zfile_read_from_disk (FileP file, VBlockP vb, BufferP buf, uint32_t len, SectionType st, DictId dict_id)
-{
- START_TIMER;
-
- ASSERT (len, "reading %s%s: len is 0", st_name (st), cond_str(dict_id.num, " dict_id=", dis_dict_id(dict_id).s));
- ASSERT (buf_has_space (buf, len), "reading %s: buf is out of space: len=%u but remaining space in buffer=%u (tip: run with --show-headers to see where it fails)",
- st_name (st), len, (uint32_t)(buf->size - buf->len));
-
- char *start = BAFTc (*buf);
- uint32_t bytes = fread (start, 1, len, Z_READ_FP(file));
- ASSERT (bytes == len, "reading %s%s read only %u bytes out of len=%u: %s",
- st_name (st), cond_str(dict_id.num, " dict_id=", dis_dict_id(dict_id).s), bytes, len, strerror(errno));
-
- buf->len += bytes;
-
- if (file->mode == READ) // mode==WRITE in case reading pair data in ZIP
- file->disk_so_far += bytes; // consumed by dispatcher_increment_progress
-
- COPY_TIMER (read);
-
- return start;
-}
-
-
-// read section header - called from the main thread.
-// returns offset of header within data, or SECTION_SKIPPED if section is skipped
-int32_t zfile_read_section_do (FileP file,
- VBlockP vb,
- uint32_t original_vb_i, // the vblock_i used for compressing. this is part of the encryption key. dictionaries are compressed by the compute thread/vb, but uncompressed by the main thread (vb=0)
- BufferP data, rom buf_name, // buffer to append
- SectionType expected_sec_type,
- Section sec, // NULL for no seeking
- FUNCLINE)
-{
- ASSERTMAINTHREAD;
-
- ASSERT (!sec || expected_sec_type == sec->st, "called from %s:%u: expected_sec_type=%s but encountered sec->st=%s. vb_i=%u",
- func, code_line, st_name (expected_sec_type), st_name(sec->st), vb->vblock_i);
-
- // skip if this section is not needed according to flags
- if (sec && file == z_file &&
- piz_is_skip_section ((vb ? vb->data_type : z_file->data_type), sec->st, (vb ? vb->comp_i : COMP_NONE), (IS_DICTED_SEC (sec->st) ? sec->dict_id : DICT_ID_NONE),
- sec->flags.flags,
- (vb && vb->preprocessing) ? SKIP_PURPOSE_PREPROC : SKIP_PURPOSE_RECON))
- return SECTION_SKIPPED;
-
- uint32_t header_size = st_header_size (expected_sec_type);
- uint32_t unencrypted_header_size = header_size;
-
- // note: for an encrypted file, while reading the reference, we don't yet know until getting the header whether it
- // will be an SEC_REF_IS_SET (encrypted) or SEC_REFERENCE (not encrypted if originating from external, encryptd if de-novo)
- bool is_encrypted = !Z_DT(REF) &&
- expected_sec_type != SEC_GENOZIP_HEADER &&
- crypt_get_encrypted_len (&header_size, NULL); // update header size if encrypted
-
- uint32_t header_offset = data->len;
- buf_alloc (vb, data, 0, header_offset + header_size, uint8_t, 2, buf_name);
- data->param = 1;
-
- // move the cursor to the section. file_seek is smart not to cause any overhead if no moving is needed
- if (sec) file_seek (file, sec->offset, SEEK_SET, READ, HARD_FAIL);
-
- SectionHeaderP header = zfile_read_from_disk (file, vb, data, header_size, expected_sec_type, IS_DICTED_SEC(sec->st) ? sec->dict_id : DICT_ID_NONE);
- uint32_t bytes_read = header_size;
-
- ASSERT (header, "called from %s:%u: Failed to read data from file %s while expecting section type %s: %s",
- func, code_line, z_name, st_name(expected_sec_type), strerror (errno));
-
- bool is_magical = BGEN32 (header->magic) == GENOZIP_MAGIC;
-
- // SEC_REFERENCE is never encrypted when originating from a reference file, it is encrypted (if the file is encrypted) if it originates from REF_INTERNAL
- if (is_encrypted && HEADER_IS(REFERENCE) && !header->data_encrypted_len) {
- is_encrypted = false;
- header_size = unencrypted_header_size;
- }
-
- // decrypt header (note: except for SEC_GENOZIP_HEADER - this header is never encrypted)
- if (is_encrypted) {
- ASSINP (BGEN32 (header->magic) != GENOZIP_MAGIC,
- "password provided, but file %s is not encrypted (sec_type=%s)", z_name, st_name (header->section_type));
-
- crypt_do (vb, (uint8_t*)header, header_size, original_vb_i, expected_sec_type, true);
-
- is_magical = BGEN32 (header->magic) == GENOZIP_MAGIC; // update after decryption
- }
-
- if (flag.show_headers) {
- sections_show_header (header, NULL, sec ? sec->offset : 0, 'R');
- if (is_genocat && (IS_DICTED_SEC (expected_sec_type) || expected_sec_type == SEC_REFERENCE || expected_sec_type == SEC_REF_IS_SET))
- return header_offset; // in genocat --show-header - we only show headers, nothing else
- }
-
- ASSERT (is_magical || flag.verify_codec, "called from %s:%u: corrupt data (magic is wrong) when attempting to read section=%s dict_id=%s of vblock_i=%u comp=%s in file %s",
- func, code_line, st_name (expected_sec_type), sec ? dis_dict_id (sec->dict_id).s : "(no sec)", vb->vblock_i, comp_name(vb->comp_i), z_name);
-
- uint32_t data_compressed_len = BGEN32 (header->data_compressed_len);
- uint32_t data_encrypted_len = BGEN32 (header->data_encrypted_len);
-
- uint32_t data_len = MAX_(data_compressed_len, data_encrypted_len);
-
- // in case where we already read part of the body (eg if is_encrypted was initially set and then unset) (remaining_data_len might be negative)
- int32_t remaining_data_len = (int32_t)data_len - (int32_t)(bytes_read - header_size);
-
- // check that we received the section type we expect,
- ASSERT (expected_sec_type == header->section_type,
- "called from %s:%u: Unexpected section type when reading %s: expecting %s, found %s sec(expecting)=(offset=%s, dict_id=%s)",
- func, code_line, z_name, st_name(expected_sec_type), st_name(header->section_type),
- sec ? str_int_commas (sec->offset).s : "N/A", sec ? dis_dict_id (sec->dict_id).s : "N/A");
-
- ASSERT (BGEN32 (header->vblock_i) == original_vb_i,
- "Requested to read %s with vb_i=%u, but actual section has vb_i=%u",
- st_name(expected_sec_type), original_vb_i, BGEN32 (header->vblock_i));
-
- // up to v14, we had compressed_offset instead of z_digest. Since we have it, we might as well use it
- // as an extra verification of the SectionHeader integrity
- ASSERT (VER(15) || BGEN32 (header->v14_compressed_offset) == header_size,
- "called from %s:%u: invalid header when reading %s - expecting compressed_offset to be %u but found %u. genozip_version=%u section_type=%s",
- func, code_line, z_name, header_size, BGEN32 (header->v14_compressed_offset), z_file->genozip_version/*set from footer*/, st_name(header->section_type));
-
- // allocate more memory for the rest of the header + data
- buf_alloc (vb, data, 0, header_offset + header_size + data_len, uint8_t, 2, "zfile_read_section");
- header = (SectionHeaderP)Bc(*data, header_offset); // update after realloc
-
- data->param = 2;
-
- // read section data
- if (remaining_data_len > 0)
- zfile_read_from_disk (file, vb, data, remaining_data_len, expected_sec_type, sections_get_dict_id (header));
-
- return header_offset;
-}
-
-// Read one section header - returns the header in vb->scratch - caller needs to free vb->scratch
-SectionHeaderUnion zfile_read_section_header_do (VBlockP vb, Section sec,
- SectionType expected_sec_type, // optional: if not SEC_NONE, also verifies section is of expected type
- FUNCLINE)
-{
- ASSERT (expected_sec_type == SEC_NONE || sec->st == expected_sec_type,
- "called from %s:%u: expecting sec.st=%s to be %s", func, code_line, st_name (sec->st), st_name (expected_sec_type));
-
- uint32_t header_size = st_header_size (sec->st);
- uint32_t unencrypted_header_size = header_size;
-
- file_seek (z_file, sec->offset, SEEK_SET, READ, HARD_FAIL);
-
- bool is_encrypted = (z_file->data_type != DT_REF) &&
- (sec->st != SEC_GENOZIP_HEADER) &&
- crypt_get_encrypted_len (&header_size, NULL); // update header size if encrypted
-
- SectionHeaderUnion header;
- uint32_t bytes = fread (&header, 1, header_size, Z_READ_FP(z_file));
-
- ASSERT (bytes == header_size, "called from %s:%u: Failed to read header of section type %s from file %s: %s (bytes=%u header_size=%u)",
- func, code_line, st_name(sec->st), z_name, strerror (errno), bytes, header_size);
-
- bool is_magical = BGEN32 (header.common.magic) == GENOZIP_MAGIC;
-
- // SEC_REFERENCE is never encrypted in references files, or if REF_EXT_STORE is used.
- // It is encrypted (if the file is encrypted) if REF_INTERNAL is used.
- if (is_encrypted && header.common.section_type == SEC_REFERENCE && !header.common.data_encrypted_len) {
- is_encrypted = false;
- header_size = unencrypted_header_size;
- }
-
- // decrypt header
- if (is_encrypted) {
- ASSERT (BGEN32 (header.common.magic) != GENOZIP_MAGIC,
- "called from %s:%u: password provided, but file %s is not encrypted (sec_type=%s)", func, code_line, z_name, st_name (header.common.section_type));
-
- crypt_do (vb, (uint8_t*)&header, header_size, sec->vblock_i, sec->st, true);
-
- is_magical = BGEN32 (header.common.magic) == GENOZIP_MAGIC; // update after decryption
- }
-
- ASSERT (is_magical, "called from %s:%u: corrupt data (magic is wrong) when attempting to read header of section %s in file %s",
- func, code_line, st_name (sec->st), z_name);
-
- ASSERT (expected_sec_type == SEC_NONE ||
- (BGEN32 (header.common.vblock_i) == sec->vblock_i && header.common.section_type == sec->st) ||
- (!VER(14) && sec->st == SEC_REF_HASH), // in V<=13, REF_HASH didn't have a vb_i in the section list
- "called from %s:%u: Requested to read %s with vb_i=%u, but actual section is %s with vb_i=%u",
- func, code_line, st_name(sec->st), sec->vblock_i, st_name(header.common.section_type), BGEN32 (header.common.vblock_i));
-
- return header;
-}
-
-// up to v14, we had no explicit "has_digest" flag - we calculate it here by searching for proof of digest.
-// since a digest might be 0 by chance, a 0 is not a proof of non-digest, however several 0s are strong enough evidence.
-static bool zfile_get_has_digest_up_to_v14 (SectionHeaderGenozipHeaderP header)
-{
- // proof: a file was compressed with --md5 (zip verifies --md5 conflicts)
- if (!header->flags.genozip_header.adler) return true;
-
- // proof: a file is up to v13 with digest_bound
- if (!VER(14) && !digest_is_zero (header->FASTQ_v13_digest_bound)) return true;
-
- // search for a non-0 digest in the first 3 TXT/VB headers
- Section sec = NULL;
- for (int i=0 ; i < 3 && sections_next_sec2 (&sec, SEC_TXT_HEADER, SEC_VB_HEADER); i++) {
- SectionHeaderUnion header = zfile_read_section_header (evb, sec, SEC_NONE);
-
- // proof: a TXT_HEADER has a digest of the txt_header (0 if file has no header) or
- // digest of the entire file.
- if (sec->st == SEC_TXT_HEADER &&
- (!digest_is_zero (header.txt_header.digest) || !digest_is_zero (header.txt_header.digest_header)))
- return true;
-
- // proof: a VB has a digest
- if (sec->st == SEC_VB_HEADER && !digest_is_zero (header.vb_header.digest)) return true;
- }
-
- return false; // no proof of digest
-}
-
-bool zfile_advance_to_next_header (uint64_t *offset, uint64_t *gap)
-{
- uint64_t start_offset = *offset;
- file_seek (z_file, start_offset, SEEK_SET, READ, HARD_FAIL);
-
- char data[128 KB + 4];
- while (1) {
- memset (data, 0, sizeof(data));
-
- uint32_t bytes;
- if (!(bytes = fread (data+4, 1, 128 KB, Z_READ_FP(z_file))))
- return false; // possibly 4 bytes of the Footer magic remaining
-
- // note: we accept a magic in the final 4 bytes of data - this could be a Footer. We
- // move those last 4 bytes to the next iteration
- for (int i=0; i < bytes; i++)
- if (BGEN32(GET_UINT32 (&data[i])) == GENOZIP_MAGIC) {
- *offset += i - 4;
- *gap = *offset - start_offset;
- return true;
- }
-
- *offset += 128 KB;
- memcpy (data, &data[128 KB], 4);
- }
-}
-
-// check if reference filename exists in the absolute or relative path
-static rom zfile_read_genozip_header_get_ref_filename (rom header_fn)
-{
- // if header_filename exists, use it
- if (file_exists (header_fn)) {
- char *fn = MALLOC (strlen (header_fn) + 1);
- strcpy (fn, header_fn);
- return fn;
- }
-
- // case absolute path and it doesn't exist
- if (header_fn[0] == '/' || header_fn[0] == '\\') return NULL;
-
- rom slash = strrchr (z_name, '/');
- if (!slash && flag.is_windows) slash = strrchr (z_name, '\\');
- if (!slash) return NULL; // chain file is in the current dir
-
- unsigned dirname_len = slash - z_name + 1; // including slash
- int fn_size = strlen (header_fn) + dirname_len + 1;
- char *fn = MALLOC (fn_size);
- snprintf (fn, fn_size, "%.*s%s", dirname_len, z_name, header_fn);
-
- if (file_exists (fn))
- return fn;
- else {
- FREE (fn);
- return NULL;
- }
-}
-
-static void zfile_read_genozip_header_set_reference (ConstSectionHeaderGenozipHeaderP header, rom ref_filename)
-{
- WARN ("Note: using the reference file %s. You can override this with --reference or $GENOZIP_REFERENCE", ref_filename);
- ref_set_reference (gref, ref_filename, REF_EXTERNAL, false);
-}
-
-// reference data when NOT reading a reference file
-static void zfile_read_genozip_header_handle_ref_info (ConstSectionHeaderGenozipHeaderP header)
-{
- ASSERT0 (!flag.reading_reference, "we should not be here");
-
- if (digest_is_zero (header->ref_genome_digest)) return; // no reference info in header - we're done
-
- z_file->ref_genome_digest = header->ref_genome_digest;
- memcpy (z_file->ref_filename_used_in_zip, header->ref_filename, REF_FILENAME_LEN);
-
- if (flag.show_reference) {
- if (flag.force)
- iprintf ("%s", header->ref_filename);
- else
- iprintf ("%s was compressed using the reference file:\nName: %s\nMD5: %s\n",
- z_name, header->ref_filename, digest_display (header->ref_genome_digest).s);
- if (is_genocat) exit_ok; // in genocat --show-reference, we only show the reference, not the data
- }
-
- if (!is_genols) { // note: we don't need the reference for genols
-
- rom gref_fn = ref_get_filename (gref);
-
- rom env = getenv ("GENOZIP_REFERENCE");
- int env_len = env ? strlen (env) : 0;
-
- if (env_len > 1 && (env[env_len-1] == '/' || env[env_len-1] == '\\'))
- env_len--; // remove trailing /
-
- // case: this file requires an external reference, but command line doesn't include --reference - attempt to use the
- // reference specified in the header.
- // Note: this code will be executed when zfile_read_genozip_header is called from main_genounzip.
- if (!flag.explicit_ref && !env && // reference NOT was specified on command line
- !Z_DT(REF) && // for reference files, this field is actual fasta_filename
- !(gref_fn && !strcmp (gref_fn, header->ref_filename))) { // ref_filename already set from a previous file with the same reference
-
- rom ref_filename = zfile_read_genozip_header_get_ref_filename (header->ref_filename);
-
- if (!flag.dont_load_ref_file && ref_filename && file_exists (ref_filename))
- zfile_read_genozip_header_set_reference (header, ref_filename);
- else
- ASSINP (flag.dont_load_ref_file, "Please use --reference to specify the path to the reference file. Original path was: %.*s",
- REF_FILENAME_LEN, header->ref_filename);
-
- FREE (ref_filename);
- }
-
- // case: reference directory provided in GENOZIP_REFERENCE
- else if (!flag.explicit_ref && !Z_DT(REF) && !flag.dont_load_ref_file &&
- env && file_is_dir (env)) {
-
- bool exists = false;
-
- if (header->ref_filename[0]) {
- // get basename of filename in header
- rom ref_basename = strrchr (header->ref_filename, '/');
- if (!ref_basename) ref_basename = strrchr (header->ref_filename, '\\');
- ref_basename = ref_basename ? (ref_basename + 1) : header->ref_filename;
-
- int new_filename_size = strlen (ref_basename) + env_len + 2;
- char new_filename[new_filename_size];
-
- snprintf (new_filename, new_filename_size, "%.*s/%s", STRf(env), ref_basename);
- exists = file_exists (new_filename);
-
- // case: use reference file in directory GENOZIP_REFERENCE and basename from header
- if (exists &&
- !(gref_fn && !strcmp (gref_fn, new_filename))) // reference not already loaded
- zfile_read_genozip_header_set_reference (header, new_filename);
- }
-
- // if reference not found in directory GENOZIP_REFERENCE, use full filename from header
- if (!exists) {
- rom ref_filename = zfile_read_genozip_header_get_ref_filename (header->ref_filename);
-
- if (!(ref_filename && gref_fn && !strcmp (gref_fn, ref_filename))) {
- if (ref_filename)
- zfile_read_genozip_header_set_reference (header, ref_filename);
- else
- ABORTINP ("Please use --reference to specify the path to the reference file. Original path was: %.*s",
- REF_FILENAME_LEN, header->ref_filename);
- }
- FREE (ref_filename);
- }
- }
- }
-}
-
-static uint64_t zfile_read_genozip_header_get_actual_offset (void)
-{
- uint32_t size = MIN_(z_file->disk_size, 16 MB);
- file_seek (z_file, z_file->disk_size - size, SEEK_SET, READ, HARD_FAIL);
-
- ASSERTNOTINUSE (evb->scratch);
- buf_alloc_exact_zero (evb, evb->scratch, size + 100, char, "scratch");
- evb->scratch.len -= 100; // extra allocated memory to ease the scan loop
-
- int ret = fread (evb->scratch.data, size, 1, Z_READ_FP(z_file));
- ASSERT (ret == 1, "Failed to read %u bytes from the end of %s", size, z_name);
-
- for_buf_back (uint8_t, p, evb->scratch)
- if (BGEN32(GET_UINT32(p)) == GENOZIP_MAGIC && ((SectionHeaderP)p)->section_type == SEC_GENOZIP_HEADER)
- return BNUM (evb->scratch, p) + (z_file->disk_size - size);
-
- ABORT ("Cannot locate the SEC_GENOZIP_HEADER in the final %u bytes of %s", size, z_name);
-}
-
-// gets offset to the beginning of the GENOZIP_HEADER section, and sets z_file->genozip_version
-uint64_t zfile_read_genozip_header_get_offset (bool as_is)
-{
- // read the footer from the end of the file
- if (z_file->disk_size < sizeof(SectionFooterGenozipHeader) ||
- !z_file->file ||
- !file_seek (z_file, -sizeof(SectionFooterGenozipHeader), SEEK_END, READ, SOFT_FAIL))
- return 0; // failed
-
- TEMP_FLAG(quiet, false);
-
- SectionFooterGenozipHeader footer;
- int ret = fread (&footer, sizeof (footer), 1, Z_READ_FP(z_file));
- ASSERTW (ret == 1, "Skipping empty file %s", z_name);
- if (!ret) return 0; // failed
-
- // case: there is no genozip header. this can happen if the file was truncated (eg because compression did not complete)
- RETURNW (BGEN32 (footer.magic) == GENOZIP_MAGIC, 0, "Error in %s: the file appears to be incomplete (it is missing the Footer).", z_name);
-
- uint64_t offset = flag.recover ? zfile_read_genozip_header_get_actual_offset() // get correct offset in case of corruption
- : BGEN64 (footer.genozip_header_offset);
-
- if (as_is) return offset;
-
- // read genozip_version directly, needed to determine the section header size
- RETURNW (file_seek (z_file, offset, SEEK_SET, READ, WARNING_FAIL), 0,
- "Error in %s: corrupt offset=%"PRIu64" in Footer (file_size=%"PRIu64")",
- z_name, offset, z_file->disk_size);
-
- SectionHeaderGenozipHeader top = {};
- RETURNW (fread (&top, 1, MIN_(sizeof (SectionHeaderGenozipHeader), z_file->disk_size - offset/*header was shorter in earlier verions*/),
- Z_READ_FP(z_file)), 0, "Error in %s: failed to read genozip header", z_name);
-
- RETURNW (BGEN32 (top.magic) == GENOZIP_MAGIC, 0, "Error in %s: offset=%"PRIu64" of the GENOZIP_HEADER section as it appears in the Footer appears to be wrong, or the GENOZIP_HEADER section has bad magic (file_size=%"PRIu64").%s",
- z_name, offset, z_file->disk_size, flag.debug_or_test ? " Try again with --recover." : "");
-
- RESTORE_FLAG(quiet);
-
- z_file->genozip_version = top.genozip_version;
- z_file->genozip_minor_ver = top.genozip_minor_ver; // 0 before 15.0.28
-
- z_file->data_type = BGEN16 (top.data_type);
- if (Z_DT(BCF)) { z_file->data_type = DT_VCF; z_file->source_codec = CODEC_BCF; } // Z_DT is always VCF, not BCF
- else if (Z_DT(CRAM)) { z_file->data_type = DT_SAM; z_file->source_codec = CODEC_CRAM; } // Z_DT is always SAM, not CRAM or BAM
-
- // check that file version is at most this executable version, except for reference file for which only major version is tested
- ASSINP (z_file->genozip_version < code_version_major() ||
- (z_file->genozip_version == code_version_major() && (z_file->genozip_minor_ver <= code_version_minor() || Z_DT(REF) || (is_genocat && flag.show_stats))),
- "Error: %s cannot be opened because it was compressed with genozip version %u.0.%u which is newer than the version running - %s.\n%s",
- z_name, z_file->genozip_version, z_file->genozip_minor_ver, GENOZIP_CODE_VERSION, genozip_update_msg());
-
- bool metadata_only = is_genocat && (flag.show_stats || flag.show_gheader || flag.show_headers || flag.show_aliases || flag.show_dict);
-
- #define MSG "Error: %s was compressed with version %u of genozip. It may be uncompressed with genozip versions %u to %u"
-
- // in version 6, we canceled backward compatability with v1-v5
- ASSINP (VER(6), MSG, z_name, z_file->genozip_version, z_file->genozip_version, 5);
-
- // in version 7, we canceled backward compatability with v6
- ASSINP (VER(7), MSG, z_name, z_file->genozip_version, 6, 6);
-
- // in version 8, we canceled backward compatability with v7
- ASSINP (VER(8), MSG, z_name, z_file->genozip_version, 7, 7);
-
- // in version 15, we canceled backward compatability with v8,9,10 (except reference files which continue to be supported back to v8, as they might be needed to decompress files of later versions)
- ASSINP (metadata_only || VER(11) || Z_DT(REF), MSG, z_name, z_file->genozip_version, z_file->genozip_version, 14);
-
- #undef MSG
- return offset;
-}
-
-// returns false if file should be skipped
-bool zfile_read_genozip_header (SectionHeaderGenozipHeaderP out_header, FailType fail_type) // optional outs
-{
- ASSERTNOTNULL (z_file);
-
- if (z_file->section_list_buf.len) return true; // header already read
-
- SectionEnt sec = { .st = SEC_GENOZIP_HEADER,
- .offset = zfile_read_genozip_header_get_offset (false) };
-
- if (!sec.offset) {
- fail_type = HARD_FAIL;
- goto error;
- }
-
- zfile_read_section (z_file, evb, 0, &evb->z_data, "z_data", SEC_GENOZIP_HEADER, &sec);
-
- SectionHeaderGenozipHeaderP header = (SectionHeaderGenozipHeaderP)evb->z_data.data;
- if (out_header) *out_header = *header;
-
- DataType data_type = (DataType)(BGEN16 (header->data_type));
-
- // Note: BCF/CRAM files have DT_BCF/DT_CRAM in the GenozipHeader, but in the PIZ code we
- // expect data_type=VCF/SAM with z_file->source_codec set to CODEC_BCF/CODEC_CRAM.
- if (data_type == DT_BCF) data_type = DT_VCF;
- else if (data_type == DT_CRAM) data_type = DT_SAM;
-
- ASSERT ((unsigned)data_type < NUM_DATATYPES, "unrecognized data_type=%d. %s", data_type, genozip_update_msg());
-
- // case: we couldn't figure out z_file->data_type from the .genozip filename - set based on the data_type in the GenozipHeader
- if (Z_DT(NONE) || Z_DT(GNRIC)) {
- z_file->data_type = data_type;
- z_file->type = file_get_default_z_ft_of_data_type (data_type);
- }
-
- // case: we set z_file->data_type based on the .genozip filename - verify that it is correct
- else
- ASSINP (z_file->data_type == data_type, "%s - file extension indicates this is a %s file, but according to its contents it is a %s",
- z_name, z_dt_name(), dt_name (data_type));
-
- ASSINP (header->encryption_type != ENC_NONE || !has_password() || Z_DT(REF),
- "password provided, but file %s is not encrypted", z_name);
-
- ASSERT (VER(15) || BGEN32 (header->v14_compressed_offset) == st_header_size (SEC_GENOZIP_HEADER),
- "invalid genozip header of %s - expecting compressed_offset to be %u in genozip_version=%u but found %u",
- z_name, st_header_size (SEC_GENOZIP_HEADER), header->genozip_version, BGEN32 (header->v14_compressed_offset));
-
- // get & test password, if file is encrypted
- if (header->encryption_type != ENC_NONE) {
-
- if (!has_password()) crypt_prompt_for_password();
-
- crypt_do (evb, header->password_test, sizeof(header->password_test), 0, SEC_NONE, true); // decrypt password test
-
- ASSINP (!memcmp (header->password_test, PASSWORD_TEST, sizeof(header->password_test)),
- "password is wrong for file %s", z_name);
- }
-
- z_file->num_txt_files = VER(14) ? header->num_txt_files : BGEN32 (header->v13_num_components);
- if (z_file->num_txt_files < 2) flag.unbind = 0; // override user's prefix if file has only 1 component (bug 326)
-
- int dts = z_file->z_flags.dt_specific; // save in case its set already (eg dts_paired is set in sections_is_paired)
- z_file->z_flags = header->flags.genozip_header;
-
- if (IS_SRC_BCF) z_file->z_flags.txt_is_bin = true; // in files 15.0.58 or older this was not set
-
- z_file->z_flags.dt_specific |= dts;
- z_file->num_lines = BGEN64 (header->num_lines_bound);
- z_file->txt_data_so_far_bind = BGEN64 (header->recon_size);
-
- if (VER(14) && !flag.reading_reference)
- segconf.vb_size = (uint64_t)BGEN16 (header->vb_size) MB;
-
- if (VER(15) && !flag.reading_reference)
- segconf.zip_txt_modified = header->is_modified; // since 15.0.60
-
- if (flag.show_data_type) {
- iprintf ("%s\n", z_dt_name());
- exit_ok;
- }
-
- DT_FUNC (z_file, piz_genozip_header)(header); // data-type specific processing of the Genozip Header
-
- bool has_section_list = true;
- if (!z_file->section_list_buf.param) { // not already initialized in a previous call to this function
-
- has_section_list = license_piz_prepare_genozip_header (header, IS_LIST || (IS_SHOW_HEADERS && flag.force));
-
- if (has_section_list) {
- zfile_uncompress_section (evb, header, &z_file->section_list_buf, "z_file->section_list_buf", 0, SEC_GENOZIP_HEADER);
-
- sections_list_file_to_memory_format (header);
- }
-
- if (flag.show_gheader==1) {
- DO_ONCE sections_show_gheader (header);
- if (is_genocat) exit_ok; // in genocat, exit after showing the requested data
- }
-
- z_file->section_list_buf.param = 1;
- }
-
- if (!VER(15))
- z_file->z_flags.has_digest = zfile_get_has_digest_up_to_v14 (header); // overwrites v14_bgzf that is no longer used for PIZ
-
- // case: we are reading a file expected to be the reference file itself
- if (flag.reading_reference) {
- ASSINP (data_type == DT_REF, "Error: %s is not a reference file. To create a reference file, use 'genozip --make-reference '",
- ref_get_filename(gref));
-
- // note: in the reference file itself, header->ref_filename is the original fasta used to create this reference
- ref_set_ref_file_info (flag.reading_reference, header->genome_digest, header->flags.genozip_header.adler,
- header->fasta_filename, header->genozip_version);
-
- refhash_set_digest (header->refhash_digest);
-
- buf_free (evb->z_data);
- }
-
- // case: we are reading a file that is not expected to be a reference file
- else {
- // case: we are attempting to decompress a reference file - this is not supported
- ASSGOTO (data_type != DT_REF || (flag.genocat_no_reconstruct && is_genocat) || is_genols,
- "%s is a reference file - it cannot be decompressed - skipping it. Did you intend to use --reference?.", z_name);
-
- // handle reference file info
- flags_update_piz_no_ref_file();
-
- if (!flag.dont_load_ref_file && data_type != DT_REF)
- zfile_read_genozip_header_handle_ref_info (header);
-
- buf_free (evb->z_data); // free before ctx_piz_initialize_zctxs that might read aliases - header not valid after freeing
-
- // create all contexts for B250/LOCAL/DICT data in the z_file (or predefined) -
- // flags_update_piz_one_z_file and IS_SKIP functions may rely on Context.z_data_exists
- if (has_section_list)
- ctx_piz_initialize_zctxs();
- }
-
- return true;
-
-error:
- buf_free (evb->z_data);
- ASSERT (fail_type == SOFT_FAIL, "failed to read %s", z_name);
- return false;
-}
-
-// Update the first SEC_TXT_HEADER fragment of the current txt file.
-void zfile_update_txt_header_section_header (uint64_t offset_in_z_file)
-{
- // sanity check - we skip empty files, so data is expected
- ASSERT (txt_file->txt_data_so_far_single > 0, "Expecting txt_file->txt_data_so_far_single=%"PRId64" > 0", txt_file->txt_data_so_far_single);
-
- ASSERTNOTINUSE (evb->scratch);
- buf_alloc_exact_zero (evb, evb->scratch, sizeof (SectionHeaderTxtHeader) + AES_BLOCKLEN-1/*encryption padding*/, char, "scratch");
-
- SectionHeaderTxtHeaderP header = B1ST(SectionHeaderTxtHeader, evb->scratch);
- *header = z_file->txt_header_hdr;
-
- header->txt_data_size = BGEN64 (txt_file->txt_data_so_far_single);
- header->txt_num_lines = BGEN64 (txt_file->num_lines);
- header->max_lines_per_vb = BGEN32 (txt_file->max_lines_per_vb);
-
- // qname stuff
- for (QType q=0; q < NUM_QTYPES; q++)
- header->flav_prop[q] = segconf.flav_prop[q];
-
- if (flag.md5 && !segconf.zip_txt_modified && gencomp_comp_eligible_for_digest(NULL))
- header->digest = digest_snapshot (&z_file->digest_ctx, "file");
-
- if (flag.show_headers)
- sections_show_header ((SectionHeaderP)header, NULL, offset_in_z_file, 'W');
-
- evb->scratch.len = crypt_padded_len (sizeof (SectionHeaderTxtHeader));
-
- // encrypt if needed
- if (has_password()) {
- crypt_pad ((uint8_t *)header, evb->scratch.len, evb->scratch.len - sizeof (SectionHeaderTxtHeader));
- crypt_do (evb, (uint8_t *)header, evb->scratch.len, 1 /*was 0 up to 14.0.8*/, header->section_type, true);
- }
-
- zriter_write (&evb->scratch, NULL, offset_in_z_file, false); // note: cannot write in background with offset
-
- buf_free (evb->scratch);
-}
-
-// ZIP compute thread - called from zip_compress_one_vb()
-void zfile_compress_vb_header (VBlockP vb)
-{
-
- SectionHeaderVbHeader vb_header = {
- .magic = BGEN32 (GENOZIP_MAGIC),
- .section_type = SEC_VB_HEADER,
- .vblock_i = BGEN32 (vb->vblock_i),
- .codec = CODEC_NONE,
- .flags.vb_header = vb->flags,
- .recon_size = BGEN32 (vb->recon_size),
- .longest_line_len = BGEN32 (vb->longest_line_len),
- .longest_seq_len = BGEN32 (vb->longest_seq_len), // since v15 (non-0 for SAM, BAM, FASTQ)
- .digest = vb->digest,
- };
-
- DT_FUNC (vb, zip_set_vb_header_specific)(vb, &vb_header);
-
- // copy section header into z_data - to be eventually written to disk by the main thread. this section doesn't have data.
- comp_compress (vb, NULL, &vb->z_data, &vb_header, NULL, NO_CALLBACK, "SEC_VB_HEADER");
-}
-
-// ZIP only: called by the main thread in the sequential order of VBs: updating of the already compressed
-// variant data section (compressed by the compute thread in zfile_compress_vb_header) just before writing it to disk
-// note: this updates the z_data in memory (not on disk)
-void zfile_update_compressed_vb_header (VBlockP vb)
-{
- if (flag.biopsy) return; // we have no z_data in biopsy mode
-
- SectionHeaderVbHeaderP vb_header = (SectionHeaderVbHeaderP)vb->z_data.data;
- vb_header->z_data_bytes = BGEN32 (vb->z_data.len32);
-
- if (flag_is_show_vblocks (ZIP_TASK_NAME))
- iprintf ("UPDATE_VB_HEADER(id=%d) vb_i=%u comp_i=%u recon_size=%u genozip_size=%u n_lines=%u longest_line_len=%u\n",
- vb->id, vb->vblock_i, vb->comp_i,
- BGEN32 (vb_header->recon_size), BGEN32 (vb_header->z_data_bytes),
- vb->lines.len32, // just for debugging, not in VB header
- BGEN32 (vb_header->longest_line_len));
-
- // now we can finally encrypt the header - if needed
- if (has_password())
- crypt_do (vb, (uint8_t*)vb_header, ROUNDUP16(sizeof(SectionHeaderVbHeader)),
- BGEN32 (vb_header->vblock_i), vb_header->section_type, true);
-}
-
-// ZIP - main thread
-void zfile_output_processed_vb_ext (VBlockP vb, bool background)
-{
- ASSERTMAINTHREAD;
-
- zriter_write (&vb->z_data, &vb->section_list_buf, -1, background);
-
- if (vb->comp_i != COMP_NONE) z_file->disk_so_far_comp[vb->comp_i] += vb->z_data.len;
- vb->z_data.len = 0;
-
- ctx_update_stats (vb);
-
- if (flag.show_headers && buf_is_alloc (&vb->show_headers_buf))
- buf_print (&vb->show_headers_buf, false);
-}
-
-void zfile_output_processed_vb (VBlockP vb)
-{
- zfile_output_processed_vb_ext (vb, false);
-}
-
-// get file data type - by its name if possible, or if not, inspect the GenozipHeader
-DataType zfile_piz_get_file_dt (rom z_filename)
-{
- DataType dt = file_get_dt_by_z_filename (z_filename);
- FileP file = NULL;
-
- // case: we don't know yet what file type this is - we need to read the genozip header to determine
- if (dt == DT_NONE && z_filename) {
- if (!(file = file_open_z_read (z_filename)) || !file->file)
- goto done; // not a genozip file
-
- // read the footer from the end of the file
- if (!file_seek (file, -sizeof(SectionFooterGenozipHeader), SEEK_END, READ, WARNING_FAIL))
- goto done;
-
- SectionFooterGenozipHeader footer;
- int ret = fread (&footer, sizeof (footer), 1, Z_READ_FP(file));
- ASSERTW (ret == 1, "Skipping empty file %s", z_name);
- if (!ret) goto done; // empty file / cannot read
-
- // case: this is not a valid genozip v2+ file
- if (BGEN32 (footer.magic) != GENOZIP_MAGIC) goto done;
-
- // read genozip header
- uint64_t genozip_header_offset = BGEN64 (footer.genozip_header_offset);
- if (!file_seek (file, genozip_header_offset, SEEK_SET, READ, WARNING_FAIL))
- goto done;
-
- SectionHeaderGenozipHeader header;
- int bytes = fread ((char*)&header, 1, sizeof(SectionHeaderGenozipHeader), Z_READ_FP(file));
- if (bytes < sizeof(SectionHeaderGenozipHeader)) goto done;
-
- ASSERTW (BGEN32 (header.magic) == GENOZIP_MAGIC, "Error reading %s: corrupt data", z_name);
- if (BGEN32 (header.magic) != GENOZIP_MAGIC) goto done;
-
- dt = (DataType)BGEN16 (header.data_type);
- }
-
-done:
- file_close (&file);
- return dt;
-}
-
+// ------------------------------------------------------------------
+// zfile.c
+// Copyright (C) 2019-2024 Genozip Limited. Patent Pending.
+// Please see terms and conditions in the file LICENSE.txt
+//
+// WARNING: Genozip is proprietary, not open source software. Modifying the source code is strictly prohibited,
+// under penalties specified in the license.
+
+#include
+#include
+#include
+#include
+#include
+#include "vblock.h"
+#include "zfile.h"
+#include "crypt.h"
+#include "context.h"
+#include "compressor.h"
+#include "piz.h"
+#include "zip.h"
+#include "license.h"
+#include "gencomp.h"
+#include "threads.h"
+#include "refhash.h"
+#include "seg.h"
+#include "dispatcher.h"
+#include "zriter.h"
+#include "b250.h"
+#include "libdeflate_1.19/libdeflate.h"
+
+static void zfile_show_b250_section (SectionHeaderUnionP header_p, ConstBufferP b250_data)
+{
+ static Mutex show_b250_mutex = {}; // protect so compute thread's outputs don't get mix
+
+ SectionHeaderCtxP header = header_p.ctx;
+
+ if (!flag.show_b250 && dict_id_typeless (header->dict_id).num != flag.dict_id_show_one_b250.num) return;
+
+ mutex_initialize (show_b250_mutex); // possible unlikely race condition on initializing - good enough for debugging purposes
+ mutex_lock (show_b250_mutex);
+
+ iprintf ("vb_i=%u %*.*s: ", BGEN32 (header->vblock_i), -DICT_ID_LEN-1, DICT_ID_LEN, dict_id_typeless (header->dict_id).id);
+
+ bytes data = B1ST (const uint8_t, *b250_data);
+ bytes after = BAFT (const uint8_t, *b250_data);
+
+ while (data < after) {
+ WordIndex word_index = b250_piz_decode (&data, true, header->b250_size, "zfile_show_b250_section");
+ switch (word_index) {
+ case WORD_INDEX_ONE_UP : iprint0 ("ONE_UP " ) ; break ;
+ case WORD_INDEX_EMPTY : iprint0 ("EMPTY " ) ; break ;
+ case WORD_INDEX_MISSING : iprint0 ("MISSING ") ; break ;
+ default : iprintf ("%u ", word_index);
+ }
+ }
+ iprint0 ("\n");
+
+ mutex_unlock (show_b250_mutex);
+}
+
+// Write uncompressed, unencrypted section to ...[header|body].
+// Note: header includes encryption padding if it was encrypted
+static void zfile_dump_section (BufferP uncompressed_data, SectionHeaderP header, unsigned section_len, DictId dict_id)
+{
+ char filename[100];
+ VBIType vb_i = BGEN32 (header->vblock_i);
+
+ // header
+ snprintf (filename, sizeof(filename), "%s.%u.%s.header", st_name (header->section_type), vb_i, dis_dict_id (dict_id).s);
+ file_put_data (filename, header, section_len, 0);
+
+ // body
+ if (uncompressed_data->len) {
+ snprintf (filename, sizeof(filename),"%s.%u.%s.body", st_name (header->section_type), vb_i, dis_dict_id (dict_id).s);
+ buf_dump_to_file (filename, uncompressed_data, 1, false, false, true, false);
+ }
+}
+
+// uncompressed a block and adds a \0 at its end. Returns the length of the uncompressed block, without the \0.
+// when we get here, the header is already unencrypted zfile_one_section
+void zfile_uncompress_section (VBlockP vb,
+ SectionHeaderUnionP header_p,
+ BufferP uncompressed_data,
+ rom uncompressed_data_buf_name, // a name if Buffer, NULL ok if buffer need not be realloced
+ uint32_t expected_vb_i,
+ SectionType expected_section_type)
+{
+ START_TIMER;
+ ASSERTNOTNULL (header_p.common);
+
+ DictId dict_id = DICT_ID_NONE;
+ uint8_t codec_param = 0;
+
+ if (expected_section_type == SEC_DICT)
+ dict_id = header_p.dict->dict_id;
+ else if (expected_section_type == SEC_B250 || expected_section_type == SEC_LOCAL) {
+ dict_id = header_p.ctx->dict_id;
+ codec_param = header_p.ctx->param;
+ }
+ else if (expected_section_type == SEC_COUNTS)
+ dict_id = header_p.counts->dict_id;
+ else if (expected_section_type == SEC_SUBDICTS)
+ dict_id = header_p.subdicts->dict_id;
+
+ ContextP ctx = NULL;
+ if (IS_DICTED_SEC (expected_section_type)) {
+ ctx = ECTX(dict_id);
+ if (ctx && !ctx->is_loaded && IS_PIZ) // note: never skip in ZIP (when an R2 VB uncompressed R1 sections)
+ return; // section was skipped
+ }
+ else
+ if (piz_is_skip_undicted_section (expected_section_type)) return; // undicted section was skipped
+
+ SectionHeaderP header = header_p.common;
+ uint32_t data_encrypted_len = BGEN32 (header->data_encrypted_len);
+ uint32_t data_compressed_len = BGEN32 (header->data_compressed_len);
+ uint32_t data_uncompressed_len = BGEN32 (header->data_uncompressed_len);
+ uint32_t expected_z_digest = BGEN32 (header->z_digest);
+ VBIType vblock_i = BGEN32 (header->vblock_i);
+
+ // sanity checks
+ ASSERT (header->section_type == expected_section_type, "expecting section type %s but seeing %s", st_name(expected_section_type), st_name(header->section_type));
+
+ ASSERT (vblock_i == expected_vb_i || !expected_vb_i, // dictionaries are uncompressed by the main thread with pseduo_vb (vb_i=0)
+ "bad vblock_i: header->vblock_i=%u but expecting it to be %u (section_type=%s dict_id=%s)",
+ vblock_i, expected_vb_i, st_name (expected_section_type), dis_dict_id(dict_id).s);
+
+ if (flag.show_uncompress)
+ iprintf ("Uncompress: %s %-9s %-8s comp_len=%-7u uncomp_len=%u\n", VB_NAME,
+ st_name (expected_section_type), dict_id.num ? dis_dict_id (dict_id).s : "", data_compressed_len, data_uncompressed_len);
+
+ uint32_t compressed_offset = st_header_size (header->section_type);
+ if (data_encrypted_len) compressed_offset = ROUNDUP16 (compressed_offset);
+
+ uint32_t actual_z_digest = adler32 (1, (uint8_t*)header + compressed_offset, MAX_(data_compressed_len, data_encrypted_len));
+
+ if (VER(15) && expected_z_digest != actual_z_digest) {
+ sections_show_header (header_p.common, vb, 0, 'E');
+ ABORT ("%s:%s: Section %s data failed digest verification: expected_z_digest=%u != actual_z_digest=%u",
+ z_name, VB_NAME, st_name(header->section_type), expected_z_digest, actual_z_digest);
+ }
+
+ // decrypt data (in-place) if needed
+ if (data_encrypted_len)
+ crypt_do (vb, (uint8_t*)header + compressed_offset, data_encrypted_len, vblock_i, header->section_type, false);
+
+ bool bad_compression = false;
+
+ if (data_uncompressed_len > 0) { // FORMAT, for example, can be missing in a sample-less file
+
+ if (uncompressed_data_buf_name) {
+ buf_alloc (vb, uncompressed_data, 0, data_uncompressed_len + sizeof (uint64_t), char, 1.1, uncompressed_data_buf_name); // add a 64b word for safety in case this buffer will be converted to a bits later
+ uncompressed_data->len = data_uncompressed_len;
+ }
+
+ comp_uncompress (vb, ctx, header->codec,
+ header->section_type == SEC_LOCAL ? header->sub_codec : 0,
+ codec_param,
+ (char*)header + compressed_offset, data_compressed_len,
+ uncompressed_data, data_uncompressed_len,
+ dict_id.num ? dis_dict_id(dict_id).s : st_name(expected_section_type));
+
+ //--verify-codec: verify that adler32 of the decompressed data is equal that of the original uncompressed data
+ if (flag.verify_codec && uncompressed_data && data_uncompressed_len &&
+ BGEN32 (header->magic) != GENOZIP_MAGIC &&
+ header->uncomp_adler32 != adler32 (1, uncompressed_data->data, data_uncompressed_len)) {
+
+ iprintf ("--verify-codec: BAD ADLER32 section decompressed incorrectly: codec=%s\n", codec_name(header->codec));
+ sections_show_header (header, NULL, 0, 'R');
+ bad_compression = true;
+ }
+ }
+
+ if (flag.show_b250 && expected_section_type == SEC_B250)
+ zfile_show_b250_section (header_p, uncompressed_data);
+
+ if ((flag.dump_section && !strcmp (st_name (expected_section_type), flag.dump_section)) || bad_compression) {
+ uint64_t save_len = uncompressed_data->len;
+ uncompressed_data->len = data_uncompressed_len; // might be different, eg in the case of ref_hash
+ zfile_dump_section (uncompressed_data, header, compressed_offset, dict_id);
+ uncompressed_data->len = save_len; // restore
+ }
+
+ if (vb) COPY_TIMER (zfile_uncompress_section);
+}
+
+// uncompress into a specific offset in a pre-allocated buffer
+void zfile_uncompress_section_into_buf (VBlockP vb, SectionHeaderUnionP header_p, uint32_t expected_vb_i, SectionType expected_section_type,
+ BufferP dst_buf,
+ char *dst) // pointer into dst_buf.data
+{
+ if (!header_p.common->data_uncompressed_len) return;
+
+ ASSERT (dst >= B1STc(*dst_buf) && dst <= BLSTc(*dst_buf), "expecting dst=%p to be within dst_buf=%s", dst, buf_desc(dst_buf).s);
+
+ Buffer copy = *dst_buf;
+ copy.data = dst; // somewhat of a hack
+ zfile_uncompress_section (vb, header_p, ©, NULL, expected_vb_i, expected_section_type); // NULL name prevents buf_alloc
+}
+
+uint32_t zfile_compress_b250_data (VBlockP vb, ContextP ctx)
+{
+ struct FlagsCtx flags = ctx->flags; // make a copy
+
+ if (VB_DT(FASTQ))
+ flags.paired = (flag.pair == PAIR_R1 && fastq_zip_use_pair_identical (ctx->dict_id)) || // "paired" flag in R1 means: "In R2, reconstruct R1 data IFF R2 data is absent" (v15)
+ (flag.pair == PAIR_R2 && fastq_zip_use_pair_assisted (ctx->dict_id, SEC_B250)); // "paired" flag in R2 means: "Reconstruction of R2 requires R2 data as well as R1 data"
+
+ SectionHeaderCtx header = (SectionHeaderCtx) {
+ .magic = BGEN32 (GENOZIP_MAGIC),
+ .section_type = SEC_B250,
+ .data_uncompressed_len = BGEN32 (ctx->b250.len32),
+ .codec = ctx->bcodec == CODEC_UNKNOWN ? CODEC_RANS8 : ctx->bcodec,
+ .vblock_i = BGEN32 (vb->vblock_i),
+ .flags.ctx = flags,
+ .dict_id = ctx->dict_id,
+ .b250_size = ctx->b250_size,
+ };
+
+ ctx->b250_in_z = vb->z_data.len32;
+
+ uint32_t compressed_size = comp_compress (vb, ctx, &vb->z_data, &header, ctx->b250.data, NO_CALLBACK, ctx->tag_name);
+
+ ctx->b250_in_z_len = vb->z_data.len32 - ctx->b250_in_z;
+
+ ctx_zip_z_data_exist (ctx);
+
+ return compressed_size;
+}
+
+// returns compressed size
+uint32_t zfile_compress_local_data (VBlockP vb, ContextP ctx, uint32_t sample_size /* 0 means entire local buffer */)
+{
+ struct FlagsCtx flags = ctx->flags; // make a copy
+
+ if (VB_DT(FASTQ))
+ flags.paired = (flag.pair == PAIR_R1 && fastq_zip_use_pair_identical (ctx->dict_id)) || // "paired" flag in R1 means: "Load R1 data in R2, if R2 data is absent" (v15)
+ (flag.pair == PAIR_R2 && fastq_zip_use_pair_assisted (ctx->dict_id, SEC_LOCAL)); // "paired" flag in R2 means: "Reconstruction of R2 requires R2 data as well as R1 data"
+
+ uint32_t uncompressed_len = ctx->local.len32 * lt_width(ctx);
+
+ // case: we're just testing a small sample
+ if (sample_size && uncompressed_len > sample_size)
+ uncompressed_len = sample_size;
+
+ SectionHeaderCtx header = (SectionHeaderCtx) {
+ .magic = BGEN32 (GENOZIP_MAGIC),
+ .section_type = SEC_LOCAL,
+ .data_uncompressed_len = BGEN32 (uncompressed_len),
+ .codec = ctx->lcodec == CODEC_UNKNOWN ? CODEC_RANS8 : ctx->lcodec, // if codec has not been decided yet, fall back on RANS8
+ .sub_codec = ctx->lsubcodec_piz ? ctx->lsubcodec_piz : CODEC_UNKNOWN,
+ .vblock_i = BGEN32 (vb->vblock_i),
+ .flags.ctx = flags,
+ .dict_id = ctx->dict_id,
+ .ltype = ctx->ltype,
+ .param = ctx->local_param ? ctx->local.prm8[0] : 0,
+ };
+
+ if (lt_max(ctx->ltype)) // integer ltype
+ header.nothing_char = ctx->nothing_char ? ctx->nothing_char : 0xff; // note: nothing_char=0 is trasmitted as 0xff in SectionHeaderCtx, because 0 means "logic up to version 15.0.37"
+
+ LocalGetLineCB *callback = zip_get_local_data_callback (vb->data_type, ctx);
+
+ ctx->local_in_z = vb->z_data.len32;
+
+ uint32_t compressed_size = comp_compress (vb, ctx, &vb->z_data, &header,
+ callback ? NULL : ctx->local.data, callback, ctx->tag_name);
+
+ ctx->local_in_z_len = vb->z_data.len32 - ctx->local_in_z;
+
+ ctx_zip_z_data_exist (ctx);
+
+ return compressed_size;
+}
+
+// compress section - two options for input data -
+// 1. contiguous data in section_data
+// 2. line by line data - by providing a callback + total_len
+void zfile_compress_section_data_ex (VBlockP vb,
+ ContextP ctx, // NULL if not context data
+ SectionType section_type,
+ BufferP section_data, // option 1 - compress contiguous data
+ LocalGetLineCB callback, uint32_t total_len, // option 2 - compress data one line at a time
+ Codec codec, SectionFlags flags,
+ rom name)
+{
+ ASSERT (st_header_size (section_type) == sizeof (SectionHeader), "cannot use this for section_type=%s", st_name (section_type));
+
+ SectionHeader header = {
+ .magic = BGEN32 (GENOZIP_MAGIC),
+ .section_type = section_type,
+ .data_uncompressed_len = BGEN32 (section_data ? section_data->len : total_len),
+ .codec = codec,
+ .vblock_i = BGEN32 (vb->vblock_i),
+ .flags = flags
+ };
+
+ if (flag.show_time) codec_show_time (vb, name ? name : st_name (section_type), NULL, codec);
+
+ comp_compress (vb, ctx,
+ // note: when called from codec_assign_best_codec we use z_data_test. this is because codec_assign_best_codec can be
+ // called from within complex codecs for their subcodecs, and if we had used z_data, comp_compress could realloc it as it
+ // is being populated by complex codec
+ in_assign_codec ? &vb->z_data_test : &vb->z_data,
+ &header,
+ section_data ? section_data->data : NULL,
+ callback, st_name (section_type));
+}
+
+typedef struct { uint64_t start, len; } RemovedSection;
+
+static DESCENDING_SORTER (sort_removed_sections, RemovedSection, start)
+
+// remove ctx and all other ctxs consolidated to it from z_data. akin of unscrambling an egg.
+void zfile_remove_ctx_group_from_z_data (VBlockP vb, Did remove_did_i)
+{
+ unsigned num_rms=0;
+ RemovedSection rm[vb->num_contexts * 2];
+
+ // remove all contexts in the group
+ CTX(remove_did_i)->st_did_i = remove_did_i; // so the loop catches it too
+ for_ctx_that (ctx->st_did_i == remove_did_i) {
+ if (ctx->b250_in_z_len)
+ rm[num_rms++] = (RemovedSection){.start = ctx->b250_in_z, .len = ctx->b250_in_z_len };
+
+ if (ctx->local_in_z_len)
+ rm[num_rms++] = (RemovedSection){.start = ctx->local_in_z, .len = ctx->local_in_z_len};
+
+ vb->recon_size -= ctx->txt_len; // it won't be reconstructed after all
+
+ ctx_update_zctx_txt_len (vb, ctx, -(int64_t)ctx->txt_len); // substract txt_len added to zctx during merge
+
+ buflist_free_ctx (vb, ctx);
+ }
+
+ // update VB Header (always first in z_data) with reduced recon_size (re-encrypting it if encrypting)
+ uint64_t save = vb->z_data.len;
+ vb->z_data.len = 0;
+ zfile_compress_vb_header (vb);
+ vb->z_data.len = save;
+
+ // sort indices to the to-be-removed sections in reverse order
+ qsort (rm, num_rms, sizeof(RemovedSection), sort_removed_sections);
+
+ bool is_encrypted = has_password(); // we can't (easily) test magic if header is encrypted
+
+ for (unsigned i=0; i < num_rms; i++) {
+ ASSERT (is_encrypted || ((SectionHeader*)B8 (vb->z_data, rm[i].start))->magic == BGEN32(GENOZIP_MAGIC),
+ "Data to be cut out start=%"PRIu64" len=%"PRIu64" is not on section boundary", rm[i].start, rm[i].len);
+
+ buf_remove (vb->z_data, char, rm[i].start, rm[i].len);
+ sections_remove_from_list (vb, rm[i].start, rm[i].len);
+
+ ASSERT (is_encrypted || rm[i].start == vb->z_data.len || ((SectionHeader*)B8 (vb->z_data, rm[i].start))->magic == BGEN32(GENOZIP_MAGIC),
+ "Data cut out is not exactly one section start=%"PRIu64" len=%"PRIu64, rm[i].start, rm[i].len);
+ }
+}
+
+// reads exactly the length required, error otherwise.
+// return a pointer to the data read
+static void *zfile_read_from_disk (FileP file, VBlockP vb, BufferP buf, uint32_t len, SectionType st, DictId dict_id)
+{
+ START_TIMER;
+
+ ASSERT (len, "reading %s%s: len is 0", st_name (st), cond_str(dict_id.num, " dict_id=", dis_dict_id(dict_id).s));
+ ASSERT (buf_has_space (buf, len), "reading %s: buf is out of space: len=%u but remaining space in buffer=%u (tip: run with --show-headers to see where it fails)",
+ st_name (st), len, (uint32_t)(buf->size - buf->len));
+
+ char *start = BAFTc (*buf);
+ uint32_t bytes = fread (start, 1, len, Z_READ_FP(file));
+ ASSERT (bytes == len, "reading %s%s read only %u bytes out of len=%u: %s",
+ st_name (st), cond_str(dict_id.num, " dict_id=", dis_dict_id(dict_id).s), bytes, len, strerror(errno));
+
+ buf->len += bytes;
+
+ if (file->mode == READ) // mode==WRITE in case reading pair data in ZIP
+ file->disk_so_far += bytes; // consumed by dispatcher_increment_progress
+
+ COPY_TIMER (read);
+
+ return start;
+}
+
+
+// read section header - called from the main thread.
+// returns offset of header within data, or SECTION_SKIPPED if section is skipped
+int32_t zfile_read_section_do (FileP file,
+ VBlockP vb,
+ uint32_t original_vb_i, // the vblock_i used for compressing. this is part of the encryption key. dictionaries are compressed by the compute thread/vb, but uncompressed by the main thread (vb=0)
+ BufferP data, rom buf_name, // buffer to append
+ SectionType expected_sec_type,
+ Section sec, // NULL for no seeking
+ FUNCLINE)
+{
+ ASSERTMAINTHREAD;
+
+ ASSERT (!sec || expected_sec_type == sec->st, "called from %s:%u: expected_sec_type=%s but encountered sec->st=%s. vb_i=%u",
+ func, code_line, st_name (expected_sec_type), st_name(sec->st), vb->vblock_i);
+
+ // skip if this section is not needed according to flags
+ if (sec && file == z_file &&
+ piz_is_skip_section ((vb ? vb->data_type : z_file->data_type), sec->st, (vb ? vb->comp_i : COMP_NONE), (IS_DICTED_SEC (sec->st) ? sec->dict_id : DICT_ID_NONE),
+ sec->flags.flags,
+ (vb && vb->preprocessing) ? SKIP_PURPOSE_PREPROC : SKIP_PURPOSE_RECON))
+ return SECTION_SKIPPED;
+
+ uint32_t header_size = st_header_size (expected_sec_type);
+ uint32_t unencrypted_header_size = header_size;
+
+ // note: for an encrypted file, while reading the reference, we don't yet know until getting the header whether it
+ // will be an SEC_REF_IS_SET (encrypted) or SEC_REFERENCE (not encrypted if originating from external, encryptd if de-novo)
+ bool is_encrypted = !Z_DT(REF) &&
+ expected_sec_type != SEC_GENOZIP_HEADER &&
+ crypt_get_encrypted_len (&header_size, NULL); // update header size if encrypted
+
+ uint32_t header_offset = data->len;
+ buf_alloc (vb, data, 0, header_offset + header_size, uint8_t, 2, buf_name);
+ data->param = 1;
+
+ // move the cursor to the section. file_seek is smart not to cause any overhead if no moving is needed
+ if (sec) file_seek (file, sec->offset, SEEK_SET, READ, HARD_FAIL);
+
+ SectionHeaderP header = zfile_read_from_disk (file, vb, data, header_size, expected_sec_type, IS_DICTED_SEC(sec->st) ? sec->dict_id : DICT_ID_NONE);
+ uint32_t bytes_read = header_size;
+
+ ASSERT (header, "called from %s:%u: Failed to read data from file %s while expecting section type %s: %s",
+ func, code_line, z_name, st_name(expected_sec_type), strerror (errno));
+
+ bool is_magical = BGEN32 (header->magic) == GENOZIP_MAGIC;
+
+ // SEC_REFERENCE is never encrypted when originating from a reference file, it is encrypted (if the file is encrypted) if it originates from REF_INTERNAL
+ if (is_encrypted && HEADER_IS(REFERENCE) && !header->data_encrypted_len) {
+ is_encrypted = false;
+ header_size = unencrypted_header_size;
+ }
+
+ // decrypt header (note: except for SEC_GENOZIP_HEADER - this header is never encrypted)
+ if (is_encrypted) {
+ ASSINP (BGEN32 (header->magic) != GENOZIP_MAGIC,
+ "password provided, but file %s is not encrypted (sec_type=%s)", z_name, st_name (header->section_type));
+
+ crypt_do (vb, (uint8_t*)header, header_size, original_vb_i, expected_sec_type, true);
+
+ is_magical = BGEN32 (header->magic) == GENOZIP_MAGIC; // update after decryption
+ }
+
+ if (flag.show_headers) {
+ sections_show_header (header, NULL, sec ? sec->offset : 0, 'R');
+ if (is_genocat && (IS_DICTED_SEC (expected_sec_type) || expected_sec_type == SEC_REFERENCE || expected_sec_type == SEC_REF_IS_SET))
+ return header_offset; // in genocat --show-header - we only show headers, nothing else
+ }
+
+ ASSERT (is_magical || flag.verify_codec, "called from %s:%u: corrupt data (magic is wrong) when attempting to read section=%s dict_id=%s of vblock_i=%u comp=%s in file %s",
+ func, code_line, st_name (expected_sec_type), sec ? dis_dict_id (sec->dict_id).s : "(no sec)", vb->vblock_i, comp_name(vb->comp_i), z_name);
+
+ uint32_t data_compressed_len = BGEN32 (header->data_compressed_len);
+ uint32_t data_encrypted_len = BGEN32 (header->data_encrypted_len);
+
+ uint32_t data_len = MAX_(data_compressed_len, data_encrypted_len);
+
+ // in case where we already read part of the body (eg if is_encrypted was initially set and then unset) (remaining_data_len might be negative)
+ int32_t remaining_data_len = (int32_t)data_len - (int32_t)(bytes_read - header_size);
+
+ // check that we received the section type we expect,
+ ASSERT (expected_sec_type == header->section_type,
+ "called from %s:%u: Unexpected section type when reading %s: expecting %s, found %s sec(expecting)=(offset=%s, dict_id=%s)",
+ func, code_line, z_name, st_name(expected_sec_type), st_name(header->section_type),
+ sec ? str_int_commas (sec->offset).s : "N/A", sec ? dis_dict_id (sec->dict_id).s : "N/A");
+
+ ASSERT (BGEN32 (header->vblock_i) == original_vb_i,
+ "Requested to read %s with vb_i=%u, but actual section has vb_i=%u",
+ st_name(expected_sec_type), original_vb_i, BGEN32 (header->vblock_i));
+
+ // up to v14, we had compressed_offset instead of z_digest. Since we have it, we might as well use it
+ // as an extra verification of the SectionHeader integrity
+ ASSERT (VER(15) || BGEN32 (header->v14_compressed_offset) == header_size,
+ "called from %s:%u: invalid header when reading %s - expecting compressed_offset to be %u but found %u. genozip_version=%u section_type=%s",
+ func, code_line, z_name, header_size, BGEN32 (header->v14_compressed_offset), z_file->genozip_version/*set from footer*/, st_name(header->section_type));
+
+ // allocate more memory for the rest of the header + data
+ buf_alloc (vb, data, 0, header_offset + header_size + data_len, uint8_t, 2, "zfile_read_section");
+ header = (SectionHeaderP)Bc(*data, header_offset); // update after realloc
+
+ data->param = 2;
+
+ // read section data
+ if (remaining_data_len > 0)
+ zfile_read_from_disk (file, vb, data, remaining_data_len, expected_sec_type, sections_get_dict_id (header));
+
+ return header_offset;
+}
+
+// Read one section header - returns the header in vb->scratch - caller needs to free vb->scratch
+SectionHeaderUnion zfile_read_section_header_do (VBlockP vb, Section sec,
+ SectionType expected_sec_type, // optional: if not SEC_NONE, also verifies section is of expected type
+ FUNCLINE)
+{
+ ASSERT (expected_sec_type == SEC_NONE || sec->st == expected_sec_type,
+ "called from %s:%u: expecting sec.st=%s to be %s", func, code_line, st_name (sec->st), st_name (expected_sec_type));
+
+ uint32_t header_size = st_header_size (sec->st);
+ uint32_t unencrypted_header_size = header_size;
+
+ file_seek (z_file, sec->offset, SEEK_SET, READ, HARD_FAIL);
+
+ bool is_encrypted = (z_file->data_type != DT_REF) &&
+ (sec->st != SEC_GENOZIP_HEADER) &&
+ crypt_get_encrypted_len (&header_size, NULL); // update header size if encrypted
+
+ SectionHeaderUnion header;
+ uint32_t bytes = fread (&header, 1, header_size, Z_READ_FP(z_file));
+
+ ASSERT (bytes == header_size, "called from %s:%u: Failed to read header of section type %s from file %s: %s (bytes=%u header_size=%u)",
+ func, code_line, st_name(sec->st), z_name, strerror (errno), bytes, header_size);
+
+ bool is_magical = BGEN32 (header.common.magic) == GENOZIP_MAGIC;
+
+ // SEC_REFERENCE is never encrypted in references files, or if REF_EXT_STORE is used.
+ // It is encrypted (if the file is encrypted) if REF_INTERNAL is used.
+ if (is_encrypted && header.common.section_type == SEC_REFERENCE && !header.common.data_encrypted_len) {
+ is_encrypted = false;
+ header_size = unencrypted_header_size;
+ }
+
+ // decrypt header
+ if (is_encrypted) {
+ ASSERT (BGEN32 (header.common.magic) != GENOZIP_MAGIC,
+ "called from %s:%u: password provided, but file %s is not encrypted (sec_type=%s)", func, code_line, z_name, st_name (header.common.section_type));
+
+ crypt_do (vb, (uint8_t*)&header, header_size, sec->vblock_i, sec->st, true);
+
+ is_magical = BGEN32 (header.common.magic) == GENOZIP_MAGIC; // update after decryption
+ }
+
+ ASSERT (is_magical, "called from %s:%u: corrupt data (magic is wrong) when attempting to read header of section %s in file %s",
+ func, code_line, st_name (sec->st), z_name);
+
+ ASSERT (expected_sec_type == SEC_NONE ||
+ (BGEN32 (header.common.vblock_i) == sec->vblock_i && header.common.section_type == sec->st) ||
+ (!VER(14) && sec->st == SEC_REF_HASH), // in V<=13, REF_HASH didn't have a vb_i in the section list
+ "called from %s:%u: Requested to read %s with vb_i=%u, but actual section is %s with vb_i=%u",
+ func, code_line, st_name(sec->st), sec->vblock_i, st_name(header.common.section_type), BGEN32 (header.common.vblock_i));
+
+ return header;
+}
+
+// up to v14, we had no explicit "has_digest" flag - we calculate it here by searching for proof of digest.
+// since a digest might be 0 by chance, a 0 is not a proof of non-digest, however several 0s are strong enough evidence.
+static bool zfile_get_has_digest_up_to_v14 (SectionHeaderGenozipHeaderP header)
+{
+ // proof: a file was compressed with --md5 (zip verifies --md5 conflicts)
+ if (!header->flags.genozip_header.adler) return true;
+
+ // proof: a file is up to v13 with digest_bound
+ if (!VER(14) && !digest_is_zero (header->FASTQ_v13_digest_bound)) return true;
+
+ // search for a non-0 digest in the first 3 TXT/VB headers
+ Section sec = NULL;
+ for (int i=0 ; i < 3 && sections_next_sec2 (&sec, SEC_TXT_HEADER, SEC_VB_HEADER); i++) {
+ SectionHeaderUnion header = zfile_read_section_header (evb, sec, SEC_NONE);
+
+ // proof: a TXT_HEADER has a digest of the txt_header (0 if file has no header) or
+ // digest of the entire file.
+ if (sec->st == SEC_TXT_HEADER &&
+ (!digest_is_zero (header.txt_header.digest) || !digest_is_zero (header.txt_header.digest_header)))
+ return true;
+
+ // proof: a VB has a digest
+ if (sec->st == SEC_VB_HEADER && !digest_is_zero (header.vb_header.digest)) return true;
+ }
+
+ return false; // no proof of digest
+}
+
+bool zfile_advance_to_next_header (uint64_t *offset, uint64_t *gap)
+{
+ uint64_t start_offset = *offset;
+ file_seek (z_file, start_offset, SEEK_SET, READ, HARD_FAIL);
+
+ char data[128 KB + 4];
+ while (1) {
+ memset (data, 0, sizeof(data));
+
+ uint32_t bytes;
+ if (!(bytes = fread (data+4, 1, 128 KB, Z_READ_FP(z_file))))
+ return false; // possibly 4 bytes of the Footer magic remaining
+
+ // note: we accept a magic in the final 4 bytes of data - this could be a Footer. We
+ // move those last 4 bytes to the next iteration
+ for (int i=0; i < bytes; i++)
+ if (BGEN32(GET_UINT32 (&data[i])) == GENOZIP_MAGIC) {
+ *offset += i - 4;
+ *gap = *offset - start_offset;
+ return true;
+ }
+
+ *offset += 128 KB;
+ memcpy (data, &data[128 KB], 4);
+ }
+}
+
+// check if reference filename exists in the absolute or relative path
+static rom zfile_read_genozip_header_get_ref_filename (rom header_fn)
+{
+ // if header_filename exists, use it
+ if (file_exists (header_fn)) {
+ char *fn = MALLOC (strlen (header_fn) + 1);
+ strcpy (fn, header_fn);
+ return fn;
+ }
+
+ // case absolute path and it doesn't exist
+ if (header_fn[0] == '/' || header_fn[0] == '\\') return NULL;
+
+ rom slash = strrchr (z_name, '/');
+ if (!slash && flag.is_windows) slash = strrchr (z_name, '\\');
+ if (!slash) return NULL; // chain file is in the current dir
+
+ unsigned dirname_len = slash - z_name + 1; // including slash
+ int fn_size = strlen (header_fn) + dirname_len + 1;
+ char *fn = MALLOC (fn_size);
+ snprintf (fn, fn_size, "%.*s%s", dirname_len, z_name, header_fn);
+
+ if (file_exists (fn))
+ return fn;
+ else {
+ FREE (fn);
+ return NULL;
+ }
+}
+
+static void zfile_read_genozip_header_set_reference (ConstSectionHeaderGenozipHeaderP header, rom ref_filename)
+{
+ WARN ("Note: using the reference file %s. You can override this with --reference or $GENOZIP_REFERENCE", ref_filename);
+ ref_set_reference (gref, ref_filename, REF_EXTERNAL, false);
+}
+
+// reference data when NOT reading a reference file
+static void zfile_read_genozip_header_handle_ref_info (ConstSectionHeaderGenozipHeaderP header)
+{
+ ASSERT0 (!flag.reading_reference, "we should not be here");
+
+ if (digest_is_zero (header->ref_genome_digest)) return; // no reference info in header - we're done
+
+ z_file->ref_genome_digest = header->ref_genome_digest;
+ memcpy (z_file->ref_filename_used_in_zip, header->ref_filename, REF_FILENAME_LEN);
+
+ if (flag.show_reference) {
+ if (flag.force)
+ iprintf ("%s", header->ref_filename);
+ else
+ iprintf ("%s was compressed using the reference file:\nName: %s\nMD5: %s\n",
+ z_name, header->ref_filename, digest_display (header->ref_genome_digest).s);
+ if (is_genocat) exit_ok; // in genocat --show-reference, we only show the reference, not the data
+ }
+
+ if (!is_genols) { // note: we don't need the reference for genols
+
+ rom gref_fn = ref_get_filename (gref);
+
+ rom env = getenv ("GENOZIP_REFERENCE");
+ int env_len = env ? strlen (env) : 0;
+
+ if (env_len > 1 && (env[env_len-1] == '/' || env[env_len-1] == '\\'))
+ env_len--; // remove trailing /
+
+ // case: this file requires an external reference, but command line doesn't include --reference - attempt to use the
+ // reference specified in the header.
+ // Note: this code will be executed when zfile_read_genozip_header is called from main_genounzip.
+ if (!flag.explicit_ref && !env && // reference NOT was specified on command line
+ !Z_DT(REF) && // for reference files, this field is actual fasta_filename
+ !(gref_fn && !strcmp (gref_fn, header->ref_filename))) { // ref_filename already set from a previous file with the same reference
+
+ rom ref_filename = zfile_read_genozip_header_get_ref_filename (header->ref_filename);
+
+ if (!flag.dont_load_ref_file && ref_filename && file_exists (ref_filename))
+ zfile_read_genozip_header_set_reference (header, ref_filename);
+ else
+ ASSINP (flag.dont_load_ref_file, "Please use --reference to specify the path to the reference file. Original path was: %.*s",
+ REF_FILENAME_LEN, header->ref_filename);
+
+ FREE (ref_filename);
+ }
+
+ // case: reference directory provided in GENOZIP_REFERENCE
+ else if (!flag.explicit_ref && !Z_DT(REF) && !flag.dont_load_ref_file &&
+ env && file_is_dir (env)) {
+
+ bool exists = false;
+
+ if (header->ref_filename[0]) {
+ // get basename of filename in header
+ rom ref_basename = strrchr (header->ref_filename, '/');
+ if (!ref_basename) ref_basename = strrchr (header->ref_filename, '\\');
+ ref_basename = ref_basename ? (ref_basename + 1) : header->ref_filename;
+
+ int new_filename_size = strlen (ref_basename) + env_len + 2;
+ char new_filename[new_filename_size];
+
+ snprintf (new_filename, new_filename_size, "%.*s/%s", STRf(env), ref_basename);
+ exists = file_exists (new_filename);
+
+ // case: use reference file in directory GENOZIP_REFERENCE and basename from header
+ if (exists &&
+ !(gref_fn && !strcmp (gref_fn, new_filename))) // reference not already loaded
+ zfile_read_genozip_header_set_reference (header, new_filename);
+ }
+
+ // if reference not found in directory GENOZIP_REFERENCE, use full filename from header
+ if (!exists) {
+ rom ref_filename = zfile_read_genozip_header_get_ref_filename (header->ref_filename);
+
+ if (!(ref_filename && gref_fn && !strcmp (gref_fn, ref_filename))) {
+ if (ref_filename)
+ zfile_read_genozip_header_set_reference (header, ref_filename);
+ else
+ ABORTINP ("Please use --reference to specify the path to the reference file. Original path was: %.*s",
+ REF_FILENAME_LEN, header->ref_filename);
+ }
+ FREE (ref_filename);
+ }
+ }
+ }
+}
+
+static uint64_t zfile_read_genozip_header_get_actual_offset (void)
+{
+ uint32_t size = MIN_(z_file->disk_size, 16 MB);
+ file_seek (z_file, z_file->disk_size - size, SEEK_SET, READ, HARD_FAIL);
+
+ ASSERTNOTINUSE (evb->scratch);
+ buf_alloc_exact_zero (evb, evb->scratch, size + 100, char, "scratch");
+ evb->scratch.len -= 100; // extra allocated memory to ease the scan loop
+
+ int ret = fread (evb->scratch.data, size, 1, Z_READ_FP(z_file));
+ ASSERT (ret == 1, "Failed to read %u bytes from the end of %s", size, z_name);
+
+ for_buf_back (uint8_t, p, evb->scratch)
+ if (BGEN32(GET_UINT32(p)) == GENOZIP_MAGIC && ((SectionHeaderP)p)->section_type == SEC_GENOZIP_HEADER)
+ return BNUM (evb->scratch, p) + (z_file->disk_size - size);
+
+ ABORT ("Cannot locate the SEC_GENOZIP_HEADER in the final %u bytes of %s", size, z_name);
+}
+
+// gets offset to the beginning of the GENOZIP_HEADER section, and sets z_file->genozip_version
+uint64_t zfile_read_genozip_header_get_offset (bool as_is)
+{
+ // read the footer from the end of the file
+ if (z_file->disk_size < sizeof(SectionFooterGenozipHeader) ||
+ !z_file->file ||
+ !file_seek (z_file, -sizeof(SectionFooterGenozipHeader), SEEK_END, READ, SOFT_FAIL))
+ return 0; // failed
+
+ TEMP_FLAG(quiet, false);
+
+ SectionFooterGenozipHeader footer;
+ int ret = fread (&footer, sizeof (footer), 1, Z_READ_FP(z_file));
+ ASSERTW (ret == 1, "Skipping empty file %s", z_name);
+ if (!ret) return 0; // failed
+
+ // case: there is no genozip header. this can happen if the file was truncated (eg because compression did not complete)
+ RETURNW (BGEN32 (footer.magic) == GENOZIP_MAGIC, 0, "Error in %s: the file appears to be incomplete (it is missing the Footer).", z_name);
+
+ uint64_t offset = flag.recover ? zfile_read_genozip_header_get_actual_offset() // get correct offset in case of corruption
+ : BGEN64 (footer.genozip_header_offset);
+
+ if (as_is) return offset;
+
+ // read genozip_version directly, needed to determine the section header size
+ RETURNW (file_seek (z_file, offset, SEEK_SET, READ, WARNING_FAIL), 0,
+ "Error in %s: corrupt offset=%"PRIu64" in Footer (file_size=%"PRIu64")",
+ z_name, offset, z_file->disk_size);
+
+ SectionHeaderGenozipHeader top = {};
+ RETURNW (fread (&top, 1, MIN_(sizeof (SectionHeaderGenozipHeader), z_file->disk_size - offset/*header was shorter in earlier verions*/),
+ Z_READ_FP(z_file)), 0, "Error in %s: failed to read genozip header", z_name);
+
+ RETURNW (BGEN32 (top.magic) == GENOZIP_MAGIC, 0, "Error in %s: offset=%"PRIu64" of the GENOZIP_HEADER section as it appears in the Footer appears to be wrong, or the GENOZIP_HEADER section has bad magic (file_size=%"PRIu64").%s",
+ z_name, offset, z_file->disk_size, flag.debug_or_test ? " Try again with --recover." : "");
+
+ RESTORE_FLAG(quiet);
+
+ z_file->genozip_version = top.genozip_version;
+ z_file->genozip_minor_ver = top.genozip_minor_ver; // 0 before 15.0.28
+
+ z_file->data_type = BGEN16 (top.data_type);
+ if (Z_DT(BCF)) { z_file->data_type = DT_VCF; z_file->source_codec = CODEC_BCF; } // Z_DT is always VCF, not BCF
+ else if (Z_DT(CRAM)) { z_file->data_type = DT_SAM; z_file->source_codec = CODEC_CRAM; } // Z_DT is always SAM, not CRAM or BAM
+
+ // check that file version is at most this executable version, except for reference file for which only major version is tested
+ ASSINP (z_file->genozip_version < code_version_major() ||
+ (z_file->genozip_version == code_version_major() && (z_file->genozip_minor_ver <= code_version_minor() || Z_DT(REF) || (is_genocat && flag.show_stats))),
+ "Error: %s cannot be opened because it was compressed with genozip version %u.0.%u which is newer than the version running - %s.\n%s",
+ z_name, z_file->genozip_version, z_file->genozip_minor_ver, GENOZIP_CODE_VERSION, genozip_update_msg());
+
+ bool metadata_only = is_genocat && (flag.show_stats || flag.show_gheader || flag.show_headers || flag.show_aliases || flag.show_dict);
+
+ #define MSG "Error: %s was compressed with version %u of genozip. It may be uncompressed with genozip versions %u to %u"
+
+ // in version 6, we canceled backward compatability with v1-v5
+ ASSINP (VER(6), MSG, z_name, z_file->genozip_version, z_file->genozip_version, 5);
+
+ // in version 7, we canceled backward compatability with v6
+ ASSINP (VER(7), MSG, z_name, z_file->genozip_version, 6, 6);
+
+ // in version 8, we canceled backward compatability with v7
+ ASSINP (VER(8), MSG, z_name, z_file->genozip_version, 7, 7);
+
+ // in version 15, we canceled backward compatability with v8,9,10 (except reference files which continue to be supported back to v8, as they might be needed to decompress files of later versions)
+ ASSINP (metadata_only || VER(11) || Z_DT(REF), MSG, z_name, z_file->genozip_version, z_file->genozip_version, 14);
+
+ #undef MSG
+ return offset;
+}
+
+// returns false if file should be skipped
+bool zfile_read_genozip_header (SectionHeaderGenozipHeaderP out_header, FailType fail_type) // optional outs
+{
+ ASSERTNOTNULL (z_file);
+
+ if (z_file->section_list_buf.len) return true; // header already read
+
+ SectionEnt sec = { .st = SEC_GENOZIP_HEADER,
+ .offset = zfile_read_genozip_header_get_offset (false) };
+
+ if (!sec.offset) {
+ fail_type = HARD_FAIL;
+ goto error;
+ }
+
+ zfile_read_section (z_file, evb, 0, &evb->z_data, "z_data", SEC_GENOZIP_HEADER, &sec);
+
+ SectionHeaderGenozipHeaderP header = (SectionHeaderGenozipHeaderP)evb->z_data.data;
+ if (out_header) *out_header = *header;
+
+ DataType data_type = (DataType)(BGEN16 (header->data_type));
+
+ // Note: BCF/CRAM files have DT_BCF/DT_CRAM in the GenozipHeader, but in the PIZ code we
+ // expect data_type=VCF/SAM with z_file->source_codec set to CODEC_BCF/CODEC_CRAM.
+ if (data_type == DT_BCF) data_type = DT_VCF;
+ else if (data_type == DT_CRAM) data_type = DT_SAM;
+
+ ASSERT ((unsigned)data_type < NUM_DATATYPES, "unrecognized data_type=%d. %s", data_type, genozip_update_msg());
+
+ // case: we couldn't figure out z_file->data_type from the .genozip filename - set based on the data_type in the GenozipHeader
+ if (Z_DT(NONE) || Z_DT(GNRIC)) {
+ z_file->data_type = data_type;
+ z_file->type = file_get_default_z_ft_of_data_type (data_type);
+ }
+
+ // case: we set z_file->data_type based on the .genozip filename - verify that it is correct
+ else
+ ASSINP (z_file->data_type == data_type, "%s - file extension indicates this is a %s file, but according to its contents it is a %s",
+ z_name, z_dt_name(), dt_name (data_type));
+
+ ASSINP (header->encryption_type != ENC_NONE || !has_password() || Z_DT(REF),
+ "password provided, but file %s is not encrypted", z_name);
+
+ ASSERT (VER(15) || BGEN32 (header->v14_compressed_offset) == st_header_size (SEC_GENOZIP_HEADER),
+ "invalid genozip header of %s - expecting compressed_offset to be %u in genozip_version=%u but found %u",
+ z_name, st_header_size (SEC_GENOZIP_HEADER), header->genozip_version, BGEN32 (header->v14_compressed_offset));
+
+ // get & test password, if file is encrypted
+ if (header->encryption_type != ENC_NONE) {
+
+ if (!has_password()) crypt_prompt_for_password();
+
+ crypt_do (evb, header->password_test, sizeof(header->password_test), 0, SEC_NONE, true); // decrypt password test
+
+ ASSINP (!memcmp (header->password_test, PASSWORD_TEST, sizeof(header->password_test)),
+ "password is wrong for file %s", z_name);
+ }
+
+ z_file->num_txt_files = VER(14) ? header->num_txt_files : BGEN32 (header->v13_num_components);
+ if (z_file->num_txt_files < 2) flag.unbind = 0; // override user's prefix if file has only 1 component (bug 326)
+
+ int dts = z_file->z_flags.dt_specific; // save in case its set already (eg dts_paired is set in sections_is_paired)
+ z_file->z_flags = header->flags.genozip_header;
+
+ if (IS_SRC_BCF) z_file->z_flags.txt_is_bin = true; // in files 15.0.58 or older this was not set
+
+ z_file->z_flags.dt_specific |= dts;
+ z_file->num_lines = BGEN64 (header->num_lines_bound);
+ z_file->txt_data_so_far_bind = BGEN64 (header->recon_size);
+
+ if (VER(14) && !flag.reading_reference)
+ segconf.vb_size = (uint64_t)BGEN16 (header->vb_size) MB;
+
+ if (VER(15) && !flag.reading_reference)
+ segconf.zip_txt_modified = header->is_modified; // since 15.0.60
+
+ if (flag.show_data_type) {
+ iprintf ("%s\n", z_dt_name());
+ exit_ok;
+ }
+
+ DT_FUNC (z_file, piz_genozip_header)(header); // data-type specific processing of the Genozip Header
+
+ bool has_section_list = true;
+ if (!z_file->section_list_buf.param) { // not already initialized in a previous call to this function
+
+ has_section_list = license_piz_prepare_genozip_header (header, IS_LIST || (IS_SHOW_HEADERS && flag.force));
+
+ if (has_section_list) {
+ zfile_uncompress_section (evb, header, &z_file->section_list_buf, "z_file->section_list_buf", 0, SEC_GENOZIP_HEADER);
+
+ sections_list_file_to_memory_format (header);
+ }
+
+ if (flag.show_gheader==1) {
+ DO_ONCE sections_show_gheader (header);
+ if (is_genocat) exit_ok; // in genocat, exit after showing the requested data
+ }
+
+ z_file->section_list_buf.param = 1;
+ }
+
+ if (!VER(15))
+ z_file->z_flags.has_digest = zfile_get_has_digest_up_to_v14 (header); // overwrites v14_bgzf that is no longer used for PIZ
+
+ // case: we are reading a file expected to be the reference file itself
+ if (flag.reading_reference) {
+ ASSINP (data_type == DT_REF, "Error: %s is not a reference file. To create a reference file, use 'genozip --make-reference '",
+ ref_get_filename(gref));
+
+ // note: in the reference file itself, header->ref_filename is the original fasta used to create this reference
+ ref_set_ref_file_info (flag.reading_reference, header->genome_digest, header->flags.genozip_header.adler,
+ header->fasta_filename, header->genozip_version);
+
+ refhash_set_digest (header->refhash_digest);
+
+ buf_free (evb->z_data);
+ }
+
+ // case: we are reading a file that is not expected to be a reference file
+ else {
+ // case: we are attempting to decompress a reference file - this is not supported
+ ASSGOTO (data_type != DT_REF || (flag.genocat_no_reconstruct && is_genocat) || is_genols,
+ "%s is a reference file - it cannot be decompressed - skipping it. Did you intend to use --reference?.", z_name);
+
+ // handle reference file info
+ flags_update_piz_no_ref_file();
+
+ if (!flag.dont_load_ref_file && data_type != DT_REF)
+ zfile_read_genozip_header_handle_ref_info (header);
+
+ buf_free (evb->z_data); // free before ctx_piz_initialize_zctxs that might read aliases - header not valid after freeing
+
+ // create all contexts for B250/LOCAL/DICT data in the z_file (or predefined) -
+ // flags_update_piz_one_z_file and IS_SKIP functions may rely on Context.z_data_exists
+ if (has_section_list)
+ ctx_piz_initialize_zctxs();
+ }
+
+ return true;
+
+error:
+ buf_free (evb->z_data);
+ ASSERT (fail_type == SOFT_FAIL, "failed to read %s", z_name);
+ return false;
+}
+
+// Update the first SEC_TXT_HEADER fragment of the current txt file.
+void zfile_update_txt_header_section_header (uint64_t offset_in_z_file)
+{
+ // sanity check - we skip empty files, so data is expected
+ ASSERT (txt_file->txt_data_so_far_single > 0, "Expecting txt_file->txt_data_so_far_single=%"PRId64" > 0", txt_file->txt_data_so_far_single);
+
+ ASSERTNOTINUSE (evb->scratch);
+ buf_alloc_exact_zero (evb, evb->scratch, sizeof (SectionHeaderTxtHeader) + AES_BLOCKLEN-1/*encryption padding*/, char, "scratch");
+
+ SectionHeaderTxtHeaderP header = B1ST(SectionHeaderTxtHeader, evb->scratch);
+ *header = z_file->txt_header_hdr;
+
+ header->txt_data_size = BGEN64 (txt_file->txt_data_so_far_single);
+ header->txt_num_lines = BGEN64 (txt_file->num_lines);
+ header->max_lines_per_vb = BGEN32 (txt_file->max_lines_per_vb);
+
+ // qname stuff
+ for (QType q=0; q < NUM_QTYPES; q++)
+ header->flav_prop[q] = segconf.flav_prop[q];
+
+ if (flag.md5 && !segconf.zip_txt_modified && gencomp_comp_eligible_for_digest(NULL))
+ header->digest = digest_snapshot (&z_file->digest_ctx, "file");
+
+ if (flag.show_headers)
+ sections_show_header ((SectionHeaderP)header, NULL, offset_in_z_file, 'W');
+
+ evb->scratch.len = crypt_padded_len (sizeof (SectionHeaderTxtHeader));
+
+ // encrypt if needed
+ if (has_password()) {
+ crypt_pad ((uint8_t *)header, evb->scratch.len, evb->scratch.len - sizeof (SectionHeaderTxtHeader));
+ crypt_do (evb, (uint8_t *)header, evb->scratch.len, 1 /*was 0 up to 14.0.8*/, header->section_type, true);
+ }
+
+ zriter_write (&evb->scratch, NULL, offset_in_z_file, false); // note: cannot write in background with offset
+
+ buf_free (evb->scratch);
+}
+
+// ZIP compute thread - called from zip_compress_one_vb()
+void zfile_compress_vb_header (VBlockP vb)
+{
+
+ SectionHeaderVbHeader vb_header = {
+ .magic = BGEN32 (GENOZIP_MAGIC),
+ .section_type = SEC_VB_HEADER,
+ .vblock_i = BGEN32 (vb->vblock_i),
+ .codec = CODEC_NONE,
+ .flags.vb_header = vb->flags,
+ .recon_size = BGEN32 (vb->recon_size),
+ .longest_line_len = BGEN32 (vb->longest_line_len),
+ .longest_seq_len = BGEN32 (vb->longest_seq_len), // since v15 (non-0 for SAM, BAM, FASTQ)
+ .digest = vb->digest,
+ };
+
+ DT_FUNC (vb, zip_set_vb_header_specific)(vb, &vb_header);
+
+ // copy section header into z_data - to be eventually written to disk by the main thread. this section doesn't have data.
+ comp_compress (vb, NULL, &vb->z_data, &vb_header, NULL, NO_CALLBACK, "SEC_VB_HEADER");
+}
+
+// ZIP only: called by the main thread in the sequential order of VBs: updating of the already compressed
+// variant data section (compressed by the compute thread in zfile_compress_vb_header) just before writing it to disk
+// note: this updates the z_data in memory (not on disk)
+void zfile_update_compressed_vb_header (VBlockP vb)
+{
+ if (flag.biopsy) return; // we have no z_data in biopsy mode
+
+ SectionHeaderVbHeaderP vb_header = (SectionHeaderVbHeaderP)vb->z_data.data;
+ vb_header->z_data_bytes = BGEN32 (vb->z_data.len32);
+
+ if (flag_is_show_vblocks (ZIP_TASK_NAME))
+ iprintf ("UPDATE_VB_HEADER(id=%d) vb=%s recon_size=%u genozip_size=%u n_lines=%u longest_line_len=%u\n",
+ vb->id, VB_NAME,
+ BGEN32 (vb_header->recon_size), BGEN32 (vb_header->z_data_bytes),
+ vb->lines.len32, // just for debugging, not in VB header
+ BGEN32 (vb_header->longest_line_len));
+
+ // now we can finally encrypt the header - if needed
+ if (has_password())
+ crypt_do (vb, (uint8_t*)vb_header, ROUNDUP16(sizeof(SectionHeaderVbHeader)),
+ BGEN32 (vb_header->vblock_i), vb_header->section_type, true);
+}
+
+// ZIP - main thread
+void zfile_output_processed_vb_ext (VBlockP vb, bool background)
+{
+ ASSERTMAINTHREAD;
+
+ zriter_write (&vb->z_data, &vb->section_list_buf, -1, background);
+
+ if (vb->comp_i != COMP_NONE) z_file->disk_so_far_comp[vb->comp_i] += vb->z_data.len;
+ vb->z_data.len = 0;
+
+ ctx_update_stats (vb);
+
+ if (flag.show_headers && buf_is_alloc (&vb->show_headers_buf))
+ buf_print (&vb->show_headers_buf, false);
+}
+
+void zfile_output_processed_vb (VBlockP vb)
+{
+ zfile_output_processed_vb_ext (vb, false);
+}
+
+// get file data type - by its name if possible, or if not, inspect the GenozipHeader
+DataType zfile_piz_get_file_dt (rom z_filename)
+{
+ DataType dt = file_get_dt_by_z_filename (z_filename);
+ FileP file = NULL;
+
+ // case: we don't know yet what file type this is - we need to read the genozip header to determine
+ if (dt == DT_NONE && z_filename) {
+ if (!(file = file_open_z_read (z_filename)) || !file->file)
+ goto done; // not a genozip file
+
+ // read the footer from the end of the file
+ if (!file_seek (file, -sizeof(SectionFooterGenozipHeader), SEEK_END, READ, WARNING_FAIL))
+ goto done;
+
+ SectionFooterGenozipHeader footer;
+ int ret = fread (&footer, sizeof (footer), 1, Z_READ_FP(file));
+ ASSERTW (ret == 1, "Skipping empty file %s", z_name);
+ if (!ret) goto done; // empty file / cannot read
+
+ // case: this is not a valid genozip v2+ file
+ if (BGEN32 (footer.magic) != GENOZIP_MAGIC) goto done;
+
+ // read genozip header
+ uint64_t genozip_header_offset = BGEN64 (footer.genozip_header_offset);
+ if (!file_seek (file, genozip_header_offset, SEEK_SET, READ, WARNING_FAIL))
+ goto done;
+
+ SectionHeaderGenozipHeader header;
+ int bytes = fread ((char*)&header, 1, sizeof(SectionHeaderGenozipHeader), Z_READ_FP(file));
+ if (bytes < sizeof(SectionHeaderGenozipHeader)) goto done;
+
+ ASSERTW (BGEN32 (header.magic) == GENOZIP_MAGIC, "Error reading %s: corrupt data", z_name);
+ if (BGEN32 (header.magic) != GENOZIP_MAGIC) goto done;
+
+ dt = (DataType)BGEN16 (header.data_type);
+ }
+
+done:
+ file_close (&file);
+ return dt;
+}
+
diff --git a/src/zip.c b/src/zip.c
index 034c5f89..0ec4944e 100644
--- a/src/zip.c
+++ b/src/zip.c
@@ -1,822 +1,838 @@
-// ------------------------------------------------------------------
-// zip.c
-// Copyright (C) 2019-2024 Genozip Limited. Patent Pending.
-// Please see terms and conditions in the file LICENSE.txt
-//
-// WARNING: Genozip is proprietary, not open source software. Modifying the source code is strictly prohibited,
-// under penalties specified in the license.
-
-#include
-#include
-#include
-#include "zfile.h"
-#include "dispatcher.h"
-#include "zip.h"
-#include "seg.h"
-#include "random_access.h"
-#include "refhash.h"
-#include "ref_iupacs.h"
-#include "progress.h"
-#include "stats.h"
-#include "compressor.h"
-#include "bgzf.h"
-#include "txtheader.h"
-#include "threads.h"
-#include "contigs.h"
-#include "chrom.h"
-#include "biopsy.h"
-#include "dict_io.h"
-#include "gencomp.h"
-#include "aliases.h"
-#include "arch.h"
-#include "user_message.h"
-#include "zriter.h"
-#include "b250.h"
-#include "zip_dyn_int.h"
-
-static void zip_display_compression_ratio (Digest md5)
-{
- float z_bytes = MAX_((float)z_file->disk_so_far, 1.0); // at least one, to avoid division by zero in case of a z_bytes=0 issue
- float plain_bytes = (float)z_file->txt_data_so_far_bind;
- float comp_bytes = is_read_via_ext_decompressor (txt_file)
- ? (float)txt_file->disk_size // 0 if via pipe or url, as we have no knowledge of file size
- : (float)txt_file->disk_so_far; // unlike disk_size, works also for piped-in files (but not CRAM, BCF, XZ, ZIP)
- float ratio_vs_plain = plain_bytes / z_bytes;
- float ratio_vs_comp = -1;
-
- if (flag.debug_progress)
- iprintf ("Ratio calculation: ratio_vs_plain=%f = plain_bytes=%"PRIu64" / z_bytes=%"PRIu64"\n",
- ratio_vs_plain, (uint64_t)plain_bytes, (uint64_t)z_bytes);
-
- // in bind mode, we don't show compression ratio for files except for the last one
- if (flag.bind) {
-
- static float comp_bytes_bind = 0;
- static FileType source_file_type = UNKNOWN_FILE_TYPE;
-
- // reset for every set of bound files (we might have multiple sets of --pair)
- if (z_file->num_txts_so_far == 1) {
- comp_bytes_bind=0;
- source_file_type = txt_file->type;
- }
-
- else if (source_file_type != txt_file->type) // heterogenous source file types
- source_file_type = UNKNOWN_FILE_TYPE;
-
- comp_bytes_bind += comp_bytes;
-
- if (z_file->z_closes_after_me) {
- ratio_vs_comp = comp_bytes_bind / z_bytes; // compression vs .gz/.bz2/.bcf/.xz... size
- if (flag.debug_progress)
- iprintf ("Ratio calculation: ratio_vs_comp=%f = comp_bytes_bind=%"PRIu64" / z_bytes=%"PRIu64"\n",
- ratio_vs_comp, (uint64_t)comp_bytes_bind, (uint64_t)z_bytes);
- }
- else
- progress_finalize_component_time ("Done", md5);
- }
- else {
- ratio_vs_comp = comp_bytes / z_bytes; // compression vs .gz/.bz2/.bcf/.xz... size
- if (flag.debug_progress)
- iprintf ("Ratio calculation: ratio_vs_comp=%f = comp_bytes=%"PRIu64" / z_bytes=%"PRIu64"\n",
- ratio_vs_comp, (uint64_t)comp_bytes, (uint64_t)z_bytes);
- }
-
- // in bound files, for the non-last components, we already printed "Done" above
- if (flag.bind && !z_file->z_closes_after_me) {}
-
- // when making a reference, we don't care about the compression
- else if (flag.make_reference || flag.zip_no_z_file)
- progress_finalize_component_time ("Done", md5);
-
- // Deep - to complicated to communicate compression vs FASTQ and BAM/SAM source files - show only vs compression
- else if (flag.deep)
- progress_finalize_component_time_ratio ("Deep", ratio_vs_comp, md5);
-
- // when compressing BAM report only ratio_vs_comp (compare to BGZF-compress BAM - we don't care about the underlying plain BAM)
- // Likewise, doesn't have a compression extension (eg .gz), even though it may actually be compressed eg .tbi (which is actually BGZF)
- else if (Z_DT(BAM) || (txt_file && file_get_codec_by_txt_ft (FAF ? DT_FASTA : txt_file->data_type, txt_file->type, false) == CODEC_NONE))
- progress_finalize_component_time_ratio (SRC_CODEC(CRAM)?"CRAM" : z_dt_name_faf(), ratio_vs_comp, md5);
-
- else if (ratio_vs_comp >= 0) {
- if (SRC_CODEC(NONE) || ratio_vs_comp < 1.05) // disk_so_far doesn't give us the true txt file size
- progress_finalize_component_time_ratio (z_dt_name_faf(), ratio_vs_plain, md5);
-
- else // source was compressed
- progress_finalize_component_time_ratio_better (z_dt_name_faf(), ratio_vs_plain, file_exts[txt_file->type], ratio_vs_comp, md5);
- }
-}
-
-static struct { DataType dt; const uint64_t dict_id_num; LocalGetLineCB *func; } callbacks[] = LOCAL_GET_LINE_CALLBACKS;
-
-LocalGetLineCB *zip_get_local_data_callback (DataType dt, ContextP ctx)
-{
- if (ctx && !ctx->no_callback)
- for (unsigned i=0; i < ARRAY_LEN(callbacks); i++)
- if (callbacks[i].dt == dt && callbacks[i].dict_id_num == ctx->dict_id.num)
- return callbacks[i].func;
-
- return NULL;
-}
-
-void zip_set_no_stons_if_callback (VBlockP vb)
-{
- for (unsigned i=0; i < ARRAY_LEN(callbacks); i++)
- if (callbacks[i].dt == vb->data_type) {
- ContextP ctx = ctx_get_ctx (vb, callbacks[i].dict_id_num);
-
- if (!ctx->no_callback) ctx->no_stons = true;
- }
-}
-
-// after segging - if any context appears to contain only singleton snips (eg a unique ID),
-// we move it to local instead of needlessly cluttering the global dictionary
-static void zip_handle_unique_words_ctxs (VBlockP vb)
-{
- START_TIMER;
-
- for_ctx {
- if (ctx->local.len || ctx->local_always || // local is not free to accept our singletons
- ctx->no_stons || // don't change to LT_SINGLETON if we were explicitly forbidden having singletons
- ctx->ltype == LT_SUPP || // local data might be created by codec (later)
- (VB_DT(VCF) && dict_id_is_vcf_format_sf (ctx->dict_id))) // this doesn't work for FORMAT fields
- continue;
-
- // reset ltype to LT_SINGLETON so that we can use local for singletons (either here or in ctx_commit_node - subject to conditions).
- // note: ltype was possibly assigned a different value in *_seg_initialize, but then local not utilized.
- ctx->ltype = LT_SINGLETON;
-
- if (!ctx->nodes.len || // no new words in this VB
- ctx->nodes.len != ctx->b250.count || // not all new words in this VB are singletons
- ctx->nodes.len < vb->lines.len / 5 || // don't bother if this is a rare field less than 20% of the lines
- !ctx_can_have_singletons (ctx) || // this context is not allowed to have singletons
- ctx->b250.count == 1) // only one word - better to handle with all_the_same rather than singleton
- continue;
-
- buf_free (ctx->local); // possibly local was allocated, but then not utilized
- buf_move (vb, ctx->local, CTX_TAG_LOCAL, ctx->dict);
- buf_free (ctx->nodes);
- buf_free (ctx->b250);
- }
-
- COPY_TIMER (zip_handle_unique_words_ctxs);
-}
-
-static bool zip_generate_local (VBlockP vb, ContextP ctx)
-{
- START_TIMER;
-
- ASSERT (ctx->dict_id.num, "tag_name=%s did_i=%u: ctx->dict_id=0 despite ctx->local containing data", ctx->tag_name, (unsigned)(ctx - vb->contexts));
-
- ctx->ltype = dyn_int_get_ltype (ctx);
-
- // case: local is LTEN (instead of native endianity) and machine is BGEN, so BGEN_*_buf ^ above did nothing.
- bool need_lten = (ctx->local_is_lten && !flag.is_lten);
-
- switch (ctx->ltype) {
- case LT_BITMAP :
- LTEN_bits ((BitsP)&ctx->local);
- ctx->local.prm8[0] = ((uint8_t)64 - (uint8_t)(ctx->local.nbits % 64)) % (uint8_t)64;
- ctx->local_param = true;
- break;
-
- case LT_UINT32 : case LT_hex32 : case LT_HEX32 : case LT_FLOAT32 :
- if (need_lten) LTEN_u32_buf (&ctx->local, NULL);
- else BGEN_u32_buf (&ctx->local, NULL);
- break;
-
- case LT_UINT16 : case LT_hex16 : case LT_HEX16 :
- if (need_lten) LTEN_u16_buf (&ctx->local, NULL);
- else BGEN_u16_buf (&ctx->local, NULL);
- break;
-
- case LT_UINT64 : case LT_hex64 : case LT_HEX64 : case LT_FLOAT64 :
- if (need_lten) LTEN_u64_buf (&ctx->local, NULL);
- else BGEN_u64_buf (&ctx->local, NULL);
- break;
-
- case LT_INT8 : interlace_d8_buf (&ctx->local, NULL);
- break;
-
- case LT_INT16 : if (need_lten) LTEN_interlace_d16_buf (&ctx->local, NULL);
- else BGEN_interlace_d16_buf (&ctx->local, NULL);
- break;
-
- case LT_INT32 : if (need_lten) LTEN_interlace_d32_buf (&ctx->local, NULL);
- else BGEN_interlace_d32_buf (&ctx->local, NULL);
- break;
-
- case LT_INT64 : if (need_lten) LTEN_interlace_d64_buf (&ctx->local, NULL);
- else BGEN_interlace_d64_buf (&ctx->local, NULL);
- break;
-
- default : break;
- }
-
- // transpose if needed AND local is rectangular
- if (ctx->dyn_transposed)
- dyn_int_transpose (vb, ctx);
-
- COPY_TIMER (zip_generate_local); // codec_assign measures its own time
-
- // in case we are using "pair identical", drop this section if it is an R2 section identical to its R1 counterpart
- if (is_fastq_pair_2 (vb) && fastq_zip_use_pair_identical (ctx->dict_id) &&
- buf_issame (&ctx->local, &ctx->localR1, lt_width(ctx))) {
-
- if (flag.debug_generate)
- iprintf ("%s: %s[%u].local dropped because it is an R2 section which is identical to its R1 counterpart\n",
- VB_NAME, ctx->tag_name, ctx->did_i);
-
- // note: careful not to set local.len=0 bc this is called before merge, and ctx_drop_all_the_same (that
- // is part of merge), relies on local.len to decide if it can drop b250 of pair-identical R2.b250 sections
- return false;
- }
-
- codec_assign_best_codec (vb, ctx, NULL, SEC_LOCAL);
-
- if (flag.debug_generate)
- iprintf ("%s: %s[%u].local ltype=%s len=%"PRIu64" codec=%s\n", VB_NAME, ctx->tag_name, ctx->did_i,
- lt_name (ctx->ltype), ctx->local.len, codec_name(ctx->lcodec));
-
- return true;
-}
-
-// generate & write b250 data for all contexts - do them in random order, to reduce the chance of multiple doing codec_assign_best_codec for the same context at the same time
-// VBs doing codec_assign_best_codec at the same, so that they can benefit from pre-assiged codecs
-void zip_compress_all_contexts_b250 (VBlockP vb)
-{
- START_TIMER;
- threads_log_by_vb (vb, "zip", "START COMPRESSING B250", 0);
-
- // arrays of all contexts in this VB
- ContextP ctxs[vb->num_contexts];
- for (Did did_i=0; did_i < vb->num_contexts; did_i++) ctxs[did_i] = CTX(did_i);
-
- // in each iteration, pick a context at random and remove it from the list
- for (unsigned i=0; i < vb->num_contexts; i++) {
-
- int ctx_i = global_max_threads > 1 ? ((clock()+1) * (vb->vblock_i+1)) % (vb->num_contexts - i) : 0; // force predictability with single thread
-
- ContextP ctx = ctxs[ctx_i];
- memmove (&ctxs[ctx_i], &ctxs[ctx_i+1], (vb->num_contexts - i - ctx_i - 1) * sizeof (ContextP));
-
- if (!ctx->b250.len || ctx->b250_compressed) continue;
-
- if (!b250_zip_generate (vb, ctx)) // generate the final b250 buffers from their intermediate form
- continue; // dropped
-
- if (dict_id_typeless (ctx->dict_id).num == flag.dump_one_b250_dict_id.num)
- ctx_dump_binary (vb, ctx, false);
-
- if (flag.show_time) codec_show_time (vb, "B250", ctx->tag_name, ctx->bcodec);
-
- if (HAS_DEBUG_SEG(ctx) || flag.show_compress)
- iprintf ("B250: %s: %s: b250.len=%"PRIu64" b250.count=%"PRIu64" nodes.len=%"PRIu64"\n",
- VB_NAME, ctx->tag_name, ctx->b250.len, ctx->b250.count, ctx->nodes.len);
-
- START_TIMER;
- zfile_compress_b250_data (vb, ctx);
- COPY_TIMER(fields[ctx->did_i]);
-
- ctx->b250_compressed = true;
- }
-
- COPY_TIMER (zip_compress_ctxs); // same profiler for b250 and local as we breakdown by ctx underneath it
-}
-
-// generate & write local data for all contexts - in random order, to reduce the chance of multiple doing codec_assign_best_codec for the same context at the same time
-static void zip_compress_all_contexts_local (VBlockP vb)
-{
- START_TIMER;
- threads_log_by_vb (vb, "zip", "START COMPRESSING LOCAL", 0);
-
- // first we handle local_dep=0 then local_dep=1 and finally local_dep=2
- for (int dep_level=DEP_L0 ; dep_level < NUM_LOCAL_DEPENDENCY_LEVELS; dep_level++) {
-
- // initialize list of contexts at this dependency level that need compression
- ContextP ctxs[vb->num_contexts];
- unsigned num_ctxs=0;
- for_ctx_that ((ctx->local.len || ctx->local_always) && ctx->local_dep == dep_level && !ctx->local_compressed)
- ctxs[num_ctxs++] = ctx;
-
- while (num_ctxs) {
- // pick a context at "random" and remove it from the list (not random if single thread)
- int ctx_i = global_max_threads > 1 ? (65531 * (vb->vblock_i+1)) % num_ctxs : 0;
- ContextP ctx = ctxs[ctx_i];
- memmove (&ctxs[ctx_i], &ctxs[ctx_i+1], (num_ctxs - (ctx_i+1)) * sizeof (ContextP));
- num_ctxs--;
-
- ctx->local_compressed = true; // so we don't compress it again
-
- if (!zip_generate_local (vb, ctx))
- continue; // section dropped
-
- if (dict_id_typeless (ctx->dict_id).num == flag.show_singletons_dict_id.num)
- dict_io_show_singletons (vb, ctx);
-
- if (dict_id_typeless (ctx->dict_id).num == flag.dump_one_local_dict_id.num)
- ctx_dump_binary (vb, ctx, true);
-
- if (flag.show_time) codec_show_time (vb, "LOCAL", ctx->tag_name, ctx->lcodec);
-
- if (HAS_DEBUG_SEG(ctx) || flag.show_compress)
- iprintf ("LOCAL: %s: L%u: %s: ltype=%s len=%"PRIu64" size=%"PRIu64" param=%"PRIu64"\n",
- VB_NAME, dep_level, ctx->tag_name, lt_name (ctx->ltype), ctx->local.len, ctx->local.len * lt_width(ctx), ctx->local.param);
-
- START_TIMER;
- zfile_compress_local_data (vb, ctx, 0);
- COPY_TIMER(fields[ctx->did_i]);
-
- if (!ctx->dict_merged) // note: if dict_merged, we are in the second call to this function, and local consists of singletons
- ctx->no_stons = true; // since we had data in local, we don't allow ctx_commit_node to move singletons to local
-
- }
- }
-
- COPY_TIMER (zip_compress_ctxs); // same profiler for b250 and local as we breakdown by ctx underneath it
-}
-
-void zip_init_vb (VBlockP vb)
-{
- if (DTPT(zip_init_vb)) DTPT(zip_init_vb)(vb); // data-type specific initialization of the VB
-}
-
-// called by main thread after VB has completed processing
-static void zip_update_txt_counters (VBlockP vb)
-{
- // note: in case of an FASTQ with qname optimization or VCF with add_line_numbers, we already updated this in *_zip_init_vb
- if (!flag.zip_lines_counted_at_init_vb)
- txt_file->num_lines += vb->lines.len; // lines in this txt file
-
- // counters of data AS IT APPEARS IN THE TXT FILE
- z_file->num_lines += vb->lines.len; // lines in all bound files in this z_file
-
- z_file->comp_num_lines[vb->comp_i] += vb->lines.len; // set also for DVCF rejects
-
- z_file->txt_data_so_far_single_0 += (int64_t)vb->txt_size; // length of data before any modifications
- z_file->txt_data_so_far_bind_0 += (int64_t)vb->txt_size;
-
- // counter of data FOR PROGRESS DISPLAY
- z_file->txt_data_so_far_single += (int64_t)vb->txt_size;
-
- // counter of data in DEFAULT RECONSTRUCTION
- z_file->txt_data_so_far_bind += vb->recon_size;
-
- // per-component data for stats
- z_file->txt_data_so_far_bind_0_comp[vb->comp_i] += (int64_t)vb->txt_size;
-
- // note: in case of SAM gencomp, MAIN, we add recon_size - assuming the discrepency vs txt_data.len
- // is only due to lines being deported to gencomp
- z_file->txt_data_so_far_bind_comp[vb->comp_i] +=
- (z_sam_gencomp && vb->comp_i==SAM_COMP_MAIN) ? vb->recon_size : Ltxt;
-
- z_file->num_components = MAX_(z_file->num_components, vb->comp_i+1);
-
- // Note: no data-type-specific code here, instead, put in *_zip_after_compute
-}
-
-// ZIP: free z_file buffers no longer needed before global sections
-static void zip_free_undeeded_zctx_bufs_after_seg (void)
-{
- START_TIMER;
-
- for_zctx {
- buf_destroy (zctx->ston_hash);
- buf_destroy (zctx->ston_ents);
- buf_destroy (zctx->global_hash);
- }
-
- buf_destroy (z_file->sag_grps);
- buf_destroy (z_file->sag_grps_index);
- buf_destroy (z_file->sag_alns);
- buf_destroy (z_file->sag_qnames);
- buf_destroy (z_file->sag_depn_index);
- buf_destroy (z_file->sag_cigars);
- buf_destroy (z_file->sag_seq);
- buf_destroy (z_file->sag_qual);
- buf_destroy (z_file->vb_start_deep_line);
- buf_destroy (z_file->deep_ents);
-
- buf_low_level_release_memory_back_to_kernel();
-
- COPY_TIMER_EVB (zip_free_undeeded_zctx_bufs_after_seg);
-}
-
-// write all the sections at the end of the file, after all VB stuff has been written
-static void zip_write_global_area (void)
-{
- START_TIMER;
-
- #define THREAD_DEBUG(x) threads_log_by_vb (evb, "main_thread:global_area", #x, 0);
-
- if (!flag.show_memory) // in show-mem, keep these, so we can report them.
- zip_free_undeeded_zctx_bufs_after_seg();
-
- codec_qual_show_stats();
-
- // if we're making a reference, we need the RA data to populate the reference section chrome/first/last_pos ahead of ref_compress_ref
- THREAD_DEBUG (finalize_random_access);
- random_access_finalize_entries (&z_file->ra_buf); // sort RA, update entries that don't yet have a chrom_index
-
- THREAD_DEBUG (compress_dictionaries);
- dict_io_compress_dictionaries();
-
- THREAD_DEBUG (compress_counts);
- ctx_compress_counts();
-
- THREAD_DEBUG (compress_subdicts);
- ctx_compress_subdicts();
-
- // store a mapping of the file's chroms to the reference's contigs, if they are any different
- // note: not needed in REF_EXT_STORE, as we convert the stored ref_contigs to use chrom_index of the file's CHROM
- if (IS_REF_EXTERNAL && DTFZ(chrom) != DID_NONE) {
- THREAD_DEBUG (compress_chrom_2ref);
- chrom_2ref_compress(gref);
- }
-
- // output reference, if needed
- bool store_ref = (flag.reference & REF_STORED) || flag.make_reference;
- if (store_ref) {
- THREAD_DEBUG (compress_ref);
- ref_compress_ref();
- }
-
- if (flag.make_reference) {
- THREAD_DEBUG (compress_iupacs);
- ref_iupacs_compress();
-
- THREAD_DEBUG (compress_refhash);
- refhash_compress_refhash();
- }
-
- // compress alias list, if this data_type has any aliases defined
- THREAD_DEBUG (compress_aliases);
- aliases_compress();
-
- if (!segconf.disable_random_acccess) {
- THREAD_DEBUG (compress_random_access);
- // if this data has random access (i.e. it has chrom and pos), compress all random access records into evb->z_data
- Codec codec = random_access_compress (&z_file->ra_buf, SEC_RANDOM_ACCESS, CODEC_UNKNOWN, flag.show_index ? RA_MSG_PRIM : NULL);
-
- if (store_ref)
- random_access_compress (ref_get_stored_ra (gref), SEC_REF_RAND_ACC, codec, flag.show_ref_index ? RA_MSG_REF : NULL);
- }
-
- THREAD_DEBUG (user_message);
- user_message_compress();
-
- THREAD_DEBUG (stats);
- stats_generate();
-
- // compress genozip header (including its payload sectionlist and footer) into evb->z_data
- zfile_compress_genozip_header();
-
- stats_finalize();
-
- if (DTPZ(zip_free_end_of_z)) DTPZ(zip_free_end_of_z)();
-
- COPY_TIMER_EVB (zip_write_global_area);
-}
-
-// entry point for ZIP compute thread
-static void zip_compress_one_vb (VBlockP vb)
-{
- START_TIMER;
-
- // we're just taking a biopsy of the txt data, so no need to actually compress.
- if (flag.biopsy &&
- !(segconf.sag_type && vb->comp_i == SAM_COMP_MAIN)) // except in MAIN of SAM/BAM gencomp - need to generate PRIM and DEPN VBs
- goto after_compress;
-
- // if the txt file is compressed with BGZF, we uncompress now, in the compute thread
- if (txt_file->codec == CODEC_BGZF && flag.pair != PAIR_R2)
- bgzf_uncompress_vb (vb); // some of the blocks might already have been decompressed while reading - we decompress the remaining
-
- vb->txt_size = Ltxt; // this doesn't change with --optimize.
-
- // clone global dictionaries while granted exclusive access to the global dictionaries
- ctx_clone (vb);
-
- // case we need to modify the data (--optimize etc): re-write VB before digest
- if (segconf.zip_txt_modified && DTP(zip_modify) &&
- !flag.make_reference && Ltxt && !vb_is_gencomp(vb))
- zip_modify (vb);
-
- vb->recon_size = Ltxt; // length after potentially modifying
-
- // calculate the digest contribution of this VB, and the digest snapshot of this VB
- if (zip_need_digest)
- digest_one_vb (vb, true, NULL); // serializes VBs in order if MD5
-
- // allocate memory for the final compressed data of this vb. allocate 1/8 of the
- // vb size on the (uncompressed) txt file - this is normally plenty. if not, we will realloc downstream
- buf_alloc (vb, &vb->z_data, 0, vb->txt_size / 8, char, CTX_GROWTH, "z_data");
-
- // split each line in this VB to its components
- threads_log_by_vb (vb, "zip", "START SEG", 0);
-
- seg_all_data_lines (vb);
-
- if (flag.biopsy) goto after_compress; // in case of MAIN VB of SAM/BAM gencomp: we end our biopsy journey here
-
- // identify dictionaries that contain only singleton words (eg a unique id) and move the data from dict to local
- zip_handle_unique_words_ctxs (vb);
-
- zfile_compress_vb_header (vb); // vblock header
-
- if (flag.show_codec) {
- DO_ONCE iprintf ("\n\nThe output of --show-codec-test: Testing a sample of up %u bytes on ctx.local of each context.\n"
- "Results in the format [codec bytes μsec] are in order of quality - the first was selected.\n", CODEC_ASSIGN_SAMPLE_SIZE);
- }
-
- bool need_compress = !flag.make_reference && !flag.seg_only;
-
- // while vb_i=1 is busy merging, other VBs can handle local
- if (vb->vblock_i != 1 && need_compress)
- zip_compress_all_contexts_local (vb); // not yet locals that consist of singletons transferred from dict to local in ctx_merge_in_vb_ctx (these will have len=0 at this point)
-
- dispatcher_increment_progress ("compress1", PROGRESS_UNIT/2); // 1/2 compression done
-
- threads_log_by_vb (vb, "zip", "START MERGE", 0);
-
- // for --make-reference we serialize merging by VB, so that contigs get their word_index in the order of the reference file
- if (flag.make_reference) serializer_lock (make_ref_merge_serializer, vb->vblock_i);
-
- // merge new words added in this vb into the z_file.contexts (zctx), ahead of b250_zip_generate().
- // writing indices based on the merged dictionaries. all this is done while locking a mutex for each zctx.
- // note: vb>=2 will block here, until vb=1 is completed
- ctx_merge_in_vb_ctx (vb);
-
- if (flag.make_reference) serializer_unlock (make_ref_merge_serializer);
-
- if (need_compress) {
- zip_compress_all_contexts_local (vb); // for vb=1 - all locals ; for vb>1 - locals which consist of singletons set in ctx_merge_in_vb_ctx (other locals were already compressed above)
- zip_compress_all_contexts_b250 (vb);
- }
-
- dispatcher_increment_progress ("compress2", PROGRESS_UNIT-(PROGRESS_UNIT/2)); // 1/2 compression done
-
- // merge in random access - IF it is used
- if (!segconf.disable_random_acccess)
- random_access_merge_in_vb (vb);
-
-after_compress:
- // examples: compress data-type specific sections ; absorb gencomp lines
- DT_FUNC (vb, zip_after_compress)(vb);
-
- // tell dispatcher this thread is done and can be joined.
- vb_set_is_processed (vb);
-
- COPY_TIMER (compute);
-}
-
-// data sent through dispatcher fan out functions - to do: make this an opaque struct
-static VBIType prev_file_first_vb_i=0, prev_file_last_vb_i=0; // used if we're binding files - the vblock_i will continue from one file to the next
-
-// main thread: returns true if successfully prepared a vb
-static void zip_prepare_one_vb_for_dispatching (VBlockP vb)
-{
- // if we're compressing the 2nd file in a fastq pair (with --pair) - look back at the z_file data
- // and copy the data we need for this vb. note: we need to do this before txtfile_read_vblock as
- // we need the num_lines of the pair VB
- bool R1_data_exhausted = false;
- if (flag.pair == PAIR_R2) {
- uint32_t pair_vb_i = prev_file_first_vb_i + (vb->vblock_i-1 - prev_file_last_vb_i);
-
- if (pair_vb_i <= prev_file_last_vb_i)
- fastq_read_pair_1_data (vb, pair_vb_i); // add the R1 sections z_data after the R2 sections
- else
- R1_data_exhausted = true; // R1 data is already exhausted. This is ok if R2 data is exhausted too.
- }
-
- // case: we have out-of-band txt_data waiting (for generated components) - compress this data first,
- // before reading more data from the txt_file
- if (gencomp_get_txt_data(vb))
- goto dispatch;
-
- else {
- txtfile_read_vblock (vb);
-
- // --head diagnostic option in ZIP cuts a certain number of first lines from vb=1, and discards other VBs
- if (vb->vblock_i != 1 && flag.has_head) {
- vb->dispatch = DATA_EXHAUSTED;
- return;
- }
-
- if (Ltxt)
- goto dispatch;
-
- else if (gencomp_am_i_expecting_more_txt_data()) // more data might be coming from MAIN VBs currently computing
- vb->dispatch = MORE_DATA_MIGHT_COME;
-
- else
- vb->dispatch = DATA_EXHAUSTED;
-
- // note: the opposite case where R2 has less reads than R1 is caught in txtfile_read_vblock
- ASSINP (!R1_data_exhausted || vb->dispatch == DATA_EXHAUSTED, // we are expecting that if our pair R1 data is exhausted, then our R2 data is exhausted too
- "Error: File %s has more FASTQ reads than its R1 mate (vb=%s)", txt_name, VB_NAME);
-
- // error if stdin is empty - can happen only when redirecting eg "cat empty-file|./genozip -" (we test for empty regular files in main_genozip)
- ASSINP0 (vb->vblock_i > 1 || txt_file->txt_data_so_far_single /* txt header data */,
- "Error: Cannot compress stdin data because its size is 0");
-
- if (flag.biopsy && vb->dispatch == DATA_EXHAUSTED)
- biopsy_data_is_exhausted();
-
- if (flag.debug_or_test) buflist_test_overflows(vb, __FUNCTION__);
-
- return;
- }
-
-dispatch:
- vb->dispatch = READY_TO_COMPUTE;
- txt_file->num_vbs_dispatched++;
-
- if (vb->comp_i == COMP_MAIN) // note: we only update the MAIN comp from here, gen comps are updated
- gencomp_a_main_vb_has_been_dispatched();
-}
-
-// called main thread, as VBs complete (might be out-of-order)
-static void zip_complete_processing_one_vb (VBlockP vb)
-{
- DT_FUNC (vb, zip_after_compute)(vb);
-
- // update z_data in memory (its not written to disk yet)
- zfile_update_compressed_vb_header (vb);
-
- txt_file->max_lines_per_vb = MAX_(txt_file->max_lines_per_vb, vb->lines.len);
-
- if (!flag.make_reference && !flag.seg_only)
- zfile_output_processed_vb_ext (vb, true);
-
- zip_update_txt_counters (vb);
-
- // destroy some buffers of "first generation" contexts (those that didn't clone any nodes)
- if (vb->vblock_i < 100) // don't bother checking for high vb_i
- for_ctx_that (ctx->nodes.len32 && !ctx->ol_nodes.len32) {
- buf_destroy (ctx->b250); // 1st generation likely to have excessive length due to being all-new 4B nodes
- buf_destroy (ctx->local_hash); // 1st generation allocated based on wild guess
- buf_destroy (ctx->nodes); // 1st generation likely to have a lot more new nodes (+dict) that subsequent generations
- buf_destroy (ctx->dict);
- }
-
- dispatcher_increment_progress ("z_write", PROGRESS_UNIT); // writing done.
-
- z_file->num_vbs = MAX_(z_file->num_vbs, vb->vblock_i); // note: VBs are written out of order, so this can increase by 0, 1, or more than 1
- txt_file->num_vbs++;
-}
-
-// this is the main dispatcher function. It first processes the txt header, then proceeds to read
-// a VB from the input file and send it off to a thread for computation. When the thread
-// completes, this function proceeds to write the output to the output file. It can dispatch
-// several threads in parallel.
-void zip_one_file (rom txt_basename,
- bool is_last_user_txt_file) // the last user-specified txt file in this execution
-{
- Dispatcher dispatcher = 0;
- dispatcher_start_wallclock();
- if (flag.show_time_comp_i == flag.zip_comp_i) profiler_initialize(); // re-start wallclock
-
- z_file->txt_data_so_far_single = 0;
- z_file->num_components = MAX_(z_file->num_components, flag.zip_comp_i+1); // may increase further with generated components (in zip_update_txt_counters())
- evb->z_data.len = 0;
- evb->z_next_header_i = 0;
-
- // we calculate digest for each component seperately, stored in SectionHeaderTxtHeader (always 0 for generated components, or if modified)
- if (gencomp_comp_eligible_for_digest(NULL)) // if generated component - keep digest to display in progress after the last component
- z_file->digest_ctx = DIGEST_CONTEXT_NONE;
-
- if (!flag.bind || flag.zip_comp_i == COMP_MAIN)
- prev_file_first_vb_i = prev_file_last_vb_i = 0; // reset if we're not binding
-
- // initalize pre-defined ctxs before segconf
- // note: generic_is_header_done as well as segconf may change the data type and re-initialize the contexts
- if (z_file->num_txts_so_far == 0) // first component of this z_file
- ctx_initialize_predefined_ctxs (z_file->contexts, txt_file->data_type, z_file->d2d_map, &z_file->num_contexts);
-
- segconf_zip_initialize(); // before txtheader
-
- uint32_t first_vb_i = prev_file_last_vb_i + 1;
-
- // read the txt header, assign the global variables, and write the compressed header to the GENOZIP file
- int64_t txt_header_offset = -1;
- int64_t txt_header_len = txtheader_zip_read_and_compress (&txt_header_offset, flag.zip_comp_i); // also increments z_file->num_txts_so_far
-
- bool success = (txt_header_len >= -1);
- if (!success) goto finish; // eg 2nd+ VCF file cannot bind, because of different sample names
-
- DT_FUNC (txt_file, zip_initialize)();
-
- segconf_calculate();
-
- DT_FUNC (txt_file, zip_after_segconf)();
-
- static uint64_t target_progress=0;
- if ((Z_DT(FASTQ) && flag.pair != PAIR_R2) || // note: if 2nd of a FASTQ file pair - we leave the target as it was in the first file as seggable_size is not calculated for the 2nd file
- (flag.deep && flag.zip_comp_i <= SAM_COMP_FQ00) ||
- (!flag.deep && !Z_DT(FASTQ))) {
-
- int64_t progress_unit = txt_file->est_num_lines ? txt_file->est_num_lines : txtfile_get_seggable_size();
-
- target_progress = progress_unit * (3 + segconf.zip_txt_modified) // read, (modify), seg, compress
- + (!flag.make_reference && !flag.seg_only && !flag.biopsy) * progress_unit; // write
- }
-
- if (flag.debug_progress)
- iprintf ("zip_comp_i=%u : target_progress=%"PRIu64"\n", flag.zip_comp_i, target_progress);
-
- dispatcher =
- dispatcher_fan_out_task (ZIP_TASK_NAME, txt_basename,
- target_progress, // target progress: 1 for each read, compute, write
- target_progress ? NULL : txt_file->is_remote ? "Downloading & compressing..." : "Compressing...",
- !flag.make_reference, // allow callbacks to zip_complete_processing_one_vb not in order of VBs (not allowed for make-reference as contigs need to be in consistent order)
- false, // not test mode
- flag.xthreads, prev_file_last_vb_i, 5000, false,
- zip_prepare_one_vb_for_dispatching,
- zip_compress_one_vb,
- zip_complete_processing_one_vb);
-
- // verify that entire file was read (if file size is known)
- ASSERT (txt_file->disk_so_far == txt_file->disk_size || !txt_file->disk_size ||
- flag.lines_last != NO_LINE || // only of a subset of the file was compressed, at user request
- is_read_via_ext_decompressor (txt_file) ||
- (txt_file->disk_so_far > txt_file->disk_size && flag.truncate), // case of compressing a file while it is still downloading - we will likely compress more than the disk_size bytes recorded upon file open
- "Failed to compress entire file: file size is %s, but only %s bytes were compressed",
- str_int_commas (txt_file->disk_size).s, str_int_commas (txt_file->disk_so_far).s);
-
- if (txt_file->codec == CODEC_BGZF)
- bgzf_finalize_discovery();
-
- zriter_wait_for_bg_writing(); // complete writing VBs before moving on
-
- dispatcher_calc_avg_compute_vbs (dispatcher);
-
- dispatcher_increment_progress ("txt_header", txt_file->est_num_lines ? 3 : (txt_header_len * 3)); // account for txt_header read, computed and written
-
- // go back and update some fields in the txt header's section header and genozip header
- if (txt_header_offset >= 0) // note: this will be -1 if we didn't write a SEC_TXT_HEADER section for any reason (e.g. SAM PRIM/DEPN, --make-reference...)
- zfile_update_txt_header_section_header (txt_header_offset);
-
- ASSERT0 (!flag.biopsy || biopsy_is_done(), "Biopsy request not complete - some VBs missing");
-
- // write the BGZF section containing BGZF block sizes, if this txt file is compressed with BGZF
- if (txt_file->codec == CODEC_BGZF)
- bgzf_compress_bgzf_section();
-
- // if this a non-bound file, or the last component of a bound file - write the genozip header, random access and dictionaries
-finish:
- z_file->txt_file_disk_sizes[flag.zip_comp_i] = txt_file->disk_size ? txt_file->disk_size // actual file size on disk, if we know it (we don't if its a remote or stdin file)
- : (int64_t)txt_file->disk_so_far + (txt_file->codec==CODEC_BGZF ? BGZF_EOF_LEN : 0); // data (plain, BGZF, GZ or BZ2) read from the file descriptor (we won't have correct src data here if reading through an external decompressor - but luckily txt_file->disk_size will capture that case)
- z_file->txt_file_disk_sizes_sum += z_file->txt_file_disk_sizes[flag.zip_comp_i];
-
- z_file->comp_codec[flag.zip_comp_i] = txt_file->codec;
- z_file->comp_source_codec[flag.zip_comp_i] = txt_file->source_codec;
-
- // (re-)index sections after adding this txt_file
- sections_create_index();
-
- // reconstruction plan (for VCF - for DVCF or --sort, for SAM - re-integrate supp/secondary alignments)
- if (!flag.seg_only && DTPZ(generate_recon_plan))
- DTPZ(generate_recon_plan)(); // should set z_file->z_closes_after_me if we need to close after this component after all
-
- if (z_file->z_closes_after_me && !flag.seg_only) { // note: for SAM, z_closes_after_me might be updated in sam_zip_generate_recon_plan
- // if we used the aligner with REF_EXT_STORE, we make sure all the CHROMs referenced are in the CHROM context, so
- // as SEC_REF_CONTIGS refers to them. We do this by seeing which contigs have any bit set in is_set.
- // note: in REF_EXTERNAL we don't use is_set, so we populate all contigs in zip_initialize
- // note: must be before zip_after_vbs() bc sam_zip_after_vbs() removes unused dict words (they are marked as used in ref_contigs_populate_aligned_chroms)
- if (flag.aligner_available && IS_REF_EXT_STORE) {
- THREAD_DEBUG (populate_aligned_chroms);
- ref_contigs_populate_aligned_chroms();
- }
-
- DT_FUNC (txt_file, zip_after_vbs)();
-
- zip_write_global_area();
- }
-
- zip_display_compression_ratio (digest_snapshot (&z_file->digest_ctx, NULL)); // Done for reference + final compression ratio calculation
-
- if (flag.md5 && flag.bind && z_file->z_closes_after_me &&
- ((flag.bind == BIND_FQ_PAIR && z_file->num_txts_so_far == 2) ||
- (flag.bind == BIND_SAM && z_file->num_txts_so_far == 3)))
- progress_concatenated_md5 (z_dt_name(), digest_snapshot (&z_file->digest_ctx, "file"));
-
- z_file->disk_size = z_file->disk_so_far;
-
- prev_file_first_vb_i = first_vb_i;
- dispatcher_finish (&dispatcher, &prev_file_last_vb_i,
- z_file->z_closes_after_me && !is_last_user_txt_file,
- flag.show_memory && z_file->z_closes_after_me && is_last_user_txt_file); // show memory
-
- if (!z_file->z_closes_after_me)
- ctx_reset_codec_commits();
-
- // no need to waste time freeing memory of the last file, the process termination will do that
- flag.let_OS_cleanup_on_exit = is_last_user_txt_file && z_file->z_closes_after_me && !arch_is_valgrind();
-
- DT_FUNC (txt_file, zip_finalize)(is_last_user_txt_file);
-
- segconf_free();
-
- if (flag.show_time_comp_i == flag.zip_comp_i)
- profiler_add_evb_and_print_report();
-}
+// ------------------------------------------------------------------
+// zip.c
+// Copyright (C) 2019-2024 Genozip Limited. Patent Pending.
+// Please see terms and conditions in the file LICENSE.txt
+//
+// WARNING: Genozip is proprietary, not open source software. Modifying the source code is strictly prohibited,
+// under penalties specified in the license.
+
+#include
+#include
+#include
+#include "zfile.h"
+#include "dispatcher.h"
+#include "zip.h"
+#include "seg.h"
+#include "random_access.h"
+#include "refhash.h"
+#include "ref_iupacs.h"
+#include "progress.h"
+#include "stats.h"
+#include "compressor.h"
+#include "bgzf.h"
+#include "txtheader.h"
+#include "threads.h"
+#include "contigs.h"
+#include "chrom.h"
+#include "biopsy.h"
+#include "dict_io.h"
+#include "gencomp.h"
+#include "aliases.h"
+#include "arch.h"
+#include "user_message.h"
+#include "zriter.h"
+#include "b250.h"
+#include "zip_dyn_int.h"
+
+static void zip_display_compression_ratio (Digest md5)
+{
+ float z_bytes = MAX_((float)z_file->disk_so_far, 1.0); // at least one, to avoid division by zero in case of a z_bytes=0 issue
+ float plain_bytes = (float)z_file->txt_data_so_far_bind;
+ float comp_bytes = is_read_via_ext_decompressor (txt_file)
+ ? (float)txt_file->disk_size // 0 if via pipe or url, as we have no knowledge of file size
+ : (float)txt_file->disk_so_far; // unlike disk_size, works also for piped-in files (but not CRAM, BCF, XZ, ZIP)
+ float ratio_vs_plain = plain_bytes / z_bytes;
+ float ratio_vs_comp = -1;
+
+ if (flag.debug_progress)
+ iprintf ("Ratio calculation: ratio_vs_plain=%f = plain_bytes=%"PRIu64" / z_bytes=%"PRIu64"\n",
+ ratio_vs_plain, (uint64_t)plain_bytes, (uint64_t)z_bytes);
+
+ // in bind mode, we don't show compression ratio for files except for the last one
+ if (flag.bind) {
+
+ static float comp_bytes_bind = 0;
+ static FileType source_file_type = UNKNOWN_FILE_TYPE;
+
+ // reset for every set of bound files (we might have multiple sets of --pair)
+ if (z_file->num_txts_so_far == 1) {
+ comp_bytes_bind=0;
+ source_file_type = txt_file->type;
+ }
+
+ else if (source_file_type != txt_file->type) // heterogenous source file types
+ source_file_type = UNKNOWN_FILE_TYPE;
+
+ comp_bytes_bind += comp_bytes;
+
+ if (z_file->z_closes_after_me) {
+ ratio_vs_comp = comp_bytes_bind / z_bytes; // compression vs .gz/.bz2/.bcf/.xz... size
+ if (flag.debug_progress)
+ iprintf ("Ratio calculation: ratio_vs_comp=%f = comp_bytes_bind=%"PRIu64" / z_bytes=%"PRIu64"\n",
+ ratio_vs_comp, (uint64_t)comp_bytes_bind, (uint64_t)z_bytes);
+ }
+ else
+ progress_finalize_component_time ("Done", md5);
+ }
+ else {
+ ratio_vs_comp = comp_bytes / z_bytes; // compression vs .gz/.bz2/.bcf/.xz... size
+ if (flag.debug_progress)
+ iprintf ("Ratio calculation: ratio_vs_comp=%f = comp_bytes=%"PRIu64" / z_bytes=%"PRIu64"\n",
+ ratio_vs_comp, (uint64_t)comp_bytes, (uint64_t)z_bytes);
+ }
+
+ // in bound files, for the non-last components, we already printed "Done" above
+ if (flag.bind && !z_file->z_closes_after_me) {}
+
+ // when making a reference, we don't care about the compression
+ else if (flag.make_reference || flag.zip_no_z_file)
+ progress_finalize_component_time ("Done", md5);
+
+ // Deep - to complicated to communicate compression vs FASTQ and BAM/SAM source files - show only vs compression
+ else if (flag.deep)
+ progress_finalize_component_time_ratio ("Deep", ratio_vs_comp, md5);
+
+ // when compressing BAM report only ratio_vs_comp (compare to BGZF-compress BAM - we don't care about the underlying plain BAM)
+ // Likewise, doesn't have a compression extension (eg .gz), even though it may actually be compressed eg .tbi (which is actually BGZF)
+ else if (Z_DT(BAM) || (txt_file && file_get_codec_by_txt_ft (FAF ? DT_FASTA : txt_file->data_type, txt_file->type, false) == CODEC_NONE))
+ progress_finalize_component_time_ratio (SRC_CODEC(CRAM)?"CRAM" : z_dt_name_faf(), ratio_vs_comp, md5);
+
+ else if (ratio_vs_comp >= 0) {
+ if (SRC_CODEC(NONE) || ratio_vs_comp < 1.05) // disk_so_far doesn't give us the true txt file size
+ progress_finalize_component_time_ratio (z_dt_name_faf(), ratio_vs_plain, md5);
+
+ else // source was compressed
+ progress_finalize_component_time_ratio_better (z_dt_name_faf(), ratio_vs_plain, file_exts[txt_file->type], ratio_vs_comp, md5);
+ }
+}
+
+static struct { DataType dt; const uint64_t dict_id_num; LocalGetLineCB *func; } callbacks[] = LOCAL_GET_LINE_CALLBACKS;
+
+LocalGetLineCB *zip_get_local_data_callback (DataType dt, ContextP ctx)
+{
+ if (ctx && !ctx->no_callback)
+ for (unsigned i=0; i < ARRAY_LEN(callbacks); i++)
+ if (callbacks[i].dt == dt && callbacks[i].dict_id_num == ctx->dict_id.num)
+ return callbacks[i].func;
+
+ return NULL;
+}
+
+void zip_set_no_stons_if_callback (VBlockP vb)
+{
+ for (unsigned i=0; i < ARRAY_LEN(callbacks); i++)
+ if (callbacks[i].dt == vb->data_type) {
+ ContextP ctx = ctx_get_ctx (vb, callbacks[i].dict_id_num);
+
+ if (!ctx->no_callback) ctx->no_stons = true;
+ }
+}
+
+// after segging - if any context appears to contain only singleton snips (eg a unique ID),
+// we move it to local instead of needlessly cluttering the global dictionary
+static void zip_handle_unique_words_ctxs (VBlockP vb)
+{
+ START_TIMER;
+
+ for_ctx {
+ if (ctx->local.len || ctx->local_always || // local is not free to accept our singletons
+ ctx->no_stons || // don't change to LT_SINGLETON if we were explicitly forbidden having singletons
+ ctx->ltype == LT_SUPP || // local data might be created by codec (later)
+ (VB_DT(VCF) && dict_id_is_vcf_format_sf (ctx->dict_id))) // this doesn't work for FORMAT fields
+ continue;
+
+ // reset ltype to LT_SINGLETON so that we can use local for singletons (either here or in ctx_commit_node - subject to conditions).
+ // note: ltype was possibly assigned a different value in *_seg_initialize, but then local not utilized.
+ ctx->ltype = LT_SINGLETON;
+
+ if (!ctx->nodes.len || // no new words in this VB
+ ctx->nodes.len != ctx->b250.count || // not all new words in this VB are singletons
+ ctx->nodes.len < vb->lines.len / 5 || // don't bother if this is a rare field less than 20% of the lines
+ !ctx_can_have_singletons (ctx) || // this context is not allowed to have singletons
+ ctx->b250.count == 1) // only one word - better to handle with all_the_same rather than singleton
+ continue;
+
+ buf_free (ctx->local); // possibly local was allocated, but then not utilized
+ buf_move (vb, ctx->local, CTX_TAG_LOCAL, ctx->dict);
+ buf_free (ctx->nodes);
+ buf_free (ctx->b250);
+ }
+
+ COPY_TIMER (zip_handle_unique_words_ctxs);
+}
+
+static bool zip_generate_local (VBlockP vb, ContextP ctx)
+{
+ START_TIMER;
+
+ ASSERT (ctx->dict_id.num, "tag_name=%s did_i=%u: ctx->dict_id=0 despite ctx->local containing data", ctx->tag_name, (unsigned)(ctx - vb->contexts));
+
+ ctx->ltype = dyn_int_get_ltype (ctx);
+
+ // case: local is LTEN (instead of native endianity) and machine is BGEN, so BGEN_*_buf ^ above did nothing.
+ bool need_lten = (ctx->local_is_lten && !flag.is_lten);
+
+ switch (ctx->ltype) {
+ case LT_BITMAP :
+ LTEN_bits ((BitsP)&ctx->local);
+ ctx->local.prm8[0] = ((uint8_t)64 - (uint8_t)(ctx->local.nbits % 64)) % (uint8_t)64;
+ ctx->local_param = true;
+ break;
+
+ case LT_UINT32 : case LT_hex32 : case LT_HEX32 : case LT_FLOAT32 :
+ if (need_lten) LTEN_u32_buf (&ctx->local, NULL);
+ else BGEN_u32_buf (&ctx->local, NULL);
+ break;
+
+ case LT_UINT16 : case LT_hex16 : case LT_HEX16 :
+ if (need_lten) LTEN_u16_buf (&ctx->local, NULL);
+ else BGEN_u16_buf (&ctx->local, NULL);
+ break;
+
+ case LT_UINT64 : case LT_hex64 : case LT_HEX64 : case LT_FLOAT64 :
+ if (need_lten) LTEN_u64_buf (&ctx->local, NULL);
+ else BGEN_u64_buf (&ctx->local, NULL);
+ break;
+
+ case LT_INT8 : interlace_d8_buf (&ctx->local, NULL);
+ break;
+
+ case LT_INT16 : if (need_lten) LTEN_interlace_d16_buf (&ctx->local, NULL);
+ else BGEN_interlace_d16_buf (&ctx->local, NULL);
+ break;
+
+ case LT_INT32 : if (need_lten) LTEN_interlace_d32_buf (&ctx->local, NULL);
+ else BGEN_interlace_d32_buf (&ctx->local, NULL);
+ break;
+
+ case LT_INT64 : if (need_lten) LTEN_interlace_d64_buf (&ctx->local, NULL);
+ else BGEN_interlace_d64_buf (&ctx->local, NULL);
+ break;
+
+ default : break;
+ }
+
+ // transpose if needed AND local is rectangular
+ if (ctx->dyn_transposed)
+ dyn_int_transpose (vb, ctx);
+
+ COPY_TIMER (zip_generate_local); // codec_assign measures its own time
+
+ // in case we are using "pair identical", drop this section if it is an R2 section identical to its R1 counterpart
+ if (is_fastq_pair_2 (vb) && fastq_zip_use_pair_identical (ctx->dict_id) &&
+ buf_issame (&ctx->local, &ctx->localR1, lt_width(ctx))) {
+
+ if (flag.debug_generate)
+ iprintf ("%s: %s[%u].local dropped because it is an R2 section which is identical to its R1 counterpart\n",
+ VB_NAME, ctx->tag_name, ctx->did_i);
+
+ // note: careful not to set local.len=0 bc this is called before merge, and ctx_drop_all_the_same (that
+ // is part of merge), relies on local.len to decide if it can drop b250 of pair-identical R2.b250 sections
+ return false;
+ }
+
+ codec_assign_best_codec (vb, ctx, NULL, SEC_LOCAL);
+
+ if (flag.debug_generate)
+ iprintf ("%s: %s[%u].local ltype=%s len=%"PRIu64" codec=%s\n", VB_NAME, ctx->tag_name, ctx->did_i,
+ lt_name (ctx->ltype), ctx->local.len, codec_name(ctx->lcodec));
+
+ return true;
+}
+
+// generate & write b250 data for all contexts - do them in random order, to reduce the chance of multiple doing codec_assign_best_codec for the same context at the same time
+// VBs doing codec_assign_best_codec at the same, so that they can benefit from pre-assiged codecs
+void zip_compress_all_contexts_b250 (VBlockP vb)
+{
+ START_TIMER;
+ threads_log_by_vb (vb, "zip", "START COMPRESSING B250", 0);
+
+ // arrays of all contexts in this VB
+ ContextP ctxs[vb->num_contexts];
+ for (Did did_i=0; did_i < vb->num_contexts; did_i++) ctxs[did_i] = CTX(did_i);
+
+ // in each iteration, pick a context at random and remove it from the list
+ for (unsigned i=0; i < vb->num_contexts; i++) {
+
+ int ctx_i = global_max_threads > 1 ? ((clock()+1) * (vb->vblock_i+1)) % (vb->num_contexts - i) : 0; // force predictability with single thread
+
+ ContextP ctx = ctxs[ctx_i];
+ memmove (&ctxs[ctx_i], &ctxs[ctx_i+1], (vb->num_contexts - i - ctx_i - 1) * sizeof (ContextP));
+
+ if (!ctx->b250.len || ctx->b250_compressed) continue;
+
+ if (!b250_zip_generate (vb, ctx)) // generate the final b250 buffers from their intermediate form
+ continue; // dropped
+
+ if (dict_id_typeless (ctx->dict_id).num == flag.dump_one_b250_dict_id.num)
+ ctx_dump_binary (vb, ctx, false);
+
+ if (flag.show_time) codec_show_time (vb, "B250", ctx->tag_name, ctx->bcodec);
+
+ if (HAS_DEBUG_SEG(ctx) || flag.show_compress)
+ iprintf ("B250: %s: %s: b250.len=%"PRIu64" b250.count=%"PRIu64" nodes.len=%"PRIu64"\n",
+ VB_NAME, ctx->tag_name, ctx->b250.len, ctx->b250.count, ctx->nodes.len);
+
+ START_TIMER;
+ zfile_compress_b250_data (vb, ctx);
+ COPY_TIMER(fields[ctx->did_i]);
+
+ ctx->b250_compressed = true;
+ }
+
+ COPY_TIMER (zip_compress_ctxs); // same profiler for b250 and local as we breakdown by ctx underneath it
+}
+
+// generate & write local data for all contexts - in random order, to reduce the chance of multiple doing codec_assign_best_codec for the same context at the same time
+static void zip_compress_all_contexts_local (VBlockP vb)
+{
+ START_TIMER;
+ threads_log_by_vb (vb, "zip", "START COMPRESSING LOCAL", 0);
+
+ // first we handle local_dep=0 then local_dep=1 and finally local_dep=2
+ for (int dep_level=DEP_L0 ; dep_level < NUM_LOCAL_DEPENDENCY_LEVELS; dep_level++) {
+
+ // initialize list of contexts at this dependency level that need compression
+ ContextP ctxs[vb->num_contexts];
+ unsigned num_ctxs=0;
+ for_ctx_that ((ctx->local.len || ctx->local_always) && ctx->local_dep == dep_level && !ctx->local_compressed)
+ ctxs[num_ctxs++] = ctx;
+
+ while (num_ctxs) {
+ // pick a context at "random" and remove it from the list (not random if single thread)
+ int ctx_i = global_max_threads > 1 ? (65531 * (vb->vblock_i+1)) % num_ctxs : 0;
+ ContextP ctx = ctxs[ctx_i];
+ memmove (&ctxs[ctx_i], &ctxs[ctx_i+1], (num_ctxs - (ctx_i+1)) * sizeof (ContextP));
+ num_ctxs--;
+
+ ctx->local_compressed = true; // so we don't compress it again
+
+ if (!zip_generate_local (vb, ctx))
+ continue; // section dropped
+
+ if (dict_id_typeless (ctx->dict_id).num == flag.show_singletons_dict_id.num)
+ dict_io_show_singletons (vb, ctx);
+
+ if (dict_id_typeless (ctx->dict_id).num == flag.dump_one_local_dict_id.num)
+ ctx_dump_binary (vb, ctx, true);
+
+ if (flag.show_time) codec_show_time (vb, "LOCAL", ctx->tag_name, ctx->lcodec);
+
+ if (HAS_DEBUG_SEG(ctx) || flag.show_compress)
+ iprintf ("LOCAL: %s: L%u: %s: ltype=%s len=%"PRIu64" size=%"PRIu64" param=%"PRIu64"\n",
+ VB_NAME, dep_level, ctx->tag_name, lt_name (ctx->ltype), ctx->local.len, ctx->local.len * lt_width(ctx), ctx->local.param);
+
+ START_TIMER;
+ zfile_compress_local_data (vb, ctx, 0);
+ COPY_TIMER(fields[ctx->did_i]);
+
+ if (!ctx->dict_merged) // note: if dict_merged, we are in the second call to this function, and local consists of singletons
+ ctx->no_stons = true; // since we had data in local, we don't allow ctx_commit_node to move singletons to local
+
+ }
+ }
+
+ COPY_TIMER (zip_compress_ctxs); // same profiler for b250 and local as we breakdown by ctx underneath it
+}
+
+void zip_init_vb (VBlockP vb)
+{
+ if (DTPT(zip_init_vb)) DTPT(zip_init_vb)(vb); // data-type specific initialization of the VB
+}
+
+// called by main thread after VB has completed processing
+static void zip_update_txt_counters (VBlockP vb)
+{
+ // note: in case of an FASTQ with qname optimization or VCF with add_line_numbers, we already updated this in *_zip_init_vb
+ if (!flag.zip_lines_counted_at_init_vb)
+ txt_file->num_lines += vb->lines.len; // lines in this txt file
+
+ // counters of data AS IT APPEARS IN THE TXT FILE
+ z_file->num_lines += vb->lines.len; // lines in all bound files in this z_file
+
+ z_file->comp_num_lines[vb->comp_i] += vb->lines.len; // set also for DVCF rejects
+
+ z_file->txt_data_so_far_single_0 += (int64_t)vb->txt_size; // length of data before any modifications
+ z_file->txt_data_so_far_bind_0 += (int64_t)vb->txt_size;
+
+ // counter of data FOR PROGRESS DISPLAY
+ z_file->txt_data_so_far_single += (int64_t)vb->txt_size;
+
+ // counter of data in DEFAULT RECONSTRUCTION
+ z_file->txt_data_so_far_bind += vb->recon_size;
+
+ // per-component data for stats
+ z_file->txt_data_so_far_bind_0_comp[vb->comp_i] += (int64_t)vb->txt_size;
+
+ // note: in case of SAM gencomp, MAIN, we add recon_size - assuming the discrepency vs txt_data.len
+ // is only due to lines being deported to gencomp
+ z_file->txt_data_so_far_bind_comp[vb->comp_i] +=
+ (z_sam_gencomp && vb->comp_i==SAM_COMP_MAIN) ? vb->recon_size : Ltxt;
+
+ z_file->num_components = MAX_(z_file->num_components, vb->comp_i+1);
+
+ // Note: no data-type-specific code here, instead, put in *_zip_after_compute
+}
+
+// ZIP: free z_file buffers no longer needed before global sections
+static void zip_free_undeeded_zctx_bufs_after_seg (void)
+{
+ START_TIMER;
+
+ for_zctx {
+ buf_destroy (zctx->ston_hash);
+ buf_destroy (zctx->ston_ents);
+ buf_destroy (zctx->global_hash);
+ }
+
+ buf_destroy (z_file->sag_grps);
+ buf_destroy (z_file->sag_grps_index);
+ buf_destroy (z_file->sag_alns);
+ buf_destroy (z_file->sag_qnames);
+ buf_destroy (z_file->sag_depn_index);
+ buf_destroy (z_file->sag_cigars);
+ buf_destroy (z_file->sag_seq);
+ buf_destroy (z_file->sag_qual);
+ buf_destroy (z_file->vb_start_deep_line);
+ buf_destroy (z_file->deep_ents);
+
+ buf_low_level_release_memory_back_to_kernel();
+
+ COPY_TIMER_EVB (zip_free_undeeded_zctx_bufs_after_seg);
+}
+
+// write all the sections at the end of the file, after all VB stuff has been written
+static void zip_write_global_area (void)
+{
+ START_TIMER;
+
+ #define THREAD_DEBUG(x) threads_log_by_vb (evb, "main_thread:global_area", #x, 0);
+
+ if (!flag.show_memory) // in show-mem, keep these, so we can report them.
+ zip_free_undeeded_zctx_bufs_after_seg();
+
+ codec_qual_show_stats();
+
+ // if we're making a reference, we need the RA data to populate the reference section chrome/first/last_pos ahead of ref_compress_ref
+ THREAD_DEBUG (finalize_random_access);
+ random_access_finalize_entries (&z_file->ra_buf); // sort RA, update entries that don't yet have a chrom_index
+
+ THREAD_DEBUG (compress_dictionaries);
+ dict_io_compress_dictionaries();
+
+ THREAD_DEBUG (compress_counts);
+ ctx_compress_counts();
+
+ THREAD_DEBUG (compress_subdicts);
+ ctx_compress_subdicts();
+
+ // store a mapping of the file's chroms to the reference's contigs, if they are any different
+ // note: not needed in REF_EXT_STORE, as we convert the stored ref_contigs to use chrom_index of the file's CHROM
+ if (IS_REF_EXTERNAL && DTFZ(chrom) != DID_NONE) {
+ THREAD_DEBUG (compress_chrom_2ref);
+ chrom_2ref_compress(gref);
+ }
+
+ // output reference, if needed
+ bool store_ref = (flag.reference & REF_STORED) || flag.make_reference;
+ if (store_ref) {
+ THREAD_DEBUG (compress_ref);
+ ref_compress_ref();
+ }
+
+ if (flag.make_reference) {
+ THREAD_DEBUG (compress_iupacs);
+ ref_iupacs_compress();
+
+ THREAD_DEBUG (compress_refhash);
+ refhash_compress_refhash();
+ }
+
+ // compress alias list, if this data_type has any aliases defined
+ THREAD_DEBUG (compress_aliases);
+ aliases_compress();
+
+ if (!segconf.disable_random_acccess) {
+ THREAD_DEBUG (compress_random_access);
+ // if this data has random access (i.e. it has chrom and pos), compress all random access records into evb->z_data
+ Codec codec = random_access_compress (&z_file->ra_buf, SEC_RANDOM_ACCESS, CODEC_UNKNOWN, flag.show_index ? RA_MSG_PRIM : NULL);
+
+ if (store_ref)
+ random_access_compress (ref_get_stored_ra (gref), SEC_REF_RAND_ACC, codec, flag.show_ref_index ? RA_MSG_REF : NULL);
+ }
+
+ THREAD_DEBUG (user_message);
+ user_message_compress();
+
+ THREAD_DEBUG (stats);
+ stats_generate();
+
+ // compress genozip header (including its payload sectionlist and footer) into evb->z_data
+ zfile_compress_genozip_header();
+
+ stats_finalize();
+
+ if (DTPZ(zip_free_end_of_z)) DTPZ(zip_free_end_of_z)();
+
+ COPY_TIMER_EVB (zip_write_global_area);
+}
+
+// entry point for ZIP compute thread
+static void zip_compress_one_vb (VBlockP vb)
+{
+ START_TIMER;
+
+ // we're just taking a biopsy of the txt data, so no need to actually compress.
+ if (flag.biopsy &&
+ !(segconf.sag_type && vb->comp_i == SAM_COMP_MAIN)) // except in MAIN of SAM/BAM gencomp - need to generate PRIM and DEPN VBs
+ goto after_compress;
+
+ // if the txt file is compressed with BGZF/GZIL, we uncompress now, in the compute thread
+ if (vb->txt_codec)
+ bgz_uncompress_vb (vb, vb->txt_codec); // some of the blocks might already have been decompressed while reading - we decompress the remaining
+
+ vb->txt_size = Ltxt; // this doesn't change with --optimize.
+
+ // clone global dictionaries while granted exclusive access to the global dictionaries
+ ctx_clone (vb);
+
+ // case we need to modify the data (--optimize etc): re-write VB before digest
+ if (segconf.zip_txt_modified && DTP(zip_modify) &&
+ !flag.make_reference && Ltxt && !vb_is_gencomp(vb))
+ zip_modify (vb);
+
+ vb->recon_size = Ltxt; // length after potentially modifying
+
+ // calculate the digest contribution of this VB, and the digest snapshot of this VB
+ if (zip_need_digest)
+ digest_one_vb (vb, true, NULL); // serializes VBs in order if MD5
+
+ // allocate memory for the final compressed data of this vb. allocate 1/8 of the
+ // vb size on the (uncompressed) txt file - this is normally plenty. if not, we will realloc downstream
+ buf_alloc (vb, &vb->z_data, 0, vb->txt_size / 8, char, CTX_GROWTH, "z_data");
+
+ // split each line in this VB to its components
+ threads_log_by_vb (vb, "zip", "START SEG", 0);
+
+ seg_all_data_lines (vb);
+
+ if (flag.biopsy) goto after_compress; // in case of MAIN VB of SAM/BAM gencomp: we end our biopsy journey here
+
+ // identify dictionaries that contain only singleton words (eg a unique id) and move the data from dict to local
+ zip_handle_unique_words_ctxs (vb);
+
+ zfile_compress_vb_header (vb); // vblock header
+
+ if (flag.show_codec) {
+ DO_ONCE iprintf ("\n\nThe output of --show-codec-test: Testing a sample of up %u bytes on ctx.local of each context.\n"
+ "Results in the format [codec bytes μsec] are in order of quality - the first was selected.\n", CODEC_ASSIGN_SAMPLE_SIZE);
+ }
+
+ bool need_compress = !flag.make_reference && !flag.seg_only;
+
+ // while vb_i=1 is busy merging, other VBs can handle local
+ if (vb->vblock_i != 1 && need_compress)
+ zip_compress_all_contexts_local (vb); // not yet locals that consist of singletons transferred from dict to local in ctx_merge_in_vb_ctx (these will have len=0 at this point)
+
+ dispatcher_increment_progress ("compress1", PROGRESS_UNIT/2); // 1/2 compression done
+
+ threads_log_by_vb (vb, "zip", "START MERGE", 0);
+
+ // for --make-reference we serialize merging by VB, so that contigs get their word_index in the order of the reference file
+ if (flag.make_reference) serializer_lock (make_ref_merge_serializer, vb->vblock_i);
+
+ // merge new words added in this vb into the z_file.contexts (zctx), ahead of b250_zip_generate().
+ // writing indices based on the merged dictionaries. all this is done while locking a mutex for each zctx.
+ // note: vb>=2 will block here, until vb=1 is completed
+ ctx_merge_in_vb_ctx (vb);
+
+ if (flag.make_reference) serializer_unlock (make_ref_merge_serializer);
+
+ if (need_compress) {
+ zip_compress_all_contexts_local (vb); // for vb=1 - all locals ; for vb>1 - locals which consist of singletons set in ctx_merge_in_vb_ctx (other locals were already compressed above)
+ zip_compress_all_contexts_b250 (vb);
+ }
+
+ dispatcher_increment_progress ("compress2", PROGRESS_UNIT-(PROGRESS_UNIT/2)); // 1/2 compression done
+
+ // merge in random access - IF it is used
+ if (!segconf.disable_random_acccess)
+ random_access_merge_in_vb (vb);
+
+after_compress:
+ // examples: compress data-type specific sections ; absorb gencomp lines
+ DT_FUNC (vb, zip_after_compress)(vb);
+
+ // tell dispatcher this thread is done and can be joined.
+ vb_set_is_processed (vb);
+
+ COPY_TIMER (compute);
+}
+
+// data sent through dispatcher fan out functions - to do: make this an opaque struct
+static VBIType prev_file_first_vb_i=0, prev_file_last_vb_i=0; // used if we're binding files - the vblock_i will continue from one file to the next
+
+// main thread: returns true if successfully prepared a vb
+static void zip_prepare_one_vb_for_dispatching (VBlockP vb)
+{
+ // if we're compressing the 2nd file in a fastq pair (with --pair) - look back at the z_file data
+ // and copy the data we need for this vb. note: we need to do this before txtfile_read_vblock as
+ // we need the num_lines of the pair VB
+ bool R1_data_exhausted = false;
+ if (flag.pair == PAIR_R2) {
+ uint32_t pair_vb_i = prev_file_first_vb_i + (vb->vblock_i-1 - prev_file_last_vb_i);
+
+ if (pair_vb_i <= prev_file_last_vb_i)
+ fastq_read_pair_1_data (vb, pair_vb_i); // add the R1 sections z_data after the R2 sections
+ else
+ R1_data_exhausted = true; // R1 data is already exhausted. This is ok if R2 data is exhausted too.
+ }
+
+ // case: we have out-of-band txt_data waiting (for generated components) - compress this data first,
+ // before reading more data from the txt_file
+ if (gencomp_get_txt_data(vb))
+ goto dispatch;
+
+ else {
+ // --head diagnostic option in ZIP cuts a certain number of first lines from vb=1, and discards other VBs
+ if (vb->vblock_i != 1 && flag.has_head) {
+ vb->dispatch = DATA_EXHAUSTED;
+ return;
+ }
+
+ txtfile_read_vblock (vb);
+
+ if (Ltxt)
+ goto dispatch;
+
+ else if (gencomp_am_i_expecting_more_txt_data()) // more data might be coming from MAIN VBs currently computing
+ vb->dispatch = MORE_DATA_MIGHT_COME;
+
+ else
+ vb->dispatch = DATA_EXHAUSTED;
+
+ // note: the opposite case where R2 has less reads than R1 is caught in txtfile_read_vblock
+ ASSINP (!R1_data_exhausted || vb->dispatch == DATA_EXHAUSTED, // we are expecting that if our pair R1 data is exhausted, then our R2 data is exhausted too
+ "Error: File %s has more FASTQ reads than its R1 mate (vb=%s)", txt_name, VB_NAME);
+
+ // error if stdin is empty - can happen only when redirecting eg "cat empty-file|./genozip -" (we test for empty regular files in main_genozip)
+ ASSINP0 (vb->vblock_i > 1 || txt_file->txt_data_so_far_single /* txt header data */,
+ "Error: Cannot compress stdin data because its size is 0");
+
+ if (flag.biopsy && vb->dispatch == DATA_EXHAUSTED)
+ biopsy_data_is_exhausted();
+
+ if (flag.debug_or_test) buflist_test_overflows(vb, __FUNCTION__);
+
+ return;
+ }
+
+dispatch:
+ vb->dispatch = READY_TO_COMPUTE;
+ txt_file->num_vbs_dispatched++;
+
+ if (vb->comp_i == COMP_MAIN) // note: we only update the MAIN comp from here, gen comps are updated
+ gencomp_a_main_vb_has_been_dispatched();
+}
+
+// called main thread, as VBs complete (might be out-of-order)
+static void zip_complete_processing_one_vb (VBlockP vb)
+{
+ DT_FUNC (vb, zip_after_compute)(vb);
+
+ // update z_data in memory (its not written to disk yet)
+ zfile_update_compressed_vb_header (vb);
+
+ txt_file->max_lines_per_vb = MAX_(txt_file->max_lines_per_vb, vb->lines.len);
+
+ if (!flag.make_reference && !flag.seg_only)
+ zfile_output_processed_vb_ext (vb, true);
+
+ zip_update_txt_counters (vb);
+
+ // destroy some buffers of "first generation" contexts (those that didn't clone any nodes)
+ if (vb->vblock_i < 100) // don't bother checking for high vb_i
+ for_ctx_that (ctx->nodes.len32 && !ctx->ol_nodes.len32) {
+ buf_destroy (ctx->b250); // 1st generation likely to have excessive length due to being all-new 4B nodes
+ buf_destroy (ctx->local_hash); // 1st generation allocated based on wild guess
+ buf_destroy (ctx->nodes); // 1st generation likely to have a lot more new nodes (+dict) that subsequent generations
+ buf_destroy (ctx->dict);
+ }
+
+ dispatcher_increment_progress ("z_write", PROGRESS_UNIT); // writing done.
+
+ z_file->num_vbs = MAX_(z_file->num_vbs, vb->vblock_i); // note: VBs are written out of order, so this can increase by 0, 1, or more than 1
+ txt_file->num_vbs++;
+}
+
+uint64_t zip_get_target_progress (void)
+{
+ static uint64_t target_progress=0;
+ if ((Z_DT(FASTQ) && flag.pair != PAIR_R2) || // note: if 2nd of a FASTQ file pair - we leave the target as it was in the first file as seggable_size is not calculated for the 2nd file
+ (flag.deep && flag.zip_comp_i <= SAM_COMP_FQ00) ||
+ (!flag.deep && !Z_DT(FASTQ))) {
+
+ int64_t progress_unit = txt_file->est_num_lines ? txt_file->est_num_lines : txtfile_get_seggable_size();
+
+ target_progress = progress_unit * (3 + segconf.zip_txt_modified) // read, (modify), seg, compress
+ + (!flag.make_reference && !flag.seg_only && !flag.biopsy) * progress_unit; // write
+ }
+
+ DO_ONCE if (flag.debug_progress)
+ iprintf ("zip_comp_i=%u : target_progress=%s\n", flag.zip_comp_i, str_int_commas(target_progress).s);
+
+ return target_progress;
+}
+
+// this is the main dispatcher function. It first processes the txt header, then proceeds to read
+// a VB from the input file and send it off to a thread for computation. When the thread
+// completes, this function proceeds to write the output to the output file. It can dispatch
+// several threads in parallel.
+void zip_one_file (rom txt_basename,
+ bool is_last_user_txt_file) // the last user-specified txt file in this execution
+{
+ Dispatcher dispatcher = 0;
+ dispatcher_start_wallclock();
+ if (flag.show_time_comp_i == flag.zip_comp_i) profiler_initialize(); // re-start wallclock
+
+ z_file->txt_data_so_far_single = 0;
+ z_file->num_components = MAX_(z_file->num_components, flag.zip_comp_i+1); // may increase further with generated components (in zip_update_txt_counters())
+ evb->z_data.len = 0;
+ evb->z_next_header_i = 0;
+
+ txtfile_zip_finalize_codecs();
+
+ // we calculate digest for each component seperately, stored in SectionHeaderTxtHeader (always 0 for generated components, or if modified)
+ if (gencomp_comp_eligible_for_digest(NULL)) // if generated component - keep digest to display in progress after the last component
+ z_file->digest_ctx = DIGEST_CONTEXT_NONE;
+
+ if (!flag.bind || flag.zip_comp_i == COMP_MAIN)
+ prev_file_first_vb_i = prev_file_last_vb_i = 0; // reset if we're not binding
+
+ // initalize pre-defined ctxs before segconf
+ // note: generic_is_header_done as well as segconf may change the data type and re-initialize the contexts
+ if (z_file->num_txts_so_far == 0) // first component of this z_file
+ ctx_initialize_predefined_ctxs (z_file->contexts, txt_file->data_type, z_file->d2d_map, &z_file->num_contexts);
+
+ segconf_zip_initialize(); // before txtheader
+
+ uint32_t first_vb_i = prev_file_last_vb_i + 1;
+
+ // read the txt header, assign the global variables, and write the compressed header to the GENOZIP file
+ int64_t txt_header_offset = -1;
+ int64_t txt_header_len = txtheader_zip_read_and_compress (&txt_header_offset, flag.zip_comp_i); // also increments z_file->num_txts_so_far
+
+ bool success = (txt_header_len >= -1);
+ if (!success) goto finish; // eg 2nd+ VCF file cannot bind, because of different sample names
+
+ DT_FUNC (txt_file, zip_initialize)();
+
+ segconf_calculate();
+
+ DT_FUNC (txt_file, zip_after_segconf)();
+
+ uint64_t target_progress = zip_get_target_progress();
+
+ dispatcher = dispatcher_fan_out_task (
+ ZIP_TASK_NAME, txt_basename,
+ target_progress, // target progress: 1 for each read, compute, write
+ target_progress ? NULL : txt_file->is_remote ? "Downloading & compressing..." : "Compressing...",
+ !flag.make_reference, // allow callbacks to zip_complete_processing_one_vb not in order of VBs (not allowed for make-reference as contigs need to be in consistent order)
+ false, // not test mode
+ flag.xthreads, prev_file_last_vb_i, 5000, false,
+ zip_prepare_one_vb_for_dispatching,
+ zip_compress_one_vb,
+ zip_complete_processing_one_vb);
+
+ // verify that entire file was read (with some exceptions)
+ bool appending = false;
+ ASSERT (txt_file->disk_so_far == txt_file->disk_size || // all good: entire file was read from disk
+ !txt_file->disk_size || // we don't know the disk size (e.g. redirected)
+ flag.has_head || // only of a subset of the file was compressed, at user request
+ is_read_via_ext_decompressor (txt_file) || // we don't know how much was read from disk, because an external process did the reading
+ (!txt_file->is_remote && !txt_file->redirected && flag.truncate && (appending = (txt_file->disk_size != file_get_size (txt_file->name)))), // edge case: compressed a local file while it was being appended (e.g. while downloading)
+ "Failed to compress entire file: file size is %s, but only %s bytes were compressed",
+ str_int_commas (txt_file->disk_size).s, str_int_commas (txt_file->disk_so_far).s);
+
+ WARN_IF (appending, "%s was being appended by an external process while compression was in progress", txt_name);
+
+ // verify that the entire data is either decompressed or truncated away (doesn't work for external decompressors)
+ ASSERT (txt_file->disk_gz_uncomp_or_trunc == txt_file->disk_so_far || (!TXT_IS_BGZF && !TXT_IS_GZIL && !TXT_IS_GZ) || flag.has_head,
+ "Failed to process all source data: read %s bytes from disk, but decompressed %sonly %s bytes. txt_codec=%s",
+ str_int_commas (txt_file->disk_so_far).s, flag.truncate ? "or truncated " : "", str_int_commas (txt_file->disk_gz_uncomp_or_trunc).s,
+ txtfile_codec_name (z_file, flag.zip_comp_i).s);
+
+ if (TXT_IS_BGZF || TXT_IS_GZIL)
+ bgzf_finalize_discovery();
+
+ zriter_wait_for_bg_writing(); // complete writing VBs before moving on
+
+ dispatcher_calc_avg_compute_vbs (dispatcher);
+
+ dispatcher_increment_progress ("txt_header", txt_file->est_num_lines ? 3 : (txt_header_len * 3)); // account for txt_header read, computed and written
+
+ // go back and update some fields in the txt header's section header and genozip header
+ if (txt_header_offset >= 0) // note: this will be -1 if we didn't write a SEC_TXT_HEADER section for any reason (e.g. SAM PRIM/DEPN, --make-reference...)
+ zfile_update_txt_header_section_header (txt_header_offset);
+
+ ASSERT0 (!flag.biopsy || biopsy_is_done(), "Biopsy request not complete - some VBs missing");
+
+ // write the BGZF section containing BGZF block sizes, if this txt file is compressed with BGZF
+ if (TXT_IS_BGZF)
+ bgzf_compress_bgzf_section();
+
+ // if this a non-bound file, or the last component of a bound file - write the genozip header, random access and dictionaries
+finish:
+ z_file->txt_file_disk_sizes[flag.zip_comp_i] = txt_file->disk_size ? txt_file->disk_size // actual file size on disk, if we know it (we don't if its a remote or stdin file)
+ : (int64_t)txt_file->disk_so_far + (txt_file->codec==CODEC_BGZF ? BGZF_EOF_LEN : 0); // data (plain, BGZF, GZ or BZ2) read from the file descriptor (we won't have correct src data here if reading through an external decompressor - but luckily txt_file->disk_size will capture that case)
+ z_file->txt_file_disk_sizes_sum += z_file->txt_file_disk_sizes[flag.zip_comp_i];
+
+ // (re-)index sections after adding this txt_file
+ sections_create_index();
+
+ // reconstruction plan (for VCF - for DVCF or --sort, for SAM - re-integrate supp/secondary alignments)
+ if (!flag.seg_only && DTPZ(generate_recon_plan))
+ DTPZ(generate_recon_plan)(); // should set z_file->z_closes_after_me if we need to close after this component after all
+
+ if (z_file->z_closes_after_me && !flag.seg_only) { // note: for SAM, z_closes_after_me might be updated in sam_zip_generate_recon_plan
+ // if we used the aligner with REF_EXT_STORE, we make sure all the CHROMs referenced are in the CHROM context, so
+ // as SEC_REF_CONTIGS refers to them. We do this by seeing which contigs have any bit set in is_set.
+ // note: in REF_EXTERNAL we don't use is_set, so we populate all contigs in zip_initialize
+ // note: must be before zip_after_vbs() bc sam_zip_after_vbs() removes unused dict words (they are marked as used in ref_contigs_populate_aligned_chroms)
+ if (flag.aligner_available && IS_REF_EXT_STORE) {
+ THREAD_DEBUG (populate_aligned_chroms);
+ ref_contigs_populate_aligned_chroms();
+ }
+
+ DT_FUNC (txt_file, zip_after_vbs)();
+
+ zip_write_global_area();
+ }
+
+ zip_display_compression_ratio (digest_snapshot (&z_file->digest_ctx, NULL)); // Done for reference + final compression ratio calculation
+
+ if (flag.md5 && flag.bind && z_file->z_closes_after_me &&
+ ((flag.bind == BIND_FQ_PAIR && z_file->num_txts_so_far == 2) ||
+ (flag.bind == BIND_SAM && z_file->num_txts_so_far == 3)))
+ progress_concatenated_md5 (z_dt_name(), digest_snapshot (&z_file->digest_ctx, "file"));
+
+ z_file->disk_size = z_file->disk_so_far;
+
+ prev_file_first_vb_i = first_vb_i;
+ dispatcher_finish (&dispatcher, &prev_file_last_vb_i,
+ z_file->z_closes_after_me && !is_last_user_txt_file,
+ flag.show_memory && z_file->z_closes_after_me && is_last_user_txt_file); // show memory
+
+ if (!z_file->z_closes_after_me)
+ ctx_reset_codec_commits();
+
+ // no need to waste time freeing memory of the last file, the process termination will do that
+ flag.let_OS_cleanup_on_exit = is_last_user_txt_file && z_file->z_closes_after_me && !arch_is_valgrind();
+
+ DT_FUNC (txt_file, zip_finalize)(is_last_user_txt_file);
+
+ segconf_free();
+
+ if (flag.show_time_comp_i == flag.zip_comp_i)
+ profiler_add_evb_and_print_report();
+}
diff --git a/src/zip.h b/src/zip.h
index 1041d043..0a7338ac 100644
--- a/src/zip.h
+++ b/src/zip.h
@@ -14,6 +14,7 @@ extern void zip_one_file (rom vcf_basename, bool is_last_user_txt_file);
extern void zip_compress_all_contexts_b250 (VBlockP vb);
extern void zip_init_vb (VBlockP vb);
extern bool zip_is_input_exhausted (void);
+extern uint64_t zip_get_target_progress (void);
extern LocalGetLineCB *zip_get_local_data_callback (DataType dt, ContextP ctx);
extern void zip_set_no_stons_if_callback (VBlockP vb);