diff --git a/.vscode/launch.json b/.vscode/launch.json index e8b2bc28..78141441 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -16,10 +16,9 @@ // "program": "${workspaceFolder}/src/genounzip.exe", // "program": "${workspaceFolder}/src/genols-debug.exe", - "args" : ["-ft", "private/test/test.starling.vcf"], - + "args" : ["--echo", "-fX", "--truncate", "./private/test/gz.bgzf.truncated.fq.gz"], + "environment": [ - { "name": "GENOZIP_TEST", "value": "1", }, // needed for VER2 macro to work { "name": "GENOZIP_REFERENCE", "value": "c:\\Users\\divon\\genozip/public", }, ], diff --git a/.vscode/settings.json b/.vscode/settings.json index fdc9d9e8..51f78d19 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -40,7 +40,9 @@ "endianness.h": "c", "version.h": "c", "libgen.h": "c", - "compare": "c" + "compare": "c", + "progress.h": "c", + "pthread.h": "c" }, "cmake.sourceDirectory": "C:/Users/divon/genozip/src/onion", "cmake.configureOnOpen": false diff --git a/LICENSE.txt b/LICENSE.txt index 16869f69..d8882ddc 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -159,5 +159,5 @@ ABOVE STATED REMEDY FAILS OF ITS ESSENTIAL PURPOSE. END OF TERMS AND CONDITIONS -Genozip license version: 15.0.62 +Genozip license version: 15.0.63 diff --git a/RELEASE_NOTES.txt b/RELEASE_NOTES.txt index 25781ce4..f6f954b6 100644 --- a/RELEASE_NOTES.txt +++ b/RELEASE_NOTES.txt @@ -3,10 +3,15 @@ Note on versioning: - Minor version changes with bug fixes and minor feature updates - Some minor versions are skipped due to failed deployment pipelines -15.0.62 -- I/O optimizations for faster compression +15.0.63 18/7/2024 +- FASTQ: much faster compression of most MGI, most Element, and some Illumina FASTQs due to better scaling of CPU cores on machines with > 40 cores +- New option: --not-paired: used in combination of --deep to inform Genozip that the two FASTQs files provided are not paired-end. +- Bug fix: correct handling of BGZF-compressed files with a BGZF End-of-File block in their midst (instead of at their end): Until version 15.0.46 the file was compressed up the BGZF EOF block, and the rest of the file was lost. Between 15.0.48 to 15.0.62 Genozip errored on this situation. This edge case was discovered during development and has not been encountered so far in any real-world files. + +15.0.62 29/6/2024 +- Scaling to more cores thanks to improved method of handing disk I/O - Bug fixes -- New diagnostic options: --show-gz-uncomp, --generate-gzil +- New diagnostic options: --show-gz-uncomp, --generate-il1m - Removed bash autocomplete for genozip as it didn't work very well. If this was installed, it can be removed by manually editing ~/.bash_completion 15.0.61 22/6/2024 diff --git a/installers/LICENSE.html b/installers/LICENSE.html index c0399d72..1cd522a3 100644 --- a/installers/LICENSE.html +++ b/installers/LICENSE.html @@ -34,4 +34,4 @@ 10. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides Genozip on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Genozip and assume any risks associated with Your exercise of permissions under this License.

11. LIMITATION OF LIABILITY. TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, STRICT LIABILITY OR OTHER LEGAL OR EQUITABLE THEORY, SHALL LICENSOR OR DEVELOPER BE LIABLE FOR DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER ARISING AS A RESULT OF THIS LICENSE OR OUT OF THE USE OR INABILITY TO USE GENOZIP (INCLUDING BUT NOT LIMITED TO DAMAGES FOR LOSS OF GOODWILL, WORK STOPPAGE, COMPUTER FAILURE OR MALFUNCTION, FILE CORRUPTION, DATA LOSS, OR ANY AND ALL OTHER COMMERCIAL DAMAGES OR LOSSES), EVEN IF LICENSOR OR DEVELOPER HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL LICENSOR'S OR DEVELOPER'S TOTAL LIABILITY TO LICENSEE FOR ALL DAMAGES (OTHER THAN AS MAY BE REQUIRED BY APPLICABLE LAW IN CASES INVOLVING PERSONAL INJURY) EXCEED THE AMOUNT OF $500 USD. THE FOREGOING LIMITATIONS WILL APPLY EVEN IF THE ABOVE STATED REMEDY FAILS OF ITS ESSENTIAL PURPOSE.

END OF TERMS AND CONDITIONS

-Genozip license version: 15.0.62

+Genozip license version: 15.0.63

diff --git a/installers/genozip-installer.exe b/installers/genozip-installer.exe index dee031fc..720488d5 100644 Binary files a/installers/genozip-installer.exe and b/installers/genozip-installer.exe differ diff --git a/installers/genozip-linux-x86_64.tar b/installers/genozip-linux-x86_64.tar index cbfdb47a..eacae736 100644 Binary files a/installers/genozip-linux-x86_64.tar and b/installers/genozip-linux-x86_64.tar differ diff --git a/installers/genozip-osx-arm.tar b/installers/genozip-osx-arm.tar index b5fe66b9..02cea589 100644 Binary files a/installers/genozip-osx-arm.tar and b/installers/genozip-osx-arm.tar differ diff --git a/installers/genozip-osx-x86.tar b/installers/genozip-osx-x86.tar index 0b0f0892..19bfabc8 100644 Binary files a/installers/genozip-osx-x86.tar and b/installers/genozip-osx-x86.tar differ diff --git a/src/Makefile b/src/Makefile index cb17f75e..4a9761b1 100644 --- a/src/Makefile +++ b/src/Makefile @@ -154,10 +154,10 @@ MY_SRCS = genozip.c genols.c context.c container.c strings.c stats.c arch.c tip. sam_seq.c sam_qual.c sam_sag_zip.c sam_sag_piz.c sam_sag_load.c sam_sag_ingest.c sam_sag_scan.c \ sam_bwa.c sam_bowtie2.c sam_bsseeker2.c sam_bsbolt.c sam_bismark.c sam_gem3.c sam_tmap.c sam_hisat2.c \ sam_blasr.c sam_dragen.c sam_minimap2.c sam_10xGenomics.c sam_biobambam.c sam_pos.c sam_deep.c \ - sam_star.c sam_abra2.c sam_optimize.c \ + sam_star.c sam_abra2.c sam_modify.c \ fastq.c fastq_desc.c fastq_seq.c fastq_qual.c fastq_deep.c fastq_saux.c deep.c \ fasta.c gff.c bed.c me23.c locs.c generic.c lookback.c compressor.c \ - buffer.c buf_struct.c buf_list.c random_access.c sections.c base64.c bgzf.c coverage.c txtheader.c \ + buffer.c buf_struct.c buf_list.c random_access.c sections.c base64.c mgzip.c coverage.c txtheader.c \ codec.c codec_bz2.c codec_lzma.c codec_acgt.c codec_domq.c codec_bsc.c codec_pacb.c \ codec_pbwt.c codec_none.c codec_htscodecs.c codec_longr.c codec_normq.c codec_homp.c codec_t0.c \ codec_smux.c codec_oq.c \ @@ -193,7 +193,7 @@ INCLUDES += dict_id_gen.h aes.h dispatcher.h profiler.h dict_id.h aliases.h txtf buffer.h buf_struct.h buf_list.h file.h context.h context_struct.h container.h seg.h text_license.h version.h compressor.h \ crypt.h genozip.h piz.h vblock.h zfile.h random_access.h regions.h reconstruct.h tar.h qname.h qname_flavors.h codec.h \ lookback.h tokenizer.h codec_longr_alg.c gencomp.h dict_io.h recon_plan_io.h tip.h deep.h filename.h stats.h multiplexer.h \ - reference.h ref_private.h refhash.h ref_iupacs.h aligner.h mutex.h bgzf.h coverage.h threads.h local_type.h \ + reference.h ref_private.h refhash.h ref_iupacs.h aligner.h mutex.h mgzip.h coverage.h threads.h local_type.h \ arch.h license.h file_types.h data_types.h base64.h txtheader.h writer.h zriter.h bases_filter.h genols.h contigs.h chrom.h \ vcf.h vcf_private.h sam.h sam_private.h me23.h fasta.h fasta_private.h gff.h bed.h locs.h generic.h \ fastq.h fastq_private.h user_message.h mac_compat.h b250.h zip_dyn_int.h qname_filter.h \ diff --git a/src/arch.c b/src/arch.c index 23cd6d8d..d23aec21 100644 --- a/src/arch.c +++ b/src/arch.c @@ -300,6 +300,7 @@ StrText arch_get_filesystem_type (FileP file) NAME (0x2011bab0, "exFAT"); // Filesystem for flash memory: https://en.wikipedia.org/wiki/ExFAT NAME (0x9123683e, "brtfs"); // Copy-on-write filesystem for Linux: https://docs.kernel.org/filesystems/btrfs.html NAME (0x794C7630, "OverlayFS");// A union-mount filesystem: https://en.wikipedia.org/wiki/OverlayFS + NAME (0xf15f, "eCryptfs"); // A cryptographic filesystem for Linux: https://www.ecryptfs.org/ default: snprintf (s.s, sizeof (s.s), "0x%lx", fs.f_type); } @@ -521,7 +522,7 @@ static bool arch_is_exec_in_path (rom exec) } bool wget_available (void) -{ +{ static thool installed = unknown; if (installed == unknown) // note: wget not used on Windows, bc I can't get it to output to stdout, and also earlier wget versions may be adding \r ... : https://stackoverflow.com/questions/8522983/wget-of-binary-file-piped-into-other-commands-on-windows-breaks-the-binary diff --git a/src/b250.c b/src/b250.c index 70136162..4ebd2e57 100644 --- a/src/b250.c +++ b/src/b250.c @@ -9,6 +9,7 @@ #include "b250.h" #include "context.h" #include "codec.h" +#include "file.h" // single-length encoding (up to 15.0.37) // Format on data in Context.b250: Each entry is either a single-byte special-code value 0xFA-0xFF, OR a 1, 2 or 4 big-endian integer. @@ -259,12 +260,22 @@ bool b250_zip_generate (VBlockP vb, ContextP ctx) ctx->b250.size -= shortened_by; // in case we are using "pair identical", drop this section if it is an R2 section identical to its R1 counterpart - if (is_fastq_pair_2 (vb) && fastq_zip_use_pair_identical (ctx->dict_id) && buf_issame (&ctx->b250, &ctx->b250R1, 1)) { - ctx->b250.len = 0; - - if (flag.debug_generate) iprintf ("%s: %s[%u].b250 dropped because it is an R2 section which is identical to its R1 counterpart\n", VB_NAME, ctx->tag_name, ctx->did_i); + if (is_fastq_pair_2 (vb) && fastq_zip_use_pair_identical (ctx->dict_id)) { + + if (buf_issame (&ctx->b250, &ctx->b250R1, 1)) { + ctx->b250.len = 0; + ret = false; + + if (flag.debug_generate) iprintf ("%s: %s[%u].b250 dropped because it is an R2 section which is identical to its R1 counterpart\n", VB_NAME, ctx->tag_name, ctx->did_i); + } - ret = false; + // if we know the flavor - verify that VBs are indeed paired by requiring that all QNAME components (except for QmNAME) are identical + else if (segconf.qname_flavor[QNAME1] && IN_RANGE(ctx->did_i, FASTQ_Q0NAME, FASTQ_QmNAME) && + // note: in deep the sections don't always match as a deeped read will have copy-from-deep, but if its mate is missing from SAM, it will be segged differently. + // since we don't error if deep, it is possible that fastq files that are not aligned paired-end will be segged as such if they are close enough + // so that their VBs get divvied up by txtfile_read_vblock in the same way, and paired VBs have the same number of lines. no harm. + !flag.deep) + ABORTINP (NO_PAIR_FMT_PREFIX "%s %s.b250 is not identical to R1)", txt_name, VB_NAME, ctx->tag_name); } // xxx - print b250 if show_b250 diff --git a/src/bam_seg.c b/src/bam_seg.c index 13da9ead..1ddd6c64 100644 --- a/src/bam_seg.c +++ b/src/bam_seg.c @@ -65,14 +65,14 @@ static int32_t bam_unconsumed_scan_forwards (VBlockP vb) return aln_size - (i - txt_len); // we pass the data of the final, partial, alignment to the next VB } -static int32_t bam_unconsumed_scan_backwards (VBlockP vb, uint32_t first_i, int32_t *i) +static int32_t bam_unconsumed_scan_backwards (VBlockP vb, uint32_t first_i) { - *i = MIN_(*i, Ltxt - sizeof(BAMAlignmentFixed)); + int32_t last_i = Ltxt - sizeof(BAMAlignmentFixed); // find the first alignment in the data (going backwards) that is entirely in the data - // we identify and alignment by l_read_name and read_name - for (; *i >= (int32_t)first_i; (*i)--) { - const BAMAlignmentFixed *aln = (const BAMAlignmentFixed *)Btxt (*i); + for (; last_i >= (int32_t)first_i; (last_i)--) { + const BAMAlignmentFixed *aln = (const BAMAlignmentFixed *)Btxt (last_i); uint32_t block_size = LTEN32 (aln->block_size); if (block_size > 100000000) continue; // quick short-circuit - more than 100M for one alignment - clearly wrong @@ -81,7 +81,7 @@ static int32_t bam_unconsumed_scan_backwards (VBlockP vb, uint32_t first_i, int3 uint16_t n_cigar_op = LTEN16 (aln->n_cigar_op); // test to see block_size makes sense - if ((uint64_t)*i + (uint64_t)block_size + 4 > (uint64_t)vb->txt_data.len || // 64 bit arith to catch block_size=-1 that will overflow in 32b + if ((uint64_t)last_i + (uint64_t)block_size + 4 > (uint64_t)vb->txt_data.len || // 64 bit arith to catch block_size=-1 that will overflow in 32b block_size + 4 < sizeof (BAMAlignmentFixed) + 4*n_cigar_op + aln->l_read_name + l_seq + (l_seq+1)/2) continue; @@ -122,7 +122,7 @@ static int32_t bam_unconsumed_scan_backwards (VBlockP vb, uint32_t first_i, int3 // agree with our formula. see comment in bam_reg2bin // all tests passed - this is indeed an alignment - return Ltxt - (*i + LTEN32 (aln->block_size) + 4); // everything after this alignment is "unconsumed" + return Ltxt - (last_i + LTEN32 (aln->block_size) + 4); // everything after this alignment is "unconsumed" } return -1; // we can't find any alignment - need more data (lower first_i) @@ -130,9 +130,9 @@ static int32_t bam_unconsumed_scan_backwards (VBlockP vb, uint32_t first_i, int3 // returns the length of the data at the end of vb->txt_data that will not be consumed by this VB is to be passed to the next VB // if first_i > 0, we attempt to heuristically detect the start of a BAM alignment. -int32_t bam_unconsumed (VBlockP vb, uint32_t first_i, int32_t *i) +int32_t bam_unconsumed (VBlockP vb, uint32_t first_i) { - ASSERT (*i >= 0 && *i < Ltxt, "*i=%d is ∉ [0,%u]", *i, Ltxt); + ASSERTNOTZERO (Ltxt); int32_t result; @@ -142,7 +142,7 @@ int32_t bam_unconsumed (VBlockP vb, uint32_t first_i, int32_t *i) // stringent -either CIGAR needs to match seq_len, or qname needs to match flavor else - result = bam_unconsumed_scan_backwards (vb, first_i, i); + result = bam_unconsumed_scan_backwards (vb, first_i); return result; // if -1 - we will be called again with more data } @@ -218,7 +218,7 @@ void bam_seg_BIN (VBlockSAMP vb, ZipDataLineSAMP dl, uint16_t bin /* used only i static inline void bam_seg_ref_id (VBlockSAMP vb, ZipDataLineSAMP dl, Did did_i, int32_t ref_id, int32_t compare_to_ref_i) { - ASSERT (ref_id == -1 || (sam_hdr_contigs && IN_RANGE (ref_id, 0, sam_hdr_contigs->contigs.len32-1)), + ASSERT (ref_id == -1 || (sam_hdr_contigs && IN_RANGE (ref_id, 0, sam_hdr_contigs->contigs.len32)), "%s: encountered %s.ref_id=%d but header has only %u contigs%s", LN_NAME, CTX(did_i)->tag_name, ref_id, sam_hdr_contigs ? sam_hdr_contigs->contigs.len32 : 0, MP(LONGRANGER) ? ". This is a known longranger bug (samtools won't accept this file either)." : ""); @@ -288,16 +288,16 @@ void bam_get_one_aux (VBlockSAMP vb, int16_t idx, switch (*type) { // in case of an numeric type, we pass the value as a ValueType - case 'i': *value_len = 4; numeric->i = (int32_t)LTEN32 (GET_UINT32 (aux)); break; - case 'I': *value_len = 4; numeric->i = LTEN32 (GET_UINT32 (aux)); break; - case 'f': *value_len = 4; numeric->f32.f = LTEN32F (GET_FLOAT32 (aux)); break; // note: this DOES NOT result in the correct value in last_value.f - case 's': *value_len = 2; numeric->i = (int16_t)LTEN16 (GET_UINT16 (aux)); break; - case 'S': *value_len = 2; numeric->i = LTEN16 (GET_UINT16 (aux)); break; - case 'c': *value_len = 1; numeric->i = (int8_t)*aux; break; - case 'C': *value_len = 1; numeric->i = (uint8_t)*aux; break; + case 'i': *value_len = 4; numeric->i = (int32_t)GET_UINT32 (aux); break; + case 'I': *value_len = 4; numeric->i = GET_UINT32 (aux); break; + case 'f': *value_len = 4; numeric->f32.f = GET_FLOAT32 (aux); break; // note: this DOES NOT result in the correct value in last_value.f + case 's': *value_len = 2; numeric->i = (int16_t)GET_UINT16 (aux); break; + case 'S': *value_len = 2; numeric->i = GET_UINT16 (aux); break; + case 'c': *value_len = 1; numeric->i = (int8_t)*aux; break; + case 'C': *value_len = 1; numeric->i = (uint8_t)*aux; break; case 'Z': - case 'H': *value_len = vb->aux_lens[idx] - 4; *value = aux; break; // value_len excludes the terminating \0 - case 'A': *value_len = 1; *value = aux; break; + case 'H': *value_len = vb->aux_lens[idx] - 4; *value = aux; break; // value_len excludes the terminating \0 + case 'A': *value_len = 1; *value = aux; break; // in case of a numerical value we pass the data as is, in machine endianity case 'B': diff --git a/src/bgzf.c b/src/bgzf.c deleted file mode 100644 index 51305729..00000000 --- a/src/bgzf.c +++ /dev/null @@ -1,1288 +0,0 @@ -// ------------------------------------------------------------------ -// bgzf.c -// Copyright (C) 2020-2024 Genozip Limited. Patent Pending. -// Please see terms and conditions in the file LICENSE.txt -// -// WARNING: Genozip is proprietary, not open source software. Modifying the source code is strictly prohibited -// and subject to penalties specified in the license. - -#include - -#include "igzip/igzip_lib.h" -#include "libdeflate_1.19/libdeflate.h" -#include "libdeflate_1.7/libdeflate.h" -#include "zlib/zlib.h" -#include "bgzf.h" -#include "arch.h" -#include "file.h" -#include "zfile.h" -#include "zip.h" -#include "arch.h" -#include "txtfile.h" -#include "codec.h" -#include "threads.h" -#include "dispatcher.h" -#include "writer.h" -#include "gencomp.h" -#include "filename.h" -#include "strings.h" - -#define LIBDEFLATE_MAX_LEVEL 12 -#define ZLIB_MAX_LEVEL 9 - -// all data in Little Endian. Defined in https://datatracker.ietf.org/doc/html/rfc1952 and https://samtools.github.io/hts-specs/SAMv1.pdf -typedef struct __attribute__ ((packed, aligned(2))) BgzfHeader { // 18 bytes - uint8_t id1; // Gzip id - must be 31 (0x1f) - uint8_t id2; // Gzip id - must be 139 (0x8b) - uint8_t cm; // Compression Method - must be 8 - uint8_t flg; // Flags - must be 4 (FEXTRA) - uint32_t mtime; // Modification Time - uint8_t xfl; // eXtra Flags - uint8_t os; // Operating System - uint16_t xlen; // Size of extra fields - 6 if contain only BGZF (may be more) - uint8_t si1; // BGZF id - must be 66 (0x42) - uint8_t si2; // BGZF id - must be 67 (0x43) - uint16_t slen; // BGZF extra field length - must be 2 - uint16_t bsize; // BGZF extra field - (compressed block size -1) -} BgzfHeader; - -#define BGZF_HEADER_LEN ((int)sizeof(BgzfHeader)) - -typedef struct GzipFooter { - uint32_t crc32; // CRC32 of uncompressed data - uint32_t isize; // Input (i.e. uncompressed) Size -} GzipFooter; - -#define GZIP_FOOTER_LEN ((int)sizeof(GzipFooter)) - -typedef struct __attribute__ ((packed, aligned(2))) GzipHeader { // 10 bytes - uint8_t id1; // Gzip id - must be 31 (0x1f) - uint8_t id2; // Gzip id - must be 139 (0x8b) - uint8_t cm; // Compression Method - must be 8 - uint8_t flg; // Flags - must be 0 - uint32_t mtime; // Modification Time - must be 0 - uint8_t xfl; // eXtra Flags - must be 0 - uint8_t os; // Operating System - must be 3 -} GzipHeader; - -static FlagsBgzf bgzf_recompression_levels[1+MAX_FLAG_BGZF] = { - { .library = BGZF_LIBDEFLATE19, .level = 0, .has_eof_block = true }, // --bgzf=0 : BGZF blocks with no compression - { .library = BGZF_IGZIP, .level = 1, .has_eof_block = true }, // --bgzf=1 : note: this is IGZIP LVL0 - { .library = BGZF_IGZIP, .level = 2, .has_eof_block = true }, // --bgzf=2 : note: this is IGZIP LVL1 - { .library = BGZF_LIBDEFLATE19, .level = 1, .has_eof_block = true }, // --bgzf=3 - { .library = BGZF_LIBDEFLATE19, .level = 7, .has_eof_block = true }, // --bgzf=4 - { .library = BGZF_LIBDEFLATE19, .level = 9, .has_eof_block = true }, // --bgzf=5 -}; - -#define bgzf_no_recompression (FlagsBgzf){ .library = BGZF_NO_LIBRARY, .level = BGZF_NO_BGZF, .has_eof_block = false } - -rom gzstatus_name (GzStatus st) -{ - return IN_RANGE(st, 0, NUM_GZ_STATUSES-1) ? (rom[])GZSTATUS_NAMES[st] : "InvalidGzStatus"; -} - -// possible return values, see libdeflate_result in libdeflate.h -static rom libdeflate_error (int err) -{ - switch (err) { - case LIBDEFLATE_SUCCESS : return "SUCCESS"; - case LIBDEFLATE_BAD_DATA : return "BAD DATA"; - case LIBDEFLATE_SHORT_OUTPUT : return "SHORT OUTPUT"; - case LIBDEFLATE_INSUFFICIENT_SPACE : return "INSUFFICIENT SPACE"; - default : return "Undefined libdeflate error"; - } -} - -typedef struct { char s[100]; } BgzfBlockStr; -static BgzfBlockStr display_bb (GzBlockZip *bb) -{ - BgzfBlockStr s; - snprintf (s.s, sizeof (s.s), "{txt_index=%u txt_size=%u compressed_index=%u comp_size=%u is_decompressed=%u}", - bb->txt_index, bb->txt_size, bb->compressed_index, bb->comp_size, bb->is_decompressed); - return s; -} - -static void *bgzf_alloc (void *vb_, unsigned items, unsigned size, FUNCLINE) -{ - return codec_alloc_do ((VBlockP )vb_, (uint64_t)items * (uint64_t)size, 1, func, code_line); // all bzlib buffers are constant in size between subsequent compressions -} - -//-------------------------------------------------------------------- -// ZIP SIDE - library/level discovery -//-------------------------------------------------------------------- - -#define BGZF_DISCOVERY_MAX_TESTS 10 // maximum number of BGZF blocks to be tested - -void bgzf_initialize_discovery (FileP file) -{ - ASSERTNOTINUSE (file->bgzf_plausible_levels); - - if (file->codec == CODEC_BGZF) { - ARRAY_alloc (FlagsBgzf, ll, (LIBDEFLATE_MAX_LEVEL+1)+LIBDEFLATE_MAX_LEVEL+ZLIB_MAX_LEVEL, - false, file->bgzf_plausible_levels, evb, "txt_file->bgzf_plausible_levels"); - - int next=0; - for (int l=0; l <= LIBDEFLATE_MAX_LEVEL; l++) // level=0 only here, bc it would be the same in all libraries - ll[next++] = (FlagsBgzf){ .library = BGZF_LIBDEFLATE19, .level = l}; - - for (int l=1; l <= LIBDEFLATE_MAX_LEVEL; l++) - ll[next++] = (FlagsBgzf){ .library = BGZF_LIBDEFLATE7, .level = l}; - - for (int l=1; l <= ZLIB_MAX_LEVEL; l++) - ll[next++] = (FlagsBgzf){ .library = BGZF_ZLIB, .level = l}; - } - - else if (file->codec == CODEC_GZIL) { - // bug 1101: we don't yet know the plausible levels for GZIL - } - - else - ABORT ("Supported codec=%s for discovery", codec_name (file->codec)); -} - -// ZIP main thread -static void bgzf_discover_finalize_testing (BgzfLibraryType lib, BgzfLevel level) -{ - txt_file->bgzf_flags.library = lib; // assign field-by-field: careful not to modify bgzf_flags.has_eof_block - txt_file->bgzf_flags.level = level; - - if (flag.zip_comp_i < MAX_NUM_COMPS) // for stats - z_file->comp_bgzf[flag.zip_comp_i] = txt_file->bgzf_flags; -} - -// ZIP main thread -void bgzf_finalize_discovery (void) -{ - int n_levels = txt_file->bgzf_plausible_levels.len32; - - // case: there is no library/level combination for which we can decompress with bgzf=exact - if (n_levels == 0) { - bgzf_discover_finalize_testing (0, BGZF_COMP_LEVEL_UNKNOWN); // has BGZF, but cannot identify level - - if (flag.show_bgzf) - iprintf ("Discover:%s: is a BGZF file, generated by an unidentified library\n", txt_name); - } - - // case: one or more library/level combinations was verified with all test bgzf blocks (10 blocks, unless file is shorter) - else { - bgzf_discover_finalize_testing (B1ST(FlagsBgzf, txt_file->bgzf_plausible_levels)->library, B1ST(FlagsBgzf, txt_file->bgzf_plausible_levels)->level); - - if (flag.show_bgzf) - iprintf ("Discover: %s: %s %s level %u\n", txt_name, - (n_levels == 1) ? "Identified as BGZF generated with" : "Multiple plausible levels, arbitrarily selecting", - bgzf_library_name (txt_file->bgzf_flags.library, true), txt_file->bgzf_flags.level); - } - - if (flag.show_gz) { - iprintf ("%s: txt_codec=%s\n", txt_file->basename, txtfile_codec_name (z_file, flag.zip_comp_i).s); // same format as in txtfile_zip_finalize_codecs - exit_ok; - } -} - -// ZIP: test a BGZF block against all the remaining plausible levels, and eliminate those that don't match. -static void bgzf_discover_library_and_level (VBlockP vb, int test_block_i, STRp(comp), STRp(uncomp)) -{ - uint32_t header_len = TXT_IS_BGZF ? BGZF_HEADER_LEN : GZIL_HEADER_LEN; - if (comp_len <= header_len + GZIP_FOOTER_LEN) { - txt_file->bgzf_plausible_levels.len = 0; - - if (flag.show_bgzf || flag.show_gz) - iprintf ("%s: Block too small - could not identify compression library and level\n", txt_name); - - if (flag.show_gz) exit_ok; - - return; - } - - // ignore the header and footer of the block - comp += header_len; - comp_len -= header_len + GZIP_FOOTER_LEN; - - // compress with each of the remaining plausible levels - testing if the compression is identical to the actual - STRl (recomp, TXT_IS_BGZF ? BGZF_MAX_BLOCK_SIZE : GZIL_MAX_BLOCK_SIZE); - - for_buf (FlagsBgzf, ll, txt_file->bgzf_plausible_levels) { - - switch (ll->library) { - case BGZF_LIBDEFLATE19 : { - void *compressor = libdeflate_alloc_compressor (vb, ll->level, __FUNCLINE); - recomp_len = (uint32_t)libdeflate_deflate_compress (compressor, STRa(uncomp), recomp, BGZF_MAX_BLOCK_SIZE); - - libdeflate_free_compressor (compressor, __FUNCLINE); - break; - } - - case BGZF_LIBDEFLATE7 : { - void *compressor = libdeflate_alloc_compressor_1_7 (ll->level, vb); - recomp_len = (uint32_t)libdeflate_deflate_compress_1_7 (compressor, STRa(uncomp), recomp, BGZF_MAX_BLOCK_SIZE); - - libdeflate_free_compressor_1_7 (compressor); - break; - } - - case BGZF_ZLIB : { - z_stream strm = { .zalloc = bgzf_alloc, .zfree = codec_free_do, .opaque = vb }; - // deflateInit2 with the default zlib parameters, which is also the same as htslib does - ASSERT0 (deflateInit2 (&strm, ll->level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) == Z_OK, "deflateInit2 failed"); - - strm.next_in = (uint8_t *)uncomp; - strm.avail_in = uncomp_len; - strm.next_out = (uint8_t *)recomp; - strm.avail_out = sizeof (recomp); - ASSERT (deflate (&strm, Z_FINISH) == Z_STREAM_END, "deflate failed: msg=%s", strm.msg); - - recomp_len = sizeof (recomp) - strm.avail_out; - - ASSERT0 (deflateEnd (&strm) == Z_OK, "deflateEnd failed"); - break; - } - - default: ABORT ("Invalid library=%d", ll->library); - } - - bool plausible = str_issame (comp, recomp); - - if (flag.show_bgzf) - iprintf ("Discover[%d]: library %s level %u: size_in_file=%u size_in_test=%u plausible=%s\n", - test_block_i, bgzf_library_name (ll->library, true), ll->level, comp_len, recomp_len, YN(plausible)); - - if (!plausible) { - buf_remove (txt_file->bgzf_plausible_levels, FlagsBgzf, BNUM(txt_file->bgzf_plausible_levels, ll), 1); - ll--; fb_after--; // hack for_buf loop - } - } -} - -//-------------------------------------------------------------------- -// ZIP SIDE - decompress BGZF-compressed file and prepare BGZF section -//-------------------------------------------------------------------- - -static void bgzf_update_file_isizes (FileP file) -{ - // add isize to buffer that will be written to SEC_BGZF - if (file->gz_data.uncomp_len) { // don't store EOF block (bc isize=0 cannot be represented as (isize-1) ) - #define BGZF_INITIAL_ALLOC 16 // just of the sake of a bit of effeciency: 16 chosen carefully so 16*63000 < 1MB min vb_size but over segconf size - if (file->bgzf_isizes.len32 <= BGZF_INITIAL_ALLOC) { // entered thrice: when called from file_open_txt_read, segconf, and in first VB - buf_alloc (evb, &file->bgzf_isizes, 0, MAX_(BGZF_INITIAL_ALLOC, segconf.vb_size / 63000), uint16_t, 0, "txt_file->bgzf_isizes"); - buf_alloc (evb, &file->bgzf_starts, 0, MAX_(BGZF_INITIAL_ALLOC, segconf.vb_size / 63000), uint64_t, 0, "txt_file->bgzf_starts"); - } - - buf_append_one (file->bgzf_isizes, BGEN16 ((uint16_t)(file->gz_data.uncomp_len - 1))); // -1 to make the range 0..65535 - buf_append_one (file->bgzf_starts, file->disk_so_far - file->gz_data.len); // not BGEN bc not written to z_file. note: first block is read from file_open_txt_read before txt_file is assigned - } - else { - // if isize is 0, we're expecting an EOF block - ASSERT (str_issame_(file->gz_data.data, file->gz_data.comp_len, BGZF_EOF, BGZF_EOF_LEN), - "Corrupt BGZF block in %s offset=%"PRIu64" bgzf_block_size=%u: isize=0 but this is not an EOF block", - file->name, file->disk_so_far - file->gz_data.comp_len, file->gz_data.comp_len); - - ASSERT (file->disk_so_far == file->disk_size || // expected - !file->disk_size || is_read_via_ext_decompressor (file) || file->redirected || file->is_remote, // cases in which we can't reliably test this condition - "Corrupt BGZF file %s (size=%"PRIu64"): BGZF EOF block encountered at offset=%"PRIu64" length=%u, but this is not the end of the file", - file->name, file->disk_size, file->disk_so_far - file->gz_data.comp_len, file->gz_data.comp_len); - - if (txt_file) - txt_file->bgzf_flags.has_eof_block = true; - } -} - -void inc_disk_gz_uncomp_or_trunc_(FileP file, uint64_t inc, FUNCLINE) -{ - __atomic_add_fetch (&file->disk_gz_uncomp_or_trunc, inc, __ATOMIC_RELAXED); - - if (flag.show_gz_uncomp) - iprintf ("%s:%u: disk_gz_uncomp_or_trunc + %"PRIu64"\t= %"PRIu64"\n", func, code_line, inc, file->disk_gz_uncomp_or_trunc); -} - -// ZIP main thread: reads gzil data from disk into gz_data, and updates gz_data.comp_len/uncomp_len -// of the gzil block at the beginning of gz_data -// returns: discovering: GZ_SUCCESS, GZ_IS_NOT_GZIL -// otherwise: GZ_SUCCESS -GzStatus gzil_read_block (FileP file, // txt_file is not yet assigned when called from txtfile_discover_gz_codec - bool discovering, - bool *is_eof) // is the block indicated by comp_len/uncomp_len the final block in the file -{ - START_TIMER; - FILE *fp = (FILE *)file->file; - file->gz_data.comp_len = file->gz_data.uncomp_len = 0; // init - - // top up gz_data to GZIL_MAX_BLOCK_SIZE (or less if EOF) - // performance note: typical top-up is 200-250KB which is more than the block device read-ahead buffer of 128 KB, which means - // that fread will block on the second half. However, when using GZIL we are not de-compressing in the main thread - // (except for tiny amounts) so the disk is never idling and we are pushing up gzil data just as fast as the disk can read it. - // An R2 GZIL file (R2 always decompresses in the main thread) will be read through igzip for this reason - see txtfile_discover_gz_codec. - uint32_t bytes = txtfile_fread (file, fp, BAFT8(file->gz_data), GZIL_MAX_BLOCK_SIZE - file->gz_data.len32, &file->disk_so_far); - file->gz_data.len32 += bytes; - - // case: eof: no data in gz header - if (file->gz_data.len32 == 0) - return discovering ? GZ_IS_NOT_GZIL // empty file (not expected to ever happen - we already checked that the file is not empty) - : GZ_SUCCESS; // no more data in this file - that's ok - - // case: not GSIL header - else if (file->gz_data.len32 >= GZIL_HEADER_LEN && memcmp (B1ST8(file->gz_data), GZIL_HEADER, GZIL_HEADER_LEN)) { - if (discovering) - return GZ_IS_NOT_GZIL; - else - ABORT ("Encountered a GZIP block that unexpectedly is not GZIL in %s offset=%"PRIu64"\nSolution: use --no-bgzf", - file->basename, (uint64_t)ftello64 (fp) - bytes); - } - - // search for block size by beginning of next block. Because FASTQ is quick compressible, - // we expect the gz header of the next block to be in gz_data. note: we do this even if EOF, - // because gz_data might contain several gzil blocks. note: also NULL if data is too short. - uint8_t *isize_p = memmem (B1ST8(file->gz_data), file->gz_data.len32, - GZIL_ISIZE GZIL_HEADER, GZIL_ISIZE_LEN + GZIL_HEADER_LEN); - - // case: a non-last block was found. note: all blocks GZIL (except for the last) are expected to be 1 MB - if (isize_p && (file->gz_data.uncomp_len = LTEN32 (GET_UINT32(isize_p))) == 1 MB) { - file->gz_data.comp_len = BNUM (file->gz_data, isize_p + 4); - *is_eof = false; // we found the beginning of the next block, so this is definitly not the last block. this can happen even if feof(fp) tell us there nothing more to read in the file, but there are still more blocks in gz_data. - } - - // case: remaining data could be a final gzil block, we will know for sure when trying to decompress it - else if (file->gz_data.len32 >= GZIL_HEADER_LEN + GZIP_FOOTER_LEN && - (file->gz_data.uncomp_len = LTEN32 (GET_UINT32 (BAFT8(file->gz_data) - 4))) <= 1 MB) { - file->gz_data.comp_len = file->gz_data.len32; - *is_eof = true; - } - - // case: data in gz_data is does not contain a gzil block - either not gzil file is truncated - else { - if (discovering) - return GZ_IS_NOT_GZIL; - - // data is not GZIL somewhere in the middle of the file... - ASSERT (feof (fp), "Encountered a GZIP block that unexpectedly is not GZIL in %s offset=%"PRIu64"\nSolution: use --no-bgzf", - file->basename, (uint64_t)ftello64 ((FILE *)file->file) - file->gz_data.len); - - // case: final data in file is not a full GZIL block and truncation allowed: - // account and then ignore the data that will not be gz-decompressed - if (flag.truncate) { - WARN ("FYI: %s is truncated - its final GZIL block in incomplete. Dropping final %u bytes of the GZ data.", txt_name, file->gz_data.len32); - - inc_disk_gz_uncomp_or_trunc (file, file->gz_data.len); - file->gz_data.len32 = file->gz_data.uncomp_len = 0; - segconf.zip_txt_modified = true; - - *is_eof = true; - } - - else - ABORTINP ("%s is truncated mid-way through GZIL block. Tip: If this is expected, use --truncate to discard the final partial GZIL block", txt_name); - } - - if (!discovering && file->gz_data.uncomp_len) - file->bgzf_isizes.len++; // count GZIL blocks - - COPY_TIMER_EVB (gzil_read_block); - return GZ_SUCCESS; -} - -// ZIP: reads and validates a BGZF block -// returns: discoverying: GZ_SUCCESS, GZ_IS_NOT_GZIP, GZ_IS_GZIP_NOT_BGZF -// otherwise: GZ_SUCCESS, GZ_EOF_WITHOUT_EOF_BLOCK, GZ_TRUNCATED -static GzStatus bgzf_read_block_do (FileP file, // txt_file is not yet assigned when called from txtfile_discover_gz_codec - bool is_remote, bool discovering) -{ - FILE *fp = (FILE *)file->file; - file->gz_data.comp_len = file->gz_data.uncomp_len = 0; // init - - // top-up if needed - if (file->gz_data.len32 < BGZF_MAX_BLOCK_SIZE && !feof (fp)) { - uint32_t chunk_size = flag.zip_uncompress_source_during_read ? 150 KB // a bit more than default block-device read-ahead buffer for best parallelization between disk read-ahead and CPU decompression - : BGZF_MAX_CHUCK_SIZE; // bigger block is faster if we are prepared to yield the CPU when waiting for the disk - file->gz_data.len32 += txtfile_fread (file, fp, BAFT8(file->gz_data), chunk_size - file->gz_data.len32, &file->disk_so_far); - } - - BgzfHeader *h = B1ST (BgzfHeader, file->gz_data); - - // no data at all - if (file->gz_data.len32 == 0) - return discovering ? GZ_IS_NOT_GZIP // no data was read from this file at all - : GZ_EOF_WITHOUT_EOF_BLOCK; // EOF without an EOF block (we know this is a BGZF file bc this isn't the first block) - - // truncated mid-way through header - else if (file->gz_data.len32 < BGZF_HEADER_LEN) { - if (discovering) - return GZ_IS_NOT_GZIP; // file smaller than a gzip header - its not GZIP - - else if (flag.truncate) - return GZ_TRUNCATED; // truncated file - - else - ABORT ("file %s appears truncated - it ends with a partial gzip block header", file->basename); // less than the minimal gz block header size - } - - // case: this is not a GZ / BGZF block at all (see: https://tools.ietf.org/html/rfc1952) - else if (h->id1 != 31 || h->id2 != 139) { - if (discovering) - return GZ_IS_NOT_GZIP; - else - ABORT ("expecting %s to be compressed with gzip format, but it is not", file->basename); - } - - // case: this is GZIP block (by the magic) but it is NOT a valid BGZF block (see: https://samtools.github.io/hts-specs/SAMv1.pdf) - else if (memcmp (h, BGZF_PREFIX, BGZF_PREFIX_LEN)) { - if (discovering) - return GZ_IS_GZIP_NOT_BGZF; - else - ABORT ("Encountered a GZIP block that unexpectedly is not BGZF in %s offset=%"PRIu64"\nSolution: use --no-bgzf", - file->basename, (uint64_t)ftello64 (fp) - file->gz_data.len32); - } - - uint32_t body_size = (LTEN16 (h->bsize) + 1) - BGZF_HEADER_LEN; - - if (file->gz_data.len32 >= body_size) { - file->gz_data.comp_len = BGZF_HEADER_LEN + body_size; - - file->gz_data.uncomp_len = LTEN32 (GET_UINT32 (B8(file->gz_data, file->gz_data.comp_len-4))); // 0...65536 per spec: "isize / input size" = uncompressed data length - ASSERT (file->gz_data.uncomp_len <= 65536, "isize=%u ∉ [0,65536] in %s offset=%"PRIu64, file->gz_data.uncomp_len, file->basename, (uint64_t)ftello64 (fp) - file->gz_data.len32); - - return GZ_SUCCESS; - } - - // truncated first block: compress as non-GZIP - else if (discovering) - return GZ_IS_NOT_GZIP; - - else if (flag.truncate) - return GZ_TRUNCATED; - - else { // if failed, always error, even if in discovery (except if truncation allowed) - int save_errno = errno; // we want to report errno of fread, not ftell. - ABORT ("%s %s (ftell=%"PRId64" err=\"%s\" gz_data.len=%u but expecting=%u filesystem=%s). %s\n", - feof (fp) ? "Unexpected end of file while reading" : "Failed to read file", - file->basename, ftello64 (fp), - (is_remote && save_errno == ESPIPE) ? "Disconnected from remote host" : strerror (save_errno), - file->gz_data.len32, body_size, arch_get_txt_filesystem().s, - feof (fp) ? "Tip: If the file is expected to be truncated, you use --truncate to disregard the final partial BGZF block." : ""); - } -} - -// ZIP main thread: reads a BGZF block into gz_data -GzStatus bgzf_read_block (FileP file, bool discovering) -{ - START_TIMER; - - // with BGZF, gz_data is either empty or contains exactly 1 bgzf block - if (file->gz_data.comp_len) return GZ_SUCCESS; // we already have 1 block - - GzStatus ret = bgzf_read_block_do (file, file->is_remote, discovering); - - switch (ret) { - case GZ_SUCCESS: // successful read of a BGZF block - bgzf_update_file_isizes (file); - break; - - case GZ_IS_NOT_GZIP: - case GZ_IS_GZIP_NOT_BGZF: - ASSERT (discovering, "ret=%d expected only if discovering", ret); - break; // file->gz_data contains data that is not BGZF data - - case GZ_EOF_WITHOUT_EOF_BLOCK: // file ended without EOF block: that's fine - ASSERT0 (!discovering, "unexpected ret=GZ_EOF_WITHOUT_EOF_BLOCK unexpected when discovering"); - ret = GZ_SUCCESS; - break; // note: if file was not entirely read, we will detect that at the end of zip_one_file - - case GZ_TRUNCATED: // file ended mid-way through a BGZF block - ASSERT0 (!discovering, "unexpected ret=GZ_TRUNCATED unexpected when discovering"); - - // case: truncation allowed: account and then discard the data that will not be gz-decompressed - if (flag.truncate) { - WARN ("FYI: %s is truncated - its final BGZF block in incomplete. Dropping final %u bytes of the GZ data.", txt_name, file->gz_data.len32); - - inc_disk_gz_uncomp_or_trunc (file, file->gz_data.len); - file->gz_data.len32 = file->gz_data.comp_len = file->gz_data.uncomp_len = 0; // discard partial BGZF block - segconf.zip_txt_modified = true; - - ret = GZ_SUCCESS; - break; - } - - else - ABORTINP ("%s is truncated mid-way through BGZF block. Tip: If this is expected, use --truncate to discard the final partial BGZF block", txt_name); - - default: - ABORT ("Unexpected ret=%s", gzstatus_name (ret)); - } - - COPY_TIMER_EVB (bgzf_read_block); - return ret; -} - -// ZIP: BGZF section per component -void bgzf_compress_bgzf_section (void) -{ - // cases where we don't write the BGZF blocks section - if (!txt_file->bgzf_isizes.len || // this txt file is not compressed with BGZF - we don't need a BGZF section - txt_file->bgzf_flags.level == BGZF_COMP_LEVEL_UNKNOWN || // we don't know the level - so PIZ will reconstruct at default level - segconf.zip_txt_modified) // the file has changed and we can't reconstruct to the same blocks - return; - - // sanity check - int64_t total_isize = 0; - for_buf (uint16_t, isize_p, txt_file->bgzf_isizes) - total_isize += BGEN16 (*isize_p) + 1; // values 0-65535 correspond to isize 1-65536 - - ASSERT (total_isize == txt_file->txt_data_so_far_single + txt_file->last_truncated_line_len, "Expecting total_isize=%"PRId64" == txt_file->txt_data_so_far_single=%"PRId64, - total_isize, txt_file->txt_data_so_far_single); - - // get the best codec for the SEC_BGZF section - txt_file->bgzf_isizes.len *= sizeof (uint16_t); - Codec codec = codec_assign_best_codec (evb, NULL, &txt_file->bgzf_isizes, SEC_BGZF); - - evb->comp_i = flag.zip_comp_i; // this goes into SectionEntFileFormat.comp_i via sections_add_to_list - zfile_compress_section_data_ex (evb, NULL, SEC_BGZF, &txt_file->bgzf_isizes, NULL, 0, codec, (SectionFlags)txt_file->bgzf_flags, NULL); - txt_file->bgzf_isizes.len /= sizeof (uint16_t); // restore -} - -// uncompresses a BGZF block in vb->scratch referred to by bb, into its place in vb->txt_data as prescribed by bb -// might be called from main thread or compute threads -void bgz_uncompress_one_block (VBlockP vb, GzBlockZip *bb, Codec codec) -{ - if (bb->is_decompressed) return; // already decompressed (or a BGZF EOF block) - nothing to do - - ASSERT0 (vb->gzip_compressor, "vb->gzip_compressor=NULL"); - - int header_len = TXT_IS_BGZF ? BGZF_HEADER_LEN : GZIL_HEADER_LEN; - - uint8_t *h = B8(vb->scratch, bb->compressed_index); - - // verify that entire block is within vb->scratch - ASSERT (bb->compressed_index + header_len < vb->scratch.len && // we have at least the header - we can access bsize - bb->compressed_index + bb->comp_size <= vb->scratch.len, - "%s: %s block size goes past the end of in vb->scratch: bb=%s compressed_index=%u vb->scratch.len=%"PRIu64, - VB_NAME, codec_name (txt_file->codec), display_bb (bb).s, bb->compressed_index, vb->scratch.len); - - ASSERT (h[0]==31 && h[1]==139, "%s: invalid %s block in vb->scratch: compressed_index=%u", VB_NAME, codec_name (codec), bb->compressed_index); - - if (flag.show_bgzf) - iprintf ("UNCOMPRESS thread=%s %s i=%u comp_index=%u comp_len=%u txt_index=%u txt_len=%u eof=%s ", - threads_am_i_main_thread() ? "MAIN" : "COMPUTE", VB_NAME, - BNUM (vb->gz_blocks, bb), bb->compressed_index, bb->comp_size, bb->txt_index, bb->txt_size, TF(bb->is_eof)); - - enum libdeflate_result ret = - libdeflate_deflate_decompress (vb->gzip_compressor, - h + header_len, bb->comp_size - header_len - GZIP_FOOTER_LEN, // compressed - Btxt (bb->txt_index), bb->txt_size, NULL); // uncompressed - - // account for the case of decompression, and also the case bb is discarded due to a certain truncate situation (see below). - inc_disk_gz_uncomp_or_trunc (txt_file, bb->comp_size); - - // case: final GZIL block, which is truncated, but we have --truncate, and the garbage last word - // unluckily < 1MB so it went undetected as a legimiate block in gzil_read_block. we drop this block now. - if (ret != LIBDEFLATE_SUCCESS && vb->txt_codec == CODEC_GZIL && bb->is_eof) { - if (flag.truncate) - return; // with bb->is_decompressed=false - else { - ABORT ("Failed to decompress the final %s block of the file: %s. Tip: If it is expected that the file is truncated, use --truncate to ignore the defective final block.", - codec_name (vb->txt_codec), libdeflate_error(ret)); - } - } - - ASSERT (ret == LIBDEFLATE_SUCCESS, "libdeflate_deflate_decompress failed: %s", libdeflate_error(ret)); - - bb->is_decompressed = true; - - if (flag.show_bgzf) - #define C(i) ((bb->txt_index + i < Ltxt) ? char_to_printable (*Btxt (bb->txt_index + (i))).s : "") - iprintf ("txt_data[5]=%1s%1s%1s%1s%1s %s\n", C(0), C(1), C(2), C(3), C(4), bb->comp_size == BGZF_EOF_LEN ? "EOF" : ""); - #undef C - - // discover which gzip library and compression level were used (testing the first few BGZF blocks) - int test_block_i=0; - if (txt_file->bgzf_plausible_levels.len32 && - txt_file->bgzf_plausible_levels.count < BGZF_DISCOVERY_MAX_TESTS && // fail fast without atomic - (test_block_i = __atomic_fetch_add (&txt_file->bgzf_plausible_levels.count, 1, __ATOMIC_RELAXED)) < BGZF_DISCOVERY_MAX_TESTS) { // note: if multiple threads test in parallel, count might be incremented beyond BGZF_DISCOVERY_MAX_TESTS - that's ok - - bgzf_discover_library_and_level (vb, test_block_i, (rom)h, bb->comp_size, Btxt (bb->txt_index), bb->txt_size); - } - - // in case of --show_gz: report and exit (otherwise, we finalize in the main thread to avoid thread issues with updating txt_file->bgzf_flags) - if (flag.show_gz && (test_block_i == BGZF_DISCOVERY_MAX_TESTS-1 || !txt_file->bgzf_plausible_levels.len32)) - bgzf_finalize_discovery(); -} - -// ZIP: called from the compute thread: zip_compress_one_vb and main thread: txtfile_read_block_bgz -void bgz_uncompress_vb (VBlockP vb, Codec codec) -{ - START_TIMER; - - vb->gzip_compressor = libdeflate_alloc_decompressor(vb, __FUNCLINE); - - for_buf (GzBlockZip, bb, vb->gz_blocks) - bgz_uncompress_one_block (vb, bb, codec); - - libdeflate_free_decompressor ((struct libdeflate_decompressor **)&vb->gzip_compressor, __FUNCLINE); - - buf_free (vb->scratch); // now that we are finished decompressing we can free it - - if (flag.show_time) { - if (threads_am_i_main_thread ()) COPY_TIMER (bgzf_io_thread) - else COPY_TIMER (bgzf_compute_thread); - } - - COPY_TIMER (bgz_uncompress_vb); -} - -// ZIP: decompresses a prescribed BGZF block when re-reading DEPN lines -static inline void bgzf_uncompress_one_prescribed_block (VBlockP vb, STRp(bgzf_block), STRc(uncomp_block), uint64_t bb_i) -{ - START_TIMER; - - BgzfHeader *h = (BgzfHeader *)bgzf_block; - - if (flag.show_bgzf) - iprintf ("REREAD %s reread bb_i=%"PRIu64" comp_size=%u uncomp_size=%u ", - VB_NAME, bb_i, bgzf_block_len, uncomp_block_len); - - enum libdeflate_result ret = - libdeflate_deflate_decompress (vb->gzip_compressor, - h+1, bgzf_block_len - BGZF_HEADER_LEN - GZIP_FOOTER_LEN, // compressed - STRa(uncomp_block), NULL); // uncompressed - - ASSERT (ret == LIBDEFLATE_SUCCESS, "%s: libdeflate_deflate_decompress failed: %s. bgzf_block_len=%u uncomp_block_len=%u bb_i=%"PRIu64, - VB_NAME, libdeflate_error(ret), bgzf_block_len, uncomp_block_len, bb_i); - - if (flag.show_bgzf) - #define C(i) (i < uncomp_block_len ? char_to_printable (uncomp_block[i]).s : "") - iprintf ("txt_data[5]=%1s%1s%1s%1s%1s\n", C(0), C(1), C(2), C(3), C(4)); - #undef C - - COPY_TIMER (bgzf_uncompress_one_prescribed_block); -} - -// ZIP: re-reads and validates one BGZF block -static void bgzf_reread_one_prescribed_block (FILE *fp, uint64_t offset, qSTRp (bgzf_block)) -{ - ASSERT (!fseeko64 (fp, offset, SEEK_SET), - "fseeko64(%s, %"PRIu64") failed while rereading BGZF depn lines: %s", txt_name, offset, strerror(errno)); - - // read the header - uint32_t header_bytes = txtfile_fread (txt_file, fp, bgzf_block, BGZF_HEADER_LEN, NULL); - - // failed to read as prescribed - ASSERT (header_bytes == BGZF_HEADER_LEN && !memcmp (bgzf_block, BGZF_PREFIX, BGZF_PREFIX_LEN), - "failed to re-read a BGZF block header as perscribed BGZF: offset=%"PRIu64" bytes_read=%u header=%s", offset, header_bytes, str_to_hex ((bytes)bgzf_block, header_bytes).s); - - uint32_t body_size = (LTEN16 (((BgzfHeader*)bgzf_block)->bsize) + 1) - BGZF_HEADER_LEN; - uint32_t body_bytes = txtfile_fread (txt_file, fp, bgzf_block + BGZF_HEADER_LEN, body_size, NULL); - - ASSERT (body_bytes == body_size, "failed to re-read a BGZF block body as perscribed BGZF: offset=%"PRIu64" bytes_read=%u expected=%u", - offset, body_bytes, body_size); - - *bgzf_block_len = BGZF_HEADER_LEN + body_size; -} - -// ZIP: SAM/BAM: compute thread of a DEPN VB: actually re-reading data into txt_data according to vb->reread_prescription -void bgzf_reread_uncompress_vb_as_prescribed (VBlockP vb, FILE *fp) -{ - uint64_t last_offset = -1LL; - char uncomp_block[BGZF_MAX_BLOCK_SIZE]; - - vb->gzip_compressor = libdeflate_alloc_decompressor(vb, __FUNCLINE); - - for_buf (RereadLine, line, vb->reread_prescription) { - - // a line might span 1 or more BGZF blocks - while (line->line_len) { - ASSERT (line->offset.bb_i < txt_file->bgzf_starts.len32, "Expecting bb_i=%"PRIu64" < bgzf_starts.len=%"PRIu64, - (uint64_t)line->offset.bb_i, txt_file->bgzf_starts.len); - - uint64_t offset = *B64 (txt_file->bgzf_starts, line->offset.bb_i); - uint32_t isize = BGEN16 (*B16 (txt_file->bgzf_isizes, line->offset.bb_i)) + 1; // maximum isize is 65536 (not 65535) - - if (offset != last_offset) { - STRl (bgzf_block, BGZF_MAX_BLOCK_SIZE); - - bgzf_reread_one_prescribed_block (fp, offset, qSTRa(bgzf_block)); - bgzf_uncompress_one_prescribed_block (vb, STRa(bgzf_block), uncomp_block, isize, line->offset.bb_i); - - last_offset = offset; - } - - uint32_t subline_len = MIN_(line->line_len, isize - line->offset.uoffset); - memcpy (BAFTtxt, &uncomp_block[line->offset.uoffset], subline_len); - Ltxt += subline_len; - - // if this line continues to next BGZF block - it starts from the beginning of that block, its remainder is subline_len shorter - line->line_len -= subline_len; - line->offset.bb_i++; - line->offset.uoffset = 0; - } - } - - libdeflate_free_decompressor ((struct libdeflate_decompressor **)&vb->gzip_compressor, __FUNCLINE); -} - -void bgzf_libdeflate_1_7_initialize (void) -{ - libdeflate_set_memory_allocator_1_7 (bgzf_alloc, codec_free_do); -} - -// ZIP: called by Seg to set the bgzf index of the next line -void bgz_zip_advance_index (VBlockP vb, uint32_t line_len) -{ - if (!vb->gz_blocks.len) return; // no BGZF blocks in this VB - all data came from "unconsumed_txt" - - vb->line_bgzf_uoffset += line_len; - - // udpate current_bb_i and bgzf_offset (note: line_len might span multiple bgzf blocks) - GzBlockZip *bb; - for (bb = B(GzBlockZip, vb->gz_blocks, vb->gz_blocks.current_bb_i); - vb->line_bgzf_uoffset && vb->line_bgzf_uoffset >= bb->txt_size; // note: careful to also terminate on the edge case that line_bgzf_uoffset==0 and in the final VB block bb->txt_size==0 - bb++) - - vb->line_bgzf_uoffset -= bb->txt_size; // index into the next BGZF block - - vb->gz_blocks.current_bb_i = BNUM(vb->gz_blocks, bb); -} - -// ZIP: after reading data for a txt_header or VB, copy unconsumed gz_blocks to txt_file->unconsumed_bgz_blocks -// The first block might be partially consumed. -int64_t bgz_copy_unconsumed_blocks (VBlockP vb) -{ - START_TIMER; - ASSERTISZERO (txt_file->unconsumed_bgz_blocks.len32); - - if (!vb->gz_blocks.len) return 0; // not a BGZF-compressed file - - int32_t consumed = // amount of data in vb->gz_blocks that does NOT need to be copied to next VB bc it was consumed by this VB or the previous one - Ltxt + // amount of data consumed by this VB - vb->gz_blocks.consumed_by_prev_vb; // amount of data in first BGZF block was conusmed by the previous VB - - ARRAY (GzBlockZip, bb, vb->gz_blocks); - - bool done = false; - bool consumed_full_bgzf_blocks=false; - int64_t compressed_size = 0; - - for (uint32_t i=0; i < bb_len; i++) { - // if some of the BGZF blocks are not consumed (the first of them might be partially consumed) - move the blocks - // to unconsumed_bgz_blocks - to be moved to the next VB - if (consumed - bb[i].txt_size < 0 && !done/*enter only once*/) { - - consumed_full_bgzf_blocks = (consumed == 0); // no partially-consumed block - - // block i might be partially consumed or not consumed at all, subsequent blocks are not consumed at all - buf_append (evb, txt_file->unconsumed_bgz_blocks, GzBlockZip, - B(GzBlockZip, vb->gz_blocks, i), vb->gz_blocks.len32 - i, "txt_file->unconsumed_bgz_blocks"); - - txt_file->unconsumed_bgz_blocks.consumed_by_prev_vb = consumed; // part of first BGZF block already consumed - done = true; - } - else if (!done) - compressed_size += bb[i].comp_size; - - consumed -= bb[i].txt_size; - } - - // sanity check - ASSERT (-consumed == txt_file->unconsumed_txt.len32, "Expecting (-consumed)=%d == unconsumed_txt.len=%u", -consumed, txt_file->unconsumed_txt.len32); - - // update bb.txt_index for next VB - // note: first bb.txt_data of the next VB is possibly negative if some of its data was consumed by the current VB - int32_t txt_index = -txt_file->unconsumed_bgz_blocks.consumed_by_prev_vb; - for_buf (GzBlockZip, bb, txt_file->unconsumed_bgz_blocks) { - bb->txt_index = txt_index; - txt_index += bb->txt_size; - } - - COPY_TIMER (bgz_copy_unconsumed_blocks); - return consumed_full_bgzf_blocks ? compressed_size : 0; -} - -// return blocks used by the segconf VB to the unconsumed blocks -void bgz_return_segconf_blocks (VBlockP vb) -{ - buf_copy (evb, &txt_file->unconsumed_bgz_blocks, &vb->gz_blocks, GzBlockZip, 0, 0, 0); - txt_file->unconsumed_bgz_blocks.consumed_by_prev_vb = vb->gz_blocks.consumed_by_prev_vb; -} - -// ZIP: before reading data for a VB, populate gz_blocks with some or all of the unconsumed blocks passed -// from the previous VB or txt_header -void bgz_zip_init_vb (VBlockP vb) -{ - vb->vb_bgz_i = txt_file->bgzf_isizes.len + txt_file->bgzf_flags.has_eof_block; // index of first bgzf block to be used by the VB - - if (!txt_file->unconsumed_bgz_blocks.len) return; // happens when either unconsumed_bytes=0 or not a BGZF-compressed file - - // data in the first BGZF block already consumed by previous VB or txt_header - vb->gz_blocks.consumed_by_prev_vb = vb->line_bgzf_uoffset = txt_file->unconsumed_bgz_blocks.consumed_by_prev_vb; - - // copy all unconsumed BGZF blocks - we might not need all of them - the unconsumed ones will moved back in bgz_copy_unconsumed_blocks - buf_copy (vb, &vb->gz_blocks, &txt_file->unconsumed_bgz_blocks, GzBlockZip, 0, 0, "gz_blocks"); - - vb->vb_bgz_i -= txt_file->unconsumed_bgz_blocks.len32; - - txt_file->unconsumed_bgz_blocks.len32 = txt_file->unconsumed_bgz_blocks.consumed_by_prev_vb = 0; - - // sanity check - int32_t available = -vb->gz_blocks.consumed_by_prev_vb; // possibly start negative - for_buf (GzBlockZip, bb, vb->gz_blocks) - available += bb->txt_size; - - ASSERT (available >= Ltxt, "%s blocks in txt_file->unconsumed_bgz_blocks cover only %d bytes, less than the needed unconsumed_bytes=%d", - codec_name (txt_file->codec), available, Ltxt); -} - -//----------------------------------------------------- -// PIZ SIDE - setting up BGZF for a particular txt file -//----------------------------------------------------- - -static Buffer isizes = {}; // Will be grabbed into txt_file->bgzf_isizes. - -static inline FlagsBgzf recompression_template (int bgzf_level) -{ - return (FlagsBgzf){ .has_eof_block = true, - .level = bgzf_recompression_levels[bgzf_level].level, // a 4-bit bitfield - .library = bgzf_recompression_levels[bgzf_level].library }; -} - -static FlagsBgzf bgzf_load_isizes (CompIType comp_i, bool show_only) -{ - Section sec = sections_get_comp_bgzf_sec (comp_i); - if (!sec) goto fallback; // this component doesn't contain a BGZF section - - int32_t offset = zfile_read_section (z_file, evb, 0, &evb->z_data, "z_data", SEC_BGZF, sec); - - SectionHeaderP header = (SectionHeaderP)Bc(evb->z_data, offset); - - // if we don't know the compression level (in older Genozip versions we wrote the SECTION_BGZF even - // if level discover failed) - if (header->flags.bgzf.level == BGZF_COMP_LEVEL_UNKNOWN) - goto fallback; - - zfile_uncompress_section (evb, header, &isizes, "txt_file->bgzf_isizes", 0, SEC_BGZF); - isizes.len /= 2; - - if (show_only) buf_destroy (isizes); - - // convert to native endianity from big endian - for_buf (uint16_t, isize_p, isizes) - *isize_p = BGEN16 (*isize_p); // now it contains isize-1 - a value 0->65535 representing an isize 1->65536 - - return header->flags.bgzf; // bgzf_isizes successfully loaded - -fallback: - return recompression_template (BGZF_DEFAULT_LEVEL); -} - -// PIZ: called from main thread after reading txt_header's header -FlagsBgzf bgzf_piz_calculate_bgzf_flags (CompIType comp_i, Codec src_codec) -{ - #define C(cdc) (src_codec == CODEC_##cdc) - FlagsBgzf bgzf_flags; - - #define HAS_EXT(x) filename_has_ext (flag.out_filename, #x) - bool bgzf_implied_by_out_filename = flag.out_filename && (HAS_EXT(.gz) || HAS_EXT(.bgz) || HAS_EXT(.bam)); - bool no_bgzf_implied_by_out_filename = file_piz_get_dt_of_out_filename() == flag.out_dt && !(HAS_EXT(.gz) || HAS_EXT(.bgz) || HAS_EXT(.bam)); - bool isizes_loaded = false; - - // cases where there is no BGZF re-compression - if (flag.test || - OUT_DT(CRAM) || - (flag.bgzf == 0 && !OUT_DT(BAM) && !OUT_DT(BCF)) || // note: in BCF and BAM --bgzf=0 means BGZF blocks with no compression (as opposed to no BGZF at all) - (flag.bgzf == BGZF_BY_ZFILE && C(NONE) && flag.reconstruct_as_src)) // case: --bgzf=exact and source codec was CODEC_NONE - - bgzf_flags = bgzf_no_recompression; - - // case: reconstructing BCF: piz sends VCF to bcftools in CODEC_NONE, and bcftools compressed by the level given in bgzf_flags - else if (OUT_DT(BCF)) - bgzf_flags = (FlagsBgzf){ .library = BGZF_EXTERNAL_LIB, - .level = (flag.bgzf < 0) ? 4 : (int[]){0, 2, 4, 6, 8, 9 }[flag.bgzf] }; // convert Genozip level 0-5 to bcftools level 0-9 - - // case: --bgzf=exact and source codec is other than CODEC_NONE - else if (flag.bgzf == BGZF_BY_ZFILE && !C(NONE)) { - bgzf_flags = bgzf_load_isizes (comp_i, false); - isizes_loaded = true; - } - - // case: --bgzf=0 to 5 - else if (flag.bgzf >= 0) { - bgzf_flags = recompression_template (flag.bgzf); // set to --bgzf command line value - - // if user specified --bgzf and --output - make sure output filename is .gz, .bam or .bcf - ASSINP (flag.force || !flag.out_filename || bgzf_implied_by_out_filename || HAS_EXT(.bcf) || bgzf_flags.level==0, - "using %s in combination with %s for outputting a %s file, requires the output filename to end with %s (override with --force)", - OT("output", "o"), OT("bgzf", "z"), dt_name(flag.out_dt), OUT_DT(BAM)?".bam" : OUT_DT(BCF)?".bcf" : ".gz"); - - ASSINP0 (!OUT_DT(BCF) || flag.bgzf != BGZF_BY_ZFILE, "cannot use --bgzf=exact when outputing a BCF file"); // because we have no control over bcftools' BGZF block generation - } - - // case: genocat to stdout without --bgzf: - no re-compression. - else if (is_genocat && !flag.out_filename) - bgzf_flags = OUT_DT(BAM) ? bgzf_recompression_levels[0] : bgzf_no_recompression; // file_open_txt_write interprets level=0 as CODEC_BGZF without compression for BAM, and CODEC_NONE for other types - - // case: genocat or genounzip out_filename and no --bgzf: - determine by file name (except BAM - bgzf regardless of filename) - else if (flag.out_filename) - bgzf_flags = (bgzf_implied_by_out_filename || OUT_DT(BAM) || (!no_bgzf_implied_by_out_filename && !C(NONE))) ? bgzf_recompression_levels[BGZF_DEFAULT_LEVEL] : bgzf_no_recompression; - - // case: genounzip without explicit filename, and no --bgzf: default compression or no compression - else - // note: for bz2, xz, and zip - we reconstruct as gz too. better choice than plain. - bgzf_flags = (C(BGZF) || C(BAM) || C(GZ) || C(BZ2) || C(XZ) || C(ZIP)) ? bgzf_recompression_levels[BGZF_DEFAULT_LEVEL] : bgzf_no_recompression; // note: similar logic to txtheader_get_txt_filename_from_section - - // case: user wants to see this section header, despite not needing BGZF data - if (!isizes_loaded && (flag.only_headers == SEC_BGZF+1 || flag.only_headers == SHOW_ALL_HEADERS)) - bgzf_load_isizes (comp_i, true); - - if (flag.show_bgzf) - iprintf ("comp_i=%u with src_codec=%s out_dt=%s: calculated bgzf_flags={%s, %d}\n", - comp_i, codec_name (src_codec), dt_name (flag.out_dt), bgzf_library_name (bgzf_flags.library, true), bgzf_flags.level); - - return bgzf_flags; - #undef C -} - -// PIZ main thread: update txt_file with BGZF info calculated earlier -void bgzf_piz_set_txt_file_bgzf_info (FlagsBgzf bgzf_flags, bytes codec_info) -{ - memcpy (txt_file->bgzf_signature, codec_info, 3); - - if (isizes.len) - buf_grab (evb, txt_file->bgzf_isizes, "txt_file->bgzf_isizes", isizes); - - txt_file->bgzf_flags = bgzf_flags; - - // sanity - ASSERT (txt_file->bgzf_flags.level >= 0 && txt_file->bgzf_flags.level <= BGZF_MAX_LEVEL, "txt_file->bgzf_flags.level=%u ∉ [0,%u]", - txt_file->bgzf_flags.level, BGZF_MAX_LEVEL); - - ASSERT (txt_file->bgzf_flags.library >= 0 && txt_file->bgzf_flags.library < NUM_BGZF_LIBRARIES, "txt_file->bgzf_flags.library=%u ∉ [0,%u]", - txt_file->bgzf_flags.level, NUM_BGZF_LIBRARIES-1); -} - -//----------------------------------------------------- -// PIZ SIDE - compressing txt_file with BGZF -//----------------------------------------------------- - -static void bgzf_alloc_compressor (VBlockP vb, FlagsBgzf bgzf_flags) -{ - ASSERT0 (!vb->gzip_compressor, "expecting vb->gzip_compressor=NULL"); - - switch (bgzf_flags.library) { - case BGZF_LIBDEFLATE19: - vb->gzip_compressor = libdeflate_alloc_compressor (vb, bgzf_flags.level, __FUNCLINE); - break; - - case BGZF_LIBDEFLATE7: - vb->gzip_compressor = libdeflate_alloc_compressor_1_7 (bgzf_flags.level, vb); - break; - - case BGZF_ZLIB: - vb->gzip_compressor = bgzf_alloc (vb, 1, sizeof (z_stream), __FUNCLINE); - *(z_stream *)vb->gzip_compressor = (z_stream){ .zalloc = bgzf_alloc, .zfree = codec_free_do, .opaque = vb }; - break; - - case BGZF_IGZIP: - ASSERT (bgzf_flags.level==1 || bgzf_flags.level==2, "igzip: expecting bgzf_flags.level=%u ∈[1,2]", bgzf_flags.level); - - vb->gzip_compressor = bgzf_alloc (vb, 1, (int[]){ 1+ISAL_DEF_LVL0_DEFAULT, ISAL_DEF_LVL1_DEFAULT }[bgzf_flags.level-1], __FUNCLINE); // 1+ to avoid 0 - break; - - default: - ABORT ("Invalid bgzf_flags.library=%d", bgzf_flags.library); - } - - if (flag.show_bgzf) - iprintf ("%s: initialized compressor %s level %u%s\n", VB_NAME, - bgzf_library_name (bgzf_flags.library, true), bgzf_flags.level, - flag.bgzf == BGZF_BY_ZFILE ? " EXACT" : ""); -} - -static void bgzf_free_compressor (VBlockP vb, FlagsBgzf bgzf_flags) -{ - switch (bgzf_flags.library) { - case BGZF_LIBDEFLATE7 : - libdeflate_free_compressor_1_7 (vb->gzip_compressor); - break; - - case BGZF_LIBDEFLATE19 : - libdeflate_free_compressor (vb->gzip_compressor, __FUNCLINE); - break; - - case BGZF_IGZIP : - case BGZF_ZLIB : - codec_free (vb, vb->gzip_compressor); - break; - - default: - ABORT ("Invalid bgzf_flags.library=%d", bgzf_flags.library); - } - - vb->gzip_compressor = NULL; -} - -static void bgzf_compress_one_block (VBlockP vb, rom in, uint32_t isize, - int32_t block_i, int32_t txt_index, // for show_bgzf (both may be negative - indicating previous VB) - BufferP compressed) -{ - START_TIMER; - - ASSERT0 (vb->gzip_compressor, "vb->gzip_compressor=NULL"); - - #define BGZF_MAX_CDATA_SIZE (BGZF_MAX_BLOCK_SIZE - BGZF_HEADER_LEN - GZIP_FOOTER_LEN) - - buf_alloc (vb, compressed, BGZF_MAX_BLOCK_SIZE, 0, char, 1.2, "scratch"); - - BgzfHeader *header = (BgzfHeader *)BAFTc (*compressed); - buf_add (compressed, BGZF_EOF, BGZF_HEADER_LEN); // template of header - only bsize needs updating - - uint32_t comp_index = compressed->len; - int out_size; - - if (txt_file->bgzf_flags.library == BGZF_IGZIP) { - struct isal_zstream strm; - isal_deflate_stateless_init (&strm); - strm.gzip_flag = ISAL_DEFLATE; - strm.flush = NO_FLUSH; - strm.level = txt_file->bgzf_flags.level - 1; // note: level 1,2 in bgzf_flags corrsponds to IGZIP level 0,1 - strm.level_buf_size = (int[]){ ISAL_DEF_LVL0_DEFAULT, ISAL_DEF_LVL1_DEFAULT }[strm.level]; - strm.level_buf = vb->gzip_compressor; - strm.next_in = (uint8_t *)in; - strm.avail_in = isize; - strm.next_out = BAFT8 (*compressed); - strm.avail_out = BGZF_MAX_CDATA_SIZE + GZIP_FOOTER_LEN; - - int ret = isal_deflate_stateless (&strm); - ASSERT (ret == ISAL_DECOMP_OK, "%s: isal_deflate_stateless: %s. isize=%u", VB_NAME, isal_error (ret), isize); - - out_size = BGZF_MAX_CDATA_SIZE + GZIP_FOOTER_LEN - strm.avail_out; - } - - else if (txt_file->bgzf_flags.library == BGZF_LIBDEFLATE19) { // libdeflate 1.19 - - out_size = (int)libdeflate_deflate_compress (vb->gzip_compressor, in, isize, BAFTc (*compressed), BGZF_MAX_CDATA_SIZE); - - // in case the compressed data doesn't fit in one BGZF block, move to compressing at the maximum level. this can - // happen theoretically (maybe) if the original data was compressed with a higher level, and an uncompressible 64K block was - // "scratch" to just under 64K while in our compression level it is just over 64K. - if (!out_size) { - void *high_compressor = libdeflate_alloc_compressor (vb, LIBDEFLATE_MAX_LEVEL, __FUNCLINE); // libdefate's highest level - out_size = libdeflate_deflate_compress (high_compressor, in, isize, BAFTc (*compressed), BGZF_MAX_CDATA_SIZE); - libdeflate_free_compressor (high_compressor, __FUNCLINE); - } - } - - else if (txt_file->bgzf_flags.library == BGZF_LIBDEFLATE7) { // libdeflate 1.7 - - out_size = (int)libdeflate_deflate_compress_1_7 (vb->gzip_compressor, in, isize, BAFTc (*compressed), BGZF_MAX_CDATA_SIZE); - - // in case the compressed data doesn't fit in one BGZF block, move to compressing at the maximum level. this can - // happen theoretically (maybe) if the original data was compressed with a higher level, and an uncompressible 64K block was - // "scratch" to just under 64K while in our compression level it is just over 64K. - if (!out_size) { - void *high_compressor = libdeflate_alloc_compressor_1_7 (LIBDEFLATE_MAX_LEVEL, vb); // libdefate's highest level - out_size = libdeflate_deflate_compress_1_7 (vb->gzip_compressor, in, isize, BAFTc (*compressed), BGZF_MAX_CDATA_SIZE); - libdeflate_free_compressor_1_7 (high_compressor); - } - } - - else { // zlib - #define strm ((z_stream *)vb->gzip_compressor) - - ASSERT0 (deflateInit2 (vb->gzip_compressor, txt_file->bgzf_flags.level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) == Z_OK, - "deflateInit2 failed"); - - strm->next_in = (uint8_t *)in; - strm->avail_in = isize; - strm->next_out = BAFT8 (*compressed); - strm->avail_out = BGZF_MAX_CDATA_SIZE; - ASSERT (deflate (vb->gzip_compressor, Z_FINISH) == Z_STREAM_END, "deflate failed: msg=%s", strm->msg); - - out_size = BGZF_MAX_CDATA_SIZE - strm->avail_out; - - ASSERT0 (deflateEnd (vb->gzip_compressor) == Z_OK, "deflateEnd failed"); - #undef strm - } - - ASSERT (out_size, "cannot compress block with %u bytes into a BGZF block with %u bytes", isize, BGZF_MAX_BLOCK_SIZE); - compressed->len += out_size; - - header->bsize = LTEN16 ((uint16_t)(BGZF_HEADER_LEN + out_size + GZIP_FOOTER_LEN - 1)); - - GzipFooter footer = { .crc32 = LTEN32 (crc32 (0, in, isize)), - .isize = LTEN32 (isize) }; - buf_add (compressed, (rom)&footer, GZIP_FOOTER_LEN); - - if (flag.show_bgzf) - #define C(i) (i < isize ? char_to_printable (in[i]).s : "") - iprintf ("COMPRESS thread=%s %s i=%d compressed_index=%u size=%u txt_index=%d size=%u txt_data[5]=%1s%1s%1s%1s%1s %s\n", - threads_am_i_main_thread() ? "MAIN" : threads_am_i_writer_thread() ? "WRITER" : "COMPUTE", VB_NAME, block_i, - comp_index - (int)BGZF_HEADER_LEN, (unsigned)out_size + BGZF_HEADER_LEN + GZIP_FOOTER_LEN, txt_index, isize, C(0), C(1), C(2), C(3), C(4), - out_size == BGZF_EOF_LEN ? "EOF" : ""); - #undef C - - COPY_TIMER (bgzf_compress_one_block); -} - -// appends file data to wvb->z_data -void bgzf_write_finalize (void) -{ - // write EOF block if needed - if (txt_file->bgzf_flags.has_eof_block) { - buf_add_more (wvb, &wvb->z_data, BGZF_EOF, BGZF_EOF_LEN, "z_data"); - - if (flag.show_bgzf) iprintf ("%-7s vb=%u EOF\n", "IO", 0); - } - - // if we attempted to reconstruct the BGZF block to the original file's bgzf_isizes - warn if we were unlucky and failed - if (txt_file->bgzf_isizes.len) { - uint8_t signature[3]; - bgzf_sign (txt_file->disk_so_far + (txt_file->bgzf_flags.has_eof_block ? BGZF_EOF_LEN : 0), signature); - - // commented out because of an inconsistency: here we give an FYI in case we thought we could regenerate the - // blocks and failed, however we don't give a warning in the more common case that ZIP couldn't find the BGZF - // level in the first place - // ASSERTW (!memcmp (signature, txt_file->bgzf_signature, 3), - // "FYI: %s is recompressed with BGZF (.gz). However, it seems that the original file was compressed with a different compression library than genozip uses, resulting in a slightly different level of compression. Rest assured that the actual data is identical.", - // txt_name); - } -} - -void bgzf_sign (uint64_t disk_size, uint8_t *signature) -{ - signature[0] = (disk_size ) & 0xff; // LSB of size - signature[1] = (disk_size >> 8 ) & 0xff; - signature[2] = (disk_size >> 16) & 0xff; -} - -// Entry point of BGZF compression compute thread. -// bgzf-compress vb->txt_data into vb->z_data - using BGZF blocks as prescribed in vb->gz_blocks. -// Note: we hope to reconstruct the exact same byte-level BGZF blocks, as the original files, but that -// will only happen if the GZIP library (eg libdeflate), version and parameters are the same -static void bgzf_compress_vb (VBlockP vb) -{ - START_TIMER; - - if (flag.show_bgzf) - iprintf ("%s: start BZGF re-compression: bgzf_library=%s bgzf_level=%d\n", - VB_NAME, bgzf_library_name (txt_file->bgzf_flags.library, true), txt_file->bgzf_flags.level); - - ASSERTNOTEMPTY (vb->gz_blocks); - - buf_alloc (vb, &vb->z_data, 0, vb->gz_blocks.len32 * BGZF_MAX_BLOCK_SIZE/2, uint8_t, 1, "z_data"); // alloc based on estimated size - bgzf_alloc_compressor (vb, txt_file->bgzf_flags); - - for_buf2 (BgzfBlockPiz, block, i, vb->gz_blocks) { - - ASSERT (block->txt_index + block->txt_size <= Ltxt, - "block=%u out of range: expecting txt_index=%u txt_size=%u <= txt_data.len=%u", - i, block->txt_index, block->txt_size, Ltxt); - - bgzf_compress_one_block (vb, Btxt (block->txt_index), block->txt_size, i, block->txt_index, &vb->z_data); - } - - bgzf_free_compressor (vb, txt_file->bgzf_flags); - - vb_set_is_processed (vb); /* tell dispatcher this thread is done and can be joined. this operation needn't be atomic, but it likely is anyway */ - COPY_TIMER (bgzf_compute_thread); -} - -static inline uint32_t bgzf_next_isize (void) -{ - #define BGZF_CREATED_BLOCK_SIZE 65280 // same size as observed in htslib-created files - - return txt_file->bgzf_isizes.len ? *B16(txt_file->bgzf_isizes, txt_file->bgzf_isizes.next) + 1 // +1 bc the array values are (isize-1) - : BGZF_CREATED_BLOCK_SIZE; // case: no prescriped isizes -} - -// PIZ: calculate the BGZF blocks within this VB -static uint32_t bgzf_calculate_blocks_one_vb (VBlockP vb, bool is_last) -{ - uint32_t index = 0; - - while ((!txt_file->bgzf_isizes.len || txt_file->bgzf_isizes.next < txt_file->bgzf_isizes.len) && index < Ltxt) { - - uint32_t isize = bgzf_next_isize(); - - if (index + isize > Ltxt) { - if (is_last) isize = Ltxt - index; // last BGZF block might be shorter - else - break; // the data at the end of this VB doesn't fill a whole BGZF block - pass it down to next vb - } - - buf_alloc (vb, &vb->gz_blocks, 1, Ltxt / 63000, BgzfBlockPiz, 1.5, "gz_blocks"); - - BNXT (BgzfBlockPiz, vb->gz_blocks) = (BgzfBlockPiz){ .txt_index = index, .txt_size = isize }; - - index += isize; - txt_file->bgzf_isizes.next++; - } - - uint32_t remaining = Ltxt - index; - ASSERT0 (remaining < BGZF_MAX_BLOCK_SIZE, "bgzf_isizes exhausted prematurely"); // if we have 65536 or more remaining, there should have been more isizes - - return remaining; -} - -void bgzf_dispatch_compress (Dispatcher dispatcher, STRp (uncomp), CompIType comp_i, bool is_last) -{ - // uncompressed data to be dealt with by next call to this function (buffer belongs to writer thread) - static Buffer intercall_txt = {}; // belongs to wvb - buf_alloc (wvb, &intercall_txt, 0, BGZF_MAX_BLOCK_SIZE, char, 0, "intercall_txt"); - - // case: uncomp is not enough to fill a block, just store it to next call - if (!is_last && (uncomp_len + intercall_txt.len32 < bgzf_next_isize())) { - memcpy (BAFTc(intercall_txt), uncomp, uncomp_len); - intercall_txt.len32 += uncomp_len; - return; - } - - if (uncomp_len || intercall_txt.len) { // might be 0 if is_last, in some cases - - VBlockP vb = dispatcher_generate_next_vb (dispatcher, wvb->vblock_i, COMP_NONE); - vb->comp_i = comp_i; - - // build uncompressed data for this VB - some data left over from previous VB + data from wvb - buf_alloc_exact (vb, vb->txt_data, intercall_txt.len + uncomp_len, char, "txt_data"); - if (intercall_txt.len32) memcpy (B1STtxt, intercall_txt.data, intercall_txt.len32); - memcpy (Btxt (intercall_txt.len32), uncomp, uncomp_len); - - // calculate BGZF blocks - and trim data that doesn't fill a block - to be moved to next VB - if ((intercall_txt.len32 = bgzf_calculate_blocks_one_vb (vb, is_last))) { - Ltxt -= intercall_txt.len32; - memcpy (B1STc(intercall_txt), BAFTtxt, intercall_txt.len32); - } - - // BGZF-compress vb->txt_data in a separate thread - dispatcher_compute (dispatcher, bgzf_compress_vb); - } - - if (is_last) { - dispatcher_set_no_data_available (dispatcher, false, DATA_EXHAUSTED); - buf_destroy (intercall_txt); - } -} - -rom bgzf_library_name (BgzfLibraryType library, bool long_name) -{ - return (library < 0 || library >= NUM_ALL_BGZF_LIBRARIES) ? "INVALID_BGZF_LIBRARY" - : long_name ? (rom[])BGZF_LIB_NAMES_LONG[library] - : (rom[])BGZF_LIB_NAMES_SHRT[library]; -} - -// used by test/Makefile -void gzil_compress (void) -{ - void *compressor = libdeflate_alloc_compressor (evb, 5, __FUNCLINE); - - uint8_t *in = MALLOC (1 MB), *out = MALLOC (2 MB); - uint32_t in_len; - for (int i=0; (in_len = fread (in, 1, 1 MB, stdin)); i++) { - GzipFooter footer = { .crc32 = LTEN32 (crc32 (0, in, in_len)), - .isize = LTEN32 (in_len) }; - - uint32_t out_len = libdeflate_deflate_compress (compressor, in, in_len, out, 2 MB); - ASSERT (out_len, "deflate failed: in_len=%u block_i=%u", in_len, i); - - ASSERT0 (1 == fwrite (_S(GZIL_HEADER), 1, stdout), "fwrite failed #1"); - ASSERT (1 == fwrite (STRa(out), 1, stdout), "fwrite failed: #2 out_len=%u", out_len); - ASSERT0 (1 == fwrite (&footer, sizeof (footer), 1, stdout), "fwrite failed #3"); - } - - fflush (stdout); - exit (0); -} diff --git a/src/bgzf.h b/src/bgzf.h deleted file mode 100644 index 53984b2c..00000000 --- a/src/bgzf.h +++ /dev/null @@ -1,81 +0,0 @@ -// ------------------------------------------------------------------ -// bgzf.h -// Copyright (C) 2020-2024 Genozip Limited. Patent Pending. -// Please see terms and conditions in the file LICENSE.txt -// -// WARNING: Genozip is proprietary, not open source software. Modifying the source code is strictly prohibited -// and subject to penalties specified in the license. - -#include "sections.h" - -#define BGZF_DEFAULT_LEVEL 2 // PIZ: used if --bgzf is not specified (it is actually faster than 1 if also writing to disk) - -#define BGZF_MAX_BLOCK_SIZE ((uint32_t)(64 KB)) // maximum block size of both compressed and uncompressed data of one block -#define BGZF_MAX_CHUCK_SIZE ((uint32_t)(1 MB)) // max amount we read from disk at a time - -// First 16 bytes of every BGZF block -#define BGZF_PREFIX_LEN 16 -#define BGZF_PREFIX "\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00" - -// BGZF EOF marker is simply an empty block, see https://samtools.github.io/hts-specs/SAMv1.pdf section 4.1.2 -#define BGZF_EOF_LEN 28 -#define BGZF_EOF BGZF_PREFIX "\x1b\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00" - -// maximum block size of uncompressed data, and we are going to assume that since GZIL is only -// for FASTQ, and FASTQ is quite compressible, (GZIL_MAX_BLOCK_SIZE-GZIL_HEADER_LEN) it is an upper limit on GZIL-compressed data -#define GZIL_MAX_BLOCK_SIZE ((uint32_t)(1 MB)) - -#define GZIL_HEADER "\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x03" -#define GZIL_HEADER_LEN 10 - -#define GZIL_ISIZE "\x00\x00\x10\x00" // isize == 1MB in all blocks except the last -#define GZIL_ISIZE_LEN 4 - -typedef struct BgzfBlockPiz { - int32_t txt_index, txt_size; // index of uncompressed block within vb->txt_data. The first block index will be negative if there is passed-down unconsumed data -} BgzfBlockPiz; - -// ZIP side -typedef enum { GZ_SUCCESS, GZ_IS_GZIP_NOT_BGZF, GZ_IS_NOT_GZIL, GZ_IS_NOT_GZIP, GZ_EOF_WITHOUT_EOF_BLOCK, GZ_TRUNCATED, NUM_GZ_STATUSES } GzStatus; // file is truncated -#define GZSTATUS_NAMES { "SUCCESS", "IS_GZIP_NOT_BGZF", "IS_NOT_GZIL", "IS_NOT_GZIP", "EOF_WITHOUT_EOF_BLOCK", "TRUNCATED" } - -// data type of VBlock.gz_blocks and txt_file->unconsumed_bgz_blocks : details of BGZF/GZIL blocks. -typedef struct GzBlockZip { - int32_t txt_index; // index of uncompressed block within vb->txt_data. If there is passed-down data from previous VB/txt_header, then txt_index of the first block will be negative (see bgz_copy_unconsumed_blocks) - uint32_t txt_size : 30; - uint32_t is_decompressed : 1; // true if data has been GZ-decompressed by main thread - uint32_t is_eof : 1; // true if this is the last GZ-block in the file - uint32_t compressed_index, comp_size; // index within vb->scratch -} GzBlockZip; - -extern GzStatus bgzf_read_block (FileP file, bool discovering); -extern GzStatus gzil_read_block (FileP file, bool discovering, bool *is_eof); -extern void bgz_uncompress_vb (VBlockP vb, Codec codec); -extern void bgz_uncompress_one_block (VBlockP vb, GzBlockZip *bb, Codec codec); -extern void bgzf_reread_uncompress_vb_as_prescribed (VBlockP vb, FILE *file); -extern void bgzf_compress_bgzf_section (void); -extern void bgz_zip_advance_index (VBlockP vb, uint32_t line_len); -extern int64_t bgz_copy_unconsumed_blocks (VBlockP vb); -extern void bgz_zip_init_vb (VBlockP vb); -extern void bgzf_insert_back_segconf_blocks (VBlockP vb); -extern void bgz_return_segconf_blocks (VBlockP vb); - -extern void inc_disk_gz_uncomp_or_trunc_(FileP file, uint64_t inc, FUNCLINE); -#define inc_disk_gz_uncomp_or_trunc(file, inc) inc_disk_gz_uncomp_or_trunc_((file), (inc), __FUNCLINE) - -// library / level discovery -extern void bgzf_initialize_discovery (FileP file); -extern void bgzf_finalize_discovery (void); - -// PIZ side -extern FlagsBgzf bgzf_piz_calculate_bgzf_flags (CompIType comp_i, Codec src_codec); -extern void bgzf_piz_set_txt_file_bgzf_info (FlagsBgzf bgzf_flags, bytes codec_info); -extern void bgzf_dispatch_compress (Dispatcher dispatcher, STRp (uncomp), CompIType comp_i, bool is_last); -extern void bgzf_write_finalize (void); - -// misc -extern rom bgzf_library_name (BgzfLibraryType library, bool long_name); -extern rom gzstatus_name (GzStatus st); -extern void gzil_compress (void); -extern void bgzf_libdeflate_1_7_initialize (void); -extern void bgzf_sign (uint64_t disk_size, uint8_t *signature); diff --git a/src/buf_struct.c b/src/buf_struct.c index b1d2f7c0..2aab41bf 100644 --- a/src/buf_struct.c +++ b/src/buf_struct.c @@ -77,7 +77,7 @@ void buf_initialize() rom buf_type_name (ConstBufferP buf) { - if (IN_RANGE (buf->type, 0, BUF_NUM_TYPES-1)) + if (IN_RANGE (buf->type, 0, BUF_NUM_TYPES)) return (rom[])BUFTYPE_NAMES[buf->type]; else { @@ -213,7 +213,7 @@ void buf_alloc_do (VBlockP vb, BufferP buf, uint64_t requested_size, #define REQUEST_TOO_BIG_THREADSHOLD (3 GB) if (requested_size > REQUEST_TOO_BIG_THREADSHOLD && !buf->can_be_big) // use WARN instead of ASSERTW to have a place for breakpoint WARN ("Warning: buf_alloc called from %s:%u %s for \"%s\" requested %s. This is suspiciously high and might indicate a bug - please report to " EMAIL_SUPPORT ". z_dt=%s vb->vblock_i=%u buf=%s line_i=%d vb_size=%s RAM=%3.1lf GB txt_file->disk_size=%s", - func, code_line, version_str().s, name, str_size (requested_size).s, z_dt_name(), vb->vblock_i, buf_desc (buf).s, vb->line_i, str_size (segconf.vb_size).s, arch_get_physical_mem_size(), str_size (txt_file->disk_size).s); + func, code_line, version_str().s, name, str_size (requested_size).s, z_dt_name(), vb->vblock_i, buf_desc (buf).s, vb->line_i, str_size (segconf.vb_size).s, arch_get_physical_mem_size(), txt_file ? str_size (txt_file->disk_size).s : "N/A"); ASSERT (buf->type == BUF_REGULAR || buf->type == BUF_UNALLOCATED, "called from %s:%u: cannot buf_alloc a buffer of type %s. details: %s", func, code_line, buf_type_name (buf), buf_desc (buf).s); diff --git a/src/buffer.c b/src/buffer.c index e956e998..a3d0c137 100644 --- a/src/buffer.c +++ b/src/buffer.c @@ -1,386 +1,432 @@ -// ------------------------------------------------------------------ -// buffer.c -// Copyright (C) 2019-2024 Genozip Limited. Patent Pending. -// Please see terms and conditions in the files LICENSE.non-commercial.txt and LICENSE.commercial.txt - -#include "buffer.h" -#include "file.h" - -// writes a buffer to a file, return true if successful -// note: this is designed to run in any there, so it cannot create any buffers in evb -bool buf_dump_to_file (rom filename, ConstBufferP buf, unsigned buf_word_width, bool including_control_region, - bool no_dirs, bool verbose, bool do_gzip) -{ - RETURNW (buf->type == BUF_REGULAR, false, - "FYI: failed to dump buffer.type=%s name=%s while putting %s", - buf_type_name (buf), buf->name ? buf->name : "(null)", filename); - - int fn_len = strlen(filename); - char update_filename[fn_len + 10]; - strcpy (update_filename, filename); - - if (no_dirs) { - for (unsigned i=0; i < fn_len; i++) - if (filename[i] == '/' || (flag.is_windows && (filename[i] == '\\' || (filename[i] == ':' && i!=1)))) - update_filename[i] = '-'; - filename = update_filename; - } - - bool success; - if (including_control_region) { - ASSERT (*(uint64_t *)(buf->memory) == UNDERFLOW_TRAP, "dumping to %s: buffer has underflowed", filename); - ASSERT (*(uint64_t *)(buf->data + buf->size) == OVERFLOW_TRAP, "dumping to %s: buffer has overflowed", filename); - - success = file_put_data (update_filename, buf->memory, buf_mem_size (buf), 0); - } - else - success = file_put_data (update_filename, buf->data, buf->len * buf_word_width, 0); - - if (success && do_gzip) - file_gzip (update_filename); // updates filename if successful - - if (success && verbose) iprintf ("\nDumped file %s\n", update_filename); - - return success; -} - -// copy data - possibly within the same buffer -void buf_copy_do (VBlockP dst_vb, BufferP dst, ConstBufferP src, - uint64_t bytes_per_entry, // how many bytes are counted by a unit of .len - uint64_t src_start_entry, uint64_t max_entries, // if 0 copies the entire buffer - FUNCLINE, - rom dst_name) // dst buffer settings, or take from src if 0 -{ - ASSERTNOTNULL (src); - ASSERTNOTNULL (dst); - - ASSERT (src->data, "called from %s:%u: src->data is NULL", func, code_line); - - ASSERT (!max_entries || src_start_entry < src->len, - "buf_copy of %s called from %s:%u: src_start_entry=%"PRIu64" is larger than src->len=%"PRIu64, buf_desc(src).s, func, code_line, src_start_entry, src->len); - - uint64_t num_entries = max_entries ? MIN_(max_entries, src->len - src_start_entry) : src->len - src_start_entry; - if (!bytes_per_entry) bytes_per_entry=1; - - if (num_entries) { - buf_alloc_(dst_vb, dst, 0, num_entries * bytes_per_entry, 1, 1, dst_name ? dst_name : src->name, func, code_line); - - if (dst != src || src_start_entry >= num_entries) - memcpy (dst->data, &src->data[src_start_entry * bytes_per_entry], num_entries * bytes_per_entry); - else - memmove (dst->data, &src->data[src_start_entry * bytes_per_entry], num_entries * bytes_per_entry); // need memmove for overlapping areas - } - - dst->len = num_entries; -} - -// removes a section from the buffer -void buf_remove_do (BufferP buf, unsigned sizeof_item, uint64_t remove_start, uint64_t remove_len) -{ - if (!remove_len) return; - - ASSERT (remove_start + remove_len <= buf->len, "Out of range: remove_start=%"PRIu64" + remove_len=%"PRIu64" > buf->len=%"PRIu64, - remove_start, remove_len, buf->len); - - if (remove_len != buf->len) { // skip in common case of deleting entire buffer - uint64_t remove_start_byte = remove_start * sizeof_item; - uint64_t remove_bytes = remove_len * sizeof_item; - memmove (buf->data + remove_start_byte, - buf->data + remove_start_byte + remove_bytes, - (buf->len * sizeof_item) - (remove_start_byte + remove_bytes)); - } - - buf->len -= remove_len; -} - -void buf_add (BufferP buf, STRp(data)) -{ - if (!data_len) return; // don't test for space (and "data" pointer) if length is 0 - - ASSERT (buf_has_space (buf, data_len), - "buf_add: buffer %s is out of space: len=%u size=%u data_len=%u", - buf_desc (buf).s, (uint32_t)buf->len, (uint32_t)buf->size, data_len); - buf_add_do (buf, data, data_len); -} - -void buf_insert_do (VBlockP vb, BufferP buf, unsigned width, uint64_t insert_at, const void *new_data, uint64_t new_data_len, rom name, FUNCLINE) -{ - if (!new_data_len) return; - - buf_alloc_(vb ? vb : buf->vb, buf, new_data_len + (width==1)/*room for \0 or separator if char*/, 0, width, CTX_GROWTH, name, func, code_line); - - if (insert_at != buf->len) { - ASSERT (insert_at < buf->len, "called from %s:%u: expecting insert_at=%"PRIu64" <= buf->len=%"PRIu64" in buf=%s", - func, code_line, insert_at, buf->len, buf_desc(buf).s); - - memmove (&buf->data[(insert_at + new_data_len) * width], &buf->data[insert_at * width], (buf->len - insert_at) * width); - } - - memcpy (&buf->data[insert_at * width], new_data, new_data_len * width); - buf->len += new_data_len; - - ASSERT (BOVERFLOW(buf) == OVERFLOW_TRAP, "buffer overflow: %s", buf_desc(buf).s); -} - -void buf_append_string (VBlockP vb, BufferP buf, rom str) -{ - uint64_t len = strlen (str); - ASSERT (len < 10000000, "len=%"PRIu64" too long, looks like a bug", len); - - buf_add_more (vb, buf, str, len, buf->name ? buf->name : "string_buf"); // allocates one char extra - *BAFTc (*buf) = '\0'; // string terminator without increasing buf->len -} - -// swaps buffers' content without affecting buffer list -void buf_swap (BufferP buf1, BufferP buf2) -{ - ASSERT (buf1->vb == buf2->vb && - buf1->type == BUF_REGULAR && buf2->type == BUF_REGULAR && - !buf1->shared && !buf2->shared, - "buf1->vb != buf2->vb or not REGULAR or shared. buf1=%s buf2=%s", buf_desc (buf1).s, buf_desc (buf2).s); - - SWAP (buf1->memory, buf2->memory); - SWAP (buf1->data, buf2->data); - SWAP (buf1->len, buf2->len); - SWAP (buf1->param, buf2->param); - SWAPbits (buf1->size, buf2->size); -} - -void buf_print (BufferP buf, bool add_newline) -{ - for (uint64_t i=0; i < buf->len; i++) - fputc (buf->data[i], info_stream); // safer than printf %.*s ? - - iprint0 (add_newline ? "\n" : ""); -} - -//--------------------- -// Bits stuff -//--------------------- - -BitsP buf_alloc_bits_do (VBlockP vb, BufferP buf, uint64_t nbits, BitsInitType init_to, float grow_at_least_factor, rom name, FUNCLINE) -{ - ASSERT0 (buf->type == BUF_UNALLOCATED || buf->type == BUF_REGULAR, "buf needs to be BUF_UNALLOCATED or BUF_REGULAR"); - - uint64_t nwords = roundup_bits2words64 (nbits); - uint64_t old_nbits = buf->nbits; - - buf_alloc_(vb, buf, 0, nwords, sizeof(uint64_t), grow_at_least_factor, name, func, code_line); - - buf->nbits = nbits; - buf->nwords = nwords; - - bits_clear_excess_bits_in_top_word ((BitsP)buf); - - if (init_to == CLEAR) { - if (!old_nbits) bits_clear_all ((BitsP)buf); - else if (nbits > old_nbits) bits_clear_region ((BitsP)buf, old_nbits, nbits - old_nbits); - } - if (init_to == SET) { - if (!old_nbits) bits_set_all ((BitsP)buf); - else if (nbits > old_nbits) bits_set_region ((BitsP)buf, old_nbits, nbits - old_nbits); - } - - return (BitsP)buf; -} - -BitsP buf_overlay_bits_do (VBlockP vb, - BufferP top_buf, BufferP bottom_buf, - uint64_t start_byte_in_bottom_buf, - uint64_t nbits, - FUNCLINE, rom name) -{ - uint64_t nwords = roundup_bits2words64 (nbits); - - buf_overlay_do (evb, top_buf, bottom_buf, start_byte_in_bottom_buf, func, code_line, name); - - top_buf->nbits = nbits; - top_buf->nwords = nwords; - return (BitsP)top_buf; -} - -// convert a Buffer from a z_file section whose len is in char to a bits -Bits *buf_zfile_buf_to_bits (BufferP buf, uint64_t nbits) -{ - ASSERT (roundup_bits2bytes (nbits) <= buf->len, "nbits=%"PRId64" indicating a length of at least %"PRId64", but buf->len=%"PRId64, - nbits, roundup_bits2bytes (nbits), buf->len); - - Bits *bits = (BitsP)buf; - bits->nbits = nbits; - bits->nwords = roundup_bits2words64 (bits->nbits); - - ASSERT (roundup_bits2bytes64 (nbits) <= buf->size, "buffer to small: buf->size=%"PRId64" but bits has %"PRId64" words and hence requires %"PRId64" bytes", - (uint64_t)buf->size, bits->nwords, bits->nwords * sizeof(uint64_t)); - - LTEN_bits (bits); - - bits_clear_excess_bits_in_top_word (bits); - - return bits; -} - -void buf_add_bit (BufferP buf, int64_t new_bit) -{ - Bits *bar = (BitsP)buf; - - ASSERT (bar->nbits < buf->size * 8, "no room in Buffer %s to extend the bitmap", buf->name); - bar->nbits++; - if (bar->nbits % 64 == 1) { // starting a new word - bar->nwords++; - bar->words[bar->nwords-1] = new_bit; // LSb is as requested, other 63 bits are 0 - } - else - bits_assign (bar, bar->nbits-1, new_bit); -} - -uint64_t buf_extend_bits (BufferP buf, int64_t num_new_bits) -{ - Bits *bar = (BitsP)buf; - - ASSERT (bar->nbits + num_new_bits <= buf->size * 8, "Error in %s:%u: no room in Buffer %s to extend the bitmap: nbits=%"PRIu64", num_new_bits=%"PRId64", buf->size=%"PRIu64, - __FUNCLINE, buf->name, bar->nbits, num_new_bits, (uint64_t)buf->size); - - uint64_t next_bit = bar->nbits; - - bar->nbits += num_new_bits; - bar->nwords = roundup_bits2words64 (bar->nbits); - bits_clear_excess_bits_in_top_word (bar); - - return next_bit; -} - -//--------------------- -// Endianity stuff -//--------------------- - -void interlace_d8_buf (BufferP buf, LocalType *lt) { for_buf (int8_t, num, *buf) *num = (INTERLACE(int8_t, *num)); } -void BGEN_interlace_d16_buf (BufferP buf, LocalType *lt) { for_buf (int16_t, num, *buf) *num = BGEN16 (INTERLACE(int16_t, *num)); } -void BGEN_interlace_d32_buf (BufferP buf, LocalType *lt) { for_buf (int32_t, num, *buf) *num = BGEN32 (INTERLACE(int32_t, *num)); } -void BGEN_interlace_d64_buf (BufferP buf, LocalType *lt) { for_buf (int64_t, num, *buf) *num = BGEN64 (INTERLACE(int64_t, *num)); } -void LTEN_interlace_d16_buf (BufferP buf, LocalType *lt) { for_buf (int16_t, num, *buf) *num = LTEN16 (INTERLACE(int16_t, *num)); } -void LTEN_interlace_d32_buf (BufferP buf, LocalType *lt) { for_buf (int32_t, num, *buf) *num = LTEN32 (INTERLACE(int32_t, *num)); } -void LTEN_interlace_d64_buf (BufferP buf, LocalType *lt) { for_buf (int64_t, num, *buf) *num = LTEN64 (INTERLACE(int64_t, *num)); } - -void BGEN_u8_buf (BufferP buf, LocalType *lt) {} -void BGEN_u16_buf (BufferP buf, LocalType *lt) { if ( flag.is_lten) for_buf (uint16_t, num, *buf) *num = BGEN16 (*num); } -void BGEN_u32_buf (BufferP buf, LocalType *lt) { if ( flag.is_lten) for_buf (uint32_t, num, *buf) *num = BGEN32 (*num); } -void BGEN_u64_buf (BufferP buf, LocalType *lt) { if ( flag.is_lten) for_buf (uint64_t, num, *buf) *num = BGEN64 (*num); } -void LTEN_u16_buf (BufferP buf, LocalType *lt) { if (!flag.is_lten) for_buf (uint16_t, num, *buf) *num = LTEN16 (*num); } -void LTEN_u32_buf (BufferP buf, LocalType *lt) { if (!flag.is_lten) for_buf (uint32_t, num, *buf) *num = LTEN32 (*num); } -void LTEN_u64_buf (BufferP buf, LocalType *lt) { if (!flag.is_lten) for_buf (uint64_t, num, *buf) *num = LTEN64 (*num); } - -// number of columns is trasmitted in the count, except if this is a matrix of VCF samples, in which case param=0 and we take -// the number of columns to be the number of samples in the VCF header -static inline uint32_t BGEN_transpose_num_cols (ConstBufferP buf) -{ - uint32_t cols = buf->n_cols; // cols and rows in terms of the target non-transposed matrix (0 if vcf_num_samples) - - if (!cols) cols = vcf_header_get_num_samples(); - ASSERT0 (cols, "vcf_header_get_num_samples=0"); - - return cols; -} - -void BGEN_transpose_u8_buf (BufferP buf, LocalType *lt) -{ - if (!buf->len) return; - - uint32_t cols = BGEN_transpose_num_cols (buf); - uint32_t rows = buf->len / cols; - - buf_alloc (buf->vb, &buf->vb->scratch, 0, buf->len, uint8_t, 1, "scratch"); - ARRAY (uint8_t, target, buf->vb->scratch); - ARRAY (uint8_t, transposed, *buf); - - for (uint32_t c=0; c < cols; c++) - for (uint32_t r=0; r < rows; r++) - target[r * cols + c] = transposed[c * rows + r]; - - buf->vb->scratch.len = buf->len; - buf_copy (buf->vb, buf, &buf->vb->scratch, uint8_t, 0, 0, CTX_TAG_LOCAL); // copy and not move, so we can keep local's memory for next vb - - buf_free (buf->vb->scratch); - - if (lt) *lt = LT_UINT8; // no longer transposed -} - -void BGEN_transpose_u16_buf (BufferP buf, LocalType *lt) -{ - if (!buf->len) return; - - uint32_t cols = BGEN_transpose_num_cols (buf); - uint32_t rows = buf->len / cols; - - buf_alloc (buf->vb, &buf->vb->scratch, 0, buf->len, uint16_t, 1, "scratch"); - ARRAY (uint16_t, target, buf->vb->scratch); - ARRAY (uint16_t, transposed, *buf); - - for (uint32_t c=0; c < cols; c++) - for (uint32_t r=0; r < rows; r++) - target[r * cols + c] = BGEN16 (transposed[c * rows + r]); - - buf->vb->scratch.len = buf->len; - buf_copy (buf->vb, buf, &buf->vb->scratch, uint16_t, 0, 0, CTX_TAG_LOCAL); // copy and not move, so we can keep local's memory for next vb - - buf_free (buf->vb->scratch); - - *lt = LT_UINT16; // no longer transposed -} - -void BGEN_transpose_u32_buf (BufferP buf, LocalType *lt) -{ - if (!buf->len) return; - - uint32_t cols = BGEN_transpose_num_cols (buf); - uint32_t rows = buf->len / cols; - - buf_alloc (buf->vb, &buf->vb->scratch, 0, buf->len, uint32_t, 1, "scratch"); - ARRAY (uint32_t, target, buf->vb->scratch); - ARRAY (uint32_t, transposed, *buf); - - for (uint32_t c=0; c < cols; c++) - for (uint32_t r=0; r < rows; r++) - target[r * cols + c] = BGEN32 (transposed[c * rows + r]); - - buf->vb->scratch.len = buf->len; - buf_copy (buf->vb, buf, &buf->vb->scratch, uint32_t, 0, 0, CTX_TAG_LOCAL); // copy and not move, so we can keep local's memory for next vb - - buf_free (buf->vb->scratch); - - *lt = LT_UINT32; // no longer transposed -} - -void BGEN_deinterlace_d8_buf (BufferP buf, LocalType *lt) -{ - for (uint64_t i=0; i < buf->len; i++) { - uint8_t unum = *B8 (*buf, i); - *B(int8_t, *buf, i) = DEINTERLACE(int8_t,unum); - } -} - -void BGEN_deinterlace_d16_buf (BufferP buf, LocalType *lt) -{ - for (uint64_t i=0; i < buf->len; i++) { - uint16_t num_big_en = *B16 (*buf, i); - uint16_t unum = BGEN16 (num_big_en); - *B(int16_t, *buf, i) = DEINTERLACE(int16_t,unum); - } -} - -void BGEN_deinterlace_d32_buf (BufferP buf, LocalType *lt) -{ - for (uint64_t i=0; i < buf->len; i++) { - uint32_t num_big_en = *B32 ( *buf, i); - uint32_t unum = BGEN32 (num_big_en); - *B(int32_t, *buf, i) = DEINTERLACE(int32_t,unum); - } -} - -void BGEN_deinterlace_d64_buf (BufferP buf, LocalType *lt) -{ - for (uint64_t i=0; i < buf->len; i++) { - uint64_t num_big_en = *B64 (*buf, i); - uint64_t unum = BGEN64 (num_big_en); - *B(int64_t, *buf, i) = DEINTERLACE(int64_t,unum); - } -} +// ------------------------------------------------------------------ +// buffer.c +// Copyright (C) 2019-2024 Genozip Limited. Patent Pending. +// Please see terms and conditions in the files LICENSE.non-commercial.txt and LICENSE.commercial.txt + +#include "buffer.h" +#include "file.h" + +// writes a buffer to a file, return true if successful +// note: this is designed to run in any there, so it cannot create any buffers in evb +bool buf_dump_to_file (rom filename, ConstBufferP buf, unsigned buf_word_width, bool including_control_region, + bool no_dirs, bool verbose, bool do_gzip) +{ + RETURNW (buf->type == BUF_REGULAR, false, + "FYI: failed to dump buffer.type=%s name=%s while putting %s", + buf_type_name (buf), buf->name ? buf->name : "(null)", filename); + + int fn_len = strlen(filename); + char update_filename[fn_len + 10]; + strcpy (update_filename, filename); + + if (no_dirs) { + for (unsigned i=0; i < fn_len; i++) + if (filename[i] == '/' || (flag.is_windows && (filename[i] == '\\' || (filename[i] == ':' && i!=1)))) + update_filename[i] = '-'; + filename = update_filename; + } + + bool success; + if (including_control_region) { + ASSERT (*(uint64_t *)(buf->memory) == UNDERFLOW_TRAP, "dumping to %s: buffer has underflowed", filename); + ASSERT (*(uint64_t *)(buf->data + buf->size) == OVERFLOW_TRAP, "dumping to %s: buffer has overflowed", filename); + + success = file_put_data (update_filename, buf->memory, buf_mem_size (buf), 0); + } + else + success = file_put_data (update_filename, buf->data, buf->len * buf_word_width, 0); + + if (success && do_gzip) + file_gzip (update_filename); // updates filename if successful + + if (success && verbose) iprintf ("\nDumped file %s\n", update_filename); + + return success; +} + +// copy data - possibly within the same buffer +void buf_copy_do (VBlockP dst_vb, BufferP dst, ConstBufferP src, + uint64_t bytes_per_entry, // how many bytes are counted by a unit of .len + uint64_t src_start_entry, uint64_t max_entries, // if 0 copies the entire buffer + FUNCLINE, + rom dst_name) // dst buffer settings, or take from src if 0 +{ + ASSERTNOTNULL (src); + ASSERTNOTNULL (dst); + + ASSERT (src->data, "called from %s:%u: src->data is NULL", func, code_line); + + ASSERT (!max_entries || src_start_entry < src->len, + "buf_copy of %s called from %s:%u: src_start_entry=%"PRIu64" is larger than src->len=%"PRIu64, buf_desc(src).s, func, code_line, src_start_entry, src->len); + + uint64_t num_entries = max_entries ? MIN_(max_entries, src->len - src_start_entry) : src->len - src_start_entry; + if (!bytes_per_entry) bytes_per_entry=1; + + if (num_entries) { + buf_alloc_(dst_vb, dst, 0, num_entries * bytes_per_entry, 1, 1, dst_name ? dst_name : src->name, func, code_line); + + if (dst != src || src_start_entry >= num_entries) + memcpy (dst->data, &src->data[src_start_entry * bytes_per_entry], num_entries * bytes_per_entry); + else + memmove (dst->data, &src->data[src_start_entry * bytes_per_entry], num_entries * bytes_per_entry); // need memmove for overlapping areas + } + + dst->len = num_entries; +} + +// removes a section from the buffer +void buf_remove_do (BufferP buf, unsigned sizeof_item, uint64_t remove_start, uint64_t remove_len) +{ + if (!remove_len) return; + + ASSERT (remove_start + remove_len <= buf->len, "Out of range: remove_start=%"PRIu64" + remove_len=%"PRIu64" > buf->len=%"PRIu64, + remove_start, remove_len, buf->len); + + if (remove_len != buf->len) { // skip in common case of deleting entire buffer + uint64_t remove_start_byte = remove_start * sizeof_item; + uint64_t remove_bytes = remove_len * sizeof_item; + memmove (buf->data + remove_start_byte, + buf->data + remove_start_byte + remove_bytes, + (buf->len * sizeof_item) - (remove_start_byte + remove_bytes)); + } + + buf->len -= remove_len; +} + +void buf_add (BufferP buf, STRp(data)) +{ + if (!data_len) return; // don't test for space (and "data" pointer) if length is 0 + + ASSERT (buf_has_space (buf, data_len), + "buf_add: buffer %s is out of space: len=%u size=%u data_len=%u", + buf_desc (buf).s, (uint32_t)buf->len, (uint32_t)buf->size, data_len); + buf_add_do (buf, data, data_len); +} + +void buf_insert_do (VBlockP vb, BufferP buf, unsigned width, uint64_t insert_at, const void *new_data, uint64_t new_data_len, rom name, FUNCLINE) +{ + if (!new_data_len) return; + + buf_alloc_(vb ? vb : buf->vb, buf, new_data_len + (width==1)/*room for \0 or separator if char*/, 0, width, CTX_GROWTH, name, func, code_line); + + if (insert_at != buf->len) { + ASSERT (insert_at < buf->len, "called from %s:%u: expecting insert_at=%"PRIu64" <= buf->len=%"PRIu64" in buf=%s", + func, code_line, insert_at, buf->len, buf_desc(buf).s); + + memmove (&buf->data[(insert_at + new_data_len) * width], &buf->data[insert_at * width], (buf->len - insert_at) * width); + } + + memcpy (&buf->data[insert_at * width], new_data, new_data_len * width); + buf->len += new_data_len; + + ASSERT (BOVERFLOW(buf) == OVERFLOW_TRAP, "buffer overflow: %s", buf_desc(buf).s); +} + +void buf_append_string (VBlockP vb, BufferP buf, rom str) +{ + uint64_t len = strlen (str); + ASSERT (len < 10000000, "len=%"PRIu64" too long, looks like a bug", len); + + buf_add_more (vb, buf, str, len, buf->name ? buf->name : "string_buf"); // allocates one char extra + *BAFTc (*buf) = '\0'; // string terminator without increasing buf->len +} + +// swaps buffers' content without affecting buffer list +void buf_swap (BufferP buf1, BufferP buf2) +{ + ASSERT (buf1->vb == buf2->vb && + buf1->type == BUF_REGULAR && buf2->type == BUF_REGULAR && + !buf1->shared && !buf2->shared, + "buf1->vb != buf2->vb or not REGULAR or shared. buf1=%s buf2=%s", buf_desc (buf1).s, buf_desc (buf2).s); + + SWAP (buf1->memory, buf2->memory); + SWAP (buf1->data, buf2->data); + SWAP (buf1->len, buf2->len); + SWAP (buf1->param, buf2->param); + SWAPbits (buf1->size, buf2->size); +} + +void buf_print (BufferP buf, bool add_newline) +{ + for (uint64_t i=0; i < buf->len; i++) + fputc (buf->data[i], info_stream); // safer than printf %.*s ? + + iprint0 (add_newline ? "\n" : ""); +} + +// iterator on a buffer containing newline-terminated lines +// false means continue iterating, true means stop +char *buf_foreach_line (BufferP buf, + bool reverse, // iterate backwards + TxtIteratorCallback callback, + void *cb_param1, void *cb_param2, unsigned cb_param3, // passed as-is to callback + int64_t *line_len) // out +{ + if (line_len) *line_len = 0; + + if (!buf->len) return NULL; + + char *firstbuf = buf->data; + char *afterbuf = BAFTc (*buf); + + char *first = !reverse ? firstbuf : 0; + char *after = !reverse ? 0 : afterbuf; + + while (1) { + + // get one line - searching forward or backwards + if (!reverse) { + for (after=first ; after < afterbuf && *after != '\n' ; after++); + after++; // skip newline + } + else { + for (first=after-2 /* skip final \n */; first >= firstbuf && *first != '\n'; first--); + first++; // after detected \n or at start of line + } + + if (!reverse && after > afterbuf) return NULL; // we don't call callback if after>afterbuf - beyond end of line + + if (callback (first, after - first, cb_param1, cb_param2, cb_param3)) { + if (line_len) *line_len = after - first; + return first; + } + + if (reverse && first == firstbuf) return NULL; // beginning of line - we called the cb + + if (!reverse) first=after; + else after=first; + } + + return 0; // never reaches here +} + +//--------------------- +// Bits stuff +//--------------------- + +BitsP buf_alloc_bits_do (VBlockP vb, BufferP buf, uint64_t nbits, BitsInitType init_to, float grow_at_least_factor, rom name, FUNCLINE) +{ + ASSERT0 (buf->type == BUF_UNALLOCATED || buf->type == BUF_REGULAR, "buf needs to be BUF_UNALLOCATED or BUF_REGULAR"); + + uint64_t nwords = roundup_bits2words64 (nbits); + uint64_t old_nbits = buf->nbits; + + buf_alloc_(vb, buf, 0, nwords, sizeof(uint64_t), grow_at_least_factor, name, func, code_line); + + buf->nbits = nbits; + buf->nwords = nwords; + + bits_clear_excess_bits_in_top_word ((BitsP)buf); + + if (init_to == CLEAR) { + if (!old_nbits) bits_clear_all ((BitsP)buf); + else if (nbits > old_nbits) bits_clear_region ((BitsP)buf, old_nbits, nbits - old_nbits); + } + if (init_to == SET) { + if (!old_nbits) bits_set_all ((BitsP)buf); + else if (nbits > old_nbits) bits_set_region ((BitsP)buf, old_nbits, nbits - old_nbits); + } + + return (BitsP)buf; +} + +BitsP buf_overlay_bits_do (VBlockP vb, + BufferP top_buf, BufferP bottom_buf, + uint64_t start_byte_in_bottom_buf, + uint64_t nbits, + FUNCLINE, rom name) +{ + uint64_t nwords = roundup_bits2words64 (nbits); + + buf_overlay_do (evb, top_buf, bottom_buf, start_byte_in_bottom_buf, func, code_line, name); + + top_buf->nbits = nbits; + top_buf->nwords = nwords; + return (BitsP)top_buf; +} + +// convert a Buffer from a z_file section whose len is in char to a bits +Bits *buf_zfile_buf_to_bits (BufferP buf, uint64_t nbits) +{ + ASSERT (roundup_bits2bytes (nbits) <= buf->len, "nbits=%"PRId64" indicating a length of at least %"PRId64", but buf->len=%"PRId64, + nbits, roundup_bits2bytes (nbits), buf->len); + + Bits *bits = (BitsP)buf; + bits->nbits = nbits; + bits->nwords = roundup_bits2words64 (bits->nbits); + + ASSERT (roundup_bits2bytes64 (nbits) <= buf->size, "buffer to small: buf->size=%"PRId64" but bits has %"PRId64" words and hence requires %"PRId64" bytes", + (uint64_t)buf->size, bits->nwords, bits->nwords * sizeof(uint64_t)); + + LTEN_bits (bits); + + bits_clear_excess_bits_in_top_word (bits); + + return bits; +} + +void buf_add_bit (BufferP buf, int64_t new_bit) +{ + Bits *bar = (BitsP)buf; + + ASSERT (bar->nbits < buf->size * 8, "no room in Buffer %s to extend the bitmap", buf->name); + bar->nbits++; + if (bar->nbits % 64 == 1) { // starting a new word + bar->nwords++; + bar->words[bar->nwords-1] = new_bit; // LSb is as requested, other 63 bits are 0 + } + else + bits_assign (bar, bar->nbits-1, new_bit); +} + +uint64_t buf_extend_bits (BufferP buf, int64_t num_new_bits) +{ + Bits *bar = (BitsP)buf; + + ASSERT (bar->nbits + num_new_bits <= buf->size * 8, "Error in %s:%u: no room in Buffer %s to extend the bitmap: nbits=%"PRIu64", num_new_bits=%"PRId64", buf->size=%"PRIu64, + __FUNCLINE, buf->name, bar->nbits, num_new_bits, (uint64_t)buf->size); + + uint64_t next_bit = bar->nbits; + + bar->nbits += num_new_bits; + bar->nwords = roundup_bits2words64 (bar->nbits); + bits_clear_excess_bits_in_top_word (bar); + + return next_bit; +} + +//--------------------- +// Endianity stuff +//--------------------- + +void interlace_d8_buf (BufferP buf, LocalType *lt) { for_buf (int8_t, num, *buf) *num = (INTERLACE(int8_t, *num)); } +void BGEN_interlace_d16_buf (BufferP buf, LocalType *lt) { for_buf (int16_t, num, *buf) *num = BGEN16 (INTERLACE(int16_t, *num)); } +void BGEN_interlace_d32_buf (BufferP buf, LocalType *lt) { for_buf (int32_t, num, *buf) *num = BGEN32 (INTERLACE(int32_t, *num)); } +void BGEN_interlace_d64_buf (BufferP buf, LocalType *lt) { for_buf (int64_t, num, *buf) *num = BGEN64 (INTERLACE(int64_t, *num)); } +void LTEN_interlace_d16_buf (BufferP buf, LocalType *lt) { for_buf (int16_t, num, *buf) *num = LTEN16 (INTERLACE(int16_t, *num)); } +void LTEN_interlace_d32_buf (BufferP buf, LocalType *lt) { for_buf (int32_t, num, *buf) *num = LTEN32 (INTERLACE(int32_t, *num)); } +void LTEN_interlace_d64_buf (BufferP buf, LocalType *lt) { for_buf (int64_t, num, *buf) *num = LTEN64 (INTERLACE(int64_t, *num)); } + +void BGEN_u8_buf (BufferP buf, LocalType *lt) {} +void BGEN_u16_buf (BufferP buf, LocalType *lt) { if ( flag.is_lten) for_buf (uint16_t, num, *buf) *num = BGEN16 (*num); } +void BGEN_u32_buf (BufferP buf, LocalType *lt) { if ( flag.is_lten) for_buf (uint32_t, num, *buf) *num = BGEN32 (*num); } +void BGEN_u64_buf (BufferP buf, LocalType *lt) { if ( flag.is_lten) for_buf (uint64_t, num, *buf) *num = BGEN64 (*num); } +void LTEN_u16_buf (BufferP buf, LocalType *lt) { if (!flag.is_lten) for_buf (uint16_t, num, *buf) *num = LTEN16 (*num); } +void LTEN_u32_buf (BufferP buf, LocalType *lt) { if (!flag.is_lten) for_buf (uint32_t, num, *buf) *num = LTEN32 (*num); } +void LTEN_u64_buf (BufferP buf, LocalType *lt) { if (!flag.is_lten) for_buf (uint64_t, num, *buf) *num = LTEN64 (*num); } + +// number of columns is trasmitted in the count, except if this is a matrix of VCF samples, in which case param=0 and we take +// the number of columns to be the number of samples in the VCF header +static inline uint32_t BGEN_transpose_num_cols (ConstBufferP buf) +{ + uint32_t cols = buf->n_cols; // cols and rows in terms of the target non-transposed matrix (0 if vcf_num_samples) + + if (!cols) cols = vcf_header_get_num_samples(); + ASSERT0 (cols, "vcf_header_get_num_samples=0"); + + return cols; +} + +void BGEN_transpose_u8_buf (BufferP buf, LocalType *lt) +{ + if (!buf->len) return; + + uint32_t cols = BGEN_transpose_num_cols (buf); + uint32_t rows = buf->len / cols; + + buf_alloc (buf->vb, &buf->vb->scratch, 0, buf->len, uint8_t, 1, "scratch"); + ARRAY (uint8_t, target, buf->vb->scratch); + ARRAY (uint8_t, transposed, *buf); + + for (uint32_t c=0; c < cols; c++) + for (uint32_t r=0; r < rows; r++) + target[r * cols + c] = transposed[c * rows + r]; + + buf->vb->scratch.len = buf->len; + buf_copy (buf->vb, buf, &buf->vb->scratch, uint8_t, 0, 0, CTX_TAG_LOCAL); // copy and not move, so we can keep local's memory for next vb + + buf_free (buf->vb->scratch); + + if (lt) *lt = LT_UINT8; // no longer transposed +} + +void BGEN_transpose_u16_buf (BufferP buf, LocalType *lt) +{ + if (!buf->len) return; + + uint32_t cols = BGEN_transpose_num_cols (buf); + uint32_t rows = buf->len / cols; + + buf_alloc (buf->vb, &buf->vb->scratch, 0, buf->len, uint16_t, 1, "scratch"); + ARRAY (uint16_t, target, buf->vb->scratch); + ARRAY (uint16_t, transposed, *buf); + + for (uint32_t c=0; c < cols; c++) + for (uint32_t r=0; r < rows; r++) + target[r * cols + c] = BGEN16 (transposed[c * rows + r]); + + buf->vb->scratch.len = buf->len; + buf_copy (buf->vb, buf, &buf->vb->scratch, uint16_t, 0, 0, CTX_TAG_LOCAL); // copy and not move, so we can keep local's memory for next vb + + buf_free (buf->vb->scratch); + + *lt = LT_UINT16; // no longer transposed +} + +void BGEN_transpose_u32_buf (BufferP buf, LocalType *lt) +{ + if (!buf->len) return; + + uint32_t cols = BGEN_transpose_num_cols (buf); + uint32_t rows = buf->len / cols; + + buf_alloc (buf->vb, &buf->vb->scratch, 0, buf->len, uint32_t, 1, "scratch"); + ARRAY (uint32_t, target, buf->vb->scratch); + ARRAY (uint32_t, transposed, *buf); + + for (uint32_t c=0; c < cols; c++) + for (uint32_t r=0; r < rows; r++) + target[r * cols + c] = BGEN32 (transposed[c * rows + r]); + + buf->vb->scratch.len = buf->len; + buf_copy (buf->vb, buf, &buf->vb->scratch, uint32_t, 0, 0, CTX_TAG_LOCAL); // copy and not move, so we can keep local's memory for next vb + + buf_free (buf->vb->scratch); + + *lt = LT_UINT32; // no longer transposed +} + +void BGEN_deinterlace_d8_buf (BufferP buf, LocalType *lt) +{ + for (uint64_t i=0; i < buf->len; i++) { + uint8_t unum = *B8 (*buf, i); + *B(int8_t, *buf, i) = DEINTERLACE(int8_t,unum); + } +} + +void BGEN_deinterlace_d16_buf (BufferP buf, LocalType *lt) +{ + for (uint64_t i=0; i < buf->len; i++) { + uint16_t num_big_en = *B16 (*buf, i); + uint16_t unum = BGEN16 (num_big_en); + *B(int16_t, *buf, i) = DEINTERLACE(int16_t,unum); + } +} + +void BGEN_deinterlace_d32_buf (BufferP buf, LocalType *lt) +{ + for (uint64_t i=0; i < buf->len; i++) { + uint32_t num_big_en = *B32 ( *buf, i); + uint32_t unum = BGEN32 (num_big_en); + *B(int32_t, *buf, i) = DEINTERLACE(int32_t,unum); + } +} + +void BGEN_deinterlace_d64_buf (BufferP buf, LocalType *lt) +{ + for (uint64_t i=0; i < buf->len; i++) { + uint64_t num_big_en = *B64 (*buf, i); + uint64_t unum = BGEN64 (num_big_en); + *B(int64_t, *buf, i) = DEINTERLACE(int64_t,unum); + } +} diff --git a/src/buffer.h b/src/buffer.h index 88c5a338..4337f176 100644 --- a/src/buffer.h +++ b/src/buffer.h @@ -168,8 +168,8 @@ extern void buf_add (BufferP buf, STRp(data)); #define buf_add_moreC(vb_, buf, literal_str, name) buf_add_more ((VBlockP)(vb_), (buf), literal_str, sizeof literal_str-1, (name)) #define buf_add_moreS(vb_, buf, str, name) buf_add_more ((VBlockP)(vb_), (buf), str, str##_len, (name)) -#define buf_add_buf(vb_,dst_buf,src_buf,type,name) ({ \ - buf_alloc ((vb_) ? (VBlockP)(vb_) : (dst_buf)->vb, (dst_buf), (src_buf)->len, 0, type, CTX_GROWTH, (name)); \ +#define buf_add_buf(dst_vb,dst_buf,src_buf,type,name) ({ \ + buf_alloc ((dst_vb) ? (VBlockP)(dst_vb) : (dst_buf)->vb, (dst_buf), (src_buf)->len, 0, type, CTX_GROWTH, (name)); \ memcpy (BAFT(type, *(dst_buf)), (src_buf)->data, (src_buf)->len * sizeof (type)); \ (dst_buf)->len += (src_buf)->len; }) @@ -226,6 +226,9 @@ extern void buf_copy_do (VBlockP dst_vb, BufferP dst, ConstBufferP src, uint64_t #define buf_copy(dst_vb,dst,src,type,src_start_entry,max_entries,dst_name) \ buf_copy_do ((VBlockP)(dst_vb),(dst),(src),sizeof(type),(src_start_entry),(max_entries),__FUNCLINE,(dst_name)) +typedef bool (*TxtIteratorCallback)(rom line, unsigned line_len, void *cb_param1, void *cb_param2, unsigned cb_param3); +extern char *buf_foreach_line (BufferP buf, bool reverse, TxtIteratorCallback callback, void *cb_param1, void *cb_param2, unsigned cb_param3, int64_t *line_len); + extern void buf_print (BufferP buf, bool add_newline); extern bool buf_dump_to_file (rom filename, ConstBufferP buf, unsigned buf_word_width, bool including_control_region, bool no_dirs, bool verbose, bool do_gzip); diff --git a/src/chrom.c b/src/chrom.c index 0425b25d..7b4b7c89 100644 --- a/src/chrom.c +++ b/src/chrom.c @@ -99,8 +99,8 @@ void chrom_2ref_load (Reference ref) WordIndex chrom_index = BGEN32 (ent->chrom_index); WordIndex ref_index = BGEN32 (ent->ref_index); - ASSERT (IN_RANGE(chrom_index, 0, zctx->word_list.len32-1), "chrom_index=%d ∉ [0,%d]", chrom_index, (int32_t)zctx->word_list.len-1); - ASSERT (!num_ref_contigs /* ref not loaded */ || IN_RANGE (ref_index, -1, num_ref_contigs-1), + ASSERT (IN_RANGE(chrom_index, 0, zctx->word_list.len32), "chrom_index=%d ∉ [0,%d]", chrom_index, (int32_t)zctx->word_list.len-1); + ASSERT (!num_ref_contigs /* ref not loaded */ || IN_RANGE (ref_index, -1, num_ref_contigs), "ref_index=%d ∉ [-1,%u] (chrom_index=%u i=%u len=%u)", ref_index, num_ref_contigs-1, chrom_index, i, evb->scratch.len32); @@ -135,7 +135,7 @@ WordIndex chrom_2ref_seg_get (Reference ref, ConstVBlockP vb, WordIndex chrom_in : (chrom_index < ctx->chrom2ref_map.len32) ? *B(WordIndex, ctx->chrom2ref_map, chrom_index - ol_len) // possibly WORD_INDEX_NONE, see chrom_seg_ex : WORD_INDEX_NONE; - ASSSEG (IN_RANGE (ref_index, WORD_INDEX_NONE, (WordIndex)ref_num_contigs (ref)-1), + ASSSEG (IN_RANGE (ref_index, WORD_INDEX_NONE, (WordIndex)ref_num_contigs (ref)), "ref_index=%d out of range: ref->ranges.len=%u, chrom_index=%d", ref_index, ref_num_contigs (ref), chrom_index); return ref_index; diff --git a/src/codec.c b/src/codec.c index 94fc0f2c..a622d728 100644 --- a/src/codec.c +++ b/src/codec.c @@ -14,7 +14,7 @@ #include "zfile.h" #include "zip.h" #include "profiler.h" -#include "bgzf.h" +#include "mgzip.h" // -------------------------------------- // memory functions that serve the codecs @@ -94,7 +94,7 @@ static uint32_t codec_est_size_default (Codec codec, uint64_t uncompressed_len) // returns 4-character codec name rom codec_name (Codec codec) { - return IN_RANGE (codec, 0, NUM_CODECS-1) ? codec_args[codec].name : "BAD!"; + return IN_RANGE (codec, 0, NUM_CODECS) ? codec_args[codec].name : "BAD!"; } void codec_initialize (void) diff --git a/src/codec.h b/src/codec.h index e7861e51..1ba11c56 100644 --- a/src/codec.h +++ b/src/codec.h @@ -97,7 +97,11 @@ typedef struct { { 0, "SMUX", "+", codec_smux_compress, USE_SUBCODEC, codec_smux_reconstruct, codec_trivial_size, }, \ { 0, "ORA", "+.ora", NA1, NA2, NA3, NA4 }, \ { 0, "OQ", "+", codec_oq_compress, USE_SUBCODEC, codec_oq_reconstruct, codec_RANB_est_size, }, \ - { 0, "GZIL", "+.gz", NA1, NA2, NA3, NA4 }, \ + { 0, "IL1M", "+.gz", NA1, NA2, NA3, NA4 }, \ + { 0, "MGZF", "+.gz", NA1, NA2, NA3, NA4 }, \ + { 0, "MGSP", "+.gz", NA1, NA2, NA3, NA4 }, \ + { 0, "EMFL", "+.gz", NA1, NA2, NA3, NA4 }, \ + { 0, "EMVL", "+.gz", NA1, NA2, NA3, NA4 }, \ } extern CodecArgs codec_args[NUM_CODECS]; diff --git a/src/codec_domq.c b/src/codec_domq.c index c5f48e62..e1c81596 100644 --- a/src/codec_domq.c +++ b/src/codec_domq.c @@ -155,7 +155,7 @@ static void codec_domq_calc_histogram (VBlockP vb, ContextP qual_ctx, ContextP d // validate bases and get line dom uint32_t max_score_count=0; for (int ascii_i=0; ascii_i < 256; ascii_i++) { - ASSERT (IN_RANGE (ascii_i, FIRST_Q, LAST_Q) || !line_ascii_histogram[ascii_i], + ASSERT (IN_RANGE (ascii_i, FIRST_Q, LAST_Q+1) || !line_ascii_histogram[ascii_i], "%s/%u: QUAL value=%u ∉ [%u, %u] for %s", VB_NAME, line_i, ascii_i, FIRST_Q, LAST_Q, qual_ctx->tag_name); diff --git a/src/codec_htscodecs.c b/src/codec_htscodecs.c index 1ca9b98d..cfceefad 100644 --- a/src/codec_htscodecs.c +++ b/src/codec_htscodecs.c @@ -41,7 +41,7 @@ static bool codec_hts_compress (VBlockP vb, ContextP ctx, rom uncompressed, // option 1 - compress contiguous data uint32_t *uncompressed_len, LocalGetLineCB get_line_cb, // option 2 - compress data one line at a time - STRe(compressed), // in/out + qSTRp(compressed), // in/out uint8_t *(*func)(VBlockP vb, uint8_t *in, unsigned in_size, uint8_t *out, unsigned *out_size, int order), int order, FailType soft_fail, rom name) { diff --git a/src/compressor.c b/src/compressor.c index 2d21899c..67769028 100644 --- a/src/compressor.c +++ b/src/compressor.c @@ -184,7 +184,7 @@ uint32_t comp_compress (VBlockP vb, // compress primary context of a complex codec, after codec code as prepared the data in ctx->local. The other contexts // of the complex codec are marked with DEP_L* and will be compressed in the normal ctx->local compression loop bool comp_compress_complex_codec (VBlockP vb, ContextP ctx, SectionHeaderP header, bool is_2nd_try, - uint32_t *uncompressed_len, STRe(compressed), rom name) + uint32_t *uncompressed_len, qSTRp(compressed), rom name) { if (!is_2nd_try) { Codec save_lcodec = ctx->lcodec; diff --git a/src/compressor.h b/src/compressor.h index 38297552..a7fc2fec 100644 --- a/src/compressor.h +++ b/src/compressor.h @@ -15,7 +15,7 @@ extern uint32_t comp_compress (VBlockP vb, ContextP ctx, BufferP z_data, Section LocalGetLineCB callback, rom name); // option 2 - compress data one line at a time extern bool comp_compress_complex_codec (VBlockP vb, ContextP ctx, SectionHeaderP header, bool is_2nd_try, - uint32_t *uncompressed_len, STRe (compressed), rom name); + uint32_t *uncompressed_len, qSTRp (compressed), rom name); extern void comp_uncompress (VBlockP vb, ContextP ctx, Codec codec, Codec sub_codec, uint8_t param, STRp(compressed_data), BufferP uncompressed_data, uint64_t uncompressed_len, rom name); diff --git a/src/container.c b/src/container.c index e19c0a35..12083788 100644 --- a/src/container.c +++ b/src/container.c @@ -73,10 +73,10 @@ WordIndex container_seg_do (VBlockP vb, ContextP ctx, ConstContainerP con, container_prepare_snip (con, STRa(prefixes), qSTRa(snip)); if (flag.show_containers) { - iprintf ("%s%sVB=%u Line=%d Ctx=%u:%s Repeats=%u RepSep=%u,%u Items=", + iprintf ("%s%s%s Ctx=%u:%s Repeats=%u RepSep=%u,%u Items=", vb->preprocessing ? "preproc " : "", vb->peek_stack_level ? "peeking " : "", - vb->vblock_i, vb->line_i, ctx->did_i, ctx->tag_name, con->repeats, con->repsep[0], con->repsep[1]); + LN_NAME, ctx->did_i, ctx->tag_name, con->repeats, con->repsep[0], con->repsep[1]); for_con (con) if (item->dict_id.num) iprintf ("%s(%u) ", dis_dict_id (item->dict_id).s, ctx->did_i); @@ -315,10 +315,10 @@ static inline void container_toplevel_filter (VBlockP vb, uint32_t rep_i, rom re vb->num_nondrop_lines++; if (show_non_item && vb->drop_curr_line) // show container reconstruction - iprintf ("%s%sVB=%u Line=%d dropped due to \"%s\"\n", + iprintf ("%s%s%s dropped due to \"%s\"\n", vb->preprocessing ? "preproc " : "", vb->peek_stack_level ? "peeking " : "", - vb->vblock_i, vb->line_i, vb->drop_curr_line); + LN_NAME, vb->drop_curr_line); } CONTAINER_FILTER_FUNC (default_piz_filter) @@ -472,10 +472,10 @@ ValueType container_reconstruct (VBlockP vb, ContextP ctx, ConstContainerP con, bool show_non_item = vb->show_containers && (!flag.dict_id_show_containers.num || dict_id_typeless (ctx->dict_id).num == flag.dict_id_show_containers.num); if (show_non_item) // show container reconstruction (note: before container_reconstruct_prefix which modifies prefixes) - iprintf ("%s%sVB=%u Line=%d Container(%s)=%s\n", + iprintf ("%s%s%s Container(%s)=%s\n", vb->preprocessing ? "preproc " : "", vb->peek_stack_level ? "peeking " : "", - vb->vblock_i, vb->line_i, dis_dict_id (ctx->dict_id).s, + LN_NAME, dis_dict_id (ctx->dict_id).s, container_to_json (con, prefixes_len ? prefixes-1 : NULL, prefixes_len ? prefixes_len+1 : 0).s); // +1 to add back initial CON_PX_SEP removed by container_retrieve // container wide prefix - it will be missing if Container has no prefixes, or empty if it has only items prefixes @@ -536,10 +536,10 @@ ValueType container_reconstruct (VBlockP vb, ContextP ctx, ConstContainerP con, unsigned num_preceding_seps = 0; if (show_non_item) // show container reconstruction - iprintf ("%s%sVB=%u Line=%d Repeat=%u LastRepeat=%u %s\n", + iprintf ("%s%s%s Repeat=%u LastRepeat=%u %s\n", vb->preprocessing ? "preproc " : "", vb->peek_stack_level ? "peeking " : "", - vb->vblock_i, vb->line_i, rep_i, con->repeats-1, ctx->tag_name); + VB_NAME, rep_i, con->repeats-1, ctx->tag_name); for (unsigned item_i=0; item_i < num_items; item_i++) { const ContainerItem *item = &con->items[item_i]; @@ -569,10 +569,10 @@ ValueType container_reconstruct (VBlockP vb, ContextP ctx, ConstContainerP con, } if (show_item) - iprintf ("%s%sVB=%u Line=%d Repeat=%u %s(%u)->%s(%u) trans_id=%u txt_data.len=%"PRIu64" (0x%04"PRIx64") reconstruct_prefix=%d reconstruct_value=%d : ", + iprintf ("%s%s%s Repeat=%u %s(%u)->%s(%u) trans_id=%u txt_data.len=%"PRIu64" (0x%04"PRIx64") reconstruct_prefix=%d reconstruct_value=%d : ", vb->preprocessing ? "preproc " : "", vb->peek_stack_level ? "peeking " : "", - vb->vblock_i, vb->line_i, rep_i, ctx->tag_name, ctx->did_i, item_ctx->tag_name, item_ctx->did_i, + LN_NAME, rep_i, ctx->tag_name, ctx->did_i, item_ctx->tag_name, item_ctx->did_i, trans_item ? item->translator : 0, vb->vb_position_txt_file + Ltxt, vb->vb_position_txt_file + Ltxt, reconstruct, reconstruct && !trans_nor); diff --git a/src/container.h b/src/container.h index 9682a26d..53414a74 100644 --- a/src/container.h +++ b/src/container.h @@ -124,7 +124,7 @@ extern bool curr_container_has (VBlockP vb, DictId item_dict_id); // a prefix for each item (may be empty) + CON_PX_SEP // a suffix for each repeat + CON_PX_SEP // empty prefixes of trailing items may be omitted -extern void container_prepare_snip (ConstContainerP con, STRp(prefixes), STRe (snip)); +extern void container_prepare_snip (ConstContainerP con, STRp(prefixes), qSTRp (snip)); extern WordIndex container_seg_do (VBlockP vb, ContextP ctx, ConstContainerP con, STRp(prefixes), unsigned add_bytes, bool *is_new); #define container_seg(vb, ctx, con, prefixes, prefixes_len, add_bytes) container_seg_do ((VBlockP)(vb), (ctx), (con), (prefixes), (prefixes_len), (add_bytes), NULL) #define container_seg_by_dict_id(vb,dict_id,con,add_bytes) container_seg (vb, ctx_get_ctx (vb, dict_id), con, NULL, 0, add_bytes) diff --git a/src/context_struct.h b/src/context_struct.h index 2b8e0068..38e92d6f 100644 --- a/src/context_struct.h +++ b/src/context_struct.h @@ -57,9 +57,9 @@ typedef struct Context { }; LocalType ltype; // LT_* - type of local data - included in the section header - LocalType pair_ltype; // LT_* - Used if this file is a PAIR_2 - type of local data of PAIR_1 + LocalType pair_ltype; // LT_* - Used if this file is a PAIR_R2 - type of local data of PAIR_R1 struct FlagsCtx flags; // flags to be included in section header - struct FlagsCtx pair_flags;// Used if this file is a PAIR_2 - contains ctx->flags of the PAIR_1 + struct FlagsCtx pair_flags;// Used if this file is a PAIR_R2 - contains ctx->flags of the PAIR_R1 struct FlagsDict dict_flags; // ZIP zctx ; PIZ: zctx+vctx . Tramsmiited via SectionFlags.dictinonary (v15) B250Size b250_size; // Size type of element in b250 data (PIZ and ZIP after generation) v14 B250Size pair_b250_size; @@ -81,7 +81,7 @@ typedef struct Context { // PIZ: .data contains the word indices (i.e. indices into word_list) in base-250 Buffer local; // ZIP/PIZ vctx: Data private to this VB that is not in the dictionary // ZIP zctx - only .len - number of fields of this type segged in the file (for stats) - Buffer b250R1; // ZIP/PIZ: used by PAIR_2 FASTQ VBs (inc. in Deep SAM), for paired contexts: PAIR_1 b250 data from corresponding VB (in PIZ: only if CTX_PAIR_LOAD) + Buffer b250R1; // ZIP/PIZ: used by PAIR_R2 FASTQ VBs (inc. in Deep SAM), for paired contexts: PAIR_R1 b250 data from corresponding VB (in PIZ: only if CTX_PAIR_LOAD) Buffer counts; // ZIP/PIZ: counts of snips (VB:uint32_t, z_file:uint64_t) // ZIP: counts.param is a context-specific global counter that gets accumulated in zctx during merge (e.g. OPTION_SA_CIGAR) @@ -128,7 +128,7 @@ typedef struct Context { Buffer ref2chrom_map; // ZIP: zctx: SAM/BAM/VCF: CHROM: reverse mapping from ref_index to chrom, created by ref_compress_ref Buffer con_len; // PIZ: vctx: use by contexts that might have containers: Array of uint16_t - length of item in cache // FASTQ - Buffer localR1; // ZIP/PIZ vctx: PAIR_2 FASTQ VBs (inc. in Deep SAM): for paired contexts: PAIR_1 local data from corresponding VB (in PIZ: only if fastq_use_pair_assisted). Note: contexts with containers are always no_stons, so they have no local - therefore no issue with union conflict. + Buffer localR1; // ZIP/PIZ vctx: PAIR_R2 FASTQ VBs (inc. in Deep SAM): for paired contexts: PAIR_R1 local data from corresponding VB (in PIZ: only if fastq_use_pair_assisted). Note: contexts with containers are always no_stons, so they have no local - therefore no issue with union conflict. // VCF Buffer format_contexts; // ZIP: vctx: VCF_SAMPLES: an array of format_mapper_buf.len of ContextPBlock Buffer insertion; // PIZ: vctx: INFO_SF: inserted INFO fields reconstructed after samples diff --git a/src/cram.c b/src/cram.c index c22f34b5..d966c485 100644 --- a/src/cram.c +++ b/src/cram.c @@ -207,7 +207,7 @@ static bool cram_inspect_file_definition_data (FileP file) // case: this is actually a GZ file (possibly BAM) if (c_len >= 2 && c[0] == 0x1f && c[1] == 0x8b) { - file->codec = file->source_codec = CODEC_GZ; + file->effective_codec = file->src_codec = CODEC_GZ; file->data_type = DT_GNRIC; // generic_is_header_done will figure out the true data type file->type = GNRIC_GZ; return false; @@ -215,7 +215,7 @@ static bool cram_inspect_file_definition_data (FileP file) // case: this is not CRAM, but not a GZ file (perhaps SAM) else if (c_len < 26 || memcmp (c, CRAM_MAGIC, STRLEN(CRAM_MAGIC))) { - file->codec = file->source_codec = CODEC_NONE; + file->effective_codec = file->src_codec = CODEC_NONE; file->data_type = DT_GNRIC; file->type = GNRIC; return false; diff --git a/src/data_types.c b/src/data_types.c index b9f23d01..e6e8aa53 100644 --- a/src/data_types.c +++ b/src/data_types.c @@ -65,10 +65,10 @@ const DtTranslation dt_get_translation (VBlockP vb) // vb=NULL relates to the tx } // if file is_txt_binary - return the equivalent textual type, or just the type if not -DataType dt_get_dt_for_genozip_header (DataType dt, Codec source_codec) +DataType dt_get_dt_for_genozip_header (DataType dt, Codec src_codec) { - return (dt == DT_VCF && source_codec == CODEC_BCF) ? DT_BCF // note: this goes into the GenozipHeader, but converted in piz to z_file->data_type=VCF & z_file->source_code=CODEC_BCF - : (dt == DT_BAM && source_codec == CODEC_CRAM) ? DT_CRAM // note: likewise + return (dt == DT_VCF && src_codec == CODEC_BCF) ? DT_BCF // note: this goes into the GenozipHeader, but converted in piz to z_file->data_type=VCF & z_file->source_code=CODEC_BCF + : (dt == DT_BAM && src_codec == CODEC_CRAM) ? DT_CRAM // note: likewise : (dt == DT_BAM) ? DT_SAM : dt; } @@ -86,11 +86,11 @@ rom dt_name_faf (DataType dt) rom z_dt_name (void) { - return IS_SRC_BAM ? "BAM" - : IS_SRC_CRAM ? "CRAM" - : IS_SRC_BCF ? "BCF" - : z_file ? dt_name (z_file->data_type) - : "ERR_NULL_Z_FILE"; + return (txt_file && IS_SRC_BAM) ? "BAM" + : (txt_file && IS_SRC_CRAM) ? "CRAM" + : (txt_file && IS_SRC_BCF) ? "BCF" + : z_file ? dt_name (z_file->data_type) + : ""; } rom z_dt_name_faf (void) diff --git a/src/data_types.h b/src/data_types.h index 078e20b1..37a168e8 100644 --- a/src/data_types.h +++ b/src/data_types.h @@ -66,7 +66,7 @@ typedef struct DataTypeProperties { #define HEADER_NEED_MORE -1 #define HEADER_DATA_TYPE_CHANGED -2 // when reading header, data type of txt/z_file is changed int32_t (*is_header_done) (bool is_eof); // ZIP: header length if header read is complete, HEADER_NEED_MORE if not complete yet + sets lines.len - int32_t (*unconsumed) (VBlockP, uint32_t first_i, int32_t *i); // called by main thread called by txtfile_get_unconsumed_to_pass_to_next_vb to get the length of unconsumed txt to pass to next vb. returns -1 if first_i is too high and it needs more data. + int32_t (*unconsumed) (VBlockP, uint32_t first_i); // called by main thread called by txtfile_get_unconsumed_to_pass_to_next_vb to get the length of unconsumed txt to pass to next vb. returns -1 if first_i is too high and it needs more data. bool (*inspect_txt_header) (VBlockP txt_header_vb, BufferP txt_header, struct FlagsTxtHeader txt_header_flags); // called by main thread to verify the txt header. returns false if this txt file should be skipped bool (*is_data_type) (STRp(header), bool *need_more); // ZIP: check for signature of the data type, to rescue generic files. If adding, also add to test.sh:test_redirected @@ -139,23 +139,23 @@ typedef struct DataTypeProperties { #define usz(type) ((unsigned)sizeof(type)) #define DATA_TYPE_PROPERTIES { \ -/* name is_bin \n-end use_ref txt_type bin_type sizeof_vb sizeof_zip_dataline txt_headr hdr_contigs 1st is_header_done unconsumed inspect_txt_header is_data_type zip_initialize zip_after_segconf zip_after_vbs, zip_finalize zip_free_end_of_z zip_init_vb zip_after_compute zip_dts_flag zip_set_vb_header_specific zip_set_txt_header_flags zip_modify seg_initialize seg_txt_line assseg_line seg_is_big seg_is_small seg_finalize segconf_finalize zip_custom_merge seg_modifies zip_after_compress generate_recon_plan stats_reallocate zip_genozip_header piz_genozip_header piz_after_global_area piz_preprocess piz_header_init piz_initialize piz_finalize piz_before_read piz_init_vb piz_recon_init piz_init_line piz_after_recon piz_process_recon piz_after_preproc piz_xtra_line_data is_skip_section reconstruct_seq container_filter container_cb con_item_cb num_special special num_trans translators line_name dtype_names */ \ - [DT_VCF] = { "VCF", false, true, true, DT_VCF, DT_NONE, vcf_vb_size, vcf_vb_zip_dl_size, HDR_MUST, VCF_CONTIG_FMT, '#', vcf_is_header_done, NULL, vcf_inspect_txt_header, is_vcf, vcf_zip_initialize, NULL, vcf_zip_after_vbs, vcf_zip_finalize, vcf_header_finalize, vcf_zip_init_vb, vcf_zip_after_compute, NULL, vcf_zip_set_vb_header_specific, vcf_zip_set_txt_header_flags, vcf_zip_modify, vcf_seg_initialize, vcf_seg_txt_line, NULL, vcf_seg_is_big, vcf_seg_is_small, vcf_seg_finalize, vcf_segconf_finalize, NULL, false, vcf_zip_after_compress, NULL, NULL, vcf_zip_genozip_header, vcf_piz_genozip_header, NULL, NULL, vcf_piz_header_init, NULL, vcf_piz_finalize, NULL, vcf_piz_init_vb, vcf_piz_recon_init, vcf_reset_line, NULL, NULL, NULL, NULL, vcf_piz_is_skip_section, NULL, vcf_piz_filter, vcf_piz_container_cb, vcf_piz_con_item_cb, NUM_VCF_SPECIAL, VCF_SPECIAL, 0, {}, "variant", { "FIELD", "INFO", "FORMAT", "BOTH" } }, \ - [DT_BCF] = { "BCF", true, true, true, DT_VCF, DT_BCF, vcf_vb_size, vcf_vb_zip_dl_size, HDR_MUST, VCF_CONTIG_FMT, '#', NULL, NULL, vcf_inspect_txt_header, NULL, vcf_zip_initialize, NULL, vcf_zip_after_vbs, vcf_zip_finalize, vcf_header_finalize, vcf_zip_init_vb, vcf_zip_after_compute, NULL, vcf_zip_set_vb_header_specific, vcf_zip_set_txt_header_flags, vcf_zip_modify, vcf_seg_initialize, vcf_seg_txt_line, NULL, vcf_seg_is_big, vcf_seg_is_small, vcf_seg_finalize, vcf_segconf_finalize, NULL, false, vcf_zip_after_compress, NULL, NULL, vcf_zip_genozip_header, vcf_piz_genozip_header, NULL, NULL, vcf_piz_header_init, NULL, vcf_piz_finalize, NULL, vcf_piz_init_vb, vcf_piz_recon_init, vcf_reset_line, NULL, NULL, NULL, NULL, vcf_piz_is_skip_section, NULL, vcf_piz_filter, vcf_piz_container_cb, vcf_piz_con_item_cb, NUM_VCF_SPECIAL, VCF_SPECIAL, 0, {}, "variant", { "FIELD", "INFO", "FORMAT", "BOTH" } }, \ - [DT_SAM] = { "SAM", false, true, true, DT_SAM, DT_BAM, sam_vb_size, sam_vb_zip_dl_size, HDR_OK_0, SAM_CONTIG_FMT, '@', NULL, NULL, sam_header_inspect, is_sam, sam_zip_initialize, sam_set_sag_type, sam_zip_after_vbs, sam_zip_finalize, sam_zip_free_end_of_z, sam_zip_init_vb, sam_zip_after_compute, sam_zip_dts_flag, sam_zip_set_vb_header_specific, NULL, sam_zip_modify, sam_seg_initialize, sam_seg_txt_line, NULL, sam_seg_is_big, sam_seg_is_small, sam_seg_finalize, sam_segconf_finalize, sam_deep_merge, false, sam_zip_after_compress, sam_zip_generate_recon_plan, sam_stats_reallocate, sam_zip_genozip_header, sam_piz_genozip_header, sam_piz_load_sags, sam_piz_dispatch_one_load_sag_vb, sam_piz_header_init, sam_piz_initialize, sam_piz_finalize, NULL, sam_piz_init_vb, sam_piz_recon_init, sam_reset_line, sam_piz_after_recon, sam_piz_process_recon, sam_piz_after_preproc, sam_piz_xtra_line_data, sam_piz_is_skip_section, sam_reconstruct_SEQ_vs_ref,sam_piz_filter, sam_piz_container_cb, sam_piz_con_item_cb, NUM_SAM_SPECIAL, SAM_SPECIAL, NUM_SAM_TRANS, SAM_TRANSLATORS, "alignment", { "FIELD", "QNAME", "OPTION" } }, \ - [DT_BAM] = { "BAM", true, false, true, DT_SAM, DT_BAM, sam_vb_size, sam_vb_zip_dl_size, HDR_MUST_0, SAM_CONTIG_FMT, -1, bam_is_header_done, bam_unconsumed, sam_header_inspect, is_bam, sam_zip_initialize, sam_set_sag_type, sam_zip_after_vbs, sam_zip_finalize, sam_zip_free_end_of_z, sam_zip_init_vb, sam_zip_after_compute, sam_zip_dts_flag, sam_zip_set_vb_header_specific, NULL, bam_zip_modify, bam_seg_initialize, bam_seg_txt_line, bam_assseg_line, sam_seg_is_big, sam_seg_is_small, sam_seg_finalize, sam_segconf_finalize, sam_deep_merge, true, sam_zip_after_compress, sam_zip_generate_recon_plan, sam_stats_reallocate, sam_zip_genozip_header, sam_piz_genozip_header, sam_piz_load_sags, sam_piz_dispatch_one_load_sag_vb, sam_piz_header_init, sam_piz_initialize, sam_piz_finalize, NULL, sam_piz_init_vb, sam_piz_recon_init, sam_reset_line, sam_piz_after_recon, sam_piz_process_recon, sam_piz_after_preproc, sam_piz_xtra_line_data, NULL, NULL, sam_piz_filter, sam_piz_container_cb, 0/*cb only in SAM*/, NUM_SAM_SPECIAL, SAM_SPECIAL, NUM_SAM_TRANS, SAM_TRANSLATORS, "alignment", { "FIELD", "DESC", "OPTION" } }, \ - [DT_CRAM] = { "CRAM", true, true, true, DT_SAM, DT_CRAM, sam_vb_size, sam_vb_zip_dl_size, HDR_MUST_0, SAM_CONTIG_FMT, -1, bam_is_header_done, bam_unconsumed, sam_header_inspect, is_cram, sam_zip_initialize, sam_set_sag_type, sam_zip_after_vbs, sam_zip_finalize, sam_zip_free_end_of_z, sam_zip_init_vb, sam_zip_after_compute, sam_zip_dts_flag, sam_zip_set_vb_header_specific, NULL, NULL, bam_seg_initialize, bam_seg_txt_line, bam_assseg_line, sam_seg_is_big, sam_seg_is_small, sam_seg_finalize, sam_segconf_finalize, sam_deep_merge, true, sam_zip_after_compress, sam_zip_generate_recon_plan, sam_stats_reallocate, sam_zip_genozip_header, sam_piz_genozip_header, sam_piz_load_sags, sam_piz_dispatch_one_load_sag_vb, sam_piz_header_init, sam_piz_initialize, sam_piz_finalize, NULL, sam_piz_init_vb, sam_piz_recon_init, sam_reset_line, sam_piz_after_recon, sam_piz_process_recon, sam_piz_after_preproc, sam_piz_xtra_line_data, NULL, NULL, sam_piz_filter, sam_piz_container_cb, 0/*cb only in SAM*/, NUM_SAM_SPECIAL, SAM_SPECIAL, NUM_SAM_TRANS, SAM_TRANSLATORS, "alignment", { "FIELD", "DESC", "OPTION" } }, \ - [DT_FASTQ] = { "FASTQ", false, true, true, DT_FASTQ, DT_NONE, fastq_vb_size, fastq_vb_zip_dl_size, HDR_NONE, NULL, -1, NULL, fastq_unconsumed, NULL, is_fastq, fastq_zip_initialize, NULL, NULL, fastq_zip_finalize, NULL, fastq_zip_init_vb, fastq_zip_after_compute, NULL, NULL, fastq_zip_set_txt_header_flags, fastq_zip_modify, fastq_seg_initialize, fastq_seg_txt_line, fastq_assseg_line, NULL, fastq_seg_is_small, fastq_seg_finalize, fastq_segconf_finalize, NULL, false, NULL, NULL, NULL, fastq_zip_genozip_header, fastq_piz_genozip_header, NULL, NULL, fastq_piz_header_init, fastq_piz_initialize, NULL, fastq_piz_before_read, fastq_piz_init_vb, NULL, fastq_reset_line, NULL, fastq_piz_process_recon, NULL, NULL, fastq_piz_is_skip_section, fastq_recon_aligned_SEQ, fastq_piz_filter, fastq_piz_container_cb, 0, NUM_FASTQ_SPECIAL, FASTQ_SPECIAL, 0, {}, "read", { "FIELD", "QNAME", "AUX" } }, \ - [DT_FASTA] = { "FASTA", false, false, false, DT_FASTA, DT_NONE, fasta_vb_size, fasta_vb_zip_dl_size, HDR_NONE, NULL, -1, NULL, fasta_unconsumed, NULL, is_fasta, fasta_zip_initialize, NULL, NULL, NULL, NULL, NULL, fasta_zip_after_compute, NULL, fasta_zip_set_vb_header_specific, NULL, NULL, fasta_seg_initialize, fasta_seg_txt_line, NULL, fasta_seg_is_big, fasta_seg_is_small, fasta_seg_finalize, fasta_segconf_finalize, NULL, false, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, fasta_piz_initialize, NULL, NULL, fasta_piz_init_vb, NULL, NULL, NULL, NULL, NULL, NULL, fasta_piz_is_skip_section, NULL, fasta_piz_filter, NULL, 0, NUM_FASTA_SPECIAL, FASTA_SPECIAL, 0, {}, "line", { "FIELD", "DESC", "ERROR!" } }, \ - [DT_REF] = { "REFERENCE", false, false, false, DT_REF, DT_NONE, fasta_vb_size, fasta_vb_zip_dl_size, HDR_NONE, NULL, -1, NULL, fasta_unconsumed, NULL, is_ref, ref_make_ref_init, NULL, NULL, ref_make_finalize, NULL, NULL, ref_make_after_compute, NULL, NULL, NULL, NULL, ref_make_seg_initialize,fasta_seg_txt_line, NULL, NULL, fasta_seg_is_small, fasta_seg_finalize, fasta_segconf_finalize, NULL, false, ref_make_create_range, NULL, NULL, ref_make_genozip_header, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 0, 0, 0, {}, 0, {}, "line", { "FIELD", "DESC", "ERROR!" } }, \ - [DT_GFF] = { "GFF", false, true, false, DT_GFF, DT_NONE, gff_vb_size, 0, HDR_OK, NULL, '#', NULL, gff_unconsumed, gff_header_inspect, is_gff, gff_zip_initialize, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, gff_seg_initialize, gff_seg_txt_line, NULL, gff_seg_is_big, gff_seg_is_small, gff_seg_finalize, gff_segconf_finalize, NULL, false, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, gff_piz_init_vb, NULL, gff_reset_line, NULL, NULL, NULL, NULL, NULL, NULL, gff_piz_filter, gff_piz_container_cb, 0, NUM_GFF_SPECIAL, GFF_SPECIAL, 0, {}, "sequence", { "FIELD", "ATTRS", "ENST" } }, \ - [DT_ME23] = { "23ANDME", false, true, false, DT_ME23, DT_NONE, 0, 0, HDR_MUST, NULL, '#', NULL, NULL, me23_header_inspect, is_me23, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, me23_seg_initialize, me23_seg_txt_line, NULL, NULL, me23_seg_is_small, me23_seg_finalize, NULL, NULL, false, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 0, 0, {}, NUM_ME23_TRANS, ME23_TRANSLATORS, "SNP", { "FIELD", "ERROR!", "ERROR!" } }, \ - [DT_GNRIC] = { "GENERIC", true, false, false, DT_NONE, DT_GNRIC, 0, 0, HDR_OK, NULL, -1, generic_is_header_done, generic_unconsumed, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, generic_seg_initialize, generic_seg_txt_line, generic_assseg_line, NULL, generic_seg_is_small, generic_seg_finalize, NULL, NULL, false, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 0, NUM_GENERIC_SPECIAL, GENERIC_SPECIAL, 0, {}, "N/A", { "FIELD", "ERROR!", "ERROR!" } }, \ - [DT_LOCS] = { "LOCS", true, false, false, DT_NONE, DT_LOCS, 0, 0, HDR_MUST, NULL, -1, locs_is_header_done, locs_unconsumed, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, locs_seg_initialize, locs_seg_txt_line, NULL, NULL, locs_seg_is_small, locs_seg_finalize, NULL, NULL, false, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 0, NUM_LOCS_SPECIAL, LOCS_SPECIAL, NUM_LOCS_TRANS, LOCS_TRANSLATORS, "cluster" , { "FIELD", "ERROR!", "ERROR!" } }, \ - [DT_BED] = { "BED", false, true, false, DT_BED, DT_NONE, 0, 0, HDR_OK, NULL, -1, bed_is_header_done, NULL, NULL, is_bed, bed_zip_initialize, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, bed_seg_initialize, bed_seg_txt_line, NULL, NULL, bed_seg_is_small, bed_seg_finalize, bed_segconf_finalize, NULL, false, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 0, NUM_BED_SPECIAL, BED_SPECIAL, 0, {}, "line", { "FIELD", "ERROR!", "ERROR!" } }, \ +/* name is_bin \n-end use_ref txt_type bin_type sizeof_vb sizeof_zip_dataline txt_headr hdr_contigs 1st is_header_done unconsumed inspect_txt_header is_data_type zip_initialize zip_after_segconf zip_after_vbs, zip_finalize zip_free_end_of_z zip_init_vb zip_after_compute zip_dts_flag zip_set_vb_header_specific zip_set_txt_header_flags zip_modify seg_initialize seg_txt_line assseg_line seg_is_big seg_is_small seg_finalize segconf_finalize zip_custom_merge seg_modifies zip_after_compress generate_recon_plan stats_reallocate zip_genozip_header piz_genozip_header piz_after_global_area piz_preprocess piz_header_init piz_initialize piz_finalize piz_before_read piz_init_vb piz_recon_init piz_init_line piz_after_recon piz_process_recon piz_after_preproc piz_xtra_line_data is_skip_section reconstruct_seq container_filter container_cb con_item_cb num_special special num_trans translators line_name dtype_names */ \ + [DT_VCF] = { "VCF", false, true, true, DT_VCF, DT_NONE, vcf_vb_size, vcf_vb_zip_dl_size, HDR_MUST, VCF_CONTIG_FMT, '#', vcf_is_header_done, NULL, vcf_inspect_txt_header, is_vcf, vcf_zip_initialize, NULL, vcf_zip_after_vbs, vcf_zip_finalize, vcf_header_finalize, vcf_zip_init_vb, vcf_zip_after_compute, NULL, vcf_zip_set_vb_header_specific, vcf_zip_set_txt_header_flags, vcf_zip_modify, vcf_seg_initialize, vcf_seg_txt_line, NULL, vcf_seg_is_big, vcf_seg_is_small, vcf_seg_finalize, vcf_segconf_finalize, NULL, false, vcf_zip_after_compress, NULL, NULL, vcf_zip_genozip_header, vcf_piz_genozip_header, NULL, NULL, vcf_piz_header_init, NULL, vcf_piz_finalize, NULL, vcf_piz_init_vb, vcf_piz_recon_init, vcf_reset_line, NULL, NULL, NULL, NULL, vcf_piz_is_skip_section, NULL, vcf_piz_filter, vcf_piz_container_cb, vcf_piz_con_item_cb, NUM_VCF_SPECIAL, VCF_SPECIAL, 0, {}, "variant", { "FIELD", "INFO", "FORMAT", "BOTH" } }, \ + [DT_BCF] = { "BCF", true, true, true, DT_VCF, DT_BCF, vcf_vb_size, vcf_vb_zip_dl_size, HDR_MUST, VCF_CONTIG_FMT, '#', NULL, NULL, vcf_inspect_txt_header, NULL, vcf_zip_initialize, NULL, vcf_zip_after_vbs, vcf_zip_finalize, vcf_header_finalize, vcf_zip_init_vb, vcf_zip_after_compute, NULL, vcf_zip_set_vb_header_specific, vcf_zip_set_txt_header_flags, vcf_zip_modify, vcf_seg_initialize, vcf_seg_txt_line, NULL, vcf_seg_is_big, vcf_seg_is_small, vcf_seg_finalize, vcf_segconf_finalize, NULL, false, vcf_zip_after_compress, NULL, NULL, vcf_zip_genozip_header, vcf_piz_genozip_header, NULL, NULL, vcf_piz_header_init, NULL, vcf_piz_finalize, NULL, vcf_piz_init_vb, vcf_piz_recon_init, vcf_reset_line, NULL, NULL, NULL, NULL, vcf_piz_is_skip_section, NULL, vcf_piz_filter, vcf_piz_container_cb, vcf_piz_con_item_cb, NUM_VCF_SPECIAL, VCF_SPECIAL, 0, {}, "variant", { "FIELD", "INFO", "FORMAT", "BOTH" } }, \ + [DT_SAM] = { "SAM", false, true, true, DT_SAM, DT_BAM, sam_vb_size, sam_vb_zip_dl_size, HDR_OK_0, SAM_CONTIG_FMT, '@', NULL, NULL, sam_header_inspect, is_sam, sam_zip_initialize, sam_set_sag_type, sam_zip_after_vbs, sam_zip_finalize, sam_zip_free_end_of_z, sam_zip_init_vb, sam_zip_after_compute, sam_zip_dts_flag, sam_zip_set_vb_header_specific, NULL, sam_zip_modify, sam_seg_initialize, sam_seg_txt_line, NULL, sam_seg_is_big, sam_seg_is_small, sam_seg_finalize, sam_segconf_finalize, sam_deep_merge, false, sam_zip_after_compress, sam_zip_generate_recon_plan, sam_stats_reallocate, sam_zip_genozip_header, sam_piz_genozip_header, sam_piz_load_sags, sam_piz_dispatch_one_load_sag_vb, sam_piz_header_init, sam_piz_initialize, sam_piz_finalize, NULL, sam_piz_init_vb, sam_piz_recon_init, sam_reset_line, sam_piz_after_recon, sam_piz_process_recon, sam_piz_after_preproc, sam_piz_xtra_line_data, sam_piz_is_skip_section, sam_reconstruct_SEQ_vs_ref,sam_piz_filter, sam_piz_container_cb, sam_piz_con_item_cb, NUM_SAM_SPECIAL, SAM_SPECIAL, NUM_SAM_TRANS, SAM_TRANSLATORS, "alignment", { "FIELD", "QNAME", "OPTION" } }, \ + [DT_BAM] = { "BAM", true, false, true, DT_SAM, DT_BAM, sam_vb_size, sam_vb_zip_dl_size, HDR_MUST_0, SAM_CONTIG_FMT, -1, bam_is_header_done, bam_unconsumed, sam_header_inspect, is_bam, sam_zip_initialize, sam_set_sag_type, sam_zip_after_vbs, sam_zip_finalize, sam_zip_free_end_of_z, sam_zip_init_vb, sam_zip_after_compute, sam_zip_dts_flag, sam_zip_set_vb_header_specific, NULL, bam_zip_modify, bam_seg_initialize, bam_seg_txt_line, bam_assseg_line, sam_seg_is_big, sam_seg_is_small, sam_seg_finalize, sam_segconf_finalize, sam_deep_merge, true, sam_zip_after_compress, sam_zip_generate_recon_plan, sam_stats_reallocate, sam_zip_genozip_header, sam_piz_genozip_header, sam_piz_load_sags, sam_piz_dispatch_one_load_sag_vb, sam_piz_header_init, sam_piz_initialize, sam_piz_finalize, NULL, sam_piz_init_vb, sam_piz_recon_init, sam_reset_line, sam_piz_after_recon, sam_piz_process_recon, sam_piz_after_preproc, sam_piz_xtra_line_data, NULL, NULL, sam_piz_filter, sam_piz_container_cb, 0/*cb only in SAM*/, NUM_SAM_SPECIAL, SAM_SPECIAL, NUM_SAM_TRANS, SAM_TRANSLATORS, "alignment", { "FIELD", "DESC", "OPTION" } }, \ + [DT_CRAM] = { "CRAM", true, true, true, DT_SAM, DT_CRAM, sam_vb_size, sam_vb_zip_dl_size, HDR_MUST_0, SAM_CONTIG_FMT, -1, bam_is_header_done, bam_unconsumed, sam_header_inspect, is_cram, sam_zip_initialize, sam_set_sag_type, sam_zip_after_vbs, sam_zip_finalize, sam_zip_free_end_of_z, sam_zip_init_vb, sam_zip_after_compute, sam_zip_dts_flag, sam_zip_set_vb_header_specific, NULL, NULL, bam_seg_initialize, bam_seg_txt_line, bam_assseg_line, sam_seg_is_big, sam_seg_is_small, sam_seg_finalize, sam_segconf_finalize, sam_deep_merge, true, sam_zip_after_compress, sam_zip_generate_recon_plan, sam_stats_reallocate, sam_zip_genozip_header, sam_piz_genozip_header, sam_piz_load_sags, sam_piz_dispatch_one_load_sag_vb, sam_piz_header_init, sam_piz_initialize, sam_piz_finalize, NULL, sam_piz_init_vb, sam_piz_recon_init, sam_reset_line, sam_piz_after_recon, sam_piz_process_recon, sam_piz_after_preproc, sam_piz_xtra_line_data, NULL, NULL, sam_piz_filter, sam_piz_container_cb, 0/*cb only in SAM*/, NUM_SAM_SPECIAL, SAM_SPECIAL, NUM_SAM_TRANS, SAM_TRANSLATORS, "alignment", { "FIELD", "DESC", "OPTION" } }, \ + [DT_FASTQ] = { "FASTQ", false, true, true, DT_FASTQ, DT_NONE, fastq_vb_size, fastq_vb_zip_dl_size, HDR_NONE, NULL, -1, NULL, fastq_unconsumed, NULL, is_fastq, fastq_zip_initialize, fastq_zip_after_segconf, NULL, fastq_zip_finalize, NULL, fastq_zip_init_vb, fastq_zip_after_compute, NULL, NULL, fastq_zip_set_txt_header_flags, fastq_zip_modify, fastq_seg_initialize, fastq_seg_txt_line, fastq_assseg_line, NULL, fastq_seg_is_small, fastq_seg_finalize, fastq_segconf_finalize, NULL, false, NULL, NULL, NULL, fastq_zip_genozip_header, fastq_piz_genozip_header, NULL, NULL, fastq_piz_header_init, fastq_piz_initialize, NULL, fastq_piz_before_read, fastq_piz_init_vb, NULL, fastq_reset_line, NULL, fastq_piz_process_recon, NULL, NULL, fastq_piz_is_skip_section, fastq_recon_aligned_SEQ, fastq_piz_filter, fastq_piz_container_cb, 0, NUM_FASTQ_SPECIAL, FASTQ_SPECIAL, 0, {}, "read", { "FIELD", "QNAME", "AUX" } }, \ + [DT_FASTA] = { "FASTA", false, false, false, DT_FASTA, DT_NONE, fasta_vb_size, fasta_vb_zip_dl_size, HDR_NONE, NULL, -1, NULL, fasta_unconsumed, NULL, is_fasta, fasta_zip_initialize, NULL, NULL, NULL, NULL, NULL, fasta_zip_after_compute, NULL, fasta_zip_set_vb_header_specific, NULL, NULL, fasta_seg_initialize, fasta_seg_txt_line, NULL, fasta_seg_is_big, fasta_seg_is_small, fasta_seg_finalize, fasta_segconf_finalize, NULL, false, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, fasta_piz_initialize, NULL, NULL, fasta_piz_init_vb, NULL, NULL, NULL, NULL, NULL, NULL, fasta_piz_is_skip_section, NULL, fasta_piz_filter, NULL, 0, NUM_FASTA_SPECIAL, FASTA_SPECIAL, 0, {}, "line", { "FIELD", "DESC", "ERROR!" } }, \ + [DT_REF] = { "REFERENCE", false, false, false, DT_REF, DT_NONE, fasta_vb_size, fasta_vb_zip_dl_size, HDR_NONE, NULL, -1, NULL, fasta_unconsumed, NULL, is_ref, ref_make_ref_init, NULL, NULL, ref_make_finalize, NULL, NULL, ref_make_after_compute, NULL, NULL, NULL, NULL, ref_make_seg_initialize,fasta_seg_txt_line, NULL, NULL, fasta_seg_is_small, fasta_seg_finalize, fasta_segconf_finalize, NULL, false, ref_make_create_range, NULL, NULL, ref_make_genozip_header, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 0, 0, 0, {}, 0, {}, "line", { "FIELD", "DESC", "ERROR!" } }, \ + [DT_GFF] = { "GFF", false, true, false, DT_GFF, DT_NONE, gff_vb_size, 0, HDR_OK, NULL, '#', NULL, gff_unconsumed, gff_header_inspect, is_gff, gff_zip_initialize, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, gff_seg_initialize, gff_seg_txt_line, NULL, gff_seg_is_big, gff_seg_is_small, gff_seg_finalize, gff_segconf_finalize, NULL, false, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, gff_piz_init_vb, NULL, gff_reset_line, NULL, NULL, NULL, NULL, NULL, NULL, gff_piz_filter, gff_piz_container_cb, 0, NUM_GFF_SPECIAL, GFF_SPECIAL, 0, {}, "sequence", { "FIELD", "ATTRS", "ENST" } }, \ + [DT_ME23] = { "23ANDME", false, true, false, DT_ME23, DT_NONE, 0, 0, HDR_MUST, NULL, '#', NULL, NULL, me23_header_inspect, is_me23, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, me23_seg_initialize, me23_seg_txt_line, NULL, NULL, me23_seg_is_small, me23_seg_finalize, NULL, NULL, false, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 0, 0, {}, NUM_ME23_TRANS, ME23_TRANSLATORS, "SNP", { "FIELD", "ERROR!", "ERROR!" } }, \ + [DT_GNRIC] = { "GENERIC", true, false, false, DT_NONE, DT_GNRIC, 0, 0, HDR_OK, NULL, -1, generic_is_header_done, generic_unconsumed, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, generic_seg_initialize, generic_seg_txt_line, generic_assseg_line, NULL, generic_seg_is_small, generic_seg_finalize, NULL, NULL, false, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 0, NUM_GENERIC_SPECIAL, GENERIC_SPECIAL, 0, {}, "N/A", { "FIELD", "ERROR!", "ERROR!" } }, \ + [DT_LOCS] = { "LOCS", true, false, false, DT_NONE, DT_LOCS, 0, 0, HDR_MUST, NULL, -1, locs_is_header_done, locs_unconsumed, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, locs_seg_initialize, locs_seg_txt_line, NULL, NULL, locs_seg_is_small, locs_seg_finalize, NULL, NULL, false, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 0, NUM_LOCS_SPECIAL, LOCS_SPECIAL, NUM_LOCS_TRANS, LOCS_TRANSLATORS, "cluster" , { "FIELD", "ERROR!", "ERROR!" } }, \ + [DT_BED] = { "BED", false, true, false, DT_BED, DT_NONE, 0, 0, HDR_OK, NULL, -1, bed_is_header_done, NULL, NULL, is_bed, bed_zip_initialize, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, bed_seg_initialize, bed_seg_txt_line, NULL, NULL, bed_seg_is_small, bed_seg_finalize, bed_segconf_finalize, NULL, false, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 0, NUM_BED_SPECIAL, BED_SPECIAL, 0, {}, "line", { "FIELD", "ERROR!", "ERROR!" } }, \ } #define DATA_TYPE_FUNCTIONS_DEFAULT /* only applicable to (some) functions */ \ - { "DEFAULT", false, false, false, DT_NONE, DT_NONE, def_vb_size, 0, 0, 0, 0, def_is_header_done, def_unconsumed, 0, NULL, 0, NULL, 0, 0, 0, 0, 0, 0, NULL, NULL, NULL, 0, 0, textual_assseg_line, NULL, 0, 0, NULL, NULL, false, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, 0, NULL, 0, 0, 0, NULL, 0, NULL, NULL, NULL, 0, 0, default_piz_filter, 0, 0, 0, {}, 0, {}, "", { "FIELD", "DTYPE1", "DTYPE2" } } + { "DEFAULT", false, false, false, DT_NONE, DT_NONE, def_vb_size, 0, 0, 0, 0, def_is_header_done, def_unconsumed, 0, NULL, 0, NULL, 0, 0, 0, 0, 0, 0, NULL, NULL, NULL, 0, 0, textual_assseg_line, NULL, 0, 0, NULL, NULL, false, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, 0, NULL, 0, 0, 0, NULL, 0, NULL, NULL, NULL, 0, 0, default_piz_filter, 0, 0, 0, {}, 0, {}, "", { "FIELD", "DTYPE1", "DTYPE2" } } extern DataTypeProperties dt_props[NUM_DATATYPES], dt_props_def; @@ -258,7 +258,7 @@ typedef struct DtTranslation { extern void dt_initialize (void); extern const DtTranslation dt_get_translation (VBlockP vb); -extern DataType dt_get_dt_for_genozip_header (DataType dt, Codec source_codec); +extern DataType dt_get_dt_for_genozip_header (DataType dt, Codec src_codec); extern rom dt_name (DataType data_type); extern rom dt_name_faf (DataType data_type); extern rom z_dt_name (void); diff --git a/src/dict_io.c b/src/dict_io.c index ac21b438..09f6de82 100644 --- a/src/dict_io.c +++ b/src/dict_io.c @@ -390,7 +390,8 @@ StrTextMegaLong str_snip_ex (STRp(snip), bool add_quote) { StrTextMegaLong s; int s_len=0; - + DataType dt = txt_file ? txt_file->data_type : z_file->data_type; + char op = (snip_len && snip[0] > 0 && snip[0] < 32) ? snip[0] : 0; int i=1; @@ -416,8 +417,8 @@ StrTextMegaLong str_snip_ex (STRp(snip), bool add_quote) case v13_SNIP_COPY_BUDDY : SNPRINTF0(s, "[BCOPY]"); break; case SNIP_DIFF : SNPRINTF0(s, "[DIFF]"); break; case SNIP_NUMERIC : SNPRINTF0(s, "[NUMERIC]"); break; - case SNIP_SPECIAL : if (z_file && special_names[z_file->data_type][snip[1]-32]) - SNPRINTF (s, "[%s_SPECIAL_%s]", dt_name(z_file->data_type), special_names[z_file->data_type][snip[1]-32]); + case SNIP_SPECIAL : if (z_file && special_names[dt][snip[1]-32]) + SNPRINTF (s, "[%s_SPECIAL_%s]", dt_name(dt), special_names[dt][snip[1]-32]); else SNPRINTF (s, "[SPECIAL-%u]", snip[1]-32); i++; @@ -425,7 +426,7 @@ StrTextMegaLong str_snip_ex (STRp(snip), bool add_quote) default : SNPRINTF (s, "\\x%x", (uint8_t)op); } - #define X(dt,sp) (op == SNIP_SPECIAL && z_file->data_type==DT_##dt && (snip[1] == dt##_SPECIAL_##sp)) + #define X(dtype,sp) (op == SNIP_SPECIAL && dt==DT_##dtype && (snip[1] == dtype##_SPECIAL_##sp)) #define X_SAM(sp) (op == SNIP_SPECIAL && (Z_DT(SAM) || Z_DT(BAM)) && (snip[1] == SAM_SPECIAL_##sp)) if (op == SNIP_OTHER_LOOKUP || op == SNIP_OTHER_DELTA || op == SNIP_COPY || op == SNIP_REDIRECTION || X(VCF,LEN_OF) || X(VCF,ARRAY_LEN_OF) || X(VCF,COPY_MATE) || (X(VCF,GQ) && snip_len > 8) || diff --git a/src/digest.c b/src/digest.c index b25456d3..822be3da 100644 --- a/src/digest.c +++ b/src/digest.c @@ -203,8 +203,8 @@ static void digest_piz_verify_one_vb (VBlockP vb) NOISYWARN ("reconstructed vblock=%s/%u (vb_line_i=0 -> txt_line_i(1-based)=%"PRId64" num_lines=%u), (%s=%s) differs from the %s file (%s=%s).\n%s", comp_name (vb->comp_i), vb->vblock_i, writer_get_txt_line_i (vb, 0), vb->lines.len32, - segconf.zip_txt_modified ? "modified" : "original", DIGEST_NAME, digest_display (piz_digest).s, + segconf.zip_txt_modified ? "modified" : "original", DIGEST_NAME, digest_display (vb->expected_digest).s, recon_size_warn); @@ -212,9 +212,11 @@ static void digest_piz_verify_one_vb (VBlockP vb) if (!__atomic_test_and_set (&txt_file->vb_digest_failed, __ATOMIC_RELAXED)) { // not WARN_ONCE because we might be genounzipping multiple files - we want to show this for every failed file (see also note in digest_piz_verify_one_txt_file) NOISYWARN ("Bad reconstructed vblock has been dumped to: %s.gz\n" "To see the same data in the original file:\n" - "genozip --biopsy %u -B%u %s%s", // note: segconf.vb_size is only available since v14. For older files, look it up with genocat --stats. - txtfile_dump_vb (vb, z_name).s, vb->vblock_i, (unsigned)(segconf.vb_size >> 20), - (txt_file && txt_file->name) ? filename_guess_original (txt_file) : IS_PIZ ? txtheader_get_txt_filename_from_section().s : "(uncalculable)", + "genozip --biopsy %u%s %s%s", // note: segconf.vb_size is only available since v14. For older files, look it up with genocat --stats. + txtfile_dump_vb (vb, z_name).s, vb->vblock_i, + cond_int (segconf.vb_size/*0 if IS_VB_SIZE_BY_MGZIP*/, " -B", (unsigned)(segconf.vb_size >> 20)), + + (txt_file && txt_file->name) ? filename_guess_original (txt_file) : IS_PIZ ? txtheader_get_txt_filename_from_section(vb->comp_i).s : "(uncalculable)", SUPPORT); if (flag.test) exit_on_error (false); // must be inside the atomic test, otherwise another thread will exit before we completed dumping diff --git a/src/dispatcher.c b/src/dispatcher.c index 5e54fbee..d4587d29 100644 --- a/src/dispatcher.c +++ b/src/dispatcher.c @@ -14,6 +14,7 @@ #include "segconf.h" #include "arch.h" #include "zip.h" +#include "txtheader.h" #define RR(x) ((x) % d->max_threads) @@ -39,7 +40,7 @@ typedef struct DispatcherData { uint32_t next_vb_i; uint32_t max_threads; enum { PROGRESS_PERCENT, PROGRESS_MESSAGE, PROGRESS_NONE } progress_type; - rom filename; + StrTextLong filename; uint64_t progress; // progress towards target_progress uint64_t target_progress; // progress reaches this, it is at 100% @@ -109,7 +110,7 @@ Dispatcher dispatcher_init (rom task_name, main_dispatcher = d; if (filename) - d->filename = filename; + strncpy (d->filename.s, filename, sizeof (StrTextLong)-1); ASSERT (max_threads <= global_max_threads, "expecting max_threads=%u <= global_max_threads=%u", max_threads, global_max_threads); @@ -142,15 +143,15 @@ void dispatcher_pause (Dispatcher d) } // PIZ: reinit dispatcher, used when splitting a genozip file to its components, using a single dispatcher object -void dispatcher_resume (Dispatcher d, uint32_t target_progress) +void dispatcher_resume (Dispatcher d, uint32_t target_progress, CompIType comp_i) { d->input_exhausted = false; - d->filename = txt_file->basename; + d->filename = txtheader_get_txt_filename_from_section (comp_i); d->progress = 0; d->target_progress = target_progress; if (d->paused) - progress_new_component (d->filename, "0\%", flag.test, start_time_initialized ? &start_time : NULL); + progress_new_component (d->filename.s, "0\%", flag.test, start_time_initialized ? &start_time : NULL); d->paused = false; } diff --git a/src/dispatcher.h b/src/dispatcher.h index 702fa7c7..17547435 100644 --- a/src/dispatcher.h +++ b/src/dispatcher.h @@ -21,7 +21,7 @@ extern Dispatcher dispatcher_init (rom task_name, rom preproc_task_name, VBlockP extern void dispatcher_start_wallclock (void); extern void dispatcher_allow_out_of_order (Dispatcher dispatcher); extern void dispatcher_pause (Dispatcher dispatcher); -extern void dispatcher_resume (Dispatcher dispatcher, uint32_t target_progress); +extern void dispatcher_resume (Dispatcher dispatcher, uint32_t target_progress, CompIType comp_i); extern void dispatcher_finish (Dispatcher *dispatcher, unsigned *last_vb_i, bool cleanup_after_me, bool show_memory); typedef void (*DispatcherFunc)(VBlockP); diff --git a/src/distribution.c b/src/distribution.c index 2534e5d0..9f9a6acb 100644 --- a/src/distribution.c +++ b/src/distribution.c @@ -7,8 +7,9 @@ // and subject to penalties specified in the license. #include "genozip.h" +#include "version.h" rom get_distribution (void) { - return DISTRIBUTION; + return version_is_devel() ? "devel" : DISTRIBUTION; } diff --git a/src/endianness.h b/src/endianness.h index eca82196..6a2a3c4c 100644 --- a/src/endianness.h +++ b/src/endianness.h @@ -144,3 +144,41 @@ #define BGEN32F(x) ({ union { float f; uint32_t i; } u = {.f = (x)}; u.i = BGEN32(u.i); u.f; }) #define LTEN64F(x) ({ union { double f; uint64_t i; } u = {.f = (x)}; u.i = LTEN64(u.i); u.f; }) #define BGEN64F(x) ({ union { double f; uint64_t i; } u = {.f = (x)}; u.i = BGEN64(u.i); u.f; }) + +// getting and putting unaligned Little Endian words +#ifdef GENOZIP_ALLOW_UNALIGNED_ACCESS + #define GET_UINT16(p) LTEN16 (*((uint16_t *)(p))) + #define GET_UINT32(p) LTEN32 (*((uint32_t *)(p))) + #define GET_UINT64(p) LTEN64 (*((uint64_t *)(p))) + #define GET_FLOAT32(p) LTEN32F(*((float *)(p))) + + #define GET_UINT32_(st_p, member) ((st_p)->member) + + #define PUT_UINT16(p,n) *((uint16_t *)(p)) = LTEN16(n) + #define PUT_UINT32(p,n) *((uint32_t *)(p)) = LTEN32(n) + + #define PUT_UINT16_(st_p, member, n) (st_p)->member = LTEN16(n) + #define PUT_UINT32_(st_p, member, n) (st_p)->member = LTEN32(n) +#else + // loading a Little Endian uint32_t from an unaligned memory location + #define GET_UINT16(p) ((uint16_t)((uint8_t*)(p))[0] | ((uint16_t)((uint8_t*)(p))[1] << 8)) + #define GET_UINT32(p) ((uint32_t)((uint8_t*)(p))[0] | ((uint32_t)((uint8_t*)(p))[1] << 8) | ((uint32_t)((uint8_t*)(p))[2] << 16) | ((uint32_t)((uint8_t*)(p))[3] << 24)) + #define GET_UINT64(p) ((uint64_t)((uint8_t*)(p))[0] | ((uint64_t)((uint8_t*)(p))[1] << 8) | ((uint64_t)((uint8_t*)(p))[2] << 16) | ((uint64_t)((uint8_t*)(p))[3] << 24) | ((uint64_t)((uint8_t*)(p))[4] << 32) | ((uint64_t)((uint8_t*)(p))[5] << 40) | ((uint64_t)((uint8_t*)(p))[6] << 48) | ((uint64_t)((uint8_t*)(p))[7] << 56))) + #define GET_FLOAT32(p) ({ union { uint32_t i; float f; } n= {.i = GET_UINT32(p)}; n.f; }) + + #define GET_UINT32_(st_p, member) ({ typeof(*st_p) dummy; bytes _P=(bytes)(st_p) + ((bytes)&dummy.member - (bytes)&dummy); ((uint32_t)_P[0] | ((uint32_t)_P[1] << 8) | ((uint32_t)_P[2] << 16) | ((uint32_t)_P[3] << 24)); }) + + // storing a Little Endian integer in an unaligned memory location + #define PUT_UINT16(p,n) ({ uint16_t _N=(n); uint8_t *_P=(uint8_t *)(p); _P[0]=_N; _P[1]=_N>>8; }) + #define PUT_UINT32(p,n) ({ uint32_t _N=(n); uint8_t *_P=(uint8_t *)(p); _P[0]=_N; _P[1]=_N>>8; _P[2]=_N>>16; _P[3]=_N>>24; }) + + // storing as a struct member + #define PUT_UINT16_(st_p, member, n) ({ typeof(*st_p) dummy; PUT_UINT16 ((rom)(st_p) + ((rom)&dummy.member - (rom)&dummy), LTEN16(n)); }) + #define PUT_UINT32_(st_p, member, n) ({ typeof(*st_p) dummy; PUT_UINT32 ((rom)(st_p) + ((rom)&dummy.member - (rom)&dummy), LTEN32(n)); }) +#endif + +#define GET_UINT8(p) ((uint8_t)(((uint8_t*)(p))[0])) +#define GET_UINT24(p) ((uint32_t)(((uint8_t*)(p))[0] | (((uint8_t*)(p))[1] << 8))| (((uint8_t*)(p))[2] << 16)) + +#define PUT_UINT8(p,n) ({ ((uint8_t*)(p))[0] = (n); }) +#define PUT_UINT24(p,n) ({ uint32_t _N=(n); uint8_t *_P=(uint8_t *)(p); _P[0]=_N; _P[1]=_N>>8; _P[2]=_N>>16; }) diff --git a/src/fasta.c b/src/fasta.c index 6049f002..ee5b48e8 100644 --- a/src/fasta.c +++ b/src/fasta.c @@ -144,11 +144,12 @@ static inline int fasta_is_end_of_contig (VBlockP vb, uint32_t first_i, } // returns the length of the data at the end of vb->txt_data that will not be consumed by this VB is to be passed to the next VB -int32_t fasta_unconsumed (VBlockP vb, uint32_t first_i, int32_t *last_i) +int32_t fasta_unconsumed (VBlockP vb, uint32_t first_i) { - bool is_entire_vb = (first_i == 0 && *last_i == Ltxt-1); + ASSERTNOTZERO (Ltxt); - ASSERT (*last_i >= 0 && *last_i < Ltxt, "*last_i=%d is ∉ [0,%u]", *last_i, Ltxt); + int32_t last_i = Ltxt-1; + bool is_entire_vb = (first_i == 0); ARRAY (char, txt, vb->txt_data); @@ -170,7 +171,7 @@ int32_t fasta_unconsumed (VBlockP vb, uint32_t first_i, int32_t *last_i) } // we move the final partial line to the next vb (unless we are already moving more, due to a --make-reference) - for (int32_t i=*last_i; i >= (int32_t)first_i; i--) { + for (int32_t i=last_i; i >= (int32_t)first_i; i--) { if (txt[i] == '\n') { @@ -195,7 +196,7 @@ int32_t fasta_unconsumed (VBlockP vb, uint32_t first_i, int32_t *last_i) return 0; } - *last_i = i; + last_i = i; return Ltxt-1 - i; } } diff --git a/src/fasta.h b/src/fasta.h index 9dea5a07..ac0ee752 100644 --- a/src/fasta.h +++ b/src/fasta.h @@ -51,7 +51,7 @@ // ----------------------------------------------------------------------------------------------------------- // Txtfile stuff -extern int32_t fasta_unconsumed (VBlockP vb, uint32_t first_i, int32_t *i); +extern int32_t fasta_unconsumed (VBlockP vb, uint32_t first_i); extern bool is_fasta (STRp(header), bool *need_more); // ZIP Stuff diff --git a/src/fastq.c b/src/fastq.c index 47ffad28..c09edfbe 100644 --- a/src/fastq.c +++ b/src/fastq.c @@ -23,6 +23,7 @@ #include "zfile.h" #include "zriter.h" #include "qname_filter.h" +#include "mgzip.h" #define dict_id_is_fastq_qname_sf dict_id_is_type_1 #define dict_id_is_fastq_aux dict_id_is_type_2 @@ -59,10 +60,10 @@ bool is_fastq (STRp(header), bool *need_more) #define NUM_TEST_READS 3 str_split_by_lines (header, header_len, 4 * NUM_TEST_READS); - n_lines = (n_lines / 4) * 4; // round to whole reads + n_lines = ROUNDDOWN4 (n_lines); // round to whole reads if (!n_lines) { - *need_more = true; // we can't tell yet - need more data + if (need_more) *need_more = true; // we can't tell yet - need more data return false; } @@ -72,9 +73,20 @@ bool is_fastq (STRp(header), bool *need_more) return true; } -bool is_fastq_pair_2 (VBlockP vb) -{ - return VB_DT(FASTQ) && VB_FASTQ->pair_vb_i > 0; +VBIType fastq_get_R1_vb_i (VBlockP vb) { return VB_FASTQ->R1_vb_i; } +uint32_t fastq_get_R1_num_lines (VBlockP vb) { return VB_FASTQ->R1_num_lines; } +rom fastq_get_R1_last_qname (VBlockP vb) { return VB_FASTQ->R1_last_qname; } +bool is_fastq_pair_2 (VBlockP vb) { return VB_DT(FASTQ) && VB_FASTQ->R1_vb_i > 0; } + +uint32_t fastq_get_R1_txt_data_len (VBlockP vb) +{ + if (!VB_FASTQ->R1_vb_i) return 0; // no corresponding R1 VB (note: we don't error here, so txtfile_read_vblock can verify that indeed R2 has no more data) + + ASSERT (z_file->R1_txt_data_lens.len32 > VB_FASTQ->R1_vb_i - z_file->R1_first_vb_i, + "%s: expecting z_file->R1_txt_data_lens.len=%u > R1_vb_i=%u - R1_first_vb_i=%u", + VB_NAME, z_file->R1_txt_data_lens.len32, VB_FASTQ->R1_vb_i, z_file->R1_first_vb_i); + + return *B32(z_file->R1_txt_data_lens, VB_FASTQ->R1_vb_i - z_file->R1_first_vb_i); } // "pair assisted" is a type pairing in which R1 data is loaded to ctx->localR1/b250R1 and R2 consults with it in seg/recon. @@ -105,107 +117,145 @@ bool fastq_zip_use_pair_identical (DictId dict_id) // two reads are "interleaved" if their line1 is of identical length, and differs in one character static bool fastq_zip_is_interleaved (STRp(r1), STRp(r2)) { - if (r1_len != r2_len) return false; + if (segconf.qname_flavor[QNAME1]) { + r1_len = strcspn (r1, " \t\n\r"); // note: no need to nul-terminate - we always call this function when we know there are full lines, so at least a \n is there + qname_canonize (QNAME1, r1, &r1_len); - uint32_t count_diff=0, diff_i[3]={}; + r2_len = strcspn (r2, " \t\n\r"); + qname_canonize (QNAME1, r2, &r2_len); + + return str_issame (r1, r2); + } - // we allow up to two diffs, with the same values, e.g.: - // A00311:85:HYGWAVAXX:1:1101:3025:1000/1 1:N:0:CAACGAGAGC+GAATTGAGTG - // A00311:85:HYGWAVAXX:1:1101:3025:1000/2 2:N:0:CAACGAGAGC+GAATTGAGTG - for (uint32_t i=0; i < r1_len && count_diff < 3; i++) - if (r1[i] != r2[i]) - diff_i[count_diff++] = i; + else { + if (r1_len != r2_len) return false; + + uint32_t count_diff=0, diff_i[3]={}; + + // we allow up to two diffs, with the same values, e.g.: + // A00311:85:HYGWAVAXX:1:1101:3025:1000/1 1:N:0:CAACGAGAGC+GAATTGAGTG + // A00311:85:HYGWAVAXX:1:1101:3025:1000/2 2:N:0:CAACGAGAGC+GAATTGAGTG + for (uint32_t i=0; i < r1_len && count_diff < 3; i++) + if (r1[i] != r2[i]) + diff_i[count_diff++] = i; + + if (count_diff <= 2 && !segconf.interleaved_r1) { // segconf: first pair of lines + segconf.interleaved_r1 = r1[diff_i[0]]; + segconf.interleaved_r2 = r2[diff_i[0]]; + } - if (count_diff <= 2 && !segconf.interleaved_r1) { // segconf: first pair of lines - segconf.interleaved_r1 = r1[diff_i[0]]; - segconf.interleaved_r2 = r2[diff_i[0]]; + return count_diff <= 2 && + (count_diff < 1 || (segconf.interleaved_r1 == r1[diff_i[0]] && segconf.interleaved_r2 == r2[diff_i[0]])) && + (count_diff < 2 || (segconf.interleaved_r1 == r1[diff_i[1]] && segconf.interleaved_r2 == r2[diff_i[1]])); } - - return count_diff <= 2 && - (count_diff < 1 || (segconf.interleaved_r1 == r1[diff_i[0]] && segconf.interleaved_r2 == r2[diff_i[0]])) && - (count_diff < 2 || (segconf.interleaved_r1 == r1[diff_i[1]] && segconf.interleaved_r2 == r2[diff_i[1]])); } +static inline bool is_last_qname (VBlockFASTQP vb, STRp(qname), STRp(pair_qname)) +{ + qname_canonize (QNAME1, qSTRa(qname)); // changes qname_len, but not qname + + if (qname_len != pair_qname_len) return false; + + // compare qnames in reverse - fail faster + for (int i=qname_len-1; i >= 0; i--) + if (qname[i] != pair_qname[i]) + return false; + + if (flag.show_bgzf) + iprintf ("R2_SYNC_QNAME vb=%-7s R1_vb_i=%u qname=\"%.*s\"\n", VB_NAME, vb->R1_vb_i, STRf(qname)); + + return true; +} // returns the length of the data at the end of vb->txt_data that will not be consumed by this VB is to be passed to the next VB -int32_t fastq_unconsumed (VBlockP vb, - uint32_t first_i, // in/out the smallest index in txt_data for which txt_data is populated (the rest might still in uncompressed BGZF blocks) - int32_t *i_out) -{ - ASSERT (*i_out >= 0 && *i_out < Ltxt, "*i=%d is ∉ [0,%u]", *i_out, Ltxt); +int32_t fastq_unconsumed (VBlockP vb_, + uint32_t first_i) // the smallest index in txt_data for which txt_data is populated (the rest might still in uncompressed MGZIP blocks) +{ + VBlockFASTQP vb = (VBlockFASTQP)vb_; + ASSERTNOTZERO (Ltxt); + + // if entire R2 vb->txt_data doesn't have a counterpart in R2, truncate it if we are allowed + if (IS_R2 && !vb->R1_vb_i && flag.truncate) + return -2; + + // initialize new R2 VB + if (IS_R2 && !vb->R1_last_qname) { + ASSINP (vb->R1_vb_i, NO_PAIR_FMT_PREFIX "%s doesn't have a counterpart VB in R1)%s", txt_name, VB_NAME, NO_PAIR_FMT_SUFFIX); - rom nl[17]={}; // newline pointers: nl[0] is the first from the end - uint32_t l[17]={}; // lengths of segments excluding \n and \r: l[1] is the segment that starts at nl[1]+1 until nl[0]-1 (or nl[0]-2 if there is a \r). l[0] is not used. + ASSERT (z_file->R1_last_qname_index.len32 > VB_FASTQ->R1_vb_i - z_file->R1_first_vb_i, + "%s: z_file->R1_last_qname_index is missing the last_qname of R1_vb_i=%u (R1_first_vb_i=%u len=%u)", + VB_NAME, vb->R1_vb_i, z_file->R1_first_vb_i, z_file->R1_last_qname_index.len32); + + uint64_t index = *B64 (z_file->R1_last_qname_index, vb->R1_vb_i - z_file->R1_first_vb_i); // set in fastq_zip_after_compute + vb->R1_last_qname = Bc(z_file->R1_last_qname, index); // nul-terminated + vb->R1_last_qname_len = strlen (vb->R1_last_qname); + + vb->R2_lowest_read = vb->R2_highest_read = -1; + } // search backwards a suffient number of newlines (eg. for normal FASTQ: best case: \nD\nS\nT\nQ\n ; worst case: \nD1\nS1\nT1\nQ1\nD2\nS2\nT2\nq2 (q2 is partial Q2)) int n=0; int height = (FAF ? 2 : 4); // number of lines per read - int min_lines = height * (segconf.is_interleaved ? 2 : 1); // minimum lines needed for testing - int max_lines = min_lines * 2; // maximum lines needed for testing + int min_lines = height + (segconf.is_interleaved ? height : 0); // minimum lines needed for testing + int max_lines = min_lines + height + (segconf.is_interleaved ? height : 0); // maximum lines needed for testing (added lines in case of final partial read/interleaved-double-read that needs to be skipped) - for (rom c=Btxt (*i_out), first_c=Btxt (first_i) ; c >= first_c-1/*one beyond*/ && n <= max_lines; c--) - if (c == (first_c-1) || *c == '\n') { // we consider character before the start to also be a "virtual newline" - nl[n] = c; - if (n) l[n] = ((nl[n-1]) - (nl[n-1][-1] == '\r')) - (nl[n] + 1); + rom lines[9]={}; // newline pointers: nl[0] is the first from the end + uint32_t line_lens[9]={}; // lengths of segments excluding \n and \r + int line_1_modulo = -1; // value 0-3 means we know which n%4 an R2 read starts, otherwise -1. + uint32_t highest_read_in_this_call=0; + for (rom c = ((IS_R2 && vb->R2_lowest_read >= 0) ? Btxt(vb->R2_lowest_read-1) : BLSTtxt), one_before = Btxt (first_i)-1; + c >= one_before && (n <= max_lines || IS_R2); + c--) + + if (c == one_before || *c == '\n') { // we consider character before the start to also be a "virtual newline" + memmove (lines+1, lines, min_lines * sizeof(rom)); // [0] is always the current, i.e. the lowest in txt_data, line. + memmove (line_lens+1, line_lens, min_lines * sizeof(uint32_t)); + lines[0] = c+1; // first character after \n + line_lens[0] = n ? (lines[1] - lines[0] -1/*\n*/ - (lines[1][-2] == '\r')) : 0; // when n=0, it is the final, partial, line, so we considers its length to be 0. + // case: test for valid read after reading a sufficient number of lines if (n >= min_lines && - (FAF ? is_valid_read ((rom[]){ nl[n]+1, nl[n-1]+1 }, (uint32_t[]){ l[n], l[n-1] }) - : is_valid_read ((rom[]){ nl[n]+1, nl[n-1]+1, nl[n-2]+1, nl[n-3]+1 }, (uint32_t[]){ l[n], l[n-1], l[n-2], l[n-3]})) && - (segconf.is_interleaved ? fastq_zip_is_interleaved (nl[n]+1, l[n], nl[n-height]+1, l[n-height]) : true)) + (line_1_modulo == -1 || line_1_modulo == n % 4) && // R2: no need to call is_valid_read if we are not on a start of a read + is_valid_read (lines, line_lens) && + (segconf.is_interleaved ? is_valid_read (&lines[height], &line_lens[height]) : true) && + (segconf.is_interleaved ? fastq_zip_is_interleaved (STRi(line, 0), STRi(line, height)) : true)) { - *i_out = BNUMtxt (nl[n-min_lines]); // the final newline of this read (everything beyond is "unconsumed" and moved to the next VB) - return BLSTtxt - nl[n-min_lines]; // number of "unconsumed" characters remaining in txt_data after the last \n of this read + // case R2: starting with the last validated read, scan backwards until reaching (or not) the read we're looking for + if (IS_R2) { + uint32_t read_bnum = BNUMtxt (lines[0]); + + if (line_1_modulo == -1) { + highest_read_in_this_call = read_bnum; + line_1_modulo = n % 4; + } + + if (vb->R2_lowest_read == -1/*uninitialized*/ || vb->R2_lowest_read > read_bnum) + vb->R2_lowest_read = read_bnum; // this is the read with the lowest index in txt_data so far to be considered + + // case: in previous calls to this function, we already tested this read and all lower reads - we need to read more data + if (vb->R2_highest_read == read_bnum && vb->R2_lowest_read == 0) { + vb->R2_highest_read = highest_read_in_this_call; // the highest read we've considered + return -1; // all current txt_data has been considered and matching QNAME not found, read more data from disk please + } + + // case: the read does not have the same QNAME as the last read of R1 - continue searching backwards + if (!is_last_qname (vb, lines[0]+1/*skip @*/, strcspn (lines[0]+1, " \t\n\r"), STRa(vb->R1_last_qname))) + goto next_line; + } + + // everything after the last full read goes to the next VB + ASSERTNOTNULL (lines[min_lines]); + return BAFTtxt - lines[min_lines]; // number of "unconsumed" characters remaining in txt_data after the last line of this read } - - n++; - } - - ASSINP (n < max_lines, "%s: Examined %d textual lines at the end of the VB and could not find a valid read, it appears that this is not a valid %s file. Data examined:\n%.*s", - VB_NAME, max_lines-1, DT_NAME, (int)(nl[0] - nl[max_lines-1]), nl[max_lines-1] + 1); // 7 lines and their newlines - - // case: the data provided has less than 'max_lines' newlines, and within it we didn't find a read. need more data. - *i_out = (int32_t)first_i - 1; // next index to test - one before first_i - return -1; // more data please -} - -// called by txtfile_read_vblock when reading the 2nd file in a fastq pair - counts the number of fastq "lines" (each being 4 textual lines), -// comparing to the number of lines in the first file of the pair -// returns true if we have at least as much as needed, and sets unconsumed_len to the amount of excess characters read -// returns false is we don't yet have pair_1_num_lines lines - we need to read more -bool fastq_txtfile_have_enough_lines (VBlockP vb_, uint32_t *unconsumed_len, - uint32_t *my_lines, VBIType *pair_vb_i, uint32_t *pair_lines, uint32_t *pair_txt_data_len) // out - only set in case of failure -{ - START_TIMER; - - VBlockFASTQ *vb = (VBlockFASTQ *)vb_; - - // note: the opposite case where R2 has less reads than R1 is caught in txtfile_read_vblock. this case is also caught in zip_prepare_one_vb_for_dispatching - ASSINP (vb->pair_num_lines || - (flag.truncate && str_count_char (STRb(vb->txt_data), '\n') < 4), // we don't have any line either - the data we have is just yet-to-be-truncated partial final line - "Error: File %s has more FASTQ reads than its R1 mate (vb=%s txt_data.len=%u pair_vb_i=%u pair_num_lines=0)", - txt_name, VB_NAME, Ltxt, vb->pair_vb_i); - - rom next = B1STtxt; - rom after = BAFTtxt; - - uint32_t pair_num_txt_lines = vb->pair_num_lines * 4, line_i; - for (line_i=0; line_i < pair_num_txt_lines; line_i++) { - if (!(next = memchr (next, '\n', after - next))) { - *my_lines = line_i; - *pair_vb_i = vb->pair_vb_i; - *pair_lines = pair_num_txt_lines; - *pair_txt_data_len = vb->pair_txt_data_len; - return false; + + next_line: n++; } - next++; // skip newline - } - vb->lines.len32 = line_i / 4; - *unconsumed_len = after - next; + ASSINP (n < max_lines || IS_R2, "%s: Examined %d textual lines at the end of the VB and could not find a valid read, it appears that this is not a valid %s file. Last %u lines examined:\n[0]=\"%.*s\"\n[1]=\"%.*s\"\n[2]=\"%.*s\"\n[3]=\"%.*s\"\n[4]=\"%.*s\"\n", + VB_NAME, n, DT_NAME, MIN_(n, 5), STRfi(line,0), STRfi(line,1), STRfi(line,2), STRfi(line,3), STRfi(line,4)); - COPY_TIMER (fastq_txtfile_have_enough_lines); - return true; + return -1; // uncompress one more mgzip block please } void fastq_zip_set_txt_header_flags (struct FlagsTxtHeader *f) @@ -242,6 +292,23 @@ void fastq_zip_after_compute (VBlockP vb) if (!flag.deep && IS_REF_LOADED_ZIP) { DO_ONCE ref_verify_organism (vb); } + + // capture the last qname of each R1 VB, allowing the generation of the respective R2 VB in fastq_unconsumed + if (IS_R1) { + // note: we store qnames with indirection (index) because this function is called out-of-order + buf_alloc_zero (evb, &z_file->R1_last_qname_index, 0, vb->vblock_i - z_file->R1_first_vb_i + 1, uint64_t, CTX_GROWTH, NULL); // pre-allocated in fastq_zip_after_segconf + *B64(z_file->R1_last_qname_index, vb->vblock_i - z_file->R1_first_vb_i) = z_file->R1_last_qname.len; + z_file->R1_last_qname_index.len32 = MAX_(z_file->R1_last_qname_index.len32, vb->vblock_i - z_file->R1_first_vb_i + 1); + + STRlast (qname, FASTQ_QNAME); + qname_canonize (QNAME1, qname, &qname_len); // get canonical qname_len + + buf_add_more (evb, &z_file->R1_last_qname, qname, qname_len, NULL); // pre-allocated in fastq_zip_after_segconf + BNXTc (z_file->R1_last_qname) = 0; // nul + + if (flag.show_bgzf) + iprintf ("R1_LAST_QNAME vb=%-7s qname=\"%.*s\" num_lines=%u\n", VB_NAME, STRf(qname), vb->lines.len32); + } } // case of --optimize-DESC: generate the prefix of the read name from the txt file name @@ -301,10 +368,13 @@ void fastq_zip_initialize (void) seg_prepare_snip_other (SNIP_COPY, _FASTQ_QNAME, 0, 0, copy_qname_snip); } - // reset lcodec for STRAND and GPOS, as these may change between PAIR_1 and PAIR_2 files + // reset lcodec for STRAND and GPOS, as these may change between PAIR_R1 and PAIR_R2 files ZCTX(FASTQ_STRAND)->lcodec = CODEC_UNKNOWN; ZCTX(FASTQ_GPOS )->lcodec = CODEC_UNKNOWN; + if (IS_R1) + z_file->R1_first_vb_i = z_file->num_vbs + 1; // 1 for --pair, >1 for --deep + // with REF_EXTERNAL, we don't know which chroms are seen (bc unlike REF_EXT_STORE, we don't use is_set), so // we just copy all reference contigs. this are not needed for decompression, just for --coverage/--sex/--idxstats if (IS_REF_EXTERNAL && z_file->num_txts_so_far == 1) // single file, or first of pair (and never Deep) @@ -319,15 +389,6 @@ void fastq_zip_initialize (void) // called by main thread after each txt file compressing is done void fastq_zip_finalize (bool is_last_user_txt_file) { - // TO DO: bug 1044 - // double deep_pc = z_file->deep_stats[NDP_FQ_READS] ? (double)(z_file->deep_stats[NDP_DEEPABLE] + z_file->deep_stats[NDP_DEEPABLE_TRIM]) / (double)z_file->deep_stats[NDP_FQ_READS] : 0; - - // after compressing R1, if it turns out that almost all reads were deeped, no need to - // pair (this saves loading paired sections from z_file, and if BGZF - we can decompress in the compute thread) - // if (!is_last_user_txt_file && flag.deep && flag.pair && - // ((!flag.best && deep_pc > 0.99) || (flag.best && deep_pc > 0.998))) - // flag.pair = PAIR_DEEP_ONLY; - if (is_last_user_txt_file && flag.deep) fastq_deep_zip_finalize(); @@ -350,9 +411,14 @@ void fastq_seg_initialize (VBlockP vb_) // if no --pair, segconf to determine if file is interleaved segconf.is_interleaved = (flag.pair || flag.no_interleaved) ? no : unknown; + } - // this optimization is not dependent on segconf results, and is needed by segconf - so we initialize here - if (flag.optimize) segconf.optimize[FASTQ_QNAME] = !flag.deep; + // if this is an R2 VB that has been uncompressed in the compute thread, verify the number lines + if (IS_R2 && TXT_IS_IN_SYNC) { + uint32_t actual_num_lines = str_count_char (STRb(vb->txt_data), '\n') / 4; + ASSERT (actual_num_lines == vb->lines.len32/*set in seg_all_data_lines*/, + "expecting n_reads=%u in %s to match n_reads=%u in corresponding R1 vb=%u. effective_codec=%s. Please report this to "EMAIL_SUPPORT". Solution: use --no-bgzf.", + actual_num_lines, VB_NAME, vb->lines.len32, vb->R1_vb_i, codec_name (txt_file->effective_codec)); } vb->has_extra = segconf.has_extra; // VB-private copy @@ -391,7 +457,7 @@ void fastq_seg_initialize (VBlockP vb_) buf_alloc (vb, &gpos_ctx->local, 1, vb->lines.len, uint32_t, CTX_GROWTH, CTX_TAG_LOCAL); buf_alloc (vb, &strand_ctx->local, 0, roundup_bits2bytes64 (vb->lines.len), uint8_t, 0, CTX_TAG_LOCAL); - if (vb->pair_vb_i) + if (vb->R1_vb_i) buf_alloc (vb, &gpos_d_ctx->local, 1, vb->lines.len, int16_t, CTX_GROWTH, CTX_TAG_LOCAL); } } @@ -406,14 +472,14 @@ void fastq_seg_initialize (VBlockP vb_) // initialize QUAL to LT_BLOB, it might be changed later to LT_CODEC (eg domq, longr) ctx_set_ltype (VB, LT_BLOB, FASTQ_QUAL, DID_EOL); - if (flag.pair == PAIR_R1) - // cannot all_the_same with no b250 for PAIR_1 - SQBITMAP.b250 is tested in fastq_get_pair_1_gpos_strand + if (IS_R1) + // cannot all_the_same with no b250 for PAIR_R1 - SQBITMAP.b250 is tested in fastq_get_pair_1_gpos_strand // See defect 2023-02-11. We rely on this "no_drop_b250" in fastq_piz_get_r2_is_forward bitmap_ctx->no_drop_b250 = true; - else if (flag.pair == PAIR_R2) { - ASSERT (vb->lines.len32 == vb->pair_num_lines, "in vb=%s (PAIR_R2): pair_num_lines=%u but lines.len=%u", - VB_NAME, vb->pair_num_lines, vb->lines.len32); + else if (IS_R2) { + ASSINP (vb->lines.len32 == vb->R1_num_lines, NO_PAIR_FMT_PREFIX "in vb=%s: lines.len=%u but R1_num_lines=%u in its corresponding R1 vb_i=%u)%s", + txt_name, VB_NAME, vb->lines.len32, vb->R1_num_lines, vb->R1_vb_i, NO_PAIR_FMT_SUFFIX); // we're pair-2, decompress all of pair-1's contexts needed for pairing piz_uncompress_all_ctxs (VB); @@ -434,7 +500,7 @@ void fastq_seg_initialize (VBlockP vb_) ctx_consolidate_stats (VB, (segconf.desc_is_l3 ? FASTQ_LINE3 : FASTQ_QNAME), FASTQ_AUX, FASTQ_EXTRA, DID_EOL); } - if (flag.pair == PAIR_R2) + if (IS_R2) ctx_create_node (VB, FASTQ_SQBITMAP, (char[]){ SNIP_SPECIAL, FASTQ_SPECIAL_mate_lookup }, 2); // when pairing, we cannot have singletons, bc a singleton in R1, when appearing in R2 will not @@ -507,22 +573,48 @@ void fastq_segconf_finalize (VBlockP vb) qname_segconf_finalize (vb); - // set optimizations + // set optimizations (these might tell get canceled in segconf_finalize_optimize()) if (flag.optimize) { + segconf.optimize[FASTQ_QNAME] = !flag.deep; + segconf.optimize[FASTQ_QNAME2] = (!flag.deep && segconf.has_qname2); // note: we don't reset has_qname2 as fastq_zip_modify needs it to parse the read + segconf.optimize[FASTQ_LINE3] = (segconf.line3 != L3_EMPTY); + // optimize QUAL unless already binned (8 is the number of bins in Illimina: https://sapac.illumina.com/content/dam/illumina-marketing/documents/products/technotes/technote_understanding_quality_scores.pdf) - segconf.optimize[FASTQ_QUAL] = (segconf_get_num_qual_scores(QHT_QUAL) > 8); + segconf.optimize[FASTQ_QUAL] = (segconf_get_num_qual_scores(QHT_QUAL) > 8); + } +} + +// called after segconf inc. segconf_finalize_optimize() which might remove optimizations +void fastq_zip_after_segconf (void) +{ + if (IS_R1) { + double est_num_vbs = MAX_(1, (double)txtfile_get_seggable_size() / (double)segconf.vb_size * 1.1); - segconf.optimize[FASTQ_LINE3] = (segconf.line3 == L3_OPTIMIZED_AWAY); + // allocate memory to store txt_data.len32 of each R1 VB + buf_alloc (evb, &z_file->R1_txt_data_lens, 0, est_num_vbs, uint32_t, 0, "z_file->R1_txt_data_lens"); + + // allocate memory to store the last qname (canonized) of each R1 VB + uint32_t canonical_len = strlen (segconf.qname_line0[QNAME1].s); + qname_canonize (QNAME1, segconf.qname_line0[QNAME1].s, &canonical_len); + buf_alloc (evb, &z_file->R1_last_qname, 0, est_num_vbs * (1 + canonical_len), char, 0, "z_file->R1_last_qname"); + buf_alloc (evb, &z_file->R1_last_qname_index, 0, est_num_vbs, uint64_t, 0, "z_file->R1_last_qname_index"); + } - if (!flag.deep) { - segconf.optimize[FASTQ_QNAME] = true; - segconf.optimize[FASTQ_QNAME2] = segconf.has_qname2; // note: we don't reset has_qname2 as fastq_zip_modify needs it to parse the read - segconf.qname_flavor[QNAME2] = NULL; - ZCTX(FASTQ_QNAME2)->st_did_i = FASTQ_QNAME; // consolidate_stats doesn't work for QNAME2 because it is not merged if optimized - } + if (segconf.optimize[FASTQ_LINE3]) { + segconf.line3 = L3_EMPTY; - if (segconf.optimize[FASTQ_QNAME]) - fastq_get_optimized_qname_read_name(); + if (segconf.desc_is_l3) + segconf.seq_len_dict_id.num = 0; + } + + if (segconf.optimize[FASTQ_QNAME2]) { + segconf.qname_flavor[QNAME2] = NULL; + ZCTX(FASTQ_QNAME2)->st_did_i = FASTQ_QNAME; // consolidate_stats doesn't work for QNAME2 because it is not merged if optimized + } + + if (segconf.optimize[FASTQ_QNAME]) { + segconf.qname_flavor[QNAME1] = qname_get_optimize_qf(); + fastq_get_optimized_qname_read_name(); } } @@ -591,7 +683,7 @@ void fastq_seg_finalize (VBlockP vb) memmove (&prefixes[px_i], &prefixes[px_i+px_len], sizeof (prefixes)-(px_i+px_len)); \ prefixes_len -= px_len; }) - bool has_line3 = segconf.line3 != L3_EMPTY && segconf.line3 != L3_OPTIMIZED_AWAY; + bool has_line3 = segconf.line3 != L3_EMPTY; // whether the Description (QNAME2 + EXTRA + AUX) appears on line 1 or line 3 // note: if on both, the line 3 is just a copy snip from line 1 @@ -630,22 +722,18 @@ bool fastq_seg_is_small (ConstVBlockP vb, DictId dict_id) // ZIP/PIZ main thread: called ahead of zip or piz a pair 2 vb - to read data we need from the previous pair 1 file // returns true if successful, false if there isn't a vb with vb_i in the previous file -void fastq_read_pair_1_data (VBlockP vb_, VBIType pair_vb_i) +void fastq_read_R1_data (VBlockP vb_, VBIType R1_vb_i) { START_TIMER; + VBlockFASTQP vb = (VBlockFASTQP)vb_; if (flag.no_zriter) zriter_flush(); - VBlockFASTQP vb = (VBlockFASTQP)vb_; - - vb->pair_vb_i = pair_vb_i; + Section sec = sections_vb_header (R1_vb_i); - Section sec = sections_vb_header (pair_vb_i); - vb->pair_num_lines = sec->num_lines; + vb->R1_vb_i = R1_vb_i; + vb->R1_num_lines = sec->num_lines; - if (flag.debug) // use --debug to access - displays in errors in txtfile_read_vblock - vb->pair_txt_data_len = BGEN32 (zfile_read_section_header (vb, sec, SEC_VB_HEADER).vb_header.recon_size); - // read into ctx->pair the data we need from our pair: QNAME,QNAME2,LINE3 and its components, GPOS and STRAND buf_alloc (vb, &vb->z_section_headers, MAX_DICTS * 2, 0, uint32_t, 0, "z_section_headers"); // indices into vb->z_data of section headers @@ -654,13 +742,13 @@ void fastq_read_pair_1_data (VBlockP vb_, VBIType pair_vb_i) if (flag.no_zriter) file_seek (z_file, 0, SEEK_END, READ, HARD_FAIL); // restore - COPY_TIMER (fastq_read_pair_1_data); + COPY_TIMER (fastq_read_R1_data); } // main thread: after reading VB_HEADER and before reading local/b250 sections from z_file void fastq_piz_before_read (VBlockP vb) { - if (writer_am_i_pair_2 (vb->vblock_i, &VB_FASTQ->pair_vb_i)) { // sets pair_vb_i if R2, leaves it 0 if R1 + if (writer_am_i_pair_2 (vb->vblock_i, &VB_FASTQ->R1_vb_i)) { // sets R1_vb_i if R2, leaves it 0 if R1 // backward compatability: prior to V15, PIZ didn't rely on FlagCtx.paired, and it was not always // applied: SQBITMAP.b250 in v14 and GPOS.local (at least) in v14 incorrectly didn't set the flag @@ -675,8 +763,8 @@ void fastq_piz_before_read (VBlockP vb) bool fastq_piz_init_vb (VBlockP vb, ConstSectionHeaderVbHeaderP header) { // in case of this is a R2 of a paired fastq file, get the R1 data - if (vb && VB_FASTQ->pair_vb_i > 0) - fastq_read_pair_1_data (vb, VB_FASTQ->pair_vb_i); + if (vb && VB_FASTQ->R1_vb_i > 0) + fastq_read_R1_data (vb, VB_FASTQ->R1_vb_i); return true; } @@ -761,14 +849,11 @@ static rom fastq_seg_get_lines (VBlockFASTQP vb, rom line, int32_t remaining, // get LINE3 *qual = seg_get_next_line (VB, *line3, &remaining, line3_len, true, &has_13[2], "LINE3"); - // analyze Line3 (segconf) + // analyze Line3 (segconf). note: if flag.optimize, we will update in fastq_seg_finalize if (analyze) { if (*line3_len == 0) segconf.line3 = L3_EMPTY; - else if (flag.optimize) - segconf.line3 = L3_OPTIMIZED_AWAY; - else if (fastq_is_line3_copy_of_line1 (STRa(*qname), STRa(*line3), *desc_len)) segconf.line3 = L3_COPY_LINE1; @@ -874,9 +959,14 @@ rom fastq_zip_modify (VBlockP vb_, rom line_start, uint32_t remaining) // case: optimize -> empty line3 if (segconf.optimize[FASTQ_LINE3]) - CTX(FASTQ_LINE3)->txt_shrinkage += line3_len; - else + CTX(FASTQ_LINE3)->txt_shrinkage += line3_len + (segconf.desc_is_l3 ? (1 + desc_len) : 0); + else { next = mempcpy (next, line3, line3_len); + if (segconf.desc_is_l3 && desc_len) { + *next++ = ' '; + next = mempcpy (next, desc, desc_len); + } + } *next++ = '\n'; diff --git a/src/fastq.h b/src/fastq.h index 43abfda7..4219a279 100644 --- a/src/fastq.h +++ b/src/fastq.h @@ -14,7 +14,10 @@ // SAM and FASTQ share the same Dids and DictIds #define FASTQ_CONTIG SAM_RNAME #define FASTQ_QNAME SAM_QNAME +#define FASTQ_Q0NAME SAM_Q0NAME +#define FASTQ_QmNAME SAM_QmNAME #define FASTQ_QNAME2 SAM_QNAME2 +#define FASTQ_QmNAME2 SAM_QmNAME2 #define FASTQ_AUX SAM_AUX #define FASTQ_SQBITMAP SAM_SQBITMAP #define FASTQ_NONREF SAM_NONREF @@ -41,9 +44,11 @@ #define _FASTQ_QNAME _SAM_QNAME #define _FASTQ_Q0NAME _SAM_Q0NAME #define _FASTQ_Q1NAME _SAM_Q1NAME +#define _FASTQ_QmNAME _SAM_QmNAME #define _FASTQ_QNAME2 _SAM_QNAME2 #define _FASTQ_Q0NAME2 _SAM_Q0NAME2 #define _FASTQ_Q1NAME2 _SAM_Q1NAME2 +#define _FASTQ_QmNAME2 _SAM_QmNAME2 #define _FASTQ_AUX _SAM_AUX #define _FASTQ_SQBITMAP _SAM_SQBITMAP #define _FASTQ_NONREF _SAM_NONREF @@ -70,17 +75,25 @@ #define NUM_FASTQ_FIELDS NUM_SAM_FIELDS +#define NO_PAIR_FMT_PREFIX "--pair cannot be used because %s is not perfectly paired with its counterpart (read names differ or are not aligned) (technical: " +#define NO_PAIR_FMT_SUFFIX (flag.deep ? " Solution: add --not-paired" : "") + // Txtfile stuff -extern int32_t fastq_unconsumed (VBlockP vb, uint32_t first_i, int32_t *i); -extern bool fastq_txtfile_have_enough_lines (VBlockP vb, uint32_t *unconsumed_len, uint32_t *my_lines, VBIType *pair_vb_i, uint32_t *pair_lines, uint32_t *pair_txt_data_len); +extern int32_t fastq_unconsumed (VBlockP vb, uint32_t first_i); +extern bool fastq_txtfile_sync_to_R1_by_num_lines (VBlockP vb, uint32_t bytes_requested, uint32_t len, bool no_read_expected, uint32_t *my_vb_size); extern bool is_fastq (STRp(header), bool *need_more); extern bool is_fastq_pair_2 (VBlockP vb); +extern VBIType fastq_get_R1_vb_i (VBlockP vb); +extern uint32_t fastq_get_R1_num_lines (VBlockP vb); +extern uint32_t fastq_get_R1_txt_data_len (VBlockP vb); +extern rom fastq_get_R1_last_qname (VBlockP vb); extern void fastq_zip_set_txt_header_flags (struct FlagsTxtHeader *f); // ZIP Stuff extern void fastq_zip_initialize (void); extern rom fastq_zip_modify (VBlockP vb, rom line_start, uint32_t remaining); extern void fastq_segconf_set_r1_or_r2 (void); +extern void fastq_zip_after_segconf (void); extern void fastq_zip_finalize (bool is_last_user_txt_file); extern void fastq_zip_init_vb (VBlockP vb); extern void fastq_zip_after_compute (VBlockP vb); @@ -120,7 +133,7 @@ extern unsigned fastq_vb_zip_dl_size (void); extern void fastq_reset_line (VBlockP vb); // file pairing (--pair) stuff -extern void fastq_read_pair_1_data (VBlockP vb, VBIType pair_vb_i); +extern void fastq_read_R1_data (VBlockP vb, VBIType R1_vb_i); // FASTQ-specific fields in genozip header extern void fastq_zip_genozip_header (SectionHeaderGenozipHeaderP header); diff --git a/src/fastq_deep.c b/src/fastq_deep.c index a02ca0a8..1abe4eef 100644 --- a/src/fastq_deep.c +++ b/src/fastq_deep.c @@ -23,10 +23,10 @@ sSTRl(con_decanonize2_snip,96); void fastq_deep_zip_initialize (void) { DO_ONCE { - SmallContainer con1 = { .repeats = 1, .nitems_lo = 2, .items = { { .dict_id.num = _FASTQ_Q0NAME }, { .dict_id.num = _FASTQ_Q1NAME } }}; + SmallContainer con1 = { .repeats = 1, .nitems_lo = 2, .items = { { .dict_id.num = _FASTQ_Q0NAME }, { .dict_id.num = _FASTQ_QmNAME } }}; container_prepare_snip ((ContainerP)&con1, NULL, 0, qSTRa(con_decanonize1_snip)); - SmallContainer con2 = { .repeats = 1, .nitems_lo = 2, .items = { { .dict_id.num = _FASTQ_Q0NAME2 }, { .dict_id.num = _FASTQ_Q1NAME2 } }}; + SmallContainer con2 = { .repeats = 1, .nitems_lo = 2, .items = { { .dict_id.num = _FASTQ_Q0NAME2 }, { .dict_id.num = _FASTQ_QmNAME2 } }}; container_prepare_snip ((ContainerP)&con2, NULL, 0, qSTRa(con_decanonize2_snip)); } } @@ -41,7 +41,7 @@ void fastq_deep_seg_initialize (VBlockFASTQP vb) { ctx_set_dyn_int (VB, FASTQ_DEEP, DID_EOL); // this also sets STORE_INT. actually, we store a pointer into one of the Buffers in z_file->deep_ents, but we treat it as an int - if (flag.pair == PAIR_R1 || flag.pair == NOT_PAIRED) + if (IS_R1 || flag.pair == NOT_PAIRED) seg_by_did (VB, (char[]){ SNIP_SPECIAL, FASTQ_SPECIAL_set_deep, '0' }, 3, FASTQ_DEEP, 0); // all-the-same for FASTQ_DEEP else { // pair-2 @@ -238,7 +238,7 @@ static void fastq_deep_seg_segconf (VBlockFASTQP vb, STRp(qname), STRp(qname2), int num_qnames = 1 + (qname2_len > 0); - uint32_t qname_hash[2] = { deep_qname_hash (QNAME1, STRa(qname), NULL), + uint32_t qname_hash[2] = { deep_qname_hash (QNAME1, STRa(qname), NULL), ((num_qnames==2) ? deep_qname_hash (QNAME2, STRa(qname2), NULL) : 0) }; uint32_t seq_hash = deep_seq_hash (VB, STRa(seq), false); @@ -587,7 +587,7 @@ void fastq_seg_deep (VBlockFASTQP vb, ZipDataLineFASTQ *dl, STRp(qname), STRp(qn *deep_seq = *deep_qual = *deep_qname = false; // reset } - if (flag.pair == NOT_PAIRED || flag.pair == PAIR_R1) + if (flag.pair == NOT_PAIRED || IS_R1) dyn_int_append (VB, ctx, deep_value, 0); else { // PAIR_R2 @@ -657,7 +657,7 @@ void fastq_deep_seg_QNAME (VBlockFASTQP vb, Did did_i, STRp(qname), uint32_t unc else /*QNAME2*/ seg_by_did (VB, STRa(con_decanonize2_snip), did_i, 0); seg_by_did (VB, (char[]){ SNIP_SPECIAL, FASTQ_SPECIAL_deep_copy_QNAME }, 2, did_i + 1, add_bytes - uncanonical_suffix_len); - seg_by_did (VB, qname + qname_len - uncanonical_suffix_len, uncanonical_suffix_len, did_i + 2, uncanonical_suffix_len); + seg_by_did (VB, qname + qname_len - uncanonical_suffix_len, uncanonical_suffix_len, (did_i == FASTQ_QNAME) ? FASTQ_QmNAME : FASTQ_QmNAME2, uncanonical_suffix_len); } // case: entire qname is canonical - seg as "copy from deep" @@ -715,7 +715,7 @@ SPECIAL_RECONSTRUCTOR_DT (fastq_special_set_deep) ASSERTNOTEMPTY (z_file->vb_start_deep_line); uint64_t txt_deepable_line_i; - uint64_t pair_1_deep_value = vb->pair_vb_i ? fastq_get_pair_deep_value (vb, ctx) : 0; // consume whether or not used + uint64_t pair_1_deep_value = vb->R1_vb_i ? fastq_get_pair_deep_value (vb, ctx) : 0; // consume whether or not used if (snip[0] == '0') // no delta txt_deepable_line_i = reconstruct_from_local_int (VB, ctx, 0, RECON_OFF); @@ -889,7 +889,7 @@ SPECIAL_RECONSTRUCTOR_DT (fastq_special_deep_copy_SEQ) reconstruct_from_local_sequence (VB, CTX(FASTQ_NONREF), trim_len - vb->sam_seq_offset, reconstruct); // case we are pair-2: advance pair-1 SQBITMAP iterator, and if pair-1 is aligned - also its GPOS iterator - if (vb->pair_vb_i/*we are R2*/ && fastq_piz_R1_test_aligned (vb)) + if (vb->R1_vb_i/*we are R2*/ && fastq_piz_R1_test_aligned (vb)) CTX(FASTQ_GPOS)->localR1.next++; // gpos_ctx->localR1.next is an iterator for both gpos and strand COPY_TIMER (fastq_special_deep_copy_SEQ); diff --git a/src/fastq_desc.c b/src/fastq_desc.c index 70799908..76fa2295 100644 --- a/src/fastq_desc.c +++ b/src/fastq_desc.c @@ -17,6 +17,9 @@ void fastq_seg_QNAME (VBlockFASTQP vb, STRp(qname), uint32_t line1_len, bool dee else qname_seg (VB, QNAME1, STRa(qname), 1); // account for the '@' (segged as a toplevel container prefix) + + if (IS_R1) + set_last_txt (FASTQ_QNAME, qname); // used to populate z_file->R1_last_qname in fastq_zip_after_compute } bool fastq_is_line3_copy_of_line1 (STRp(qname), STRp(line3), uint32_t desc_len) @@ -30,7 +33,6 @@ void fastq_seg_LINE3 (VBlockFASTQP vb, STRp(qline3), STRp(qline1), STRp(desc)) { switch (segconf.line3) { case L3_EMPTY: // no segging - we will drop the line from top_level - case L3_OPTIMIZED_AWAY: ASSSEG (!qline3_len || segconf.optimize[FASTQ_QNAME], "Invalid FASTQ file format (#1): expecting middle line to be a \"+\", but it is \"+%.*s\"", STRf(qline3)); CTX(FASTQ_LINE3)->txt_len++; // account for the '+' (it is segged in the toplevel container) break; diff --git a/src/fastq_private.h b/src/fastq_private.h index d198cf92..53d81b15 100644 --- a/src/fastq_private.h +++ b/src/fastq_private.h @@ -36,12 +36,14 @@ typedef struct VBlockFASTQ { // current line uint32_t sam_seq_offset; // PIZ Deep: offset of start of SEQ / QUAL copied from SAM with the FASTQ SEQ / QUAL - // pairing stuff - used if we are the 2nd file in the pair - uint32_t pair_vb_i; // ZIP/PIZ: in R2: the equivalent vb_i in the R1 (vb_i >= 1), or 0 if this is R1 - uint32_t pair_num_lines; // R2: number of reads (FASTQ lines) in the equivalent vb in the R1 - uint32_t pair_txt_data_len; // ZIP R2: populated if flag.debug - - uint64_t first_line; // ZIP: used for optimizing QNAME + // data used for segging R2 + uint32_t R1_vb_i; // ZIP/PIZ R2: the equivalent vb_i in the R1 (vb_i >= 1) + STR (R1_last_qname); // ZIP R2: pointer into z_file->R1_last_qname: last reversed, nul-terminated canonical qname of the correspondining R1 VB + uint32_t R1_num_lines; // ZIP R2: number of reads (FASTQ lines) in the corresponding R1 vb + int32_t R2_lowest_read; // ZIP: in fastq_unconsumed: lowest/highest index in txt_data of reads that we tested against R1_last_qname + int32_t R2_highest_read; + + uint64_t first_line; // ZIP R2: used for optimizing QNAME bool has_extra; // ZIP: a VB-private copy of segconf.has_extra diff --git a/src/fastq_seq.c b/src/fastq_seq.c index 4d068284..ec43c1c5 100644 --- a/src/fastq_seq.c +++ b/src/fastq_seq.c @@ -50,7 +50,7 @@ void fastq_seg_SEQ (VBlockFASTQP vb, ZipDataLineFASTQ *dl, STRp(seq), bool deep) bool pair_is_forward = false; // case: R2 in paired file - if (vb->pair_vb_i) + if (vb->R1_vb_i) fastq_get_pair_1_gpos_strand (vb, &pair_gpos, &pair_is_forward); // advance iterators even if we don't need the pair data // case: R2 in interleaved file @@ -78,7 +78,7 @@ void fastq_seg_SEQ (VBlockFASTQP vb, ZipDataLineFASTQ *dl, STRp(seq), bool deep) // case: aligner - lookup from SQBITMAP MappingType aln_res; - if (aligner_ok && ((aln_res = aligner_seg_seq (VB, STRa(seq), (vb->pair_vb_i > 0), pair_gpos, pair_is_forward)))) { + if (aligner_ok && ((aln_res = aligner_seg_seq (VB, STRa(seq), (vb->R1_vb_i > 0), pair_gpos, pair_is_forward)))) { int32_t pseudo_seq_len = seq_len_by_qname (vb, seq_len) ? SEQ_LEN_BY_QNAME : seq_len; @@ -281,7 +281,7 @@ void fastq_recon_aligned_SEQ (VBlockP vb_, STRp(seq_len_str), ReconType reconstr VBlockFASTQP vb = (VBlockFASTQP )vb_; declare_seq_contexts; - if (vb->pair_vb_i) // R2 + if (vb->R1_vb_i) // R2 fastq_piz_R1_test_aligned (vb); // set r1_is_aligned // v14: perfect alignment is expressed by a negative seq_len @@ -302,7 +302,7 @@ void fastq_recon_aligned_SEQ (VBlockP vb_, STRp(seq_len_str), ReconType reconstr // normal reconstruction else - aligner_reconstruct_seq (vb_, vb->seq_len, vb->pair_vb_i > 0, perfect_alignment, reconstruct, NULL, NULL, NULL); + aligner_reconstruct_seq (vb_, vb->seq_len, vb->R1_vb_i > 0, perfect_alignment, reconstruct, NULL, NULL, NULL); } // PIZ: SEQ reconstruction - in case of unaligned sequence @@ -311,7 +311,7 @@ SPECIAL_RECONSTRUCTOR (fastq_special_unaligned_SEQ) declare_seq_contexts; // case we are pair-2: advance pair-1 SQBITMAP iterator, and if pair-1 is aligned - also its GPOS iterator - if (VB_FASTQ->pair_vb_i) // R2 + if (VB_FASTQ->R1_vb_i) // R2 if (fastq_piz_R1_test_aligned (VB_FASTQ) || !VER(14)) // up to v13, even non-aligned reads had a GPOS entry gpos_ctx->localR1.next++; // gpos_ctx->localR1.next is an iterator for both gpos and strand diff --git a/src/file.c b/src/file.c index 8d802402..fa34dd4c 100644 --- a/src/file.c +++ b/src/file.c @@ -23,7 +23,7 @@ #include "file.h" #include "url.h" #include "codec.h" -#include "bgzf.h" +#include "mgzip.h" #include "progress.h" #include "tar.h" #include "writer.h" @@ -356,12 +356,8 @@ static void file_initialize_txt_file_fields (FileP file) #define TXT_INIT(buf) ({ buf_set_promiscuous (&file->buf, "txt_file->" #buf); }) if (IS_ZIP) { - mutex_initialize (file->recon_plan_mutex); - // initialize evb "promiscuous" buffers - i.e. buffers that can be allocated by any thread // promiscuous buffers must be initialized by the main thread, and buffer.c does not verify their integrity. - TXT_INIT(line_info[0]); - TXT_INIT(line_info[1]); TXT_INIT(vb_info[0]); TXT_INIT(vb_info[1]); } @@ -373,7 +369,7 @@ static void file_initialize_txt_file_fields (FileP file) static void file_open_ext_decompessor (FileP file, rom exec_name, rom subcommand, Codec streamed_codec, bool name_if_not_remote, rom args[7]) { char reason[64]; // used for error message if stream_create fails - snprintf (reason, sizeof(reason), "To compress a %s file", codec_name (file->codec)); + snprintf (reason, sizeof(reason), "To compress a %s file", codec_name (file->src_codec)); input_decompressor = stream_create (0, DEFAULT_PIPE_SIZE, DEFAULT_PIPE_SIZE, 0, 0, @@ -387,7 +383,7 @@ static void file_open_ext_decompessor (FileP file, rom exec_name, rom subcommand file->file = stream_from_stream_stdout (input_decompressor); file->redirected = true; - file->codec = streamed_codec; // data received from input_decompressor is in this codec + file->effective_codec = streamed_codec; // data received from input_decompressor is in this codec } static void file_open_txt_read_bz2 (FileP file) @@ -423,7 +419,17 @@ static void file_open_txt_read_gz (FileP file) #endif } - txtfile_discover_gz_codec (file); // decide between CODEC_GZ, CODEC_BGZF or CODEC_GZIL + // case: discovery deferred to the end of segconf when we know segconf.tech + if (file->data_type == DT_FASTQ) { // note: even if --no-bgzf: so we can report correct src_codec in stats + file->effective_codec = file->src_codec = txtfile_is_gzip (file) ? CODEC_GZ : CODEC_NONE; // based on the first 3 bytes + file->discover_during_segconf = (file->effective_codec == CODEC_GZ); + txtfile_initialize_igzip (file); + } + + // run discovery now if not FASTQ. That's because other data types might have header + // which is read before segconf. luckily FASTQ doesn't. + else + txtfile_discover_specific_gz (file); // decide between GZ, BGZF and NONE } FileP file_open_txt_read (rom filename) @@ -483,11 +489,12 @@ FileP file_open_txt_read (rom filename) if (file_open_txt_read_test_valid_dt (file)) goto fail; // skip this file // open the file, based on the codec (as guessed by file extension) - file->codec = file_get_codec_by_txt_ft (file->data_type, file->type, false); - file->source_codec = file_get_codec_by_txt_ft (file->data_type, file->type, true); + // file->codec = file_get_codec_by_txt_ft (file->data_type, file->type, false); + file->src_codec = file_get_codec_by_txt_ft (file->data_type, file->type, true); + file->effective_codec = file->src_codec; // initialize: can be changed if streaming or if gz variant - switch (file->codec) { - case CODEC_GZ: case CODEC_BGZF: case CODEC_NONE: gz: + switch (file->src_codec) { + case CODEC_GZ: case CODEC_BGZF: case CODEC_BAM: case CODEC_NONE: gz: file_open_txt_read_gz (file); break; @@ -499,8 +506,8 @@ FileP file_open_txt_read (rom filename) // note: in CRAM, we read the header in advance in possible, directly (without samtools), so we can handle the case // that the reference file is wrong. In samtools, if we read beyond the header with a wrong ref, samtools will hang. if (!file->is_remote && !file->redirected) { - cram_inspect_file (file); // if file is indeed CRAM, updates file->est_num_lines, file->header_size, and if not, updates file->data_type and file->codec/source_codec - if (file->codec == CODEC_GZ || file->codec == CODEC_NONE) goto gz; // actually, this is a GZ file (possibly BAM) + cram_inspect_file (file); // if file is indeed CRAM, updates file->est_num_lines, file->header_size, and if not, updates file->data_type and file->codec/src_codec + if (file->src_codec == CODEC_GZ || file->src_codec == CODEC_NONE) goto gz; // actually, this is a GZ file (possibly BAM) } StrTextSuperLong samtools_T_option = cram_get_samtools_option_T (gref); @@ -510,8 +517,12 @@ FileP file_open_txt_read (rom filename) "--threads=10", // in practice, samtools is able to consume ~4 cores file_samtools_no_PG() ? "--no-PG" : SKIP_ARG, // don't add a PG line to the header samtools_T_option.s[0] ? samtools_T_option.s : SKIP_ARG }); + + if (flag.no_bgzf) { + file->effective_codec = CODEC_GZ; + txtfile_initialize_igzip (file); + } - txtfile_discover_gz_codec (file); // also allocates gz_data break; } @@ -562,7 +573,7 @@ FileP file_open_txt_read (rom filename) return NULL; } -FileP file_open_txt_write (rom filename, DataType data_type, BgzfLevel bgzf_level) +FileP file_open_txt_write (rom filename, DataType data_type, MgzipLevel bgzf_level) { ASSERT (data_type > DT_NONE && data_type < NUM_DATATYPES ,"invalid data_type=%d", data_type); @@ -573,10 +584,10 @@ FileP file_open_txt_write (rom filename, DataType data_type, BgzfLevel bgzf_leve file->data_type = data_type; file->redirected = !filename; - file->codec = data_type == DT_CRAM ? CODEC_CRAM - : data_type == DT_BCF ? CODEC_BCF - : bgzf_level != BGZF_NO_BGZF ? CODEC_BGZF // see bgzf_piz_calculate_bgzf_flags - : /* BGZF_NO_BGZF */ CODEC_NONE; + file->effective_codec = data_type == DT_CRAM ? CODEC_CRAM + : data_type == DT_BCF ? CODEC_BCF + : bgzf_level != BGZF_NO_BGZF ? CODEC_BGZF // see mgzip_piz_calculate_mgzip_flags + : /* BGZF_NO_BGZF */ CODEC_NONE; if (!file->redirected) { // not stdout if (file_exists (filename) && @@ -599,7 +610,7 @@ FileP file_open_txt_write (rom filename, DataType data_type, BgzfLevel bgzf_leve if (flag.no_writer) return file; // open the file, based on the codec - switch (file->codec) { + switch (file->effective_codec) { case CODEC_BGZF : case CODEC_NONE : file->file = file->redirected ? fdopen (STDOUT_FILENO, "wb") : fopen (file->name, WRITE); break; @@ -812,7 +823,7 @@ FileP file_open_z_read (rom filename) } // opens z_file for read or write -FileP file_open_z_write (rom filename, FileMode mode, DataType data_type, Codec source_codec) +FileP file_open_z_write (rom filename, FileMode mode, DataType data_type, Codec src_codec) { START_TIMER; @@ -835,7 +846,7 @@ FileP file_open_z_write (rom filename, FileMode mode, DataType data_type, Codec file->type = file_get_type_force_dt (file->name, data_type); file->data_type = data_type; - file->source_codec = source_codec; + file->src_codec = src_codec; file->basename = filename_base (file->name, false, NULL, NULL, 0); @@ -892,7 +903,7 @@ FileP file_open_z_write (rom filename, FileMode mode, DataType data_type, Codec return file; } -// index file is it is a disk file of a type that can be indexed +// PIZ: index file is it is a disk file of a type that can be indexed static void file_index_txt (ConstFileP file) { ASSERTNOTNULL (file); @@ -904,20 +915,20 @@ static void file_index_txt (ConstFileP file) switch (file->data_type) { case DT_SAM: case DT_BAM: - RETURNW (file->codec == CODEC_BGZF,, "%s: output file needs to be a .sam.gz or .bam to be indexed", global_cmd); + RETURNW (file->effective_codec == CODEC_BGZF,, "%s: output file needs to be a .sam.gz or .bam to be indexed", global_cmd); indexing = stream_create (0, 0, 0, 0, 0, 0, 0, "to create an index", "samtools", "index", file->name, NULL); break; case DT_VCF: - RETURNW (file->codec == CODEC_BGZF,, "%s: output file needs to be a .vcf.gz or .bcf to be indexed", global_cmd); + RETURNW (file->effective_codec == CODEC_BGZF,, "%s: output file needs to be a .vcf.gz or .bcf to be indexed", global_cmd); RETURNW (vcf_header_get_has_fileformat(),, "%s: file needs to start with ##fileformat=VCF be indexed", global_cmd); indexing = stream_create (0, 0, 0, 0, 0, 0, 0, "to create an index", "bcftools", "index", file->name, NULL); break; case DT_FASTQ: case DT_FASTA: - RETURNW (file->codec == CODEC_BGZF || file->codec == CODEC_NONE,, - "%s: To be indexed, the output file cannot be compressed with %s", global_cmd, codec_name (file->codec)); + RETURNW (file->effective_codec == CODEC_BGZF || file->effective_codec == CODEC_NONE,, + "%s: To be indexed, the output file cannot be compressed with %s", global_cmd, codec_name (file->effective_codec)); indexing = stream_create (0, 0, 0, 0, 0, 0, 0, "to create an index", "samtools", "faidx", file->name, NULL); break; @@ -949,13 +960,13 @@ void file_close (FileP *file_p) if (file->file && file->supertype == TXT_FILE) { - if (file->mode == READ && file->codec == CODEC_BZ2) + if (file->mode == READ && file->effective_codec == CODEC_BZ2) BZ2_bzclose((BZFILE *)file->file); else if (file->mode == READ && is_read_via_ext_decompressor (file)) stream_close (&input_decompressor, STREAM_WAIT_FOR_PROCESS); - else if (file->mode == WRITE && is_written_via_ext_compressor (file->codec)) + else if (file->mode == WRITE && is_written_via_ext_compressor (file->effective_codec)) stream_close (&output_compressor, STREAM_WAIT_FOR_PROCESS); // if its stdout - just flush, don't close - we might need it for the next file @@ -986,7 +997,7 @@ void file_close (FileP *file_p) FCLOSE (file->file, file_printname (file)); FCLOSE (file->z_reread_file, file_printname (file)); } - serializer_destroy (file->digest_serializer); + serializer_destroy (file->digest_serializer); } // free resources if we are NOT near the end of the execution. If we are at the end of the execution @@ -1006,7 +1017,6 @@ void file_close (FileP *file_p) mutex_destroy (file->dicts_mutex); mutex_destroy (file->custom_merge_mutex); mutex_destroy (file->qname_huf_mutex); - mutex_destroy (file->recon_plan_mutex); FREE (file->name); FREE (file->basename); @@ -1154,30 +1164,6 @@ bool file_seek (FileP file, int64_t offset, return !ret; } -int64_t file_tell_do (FileP file, FailType soft_fail, rom func, unsigned line) -{ - ASSERTNOTNULL (file); - ASSERTNOTNULL (file->file); - - if (IS_ZIP && file->supertype == TXT_FILE && file->codec == CODEC_GZ) - return txt_file->disk_so_far; - - if (IS_ZIP && file->supertype == TXT_FILE && file->codec == CODEC_BZ2) - return BZ2_consumed ((BZFILE *)file->file); - - int64_t offset = ftello64 ((FILE *)file->file); - ASSERT (offset >= 0 || soft_fail, "called from %s:%u: ftello64 failed for %s (FILE*=%p remote=%s redirected=%s): %s", - func, line, file->name, file->file, TF(file->is_remote), TF(file->redirected), strerror (errno)); - - if (offset < 0) return -1; // soft fail - - // in in z_file that is being tarred, update the offset to the beginning of the file data in the tar file - if (file->supertype == Z_FILE) - offset -= tar_file_offset(); // 0 if not using tar - - return offset; -} - uint64_t file_get_size (rom filename) { struct stat64 st; diff --git a/src/file.h b/src/file.h index 19dd76d7..536d18d3 100644 --- a/src/file.h +++ b/src/file.h @@ -36,15 +36,17 @@ typedef struct File { bool is_in_tar; // z_file: file is embedded in tar file bool is_scanned; // TXT_FILE: sam_sag_by_flag_scan_for_depn has been performed for this file DataType data_type; - Codec source_codec; // TXT_FILE ZIP: codec of txt file before redirection (eg CRAM, XZ, ZIP...). Note: CODEC_BAM if BAM (with or without internal bgzf compression) + Codec src_codec; // TXT_FILE ZIP/PIZ: internal or external codec of txt file (eg CRAM, BAM, XZ, ZIP, BGZF, MGZF, NONE...). Passed in SectionHeaderTxtHeader.src_codec. // Z_FILE PIZ: set to CODEC_BCF or CODEC_CRAM iff GenozipHeader.data_type is DT_BCF/DT_CRAM - Codec codec; // TXT_FILE ZIP: internal decompression codec used with this file. If redirected - as read by txtfile (eg for cram files this is BGZF) - Codec gunzip_method; // TXT_FILE ZIP: if codec∈{GZ,BGZF,GZIL}, method used to decompress it (either the same as codec, or GZ) + Codec effective_codec; // TXT_FILE ZIP: method with which we actually uncompress txt_file: can be different than .codec, e.g. using GZ instead of a specialized gzip codec, or using BGZF/NONE when streaming + // TXT_FILE PIZ: reconstruction codec (possibile values: NONE, BGZF, BCF, CRAM) + uint32_t num_EOF_blocks; // TXT_FILE ZIP MGZIP: number of EOF blocks encountered // these relate to actual bytes on the disk int64_t disk_size; // ZIP: size of actual file on disk. 0 if not known (eg stdin or http stream). int64_t disk_so_far; // ZIP: Z/TXT_FILE: data actually read/write to/from "disk" (using fread/fwrite), (TXT_FILE: possibley bgzf/gz/bz2 compressed ; 0 if external compressor is used for reading). int64_t disk_gz_uncomp_or_trunc; // ZIP: TXT_FILE: gz-compressed data actually either decompressed or discarded due to truncate + int64_t gz_blocks_so_far; // ZIP: TXT_FILE: number of gz blocks read from disk int64_t est_seggable_size; // TXT_FILE ZIP, access via txtfile_get_seggable_size(). Estimated size of txt_data in file, i.e. excluding the header. It is exact for plain files, or based on test_vb if the file has source compression int64_t est_num_lines; // TXT_FILE ZIP, an alternative for progress bar - by lines instead of bytes (used for CRAM) @@ -122,18 +124,24 @@ typedef struct File { // TXT file: reading Buffer unconsumed_txt; // ZIP: excess uncompressed data read from the txt file - moved to the next VB: the final part of vb->txt_data that was not consumed - Buffer unconsumed_bgz_blocks; // ZIP TXT BGZF/GZIL: unconsumed or partially consumed bgzf/gzil blocks - moved to the next VB - Buffer gz_data; // ZIP TXT GZ: yet-unconsumed gz data read from disk. .comp_len/.uncomp_len refer to the first block in the buffer (in GZIL, but not BGZF, there might be additional data after the first block) - Buffer igzip_state; // ZIP TXT GZ (with igzip) - - // TXT file: BGZF stuff reading and writing compressed txt files - Buffer bgzf_isizes; // ZIP/PIZ: BGZF: uncompressed size of the BGZF blocks in which this txt file is compressed (in BGEN16). - // ZIP : GZIL: only .len is used to count GZIL blocks (as their isize is always 1MB except for the last block) - Buffer bgzf_starts; // ZIP: offset in txt_file of each BGZF block + Buffer unconsumed_mgzip_blocks; // ZIP TXT MGZIP codecs: unconsumed or partially consumed MGZIP blocks - moved to the next VB + Buffer gz_data; // ZIP TXT GZ: yet-unconsumed gz data read from disk. .comp_len/.uncomp_len refer to the first block in the buffer (in IL1M, but not BGZF, there might be additional data after the first block) + Buffer igzip_state; // ZIP TXT GZ (with igzip). + uint64_t start_gz_block; // ZIP TXT GZ (with igzip): offset in file of start of a gz block + uint32_t mgsp_vb_isize; // ZIP TXT GZ (with MGSP): isize of each gz block in the VB (except for the last block that might be slightly more) + uint32_t num_mgsp_blocks_in_vb; // ZIP TXT GZ (with MGSP): number of gz blocks in this VB + uint32_t max_mgsp_blocks_in_vb; // ZIP TXT GZ (with MGSP): max gz blocks in a VB so far + uint32_t max_mgzip_isize; // ZIP TXT GZ: largest MGZIP gz block + bool discover_during_segconf; // ZIP TXT GZ: gz discovery during segconf instead of file_open: for FASTQ files + + // TXT file: MGZIP stuff reading and writing compressed txt files + Buffer mgzip_isizes; // ZIP/PIZ: MGZIP: uncompressed size of the MGZIP blocks in which this txt file is compressed + Buffer mgzip_starts; // ZIP: offset in txt_file of each BGZF block Buffer bgzf_plausible_levels; // ZIP: discovering library/level. .count = number of BGZF blocks tested so far - struct FlagsBgzf bgzf_flags; // corresponds to SectionHeader.flags in SEC_BGZF + struct FlagsMgzip mgzip_flags; // corresponds to SectionHeader.flags in SEC_MGZIP uint8_t bgzf_signature[3]; // PIZ: 3 LSB of size of source BGZF-compressed file, as passed in SectionHeaderTxtHeader.codec_info - + bool non_EOF_zero_block_found; // ZIP: file contains an isize=0 block that is not identical to the EOF block, therefore we won't be able to reconstruct exactly + // TXT FILE: accounting for truncation when --truncate-partial-last-line is used uint32_t last_truncated_line_len; // ZIP: bytes truncated due to incomplete final line. note that if file is BGZF, then this truncated data is contained in the final intact BGZF blocks, after already discarding the final incomplete BGZF block @@ -173,12 +181,9 @@ typedef struct File { int qnames_sampled; // Z_FILE: PIZ: Deep: Number of QNAMEs sampled for producing the huffman compressor // Reconstruction plan, for reconstructing in sorted order if --sort: [0] is primary coords, [1] is luft coords - Mutex recon_plan_mutex; // TXT_FILE ZIP: VCF: protect vb_info and line_info during merging of VB data Buffer vb_info[2]; // TXT_FILE ZIP: VCF: array of ZipVbInfo per VB, indexed by (vb_i-1), 0:PRIMARY, 1:LUFT // Z_FILE ZIP: SAM: array of SamGcVbInfo for: 0:PRIM 1:DEPN // Z_FILE PIZ: [0]: used by writer [1]: used to load SAM SA Groups - array of PlsgVbInfo - entry per PRIM vb - Buffer line_info[2]; // TXT_FILE ZIP: VCF: array of LineInfo per line or gapless range in txt_file - // SAM: array of uint32 - lengths of lines in PRIM/DEPN Buffer recon_plan; // TXT_FILE ZIP/PIZ: array of ReconPlanItem - order of reconstruction of ranges of lines, to achieve a sorted file. VCF: [0]=PRIM rendition [1]=LUFT rendition // Z_FILE PIZ: plan for entire z_file, txt_file.recon_plan is assigned a portion of this plan Buffer recon_plan_index; // TXT_FILE ZIP / Z_FILE PIZ: An array of BufWord, one for each VB: start and length of VB in recon_plan @@ -192,7 +197,11 @@ typedef struct File { struct timespec start_time; // Z_FILE: For stats: time z_file object was created in memory Mutex ctx_mutex[MAX_DICTS]; // Z_FILE ZIP: Context z_file (only) is protected by a mutex Mutex custom_merge_mutex; // Z_FILE: ZIP: used to merge deep, but in the future could be used for other custom merges - + Buffer R1_txt_data_lens; // Z_FILE: ZIP: FASTQ GZ: info regarding R1 VBs: txt_data.len32 of each VB + Buffer R1_last_qname_index; // Z_FILE: ZIP: FASTQ GZ: info regarding R1 VBs: last qname of each VB, canonical form, nul-separated: index into R1_last_qname. Note: only accessible in main thread (bc may realloc) + Buffer R1_last_qname; // Z_FILE: ZIP: FASTQ GZ: data of R1_last_qname + VBIType R1_first_vb_i; // Z_FILE: ZIP: first vb_i of R1. Always 1 for --pair, more for --deep. + // Information content stats CompIType num_components; // ZIP/PIZ z_file: number of components in this file (inc. generated components) uint32_t num_vbs; // ZIP: z_file/txt_file PIZ: txt_file: number of VBs processed z_file: total VBs in file @@ -222,12 +231,18 @@ typedef struct File { int64_t disk_so_far_comp[MAX_NUM_COMPS]; // Z_FILE ZIP: per-component size if z_file VB sections (note: global area sections, including SEC_DICT, are not accounted for) int64_t txt_data_so_far_bind_comp[MAX_NUM_COMPS]; // Z_FILE ZIP: per-component txt_size after modifications (due to --optimzie etc) int64_t txt_data_so_far_bind_0_comp[MAX_NUM_COMPS]; // Z_FILE ZIP: per-component txt_size before modifications - Codec comp_codec[MAX_NUM_COMPS]; // Z_FILE ZIP: codec used for every txt file component (i.e. excluding generated components) - Codec comp_source_codec[MAX_NUM_COMPS]; // Z_FILE ZIP: source codec used for every txt file component (i.e. excluding generated components) - Codec comp_gunzip_method[MAX_NUM_COMPS]; // Z_FILE ZIP: gunzip_method used for every txt file component with codec∈{GZ,BGZF,GZIL} - FlagsBgzf comp_bgzf[MAX_NUM_COMPS]; // Z_FILE ZIP BGZF: library and level of BGZF of each comp - uint64_t gz_isize[MAX_NUM_COMPS][2]; // Z_FILE ZIP GZ: isize(=uncomp_size) of the first two gzip block multi-gz-blocks (excluding BGZF and GZIL). - uint8_t gz_header[MAX_NUM_COMPS][12]; // Z_FILE ZIP GZ: first 12 bytes of the gz header (10 if FEXTRA=false) (excluding BGZF and GZIL). + Codec comp_src_codec[MAX_NUM_COMPS]; // Z_FILE ZIP: source codec used for every txt file component (i.e. excluding generated components) + Codec comp_eff_codec[MAX_NUM_COMPS]; // Z_FILE ZIP: effective_codec used for every txt file component with a GZIP codec + FlagsMgzip comp_bgzf[MAX_NUM_COMPS]; // Z_FILE ZIP BGZF: library and level of BGZF of each comp + uint64_t gz_isize[MAX_NUM_COMPS][2]; // Z_FILE ZIP GZ: isize(=uncomp_size) of the first two MGZIP blocks (excluding known MGZIP codecs). + uint32_t comp_num_EOF_blocks[MAX_NUM_COMPS]; // Z_FILE ZIP MGZIP: number of EOF blocks encountered in the component + + #define GZ_HEADER_LEN 100 + union { + uint8_t comp_gz_header[MAX_NUM_COMPS][GZ_HEADER_LEN]; // Z_FILE ZIP gzip codecs: first (usually all) bytes of the gz header + uint8_t gz_header[GZ_HEADER_LEN]; // TXT_FILE ZIP: first gz_header of file (copied later to comp_gz_header) + }; + uint32_t gz_header_len; // TXT_FILE ZIP: gz_header length of first gz block } File; #define z_has_gencomp (z_file && z_file->z_flags.has_gencomp) // ZIP/PIZ @@ -235,14 +250,12 @@ typedef struct File { // methods extern FileP file_open_z_read (rom filename); -extern FileP file_open_z_write (rom filename, FileMode mode, DataType data_type, Codec source_codec); +extern FileP file_open_z_write (rom filename, FileMode mode, DataType data_type, Codec src_codec); extern StrText file_get_z_run_time (FileP file); extern FileP file_open_txt_read (rom filename); -extern FileP file_open_txt_write (rom filename, DataType data_type, BgzfLevel bgzf_level); +extern FileP file_open_txt_write (rom filename, DataType data_type, MgzipLevel bgzf_level); extern void file_close (FileP *file_p); extern bool file_seek (FileP file, int64_t offset, int whence, rom mode, FailType soft_fail); // SEEK_SET, SEEK_CUR or SEEK_END -extern int64_t file_tell_do (FileP file, FailType soft_fail, FUNCLINE); -#define file_tell(file,soft_fail) file_tell_do ((file), (soft_fail), __FUNCLINE) extern FileType file_get_type (rom filename); extern DataType file_get_data_type_of_input_file (FileType ft); extern DataType file_piz_get_dt_of_out_filename (void); @@ -295,15 +308,9 @@ extern bool file_buf_locate (FileP file, ConstBufferP buf); // tests for compression types // --------------------------- -#define SRC_CODEC(x) (txt_file->source_codec == CODEC_##x) - -#define TXT_IS_PLAIN (txt_file->codec == CODEC_NONE) -#define TXT_IS_BGZF (txt_file->codec == CODEC_BGZF) -#define TXT_IS_GZIL (txt_file->codec == CODEC_GZIL) -#define TXT_IS_GZ (txt_file->codec == CODEC_GZ) -#define TXT_IS_BZ2 (txt_file->codec == CODEC_BZ2) +#define SRC_CODEC(x) (txt_file->src_codec == CODEC_##x) -#define SC(x) (file->source_codec == CODEC_##x) +#define SC(x) (file->src_codec == CODEC_##x) static inline bool is_read_via_ext_decompressor(ConstFileP file) { return SC(XZ)|| SC(ZIP) || SC(BCF)|| SC(CRAM) || SC(ORA); } #undef SC diff --git a/src/filename.c b/src/filename.c index a0c3e476..ee75291c 100644 --- a/src/filename.c +++ b/src/filename.c @@ -133,13 +133,13 @@ rom filename_guess_original (ConstFileP file) { if (!file) return "(is-NULL)"; - if (file->codec == CODEC_NONE) return file->name; + if (file->src_codec == CODEC_NONE) return file->name; unsigned len = strlen (file->name) + 10; char *org_name = MALLOC (len); strcpy (org_name, file->name); - rom ext = codec_args[file->codec].ext; + rom ext = codec_args[file->src_codec].ext; // remove existing extension if needed (eg when replacing .sam with .bam) if (ext[0] == '-') { @@ -268,7 +268,7 @@ PairType filename_is_fastq_pair (STRp(fn1), STRp(fn2)) } // its predicted to a pair if filenames are the same, except for '1'⇄'2' switch - if (mismatches == 1 && ((fn1[mm_i] == '1' && fn2[mm_i] == '2'))) return PAIR_R1; // fn1 is PAIR_1 + if (mismatches == 1 && ((fn1[mm_i] == '1' && fn2[mm_i] == '2'))) return PAIR_R1; // fn1 is PAIR_R1 if (mismatches == 1 && ((fn1[mm_i] == '2' && fn2[mm_i] == '1'))) return PAIR_R2; return NOT_PAIRED; } \ No newline at end of file diff --git a/src/flags.c b/src/flags.c index be666195..7a7a01ac 100644 --- a/src/flags.c +++ b/src/flags.c @@ -19,7 +19,7 @@ #include "regions.h" #include "crypt.h" #include "stream.h" -#include "bgzf.h" +#include "mgzip.h" #include "bases_filter.h" #include "license.h" #include "tar.h" @@ -27,6 +27,7 @@ #include "stats.h" #include "arch.h" #include "user_message.h" +#include "codec.h" // flags - factory default values (all others are 0) Flags flag = { @@ -381,7 +382,7 @@ void flags_init_from_command_line (int argc, char **argv) #define _zf {"fq", no_argument, PADDED(out_dt), DT_FASTQ } #define _zF {"FQ", no_argument, PADDED(out_dt), DT_FASTQ } #define _9 {"optimize", optional_argument, 0, '9', } // US spelling - #define _8 {"optimise", optional_argument, 0, '9', } // British spelling + #define _88 {"optimise", optional_argument, 0, '9', } // British spelling #define _m {"md5", no_argument, &flag.md5, 1 } #define _t {"test", no_argument, &flag.test, 1 } #define _Nt {"no-test", no_argument, &flag.no_test, 1 } @@ -389,8 +390,10 @@ void flags_init_from_command_line (int argc, char **argv) #define _bs {"best", no_argument, &flag.best, 1 } #define _lm {"low-memory", no_argument, &flag.low_memory, 1 } #define _al {"add-line-numbers", no_argument, &flag.add_line_numbers, 1 } + #define _as {"add-seq", no_argument, &flag.add_seq, 1 } #define _Sd {"secure-DP", no_argument, &flag.secure_DP, 1 } #define _pe {"pair", no_argument, PADDED(pair), PAIRED } + #define _Np {"not-paired", no_argument, &flag.not_paired, 1 } #define _DP {"deep", no_argument, &flag.deep, 1 } #define _St {"sendto", required_argument, 0, 151 } #define _um {"user-message", required_argument, 0, 152 } @@ -574,13 +577,13 @@ void flags_init_from_command_line (int argc, char **argv) #define _ts {"t_size", required_argument, 0, 143, } #define _lp {"license-prepare", required_argument, 0, 148, } #define _00 {0, 0, 0, 0 } - #define _gg {"generate-gzil", no_argument, 0, 153 } + #define _gg {"generate-il1m", no_argument, 0, 153 } typedef const struct option Option; - static Option genozip_lo[] = { _lg, _i, _I, _d, _f, _h, _x, _D, _L1, _L2, _q, _Q, _qq, _t, _Nt, _DL, _nb, _nz, _nc,_nu, _V, _z, _m, _th, _o, _p, _e, _E, _H1, _sL, _ss, _SS, _sd, _sT, _sb, _Sb, _lc, _lh, _lH, _s2, _s7, _S7, _S0, _S8, _S9, _sa, _st, _sm, _sh, _si, _Si, _Sh, _sr, _su, _so, _gz, _sv, _sn, _pn, _ai, _B, _xt, _dm, _dp, _dL, _dD, _dq, _dB, _dt, _dw, _dM, _dr, _dR, _dP, _dG, _dN, _dF, _DF, _dQ, _dH, _dO, _dC, _fQ, _fC, _fO, _fS, _fH, _fN, _dU, _dl, _dc, _dg, _dh,_dS, _bS, _9, _8, _pe, _fa, _bs, _lm, _nh, _rg, _hC, _rA, _rS, _me, _s5, _S5, _sM, _sA, _sB, _sP, _sc, _Sc, _AL, _sI, _cn, _s6, _oe, _al, _Lf, _dd, _T, _TT, _TL, _wM, _wm, _WM, _WB, _bi, _bl, _sk, _VV, _DV, _Ds, _DS, _sp, _Du, _DD, _DP, _SH, _Dd, _to, _ts, _hc, _dv, _TR, _NE, _lp, _Sd, _St, _um, _fP, _nF, _nI, _gg, _00 }; - static Option genounzip_lo[] = { _lg, _d, _f, _h, _x, _D, _L1, _L2, _q, _Q, _t, _DL, _nc, _V, _z, _m, _th, _u, _o, _p, _e, _sL, _ss, _SS, _sG, _sd, _sT, _sb, _lc, _lh, _lH, _s2, _s7, _S7, _S0, _S8, _S9, _sa, _st, _sm, _sh, _si, _Si, _Sh, _sr, _su, _sv, _sn, _pn, _ov, _xt, _dm, _dp, _dD, _dB, _dt, _dR, _dc, _lm, _sR, _pR, _hC, _rA, _rS, _me, _s5, _S5, _sM, _sA, _sB, _Sc, _AL, _sI, _cn, _cN, _s6, _oe, _dd, _T, _TT, _Dp, _Dh, _sp, _DD, _Dd, _to, _ts, _RC, _dv, _TR, _NE, _np, _00 }; - static Option genocat_lo[] = { _lg, _d, _f, _h, _D, _L1, _L2, _q, _Q, _nc, _V, _z, _zr, _zR, _zb, _zB, _zs, _zS, _zq, _zQ, _zf, _zF, _zc, _zC, _zv, _zV, _th, _o, _p, _e, _il, _r, _R, _Rg, _qf, _qF, _Qf, _QF, _SF, _s, _sf, _sq, _G, _1, _H0, _H1, _H2, _H3, _Gt, _So, _Io, _IU, _iu, _GT, _sL, _ss, _SS, _sG, _sd, _sT, _sb, _lc, _lh, _lH, _s2, _s7, _S7, _S0, _S8, _S9, _sa, _st, _sm, _sh, _si, _Si, _Sh, _sr, _su, _sv, _sn, _pn, _ov, _R1, _R2, _RX, _xt, _dm, _dp, _dD, _dB, _dt, _dR, _dc, _ds, _lm, _fs, _g, _gw, _n, _nt, _nH, _sR, _pR, _sC, _pC, _hC, _rA, _rI, _pI, _rS, _me, _s5, _S5, _sM, _sA, _sB, _Sc, _AL, _sI, _cn, _cN, _pg, _PG, _SX, _ix, _ct, _vl, _s6, _oe, _al, _dd, _T, _Dp, _Dh, _sp, _DD, _Dd, _DT, _RC, _dv, _TR, _NE, _np, _00 }; - static Option genols_lo[] = { _lg, _f, _h, _l, _L1, _L2, _q, _V, _p, _st, _sm, _dm, _dt, _sM, _b, _LC, _oe, _dd, _T, _sp, _DD, _dv, _NE, _00 }; + static Option genozip_lo[] = { _lg, _i, _I, _d, _f, _h, _x, _D, _L1, _L2, _q, _Q, _qq, _t, _Nt, _DL, _nb, _nz, _nc,_nu, _V, _z, _m, _th, _o, _p, _e, _E, _H1, _sL, _ss, _SS, _sd, _sT, _sb, _Sb, _lc, _lh, _lH, _s2, _s7, _S7, _S0, _S8, _S9, _sa, _st, _sm, _sh, _si, _Si, _Sh, _sr, _su, _so, _gz, _sv, _sn, _pn, _ai, _B, _xt, _dm, _dp, _dL, _dD, _dq, _dB, _dt, _dw, _dM, _dr, _dR, _dP, _dG, _dN, _dF, _DF, _dQ, _dH, _dO, _dC, _fQ, _fC, _fO, _fS, _fH, _fN, _dU, _dl, _dc, _dg, _dh,_dS, _bS, _9, _88, _pe, _Np, _fa, _bs, _lm, _nh, _rg, _hC, _rA, _rS, _me, _s5, _S5, _sM, _sA, _sB, _sP, _sc, _Sc, _AL, _sI, _cn, _s6, _oe, _al, _as, _Lf, _dd, _T, _TT, _TL, _wM, _wm, _WM, _WB, _bi, _bl, _sk, _VV, _DV, _Ds, _DS, _sp, _Du, _DD, _DP, _SH, _Dd, _to, _ts, _hc, _dv, _TR, _NE, _lp, _Sd, _St, _um, _fP, _nF, _nI, _gg, _00 }; + static Option genounzip_lo[] = { _lg, _d, _f, _h, _x, _D, _L1, _L2, _q, _Q, _t, _DL, _nc, _V, _z, _m, _th, _u, _o, _p, _e, _sL, _ss, _SS, _sG, _sd, _sT, _sb, _lc, _lh, _lH, _s2, _s7, _S7, _S0, _S8, _S9, _sa, _st, _sm, _sh, _si, _Si, _Sh, _sr, _su, _sv, _sn, _pn, _ov, _xt, _dm, _dp, _dD, _dB, _dt, _dR, _dc, _lm, _sR, _pR, _hC, _rA, _rS, _me, _s5, _S5, _sM, _sA, _sB, _Sc, _AL, _sI, _cn, _cN, _s6, _oe, _dd, _T, _TT, _Dp, _Dh, _sp, _DD, _Dd, _to, _ts, _RC, _dv, _TR, _NE, _np, _00 }; + static Option genocat_lo[] = { _lg, _d, _f, _h, _D, _L1, _L2, _q, _Q, _nc, _V, _z, _zr, _zR, _zb, _zB, _zs, _zS, _zq, _zQ, _zf, _zF, _zc, _zC, _zv, _zV, _th, _o, _p, _e, _il, _r, _R, _Rg, _qf, _qF, _Qf, _QF, _SF, _s, _sf, _sq, _G, _1, _H0, _H1, _H2, _H3, _Gt, _So, _Io, _IU, _iu, _GT, _sL, _ss, _SS, _sG, _sd, _sT, _sb, _lc, _lh, _lH, _s2, _s7, _S7, _S0, _S8, _S9, _sa, _st, _sm, _sh, _si, _Si, _Sh, _sr, _su, _sv, _sn, _pn, _ov, _R1, _R2, _RX, _xt, _dm, _dp, _dD, _dB, _dt, _dR, _dc, _ds, _lm, _fs, _g, _gw, _n, _nt, _nH, _sR, _pR, _sC, _pC, _hC, _rA, _rI, _pI, _rS, _me, _s5, _S5, _sM, _sA, _sB, _Sc, _AL, _sI, _cn, _cN, _pg, _PG, _SX, _ix, _ct, _vl, _s6, _oe, _al, _dd, _T, _Dp, _Dh, _sp, _DD, _Dd, _DT, _RC, _dv, _TR, _NE, _np, _00 }; + static Option genols_lo[] = { _lg, _f, _h, _l, _L1, _L2, _q, _V, _p, _st, _sm, _dm, _dt, _sM, _b, _LC, _oe, _dd, _T, _sp, _DD, _dv, _NE, _00 }; static Option *long_options[NUM_EXE_TYPES] = { genozip_lo, genounzip_lo, genocat_lo, genols_lo }; // same order as ExeType // include the option letter here for the short version (eg "-t") to work. ':' indicates an argument. @@ -709,7 +712,7 @@ void flags_init_from_command_line (int argc, char **argv) case 148 : license_prepare (optarg); break; case 151 : ASSINP (str_get_int_range64 (optarg, strlen (optarg), 1, 0xffffffff, &flag.sendto), "Expecting the value of --sendto=%s to a number", optarg); break; case 152 : user_message_init (optarg); break; - case 153 : gzil_compress(); // doesn't return + case 153 : il1m_compress(); // doesn't return case 0 : break; // a long option that doesn't have short version will land here - already handled so nothing to do default : // unrecognized option @@ -886,10 +889,8 @@ static void flags_test_conflicts (unsigned num_files /* optional */) void flags_zip_verify_dt_specific (DataType dt) { // SAM - FLAG_ONLY_FOR_DT(BAM, show_bam, "show_bam"); - - if (flag.show_bam || flag.analyze_ins) - flag.seg_only = flag.xthreads = flag.quiet = true; + FLAG_ONLY_FOR_DT(BAM, show_bam, "show-bam"); + FLAG_ONLY_FOR_DT(SAM, add_seq, "add-seq"); // FASTQ FLAG_ONLY_FOR_2DTs(SAM, FASTQ, pair, "pair"); @@ -1003,9 +1004,9 @@ static void flags_zip_verify_deep_rules (unsigned num_files, rom *filenames) if (sam_i != 0) SWAP (filenames[sam_i], filenames[0]); // case two FASTQs: unless already explicitly set with --pair, implicitly set pair if predicted by filenames - if (!flag.pair && n_dt[DT_FASTQ] == 2) { + if (!flag.pair && !flag.not_paired && n_dt[DT_FASTQ] == 2) { flag.pair = filename_is_fastq_pair (filenames[1], strlen(filenames[1]), filenames[2], strlen(filenames[2])); - if (flag.pair == PAIR_R2) + if (IS_R2) SWAP (filenames[1], filenames[2]); if (flag.pair) flag.pair = PAIRED; @@ -1145,6 +1146,12 @@ void flags_update_zip_one_file (void) ASSINP (!has_password(), "option --make-reference is incompatible with %s", OT("password", "p")); } + if (flag.vblock && TXT_IS_VB_SIZE_BY_MGZIP) { + WARN ("%s option is ignored, because %s is read using the efficient %s method. Tip: use --no-bgzf to override this.", + OT("vblock", "B"), txt_name, codec_name (txt_file->effective_codec)); + flag.vblock = 0; + } + z_file->z_flags.txt_is_bin = DTPT (is_binary); // this will go into SectionHeaderGenozipHeader and is determined by the component (eg in Deep it is determined by the SAM/BAM) DO_ONCE @@ -1159,7 +1166,7 @@ void flags_update_zip_one_file (void) ASSINP0 (!flag.test_i || flag.test || flag.no_test || flag.debug || flag.make_reference || flag.zip_no_z_file || zip_is_biopsy, "When running with GENOZIP_TEST one of: --test, --no-test, --debug, --make-reference must be set"); - flag.bind = flag.deep ? BIND_DEEP // one SAM/BAM (1-3 components) and one or two FASTQs + flag.bind = flag.deep ? BIND_DEEP // one SAM/BAM (1-3 components) and one or more FASTQs : (dt == DT_FASTQ && flag.pair) ? BIND_FQ_PAIR // FQ_COMP_R1 and FQ_COMP_R2 components : (dt == DT_SAM || dt == DT_BAM) ? BIND_SAM // SAM_COMP_MAIN component and possibly SAM_COMP_PRIM and/or SAM_COMP_DEPN. If no PRIM/DEPN lines exist, we will cancel it sam_zip_generate_recon_plan : BIND_NONE; @@ -1168,6 +1175,11 @@ void flags_update_zip_one_file (void) if (flag.has_biopsy_line && flag.bind != BIND_FQ_PAIR) flag.seg_only = true; + if (flag.show_bam || flag.analyze_ins) + flag.seg_only = flag.xthreads = flag.quiet = true; + + flag.skip_segconf |= flag.add_seq; // --add-seq implies --skip-segconf + flags_zip_verify_dt_specific (dt); } @@ -1224,10 +1236,10 @@ static void flags_piz_set_flags_for_deep (void) if (fastq) { flag.deep = flag.deep_fq_only = true; // note: in case of SAM/BAM-only reconstruction, we don't need deep. flag.out_dt = DT_FASTQ; // SAM component not written. Used in TRANSLATIONS to specifiy toplevel of TOP2NONE - - ASSINP (!flag.interleaved || z_file->num_components == 5, - "--interleaved can't be used because %s does not contain two FASTQs", z_name); - + + ASSINP (!flag.interleaved || z_file->num_components == 5, // also appears in flags_update_piz_one_z_file, but we need it here too + "--interleaved can't be used because %s contains %u FASTQ components, not two", z_name, z_file->num_components - 3); + // determined what will be outputted with --fq if (!flag.one_component && !flag.one_vb) { if (z_file->num_components == 4) // only one FQ file @@ -1236,7 +1248,7 @@ static void flags_piz_set_flags_for_deep (void) if (!flag.interleaved) flag.interleaved = INTERLEAVE_BOTH; } else - ABORTINP0 ("--fastq can't be used because file contains more than two FASTQ. Used --R1 or --R2 instead."); + ABORTINP0 ("--fastq can't be used because file contains more than two FASTQ components. Use --R instead."); } ASSINP0 (!flag.interleaved || z_file->num_components == 5, // exactly 2 FASTQ components @@ -1368,13 +1380,20 @@ void flags_update_piz_one_z_file (int z_file_i /* -1 if unknown - called form fi flag.sequential = 1; } - else if (Z_DT(FASTQ)) { - // --R1/--R2 and --interleaved is only possible for on FASTQ data compressed with --pair - ASSINP (!flag.one_component || flag.pair, - "--R%c is not supported for %s because it only works on FASTQ data that was compressed with --pair", '0'+flag.one_component, z_name); + else if (OUT_DT(FASTQ)) { // FASTQ or outputing a FASTQ component(s) of deep + int n_sam_comps = flag.deep ? 3 : 0; + + // --R1 --R2 and --R= only possible if we have enough components + ASSINP (flag.one_component <= z_file->num_components, + "--R=%u is not supported for %s because it contains only %u FASTQ components", + flag.one_component - n_sam_comps, z_name, z_file->num_components - n_sam_comps); + // --interleaved is only possible for on FASTQ data compressed with --pair ASSINP (!flag.interleaved || flag.pair, - "--R%c is not supported for %s because it only works on FASTQ data that was compressed with --pair", '0'+flag.one_component, z_name); + "--interleaved is not supported for %s because it doesn't contain paired-end FASTQs", z_name); + + ASSINP (!flag.interleaved || z_file->num_components == 2 + n_sam_comps, + "--interleaved can't be used because %s contains %u FASTQ components, not two", z_name, z_file->num_components - n_sam_comps); // genocat paired FASTQ: if none of --interleaved, --R1 or --R2 are specified, INTERLEAVE_BOTH is the default if (is_genocat && !flag.interleaved && flag.pair && !flag.one_component && !flag.one_vb) @@ -1739,5 +1758,5 @@ void flags_finalize (void) rom pair_type_name (PairType p) { - return IN_RANGE (p, 0, 3) ? (rom[])PAIR_TYPE_NAMES[p] : "InvalidPairType"; + return IN_RANGE (p, 0, 4) ? (rom[])PAIR_TYPE_NAMES[p] : "InvalidPairType"; } \ No newline at end of file diff --git a/src/flags.h b/src/flags.h index 9288f895..11764569 100644 --- a/src/flags.h +++ b/src/flags.h @@ -41,6 +41,8 @@ typedef packed_enum { NOT_PAIRED, // ZIP and PIZ PAIRED, // PIZ: z_file is paired ; ZIP: --pair or --deep with paired FASTQ } PairType; #define PAIR_TYPE_NAMES { "NOT_PAIRED", "PAIR_R1", "PAIR_R2", "PAIRED" } +#define IS_R1 (flag.pair == PAIR_R1) +#define IS_R2 (flag.pair == PAIR_R2) // make a single-byte flag padded to 4 bytes, so we can easily assign to it in flags_init_from_command_line #ifdef __LITTLE_ENDIAN__ @@ -52,7 +54,7 @@ typedef packed_enum { NOT_PAIRED, // ZIP and PIZ typedef struct { // genozip options that affect the compressed file - int fast, best, low_memory, make_reference, multiseq, md5, secure_DP, + int fast, best, low_memory, make_reference, multiseq, md5, secure_DP, not_paired, deep; // deep is set with --deep in ZIP and from SectionHeaderGenozipHeader.flags.genozip_header.dts2_deep in PIZ rom vblock; int64_t sendto; @@ -62,7 +64,7 @@ typedef struct { int optimize_dict_ids_len; DictId *optimize_dict_ids; - int add_line_numbers; + int add_line_numbers, add_seq; int truncate; // allow truncated file - compress only available full lines. note: we don't consider this option data modifying as its used for debugging - digest is calculated only after truncation @@ -70,7 +72,7 @@ typedef struct { // piz options #define MAX_FLAG_BGZF 5 - int32_t bgzf; // PIZ: can be set by --bgzf, or by various other conditions. values 0-MAX_FLAG_BGZF indicate the level of libdeflate, BGZF_BY_ZFILE means use SEC_BGZF or default level if it is absent + int32_t bgzf; // PIZ: can be set by --bgzf, or by various other conditions. values 0-MAX_FLAG_BGZF indicate the level of libdeflate, BGZF_BY_ZFILE means use SEC_MGZIP or default level if it is absent PADDED_FLAG(DataType, out_dt); // used to indicate the desired dt of the output txt - consumed by file_open_z, and thereafter equal to txt_file->data_type diff --git a/src/gencomp.c b/src/gencomp.c index af042351..8a71bb52 100644 --- a/src/gencomp.c +++ b/src/gencomp.c @@ -12,7 +12,7 @@ #include "gencomp.h" #include "zip.h" #include "codec.h" -#include "bgzf.h" +#include "mgzip.h" #include "biopsy.h" #include "stream.h" #include "dispatcher.h" @@ -151,7 +151,7 @@ void gencomp_seg_add_line (VBlockP vb, CompIType comp_i, STRp(line)/*pointer int // If we're might re-read depn lines from the txt file, we store their coordinates in the txt file if (componentsP[comp_i].type == GCT_DEPN && depn_method == DEPN_REREAD) { if (TXT_IS_BGZF) { - uint64_t bb_i = vb->vb_bgz_i + vb->gz_blocks.current_bb_i; + uint64_t bb_i = vb->vb_mgzip_i + vb->gz_blocks.current_bb_i; ASSERT (bb_i <= MAX_BB_I, "%s: BGZF bb_i=%"PRIu64" exceeds maximum of %"PRIu64, VB_NAME, bb_i, MAX_BB_I); BLST (GencompLineIEntry, vb->gencomp_lines)->offset = (LineOffset){ .bb_i = bb_i, .uoffset = vb->line_bgzf_uoffset }; @@ -316,6 +316,8 @@ void gencomp_destroy (void) static uint32_t compress_depn_buf (BufferP comp_buf) { + START_TIMER; + compress_depn_vb = vb_initialize_nonpool_vb (VB_ID_COMPRESS_DEPN, DT_NONE, "compress_depn_buf"); uint32_t uncomp_len = depn.thread_data.len32; @@ -330,6 +332,7 @@ static uint32_t compress_depn_buf (BufferP comp_buf) vb_release_vb (&compress_depn_vb, "compress_depn_buf"); + COPY_TIMER_EVB (compress_depn_buf); return (comp_buf->len = comp_len + sizeof (uint32_t)); } @@ -339,8 +342,10 @@ static void *gencomp_do_offload (void *info_) uint32_t uncomp_len = depn.thread_data.len32; info->comp_len = compress_depn_buf (&depn.thread_data_comp); + START_TIMER; ASSERT (1 == fwrite (STRb(depn.thread_data_comp), 1, depn.fp), "Failed to write %"PRIu64" bytes to temporary file %s: %s", depn.thread_data_comp.len, depn.name, strerror (errno)); + COPY_TIMER_EVB (gencomp_do_offload_write); if (flag.debug_gencomp) iprintf ("Wrote to disk: buf=%u num_lines=%u uncomp_len=%u comp_len=%u uncomp_alder32=%u comp_adler32=%u\n", @@ -425,7 +430,7 @@ static bool gencomp_flush (CompIType comp_i, bool is_final_flush) // final flush // wait for previous DEPN compression / offload to finish if (gct == GCT_DEPN && depn.has_thread) { int err; - ASSERT (!(err = pthread_join (depn.thread, NULL)), "pthread_join failed: %s", strerror (err)); + ASSERT (!(err = PTHREAD_JOIN (depn.thread, "gencomp_compress_depn")), "pthread_join failed: %s", strerror (err)); depn.has_thread = false; } @@ -583,7 +588,8 @@ static ASCENDING_SORTER (preabsorb_queue_sorter, PreabsorbEntry, vblock_i) // called from compute_vb, with VBs in arbitrary order. Notes: // (1) We need to do it in the compute thread (rather than the main thread) so that zip_prepare_one_vb_for_dispatching, // running in the main thread, can busy-wait for all MAIN compute threads to complete before flushing the final txt_data. -// (2) We do it in VB order, as recon_plan creation expects the lines to be in the same order as in the MAIN component. +// (2) Despite this function being called in arbitrary VB order, we take care to absorb the VBs in order, as +// recon_plan creation expects the lines to be in the same order as in the MAIN component. void gencomp_absorb_vb_gencomp_lines (VBlockP vb) { mutex_lock (preabsorb_queue_mutex); // protects preabsorb_queue @@ -901,8 +907,8 @@ void gencomp_reread_lines_as_prescribed (VBlockP vb) stream_set_inheritability (fileno (fp), false); // Windows: allow file_remove in case of --replace if (flag_is_show_vblocks (ZIP_TASK_NAME)) - iprintf ("REREAD_DEPN(id=%d) vb=%s n_lines=%u codec=%s\n", - vb->id, VB_NAME, vb->reread_prescription.len32, codec_name (txt_file->codec)); + iprintf ("REREAD_DEPN(id=%d) vb=%s n_lines=%u effective_codec=%s\n", + vb->id, VB_NAME, vb->reread_prescription.len32, codec_name (txt_file->effective_codec)); if (TXT_IS_BGZF) bgzf_reread_uncompress_vb_as_prescribed (vb, fp); diff --git a/src/gencomp.h b/src/gencomp.h index 133df9e6..be8178ec 100644 --- a/src/gencomp.h +++ b/src/gencomp.h @@ -14,7 +14,7 @@ typedef union { // 64 bit struct { // used if file codec is BGZF - uint64_t bb_i : 48; // index into txt_file->bgzf_isizes if beginning of line + uint64_t bb_i : 48; // index into txt_file->mgzip_isizes if beginning of line uint64_t uoffset : 16; // index into uncompressed BGZF block of beginning of line }; uint64_t offset; // offset into txt_file of beginning of line - used if file codec is NONE diff --git a/src/generic.c b/src/generic.c index 649ff820..ea868954 100644 --- a/src/generic.c +++ b/src/generic.c @@ -17,7 +17,7 @@ static char magic[MAGIC_SIZE+1] = {}; // first bytes of the generic file static char ext[32] = {}; // nul-terminated txt filename extension // all data is always consumed -int32_t generic_unconsumed (VBlockP vb, uint32_t first_i, int32_t *i) +int32_t generic_unconsumed (VBlockP vb, uint32_t first_i) { return 0; } @@ -56,7 +56,8 @@ int32_t generic_is_header_done (bool is_eof) // recreate predefined contexts ctx_initialize_predefined_ctxs (z_file->contexts, new_dt, z_file->d2d_map, &z_file->num_contexts); - + + // note on FASTQ: effective_codec is currently GZ or BGZF. We will not try to re-discover in segconf as that is too complicated. return HEADER_DATA_TYPE_CHANGED; } diff --git a/src/generic.h b/src/generic.h index b712b8df..ca715d9f 100644 --- a/src/generic.h +++ b/src/generic.h @@ -14,7 +14,7 @@ #pragma GENDICT GNRIC_DATA=DTYPE_FIELD=DATA #pragma GENDICT GNRIC_TOPLEVEL=DTYPE_FIELD=TOPLEVEL -extern int32_t generic_unconsumed (VBlockP vb, uint32_t first_i, int32_t *i); +extern int32_t generic_unconsumed (VBlockP vb, uint32_t first_i); extern int32_t generic_is_header_done (bool is_eof); extern void generic_seg_initialize (VBlockP vb); extern rom generic_seg_txt_line (VBlockP vb, rom next_line, uint32_t remaining_txt_len, bool *has_13); diff --git a/src/genozip.c b/src/genozip.c index f65eaaf9..fc078dbb 100644 --- a/src/genozip.c +++ b/src/genozip.c @@ -192,7 +192,7 @@ static void main_print_help (bool explicit) static void main_print_version() { - iprintf ("version=%s distribution=%s\n", GENOZIP_CODE_VERSION, get_distribution()); + iprintf ("version=%s distribution=%s\n", code_version().s, get_distribution()); } static void main_genounzip (rom z_filename, rom txt_filename, int z_file_i, bool is_last_z_file) @@ -491,7 +491,7 @@ static void main_genozip (rom txt_filename, : (flag.pair && !flag.out_filename) ? filename_z_pair (txt_filename, next_txt_filename, false) // first file in a FASTQ pair : filename_z_normal (txt_file->name, txt_file->data_type, txt_file->type); - z_file = file_open_z_write (z_filename, flag.pair ? WRITEREAD : WRITE, txt_file->data_type, txt_file->source_codec); + z_file = file_open_z_write (z_filename, flag.pair ? WRITEREAD : WRITE, txt_file->data_type, txt_file->src_codec); FREE(z_filename); // file_open_z copies the name license_eval_notice(); @@ -511,7 +511,7 @@ static void main_genozip (rom txt_filename, int64_t t_offset = z_file->is_in_tar ? tar_file_offset() : 0; // if tar: offset of beginning of z_file in tar (after tar header block) if (flag.bind == BIND_FQ_PAIR || (flag.bind == BIND_DEEP && flag.pair && flag.zip_comp_i >= SAM_COMP_FQ00)) - flag.pair = (flag.pair==PAIR_R1) ? PAIR_R2 : PAIR_R1; + flag.pair = IS_R1 ? PAIR_R2 : PAIR_R1; zip_one_file (txt_file->basename, is_last_user_txt_file); diff --git a/src/genozip.h b/src/genozip.h index f288d1a4..20a0a5b3 100644 --- a/src/genozip.h +++ b/src/genozip.h @@ -240,17 +240,23 @@ extern FileP z_file, txt_file; // IMPORTANT: This is part of the genozip file format. Also update codec.h/codec_args // If making any changes, update arrays in // 1. CODEC_ARGS in codec.h -// 2. (for codecs that have a public file format, eg .zip) txtfile_set_seggable_size -// 3. codec_show_time +// 2. for codecs that have a public file format, eg .zip: txtfile_set_seggable_size +// 3. for codecs used to compress sections: codec_show_time typedef packed_enum { // 1 byte CODEC_UNKNOWN=0, CODEC_NONE=1, - // internal source codecs - CODEC_BGZF=20, CODEC_GZ=2, CODEC_GZIL=34, CODEC_BZ2=3, + // gzip codecs + CODEC_GZ=2, CODEC_BGZF=20, // General GZIP codecs + CODEC_IL1M=34, // Illumina GZIP codecs + CODEC_MGZF=35, CODEC_MGSP=36, // MGI GZIP codecs + CODEC_EMFL=37, CODEC_EMVL=38, // Element GZIP codecs + CODEC_BAM=23, // treated as a gzip codec, with effective_codec=BGZF + + // other internal source codecs + CODEC_BZ2=3, // external source codecs (used by executing an external application) CODEC_XZ=21, CODEC_BCF=22, CODEC_CRAM=24, CODEC_ZIP=25, CODEC_ORA=32, - CODEC_BAM=23, // in v8 BAM was a codec which was compressed using samtools as external compressor. Since v14 we use the codec name for displaying "BAM" in stats total line. // simple codecs used in genozip files CODEC_LZMA=4, CODEC_BSC=5, /*CODEC_BZ2=3,*/ @@ -271,7 +277,7 @@ typedef packed_enum { // 1 byte CODEC_T0 = 29, // compress the T0:Z field (Ultima) CODEC_OQ = 33, // compress the OQ:Z field (mostly generated by GATK BQSR) - NUM_CODECS = 35, + NUM_CODECS = 39, } Codec; // note: the numbering of the sections cannot be modified, for backward compatibility @@ -293,7 +299,7 @@ typedef packed_enum { // 1 byte SEC_LOCAL = 12, // Multiple sections per-VB SEC_CHROM2REF_MAP = 13, // Global section SEC_STATS = 14, // Global section - SEC_BGZF = 15, // Per-component section (optional): contains the uncompressed sizes of the source file bgzf block + SEC_MGZIP = 15, // Per-component section (optional): contains the uncompressed sizes of the source file mgzip block SEC_RECON_PLAN = 16, // Per-component section (optional): introduced v12 SEC_COUNTS = 17, // Global section: introduced v12 SEC_REF_IUPACS = 18, // Global section: introduced v12 @@ -329,7 +335,7 @@ typedef int ThreadId; #define VER(n) (z_file->genozip_version >= (n)) #define VER2(major,minor) (z_file->genozip_version > (major) || \ - (z_file->genozip_version == (major) && (z_file->genozip_minor_ver >= (minor) || (flag.test_i && z_file->genozip_minor_ver == (minor)-1)))) // when developing, version is still the target version minus 1 + (z_file->genozip_version == (major) && (z_file->genozip_minor_ver >= (minor)))) #define EXACT_VER(n) (z_file->genozip_version == (n)) #define KB *((uint64_t)1<<10) @@ -348,7 +354,8 @@ typedef int ThreadId; #define SQR(x) ((x)*(x)) #endif -#define IN_RANGE(x,min,max) ((x) >= (min) && (x) <= (max)) +#define IN_RANGE(x,min,after) ((x) >= (min) && (x) < (after)) // half_open [min,after) +#define IN_RANGX(x,min,max) ((x) >= (min) && (x) <= (max)) // close [min,max] #define MAXB64(x) ((1ULL<<(x))-1) #define MAXB(x) ((uint32_t)MAXB64(x)) // eg: MAXB(3) == 0b111 == 7 @@ -383,44 +390,6 @@ typedef int ThreadId; #define SNPRINTF0(out/*StrText* */, str) \ ({ out##_len += snprintf (&out.s[out##_len], sizeof(out.s)-out##_len, (str)); out##_len = MIN_(out##_len, sizeof(out.s)); }) -// getting and putting unaligned words -#ifdef GENOZIP_ALLOW_UNALIGNED_ACCESS - #define GET_UINT16(p) *((uint16_t *)(p)) - #define GET_UINT32(p) *((uint32_t *)(p)) - #define GET_UINT64(p) *((uint64_t *)(p)) - #define GET_FLOAT32(p) *((float *)(p)) - - #define GET_UINT32_(st_p, member) ((st_p)->member) - - #define PUT_UINT16(p,n) *((uint16_t *)(p)) = (n) - #define PUT_UINT32(p,n) *((uint32_t *)(p)) = (n) - - #define PUT_UINT16_(st_p, member, n) (st_p)->member = (n) - #define PUT_UINT32_(st_p, member, n) (st_p)->member = (n) -#else - // loading a Little Endian uint32_t from an unaligned memory location - #define GET_UINT16(p) ((uint16_t)((uint8_t*)(p))[0] | ((uint16_t)((uint8_t*)(p))[1] << 8)) - #define GET_UINT32(p) ((uint32_t)((uint8_t*)(p))[0] | ((uint32_t)((uint8_t*)(p))[1] << 8) | ((uint32_t)((uint8_t*)(p))[2] << 16) | ((uint32_t)((uint8_t*)(p))[3] << 24)) - #define GET_UINT64(p) ((uint64_t)((uint8_t*)(p))[0] | ((uint64_t)((uint8_t*)(p))[1] << 8) | ((uint64_t)((uint8_t*)(p))[2] << 16) | ((uint64_t)((uint8_t*)(p))[3] << 24) | ((uint64_t)((uint8_t*)(p))[4] << 32) | ((uint64_t)((uint8_t*)(p))[5] << 40) | ((uint64_t)((uint8_t*)(p))[6] << 48) | ((uint64_t)((uint8_t*)(p))[7] << 56))) - #define GET_FLOAT32(p) ({ union { uint32_t i; float f; } n= {.i = GET_UINT32(p)}; n.f; }) - - #define GET_UINT32_(st_p, member) ({ typeof(*st_p) dummy; bytes _P=(bytes)(st_p) + ((bytes)&dummy.member - (bytes)&dummy); ((uint32_t)_P[0] | ((uint32_t)_P[1] << 8) | ((uint32_t)_P[2] << 16) | ((uint32_t)_P[3] << 24)); }) - - // storing a Little Endian integer in an unaligned memory location - #define PUT_UINT16(p,n) ({ uint16_t _N=(n); uint8_t *_P=(uint8_t *)(p); _P[0]=_N; _P[1]=_N>>8; }) - #define PUT_UINT32(p,n) ({ uint32_t _N=(n); uint8_t *_P=(uint8_t *)(p); _P[0]=_N; _P[1]=_N>>8; _P[2]=_N>>16; _P[3]=_N>>24; }) - - // storing as a struct member - #define PUT_UINT16_(st_p, member, n) ({ typeof(*st_p) dummy; PUT_UINT16 ((rom)(st_p) + ((rom)&dummy.member - (rom)&dummy), (n)); }) - #define PUT_UINT32_(st_p, member, n) ({ typeof(*st_p) dummy; PUT_UINT32 ((rom)(st_p) + ((rom)&dummy.member - (rom)&dummy), (n)); }) -#endif - -#define GET_UINT8(p) ((uint8_t)(((uint8_t*)(p))[0])) -#define GET_UINT24(p) ((uint32_t)(((uint8_t*)(p))[0] | (((uint8_t*)(p))[1] << 8))| (((uint8_t*)(p))[2] << 16)) - -#define PUT_UINT8(p,n) ({ ((uint8_t*)(p))[0] = (n); }) -#define PUT_UINT24(p,n) ({ uint32_t _N=(n); uint8_t *_P=(uint8_t *)(p); _P[0]=_N; _P[1]=_N>>8; _P[2]=_N>>16; }) - // used for qsort sort function - receives two integers of any type and returns -1/0/1 as required to sort in ascending order #define SORTER(func) int func (const void *a, const void *b) typedef SORTER ((*Sorter)); @@ -520,10 +489,10 @@ typedef SORTER ((*Sorter)); // Strings - function parameters #define STRp(x) rom x, uint32_t x##_len +#define STR8p(x) bytes x, uint32_t x##_len #define STRc(x) char *x, uint32_t x##_len // string is fixed-length, but editable #define pSTRp(x) rom *x, uint32_t *x##_len #define qSTRp(x) char *x, uint32_t *x##_len // function populates a string and updates its length -#define STRe(x) char *x, uint32_t *x##_len // string and its length are editable #define STRps(x) uint32_t n_##x##s, rom *x##s, const uint32_t *x##_lens // Strings - function arguments @@ -532,6 +501,7 @@ typedef SORTER ((*Sorter)); #define STRasi(x,i) (n_##x##s-(i)), &x##s[i], &x##_lens[i] // subset strating from item i #define STRd(x) x##_str, x##_len #define STRb(buf) (buf).data, (buf).len +#define STRb(buf) (buf).data, (buf).len #define STRi(x,i) x##s[i], x##_lens[i] #define qSTRi(x,i) x##s[i], &x##_lens[i] #define pSTRa(x) &x, &x##_len @@ -559,6 +529,7 @@ typedef SORTER ((*Sorter)); #define STRdec(x,n) ({ x -= (n); x##_len += (n); }) #define STRLEN(string_literal) ((unsigned)(sizeof string_literal - 1)) #define _S(x) x, STRLEN(x) +#define _8(x) (bytes)x, STRLEN(x) #define STRBw(buf,txtword) Bc ((buf), (txtword).index), (txtword).len // used with TxtWord #define FUNCLINE rom func, uint32_t code_line #define __FUNCLINE __FUNCTION__, __LINE__ @@ -632,7 +603,7 @@ typedef packed_enum { BGZF_LIBDEFLATE7=0, BGZF_ZLIB=1, BGZF_LIBDEFLATE19=2, BGZF BGZF_EXTERNAL_LIB, // level is sent to external compressor (used for BCF) BGZF_NO_LIBRARY, NUM_ALL_BGZF_LIBRARIES - } BgzfLibraryType; // constants for BGZF FlagsBgzf.library + } MgzipLibraryType; // constants for BGZF FlagsMgzip.library #define BGZF_LIB_NAMES_LONG { "libdeflate_1.7", "zlib", "libdeflate_1.19", "igzip", "invalid", "external", "no_bgzf" } #define BGZF_LIB_NAMES_SHRT { "libdef7", "zlib", "libdef19", "igzip", "invalid", "external", "no_bgzf" } @@ -642,7 +613,7 @@ typedef packed_enum { BGZF_NOT_INITIALIZED = -100, BGZF_MAX_LEVEL = 14, BGZF_COMP_LEVEL_UNKNOWN = 15 #define BGZF_NO_BGZF 15 // meaning if bgzf_library=BGZF_NO_LIBRARY -} BgzfLevel; +} MgzipLevel; #define COMPRESSOR_CALLBACK(func) \ void func (VBlockP vb, \ diff --git a/src/gff.c b/src/gff.c index c4df40d8..dce66646 100644 --- a/src/gff.c +++ b/src/gff.c @@ -109,18 +109,18 @@ bool gff_header_inspect (VBlockP txt_header_vb, BufferP txt_header, struct Flags } // search for last newline, and also search for embedded FASTA -int32_t gff_unconsumed (VBlockP vb, uint32_t first_i, int32_t *i) +int32_t gff_unconsumed (VBlockP vb, uint32_t first_i) { - ASSERT (*i >= 0 && *i < Ltxt, "*i=%d is ∉ [0,%u]", *i, Ltxt); + ASSERTNOTZERO (Ltxt); - int32_t final_i = *i; + int32_t last_i = Ltxt - 1; int32_t last_newline = -1; - for (int32_t j=first_i; j <= final_i; j++) + for (int32_t j=first_i; j <= last_i; j++) if (*Btxt (j) == '\n') { last_newline = j; - if (j < final_i && *Btxt (j+1) == '>') { + if (j < last_i && *Btxt (j+1) == '>') { if (!segconf.running) { // note: we don't run segconf on an embedded FASTA - we set the values here instead segconf.has_embedded_fasta = true; @@ -132,15 +132,11 @@ int32_t gff_unconsumed (VBlockP vb, uint32_t first_i, int32_t *i) } } - if (last_newline != -1) { - *i = last_newline; + if (last_newline != -1) return Ltxt-1 - last_newline; - } - else { // no newline found - *i = (int32_t)first_i - 1; + else // no newline found return -1; // cannot find \n in the data starting first_i - } } // called from seg_all_data_lines diff --git a/src/gff.h b/src/gff.h index fa95d721..b5236e8e 100644 --- a/src/gff.h +++ b/src/gff.h @@ -165,7 +165,7 @@ extern void gff_zip_initialize (void); extern bool is_gff (STRp(header), bool *need_more); extern bool gff_header_inspect (VBlockP txt_header_vb, BufferP txt_header, struct FlagsTxtHeader txt_header_flags); -extern int32_t gff_unconsumed (VBlockP vb, uint32_t first_i, int32_t *i); +extern int32_t gff_unconsumed (VBlockP vb, uint32_t first_i); extern rom gff_seg_txt_line (VBlockP vb_, rom field_start_line, uint32_t remaining_txt_len, bool *has_special_eol); extern void gff_seg_initialize (VBlockP vb_); extern void gff_segconf_finalize (VBlockP vb); diff --git a/src/hash.c b/src/hash.c index 547118e5..f94616e8 100644 --- a/src/hash.c +++ b/src/hash.c @@ -369,7 +369,7 @@ static inline WordIndex hash_global_add_node (ContextP zctx, uint32_t hash, uint if (zctx->nodes.len > HASH_OCC_WARNING * zctx->global_hash.len) { if (txt_file->redirected) - WARN_ONCE ("Unusually slow compression due to Genozip under-allocating resources because the input file is streaming through a pipe preventing it from knowing the file size. To overcome this, please use --input-size (value in bytes, can be approximate) to inform Genozip of the file size. ctx=%s hash_prime=%u snip=\"%s\"", + WARN_ONCE ("Unusually slow compression due to Genozip under-allocating resources because the input file is streaming through a pipe preventing it from knowing the file size. To overcome this, use --input-size (value in bytes, can be approximate) to inform Genozip of the file size. ctx=%s hash_prime=%u snip=\"%s\"", zctx->tag_name, zctx->global_hash.len32, str_snip); else WARN_ONCE ("Unexpected structure of file is causing unusually slow compression. ctx=%s hash_prime=%u snip=\"%s\"%s", diff --git a/src/locs.c b/src/locs.c index c501237a..36785417 100644 --- a/src/locs.c +++ b/src/locs.c @@ -23,7 +23,7 @@ int32_t locs_is_header_done (bool is_eof) } // returns the length of the data at the end of vb->txt_data that will not be consumed by this VB is to be passed to the next VB -int32_t locs_unconsumed (VBlockP vb, uint32_t first_i, int32_t *i /* in/out */) +int32_t locs_unconsumed (VBlockP vb, uint32_t first_i) { return Ltxt % 8; // a line is an 8-byte cluster of {float x, y;} } diff --git a/src/locs.h b/src/locs.h index d56c4fb1..42c820fe 100644 --- a/src/locs.h +++ b/src/locs.h @@ -17,7 +17,7 @@ #pragma GENDICT LOCS_DEBUG_LINES=DTYPE_FIELD=DBGLINES // used by --debug-lines extern int32_t locs_is_header_done (bool is_eof); -extern int32_t locs_unconsumed (VBlockP vb, uint32_t first_i, int32_t *i); +extern int32_t locs_unconsumed (VBlockP vb, uint32_t first_i); extern bool locs_seg_is_small (ConstVBlockP vb, DictId dict_id); extern void locs_seg_initialize (VBlockP vb); extern void locs_seg_finalize (VBlockP vb); diff --git a/src/lookback.c b/src/lookback.c index af8b1ba1..8b90dcf5 100644 --- a/src/lookback.c +++ b/src/lookback.c @@ -137,7 +137,7 @@ uint32_t lookback_get_next (VBlockP vb, ContextP lb_ctx, ContextP ctx, WordIndex if (*B(WordIndex, *buf, *iterator) == search_for) lookback = (RR(*iterator - buf->newest_index + 1, lb_size)); - ASSERT (IN_RANGE (lookback, 0, lb_size-1), "Invalid lookback=%d", lookback); + ASSERT (IN_RANGE (lookback, 0, lb_size), "Invalid lookback=%d", lookback); return lookback; } diff --git a/src/me23.c b/src/me23.c index 705d0909..1d474d05 100644 --- a/src/me23.c +++ b/src/me23.c @@ -162,7 +162,7 @@ TXTHEADER_TRANSLATOR (txtheader_me232vcf) char, 1, "txt_data"); // add genozip stuff - bufprintf (comp_vb, txtheader_buf, VCF_HEAD_3p1, GENOZIP_CODE_VERSION, GENOZIP_URL); + bufprintf (comp_vb, txtheader_buf, VCF_HEAD_3p1, code_version().s, GENOZIP_URL); buf_append_string (comp_vb, txtheader_buf, flags_command_line()); bufprint0 (comp_vb, txtheader_buf, "\"\n"); diff --git a/src/mgzip.c b/src/mgzip.c new file mode 100644 index 00000000..16f2f81f --- /dev/null +++ b/src/mgzip.c @@ -0,0 +1,1536 @@ +// ------------------------------------------------------------------ +// mgzip.c +// Copyright (C) 2020-2024 Genozip Limited. Patent Pending. +// Please see terms and conditions in the file LICENSE.txt +// +// WARNING: Genozip is proprietary, not open source software. Modifying the source code is strictly prohibited +// and subject to penalties specified in the license. + +#include +#include + +#include "igzip/igzip_lib.h" +#include "libdeflate_1.19/libdeflate.h" +#include "libdeflate_1.7/libdeflate.h" +#include "zlib/zlib.h" +#include "mgzip.h" +#include "arch.h" +#include "file.h" +#include "zfile.h" +#include "zip.h" +#include "arch.h" +#include "txtfile.h" +#include "codec.h" +#include "threads.h" +#include "dispatcher.h" +#include "writer.h" +#include "gencomp.h" +#include "filename.h" +#include "strings.h" + +#define LIBDEFLATE_MAX_LEVEL 12 +#define ZLIB_MAX_LEVEL 9 + +// all data in Little Endian. Defined in https://datatracker.ietf.org/doc/html/rfc1952 and https://samtools.github.io/hts-specs/SAMv1.pdf +typedef struct __attribute__ ((packed, aligned(2))) BgzfHeader { // 18 bytes + uint8_t id1; // Gzip id - must be 31 (0x1f) + uint8_t id2; // Gzip id - must be 139 (0x8b) + uint8_t cm; // Compression Method - must be 8 + uint8_t flg; // Flags - must be 4 (FEXTRA) + uint32_t mtime; // Modification Time + uint8_t xfl; // eXtra Flags + uint8_t os; // Operating System + uint16_t xlen; // Size of extra fields - 6 if contain only BGZF (may be more) + uint8_t si1; // bsize field id - must be 66 (0x42) + uint8_t si2; // bsize field id - must be 67 (0x43) + uint16_t slen; // bsize field length - must be 2 + uint16_t bsize; // bsize field field - (compressed block size -1) +} BgzfHeader; + +// see: https://docs.google.com/document/d/11yeGa1HzXi96D3VTeMwReW2BzG9hmspyKFxx-yRpLPo/edit +typedef struct __attribute__ ((packed)) MgzfHeader { // 29 bytes + uint8_t id1; // Gzip id - must be 31 (0x1f) + uint8_t id2; // Gzip id - must be 139 (0x8b) + uint8_t cm; // Compression Method - must be 8 + uint8_t flg; // Flags - must be 20 (0x14) (FEXTRA | FCOMMENT) + uint32_t mtime; // Modification Time - must be 0 + uint8_t xfl; // eXtra Flags - must be 0 + uint8_t os; // Operating System - must be 0xff + uint16_t xlen; // Size of extra fields - must be 8 + uint8_t si1; // bsize field id - must be (0x49) + uint8_t si2; // bsize field id - must be (0x47) + uint16_t slen; // bsize field extra field length - must be 4 + uint32_t bsize; // bsize field - compressed block size - header + body + char comment[9];// nul-terminated string in the format "C001R015" +} MgzfHeader; + +typedef struct GzipFooter { + uint32_t crc32; // CRC32 of uncompressed data + uint32_t isize; // Input (i.e. uncompressed) Size +} GzipFooter; + +#define GZIP_FOOTER_LEN ((int)sizeof(GzipFooter)) + +typedef struct __attribute__ ((packed, aligned(2))) GzipHeader { // 10 bytes + uint8_t id1; // Gzip id - must be 31 (0x1f) + uint8_t id2; // Gzip id - must be 139 (0x8b) + uint8_t cm; // Compression Method - must be 8 + uint8_t flg; // Flags - must be 0 + uint32_t mtime; // Modification Time - must be 0 + uint8_t xfl; // eXtra Flags - must be 0 + uint8_t os; // Operating System - must be 3 +} GzipHeader; + +static FlagsMgzip bgzf_recompression_levels[1+MAX_FLAG_BGZF] = { + { .library = BGZF_LIBDEFLATE19, .level = 0 }, // --bgzf=0 : BGZF blocks with no compression + { .library = BGZF_IGZIP, .level = 1 }, // --bgzf=1 : note: this is IGZIP LVL0 + { .library = BGZF_IGZIP, .level = 2 }, // --bgzf=2 : note: this is IGZIP LVL1 + { .library = BGZF_LIBDEFLATE19, .level = 1 }, // --bgzf=3 + { .library = BGZF_LIBDEFLATE19, .level = 7 }, // --bgzf=4 + { .library = BGZF_LIBDEFLATE19, .level = 9 }, // --bgzf=5 +}; + +static const uint8_t mgzip_header_len[NUM_CODECS] = MGZIP_HEADER_LEN_BY_CODEC; + +#define bgzf_no_recompression (FlagsMgzip){ .library = BGZF_NO_LIBRARY, .level = BGZF_NO_BGZF } + +rom gzstatus_name (GzStatus st) +{ + return IN_RANGE(st, 0, NUM_GZ_STATUSES) ? (rom[])GZSTATUS_NAMES[st] : "InvalidGzStatus"; +} + +// possible return values, see libdeflate_result in libdeflate.h +static rom libdeflate_error (int err) +{ + switch (err) { + case LIBDEFLATE_SUCCESS : return "SUCCESS"; + case LIBDEFLATE_BAD_DATA : return "BAD DATA"; + case LIBDEFLATE_SHORT_OUTPUT : return "SHORT OUTPUT"; + case LIBDEFLATE_INSUFFICIENT_SPACE : return "INSUFFICIENT SPACE"; + default : return "Undefined libdeflate error"; + } +} + +typedef struct { char s[100]; } BgzfBlockStr; +static BgzfBlockStr display_bb (GzBlockZip *bb) +{ + BgzfBlockStr s; + snprintf (s.s, sizeof (s.s), "{txt_index=%u txt_size=%u compressed_index=%u comp_size=%u is_uncompressed=%u}", + bb->txt_index, bb->txt_size, bb->compressed_index, bb->comp_size, bb->is_uncompressed); + return s; +} + +static void *bgzf_alloc (void *vb_, unsigned items, unsigned size, FUNCLINE) +{ + return codec_alloc_do ((VBlockP )vb_, (uint64_t)items * (uint64_t)size, 1, func, code_line); // all bzlib buffers are constant in size between subsequent compressions +} + +//-------------------------------------------------------------------- +// ZIP SIDE - library/level discovery +//-------------------------------------------------------------------- + +#define BGZF_DISCOVERY_MAX_TESTS 10 // maximum number of BGZF blocks to be tested + +void bgzf_initialize_discovery (FileP file) +{ + ASSERTNOTINUSE (file->bgzf_plausible_levels); + + // note: tested example files of MGZF, MGSP and IL1M and they don't match any of these libraries. + if (IS_BGZF(file->effective_codec)) { + ARRAY_alloc (FlagsMgzip, ll, (LIBDEFLATE_MAX_LEVEL+1)+LIBDEFLATE_MAX_LEVEL+ZLIB_MAX_LEVEL, + false, file->bgzf_plausible_levels, evb, "txt_file->bgzf_plausible_levels"); + + int next=0; + for (int l=0; l <= LIBDEFLATE_MAX_LEVEL; l++) // level=0 only here, bc it would be the same in all libraries + ll[next++] = (FlagsMgzip){ .library = BGZF_LIBDEFLATE19, .level = l}; + + for (int l=1; l <= LIBDEFLATE_MAX_LEVEL; l++) + ll[next++] = (FlagsMgzip){ .library = BGZF_LIBDEFLATE7, .level = l}; + + for (int l=1; l <= ZLIB_MAX_LEVEL; l++) + ll[next++] = (FlagsMgzip){ .library = BGZF_ZLIB, .level = l}; + } + + else if (IS_MGZIP(file->effective_codec)) { + // bug 1101: we don't yet know the plausible levels for other MGZIP codecs + } + + else + ABORT ("Unsupported effective_codec=%s for discovery", codec_name (file->effective_codec)); +} + +// ZIP main thread +static void bgzf_discover_finalize_testing (MgzipLibraryType lib, MgzipLevel level) +{ + txt_file->mgzip_flags.library = lib; // assign field-by-field to not modify other fields + txt_file->mgzip_flags.level = level; + + if (flag.zip_comp_i < MAX_NUM_COMPS) // for stats + z_file->comp_bgzf[flag.zip_comp_i] = txt_file->mgzip_flags; +} + +// ZIP main thread +void bgzf_finalize_discovery (void) +{ + int n_levels = txt_file->bgzf_plausible_levels.len32; + + // case: there is no library/level combination for which we can decompress with bgzf=exact + if (n_levels == 0) { + bgzf_discover_finalize_testing (0, BGZF_COMP_LEVEL_UNKNOWN); // has BGZF, but cannot identify level + + if (flag.show_bgzf) + iprintf ("Discover:%s: is a %s file, generated by an unidentified library\n", txt_name, codec_name (txt_file->effective_codec)); + } + + // case: one or more library/level combinations was verified with all test bgzf blocks (10 blocks, unless file is shorter) + else { + bgzf_discover_finalize_testing (B1ST(FlagsMgzip, txt_file->bgzf_plausible_levels)->library, B1ST(FlagsMgzip, txt_file->bgzf_plausible_levels)->level); + + if (flag.show_bgzf) + iprintf ("Discover: %s is a %s file, %s %s level %u\n", txt_name, codec_name (txt_file->effective_codec), + (n_levels == 1) ? "identified as generated with" : "with multiple plausible levels, arbitrarily selecting", + bgzf_library_name (txt_file->mgzip_flags.library, true), txt_file->mgzip_flags.level); + } +} + +// ZIP: test a BGZF block against all the remaining plausible levels, and eliminate those that don't match. +static void bgzf_discover_library_and_level (VBlockP vb, int test_block_i, STRp(comp), STRp(uncomp)) +{ + uint32_t header_len = mgzip_header_len[txt_file->effective_codec]; + ASSERT (header_len, "%s_HEADER_LEN missing in mgzip_header_len", codec_name (txt_file->effective_codec)); + + if (comp_len <= header_len + GZIP_FOOTER_LEN) { + txt_file->bgzf_plausible_levels.len = 0; + + if (flag.show_bgzf || flag.show_gz) + iprintf ("%s: Block too small - could not identify compression library and level\n", txt_name); + + if (flag.show_gz) exit_ok; + + return; + } + + // ignore the header and footer of the block + comp += header_len; + comp_len -= header_len + GZIP_FOOTER_LEN; + + // compress with each of the remaining plausible levels - testing if the compression is identical to the actual + uint32_t recomp_size = uncomp_len * 1.1 + 64 KB; // guessing the max compressed size in the worst case scenario of very bad compression + char *recomp = MALLOC (recomp_size); + uint32_t recomp_len; + + for_buf (FlagsMgzip, ll, txt_file->bgzf_plausible_levels) { + + // for large test blocks, skip high compression levels which are not common anyway, as testing is too slow + if (comp_len > 100 KB && ll->level >= 8) + continue; + + switch (ll->library) { + case BGZF_LIBDEFLATE19 : { + void *compressor = libdeflate_alloc_compressor (vb, ll->level, __FUNCLINE); + recomp_len = (uint32_t)libdeflate_deflate_compress (compressor, STRa(uncomp), recomp, recomp_size); + + libdeflate_free_compressor (compressor, __FUNCLINE); + break; + } + + case BGZF_LIBDEFLATE7 : { + void *compressor = libdeflate_alloc_compressor_1_7 (ll->level, vb); + recomp_len = (uint32_t)libdeflate_deflate_compress_1_7 (compressor, STRa(uncomp), recomp, recomp_size); + + libdeflate_free_compressor_1_7 (compressor); + break; + } + + case BGZF_ZLIB : { + z_stream strm = { .zalloc = bgzf_alloc, .zfree = codec_free_do, .opaque = vb }; + // deflateInit2 with the default zlib parameters, which is also the same as htslib does + ASSERT0 (deflateInit2 (&strm, ll->level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) == Z_OK, "deflateInit2 failed"); + + strm.next_in = (uint8_t *)uncomp; + strm.avail_in = uncomp_len; + strm.next_out = (uint8_t *)recomp; + strm.avail_out = recomp_size; + ASSERT (deflate (&strm, Z_FINISH) == Z_STREAM_END, "zlib deflate failed: msg=%s", strm.msg); + + recomp_len = recomp_size - strm.avail_out; + + ASSERT0 (deflateEnd (&strm) == Z_OK, "deflateEnd failed"); + break; + } + + default: ABORT ("Invalid library=%d", ll->library); + } + + bool plausible = str_issame (comp, recomp); + + if (flag.show_bgzf) + iprintf ("Discover[%d]: library %s level %u: size_in_file=%u size_in_test=%u plausible=%s\n", + test_block_i, bgzf_library_name (ll->library, true), ll->level, comp_len, recomp_len, YN(plausible)); + + if (!plausible) { + buf_remove (txt_file->bgzf_plausible_levels, FlagsMgzip, BNUM(txt_file->bgzf_plausible_levels, ll), 1); + ll--; fb_after--; // hack for_buf loop + } + } + + FREE (recomp); +} + +//-------------------------------------------------------------------- +// ZIP SIDE - decompress MGZIP-compressed file and prepare BGZF section +//-------------------------------------------------------------------- + +uint32_t mgzip_get_max_block_size (void) +{ + uint32_t max_block_size = (uint32_t[NUM_CODECS])MAX_ISIZE_BY_CODEC[txt_file->effective_codec]; + if (!max_block_size) max_block_size = 1; + + return max_block_size; +} + +void inc_disk_gz_uncomp_or_trunc_(FileP file, uint64_t inc, FUNCLINE) +{ + __atomic_add_fetch (&file->disk_gz_uncomp_or_trunc, inc, __ATOMIC_RELAXED); + + if (flag.show_gz_uncomp) + iprintf ("%s:%u: disk_gz_uncomp_or_trunc + %"PRIu64"\t= %"PRIu64"\n", func, code_line, inc, file->disk_gz_uncomp_or_trunc); +} + +static GzStatus mgzip_block_verify_header (FileP file, bool discovering, int header_len, STR8p(prefix)) +{ + FILE *fp = (FILE *)file->file; + + // no data at all + if (file->gz_data.len32 == 0 && feof (fp)) { + ASSERT0 (!discovering, "unexpected end of file when discovering"); + return GZ_EOF_WITHOUT_EOF_BLOCK; // end of file + } + + // truncated mid-way through header + ARRAY(uint8_t, h, file->gz_data); + if (h_len < header_len) { + if (discovering) + return GZ_NOT_GZIP; // file smaller than a gzip header - its not GZIP + + else if (flag.truncate && feof (fp)) + return GZ_TRUNCATED; // truncated file + + else + ABORT ("%s file %s appears truncated - it ends with a partial gzip block header. offset=%"PRIu64". If you expect this file to be truncated, use --truncate", + codec_name (file->effective_codec), file->basename, (uint64_t)ftello64 (fp) - h_len); // less than the minimal gz block header size + } + + // case: this is not a GZIP block at all (see: https://tools.ietf.org/html/rfc1952) + else if (h[0] != 31 || h[1] != 139 || h[2] != 8) { + if (discovering) + return GZ_NOT_GZIP; + else + ABORT ("expecting %s file %s to be compressed with gzip format, but it is not. offset=%"PRIu64, + codec_name (file->effective_codec), file->basename, (uint64_t)ftello64 (fp) - h_len); + } + + // case: this is GZIP block (by the magic) but it is NOT a valid BGZF block (see: https://samtools.github.io/hts-specs/SAMv1.pdf) + else if (memcmp (h, prefix, prefix_len)) { + if (discovering) + return GZ_IS_OTHER_FORMAT; + else + ABORT ("Encountered a GZIP block that unexpectedly is not %s in %s offset=%"PRIu64" found=%s expected=%s. Solution: use --no-bgzf", + codec_name (file->effective_codec), file->basename, (uint64_t)ftello64 (fp) - h_len, display_gz_header (h, h_len, false).s, display_gz_header (STRa(prefix), false).s); + } + + return GZ_SUCCESS; +} + +static void mgzip_update_file_isizes (FileP file) +{ + // changes since 15.0.63 + // - previously, we didn't add EOF blocks to mgzip_isizes and instead set txt_file->mgzip_flags.has_eof_block, incorrectly assuming that an EOF block can only occur at the end of the file + // - previously, mgzip_isizes was an array of uint16_t whose elements were (file->gz_data.uncomp_len-1) + + // add isize to buffer that will be written to SEC_MGZIP + if (!file->mgzip_isizes.len32) { + uint64_t est_n_blocks = (file->gz_data.comp_len > 10000) ? ((double)file->disk_size / (double)file->gz_data.comp_len * 1.1) : 0; // >10000 to avoid over-allocating due to a randomly small block + buf_alloc (evb, &file->mgzip_isizes, 0, MAX_(10000, est_n_blocks), uint32_t, 0, "txt_file->mgzip_isizes"); + buf_alloc (evb, &file->mgzip_starts, 0, MAX_(10000, est_n_blocks), uint64_t, 0, "txt_file->mgzip_starts"); + } + + buf_append_one (file->mgzip_isizes, file->gz_data.uncomp_len); + buf_append_one (file->mgzip_starts, file->disk_so_far - file->gz_data.len); +} + +static void mgzip_show_truncated (FileP file, uint32_t comp_len_truncated) +{ + iprintf ("TRUNCATED %s thread=%s comp_len_truncated=%u truncated incomplete final %s block\n", + codec_name (file->effective_codec), threads_am_i_main_thread() ? "MAIN" : "COMPUTE", comp_len_truncated, codec_name (file->effective_codec)); +} + +bool il1m_is_valid_isize (FileP file, uint32_t proposed_isize, bool is_eof, bool *is_end_of_vb) +{ + return proposed_isize == 1 MB || (is_eof && proposed_isize < 1 MB); +} + +bool emfl_is_valid_isize (FileP file, uint32_t proposed_isize, bool is_eof, bool *is_end_of_vb) +{ + return !file->max_mgzip_isize || // not set yet because this is the first block + proposed_isize == file->max_mgzip_isize || (is_eof && proposed_isize < file->max_mgzip_isize); +} + +bool emvl_is_valid_isize (FileP file, uint32_t proposed_isize, bool is_eof, bool *is_end_of_vb) +{ + return proposed_isize < 512 MB; // sanity check +} + +bool mgsp_is_valid_isize (FileP file, uint32_t proposed_isize, bool is_eof, bool *is_end_of_vb) +{ + if (proposed_isize > 64 MB) return false; // sanity + + // case: first gz block in this VB + if (!file->num_mgsp_blocks_in_vb) + file->mgsp_vb_isize = proposed_isize; + + if (proposed_isize == file->mgsp_vb_isize || // gz block with same isize as first + proposed_isize == 0 || // EOF block is always accepted + proposed_isize - file->mgsp_vb_isize <= file->num_mgsp_blocks_in_vb) { // last block allowed to be the remainder of dividing the total isizes by the number of blocks: i.e. an integer from zero to the number of block minus one. + + file->num_mgsp_blocks_in_vb++; + file->max_mgsp_blocks_in_vb = MAX_(file->max_mgsp_blocks_in_vb, file->num_mgsp_blocks_in_vb); + return true; + } + + // proposed block doesn't belong to this VB + else { + file->num_mgsp_blocks_in_vb = file->mgsp_vb_isize = 0; // initialize for next VB + *is_end_of_vb = true; + return false; + } +} + +// read a gz block of a codec that does not contain bsize in the gz header +// returns: discovering: GZ_SUCCESS, GZ_IS_OTHER_FORMAT +// otherwise: GZ_SUCCESS +GzStatus mgzip_read_block_no_bsize (FileP file, bool discovering, Codec codec) +{ + START_TIMER; + static NoBsizeCodecParams no_bsize_codec_params[NUM_CODECS] = (NoBsizeCodecParams[])NO_BSIZE_CODECS_PARAMS; + + NoBsizeCodecParams params = no_bsize_codec_params[codec]; + if (!params.gz_hdr) params.gz_hdr = file->gz_header; + + ASSERT (params.max_bsize, "NO_BSIZE_CODECS_PARAMS missing data for %s", codec_name (codec)); + + if (discovering && params.valid_3_blocks_isize) + params.max_bsize *= 3; // read 3 blocks - txtfile_discover_specific_gz will use this data to verify they all have the same isize + + FILE *fp = (FILE *)file->file; + file->gz_data.comp_len = file->gz_data.uncomp_len = 0; // init + + // top up gz_data to max_comp_size (or less if EOF) + txtfile_fread (file, fp, NULL, (int32_t)params.max_bsize - (int32_t)file->gz_data.len32, &file->disk_so_far); + + GzStatus status = mgzip_block_verify_header (file, discovering, params.gz_hdr_len, STRa(params.gz_hdr)); + if (status == GZ_EOF_WITHOUT_EOF_BLOCK) { + if (!discovering) file->no_more_blocks = true; + return GZ_SUCCESS; + } + + if (status != GZ_SUCCESS) return status; + + // search for block size by beginning of next block. note: we do this even if EOF, + // because gz_data might contain several gz blocks. note: also NULL if data is too short. + uint8_t *next_blk = memmem (B8(file->gz_data, params.gz_hdr_len), file->gz_data.len32 - params.gz_hdr_len, params.gz_hdr, params.gz_hdr_len); + bool end_of_vb = false; + + // case: a block was found, and it is not the last block + if (next_blk && params.is_valid_isize (file, GET_UINT32 (next_blk - 4), false/* there IS a next block so not EOF*/, &end_of_vb)) { + + file->gz_data.uncomp_len = GET_UINT32 (next_blk - 4); + file->gz_data.comp_len = BNUM (file->gz_data, next_blk); + } + + // case: remaining data could be a final gz block, or could be a truncated block, + // we will know for sure when trying to uncompress it + else if (feof (fp) && file->gz_data.len32 >= params.gz_hdr_len + GZIP_FOOTER_LEN && + params.is_valid_isize (file, GET_UINT32 (BAFT8 (file->gz_data) - 4), true, &end_of_vb)) { + + file->gz_data.uncomp_len = GET_UINT32 (BAFT8(file->gz_data) - 4); + file->gz_data.comp_len = file->gz_data.len32; + if (!discovering) file->no_more_blocks = true; + } + + // case: end of group of gz blocks that are designated for the current VB + else if (end_of_vb) { + // gz_data: keep data comp_len, uncomp_len remain 0, keep len remains >0 : this means end-of-vb + } + + // case: data in gz_data is does not contain a gz block - either not the right codec file or is truncated + else { + if (discovering) + return GZ_IS_OTHER_FORMAT; + + // data is not IL1M somewhere in the middle of the file... + ASSERT (feof (fp), "Encountered a GZIP block that unexpectedly is not %s in %s offset=%"PRIu64"\nSolution: use --no-bgzf", + codec_name (codec), file->basename, (uint64_t)ftello64 ((FILE *)file->file) - file->gz_data.len); + + // case: final data in file is not a full gz block and truncation allowed: + // account and then ignore the data that will not be gz-decompressed + if (flag.truncate) { + WARN ("FYI: %s is truncated - its final %s block in incomplete. Dropping final %u bytes of the GZ data.", + txt_name, codec_name (codec), file->gz_data.len32); + + if (flag.show_bgzf) mgzip_show_truncated (file, file->gz_data.len32); + + inc_disk_gz_uncomp_or_trunc (file, file->gz_data.len); + file->gz_data.len32 = file->gz_data.uncomp_len = 0; + segconf.zip_txt_modified = true; + file->no_more_blocks = true; + } + + else + ABORTINP ("%s is truncated mid-way through %s block. Tip: If this is expected, use --truncate to discard the final partial %s block", + txt_name, codec_name (codec), codec_name (codec)); + } + + if (file->gz_data.comp_len && !discovering) + mgzip_update_file_isizes (file); + + COPY_TIMER_EVB (mgzip_read_block_no_bsize); + return GZ_SUCCESS; +} + +static GzStatus bgzf_mgzf_set_block_lens (FileP file, uint32_t bsize, bool discovering) +{ + FILE *fp = (FILE *)file->file; + + if (file->gz_data.len32 >= bsize) { + file->gz_data.comp_len = bsize; + file->gz_data.uncomp_len = GET_UINT32 (B8(file->gz_data, bsize-4)); + return GZ_SUCCESS; + } + + if (discovering) + return GZ_NOT_GZIP; + + if (flag.truncate && feof (fp)) + return GZ_TRUNCATED; // truncated file + + int save_errno = errno; // we want to report errno of fread, not ftell. + + ABORT ("%s %s (ftell=%"PRId64" err=\"%s\" gz_data.len=%u but expecting=%u filesystem=%s). %s\n", + feof (fp) ? "Unexpected end of file while reading" : "Failed to read file", + file->basename, ftello64 (fp), + (file->is_remote && save_errno == ESPIPE) ? "Disconnected from remote host" : strerror (save_errno), + file->gz_data.len32, bsize, arch_get_txt_filesystem().s, + feof (fp) ? "Tip: If the file is expected to be truncated, you use --truncate to disregard the final partial BGZF block." : ""); +} + +static void bgzf_mgzf_verify_eof_block (FileP file, STRp(eof_block)) +{ + // case: valid EOF block + if (str_issame_(B1STc(file->gz_data), file->gz_data.comp_len, eof_block, eof_block_len)) + file->num_EOF_blocks++; + + // case: an isize=0 block that is not our EOF block (gz header differs or minimal payload differs - its possible in gzip) + else { + file->non_EOF_zero_block_found = true; // we won't be able to reconstruct exactly, as PIZ reconstructs the EOF block if isize=0 + + if (flag.show_bgzf) + iprintf ("DETECTED non-EOF zero block - we can't reconstruct this file --exact-ly: %s\n", + str_to_hex (B1ST8(file->gz_data), file->gz_data.comp_len).s); + } +} + +static bool mgzf_get_bsize (FileP file, uint32_t *bsize) +{ + if (*bsize) + return true; // already set + + if (file->gz_data.len32 < MGZF_PREFIX_LEN + sizeof (uint32_t)) + return false; + + if (memcmp (B1ST8 (file->gz_data), MGZF_PREFIX, MGZF_PREFIX_LEN)) + return false; + + MgzfHeader *h = B1ST(MgzfHeader, file->gz_data); + *bsize = LTEN32 (h->bsize); + + ASSERT (*bsize <= 128 MB, "bsize=%u seems too big", *bsize); // sanity + + return true; +} + +// ZIP: reads and validates a BGZF block +// returns: discoverying: GZ_SUCCESS, GZ_NOT_GZIP, GZ_IS_OTHER_FORMAT +// otherwise: GZ_SUCCESS, GZ_EOF_WITHOUT_EOF_BLOCK, GZ_TRUNCATED +static GzStatus mgzf_read_block_do (FileP file, // txt_file is not yet assigned when called from txtfile_discover_specific_gz + bool discovering) +{ + #define MGZF_CHUCK_SIZE ((uint32_t)(16 MB)) // max amount we read from disk at a time + + FILE *fp = (FILE *)file->file; + file->gz_data.comp_len = file->gz_data.uncomp_len = 0; // init + uint32_t bsize = 0; + + // top-up if needed - in rare cases - twice (this happens very large block where comp_size>MGZF_CHUCK_SIZE and not known yet - first read of MGZF_CHUCK_SIZE includes the header) + for (int i=0; i < 2; i++) + if ((!mgzf_get_bsize (file, &bsize) || bsize > file->gz_data.len32) && !feof (fp)) { + int32_t chunk_size = MAX_(MGZF_CHUCK_SIZE, bsize); + + txtfile_fread (file, fp, NULL, chunk_size - (int32_t)file->gz_data.len32, &file->disk_so_far); + } + + GzStatus status = mgzip_block_verify_header (file, discovering, MGZF_HEADER_LEN, _8(MGZF_PREFIX)); + if (status != GZ_SUCCESS) return status; + + status = bgzf_mgzf_set_block_lens (file, bsize, discovering); + + if (status == GZ_SUCCESS) { + // verify comment, length=8, format "C001R015" + MgzfHeader *h = B1ST(MgzfHeader, file->gz_data); + #define c h->comment + bool is_valid_comment = strnlen (c, 9) == 8 && + c[0]=='C' && IS_DIGIT(c[1]) && IS_DIGIT(c[2]) && IS_DIGIT(c[3]) && + c[4]=='R' && IS_DIGIT(c[5]) && IS_DIGIT(c[6]) && IS_DIGIT(c[7]); + #undef c + if (discovering && !is_valid_comment) return GZ_IS_OTHER_FORMAT; + + ASSERT (h->bsize == MGZF_EOF_LEN || is_valid_comment, "Invalid MGZF comment: gz_header={ %s }. Solution: run with --no-bgzf", display_gz_header ((bytes)STRb(file->gz_data), false).s); + + // if this is an isize=0 block, verify that it is an EOF block, else we won't be able reconstruct exact + if (!file->gz_data.uncomp_len) + bgzf_mgzf_verify_eof_block (file, MGZF_EOF, MGZF_EOF_LEN); + } + + return status; +} + +// ZIP: reads and validates a BGZF block +// returns: discoverying: GZ_SUCCESS, GZ_NOT_GZIP, GZ_IS_OTHER_FORMAT +// otherwise: GZ_SUCCESS, GZ_EOF_WITHOUT_EOF_BLOCK, GZ_TRUNCATED +static GzStatus bgzf_read_block_do (FileP file, // txt_file is not yet assigned when called from txtfile_discover_specific_gz + bool discovering) +{ + FILE *fp = (FILE *)file->file; + file->gz_data.comp_len = file->gz_data.uncomp_len = 0; // init + + // top-up if needed + if (file->gz_data.len32 < BGZF_MAX_BLOCK_SIZE && !feof (fp)) { + int32_t chunk_size = flag.zip_uncompress_source_during_read ? 150 KB // a bit more than the default block-device read-ahead buffer (128KB) for best parallelization between disk read-ahead and CPU decompression + : BGZF_MAX_CHUCK_SIZE; // bigger block is faster if we are prepared to yield the CPU when waiting for the disk + txtfile_fread (file, fp, NULL, chunk_size - (int32_t)file->gz_data.len32, &file->disk_so_far); + } + + uint32_t bsize = (uint32_t)LTEN16 (B1ST(BgzfHeader, file->gz_data)->bsize) + 1; + + GzStatus status = mgzip_block_verify_header (file, discovering, BGZF_HEADER_LEN, _8(BGZF_PREFIX)); + if (status != GZ_SUCCESS) return status; + + status = bgzf_mgzf_set_block_lens (file, bsize, discovering); + + if (status == GZ_SUCCESS) { + ASSERT (file->gz_data.uncomp_len <= 65536, "isize=%u ∉ [0,65536] in %s offset=%"PRIu64, file->gz_data.uncomp_len, file->basename, (uint64_t)ftello64 (fp) - file->gz_data.len32); + + // if this is an isize=0 block, verify that it is an EOF block, else we won't be able reconstruct exact + if (!file->gz_data.uncomp_len) + bgzf_mgzf_verify_eof_block (file, _S(BGZF_EOF)); + } + + return status; +} + +// ZIP main thread: reads a BGZF block into gz_data +GzStatus mgzip_read_block_with_bsize (FileP file, bool discovering, Codec codec) +{ + START_TIMER; + + // with BGZF, gz_data is either empty or contains exactly 1 bgzf block + if (file->gz_data.comp_len) return GZ_SUCCESS; // we already have 1 block + + GzStatus ret = (codec == CODEC_BGZF) ? bgzf_read_block_do (file, discovering) + : mgzf_read_block_do (file, discovering); + switch (ret) { + case GZ_SUCCESS: // successful read of a BGZF block + mgzip_update_file_isizes (file); + break; + + case GZ_NOT_GZIP: + case GZ_IS_OTHER_FORMAT: + ASSERT (discovering, "ret=%d expected only if discovering", ret); + break; // file->gz_data contains data that is not BGZF data + + case GZ_EOF_WITHOUT_EOF_BLOCK: // file ended without EOF block: that's fine + ret = GZ_SUCCESS; + break; // note: if file was not entirely read, we will detect that at the end of zip_one_file + + case GZ_TRUNCATED: // file ended mid-way through a BGZF block + ASSERT0 (!discovering, "GZ_TRUNCATED unexpected when discovering"); + + // case: truncation allowed: account and then discard the data that will not be gz-decompressed + if (flag.truncate) { + WARN ("FYI: %s is truncated - its final BGZF block in incomplete. Dropping final %u bytes of the GZ data.", txt_name, file->gz_data.len32); + + if (flag.show_bgzf) mgzip_show_truncated (file, file->gz_data.len32); + + inc_disk_gz_uncomp_or_trunc (file, file->gz_data.len); + file->gz_data.len32 = file->gz_data.comp_len = file->gz_data.uncomp_len = 0; // discard partial BGZF block + segconf.zip_txt_modified = true; + + ret = GZ_SUCCESS; + break; + } + + else + ABORTINP ("%s is truncated mid-way through BGZF block. Tip: If this is expected, use --truncate to discard the final partial BGZF block", txt_name); + + default: + ABORT ("Unexpected ret=%s", gzstatus_name (ret)); + } + + if (!discovering) + file->no_more_blocks = (file->gz_data.comp_len == file->gz_data.len32 && feof ((FILE*)file->file)); + + COPY_TIMER_EVB (mgzip_read_block_with_bsize); + return ret; +} + +// ZIP: BGZF section per txt_file component +void mgzip_compress_mgzip_section (void) +{ + // cases where we don't write the BGZF blocks section + if (!txt_file->mgzip_isizes.len || // this txt file is not compressed with MGZIP - we don't need a MGZIP section + txt_file->mgzip_flags.level == BGZF_COMP_LEVEL_UNKNOWN || // we don't know the level - so PIZ will reconstruct at default level + txt_file->non_EOF_zero_block_found || // we don't know how to reconstructed-exactly a zero block other than an EOF block, so no point storing isizes + segconf.zip_txt_modified) // the file has changed and we can't reconstruct to the same blocks + return; + + // sanity check + uint64_t total_isize=0; + for_buf (uint32_t, isize_p, txt_file->mgzip_isizes) + total_isize += *isize_p; + + ASSERT (total_isize == txt_file->txt_data_so_far_single + txt_file->last_truncated_line_len, + "Expecting total_isize=%"PRId64" == txt_data_so_far_single=%"PRId64, + total_isize, txt_file->txt_data_so_far_single); + + BGEN_u32_buf (&txt_file->mgzip_isizes, NULL); + txt_file->mgzip_isizes.len *= sizeof (uint32_t); + + Codec codec = codec_assign_best_codec (evb, NULL, &txt_file->mgzip_isizes, SEC_MGZIP); + + evb->comp_i = flag.zip_comp_i; // this goes into SectionEntFileFormat.comp_i via sections_add_to_list + zfile_compress_section_data_ex (evb, NULL, SEC_MGZIP, &txt_file->mgzip_isizes, NULL, 0, codec, (SectionFlags)txt_file->mgzip_flags, NULL); + txt_file->mgzip_isizes.len /= sizeof (uint32_t); // restore + + z_file->comp_num_EOF_blocks[flag.zip_comp_i] = txt_file->num_EOF_blocks; +} + +// uncompresses a BGZF block in vb->comp_txt_data referred to by bb, into its place in vb->txt_data as prescribed by bb +// might be called from main thread or compute threads +void mgzip_uncompress_one_block (VBlockP vb, GzBlockZip *bb, Codec codec) +{ + if (bb->is_uncompressed) return; // already decompressed (or an empty (e.g. EOF) block) - nothing to do + + ASSERT0 (vb->gzip_compressor, "vb->gzip_compressor=NULL"); + + int header_len = mgzip_header_len[codec]; + ASSERT (header_len, "%s_HEADER_LEN missing in mgzip_header_len", codec_name (codec)); + + uint8_t *h = B8(vb->comp_txt_data, bb->compressed_index); + + // verify that entire block is within vb->comp_txt_data + ASSERT (bb->compressed_index + header_len < vb->comp_txt_data.len && // we have at least the header - we can access bsize + bb->compressed_index + bb->comp_size <= vb->comp_txt_data.len, + "%s: %s block size goes past the end of in vb->comp_txt_data: bb=%s compressed_index=%u vb->comp_txt_data.len=%"PRIu64, + VB_NAME, codec_name (codec), display_bb (bb).s, bb->compressed_index, vb->comp_txt_data.len); + + ASSERT (h[0]==31 && h[1]==139 && h[2]==8, "%s: invalid %s block in vb->comp_txt_data: compressed_index=%u", VB_NAME, codec_name (codec), bb->compressed_index); + + // possibly grow txt_data: can happen data is MGZIP and its length exceeds vb_size due to last block going over + buf_alloc (vb, &vb->txt_data, 0/*don't use "more" bc Ltxt already incremented*/, bb->txt_index + bb->txt_size + TXTFILE_READ_VB_PADDING, char, 0, "txt_data"); + + enum libdeflate_result ret = + libdeflate_deflate_decompress (vb->gzip_compressor, + h + header_len, bb->comp_size - header_len - GZIP_FOOTER_LEN, // compressed + Btxt (bb->txt_index), bb->txt_size, NULL); // uncompressed + + // account for the case of decompression, and also the case bb is discarded due to a certain truncate situation (see below). + inc_disk_gz_uncomp_or_trunc (txt_file, bb->comp_size); + + // case: final IL1M block, which is truncated, but we have --truncate, and the garbage last word + // unluckily < 1MB so it went undetected as a legimiate block in il1m_is_valid_isize. we drop this block now. + if (ret != LIBDEFLATE_SUCCESS && TXT_IS_IL1M && bb->is_eof) { + if (flag.truncate) { + // receive updates made by main thread to mgzip_isizes, mgzip_starts: no more are going to happen as we reached eof + __atomic_thread_fence (__ATOMIC_ACQ_REL); + + txt_file->mgzip_isizes.len--; // remove truncated block from isizes + txt_file->mgzip_starts.len--; + mgzip_show_truncated (txt_file, bb->comp_size); + return; // with bb->is_uncompressed=false + } + + else { + ABORT ("Failed to decompress the final %s block of the file: %s. Tip: If it is expected that the file is truncated, use --truncate to ignore the defective final block.", + codec_name (vb->txt_codec), libdeflate_error(ret)); + } + } + + ASSERT (ret == LIBDEFLATE_SUCCESS, "libdeflate_deflate_decompress failed: %s", libdeflate_error(ret)); + + bb->is_uncompressed = true; + + if (flag.show_bgzf) + iprintf ("UNCOMPRESS %s thread=%s%s i=%u comp_index=%u comp_len=%u txt_index=%u txt_len=%u eof=%s%s%s\n", + codec_name (codec), threads_am_i_main_thread() ? "MAIN" : "COMPUTE", + cond_str (vb->vblock_i, " vb=", VB_NAME), + BNUM (vb->gz_blocks, bb), bb->compressed_index, bb->comp_size, bb->txt_index, bb->txt_size, TF(bb->is_eof), + cond_str (bb->txt_size, " uncomp[5]=", str_to_printable_(Btxt(bb->txt_index), MIN_(5, Ltxt - bb->txt_index)).s), + bb->comp_size == BGZF_EOF_LEN ? " EOF" : ""); + + // discover which gzip library and compression level were used (testing the first few BGZF blocks) + int test_block_i=0; + if (txt_file->bgzf_plausible_levels.len32 && // only >0 in BGZF + txt_file->bgzf_plausible_levels.count < BGZF_DISCOVERY_MAX_TESTS && // fail fast without atomic + (test_block_i = __atomic_fetch_add (&txt_file->bgzf_plausible_levels.count, 1, __ATOMIC_RELAXED)) < BGZF_DISCOVERY_MAX_TESTS) { // note: if multiple threads test in parallel, count might be incremented beyond BGZF_DISCOVERY_MAX_TESTS - that's ok + + bgzf_discover_library_and_level (vb, test_block_i, (rom)h, bb->comp_size, Btxt (bb->txt_index), bb->txt_size); + } + + // in case of --show_gz: report and exit (otherwise, we finalize in the main thread to avoid thread issues with updating txt_file->mgzip_flags) + if (flag.show_gz && (test_block_i == BGZF_DISCOVERY_MAX_TESTS-1 || !txt_file->bgzf_plausible_levels.len32)) + bgzf_finalize_discovery(); +} + +// ZIP: called from the compute thread: zip_compress_one_vb and main thread: txtfile_read_block_mgzip +void mgzip_uncompress_vb (VBlockP vb, Codec codec) +{ + START_TIMER; + ASSERTNOTEMPTY (vb->gz_blocks); + + vb->gzip_compressor = libdeflate_alloc_decompressor(vb, __FUNCLINE); + + uint32_t total_vb_isizes = 0; + for_buf (GzBlockZip, bb, vb->gz_blocks) { + mgzip_uncompress_one_block (vb, bb, codec); + total_vb_isizes += bb->txt_size; + } + + // sanity - total_vb_isizes is at least the size of the vb (can be more, bc the first/last bb can be shared with previous/next vb) + ASSERT (total_vb_isizes >= Ltxt, "%s: Expecting total_vb_isizes=%u >= Ltxt=%u. codec=%s", + VB_NAME, total_vb_isizes, Ltxt, codec_name (txt_file->effective_codec)); + + libdeflate_free_decompressor ((struct libdeflate_decompressor **)&vb->gzip_compressor, __FUNCLINE); + + buf_free (vb->comp_txt_data); // now that we are finished decompressing we can free it + + if (flag.show_time) { + if (threads_am_i_main_thread ()) COPY_TIMER (bgzf_io_thread) + else COPY_TIMER (bgzf_compute_thread); + } + + COPY_TIMER (mgzip_uncompress_vb); +} + +// ZIP: decompresses a prescribed BGZF block when re-reading DEPN lines +static inline void bgzf_uncompress_one_prescribed_block (VBlockP vb, STRp(bgzf_block), STRc(uncomp_block), uint64_t bb_i) +{ + START_TIMER; + + BgzfHeader *h = (BgzfHeader *)bgzf_block; + + if (flag.show_bgzf) + iprintf ("REREAD %s reread bb_i=%"PRIu64" comp_size=%u uncomp_size=%u ", + VB_NAME, bb_i, bgzf_block_len, uncomp_block_len); + + enum libdeflate_result ret = + libdeflate_deflate_decompress (vb->gzip_compressor, + h+1, bgzf_block_len - BGZF_HEADER_LEN - GZIP_FOOTER_LEN, // compressed + STRa(uncomp_block), NULL); // uncompressed + + ASSERT (ret == LIBDEFLATE_SUCCESS, "%s: libdeflate_deflate_decompress failed: %s. bgzf_block_len=%u uncomp_block_len=%u bb_i=%"PRIu64, + VB_NAME, libdeflate_error(ret), bgzf_block_len, uncomp_block_len, bb_i); + + if (flag.show_bgzf) + #define C(i) (i < uncomp_block_len ? char_to_printable (uncomp_block[i]).s : "") + iprintf ("txt_data[5]=%1s%1s%1s%1s%1s\n", C(0), C(1), C(2), C(3), C(4)); + #undef C + + COPY_TIMER (bgzf_uncompress_one_prescribed_block); +} + +// ZIP: re-reads and validates one BGZF block +static void bgzf_reread_one_prescribed_block (FILE *fp, uint64_t offset, qSTRp (bgzf_block)) +{ + ASSERT (!fseeko64 (fp, offset, SEEK_SET), + "fseeko64(%s, %"PRIu64") failed while rereading BGZF depn lines: %s", txt_name, offset, strerror(errno)); + + // read the header + uint32_t header_bytes = txtfile_fread (txt_file, fp, bgzf_block, BGZF_HEADER_LEN, NULL); + + // failed to read as prescribed + ASSERT (header_bytes == BGZF_HEADER_LEN && !memcmp (bgzf_block, BGZF_PREFIX, STRLEN(BGZF_PREFIX)), + "failed to re-read a BGZF block header as perscribed BGZF: offset=%"PRIu64" bytes_read=%u header=%s", offset, header_bytes, str_to_hex ((bytes)bgzf_block, header_bytes).s); + + uint32_t body_size = (LTEN16 (((BgzfHeader*)bgzf_block)->bsize) + 1) - BGZF_HEADER_LEN; + uint32_t body_bytes = txtfile_fread (txt_file, fp, bgzf_block + BGZF_HEADER_LEN, body_size, NULL); + + ASSERT (body_bytes == body_size, "failed to re-read a BGZF block body as perscribed BGZF: offset=%"PRIu64" bytes_read=%u expected=%u", + offset, body_bytes, body_size); + + *bgzf_block_len = BGZF_HEADER_LEN + body_size; +} + +// ZIP: SAM/BAM: compute thread of a DEPN VB: actually re-reading data into txt_data according to vb->reread_prescription +void bgzf_reread_uncompress_vb_as_prescribed (VBlockP vb, FILE *fp) +{ + uint64_t last_offset = -1LL; + char uncomp_block[BGZF_MAX_BLOCK_SIZE]; + + vb->gzip_compressor = libdeflate_alloc_decompressor(vb, __FUNCLINE); + + for_buf (RereadLine, line, vb->reread_prescription) { + + // a line might span 1 or more BGZF blocks + while (line->line_len) { + ASSERT (line->offset.bb_i < txt_file->mgzip_starts.len32, "Expecting bb_i=%"PRIu64" < mgzip_starts.len=%"PRIu64, + (uint64_t)line->offset.bb_i, txt_file->mgzip_starts.len); + + uint64_t offset = *B64 (txt_file->mgzip_starts, line->offset.bb_i); + uint32_t isize = *B32 (txt_file->mgzip_isizes, line->offset.bb_i); + + if (offset != last_offset) { + STRl (bgzf_block, BGZF_MAX_BLOCK_SIZE); + + bgzf_reread_one_prescribed_block (fp, offset, qSTRa(bgzf_block)); + bgzf_uncompress_one_prescribed_block (vb, STRa(bgzf_block), uncomp_block, isize, line->offset.bb_i); + + last_offset = offset; + } + + uint32_t subline_len = MIN_(line->line_len, isize - line->offset.uoffset); + memcpy (BAFTtxt, &uncomp_block[line->offset.uoffset], subline_len); + Ltxt += subline_len; + + // if this line continues to next BGZF block - it starts from the beginning of that block, its remainder is subline_len shorter + line->line_len -= subline_len; + line->offset.bb_i++; + line->offset.uoffset = 0; + } + } + + libdeflate_free_decompressor ((struct libdeflate_decompressor **)&vb->gzip_compressor, __FUNCLINE); +} + +void bgzf_libdeflate_1_7_initialize (void) +{ + libdeflate_set_memory_allocator_1_7 (bgzf_alloc, codec_free_do); +} + +// ZIP: called by Seg to set the bgzf index of the next line +void mgzip_zip_advance_index (VBlockP vb, uint32_t line_len) +{ + if (!vb->gz_blocks.len) return; // no MGZIP blocks in this VB - all data came from "unconsumed_txt" + + vb->line_bgzf_uoffset += line_len; + + // udpate current_bb_i and bgzf_offset (note: line_len might span multiple bgzf blocks) + GzBlockZip *bb; + for (bb = B(GzBlockZip, vb->gz_blocks, vb->gz_blocks.current_bb_i); + vb->line_bgzf_uoffset && vb->line_bgzf_uoffset >= bb->txt_size; // note: careful to also terminate on the edge case that line_bgzf_uoffset==0 and in the final VB block bb->txt_size==0 + bb++) + + vb->line_bgzf_uoffset -= bb->txt_size; // index into the next BGZF block + + vb->gz_blocks.current_bb_i = BNUM(vb->gz_blocks, bb); +} + +// ZIP: after reading data for a txt_header or VB, copy unconsumed gz_blocks to txt_file->unconsumed_mgzip_blocks +// The first block might be partially consumed. +int64_t mgzip_copy_unconsumed_blocks (VBlockP vb) +{ + START_TIMER; + ASSERTISZERO (txt_file->unconsumed_mgzip_blocks.len32); + + if (!vb->gz_blocks.len) return 0; // not a BGZF-compressed file + + int32_t consumed = // amount of data in vb->gz_blocks that does NOT need to be copied to next VB bc it was consumed by this VB or the previous one + Ltxt + // amount of data consumed by this VB + vb->gz_blocks.consumed_by_prev_vb; // amount of data in first BGZF block was consumed by the previous VB + + ARRAY (GzBlockZip, bb, vb->gz_blocks); + + bool done = false; + bool consumed_full_bgzf_blocks=false; + int64_t compressed_size = 0; + + for (uint32_t i=0; i < bb_len; i++) { + // if some of the BGZF blocks are not consumed (the first of them might be partially consumed) - move the blocks + // to unconsumed_mgzip_blocks - to be moved to the next VB + if (consumed - bb[i].txt_size < 0 && !done/*enter only once*/) { + + consumed_full_bgzf_blocks = (consumed == 0); // no partially-consumed block + + // block i might be partially consumed or not consumed at all, subsequent blocks are not consumed at all + buf_append (evb, txt_file->unconsumed_mgzip_blocks, GzBlockZip, + B(GzBlockZip, vb->gz_blocks, i), vb->gz_blocks.len32 - i, "txt_file->unconsumed_mgzip_blocks"); + + txt_file->unconsumed_mgzip_blocks.consumed_by_prev_vb = consumed; // part of first BGZF block already consumed + done = true; + } + else if (!done) + compressed_size += bb[i].comp_size; + + consumed -= bb[i].txt_size; + } + + // sanity check + ASSERT (-consumed - (int32_t)txt_file->last_truncated_line_len == txt_file->unconsumed_txt.len32, "Expecting (-consumed)=%d - last_truncated_line_len=%u == unconsumed_txt.len=%u", + -consumed, txt_file->last_truncated_line_len, txt_file->unconsumed_txt.len32); + + // update bb.txt_index for next VB + // note: first bb.txt_data of the next VB is possibly negative if some of its data was consumed by the current VB + int32_t txt_index = -txt_file->unconsumed_mgzip_blocks.consumed_by_prev_vb; + for_buf (GzBlockZip, bb, txt_file->unconsumed_mgzip_blocks) { + bb->txt_index = txt_index; + txt_index += bb->txt_size; + } + + COPY_TIMER (mgzip_copy_unconsumed_blocks); + return consumed_full_bgzf_blocks ? compressed_size : 0; +} + +// return blocks used by the segconf VB to the unconsumed blocks +void mgzip_return_segconf_blocks (VBlockP vb) +{ + buf_copy (evb, &txt_file->unconsumed_mgzip_blocks, &vb->gz_blocks, GzBlockZip, 0, 0, 0); + txt_file->unconsumed_mgzip_blocks.consumed_by_prev_vb = vb->gz_blocks.consumed_by_prev_vb; +} + +// ZIP: before reading data for a VB, populate gz_blocks with some or all of the unconsumed blocks passed +// from the previous VB or txt_header +void mgzip_zip_init_vb (VBlockP vb) +{ + vb->vb_mgzip_i = txt_file->mgzip_isizes.len; // index of first bgzf block to be used by the VB + + if (!txt_file->unconsumed_mgzip_blocks.len) return; // happens when either unconsumed_bytes=0 or not a BGZF-compressed file + + // data in the first BGZF block already consumed by previous VB or txt_header + vb->gz_blocks.consumed_by_prev_vb = vb->line_bgzf_uoffset = txt_file->unconsumed_mgzip_blocks.consumed_by_prev_vb; + + // copy all unconsumed BGZF blocks - we might not need all of them - the unconsumed ones will moved back in mgzip_copy_unconsumed_blocks + buf_copy (vb, &vb->gz_blocks, &txt_file->unconsumed_mgzip_blocks, GzBlockZip, 0, 0, "gz_blocks"); + + vb->vb_mgzip_i -= txt_file->unconsumed_mgzip_blocks.len32; + + txt_file->unconsumed_mgzip_blocks.len32 = txt_file->unconsumed_mgzip_blocks.consumed_by_prev_vb = 0; + + // sanity check + int32_t available = -vb->gz_blocks.consumed_by_prev_vb; // possibly start negative + for_buf (GzBlockZip, bb, vb->gz_blocks) + available += bb->txt_size; + + ASSERT (available >= Ltxt, "%s blocks in txt_file->unconsumed_mgzip_blocks cover only %d bytes, less than the needed unconsumed_bytes=%d", + codec_name (txt_file->effective_codec), available, Ltxt); +} + +//----------------------------------------------------- +// PIZ SIDE - setting up BGZF for a particular txt file +//----------------------------------------------------- + +static Buffer isizes = {}; // Will be grabbed into txt_file->mgzip_isizes. + +static inline FlagsMgzip recompression_template (int bgzf_level) +{ + return (FlagsMgzip){ .level = bgzf_recompression_levels[bgzf_level].level, // a 4-bit bitfield + .library = bgzf_recompression_levels[bgzf_level].library }; +} + +// PIZ, after calling bgzf_load_isizes +static inline bool is_exact (void) +{ + return txt_file->mgzip_isizes.len > 0; +} + +static FlagsMgzip bgzf_load_isizes (CompIType comp_i, bool show_only) +{ + Section sec = sections_get_comp_bgzf_sec (comp_i); + if (!sec) ignore: { + ASSERTW0 (show_only, "FYI: --bgzf=exact ignored, because when compressing, genozip could not identify parameters of the .gz file"); + goto fallback; // this component doesn't contain a BGZF section + } + + int32_t offset = zfile_read_section (z_file, evb, 0, &evb->z_data, "z_data", SEC_MGZIP, sec); + + SectionHeaderP header = (SectionHeaderP)Bc(evb->z_data, offset); + + // if we don't know the compression level (in older Genozip versions we wrote the SEC_MGZIP even + // if level discovery failed) + if (header->flags.mgzip.level == BGZF_COMP_LEVEL_UNKNOWN) + goto ignore; + + zfile_uncompress_section (evb, header, &isizes, "txt_file->mgzip_isizes", 0, SEC_MGZIP); + + if (show_only) { + buf_destroy (isizes); + goto fallback; + } + + if (VER2(15,63)) { + isizes.len /= sizeof (uint32_t); + BGEN_u32_buf (&isizes, NULL); + } + + // up to 15.0.62 buffer was 16 bit, values were (isize-1), and EOF was indicated by header.has_eof_block + else { + isizes.len /= sizeof (uint16_t); + buf_alloc (evb, &isizes, 0, isizes.len + 1, uint32_t, 0, NULL); + + for_buf_tandem_back (uint16_t, isize16, isizes, uint32_t, isize32, isizes) + *isize32 = (uint32_t)BGEN16 (*isize16) + 1; + + if (header->flags.mgzip.OLD_has_eof_block) + BNXT32(isizes) = 0; // append EOF block + } + + return header->flags.mgzip; // mgzip_isizes successfully loaded + +fallback: + return recompression_template (BGZF_DEFAULT_LEVEL); +} + +// PIZ: called from main thread after reading txt_header's header +FlagsMgzip mgzip_piz_calculate_mgzip_flags (CompIType comp_i, Codec src_codec) +{ + #define C(cdc) (src_codec == CODEC_##cdc) + FlagsMgzip mgzip_flags; + + #define HAS_EXT(x) filename_has_ext (flag.out_filename, #x) + bool bgzf_implied_by_out_filename = flag.out_filename && (HAS_EXT(.gz) || HAS_EXT(.bgz) || HAS_EXT(.bam)); + bool no_bgzf_implied_by_out_filename = file_piz_get_dt_of_out_filename() == flag.out_dt && !(HAS_EXT(.gz) || HAS_EXT(.bgz) || HAS_EXT(.bam)); + bool isizes_loaded = false; + + // cases where there is no BGZF re-compression + if (flag.test || + OUT_DT(CRAM) || + (flag.bgzf == 0 && !OUT_DT(BAM) && !OUT_DT(BCF)) || // note: in BCF and BAM --bgzf=0 means BGZF blocks with no compression (as opposed to no BGZF at all) + (flag.bgzf == BGZF_BY_ZFILE && C(NONE) && flag.reconstruct_as_src)) // case: --bgzf=exact and source codec was CODEC_NONE + + mgzip_flags = bgzf_no_recompression; + + // case: reconstructing BCF: piz sends VCF to bcftools in CODEC_NONE, and bcftools compressed by the level given in mgzip_flags + else if (OUT_DT(BCF)) + mgzip_flags = (FlagsMgzip){ .library = BGZF_EXTERNAL_LIB, + .level = (flag.bgzf < 0) ? 4 : (int[]){0, 2, 4, 6, 8, 9 }[flag.bgzf] }; // convert Genozip level 0-5 to bcftools level 0-9 + + // case: --bgzf=exact and source codec is other than CODEC_NONE + else if (flag.bgzf == BGZF_BY_ZFILE && !C(NONE)) { + mgzip_flags = bgzf_load_isizes (comp_i, false); + isizes_loaded = true; + } + + // case: --bgzf=0 to 5 + else if (flag.bgzf >= 0) { + mgzip_flags = recompression_template (flag.bgzf); // set to --bgzf command line value + + // if user specified --bgzf and --output - make sure output filename is .gz, .bam or .bcf + ASSINP (flag.force || !flag.out_filename || bgzf_implied_by_out_filename || HAS_EXT(.bcf) || mgzip_flags.level==0, + "using %s in combination with %s for outputting a %s file, requires the output filename to end with %s (override with --force)", + OT("output", "o"), OT("bgzf", "z"), dt_name(flag.out_dt), OUT_DT(BAM)?".bam" : OUT_DT(BCF)?".bcf" : ".gz"); + + ASSINP0 (!OUT_DT(BCF) || flag.bgzf != BGZF_BY_ZFILE, "cannot use --bgzf=exact when outputing a BCF file"); // because we have no control over bcftools' BGZF block generation + } + + // case: genocat to stdout without --bgzf: - no re-compression. + else if (is_genocat && !flag.out_filename) + mgzip_flags = OUT_DT(BAM) ? bgzf_recompression_levels[0] : bgzf_no_recompression; // file_open_txt_write interprets level=0 as CODEC_BGZF without compression for BAM, and CODEC_NONE for other types + + // case: genocat or genounzip out_filename and no --bgzf: - determine by file name (except BAM - bgzf regardless of filename) + else if (flag.out_filename) + mgzip_flags = (bgzf_implied_by_out_filename || OUT_DT(BAM) || (!no_bgzf_implied_by_out_filename && !C(NONE))) ? bgzf_recompression_levels[BGZF_DEFAULT_LEVEL] : bgzf_no_recompression; + + // case: genounzip without explicit filename, and no --bgzf: default compression or no compression + else + // note: for bz2, xz, and zip - we reconstruct as gz too. better choice than plain. + mgzip_flags = (IS_GZIP(src_codec) || C(BAM) || C(BZ2) || C(XZ) || C(ZIP)) ? bgzf_recompression_levels[BGZF_DEFAULT_LEVEL] : bgzf_no_recompression; // note: similar logic to txtheader_piz_get_filename + + // case: user wants to see this section header, despite not needing BGZF data + if (!isizes_loaded && (flag.only_headers == SEC_MGZIP+1 || flag.only_headers == SHOW_ALL_HEADERS)) + bgzf_load_isizes (comp_i, true); + + if (flag.show_bgzf) + iprintf ("comp_i=%u with src_codec=%s out_dt=%s: calculated mgzip_flags={%s, %d}\n", + comp_i, codec_name (src_codec), dt_name (flag.out_dt), bgzf_library_name (mgzip_flags.library, true), mgzip_flags.level); + + return mgzip_flags; + #undef C +} + +// PIZ main thread: update txt_file with BGZF info calculated earlier +void bgzf_piz_set_txt_file_bgzf_info (FlagsMgzip mgzip_flags, bytes codec_info) +{ + memcpy (txt_file->bgzf_signature, codec_info, 3); + + if (isizes.len) + buf_grab (evb, txt_file->mgzip_isizes, "txt_file->mgzip_isizes", isizes); + + txt_file->mgzip_flags = mgzip_flags; + + // sanity + ASSERT (txt_file->mgzip_flags.level >= 0 && txt_file->mgzip_flags.level <= BGZF_MAX_LEVEL, "txt_file->mgzip_flags.level=%u ∉ [0,%u]", + txt_file->mgzip_flags.level, BGZF_MAX_LEVEL); + + ASSERT (txt_file->mgzip_flags.library >= 0 && txt_file->mgzip_flags.library < NUM_BGZF_LIBRARIES, "txt_file->mgzip_flags.library=%u ∉ [0,%u]", + txt_file->mgzip_flags.level, NUM_BGZF_LIBRARIES-1); +} + +//----------------------------------------------------- +// PIZ SIDE - compressing txt_file with BGZF +//----------------------------------------------------- + +static void bgzf_alloc_compressor (VBlockP vb, FlagsMgzip mgzip_flags) +{ + ASSERT0 (!vb->gzip_compressor, "expecting vb->gzip_compressor=NULL"); + + switch (mgzip_flags.library) { + case BGZF_LIBDEFLATE19: + vb->gzip_compressor = libdeflate_alloc_compressor (vb, mgzip_flags.level, __FUNCLINE); + break; + + case BGZF_LIBDEFLATE7: + vb->gzip_compressor = libdeflate_alloc_compressor_1_7 (mgzip_flags.level, vb); + break; + + case BGZF_ZLIB: + vb->gzip_compressor = bgzf_alloc (vb, 1, sizeof (z_stream), __FUNCLINE); + *(z_stream *)vb->gzip_compressor = (z_stream){ .zalloc = bgzf_alloc, .zfree = codec_free_do, .opaque = vb }; + break; + + case BGZF_IGZIP: + ASSERT (mgzip_flags.level==1 || mgzip_flags.level==2, "igzip: expecting mgzip_flags.level=%u ∈[1,2]", mgzip_flags.level); + + vb->gzip_compressor = bgzf_alloc (vb, 1, (int[]){ 1+ISAL_DEF_LVL0_DEFAULT, ISAL_DEF_LVL1_DEFAULT }[mgzip_flags.level-1], __FUNCLINE); // 1+ to avoid 0 + break; + + default: + ABORT ("Invalid mgzip_flags.library=%d", mgzip_flags.library); + } +} + +static void bgzf_free_compressor (VBlockP vb, FlagsMgzip mgzip_flags) +{ + switch (mgzip_flags.library) { + case BGZF_LIBDEFLATE7 : + libdeflate_free_compressor_1_7 (vb->gzip_compressor); + break; + + case BGZF_LIBDEFLATE19 : + libdeflate_free_compressor (vb->gzip_compressor, __FUNCLINE); + break; + + case BGZF_IGZIP : + case BGZF_ZLIB : + codec_free (vb, vb->gzip_compressor); + break; + + default: + ABORT ("Invalid mgzip_flags.library=%d", mgzip_flags.library); + } + + vb->gzip_compressor = NULL; +} + +static void bgzf_show_compress (VBlockP vb, int32_t block_i, uint32_t comp_index, uint32_t comp_len, rom txt, uint32_t txt_index, uint32_t isize) +{ + iprintf ("COMPRESS thread=%s %s i=%d comp_txt_data.i=%u bsize=%u txt_data.i=%d isize=%u%s%s\n", + threads_am_i_main_thread() ? "MAIN" : threads_am_i_writer_thread() ? "WRITER" : "COMPUTE", VB_NAME, block_i, + comp_index, comp_len + BGZF_HEADER_LEN + GZIP_FOOTER_LEN, txt_index, isize, + cond_str (isize, " uncomp[5]=", str_to_printable_(txt, MIN_(isize, 5)).s), + comp_len == BGZF_EOF_LEN ? " EOF" : ""); +} + +static void bgzf_compress_one_block (VBlockP vb, rom in, uint32_t isize, + int32_t block_i, int32_t txt_index) // for show_bgzf (both may be negative - indicating previous VB) +{ + START_TIMER; + + ASSERT0 (vb->gzip_compressor, "vb->gzip_compressor=NULL"); + + #define BGZF_MAX_CDATA_SIZE (BGZF_MAX_BLOCK_SIZE - BGZF_HEADER_LEN - GZIP_FOOTER_LEN) + + buf_alloc (vb, &vb->comp_txt_data, BGZF_MAX_BLOCK_SIZE, 0, char, 1.2, "comp_txt_data"); + uint32_t comp_index = vb->comp_txt_data.len32, comp_len; + + // if this is a isize=0 block, we reconstruct it as an EOF block, not through the compressor that might generate a different isize=0 block + if (!isize) { + buf_add (&vb->comp_txt_data, _S(BGZF_EOF)); + comp_len = BGZF_EOF_LEN; + } + + else { // not EOF block + BgzfHeader *header = (BgzfHeader *)BAFTc (vb->comp_txt_data); + buf_add (&vb->comp_txt_data, BGZF_EOF, BGZF_HEADER_LEN); // template of header - only bsize needs updating + + if (txt_file->mgzip_flags.library == BGZF_IGZIP) { + struct isal_zstream strm; + isal_deflate_stateless_init (&strm); + strm.gzip_flag = ISAL_DEFLATE; + strm.flush = NO_FLUSH; + strm.level = txt_file->mgzip_flags.level - 1; // note: level 1,2 in mgzip_flags corrsponds to IGZIP level 0,1 + strm.level_buf_size = (int[]){ ISAL_DEF_LVL0_DEFAULT, ISAL_DEF_LVL1_DEFAULT }[strm.level]; + strm.level_buf = vb->gzip_compressor; + strm.next_in = (uint8_t *)in; + strm.avail_in = isize; + strm.next_out = BAFT8 (vb->comp_txt_data); + strm.avail_out = BGZF_MAX_CDATA_SIZE + GZIP_FOOTER_LEN; + + int ret = isal_deflate_stateless (&strm); + ASSERT (ret == ISAL_DECOMP_OK, "%s: isal_deflate_stateless: %s. isize=%u", VB_NAME, isal_error (ret), isize); + + comp_len = BGZF_MAX_CDATA_SIZE + GZIP_FOOTER_LEN - strm.avail_out; + } + + else if (txt_file->mgzip_flags.library == BGZF_LIBDEFLATE19) { // libdeflate 1.19 + + comp_len = (int)libdeflate_deflate_compress (vb->gzip_compressor, in, isize, BAFTc (vb->comp_txt_data), BGZF_MAX_CDATA_SIZE); + + // in case the compressed data doesn't fit in one BGZF block, move to compressing at the maximum level. this can + // happen theoretically (maybe) if the original data was compressed with a higher level, and an uncompressible 64K block was + // compressed to just under 64K while in our compression level it is just over 64K. + if (!comp_len) { + void *high_compressor = libdeflate_alloc_compressor (vb, LIBDEFLATE_MAX_LEVEL, __FUNCLINE); // libdefate's highest level + comp_len = libdeflate_deflate_compress (high_compressor, in, isize, BAFTc (vb->comp_txt_data), BGZF_MAX_CDATA_SIZE); + libdeflate_free_compressor (high_compressor, __FUNCLINE); + } + } + + else if (txt_file->mgzip_flags.library == BGZF_LIBDEFLATE7) { // libdeflate 1.7 + + comp_len = (int)libdeflate_deflate_compress_1_7 (vb->gzip_compressor, in, isize, BAFTc (vb->comp_txt_data), BGZF_MAX_CDATA_SIZE); + + // see comment in BGZF_LIBDEFLATE19 above + if (!comp_len) { + void *high_compressor = libdeflate_alloc_compressor_1_7 (LIBDEFLATE_MAX_LEVEL, vb); // libdefate's highest level + comp_len = libdeflate_deflate_compress_1_7 (high_compressor, in, isize, BAFTc (vb->comp_txt_data), BGZF_MAX_CDATA_SIZE); + libdeflate_free_compressor_1_7 (high_compressor); + } + } + + else { // zlib + #define strm ((z_stream *)vb->gzip_compressor) + + ASSERT0 (deflateInit2 (vb->gzip_compressor, txt_file->mgzip_flags.level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) == Z_OK, + "deflateInit2 failed"); + + strm->next_in = (uint8_t *)in; + strm->avail_in = isize; + strm->next_out = BAFT8 (vb->comp_txt_data); + strm->avail_out = BGZF_MAX_CDATA_SIZE; + ASSERT (deflate (vb->gzip_compressor, Z_FINISH) == Z_STREAM_END, "deflate failed: msg=%s", strm->msg); + + comp_len = BGZF_MAX_CDATA_SIZE - strm->avail_out; + + ASSERT0 (deflateEnd (vb->gzip_compressor) == Z_OK, "deflateEnd failed"); + #undef strm + } + + ASSERT (comp_len, "cannot compress block with %u bytes into a BGZF block with %u bytes", isize, BGZF_MAX_BLOCK_SIZE); + vb->comp_txt_data.len32 += comp_len; + + header->bsize = LTEN16 ((uint16_t)(BGZF_HEADER_LEN + comp_len + GZIP_FOOTER_LEN - 1)); + + GzipFooter footer = { .crc32 = LTEN32 (crc32 (0, in, isize)), + .isize = LTEN32 (isize) }; + buf_add (&vb->comp_txt_data, (rom)&footer, GZIP_FOOTER_LEN); + } + + if (flag.show_bgzf) + bgzf_show_compress (vb, block_i, comp_index, comp_len, in, txt_index, isize); + + COPY_TIMER (bgzf_compress_one_block); +} + +// appends file data to wvb->comp_txt_data +void bgzf_write_finalize (void) +{ + // if we attempted to reconstruct the BGZF block to the original file's mgzip_isizes - warn if we were unlucky and failed + // note: EOF block(s) were already added according to mgzip_isizes + if (is_exact()) { + uint8_t signature[3]; + bgzf_sign (txt_file->disk_so_far, signature); + + bool verified = !memcmp (signature, txt_file->bgzf_signature, 3); + + // verify that we were successful in recompressing --exact-ly + ASSERTW (verified, "FYI: %s is recompressed with %s (.gz). However, it seems that the original file was compressed with a different compression library than genozip uses, resulting in a slightly different level of compression. Rest assured that the actual data is identical.", + txt_name, codec_name (txt_file->effective_codec)); + + if (flag.show_bgzf) { + #define INT_SIGN(s) ((uint32_t)s[0] | ((uint32_t)s[1] << 8) | ((uint32_t)s[2] << 16)) + if (verified) iprintf ("VERIFY recompression SUCCEEDED: (file_size=%"PRIu64" %% 16777216) = %u, same as source file\n", txt_file->disk_so_far, INT_SIGN (signature)); + else iprintf ("VERIFY recompression SUCCEEDED: (file_size=%"PRIu64" %% 16777216) = %u but source was file %u\n", txt_file->disk_so_far, INT_SIGN (signature), INT_SIGN (txt_file->bgzf_signature)); + } + } + + // add EOF block when not reconstructing --exact-ly + else { + if (flag.show_bgzf) + bgzf_show_compress (wvb, 0, wvb->comp_txt_data.len32, BGZF_EOF_LEN, NULL, 0, 0); + + buf_add_more (wvb, &wvb->comp_txt_data, BGZF_EOF, BGZF_EOF_LEN, "comp_txt_data"); + } +} + +void bgzf_sign (uint64_t disk_size, uint8_t *signature) +{ + signature[0] = (disk_size ) & 0xff; // LSB of size + signature[1] = (disk_size >> 8 ) & 0xff; + signature[2] = (disk_size >> 16) & 0xff; +} + +// Entry point of BGZF compression compute thread. +// bgzf-compress vb->txt_data into vb->comp_txt_data - using BGZF blocks as prescribed in vb->gz_blocks. +// Note: we hope to reconstruct the exact same byte-level BGZF blocks, as the original files, but that +// will only happen if the GZIP library (eg libdeflate), version and parameters are the same +static void bgzf_compress_vb (VBlockP vb) +{ + START_TIMER; + + if (flag.show_bgzf) + iprintf ("COMPRESS thread=%s %s initialized <%sexact> re-compression with %s(%s[%u])\n", + threads_am_i_main_thread() ? "MAIN" : threads_am_i_writer_thread() ? "WRITER" : "COMPUTE", VB_NAME, + is_exact() ? "" : "non-", + codec_name (txt_file->effective_codec), bgzf_library_name (txt_file->mgzip_flags.library, true), txt_file->mgzip_flags.level); + + ASSERTNOTEMPTY (vb->gz_blocks); + + buf_alloc (vb, &vb->comp_txt_data, 0, vb->gz_blocks.len32 * BGZF_MAX_BLOCK_SIZE/2, uint8_t, 1, "comp_txt_data"); // alloc based on estimated size + bgzf_alloc_compressor (vb, txt_file->mgzip_flags); + + for_buf2 (BgzfBlockPiz, block, i, vb->gz_blocks) { + ASSERT (block->txt_index + block->txt_size <= Ltxt, + "block=%u out of range: expecting txt_index=%u txt_size=%u <= txt_data.len=%u", + i, block->txt_index, block->txt_size, Ltxt); + + bgzf_compress_one_block (vb, Btxt (block->txt_index), block->txt_size, i, block->txt_index); + } + + bgzf_free_compressor (vb, txt_file->mgzip_flags); + + vb_set_is_processed (vb); /* tell dispatcher this thread is done and can be joined. this operation needn't be atomic, but it likely is anyway */ + COPY_TIMER (bgzf_compute_thread); +} + +#define BGZF_CREATED_BLOCK_SIZE 65280 // same size as observed in htslib-created files + +// PIZ: calculate the BGZF blocks within this VB +static uint32_t bgzf_calculate_blocks_one_vb (VBlockP vb, bool is_last) +{ + // create our own equal-isize blocks + if (!is_exact()) { + buf_alloc_exact (vb, vb->gz_blocks, ceill ((double)Ltxt / BGZF_CREATED_BLOCK_SIZE), BgzfBlockPiz, "gz_blocks"); + + for_buf2 (BgzfBlockPiz, blk, i, vb->gz_blocks) + *blk = (BgzfBlockPiz){ .txt_index = i * BGZF_CREATED_BLOCK_SIZE, .txt_size = BGZF_CREATED_BLOCK_SIZE }; + + BLST(BgzfBlockPiz, vb->gz_blocks)->txt_size -= (BGZF_CREATED_BLOCK_SIZE * vb->gz_blocks.len - Ltxt); // remove excessive length from last block + + return 0; // our gz_blocks perfectly cover the VB - no data remaining + } + + // reconstruct --exact-ly based on mgzip_isizes + else { + ARRAY (uint32_t, isizes, txt_file->mgzip_isizes); + uint32_t i, index=0; + for (i=txt_file->mgzip_isizes.next; i < txt_file->mgzip_isizes.len; i++) + if (index + isizes[i] <= Ltxt/*<= to include EOF in preceding VB */) + index += isizes[i]; + else + break; + + buf_alloc_exact (vb, vb->gz_blocks, i - txt_file->mgzip_isizes.next, BgzfBlockPiz, "gz_blocks"); + + index = 0; + for_buf (BgzfBlockPiz, blk, vb->gz_blocks) { + *blk = (BgzfBlockPiz){ .txt_index = index, .txt_size = isizes[txt_file->mgzip_isizes.next++] }; + index += blk->txt_size; + } + + int32_t remaining = Ltxt - index; + ASSERT (IN_RANGE(remaining, 0, BGZF_MAX_BLOCK_SIZE), "mgzip_isizes exhausted prematurely: remaining=%d", remaining); // if we have 65536 or more remaining, there should have been more isizes + + return remaining; + } +} + +// PIZ +void bgzf_dispatch_compress (Dispatcher dispatcher, STRp (uncomp), CompIType comp_i, bool is_last) +{ + // uncompressed data to be dealt with by next call to this function (buffer belongs to writer thread) + static Buffer intercall_txt = {}; // belongs to wvb + buf_alloc (wvb, &intercall_txt, 0, BGZF_MAX_BLOCK_SIZE, char, 1.5, "intercall_txt"); + + uint32_t next_isize = txt_file->mgzip_isizes.len ? *B32(txt_file->mgzip_isizes, txt_file->mgzip_isizes.next) + : BGZF_CREATED_BLOCK_SIZE; + + // case: uncomp is not enough to fill a block, just store it to next call + if (!is_last && (uncomp_len + intercall_txt.len32 < next_isize)) { + memcpy (BAFTc(intercall_txt), uncomp, uncomp_len); + intercall_txt.len32 += uncomp_len; + return; + } + + if (uncomp_len || intercall_txt.len) { // might be 0 if is_last, in some cases + + VBlockP vb = dispatcher_generate_next_vb (dispatcher, wvb->vblock_i, COMP_NONE); + vb->comp_i = comp_i; + + // build uncompressed data for this VB - some data left over from previous VB + data from wvb + buf_alloc_exact (vb, vb->txt_data, intercall_txt.len + uncomp_len, char, "txt_data"); + if (intercall_txt.len32) memcpy (B1STtxt, intercall_txt.data, intercall_txt.len32); + memcpy (Btxt (intercall_txt.len32), uncomp, uncomp_len); + + // calculate BGZF blocks - and trim data that doesn't fill a block - to be moved to next VB + if ((intercall_txt.len32 = bgzf_calculate_blocks_one_vb (vb, is_last))) { + Ltxt -= intercall_txt.len32; + memcpy (B1STc(intercall_txt), BAFTtxt, intercall_txt.len32); + } + + // BGZF-compress vb->txt_data in a separate thread + dispatcher_compute (dispatcher, bgzf_compress_vb); + } + + if (is_last) { + dispatcher_set_no_data_available (dispatcher, false, DATA_EXHAUSTED); + buf_destroy (intercall_txt); + } +} + +rom bgzf_library_name (MgzipLibraryType library, bool long_name) +{ + return (library < 0 || library >= NUM_ALL_BGZF_LIBRARIES) ? "INVALID_BGZF_LIBRARY" + : long_name ? (rom[])BGZF_LIB_NAMES_LONG[library] + : (rom[])BGZF_LIB_NAMES_SHRT[library]; +} + +// used by test/Makefile +void il1m_compress (void) +{ + void *compressor = libdeflate_alloc_compressor (evb, 5, __FUNCLINE); + + uint8_t *in = MALLOC (1 MB), *out = MALLOC (2 MB); + uint32_t in_len; + for (int i=0; (in_len = fread (in, 1, 1 MB, stdin)); i++) { + GzipFooter footer = { .crc32 = LTEN32 (crc32 (0, in, in_len)), + .isize = LTEN32 (in_len) }; + + uint32_t out_len = libdeflate_deflate_compress (compressor, in, in_len, out, 2 MB); + ASSERT (out_len, "deflate failed: in_len=%u block_i=%u", in_len, i); + + ASSERT0 (1 == fwrite (_S(IL1M_HEADER), 1, stdout), "fwrite failed #1"); + ASSERT (1 == fwrite (STRa(out), 1, stdout), "fwrite failed: #2 out_len=%u", out_len); + ASSERT0 (1 == fwrite (&footer, sizeof (footer), 1, stdout), "fwrite failed #3"); + } + + fflush (stdout); + exit (0); +} diff --git a/src/mgzip.h b/src/mgzip.h new file mode 100644 index 00000000..3173351f --- /dev/null +++ b/src/mgzip.h @@ -0,0 +1,166 @@ +// ------------------------------------------------------------------ +// mgzip.h +// Copyright (C) 2020-2024 Genozip Limited. Patent Pending. +// Please see terms and conditions in the file LICENSE.txt +// +// WARNING: Genozip is proprietary, not open source software. Modifying the source code is strictly prohibited +// and subject to penalties specified in the license. + +#include "sections.h" + +#define TXT_IS_PLAIN (txt_file->effective_codec == CODEC_NONE) +#define TXT_IS_BGZF (txt_file->effective_codec == CODEC_BGZF) +#define TXT_IS_IL1M (txt_file->effective_codec == CODEC_IL1M) +#define TXT_IS_MGZF (txt_file->effective_codec == CODEC_MGZF) +#define TXT_IS_MGSP (txt_file->effective_codec == CODEC_MGSP) +#define TXT_IS_EMFL (txt_file->effective_codec == CODEC_EMFL) +#define TXT_IS_EMVL (txt_file->effective_codec == CODEC_EMVL) +#define TXT_IS_GZ (txt_file->effective_codec == CODEC_GZ) +#define TXT_IS_BZ2 (txt_file->effective_codec == CODEC_BZ2) + +#define IS_BGZF(codec) ((codec)==CODEC_BGZF) +#define IS_MGZF(codec) ((codec)==CODEC_MGZF) +#define IS_MGSP(codec) ((codec)==CODEC_MGSP) +#define IS_IL1M(codec) ((codec)==CODEC_IL1M) +#define IS_EMFL(codec) ((codec)==CODEC_EMFL) +#define IS_EMVL(codec) ((codec)==CODEC_EMVL) +#define IS_GZ(codec) ((codec)==CODEC_GZ) +#define IS_BZ2(codec) ((codec)==CODEC_BZ2) +#define IS_NONE(codec) ((codec)==CODEC_NONE) +#define IS_MGZIP(codec) (IS_BGZF(codec) || IS_MGZF(codec) || IS_MGSP(codec) || IS_IL1M(codec) || IS_EMFL(codec) || IS_EMVL(codec)) // multi-block gzip +#define IS_GZIP(codec) (IS_MGZIP(codec) || IS_GZ(codec)) + +// note on MGSP: "gz block" in the comments below means, for MGSP, a group of gz blocks. +#define IS_IN_SYNC(codec) (IS_MGZF(codec) || IS_MGSP(codec) || IS_EMVL(codec) || IS_EMFL(codec)) // codecs in which R1 and R2 gz blocks are guaranteed to contain whole, and precisely matching reads. Therefore, R2 gz-decompression can delegated to compute threads without further checks. +#define IS_VB_SIZE_BY_BLOCK(codec) (IS_MGZF(codec) || IS_EMVL(codec)) // codecs that are 1. variable-length 2. reads are never split between blocks 3. we use on VB per gz block +#define IS_VB_SIZE_BY_MGZIP(codec) (IS_VB_SIZE_BY_BLOCK(codec) || IS_MGSP(codec)) // like IS_VB_SIZE_BY_BLOCK, but VB can be a group of gz blocks +#define GZ_HEADER_HAS_BSIZE(codec) (IS_BGZF(codec) || IS_MGZF(codec)) // gz header contains bsize +#define IS_EXACTABLE(codec) (IS_BGZF(codec)) // codecs for which we can we discover the library level and can reconstruct exactly + +#define TXT_IS_MGZIP IS_MGZIP(txt_file->effective_codec) +#define TXT_IS_GZIP IS_GZIP (txt_file->effective_codec) +#define TXT_IS_VB_SIZE_BY_BLOCK IS_VB_SIZE_BY_BLOCK(txt_file->effective_codec) +#define TXT_IS_VB_SIZE_BY_MGZIP IS_VB_SIZE_BY_MGZIP(txt_file->effective_codec) +#define TXT_IS_IN_SYNC IS_IN_SYNC(txt_file->effective_codec) +#define TXT_GZ_HEADER_HAS_BSIZE GZ_HEADER_HAS_BSIZE(txt_file->effective_codec) + +#define BGZF_DEFAULT_LEVEL 2 // PIZ: used if --bgzf is not specified (it is actually faster than 1 if also writing to disk) +#define BGZF_MAX_BLOCK_SIZE ((uint32_t)(64 KB)) // maximum block size of both compressed and uncompressed data of one block +#define BGZF_MAX_CHUCK_SIZE ((uint32_t)(1 MB)) // max amount we read from disk at a time +#define BGZF_PREFIX "\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00" // First 16 bytes of every BGZF block +#define BGZF_HEADER_LEN 18 +#define BGZF_EOF BGZF_PREFIX "\x1b\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00" // // BGZF EOF marker is simply an empty block (note: there are multiple encoding for empty blocks, this is a specific one of them), see https://samtools.github.io/hts-specs/SAMv1.pdf section 4.1.2 +#define BGZF_EOF_LEN 28 + +#define IL1M_HEADER "\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x03" +#define IL1M_ISIZE "\x00\x00\x10\x00" // isize == 1MB in all blocks except the last + +// MGI: a 32-bit version of BGZF +#define MGZF_PREFIX_LEN 16 +#define MGZF_PREFIX "\x1f\x8b\x08\x14\x00\x00\x00\x00\x00\xff\x08\x00\x49\x47\x04\x00" +#define MGZF_EOF_LEN 31 +#define MGZF_EOF MGZF_PREFIX "\x1f\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00" +#define MGZF_HEADER_LEN 29 + +// MGI: constant isize for all gz blocks that go into a particular VB (last block in group might slightly bigger) +#define MGSP_HEADER IL1M_HEADER +#define MGSP_EOF_LEN 20 +#define MGSP_EOF MGSP_HEADER "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00" + +// Element: constant isize of all gz blocks in file (last block might be smaller) +#define EMFL_HEADER_LEN 10 + +#define EMVL_HEADER "\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\xff" +#define EMVL_HEADER_LEN 10 +#define EMVL_FIRST_BLOCK EMVL_HEADER "\x01\x00\x00\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00" // EMVL files begin with this empty block and have no EOF block + +// fixed length of gz header of each MGZIP codec +#define MGZIP_HEADER_LEN_BY_CODEC { \ + [CODEC_BGZF] = BGZF_HEADER_LEN, \ + [CODEC_MGZF] = MGZF_HEADER_LEN, \ + [CODEC_EMFL] = EMFL_HEADER_LEN, \ + [CODEC_IL1M] = STRLEN(IL1M_HEADER), \ + [CODEC_MGSP] = STRLEN(MGSP_HEADER), \ + [CODEC_EMVL] = STRLEN(EMVL_HEADER), \ +} + +// for capped-isize codecs: VB will be vb_size, and full or partial gz blocks +#define MAX_ISIZE_BY_CODEC { \ + [CODEC_BGZF] = 64 KB, \ + [CODEC_IL1M] = 1 MB, \ + [CODEC_EMFL] = txt_file->max_mgzip_isize /* determined during discovery */ \ +} + +typedef bool (*IsValidSize)(FileP, uint32_t proposed_isize, bool is_eof, bool *is_end_of_vb); + +// "no_bsize" codecs +typedef struct { + IsValidSize is_valid_isize; // isize validation function + bool valid_3_blocks_isize; // the first 3 gz blocks are expected to have the same isize (used for discovery) + uint32_t max_bsize; // an upper limit we set on compressed size (bsize) of a block based on observation (after discovery) + bytes gz_hdr; // fixed gz header + uint32_t gz_hdr_len; +} NoBsizeCodecParams; + +#define NO_BSIZE_CODECS_PARAMS { \ + [CODEC_MGSP] = { mgsp_is_valid_isize, true, 4 MB, (bytes)MGSP_HEADER, STRLEN(MGSP_HEADER) }, \ + [CODEC_EMVL] = { emvl_is_valid_isize, false, 32 MB, (bytes)EMVL_HEADER, STRLEN(EMVL_HEADER) }, \ + [CODEC_IL1M] = { il1m_is_valid_isize, true, 1 MB, (bytes)IL1M_HEADER, STRLEN(IL1M_HEADER) }, \ + [CODEC_EMFL] = { emfl_is_valid_isize, true, 4 MB, NULL/*run time*/, EMFL_HEADER_LEN }, \ +} + +typedef struct BgzfBlockPiz { + int32_t txt_index, txt_size; // index of uncompressed block within vb->txt_data. The first block index will be negative if there is passed-down unconsumed data +} BgzfBlockPiz; + +// ZIP side +typedef enum { GZ_SUCCESS, GZ_IS_OTHER_FORMAT, GZ_MORE_DATA, GZ_NOT_GZIP, GZ_EOF_WITHOUT_EOF_BLOCK, GZ_TRUNCATED, NUM_GZ_STATUSES } GzStatus; // file is truncated +#define GZSTATUS_NAMES { "SUCCESS", "IS_OTHER_FORMAT", "MORE_DATA", "NOT_GZIP" "EOF_WITHOUT_EOF_BLOCK", "TRUNCATED", } + +// data type of VBlock.gz_blocks and txt_file->unconsumed_mgzip_blocks : details of MGZIP blocks. +typedef struct GzBlockZip { + int32_t txt_index; // index of uncompressed block within vb->txt_data. If there is passed-down data from previous VB/txt_header, then txt_index of the first block will be negative (see mgzip_copy_unconsumed_blocks) + uint32_t txt_size : 30; + uint32_t is_uncompressed : 1; // true if data has been GZ-decompressed by main thread + uint32_t is_eof : 1; // true if this is the last GZ-block in the file + uint32_t compressed_index, comp_size; // index within vb->scratch +} GzBlockZip; + +extern GzStatus mgzip_read_block_with_bsize (FileP file, bool discovering, Codec codec); +extern GzStatus mgzip_read_block_no_bsize (FileP file, bool discovering, Codec codec); +extern void mgzip_uncompress_vb (VBlockP vb, Codec codec); +extern void mgzip_uncompress_one_block (VBlockP vb, GzBlockZip *bb, Codec codec); +extern void bgzf_reread_uncompress_vb_as_prescribed (VBlockP vb, FILE *file); +extern void mgzip_compress_mgzip_section (void); +extern void mgzip_zip_advance_index (VBlockP vb, uint32_t line_len); +extern int64_t mgzip_copy_unconsumed_blocks (VBlockP vb); +extern void mgzip_zip_init_vb (VBlockP vb); +extern void bgzf_insert_back_segconf_blocks (VBlockP vb); +extern void mgzip_return_segconf_blocks (VBlockP vb); +extern uint32_t mgzip_get_max_block_size (void); + +extern void inc_disk_gz_uncomp_or_trunc_(FileP file, uint64_t inc, FUNCLINE); +#define inc_disk_gz_uncomp_or_trunc(file, inc) inc_disk_gz_uncomp_or_trunc_((file), (inc), __FUNCLINE) + +// codec size validators +extern bool il1m_is_valid_isize (FileP file, uint32_t proposed_isize, bool is_eof, bool *is_end_of_vb); +extern bool mgsp_is_valid_isize (FileP file, uint32_t proposed_isize, bool is_eof, bool *is_end_of_vb); +extern bool emfl_is_valid_isize (FileP file, uint32_t proposed_isize, bool is_eof, bool *is_end_of_vb); +extern bool emvl_is_valid_isize (FileP file, uint32_t proposed_isize, bool is_eof, bool *is_end_of_vb); + +// library / level discovery +extern void bgzf_initialize_discovery (FileP file); +extern void bgzf_finalize_discovery (void); + +// PIZ side +extern FlagsMgzip mgzip_piz_calculate_mgzip_flags (CompIType comp_i, Codec src_codec); +extern void bgzf_piz_set_txt_file_bgzf_info (FlagsMgzip mgzip_flags, bytes codec_info); +extern void bgzf_dispatch_compress (Dispatcher dispatcher, STRp (uncomp), CompIType comp_i, bool is_last); +extern void bgzf_write_finalize (void); + +// misc +extern rom bgzf_library_name (MgzipLibraryType library, bool long_name); +extern rom gzstatus_name (GzStatus st); +extern void il1m_compress (void); +extern void bgzf_libdeflate_1_7_initialize (void); +extern void bgzf_sign (uint64_t disk_size, uint8_t *signature); diff --git a/src/mutex.c b/src/mutex.c index 8f61c243..b22e9685 100644 --- a/src/mutex.c +++ b/src/mutex.c @@ -167,7 +167,7 @@ void mutex_show_bottleneck_analsyis (void) qsort (lp, MAX_CODE_LINE+1, sizeof(LockPoint), mutex_sort_by_accumulator); iprint0 ("Bottleneck analysis - Time waiting on locks:\n" - "Millisec Mutex LockPoint\n"); + "Millisec Mutex / Join LockPoint\n"); for (int i=0; i <= MAX_CODE_LINE; i++) { if (!lp[i].accumulator) break; // done, since its sorted @@ -186,4 +186,20 @@ void mutex_who_is_locked (void) my_lp.mutex_name, (my_lp.func ? my_lp.func : ""), my_lp.code_line, cond_int (my_lp.lock_count > 1, "num_locks_from_different_objects=", my_lp.lock_count)); } -} \ No newline at end of file +} + +// call so join time will reported in by profiler (in --show-time) +void thread_join_lock_point (rom thread_name, TimeSpecType profiler_timer, FUNCLINE) +{ + if (!lp[code_line].mutex_name) { // first lock at this lockpoint + ASSERT (code_line <= MAX_CODE_LINE, "pthreads_join at %s:%u: cannot lock a mutex in a code_line > %u", func, code_line, MAX_CODE_LINE); + lp[code_line] = (LockPoint){ .mutex_name = thread_name, .func = func, .code_line = code_line }; + } + + else + if (lp[code_line].func != func) + WARN_ONCE ("FYI: Two calls to mutex_lock/pthreads_join exist on the same code_line: %s @ %s:%u and %s @ %s:%u - --show-time will show their combined time. To solve, add an empty line to shift the code line number of one of them", + lp[code_line].mutex_name, lp[code_line].func, lp[code_line].code_line, thread_name, func, code_line); + + __atomic_add_fetch (&lp[code_line].accumulator, CHECK_TIMER, __ATOMIC_RELAXED); +} diff --git a/src/mutex.h b/src/mutex.h index 6a70a3aa..edd329fa 100644 --- a/src/mutex.h +++ b/src/mutex.h @@ -18,6 +18,7 @@ #endif #endif #include "buffer.h" +#include "profiler.h" // for TimeSpecType // ----------- // mutex stuff @@ -116,3 +117,16 @@ extern void serializer_lock_do (SerializerP ser, VBIType vb_i, FUNCLINE); ASSERT (!ret, "pthread_spin_lock failed: %s", strerror (ret)); }) #endif +// -------------------------------------------- +// support for pthread_join bottleneck analysis +// -------------------------------------------- + +extern void thread_join_lock_point (rom thread_name, TimeSpecType profiler_timer, FUNCLINE); + +#define PTHREAD_JOIN(thread, thread_entry_point) ({ \ + START_TIMER; \ + int err = pthread_join ((thread), NULL); \ + if (flag.show_time_comp_i != COMP_NONE) \ + thread_join_lock_point ((thread_entry_point), profiler_timer, __FUNCLINE); \ + err; \ +}) diff --git a/src/objdir.linux/secure/license.o b/src/objdir.linux/secure/license.o index 05e5f19f..5dbcfd3e 100644 Binary files a/src/objdir.linux/secure/license.o and b/src/objdir.linux/secure/license.o differ diff --git a/src/objdir.osx-arm/secure/license.o b/src/objdir.osx-arm/secure/license.o index 312bf181..805049c2 100644 Binary files a/src/objdir.osx-arm/secure/license.o and b/src/objdir.osx-arm/secure/license.o differ diff --git a/src/objdir.osx-x86/secure/license.o b/src/objdir.osx-x86/secure/license.o index 94d26dd5..27cc2b82 100644 Binary files a/src/objdir.osx-x86/secure/license.o and b/src/objdir.osx-x86/secure/license.o differ diff --git a/src/objdir.windows/secure/license.o b/src/objdir.windows/secure/license.o index a64e9df5..55040b05 100644 Binary files a/src/objdir.windows/secure/license.o and b/src/objdir.windows/secure/license.o differ diff --git a/src/piz.c b/src/piz.c index 0a5797ec..0669a7f5 100644 --- a/src/piz.c +++ b/src/piz.c @@ -1,904 +1,904 @@ -// ------------------------------------------------------------------ -// piz.c -// Copyright (C) 2019-2024 Genozip Limited. Patent Pending. -// Please see terms and conditions in the file LICENSE.txt -// -// WARNING: Genozip is proprietary, not open source software. Modifying the source code is strictly prohibited, -// under penalties specified in the license. - -#include "profiler.h" -#include "zfile.h" -#include "dispatcher.h" -#include "piz.h" -#include "random_access.h" -#include "regions.h" -#include "ref_iupacs.h" -#include "refhash.h" -#include "progress.h" -#include "profiler.h" -#include "stats.h" -#include "reconstruct.h" -#include "coverage.h" -#include "writer.h" -#include "threads.h" -#include "endianness.h" -#include "chrom.h" -#include "txtheader.h" -#include "base64.h" -#include "dict_io.h" -#include "user_message.h" - -TRANSLATOR_FUNC (piz_obsolete_translator) -{ - return 0; -} - -// output coordinates of current line (for error printing) - very carefully as we are in an error condition - we can't assume anything -PizDisCoords piz_dis_coords (VBlockP vb) -{ - PizDisCoords out = {}; - if (DTF(chrom) == DID_NONE || !ctx_has_value (vb, DTF(chrom))) return out; - - ContextP chrom_ctx = CTX(DTF(chrom)); - WordIndex chrom = chrom_ctx->last_value.i; - if (chrom < 0 || chrom >= chrom_ctx->word_list.len) return out; // not a valid chrom value - - STR(chrom_str); - ctx_get_snip_by_word_index (chrom_ctx, chrom, chrom_str); - if (strlen (chrom_str) > sizeof(out.s)-20) return out; - - int out_len = snprintf (out.s, sizeof (out.s), " CHROM=\"%.64s\"(%d)", str_to_printable_(STRa(chrom_str)).s, chrom); // with leading space - - if (DTF(pos) == DID_NONE || !ctx_has_value (vb, DTF(pos))) return out; - - snprintf (&out.s[out_len], sizeof (out.s)-out_len, " POS=%"PRId64, CTX(DTF(pos))->last_value.i); - return out; -} - -// output a data-type-specific id of the line (for ASSPIZ) - very carefully as we are in an error condition - we can't assume anything -PizDisQname piz_dis_qname (VBlockP vb) -{ - PizDisQname out = {}; - - if (DTF(qname) != DID_NONE && ctx_encountered_in_line (vb, DTF(qname)) && !vb->preprocessing) { - ContextP ctx = CTX(DTF(qname)); - snprintf (out.s, sizeof (out.s), " %.10s=\"%.*s\"", ctx->tag_name, MIN_(80, ctx->last_txt.len), last_txtx(vb, ctx)); - } - - return out; -} - -void asspiz_text (VBlockP vb, FUNCLINE) -{ - StrTextSuperLong s; - int s_len = 0; - - for (int i=0; i < vb->con_stack_len; i++) - SNPRINTF (s, "%s[%u]->", CTX(vb->con_stack[i].did_i)->tag_name, vb->con_stack[i].repeat); - - SNPRINTF (s, "%s", (vb->curr_item != DID_NONE ? CTX(vb->curr_item)->tag_name : "N/A")); - - progress_newline(); - fprintf (stderr, "%s %s: Error in %s:%u line_in_file(1-based)=%"PRId64"%s %s%s stack=%s %s: ", - str_time().s, LN_NAME, func, code_line, - writer_get_txt_line_i ((VBlockP)(vb), vb->line_i), - cond_int (Z_DT(VCF), " sample_i=", vb->sample_i), - piz_dis_coords((VBlockP)(vb)).s, piz_dis_qname((VBlockP)(vb)).s, s.s, version_str().s); -} - -bool piz_grep_match (rom start, rom after) -{ - bool found = false; - SAFE_NUL (after); - - if (!flag.grepw) { - found = !!strstr (start, flag.grep); - goto done; - } - - // case: --grepw - grep whole word - rom s = start; - while (s <= after - flag.grep_len) { - if (!(s = strstr (s, flag.grep))) break; - - char before = (s == start ? ' ' : s[-1]); - char after = s[flag.grep_len]; - - if (!IS_ALPHANUMERIC(before) && before != '_' && - !IS_ALPHANUMERIC(after) && after != '_') { - - found = true; - break; - } - - s += flag.grep_len; - } - -done: - SAFE_RESTORE; - return found; -} - -bool piz_default_skip_section (SectionType st, DictId dict_id) -{ - // --show-dict=DICT - read only the one dictionary - if (ST(DICT) && flag.show_one_dict && is_genocat && !dict_id_is_show (dict_id)) return true; // skip - - // B250, LOCAL, COUNT sections - bool skip = is_genocat && dict_id.num - && dict_id.num != DTFZ(predefined)[CHROM].dict_id.num - && ( - - // sometimes we don't need dictionaries. but we always load CHROM. - (flag.genocat_no_dicts && dict_id_typeless (dict_id).num != flag.show_one_counts.num) - - // if show_counts - we only need the requested section and CHROM (note: not true for dump_one_b250_dict_id, - // as we need to reconstruct to dump it) - || (flag.show_one_counts.num && dict_id_typeless (dict_id).num != flag.show_one_counts.num) - - // if --counts, we filter here - TOPLEVEL only - unless there's a skip_section function which will do the filtering - || (flag.count && !DTPZ(is_skip_section) && dict_id.num != DTFZ(toplevel).num) - ); - - skip |= flag.dont_load_ref_file && (ST(REFERENCE) || st == SEC_REF_HASH || ST(REF_IS_SET)); - - if (skip && is_genocat && dict_id.num && (dict_id.num == flag.show_singletons_dict_id.num || dict_id.num == flag.dump_one_local_dict_id.num)) - skip = false; - - return skip; -} - -static inline void piz_adjust_one_local (ContextP ctx, BufferP local_buf, LocalType *ltype, uint8_t param, bool uncompress_to_pair) -{ - const LocalTypeDesc *ltd = <_desc[*ltype]; - - ASSERT (local_buf->len % ltd->width == 0, "%s.local has %u bytes - but expecting the number of bytes to be a multiple of %u since ltype=%s", - ctx->tag_name, local_buf->len32, ltd->width, ltd->name); - - local_buf->len /= ltd->width; - - // note: in ZIP, if we're loading R1.local to localR1 just to verify identicality, then we should - // keep it in "file format" for zip_generate_local(), not "native fomrat" - if (!uncompress_to_pair || IS_PIZ || !fastq_zip_use_pair_identical (ctx->dict_id)) { - - if (*ltype == LT_BITMAP) { - local_buf->nbits = local_buf->len * 64 - param ; - LTEN_bits ((BitsP)local_buf); - } - - else if (*ltype >= LT_UINT8_TR && *ltype <= LT_UINT64_TR) - local_buf->n_cols = param; // 0 means vcf_num_samples - - if (ltd->file_to_native) - ltd->file_to_native (local_buf, ltype); // BGEN, transpose etc - updates ltype in case of Transpose, after untransposing - } -} - -// PIZ compute thread: decompress all contexts (in pair-2 of paired FASTQ: z_data contains contexts of both pairs) -// ZIP compute thread in FASTQ: decompress pair_1 contexts when compressing pair_2 -void piz_uncompress_all_ctxs (VBlockP vb) -{ - bool vb_is_pair_2 = is_fastq_pair_2(vb); // is this VB a pair-2 FASTQ VB (either in a FASTQ or SAM z_file) - - for_buf (uint32_t, header_offset, vb->z_section_headers) { - SectionHeaderCtxP header = (SectionHeaderCtxP)Bc(vb->z_data, *header_offset); - - bool is_local = HEADER_IS(LOCAL); - bool is_b250 = HEADER_IS(B250); - if (!is_b250 && !is_local) continue; - - ContextP ctx = ctx_get_ctx (vb, header->dict_id); // gets the context (creating it if it doesn't already exist) - - // back comp: bug observed with E2:Z in v11.0.10: OPTION_E2_Z has LOCAL section despite being an alias to SAM_E2_Z - if (ctx->is_ctx_alias && !VER(12)) { - ctx = CTX(ctx->did_i); - header->dict_id = ctx->dict_id; - } - - bool is_pair_section = vb_is_pair_2 && (BGEN32 (header->vblock_i) != vb->vblock_i); // is this a section of R1 read into an R2 vb - bool uncompress_to_pair = is_pair_section && (!header->flags.ctx.paired/*not pair-identical*/ || IS_ZIP); // ZIP: always; PIZ: if pair-assisted - - ASSERT (is_b250 || header->ltype < NUM_LTYPES, "in vb=%u ctx=%s.%s: ltype=%u >= NUM_LTYPES=%u. This can possibly be solved by upgrading Genozip to the latest version", - vb->vblock_i, ctx->tag_name, is_local ? "local" : "b250", header->ltype, NUM_LTYPES); - - // PIZ only: load normal section, or a pair-identical section of from the R1 VB - if (!uncompress_to_pair) { - - // case: buffer has already been decompressed - either during pre-processing, or - // R2 section was decompressed so no need for the pair-identical R1 section - if ((is_local && ctx->local_uncompressed) || (is_b250 && ctx->b250_uncompressed)) - continue; - - if (is_local) { - ctx->lcodec = header->codec; - ctx->ltype = header->ltype; - ctx->nothing_char = !lt_max(ctx->ltype) ? 0 // nothing char is only relevant for integer ltypes - : header->nothing_char == 0xff ? 0 // no nothing char - : header->nothing_char == 0 ? 1 // use hard-coded logic up to (0 always, and only, appears in files up to 15.0.37) - : header->nothing_char; - } - - else { // b250 - // old logic - not clear if/why it is needed but not removed to avoid break back comp: - if (!VER(15) && !ctx->ltype) - ctx->ltype = header->ltype; - - ctx->iterator = (SnipIterator){ .next_b250 = B1ST8 (ctx->b250), .prev_word_index = WORD_INDEX_NONE }; - ctx->b250_size = header->b250_size; // note: for files<=v13, this was always 0, ie B250_BYTES_4 - } - } - - // A pair section (but only pair-assisted in PIZ) - else { - if (is_b250) { - ctx->pair_b250_iter = (SnipIterator){ .next_b250 = B1ST8 (ctx->b250R1), .prev_word_index = WORD_INDEX_NONE }; - ctx->pair_b250_size = header->b250_size; - } - else - ctx->pair_ltype = header->ltype; - } - - BufferP target_buf = uncompress_to_pair ? (is_local ? &ctx->localR1 : &ctx->b250R1) - : (is_local ? &ctx->local : &ctx->b250); - - rom target_buf_name = uncompress_to_pair ? (is_local ? "contexts->localR1" : "contexts->b250R1") - : (is_local ? CTX_TAG_LOCAL : CTX_TAG_B250 ); - - START_TIMER; - - zfile_uncompress_section (vb, header, target_buf, target_buf_name, BGEN32 (header->vblock_i), header->section_type); - - if (is_local && dict_id_typeless (ctx->dict_id).num == flag.show_singletons_dict_id.num && !is_pair_section) - dict_io_show_singletons (vb, ctx); - - if (is_local && dict_id_typeless (ctx->dict_id).num == flag.dump_one_local_dict_id.num && !is_pair_section) - ctx_dump_binary (vb, ctx, true); - - if (!is_local && dict_id_typeless (ctx->dict_id).num == flag.dump_one_b250_dict_id.num && !is_pair_section) - ctx_dump_binary (vb, ctx, false); - - // BGEN32, transpose, fix len - if (is_local && uncompress_to_pair) - piz_adjust_one_local (ctx, &ctx->localR1, &ctx->pair_ltype, header->param, true); - - else if (is_local && !uncompress_to_pair) - piz_adjust_one_local (ctx, &ctx->local, &ctx->ltype, header->param, false); - - if (is_local && !is_pair_section) - ctx->local_uncompressed = true; - - else if (!is_local && !is_pair_section) // b250 - ctx->b250_uncompressed = true; - - if (!uncompress_to_pair/*added this condition in v15*/ && - ((VER(14) && ctx->ltype != LT_BITMAP) || // starting v14: assign to all except LT_BITMAP (in which param is used to determine nbits) - (!VER(14) && header->flags.ctx.v13_copy_local_param))) // up to v13: copy if v13_copy_local_param is set - target_buf->prm8[0] = header->param; - - if (flag.debug_read_ctxs) - iprintf ("%c Uncompressed %s: %s[%u].len=%u into %s\n", sections_read_prefix (is_pair_section || vb->preprocessing), - VB_NAME, ctx->tag_name, ctx->did_i, target_buf->len32, target_buf_name); - } - - if (IS_PIZ) { - // initialize history buffer (eg for SAM buddy) - for_ctx_that (ctx->flags.store_per_line || ctx->flags.spl_custom) - switch (ctx->flags.store) { - // we zero the history, bc when seg compares to a dl value for a field that didn't exist, - // it sees 0. It might seg against that 0. So we need history to be 0 too. - case STORE_INT : buf_alloc_exact_zero (vb, ctx->history, vb->lines.len, int64_t, "history"); break; - case STORE_FLOAT : buf_alloc_exact_zero (vb, ctx->history, vb->lines.len, double, "history"); break; - case STORE_INDEX : buf_alloc_exact_zero (vb, ctx->history, vb->lines.len, WordIndex, "history"); break; - default : buf_alloc_exact_zero (vb, ctx->history, vb->lines.len, HistoryWord, "history"); break; - } - - // prepare context index - for_ctx - vb->ctx_index[ctx->did_i] = (ContextIndex){ .did_i = ctx->did_i, .dict_id = ctx->dict_id }; - - qsort (vb->ctx_index, vb->num_contexts, sizeof (ContextIndex), sort_by_dict_id); - - vb->has_ctx_index = true; - } - - if (flag.debug_or_test) buflist_test_overflows(vb, __FUNCTION__); -} - -// PIZ compute thread entry point -static void piz_reconstruct_one_vb (VBlockP vb) -{ - START_TIMER; - - ASSERTNOTNULL (vb); - ASSERT (vb->vblock_i, "vb->vblock_i is 0: vb->compute_thread_id=%d pthread=%"PRIu64, - vb->compute_thread_id, (uint64_t)pthread_self()); - - ASSERT (!flag.reference || ref_is_loaded (gref) || flag.dont_load_ref_file, - "%s: reference is not loaded correctly", VB_NAME); - - ASSERT (vb->recon_size >= 0, "Invalid vb->recon_size=%d", vb->recon_size); - - // note: txt_data is fully allocated in advance and cannot be extended mid-reconstruction (container_reconstruct and possibly others rely on this) - #define OVERFLOW_SIZE (1 MB) // allow some overflow space as sometimes we reconstruct unaccounted for data: 1. container templates 2. reconstruct_peek and others - - buf_alloc (vb, &vb->txt_data, 0, - vb->recon_size * vb->translation.factor/*see TRANSLATIONS*/ + OVERFLOW_SIZE, - char, 1.1, "txt_data"); - - piz_uncompress_all_ctxs (vb); - - DT_FUNC (vb, piz_recon_init)(vb); - - // reconstruct from top level snip - Did top_level_did_i = ctx_get_existing_did_i (vb, vb->translation.toplevel); - reconstruct_from_ctx (vb, top_level_did_i, 0, true); - - ASSERT (!vb->con_stack_len, "%s: Expecting container stack to be empty, but con_stack_len=%u", VB_NAME, vb->con_stack_len); - - // calculate the digest contribution of this VB, and the digest snapshot of this VB - // note: if we have generated components from which lines might be inserted into the VB - we verify in writer instead - // note: for Deep with gencomp - the SAM components are verified in writer, while the FASTQ components are verified here. - if (piz_need_digest && (!z_has_gencomp || VB_DT(FASTQ)) && !(flag.deep_fq_only && !VB_DT(FASTQ))) - digest_one_vb (vb, true, NULL); // LOOKING FOR A DEADLOCK BUG? CHECK HERE - - if (DTP(piz_after_recon)) DTP(piz_after_recon)(vb); - - vb_set_is_processed (vb); /* tell dispatcher this thread is done and can be joined. this operation needn't be atomic, but it likely is anyway */ - - if (flag.debug_or_test) buflist_test_overflows(vb, __FUNCTION__); - - COPY_TIMER (compute); -} - -static void piz_initialize_ctx_flags_from_vb_1 (VBlockP vb) -{ - // ctx.flags defaults to vb_i=1 flags, overridden if a b250 or local section is read. this will not be overridden if all_the_same, i.e. no b250/local sections. - // note: we use section_list_save and not section_list_buf, because the latter might not contain vb=1, if removed by writer_create_plan - Section vb_1_first_sec = B(SectionEnt, z_file->section_list_save, z_file->section_list_save.prm32[0]); - Section vb_1_last_sec = B(SectionEnt, z_file->section_list_save, z_file->section_list_save.prm32[1]); - - for (Section sec = vb_1_first_sec+1; sec <= vb_1_last_sec; sec++) { - ContextP ctx = ECTX (sec->dict_id); // will exist if it has a dict (all_the_same sections always have a dict) - if (ctx) { - ctx->flags = sec->flags.ctx; - ctx->flags.paired = false; // flags.paired is VB-specific and is not inherited - } - } -} - -void piz_read_all_ctxs (VBlockP vb, Section *sec/* VB_HEADER section */, bool is_pair_data) -{ - START_TIMER; - - for ((*sec)++; (*sec)->st == SEC_B250 || (*sec)->st == SEC_LOCAL; (*sec)++) { - ASSERT (is_pair_data || vb->vblock_i == (*sec)->vblock_i, "expecting vb->vblock_i=%u == sec->vblock_i=%u", - vb->vblock_i, (*sec)->vblock_i); // sanity - - // create a context even if section is skipped, for containers to work (skipping a section should be mirrored in a container filter) - ContextP zctx = ctx_get_existing_zctx ((*sec)->dict_id); - ContextP vctx = CTX(zctx->did_i); // in PIZ z and vb contexts always have same did_i. This is also true for ZIP of R2, bc context was created by R1 and overlayed on this R2 VB. - bool is_local = ((*sec)->st == SEC_LOCAL); - - // don't assert for <=v11 due to bug (see comment in piz_uncompress_all_ctxs) - ASSERT (!zctx->is_ctx_alias || !VER(12), "Found a %s section of %s, this is unexpected because %s is an alias (of %s)", - st_name((*sec)->st), zctx->tag_name, zctx->tag_name, ZCTX(zctx->did_i)->tag_name); - - // if we're a FASTQ R2 VB loading R1 data, decide if we need to load this section - bool pair_assisted=false, pair_identical=false, skip_R1=false; - if (is_pair_data) { - // note: pair_assisted available since early versions. when loading old versions, flags are fixed in sections_list_file_to_memory_format to be consistent with current version. - pair_assisted = (IS_ZIP && fastq_zip_use_pair_assisted ((*sec)->dict_id, (*sec)->st)) || - (IS_PIZ && is_local && vctx->pair_assist_type == SEC_LOCAL) || // R2 section indicated that recon requires assistence of R1 data - (IS_PIZ && !is_local && vctx->pair_assist_type == SEC_B250); - - // note: pair_identical was introduced in v15 - pair_identical = (IS_ZIP && fastq_zip_use_pair_identical ((*sec)->dict_id)) || - (IS_PIZ && is_local && !vctx->local_in_z && ((*sec)->flags.ctx.paired)) || // R2 section is missing and R1 is willing to take its stead - (IS_PIZ && !is_local && !vctx->b250_in_z && ((*sec)->flags.ctx.paired)); - - if (!pair_assisted && !pair_identical) skip_R1 = true; // this R1 section is not needed by R2 - - ASSERT (!pair_assisted || !pair_identical, "%s: %s.%s is invalidly both pair_assisted and pair_identical", - VB_NAME, vctx->tag_name, is_local ? "local" : "b250"); - } - - uint32_t section_start = vb->z_data.len32; - int32_t offset = skip_R1 ? SECTION_SKIPPED - : zfile_read_section (z_file, vb, (*sec)->vblock_i, &vb->z_data, "z_data", (*sec)->st, *sec); // returns 0 if section is skipped - - bool section_read = (offset != SECTION_SKIPPED); // section could be skipped either bc of skip_R1 or piz_is_skip_section() called from zfile_read_section - - if (section_read) { - BNXT32 (vb->z_section_headers) = section_start; - - if (!is_pair_data) { - if (is_local) vctx->local_in_z = true; - else vctx->b250_in_z = true; - - // note: |= (instead of =) to overcome bug in ZIP --pair in some versions <= 13 (lost track of which): - // local sections with junk data created in addition to the expected b250: if both b250 and local indicate pair_assist, we take the b250. - if ((*sec)->flags.ctx.paired && is_fastq_pair_2(vb)) - vctx->pair_assist_type = (*sec)->st; // set when reading R2 sections, consumed when reading pair R1 sections - } - - if (pair_assisted || (pair_identical && IS_ZIP)) - vctx->pair_flags = (*sec)->flags.ctx; - else - vctx->flags = (*sec)->flags.ctx; // override flags inherited from vb=1 and possibly the other B250/LOCAL section - } - - // note: vctx->is_loaded possibly already true if it has a dictionary - set in ctx_overlay_dictionaries_to_vb - and now sets to false if section is skipped - if (IS_PIZ && !pair_assisted && !skip_R1) - vctx->is_loaded = section_read; - - if (flag.debug_read_ctxs) { - if (section_read) - sections_show_header ((SectionHeaderP)Bc (vb->z_data, section_start), NULL, (*sec)->offset, sections_read_prefix (is_pair_data || vb->preprocessing)); - else - iprintf ("%c Skipped loading %s/%u %s.%s\n", sections_read_prefix (is_pair_data || vb->preprocessing), - comp_name((*sec)->comp_i), vb->vblock_i, zctx->tag_name, st_name ((*sec)->st)); - } - } - - if (flag.debug_or_test) buflist_test_overflows(vb, __FUNCTION__); - - COPY_TIMER (piz_read_all_ctxs); -} - -// Called by PIZ main thread: read all the sections at the end of the file, before starting to process VBs -DataType piz_read_global_area (Reference ref) -{ - START_TIMER; - - bool success = zfile_read_genozip_header (0, SOFT_FAIL); // already read if normal file, but not if auxilliary file - - if (flag.show_stats) { - stats_read_and_display(); - if (is_genocat) return DT_NONE; - } - - user_message_display(); - - if (!success) return DT_NONE; - - if (flags_writer_counts()) goto done; - - // check if the genozip file includes a reference - bool has_ref_sections = !!sections_last_sec (SEC_REFERENCE, SOFT_FAIL); - - ASSERTW (!has_ref_sections || !IS_REF_EXTERNAL || flag.reading_reference, - "FYI: ignoring reference file %s because %s was not compressed with --reference", ref_get_filename (ref), z_name); - - if (!flag.reading_reference && has_ref_sections) { - ref_destroy_reference (ref); // destroy an old reference, if one is loaded - flag.reference = REF_STORED; // possibly override REF_EXTERNAL (it will be restored for the next file in ) - } - - // read all dictionaries - CHROM/RNAME is needed for regions_make_chregs(). - // Note: some dictionaries are skipped based on skip() and all flag logic should implemented there - dict_io_read_all_dictionaries(); - - if (!flag.header_only) { - // mapping of the file's chroms to the reference chroms (for files originally compressed with REF_EXTERNAL/EXT_STORE and have alternative chroms) - chrom_2ref_load (ref); - - ref_contigs_load_contigs (ref); // note: in case of REF_EXTERNAL, reference is already pre-loaded - } - - // if the user wants to see only the header, we can skip regions and random access - if (!flag.header_only) { - - ctx_read_all_counts(); // read all SEC_COUNTS sections - - ctx_read_all_subdicts(); // read all SEC_SUBDICTS sections - - // update chrom node indices using the CHROM dictionary, for the user-specified regions (in case -r/-R were specified) - if (flag.regions) - regions_make_chregs (ZCTX(DTFZ(chrom))); - - // if the regions are negative, transform them to the positive complement instead - regions_transform_negative_to_positive_complement(); - - // if this is a stored reference we load the reference random access that will determined which reference sections - // should be read & uncompressed in case of --regions. - // note: in case of a data file with stored reference - SEC_REF_RAND_ACC will contain the random access of the reference - // and SEC_RANDOM_ACCESS will contain the random access of the data. In case of a .ref.genozip file, both sections exist - // and are identical. It made the coding easier and their size is negligible. - random_access_load_ra_section (SEC_RANDOM_ACCESS, DTFZ(chrom), &z_file->ra_buf, "z_file->ra_buf", - !flag.show_index ? NULL : RA_MSG_PRIM); - - random_access_load_ra_section (SEC_REF_RAND_ACC, CHROM, ref_get_stored_ra (ref), "ref_stored_ra", - flag.show_ref_index && !flag.reading_reference ? RA_MSG_REF : NULL); - - if (IS_REF_CHROM2REF && !flag.reading_reference && !flag.genocat_no_reconstruct) - // xxx is this actually used? - chrom_index_by_name (CHROM); // create alphabetically sorted index for user file (not reference) chrom word list - - // case: reading reference file - if (flag.reading_reference) { - - // when reading the reference for genocat --coverage/idxstats, don't need the actual REF sections - if (is_genocat && (flag.show_coverage || flag.idxstats)) - goto done; - - bool ref_loaded_from_disk = !flag.dont_load_ref_file && ref_load_stored_reference (ref); - - // load the IUPACs list of the reference (rare non-ACGT "bases") - ref_iupacs_load (ref); - - // load the refhash, if we are compressing FASTA or FASTQ, or if user requested to see it - if ( (primary_command == ZIP && flag.aligner_available) || - (flag.show_ref_hash && is_genocat) || - ref_cache_is_populating (ref)) - refhash_load (ref); - - // exit now if all we wanted was just to see the reference (we've already shown it) - if ((flag.show_reference || flag.show_is_set || flag.show_ref_hash) && is_genocat) exit_ok; - - if (ref_loaded_from_disk) - progress_finalize_component ("Done"); - } - - // case: non-reference file has stored reference sections - else if (has_ref_sections) { - if (!flag.dont_load_ref_file) { - ref_load_stored_reference (gref); - - // exit now if all we wanted was just to see the reference (we've already shown it) - if ((flag.show_reference || flag.show_is_set || flag.show_ref_hash) && is_genocat) exit_ok; - } - } - } - -done: - COPY_TIMER_EVB (piz_read_global_area); - - return z_file->data_type; -} - -// main thread -bool piz_read_one_vb (VBlockP vb, bool for_reconstruction) -{ - START_TIMER; - - Section sec = sections_vb_header (vb->vblock_i); - - int32_t vb_header_offset = zfile_read_section (z_file, vb, vb->vblock_i, &vb->z_data, "z_data", SEC_VB_HEADER, sec); - ASSERT0 (vb_header_offset >= 0, "Unexpectedly VB_HEADER section was skipped"); - - SectionHeaderVbHeader header = *(SectionHeaderVbHeaderP)Bc (vb->z_data, vb_header_offset); // copy of header as it will be overwritten in piz_read_all_ctxs - - // any of these might be overridden by callback - vb->flags = header.flags.vb_header; - vb->recon_size = BGEN32 (header.recon_size); - vb->longest_line_len = BGEN32 (header.longest_line_len); - vb->longest_seq_len = VER(15) ? BGEN32 (header.longest_seq_len) : 0; - vb->expected_digest = header.digest; - vb->chrom_node_index = WORD_INDEX_NONE; - vb->lines.len = VER(14) ? sec->num_lines : BGEN32 (header.v13_top_level_repeats); - vb->comp_i = sec->comp_i; - vb->show_containers = (flag.show_containers == SHOW_CONTAINERS_ALL_VBs || flag.show_containers == vb->vblock_i); // a per-VB value bc in SAM Load-Prim VBs =false vs normal VBs have the flag value (set in sam_piz_dispatch_one_load_sag_vb) - - if (txt_file) { // sometimes we don't have a txtfile, eg when genocat is used with some flags that emit other data, no the file - vb->vb_position_txt_file = txt_file->txt_data_so_far_single_0; // position in original txt file (before any ZIP or PIZ modifications) - txt_file->num_lines += vb->lines.len; // source file lines - } - - // in case of unbind, the vblock_i in the 2nd+ component will be different than that assigned by the dispatcher - // because the dispatcher is re-initialized for every txt component - if (flag.unbind) vb->vblock_i = BGEN32 (header.vblock_i); - - if (flag_is_show_vblocks (PIZ_TASK_NAME)) - iprintf ("READING(id=%d) vb=%s num_lines=%u recon_size=%u genozip_size=%u longest_line_len=%u\n", - vb->id, VB_NAME, vb->lines.len32, vb->recon_size, BGEN32 (header.z_data_bytes), vb->longest_line_len); - - ctx_overlay_dictionaries_to_vb (VB); // overlay all dictionaries to the vb - - buf_alloc (vb, &vb->z_section_headers, MAX_DICTS * 2, 0, uint32_t, 0, "z_section_headers"); // room for section headers - - BNXT32 (vb->z_section_headers) = vb_header_offset; // vb_header_offset is always 0 for VB header - - piz_initialize_ctx_flags_from_vb_1 (vb); - - DT_FUNC (vb, piz_before_read)(vb); - - // read all b250 and local of all fields and subfields - piz_read_all_ctxs (vb, &sec, false); - - bool ok_to_compute = DT_FUNC_OPTIONAL (vb, piz_init_vb, true)(vb, &header); - - vb->translation = dt_get_translation (vb); // must be after piz_init_vb, as in VCF we set vb->vb_chords there, needed for dt_get_translation - - if (txt_file) - txt_file->txt_data_so_far_single_0 += BGEN32 (header.recon_size); // cumulative expected recon size without piz-side modifications - - if (ok_to_compute && for_reconstruction && flag.collect_coverage) - coverage_initialize (vb); - - COPY_TIMER (piz_read_one_vb); - - return ok_to_compute; -} - -static void piz_handover_or_discard_vb (Dispatcher dispatcher, VBlockP *vb) -{ - bool is_handed_over = false; - - if ((*vb)->preprocessing) - DT_FUNC (z_file, piz_after_preproc)(*vb); - - else if (!flag.no_writer_thread) // note: in SAM with gencomp - writer does the digest calculation - is_handed_over = writer_handover_data (vb); - - if (!is_handed_over && !(*vb)->preprocessing && - (!flag.one_component || writer_does_vb_need_write ((*vb)->vblock_i))) - txt_file->txt_data_so_far_single += (*vb)->txt_data.len; // note: if writing (or SAM with gencomp), this is done in writer_flush_vb, caputring the processing in writer too - - dispatcher_recycle_vbs (dispatcher, !is_handed_over); // don't release VB if handed over - it will be released in writer_release_vb when writing is completed -} - -// returns false if VB was dispatched, and true if vb was skipped -static void piz_dispatch_one_vb (Dispatcher dispatcher, Section sec) -{ - VBlockP next_vb = dispatcher_generate_next_vb (dispatcher, sec->vblock_i, sec->comp_i); - - // read one VB's data from z_file - ReconType reconstruct = piz_read_one_vb (next_vb, true) && // read even if no_reconstruct - !flag.genocat_no_reconstruct; - - if (reconstruct) { - if (flag_is_show_vblocks (PIZ_TASK_NAME)) - iprintf ("BEFORE_COMPUTE(id=%d) vb=%s/%u num_running_compute_threads(before)=%u\n", - next_vb->id, comp_name(next_vb->comp_i), next_vb->vblock_i, dispatcher_get_num_running_compute_threads(dispatcher)); - - dispatcher_compute (dispatcher, piz_reconstruct_one_vb); - dispatcher_increment_progress ("read", 1); // done reading - } - - // case: we won't proceed to uncompressing, reconstructing we're done reading - just handover - // an empty VB as it appears in the recon plan, and writer might be already blocking on waiting for it - else { - dispatcher_increment_progress ("all_no_reconstruct", 3); // done reading, skipped reconstructing and writing - dispatcher_abandon_next_vb (dispatcher); // just moves the to processed_vb so dispatcher_recycle_vbs can recycle it - piz_handover_or_discard_vb (dispatcher, &next_vb); - } -} - -// main thread: usually called in order of VBs, but out-of-order if --test with no writer -static void piz_handle_reconstructed_vb (Dispatcher dispatcher, VBlockP vb, uint64_t *num_nondrop_lines) -{ - // verify that files are the same size, unless we intended to modify the data - ASSERTW (Ltxt == vb->recon_size || flag.piz_txt_modified || (flag.deep_fq_only && !VB_DT(FASTQ)), - "Warning: vblock_i=%s/%u (num_lines=%u) had %s bytes in the original %s file but %s bytes in the reconstructed file (diff=%d)", - comp_name (vb->comp_i), vb->vblock_i, vb->lines.len32, str_int_commas (vb->recon_size).s, dt_name (txt_file->data_type), - str_int_commas (Ltxt).s, - (int32_t)Ltxt - (int32_t)vb->recon_size); - - *num_nondrop_lines += vb->num_nondrop_lines; - if (flag.count == CNT_VBs) - iprintf ("vb=%s lines=%u nondropped_lines=%u txt_data.len=%u\n", - VB_NAME, vb->lines.len32, vb->num_nondrop_lines, Ltxt); - - DT_FUNC (vb, piz_process_recon)(vb); - - z_file->txt_data_so_far_single += vb->recon_size; - - piz_handover_or_discard_vb (dispatcher, &vb); -} - -// dispatcher of PIZ of main (i.e. not auxilliary) files -static Dispatcher main_dispatcher = NULL; -void piz_set_main_dispatcher (Dispatcher dispatcher) -{ - main_dispatcher = dispatcher; -} - -// allow out of order joining of VBs (to reverse non-allowing set in dispatcher_init) -void piz_allow_out_of_order (void) -{ - ASSERTNOTNULL (main_dispatcher); - dispatcher_allow_out_of_order (main_dispatcher); -} - -static uint64_t piz_target_progress (CompIType comp_i) -{ - if (comp_i == COMP_MAIN && Z_DT(SAM)) - return 3 * sections_get_num_vbs_(SAM_COMP_MAIN, SAM_COMP_DEPN) + sections_get_num_vbs (SAM_COMP_PRIM); // VBs pre-processed - - else if (Z_DT(FASTQ) && flag.interleaved) - return 3 * sections_get_num_vbs_(FQ_COMP_R1, FQ_COMP_R2); - - else if (Z_DT(SAM) && flag.deep && flag.interleaved) - return 3 * sections_get_num_vbs_(SAM_COMP_FQ00, SAM_COMP_FQ01); - - else { - if (comp_i == COMP_NONE) comp_i = COMP_MAIN; - return 3 * sections_get_num_vbs_(comp_i, comp_i); - } -} - -Dispatcher piz_z_file_initialize (void) -{ - // read all non-VB non-TxtHeader sections - DataType data_type = piz_read_global_area (gref); - if (data_type == DT_NONE || flag.reading_reference) - return NULL; // no components in this file (as is always the case with reference files) - - if (flag.genocat_global_area_only) return NULL; - - if (!flag_loading_auxiliary && DTPZ(piz_after_global_area)) // must be before writer_create_plan messes up the section list - DTPZ(piz_after_global_area)(); - - if (!writer_create_plan()) - return NULL; // --count, and it was reported already - - if (flag.test || flag.md5) - ASSINP0 (dt_get_translation(NULL).is_src_dt, "Error: --test or --md5 cannot be used when converting a file to another format"); - - // note: if --unbind, we will recalculate the target progress in dispatcher_resume() - Dispatcher dispatcher = dispatcher_init (flag.reading_reference ? "piz-ref" : PIZ_TASK_NAME, // also referred to in dispatcher_recycle_vbs() - PREPROCESSING_TASK_NAME, - POOL_MAIN, - flag.xthreads ? 1 : global_max_threads, 0, - flag.test && flag.no_writer_thread, // out-of-order if --test with no writer thread (note: SAM gencomp always have writer thread to do digest). - flag.test, - flag.out_filename ? flag.out_filename : txtheader_get_txt_filename_from_section().s, - piz_target_progress (COMP_MAIN), - 0); - - return dispatcher; -} - -// main thread: called once per txt_file created: i.e. once, except if unbinding a paired FASTQ, or a Deep file. -// returns true if piz completed, false if piz aborted by piz_initialize -bool piz_one_txt_file (Dispatcher dispatcher, bool is_first_z_file, bool is_last_z_file, - CompIType first_comp_i, CompIType last_comp_i, // COMP_NONE unless flag.unbind - bool allow_skip_cleaning) -{ - dispatcher_start_wallclock(); - - recon_stack_initialize(); - - if (DTPZ(piz_initialize) && !DTPZ(piz_initialize)(first_comp_i)) - return false; // abort PIZ if piz_initialize says so - - bool header_only_file = true; // initialize - true until we encounter a VB header - uint64_t num_nondrop_lines = 0; - - // note: may be NULL if txt_header was removed by writer, eg when loading auxillary files - Section txt_header_sec = (first_comp_i != COMP_NONE) ? sections_get_comp_txt_header_sec (first_comp_i) : NULL; - - Section sec = sections_one_before (txt_header_sec); - - // traverse section list as re-arranged by writer_create_plan - while (!dispatcher_is_done (dispatcher)) { - - bool achieved_something = false; - - // we're pre-processing data (SAM: loading sag) - if (flag.preprocessing && dispatcher_has_free_thread (dispatcher) && !vb_pool_is_full (POOL_MAIN)) - achieved_something = DTPZ(piz_preprocess)(dispatcher); - - // In input is not exhausted, and a compute thread is available - read a vblock and dispatch it - else if (!dispatcher_is_input_exhausted (dispatcher) && dispatcher_has_free_thread (dispatcher) && !vb_pool_is_full (POOL_MAIN)) { - achieved_something = true; - - bool found_header = sections_next_sec2 (&sec, SEC_TXT_HEADER, SEC_VB_HEADER); - bool is_sec_in_comp = (first_comp_i==COMP_NONE || (sec->comp_i >= first_comp_i && sec->comp_i <= last_comp_i)); - - // case SEC_TXT_HEADER - if (found_header && sec->st == SEC_TXT_HEADER && is_sec_in_comp) { - if (sec->vblock_i >= 2) continue; // fragments >= 2 were already handled together with the first fragment - - // note: also starts writer, and if unbinding, also opens the txt file and hands data over to the writer - txtheader_piz_read_and_reconstruct (sec); - - // case --unbind: unpausing after previous txt_file pause (requires txt file to be open) - uint64_t target_progress = piz_target_progress (first_comp_i ? first_comp_i : COMP_MAIN); - dispatcher_resume (dispatcher, target_progress); - } - - // case SEC_VB_HEADER - else if (found_header && sec->st == SEC_VB_HEADER && is_sec_in_comp) { - - if (!writer_does_vb_need_recon (sec->vblock_i)) { - dispatcher_increment_progress ("vb_no_recon", 3); // skipped reading, reconstructing, writing - - if (flag_is_show_vblocks (PIZ_TASK_NAME)) - iprintf ("SKIPPED vb=%s/%u\n", comp_name (sec->comp_i), sec->vblock_i); - continue; - } - - piz_dispatch_one_vb (dispatcher, sec); - header_only_file = false; - } - - // case: we're done with this txt_file (either no header bc EOF, or TXT_HEADER belongs to the next txt_file when unbinding) - else { - if (flag_is_show_vblocks (PIZ_TASK_NAME)) - iprintf ("INPUT EXHAUSTED - no more SEC_VB_HEADER or SEC_TXT_HEADER for txt_file_i=%u\n", z_file->num_txts_so_far); - - dispatcher_set_no_data_available (dispatcher, false, DATA_EXHAUSTED); - - if (header_only_file) - dispatcher_recycle_vbs (dispatcher, true); // note: this is normally done in piz_handover_or_discard_vb - } - } - - // if the next thread (usually in order or VBs in recon_plan, but out-of-order if --test with no writer) is ready, handle the reconstructed VB - VBlockP vb = dispatcher_get_processed_vb (dispatcher, NULL, false); // non-blocking - if (vb) { - if (flag_is_show_vblocks (PIZ_TASK_NAME)) - iprintf ("AFTER_COMPUTE(task=piz id=%d) vb=%s/%u num_running_compute_threads(after)=%u\n", - vb->id, comp_name(vb->comp_i), vb->vblock_i, dispatcher_get_num_running_compute_threads(dispatcher)); - - dispatcher_increment_progress ("preproc_or_recon", 1 + (!vb->preprocessing && flag.no_writer_thread)); // done preprocessing or reconstructing (+1 if skipping writing) - if (!vb->preprocessing) - piz_handle_reconstructed_vb (dispatcher, vb, &num_nondrop_lines); - else - piz_handover_or_discard_vb (dispatcher, &vb); - } - - if (!achieved_something) { - START_TIMER; - usleep (30000); // nothing for us to do right now - wait 30ms - COPY_TIMER_EVB (piz_main_loop_idle); - } - } - - // make sure memory writes by compute threads are visible to the main thread - __atomic_thread_fence (__ATOMIC_ACQUIRE); - - if (flag_is_show_vblocks (PIZ_TASK_NAME)) - iprintf ("DISPATCHER is done for txt_file_i=%u: %s\n", z_file->num_txts_so_far, txt_file ? txt_file->name : "(no filename)"); - - dispatcher_calc_avg_compute_vbs (dispatcher); - - z_file->num_txts_so_far++; - - // finish writing the txt_file (note: the writer thread also calculates digest in SAM/BAM with PRIM/DEPN) - writer_finish_writing (z_file->num_txts_so_far == z_file->num_txt_files || is_genocat); - - // verify amount of data written (if writing) or reconstructed (if --test) sizes adds up as expected - ASSINP (txt_file->txt_data_so_far_single/*accumulated when reconstructing/writing*/ == - txt_file->txt_data_so_far_single_0/*accummulated from section headers */ || flag.piz_txt_modified, - "Data integrity error: Size of original file (without source compression) was %"PRIu64", but reconstructed file is %"PRIu64, - txt_file->txt_data_so_far_single_0, txt_file->txt_data_so_far_single); - - // verifies reconstructed file against MD5 or Adler2 and/or codec_args (if bgzf) - if (piz_need_digest) - digest_piz_verify_one_txt_file (z_file->num_txts_so_far - 1); - - progress_finalize_component_time ("Done", DIGEST_NONE); - - // --sex and --coverage - output results - if (txt_file && !flag_loading_auxiliary) { - if (flag.show_coverage) coverage_show_coverage(); - if (flag.idxstats) coverage_show_idxstats(); - if (flag.count == CNT_TOTAL) iprintf ("%"PRIu64"\n", num_nondrop_lines); - } - - if (is_genocat || (z_file->num_txts_so_far == z_file->num_txt_files)) // genocat always produces exactly one txt file - dispatcher_finish (&dispatcher, NULL, !is_last_z_file || flag.test, - flag.show_memory && is_last_z_file); - else - dispatcher_pause (dispatcher); // we're unbinding and still have more txt_files - - if (txt_file) - DT_FUNC (txt_file, piz_finalize) (is_last_z_file); - - if (flag_is_show_vblocks (PIZ_TASK_NAME)) - iprintf ("Finished PIZ of %s\n", txt_file ? txt_file->name : "(no filename)"); - - // if we're loading an aux file for ZIP - destroy VBs as contexts are unions of ZIP and PIZ - if (primary_command == ZIP && flag_loading_auxiliary) - vb_destroy_pool_vbs (POOL_MAIN, true); - - file_close (&txt_file); - - if (flag.show_time && ((flag.show_time_comp_i >= first_comp_i && flag.show_time_comp_i <= last_comp_i) || - (first_comp_i == COMP_NONE && flag.show_time_comp_i != COMP_ALL))) - profiler_add_evb_and_print_report(); - - return true; -} +// ------------------------------------------------------------------ +// piz.c +// Copyright (C) 2019-2024 Genozip Limited. Patent Pending. +// Please see terms and conditions in the file LICENSE.txt +// +// WARNING: Genozip is proprietary, not open source software. Modifying the source code is strictly prohibited, +// under penalties specified in the license. + +#include "profiler.h" +#include "zfile.h" +#include "dispatcher.h" +#include "piz.h" +#include "random_access.h" +#include "regions.h" +#include "ref_iupacs.h" +#include "refhash.h" +#include "progress.h" +#include "profiler.h" +#include "stats.h" +#include "reconstruct.h" +#include "coverage.h" +#include "writer.h" +#include "threads.h" +#include "endianness.h" +#include "chrom.h" +#include "txtheader.h" +#include "base64.h" +#include "dict_io.h" +#include "user_message.h" + +TRANSLATOR_FUNC (piz_obsolete_translator) +{ + return 0; +} + +// output coordinates of current line (for error printing) - very carefully as we are in an error condition - we can't assume anything +PizDisCoords piz_dis_coords (VBlockP vb) +{ + PizDisCoords out = {}; + if (DTF(chrom) == DID_NONE || !ctx_has_value (vb, DTF(chrom))) return out; + + ContextP chrom_ctx = CTX(DTF(chrom)); + WordIndex chrom = chrom_ctx->last_value.i; + if (chrom < 0 || chrom >= chrom_ctx->word_list.len) return out; // not a valid chrom value + + STR(chrom_str); + ctx_get_snip_by_word_index (chrom_ctx, chrom, chrom_str); + if (strlen (chrom_str) > sizeof(out.s)-20) return out; + + int out_len = snprintf (out.s, sizeof (out.s), " CHROM=\"%.64s\"(%d)", str_to_printable_(STRa(chrom_str)).s, chrom); // with leading space + + if (DTF(pos) == DID_NONE || !ctx_has_value (vb, DTF(pos))) return out; + + snprintf (&out.s[out_len], sizeof (out.s)-out_len, " POS=%"PRId64, CTX(DTF(pos))->last_value.i); + return out; +} + +// output a data-type-specific id of the line (for ASSPIZ) - very carefully as we are in an error condition - we can't assume anything +PizDisQname piz_dis_qname (VBlockP vb) +{ + PizDisQname out = {}; + + if (DTF(qname) != DID_NONE && ctx_encountered_in_line (vb, DTF(qname)) && !vb->preprocessing) { + ContextP ctx = CTX(DTF(qname)); + snprintf (out.s, sizeof (out.s), " %.10s=\"%.*s\"", ctx->tag_name, MIN_(80, ctx->last_txt.len), last_txtx(vb, ctx)); + } + + return out; +} + +void asspiz_text (VBlockP vb, FUNCLINE) +{ + StrTextSuperLong s; + int s_len = 0; + + for (int i=0; i < vb->con_stack_len; i++) + SNPRINTF (s, "%s[%u]->", CTX(vb->con_stack[i].did_i)->tag_name, vb->con_stack[i].repeat); + + SNPRINTF (s, "%s", (vb->curr_item != DID_NONE ? CTX(vb->curr_item)->tag_name : "N/A")); + + progress_newline(); + fprintf (stderr, "%s %s: Error in %s:%u line_in_file(1-based)=%"PRId64"%s %s%s stack=%s %s: ", + str_time().s, LN_NAME, func, code_line, + writer_get_txt_line_i ((VBlockP)(vb), vb->line_i), + cond_int (Z_DT(VCF), " sample_i=", vb->sample_i), + piz_dis_coords((VBlockP)(vb)).s, piz_dis_qname((VBlockP)(vb)).s, s.s, version_str().s); +} + +bool piz_grep_match (rom start, rom after) +{ + bool found = false; + SAFE_NUL (after); + + if (!flag.grepw) { + found = !!strstr (start, flag.grep); + goto done; + } + + // case: --grepw - grep whole word + rom s = start; + while (s <= after - flag.grep_len) { + if (!(s = strstr (s, flag.grep))) break; + + char before = (s == start ? ' ' : s[-1]); + char after = s[flag.grep_len]; + + if (!IS_ALPHANUMERIC(before) && before != '_' && + !IS_ALPHANUMERIC(after) && after != '_') { + + found = true; + break; + } + + s += flag.grep_len; + } + +done: + SAFE_RESTORE; + return found; +} + +bool piz_default_skip_section (SectionType st, DictId dict_id) +{ + // --show-dict=DICT - read only the one dictionary + if (ST(DICT) && flag.show_one_dict && is_genocat && !dict_id_is_show (dict_id)) return true; // skip + + // B250, LOCAL, COUNT sections + bool skip = is_genocat && dict_id.num + && dict_id.num != DTFZ(predefined)[CHROM].dict_id.num + && ( + + // sometimes we don't need dictionaries. but we always load CHROM. + (flag.genocat_no_dicts && dict_id_typeless (dict_id).num != flag.show_one_counts.num) + + // if show_counts - we only need the requested section and CHROM (note: not true for dump_one_b250_dict_id, + // as we need to reconstruct to dump it) + || (flag.show_one_counts.num && dict_id_typeless (dict_id).num != flag.show_one_counts.num) + + // if --counts, we filter here - TOPLEVEL only - unless there's a skip_section function which will do the filtering + || (flag.count && !DTPZ(is_skip_section) && dict_id.num != DTFZ(toplevel).num) + ); + + skip |= flag.dont_load_ref_file && (ST(REFERENCE) || st == SEC_REF_HASH || ST(REF_IS_SET)); + + if (skip && is_genocat && dict_id.num && (dict_id.num == flag.show_singletons_dict_id.num || dict_id.num == flag.dump_one_local_dict_id.num)) + skip = false; + + return skip; +} + +static inline void piz_adjust_one_local (ContextP ctx, BufferP local_buf, LocalType *ltype, uint8_t param, bool uncompress_to_pair) +{ + const LocalTypeDesc *ltd = <_desc[*ltype]; + + ASSERT (local_buf->len % ltd->width == 0, "%s.local has %u bytes - but expecting the number of bytes to be a multiple of %u since ltype=%s", + ctx->tag_name, local_buf->len32, ltd->width, ltd->name); + + local_buf->len /= ltd->width; + + // note: in ZIP, if we're loading R1.local to localR1 just to verify identicality, then we should + // keep it in "file format" for zip_generate_local(), not "native fomrat" + if (!uncompress_to_pair || IS_PIZ || !fastq_zip_use_pair_identical (ctx->dict_id)) { + + if (*ltype == LT_BITMAP) { + local_buf->nbits = local_buf->len * 64 - param ; + LTEN_bits ((BitsP)local_buf); + } + + else if (*ltype >= LT_UINT8_TR && *ltype <= LT_UINT64_TR) + local_buf->n_cols = param; // 0 means vcf_num_samples + + if (ltd->file_to_native) + ltd->file_to_native (local_buf, ltype); // BGEN, transpose etc - updates ltype in case of Transpose, after untransposing + } +} + +// PIZ compute thread: decompress all contexts (in pair-2 of paired FASTQ: z_data contains contexts of both pairs) +// ZIP compute thread in FASTQ: decompress pair_1 contexts when compressing pair_2 +void piz_uncompress_all_ctxs (VBlockP vb) +{ + bool vb_is_pair_2 = is_fastq_pair_2(vb); // is this VB a pair-2 FASTQ VB (either in a FASTQ or SAM z_file) + + for_buf (uint32_t, header_offset, vb->z_section_headers) { + SectionHeaderCtxP header = (SectionHeaderCtxP)Bc(vb->z_data, *header_offset); + + bool is_local = HEADER_IS(LOCAL); + bool is_b250 = HEADER_IS(B250); + if (!is_b250 && !is_local) continue; + + ContextP ctx = ctx_get_ctx (vb, header->dict_id); // gets the context (creating it if it doesn't already exist) + + // back comp: bug observed with E2:Z in v11.0.10: OPTION_E2_Z has LOCAL section despite being an alias to SAM_E2_Z + if (ctx->is_ctx_alias && !VER(12)) { + ctx = CTX(ctx->did_i); + header->dict_id = ctx->dict_id; + } + + bool is_pair_section = vb_is_pair_2 && (BGEN32 (header->vblock_i) != vb->vblock_i); // is this a section of R1 read into an R2 vb + bool uncompress_to_pair = is_pair_section && (!header->flags.ctx.paired/*not pair-identical*/ || IS_ZIP); // ZIP: always; PIZ: if pair-assisted + + ASSERT (is_b250 || header->ltype < NUM_LTYPES, "in vb=%u ctx=%s.%s: ltype=%u >= NUM_LTYPES=%u. This can possibly be solved by upgrading Genozip to the latest version", + vb->vblock_i, ctx->tag_name, is_local ? "local" : "b250", header->ltype, NUM_LTYPES); + + // PIZ only: load normal section, or a pair-identical section of from the R1 VB + if (!uncompress_to_pair) { + + // case: buffer has already been decompressed - either during pre-processing, or + // R2 section was decompressed so no need for the pair-identical R1 section + if ((is_local && ctx->local_uncompressed) || (is_b250 && ctx->b250_uncompressed)) + continue; + + if (is_local) { + ctx->lcodec = header->codec; + ctx->ltype = header->ltype; + ctx->nothing_char = !lt_max(ctx->ltype) ? 0 // nothing char is only relevant for integer ltypes + : header->nothing_char == 0xff ? 0 // no nothing char + : header->nothing_char == 0 ? 1 // use hard-coded logic up to (0 always, and only, appears in files up to 15.0.37) + : header->nothing_char; + } + + else { // b250 + // old logic - not clear if/why it is needed but not removed to avoid break back comp: + if (!VER(15) && !ctx->ltype) + ctx->ltype = header->ltype; + + ctx->iterator = (SnipIterator){ .next_b250 = B1ST8 (ctx->b250), .prev_word_index = WORD_INDEX_NONE }; + ctx->b250_size = header->b250_size; // note: for files<=v13, this was always 0, ie B250_BYTES_4 + } + } + + // A pair section (but only pair-assisted in PIZ) + else { + if (is_b250) { + ctx->pair_b250_iter = (SnipIterator){ .next_b250 = B1ST8 (ctx->b250R1), .prev_word_index = WORD_INDEX_NONE }; + ctx->pair_b250_size = header->b250_size; + } + else + ctx->pair_ltype = header->ltype; + } + + BufferP target_buf = uncompress_to_pair ? (is_local ? &ctx->localR1 : &ctx->b250R1) + : (is_local ? &ctx->local : &ctx->b250); + + rom target_buf_name = uncompress_to_pair ? (is_local ? "contexts->localR1" : "contexts->b250R1") + : (is_local ? CTX_TAG_LOCAL : CTX_TAG_B250 ); + + START_TIMER; + + zfile_uncompress_section (vb, header, target_buf, target_buf_name, BGEN32 (header->vblock_i), header->section_type); + + if (is_local && dict_id_typeless (ctx->dict_id).num == flag.show_singletons_dict_id.num && !is_pair_section) + dict_io_show_singletons (vb, ctx); + + if (is_local && dict_id_typeless (ctx->dict_id).num == flag.dump_one_local_dict_id.num && !is_pair_section) + ctx_dump_binary (vb, ctx, true); + + if (!is_local && dict_id_typeless (ctx->dict_id).num == flag.dump_one_b250_dict_id.num && !is_pair_section) + ctx_dump_binary (vb, ctx, false); + + // BGEN32, transpose, fix len + if (is_local && uncompress_to_pair) + piz_adjust_one_local (ctx, &ctx->localR1, &ctx->pair_ltype, header->param, true); + + else if (is_local && !uncompress_to_pair) + piz_adjust_one_local (ctx, &ctx->local, &ctx->ltype, header->param, false); + + if (is_local && !is_pair_section) + ctx->local_uncompressed = true; + + else if (!is_local && !is_pair_section) // b250 + ctx->b250_uncompressed = true; + + if (!uncompress_to_pair/*added this condition in v15*/ && + ((VER(14) && ctx->ltype != LT_BITMAP) || // starting v14: assign to all except LT_BITMAP (in which param is used to determine nbits) + (!VER(14) && header->flags.ctx.v13_copy_local_param))) // up to v13: copy if v13_copy_local_param is set + target_buf->prm8[0] = header->param; + + if (flag.debug_read_ctxs) + iprintf ("%c Uncompressed %s: %s[%u].len=%u into %s\n", sections_read_prefix (is_pair_section || vb->preprocessing), + VB_NAME, ctx->tag_name, ctx->did_i, target_buf->len32, target_buf_name); + } + + if (IS_PIZ) { + // initialize history buffer (eg for SAM buddy) + for_ctx_that (ctx->flags.store_per_line || ctx->flags.spl_custom) + switch (ctx->flags.store) { + // we zero the history, bc when seg compares to a dl value for a field that didn't exist, + // it sees 0. It might seg against that 0. So we need history to be 0 too. + case STORE_INT : buf_alloc_exact_zero (vb, ctx->history, vb->lines.len, int64_t, "history"); break; + case STORE_FLOAT : buf_alloc_exact_zero (vb, ctx->history, vb->lines.len, double, "history"); break; + case STORE_INDEX : buf_alloc_exact_zero (vb, ctx->history, vb->lines.len, WordIndex, "history"); break; + default : buf_alloc_exact_zero (vb, ctx->history, vb->lines.len, HistoryWord, "history"); break; + } + + // prepare context index + for_ctx + vb->ctx_index[ctx->did_i] = (ContextIndex){ .did_i = ctx->did_i, .dict_id = ctx->dict_id }; + + qsort (vb->ctx_index, vb->num_contexts, sizeof (ContextIndex), sort_by_dict_id); + + vb->has_ctx_index = true; + } + + if (flag.debug_or_test) buflist_test_overflows(vb, __FUNCTION__); +} + +// PIZ compute thread entry point +static void piz_reconstruct_one_vb (VBlockP vb) +{ + START_TIMER; + + ASSERTNOTNULL (vb); + ASSERT (vb->vblock_i, "vb->vblock_i is 0: vb->compute_thread_id=%d pthread=%"PRIu64, + vb->compute_thread_id, (uint64_t)pthread_self()); + + ASSERT (!flag.reference || ref_is_loaded (gref) || flag.dont_load_ref_file, + "%s: reference is not loaded correctly", VB_NAME); + + ASSERT (vb->recon_size >= 0, "Invalid vb->recon_size=%d", vb->recon_size); + + // note: txt_data is fully allocated in advance and cannot be extended mid-reconstruction (container_reconstruct and possibly others rely on this) + #define OVERFLOW_SIZE (1 MB) // allow some overflow space as sometimes we reconstruct unaccounted for data: 1. container templates 2. reconstruct_peek and others + + buf_alloc (vb, &vb->txt_data, 0, + vb->recon_size * vb->translation.factor/*see TRANSLATIONS*/ + OVERFLOW_SIZE, + char, 1.1, "txt_data"); + + piz_uncompress_all_ctxs (vb); + + DT_FUNC (vb, piz_recon_init)(vb); + + // reconstruct from top level snip + Did top_level_did_i = ctx_get_existing_did_i (vb, vb->translation.toplevel); + reconstruct_from_ctx (vb, top_level_did_i, 0, true); + + ASSERT (!vb->con_stack_len, "%s: Expecting container stack to be empty, but con_stack_len=%u", VB_NAME, vb->con_stack_len); + + // calculate the digest contribution of this VB, and the digest snapshot of this VB + // note: if we have generated components from which lines might be inserted into the VB - we verify in writer instead + // note: for Deep with gencomp - the SAM components are verified in writer, while the FASTQ components are verified here. + if (piz_need_digest && (!z_has_gencomp || VB_DT(FASTQ)) && !(flag.deep_fq_only && !VB_DT(FASTQ))) + digest_one_vb (vb, true, NULL); // LOOKING FOR A DEADLOCK BUG? CHECK HERE + + if (DTP(piz_after_recon)) DTP(piz_after_recon)(vb); + + vb_set_is_processed (vb); /* tell dispatcher this thread is done and can be joined. this operation needn't be atomic, but it likely is anyway */ + + if (flag.debug_or_test) buflist_test_overflows(vb, __FUNCTION__); + + COPY_TIMER (compute); +} + +static void piz_initialize_ctx_flags_from_vb_1 (VBlockP vb) +{ + // ctx.flags defaults to vb_i=1 flags, overridden if a b250 or local section is read. this will not be overridden if all_the_same, i.e. no b250/local sections. + // note: we use section_list_save and not section_list_buf, because the latter might not contain vb=1, if removed by writer_create_plan + Section vb_1_first_sec = B(SectionEnt, z_file->section_list_save, z_file->section_list_save.prm32[0]); + Section vb_1_last_sec = B(SectionEnt, z_file->section_list_save, z_file->section_list_save.prm32[1]); + + for (Section sec = vb_1_first_sec+1; sec <= vb_1_last_sec; sec++) { + ContextP ctx = ECTX (sec->dict_id); // will exist if it has a dict (all_the_same sections always have a dict) + if (ctx) { + ctx->flags = sec->flags.ctx; + ctx->flags.paired = false; // flags.paired is VB-specific and is not inherited + } + } +} + +void piz_read_all_ctxs (VBlockP vb, Section *sec/* VB_HEADER section */, bool is_pair_data) +{ + START_TIMER; + + for ((*sec)++; (*sec)->st == SEC_B250 || (*sec)->st == SEC_LOCAL; (*sec)++) { + ASSERT (is_pair_data || vb->vblock_i == (*sec)->vblock_i, "expecting vb->vblock_i=%u == sec->vblock_i=%u", + vb->vblock_i, (*sec)->vblock_i); // sanity + + // create a context even if section is skipped, for containers to work (skipping a section should be mirrored in a container filter) + ContextP zctx = ctx_get_existing_zctx ((*sec)->dict_id); + ContextP vctx = CTX(zctx->did_i); // in PIZ z and vb contexts always have same did_i. This is also true for ZIP of R2, bc context was created by R1 and overlayed on this R2 VB. + bool is_local = ((*sec)->st == SEC_LOCAL); + + // don't assert for <=v11 due to bug (see comment in piz_uncompress_all_ctxs) + ASSERT (!zctx->is_ctx_alias || !VER(12), "Found a %s section of %s, this is unexpected because %s is an alias (of %s)", + st_name((*sec)->st), zctx->tag_name, zctx->tag_name, ZCTX(zctx->did_i)->tag_name); + + // if we're a FASTQ R2 VB loading R1 data, decide if we need to load this section + bool pair_assisted=false, pair_identical=false, skip_R1=false; + if (is_pair_data) { + // note: pair_assisted available since early versions. when loading old versions, flags are fixed in sections_list_file_to_memory_format to be consistent with current version. + pair_assisted = (IS_ZIP && fastq_zip_use_pair_assisted ((*sec)->dict_id, (*sec)->st)) || + (IS_PIZ && is_local && vctx->pair_assist_type == SEC_LOCAL) || // R2 section indicated that recon requires assistence of R1 data + (IS_PIZ && !is_local && vctx->pair_assist_type == SEC_B250); + + // note: pair_identical was introduced in v15 + pair_identical = (IS_ZIP && fastq_zip_use_pair_identical ((*sec)->dict_id)) || + (IS_PIZ && is_local && !vctx->local_in_z && ((*sec)->flags.ctx.paired)) || // R2 section is missing and R1 is willing to take its stead + (IS_PIZ && !is_local && !vctx->b250_in_z && ((*sec)->flags.ctx.paired)); + + if (!pair_assisted && !pair_identical) skip_R1 = true; // this R1 section is not needed by R2 + + ASSERT (!pair_assisted || !pair_identical, "%s: %s.%s is invalidly both pair_assisted and pair_identical", + VB_NAME, vctx->tag_name, is_local ? "local" : "b250"); + } + + uint32_t section_start = vb->z_data.len32; + int32_t offset = skip_R1 ? SECTION_SKIPPED + : zfile_read_section (z_file, vb, (*sec)->vblock_i, &vb->z_data, "z_data", (*sec)->st, *sec); // returns 0 if section is skipped + + bool section_read = (offset != SECTION_SKIPPED); // section could be skipped either bc of skip_R1 or piz_is_skip_section() called from zfile_read_section + + if (section_read) { + BNXT32 (vb->z_section_headers) = section_start; + + if (!is_pair_data) { + if (is_local) vctx->local_in_z = true; + else vctx->b250_in_z = true; + + // note: |= (instead of =) to overcome bug in ZIP --pair in some versions <= 13 (lost track of which): + // local sections with junk data created in addition to the expected b250: if both b250 and local indicate pair_assist, we take the b250. + if ((*sec)->flags.ctx.paired && is_fastq_pair_2(vb)) + vctx->pair_assist_type = (*sec)->st; // set when reading R2 sections, consumed when reading pair R1 sections + } + + if (pair_assisted || (pair_identical && IS_ZIP)) + vctx->pair_flags = (*sec)->flags.ctx; + else + vctx->flags = (*sec)->flags.ctx; // override flags inherited from vb=1 and possibly the other B250/LOCAL section + } + + // note: vctx->is_loaded possibly already true if it has a dictionary - set in ctx_overlay_dictionaries_to_vb - and now sets to false if section is skipped + if (IS_PIZ && !pair_assisted && !skip_R1) + vctx->is_loaded = section_read; + + if (flag.debug_read_ctxs) { + if (section_read) + sections_show_header ((SectionHeaderP)Bc (vb->z_data, section_start), NULL, (*sec)->offset, sections_read_prefix (is_pair_data || vb->preprocessing)); + else + iprintf ("%c Skipped loading %s/%u %s.%s\n", sections_read_prefix (is_pair_data || vb->preprocessing), + comp_name((*sec)->comp_i), vb->vblock_i, zctx->tag_name, st_name ((*sec)->st)); + } + } + + if (flag.debug_or_test) buflist_test_overflows(vb, __FUNCTION__); + + COPY_TIMER (piz_read_all_ctxs); +} + +// Called by PIZ main thread: read all the sections at the end of the file, before starting to process VBs +DataType piz_read_global_area (Reference ref) +{ + START_TIMER; + + bool success = zfile_read_genozip_header (0, SOFT_FAIL); // already read if normal file, but not if auxilliary file + + if (flag.show_stats) { + stats_read_and_display(); + if (is_genocat) return DT_NONE; + } + + user_message_display(); + + if (!success) return DT_NONE; + + if (flags_writer_counts()) goto done; + + // check if the genozip file includes a reference + bool has_ref_sections = !!sections_last_sec (SEC_REFERENCE, SOFT_FAIL); + + ASSERTW (!has_ref_sections || !IS_REF_EXTERNAL || flag.reading_reference, + "FYI: ignoring reference file %s because %s was not compressed with --reference", ref_get_filename (ref), z_name); + + if (!flag.reading_reference && has_ref_sections) { + ref_destroy_reference (ref); // destroy an old reference, if one is loaded + flag.reference = REF_STORED; // possibly override REF_EXTERNAL (it will be restored for the next file in ) + } + + // read all dictionaries - CHROM/RNAME is needed for regions_make_chregs(). + // Note: some dictionaries are skipped based on skip() and all flag logic should implemented there + dict_io_read_all_dictionaries(); + + if (!flag.header_only) { + // mapping of the file's chroms to the reference chroms (for files originally compressed with REF_EXTERNAL/EXT_STORE and have alternative chroms) + chrom_2ref_load (ref); + + ref_contigs_load_contigs (ref); // note: in case of REF_EXTERNAL, reference is already pre-loaded + } + + // if the user wants to see only the header, we can skip regions and random access + if (!flag.header_only) { + + ctx_read_all_counts(); // read all SEC_COUNTS sections + + ctx_read_all_subdicts(); // read all SEC_SUBDICTS sections + + // update chrom node indices using the CHROM dictionary, for the user-specified regions (in case -r/-R were specified) + if (flag.regions) + regions_make_chregs (ZCTX(DTFZ(chrom))); + + // if the regions are negative, transform them to the positive complement instead + regions_transform_negative_to_positive_complement(); + + // if this is a stored reference we load the reference random access that will determined which reference sections + // should be read & uncompressed in case of --regions. + // note: in case of a data file with stored reference - SEC_REF_RAND_ACC will contain the random access of the reference + // and SEC_RANDOM_ACCESS will contain the random access of the data. In case of a .ref.genozip file, both sections exist + // and are identical. It made the coding easier and their size is negligible. + random_access_load_ra_section (SEC_RANDOM_ACCESS, DTFZ(chrom), &z_file->ra_buf, "z_file->ra_buf", + !flag.show_index ? NULL : RA_MSG_PRIM); + + random_access_load_ra_section (SEC_REF_RAND_ACC, CHROM, ref_get_stored_ra (ref), "ref_stored_ra", + flag.show_ref_index && !flag.reading_reference ? RA_MSG_REF : NULL); + + if (IS_REF_CHROM2REF && !flag.reading_reference && !flag.genocat_no_reconstruct) + // xxx is this actually used? + chrom_index_by_name (CHROM); // create alphabetically sorted index for user file (not reference) chrom word list + + // case: reading reference file + if (flag.reading_reference) { + + // when reading the reference for genocat --coverage/idxstats, don't need the actual REF sections + if (is_genocat && (flag.show_coverage || flag.idxstats)) + goto done; + + bool ref_loaded_from_disk = !flag.dont_load_ref_file && ref_load_stored_reference (ref); + + // load the IUPACs list of the reference (rare non-ACGT "bases") + ref_iupacs_load (ref); + + // load the refhash, if we are compressing FASTA or FASTQ, or if user requested to see it + if ( (primary_command == ZIP && flag.aligner_available) || + (flag.show_ref_hash && is_genocat) || + ref_cache_is_populating (ref)) + refhash_load (ref); + + // exit now if all we wanted was just to see the reference (we've already shown it) + if ((flag.show_reference || flag.show_is_set || flag.show_ref_hash) && is_genocat) exit_ok; + + if (ref_loaded_from_disk) + progress_finalize_component ("Done"); + } + + // case: non-reference file has stored reference sections + else if (has_ref_sections) { + if (!flag.dont_load_ref_file) { + ref_load_stored_reference (gref); + + // exit now if all we wanted was just to see the reference (we've already shown it) + if ((flag.show_reference || flag.show_is_set || flag.show_ref_hash) && is_genocat) exit_ok; + } + } + } + +done: + COPY_TIMER_EVB (piz_read_global_area); + + return z_file->data_type; +} + +// main thread +bool piz_read_one_vb (VBlockP vb, bool for_reconstruction) +{ + START_TIMER; + + Section sec = sections_vb_header (vb->vblock_i); + + int32_t vb_header_offset = zfile_read_section (z_file, vb, vb->vblock_i, &vb->z_data, "z_data", SEC_VB_HEADER, sec); + ASSERT0 (vb_header_offset >= 0, "Unexpectedly VB_HEADER section was skipped"); + + SectionHeaderVbHeader header = *(SectionHeaderVbHeaderP)Bc (vb->z_data, vb_header_offset); // copy of header as it will be overwritten in piz_read_all_ctxs + + // any of these might be overridden by callback + vb->flags = header.flags.vb_header; + vb->recon_size = BGEN32 (header.recon_size); + vb->longest_line_len = BGEN32 (header.longest_line_len); + vb->longest_seq_len = VER(15) ? BGEN32 (header.longest_seq_len) : 0; + vb->expected_digest = header.digest; + vb->chrom_node_index = WORD_INDEX_NONE; + vb->lines.len = VER(14) ? sec->num_lines : BGEN32 (header.v13_top_level_repeats); + vb->comp_i = sec->comp_i; + vb->show_containers = (flag.show_containers == SHOW_CONTAINERS_ALL_VBs || flag.show_containers == vb->vblock_i); // a per-VB value bc in SAM Load-Prim VBs =false vs normal VBs have the flag value (set in sam_piz_dispatch_one_load_sag_vb) + + if (txt_file) { // sometimes we don't have a txtfile, eg when genocat is used with some flags that emit other data, no the file + vb->vb_position_txt_file = txt_file->txt_data_so_far_single_0; // position in original txt file (before any ZIP or PIZ modifications) + txt_file->num_lines += vb->lines.len; // source file lines + } + + // in case of unbind, the vblock_i in the 2nd+ component will be different than that assigned by the dispatcher + // because the dispatcher is re-initialized for every txt component + if (flag.unbind) vb->vblock_i = BGEN32 (header.vblock_i); + + if (flag_is_show_vblocks (PIZ_TASK_NAME)) + iprintf ("READING(id=%d) vb=%s num_lines=%u recon_size=%u genozip_size=%u longest_line_len=%u\n", + vb->id, VB_NAME, vb->lines.len32, vb->recon_size, BGEN32 (header.z_data_bytes), vb->longest_line_len); + + ctx_overlay_dictionaries_to_vb (VB); // overlay all dictionaries to the vb + + buf_alloc (vb, &vb->z_section_headers, MAX_DICTS * 2, 0, uint32_t, 0, "z_section_headers"); // room for section headers + + BNXT32 (vb->z_section_headers) = vb_header_offset; // vb_header_offset is always 0 for VB header + + piz_initialize_ctx_flags_from_vb_1 (vb); + + DT_FUNC (vb, piz_before_read)(vb); + + // read all b250 and local of all fields and subfields + piz_read_all_ctxs (vb, &sec, false); + + bool ok_to_compute = DT_FUNC_OPTIONAL (vb, piz_init_vb, true)(vb, &header); + + vb->translation = dt_get_translation (vb); // must be after piz_init_vb, as in VCF we set vb->vb_chords there, needed for dt_get_translation + + if (txt_file) + txt_file->txt_data_so_far_single_0 += BGEN32 (header.recon_size); // cumulative expected recon size without piz-side modifications + + if (ok_to_compute && for_reconstruction && flag.collect_coverage) + coverage_initialize (vb); + + COPY_TIMER (piz_read_one_vb); + + return ok_to_compute; +} + +static void piz_handover_or_discard_vb (Dispatcher dispatcher, VBlockP *vb) +{ + bool is_handed_over = false; + + if ((*vb)->preprocessing) + DT_FUNC (z_file, piz_after_preproc)(*vb); + + else if (!flag.no_writer_thread) // note: in SAM with gencomp - writer does the digest calculation + is_handed_over = writer_handover_data (vb); + + if (!is_handed_over && !(*vb)->preprocessing && + (!flag.one_component || writer_does_vb_need_write ((*vb)->vblock_i))) + txt_file->txt_data_so_far_single += (*vb)->txt_data.len; // note: if writing (or SAM with gencomp), this is done in writer_flush_vb, caputring the processing in writer too + + dispatcher_recycle_vbs (dispatcher, !is_handed_over); // don't release VB if handed over - it will be released in writer_release_vb when writing is completed +} + +// returns false if VB was dispatched, and true if vb was skipped +static void piz_dispatch_one_vb (Dispatcher dispatcher, Section sec) +{ + VBlockP next_vb = dispatcher_generate_next_vb (dispatcher, sec->vblock_i, sec->comp_i); + + // read one VB's data from z_file + ReconType reconstruct = piz_read_one_vb (next_vb, true) && // read even if no_reconstruct + !flag.genocat_no_reconstruct; + + if (reconstruct) { + if (flag_is_show_vblocks (PIZ_TASK_NAME)) + iprintf ("BEFORE_COMPUTE(id=%d) vb=%s/%u num_running_compute_threads(before)=%u\n", + next_vb->id, comp_name(next_vb->comp_i), next_vb->vblock_i, dispatcher_get_num_running_compute_threads(dispatcher)); + + dispatcher_compute (dispatcher, piz_reconstruct_one_vb); + dispatcher_increment_progress ("read", 1); // done reading + } + + // case: we won't proceed to uncompressing, reconstructing we're done reading - just handover + // an empty VB as it appears in the recon plan, and writer might be already blocking on waiting for it + else { + dispatcher_increment_progress ("all_no_reconstruct", 3); // done reading, skipped reconstructing and writing + dispatcher_abandon_next_vb (dispatcher); // just moves the to processed_vb so dispatcher_recycle_vbs can recycle it + piz_handover_or_discard_vb (dispatcher, &next_vb); + } +} + +// main thread: usually called in order of VBs, but out-of-order if --test with no writer +static void piz_handle_reconstructed_vb (Dispatcher dispatcher, VBlockP vb, uint64_t *num_nondrop_lines) +{ + // verify that files are the same size, unless we intended to modify the data + ASSERTW (Ltxt == vb->recon_size || flag.piz_txt_modified || (flag.deep_fq_only && !VB_DT(FASTQ)), + "Warning: vblock_i=%s/%u (num_lines=%u) had %s bytes in the original %s file but %s bytes in the reconstructed file (diff=%d)", + comp_name (vb->comp_i), vb->vblock_i, vb->lines.len32, str_int_commas (vb->recon_size).s, dt_name (txt_file->data_type), + str_int_commas (Ltxt).s, + (int32_t)Ltxt - (int32_t)vb->recon_size); + + *num_nondrop_lines += vb->num_nondrop_lines; + if (flag.count == CNT_VBs) + iprintf ("vb=%s lines=%u nondropped_lines=%u txt_data.len=%u\n", + VB_NAME, vb->lines.len32, vb->num_nondrop_lines, Ltxt); + + DT_FUNC (vb, piz_process_recon)(vb); + + z_file->txt_data_so_far_single += vb->recon_size; + + piz_handover_or_discard_vb (dispatcher, &vb); +} + +// dispatcher of PIZ of main (i.e. not auxilliary) files +static Dispatcher main_dispatcher = NULL; +void piz_set_main_dispatcher (Dispatcher dispatcher) +{ + main_dispatcher = dispatcher; +} + +// allow out of order joining of VBs (to reverse non-allowing set in dispatcher_init) +void piz_allow_out_of_order (void) +{ + ASSERTNOTNULL (main_dispatcher); + dispatcher_allow_out_of_order (main_dispatcher); +} + +static uint64_t piz_target_progress (CompIType comp_i) +{ + if (comp_i == COMP_MAIN && Z_DT(SAM)) + return 3 * sections_get_num_vbs_(SAM_COMP_MAIN, SAM_COMP_DEPN) + sections_get_num_vbs (SAM_COMP_PRIM); // VBs pre-processed + + else if (Z_DT(FASTQ) && flag.interleaved) + return 3 * sections_get_num_vbs_(FQ_COMP_R1, FQ_COMP_R2); + + else if (Z_DT(SAM) && flag.deep && flag.interleaved) + return 3 * sections_get_num_vbs_(SAM_COMP_FQ00, SAM_COMP_FQ01); + + else { + if (comp_i == COMP_NONE) comp_i = COMP_MAIN; + return 3 * sections_get_num_vbs_(comp_i, comp_i); + } +} + +Dispatcher piz_z_file_initialize (void) +{ + // read all non-VB non-TxtHeader sections + DataType data_type = piz_read_global_area (gref); + if (data_type == DT_NONE || flag.reading_reference) + return NULL; // no components in this file (as is always the case with reference files) + + if (flag.genocat_global_area_only) return NULL; + + if (!flag_loading_auxiliary && DTPZ(piz_after_global_area)) // must be before writer_create_plan messes up the section list + DTPZ(piz_after_global_area)(); + + if (!writer_create_plan()) + return NULL; // --count, and it was reported already + + if (flag.test || flag.md5) + ASSINP0 (dt_get_translation(NULL).is_src_dt, "Error: --test or --md5 cannot be used when converting a file to another format"); + + // note: if --unbind, we will recalculate the target progress in dispatcher_resume() + Dispatcher dispatcher = dispatcher_init (flag.reading_reference ? "piz-ref" : PIZ_TASK_NAME, // also referred to in dispatcher_recycle_vbs() + PREPROCESSING_TASK_NAME, + POOL_MAIN, + flag.xthreads ? 1 : global_max_threads, 0, + flag.test && flag.no_writer_thread, // out-of-order if --test with no writer thread (note: SAM gencomp always have writer thread to do digest). + flag.test, + flag.out_filename ? flag.out_filename : txtheader_get_txt_filename_from_section(COMP_MAIN).s, + piz_target_progress (COMP_MAIN), + 0); + + return dispatcher; +} + +// main thread: called once per txt_file created: i.e. once, except if unbinding a paired FASTQ, or a Deep file. +// returns true if piz completed, false if piz aborted by piz_initialize +bool piz_one_txt_file (Dispatcher dispatcher, bool is_first_z_file, bool is_last_z_file, + CompIType first_comp_i, CompIType last_comp_i, // COMP_NONE unless flag.unbind + bool allow_skip_cleaning) +{ + dispatcher_start_wallclock(); + + recon_stack_initialize(); + + if (DTPZ(piz_initialize) && !DTPZ(piz_initialize)(first_comp_i)) + return false; // abort PIZ if piz_initialize says so + + bool header_only_file = true; // initialize - true until we encounter a VB header + uint64_t num_nondrop_lines = 0; + + // note: may be NULL if txt_header was removed by writer, eg when loading auxillary files + Section txt_header_sec = (first_comp_i != COMP_NONE) ? sections_get_comp_txt_header_sec (first_comp_i) : NULL; + + Section sec = sections_one_before (txt_header_sec); + + // traverse section list as re-arranged by writer_create_plan + while (!dispatcher_is_done (dispatcher)) { + + bool achieved_something = false; + + // we're pre-processing data (SAM: loading sag) + if (flag.preprocessing && dispatcher_has_free_thread (dispatcher) && !vb_pool_is_full (POOL_MAIN)) + achieved_something = DTPZ(piz_preprocess)(dispatcher); + + // In input is not exhausted, and a compute thread is available - read a vblock and dispatch it + else if (!dispatcher_is_input_exhausted (dispatcher) && dispatcher_has_free_thread (dispatcher) && !vb_pool_is_full (POOL_MAIN)) { + achieved_something = true; + + bool found_header = sections_next_sec2 (&sec, SEC_TXT_HEADER, SEC_VB_HEADER); + bool is_sec_in_comp = (first_comp_i==COMP_NONE || (sec->comp_i >= first_comp_i && sec->comp_i <= last_comp_i)); + + // case SEC_TXT_HEADER + if (found_header && sec->st == SEC_TXT_HEADER && is_sec_in_comp) { + if (sec->vblock_i >= 2) continue; // fragments >= 2 were already handled together with the first fragment + + // note: also starts writer, and if unbinding, also opens the txt file and hands data over to the writer + txtheader_piz_read_and_reconstruct (sec); + + // case --unbind: unpausing after previous txt_file pause (requires txt file to be open) + uint64_t target_progress = piz_target_progress (first_comp_i ? first_comp_i : COMP_MAIN); + dispatcher_resume (dispatcher, target_progress, sec->comp_i); + } + + // case SEC_VB_HEADER + else if (found_header && sec->st == SEC_VB_HEADER && is_sec_in_comp) { + + if (!writer_does_vb_need_recon (sec->vblock_i)) { + dispatcher_increment_progress ("vb_no_recon", 3); // skipped reading, reconstructing, writing + + if (flag_is_show_vblocks (PIZ_TASK_NAME)) + iprintf ("SKIPPED vb=%s/%u\n", comp_name (sec->comp_i), sec->vblock_i); + continue; + } + + piz_dispatch_one_vb (dispatcher, sec); + header_only_file = false; + } + + // case: we're done with this txt_file (either no header bc EOF, or TXT_HEADER belongs to the next txt_file when unbinding) + else { + if (flag_is_show_vblocks (PIZ_TASK_NAME)) + iprintf ("INPUT EXHAUSTED - no more SEC_VB_HEADER or SEC_TXT_HEADER for txt_file_i=%u\n", z_file->num_txts_so_far); + + dispatcher_set_no_data_available (dispatcher, false, DATA_EXHAUSTED); + + if (header_only_file) + dispatcher_recycle_vbs (dispatcher, true); // note: this is normally done in piz_handover_or_discard_vb + } + } + + // if the next thread (usually in order or VBs in recon_plan, but out-of-order if --test with no writer) is ready, handle the reconstructed VB + VBlockP vb = dispatcher_get_processed_vb (dispatcher, NULL, false); // non-blocking + if (vb) { + if (flag_is_show_vblocks (PIZ_TASK_NAME)) + iprintf ("AFTER_COMPUTE(task=piz id=%d) vb=%s/%u num_running_compute_threads(after)=%u\n", + vb->id, comp_name(vb->comp_i), vb->vblock_i, dispatcher_get_num_running_compute_threads(dispatcher)); + + dispatcher_increment_progress ("preproc_or_recon", 1 + (!vb->preprocessing && flag.no_writer_thread)); // done preprocessing or reconstructing (+1 if skipping writing) + if (!vb->preprocessing) + piz_handle_reconstructed_vb (dispatcher, vb, &num_nondrop_lines); + else + piz_handover_or_discard_vb (dispatcher, &vb); + } + + if (!achieved_something) { + START_TIMER; + usleep (30000); // nothing for us to do right now - wait 30ms + COPY_TIMER_EVB (piz_main_loop_idle); + } + } + + // make sure memory writes by compute threads are visible to the main thread + __atomic_thread_fence (__ATOMIC_ACQUIRE); + + if (flag_is_show_vblocks (PIZ_TASK_NAME)) + iprintf ("DISPATCHER is done for txt_file_i=%u: %s\n", z_file->num_txts_so_far, txt_file ? txt_file->name : "(no filename)"); + + dispatcher_calc_avg_compute_vbs (dispatcher); + + z_file->num_txts_so_far++; + + // finish writing the txt_file (note: the writer thread also calculates digest in SAM/BAM with PRIM/DEPN) + writer_finish_writing (z_file->num_txts_so_far == z_file->num_txt_files || is_genocat); + + // verify amount of data written (if writing) or reconstructed (if --test) sizes adds up as expected + ASSINP (txt_file->txt_data_so_far_single/*accumulated when reconstructing/writing*/ == + txt_file->txt_data_so_far_single_0/*accummulated from section headers */ || flag.piz_txt_modified, + "Data integrity error: Size of original file (without source compression) was %"PRIu64", but reconstructed file is %"PRIu64, + txt_file->txt_data_so_far_single_0, txt_file->txt_data_so_far_single); + + // verifies reconstructed file against MD5 or Adler2 and/or codec_args (if bgzf) + if (piz_need_digest) + digest_piz_verify_one_txt_file (z_file->num_txts_so_far - 1); + + progress_finalize_component_time ("Done", DIGEST_NONE); + + // --sex and --coverage - output results + if (txt_file && !flag_loading_auxiliary) { + if (flag.show_coverage) coverage_show_coverage(); + if (flag.idxstats) coverage_show_idxstats(); + if (flag.count == CNT_TOTAL) iprintf ("%"PRIu64"\n", num_nondrop_lines); + } + + if (is_genocat || (z_file->num_txts_so_far == z_file->num_txt_files)) // genocat always produces exactly one txt file + dispatcher_finish (&dispatcher, NULL, !is_last_z_file || flag.test, + flag.show_memory && is_last_z_file); + else + dispatcher_pause (dispatcher); // we're unbinding and still have more txt_files + + if (txt_file) + DT_FUNC (txt_file, piz_finalize) (is_last_z_file); + + if (flag_is_show_vblocks (PIZ_TASK_NAME)) + iprintf ("Finished PIZ of %s\n", txt_file ? txt_file->name : "(no filename)"); + + // if we're loading an aux file for ZIP - destroy VBs as contexts are unions of ZIP and PIZ + if (primary_command == ZIP && flag_loading_auxiliary) + vb_destroy_pool_vbs (POOL_MAIN, true); + + file_close (&txt_file); + + if (flag.show_time && ((flag.show_time_comp_i >= first_comp_i && flag.show_time_comp_i <= last_comp_i) || + (first_comp_i == COMP_NONE && flag.show_time_comp_i != COMP_ALL))) + profiler_add_evb_and_print_report(); + + return true; +} diff --git a/src/profiler.c b/src/profiler.c index e293003c..7a314951 100644 --- a/src/profiler.c +++ b/src/profiler.c @@ -220,6 +220,7 @@ void profiler_add_evb_and_print_report (void) PRINT (refhash_compress_digest, 2); // make-ref PRINT (refhash_uncompress_one_vb, 2); PRINT (cram_inspect_file, 1); + PRINT (txtfile_discover_specific_gz, 1); PRINT (txtheader_zip_read_and_compress, 1); PRINT (txtfile_read_header, 2); PRINT (sam_header_inspect, 2); @@ -234,7 +235,7 @@ void profiler_add_evb_and_print_report (void) PRINT (txtheader_compress_one_fragment, 3); PRINT (digest_txt_header, 2); PRINT (vb_get_vb, 1); - PRINT (fastq_read_pair_1_data, 1); + PRINT (fastq_read_R1_data, 1); PRINT (piz_read_all_ctxs, 2); PRINT (txtfile_read_vblock, 1); PRINT (read, 2); @@ -242,17 +243,18 @@ void profiler_add_evb_and_print_report (void) PRINT (txtfile_read_block_igzip, 3); PRINT (igzip_uncompress_during_read, 4); PRINT (txtfile_read_block_bz2, 3); - PRINT (txtfile_read_block_bgz, 3); - PRINT (bgzf_read_block, 4); - PRINT (gzil_read_block, 4) - PRINT (bgz_uncompress_during_read, 4); - PRINT (fastq_txtfile_have_enough_lines, 2); + PRINT (txtfile_read_block_mgzip, 3); + PRINT (mgzip_read_block_with_bsize, 4); + PRINT (mgzip_read_block_no_bsize, 4); + PRINT (mgzip_uncompress_during_read, 4); + PRINT (fastq_txtfile_sync_to_R1_by_num_lines, 2); PRINT (txtfile_get_unconsumed_callback, 2); - PRINT (bgz_copy_unconsumed_blocks, 2); + PRINT (mgzip_copy_unconsumed_blocks, 2); PRINT (zriter_write, 1); PRINT (write_fg, 2); PRINT (write_bg, 2); PRINT (bgzf_io_thread, 1); + PRINT (sam_zip_gc_after_compute_main, 1); PRINT (sam_sa_prim_finalize_ingest, 1); PRINT (zip_main_loop_idle, 1); PRINT (zip_free_undeeded_zctx_bufs_after_seg, 1); @@ -281,7 +283,7 @@ void profiler_add_evb_and_print_report (void) PRINT (zfile_compress_genozip_header, 2); iprintf ("GENOZIP compute threads %s\n", str_int_commas (ms(profile.nanosecs.compute)).s); - PRINT (bgz_uncompress_vb, 1); + PRINT (mgzip_uncompress_vb, 1); PRINT (ctx_clone, 1); PRINT (scan_index_qnames_preprocessing, 1); PRINT (zip_modify, 1); @@ -356,6 +358,8 @@ void profiler_add_evb_and_print_report (void) PRINT (gencomp_absorb_add_to_queue, 1); PRINT (gencomp_flush, 2); PRINT (gencomp_offload_DEPN_to_disk, 3); + PRINT (compress_depn_buf, 4); + PRINT (gencomp_do_offload_write, 4); PRINT (gencomp_reread_lines_as_prescribed, 1); PRINT (bgzf_uncompress_one_prescribed_block, 2); PRINT (ctx_merge_in_vb_ctx, 1); diff --git a/src/profiler.h b/src/profiler.h index 465a3431..3549b749 100644 --- a/src/profiler.h +++ b/src/profiler.h @@ -18,7 +18,7 @@ file_open_z, file_close, buf_low_level_free, buflist_find_buf, buflist_sort, buflist_test_overflows_do,\ read, compute, compressor_bz2, compressor_lzma, compressor_bsc, \ write, write_fg, write_bg, zriter_write, piz_read_one_vb, vb_get_vb,\ - compressor_domq, compressor_actg, bgz_uncompress_during_read, igzip_uncompress_during_read, \ + compressor_domq, compressor_actg, mgzip_uncompress_during_read, igzip_uncompress_during_read, \ piz_get_line_subfields, b250_zip_generate, zip_generate_local, zip_compress_ctxs, ctx_merge_in_vb_ctx, wait_for_merge,\ zfile_uncompress_section, codec_assign_best_codec, compressor_pbwt, compressor_longr, compressor_homp, compressor_t0, \ compressor_rans, compressor_arith, compressor_normq, compressor_pacb, compressor_smux, compressor_oq, \ @@ -27,11 +27,11 @@ reconstruct_vb, buf_alloc_main, buf_alloc_compute, buf_destroy_do_do_main, buf_destroy_do_do_compute, buf_overlay_do, \ buf_free_main, buf_free_compute, buflist_add_buf, buflist_remove_buf, \ dispatcher_recycle_vbs, sections_create_index, \ - txtfile_read_header, txtfile_read_vblock, txtfile_get_unconsumed_callback, fastq_txtfile_have_enough_lines, \ - txtfile_read_block_bgz, txtfile_read_block_zlib, txtfile_read_block_igzip, txtfile_read_block_bz2, \ - bgzf_io_thread, bgzf_compute_thread, bgzf_writer_thread, bgz_uncompress_vb, bgz_copy_unconsumed_blocks, bgzf_read_block, \ + txtfile_discover_specific_gz, txtfile_read_header, txtfile_read_vblock, txtfile_get_unconsumed_callback, fastq_txtfile_sync_to_R1_by_num_lines, \ + txtfile_read_block_mgzip, txtfile_read_block_zlib, txtfile_read_block_igzip, txtfile_read_block_bz2, \ + bgzf_io_thread, bgzf_compute_thread, bgzf_writer_thread, mgzip_uncompress_vb, mgzip_copy_unconsumed_blocks, mgzip_read_block_with_bsize, \ bgzf_compress_one_block, bgzf_uncompress_one_prescribed_block, \ - gzil_read_block, \ + mgzip_read_block_no_bsize, \ zip_modify, vcf_zip_modify, vcf_optimize_samples, vcf_optimize_QUAL, vcf_optimize_INFO, vcf_convert_probabilites_to_phred, \ vcf_convert_likelihoods_to_phred, vcf_phred_optimize, optimize_float_3_sig_dig, \ seg_all_data_lines, seg_get_next_line, seg_get_next_item, seg_initialize,\ @@ -48,7 +48,7 @@ sam_deep_merge, sam_piz_con_item_cb, sam_piz_deep_compress, sam_piz_deep_add_qname, sam_piz_deep_add_seq, sam_piz_deep_add_qual,\ sam_piz_deep_finalize_ents, sam_piz_deep_grab_deep_ents, \ scan_index_qnames_preprocessing, sam_piz_sam2fastq_QUAL, sam_piz_sam2bam_QUAL,\ - fastq_read_pair_1_data, piz_read_all_ctxs, fastq_seg_get_lines, fastq_seg_SEQ, fastq_seg_QUAL, \ + fastq_read_R1_data, piz_read_all_ctxs, fastq_seg_get_lines, fastq_seg_SEQ, fastq_seg_QUAL, \ fastq_seg_deep, fastq_deep_seg_find_subseq, fastq_seg_DESC, fastq_seg_saux, fastq_seg_deep_consume_unique_matching_ent,\ ref_initialize_ranges,\ fastq_special_set_deep, fastq_special_deep_copy_QNAME, fastq_special_deep_copy_SEQ, fastq_special_deep_copy_QUAL, fastq_special_monochar_QUAL, \ @@ -70,7 +70,8 @@ random_access_finalize_entries, random_access_compress, ctx_compress_counts, zfile_compress_genozip_header,\ ref_compress_ref, ref_compress_one_range, ref_copy_compressed_sections_from_reference_file,\ piz_main_loop_idle, zip_main_loop_idle, zip_free_undeeded_zctx_bufs_after_seg, \ - gencomp_absorb_add_to_queue, gencomp_flush, gencomp_offload_DEPN_to_disk, gencomp_reread_lines_as_prescribed, \ + gencomp_absorb_add_to_queue, gencomp_flush, gencomp_offload_DEPN_to_disk, gencomp_reread_lines_as_prescribed, gencomp_do_offload_write, \ + compress_depn_buf, sam_zip_gc_after_compute_main, \ tmp1, tmp2, tmp3, tmp4, tmp5, \ fields[MAX_DICTS]/* ZIP: compression time (all ctxs); PIZ: recon time (fields only). must be last for profiler_add. */ \ diff --git a/src/progress.c b/src/progress.c index 4173c72f..1886a684 100644 --- a/src/progress.c +++ b/src/progress.c @@ -21,6 +21,7 @@ static float last_percent=0; static int last_seconds_so_far=-1; static rom component_name=NULL; static unsigned last_len=0; // so we know how many characters to erase on next update +static uint32_t last_secs_remaining=0xffff0000; static StrText progress_ellapsed_time (void) { @@ -81,7 +82,7 @@ void progress_new_component (rom new_component_name, test_mode = new_test_mode; component_name = new_component_name; - + if (!flag.quiet) { if (test_mode) { @@ -113,6 +114,7 @@ void progress_new_component (rom new_component_name, } } + last_secs_remaining = 0xffff0000; progress_update_status (prefix.s, message ? message : ""); } @@ -147,18 +149,20 @@ void progress_update (rom task, uint64_t sofar, uint64_t total, bool done) // case: we're making progress... show % and time remaining else if (!done && percent && (last_seconds_so_far < seconds_so_far)) { + // time remaining + uint32_t secs_remaining = (100.0 - percent) * ((double)seconds_so_far / (double)percent); - if (!done) { - // time remaining - unsigned secs = (100.0 - percent) * ((double)seconds_so_far / (double)percent); + if (!done && (percent != last_percent || secs_remaining <= last_secs_remaining || secs_remaining >= last_secs_remaining+15)) { // timer doesn't go up unless estimate changed by a good fews seconds if (!flag.debug_progress) - snprintf (progress_str, sizeof(progress_str), "%u%% (%s)", (unsigned)percent, str_human_time (secs, false).s); + snprintf (progress_str, sizeof(progress_str), "%u%% (%s)", (unsigned)percent, str_human_time (secs_remaining, false).s); else snprintf (progress_str, sizeof(progress_str), "%u%% (%s) task=%s sofar=%.20s total=%.20s seconds_so_far=%d", - (unsigned)percent, str_human_time (secs, false).s, task, str_int_commas(sofar).s, str_int_commas(total).s, seconds_so_far); + (unsigned)percent, str_human_time (secs_remaining, false).s, task, str_int_commas(sofar).s, str_int_commas(total).s, seconds_so_far); progress_update_status (NULL, progress_str); + + last_secs_remaining = secs_remaining; } } diff --git a/src/qname.c b/src/qname.c index fa71c3aa..5e3dd115 100644 --- a/src/qname.c +++ b/src/qname.c @@ -19,9 +19,16 @@ sSTRl(copy_qname, 16); sSTRl(snip_redirect_to_QNAME2, 16); +QnameFlavor qname_get_optimize_qf (void) +{ + bool is_mated = segconf.qname_flavor[QNAME1] && qf_is_mated(QNAME1); + + return &qf[NUM_QFs-1 - is_mated]; // relying on Genozip-opt being last +} + static inline Did did_by_q (QType q) { - ASSERT (IN_RANGE (q, 0, NUM_QTYPES-1), "Invalid q=%d", q); + ASSERT (IN_RANGE (q, 0, NUM_QTYPES), "Invalid q=%d", q); return (Did[]){ FASTQ_QNAME, FASTQ_QNAME2, FASTQ_LINE3 }[q]; // note: SAM and FASTQ have the same dids for QNAMEs } @@ -402,7 +409,7 @@ void qname_segconf_finalize (VBlockP vb) segconf.sorted_by_qname[q] = VB_DT(FASTQ) || segconf.is_collated || segconf.sam_is_unmapped || segconf.qname_flavor[q]->sam_qname_sorted || (!segconf.is_sorted && !segconf.is_paired); - + // if only consensus reads exist, change tech to unknown if (segconf.tech == TECH_CONS) segconf.tech = TECH_UNKNOWN; @@ -465,11 +472,9 @@ QnameTestResult qname_test_flavor (STRp(qname), QType q, QnameFlavor qfs, bool q return QTR_SUCCESS; // yes, qname is of this flavor } -// called for the first line in segconf.running +// called for the first line in segconf.running or from txtfile_discover_analyze_txt void qname_segconf_discover_flavor (VBlockP vb, QType q, STRp(qname)) { - ASSERTNOTZERO (segconf.running); - static rom reasons[] = QTR_NAME; segconf.qname_flavor[q] = NULL; // unknown @@ -516,13 +521,11 @@ void qname_segconf_discover_flavor (VBlockP vb, QType q, STRp(qname)) iprintf ("%.*s is not %s flavor \"%s\". Reason: %s\n", STRf(qname), qtype_name(q), qfs->name, reasons[reason]); } - if (flag.debug_qname && !segconf.qname_flavor[q]) - iprintf ("%.*s - flavor is NOT DISCOVERED - for %s\n", STRf(qname), qtype_name(q)); + if (!segconf.qname_flavor[q]) { + segconf.flav_prop[q].is_tokenized = true; // since 15.0.63 - // when optimizing qname with --optimize_DESC - capture the correct TECH ^ but set flavor to Genozip-opt - if (q == QNAME1 && segconf.optimize[FASTQ_QNAME]) { - bool is_mated = segconf.qname_flavor[QNAME1] && segconf.qname_flavor[QNAME1]->is_mated; - segconf.qname_flavor[QNAME1] = &qf[NUM_QFs-1 - is_mated]; // relying on Genozip-opt being last + if (flag.debug_qname) + iprintf ("%.*s - flavor is NOT DISCOVERED - for %s\n", STRf(qname), qtype_name(q)); } // set up dict id alias. need to do explicitly, because not predefined @@ -736,19 +739,30 @@ bool qname_seg (VBlockP vb, QType q, STRp (qname), unsigned add_additional_bytes // reduces qname to its canonical form: possibly reduces qname_len to make a qname more likely compareble between SAM/BAM and FASTQ void qname_canonize (QType q, rom qname, uint32_t *qname_len) { - QnameFlavorProp *f = &segconf.flav_prop[q]; // all 0 if no flavor - - // mated: "HSQ1004:134:C0D8DACXX:3:1101:1318:114841/2" ⟶ "HSQ1004:134:C0D8DACXX:3:1101:1318:114841" - // SRA2: "ERR2708427.177.1" ⟶ "ERR2708427.177" - if (f->is_mated && *qname_len > 2) - *qname_len -= 2; - - // eg: "NOVID_3053_FC625AGAAXX:6:1:1069:11483:0,84" ⟶ "NOVID_3053_FC625AGAAXX:6:1:1069:11483" - // note: it is possible that QNAME has both the mate and the cnn removed - if (f->cnn) { - static char cnn_to_char[NUM_CNN] = CNN_TO_CHAR; - rom cut = memrchr (qname + 1, cnn_to_char[f->cnn], *qname_len); // +1 so at least one character survives - if (cut) *qname_len = cut - qname; + QnameFlavorProp *f = &segconf.flav_prop[q]; + + if (!f->is_tokenized) { + // mated: "HSQ1004:134:C0D8DACXX:3:1101:1318:114841/2" ⟶ "HSQ1004:134:C0D8DACXX:3:1101:1318:114841" + // SRA2: "ERR2708427.177.1" ⟶ "ERR2708427.177" + if (f->is_mated && *qname_len > 2) + *qname_len -= 2; + + // eg: "NOVID_3053_FC625AGAAXX:6:1:1069:11483:0,84" ⟶ "NOVID_3053_FC625AGAAXX:6:1:1069:11483" + // note: it is possible that QNAME has both the mate and the cnn removed + if (f->cnn) { + static char cnn_to_char[NUM_CNN] = CNN_TO_CHAR; + rom cut = memrchr (qname + 1, cnn_to_char[f->cnn], *qname_len); // +1 so at least one character survives + if (cut) *qname_len = cut - qname; + } + } + else { // not a recognized flavor - try our best (added 15.0.63) + SAFE_NUL(&qname[*qname_len]); + *qname_len = strcspn (qname, " \t\n\r"); + SAFE_RESTORE; + + if (*qname_len > 2 && qname[*qname_len-2] == '/' && + (qname[*qname_len-1] == '1' || qname[*qname_len-1] == '2')) + *qname_len -= 2; } } @@ -756,7 +770,7 @@ uint32_t qname_calc_hash (QType q, STRp(qname), thool is_last, bool canonical, uint32_t *uncanonical_suffix_len) // optional out { uint32_t save_qame_len = qname_len; - if (canonical) qname_canonize (q, qSTRa(qname)); + if (canonical) qname_canonize (q, qSTRa(qname)); // might shorten qname_len, does not modify the qname string if (uncanonical_suffix_len) *uncanonical_suffix_len = save_qame_len - qname_len; @@ -780,7 +794,9 @@ rom segconf_qf_name (QType q) else if (q == QSAM2) return segconf.deep_sam_qname_flavor[1] ? segconf.deep_sam_qname_flavor[1]->name : "N/A"; else - return segconf.qname_flavor[q] ? segconf.qname_flavor[q]->name : "N/A"; + return segconf.qname_flavor[q] ? segconf.qname_flavor[q]->name + : flag.skip_segconf ? "" + : "N/A"; } DictIdAlias qname_get_alias (QType q) diff --git a/src/qname.h b/src/qname.h index 02a8cb89..fc37f9e4 100644 --- a/src/qname.h +++ b/src/qname.h @@ -48,6 +48,7 @@ extern const char sep_with_space[], sep_without_space[]; extern bool qname_seg (VBlockP vb, QType q, STRp(qname), unsigned add_additional_bytes); extern void qname_segconf_discover_flavor (VBlockP vb, QType q, STRp(qname)); extern QType qname_sam_get_qtype (STRp(qname)); +extern QnameFlavor qname_get_optimize_qf (void); extern void qname_zip_initialize (void); extern void qname_seg_initialize (VBlockP vb, QType q, Did st_did_i); diff --git a/src/recon_history.c b/src/recon_history.c index 04d30e8f..04ee67dc 100644 --- a/src/recon_history.c +++ b/src/recon_history.c @@ -116,7 +116,7 @@ void reconstruct_to_history (VBlockP vb, ContextP ctx) rom lookup_type_name (LookupType lookup) { - return IN_RANGE (lookup, 0, ARRAY_LEN((rom[])LOOKUP_TYPE_NAMES)-1) ? (rom[])LOOKUP_TYPE_NAMES[lookup] : "Invalid LookupType"; + return IN_RANGE (lookup, 0, ARRAY_LEN((rom[])LOOKUP_TYPE_NAMES)) ? (rom[])LOOKUP_TYPE_NAMES[lookup] : "Invalid LookupType"; } void recon_history_get_historical_snip (VBlockP vb, ContextP ctx, LineIType buddy_line_i, pSTRp(snip)) diff --git a/src/recon_plan_io.c b/src/recon_plan_io.c index 57683ad0..f560dcb2 100644 --- a/src/recon_plan_io.c +++ b/src/recon_plan_io.c @@ -61,7 +61,7 @@ void recon_plan_show (FileP file, uint32_t conc_writing_vbs, uint32_t vblock_mb) } // ------------------------------------------------------------------------------- -// convert ReconPlanItem.start_line between absolute line numbers and deltas +// convert ReconPlanItem.start_line: absolute-line-numbers ⇔ deltas // ------------------------------------------------------------------------------- // ZIP main thread @@ -71,6 +71,7 @@ static void recon_plan_deltify (void) ARRAY (ReconPlanItem, plan, txt_file->recon_plan); + // get max_vb_i in recon_plan VBIType max_vb_i = 0; // Note: this might be more than z_file->num_vbs, as in SAM recon_plan is compressed before PRIM/DEPN components are segged for (uint32_t i=0; i < plan_len; i++) if (plan[i].vb_i > max_vb_i) max_vb_i = plan[i].vb_i; diff --git a/src/ref_cache.c b/src/ref_cache.c index d9ed66f6..156e33df 100644 --- a/src/ref_cache.c +++ b/src/ref_cache.c @@ -433,5 +433,5 @@ void ref_cache_detach (Reference ref) rom cache_state_name (RefCacheState cs) { - return IN_RANGE (cs, 0, NUM_CACHE_STATES-1) ? (rom[])CACHE_STATE_NAMES[cs] : "InvalidRefCacheState"; + return IN_RANGE (cs, 0, NUM_CACHE_STATES) ? (rom[])CACHE_STATE_NAMES[cs] : "InvalidRefCacheState"; } diff --git a/src/ref_contigs.c b/src/ref_contigs.c index 41c41804..32aab91b 100644 --- a/src/ref_contigs.c +++ b/src/ref_contigs.c @@ -268,7 +268,7 @@ static void ref_contigs_load_set_contig_names (Reference ref) if (!contig[i].max_pos) continue; WordIndex chrom_index = contig[i].ref_index; - ASSERT (IN_RANGE (chrom_index, 0, chrom_len-1), "Expecting contig[%u].ref_index=%d to be in the range [0,%d]", i, chrom_index, (int)chrom_len-1); + ASSERT (IN_RANGE (chrom_index, 0, chrom_len), "Expecting contig[%u].ref_index=%d to be in the range [0,%d]", i, chrom_index, (int)chrom_len-1); contig[i].char_index = chrom[chrom_index].char_index; contig[i].snip_len = chrom[chrom_index].snip_len; } diff --git a/src/reference.c b/src/reference.c index 1c260bda..cb6ce900 100644 --- a/src/reference.c +++ b/src/reference.c @@ -220,7 +220,7 @@ static void ref_uncompact_ref (RangeP r, int64_t first_bit, int64_t last_bit, co RangeP ref_get_range_by_chrom (Reference ref, WordIndex chrom, rom *chrom_name) { decl_zctx (CHROM); - ASSERT (IN_RANGE (chrom, 0, zctx->word_list.len32-1), "chrom=%d out of range - ctx->word_list.len=%u", + ASSERT (IN_RANGE (chrom, 0, zctx->word_list.len32), "chrom=%d out of range - ctx->word_list.len=%u", chrom, zctx->word_list.len32); if (chrom_name) @@ -418,7 +418,7 @@ static void ref_uncompress_one_range (VBlockP vb) uint64_t start = MAX_(sec_start_within_contig, 0); uint64_t len = ref_sec_len - initial_flanking_len - final_flanking_len; - ASSERT (IN_RANGE (len, 0, ref_sec_len), "expecting ref_sec_len=%"PRIu64" >= initial_flanking_len=%"PRIu64" + final_flanking_len=%"PRIu64, + ASSERT (IN_RANGX (len, 0, ref_sec_len), "expecting ref_sec_len=%"PRIu64" >= initial_flanking_len=%"PRIu64" + final_flanking_len=%"PRIu64, ref_sec_len, initial_flanking_len, final_flanking_len); RefLock lock = ref_lock (vb->ref, start + r->gpos, len + 63); @@ -1096,6 +1096,8 @@ void ref_set_reference (Reference ref, rom filename, ReferenceType ref_type, boo { if (!is_explicit && ref->filename) return; // already set explicitly + ASSINP (!filename || filename[0] != '-', "expecting a the name a reference file after --reference, but found \"%s\"", filename); // catch common error of a command line option instead of a ref filename + rom env = getenv ("GENOZIP_REFERENCE"); unsigned filename_len; StrTextLong alt_name; diff --git a/src/reference.h b/src/reference.h index decca318..0bce5bbc 100644 --- a/src/reference.h +++ b/src/reference.h @@ -147,7 +147,7 @@ static inline void ref_assert_nucleotide_available (ConstRangeP range, PosType64 bool available; switch (flag.reference) { case REF_STORED : available = ref_is_nucleotide_set (range, pos); break; - default : available = IN_RANGE (pos, range->first_pos, range->last_pos); break; + default : available = IN_RANGX (pos, range->first_pos, range->last_pos); break; } ASSERT (available, "reference is not set: chrom=%.*s pos=%"PRId64, (range)->chrom_name_len, (range)->chrom_name, (pos)); } diff --git a/src/sam.h b/src/sam.h index ccd4c727..062c2e2e 100644 --- a/src/sam.h +++ b/src/sam.h @@ -36,7 +36,6 @@ #pragma GENDICT SAM_QENAME=DTYPE_1=QENAME // if adding more Q*NAMEs - add to fastq.h too, and update MAX_QNAME_ITEMS #pragma GENDICT SAM_QmNAME=DTYPE_1=QmNAME // QmNAME reserved for mate number (always the last dict_id in the container) -// Fields prefixed with "FASTQ_" are not used in SAM, but are here so that the did's are the same for SAM and FASTQ #pragma GENDICT SAM_QNAME2=DTYPE_FIELD=QNAME2 #pragma GENDICT SAM_Q0NAME2=DTYPE_1=q0NAME #pragma GENDICT SAM_Q1NAME2=DTYPE_1=q1NAME @@ -55,6 +54,7 @@ #pragma GENDICT SAM_QeNAME2=DTYPE_1=qENAME #pragma GENDICT SAM_QmNAME2=DTYPE_1=qmNAME +// Fields prefixed with "FASTQ_" are not used in SAM, but are here so that the did's are the same for SAM and FASTQ #pragma GENDICT FASTQ_EXTRA=DTYPE_1=DESC #pragma GENDICT SAM_AUX=DTYPE_FIELD=AUX @@ -750,7 +750,7 @@ extern bool sam_is_last_flags_rev_comp (VBlockP vb); // BAM Stuff extern void bam_seg_initialize (VBlockP vb); extern int32_t bam_is_header_done (bool is_eof); -extern int32_t bam_unconsumed (VBlockP vb, uint32_t first_i, int32_t *i); +extern int32_t bam_unconsumed (VBlockP vb, uint32_t first_i); extern void bam_read_vblock (VBlockP vb); extern void bam_seg_initialize (VBlockP vb); extern rom bam_seg_txt_line (VBlockP vb_, rom field_start_line, uint32_t remaining_txt_len, bool *has_special_eol); @@ -924,5 +924,5 @@ typedef enum { SAM_COMP_NONE=255, SAM_COMP_MAIN=0, SAM_COMP_PRIM=1, SAM_COMP_DEP // source file. works for ZIP/PIZ. In PIZ: Source, NOT the data type reconstructed. #define IS_SRC_BAM (command==ZIP ? IS_BAM_ZIP : IS_SRC_BAM_PIZ) -#define IS_SRC_CRAM (z_file->source_codec == CODEC_CRAM) -#define IS_SRC_BCF (z_file->source_codec == CODEC_BCF) +#define IS_SRC_CRAM (z_file->src_codec == CODEC_CRAM) +#define IS_SRC_BCF (z_file->src_codec == CODEC_BCF) diff --git a/src/sam_cigar.c b/src/sam_cigar.c index 4f4e1da9..414f247c 100644 --- a/src/sam_cigar.c +++ b/src/sam_cigar.c @@ -665,7 +665,7 @@ static void sam_cigar_update_random_access (VBlockSAMP vb, ZipDataLineSAMP dl) if (LN == -1) {} - else if (IN_RANGE (last_pos, 1, LN)) + else if (IN_RANGX (last_pos, 1, LN)) random_access_update_last_pos (VB, last_pos); else // we circled back to the beginning for the chromosome - i.e. this VB RA is the entire chromosome @@ -951,8 +951,8 @@ SPECIAL_RECONSTRUCTOR_DT (sam_cigar_special_CIGAR) // now we have the info needed to reconstruct bin, l_read_name, n_cigar_op and l_seq BAMAlignmentFixed *alignment = (BAMAlignmentFixed *)Btxt (vb->line_start); alignment->l_read_name = BAFTtxt - &alignment->read_name[0]; - PUT_UINT16_(alignment, n_cigar_op, LTEN16 (vb->binary_cigar.len)); - PUT_UINT32_(alignment, l_seq, (snip[0] == '-') ? 0 : LTEN32 (vb->seq_len)); + PUT_UINT16_(alignment, n_cigar_op, vb->binary_cigar.len); + PUT_UINT32_(alignment, l_seq, (snip[0] == '-') ? 0 : vb->seq_len); LTEN_u32_buf (&vb->binary_cigar, NULL); RECONSTRUCT (vb->binary_cigar.data, vb->binary_cigar.len * sizeof (BamCigarOp)); @@ -967,7 +967,7 @@ SPECIAL_RECONSTRUCTOR_DT (sam_cigar_special_CIGAR) PosType32 last_pos = last_flags.unmapped ? pos : (pos + vb->ref_consumed - 1); uint16_t bin = bam_reg2bin (pos, last_pos); // zero-based, half-closed half-open [start,end) - PUT_UINT16_(alignment, bin, LTEN16 (bin)); // override the -1 previously set by the translator + PUT_UINT16_(alignment, bin, bin); // override the -1 previously set by the translator } } diff --git a/src/sam_fields.c b/src/sam_fields.c index a6e561a5..1a0c9c67 100644 --- a/src/sam_fields.c +++ b/src/sam_fields.c @@ -370,7 +370,7 @@ static void sam_seg_SM_i (VBlockSAMP vb, ZipDataLineSAMP dl, int64_t SM, unsigne { decl_ctx (OPTION_SM_i); - if (IN_RANGE (SM, 0, 255) && + if (IN_RANGX (SM, 0, 255) && SM != 254 && // note: 254 is a valid, but highly improbable value - we use 254 for "copy from MAPQ" so a actual 254 is segged as an exception !(SM && !dl->MAPQ)) { // we're expecting SM=0 if MAPQ=0 @@ -416,7 +416,7 @@ static void sam_seg_AM_i (VBlockSAMP vb, ZipDataLineSAMP dl, int64_t AM, unsigne // note: currently we only support for this algorithm AM appearing after SM. Easily fixable if ever needed. // AM is often one of 3 options: 0, =SM =MAPQ-SM. If SM=0 then AM is expected to be 0. if (has(SM_i) && - IN_RANGE (AM, 0, 255) && // valid value + IN_RANGX (AM, 0, 255) && // valid value AM != 253 && AM != 254) { // note: 253,254 are valid, but highly improbable values int32_t SM; @@ -667,7 +667,7 @@ static inline void sam_seg_AS_i (VBlockSAMP vb, ZipDataLineSAMP dl, int64_t as, // in bowtie2-like data, we might be able to copy from mate else if (segconf.is_bowtie2) { - ASSERT (IN_RANGE (as, MIN_AS_i, MAX_AS_i), "%s: AS=%"PRId64" is ∉ [%d,%d]", LN_NAME, as, MIN_AS_i, MAX_AS_i); + ASSERT (IN_RANGX (as, MIN_AS_i, MAX_AS_i), "%s: AS=%"PRId64" is ∉ [%d,%d]", LN_NAME, as, MIN_AS_i, MAX_AS_i); ZipDataLineSAMP mate_dl = DATA_LINE (vb->mate_line_i); // an invalid pointer if mate_line_i is -1 @@ -894,7 +894,7 @@ SPECIAL_RECONSTRUCTOR (sam_piz_special_DEMUX_MAPQ) // Seg against mate if we have one, or else against MAPQ as it is often very similar static inline void sam_seg_MQ_i (VBlockSAMP vb, ZipDataLineSAMP dl, int64_t mq, unsigned add_bytes) { - ASSERT (IN_RANGE (mq, 0, 255), "%s: Invalid MQ:i=%"PRId64": expecting an integer [0,255]", LN_NAME, mq); + ASSERT (IN_RANGX (mq, 0, 255), "%s: Invalid MQ:i=%"PRId64": expecting an integer [0,255]", LN_NAME, mq); dl->MQ = mq; ContextP channel_ctx = seg_mux_get_channel_ctx (VB, OPTION_MQ_i, (MultiplexerP)&vb->mux_MQ, sam_has_mate); @@ -912,7 +912,7 @@ static inline void sam_seg_MQ_i (VBlockSAMP vb, ZipDataLineSAMP dl, int64_t mq, // PQ:i Phred likelihood of the template, conditional on the mapping locations of both/all segments being correct. static inline void sam_seg_PQ_i (VBlockSAMP vb, ZipDataLineSAMP dl, int64_t pq, unsigned add_bytes) { - if (IN_RANGE (pq, 0, 65534)) // dl->PQ is uint16_t + if (IN_RANGX (pq, 0, 65534)) // dl->PQ is uint16_t dl->PQ = pq + 1; // +1, so that if pq is out of this range, leave dl as 0, which will mean "no valid PQ" ContextP channel_ctx = seg_mux_get_channel_ctx (VB, OPTION_PQ_i, (MultiplexerP)&vb->mux_PQ, sam_has_mate); @@ -977,7 +977,7 @@ void sam_seg_buddied_i_fields (VBlockSAMP vb, ZipDataLineSAMP dl, Did did_i, // BAM spec permits values up to 0xffffffff, and SAM is unlimited, however for code covenience we limit // values segged with this method to int32_t. If this is ever an issue, it can be solved. - ASSERT (IN_RANGE (my_value, -0x80000000LL, 0x7fffffffLL), "%s: Value of %s is %"PRId64", outside the supported range by Genozip of [%d,%d]", + ASSERT (IN_RANGX (my_value, -0x80000000LL, 0x7fffffffLL), "%s: Value of %s is %"PRId64", outside the supported range by Genozip of [%d,%d]", LN_NAME, ctx->tag_name, my_value, -0x80000000, 0x7fffffff); #define by_mate (mux->special_code == SAM_SPECIAL_DEMUX_BY_MATE) diff --git a/src/sam_header.c b/src/sam_header.c index a3ce4d3e..f9f524a6 100644 --- a/src/sam_header.c +++ b/src/sam_header.c @@ -594,7 +594,7 @@ int32_t bam_is_header_done (bool is_eof) if (is_sam (STRb(evb->txt_data), NULL)) { txt_file->data_type = z_file->data_type = DT_SAM; - txt_file->source_codec = txt_file->codec; // not CODEC_BAM anymore + txt_file->src_codec = txt_file->effective_codec; // not CODEC_BAM anymore z_file->z_flags.txt_is_bin = false; return HEADER_DATA_TYPE_CHANGED; } @@ -636,7 +636,7 @@ static inline void sam_header_add_PG (BufferP txtheader_buf) // the command line length is unbound, careful not to put it in a bufprintf bufprintf (txtheader_buf->vb, txtheader_buf, "@PG\tID:genozip-%u\tPN:genozip\tDS:%s\tVN:%s\tCL:", - getpid(), GENOZIP_URL, GENOZIP_CODE_VERSION); + getpid(), GENOZIP_URL, code_version().s); buf_append_string (txtheader_buf->vb, txtheader_buf, flags_command_line()); buf_append_string (txtheader_buf->vb, txtheader_buf, "\n"); } diff --git a/src/sam_optimize.c b/src/sam_modify.c similarity index 90% rename from src/sam_optimize.c rename to src/sam_modify.c index a16fcef2..8d555a30 100644 --- a/src/sam_optimize.c +++ b/src/sam_modify.c @@ -8,6 +8,34 @@ #include "sam_private.h" +static rom sam_add_seq (VBlockSAMP vb, rom line_start, uint32_t remaining) +{ + ASSERT0 (!flag.optimize, "combining --optimize and --add-seq is not yet supported"); + + bool has_13; + rom next_line = line_start; + str_split_by_tab (next_line, remaining, MAX_FIELDS + AUX, &has_13, false, true); // also advances next_line to next line + + ASSSEG (n_flds >= 10, "%s: Bad SAM file: alignment expected to have at least 10 fields (lacking SEQ), but found only %u", LN_NAME, n_flds); + + buf_alloc (vb, &vb->optimized_line, 0, (next_line - line_start) + (fld_lens[SEQ] + 1), char, 0, "optimized_line"); // x2+1000 is plenty for types of modifications we have so far. + + // the start of the line: until TLEN inc. the following \t + char *next = mempcpy (B1STc(vb->optimized_line), line_start, flds[SEQ] - line_start); // initialize to exact copy of fields 1-10 + + // generate SEQ + for (int i=0; i < fld_lens[SEQ]; i++) // note: since there is no SEQ field, [SEQ] contains the QUAL data + *next++ = 'A'; + + CTX(SAM_SQBITMAP)->txt_shrinkage -= fld_lens[SEQ] + 1; + + // the end of the line: from QUAL inc. the preceding \t + next = mempcpy (next, flds[SEQ] - 1/*\t*/, next_line - (flds[SEQ] - 1) + has_13); + + vb->optimized_line.len32 = BNUM(vb->optimized_line, next); + return next_line; +} + void sam_segconf_finalize_optimizations (void) { // optimize QUAL and other tags containing base qualities unless already binned (8 is the number of bins in Illimina: https://sapac.illumina.com/content/dam/illumina-marketing/documents/products/technotes/technote_understanding_quality_scores.pdf) @@ -137,13 +165,16 @@ static inline void optimize_float (const void *in, void *out) // non-aligned add // TO DO: it would be better to round-up if the first dropped bit is 1, but it is not easy to do // as the exponent might change too. If we do so, we can make-out on more bit and achieve equivalent accuracy f32 &= 0b11111111111111111110000000000000; - PUT_UINT32 (out, LTEN32 (f32)); // back to BAM's Little Endian + PUT_UINT32 (out, f32); // back to BAM's Little Endian } rom sam_zip_modify (VBlockP vb_, rom line_start, uint32_t remaining) { VBlockSAMP vb = (VBlockSAMP)vb_; + if (flag.add_seq) + return sam_add_seq (vb, line_start, remaining); + bool has_13; rom next_line = line_start; str_split_by_tab (next_line, remaining, MAX_FIELDS + AUX, &has_13, false, true); // also advances next_line to next line diff --git a/src/sam_piz.c b/src/sam_piz.c index 87a1cf71..2bcffde8 100644 --- a/src/sam_piz.c +++ b/src/sam_piz.c @@ -664,7 +664,7 @@ CONTAINER_CALLBACK (sam_piz_container_cb) // case SAM to BAM translation: set alignment.block_size (was in sam_piz_sam2bam_AUX until v11) if (dict_id.num == _SAM_TOP2BAM) { BAMAlignmentFixed *alignment = (BAMAlignmentFixed *)Btxt (vb->line_start); - PUT_UINT32_(alignment, block_size, LTEN32 (Ltxt - vb->line_start - sizeof (uint32_t))); // block_size doesn't include the block_size field itself + PUT_UINT32_(alignment, block_size, Ltxt - vb->line_start - sizeof (uint32_t)); // block_size doesn't include the block_size field itself } // --FLAG diff --git a/src/sam_private.h b/src/sam_private.h index 281778c8..a913e707 100644 --- a/src/sam_private.h +++ b/src/sam_private.h @@ -879,7 +879,7 @@ static inline char sam_seg_sam_type_to_bam_type (char type, int64_t n) // i converts to one of 6: C,c,S,s,I,i for (int i=0 ; i < 6; i++) - if (IN_RANGE (n, lt_min (test[i]), lt_max (test[i]))) + if (IN_RANGX (n, lt_min (test[i]), lt_max (test[i]))) return lt_desc[test[i]].sam_type; return 0; // number out of range diff --git a/src/sam_sa.c b/src/sam_sa.c index 7317568c..bad819ba 100644 --- a/src/sam_sa.c +++ b/src/sam_sa.c @@ -216,7 +216,7 @@ void sam_seg_SA_Z (VBlockSAMP vb, ZipDataLineSAMP dl, STRp(sa), unsigned add_byt // We already tested the SA to be good when we added this line to PRIM in sam_seg_prim_add_sag_SA - ASSSEG (IN_RANGE (num_alns, 2, MAX_SA_NUM_ALNS), "%s: Not expecting a malformed SA field in PRIM. num_alns=%u SA:Z=\"%.*s\"", + ASSSEG (IN_RANGX (num_alns, 2, MAX_SA_NUM_ALNS), "%s: Not expecting a malformed SA field in PRIM. num_alns=%u SA:Z=\"%.*s\"", LN_NAME, num_alns, STRf(sa)); // use SA.local to store number of alignments in this SA Group (inc. primary) diff --git a/src/sam_sag_load.c b/src/sam_sag_load.c index 6059d5a4..e0ec5c3c 100644 --- a/src/sam_sag_load.c +++ b/src/sam_sag_load.c @@ -154,9 +154,9 @@ static inline void sam_load_groups_add_qname (VBlockSAMP vb, PlsgVbInfo *plsg, S reconstruct_from_ctx (vb, SAM_BUDDY, 0, RECON_OFF); // set buddy (false = don't consume QNAME) - vb_grps[vb->line_i].qname = plsg->qname_start + Ltxt; + vb_grps[vb->line_i].qname = plsg->qname_start + vb->txt_data.len; // 64 bit arithmetic reconstruct_from_ctx (vb, SAM_QNAME, 0, RECON_ON); // reconstructs into vb->txt_data, sets vb->buddy_line_i if SNIP_COPY_BUDDY - vb_grps[vb->line_i].qname_len = Ltxt - (vb_grps[vb->line_i].qname - plsg->qname_start); // 64 bit arithmetic + vb_grps[vb->line_i].qname_len = vb->txt_data.len - (vb_grps[vb->line_i].qname - plsg->qname_start); // 64 bit arithmetic // if seq_len is carried by a QNAME item, set the last value here (needed for sam_cigar_special_CIGAR) if (seq_len_ctx) vb_grps[vb->line_i].seq_len = seq_len_ctx->last_value.i; diff --git a/src/sam_sag_piz.c b/src/sam_sag_piz.c index 2571f718..569bce56 100644 --- a/src/sam_sag_piz.c +++ b/src/sam_sag_piz.c @@ -203,7 +203,7 @@ SPECIAL_RECONSTRUCTOR_DT (sam_piz_special_PRIM_QNAME) RECONSTRUCT (GRP_QNAME(vb->sag), vb->sag->qname_len); CTX(SAM_QNAME)->last_txt = (TxtWord){ .index = last_txt_index, - .len = Ltxt - last_txt_index }; // 15.0.16 - need by sam_ultima_bi_prediction + .len = Ltxt - last_txt_index }; // 15.0.16 - needed by sam_ultima_bi_prediction // if seq_len is carried by a QNAME item, set the last value here - it was already reconstructed during sag loading. if (segconf.seq_len_dict_id.num) diff --git a/src/sam_sag_scan.c b/src/sam_sag_scan.c index fe6851a6..090a3ee3 100644 --- a/src/sam_sag_scan.c +++ b/src/sam_sag_scan.c @@ -17,7 +17,7 @@ #include "txtfile.h" #include "qname.h" #include "progress.h" -#include "bgzf.h" +#include "mgzip.h" static VBlockP real_evb; static VBlockP scan_vb; @@ -36,7 +36,7 @@ static rom scan_index_one_line (VBlockSAMP vb, rom alignment, uint32_t remaining alignment_len = GET_UINT32 (alignment) + 4; // a non-sensical block_size might indicate an false-positive identification of a BAM alignment in bam_unconsumed - ASSERT (IN_RANGE (alignment_len, sizeof (BAMAlignmentFixed), remaining_txt_len), + ASSERT (IN_RANGX (alignment_len, sizeof (BAMAlignmentFixed), remaining_txt_len), "%s: alignment_len=%u is out of range - too small, or goes beyond end of txt data: remaining_txt_len=%u", LN_NAME, alignment_len, remaining_txt_len); @@ -161,7 +161,7 @@ static void scan_index_qnames_preprocessing (VBlockP vb) // if the txt file is compressed with BGZF, we uncompress now, in the compute thread if (TXT_IS_BGZF) - bgz_uncompress_vb (vb, CODEC_BGZF); // some of the blocks might already have been decompressed while reading - we decompress the remaining + mgzip_uncompress_vb (vb, CODEC_BGZF); // some of the blocks might already have been decompressed while reading - we decompress the remaining rom next = B1STtxt; rom after = BAFTtxt; diff --git a/src/sam_sag_zip.c b/src/sam_sag_zip.c index 596a2a8f..26e1c6fd 100644 --- a/src/sam_sag_zip.c +++ b/src/sam_sag_zip.c @@ -763,6 +763,8 @@ void sam_seg_against_sa_group_int (VBlockSAMP vb, ContextP ctx, int64_t paramete // called in the main thread after_compute - VBs might be out of order void sam_zip_gc_after_compute_main (VBlockSAMP vb) { + START_TIMER; + BufferP recon_plan = &txt_file->recon_plan; BufferP recon_plan_index = &txt_file->recon_plan_index; ARRAY (GencompLineIEntry, gc_lines, vb->gencomp_lines); @@ -788,8 +790,6 @@ void sam_zip_gc_after_compute_main (VBlockSAMP vb) else { recon_plan->param = vb->vblock_i; // VB being processed - so its visible in buf_alloc error messages buf_alloc (evb, recon_plan, gc_lines_len * 2 + 2, 100000, ReconPlanItem, 2, "txt_file->recon_plan"); - buf_alloc (evb, &txt_file->line_info[0], gc_lines_len, 100000, uint32_t, 2, "txt_file->line_info"); - buf_alloc (evb, &txt_file->line_info[1], gc_lines_len, 100000, uint32_t, 2, "txt_file->line_info"); uint32_t normal_line_i=0; @@ -812,9 +812,6 @@ void sam_zip_gc_after_compute_main (VBlockSAMP vb) // vb_i within the gencomp components and start_line will be updated later as we don't know them yet BNXT (ReconPlanItem, *recon_plan) = (ReconPlanItem){ .vb_i = IS_PRIM(&gc_line) ? SAM_GC_UPDATE_PRIM : SAM_GC_UPDATE_DEPN }; - - // store line lengths, to be used later to calculate vb_info - BNXT32 (txt_file->line_info[gc_line.comp_i-1]) = gc_line.line_len; } // insert final normal lines @@ -835,6 +832,8 @@ void sam_zip_gc_after_compute_main (VBlockSAMP vb) *B(BufWord, *recon_plan_index, vb->vblock_i) = (BufWord){ .index = recon_plan_vb_start, .len = recon_plan->len - recon_plan_vb_start }; + + COPY_TIMER (sam_zip_gc_after_compute_main); } //------------------- @@ -1069,5 +1068,5 @@ void sam_stats_reallocate (void) rom sag_type_name (SagType sagt) { - return IN_RANGE (sagt, 0, NUM_SAG_TYPES-1) ? (rom[])SAM_SAG_TYPE_NAMES[sagt] : "InvalidSagType"; + return IN_RANGE (sagt, 0, NUM_SAG_TYPES) ? (rom[])SAM_SAG_TYPE_NAMES[sagt] : "InvalidSagType"; } diff --git a/src/sam_star.c b/src/sam_star.c index 440a1ae3..09d8a6c8 100644 --- a/src/sam_star.c +++ b/src/sam_star.c @@ -99,9 +99,9 @@ SPECIAL_RECONSTRUCTOR_DT (sam_piz_special_jI) for_buf (BamCigarOp, op, vb->binary_cigar) { if (op->op == BC_N) { if (is_bam) { - PUT_UINT32 (next, LTEN32 (pos)); + PUT_UINT32 (next, pos); next += sizeof (uint32_t); - PUT_UINT32 (next, LTEN32 (pos + op->n - 1)); + PUT_UINT32 (next, pos + op->n - 1); next += sizeof (uint32_t); } @@ -121,7 +121,7 @@ SPECIAL_RECONSTRUCTOR_DT (sam_piz_special_jI) if (!count_N) { // no intron if (is_bam) { - PUT_UINT32 (next, LTEN32 ((uint32_t)-1)); + PUT_UINT32 (next, (uint32_t)-1); next += sizeof (uint32_t); } @@ -130,7 +130,7 @@ SPECIAL_RECONSTRUCTOR_DT (sam_piz_special_jI) } if (is_bam) { - PUT_UINT32 (bam_array_len_p, LTEN32 (count_N ? count_N * 2 : 1)); + PUT_UINT32 (bam_array_len_p, count_N ? count_N * 2 : 1); vb->txt_data.len32 = BNUM (vb->txt_data, next); } diff --git a/src/sam_tlen.c b/src/sam_tlen.c index c4935743..1e476248 100644 --- a/src/sam_tlen.c +++ b/src/sam_tlen.c @@ -190,6 +190,6 @@ SPECIAL_RECONSTRUCTOR (sam_piz_special_COPY_MATE_TLEN_old) TRANSLATOR_FUNC (sam_piz_sam2bam_TLEN) { BAMAlignmentFixed *alignment = (BAMAlignmentFixed *)Btxt (vb->line_start); - PUT_UINT32_(alignment, tlen, LTEN32 ((int32_t)ctx->last_value.i)); + PUT_UINT32_(alignment, tlen, (int32_t)ctx->last_value.i); return 0; } diff --git a/src/sections.c b/src/sections.c index dcbef47f..e46ff07a 100644 --- a/src/sections.c +++ b/src/sections.c @@ -18,7 +18,7 @@ #include "piz.h" #include "endianness.h" #include "codec.h" -#include "bgzf.h" +#include "mgzip.h" #include "threads.h" #include "license.h" @@ -175,7 +175,7 @@ bool sections_prev_sec2 (Section *sl_ent, // optional in/out. if NULL - search { Section sec = sl_ent ? *sl_ent : NULL; - ASSERT (!sec || IN_RANGE (sec, B1ST(SectionEnt, z_file->section_list_buf), BLST(SectionEnt, z_file->section_list_buf)), + ASSERT (!sec || IN_RANGX (sec, B1ST(SectionEnt, z_file->section_list_buf), BLST(SectionEnt, z_file->section_list_buf)), "Invalid sec: st1=%s st2=%s", st_name (st1), st_name (st2)); while (!sec || sec >= B1ST (SectionEnt, z_file->section_list_buf)) { @@ -261,7 +261,7 @@ void sections_create_index (void) comp->txt_header_sec_i = sec_i; break; - case SEC_BGZF : + case SEC_MGZIP : comp->bgzf_sec_i = sec_i; break; @@ -341,7 +341,7 @@ void sections_new_list_add_txt_header (BufferP new_list, CompIType comp_i) BNXT (SectionEntModifiable, *new_list) = *sec; } -// PIZ: If any of the components has a SEC_BGZF add it +// PIZ: If any of the components has a SEC_MGZIP add it void sections_new_list_add_bgzf (BufferP new_list) { for_buf2 (SectionsCompIndexEnt, comp, comp_i, z_file->comp_sections_index) @@ -354,7 +354,7 @@ void sections_new_list_add_global_sections (BufferP new_list) // get first section that's not TXT_HEADER/BGZF/VB_HEADER/LOCAL/B250/COUNT/DICT Section sec = NULL; for_buf_back (SectionEnt, s, z_file->section_list_buf) - if (IS_DICTED_SEC(s->st) || s->st == SEC_VB_HEADER || s->st == SEC_TXT_HEADER || s->st == SEC_BGZF) { + if (IS_DICTED_SEC(s->st) || s->st == SEC_VB_HEADER || s->st == SEC_TXT_HEADER || s->st == SEC_MGZIP) { sec = s; break; } @@ -458,7 +458,7 @@ void sections_list_memory_to_file_format (bool in_place) // in place, or to evb- SectionEnt sec = *B(SectionEnt, z_file->section_list_buf, i); // copy before it gets overwritten int64_t offset_delta = (int64_t)sec.offset - (int64_t)prev_sec.offset; - ASSERT (IN_RANGE (offset_delta, 0LL, 0xffffffffLL), // note: offset_delta is size of previous section + ASSERT (IN_RANGX (offset_delta, 0LL, 0xffffffffLL), // note: offset_delta is size of previous section "section_i=%u size=%"PRId64" st=%s is too big", i-1, offset_delta, st_name ((fsec-1)->st)); int32_t vb_delta = INTERLACE(int32_t, (int32_t)sec.vblock_i - (int32_t)prev_sec.vblock_i); @@ -938,8 +938,8 @@ static FlagStr sections_dis_flags (SectionFlags f, SectionType st, DataType dt) } break; - case SEC_BGZF: - snprintf (str.s, sizeof (str.s), "library=%s level=%u has_eof=%u", bgzf_library_name(f.bgzf.library, false), f.bgzf.level, f.bgzf.has_eof_block); + case SEC_MGZIP: + snprintf (str.s, sizeof (str.s), "library=%s level=%u OLD_has_eof=%u", bgzf_library_name(f.mgzip.library, false), f.mgzip.level, f.mgzip.OLD_has_eof_block); break; case SEC_LOCAL: @@ -1070,7 +1070,7 @@ void sections_show_header (ConstSectionHeaderP header, VBlockP vb /* optional if case SEC_TXT_HEADER: { SectionHeaderTxtHeaderP h = (SectionHeaderTxtHeaderP)header; - if (VER(15)) + if (!VER(15)) snprintf (str, sizeof (str), "\n%stxt_data_size=%"PRIu64" txt_header_size=%"PRIu64" lines=%"PRIu64" max_lines_per_vb=%u digest=%s digest_header=%s\n" "%ssrc_codec=%s (args=0x%02X.%02X.%02X) %s txt_filename=\"%.*s\"\n", SEC_TAB, BGEN64 (h->txt_data_size), v12 ? BGEN64 (h->txt_header_size) : 0, BGEN64 (h->txt_num_lines), BGEN32 (h->max_lines_per_vb), @@ -1079,14 +1079,14 @@ void sections_show_header (ConstSectionHeaderP header, VBlockP vb /* optional if sections_dis_flags (f, st, dt).s, TXT_FILENAME_LEN, h->txt_filename); else snprintf (str, sizeof (str), "\n%stxt_data_size=%"PRIu64" txt_header_size=%"PRIu64" lines=%"PRIu64" max_lines_per_vb=%u digest=%s digest_header=%s\n" - "%ssrc_codec=%s (args=0x%02X.%02X.%02X) %s txt_filename=\"%.*s\" flav_prop=(id,has_seq_len,is_mated,cnn)=[[%u,%u,%u],[%u,%u,%u],[%u,%u,%u]]\n", + "%ssrc_codec=%s (args=0x%02X.%02X.%02X) %s txt_filename=\"%.*s\" flav_prop=(id,has_seq_len,is_mated,cnn,tokenized)=[[%u,%u,'%s',%u],[%u,%u,'%s',%u],[%u,%u,'%s',%u]]\n", SEC_TAB, BGEN64 (h->txt_data_size), v12 ? BGEN64 (h->txt_header_size) : 0, BGEN64 (h->txt_num_lines), BGEN32 (h->max_lines_per_vb), digest_display (h->digest).s, digest_display (h->digest_header).s, SEC_TAB, codec_name (h->src_codec), h->codec_info[0], h->codec_info[1], h->codec_info[2], sections_dis_flags (f, st, dt).s, TXT_FILENAME_LEN, h->txt_filename, - h->flav_prop[0].has_seq_len, h->flav_prop[0].is_mated, h->flav_prop[0].cnn, - h->flav_prop[1].has_seq_len, h->flav_prop[1].is_mated, h->flav_prop[1].cnn, - h->flav_prop[2].has_seq_len, h->flav_prop[2].is_mated, h->flav_prop[2].cnn); + h->flav_prop[0].has_seq_len, h->flav_prop[0].is_mated, char_to_printable((char[])CHAR_TO_CNN[h->flav_prop[0].cnn]).s, h->flav_prop[0].is_tokenized, + h->flav_prop[1].has_seq_len, h->flav_prop[1].is_mated, char_to_printable((char[])CHAR_TO_CNN[h->flav_prop[1].cnn]).s, h->flav_prop[1].is_tokenized, + h->flav_prop[2].has_seq_len, h->flav_prop[2].is_mated, char_to_printable((char[])CHAR_TO_CNN[h->flav_prop[2].cnn]).s, h->flav_prop[2].is_tokenized); break; } @@ -1152,7 +1152,7 @@ void sections_show_header (ConstSectionHeaderP header, VBlockP vb /* optional if break; } - case SEC_BGZF: + case SEC_MGZIP: case SEC_RANDOM_ACCESS: { snprintf (str, sizeof (str), "%s%s\n", SEC_TAB, sections_dis_flags (f, st, dt).s); break; @@ -1303,7 +1303,7 @@ void sections_show_section_list (DataType dt) // optional - take data from z_dat BNUM(z_file->section_list_buf, s), st_name(s->st), comp_name (s->comp_i), s->vblock_i, s->offset, s->size, s->num_lines, sections_dis_flags (s->flags, s->st, dt).s); - else if (IS_FRAG_SEC(s->st) || s->st == SEC_BGZF) + else if (IS_FRAG_SEC(s->st) || s->st == SEC_MGZIP) iprintf ("%5u %-20.20s\t\t\tvb=%s/%-4u offset=%-8"PRIu64" size=%-6u %s\n", BNUM(z_file->section_list_buf, s), st_name(s->st), comp_name_ex (s->comp_i, s->st).s, s->vblock_i, s->offset, s->size, sections_dis_flags (s->flags, s->st, dt).s); diff --git a/src/sections.h b/src/sections.h index 7493d455..cf3e730e 100644 --- a/src/sections.h +++ b/src/sections.h @@ -28,7 +28,7 @@ {"SEC_LOCAL", sizeof (SectionHeaderCtx) }, \ {"SEC_CHROM2REF_MAP", sizeof (SectionHeader) }, \ {"SEC_STATS", sizeof (SectionHeader) }, \ - {"SEC_BGZF", sizeof (SectionHeader) }, \ + {"SEC_MGZIP", sizeof (SectionHeader) }, \ {"SEC_RECON_PLAN", sizeof (SectionHeaderReconPlan) }, \ {"SEC_COUNTS", sizeof (SectionHeaderCounts) }, \ {"SEC_REF_IUPACS", sizeof (SectionHeader) }, \ @@ -109,11 +109,11 @@ typedef union SectionFlags { } gff; } vb_header; - struct FlagsBgzf { - uint8_t has_eof_block : 1; + struct FlagsMgzip { + uint8_t OLD_has_eof_block: 1; // used up to 15.0.62 uint8_t level : 4; // 0-12 for libdeflate or 0-9 for zlib level: 15 means unknown - BgzfLibraryType library : 3; // ignored if level=15 (introduced 9.0.16) - } bgzf; + MgzipLibraryType library : 3; // ignored if level=15 (introduced 9.0.16) + } mgzip; struct FlagsCtx { StoreType store : 2; // after reconstruction of a snip, store it in ctx.last_value @@ -162,7 +162,7 @@ typedef union SectionFlags { } SectionFlags __attribute__((__transparent_union__)); -typedef struct FlagsBgzf FlagsBgzf; +typedef struct FlagsMgzip FlagsMgzip; #define SECTION_FLAGS_NONE ((SectionFlags){ .flags = 0 }) @@ -316,7 +316,8 @@ typedef struct { // 2 bytes uint8_t is_consensus : 1; // qname flavor is a "consensus read" flavor (15.0.26) (15.0.0 to 15.0.14 : is_mated: qname ends with /1 or /2 (never used in PIZ)) uint8_t is_mated : 1; // qname's final two characters are separator + mate (1 or 2), eg "/1" or "|2" or ".1" (previously called "has_R") uint8_t cnn : 3; // QnameCNN: terminate before the last character that is this, to canonoize - uint8_t unused_bits : 2; + uint8_t is_tokenized : 1; // unrecognized flavor: qname is tokenized (15.0.63) + uint8_t unused_bits : 1; } QnameFlavorProp; // The text file header section appears once in the file (or multiple times in case of bound file), and includes the txt file header @@ -325,7 +326,7 @@ typedef struct { uint64_t txt_data_size; // number of bytes in the original txt file (without source compression, potentially after ZIP modifications eg --optimize) uint64_t txt_num_lines; // number of data (non-header) lines in the original txt file. Concat mode: entire file for first SectionHeaderTxtHeader, and only for that txt if not first uint32_t max_lines_per_vb; // upper bound on how many data lines a VB can have in this file - Codec src_codec; // codec of original txt file (none, bgzf, gz, bz2...) + Codec src_codec; // codec of original txt file (none, bgzf, gz, bz2, cram...) uint8_t codec_info[3]; // codec specific info: for CODEC_BGZF, these are the LSB, 2nd-LSB, 3rd-LSB of the source BGZF-compressed file size Digest digest; // digest of original single txt file. Up to 15.0.59: 0 if modified or DVCF. v14: only if md5, not alder32 (adler32 digest, starting v14, is stored per VB) (bug in v14: this field is garbage instead of 0 for FASTQ_COMP_FQR2 if adler32) Digest digest_header; // MD5 or Adler32 of header. Up to 15.0.59: 0 if txt was modified by zip. @@ -644,5 +645,5 @@ extern StrText comp_name_(CompIType comp_i); #define IS_DICTED_SEC(st) ((st)==SEC_B250 || (st)==SEC_LOCAL || (st)==SEC_DICT || (st)==SEC_COUNTS || (st) == SEC_SUBDICTS) #define IS_VB_SEC(st) ((st)==SEC_VB_HEADER || (st)==SEC_B250 || (st)==SEC_LOCAL) -#define IS_COMP_SEC(st) (IS_VB_SEC(st) || (st)==SEC_TXT_HEADER || (st)==SEC_BGZF || (st)==SEC_RECON_PLAN) +#define IS_COMP_SEC(st) (IS_VB_SEC(st) || (st)==SEC_TXT_HEADER || (st)==SEC_MGZIP || (st)==SEC_RECON_PLAN) #define IS_FRAG_SEC(st) ((st)==SEC_DICT || (st)==SEC_TXT_HEADER || (st)==SEC_RECON_PLAN || (st)==SEC_REFERENCE || (st)==SEC_REF_IS_SET || (st)==SEC_REF_HASH) // global sections fragmented with a dispatcher, and hence use vb_i diff --git a/src/seg.c b/src/seg.c index 7784a1a2..9161b4cf 100644 --- a/src/seg.c +++ b/src/seg.c @@ -16,7 +16,7 @@ #include "codec.h" #include "zip.h" #include "stats.h" -#include "bgzf.h" +#include "mgzip.h" #include "dispatcher.h" #include "b250.h" #include "zip_dyn_int.h" @@ -1446,7 +1446,7 @@ void zip_modify (VBlockP vb) VB_NAME, vb->lines.len, vb->txt_data.len); // 64 bit test in case of memory corruption // set estimated number of lines - vb->lines.len32 = vb->lines.len32 ? vb->lines.len32 // already set? don't change (set in certain cases, e.g. 2nd pair FASTQ, bcl_unconsumed, vcf_zip_add_line_numbers_init_vb...) + vb->lines.len32 = IS_R2 ? fastq_get_R1_num_lines (vb) : segconf.line_len ? MAX_(1, Ltxt / segconf.line_len) : 1; // eg DT_GNRIC @@ -1471,7 +1471,7 @@ void zip_modify (VBlockP vb) vb->line_start = BNUMtxt (line); - // Call optimizer to shorten line (note: line can be shortened, but not extended!) + // Call data-specific modifier rom next_line = DTP(zip_modify) (vb, line, remaining_txt_len); if (flag.biopsy_line.line_i == vb->line_i && flag.biopsy_line.vb_i == vb->vblock_i && !DTP(seg_modifies)) { @@ -1480,7 +1480,7 @@ void zip_modify (VBlockP vb) } // case: first time for which we can't update in-place: move over to new buffer - if (in_place && optimized + vb->optimized_line.len > next_line) { + if (in_place && (optimized + vb->optimized_line.len > next_line)) { buf_alloc (vb, &vb->optimized_txt_data, 0, vb->txt_data.len * 1.05, char, 0, "optimized_line"); memcpy (B1STc(vb->optimized_txt_data), B1STtxt, BNUMtxt(optimized)); @@ -1538,22 +1538,24 @@ void zip_modify (VBlockP vb) } // split each lines in this VB to its components -void seg_all_data_lines (VBlockP vb) +uint32_t seg_all_data_lines (VBlockP vb) { START_TIMER; - + ASSERTNOTNULL (vb); + // sanity (leave 64b to detect bugs) ASSERT (vb->lines.len <= vb->txt_data.len, "%s: Expecting lines.len=%"PRIu64" < txt_data.len=%"PRIu64, VB_NAME, vb->lines.len, vb->txt_data.len); // 64 bit test in case of memory corruption // note: empty VB is possible, for example empty SAM generated component // note: if re-reading, data is not loaded yet (it will be in *_seg_initialize) - ASSERT (!Ltxt || vb->reread_prescription.len || *BLSTtxt == '\n' || !DTP(vb_end_nl), "%s: %s txt_data unexpectedly doesn't end with a newline. Last 10 chars: \"%10s\"", - VB_NAME, dt_name(vb->data_type), Btxt (Ltxt - MIN_(10,Ltxt))); + ASSERT (!Ltxt || vb->reread_prescription.len || *BLSTtxt == '\n' || !DTP(vb_end_nl), "%s: %s txt_data unexpectedly doesn't end with a newline. Ltxt=%u Last 10 chars: \"%.10s\". If you expect this file to be truncated, use --truncate.", + VB_NAME, dt_name(vb->data_type), Ltxt, Btxt (Ltxt - MIN_(10,Ltxt))); // set estimated number of lines - vb->lines.len32 = vb->lines.len32 ? vb->lines.len32 // already set? don't change (eg 2nd pair FASTQ, bcl_unconsumed) + vb->lines.len32 = vb->lines.len32 ? vb->lines.len32 // already set in zip_modify : segconf.running ? 10 // low number of avoid memory overallocation for PacBio arrays etc + : IS_R2 ? str_count_char (STRb(vb->txt_data), '\n') / 4 // fastq_seg_initialize verifes that it is the same as R1 : segconf.line_len ? MAX_(1, Ltxt / segconf.line_len) : 1; // eg DT_GNRIC @@ -1567,15 +1569,15 @@ void seg_all_data_lines (VBlockP vb) DT_FUNC (vb, seg_initialize)(vb); // data-type specific initialization (SAM DEPN: re-read lines here) + // in segconf, seg_initialize might change the data_type and realloc the segconf vb (eg FASTA->FASTQ) + if (segconf.running) vb = vb_get_nonpool_vb (VB_ID_SEGCONF); + if (flag_is_show_vblocks (ZIP_TASK_NAME)) - iprintf ("SEG(id=%d) vb=%s Ltxt=%u %.*s\n", vb->id, VB_NAME, vb->txt_data.len32, - MIN_(64, Ltxt), cond_str (!DTP(is_binary), "txt_data[64]=", B1STtxt ? B1STtxt : "(null)")); + iprintf ("SEG(id=%d) vb=%s Ltxt=%u %.*s%s\n", vb->id, VB_NAME, vb->txt_data.len32, + MIN_(64, Ltxt), cond_str (!DTP(is_binary), "txt_data[64]=\"", B1STtxt ? B1STtxt : "(null)"), DTP(is_binary) ? "" : "\""); ASSERTNOTEMPTY (vb->txt_data); // after this print ^ - // in segconf, seg_initialize might change the data_type and realloc the segconf vb (eg FASTA->FASTQ) - if (segconf.running) vb = vb_get_nonpool_vb (VB_ID_SEGCONF); - // if local is going to be compressed using a callback, we can't have singletons go to local // note: called after seg_initialize, to allow for setting of ctx->no_callback where needed zip_set_no_stons_if_callback (vb); @@ -1588,15 +1590,17 @@ void seg_all_data_lines (VBlockP vb) bool hash_hints_set_1_3 = false, hash_hints_set_2_3 = false; int64_t progress = 0; int64_t n_lines_processed=0; // number of lines sent to segging. some of them might have been sent to gencomp. + uint32_t remaining_txt_len=0; for (vb->line_i=0; vb->line_i < vb->lines.len32; vb->line_i++, n_lines_processed++) { if (!segconf.running) seg_increment_progress (vb, BNUMtxt(line), &progress, "seg"); + + remaining_txt_len = BREMAINS (vb->txt_data, line); - uint32_t remaining_txt_len = BREMAINS (vb->txt_data, line); - - if (!remaining_txt_len) { // we're done + if (!remaining_txt_len || // we're done + (segconf.running && vb->line_i == MAX_SEGCONF_LINES)) { // segconf: limit lines (even if VB is large e.g. due to reading full MGZIP block) vb->lines.len32 = vb->line_i; // update to actual number of lines break; } @@ -1628,7 +1632,7 @@ void seg_all_data_lines (VBlockP vb) // update line_bgzf_uoffset to next line if (TXT_IS_BGZF && vb->comp_i == COMP_MAIN) - bgz_zip_advance_index (vb, line_len); + mgzip_zip_advance_index (vb, line_len); // if our estimate number of lines was too small, increase it if (vb->line_i == vb->lines.len32-1 && line - vb->txt_data.data != vb->txt_data.len) @@ -1667,4 +1671,5 @@ void seg_all_data_lines (VBlockP vb) if (flag.debug_or_test) buflist_test_overflows(vb, __FUNCTION__); COPY_TIMER (seg_all_data_lines); + return remaining_txt_len; } diff --git a/src/seg.h b/src/seg.h index dc3ad6d5..b92f01fa 100644 --- a/src/seg.h +++ b/src/seg.h @@ -18,7 +18,7 @@ typedef enum { ERR_SEG_NO_ERROR=0, ERR_SEG_OUT_OF_RANGE, ERR_SEG_NOT_INTEGER } SegError; extern void zip_modify (VBlockP vb); -extern void seg_all_data_lines (VBlockP vb); +extern uint32_t seg_all_data_lines (VBlockP vb); typedef enum { GN_FORBIDEN, GN_SEP, GN_IGNORE } GetNextAllow; extern rom seg_get_next_item (VBlockP vb, rom str, int *str_len, GetNextAllow newline, GetNextAllow tab, GetNextAllow space, diff --git a/src/segconf.c b/src/segconf.c index ab4e0a97..bae5090b 100644 --- a/src/segconf.c +++ b/src/segconf.c @@ -16,7 +16,7 @@ #include "strings.h" #include "codec.h" #include "arch.h" -#include "bgzf.h" +#include "mgzip.h" #include "tip.h" #include "zfile.h" #include "zip_dyn_int.h" @@ -123,7 +123,7 @@ static void segconf_set_vb_size (VBlockP vb, uint64_t curr_vb_size) segconf.vb_size = curr_vb_size; if (segconf.vb_size) { - // already set from previous components of this z_file - do nothing (in particular, FASTQ PAIR_2 must have the same vb_size as PAIR_1) + // already set from previous components of this z_file - do nothing (in particular, FASTQ PAIR_R2 must have the same vb_size as PAIR_R1) // note: for 2nd+ components, we may set other aspects of segconf, but not vb_size } @@ -139,10 +139,6 @@ static void segconf_set_vb_size (VBlockP vb, uint64_t curr_vb_size) flag.vblock, MAX_VBLOCK_MEMORY); segconf.vb_size = (uint64_t)mem_size_mb MB; - - // we can't use GZIL for tiny VBs - if (TXT_IS_GZIL && segconf.vb_size < GZIL_MAX_BLOCK_SIZE) - txt_file->codec = CODEC_GZ; // leave source_code=GZIL for stats } // case: developer option - a number of bytes eg "100000B" @@ -163,15 +159,12 @@ static void segconf_set_vb_size (VBlockP vb, uint64_t curr_vb_size) else if (TXT_DT(GNRIC)) segconf.vb_size = VBLOCK_MEMORY_GENERIC; - // if we failed to calculate an estimated size or file is very small - use default - // else if (txtfile_get_seggable_size() < VBLOCK_MEMORY_GENERIC) - // segconf.vb_size = VBLOCK_MEMORY_GENERIC; - else { // count number of contexts used - for_ctx_that (ctx->b250.len32 || ctx->local.len32) - num_used_contexts++; - + if (vb) // NULL if --skip-segconf + for_ctx_that (ctx->b250.len32 || ctx->local.len32) + num_used_contexts++; + uint32_t vcf_samples = TXT_DT(VCF) ? vcf_header_get_num_samples() : 0; // formula - 1MB for each contexts, 128K for each VCF sample @@ -260,12 +253,6 @@ bool segconf_is_long_reads(void) flag.debug_LONG; } -static bool segconf_no_calculate (void) -{ - return (Z_DT(FASTQ) && flag.pair == PAIR_R2) || // FASTQ: no recalculating for 2nd pair - ((Z_DT(SAM) || Z_DT(BAM)) && flag.deep && flag.zip_comp_i >= SAM_COMP_FQ01); // --deep: no recalculating for second (or more) FASTQ file -} - static bool segconf_get_zip_txt_modified (bool provisional) { ASSERTNOTNULL (z_file); @@ -285,14 +272,21 @@ static bool segconf_get_zip_txt_modified (bool provisional) // note: this flag is also set when the file header indicates that it's a Luft file. See vcf_header_get_dual_coords(). return (has_optimize && DTPZ(zip_modify)) || (flag.add_line_numbers && Z_DT(VCF)) + || (flag.add_seq && Z_DT(SAM)) || flag.has_head // --head diagnostic option to compress only a few lines of VB=1 || flag.has_biopsy_line; } +static bool segconf_skip_segconf (void) +{ + return (Z_DT(FASTQ) && flag.pair == PAIR_R2) || // FASTQ: no recalculating for 2nd pair + ((Z_DT(SAM) || Z_DT(BAM)) && flag.deep && flag.zip_comp_i >= SAM_COMP_FQ01); // --deep: no recalculating for second (or more) FASTQ file +} + // ZIP only: after opening z_file, before reading txt_header and opening txt_file void segconf_zip_initialize (void) { - if (segconf_no_calculate()) return; + if (segconf_skip_segconf()) return; // case: everything but first FASTQ in Deep if (!((Z_DT(SAM) || Z_DT(BAM)) && flag.deep && flag.zip_comp_i == SAM_COMP_FQ00)) @@ -364,14 +358,29 @@ static void segconf_show_has (void) exit_ok; } +// decide between GZIP codecs. if this is GNRIC that changed to FASTQ, we are running discovery again. +static void segconf_discover_fastq_gz (void) +{ + buf_free (txt_file->igzip_state); + + txtfile_discover_specific_gz (txt_file); // sets txt_file->codec and txt_file->effective_codec + + // discard segconf uncompressed data - the discovered codec will uncompress txt_file->gz_data again + txt_file->discover_during_segconf = false; + buf_free (txt_file->unconsumed_txt); +} + // ZIP: Seg a small sample of data of the beginning of the data, to pre-calculate several Seg configuration parameters void segconf_calculate (void) { - // check for components that don't need segconf - if (segconf_no_calculate()) goto finalize; - - if (TXT_DT(GNRIC) || // no need for a segconf test VB in generic files - flag.skip_segconf) { // for use in combination with --biopsy, to biopsy of a defective file + // no need to re-calculate segconf if this is R2. We just re-calculate the codecs. + if (segconf_skip_segconf()) { + if (txt_file->discover_during_segconf) + segconf_discover_fastq_gz(); + goto finalize; + } + + if (flag.skip_segconf) { // for use in combination with --biopsy, to biopsy of a defective file. also implied by --add-seq. segconf_set_vb_size (NULL, segconf.vb_size); goto finalize; } @@ -381,14 +390,13 @@ void segconf_calculate (void) segconf_vb = vb_initialize_nonpool_vb (VB_ID_SEGCONF, txt_file->data_type, "segconf"); #define vb segconf_vb - // note: in case of BZ2, needs to be big enough to overcome the block nature of BZ2 (64K block -> 200-800K text) to get a reasonable size estimate - uint32_t vb_sizes[] = { 300000, 1500000, 5000000 }; - - for (int s = (txt_file->codec == CODEC_BZ2); s < ARRAY_LEN(vb_sizes) && !Ltxt; s++) { - segconf.vb_size = vb_sizes[s]; - if (TXT_IS_GZIL) segconf.vb_size = ROUNDUP1M (segconf.vb_size); - txtfile_read_vblock (vb); - } + SAVE_FLAGS; + flag.show_alleles = flag.show_digest = flag.show_hash = flag.show_reference = false; + flag.show_vblocks = NULL; + flag.pair = NOT_PAIRED; + + segconf.vb_size = 256 KB; // actual VB size might end up bigger, if this is not enough for a single line + txtfile_read_vblock (vb); if (!Ltxt) { // error unless this is a header-only file @@ -404,13 +412,8 @@ void segconf_calculate (void) // segment this VB ctx_clone (vb); - - SAVE_FLAGS; - flag.show_alleles = flag.show_digest = flag.show_hash = flag.show_reference = false; - flag.quiet = true; - flag.show_vblocks = NULL; - - seg_all_data_lines (vb); + + uint32_t remaining_txt_len = seg_all_data_lines (vb); // in segconf, seg_initialize might change the data_type and realloc the segconf vb (eg FASTA->FASTQ) vb = vb_get_nonpool_vb (VB_ID_SEGCONF); @@ -452,7 +455,7 @@ void segconf_calculate (void) segconf_set_vb_size (vb, save_vb_size); - segconf.line_len = (vb->lines.len32 ? ((double)Ltxt / (double)vb->lines.len32) : 500) + 0.999; // get average line length (rounded up ; arbitrary 500 if the segconf data ended up not having any lines) + segconf.line_len = (vb->lines.len32 ? ((double)(Ltxt - remaining_txt_len) / (double)vb->lines.len32) : 500) + 0.999; // get average line length (rounded up ; arbitrary 500 if the segconf data ended up not having any lines) // limitations: only pre-defined field for (Did did_i=0; did_i < DTF(num_fields); did_i++) { @@ -465,12 +468,23 @@ void segconf_calculate (void) segconf.local_per_line[did_i] = ((float)ctx->local.len32 / (float)vb->lines.len32) * (float)lt_desc[dyn_int_get_ltype(ctx)].width; } + if (txt_file->discover_during_segconf) { + segconf_discover_fastq_gz(); + + // note: this warning won't trigger for EMVL bc first gz block is isize=0, and segconf doesn't reach the end of the gz block + WARN_IF (flag.vblock && TXT_IS_VB_SIZE_BY_MGZIP && segconf.vb_size < txt_file->max_mgzip_isize, + "For performance, vblocks might be larger than requested with %s. To override this, use --no-bgzf", OT("vblock", "B")); + } + // return the data to txt_file->unconsumed_txt - squeeze it in before the passed-up data - buf_insert (evb, txt_file->unconsumed_txt, char, 0, txt_data_copy.data, txt_data_copy.len, "txt_file->unconsumed_txt"); - buf_destroy (txt_data_copy); + else { + buf_insert (evb, txt_file->unconsumed_txt, char, 0, txt_data_copy.data, txt_data_copy.len, "txt_file->unconsumed_txt"); - if (TXT_IS_BGZF || TXT_IS_GZIL) - bgz_return_segconf_blocks (vb); // return BGZF used by the segconf VB to the unconsumed BGZF blocks + if (TXT_IS_BGZF) // non-FASTQ or GENERIC-cum-FASTQ + mgzip_return_segconf_blocks (vb); // return BGZF used by the segconf VB to the unconsumed BGZF blocks + } + + buf_destroy (txt_data_copy); // in case of generated component data - undo vb->gencomp_lines.len = 0; @@ -495,7 +509,6 @@ void segconf_calculate (void) finalize: // code to execute even if segconf was skipped flag.zip_uncompress_source_during_read = - flag.pair == PAIR_R2 || // if we're reading the 2nd paired file, fastq_txtfile_have_enough_lines needs the whole data flag.make_reference || // unconsumed callback for make-reference needs to inspect the whole data flag.biopsy || flag.zip_lines_counted_at_init_vb; // *_zip_init_vb needs to count lines diff --git a/src/segconf.h b/src/segconf.h index d78664d4..d9da524f 100644 --- a/src/segconf.h +++ b/src/segconf.h @@ -22,6 +22,8 @@ #define ABSOLUTE_MIN_VBLOCK_MEMORY ((uint64_t)1000) // in Bytes #define ABSOLUTE_MAX_VBLOCK_MEMORY ((uint64_t)MAX_VBLOCK_MEMORY MB) +#define MAX_SEGCONF_LINES 1000 // max lines tested in segconf (even if VB is large e.g. due to reading full MGZIP block) + typedef packed_enum { TECH_NONE=-1, TECH_ANY=-2, TECH_CONS=-3, TECH_UNKNOWN=0, TECH_ILLUM, TECH_PACBIO, TECH_NANOPORE, TECH_454, TECH_MGI, TECH_IONTORR, TECH_HELICOS, TECH_NCBI, TECH_ULTIMA, TECH_SINGLR, TECH_ELEMENT, TECH_ONSO, NUM_TECHS } SeqTech; #define TECH_NAME { "Unknown_tech", "Illumina", "PacBio", "Oxford_Nanopore", "454", "MGI_Tech", "IonTorrent", "Helicos", "NCBI", "Ultima", "Singular", "Element", "Onso" } #define TECH(x) (segconf.tech == TECH_##x) @@ -39,7 +41,7 @@ typedef packed_enum { VCF_QUAL_DEFAULT, VCF_QUAL_local, VCF_QUAL_by_RGQ, VCF_QUA typedef packed_enum { VCF_INFO_DEFAULT, VCF_INFO_by_RGQ, VCF_INFO_by_FILTER } VcfInfoMethod; -typedef packed_enum { L3_UNKNOWN, L3_EMPTY, L3_COPY_LINE1, L3_NCBI, L3_OPTIMIZED_AWAY, NUM_L3s } FastqLine3Type; +typedef packed_enum { L3_UNKNOWN, L3_EMPTY, L3_COPY_LINE1, L3_NCBI, NUM_L3s } FastqLine3Type; typedef packed_enum { INFO_VT_UNKNOWN, INFO_VT_VAGrENT, INFO_VT_1KG, INFO_VT_CALLMOM } InfoVTType; // part of the file format: values go into the snip of VCF_SPECIAL_VT @@ -84,6 +86,7 @@ typedef struct { // Seg parameters - general uint64_t vb_size; // ZIP/PIZ: compression VBlock size in bytes (PIZ: passed in SectionHeaderGenozipHeader.vb_size) bool running; // currently in segconf_calculate() + uint32_t gz_comp_size; // size of segconf data in source gz compression, used if discover_during_segconf int has[MAX_DICTS]; // for select did_i's, counts the numner of times this field was encountered during segconf.running bool optimize[MAX_DICTS]; // true if --optimize indicates that this field should be optimized bool zip_txt_modified; // ZIP/PIZ: txt data is/was modified during Seg (e.g. by --optimize, --add-line-numbers). Before segconf: true if data *might* be modifed. After segconf: true iff data is modified. diff --git a/src/stats.c b/src/stats.c index 6383d2de..913e5404 100644 --- a/src/stats.c +++ b/src/stats.c @@ -21,6 +21,7 @@ #include "crypt.h" #include "tar.h" #include "contigs.h" +#include "mgzip.h" #define SHORT_HEADER "NAME GENOZIP % TXT % RATIO\n" @@ -262,13 +263,15 @@ static void stats_output_file_metadata (void) uint32_t num_used_ctxs=0; for_zctx_that (zctx->nodes.len || zctx->txt_len) num_used_ctxs++; - #define REPORT_VBs ({ \ - bufprintf (evb, &stats, "%ss: %s Contexts: %u Vblocks: %u x %s Sections: %u\n", \ - DTPZ (line_name), str_int_commas (z_file->num_lines).s, num_used_ctxs, \ - z_file->num_vbs, (segconf.vb_size % (1 MB) != 0) ? str_int_commas (segconf.vb_size).s : str_size (segconf.vb_size).s, z_file->section_list_buf.len32); }) + #define REPORT_VBs ({ \ + bufprintf (evb, &stats, "%ss: %s Contexts: %u Vblocks: %u x %s Sections: %u\n", \ + DTPZ (line_name), str_int_commas (z_file->num_lines).s, num_used_ctxs, \ + z_file->num_vbs, TXT_IS_VB_SIZE_BY_MGZIP ? "(var-length)" \ + : (segconf.vb_size % (1 MB) != 0) ? str_int_commas (segconf.vb_size).s \ + : str_size (segconf.vb_size).s, z_file->section_list_buf.len32); }) - #define REPORT_QNAME \ - FEATURE (z_file->num_lines, "Read name style: %s%s", "Qname=%s%s", \ + #define REPORT_QNAME \ + FEATURE (z_file->num_lines, "Read name style: %s%s", "Qname=%s%s", \ segconf_qf_name (0), cond_str(segconf.qname_flavor[1], "+", segconf_qf_name(1))) // no space surrounding the '+' as expected by batch_qname_flavors switch (z_file->data_type) { @@ -519,7 +522,7 @@ static void stats_output_file_metadata (void) bufprintf (evb, &stats, "Fields optimized: %s\n", segconf_get_optimizations().s); bufprintf (evb, &stats, "Genozip version: %s %s\nDate compressed: %s\n", - GENOZIP_CODE_VERSION, get_distribution(), str_time().s); + code_version().s, get_distribution(), str_time().s); bufprint0 (evb, &stats, "Command line: "); buf_append_string (evb, &stats, flags_command_line()); // careful not to use bufprintf with command_line as it can exceed the maximum length in bufprintf @@ -631,10 +634,10 @@ static void stats_output_stats (StatsByLine *s, unsigned num_stats, float src_co if (src_comp_ratio != 1 && flag.show_stats_comp_i == COMP_NONE) { // display codec name if same source codec for all components, or "DISK_SIZE" if not - rom source_code_name = codec_name (z_file->comp_source_codec[0]); + rom source_code_name = codec_name (z_file->comp_src_codec[0]); for (CompIType comp_i=1; comp_i < z_file->num_components; comp_i++) - if (z_file->comp_source_codec[comp_i] != z_file->comp_source_codec[0]) + if (z_file->comp_src_codec[comp_i] != z_file->comp_src_codec[0]) source_code_name = "DISK_SIZE"; bufprintf (evb, &stats, @@ -730,8 +733,8 @@ void stats_generate (void) // specific section, or COMP_NONE if for the entire f s->txt_len = ST(TXT_HEADER) ? z_file->header_size : 0; // note: excluding generated headers for DVCF s->type = (ST(REFERENCE) || ST(REF_IS_SET) || ST(REF_CONTIGS) || ST(CHROM2REF_MAP) || ST(REF_IUPACS)) ? "SEQUENCE" - : (ST(RANDOM_ACCESS) || ST(REF_RAND_ACC)) ? "RandomAccessIndex" - : "Other"; // note: some contexts appear as "Other" in --stats, but in --STATS their parent is themself, not "Other" + : (ST(RANDOM_ACCESS) || ST(REF_RAND_ACC)) ? "RandomAccessIndex" + : "Other"; // note: some contexts appear as "Other" in --stats, but in --STATS their parent is themself, not "Other" s->my_did_i = s->st_did_i = DID_NONE; s->did_i.s[0] = s->words.s[0] = s->hash.s[0] = s->uncomp_dict.s[0] = s->comp_dict.s[0] = '-'; s->pc_of_txt = txt_size ? 100.0 * (float)s->txt_len / (float)txt_size : 0; @@ -831,11 +834,10 @@ void stats_generate (void) // specific section, or COMP_NONE if for the entire f ST_NAME (SEC_REF_CONTIGS), ST_NAME (SEC_CHROM2REF_MAP), ST_NAME (SEC_REF_IUPACS)); - stats_consolidate_non_ctx (sbl, sbl_buf.len32, "Other", 22 + (DTPZ(txt_header_required) == HDR_NONE), "E1L", "E2L", "EOL", - "SAMPLES", "AUX", TOPLEVEL, "ToPLUFT", "TOP2BAM", "TOP2FQ", "TOP2NONE", "TOP2FQEX", "TOP2VCF", "TOP2HASH", - "LINEMETA", "CONTIG", "COORDS", "SAG", "SAALN", + stats_consolidate_non_ctx (sbl, sbl_buf.len32, "Other", 17 + (DTPZ(txt_header_required) == HDR_NONE), "E1L", "E2L", "EOL", + "SAMPLES", "AUX", TOPLEVEL, "TOP2BAM", "TOP2NONE", "TOP2VCF", "LINEMETA", "CONTIG", "SAG", "SAALN", ST_NAME (SEC_DICT_ID_ALIASES), ST_NAME (SEC_RECON_PLAN), - ST_NAME (SEC_VB_HEADER), ST_NAME (SEC_BGZF), ST_NAME(SEC_TXT_HEADER)/*must be last*/); + ST_NAME (SEC_VB_HEADER), ST_NAME (SEC_MGZIP), ST_NAME(SEC_TXT_HEADER)/*must be last*/); stats_consolidate_non_ctx (sbl, sbl_buf.len32, "RandomAccessIndex", 2, ST_NAME (SEC_RANDOM_ACCESS), ST_NAME (SEC_REF_RAND_ACC)); diff --git a/src/strings.c b/src/strings.c index b13834b6..0d32eed7 100644 --- a/src/strings.c +++ b/src/strings.c @@ -59,6 +59,9 @@ StrText char_to_printable (char c) case '\t' : return (StrText) { .s = "\\t" }; case '\n' : return (StrText) { .s = "\\n" }; case '\r' : return (StrText) { .s = "\\r" }; + case '\b' : return (StrText) { .s = "\\b" }; + case 0 ... 7 : return (StrText) { .s[0] = '\\', .s[1] = '0'+c }; + default : { // unprintable - output eg \xf StrText p = {}; snprintf (p.s, sizeof(p.s), "\\x%x", (uint8_t)c); @@ -82,7 +85,10 @@ uint32_t str_to_printable (STRp(in), char *out, int out_len) case '\b' : *out++ = '\\'; *out++ = 'b' ; out_len -= 2; break; case '\\' : *out++ = '\\'; *out++ = '\\'; out_len -= 2; break; case 0 ... 7 : *out++ = '\\'; *out++ = '0' + in[i]; out_len -= 2; break; - default : *out++ = '\\'; *out++ = 'x' ; *out++ = NUM2HEXDIGIT(in[i] >> 4), *out++ = NUM2HEXDIGIT(in[i] & 0xf); out_len -= 4; + default : *out++ = '\\'; *out++ = 'x' ; + *out++ = NUM2HEXDIGIT((uint8_t)in[i] >> 4); + *out++ = NUM2HEXDIGIT((uint8_t)in[i] & 0xf); + out_len -= 4; } *out = 0; @@ -289,7 +295,7 @@ bool str_get_int_range##func_num (rom str, uint32_t str_len, int64_t min_val, in if (!str_get_int (str, str_len ? str_len : strlen (str), &value64)) return false; \ if (value) *value = (type)value64; \ \ - return IN_RANGE (value64, min_val, max_val); \ + return IN_RANGX (value64, min_val, max_val); \ } str_get_int_range_type(8,uint8_t) // unsigned str_get_int_range_type(16,uint16_t) // unsigned @@ -937,7 +943,7 @@ uint32_t str_remove_whitespace (STRp(in), bool also_uppercase, char *out) } // in-place removal of flanking whitespace from a null-terminated string -void str_trim (STRe(str)) +void str_trim (qSTRp(str)) { // remove leading whitespace int i=0; for (; i < *str_len; i++) @@ -1198,8 +1204,10 @@ void *memrchr (const void *s, int c/*interpreted as unsigned char*/, size_t n) } #endif -char *memchr2 (rom p, char ch1, char ch2, uint32_t count) +char *memchr2 (const void *p_, char ch1, char ch2, uint32_t count) { + rom p = (rom)p_; + for (rom after = p + count; p < after; p++) if (*p == ch1 || *p == ch2) return (char *)p; diff --git a/src/strings.h b/src/strings.h index 5ebeed16..220f4df4 100644 --- a/src/strings.h +++ b/src/strings.h @@ -284,7 +284,7 @@ extern StrText str_human_time (unsigned secs, bool compact); #define FLOAT_FORMAT_LEN 12 extern bool str_get_float (STRp(float_str), double *value, char format[FLOAT_FORMAT_LEN], uint32_t *format_len); -extern bool str_scientific_to_decimal (STRp(float_str), STRe(modified), double *value); +extern bool str_scientific_to_decimal (STRp(float_str), qSTRp(modified), double *value); extern uint32_t str_split_do (STRp(str), uint32_t max_items, char sep, rom *items, uint32_t *item_lens, bool exactly, rom enforce_msg); @@ -336,7 +336,7 @@ extern void str_nul_separate_do (STRps(item)); #define str_nul_separate(name) str_nul_separate_do (n_##name##s, name##s, name##_lens) extern uint32_t str_remove_whitespace (STRp(in), bool also_uppercase, char *out); -extern void str_trim (STRe(str)); +extern void str_trim (qSTRp(str)); extern rom type_name (uint32_t item, rom const *name, // the address in which a pointer to name is found, if item is in range @@ -351,7 +351,7 @@ extern void str_query_user (rom query, STRc(response), bool allow_empty, Respons typedef enum { QDEF_NONE, QDEF_NO, QDEF_YES } DefAnswerType; extern bool str_query_user_yn (rom query, DefAnswerType def_answer); -extern char *memchr2 (rom p, char ch1, char ch2, uint32_t count); +extern char *memchr2 (const void *p, char ch1, char ch2, uint32_t count); // implementing memrchr, as it doesn't exist in Windows libc (msvcrt.dll) or Darwin (at least clang) #if defined _WIN32 || defined __APPLE__ diff --git a/src/test.sh b/src/test.sh index 4454e2b8..3a8e0976 100644 --- a/src/test.sh +++ b/src/test.sh @@ -24,6 +24,7 @@ cleanup() unset GENOZIP_REFERENCE } +# compares two files using internal MD%, allowing each file to be gz-compressed or not cmp_2_files() { if [ ! -f $1 ] ; then echo "File $1 not found while in cmp_2_files()"; exit 1; fi @@ -37,6 +38,20 @@ cmp_2_files() fi } +# compares two files using external MD5, requiring that they have the same gz compression +cmp_2_files_exact() +{ + if [ ! -f $1 ] ; then echo "File $1 not found while in cmp_2_files()"; exit 1; fi + if [ ! -f $2 ] ; then echo "File $2 not found while in cmp_2_files()"; exit 1; fi + + if [[ "`$md5 $1 | cut -d' ' -f1`" != "`$md5 $2 | cut -d' ' -f1`" ]] ; then + echo "MD5 comparison FAILED: $1 $2" + echo `$md5 "$1"` + echo `$md5 "$2"` + exit 1 + fi +} + verify_failure() # $1=exe $2=$? { if (( $2 == 0 )); then echo "Error: expecting $1 to fail but it succeeded"; exit 1; fi @@ -391,6 +406,7 @@ verify_bgzf() # $1 file that we wish to inspect $2 expected result (0 not-bgzf 1 batch_bgzf() { batch_print_header + local files=(basic-bgzf.bam basic-bgzf-6.sam.gz basic-bgzf-9.sam.gz basic-bgzf-6-no-eof.sam.gz basic-1bgzp_block.bam) #local files=() local file @@ -483,34 +499,22 @@ batch_special_algs() test_header "CSQ has an AF field" $genozip ${TESTDIR}/regression.CSQ_has_AF.vcf -ft || exit 1 - # bug was: compression failed, bc failed to identify a read in the VB when there is only one, with part of it in unconsumed_txt from previous VB, and part in a BGZF block (fixed 15.0.56) - test_header "vb=2 is single read" - $genozip -B32 ${TESTDIR}/regression.vb2-is-single-read.fq.gz -ft || exit 1 - - # bug was: compression failed, bc igzip would not decompress a 128K GZ chunk into the small vb_sizes[0] of first segconf attempt, and segconf gave up instead of attempting vb_sizes[1] - test_header "segconf vb_sizes[0] too small, try with vb_size[1]" - $genozip ${TESTDIR}/regression.no_small_segconf_vb_size.fq.gz -ft --truncate || exit 1 - # bug was: piz failed if two consecutive Is in CIGAR (ignoring non-seq-consuming ops like N) (defect 2024-06-16) test_header "two consecutive Is in CIGAR" $genozip ${TESTDIR}/regression.two-consecutive-Is.sam -ft || exit 1 - # bug was: no handling of GZIL blocks after move to igzip (defect 2024-03-01) - test_header "Illumina GZIL blocks" - $genozip -tf --no-bgzf ${TESTDIR}/regression.defect-2024-03-01.multi-gzip-break-between-reads.fq.gz || exit 1 - $genozip -tf --no-bgzf ${TESTDIR}/regression.defect-2024-03-01.multi-gzip-break-within-read.fq.gz || exit 1 - - # bug was: CHECKSUM at the end of 1MB-gz block in R2 was not handled correctly, failing R2 zip. - test_header "CHECKSUM in Illumina-style gzip: R2 alignment to R1" - $genozip -tf --pair -e $hs37d5 --truncate -B19 --no-bgzf ${TESTDIR}/regression.defect-2024-06-21.R1.gzil-broke-w-B19.fq.gz \ - ${TESTDIR}/regression.defect-2024-06-21.R2.gzil-broke-w-B19.fq.gz || exit 1 - - # two special code paths for handling truncated GZIL files, depending if the garbage last word of the file >1MB (detected during read) or <=1MB (detected during uncompress) - $genozip -tf --truncate ${TESTDIR}/special.gzil.truncated-last-word.gt.1MB.fastq.gz || exit 1 - $genozip -tf --truncate ${TESTDIR}/special.gzil.truncated-last-word.eq.1MB.fastq.gz || exit 1 - # bug was: MC copying CIGAR from mate, when both are "*" (=empty in BAM). Fixed in 15.0.62 in PIZ. + test_header "MC copying CIGAR from mate, when both are null-CIGARs" $genozip -tf ${TESTDIR}/regression.2024-06-26.MC-copy-from-mate-CIGAR.null-CIGAR.bam || exit 1 + + # txt data is 393k - over segconf's first VB size of 300K (test in plain, bgzf, gz; test SAM and FASTQ as they have different code paths related to discovery of gz codec) + test_header "Single read/alignment longer than segconf's first VB" + $genozip -tf ${TESTDIR}/special.force-2nd-segconf-vb_size.plain.sam || exit 1 + $genozip -tf ${TESTDIR}/special.force-2nd-segconf-vb_size.bgzf.sam.gz || exit 1 + $genozip -tf ${TESTDIR}/special.force-2nd-segconf-vb_size.gz.sam.gz || exit 1 + $genozip -tf ${TESTDIR}/special.force-2nd-segconf-vb_size.plain.fq || exit 1 + $genozip -tf ${TESTDIR}/special.force-2nd-segconf-vb_size.bgzf.fq.gz || exit 1 + $genozip -tf ${TESTDIR}/special.force-2nd-segconf-vb_size.gz.fq.gz || exit 1 } batch_qual_codecs() @@ -1067,19 +1071,135 @@ batch_real_world_optimize() cd - } -batch_gzil_fastq() +test_effective_codec_pair() # $1=R1_file $2=R2_file $3=expected_effective_codecs +{ + test_header "Test effective codes of paired $3 files" + local effective_codecs=$OUTDIR/effective_codecs.txt + + $genozip "$TESTDIR/$1" "$TESTDIR/$2" -2fX -o $output -e $hs37d5 --show-bgzf | grep effective_codec | cut -d= -f3 | cut -d" " -f1 | tr "\n" " " > $effective_codecs || exit 1 # not in sub-shell so we can catch a genozip error + (( ${PIPESTATUS[0]} == 0 )) || exit 1 + + echo -n "effective_codecs=" `cat $effective_codecs` + + if [[ "`cat $effective_codecs`" != "$3" ]]; then + echo "expected_effective_codecs=\"$3\" but effective_codecs=\"`cat $effective_codecs`\"" + exit 1 + fi + + # test genounzip too for good measure + $genounzip -t $output || exit 1 +} + +# MGZIP codecs +batch_mgzip_fastq() { batch_print_header - cd $TESTDIR - local files=( `ls -1 gzil.*.fq.gz` ) + local files=( gz.bgzf.truncated.fq.gz gz.il1m.illumina.truncated.fq.gz gz.mgzf.mgi.R1.fq.gz \ + gz.mgsp.mgi.R1.fq.gz gz.emfl.element.fq.gz gz.emvl.element.R1.fq.gz ) + local recon=$OUTDIR/recon.fq + local truncated=$OUTDIR/truncated.fq + local discovered_codec=$OUTDIR/discovered_codec.txt - $genozip --show-filename --test --force ${files[*]} || exit 1 + for f in ${files[@]}; do + test_header "Test codec discovery $f" + local expected_codec=`echo $f | cut -d. -f2` - # expecting R2 to be decompressed by igzip as it is faster than isil for gz decompressing while reading - $genozip gzil.R1.fq.gz gzil.R1.fq.gz -o $output -e $hs37d5 -tf -2 + $genozip --show-gz $TESTDIR/$f -X | grep src_codec= |cut -d= -f2 | cut -d" " -f1 > $discovered_codec # not subshell so we can catch a genozip error + (( ${PIPESTATUS[0]} == 0 )) || exit 1 - cd - + if (( "${expected_codec^^}" != "`cat $discovered_codec`")); then + echo "$f: Bad codec discovery for file: expected_codec=\"${expected_codec^^}\" discovered_codec=\"`cat $discovered_codec`\"" + exit 1 + fi + echo "$f is compressed with effective_codec=`cat $discovered_codec`" + + test_header "$f: Test reconstruction" + if (( `echo $f | grep truncated | wc -l` == 0 )); then + $genozip -fX $TESTDIR/$f || exit 1 + $genounzip $TESTDIR/${f/.gz/.genozip} -fo $recon || exit 1 + cmp_2_files $TESTDIR/$f $recon + else + $genozip -fX --truncate $TESTDIR/$f || exit 1 # full gz blocks, but last block has a partial read + $genounzip $TESTDIR/${f/.gz/.genozip} -fo $recon || exit 1 + $zcat $TESTDIR/$f | head -$(( `$zcat $TESTDIR/$f | wc -l` / 4 * 4 )) > $truncated + cmp_2_files $truncated $recon + fi + + + test_header "$f: --truncated: last mgzip block is truncated" + + local len=`ls -l $TESTDIR/$f | cut -d" " -f5` + trunc_f=$OUTDIR/truncated.$f + head -c $(( len - 10 )) $TESTDIR/$f > $trunc_f + + $genozip -ft --truncate $trunc_f || exit 1 + done + + test_header "drastically different VB sizes in pair: short VBs first" + $genozip -ft $TESTDIR/special.human2-R2.short-seqs.fq.gz $TESTDIR/test.human2-R1.fq.gz -2 -e $hs37d5 + + test_header "drastically different VB sizes in pair: long VBs first" + $genozip -ft $TESTDIR/test.human2-R1.fq.gz $TESTDIR/special.human2-R2.short-seqs.fq.gz -2 -e $hs37d5 + + # two special code paths for handling truncated GZIL files, depending if the garbage last word of the file >1MB (detected during read) or <=1MB (detected during uncompress) + test_header "truncated IL1M file: fake isize in last word > 1MB" + $genozip -tf --truncate ${TESTDIR}/special.il1m.truncated-last-word.gt.1MB.fastq.gz || exit 1 + + test_header "truncated IL1M file: fake isize in last word <= 1MB" + $genozip -tf --truncate ${TESTDIR}/special.il1m.truncated-last-word.eq.1MB.fastq.gz || exit 1 + + local files=( gz.bgzf.truncated.fq.gz gz.il1m.illumina.truncated.fq.gz ) + for f in ${files[@]}; do + test_header "$f: --truncated: Full mgzip blocks, but last read of last block is truncated" + $genozip -ft --truncate $TESTDIR/$f || exit 1 + done + + # check that pair effective codecs are as intended + test_effective_codec_pair il1m.human2-R1.fq.gz il1m.human2-R2.fq.gz "IL1M IL1M " + test_effective_codec_pair test.human2-R1.fq.gz test.human2-R2.fq.gz "BGZF BGZF " + test_effective_codec_pair test.human2-R1.fq.gz il1m.human2-R2.fq.gz "BGZF GZ " + test_effective_codec_pair gz.mgsp.mgi.R1.fq.gz gz.mgsp.mgi.R2.fq.gz "MGSP MGSP " + test_effective_codec_pair gz.mgzf.mgi.R1.fq.gz gz.mgzf.mgi.R2.fq.gz "MGZF MGZF " + test_effective_codec_pair gz.emvl.element.R1.fq.gz gz.emvl.element.R2.fq.gz "EMVL EMVL " + test_effective_codec_pair gz.mgsp.mgi.R1.fq.gz gz.bgzf.mgi.R2.fq.gz "MGSP GZ " + test_effective_codec_pair gz.bgzf.mgi.R2.fq.gz gz.mgsp.mgi.R1.fq.gz "BGZF GZ " + + # mix an IN_SYNC codec (MGSP) with a non-in-sync (BGZF) + $zcat $(TESTDIR) + + # regression tests + + # bug was: compression failed, bc failed to identify a read in the VB when there is only one, with part of it in unconsumed_txt from previous VB, and part in a BGZF block (fixed 15.0.56) + test_header "regression: vb=2 is single read" + $genozip -B32 ${TESTDIR}/regression.vb2-is-single-read.fq.gz -ft || exit 1 + + # bug was: compression failed, bc igzip would not decompress a isize > segconf.vb_sizes[0], and segconf gave up instead of attempting vb_sizes[1] + test_header "regression: segconf vb_sizes[0] too small, try with vb_size[1]" + $genozip ${TESTDIR}/regression.no_small_segconf_vb_size.fq.gz -ft --no-bgzf --truncate || exit 1 + + # bug was: igzip not handling of IL1M blocks after move to igzip (defect 2024-03-01) + test_header "regression: ILIM with igzip: multi-gzip-break-between-reads" + $genozip -tf --no-bgzf ${TESTDIR}/regression.defect-2024-03-01.multi-gzip-break-between-reads.fq.gz || exit 1 + + test_header "regression: ILIM with igzip: multi-gzip-break-within-read" + $genozip -tf --no-bgzf ${TESTDIR}/regression.defect-2024-03-01.multi-gzip-break-within-read.fq.gz || exit 1 + + # bug was: CHECKSUM at the end of 1MB-gz block in R2 was not handled correctly, failing R2 zip. + # (R1 is BGZF file containing the same number of reads as the truncated R2 IL1M file) + test_header "regression: ILIM with igzip: R2 alignment to R1 due to CHECKSUM" + $genozip -tf --pair -e $hs37d5 --truncate -B19 --no-bgzf ${TESTDIR}/regression.defect-2024-06-21.R1.il1m-broke-w-B19.fq.gz \ + ${TESTDIR}/regression.defect-2024-06-21.R2.il1m-broke-w-B19.fq.gz || exit 1 + + $genozip -tf --pair -e $hs37d5 --truncate -B100000B ${TESTDIR}/regression.defect-2024-06-21.R1.il1m-broke-w-B19.fq.gz \ + ${TESTDIR}/regression.defect-2024-06-21.R2.il1m-broke-w-B19.fq.gz || exit 1 + + # bug was: non-terminal BGZF EOF blocks were not handled correctly + test_header "regression: Non-terminal BGZF EOF block" + local file=${TESTDIR}/regression.defect-2024-07-03.midfile-bgzf-eof.vcf.gz + $genozip -tf $file -o $output || exit 1 + $genounzip $output -fo ${OUTDIR}/recon.vcf || exit 1 + cmp_2_files $file ${OUTDIR}/recon.vcf } @@ -1197,49 +1317,49 @@ batch_real_world_with_ref_md5() # $1 extra genozip argument } -# batch_real_world_with_ref_backcomp() -# { -# if [ "$i_am_prod" == "1" ]; then return; fi +batch_real_world_with_ref_backcomp() +{ + if [ "$i_am_prod" == "1" ]; then return; fi -# batch_print_header + batch_print_header -# cleanup # note: cleanup doesn't affect TESTDIR, but we shall use -f to overwrite any existing genozip files + cleanup # note: cleanup doesn't affect TESTDIR, but we shall use -f to overwrite any existing genozip files -# # with a reference -# local files37=( test.IonXpress.sam.gz \ -# test.human.fq.gz test.human2.bam test.pacbio.clr.bam \ -# test.human2-R1.fq.bz2 test.pacbio.ccs.10k.bam \ -# test.NA12878.chr22.1x.bam test.NA12878-R1.100k.fq \ -# test.human2.filtered.snp.vcf test.solexa-headerless.sam ) + # with a reference + local files37=( test.IonXpress.sam.gz \ + test.human.fq.gz test.human2.bam test.pacbio.clr.bam \ + test.human2-R1.fq.bz2 test.pacbio.ccs.10k.bam \ + test.NA12878.chr22.1x.bam test.NA12878-R1.100k.fq \ + test.human2.filtered.snp.vcf test.solexa-headerless.sam ) -# local files38=( test.1KG-38.vcf.gz test.human-collated-headerless.sam test.cigar-no-seq-qual.bam) + local files38=( test.1KG-38.vcf.gz test.human-collated-headerless.sam test.cigar-no-seq-qual.bam) -# local filesT2T1_1=( test.nanopore.t2t_v1_1.bam ) + local filesT2T1_1=( test.nanopore.t2t_v1_1.bam ) -# local total=$(( ${#files37[@]} + ${#files38[@]} + ${#filesT2T1_1[@]} )) + local total=$(( ${#files37[@]} + ${#files38[@]} + ${#filesT2T1_1[@]} )) -# local i=0 -# for f in ${files37[@]}; do -# i=$(( i + 1 )) -# test_header "$f - backward compatability with prod (with reference) - 37 ($i/$total)" -# $genozip_latest $TESTDIR/$f -mf -e $hs37d5 -o $output || exit 1 -# $genounzip -t $output || exit 1 -# done + local i=0 + for f in ${files37[@]}; do + i=$(( i + 1 )) + test_header "$f - backward compatability with prod (with reference) - 37 ($i/$total)" + $genozip_latest $TESTDIR/$f -mf -e $hs37d5 -o $output || exit 1 + $genounzip -t $output || exit 1 + done -# for f in ${files38[@]}; do -# i=$(( i + 1 )) -# test_header "$f - backward compatability with prod (with reference) - 38 ($i/$total)" -# $genozip_latest $TESTDIR/$f -mf -e $GRCh38 -o $output || exit 1 -# $genounzip -t $output || exit 1 -# done + for f in ${files38[@]}; do + i=$(( i + 1 )) + test_header "$f - backward compatability with prod (with reference) - 38 ($i/$total)" + $genozip_latest $TESTDIR/$f -mf -e $GRCh38 -o $output || exit 1 + $genounzip -t $output || exit 1 + done -# for f in ${filesT2T1_1[@]}; do -# i=$(( i + 1 )) -# test_header "$f - backward compatability with prod (with reference) - T2T ($i/$total)" -# $genozip_latest $TESTDIR/$f -mf -e $T2T1_1 -o $output || exit 1 -# $genounzip -t $output || exit 1 -# done -# } + for f in ${filesT2T1_1[@]}; do + i=$(( i + 1 )) + test_header "$f - backward compatability with prod (with reference) - T2T ($i/$total)" + $genozip_latest $TESTDIR/$f -mf -e $T2T1_1 -o $output || exit 1 + $genounzip -t $output || exit 1 + done +} batch_real_world_backcomp() { @@ -1272,10 +1392,15 @@ batch_real_world_backcomp() local i=0 for f in ${files[@]}; do i=$(( i + 1 )) - echo "$f - backcomp with $1 ($i/${#files[@]} batch_id=${GENOZIP_TEST})" + test_header "$f - backcomp with $1 ($i/${#files[@]} batch_id=${GENOZIP_TEST})" $genounzip -t $f -e $TESTDIR/$1/hs37d5.ref.genozip || exit 1 done + test_header "backcomp $1 genounzip --bgzf=exact" + local file=test.human2.bam + $genounzip $TESTDIR/$1/$file.genozip -fo $OUTDIR/$file --bgzf=exact || exit 1 + cmp_2_files_exact $TESTDIR/$file $OUTDIR/$file + cleanup } @@ -1303,8 +1428,8 @@ batch_real_world_small_vbs() for f in ${files[@]}; do rm -f $TESTDIR/${f}.genozip; done # test --pair and --deep with small VBs - $genozip --vblock=100000B -2tfe $GRCh38 $TESTDIR/test.human2-R1.fq.gz $TESTDIR/test.human2-R2.fq.gz $TESTDIR/deep.human2-38.R1.fq.gz $TESTDIR/deep.human2-38.R2.fq.gz --force-gencomp || exit 1 # 2 pairs - $genozip --vblock=100000B -3tfe $GRCh38 $TESTDIR/deep.human2-38.R1.fq.gz $TESTDIR/deep.human2-38.R2.fq.gz $TESTDIR/deep.human2-38.sam || exit 1 + $genozip --vblock=100000B -2tfe $GRCh38 $TESTDIR/test.human2-R1.fq.gz $TESTDIR/test.human2-R2.fq.gz || exit 1 # 2 pairs + $genozip --vblock=100000B -3tfe $GRCh38 $TESTDIR/deep.human2-38.R1.fq.gz $TESTDIR/deep.human2-38.R2.fq.gz $TESTDIR/deep.human2-38.sam --not-paired || exit 1 $genozip --vblock=100000B -3tfe $GRCh38 $TESTDIR/deep.bismark.sra2.one.fq.gz $TESTDIR/deep.bismark.sra2.two.fq.gz $TESTDIR/deep.bismark.sra2.bam || exit 1 } @@ -1341,7 +1466,7 @@ get_sam_type() # $1 = filename local bhead="`head -c26 $OUTDIR/txt | tail -c3`" if [[ "$bhead" == "BAM" ]]; then echo "BAM_Z0"; return; fi # BGZF with non-compressed blocks - local zhead="`zcat < $1 2>/dev/null | head -c3`" # zcat of a file called txt doesn't work on mac, hence input redirection + local zhead="`$zcat < $1 2>/dev/null | head -c3`" # zcat of a file called txt doesn't work on mac, hence input redirection if [[ "$zhead" == "BAM" ]]; then echo "BAM"; return; fi if [[ "$zhead" == "$first_chars" ]]; then echo "SAM_GZ"; return; fi } @@ -1363,7 +1488,7 @@ batch_sam_bam_cram_output() cp $src $name.bam samtools view $src -OCRAM -o $name.cram >& /dev/null || exit 1 - # tests flags_piz_set_out_dt and bgzf_piz_calculate_bgzf_flags + # tests flags_piz_set_out_dt and mgzip_piz_calculate_mgzip_flags for file in $name.sam $name.sam.gz $name.bam $name.cram; do local z=$file.genozip @@ -1467,7 +1592,7 @@ get_vcf_type() # $1 = filename local head="`head -c3 $1`" if [[ "$head" == "$first_chars" ]]; then echo "VCF"; return; fi - local zhead="`zcat < $1 2>/dev/null | head -c3`" # zcat of a file called txt doesn't work on mac, hence input redirection + local zhead="`$zcat < $1 2>/dev/null | head -c3`" # zcat of a file called txt doesn't work on mac, hence input redirection if [[ "$zhead" == "BCF" ]]; then echo "BCF"; return; fi if [[ "$zhead" == "$first_chars" ]]; then echo "VCF_GZ"; return; fi } @@ -1488,7 +1613,7 @@ batch_vcf_bcf_output() bcftools view -h $src -Oz1 -o $name.vcf.gz >& /dev/null || exit 1 bcftools view -h $src -Ob -o $name.bcf >& /dev/null || exit 1 - # tests flags_piz_set_out_dt and bgzf_piz_calculate_bgzf_flags + # tests flags_piz_set_out_dt and mgzip_piz_calculate_mgzip_flags for file in $name.vcf $name.vcf.gz $name.bcf ; do local z=$file.genozip @@ -2065,9 +2190,9 @@ batch_deep() # note: use --debug-deep for detailed tracking # btest contains a variety of scenarios test_header basic-deep local T=$TESTDIR/basic-deep - $genozip $T.sam $T.R1.fq $T.R2.fq -fe $GRCh38 -3t -o $output --best || exit 1 # --best causes aligner use on unmapped alignments - $genozip $T.sam $T.R1.fq $T.R2.fq -fe $GRCh38 -3t -o $output --no-gencomp || exit 1 # --no-gencomp causes in-VB segging against saggy - $genozip $T.sam $T.R1.fq $T.R2.fq -fe $GRCh38 -3t -o $output --md5 || exit 1 # --md5 uses a differt code path for verifying digest + $genozip $T.sam $T.R1.fq $T.R2.fq -tfe $GRCh38 --deep -o $output --best || exit 1 # --best causes aligner use on unmapped alignments + $genozip $T.sam $T.R1.fq $T.R2.fq -tfe $GRCh38 --deep -o $output --no-gencomp || exit 1 # --no-gencomp causes in-VB segging against saggy + $genozip $T.sam $T.R1.fq $T.R2.fq -tfe $GRCh38 --deep -o $output --md5 || exit 1 # --md5 uses a differt code path for verifying digest test_count_genocat_lines "" "--R1" 24 test_count_genocat_lines "" "--R2" 24 @@ -2077,8 +2202,8 @@ batch_deep() # note: use --debug-deep for detailed tracking test_header deep.human2-38 local T=$TESTDIR/deep.human2-38 - $genozip $T.sam $T.R1.fq.gz $T.R2.fq.gz -fe $GRCh38 -o $output -3t --best || exit 1 - $genozip $T.sam $T.R1.fq.gz $T.R2.fq.gz -fe $GRCh38 -o $output -3t --no-gencomp || exit 1 + $genozip $T.sam $T.R1.fq.gz $T.R2.fq.gz -fe $GRCh38 -o $output -3t --best --not-paired || exit 1 + $genozip $T.sam $T.R1.fq.gz $T.R2.fq.gz -fe $GRCh38 -o $output -3t --no-gencomp --not-paired || exit 1 # bismark (bisulfite), SRA2, non-matching FASTQ filenames test_header deep.bismark.sra2 @@ -2104,7 +2229,7 @@ batch_deep() # note: use --debug-deep for detailed tracking # SAM and FQ qname flavor is different - but comparable after canonization test_header deep.canonize-qname local T=$TESTDIR/deep.canonize-qname - $genozip $T.2.fq $T.1.fq $T.sam -fe $GRCh38 -o $output -3t || exit 1 + $genozip $T.2.fq $T.1.fq $T.sam -tfe $GRCh38 -o $output --deep || exit 1 # SAM sequences may be shorter than in FASTQ due to trimming test_header deep.trimmed @@ -2125,12 +2250,12 @@ batch_deep() # note: use --debug-deep for detailed tracking # FASTQ with SAUX test_header deep.illum.saux local T=$TESTDIR/deep.illum.saux - $genozip $T.R1.fq $T.R2.fq $T.sam -fe $hs37d5 -o $output -3t || exit 1 + $genozip $T.R1.fq $T.R2.fq $T.sam -tfe $hs37d5 -o $output --deep || exit 1 # qual scores corresponding to 'N' bases are replaced by DRAGEN test_header deep.rewrite-N-qual local T=$TESTDIR/deep.rewrite-N-qual - $genozip $T.R1.fq $T.R2.fq $T.sam -fe $hs37d5 -o $output -3t || exit 1 + $genozip $T.R1.fq $T.R2.fq $T.sam -fte $hs37d5 -o $output --deep || exit 1 if (( `$genozip $T.R1.fq $T.R2.fq $T.sam -fe $hs37d5 -o $output -3X --show-deep | grep "n_full_mch=(24,0)" | wc -l` != 1 )); then echo "expecting 24 full matches (including QUAL matches)" @@ -2141,7 +2266,7 @@ batch_deep() # note: use --debug-deep for detailed tracking cleanup_cache test_header "deep.qtype=QNAME2 - different FASTQ and SAM qname flavors" local T="$TESTDIR/deep.qtype=QNAME2" - $genozip $T.1.fq $T.2.fq $T.sam -fE $hg19 -3t -o $output || exit 1 + $genozip $T.1.fq $T.2.fq $T.sam -tfE $hg19 --deep -o $output || exit 1 test_header "deep.trimmed-deep_no_qual - encrypted" local T=$TESTDIR/deep.trimmed-deep_no_qual @@ -2317,10 +2442,12 @@ for exe in ${exes[@]}; do fi done -if `command -v md5 >& /dev/null`; then - md5="md5 -q" # mac +if [ -n "$is_mac" ]; then + md5="md5 -q" + zcat="gzip -dc" else md5=md5sum + zcat=zcat fi mkdir $OUTDIR >& /dev/null @@ -2354,52 +2481,52 @@ case $GENOZIP_TEST in 11) batch_basic basic.generic ;; 12) batch_precompressed ;; 13) batch_bgzf ;; -14) batch_subdirs ;; -15) batch_special_algs ;; -16) batch_qual_codecs ;; -17) batch_sam_bam_translations ;; -18) batch_23andMe_translations ;; -19) batch_genocat_tests ;; -20) batch_grep_count_lines ;; -21) batch_bam_subsetting ;; -22) batch_backward_compatability ;; -23) batch_single_thread ;; -24) batch_copy_ref_section ;; -25) batch_iupac ;; -26) batch_genols ;; -27) batch_tar_files_from ;; -28) batch_gencomp_depn_methods ;; -29) batch_deep ;; -30) batch_real_world_small_vbs ;; -31) batch_real_world_1_adler32 ;; -32) batch_real_world_genounzip_single_process ;; -33) batch_real_world_genounzip_compare_file ;; -34) batch_real_world_1_adler32 "--best -f" ;; -35) batch_real_world_1_adler32 "--fast --force-gencomp" ;; -36) batch_real_world_optimize ;; -37) batch_real_world_with_ref_md5 ;; -38) batch_real_world_with_ref_md5 "--best --no-cache --force-gencomp" ;; -39) batch_multiseq ;; -40) batch_sam_bam_cram_output ;; -41) batch_vcf_bcf_output ;; -42) batch_external_unzip ;; -43) batch_external_ora ;; -44) batch_reference_fastq ;; -45) batch_reference_fasta_as_fastq ;; -46) batch_reference_sam ;; -47) batch_reference_vcf ;; -48) batch_many_small_files ;; -49) batch_make_reference ;; -50) batch_headerless_wrong_ref ;; -51) batch_replace ;; -52) batch_coverage_idxstats ;; -53) batch_qname_flavors ;; -54) batch_piz_no_license ;; -55) batch_sendto ;; -56) batch_user_message_permissions ;; -57) batch_password_permissions ;; -58) batch_reference_backcomp ;; -59) batch_gzil_fastq ;; +14) batch_mgzip_fastq ;; +15) batch_subdirs ;; +16) batch_special_algs ;; +17) batch_qual_codecs ;; +18) batch_sam_bam_translations ;; +19) batch_23andMe_translations ;; +20) batch_genocat_tests ;; +21) batch_grep_count_lines ;; +22) batch_bam_subsetting ;; +23) batch_backward_compatability ;; +24) batch_single_thread ;; +25) batch_copy_ref_section ;; +26) batch_iupac ;; +27) batch_genols ;; +28) batch_tar_files_from ;; +29) batch_gencomp_depn_methods ;; +30) batch_deep ;; +31) batch_real_world_small_vbs ;; +32) batch_real_world_1_adler32 ;; +33) batch_real_world_genounzip_single_process ;; +34) batch_real_world_genounzip_compare_file ;; +35) batch_real_world_1_adler32 "--best -f" ;; +36) batch_real_world_1_adler32 "--fast --force-gencomp" ;; +37) batch_real_world_optimize ;; +38) batch_real_world_with_ref_md5 ;; +39) batch_real_world_with_ref_md5 "--best --no-cache --force-gencomp" ;; +40) batch_multiseq ;; +41) batch_sam_bam_cram_output ;; +42) batch_vcf_bcf_output ;; +43) batch_external_unzip ;; +44) batch_external_ora ;; +45) batch_reference_fastq ;; +46) batch_reference_fasta_as_fastq ;; +47) batch_reference_sam ;; +48) batch_reference_vcf ;; +49) batch_many_small_files ;; +50) batch_make_reference ;; +51) batch_headerless_wrong_ref ;; +52) batch_replace ;; +53) batch_coverage_idxstats ;; +54) batch_qname_flavors ;; +55) batch_piz_no_license ;; +56) batch_sendto ;; +57) batch_user_message_permissions ;; +58) batch_password_permissions ;; +59) batch_reference_backcomp ;; 60) batch_real_world_backcomp 11.0.11 ;; # note: versions must match VERSIONS in test/Makefile 61) batch_real_world_backcomp 12.0.42 ;; 62) batch_real_world_backcomp 13.0.21 ;; diff --git a/src/threads.c b/src/threads.c index 6ea175ea..4a06ac67 100644 --- a/src/threads.c +++ b/src/threads.c @@ -528,7 +528,7 @@ void threads_join_do (ThreadId *thread_id, rom expected_task, rom expected_task2 } // wait for thread to complete (no wait if it completed already) - pthread_join (ent.pthread, NULL); + PTHREAD_JOIN (ent.pthread, ent.task_name); // wait for data from this thread to arrive __atomic_thread_fence (__ATOMIC_ACQUIRE); diff --git a/src/txtfile.c b/src/txtfile.c index 4c3a9958..813bc3e7 100644 --- a/src/txtfile.c +++ b/src/txtfile.c @@ -14,7 +14,7 @@ #include "txtfile.h" #include "file.h" #include "codec.h" -#include "bgzf.h" +#include "mgzip.h" #include "biopsy.h" #include "zip.h" #include "arch.h" @@ -26,8 +26,6 @@ #define MAX_TXT_HEADER_LEN ((uint64_t)0xffffffff) // maximum length of txt header - one issue with enlarging it is that we digest it in one go, and the digest module is 32 bit -#define TXTFILE_READ_VB_PADDING 16 // we need this quantity of unused bytes at the end of vb.txt_data - // PIZ: dump bad vb to disk StrTextLong txtfile_dump_vb (VBlockP vb, rom base_name) { @@ -45,17 +43,29 @@ StrTextLong txtfile_dump_vb (VBlockP vb, rom base_name) // returns the requested number of bytes, except if eof in which case it could be less. uint32_t txtfile_fread (FileP file, - FILE *fp, // note: non-NULL if different from file->file (when re-reading) - void *addr, uint32_t size, int64_t *disk_so_far) + FILE *fp, // note: non-NULL if different from file->file (when re-reading) + void *addr, // NULL means append to gz_data (reallocing if needed) + int32_t size, int64_t *disk_so_far) { - ASSERTNOTNULL (addr); + if (size <= 0) return 0; + if (!fp) fp = (FILE *)file->file; + bool is_gz_data = (addr == NULL); + + if (is_gz_data) { + buf_alloc (evb, &file->gz_data, size, 0, uint8_t, 1.1, "txt_file->gz_data"); + addr = BAFT8 (file->gz_data); + } uint32_t bytes = fread (addr, 1, size, fp); if (disk_so_far) *disk_so_far += bytes; - ASSERT (bytes == size || !ferror (fp), "Error while reading %s codec=%s on filesystem=%s - requested %u bytes but read only %u: (%u)%s", - file->basename, codec_name (file->codec), arch_get_filesystem_type (file).s, size, bytes, errno, strerror (errno)); + int save_errno = errno; + ASSERT (bytes == size || !ferror (fp) || + "Error while reading %s codec=%s on filesystem=%s - requested %u bytes but read only %u: (%u)%s", + file->basename, codec_name (file->effective_codec), arch_get_filesystem_type (file).s, size, bytes, save_errno, strerror (save_errno)); + + if (is_gz_data) file->gz_data.len32 += bytes; // note: since we now took care of errors, we know that it feof iff bytes < size return bytes; @@ -85,7 +95,7 @@ static inline uint32_t txtfile_read_block_plain (VBlockP vb, uint32_t max_bytes) char *data = BAFTtxt; int32_t bytes_read; - // case: we have data passed to us from txtfile_discover_gz_codec - handle it first (possibly txt_data already contains data passed down from previous VBs) + // case: we have data passed to us from txtfile_discover_specific_gz - handle it first (possibly txt_data already contains data passed down from previous VBs) if (txt_file->gz_data.len) { bytes_read = MIN_(txt_file->gz_data.len32, max_bytes); memcpy (BAFTtxt, B1STc (txt_file->gz_data), bytes_read); @@ -149,7 +159,163 @@ rom isal_error (int ret) // so that decompression is parallelized with disk read-ahead buffer filling (a bigger buffer would cause the disk to be idle while we are still decompressing) #define IGZIP_CHUNK (128 KB) -static void txtfile_initialize_igzip (FileP file) +static StrText display_gz_flags (uint32_t flg) +{ + StrText s={}; + uint32_t s_len=0; + + rom names[] = { "TEXT", "HCRC", "XTRA", "NAME", "CMNT", "RES1", "RES2", "RES3" }; + + for (int i=0; i < 8; i++) + if ((flg >> i) & 1) + SNPRINTF (s, "%s|", names[i]); + if (s_len) + s.s[s_len-1] = 0; // remove final '|' + else + s.s[0] = '0'; + + return s; +} + +StrTextLong display_gz_header (STR8p(h), bool obscure_fname) +{ + StrTextLong s = {}; + uint32_t s_len = 0; + #define ADVANCE_h(n) ({ h += (n); h_len -= (n); }) + + if (h_len < 3) goto fail; + if (memcmp (h, BGZF_PREFIX, 3)) + return (StrTextLong){ "" }; + + if (h_len < 10) goto fail; + uint8_t flg = h[3]; + time_t mtime = GET_UINT32(&h[4]); + uint8_t xfl = h[8]; + uint8_t os = h[9]; + + struct tm *tm_info = localtime (&mtime); + char time_str[32]; + if (IN_RANGX (mtime, 1, time(NULL))) + strftime (time_str, sizeof(time_str), "%Y-%m-%d_%H:%M:%S", tm_info); + + // in Ultima, mtime holds the total_txt_size (R1+R2) + else if (TECH(ULTIMA) && !IN_RANGX(tm_info->tm_year, 20, 50)) // note: tech not set in initial call, but set later (e.g. when reporting to stats) + snprintf (time_str, sizeof(time_str), "ULTIMA=%u", (uint32_t)mtime); + + else + snprintf (time_str, sizeof(time_str), "MTIME=%u", (uint32_t)mtime); + + #define OS(x,s) os==x?s : + // note: for brevity, we don't display ID and CM: we verify above that they are BGZF_PREFIX. + SNPRINTF (s, "{ FLG=%s %s XFL=%u OS=%s", display_gz_flags(flg).s, time_str, xfl, + OS(3,"Unix") OS(7,"Mac") OS(11,"Windows") OS(255,"Unknown") str_int_s(h[9]).s); + ADVANCE_h(10); + + if (IS_FLAG (flg, 4)) { // FEXTRA + if (h_len < 2) goto fail; + uint16_t xlen = GET_UINT16(h); + ADVANCE_h(2); + SNPRINTF (s, " XLEN=%u X=[", xlen); + + if (h_len < xlen) goto fail; + + while (xlen) { + if (h_len < 4) goto fail; + uint32_t f_len = GET_UINT16(&h[2]); + if (h_len < 4 + f_len) goto fail; + + uint16_t id = GET_UINT16 (h); + if (xlen == 6 && id == 0x4342 && f_len == 2) + SNPRINTF (s, " { BGZF BSIZE-1=%u }", GET_UINT16(&h[4])); + + else if (xlen == 8 && id == 0x4749 && f_len == 4) + SNPRINTF (s, " { MGZF BSIZE=%u }", GET_UINT32(&h[4])); + + else + SNPRINTF (s, " { ID=%02X%02X LEN=%u F=%s }", h[0], h[1], f_len, str_to_hex (&h[4], f_len).s); + + ADVANCE_h(f_len + 4); + xlen -= f_len + 4; + } + + SNPRINTF0 (s, " ]"); + } + + if (IS_FLAG(flg, 8)) { // FNAME + int name_len = strnlen ((rom)h, 1 KB); + if (h_len < name_len+1 || name_len == 1 KB) goto fail; // somewhat safety + + if (obscure_fname) + SNPRINTF0 (s, " NAME="); + else + SNPRINTF (s, " NAME=\"%s\"", h); + ADVANCE_h (name_len+1); + } + + if (IS_FLAG(flg, 16)) { // FCOMMENT + int comment_len = strnlen ((rom)h, 1 KB); + if (h_len < comment_len+1 || comment_len == 1 KB) goto fail; // somewhat safety + + SNPRINTF (s, " CMNT=\"%s\"", h); + ADVANCE_h (comment_len+1); + } + + if (IS_FLAG(flg, 1)) { // FHCRC + if (h_len < 2) goto fail; + SNPRINTF (s, " HCRC=%u", GET_UINT16(h)); + ADVANCE_h(2); + } + + SNPRINTF0 (s, " }"); + + if (flg == 8/*name*/ && mtime && xfl == 0 && os == 3) SNPRINTF0 (s, " (looks like gzip or pigz)"); + + return s; + +fail: + return (StrTextLong){ "" }; +} + +uint32_t gzip_header_length (FileP file) +{ + ARRAY (char, h, file->gz_data); + if (h_len < 10) return 0; + + if (h[0] != '\x1f' || h[1] != '\x8b' || h[2] != '\x08') return 0; // not gzip + + uint8_t flg = h[3]; + uint32_t xlen=0, name_len=0, comment_len=0, hcrc_len = 0; + + if (IS_FLAG (flg, 4)) { // FEXTRA + if (h_len < 12) return 0; + xlen = 2 + GET_UINT16(h+10); + if (h_len < 10 + xlen) return 0; + } + + if (IS_FLAG (flg, 8)) { // FNAME + int offset = 10 + xlen; + name_len = strnlen (h + offset, h_len - offset - 1); + if (h[offset + name_len]) return 0; // expecting \0 + name_len++; // inc. the \0 + } + + if (IS_FLAG (flg, 16)) { // FCOMMENT + int offset = 10 + xlen + name_len; + comment_len = strnlen (h + offset, h_len - offset - 1); + if (h[offset + comment_len]) return 0; // expecting \0 + comment_len++; // inc. the \0 + } + + if (IS_FLAG(flg, 1)) { // FHCRC + int offset = 10 + xlen + name_len + comment_len; + if (h_len < offset + 2) return 0; + hcrc_len = 2; + } + + return 10 + xlen + name_len + comment_len + hcrc_len; +} + +void txtfile_initialize_igzip (FileP file) { ASSERTNOTINUSE (file->igzip_state); @@ -160,22 +326,57 @@ static void txtfile_initialize_igzip (FileP file) state->crc_flag = ISAL_GZIP; } -void txtfile_discover_gz_codec (FileP file) +bool txtfile_is_gzip (FileP file) { - buf_alloc (evb, &file->gz_data, 0, MAX_(IGZIP_CHUNK, GZIL_MAX_BLOCK_SIZE), char, 0, "gz_data"); + txtfile_fread (file, NULL, NULL, 3, &file->disk_so_far); + ARRAY (char, h, file->gz_data); + return h[0] == '\x1f' && h[1] == '\x8b' && h[2] == '\x08'; +} + +// check if the first 3 gz blocks have the same isize. +static bool txtfile_segconf_discover_constant_isize (STR8p(header)) +{ + #define MAX_CONSTANT_SIZE_HEADER_LEN 255 // must be large enough for all the constant-size codecs we recognize + if (MAX_CONSTANT_SIZE_HEADER_LEN > 255) return false; + + // isize of the first gz block followed by the header + uint8_t signature[4 + header_len]; + PUT_UINT32 (signature, txt_file->gz_data.uncomp_len); + memcpy (signature + 4, header, header_len); + + // find 3 more instances of "isize header" in the data with same isize + uint8_t *next_isize = B8(txt_file->gz_data, txt_file->gz_data.comp_len - 4); + for (int i=0 ; i < 3; i++) { + if (!(next_isize = memmem (next_isize, BAFT8(txt_file->gz_data) - next_isize, signature, 4 + header_len))) + return false; + + next_isize += sizeof (signature); + } + + return true; // there are at least 4 gz blocks, and the first 3 have the same isize +} + +// run when open a txt file (before reading the txt_header), except for FASTQ for which it is +// run at the end of segconf after segconf.tech is known. +void txtfile_discover_specific_gz (FileP file) +{ + START_TIMER; + bool keep_src_codec = (file->src_codec == CODEC_CRAM || file->src_codec == CODEC_BAM || file->src_codec == CODEC_BCF); + // read the first potential BGZF block to test if this is GZ or BGZF - // note: we read if --no-bgzf to capture the data for z_file->gz_header - GzStatus status = bgzf_read_block (file, true); + // note: we read if even if --no-bgzf to capture the data for z_file->gz_header + GzStatus status = mgzip_read_block_with_bsize (file, true, CODEC_BGZF); + + // copy GZ header: data should be in gz_data txtfile_discover_specific_gz + file->gz_header_len = gzip_header_length (file); + memcpy (file->gz_header, B1ST8 (file->gz_data), MIN_(GZ_HEADER_LEN, file->gz_header_len)); // case: this is a BGZF block - // note: we keep the still-compressed data in vb->scratch for later consumption + // note: we keep the still-compressed data in vb->comp_txt_data for later consumption if (!flag.no_bgzf && status == GZ_SUCCESS && file->gz_data.uncomp_len > 0) { - if (file->source_codec != CODEC_CRAM && file->source_codec != CODEC_BAM && file->source_codec != CODEC_BCF) - file->source_codec = CODEC_BGZF; - - file->codec = CODEC_BGZF; - bgzf_initialize_discovery (file); + if (!keep_src_codec) file->src_codec = CODEC_BGZF; + file->effective_codec = CODEC_BGZF; } // for regulars files, we already skipped 0 size files. This can happen in STDIN @@ -187,24 +388,45 @@ void txtfile_discover_gz_codec (FileP file) ABORTINP ("No data exists in input file %s", file->name ? file->name : FILENAME_STDIN); } - // case: this is non-BGZF GZIP format - else if (flag.no_bgzf || status == GZ_IS_GZIP_NOT_BGZF) { - // case: this is FASTQ (judged by the filename) that is GZIL - bool is_eof = false; - if (!flag.no_bgzf && file->data_type == DT_FASTQ && - gzil_read_block (file, true, &is_eof) != GZ_IS_NOT_GZIL) - - file->codec = file->source_codec = CODEC_GZIL; + // case: this is non-BGZF GZIP format: test for one of the FASTQ codec + else if (status == GZ_IS_OTHER_FORMAT && file->data_type == DT_FASTQ) { + #define SET_CODEC(codec) ({ file->src_codec = CODEC_##codec; \ + if (!flag.no_bgzf) file->effective_codec = CODEC_##codec; }) + + if (TECH(ILLUM) && mgzip_read_block_no_bsize (file, true, CODEC_IL1M) == GZ_SUCCESS && + txtfile_segconf_discover_constant_isize (_8(IL1M_HEADER))) + SET_CODEC(IL1M); + + else if (TECH(MGI) && mgzip_read_block_no_bsize (file, true, CODEC_MGSP) == GZ_SUCCESS && + txtfile_segconf_discover_constant_isize (_8(MGSP_HEADER))) { + SET_CODEC(MGSP); + file->num_mgsp_blocks_in_vb = 0; // reset + } + + else if (TECH(MGI) && mgzip_read_block_with_bsize (file, true, CODEC_MGZF) == GZ_SUCCESS) + SET_CODEC(MGZF); + + else if (TECH(ELEMENT) && mgzip_read_block_no_bsize (file, true, CODEC_EMFL) == GZ_SUCCESS && + txtfile_segconf_discover_constant_isize (STRa(file->gz_header))) + SET_CODEC(EMFL); + + else if (TECH(ELEMENT) && mgzip_read_block_no_bsize (file, true, CODEC_EMVL) == GZ_SUCCESS && + str_issame_(B1STc(file->gz_data), file->gz_data.comp_len, _S(EMVL_FIRST_BLOCK))) // first block is an empty block + SET_CODEC(EMVL); - // case: neither BGZF or GZIL - treat as normal GZ else - file->codec = file->source_codec = CODEC_GZ; + goto generic_gz; + #undef SET_CODEC } - // case: this is not GZIP format at all. treat as a plain file, and put the data read in vb->scratch - // for later consumption is txtfile_read_block_plain - else if (status == GZ_IS_NOT_GZIP) { + else if (status != GZ_NOT_GZIP) generic_gz: { + if (!keep_src_codec)file->src_codec = CODEC_GZ; + file->effective_codec = CODEC_GZ; + } + // case: this is not GZIP format at all. treat as a plain file, and put the data read in vb->comp_txt_data + // for later consumption is txtfile_read_block_plain + else { #define BZ2_MAGIC "BZh" #define XZ_MAGIC (char[]){ 0xFD, '7', 'z', 'X', 'Z', 0 } #define ZIP_MAGIC (char[]){ 0x50, 0x4b, 0x03, 0x04 } @@ -232,32 +454,29 @@ void txtfile_discover_gz_codec (FileP file) ABORTINP0 ("The data seems to be in ora format. Please use --input to specify the type (eg: \"genozip --input fastq.ora\")"); } - file->codec = CODEC_NONE; + if (!keep_src_codec) file->src_codec = CODEC_NONE; + file->effective_codec = CODEC_NONE; } - else - ABORT ("Invalid status=%u", status); + if (flag.make_reference && IS_MGZIP(file->src_codec)) + file->effective_codec = CODEC_GZ; + + // case: R2 codec differ than R1's - just use GZ (decompressing in main thread). This is a little + // blunt as most codecs will interact well in most cases (e.g. if vb_size is large enough), but there + // are many edge cases that need to be handled, and this is not a common use case worth dealing with. + if (IS_R2 && file->effective_codec != z_file->comp_eff_codec[flag.zip_comp_i-1]) + file->effective_codec = CODEC_GZ; - // if this is R2 we are going to uncompress in the main thread. IGZIP is a faster method for doing so - // than BGZF or GZIP, bc it is better at parallelizing disk read-aheads and decompression. The only reason - // to keep BGZF is if we want to store BGZF isizes for exact reconstruction, which is only possible if we discovered - // the library. BGZF library discovery has not yet occurred for R2, so we take the R1 results as proxy - // (if the proxying is wrong - either we will compress unneccsary slowly with BGZF instead of IGZIP, or we will - // incorrectly compress with IGZIP and drop the BGZF isizes preventing exact reconstruction - that's ok) - bool is_pair2 = flag.pair && ((flag.zip_comp_i == FQ_COMP_R2 && Z_DT(FASTQ)) || // note: flag.pair is not incremented yet; z_file only exists if this 2nd+ component so test that first - (flag.zip_comp_i == SAM_COMP_FQ01 && (Z_DT(BAM) || Z_DT(SAM)))); - if ((is_pair2 && (file->codec == CODEC_GZIL || (z_file->comp_codec[flag.zip_comp_i-1] == CODEC_BGZF && z_file->comp_bgzf[flag.zip_comp_i-1].level == BGZF_COMP_LEVEL_UNKNOWN))) || - // likewise for --make-reference: we uncompress by main thread, and we don't care about retaining BGZF isizes - (flag.make_reference && (file->codec == CODEC_GZIL || file->codec == CODEC_BGZF))) { + if (file->effective_codec == CODEC_GZ) + txtfile_initialize_igzip (file); - file->gunzip_method = CODEC_GZ; - } + else if (IS_MGZIP(file->effective_codec)) { + file->max_mgzip_isize = file->gz_data.uncomp_len; // note: will be 0 for EMVL, bc first block is 0 - else - file->gunzip_method = file->codec; + bgzf_initialize_discovery (file); // to discover library and level + } - if (file->gunzip_method == CODEC_GZ) - txtfile_initialize_igzip (file); + COPY_TIMER_EVB (txtfile_discover_specific_gz); } // ZIP main thread: called after txt and z are open, and txt codecs have been discovered. @@ -267,46 +486,42 @@ void txtfile_zip_finalize_codecs (void) ASSERTNOTNULL (txt_file); if (flag.zip_comp_i < MAX_NUM_COMPS) { // for stats - z_file->comp_codec[flag.zip_comp_i] = txt_file->codec; - z_file->comp_source_codec[flag.zip_comp_i] = txt_file->source_codec; - z_file->comp_gunzip_method[flag.zip_comp_i] = txt_file->gunzip_method; - - // copy GZ header (but not if BGZF, GZIL): data should be in gz_data txtfile_discover_gz_codec - if (TXT_IS_GZ && txt_file->gz_data.len >= 12) - memcpy (z_file->gz_header, B1ST8 (txt_file->gz_data), 12); + z_file->comp_src_codec[flag.zip_comp_i] = txt_file->src_codec; + z_file->comp_eff_codec[flag.zip_comp_i] = txt_file->effective_codec; } - // note: for BGZF, we report in bgzf_finalize_discovery as we don't yet know the library/level here - if ((flag.show_gz || flag.show_bgzf) && txt_file->gunzip_method != CODEC_BGZF) { - iprintf ("%s: txt_codec=%s", txt_file->basename, txtfile_codec_name (z_file, flag.zip_comp_i).s); + memcpy (z_file->comp_gz_header[flag.zip_comp_i], txt_file->gz_header, GZ_HEADER_LEN); + + if (flag.show_gz || flag.show_bgzf) { + iprintf ("%s: src_codec=%s effective_codec=%s gz_header=%s", txt_file->basename, // same format as in txtfile_zip_finalize_codecs + codec_name (txt_file->src_codec), codec_name (txt_file->effective_codec), + display_gz_header (z_file->comp_gz_header[flag.zip_comp_i], GZ_HEADER_LEN, false).s); if (flag.show_gz) { iprint0 ("\n"); exit_ok; }; - iprintf (" gunzip_method=%s\n", codec_name (txt_file->gunzip_method)); + iprintf (" effective_codec=%s\n", codec_name (txt_file->effective_codec)); } } // runs in main thread, reads and uncompressed GZ, and populates txt_data for vb -static uint32_t txtfile_read_block_igzip (VBlockP vb, uint32_t max_bytes) +static uint32_t txtfile_read_block_igzip (VBlockP vb, uint32_t max_bytes, bool *is_data_read) { START_TIMER; ASSERTISALLOCED (txt_file->gz_data); struct inflate_state *state = B1ST (struct inflate_state, txt_file->igzip_state); + sSTRl (last_gz_header, GZ_HEADER_LEN)*0; + + // save in case of realloc in txtfile_fread + uint32_t next_in_before = state->next_in ? BNUM(txt_file->gz_data, state->next_in) : 0; // top up gz_data - int32_t bytes_read = (txt_file->gz_data.len32 < IGZIP_CHUNK) - ? txtfile_fread (txt_file, NULL, BAFTc(txt_file->gz_data), IGZIP_CHUNK - txt_file->gz_data.len32, &txt_file->disk_so_far) - : 0; + if (txt_file->gz_data.len32 < IGZIP_CHUNK) + txtfile_fread (txt_file, NULL, NULL, (int32_t)IGZIP_CHUNK - (int32_t)txt_file->gz_data.len32, &txt_file->disk_so_far); - ASSERT (!ferror((FILE *)txt_file->file) && bytes_read >= 0, "Error reading GZ file %s on filesystem=%s: %s", - txt_name, arch_get_txt_filesystem().s, strerror (errno)); - - txt_file->gz_data.len32 += bytes_read; // yet-uncompressed data read from disk - { START_TIMER - state->next_in = B1ST8 (txt_file->gz_data); - state->avail_in = txt_file->gz_data.len32; + state->next_in = B8(txt_file->gz_data, next_in_before); + state->avail_in = BAFT8(txt_file->gz_data) - state->next_in; state->next_out = BAFT8 (vb->txt_data); state->avail_out = max_bytes; @@ -325,6 +540,11 @@ static uint32_t txtfile_read_block_igzip (VBlockP vb, uint32_t max_bytes) if (state->block_state == ISAL_BLOCK_FINISH) isal_inflate_reset (state); + if (state->block_state == ISAL_BLOCK_NEW_HDR && flag.show_bgzf) { + last_gz_header_len = MIN_(GZ_HEADER_LEN, state->avail_in); + memcpy (last_gz_header, state->next_in, last_gz_header_len); + } + int ret = isal_inflate (state); // new gzip header in a file that has concatented gzip compressions ASSERT (ret == ISAL_DECOMP_OK || ret == ISAL_END_INPUT, "isal_inflate error: %s avail_in=%u avail_out=%u", isal_error (ret), txt_file->gz_data.len32, max_bytes); @@ -332,20 +552,46 @@ static uint32_t txtfile_read_block_igzip (VBlockP vb, uint32_t max_bytes) COPY_TIMER(igzip_uncompress_during_read); } uint32_t gz_data_consumed = BNUM (txt_file->gz_data, state->next_in); + + if (state->block_state == ISAL_BLOCK_FINISH && z_file && gz_data_consumed >= 4) { + uint32_t isize = GET_UINT32 (B8(txt_file->gz_data, gz_data_consumed - 4)); // note: this is actually (isize % 2^32) bc gzip isize field is 32bit - // for stats: read and save the isize from the gzip footer (of the first 2 gzip blocks) - if (state->block_state == ISAL_BLOCK_FINISH && z_file && gz_data_consumed >= 4) + // for stats: save the isize from the gzip footer (of the first 2 gzip blocks) for (int i=0; i <= 1; i++) if (!z_file->gz_isize[flag.zip_comp_i][i]) { - z_file->gz_isize[flag.zip_comp_i][i] = LTEN32 (GET_UINT32 (B8(txt_file->gz_data, gz_data_consumed - 4))); + z_file->gz_isize[flag.zip_comp_i][i] = isize; break; } - buf_remove (txt_file->gz_data, char, 0, gz_data_consumed); + uint64_t decompressed_so_far = txt_file->disk_so_far - txt_file->gz_data.len + BNUM (txt_file->gz_data, state->next_in); + + if (flag.show_bgzf) { + uint64_t bsize = decompressed_so_far - txt_file->start_gz_block; + iprintf ("UNCOMPRESS GZ thread=MAIN vb=%s block_i=%"PRIu64" bsize=%"PRIu64" isize=%u%s gz_header=%s\n", + VB_NAME, txt_file->gz_blocks_so_far, bsize, + isize, (bsize > 500 MB && (double) isize / (double) bsize < 0.98) ? "(overflow)" : "", // note: this doesn't catch all overflows, for example it won't catch: bsize=1GB and isize=3GB where the true isize is 7GB + display_gz_header ((bytes)STRa(last_gz_header), false).s); + } - inc_disk_gz_uncomp_or_trunc (txt_file, gz_data_consumed); + txt_file->gz_blocks_so_far++; + txt_file->start_gz_block = decompressed_so_far; + txt_file->max_mgzip_isize = MAX_(txt_file->max_mgzip_isize, isize); + } - txt_file->no_more_blocks = (!state->avail_in && feof ((FILE *)txt_file->file)); + *is_data_read = BNUM(txt_file->gz_data, state->next_in) > next_in_before; + + // case FASTQ: don't remove segconf data as we will re-uncompress it after effective_codec discovery + if (!txt_file->discover_during_segconf) { + buf_remove (txt_file->gz_data, char, 0, gz_data_consumed); + state->next_in = NULL; + + inc_disk_gz_uncomp_or_trunc (txt_file, gz_data_consumed); + + txt_file->no_more_blocks = (!state->avail_in && feof ((FILE *)txt_file->file)); + } + + else + segconf.gz_comp_size += gz_data_consumed; // segconf, for use of txtfile_set_seggable_size Ltxt = BNUMtxt (state->next_out); @@ -371,170 +617,130 @@ static inline uint32_t txtfile_read_block_bz2 (VBlockP vb, uint32_t max_bytes) return bytes_read; } -// BGZF: we read *compressed* data into vb->scratch - that will be decompressed now or later, depending on "uncompress". -// We read data with a *decompressed* size up to max_uncomp. vb->scratch always contains only full BGZF blocks -static inline uint32_t txtfile_read_block_bgz (VBlockP vb, int32_t max_uncomp /* must be signed */, bool uncompress) +static noreturn void txtfile_dump_comp_txt_data (VBlockP vb, uint32_t this_block_start) +{ + char dump_fn[strlen(txt_name)+100]; + snprintf (dump_fn, sizeof (dump_fn), "%s.vb-%u.bad-%s.bad-offset-0x%x", + txt_name, vb->vblock_i, codec_name (txt_file->effective_codec), this_block_start); + + buf_dump_to_file (dump_fn, &vb->comp_txt_data, 1, false, false, true, false); + + ABORT ("%s: Invalid %s block: block_comp_len=%u. Entire data of this vblock dumped to %s, bad block stats at offset 0x%x", + VB_NAME, codec_name (txt_file->effective_codec), txt_file->gz_data.comp_len, dump_fn, this_block_start); +} + +// Multi-block GZIP (MGZIP): we read gz data into vb->comp_txt_data - that will be decompressed now or later, depending on "uncompress". +// We read data with a *decompressed* size up to max_uncomp. vb->comp_txt_data always contains only full MGZIP blocks +static inline uint32_t txtfile_read_block_mgzip (VBlockP vb, + int32_t requested_bytes, // 0 means read exactly one mgzip block + bool uncompress, + bool *is_data_read) { START_TIMER; uint32_t this_uncomp_len=0; + *is_data_read = false; // initialize if (uncompress) vb->gzip_compressor = libdeflate_alloc_decompressor(vb, __FUNCLINE); - int64_t start_uncomp_len = vb->scratch.uncomp_len; - int32_t max_block_size = TXT_IS_BGZF ? BGZF_MAX_BLOCK_SIZE : GZIL_MAX_BLOCK_SIZE; + int64_t start_uncomp_len = vb->comp_txt_data.uncomp_len; + int32_t max_block_size = mgzip_get_max_block_size(); - // scratch contains gz-compressed data; we use .uncomp_len to track its uncompress length - buf_alloc (vb, &vb->scratch, 0, max_uncomp/2, char, 0, "scratch"); + // comp_txt_data contains gz-compressed data; we use .uncomp_len to track its uncompress length + buf_alloc (vb, &vb->comp_txt_data, 0, requested_bytes/2, char, 0, "scratch"); - while (vb->scratch.uncomp_len - start_uncomp_len <= max_uncomp - max_block_size && - !txt_file->no_more_blocks) { + while ( ( (requested_bytes && vb->comp_txt_data.uncomp_len - start_uncomp_len <= requested_bytes - max_block_size) + || (!requested_bytes && !vb->comp_txt_data.uncomp_len) + || TXT_IS_MGSP) // MGSP: loop until broken + && ( !txt_file->no_more_blocks) ) { - bool is_eof = false; // only used for GZIL - GzStatus status = TXT_IS_BGZF ? bgzf_read_block (txt_file, false) - : gzil_read_block (txt_file, false, &is_eof); - - uint32_t this_block_start = vb->scratch.len32; - buf_add_more (vb, &vb->scratch, txt_file->gz_data.data, txt_file->gz_data.comp_len, "scratch"); - - // check for corrupt data - at this point we've already confirm the file is BGZF so not expecting a different block - if (status != GZ_SUCCESS) { - // dump to file - char dump_fn[strlen(txt_name)+100]; - snprintf (dump_fn, sizeof (dump_fn), "%s.vb-%u.bad-%s.bad-offset-0x%x", - txt_name, vb->vblock_i, codec_name (txt_file->codec), this_block_start); - - buf_dump_to_file (dump_fn, &vb->scratch, 1, false, false, true, false); + GzStatus status = (TXT_GZ_HEADER_HAS_BSIZE ? mgzip_read_block_with_bsize : mgzip_read_block_no_bsize) (txt_file, false, txt_file->effective_codec); - ABORT ("%s: Invalid %s block: block_comp_len=%u. Entire data of this vblock dumped to %s, bad block stats at offset 0x%x", - VB_NAME, codec_name (txt_file->codec), txt_file->gz_data.comp_len, dump_fn, this_block_start); - } + if (TXT_IS_MGSP && !txt_file->num_mgsp_blocks_in_vb) + break; // MGSP: we have reached the end of the VB and hence mgsp_is_valid_isize has set num_mgsp_blocks_in_vb to 0 + + uint32_t this_block_start = vb->comp_txt_data.len32; + buf_add_more (vb, &vb->comp_txt_data, txt_file->gz_data.data, txt_file->gz_data.comp_len, "comp_txt_data"); + + // check for corrupt data - at this point we've already confirm the file's codec so not expecting it to change + if (status != GZ_SUCCESS) + txtfile_dump_comp_txt_data (vb, this_block_start); // add block to list - including the EOF block (block_comp_len=BGZF_EOF_LEN block_uncomp_len=0) - if (txt_file->gz_data.comp_len/* note: if is 0 if truncated or EOF with no EOF block */) { - buf_alloc (vb, &vb->gz_blocks, 1, MAX_(1000, 1.2 * max_uncomp / max_block_size), GzBlockZip, 2, "gz_blocks"); + if (txt_file->gz_data.comp_len/* note: it is 0 if truncated or EOF with no EOF block */) { + buf_alloc (vb, &vb->gz_blocks, 1, MAX_(1000, txt_file->max_mgzip_isize ? 1.2 * requested_bytes / txt_file->max_mgzip_isize : 0), GzBlockZip, 2, "gz_blocks"); + BNXT (GzBlockZip, vb->gz_blocks) = (GzBlockZip) { .txt_index = Ltxt, // after passed-down data and all previous blocks .compressed_index = this_block_start, .txt_size = txt_file->gz_data.uncomp_len, .comp_size = txt_file->gz_data.comp_len, - .is_decompressed = !txt_file->gz_data.uncomp_len, // EOF block is always considered decompressed - .is_eof = is_eof }; + .is_uncompressed = !txt_file->gz_data.uncomp_len, // and isize=0 block is always considered uncompressed + .is_eof = txt_file->no_more_blocks }; + + *is_data_read = true; + + if (flag.show_bgzf) { + GzBlockZip *bb = BLST (GzBlockZip, vb->gz_blocks); + iprintf ("READ %s thread=MAIN%s block_i=%"PRIu64" bb_i=%u comp_index=%u comp_len=%u txt_index=%u txt_len=%u eof=%s%s\n", + codec_name (txt_file->effective_codec), + cond_str (vb->vblock_i, " vb=", VB_NAME), + txt_file->gz_blocks_so_far, + BNUM (vb->gz_blocks, bb), bb->compressed_index, bb->comp_size, bb->txt_index, bb->txt_size, TF(bb->is_eof), + str_issame_(Bc(vb->comp_txt_data, this_block_start), txt_file->gz_data.comp_len, _S(BGZF_EOF)) ? " BGZF_EOF " : ""); + } + txt_file->gz_blocks_so_far++; // counts blocks in entire file (note: bb_i counts within VB) - // case EOF block: are not going to decompress the block, so account for it here + // case empty block (eg EOF block or EMVL start block): we are not going to decompress the block, so account for it here if (!txt_file->gz_data.uncomp_len) inc_disk_gz_uncomp_or_trunc (txt_file, txt_file->gz_data.comp_len); - } - - // case EOF - happens in 2 cases: 1. EOF block (block_comp_len=BGZF_EOF_LEN) or 2. no EOF block (block_comp_len=0) - if (!txt_file->gz_data.uncomp_len) { - txt_file->no_more_blocks = true; - if (flag.show_bgzf && txt_file->bgzf_flags.has_eof_block) - iprint0 ("IO vb=0 EOF\n"); - } + txt_file->max_mgzip_isize = MAX_(txt_file->max_mgzip_isize, txt_file->gz_data.uncomp_len); - else { - this_uncomp_len += txt_file->gz_data.uncomp_len; // total uncompressed length of data read by this function call - vb->scratch.uncomp_len += txt_file->gz_data.uncomp_len; // total uncompressed length of data in vb->compress - Ltxt += txt_file->gz_data.uncomp_len; // total length of txt_data after adding decompressed vb->scratch (may also include pass-down data) + this_uncomp_len += txt_file->gz_data.uncomp_len; // total uncompressed length of data read by this function call + vb->comp_txt_data.uncomp_len += txt_file->gz_data.uncomp_len; // total uncompressed length of data in vb->compress + Ltxt += txt_file->gz_data.uncomp_len; // total length of txt_data after adding decompressed vb->comp_txt_data (may also include pass-down data) - // we decompress one block a time in the loop so that the decompression is parallel with the disk reading into cache + // we uncompress one block a time in the loop so that the decompression is parallel with the disk read-ahead into cache if (uncompress) { START_TIMER; - bgz_uncompress_one_block (vb, BLST (GzBlockZip, vb->gz_blocks), txt_file->codec); - COPY_TIMER(bgz_uncompress_during_read); + mgzip_uncompress_one_block (vb, BLST (GzBlockZip, vb->gz_blocks), txt_file->effective_codec); + COPY_TIMER(mgzip_uncompress_during_read); } + + // remove the first MGZIP block from the gz_data + buf_remove (txt_file->gz_data, uint8_t, 0, txt_file->gz_data.comp_len); + txt_file->gz_data.comp_len = txt_file->gz_data.uncomp_len = 0; // note: these refer to the first block, gz_data.len might still be >0 + + // case: exactly one gz block per VB (except initial empty blocks) + if (TXT_IS_VB_SIZE_BY_BLOCK && vb->comp_txt_data.uncomp_len) + break; } + + // case: comp_len=0 but len>0 : no more data for this VB (end of group of blocks): data in file->gz_data is for the next VB + else if (txt_file->gz_data.len) + break; - buf_remove (txt_file->gz_data, uint8_t, 0, txt_file->gz_data.comp_len); - txt_file->gz_data.comp_len = txt_file->gz_data.uncomp_len = 0; + // case: no more data in the file + else { + // previous block is eof. note: we miss it if previous block was in the previous VB + if (vb->gz_blocks.len) + BLST (GzBlockZip, vb->gz_blocks)->is_eof = true; + + txt_file->no_more_blocks = true; // EOF without EOF block + } } if (uncompress) { - buf_free (vb->scratch); + buf_free (vb->comp_txt_data); libdeflate_free_decompressor ((struct libdeflate_decompressor **)&vb->gzip_compressor, __FUNCLINE); } - COPY_TIMER (txtfile_read_block_bgz); + COPY_TIMER (txtfile_read_block_mgzip); return this_uncomp_len; } -// performs a single I/O read operation - returns number of bytes read -// data is placed in vb->txt_data, except if its BGZF and uncompress=false - compressed data is placed in vb->scratch -static uint32_t txtfile_read_block (VBlockP vb, uint32_t max_bytes, - bool uncompress) // in BGZF/GZIL, whether to uncompress the data. ignored if not BGZF/GZIL -{ - START_TIMER; - - if (txt_file->no_more_blocks) return 0; // nothing more to read - - uint32_t uncomp_len=0; - - // BGZF note: we read *compressed* data into vb->scratch - that will be decompressed later. we read - // data with a *decompressed* size up to max_bytes. vb->scratch always contains only full BGZF blocks - - switch (txt_file->codec) { - case CODEC_NONE : uncomp_len = txtfile_read_block_plain (vb, max_bytes); break; - case CODEC_GZIL : - case CODEC_BGZF : uncomp_len = txtfile_read_block_bgz (vb, max_bytes, uncompress); break; - case CODEC_GZ : uncomp_len = txtfile_read_block_igzip (vb, max_bytes); break; - case CODEC_BZ2 : uncomp_len = txtfile_read_block_bz2 (vb, max_bytes); break; - - default: ABORT ("txtfile_read_block: Invalid file type %s (codec=%s)", ft_name (txt_file->type), codec_name (txt_file->codec)); - } - - COPY_TIMER_EVB (read); - return uncomp_len; -} - -// iterator on a buffer containing newline-terminated lines -// false means continue iterating, true means stop -char *txtfile_foreach_line (BufferP txt_header, - bool reverse, // iterate backwards - TxtIteratorCallback callback, - void *cb_param1, void *cb_param2, unsigned cb_param3, // passed as-is to callback - int64_t *line_len) // out -{ - if (line_len) *line_len = 0; - - if (!txt_header->len) return NULL; - - char *firstbuf = txt_header->data; - char *afterbuf = BAFTc (*txt_header); - - char *first = !reverse ? firstbuf : 0; - char *after = !reverse ? 0 : afterbuf; - - while (1) { - - // get one line - searching forward or backwards - if (!reverse) { - for (after=first ; after < afterbuf && *after != '\n' ; after++); - after++; // skip newline - } - else { - for (first=after-2 /* skip final \n */; first >= firstbuf && *first != '\n'; first--); - first++; // after detected \n or at start of line - } - - if (!reverse && after > afterbuf) return NULL; // we don't call callback if after>afterbuf - beyond end of line - - if (callback (first, after - first, cb_param1, cb_param2, cb_param3)) { - if (line_len) *line_len = after - first; - return first; - } - - if (reverse && first == firstbuf) return NULL; // beginning of line - we called the cb - - if (!reverse) first=after; - else after=first; - } - - return 0; // never reaches here -} - // default callback from DataTypeProperties.is_header_done: // returns header length if header read is complete + sets lines.len, 0 if complete but not existant, -1 not complete yet int32_t def_is_header_done (bool is_eof) @@ -565,6 +771,8 @@ int32_t def_is_header_done (bool is_eof) return -1; // not end of header yet } +static uint32_t txtfile_read_block (VBlockP vb, uint32_t bytes_requested, bool uncompress, bool *is_data_read); + // ZIP main thread: reads txt header into evb->txt_data void txtfile_read_header (bool is_first_txt) { @@ -574,12 +782,13 @@ void txtfile_read_header (bool is_first_txt) int32_t header_len; uint32_t bytes_read=1 /* non-zero */; + bool is_data_read = true; // read data from the file until either 1. EOF is reached 2. end of txt header is reached #define HEADER_BLOCK (256 KB) // we have no idea how big the header will be... read this much at a time while ((header_len = (DT_FUNC (txt_file, is_header_done)(bytes_read==0))) < 0) { // we might have data here from txtfile_test_data - if (!bytes_read) { + if (!is_data_read) { if (flags_pipe_in_process_died()) // only works for Linux ABORTINP ("Pipe-in process %s (pid=%u) died before the %s header was fully read; only %"PRIu64" bytes were read", flags_pipe_in_process_name(), flags_pipe_in_pid(), dt_name(txt_file->data_type), evb->txt_data.len); @@ -594,7 +803,7 @@ void txtfile_read_header (bool is_first_txt) buf_alloc (evb, &evb->txt_data, HEADER_BLOCK, 0, char, 2, "txt_data"); if (header_len != HEADER_DATA_TYPE_CHANGED) // note: if HEADER_DATA_TYPE_CHANGED - no need to read more data - we just process the same data again, with a different data type - bytes_read = txtfile_read_block (evb, HEADER_BLOCK, true); + bytes_read = txtfile_read_block (evb, HEADER_BLOCK, true, &is_data_read); } // the excess data is for the next vb to read @@ -602,7 +811,7 @@ void txtfile_read_header (bool is_first_txt) buf_copy (evb, &txt_file->unconsumed_txt, &evb->txt_data, char, header_len, 0, "txt_file->unconsumed_txt"); evb->txt_data.len = header_len; // trim to uncompressed length of txt header - txt_file->header_size_bgzf = bgz_copy_unconsumed_blocks (evb); // copy unconsumed or partially consumed gz_blocks to txt_file->unconsumed_bgz_blocks + txt_file->header_size_bgzf = mgzip_copy_unconsumed_blocks (evb); // copy unconsumed or partially consumed gz_blocks to txt_file->unconsumed_mgzip_blocks } txt_file->txt_data_so_far_single = txt_file->header_size = header_len; @@ -613,94 +822,147 @@ void txtfile_read_header (bool is_first_txt) } // default "unconsumed" function file formats where we need to read whole \n-ending lines. returns the unconsumed data length -int32_t def_unconsumed (VBlockP vb, uint32_t first_i, int32_t *i) +int32_t def_unconsumed (VBlockP vb, uint32_t first_i) { - ASSERT (*i >= 0 && *i < Ltxt, "*i=%d ∉ [0,%u]", *i, Ltxt); + ASSERTNOTZERO (Ltxt); - int32_t j; for (j=*i; j >= (int32_t)first_i; j--) // use j - automatic var - for speed - if (*Btxt (j) == '\n') { - *i = j; + int32_t j; for (j=Ltxt-1; j >= (int32_t)first_i; j--) // use j - automatic var - for speed + if (*Btxt (j) == '\n') return Ltxt -1 - j; - } - *i = j; return -1; // cannot find \n in the data starting first_i } -static uint32_t txtfile_get_unconsumed_to_pass_to_next_vb (VBlockP vb) +static void txt_file_truncate_final_bytes (VBlockP vb, int32_t *n_bytes) { - START_TIMER; + bool is_R2_missing_R1 = (*n_bytes == -2); - int32_t pass_to_next_vb_len; + // case: VB consists of a single truncated line, or it is a R2 VB without an R1 counterpart + if (*n_bytes < 0) *n_bytes = Ltxt; // entire VB - // case: the data is BGZF-compressed in vb->scratch, except for passed down data from prev VB - // uncompress one block at a time to see if its sufficient. usually, one block is enough - if ((TXT_IS_BGZF || TXT_IS_GZIL) && vb->scratch.len) { + if (!is_R2_missing_R1) + WARN ("FYI: file is truncated - its final %s in incomplete. Dropping this partial final %s of %u bytes.", + DTPT(line_name), DTPT(line_name), *n_bytes); + else + WARN ("FYI: this is an R2 file that is longer than its R1 counterpart: dropping %u bytes. (vb=%s)", *n_bytes, VB_NAME); + + txt_file->last_truncated_line_len = *n_bytes; + Ltxt -= *n_bytes; + *n_bytes = 0; // truncate last partial line + segconf.zip_txt_modified = true; +} +static bool txtfile_get_unconsumed_to_pass_to_next_vb (VBlockP vb, bool *R2_vb_truncated_away) +{ + START_TIMER; + + int32_t final_unconsumed_len = 0; + + // case: the data is multiblock-gzip-compressed in vb->comp_txt_data, except for passed down data from prev VB + // uncompress one block at a time to see if its sufficient. usually, one block is enough + if (TXT_IS_MGZIP && vb->comp_txt_data.len) { vb->gzip_compressor = libdeflate_alloc_decompressor (vb, __FUNCLINE); - for (int block_i=vb->gz_blocks.len32 - 1; block_i >= 0; block_i--) { - GzBlockZip *bb = B(GzBlockZip, vb->gz_blocks, block_i); - + for_buf_back (GzBlockZip, bb, vb->gz_blocks) { START_TIMER; - bgz_uncompress_one_block (vb, bb, txt_file->codec); - COPY_TIMER(bgz_uncompress_during_read); + mgzip_uncompress_one_block (vb, bb, txt_file->effective_codec); + COPY_TIMER(mgzip_uncompress_during_read); - // case: we dropped the bb: happens only for the final block in GZIL is truncated, and it was not detected earlier in gzil_read_block. - if (!bb->is_decompressed) { + // case: we dropped the bb: happens only for the final block in IL1M is truncated, and it was not detected earlier in il1m_is_valid_isize. + if (!bb->is_uncompressed) { vb->gz_blocks.len32--; Ltxt -= bb->txt_size; segconf.zip_txt_modified = true; - WARN ("FYI: %s is truncated - its final GZIL block in incomplete. Dropping final %u bytes of the GZ data.", txt_name, bb->comp_size); + WARN ("FYI: %s is truncated - its final %s block in incomplete. Dropping final %u bytes of the GZ data.", + txt_name, codec_name (txt_file->effective_codec), bb->comp_size); } else { START_TIMER; - int32_t last_i = Ltxt-1; // test from end of data - pass_to_next_vb_len = (DT_FUNC(txt_file, unconsumed)(vb, MAX_(bb->txt_index, 0), &last_i)); // note: bb->txt_index might be negative if part of this bb was consumed by the previous VB + final_unconsumed_len = (DT_FUNC(txt_file, unconsumed)(vb, MAX_(bb->txt_index, 0))); // note: bb->txt_index might be negative if part of this bb was consumed by the previous VB COPY_TIMER (txtfile_get_unconsumed_callback); - if (pass_to_next_vb_len >= 0) goto done; // we have the answer (callback returns -1 if it needs more data) + if (final_unconsumed_len >= 0) { + if (final_unconsumed_len && flag.truncate && txt_file->no_more_blocks) + txt_file_truncate_final_bytes (vb, &final_unconsumed_len); + + goto done; // we have the answer (callback returns -1 if it needs more data) + } } } } - // test remaining txt_data including passed-down data from previous VB + // case: full line not detected in all gz-compressed data: test remaining txt_data including passed-down data from previous VB + // case: codec is not BGZIP (i.e. it is NONE, GZ or BZ2) and hence already fully uncompressed { START_TIMER; - int32_t last_i = Ltxt-1; // test from end of data - pass_to_next_vb_len = (DT_FUNC(vb, unconsumed)(vb, 0, &last_i)); - COPY_TIMER (txtfile_get_unconsumed_callback); - } - - // case: callback doesn't have enough data for even one line, but file has no more data - if (flag.truncate && pass_to_next_vb_len < 0 && !segconf.running) { - WARN ("FYI: %s is truncated - its final %s in incomplete. Dropping this partial final %s of %u bytes.", - txt_name, DTPT(line_name), DTPT(line_name), Ltxt); - txt_file->last_truncated_line_len = Ltxt; - Ltxt = pass_to_next_vb_len = 0; // truncate last partial line - segconf.zip_txt_modified = true; - } - - ASSERT (pass_to_next_vb_len >= 0 || - segconf.running, // case: we're testing memory and this VB is too small for a single line - return and caller will try again with a larger VB - "Reason: failed to find a full line %sin vb=%s data_type=%s txt_data.len=%u txt_file->codec=%s is_last_vb_in_txt_file=%s interleaved=%s.\n" - "Known possible causes:\n" - "- The file is %s %s. Tip: try running with --truncate\n" - "- The file is not a %s file.\n" - "VB dumped: %s\n", - DTPT(is_binary) ? "" : "(i.e. newline-terminated) ", - VB_NAME, dt_name (txt_file->data_type), Ltxt, codec_name (txt_file->codec), TF(vb->is_last_vb_in_txt_file), TF(segconf.is_interleaved), - DTPT(is_binary) ? "truncated but not on the boundary of the" : "missing a newline on the last", DTPT(line_name), - TXT_DT(REF) ? "FASTA" : dt_name (txt_file->data_type), - txtfile_dump_vb (vb, txt_name).s); + final_unconsumed_len = (DT_FUNC(vb, unconsumed)(vb, 0)); + COPY_TIMER (txtfile_get_unconsumed_callback) + } + + // case: truncate entire VB (requested by fastq_unconsumed in case this R2 VB doesn't have an R1 counterpart and we are allowed to truncate) + if (final_unconsumed_len == -2) { + txt_file_truncate_final_bytes (vb, &final_unconsumed_len); + *R2_vb_truncated_away = true; + } + + // case: there is not even one full line of text + else if (final_unconsumed_len == -1) { + + // case: segconf - segconf will try again, increasing the vb_size + if (segconf.running) {} + + // case: R2 doesn't have enough data to sync with R1 (confirmed by counting lines) get more data + else if (IS_R2) { + uint32_t r2_num_lines = str_count_char (B1STtxt, Ltxt, '\n') / 4; + + // error if we did in fact read the same amount of lines as R1, uncompress everything, and still didn't find the QNAME + ASSINP (r2_num_lines < fastq_get_R1_num_lines (vb), + NO_PAIR_FMT_PREFIX "read name \"%s\" is missing in %s. codec=%s vb_i=%u R2_num_lines=%u R1_vb_i=%u R1_num_lines=%u)%s", + txt_name, fastq_get_R1_last_qname(vb), txt_name, codec_name (txt_file->effective_codec), + vb->vblock_i, (uint32_t)r2_num_lines, fastq_get_R1_vb_i (vb), fastq_get_R1_num_lines (vb), NO_PAIR_FMT_SUFFIX); + + // error if there isn't enough data in the file + ASSINP (!txt_file->no_more_blocks, + NO_PAIR_FMT_PREFIX "read name \"%s\" is missing in %s, because the file appears shorter than its R1. codec=%s vb_i=%u R2_num_lines=%u R1_vb_i=%u R1_num_lines=%u)%s", + txt_name, fastq_get_R1_last_qname(vb), txt_name, codec_name (txt_file->effective_codec), + vb->vblock_i, (uint32_t)r2_num_lines, fastq_get_R1_vb_i (vb), fastq_get_R1_num_lines (vb), NO_PAIR_FMT_SUFFIX); + } + + // case: file is truncated + else if (flag.truncate) + txt_file_truncate_final_bytes (vb, &final_unconsumed_len); + + // case: file is truncated, but user didn't specify --truncate + else + ABORT ("Reason: failed to find a full line %sin vb=%s data_type=%s txt_data.len=%u txt_file->effective_codec=%s is_last_vb_in_txt_file=%s interleaved=%s.\n" + "Known possible causes:\n" + "- The file is %s %s. Tip: try running with --truncate\n" + "- The file is not a %s file.\n" + "VB dumped: %s\n", + DTPT(is_binary) ? "" : "(i.e. newline-terminated) ", + VB_NAME, dt_name (txt_file->data_type), Ltxt, codec_name (txt_file->effective_codec), TF(vb->is_last_vb_in_txt_file), TF(segconf.is_interleaved), + DTPT(is_binary) ? "truncated but not on the boundary of the" : "missing a newline on the last", DTPT(line_name), + TXT_DT(REF) ? "FASTA" : dt_name (txt_file->data_type), + txtfile_dump_vb (vb, txt_name).s); + } done: if (vb->gzip_compressor) libdeflate_free_decompressor ((struct libdeflate_decompressor **)&vb->gzip_compressor, __FUNCLINE); - return (uint32_t)pass_to_next_vb_len; + // pass any unconsumed data at the end of txt_data to the next vb + if (final_unconsumed_len > 0) { + if (final_unconsumed_len < 0) final_unconsumed_len = Ltxt; // entire VB is unconsumed + + ASSERT (final_unconsumed_len <= Ltxt, "expecting final_unconsumed_len=%d <= Ltxt=%u", final_unconsumed_len, Ltxt); + + buf_insert (evb, txt_file->unconsumed_txt, char, 0, Btxt (Ltxt - final_unconsumed_len), final_unconsumed_len, "txt_file->unconsumed_txt"); + Ltxt -= final_unconsumed_len; + } + + return final_unconsumed_len >= 0; // false means more data is needed } static bool seggable_size_is_modifiable (void) @@ -709,7 +971,7 @@ static bool seggable_size_is_modifiable (void) } // estimate the size of the txt_data of the file - i.e. the uncompressed data excluding the header - // based on the observed or assumed compression ratio of the source compression so far -static void txtfile_set_seggable_size (void) +void txtfile_set_seggable_size (void) { uint64_t disk_size = txt_file->disk_size ? txt_file->disk_size : flag.stdin_size ? flag.stdin_size // user-provided size @@ -717,16 +979,14 @@ static void txtfile_set_seggable_size (void) double source_comp_ratio=1; if (!is_read_via_ext_decompressor (txt_file)) { - if (txt_file->is_remote || txt_file->redirected) - source_comp_ratio = 4; - - else if (TXT_IS_PLAIN) + if (TXT_IS_PLAIN) source_comp_ratio = 1; else { double plain_len = txt_file->txt_data_so_far_single + txt_file->unconsumed_txt.len; // all data that has been decompressed - double comp_len = TXT_IS_BZ2 ? file_tell (txt_file, HARD_FAIL) - : txt_file->disk_so_far - txt_file->gz_data.len; // data read from disk, excluding data still awaiting decompression + double comp_len = TXT_IS_BZ2 ? BZ2_consumed ((BZFILE *)txt_file->file) + : txt_file->discover_during_segconf ? segconf.gz_comp_size + : txt_file->disk_so_far - txt_file->gz_data.len; // data read from disk, excluding data still awaiting decompression // case: header is whole BGZF blocks - remove header from calculation to get a better estimate of the seggable compression ratio if (txt_file->header_size_bgzf) { @@ -734,18 +994,18 @@ static void txtfile_set_seggable_size (void) comp_len -= txt_file->header_size_bgzf; } - source_comp_ratio = plain_len / MAX_(comp_len, 1); + source_comp_ratio = plain_len / MAX_(comp_len, 1.0); } } // external decompressors - else switch (txt_file->source_codec) { + else switch (txt_file->src_codec) { case CODEC_BCF: source_comp_ratio = 10; break; // note: .bcf files might be compressed or uncompressed - we have no way of knowing as "bcftools view" always serves them to us in plain VCF format. These ratios are assuming the bcf is compressed as it normally is. case CODEC_XZ: source_comp_ratio = 15; break; case CODEC_CRAM: source_comp_ratio = 25; break; case CODEC_ORA: source_comp_ratio = 25; break; case CODEC_ZIP: source_comp_ratio = 3; break; - default: ABORT ("unspecified txt_file->codec=%s (%u)", codec_name (txt_file->codec), txt_file->codec); + default: ABORT ("unspecified txt_file->src_codec=%s (%u)", codec_name (txt_file->src_codec), txt_file->src_codec); } txt_file->est_seggable_size = MAX_(0.0, (double)disk_size * source_comp_ratio - (double)txt_file->header_size); @@ -761,10 +1021,46 @@ int64_t txtfile_get_seggable_size (void) return txt_file->est_seggable_size; } -static uint32_t txt_data_alloc_size (uint32_t vb_size) +uint32_t txt_data_alloc_size (uint32_t vb_size) { - return vb_size + - TXTFILE_READ_VB_PADDING; // we need this quantity of unused bytes at the end of vb.txt_data + return TXT_IS_MGSP ? MAX_(24, txt_file->max_mgsp_blocks_in_vb) * (txt_file->max_mgzip_isize + 1/*1 + for last gz block extra length*/) + TXTFILE_READ_VB_PADDING + : vb_size ? vb_size + txt_file->max_mgzip_isize + TXTFILE_READ_VB_PADDING + : 0; +} + +// performs a single I/O read operation - returns number of bytes read +// data is placed in vb->txt_data, except if its BGZF and uncompress=false - compressed data is placed in vb->comp_txt_data +static uint32_t txtfile_read_block (VBlockP vb, uint32_t bytes_requested, + bool uncompress, // MGZIP codecs: whether to uncompress the data. ignored if not MGZIP + bool *is_data_read) // out: true if read any data, including an isize=0 gz block +{ + START_TIMER; + + if (txt_file->no_more_blocks) return 0; // nothing more to read + + uint32_t uncomp_len = 0; + + if (IS_MGZIP(txt_file->effective_codec)) + uncomp_len = txtfile_read_block_mgzip (vb, bytes_requested, uncompress, is_data_read); // note: will possibly read more bytes than requested if last mgzip block goes over + + else if (IS_GZ(txt_file->effective_codec)) + uncomp_len = txtfile_read_block_igzip (vb, bytes_requested, is_data_read); + + else if (IS_NONE(txt_file->effective_codec)) { + uncomp_len = txtfile_read_block_plain (vb, bytes_requested); + *is_data_read = !!uncomp_len; + } + + else if (IS_BZ2(txt_file->effective_codec)) { + uncomp_len = txtfile_read_block_bz2 (vb, bytes_requested); + *is_data_read = !!uncomp_len; + } + + else + ABORT ("unsupported codec %s", codec_name (txt_file->effective_codec)); + + COPY_TIMER_EVB (read); + return uncomp_len; } // ZIP main thread @@ -774,17 +1070,24 @@ void txtfile_read_vblock (VBlockP vb) ASSERTNOTNULL (txt_file); ASSERT_DT_FUNC (txt_file, unconsumed); - ASSERT (IN_RANGE (segconf.vb_size, ABSOLUTE_MIN_VBLOCK_MEMORY, ABSOLUTE_MAX_VBLOCK_MEMORY) || segconf.running, + ASSERT (IN_RANGX (segconf.vb_size, ABSOLUTE_MIN_VBLOCK_MEMORY, ABSOLUTE_MAX_VBLOCK_MEMORY) || segconf.running, "Invalid vb_size=%"PRIu64" comp_i(0-based)=%u", segconf.vb_size, z_file->num_txts_so_far-1); if (txt_file->no_more_blocks && !txt_file->unconsumed_txt.len) return; // we're done - - uint32_t my_vb_size = segconf.vb_size; // might grow to match a FASTQ R2 vb to its R1 pair - buf_alloc (vb, &vb->txt_data, 0, txt_data_alloc_size (my_vb_size), char, 1, "txt_data"); + bool is_mgzip = TXT_IS_MGZIP; + + bool always_uncompress = flag.zip_uncompress_source_during_read || // segconf tells us to uncompress the data + segconf.running || // segconf doesn't have a compute thread, and doesn't attempt to uncompress txt_data + !is_mgzip; // GZ, BZ2 and NONE always return uncompressed data anyway (note: segconf is always one of these too) + + vb->comp_i = flag.zip_comp_i; // needed for VB_NAME + + // Note: VB might grow 1. if 0 (for large variable length MGZIP blocks) and 2. to match a FASTQ R2 vb to its R1 pair + uint32_t my_vb_size = IS_R2 ? MAX_(fastq_get_R1_txt_data_len (vb), segconf.vb_size) : segconf.vb_size; // note: if no correspoding VB we go ahead and try to read data anyway, to make sure there is none + ASSERTNOTZERO (my_vb_size); - // read data from the file until either 1. EOF is reached 2. end of vb is reached - uint32_t pass_to_next_vb_len = 0; + buf_alloc (vb, &vb->txt_data, 0, txt_data_alloc_size (my_vb_size), char, 1.05, "txt_data"); // start with using the data passed down from the previous VB (note: copy & free and not move! so we can reuse txt_data next vb) if (txt_file->unconsumed_txt.len) { @@ -793,81 +1096,97 @@ void txtfile_read_vblock (VBlockP vb) buf_remove (txt_file->unconsumed_txt, char, 0, bytes_moved); } - bool is_bgz = TXT_IS_BGZF || TXT_IS_GZIL; - - if (is_bgz) bgz_zip_init_vb (vb); + if (is_mgzip) mgzip_zip_init_vb (vb); - vb->comp_i = flag.zip_comp_i; // needed for VB_NAME + // max_block_size exists for fixed-block-size codecs: VB data read will be <= this size + uint32_t max_block_size = mgzip_get_max_block_size(); - bool always_uncompress = flag.zip_uncompress_source_during_read || segconf.running; + ASSERT (my_vb_size >= max_block_size, "vblock=%s < max_block_size=%u bytes, in codec=%s. This is not supported.%s", + str_size(my_vb_size).s, max_block_size, codec_name(txt_file->effective_codec), + segconf.running ? "" : " Use --no-bgzf to switch codec or use --vblock set specificy a larger size"); - // case: compute thread should decompress - if (!always_uncompress && (TXT_IS_BGZF || TXT_IS_GZIL)) - vb->txt_codec = txt_file->codec; + while (1) { + uint32_t bytes_requested = IS_R2 ? ((double)my_vb_size * 1.03 + (max_block_size - 1)) // add 3% vs R1 (VB might be slightly bigger if reads on average are a bit longer) and round up to the next full block + : MIN_(my_vb_size, 1 GB /* read() can't handle more */); + bytes_requested -= MIN_(Ltxt, bytes_requested); // reduce data read, if we already have some from unconsumed_txt or previous iterations - uint32_t max_block_size = TXT_IS_BGZF ? BGZF_MAX_BLOCK_SIZE : GZIL_MAX_BLOCK_SIZE; + buf_alloc (vb, &vb->txt_data, bytes_requested, 0, char, 1.05, NULL); - for (bool first=true; ; first=false) { - uint32_t bytes_requested = MIN_(my_vb_size - Ltxt, 1 GB /* read() can't handle more */); - bool no_read_expected = is_bgz && (bytes_requested <= max_block_size); // in this case, txtfile_read_block is expected to return 0 + bool is_data_read = false; - uint32_t len = (my_vb_size > Ltxt) ? txtfile_read_block (vb, bytes_requested, always_uncompress) : 0; + if (bytes_requested) + txtfile_read_block (vb, bytes_requested, always_uncompress, &is_data_read); - if (!len && first && !Ltxt) goto done; // case: no data read nor pass up from prev vb (and hence also no data to pass down to next vb) - - // when reading BGZF, we might be filled up even without completely filling my_vb_size - // if there is room left for only a partial BGZF block (we can't read partial blocks) - uint32_t filled_up = my_vb_size - (is_bgz ? (max_block_size - 1) : 0); - - if (len && Ltxt < filled_up) continue; // continue filling up txt_data... - - // case: this is the 2nd file of a fastq pair - make sure it has at least as many fastq "lines" as the first file - uint32_t my_lines, pair_num_lines, pair_txt_data_len; - VBIType pair_vb_i; - if (flag.pair == PAIR_R2 && // we are reading the second file of a fastq file pair (with --pair) - !fastq_txtfile_have_enough_lines (vb, &pass_to_next_vb_len, &my_lines, &pair_vb_i, &pair_num_lines, &pair_txt_data_len)) { // we don't yet have all the data we need - - // note: the opposite case where R2 has more reads than R1 is caught in fastq_txtfile_have_enough_lines or zip_prepare_one_vb_for_dispatching - ASSINP ((len || no_read_expected) && Ltxt, "Error: File %s has less FASTQ reads than its R1 mate (vb=%s has %u lines while its pair_vb_i=%d num_R1_VBs=%u has pair_txt_data_len=%u pair_num_lines=%u; vb=%s Ltxt=%u bytes_requested=%u bytes_read=%u no_more_blocks=%s my_vb_size=%u vb_size=%s src_codec=%s disk_so_far=%"PRIu64").%s", - txt_name, VB_NAME, my_lines, pair_vb_i, sections_get_num_vbs (FQ_COMP_R1), pair_txt_data_len/*only set if flag.debug*/, pair_num_lines, VB_NAME, Ltxt, bytes_requested, len, TF(txt_file->no_more_blocks), my_vb_size, str_size (segconf.vb_size).s, txtfile_codec_name (z_file, vb->comp_i).s, txt_file->disk_so_far, - (flag.truncate && (TXT_IS_BGZF || TXT_IS_GZIL || z_file->comp_codec[0] == CODEC_BGZF || z_file->comp_codec[0] == CODEC_GZIL)) ? " Tip: this might due to --truncate. Try adding --no-bgzf" : ""); - + // with an MGZIP codec, we might be filled up even without completely filling my_vb_size + // if there is room left for only a partial MGZIP block (we can't read partial blocks) + uint32_t filled_up = my_vb_size - (is_mgzip ? (max_block_size - 1) : 0); - // if we need more lines - increase memory and keep on reading - my_vb_size *= 1.1 * ((double)pair_num_lines / (double)my_lines); - - buf_alloc (vb, &vb->txt_data, 0, txt_data_alloc_size (my_vb_size), char, 1, "txt_data"); - } - else + // case: one VB per one block (or group of blocks) + if (TXT_IS_VB_SIZE_BY_MGZIP) break; - } - if (always_uncompress) buf_free (vb->scratch); // tested by txtfile_get_unconsumed_to_pass_to_next_vb + // check if we're done: if can't read any more data, or if the VB appears full + // note: is_data_read can be true even if len=0: when reading an isize=0 gz block + else if (is_data_read && Ltxt < filled_up) + continue; // continue reading - // callback to decide what part of txt_data to pass up to the next VB (usually partial lines, but sometimes more) - // note: even if we haven't read any new data (everything was passed down), we still might data to pass up - eg - // in FASTA with make-reference if we have a lots of small contigs, each VB will take one contig and pass up the remaining - if (!pass_to_next_vb_len && Ltxt) { - pass_to_next_vb_len = txtfile_get_unconsumed_to_pass_to_next_vb (vb); + // case no more data read for R2 (for a subsequent read after previous txtfile_get_unconsumed_to_pass_to_next_vb couldn't find a matching QNAME) + else if (!is_data_read && !Ltxt && IS_R2 && fastq_get_R1_vb_i(vb)) + ABORTINP ("--pair: file %s has less reads than its R1 counterpart. For example, read \"%s\" is missing for vb=%s R1_vb_1=%u", + txt_name, fastq_get_R1_last_qname (vb), VB_NAME, fastq_get_R1_vb_i (vb)); - // case: return if we're testing memory, and there is not even one line of text - if (segconf.running && pass_to_next_vb_len == (uint32_t)-1) { - buf_copy (evb, &txt_file->unconsumed_txt, &vb->txt_data, char, 0, 0, "txt_file->unconsumed_txt"); - buf_free (vb->txt_data); - goto done; + // case: no data read nor pass up from prev vb (and hence also no data to pass down to next vb) + else if (!is_data_read && !Ltxt) + goto done; // cancel this VB - we have no more data + + // callback to decide what part of txt_data to pass up to the next VB (usually partial lines, but sometimes more) + // note: even if we haven't read any new data (everything was passed down), we still might data to pass up - eg + // in FASTA with make-reference if we have a lots of small contigs, each VB will take one contig and pass up the remaining + if (!vb->gz_blocks.len || + !txt_file->no_more_blocks || // usually, there's no need to check at the end of the file as there is no next VB + flag.truncate || // user informs us that there might be a partial line to be truncates, so proceed even if EOF + IS_R2) { // In R2 we need to search for matching R1 QNAME + + bool R2_vb_truncated_away=false; + + // case where we don't need to check for unconusmed and/or sync with R1 + if (TXT_IS_IN_SYNC && !IS_R2 && !flag.truncate && txt_file->effective_codec == z_file->comp_eff_codec[flag.zip_comp_i-1]) + break; + + // case: we found the unconsumed text and (if needed) synced our R2 vb to its R1 counterpart. all good. + else if (txtfile_get_unconsumed_to_pass_to_next_vb (vb, &R2_vb_truncated_away)) { + // case: this is R2 with --truncate, this VB consisted of data that entirely goes beyond R1 - entire VB was eliminated + if (R2_vb_truncated_away && !txt_file->no_more_blocks) + continue; // continue to read more data and elimitate it + else + break; + } + + // case 1: very long single line in segconf + // case 2: cannot find matching QNAME in R2 + else { + my_vb_size *= 1.25; + ASSERT (my_vb_size <= ABSOLUTE_MAX_VBLOCK_MEMORY, "%s: VBlock too big, > %s, when trying to grow vb", VB_NAME, str_size(ABSOLUTE_MAX_VBLOCK_MEMORY).s); + continue; + } } - } - if (pass_to_next_vb_len) { - // note: we might some unconsumed data, pass it up to the next vb. possibly we still have unconsumed data (can happen if DVCF reject - // data was passed down from the txt header, greater than my_vb_size) - buf_insert (evb, txt_file->unconsumed_txt, char, 0, Btxt (Ltxt - pass_to_next_vb_len), pass_to_next_vb_len, "txt_file->unconsumed_txt"); - Ltxt -= pass_to_next_vb_len; + // case: we read the entire file, and this VB has some data + else if (!is_data_read && Ltxt) + break; + }; - // copy unconsumed or partially consumed gz_blocks to txt_file->unconsumed_bgz_blocks - if (is_bgz) - bgz_copy_unconsumed_blocks (vb); - } + // case: compute thread should uncompress + if (is_mgzip && !always_uncompress) + vb->txt_codec = txt_file->effective_codec; + + // copy unconsumed or partially consumed gz_blocks to txt_file->unconsumed_mgzip_blocks. + // note: this might happen even if final_unconsumed_len=0: in case we inherited gz_blocks from the + // previous VB (passed to us in mgzip_zip_init_vb), and we didn't use all of them because we took only + // part of the txt_file->unconsumed_txt. This can happen, for example, if segconf + // generated lots of unconsumed_txt and unconsumed_mgzip_blocks, but segconf.vb_size is smaller for vb>1. + if (is_mgzip) + mgzip_copy_unconsumed_blocks (vb); vb->vb_position_txt_file = txt_file->txt_data_so_far_single; vb->is_last_vb_in_txt_file = txt_file->no_more_blocks && !txt_file->unconsumed_txt.len; @@ -879,6 +1198,10 @@ void txtfile_read_vblock (VBlockP vb) txtfile_set_seggable_size(); if (!segconf.running) { + // case: R1, and codec might or might not have mgzip blocks in sync with R2: store the info R2 will need for deciding this + if (IS_R1 && Ltxt) + buf_append_one (z_file->R1_txt_data_lens, vb->txt_data.len32); + biopsy_take (vb); dispatcher_increment_progress ("read", txt_file->est_num_lines ? (Ltxt / MAX_(segconf.line_len,1)) : Ltxt); } @@ -888,6 +1211,9 @@ void txtfile_read_vblock (VBlockP vb) iprintf ("VB_READ(id=%d) vb=%s Ltxt=%u vb_position_txt_file=%"PRIu64" unconsumed_txt.len=%u is_last_vb_in_txt_file=%s\n", vb->id, VB_NAME, Ltxt, vb->vb_position_txt_file, txt_file->unconsumed_txt.len32, TF(vb->is_last_vb_in_txt_file)); + if (always_uncompress) + buf_free (vb->comp_txt_data); + COPY_TIMER (txtfile_read_vblock); } @@ -902,31 +1228,28 @@ DataType txtfile_zip_get_file_dt (rom filename) } // outputs details on txt_file->codec of a component, as stored in z_file -StrText txtfile_codec_name (FileP z_file/*obscures global*/, CompIType comp_i) +StrTextLong txtfile_codec_name (FileP z_file/*obscures global*/, CompIType comp_i, bool obscure_fname) { - StrText s; + StrTextLong s; - if (!IN_RANGE (comp_i, 0, MAX_NUM_COMPS-1)) + if (!IN_RANGE (comp_i, 0, MAX_NUM_COMPS)) snprintf (s.s, sizeof (s.s), "comp_i=%u out_of_range", comp_i); - else if (z_file->comp_codec[comp_i] == CODEC_BGZF) { + else if (z_file->comp_src_codec[comp_i] == CODEC_BGZF) { if (z_file->comp_bgzf[comp_i].level < BGZF_COMP_LEVEL_UNKNOWN) snprintf (s.s, sizeof (s.s), "BGZF(%s[%d])", bgzf_library_name (z_file->comp_bgzf[comp_i].library, false), z_file->comp_bgzf[comp_i].level); else strcpy (s.s, "BGZF(unknown_lib)"); } - else if (z_file->comp_codec[comp_i]==CODEC_GZ) { - bool fextra = ((z_file->gz_header[comp_i][3] & 4) == 4); // FEXTRA is bit 2 of FLG - - snprintf (s.s, sizeof (s), "GZ(%.24s%.20s%.20s)", - str_to_hex (z_file->gz_header[comp_i], fextra ? 12 : 10).s, + else if (z_file->comp_src_codec[comp_i]==CODEC_GZ) + snprintf (s.s, sizeof (s), "GZ(%.800s%.12s%.12s)", + display_gz_header (z_file->comp_gz_header[comp_i], GZ_HEADER_LEN, obscure_fname).s, cond_str (z_file->gz_isize[comp_i][0], "-", str_size (z_file->gz_isize[comp_i][0]).s), cond_str (z_file->gz_isize[comp_i][1], "-", str_size (z_file->gz_isize[comp_i][1]).s)); - } else - strcpy (s.s, codec_name (z_file->comp_codec[comp_i])); + strcpy (s.s, codec_name (z_file->comp_src_codec[comp_i])); return s; } \ No newline at end of file diff --git a/src/txtfile.h b/src/txtfile.h index 58f560f6..8ba92e2c 100644 --- a/src/txtfile.h +++ b/src/txtfile.h @@ -11,28 +11,28 @@ #include "genozip.h" #include "digest.h" -extern uint32_t txtfile_fread (FileP file, FILE *fp, void *addr, uint32_t size, int64_t *disk_so_far); -extern void txtfile_fwrite (const void *data, uint32_t size); +#define TXTFILE_READ_VB_PADDING 16 // we need this quantity of unused bytes at the end of vb.txt_data +extern uint32_t txtfile_fread (FileP file, FILE *fp, void *addr, int32_t size, int64_t *disk_so_far); +extern void txtfile_fwrite (const void *data, uint32_t size); +extern void txtfile_initialize_igzip (FileP file); extern StrTextLong txtfile_dump_vb (VBlockP vb, rom base_name); -extern StrText txtfile_codec_name (FileP z_file, CompIType comp_i); +extern StrTextLong txtfile_codec_name (FileP z_file, CompIType comp_i, bool obscure_fname); extern void txtfile_zip_finalize_codecs (void); - extern void txtfile_read_header (bool is_first_txt); - -extern uint64_t txtfile_max_memory_per_vb (void); +extern uint32_t txt_data_alloc_size (uint32_t vb_size) ; extern void txtfile_read_vblock (VBlockP vb); +extern void txtfile_set_seggable_size (void); extern int64_t txtfile_get_seggable_size (void); - -typedef bool (*TxtIteratorCallback)(rom line, unsigned line_len, void *cb_param1, void *cb_param2, unsigned cb_param3); -extern char *txtfile_foreach_line (BufferP txt_header, bool reverse, TxtIteratorCallback callback, void *cb_param1, void *cb_param2, unsigned cb_param3, int64_t *line_len); - -// igzip -extern void txtfile_discover_gz_codec (FileP file); +extern bool txtfile_is_gzip (FileP file); +extern void txtfile_discover_specific_gz (FileP file); extern rom isal_error (int ret); // callbacks -extern int32_t def_unconsumed (VBlockP vb, uint32_t first_i, int32_t *i); +extern int32_t def_unconsumed (VBlockP vb, uint32_t first_i); extern int32_t def_is_header_done (bool is_eof); extern DataType txtfile_zip_get_file_dt (rom filename); + +// misc +extern StrTextLong display_gz_header (STR8p(h), bool obscure_fname); \ No newline at end of file diff --git a/src/txtheader.c b/src/txtheader.c index 3ae85a9d..d3b8b9a6 100644 --- a/src/txtheader.c +++ b/src/txtheader.c @@ -10,7 +10,7 @@ #include "filename.h" #include "txtfile.h" #include "zfile.h" -#include "bgzf.h" +#include "mgzip.h" #include "writer.h" #include "contigs.h" #include "piz.h" @@ -74,7 +74,7 @@ void txtheader_compress (BufferP txt_header, .magic = BGEN32 (GENOZIP_MAGIC), .section_type = SEC_TXT_HEADER, .codec = (codec == CODEC_UNKNOWN) ? CODEC_NONE : codec, - .src_codec = txt_file->source_codec, + .src_codec = txt_file->src_codec, .digest_header = header_digest, .txt_header_size = BGEN64 (unmodified_txt_header_len), // length before zip-side modifications }; @@ -84,7 +84,7 @@ void txtheader_compress (BufferP txt_header, // true if filename is compressed with gz/bgzf but does not have a .gz/.bgz extension (e.g. usually true for BAM files) section_header.flags.txt_header.no_gz_ext = - (SRC_CODEC(GZ) || SRC_CODEC(BGZF) || SRC_CODEC(BAM) || SRC_CODEC(CRAM)/*reconstructed as BAM*/) && + (TXT_IS_GZIP || SRC_CODEC(BAM) || SRC_CODEC(CRAM)/*reconstructed as BAM*/) && txt_file->basename && !filename_has_ext (txt_file->basename, ".gz") && !filename_has_ext (txt_file->basename, ".bgz"); // In BGZF, we store the 3 least significant bytes of the file size, so check if the reconstructed BGZF file is likely the same @@ -214,10 +214,10 @@ static void txtheader_uncompress_one_vb (VBlockP vb) // case 1: outputing a single file - generate txt_filename based on the z_file's name // case 2: unbinding a genozip into multiple txt files - generate txt_filename of a component file from the // component name in SEC_TXT_HEADER -static rom txtheader_piz_get_filename_do (rom orig_name, rom prefix, bool is_orig_name_genozip, bool has_bgzf, bool no_gz_ext) +static rom txtheader_piz_get_filename_do (rom orig_name, rom prefix, rom out_dirname, bool is_orig_name_genozip, bool has_gz_ext) { unsigned fn_len = strlen (orig_name); - unsigned dn_len = flag.out_dirname ? strlen (flag.out_dirname) : 0; + unsigned dn_len = out_dirname ? strlen (out_dirname) : 0; unsigned px_len = prefix ? strlen (prefix) : 0; unsigned genozip_ext_len = is_orig_name_genozip ? STRLEN(GENOZIP_EXT) : 0; int txt_filename_size = fn_len + dn_len + px_len + 10; @@ -247,53 +247,51 @@ static rom txtheader_piz_get_filename_do (rom orig_name, rom prefix, bool is_ori fn_len = strlen (orig_name); } - // cases in which BGZF compression does not result in a ".gz" file name extension: - // if original gz/bgzf-compressed file did not have a .gz/.bgz extension (since 15.0.23) or when outputing BAM or BCF - no_gz_ext = no_gz_ext // original file had no .gz extension despite being GZ/BGZF compressed - || OUT_DT(BCF) || OUT_DT(BAM) || OUT_DT(CRAM) // data types for which we never add .gz - || !has_bgzf; // bgzf logic decided not to compress this file - snprintf ((char *)txt_filename, txt_filename_size, "%s%s%s%.*s%s%s", prefix ? prefix : "", - (dn_len ? flag.out_dirname : ""), (dn_len ? "/" : ""), + (dn_len ? out_dirname : ""), (dn_len ? "/" : ""), fn_len - genozip_ext_len - old_ext_removed_len, orig_name, old_ext_removed_len ? file_plain_ext_by_dt (flag.out_dt) : "", // add translated extension if needed - no_gz_ext ? "" : ".gz"); // add .gz if needed + has_gz_ext ? ".gz" : ""); // add .gz if needed if (dn_len) FREE (orig_name); // allocated by filename_base return txt_filename; } -rom txtheader_piz_get_filename (SectionHeaderTxtHeader *header, bool has_gz_ext) +// returns filename which caller is responsible to free (but returns NULL if to stdout) +static rom txtheader_piz_get_filename (const SectionHeaderTxtHeader *header, rom out_dirname/*optional*/) { - return flag.to_stdout ? NULL - : flag.out_filename ? flag.out_filename - : flag.unbind ? txtheader_piz_get_filename_do (header->txt_filename, flag.unbind, false, has_gz_ext, header->flags.txt_header.no_gz_ext) - : txtheader_piz_get_filename_do (z_name, "", true, has_gz_ext, header->flags.txt_header.no_gz_ext); // use genozip filename as a base regardless of original name + // note: for bz2, xz, and zip - we reconstruct as gz too. better choice than plain. + #define C(cdc) (header->src_codec == CODEC_##cdc) + bool has_gzip = (flag.bgzf == BGZF_NOT_INITIALIZED) ? (IS_GZIP(header->src_codec) || C(BZ2) || C(XZ) || C(ZIP)) // note: similar logic to in mgzip_piz_calculate_mgzip_flags + : (flag.bgzf != 0); + #undef C + + bool has_gz_ext = has_gzip && + !header->flags.txt_header.no_gz_ext && // source was gzip-compressed but lacked a .gz extension - we keep it that way + !OUT_DT(BCF) && !OUT_DT(BAM) && !OUT_DT(CRAM); // data types for which we never add .gz + + rom filename = flag.to_stdout ? NULL + : flag.out_filename ? ({ char *fn=MALLOC(strlen(flag.out_filename)+1); strcpy (fn, flag.out_filename); fn; }) + : flag.unbind ? txtheader_piz_get_filename_do (header->txt_filename, flag.unbind, out_dirname, false, has_gz_ext) + : txtheader_piz_get_filename_do (z_name, "", out_dirname, true, has_gz_ext); // use genozip filename as a base regardless of original name + + return filename; } -// get filename, even if txt_file has might not been open. -StrTextLong txtheader_get_txt_filename_from_section (void) +// PIZ: get filename (without the directory name), even if txt_file has not been opened. +StrTextLong txtheader_get_txt_filename_from_section (CompIType comp_i) { Section sec; if (flag.one_component && !flag.deep) sec = sections_get_comp_txt_header_sec (flag.one_component - 1); else - sec = sections_get_comp_txt_header_sec (COMP_MAIN); + sec = sections_get_comp_txt_header_sec (comp_i); SectionHeaderTxtHeader header = zfile_read_section_header (evb, sec, SEC_TXT_HEADER).txt_header; - // note: for bz2, xz, and zip - we reconstruct as gz too. better choice than plain. - #define C(cdc) (header.src_codec == CODEC_##cdc) - bool has_bgzf = (flag.bgzf == BGZF_NOT_INITIALIZED) ? ((C(BGZF) || C(GZ) || C(BZ2) || C(XZ) || C(ZIP))) // note: similar logic to in bgzf_piz_calculate_bgzf_flags - : (flag.bgzf != 0); - #undef C - - TEMP_FLAG(out_dirname, NULL); // only basename in progress string - rom filename = txtheader_piz_get_filename (&header, has_bgzf); - - RESTORE_FLAG(out_dirname); + rom filename = txtheader_piz_get_filename (&header, NULL); StrTextLong name = {}; if (filename) { @@ -336,7 +334,7 @@ void txtheader_piz_read_and_reconstruct (Section sec) // read actual txt header data only if we need to reconstruct the header bool needs_recon = writer_does_txtheader_need_recon (sec->comp_i); - FlagsBgzf bgzf_flags = {}; + FlagsMgzip mgzip_flags = {}; rom filename = NULL; sum_fragment_len = 0; @@ -346,21 +344,21 @@ void txtheader_piz_read_and_reconstruct (Section sec) txtheader_uncompress_one_vb, NO_CALLBACK); - bgzf_flags = bgzf_piz_calculate_bgzf_flags (sec->comp_i, header.src_codec); + mgzip_flags = mgzip_piz_calculate_mgzip_flags (sec->comp_i, header.src_codec); - filename = txtheader_piz_get_filename (&header, (bgzf_flags.library != BGZF_NO_LIBRARY)); + filename = txtheader_piz_get_filename (&header, flag.out_dirname); } // note: when reading an auxiliary file or no_writer - we still create txt_file (but don't actually open the physical file) // note: if there are several components contributing to a single txt_file (e.g. SAM w/gencomp) - we only open it once if (!txt_file) - txt_file = file_open_txt_write (filename, flag_loading_auxiliary ? z_file->data_type : flag.out_dt, bgzf_flags.level); + txt_file = file_open_txt_write (filename, flag_loading_auxiliary ? z_file->data_type : flag.out_dt, mgzip_flags.level); - if (!flag.to_stdout && !flag.out_filename) FREE (filename); // file_open_z copies the names + FREE (filename); // file_open_z copies the names - // set BGZF info in txt_file - either that originates from SEC_BGZF, or constructed based on bgzf_flags - if (needs_recon && TXT_IS_BGZF) - bgzf_piz_set_txt_file_bgzf_info (bgzf_flags, header.codec_info); + // If we are reconstructing to a MGZIP codec: set recompression info + if (needs_recon && TXT_IS_MGZIP) + bgzf_piz_set_txt_file_bgzf_info (mgzip_flags, header.codec_info); // note: this is reset for each component: // since v14 it is used for the commulative component-scope MD5 used for both VBs and txt file verification @@ -377,7 +375,8 @@ void txtheader_piz_read_and_reconstruct (Section sec) txt_file->num_vbs = sections_count_sections_until (SEC_VB_HEADER, sec, SEC_TXT_HEADER); txt_file->txt_data_size = BGEN64 (header.txt_data_size); txt_file->txt_data_so_far_single_0 = sum_fragment_len; - + txt_file->src_codec = header.src_codec; + if (VER(15)) for (QType q=0; q < NUM_QTYPES; q++) segconf.flav_prop[q] = header.flav_prop[q]; diff --git a/src/txtheader.h b/src/txtheader.h index 048b99b7..55178ed6 100644 --- a/src/txtheader.h +++ b/src/txtheader.h @@ -20,5 +20,5 @@ extern int64_t txtheader_zip_read_and_compress (int64_t *txt_header_offset, Comp //---------- extern void txtheader_piz_read_and_reconstruct (Section txt_header_sec); -extern StrTextLong txtheader_get_txt_filename_from_section (void); +extern StrTextLong txtheader_get_txt_filename_from_section (CompIType comp_i); diff --git a/src/url.c b/src/url.c index 1227fdc3..eb7a950a 100644 --- a/src/url.c +++ b/src/url.c @@ -50,7 +50,10 @@ static StreamP url_open (StreamP parent_stream, rom url, bool head_only) bool is_file = str_case_compare (str5, "file:", NULL); // wget is better than curl in flakey connections - if (!is_file && wget_available()) // note: wget doesn't support file:// and also not supported for Windows (see wget_available) + if (!is_file && // wget doesn't support file:// + !(head_only && curl_available()) // wget --spider doesn't follow redirects, so for header_only, we prefer curl if its available + && wget_available()) // note: wget is not supported for Windows (see wget_available) + return stream_create (parent_stream, head_only ? 0 : DEFAULT_PIPE_SIZE, // in wget, header arrives in error channel and data channel is empty DEFAULT_PIPE_SIZE, 0, 0, 0, 0, @@ -64,7 +67,7 @@ static StreamP url_open (StreamP parent_stream, rom url, bool head_only) else if (curl_available()) return stream_create (parent_stream, DEFAULT_PIPE_SIZE, 0, 0, 0, 0, 0, - "To compress files from a URL", "curl", "--silent", + "To compress files from a URL", "curl", "--silent", "--location", flag.is_windows ? "--ssl-no-revoke" : SKIP_ARG, head_only ? "--head" : SKIP_ARG, url, NULL); diff --git a/src/vblock.c b/src/vblock.c index 605b0ab9..9157e8e2 100644 --- a/src/vblock.c +++ b/src/vblock.c @@ -1,464 +1,464 @@ -// ------------------------------------------------------------------ -// vblock.c -// Copyright (C) 2019-2024 Genozip Limited. Patent Pending. -// Please see terms and conditions in the file LICENSE.txt -// -// WARNING: Genozip is proprietary, not open source software. Modifying the source code is strictly prohibited, -// under penalties specified in the license. - -// vb stands for VBlock - it started its life as VBlockVCF when genozip could only compress VCFs, but now -// it means a block of lines from the text file. - -#include "vblock.h" -#include "file.h" -#include "digest.h" -#include "bgzf.h" -#include "threads.h" -#include "writer.h" - -// pool of VBs allocated based on number of threads -static VBlockPoolP pools[NUM_POOL_TYPES] = {}; - -VBlockP evb = NULL; // outside a pool - -VBlockPool *vb_get_pool (VBlockPoolType type, FailType soft_fail) -{ - ASSERT (pools[type] || soft_fail, "VB Pool type=%u is not allocated", type); - return pools[type]; -} - -VBlockP vb_get_from_pool (VBlockPoolP pool, VBID vb_id) -{ - ASSERTNOTNULL (pool); - - if (!(vb_id == VB_ID_EVB || (vb_id >= 0 && vb_id < pool->num_vbs))) - return NULL; // soft fail on invalid vb_id - - return (vb_id == VB_ID_EVB) ? evb : pool->vb[vb_id]; -} - -static VBlockP nonpool_vbs[NUM_NONPOOL_VBs] = {}; - -static inline bool is_in_use (VBlockP vb) -{ - return __atomic_load_n (&vb->in_use, __ATOMIC_ACQUIRE); -} - -static void set_in_use (VBlockP vb, bool in_use) -{ - __atomic_store_n (&vb->in_use, in_use, __ATOMIC_RELEASE); -} - -void vb_release_vb_do (VBlockP *vb_p, rom task_name, rom func) -{ - START_TIMER; - - VBlockP vb = *vb_p; - if (!vb) return; // nothing to release - - ASSERT (is_in_use (vb) || vb==evb, "Cannot release VB because it is not in_use (called from %s): vb->id=%d vb->vblock_id=%u", - func, vb->id, vb->vblock_i); - - threads_log_by_vb (vb, vb->compute_task ? vb->compute_task : func, "RELEASING VB", 0); - - if (flag.show_time_comp_i == vb->comp_i || flag.show_time_comp_i == COMP_ALL) - profiler_add (vb); - - if (vb->id != VB_ID_EVB) // cannot test evb, see comment in buflist_test_overflows_do - buflist_test_overflows(vb, func); - - // verify that gzip_compressor was released after use - ASSERT (!vb->gzip_compressor, "vb=%s: expecting gzip_compressor=NULL", VB_NAME); - - // release all buffers in vb->buffer_list, and zero the space between these buffers - buflist_free_vb (vb); - - // this release can be run by either the main or writer thread. we make sure to update in_use as the very - // last change, and do so atomically - - // case: this VB is from the pool (i.e. not evb) - int32_t num_in_use = -1; - if (vb != evb) { - // Logic: num_in_use is always AT LEAST sum(vb)->in_use. i.e. pessimistic. (it can be mometarily less between these two updates) - set_in_use (vb, false); // released the VB back into the pool - it may now be reused - if (vb->id >= 0) - num_in_use = __atomic_sub_fetch (&pools[vb->pool]->num_in_use, 1, __ATOMIC_ACQ_REL); // atomic to prevent concurrent update by writer thread and main thread (must be after update of in_use) - *vb_p = NULL; - } - - if (flag_is_show_vblocks (task_name)) - iprintf (vb->id >= 0 ? "VB_RELEASE(task=%s id=%d) vb=%s caller=%s in_use=%d/%d\n" - : "VB_RELEASE(task=%s id=%d) vb=%s caller=%s\n", - task_name, vb->id, VB_NAME, func, num_in_use, (vb->id >= 0 ? pools[vb->pool]->num_vbs : -1)); - - buflist_compact (vb); - - if (vb->id >= 0) COPY_TIMER_EVB (vb_release_vb_do); -} - - -void vb_destroy_vb_do (VBlockP *vb_p, rom func) -{ - ASSERTMAINTHREAD; - START_TIMER; - - VBlockP vb = *vb_p; - if (!vb) return; - - if (flag_is_show_vblocks (vb->compute_task)) - iprintf ("VB_DESTROY(id=%d) vb_i=%d caller=%s\n", vb->id, vb->vblock_i, func); - - bool is_evb = vb->id == VB_ID_EVB; - - buflist_destroy_vb_bufs (vb, false); - - // case: this is a nonpool VB - for (int i=0; i < NUM_NONPOOL_VBs; i++) - if (nonpool_vbs[i] == vb) - nonpool_vbs[i] = NULL; - - FREE (*vb_p); - - if (!is_evb) COPY_TIMER_EVB (vb_destroy_vb); -} - -// return all VBlocks memory and unused evb memory to libc and optionally to the kernel -void vb_dehoard_memory (bool release_to_kernel) -{ - vb_destroy_pool_vbs (POOL_MAIN, false); - buflist_destroy_vb_bufs (evb, true); // destroys all unused buffers - - if (release_to_kernel) - buf_low_level_release_memory_back_to_kernel(); -} - -void vb_create_pool (VBlockPoolType type, rom name) -{ - // only main-thread dispatcher can create a pool. other dispatcher (eg writer's bgzf compression) can must existing pool - uint32_t num_vbs = (type == POOL_MAIN) ? MAX_(1, global_max_threads) + // compute thread VBs - (IS_PIZ ? 2 : 0) + // txt header VB and wvb (for PIZ) - (IS_PIZ ? z_file->max_conc_writing_vbs : 0) // thread-less VBs handed over to the writer thread - : writer_get_max_bgzf_threads(); - if (flag_is_show_vblocks (NULL)) - iprintf ("CREATING_VB_POOL: type=%s global_max_threads=%u max_conc_writing_vbs=%u num_vbs=%u\n", - name, global_max_threads, z_file->max_conc_writing_vbs, num_vbs); - - uint32_t size = sizeof (VBlockPool) + num_vbs * sizeof (VBlockP); - - if (!pools[type]) - // allocation includes array of pointers (initialized to NULL) - pools[type] = (VBlockPool *)CALLOC (size); // note we can't use Buffer yet, because we don't have VBs yet... - - // case: old pool is too small - realloc it (the pool contains only pointers to VBs, so the VBs themselves are not realloced) - else if (pools[type]->num_vbs < num_vbs) { - REALLOC (&pools[type], size, "VBlockPool"); - memset (&pools[type]->vb[pools[type]->num_vbs], 0, (num_vbs - pools[type]->num_vbs) * sizeof (VBlockP)); // initialize new entries - } - - pools[type]->name = name; - pools[type]->size = size; - pools[type]->num_vbs = MAX_(num_vbs, pools[type]->num_vbs); -} - -VBlockP vb_initialize_nonpool_vb (VBID vb_id, DataType dt, rom task) -{ - VBlockP vb = CALLOC (get_vb_size (dt)); - vb->data_type = DT_NONE; - vb->id = vb_id; - vb->compute_task = task; - vb->data_type = dt; - vb->data_type_alloced = dt; - vb->comp_i = COMP_NONE; - vb->pool = NO_POOL; - init_dict_id_to_did_map (vb->d2d_map); - - if (!vb->buffer_list.vb) { - vb->buffer_list.name = "buffer_list"; - buf_init_lock (&vb->buffer_list); - buflist_add_buf (vb, &vb->buffer_list, __FUNCLINE); - vb->buffer_list.vb = vb; // indication buffer was added to buffer list - } - - nonpool_vbs[NUM_NONPOOL_VBs + vb_id] = vb; // vb_id is a negative integer - set_in_use (vb, true); - - return vb; -} - -VBlockP vb_get_nonpool_vb (VBID vb_id) -{ - return nonpool_vbs[NUM_NONPOOL_VBs + vb_id]; // may be NULL -} - -// used to change segconf VB data_type is seg_initiatlize (FASTA->FASTQ) -void vb_change_datatype_nonpool_vb (VBlockP *vb_p, DataType new_dt) -{ - REALLOC (vb_p, get_vb_size (new_dt), "VBlock"); - VBlockP vb = *vb_p; - - vb->data_type = vb->data_type_alloced = new_dt; - init_dict_id_to_did_map (vb->d2d_map); - - buflist_update_vb_addr_change (vb, vb->buffer_list.vb); - - nonpool_vbs[NUM_NONPOOL_VBs + vb->id] = vb; -} - -static VBlockP vb_update_data_type (VBlockP vb, DataType dt, DataType alloc_dt, uint64_t sizeof_vb) -{ - if (z_file && vb->data_type == dt) return vb; - - // the new data type has a private section in its VB, that is different that the one of alloc_dt - realloc private section - if (vb->data_type_alloced != alloc_dt && alloc_dt != DT_NONE) { - - // destroy private part of previous data_type. we also destroy all contexts as new data type is going - // to allocate different contexts with a different memory usage profile (eg b250 vs local) for each - if (vb->data_type_alloced != DT_NONE) - buflist_destroy_private_and_context_vb_bufs (vb); - - buflist_compact (vb); // remove buffer_list entries marked for removal - - // calloc private part of new data_type - VBlockP old_vb = vb; - REALLOC (&vb, sizeof_vb, "VBlock"); - - // initialize private part of new data_type, keeping common part intact - memset ((char*)vb + sizeof (VBlock), 0, sizeof_vb - sizeof (VBlock)); - - // update buf->vb in all buffers of this VB to new VB address - if (vb != old_vb) buflist_update_vb_addr_change (vb, old_vb); - vb->data_type_alloced = alloc_dt; - } - - vb->data_type = dt; - return vb; -} - -// allocate an unused vb from the pool. seperate pools for zip and unzip -VBlockP vb_get_vb (VBlockPoolType type, rom task_name, VBIType vblock_i, CompIType comp_i) -{ - START_TIMER; - - VBlockPool *pool = vb_get_pool (type, HARD_FAIL); - -#ifdef DEBUG - // if GFF VB becauses larger than FASTA, then we need to change the dt assignment conditions below - ASSERT0 (get_vb_size (DT_FASTA) > get_vb_size (DT_GFF), "Failed assumption that FASTA has larger VB than GFF"); -#endif - - DataType dt = (type == POOL_BGZF) ? DT_NONE - : (flag.deep && flag.zip_comp_i >= SAM_COMP_FQ00) ? DT_FASTQ - : (IS_ZIP && segconf.has_embedded_fasta) ? DT_FASTA // GFF3 with embedded FASTA (allocate a FASTA VB as it larger than GFF and can accommodate both) - : (IS_ZIP && txt_file) ? txt_file->data_type - : (IS_PIZ && z_file && Z_DT(GFF)) ? DT_FASTA // vb is allocated before reading the VB_HEADER, so we don't yet know if it is a embdedded fasta VB. To be safe, we allocate enough memory for a VBlockFASTA (which is larger), so we can change the dt in gff_piz_init_vb() without needing to realloc - : (IS_PIZ && z_file && flag.deep && comp_i != COMP_NONE && comp_i >= SAM_COMP_FQ00) ? DT_FASTQ - : (IS_PIZ && z_file) ? z_file->data_type - : DT_NONE; - - uint64_t sizeof_vb = get_vb_size (dt); - - DataType alloc_dt = sizeof_vb == sizeof (VBlock) ? DT_NONE - : (dt == DT_REF && IS_PIZ) ? DT_NONE - : dt == DT_BAM ? DT_SAM - : dt == DT_BCF ? DT_VCF - : dt; - - if (type == POOL_MAIN && IS_PIZ && z_file && Z_DT(GFF)) - dt = DT_GFF; // return GFF dt to its true dt after getting the size, otherwise it won't work - - // circle around until a VB becomes available (busy wait) - VBID vb_id; for (vb_id=0; ; vb_id = (vb_id+1) % pool->num_vbs) { - if (!pool->vb[vb_id]) { // VB is not allocated - allocate it - pool->vb[vb_id] = CALLOC (sizeof_vb); - pool->num_allocated_vbs++; - pool->vb[vb_id]->data_type_alloced = alloc_dt; - break; - } - - else if (!is_in_use (pool->vb[vb_id])) { - pool->vb[vb_id] = vb_update_data_type (pool->vb[vb_id], dt, alloc_dt, sizeof_vb); // possibly realloc - break; - } - - // case: we've checked all the VBs and none is available - wait a bit and check again - // in PIZ, this happens when a lot VBs are handed over to the writer thread which has not processed them yet. - // for example, if writer is blocking on write(), waiting for a pipe counterpart to read. - // it will be released when the writer thread completes one VB. - if (vb_id == pool->num_vbs-1) usleep (50000); // 50 ms - } - - VBlockP vb = pool->vb[vb_id]; - - // Logic: num_in_use is always AT LEAST sum(vb)->in_use. i.e. pessimistic. (it can be mometarily more between these two updates) - uint32_t num_in_use = __atomic_add_fetch (&pool->num_in_use, 1, __ATOMIC_ACQ_REL); // atomic to prevent concurrent update by writer thread and main thread (must be before update of in_use) - set_in_use (vb, true); - - // initialize VB fields that need to be a value other than 0 - vb->id = vb_id; - vb->data_type = dt; - vb->vblock_i = vblock_i; - vb->comp_i = comp_i; - vb->compute_thread_id = THREAD_ID_NONE; - vb->compute_task = task_name; - vb->pool = type; - init_dict_id_to_did_map (vb->d2d_map); - - if (!vb->buffer_list.vb) { - vb->buffer_list.name = "buffer_list"; - buf_init_lock (&vb->buffer_list); - buflist_add_buf (vb, &vb->buffer_list, __FUNCLINE); - vb->buffer_list.vb = vb; // indication buffer was added to buffer list - } - - if (flag_is_show_vblocks (task_name)) - iprintf ("VB_GET_VB(task=%s id=%u) vb_i=%s/%d num_in_use=%u/%u\n", - task_name, vb->id, comp_name (vb->comp_i), vb->vblock_i, num_in_use, pool->num_vbs); - - threads_log_by_vb (vb, task_name, "GET VB", 0); - - if (flag.debug_memory) - iprintf ("vb_get_vb: got vb_i=%d id=%d task=%s dt=%s address=[%p - %p]\n", - vblock_i, vb_id, task_name, dt_name(alloc_dt), vb, (char*)vb + sizeof_vb); - - COPY_TIMER_EVB (vb_get_vb); - return vb; -} - -uint32_t vb_pool_get_num_in_use (VBlockPoolType type, VBID *id/*optional out*/) -{ - VBlockPool *pool = vb_get_pool (type, HARD_FAIL); - int num_in_use = __atomic_load_n (&pool->num_in_use, __ATOMIC_ACQUIRE); // atomic, bc for POOL_MAIN, writer thread might update concurrently. - - if (id) { - *id = VB_ID_NONE; - if (num_in_use) { - for (VBID vb_id=0; vb_id < pool->num_vbs; vb_id++) - if (pool->vb[vb_id] && pool->vb[vb_id]->in_use) - return *id = vb_id; - - if (*id == VB_ID_NONE) // all lost use while we were checking - num_in_use = 0; - } - } - - return num_in_use; -} - -// Note: num_in_use is always AT LEAST sum(vb)->in_use (it can be mometarily less than sum(vb)->in_use as they are getting updated) -// therefore, this function may return true when pool is actually no longer full. -bool vb_pool_is_full (VBlockPoolType type) -{ - return vb_pool_get_num_in_use (type, NULL) == vb_get_pool(type, HARD_FAIL)->num_vbs; -} - -// Note: As in vb_pool_is_full, if the function returns false (not empty) the pool might in fact already be empty -bool vb_pool_is_empty (VBlockPoolType type) -{ - return vb_pool_get_num_in_use (type, NULL) == 0; -} - -bool vb_is_valid (VBlockP vb) -{ - if (vb->pool != NO_POOL) - for (VBID vb_id=0; vb_id < pools[vb->pool]->num_vbs; vb_id++) - if (vb == pools[vb->pool]->vb[vb_id]) return true; - - for (int i=0; i < NUM_NONPOOL_VBs; i++) - if (nonpool_vbs[i] == vb) return true; - - return false; -} - -// frees memory of all VBs, except for non-pool VBs (evb, segconf, writer,...) -void vb_destroy_pool_vbs (VBlockPoolType type, bool destroy_pool) -{ - if (!pools[type]) return; - - for (VBID vb_id=0; vb_id < pools[type]->num_vbs; vb_id++) - vb_destroy_vb (&pools[type]->vb[vb_id]); - - if (destroy_pool) - FREE (pools[type]); -} - -StrText err_vb_pos (void *vb) -{ - StrText s; - - snprintf (s.s, sizeof (s), "vb i=%u position in %s file=%"PRIu64, - (VB)->vblock_i, dt_name (txt_file->data_type), (VB)->vb_position_txt_file); - return s; -} - -unsigned def_vb_size (DataType dt) { return sizeof (VBlock); } - -void vb_set_is_processed (VBlockP vb) -{ - __atomic_store_n (&vb->is_processed, (bool)true, __ATOMIC_RELEASE); -} - -bool vb_is_processed (VBlockP vb) -{ - return __atomic_load_n (&vb->is_processed, __ATOMIC_ACQUIRE); -} - -bool vb_buf_locate (VBlockP vb, ConstBufferP buf) -{ - if (!vb) return false; - - unsigned sizeof_vb = get_vb_size (vb->data_type_alloced); - - return vb && is_p_in_range (buf, vb, sizeof_vb); -} - -rom textual_assseg_line (VBlockP vb) -{ - if (vb->line_start >= Ltxt) return "Invalid line_start"; - - char *nl = memchr (Btxt(vb->line_start), '\n', Ltxt - vb->line_start); - if (!nl) nl = BAFTtxt; // possibly overwriting txt_data's overflow fence - - *nl = 0; // terminate string - return Btxt(vb->line_start); -} - -static void vb_deferred_q_reorder (VBlockP vb, Did did_i, int q_index, int depth) -{ - ASSERT (depth < 10, "Cyclic seg order requirements, ctx=%s", CTX(did_i)->tag_name); - - for (int i=0; i < q_index; i++) - if (vb->deferred_q[i].seg_after_did_i == did_i) { - // move element i to after element q_index - // example start: [A, B, C, D] (B.seg_after_did_i=D). result: [A, C, D, B] - DeferredField df = vb->deferred_q[i]; - memmove (&vb->deferred_q[i], &vb->deferred_q[i+1], (q_index - i) * sizeof (DeferredField)); - vb->deferred_q[q_index] = df; - - // now, move any element that needs to be after i (B in the example) - vb_deferred_q_reorder (vb, df.did_i, q_index, depth+1); - } -} - -void vb_add_to_deferred_q (VBlockP vb, ContextP ctx, DeferredSeg seg, int16_t idx, - Did seg_after_did_i) // optional (DID_NONE if not) - ctx cannot be segged before seg_after_did_i (= another context that might be in the deferred queue) -{ - ASSERT (vb->deferred_q_len+1 < DEFERRED_Q_SZ, "%s: deferred queue is full (deferred_q_len=%u) when adding %s", VB_NAME, vb->deferred_q_len, ctx->tag_name); - ASSERT (idx >= 0, "Invalid idx=%d when adding %s", idx, ctx->tag_name); - - vb->deferred_q[vb->deferred_q_len++] = (DeferredField){ .did_i=ctx->did_i, .seg=seg, .idx=idx, .seg_after_did_i = seg_after_did_i }; - - // change order of segging if needed - vb_deferred_q_reorder (vb, ctx->did_i, vb->deferred_q_len-1, 1); -} - -void vb_display_deferred_q (VBlockP vb, rom func) -{ - if (!vb->deferred_q_len) return; - - iprintf ("%s: %s Deferred seg queue: ", func, LN_NAME); - - for (int i=0; i < vb->deferred_q_len; i++) - iprintf ("%s ", CTX(vb->deferred_q[i].did_i)->tag_name); - - iprint0 ("\n"); -} +// ------------------------------------------------------------------ +// vblock.c +// Copyright (C) 2019-2024 Genozip Limited. Patent Pending. +// Please see terms and conditions in the file LICENSE.txt +// +// WARNING: Genozip is proprietary, not open source software. Modifying the source code is strictly prohibited, +// under penalties specified in the license. + +// vb stands for VBlock - it started its life as VBlockVCF when genozip could only compress VCFs, but now +// it means a block of lines from the text file. + +#include "vblock.h" +#include "file.h" +#include "digest.h" +#include "mgzip.h" +#include "threads.h" +#include "writer.h" + +// pool of VBs allocated based on number of threads +static VBlockPoolP pools[NUM_POOL_TYPES] = {}; + +VBlockP evb = NULL; // outside a pool + +VBlockPool *vb_get_pool (VBlockPoolType type, FailType soft_fail) +{ + ASSERT (pools[type] || soft_fail, "VB Pool type=%u is not allocated", type); + return pools[type]; +} + +VBlockP vb_get_from_pool (VBlockPoolP pool, VBID vb_id) +{ + ASSERTNOTNULL (pool); + + if (!(vb_id == VB_ID_EVB || (vb_id >= 0 && vb_id < pool->num_vbs))) + return NULL; // soft fail on invalid vb_id + + return (vb_id == VB_ID_EVB) ? evb : pool->vb[vb_id]; +} + +static VBlockP nonpool_vbs[NUM_NONPOOL_VBs] = {}; + +static inline bool is_in_use (VBlockP vb) +{ + return __atomic_load_n (&vb->in_use, __ATOMIC_ACQUIRE); +} + +static void set_in_use (VBlockP vb, bool in_use) +{ + __atomic_store_n (&vb->in_use, in_use, __ATOMIC_RELEASE); +} + +void vb_release_vb_do (VBlockP *vb_p, rom task_name, rom func) +{ + START_TIMER; + + VBlockP vb = *vb_p; + if (!vb) return; // nothing to release + + ASSERT (is_in_use (vb) || vb==evb, "Cannot release VB because it is not in_use (called from %s): vb->id=%d vb->vblock_id=%u", + func, vb->id, vb->vblock_i); + + threads_log_by_vb (vb, vb->compute_task ? vb->compute_task : func, "RELEASING VB", 0); + + if (flag.show_time_comp_i == vb->comp_i || flag.show_time_comp_i == COMP_ALL) + profiler_add (vb); + + if (vb->id != VB_ID_EVB) // cannot test evb, see comment in buflist_test_overflows_do + buflist_test_overflows(vb, func); + + // verify that gzip_compressor was released after use + ASSERT (!vb->gzip_compressor, "vb=%s: expecting gzip_compressor=NULL", VB_NAME); + + // release all buffers in vb->buffer_list, and zero the space between these buffers + buflist_free_vb (vb); + + // this release can be run by either the main or writer thread. we make sure to update in_use as the very + // last change, and do so atomically + + // case: this VB is from the pool (i.e. not evb) + int32_t num_in_use = -1; + if (vb != evb) { + // Logic: num_in_use is always AT LEAST sum(vb)->in_use. i.e. pessimistic. (it can be mometarily less between these two updates) + set_in_use (vb, false); // released the VB back into the pool - it may now be reused + if (vb->id >= 0) + num_in_use = __atomic_sub_fetch (&pools[vb->pool]->num_in_use, 1, __ATOMIC_ACQ_REL); // atomic to prevent concurrent update by writer thread and main thread (must be after update of in_use) + *vb_p = NULL; + } + + if (flag_is_show_vblocks (task_name)) + iprintf (vb->id >= 0 ? "VB_RELEASE(task=%s id=%d) vb=%s caller=%s in_use=%d/%d\n" + : "VB_RELEASE(task=%s id=%d) vb=%s caller=%s\n", + task_name, vb->id, VB_NAME, func, num_in_use, (vb->id >= 0 ? pools[vb->pool]->num_vbs : -1)); + + buflist_compact (vb); + + if (vb->id >= 0) COPY_TIMER_EVB (vb_release_vb_do); +} + + +void vb_destroy_vb_do (VBlockP *vb_p, rom func) +{ + ASSERTMAINTHREAD; + START_TIMER; + + VBlockP vb = *vb_p; + if (!vb) return; + + if (flag_is_show_vblocks (vb->compute_task)) + iprintf ("VB_DESTROY(id=%d) vb_i=%d caller=%s\n", vb->id, vb->vblock_i, func); + + bool is_evb = vb->id == VB_ID_EVB; + + buflist_destroy_vb_bufs (vb, false); + + // case: this is a nonpool VB + for (int i=0; i < NUM_NONPOOL_VBs; i++) + if (nonpool_vbs[i] == vb) + nonpool_vbs[i] = NULL; + + FREE (*vb_p); + + if (!is_evb) COPY_TIMER_EVB (vb_destroy_vb); +} + +// return all VBlocks memory and unused evb memory to libc and optionally to the kernel +void vb_dehoard_memory (bool release_to_kernel) +{ + vb_destroy_pool_vbs (POOL_MAIN, false); + buflist_destroy_vb_bufs (evb, true); // destroys all unused buffers + + if (release_to_kernel) + buf_low_level_release_memory_back_to_kernel(); +} + +void vb_create_pool (VBlockPoolType type, rom name) +{ + // only main-thread dispatcher can create a pool. other dispatcher (eg writer's bgzf compression) can must existing pool + uint32_t num_vbs = (type == POOL_MAIN) ? MAX_(1, global_max_threads) + // compute thread VBs + (IS_PIZ ? 2 : 0) + // txt header VB and wvb (for PIZ) + (IS_PIZ ? z_file->max_conc_writing_vbs : 0) // thread-less VBs handed over to the writer thread + : writer_get_max_bgzf_threads(); + if (flag_is_show_vblocks (NULL)) + iprintf ("CREATING_VB_POOL: type=%s global_max_threads=%u max_conc_writing_vbs=%u num_vbs=%u\n", + name, global_max_threads, z_file->max_conc_writing_vbs, num_vbs); + + uint32_t size = sizeof (VBlockPool) + num_vbs * sizeof (VBlockP); + + if (!pools[type]) + // allocation includes array of pointers (initialized to NULL) + pools[type] = (VBlockPool *)CALLOC (size); // note we can't use Buffer yet, because we don't have VBs yet... + + // case: old pool is too small - realloc it (the pool contains only pointers to VBs, so the VBs themselves are not realloced) + else if (pools[type]->num_vbs < num_vbs) { + REALLOC (&pools[type], size, "VBlockPool"); + memset (&pools[type]->vb[pools[type]->num_vbs], 0, (num_vbs - pools[type]->num_vbs) * sizeof (VBlockP)); // initialize new entries + } + + pools[type]->name = name; + pools[type]->size = size; + pools[type]->num_vbs = MAX_(num_vbs, pools[type]->num_vbs); +} + +VBlockP vb_initialize_nonpool_vb (VBID vb_id, DataType dt, rom task) +{ + VBlockP vb = CALLOC (get_vb_size (dt)); + vb->data_type = DT_NONE; + vb->id = vb_id; + vb->compute_task = task; + vb->data_type = dt; + vb->data_type_alloced = dt; + vb->comp_i = COMP_NONE; + vb->pool = NO_POOL; + init_dict_id_to_did_map (vb->d2d_map); + + if (!vb->buffer_list.vb) { + vb->buffer_list.name = "buffer_list"; + buf_init_lock (&vb->buffer_list); + buflist_add_buf (vb, &vb->buffer_list, __FUNCLINE); + vb->buffer_list.vb = vb; // indication buffer was added to buffer list + } + + nonpool_vbs[NUM_NONPOOL_VBs + vb_id] = vb; // vb_id is a negative integer + set_in_use (vb, true); + + return vb; +} + +VBlockP vb_get_nonpool_vb (VBID vb_id) +{ + return nonpool_vbs[NUM_NONPOOL_VBs + vb_id]; // may be NULL +} + +// used to change segconf VB data_type is seg_initiatlize (FASTA->FASTQ) +void vb_change_datatype_nonpool_vb (VBlockP *vb_p, DataType new_dt) +{ + REALLOC (vb_p, get_vb_size (new_dt), "VBlock"); + VBlockP vb = *vb_p; + + vb->data_type = vb->data_type_alloced = new_dt; + init_dict_id_to_did_map (vb->d2d_map); + + buflist_update_vb_addr_change (vb, vb->buffer_list.vb); + + nonpool_vbs[NUM_NONPOOL_VBs + vb->id] = vb; +} + +static VBlockP vb_update_data_type (VBlockP vb, DataType dt, DataType alloc_dt, uint64_t sizeof_vb) +{ + if (z_file && vb->data_type == dt) return vb; + + // the new data type has a private section in its VB, that is different that the one of alloc_dt - realloc private section + if (vb->data_type_alloced != alloc_dt && alloc_dt != DT_NONE) { + + // destroy private part of previous data_type. we also destroy all contexts as new data type is going + // to allocate different contexts with a different memory usage profile (eg b250 vs local) for each + if (vb->data_type_alloced != DT_NONE) + buflist_destroy_private_and_context_vb_bufs (vb); + + buflist_compact (vb); // remove buffer_list entries marked for removal + + // calloc private part of new data_type + VBlockP old_vb = vb; + REALLOC (&vb, sizeof_vb, "VBlock"); + + // initialize private part of new data_type, keeping common part intact + memset ((char*)vb + sizeof (VBlock), 0, sizeof_vb - sizeof (VBlock)); + + // update buf->vb in all buffers of this VB to new VB address + if (vb != old_vb) buflist_update_vb_addr_change (vb, old_vb); + vb->data_type_alloced = alloc_dt; + } + + vb->data_type = dt; + return vb; +} + +// allocate an unused vb from the pool. seperate pools for zip and unzip +VBlockP vb_get_vb (VBlockPoolType type, rom task_name, VBIType vblock_i, CompIType comp_i) +{ + START_TIMER; + + VBlockPool *pool = vb_get_pool (type, HARD_FAIL); + +#ifdef DEBUG + // if GFF VB becauses larger than FASTA, then we need to change the dt assignment conditions below + ASSERT0 (get_vb_size (DT_FASTA) > get_vb_size (DT_GFF), "Failed assumption that FASTA has larger VB than GFF"); +#endif + + DataType dt = (type == POOL_BGZF) ? DT_NONE + : (flag.deep && flag.zip_comp_i >= SAM_COMP_FQ00) ? DT_FASTQ + : (IS_ZIP && segconf.has_embedded_fasta) ? DT_FASTA // GFF3 with embedded FASTA (allocate a FASTA VB as it larger than GFF and can accommodate both) + : (IS_ZIP && txt_file) ? txt_file->data_type + : (IS_PIZ && z_file && Z_DT(GFF)) ? DT_FASTA // vb is allocated before reading the VB_HEADER, so we don't yet know if it is a embdedded fasta VB. To be safe, we allocate enough memory for a VBlockFASTA (which is larger), so we can change the dt in gff_piz_init_vb() without needing to realloc + : (IS_PIZ && z_file && flag.deep && comp_i != COMP_NONE && comp_i >= SAM_COMP_FQ00) ? DT_FASTQ + : (IS_PIZ && z_file) ? z_file->data_type + : DT_NONE; + + uint64_t sizeof_vb = get_vb_size (dt); + + DataType alloc_dt = sizeof_vb == sizeof (VBlock) ? DT_NONE + : (dt == DT_REF && IS_PIZ) ? DT_NONE + : dt == DT_BAM ? DT_SAM + : dt == DT_BCF ? DT_VCF + : dt; + + if (type == POOL_MAIN && IS_PIZ && z_file && Z_DT(GFF)) + dt = DT_GFF; // return GFF dt to its true dt after getting the size, otherwise it won't work + + // circle around until a VB becomes available (busy wait) + VBID vb_id; for (vb_id=0; ; vb_id = (vb_id+1) % pool->num_vbs) { + if (!pool->vb[vb_id]) { // VB is not allocated - allocate it + pool->vb[vb_id] = CALLOC (sizeof_vb); + pool->num_allocated_vbs++; + pool->vb[vb_id]->data_type_alloced = alloc_dt; + break; + } + + else if (!is_in_use (pool->vb[vb_id])) { + pool->vb[vb_id] = vb_update_data_type (pool->vb[vb_id], dt, alloc_dt, sizeof_vb); // possibly realloc + break; + } + + // case: we've checked all the VBs and none is available - wait a bit and check again + // in PIZ, this happens when a lot VBs are handed over to the writer thread which has not processed them yet. + // for example, if writer is blocking on write(), waiting for a pipe counterpart to read. + // it will be released when the writer thread completes one VB. + if (vb_id == pool->num_vbs-1) usleep (50000); // 50 ms + } + + VBlockP vb = pool->vb[vb_id]; + + // Logic: num_in_use is always AT LEAST sum(vb)->in_use. i.e. pessimistic. (it can be mometarily more between these two updates) + uint32_t num_in_use = __atomic_add_fetch (&pool->num_in_use, 1, __ATOMIC_ACQ_REL); // atomic to prevent concurrent update by writer thread and main thread (must be before update of in_use) + set_in_use (vb, true); + + // initialize VB fields that need to be a value other than 0 + vb->id = vb_id; + vb->data_type = dt; + vb->vblock_i = vblock_i; + vb->comp_i = comp_i; + vb->compute_thread_id = THREAD_ID_NONE; + vb->compute_task = task_name; + vb->pool = type; + init_dict_id_to_did_map (vb->d2d_map); + + if (!vb->buffer_list.vb) { + vb->buffer_list.name = "buffer_list"; + buf_init_lock (&vb->buffer_list); + buflist_add_buf (vb, &vb->buffer_list, __FUNCLINE); + vb->buffer_list.vb = vb; // indication buffer was added to buffer list + } + + if (flag_is_show_vblocks (task_name)) + iprintf ("VB_GET_VB(task=%s id=%u) vb_i=%s/%d num_in_use=%u/%u\n", + task_name, vb->id, comp_name (vb->comp_i), vb->vblock_i, num_in_use, pool->num_vbs); + + threads_log_by_vb (vb, task_name, "GET VB", 0); + + if (flag.debug_memory) + iprintf ("vb_get_vb: got vb_i=%d id=%d task=%s dt=%s address=[%p - %p]\n", + vblock_i, vb_id, task_name, dt_name(alloc_dt), vb, (char*)vb + sizeof_vb); + + COPY_TIMER_EVB (vb_get_vb); + return vb; +} + +uint32_t vb_pool_get_num_in_use (VBlockPoolType type, VBID *id/*optional out*/) +{ + VBlockPool *pool = vb_get_pool (type, HARD_FAIL); + int num_in_use = __atomic_load_n (&pool->num_in_use, __ATOMIC_ACQUIRE); // atomic, bc for POOL_MAIN, writer thread might update concurrently. + + if (id) { + *id = VB_ID_NONE; + if (num_in_use) { + for (VBID vb_id=0; vb_id < pool->num_vbs; vb_id++) + if (pool->vb[vb_id] && pool->vb[vb_id]->in_use) + return *id = vb_id; + + if (*id == VB_ID_NONE) // all lost use while we were checking + num_in_use = 0; + } + } + + return num_in_use; +} + +// Note: num_in_use is always AT LEAST sum(vb)->in_use (it can be mometarily less than sum(vb)->in_use as they are getting updated) +// therefore, this function may return true when pool is actually no longer full. +bool vb_pool_is_full (VBlockPoolType type) +{ + return vb_pool_get_num_in_use (type, NULL) == vb_get_pool(type, HARD_FAIL)->num_vbs; +} + +// Note: As in vb_pool_is_full, if the function returns false (not empty) the pool might in fact already be empty +bool vb_pool_is_empty (VBlockPoolType type) +{ + return vb_pool_get_num_in_use (type, NULL) == 0; +} + +bool vb_is_valid (VBlockP vb) +{ + if (vb->pool != NO_POOL) + for (VBID vb_id=0; vb_id < pools[vb->pool]->num_vbs; vb_id++) + if (vb == pools[vb->pool]->vb[vb_id]) return true; + + for (int i=0; i < NUM_NONPOOL_VBs; i++) + if (nonpool_vbs[i] == vb) return true; + + return false; +} + +// frees memory of all VBs, except for non-pool VBs (evb, segconf, writer,...) +void vb_destroy_pool_vbs (VBlockPoolType type, bool destroy_pool) +{ + if (!pools[type]) return; + + for (VBID vb_id=0; vb_id < pools[type]->num_vbs; vb_id++) + vb_destroy_vb (&pools[type]->vb[vb_id]); + + if (destroy_pool) + FREE (pools[type]); +} + +StrText err_vb_pos (void *vb) +{ + StrText s; + + snprintf (s.s, sizeof (s), "vb i=%u position in %s file=%"PRIu64, + (VB)->vblock_i, dt_name (txt_file->data_type), (VB)->vb_position_txt_file); + return s; +} + +unsigned def_vb_size (DataType dt) { return sizeof (VBlock); } + +void vb_set_is_processed (VBlockP vb) +{ + __atomic_store_n (&vb->is_processed, (bool)true, __ATOMIC_RELEASE); +} + +bool vb_is_processed (VBlockP vb) +{ + return __atomic_load_n (&vb->is_processed, __ATOMIC_ACQUIRE); +} + +bool vb_buf_locate (VBlockP vb, ConstBufferP buf) +{ + if (!vb) return false; + + unsigned sizeof_vb = get_vb_size (vb->data_type_alloced); + + return vb && is_p_in_range (buf, vb, sizeof_vb); +} + +rom textual_assseg_line (VBlockP vb) +{ + if (vb->line_start >= Ltxt) return "Invalid line_start"; + + char *nl = memchr (Btxt(vb->line_start), '\n', Ltxt - vb->line_start); + if (!nl) nl = BAFTtxt; // possibly overwriting txt_data's overflow fence + + *nl = 0; // terminate string + return Btxt(vb->line_start); +} + +static void vb_deferred_q_reorder (VBlockP vb, Did did_i, int q_index, int depth) +{ + ASSERT (depth < 10, "Cyclic seg order requirements, ctx=%s", CTX(did_i)->tag_name); + + for (int i=0; i < q_index; i++) + if (vb->deferred_q[i].seg_after_did_i == did_i) { + // move element i to after element q_index + // example start: [A, B, C, D] (B.seg_after_did_i=D). result: [A, C, D, B] + DeferredField df = vb->deferred_q[i]; + memmove (&vb->deferred_q[i], &vb->deferred_q[i+1], (q_index - i) * sizeof (DeferredField)); + vb->deferred_q[q_index] = df; + + // now, move any element that needs to be after i (B in the example) + vb_deferred_q_reorder (vb, df.did_i, q_index, depth+1); + } +} + +void vb_add_to_deferred_q (VBlockP vb, ContextP ctx, DeferredSeg seg, int16_t idx, + Did seg_after_did_i) // optional (DID_NONE if not) - ctx cannot be segged before seg_after_did_i (= another context that might be in the deferred queue) +{ + ASSERT (vb->deferred_q_len+1 < DEFERRED_Q_SZ, "%s: deferred queue is full (deferred_q_len=%u) when adding %s", VB_NAME, vb->deferred_q_len, ctx->tag_name); + ASSERT (idx >= 0, "Invalid idx=%d when adding %s", idx, ctx->tag_name); + + vb->deferred_q[vb->deferred_q_len++] = (DeferredField){ .did_i=ctx->did_i, .seg=seg, .idx=idx, .seg_after_did_i = seg_after_did_i }; + + // change order of segging if needed + vb_deferred_q_reorder (vb, ctx->did_i, vb->deferred_q_len-1, 1); +} + +void vb_display_deferred_q (VBlockP vb, rom func) +{ + if (!vb->deferred_q_len) return; + + iprintf ("%s: %s Deferred seg queue: ", func, LN_NAME); + + for (int i=0; i < vb->deferred_q_len; i++) + iprintf ("%s ", CTX(vb->deferred_q[i].did_i)->tag_name); + + iprint0 ("\n"); +} diff --git a/src/vblock.h b/src/vblock.h index 8d7279f1..7aa3af82 100644 --- a/src/vblock.h +++ b/src/vblock.h @@ -76,7 +76,7 @@ typedef struct { \ /* tracking execution */\ uint64_t vb_position_txt_file;/* ZIP/PIZ: position of this VB's data in the plain text file (without source compression): ZIP: as read before any ZIP-side modifications ; PIZ: as reconstructed with all modifications */\ - uint64_t vb_bgz_i; /* ZIP: index into txt_file->bgzf_isizes of the first BGZF/GZIL block of this VB */ \ + uint64_t vb_mgzip_i; /* ZIP: index into txt_file->mgzip_isizes of the first MGZIP block of this VB */ \ int32_t recon_size; /* ZIP: actual size of txt if this VB is reconstructed in PRIMARY coordinates (inc. as ##primary_only in --luft) */\ /* PIZ: expected reconstruction size in the coordinates of reconstruction */\ int32_t txt_size; /* ZIP: original size of of text data read from the file */ \ @@ -127,7 +127,7 @@ typedef struct { \ /* bgzf - for handling bgzf-compressed files */ \ void *gzip_compressor; /* Handle into libdeflate compressor or decompressor, or zlib's z_stream. Pointer to codec_bufs[].data */ \ - Buffer gz_blocks; /* ZIP: an array of GzBlockZip tracking the decompression of bgzf/gzil blocks in scratch into txt_data. */\ + Buffer gz_blocks; /* ZIP: an array of GzBlockZip tracking the decompression of bgzf/il1m blocks in scratch into txt_data. */\ /* PIZ: an array of BgzfBlockPiz */ \ \ /* random access, chrom, pos */ \ @@ -157,7 +157,8 @@ typedef struct { Buffer reread_prescription; /* ZIP SAM/BAM DEPN: list of lines to be re-read at seg initialize */\ Buffer optimized_txt_data; /* ZIP: --optimized: txt_data being re-written, if it cannot be re-written in place */ \ }; \ - Buffer txt_data; /* ZIP: txt_data as read from disk - either the txt header (in evb) or the VB data lines PIZ: reconstructed data */\ + Buffer txt_data; /* ZIP: txt_data as read from disk and uncompressed - either the txt header (in evb) or the VB data lines PIZ: reconstructed data */\ + Buffer comp_txt_data; /* ZIP/PIZ: source-comprssed data as read/written from/to disk */ \ Buffer z_section_headers; /* PIZ and Pair-1 reading in ZIP-Fastq: an array of unsigned offsets of section headers within z_data */\ Buffer scratch; /* helper buffer: used by many functions. before usage, assert that its free, and buf_free after. */\ int16_t z_next_header_i; /* next header of this VB to be encrypted or decrypted */\ diff --git a/src/vcf_header.c b/src/vcf_header.c index 8a69bcac..4b46d586 100644 --- a/src/vcf_header.c +++ b/src/vcf_header.c @@ -116,7 +116,7 @@ static bool vcf_header_get_last_line_cb (STRp(line), void *unused1, void *unused static unsigned vcf_header_get_last_line (BufferP txt_header, char **line_p) { int64_t line_len; - char *line = txtfile_foreach_line (txt_header, true, vcf_header_get_last_line_cb, 0, 0, 0, &line_len); + char *line = buf_foreach_line (txt_header, true, vcf_header_get_last_line_cb, 0, 0, 0, &line_len); if (line_p) *line_p = line; return is_field_name_line (STRa(line)) ? (unsigned)line_len : 0; @@ -279,14 +279,23 @@ static bool vcf_header_zip_parse_one_line (STRp(line), void *unused1, void *unus vcf_header_zip_create_ctx (STRa(line), is_info); // keywords that designate programs - #define ISPROG(s) \ + #define ISPROG(s) \ else if (LINEIS (s)) stats_add_one_program (&line[STRLEN(s)], line_len-STRLEN(s)-1) // without the \n + #define ISGATK(s,sep) /* start after ## and terminate string at sep */\ + else if (LINEIS (s)) ({ \ + char str[100]; \ + snprintf (str, 100, "GATK.%.*s", MIN_(30, (int)strcspn (&line[STRLEN(s)], sep " \t\n\r")), &line[STRLEN(s)]); \ + stats_add_one_program (str, strlen (str)); \ + }) + // identify programs ISPROG("##source="); ISPROG("##imputation="); ISPROG("##phasing="); ISPROG("##annotator="); + ISGATK("##GATKCommandLine.", "="); // e.g. ##GATKCommandLine.UnifiedGenotyper=name, txt_header, true)) return false; // samples are different than a previous concatented file // populate contigs, identify programs, identify float fields for optimization - txtfile_foreach_line (txt_header, false, vcf_header_zip_parse_one_line, 0, 0, 0, 0); + buf_foreach_line (txt_header, false, vcf_header_zip_parse_one_line, 0, 0, 0, 0); SAFE_NULB (*txt_header); #define IF_IN_SOURCE(signature, segcf) if (stats_is_in_programs (signature)) segconf.segcf = true @@ -324,6 +333,7 @@ static bool vcf_inspect_txt_header_zip (BufferP txt_header) IF_IN_SOURCE ("freeBayes", vcf_is_freebayes); IF_IN_HEADER ("GenotypeGVCFs", vcf_is_gatk_gvcf, "GenotypeGVCFs"); IF_IN_HEADER ("CombineGVCFs", vcf_is_gatk_gvcf, "CombineGVCFs"); + IF_IN_HEADER ("EMIT_ALL_SITES", vcf_is_gvcf, ""); // GATK UnifiedGenotyper GVCF (obsolete tool, no RGQ, so set as vcf_is_gvcf, not vcf_is_gatk_gvcf) if (segconf.vcf_is_gatk_gvcf) segconf.vcf_is_gvcf = true; if (segconf.vcf_is_isaac) IF_IN_HEADER ("gvcf", vcf_is_gvcf, ""); IF_IN_HEADER ("beagle", vcf_is_beagle, "beagle"); diff --git a/src/version.c b/src/version.c index 3e3db9f1..392160e3 100644 --- a/src/version.c +++ b/src/version.c @@ -24,6 +24,28 @@ static pthread_t thread_id; static char redirect_url[128]; static StreamP github_stream = NULL; +// we are running in development - consider version to be next minor version +bool version_is_devel (void) +{ + rom exe = arch_get_executable().s; + int exe_len = strlen (exe); + + static rom devel_paths[] = { + #ifdef _WIN32 + "C:\\Users\\divon\\genozip\\", + #elif defined __linux__ + "/mnt/c/Users/divon/", + #elif defined __APPLE__ + "/Users/divon/genozip/" + #endif + }; + + for (int i=0; i < ARRAY_LEN(devel_paths); i++) + if (str_isprefix_(STRa(exe), devel_paths[i], strlen(devel_paths[i]))) return true; + + return false; +} + // version of the genozip executable running int code_version_major (void) { @@ -31,9 +53,17 @@ int code_version_major (void) } int code_version_minor (void) +{ + return atoi (strrchr (GENOZIP_CODE_VERSION, '.') + 1) + + version_is_devel(); // In development - we are running one minor version higher than GENOZIP_CODE_VERSION: needed so VER2() works in PIZ +} + +StrText code_version (void) { - rom ver = GENOZIP_CODE_VERSION; - return atoi (strrchr (ver, '.') + 1); + StrText s; + snprintf (s.s, sizeof(s), "%u.0.%u", code_version_major(), code_version_minor()); + + return s; } StrText version_str (void) @@ -41,11 +71,11 @@ StrText version_str (void) StrText s={}; if (IS_ZIP || !z_file) - snprintf (s.s, sizeof (s.s), "version=%s", GENOZIP_CODE_VERSION); + snprintf (s.s, sizeof (s.s), "version=%.16s", code_version().s); else if (!VER2(15,28)) - snprintf (s.s, sizeof (s.s), "code_version=%s file_version=%u", GENOZIP_CODE_VERSION, z_file->genozip_version); + snprintf (s.s, sizeof (s.s), "code_version=%.16s file_version=%u", code_version().s, z_file->genozip_version); else - snprintf (s.s, sizeof (s.s), "code_version=%s file_version=%u.0.%u", GENOZIP_CODE_VERSION, z_file->genozip_version, z_file->genozip_minor_ver); + snprintf (s.s, sizeof (s.s), "code_version=%.16s file_version=%u.0.%u", code_version().s, z_file->genozip_version, z_file->genozip_minor_ver); return s; } @@ -71,11 +101,10 @@ static void *version_background_test_for_newer_do (void *unused) if (flag.debug_upgrade) iprintf ("\ndebug-upgrade: latest_version=%s\n", latest_version); - char copy[100]; - strcpy (copy, GENOZIP_CODE_VERSION); // bc str_split_ints doesn't work on string literals + StrText code_ver = code_version(); str_split_ints (latest_version, strlen (latest_version), 3, '.', latest, true); - str_split_ints (copy, strlen (copy), 3, '.', this, true); + str_split_ints (code_ver.s, strlen (code_ver.s), 3, '.', this, true); if (!flag.debug_latest && n_latests == 3 && n_thiss == 3 && latests[0] == thiss[0] && latests[2] <= thiss[2]) latest_version = NULL; // same or newer than current version @@ -237,7 +266,7 @@ void version_print_notice_if_has_newer (void) // case: Genozip finished its work while thread is still running - kill it if (__atomic_load_n (&thread_running, __ATOMIC_ACQUIRE)) { pthread_cancel (thread_id); - pthread_join (thread_id, NULL); // wait for cancelation to complete + PTHREAD_JOIN (thread_id, "version_background_test_for_newer_do"); // wait for cancelation to complete if (flag.debug_upgrade) iprint0 ("debug-upgrade: upgrade thread canceled\n"); @@ -254,7 +283,7 @@ void version_print_notice_if_has_newer (void) iprint0 ("debug-upgrade: upgrade thread completed\n"); iprintf ("\nA newer & better version of Genozip is available - version %s. You are currently running version %s\n", - latest_version, GENOZIP_CODE_VERSION); + latest_version, code_version().s); if (is_info_stream_terminal) { #ifdef _WIN32 diff --git a/src/version.h b/src/version.h index d6038152..95f471d1 100644 --- a/src/version.h +++ b/src/version.h @@ -1,7 +1,9 @@ -#define GENOZIP_CODE_VERSION "15.0.62" +#define GENOZIP_CODE_VERSION "15.0.63" extern int code_version_major (void); extern int code_version_minor (void); +extern bool version_is_devel (void); +extern StrText code_version (void); extern StrText version_str (void); extern void version_background_test_for_newer (void); extern void version_print_notice_if_has_newer (void); diff --git a/src/writer.c b/src/writer.c index d2dfce40..3254316f 100644 --- a/src/writer.c +++ b/src/writer.c @@ -18,7 +18,7 @@ #include "random_access.h" #include "writer.h" #include "zfile.h" -#include "bgzf.h" +#include "mgzip.h" #include "mutex.h" #include "codec.h" #include "endianness.h" @@ -308,9 +308,9 @@ static void writer_init_vb_info (void) ASSERT (v->pair_vb_i >= 1 && v->pair_vb_i < vb_info.len, "v->pair_vb_i=%d ∉ [1,%d]", v->pair_vb_i, vb_info.len32-1); // verify that this VB and its pair have the same number of lines (test when initiazing the second one) - uint32_t pair_num_lines = VBINFO(v->pair_vb_i)->num_lines; - ASSERT (v->num_lines == pair_num_lines, "vb=%u has %u lines, but vb=%u, its pair, has %u lines", - vb_i, v->num_lines, v->pair_vb_i, pair_num_lines); + uint32_t R1_num_lines = VBINFO(v->pair_vb_i)->num_lines; + ASSERT (v->num_lines == R1_num_lines, "vb=%u has %u lines, but vb=%u, its pair, has %u lines", + vb_i, v->num_lines, v->pair_vb_i, R1_num_lines); } // conditions this VB should not be read or reconstructed @@ -701,7 +701,7 @@ static void writer_update_section_list (void) else if (Z_DT(FASTQ) && flag.one_component == FQ_COMP_R2+1) // flag.one_component is comp_i+1 writer_add_entire_component_section_list (new_list, FQ_COMP_R1); - // add SEC_BGZF of all components that have it + // add SEC_MGZIP of all components that have it sections_new_list_add_bgzf (new_list); // copy all global sections from current section list to new one @@ -1032,7 +1032,7 @@ static bool writer_output_one_processed_bgzf (Dispatcher dispatcher, bool blocki VBlockP bgzf_vb = dispatcher_get_processed_vb (dispatcher, NULL, blocking); if (bgzf_vb) { - writer_write (&bgzf_vb->z_data, bgzf_vb->txt_data.len); + writer_write (&bgzf_vb->comp_txt_data, bgzf_vb->txt_data.len); dispatcher_recycle_vbs (dispatcher, true); // also release VB } @@ -1094,8 +1094,8 @@ static void writer_write_line_range (VbInfo *v, uint32_t start_line, uint32_t nu ASSERTNOTNULL (v); ASSERTNOTNULL (v->vb); - ASSERT (!v->vb->scratch.len, "expecting vb=%s/%u data to be BGZF-compressed by writer at flush, but it is already compressed by reconstructor: txt_file->codec=%s compressed.len=%"PRIu64" txt_data.len=%"PRIu64, - comp_name (v->vb->comp_i), v->vb->vblock_i, codec_name (txt_file->codec), v->vb->scratch.len, v->vb->txt_data.len); + ASSERT (!v->vb->scratch.len, "expecting vb=%s/%u data to be BGZF-compressed by writer at flush, but it is already compressed by reconstructor: txt_file->effective_codec=%s compressed.len=%"PRIu64" txt_data.len=%"PRIu64, + comp_name (v->vb->comp_i), v->vb->vblock_i, codec_name (txt_file->effective_codec), v->vb->scratch.len, v->vb->txt_data.len); if (!v->vb->txt_data.len) return; // no data in the VB @@ -1236,7 +1236,7 @@ static void writer_main_loop (VBlockP wvb) // same as wvb global variable threads_set_writer_thread(); // if we need to BGZF-compress, we will dispatch the compression workload to compute threads - Dispatcher dispatcher = (!flag.no_writer && TXT_IS_BGZF && txt_file->bgzf_flags.library != BGZF_EXTERNAL_LIB) ? + Dispatcher dispatcher = (!flag.no_writer && TXT_IS_BGZF && txt_file->mgzip_flags.library != BGZF_EXTERNAL_LIB) ? dispatcher_init ("bgzf", NULL, POOL_BGZF, writer_get_max_bgzf_threads(), 0, false, false, NULL, 0, NULL) : NULL; // normally, we digest in the compute thread but in case gencomp lines can be inserted into the vb we digest here. @@ -1336,8 +1336,8 @@ static void writer_main_loop (VBlockP wvb) // same as wvb global variable while (!vb_pool_is_empty (POOL_BGZF)) writer_output_one_processed_bgzf (dispatcher, true); - bgzf_write_finalize(); // write final data to wvb->z_data - writer_write (&wvb->z_data, 0); + bgzf_write_finalize(); // write final data to wvb->comp_txt_data + writer_write (&wvb->comp_txt_data, 0); dispatcher_finish (&dispatcher, NULL, false, false); } diff --git a/src/zfile.c b/src/zfile.c index 177bff32..c8f840a2 100644 --- a/src/zfile.c +++ b/src/zfile.c @@ -203,8 +203,8 @@ uint32_t zfile_compress_b250_data (VBlockP vb, ContextP ctx) struct FlagsCtx flags = ctx->flags; // make a copy if (VB_DT(FASTQ)) - flags.paired = (flag.pair == PAIR_R1 && fastq_zip_use_pair_identical (ctx->dict_id)) || // "paired" flag in R1 means: "In R2, reconstruct R1 data IFF R2 data is absent" (v15) - (flag.pair == PAIR_R2 && fastq_zip_use_pair_assisted (ctx->dict_id, SEC_B250)); // "paired" flag in R2 means: "Reconstruction of R2 requires R2 data as well as R1 data" + flags.paired = (IS_R1 && fastq_zip_use_pair_identical (ctx->dict_id)) || // "paired" flag in R1 means: "In R2, reconstruct R1 data IFF R2 data is absent" (v15) + (IS_R2 && fastq_zip_use_pair_assisted (ctx->dict_id, SEC_B250)); // "paired" flag in R2 means: "Reconstruction of R2 requires R2 data as well as R1 data" SectionHeaderCtx header = (SectionHeaderCtx) { .magic = BGEN32 (GENOZIP_MAGIC), @@ -234,8 +234,8 @@ uint32_t zfile_compress_local_data (VBlockP vb, ContextP ctx, uint32_t sample_si struct FlagsCtx flags = ctx->flags; // make a copy if (VB_DT(FASTQ)) - flags.paired = (flag.pair == PAIR_R1 && fastq_zip_use_pair_identical (ctx->dict_id)) || // "paired" flag in R1 means: "Load R1 data in R2, if R2 data is absent" (v15) - (flag.pair == PAIR_R2 && fastq_zip_use_pair_assisted (ctx->dict_id, SEC_LOCAL)); // "paired" flag in R2 means: "Reconstruction of R2 requires R2 data as well as R1 data" + flags.paired = (IS_R1 && fastq_zip_use_pair_identical (ctx->dict_id)) || // "paired" flag in R1 means: "Load R1 data in R2, if R2 data is absent" (v15) + (IS_R2 && fastq_zip_use_pair_assisted (ctx->dict_id, SEC_LOCAL)); // "paired" flag in R2 means: "Reconstruction of R2 requires R2 data as well as R1 data" uint32_t uncompressed_len = ctx->local.len32 * lt_width(ctx); @@ -784,14 +784,14 @@ uint64_t zfile_read_genozip_header_get_offset (bool as_is) z_file->genozip_minor_ver = top.genozip_minor_ver; // 0 before 15.0.28 z_file->data_type = BGEN16 (top.data_type); - if (Z_DT(BCF)) { z_file->data_type = DT_VCF; z_file->source_codec = CODEC_BCF; } // Z_DT is always VCF, not BCF - else if (Z_DT(CRAM)) { z_file->data_type = DT_SAM; z_file->source_codec = CODEC_CRAM; } // Z_DT is always SAM, not CRAM or BAM + if (Z_DT(BCF)) { z_file->data_type = DT_VCF; z_file->src_codec = CODEC_BCF; } // Z_DT is always VCF, not BCF + else if (Z_DT(CRAM)) { z_file->data_type = DT_SAM; z_file->src_codec = CODEC_CRAM; } // Z_DT is always SAM, not CRAM or BAM // check that file version is at most this executable version, except for reference file for which only major version is tested ASSINP (z_file->genozip_version < code_version_major() || (z_file->genozip_version == code_version_major() && (z_file->genozip_minor_ver <= code_version_minor() || Z_DT(REF) || (is_genocat && flag.show_stats))), "Error: %s cannot be opened because it was compressed with genozip version %u.0.%u which is newer than the version running - %s.\n%s", - z_name, z_file->genozip_version, z_file->genozip_minor_ver, GENOZIP_CODE_VERSION, genozip_update_msg()); + z_name, z_file->genozip_version, z_file->genozip_minor_ver, code_version().s, genozip_update_msg()); bool metadata_only = is_genocat && (flag.show_stats || flag.show_gheader || flag.show_headers || flag.show_aliases || flag.show_dict); @@ -836,7 +836,7 @@ bool zfile_read_genozip_header (SectionHeaderGenozipHeaderP out_header, FailType DataType data_type = (DataType)(BGEN16 (header->data_type)); // Note: BCF/CRAM files have DT_BCF/DT_CRAM in the GenozipHeader, but in the PIZ code we - // expect data_type=VCF/SAM with z_file->source_codec set to CODEC_BCF/CODEC_CRAM. + // expect data_type=VCF/SAM with z_file->src_codec set to CODEC_BCF/CODEC_CRAM. if (data_type == DT_BCF) data_type = DT_VCF; else if (data_type == DT_CRAM) data_type = DT_SAM; diff --git a/src/zip.c b/src/zip.c index 0ec4944e..fb3262e1 100644 --- a/src/zip.c +++ b/src/zip.c @@ -19,7 +19,7 @@ #include "progress.h" #include "stats.h" #include "compressor.h" -#include "bgzf.h" +#include "mgzip.h" #include "txtheader.h" #include "threads.h" #include "contigs.h" @@ -487,9 +487,9 @@ static void zip_compress_one_vb (VBlockP vb) !(segconf.sag_type && vb->comp_i == SAM_COMP_MAIN)) // except in MAIN of SAM/BAM gencomp - need to generate PRIM and DEPN VBs goto after_compress; - // if the txt file is compressed with BGZF/GZIL, we uncompress now, in the compute thread + // if the txt file is compressed with a MGZIP codec, we (usually) uncompress now, in the compute thread if (vb->txt_codec) - bgz_uncompress_vb (vb, vb->txt_codec); // some of the blocks might already have been decompressed while reading - we decompress the remaining + mgzip_uncompress_vb (vb, vb->txt_codec); // some of the blocks might already have been uncompressed while reading - we uncompress the remaining vb->txt_size = Ltxt; // this doesn't change with --optimize. @@ -579,11 +579,11 @@ static void zip_prepare_one_vb_for_dispatching (VBlockP vb) // and copy the data we need for this vb. note: we need to do this before txtfile_read_vblock as // we need the num_lines of the pair VB bool R1_data_exhausted = false; - if (flag.pair == PAIR_R2) { - uint32_t pair_vb_i = prev_file_first_vb_i + (vb->vblock_i-1 - prev_file_last_vb_i); + if (IS_R2) { + uint32_t R1_vb_i = prev_file_first_vb_i + (vb->vblock_i-1 - prev_file_last_vb_i); - if (pair_vb_i <= prev_file_last_vb_i) - fastq_read_pair_1_data (vb, pair_vb_i); // add the R1 sections z_data after the R2 sections + if (R1_vb_i <= prev_file_last_vb_i) + fastq_read_R1_data (vb, R1_vb_i); // add the R1 sections z_data after the R2 sections else R1_data_exhausted = true; // R1 data is already exhausted. This is ok if R2 data is exhausted too. } @@ -668,7 +668,7 @@ static void zip_complete_processing_one_vb (VBlockP vb) uint64_t zip_get_target_progress (void) { static uint64_t target_progress=0; - if ((Z_DT(FASTQ) && flag.pair != PAIR_R2) || // note: if 2nd of a FASTQ file pair - we leave the target as it was in the first file as seggable_size is not calculated for the 2nd file + if ((Z_DT(FASTQ) && !IS_R2) || // note: if 2nd of a FASTQ file pair - we leave the target as it was in the first file as seggable_size is not calculated for the 2nd file (flag.deep && flag.zip_comp_i <= SAM_COMP_FQ00) || (!flag.deep && !Z_DT(FASTQ))) { @@ -699,9 +699,7 @@ void zip_one_file (rom txt_basename, z_file->num_components = MAX_(z_file->num_components, flag.zip_comp_i+1); // may increase further with generated components (in zip_update_txt_counters()) evb->z_data.len = 0; evb->z_next_header_i = 0; - - txtfile_zip_finalize_codecs(); - + // we calculate digest for each component seperately, stored in SectionHeaderTxtHeader (always 0 for generated components, or if modified) if (gencomp_comp_eligible_for_digest(NULL)) // if generated component - keep digest to display in progress after the last component z_file->digest_ctx = DIGEST_CONTEXT_NONE; @@ -731,12 +729,17 @@ void zip_one_file (rom txt_basename, DT_FUNC (txt_file, zip_after_segconf)(); - uint64_t target_progress = zip_get_target_progress(); + txtfile_zip_finalize_codecs(); + + uint64_t target_progress = zip_get_target_progress(); // estimate based on segconf data dispatcher = dispatcher_fan_out_task ( ZIP_TASK_NAME, txt_basename, target_progress, // target progress: 1 for each read, compute, write - target_progress ? NULL : txt_file->is_remote ? "Downloading & compressing..." : "Compressing...", + target_progress ? NULL + : txt_file->is_remote ? "Downloading & compressing..." + : flag.skip_segconf ? "Compressing (skipped segconf)..." // no progress data if segconf was skipped + : "Compressing...", !flag.make_reference, // allow callbacks to zip_complete_processing_one_vb not in order of VBs (not allowed for make-reference as contigs need to be in consistent order) false, // not test mode flag.xthreads, prev_file_last_vb_i, 5000, false, @@ -756,15 +759,14 @@ void zip_one_file (rom txt_basename, WARN_IF (appending, "%s was being appended by an external process while compression was in progress", txt_name); + bgzf_finalize_discovery(); + // verify that the entire data is either decompressed or truncated away (doesn't work for external decompressors) - ASSERT (txt_file->disk_gz_uncomp_or_trunc == txt_file->disk_so_far || (!TXT_IS_BGZF && !TXT_IS_GZIL && !TXT_IS_GZ) || flag.has_head, - "Failed to process all source data: read %s bytes from disk, but decompressed %sonly %s bytes. txt_codec=%s", + ASSERT (txt_file->disk_gz_uncomp_or_trunc == txt_file->disk_so_far || !TXT_IS_GZIP || flag.has_head, + "Failed to process all source data: read %s bytes from disk, but decompressed %sonly %s bytes. src_codec=%s", str_int_commas (txt_file->disk_so_far).s, flag.truncate ? "or truncated " : "", str_int_commas (txt_file->disk_gz_uncomp_or_trunc).s, - txtfile_codec_name (z_file, flag.zip_comp_i).s); + txtfile_codec_name (z_file, flag.zip_comp_i, false).s); - if (TXT_IS_BGZF || TXT_IS_GZIL) - bgzf_finalize_discovery(); - zriter_wait_for_bg_writing(); // complete writing VBs before moving on dispatcher_calc_avg_compute_vbs (dispatcher); @@ -777,14 +779,14 @@ void zip_one_file (rom txt_basename, ASSERT0 (!flag.biopsy || biopsy_is_done(), "Biopsy request not complete - some VBs missing"); - // write the BGZF section containing BGZF block sizes, if this txt file is compressed with BGZF - if (TXT_IS_BGZF) - bgzf_compress_bgzf_section(); + // write the MGZIP section containing MGZIP block sizes, if this txt file is compressed with a MGZIP codec + if (TXT_IS_MGZIP) + mgzip_compress_mgzip_section(); // if this a non-bound file, or the last component of a bound file - write the genozip header, random access and dictionaries finish: z_file->txt_file_disk_sizes[flag.zip_comp_i] = txt_file->disk_size ? txt_file->disk_size // actual file size on disk, if we know it (we don't if its a remote or stdin file) - : (int64_t)txt_file->disk_so_far + (txt_file->codec==CODEC_BGZF ? BGZF_EOF_LEN : 0); // data (plain, BGZF, GZ or BZ2) read from the file descriptor (we won't have correct src data here if reading through an external decompressor - but luckily txt_file->disk_size will capture that case) + : (int64_t)txt_file->disk_so_far; z_file->txt_file_disk_sizes_sum += z_file->txt_file_disk_sizes[flag.zip_comp_i]; // (re-)index sections after adding this txt_file diff --git a/src/zriter.c b/src/zriter.c index d7c98ab3..da1d0e5b 100644 --- a/src/zriter.c +++ b/src/zriter.c @@ -16,6 +16,7 @@ #include "threads.h" #include "zriter.h" #include "arch.h" +#include "tar.h" static uint32_t zriter_thread_num = 0; // for --show-threads @@ -26,6 +27,18 @@ typedef struct { bool completed; } ZriterThread, *ZriterThreadP; +static int64_t zriter_tell (void) +{ + int64_t offset = ftello64 ((FILE *)z_file->file); + ASSERT (offset >= 0 , "ftello64 failed for %s (FILE*=%p remote=%s redirected=%s): %s", + z_file->name, z_file->file, TF(z_file->is_remote), TF(z_file->redirected), strerror (errno)); + + // in a z_file that is being tarred, update the offset to the beginning of the file data in the tar file + offset -= tar_file_offset(); // 0 if not using tar + + return offset; +} + void zriter_flush (void) { if (flag.zip_no_z_file) return; @@ -40,7 +53,7 @@ void zriter_wait_for_bg_writing (void) for_buf (ZriterThreadP, zt_p, z_file->zriter_threads) { if (flag.show_threads) iprintf ("zriter: JOINING: thread_id=%"PRIu64"\n", (uint64_t)(*zt_p)->thread_id); - pthread_join ((*zt_p)->thread_id, NULL); // blocking + PTHREAD_JOIN ((*zt_p)->thread_id, "zriter_thread_entry"); // blocking if (flag.show_threads) iprintf ("zriter: JOINED: thread_id=%"PRIu64"\n", (uint64_t)(*zt_p)->thread_id); buf_destroy ((*zt_p)->data); @@ -74,7 +87,7 @@ static void *zriter_thread_entry (void *zt_) z_file->disk_so_far += zt->data.len; // length of GENOZIP data writen to disk (protected by zriter_mutex) // sanity - uint64_t actual_disk_so_far = file_tell (z_file, false); + uint64_t actual_disk_so_far = zriter_tell(); ASSERT (actual_disk_so_far == z_file->disk_so_far, "Expecting actual_disk_so_far=%"PRIu64" == z_file->disk_so_far=%"PRIu64, actual_disk_so_far, z_file->disk_so_far); @@ -99,7 +112,7 @@ static void zriter_write_background (BufferP data, BufferP section_list) // free resources if (flag.show_threads) iprintf ("zriter: JOINING: thread_id=%"PRIu64"\n", (uint64_t)(*zt_p)->thread_id); - pthread_join ((*zt_p)->thread_id, NULL); + PTHREAD_JOIN ((*zt_p)->thread_id, "zriter_thread_entry"); if (flag.show_threads) iprintf ("zriter: JOINED: thread_id=%"PRIu64"\n", (uint64_t)(*zt_p)->thread_id); buf_destroy ((*zt_p)->data); @@ -165,7 +178,7 @@ static void zriter_write_foreground (BufferP data, BufferP section_list, int64_t z_file->disk_so_far += data->len; // length of GENOZIP data writen to disk (pr) // sanity - uint64_t actual_disk_so_far = file_tell (z_file, false); + uint64_t actual_disk_so_far = zriter_tell(); ASSERT (actual_disk_so_far == z_file->disk_so_far, "Expecting actual_disk_so_far=%"PRIu64" == z_file->disk_so_far=%"PRIu64, actual_disk_so_far, z_file->disk_so_far); }