diff --git a/.gitignore b/.gitignore index 8e5cd9d..1de2a68 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ fastqz +fapack +fapacks diff --git a/Makefile b/Makefile index cc5d45c..d1cf0a3 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,7 @@ -default: fastqz +default: fastqz fapack fapacks fastqz: fastqz15.cpp libzpaq.3.pod libzpaq.cpp libzpaq.h g++ -O3 -msse2 -s -lpthread fastqz15.cpp libzpaq.cpp -o $@ + +clean: + - rm -f fastqz fapack fapacks diff --git a/fapack.cpp b/fapack.cpp new file mode 100644 index 0000000..487e03c --- /dev/null +++ b/fapack.cpp @@ -0,0 +1,71 @@ +/* fapack.cpp - pack FASTA 4 bases per byte + + Copyright (C) 2012, Matt Mahoney, Dell Inc. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + +This program produces packed DNA sequences from FASTA files. +The output may be used as a reference genome for the program fastqz. + +To use: fapack output *.fa + +where *.fa is a list of FASTA input files. For each file, +input lines starting with ">" are ignored. For all other lines, +the letters A,C,G,T are packed MSB first with A=00,C=01,G=10,T=11. +All other characters are ignored. The last partial byte is discarded. + +To compile: g++ -O3 fapack.cpp -o fapack +*/ + +#include +#include + +int main(int argc, char** argv) { + if (argc<3) + printf("To pack FASTA files: fapack output *.fa\n"), exit(1); + FILE *out=fopen(argv[1], "wb"); + int b=1, c; + for (int i=2; i') dna=false; + else if (c==10) dna=true; + if (dna) { + if (c=='A') b=b*4; + if (c=='C') b=b*4+1; + if (c=='G') b=b*4+2; + if (c=='T') b=b*4+3; + if (b>=256) putc(b&255, out), b=1; + } + } + if (in) fclose(in); + } + fclose(out); + return 0; +} + + diff --git a/fapacks.cpp b/fapacks.cpp new file mode 100644 index 0000000..1a1e09a --- /dev/null +++ b/fapacks.cpp @@ -0,0 +1,65 @@ +/* fapacks.cpp - pack FASTA 4 bases per byte + includes lowercase a,c,g,t + + Copyright (C) 2012, Matt Mahoney, Dell Inc. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + +This program produces packed DNA sequences from FASTA files. +The output may be used as a reference genome for the program fastqz. +*/ + +#include +#include +#include + +int main(int argc, char** argv) { + if (argc<3) + printf("To pack FASTA files: fapack output *.fa\n"), exit(1); + FILE *out=fopen(argv[1], "wb"); + int b=1, c; + for (int i=2; i') dna=false; + else if (c==10) dna=true; + if (islower(c)) c=toupper(c); + if (dna) { + if (c=='A') b=b*4; + if (c=='C') b=b*4+1; + if (c=='G') b=b*4+2; + if (c=='T') b=b*4+3; + if (b>=256) putc(b&255, out), b=1; + } + } + if (in) fclose(in); + } + fclose(out); + return 0; +} + + diff --git a/fastqz15.cpp b/fastqz15.cpp index 2261556..4808f6e 100644 --- a/fastqz15.cpp +++ b/fastqz15.cpp @@ -1,960 +1,960 @@ -/* fastqz v1.5 - Sanger FASTQ compressor - - Copyright (C) 2012, Matt Mahoney, Dell Inc. - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - - Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - POSSIBILITY OF SUCH DAMAGE. - -TO COMPILE - -g++ -O3 -msse2 -s -lpthread fastqz.cpp libzpaq.cpp -o fastqz - -You need libzpaq.cpp and libzpaq.h from either -https://sourceforge.net/projects/zpaq/ or -http://mattmahoney.net/zpaq/ -libzpaq is public domain. - -Also, to use in Windows you need to install Pthreads-Win32 from -http://sourceware.org/pthreads-win32/ -In particular you need pthread.h to compile and pthreadGC2.dll -in your PATH to run. Pthreads-Win32 is licensed under LGPL. - -libzpaq uses Just-In-Time (JIT) optimization of ZPAQL code on -x86 32 or 64 bit processors. To run on other processors, compile -with -DNOJIT to disable this feature. It will still work but run slower. - - -USAGE - -fx is a compressor for Sanger FASTQ files. It has two compression modes, -fast and slow. - -Usage: fastqz command input output [reference] -Commands: - c - compress input to output.fx?.zpaq (3 files, ? = {h,b,q}) - d - decompress input.fx?.zpaq to output - e - encode input to output.fx? without zpaq compression (faster) - f - decode input.fx? to output - cQ, eQ - quantize quality values to 35 plus a multiple of Q, rounding - down. Default is c1 or e1. - -Commands c and e compress. c compresses smaller but e compresses faster. -The corresponding decompression commands are d and f respectively. -You need 1.5 GB memory to compress with c or decompress with d. -They also both produce temporary files taking the same space as the -output of command e. The e and f commands don't use significant -memory and don't produce temporary files. - -Using a quantization like c2 or e4 is lossy but improves compression -when exact quality values are not needed. Values are rounded down. - -Compression produces 3 files. Command e produces files named -output.fxh, output.fxb, output.fxq. Command c produces files named -output.fxh.zpaq, output.fxb.zpaq, output.fxq.zpaq -When decompressing, omit the .fx? or .fx?.zpaq extension -on the input file names. The extensions will be assumed. - -Input for compression is expected to be a Sanger FASTQ file. -The file consists of "reads" from a DNA sequencing machine. Each -read has the following format: - - @header - ACGTN (base calls, length n) - + - !..I# (quality scores, length n, ASCII 34..73 for A,C,G,T, 33 for N) - -Maximum line length is 4095. Lines must be terminated by LF -(ASCII 10) only (no CR). All base and quality lines must have -the same length (read length = n) throughout the file. -Files not in this format are rejected. - -If [reference] is present, then it is the file name of a reference -genome. The same reference must be present for decompression. -The reference genome consists of a sequence of bases packed 4 -per byte in MSB to LSB order with ACGT=0..3. You can use the -program fapack to convert FASTA files into this format. -The reference genome cannot be bigger than 1 GB (2^32 bases). -You need 1.5 GB memory to encode and 1 GB to decode. -A fourth file will be produced: output.fxa.zpaq or output.fxa -containing compressed alignments. - - -COMPRESSION FORMAT - -Command "c" and "e" both split the input into 3 or 4 parts and -compress them as described below. Command "c" further compresses -each of the 3 or 4 files using a different ZPAQ model. - -Headers (.fxh) are coded in the form (j,k,len,xxx...,0) which means -go to column j-1 (first column is 0) in the previous header and -add k-1 to the decimal number ending there. If k=1, then skip -this step. Then copy the first len characters of the modified previous -header, then output xxx, and finally a linefeed (ASCII 10). Save this -output, minus the linefeed. - -The first 2 bytes of the .fxh file encodes the read length, n, -MSB first (e.g. 0,100 if all base and quality lines have length 100). - -Base calls (.fxb) are encoded first by deleting all N's. These can be -restored because their location is indicated by a quality score -of 33. Then the remaining bases are encoded in self terminating -base 4 with A=1, T=2, C=3, G=4 allowing 3 or 4 bases per byte. -For example, "TACT" is coded as 2*64 + 1*16 + 3*4 + 2*1 = 158. - -If a reference is given, then a list of matches are stored in a .fxa -file. The format is: - - (m1+1+128*dir,m2+1,m3+1,m4+1,p3,p2,p1,p0) to encode a match - (0) to encode no match - -where p3..p0 is a 32 bit pointer (MSB first) -into the reference genome after expanding to 1 base per element -(with 0..3=ACGT) and padding the ends with 16384 zeros (or A). -'dir' is 0 for a match in the forward direction or 1 for a -match in the reverse direction starting at the same point but -exchanging A with T and C with G. m1..m4 are the locations of -the first 4 diferences between the base sequence (after deleting -N's) and the reference, in the range 0..len-1 where len is the -length of the sequence with N's deleted. Thus, the bytes are -coded in the range 1..len, with bit 7 of the first byte set if -the match is reversed. The mismatches are in ascending order. -If there are less than 4 mismatches, then the remaining bytes -are coded as len+1. Thus, only reads up to 126 can be fully -matched. - -If a match is present, then only the corresponding mismatched bases, -plus any bases after m4 (except N), are written to the .fxb file. -If the first byte is 0, then there is no match and the entire -base string is written (except N). - -Quality scores are decoded as follows: q=1..72 decode as q+32 -(33..104). q=73..136 decode as a pair (q-73)%8+64, (q-73)/8+64, -both in the range 64..71. q=137..200 decode as the triple -(q-137)%4+68, (q-137)/4%4+68, (q-137)/16+68 in the range 68..71. -q=201..255 decodes as 71 repeated q-200 (1..55) times. q=0 -decodes by setting all remaining values to 35 and terminating -the sequence. The coding takes advantage of the high frequency -of q at or just below 71 that group early in the sequence, and -of sequences that end in runs of 35. - -Command "c" further compresses the output.fx? files -to output.fx?.zpaq files as defined by the ZPAQ level 2 standard -which can be found at http://mattmahoney.net/zpaq/ or -https://sourceforge.net/projects/zpaq/ - -ZPAQ is a configurable compression format based on the PAQ context -mixing algorithm with bit-wise prediction and arithmetic coding. -Context models are described in ZPAQL byte code, which is saved to -the compressed file and can be read by a generic ZPAQ decompressor. -Thus, a FASTQ file compressed with "fastqz c" could be decompressed -first with zpaq and then with "fastq f" as opposed to decompressing -with "fastq d". - -ZPAQL byte code describes an array of components and code to compute -contexts. Each component takes a context and possibly the predictions -of earlier components and outputs a new probability that the next -bit will be a 1. The output of the last component is used to arithmetic -encode or decode the next bit. After encoding or decoding, the bit -is used to update the models to reduce their prediction errors. - -Whole-byte contexts are computed on byte boundaries by code running on -a ZPAQL virtual machine. This program is executed once after modeling -each byte with that byte as input. The output is saved in an array -of 32-bit values which is available as input to the array of components. -These values are combined with the previously coded bits of the current -byte to form a complete context. - -A ZPAQ model is described by a config file. In this program, the -compiled byte code is fed to the model during compression, or read -from the compressed file header during decompression. The source code -for each model is given below, followed by an explanation of the code. -The command "zpaq -mfx? l" will generate the byte code used in this -program from the sources below named "fx?.cfg" (where ? is h,b,q,a). - -A config file has 3 sections: - - COMP - describes the array of modeling components. - HCOMP - ZPAQL code to compute contexts. - POST/PCOMP - ZPAQL code for post-processing. - -Post-processing is not used, so each file ends with POST 0 END. -Modeled bits are output directly. - -A ZPAQL virtual machine has 32-bit registers A,B,C,D, an array -of bytes M, an array of 32 bit unsigned integers H, a condition flag F, -and a 16 bit program counter. H is the context output to the model. -A is the input byte and accumulator for arithmetic and logical operations. -B and C are pointers into M. D points to H. *B, *C, *D refer -to the elements pointed to, modulo the array sizes. The sizes are -given by the first 2 parameters after COMP. - - -HEADER MODELING - -(fxh.cfg model to compress headers) -comp 3 8 0 0 5 (H has size 2^3, M has size 2^8) - 0 cm 20 128 (direct 20-bit context model with max count 128*4) - 1 cm 22 128 - 2 icm 18 (indirect context model with 2^(18+6) bit histories) - 3 icm 19 - 4 mix 13 0 4 24 255 (13 bit context, mix 0..0+4-1, rate 24, mask 255) -hcomp - *c=a c++ a== 0 if c=0 endif (save input in buffer M pointed to by C) - d=0 *d=0 b=c a=c hashd (context H[0] is a hash of column number) - a=*b hashd (combined with the byte above, saved in M) - b-- a=*b hashd (combined with the byte to the left (order 1)) - a=*d d++ *d=a b-- a=*b hashd (context H[1] as above but order 2) - a=*d d++ *d=a b-- a=*b hashd (context H[2] as above but order 3) - a=*d d++ *d=a b-- a=*b hashd (context H[3] as above put order 4) - d++ a=c a<<= 8 *d=a (context H[5] for mixer is just the column number) - halt -post 0 end (no post-processing) - -The headers are compressed using a mixture of 4 context models. -The first two are direct (CM: context -> bit prediction) -and 3 and 4 are indirect (ICM: context -> bit history -> prediction). -The context for the first model is the column number, the byte -above and the byte to the left. The next 3 add 1 to 3 -more bytes to the left as context, respectively. The four -bit predictions are mixed by weighted averaging in the logistic -domain (log p/(1-p)) and the weights adapted to reduce prediction -errors. The mixer weight vector is selected by a context consisting -of the column number and the previously coded bits of the -current byte. The resulting bits are arithmetic coded. - -In the code above, *C=A saves the input byte in M. C++ advances -to the next byte, which was saved from the previous line. -"A== 0 IF C=0 ENDIF" tests if the input is 0, marking the end of a -header line, and if so, resets the pointer C to the beginning of -the buffer. - -The next 3 lines set the context for component 0, pointed to by D. -HASHD computes the hash *D=(*D+A+512)*773. - -The next 3 lines set the contexts for components 1 through 3 by -copying the previous context hash and combining it with the next -byte back in the history buffer maintained in M and pointed to -by *B. - -The last line uses the low 5 bits of the column number (in C) -as part of the 13 bit context to the mixer. The low 8 bits are -left as zeros so that during modeling the bits from the partial -byte can be added. - - -BASE CALL MODELING - -(fxb.cfg model to compress base calls) -comp 3 3 0 0 7 (hh hm ph pm n) - 0 cm 9 255 (2 KB) - 1 cm 18 255 (1 MB) - 2 cm 25 255 (128 MB) - 3 icm 22 (256 MB) - 4 isse 23 3 (512 MB) - 5 match 26 28 (256 MB hash table, 256 MB buffer) - 6 mix 8 0 6 12 255 (order 0 mix of 0..0+6-1, rate 12, mask 255) -hcomp - c++ *c=a b=c a=0 (save in rotating buffer M) - d= 1 hash *d=a - b-- d++ hash *d=a - b-- d++ hash *d=a - b-- d++ hash *d=a - b-- d++ hash *d=a - halt -post - 0 -end - -Base calls are modeled using an order 0..5 mix. Orders 0, 1, and 2 -are direct, slow adapting (rate = error/count up to 255*4) context models. -Order 3 is indirect. Order 4 is indirect and chained to the order 3 -output, i.e. order 3 prediction is mixed with a constant 1 in the -logistic domain by a pair of adaptive weights selected by the -bit history indexed by the order 4 context hash. The order 5 -context is a match model which looks up the previous occurrence -of the context hash and predicts whatever bit followed. The -mixer context is bytewise order 0. - -The HASH instruction computes A=(A+*B+512)*773. - - -QUALITY MODELING - -(fxq.cfg model used to compress quality scores) -comp 2 12 0 0 4 - 0 cm 22 128 - 1 cm 22 128 - 2 cm 22 128 - 3 mix 14 0 3 12 255 -hcomp - c++ *c=a (store input in M pointed to by C) - a== 0 if c=0 endif (reset M at newline) - d=0 b=c hash *d=a a=c a>>= 3 hashd - d++ a=0 b-- hash *d=a - b-- a=*b a>>= 5 hashd - d++ *d=0 b-- a=*b hashd - b-- a=*b a>>= 4 hashd - d++ a=*c a>>= 3 *d=0 hashd - a=c a> 3 if a>>= 5 a+= 4 endif hashd - halt -post 0 end - -Quality scores use a mix of 3 direct context models. The first -uses the previous byte and the column number excluding the -low 3 bits as the context hash. The second model uses the second byte -and the high 3 bits of the third byte back as the context hash. -The third model uses the 4'th byte and the high 4 bits of -the 5'th byte back as context hash. The mixer uses a 14 bit -context consisting of the current partial byte and the column -number with the high 5 bits dropped for column numbers above 3. - - -ALIGNMENT MODELING - -(fxa.cfg to model reference matches) -comp 0 0 0 0 1 - 0 cm 20 255 -hcomp - c++ b=a - a== 0 if a=c a== 1 if c=0 endif endif - a=c a> 7 if c=0 endif - a< 6 if - a=b a>>= 2 a<<= 5 a+=c - else - a=c - endif - a<<= 9 *d=a - halt -post 0 end - -Reference matches (if present) use a stationary order 0 model with -the parse state (0..7) as context. States 0..3 expect a mismatch -byte and 4..7 expect a pointer byte. States 0..5 also use -the previous byte as context with the low 2 bits discarded. - -The ZPAQ archives are each saved as a single segment in a single block -without a locator tag, filename, comment, or checksum. No post-processing -is used. The ZPAQL code used for each of the 4 files is as follows: - -Each of the 3 or 4 ZPAQ models is compressed or decompressed in parallel -in separate threads from or to temporary files, which are deleted -when done. - -c: input -> output.fx? -> output.fx?.zpaq (delete output.fx?) -d: input.fx?.zpaq -> input.fx? -> output (delete input.fx?) -e: input -> output.fx? -f: input.fx? -> output - -*/ - -#include -#include -#include -#include -#include -#include -#include -#include "libzpaq.h" -using std::string; - -const int N=4096; // max FASTQ line length - -// print error message and exit (may be called by libzpaq) -void libzpaq::error(const char* msg) { - fprintf(stderr, "fastqz error: %s\n", msg); - exit(1); -} -using libzpaq::error; - -// I/O for libzpaq -struct File: public libzpaq::Reader, public libzpaq::Writer { - FILE* f; - int get() {return getc(f);} - void put(int c) {putc(c, f);} - int read(char* buf, int n) {return fread(buf, 1, n, f);} - void write(const char* buf, int n) {fwrite(buf, 1, n, f);} -}; - -// Thread argument -struct Job { - int id; // model 0..2 - string input, output; // filenames -}; - -// Thread to compress job.input to job.output using model job.id -void* compress(void *arg) { - Job& job=*(Job*)arg; - printf("compressing %s\n", job.input.c_str()); - - // Models for fxh, fxb, fxq files - // Byte codes generated by "zpaq -mfx? l" using fx?.cfg above - static char hcomp[4][76]={ - {64,0,3,8,0,0,5,2,20,-128,2,22,-128,3,18,3, - 19,7,13,0,4,24,-1,0,104,17,-33,0,47,1,20,28, - 52,74,66,60,68,60,10,68,60,70,25,112,10,68,60,70, - 25,112,10,68,60,70,25,112,10,68,60,25,66,-49,8,112, - 56,0}, - {55,0,3,3,0,0,7,2,9,-1,2,18,-1,2,25,-1, // fxb - 3,22,8,23,3,4,26,28,7,8,0,6,12,-1,0,17, - 104,74,4,95,1,59,112,10,25,59,112,10,25,59,112,10, - 25,59,112,10,25,59,112,56,0}, - {74,0,2,12,0,0,4,2,22,-128,2,22,-128,2,22,-128, // fxq - 7,14,0,3,12,-1,0,17,104,-33,0,47,1,20,28,74, - 59,112,66,-41,3,60,25,4,10,59,112,10,68,-41,5,60, - 25,52,10,68,60,10,68,-41,4,60,25,69,-41,3,52,60, - 66,-17,3,47,4,-41,5,-121,4,60,56,0}, - {45,0,0,0,0,0,1,2,20,-1,0,17,72,-33,0,47, - 6,66,-33,1,47,1,20,66,-17,7,47,1,20,-25,6,47, - 8,65,-41,2,-49,5,-126,63,1,66,-49,9,112,56,0}}; - - // Compress input to output, then delete input - libzpaq::Compressor co; - File in, out; - in.f=fopen(job.input.c_str(), "rb"); - if (!in.f) perror(job.input.c_str()), exit(1); - out.f=fopen(job.output.c_str(), "wb"); - if (!out.f) perror(job.output.c_str()), exit(1); - co.setInput(&in); - co.setOutput(&out); - co.startBlock(hcomp[job.id]); - co.startSegment(); - co.postProcess(); - co.compress(); - co.endSegment(); - co.endBlock(); - fclose(out.f); - fclose(in.f); - remove(job.input.c_str()); - printf("compressed %s\n", job.output.c_str()); - return 0; -} - -// Thread to decompress job.input to job.output -void* decompress(void *arg) { - Job& job=*(Job*)arg; - printf("decompressing %s\n", job.input.c_str()); - File in, out; - in.f=fopen(job.input.c_str(), "rb"); - if (!in.f) perror(job.input.c_str()), exit(1); - out.f=fopen(job.output.c_str(), "wb"); - if (!out.f) perror(job.output.c_str()), exit(1); - libzpaq::decompress(&in, &out); - fclose(out.f); - fclose(in.f); - printf("decompressed %s\n", job.output.c_str()); - return 0; -} - -// hash 64 bits to 32 bits -unsigned int hash(unsigned long long hl) { - return (hl*12345679123456789ull)>>32; -} - -// Return the positions of the first 4 mismatches between bbuf[0..len-1] -// and ref[h/4...] (incrementing by dir=(+1,-1)), packed LSB first. -// If there are less than 4 mismatches, use len. -int rmatch(libzpaq::Array& ref, unsigned int h, - unsigned char* bbuf, int len, int dir) { - int i, j, score=0; - if (len>126) len=126; - for (i=j=0; i>(6-h%4*2))&3)!=(dir>0?bbuf[i]:3-bbuf[i])) - score+=i<<(j++*8); - for (; j<4; ++j) - score+=len<<(j*8); - return score; -} - -// read reference file into ref -void readref(libzpaq::Array& ref, const char* filename) { - FILE* in=fopen(filename, "rb"); - if (!in) perror(filename), exit(1); - fseek(in, 0, SEEK_END); - int rlen=ftell(in); - if (rlen<0 || rlen>=(1<<30)) - error("reference must be smaller than 1 GB"); - rewind(in); - ref.resize(rlen+N*2); // pad extra N bytes at each end - if (int(fread(&ref[N], 1, rlen, in))!=rlen) error("ref read error"); - printf("%s: length=%d bytes\n", filename, rlen); - fclose(in); -} - -int main(int argc, char** argv) { - - // Start timer - clock_t start=clock(); - - // Check command line: {c|d|e|f} input output - if (argc<4) { - printf("fastqz v1.5 FASTQ compressor\n" - "(C) 2012, Dell Inc. Written by Matt Mahoney. Compiled %s.\n" - "Licensed under BSD 2 clause license\n" - "\n" - "Usage: fastqz command input output [reference]\n" - "Commands\n" - " c[Q] - compress input to output.fx?.zpaq (? = {h,b,q})\n" - " d - decompress input.fx?.zpaq to output\n" - " e[Q] - encode (fast) input to output.fx? (? = {h,b,q})\n" - " f - fast decode input.fx? to output\n" - "Use Q to quantize quality values to steps of size Q for better but\n" - "lossy compression. Default is c1 or e1 (lossless).\n" - "Use fapacks to create a reference genome from FASTA files\n", - __DATE__); - exit(1); - } - - const char cmd=argv[1][0]; // c,d,e,f - int quality=atoi(argv[1]+1); - if (quality<1) quality=1; - const int isref=argc>4; // 1 if a reference file supplied - const int BUCKET=8; // index bucket size - libzpaq::Array ref; // copy of packed reference genome - libzpaq::Array index; // hash table index to ref - - // Encode - if (cmd=='e' || cmd=='c') { - - // Read reference file - if (isref) { - readref(ref, argv[4]); // read into ref - - // Create an index. Divide ref into groups of 32 bases (8 bytes) - // and compute a 32 bit hash, h. Use the low 27 bits as a hash index - // and high 5 bits as a hash checksum. Store the checksum and a - // 27 bit pointer into ref packed into index[h]. - if (cmd=='c' || cmd=='e') { - index.resize((1<<27)+BUCKET); - int collisions=0; - for (int i=N; i<=int(ref.size())-N-8; i+=8) { - unsigned long long hl=0; - for (int j=0; j<8; ++j) hl=hl<<8|ref[i+j]; - unsigned int h=hash(hl); - unsigned int hi=h&0x7ffffff; - int j; - for (j=0; j>3); - } - printf("indexed %s: %d of %d collisions\n", - argv[4], collisions, ref.size()/8); - } - } - - // read input files - FILE *in, *out[4]; // fastq, fxh, fxb, fxq, fxa - int n, i, j, k, len, c; - in=fopen(argv[2], "rb"); - if (!in) perror(argv[2]), exit(1); - for (i=0; i<3+isref; ++i) { - string fn=string(argv[3])+".fx"+"hbqa"[i]; - out[i]=fopen(fn.c_str(), "wb"); - if (!out[i]) perror(fn.c_str()), exit(1); - } - - // Save read length, n - for (i=j=n=0; (c=getc(in))!=EOF && !n; ++i) { - if (c==10 && j) n=i-j-1; - else if (c==10) j=i; - } - if (n<1 || n>=N) error("read length must be 1..4095"); - printf("encoding %s -> %s read length %d\n", - argv[2], argv[3], n); - rewind(in); - putc(n>>8, out[0]); - putc(n&255, out[0]); - - // encode - int base=0; // packed bases in base 4 - unsigned char hbuf[N]={0}; // previous header - unsigned char bbuf[N]={0}; // one sequence - int matches[N+3]={0}; - int match_sum=0, base_sum=0; - int line=0; - bool ismatch=false; - for (line=0; 1; ++line) { - - // encode header as (j+1,k+1,len+1,xxx,0) meaning - // add k to hbuf[..j], then len bytes match, followed by xxx,10. - for (i=j=k=len=0; (c=getc(in))!=EOF && c!=10; ++i) { - if (i>=N) error("Line too long\n"); - if (c!=hbuf[i] && isdigit(c) && isdigit(hbuf[i]) && j<254 - && i<254 && i==len && (!j || j==i)) { - int d=k*10+c-hbuf[i]; - if (d>0 && d<254) hbuf[i]=c, k=d, j=i+1; - } - if (c==hbuf[i] && i==len && len<254) ++len; - hbuf[i]=c; - } - if (c==EOF) { - if (i) error("unexpected EOF in header"); - break; // done - } - putc(j+(j==0), out[0]); - putc(k+1, out[0]); - putc(len+1, out[0]); - for (j=len; j=31) { - unsigned int h=hash(hl); - unsigned int hi=h&0x7ffffff; - for (k=0; kbm) bm=m, bptr=ptr; - } - } - } - } - - // search for complementary matches - hl=0; - for (j=len-1; j>=0; --j) { - hl=hl*4+3-bbuf[j]; - if (j<=len-32) { - unsigned int h=hash(hl); - unsigned int hi=h&0x7ffffff; - for (k=0; kbm) bm=m, bptr=ptr, bdir=-1; - } - } - } - } - ++matches[bm>>24&127]; - match_sum+=(bm>>24)&127; - match_sum-=(bm^bm<<8)>0xffffff; - match_sum-=(bm<<8^bm<<16)>0xffffff; - match_sum-=(bm<<16^bm<<24)>0xffffff; - base_sum+=len; - - // write mismatch locations and pointer to reference genome - ismatch=(bm>>23)>=len; - if (!ismatch) - putc(0, out[3]); - else { - putc(1+bm+128*(bdir<0), out[3]); - putc(1+(bm>>8), out[3]); - putc(1+(bm>>16), out[3]); - putc(1+(bm>>24), out[3]); - putc(bptr>>24, out[3]); - putc(bptr>>16, out[3]); - putc(bptr>>8, out[3]); - putc(bptr, out[3]); - } - } - - // write the bases - for (i=0; i=(bm>>24&255) || i==(bm>>16&255) || i==(bm>>8&255) - || i==(bm&255)) { - j="\x01\x03\x04\x02"[bbuf[i]]; // ACGT -> ATCG - if (base*4+j>255) putc(base, out[1]), base=0; - base=base*4+j; - } - } - - // verify empty second header "+\n" - if (getc(in)!='+') error("expected +"); - if (getc(in)!=10) error("expected newline after +"); - - // encode quality scores - // c=33..104 -> c-32 - // j,c=64..71 -> 73+(j-64)+8*(c-64) - // k,j,c=68..71 -> 137+(k-68)+4*(j-68)+16*(c-68) - // 35...,10 -> 0 - // 71... -> 200+len - len=0; // pending output bytes - j=k=0; // last 2 bytes - for (i=0; (c=getc(in))!=EOF; ++i, k=j, j=c) { - if (c!=10 && (c<33 || c>104)) - error("expected quality score in 33..104"); - if (quality>1 && c>35) c-=(c-35)%quality; - if (c==35 && (len==0 || j==35)) ++len; - else if (len==0 && c>=64 && c<=71) ++len; - else if (len==1 && c>=68 && c<=71 && j>=68 && j<=71) ++len; - else if (len>=2 && len<55 && k==71 && j==71 && c==71) ++len; - else if (c==10 && (len==0 || j==35)) break; - else { // must write pending output - ++len; // c is pending - while (len>1 && j==35) - putc(3, out[2]), --len; - if (len>3 && j==71 && k==71) - putc(199+len, out[2]), len=1; - if (len==3) { - if (c>=68 && c<=71) - putc(137+(k-68)+4*(j-68)+16*(c-68), out[2]), len=0; - else - putc(73+(k-64)+8*(j-64), out[2]), len=1; - } - if (len==2) { - if (c>=64 && c<=71) putc(73+(j-64)+8*(c-64), out[2]), len=0; - else putc(j-32, out[2]), len=1; - } - if (len==1) { - if (c==10) break; - if (c!=35 && (c<64 || c>71)) putc(c-32, out[2]), len=0; - } - } - } - putc(0, out[2]); - if (i!=n) error("wrong number of quality scores"); - } - putc(base, out[1]); - for (i=2+isref; i>=0; --i) fclose(out[i]); - fclose(in); - index.resize(0); - ref.resize(0); - - // print match statistics - if (base_sum>0) { - printf("matches[0..%d+2]=", n); - for (i=0; i<=n+2; ++i) { - printf("%d ", matches[i]); - if (i%10==0) printf("\n"); - } - printf("\nMatched %d of %d bases (%1.2f%%)\n", - match_sum, base_sum, match_sum*100.0/base_sum); - } - - // compress each temporary file to .zpaq in a separate thread - if (cmd=='c') { - pthread_t tid[4]; - pthread_attr_t attr; // thread joinable attribute - pthread_attr_init(&attr); - pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); - Job job[4]; - for (i=0; i<3+isref; ++i) { - job[i].id=i; - job[i].input=string(argv[3])+".fx"+"hbqa"[i]; - job[i].output=job[i].input+".zpaq"; - pthread_create(&tid[i], &attr, compress, (void*)&job[i]); - } - - // wait until all jobs are done - for (i=0; i<3+isref; ++i) { - void* status; - pthread_join(tid[i], &status); - } - } - } - - // decode - else if (cmd=='d' || cmd=='f') { - - // decompress .zpaq - Job job[4]; - if (cmd=='d') { - pthread_t tid[4]; - pthread_attr_t attr; // thread joinable attribute - pthread_attr_init(&attr); - pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); - for (int i=0; i<3+isref; ++i) { - job[i].id=i; - job[i].output=string(argv[2])+".fx"+"hbqa"[i]; - job[i].input=job[i].output+".zpaq"; - pthread_create(&tid[i], &attr, decompress, (void*)&job[i]); - } - - // wait until all threads are done - for (int i=0; i<3+isref; ++i) { - void* status; - pthread_join(tid[i], &status); - } - } - - // read reference - if (isref) readref(ref, argv[4]); - - // open files - FILE *in[4], *out; // fxh, fxb, fxq, fxa, fastq - int i, j, k, c, n; - for (i=0; i<3+isref; ++i) { - string fn=string(argv[2])+".fx"+"hbqa"[i]; - in[i]=fopen(fn.c_str(), "rb"); - if (!in[i]) perror(fn.c_str()), exit(1); - } - out=fopen(argv[3], "wb"); - if (!out) perror(argv[3]), exit(1); - - // get read length, n - n=getc(in[0]); - n=n*256+getc(in[0]); - printf("decoding %s -> %s read length %d\n", - argv[2], argv[3], n); - if (n<1 || n>=N) error("bad read length"); - - // decode - int base=0; - unsigned char hbuf[N]={0}, qbuf[N]={0}; - while (1) { - - // decode header - j=getc(in[0])-1; // index of last digit of number to adjust - if (j==EOF-1) break; - k=getc(in[0])-1; // amount to add - i=getc(in[0])-1; // number of matched bytes after adjustment - if (j<0 || k<0 || i<0) error("bad header"); - for (; i=0; --j, k/=10) { - int d=k%10; - hbuf[j]+=d, k-=d; - if (hbuf[j]>'9') hbuf[j]-=10, k+=10; - } - for (j=0; j pad with 35 and end - // c=1..72 -> c+32 - // c=73..136 -> (c-73)%8+64, (c-73)/8+64 - // c=137..200 -> (c-137)%4+68, (c-137)/4%4+68, (c-137)%16+68 - // c=201..255 -> 71 repeated c-200 times - for (i=0;;) { - c=getc(in[2]); - if (c==EOF) error("unexpected end of .fxq"); - if (i>n) error("missing .fxq terminator"); - if (c==0) { // end of line - for (; i=201 && i+c-200<=n) - while (c-->200) qbuf[i++]=71; - else if (c>=137 && c<=200 && i>2)&3)+68; - qbuf[i++]=((c>>4)&3)+68; - } - else if (c>=73 && c<=136 && i>3)&7)+64; - } - else if (c>=1 && c<=72 && i=128) miss1-=128, bdir=-1; - else bdir=1; - --miss1; - miss2=getc(in[3])-1; - miss3=getc(in[3])-1; - miss4=getc(in[3])-1; - bptr=getc(in[3]); - bptr=bptr*256+getc(in[3]); - bptr=bptr*256+getc(in[3]); - bptr=bptr*256+getc(in[3]); - } - } - - // decode bases - for (i=k=0; i=ref.size()) error(".fxa pointer out of bounds"); - j=(ref[bptr/4]>>(6-bptr%4*2))&3; - bptr+=bdir; - if (bdir<0) j=3-j; - putc("ACGT"[j], out); - ++k; - } - else { - while (base==0) { - base=getc(in[1]); - if (base==EOF) error("unexpected end of .fxb"); - } - if (base>84) j=(base-21)>>6, base-=j*64; - else if (base>20) j=(base-5)>>4, base-=j*16; - else if (base>4) j=(base-1)>>2, base-=j*4; - else j=base, base=0; - putc(" ATCG"[j], out); - ++k; - bptr+=bdir; - } - } - putc(10, out); - - // write empty second header - putc('+', out); - putc(10, out); - - // write quality scores - for (i=0; i=0; --i) fclose(in[i]); - - // delete temporary files - if (cmd=='d') - for (int i=0; i<3+isref; ++i) - remove(job[i].output.c_str()); - - // show results - printf("decoded %s\n", argv[3]); - } - printf("%1.2f seconds\n", double(clock()-start)/CLOCKS_PER_SEC); - return 0; -} +/* fastqz v1.5 - Sanger FASTQ compressor + + Copyright (C) 2012, Matt Mahoney, Dell Inc. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + +TO COMPILE + +g++ -O3 -msse2 -s -lpthread fastqz.cpp libzpaq.cpp -o fastqz + +You need libzpaq.cpp and libzpaq.h from either +https://sourceforge.net/projects/zpaq/ or +http://mattmahoney.net/zpaq/ +libzpaq is public domain. + +Also, to use in Windows you need to install Pthreads-Win32 from +http://sourceware.org/pthreads-win32/ +In particular you need pthread.h to compile and pthreadGC2.dll +in your PATH to run. Pthreads-Win32 is licensed under LGPL. + +libzpaq uses Just-In-Time (JIT) optimization of ZPAQL code on +x86 32 or 64 bit processors. To run on other processors, compile +with -DNOJIT to disable this feature. It will still work but run slower. + + +USAGE + +fx is a compressor for Sanger FASTQ files. It has two compression modes, +fast and slow. + +Usage: fastqz command input output [reference] +Commands: + c - compress input to output.fx?.zpaq (3 files, ? = {h,b,q}) + d - decompress input.fx?.zpaq to output + e - encode input to output.fx? without zpaq compression (faster) + f - decode input.fx? to output + cQ, eQ - quantize quality values to 35 plus a multiple of Q, rounding + down. Default is c1 or e1. + +Commands c and e compress. c compresses smaller but e compresses faster. +The corresponding decompression commands are d and f respectively. +You need 1.5 GB memory to compress with c or decompress with d. +They also both produce temporary files taking the same space as the +output of command e. The e and f commands don't use significant +memory and don't produce temporary files. + +Using a quantization like c2 or e4 is lossy but improves compression +when exact quality values are not needed. Values are rounded down. + +Compression produces 3 files. Command e produces files named +output.fxh, output.fxb, output.fxq. Command c produces files named +output.fxh.zpaq, output.fxb.zpaq, output.fxq.zpaq +When decompressing, omit the .fx? or .fx?.zpaq extension +on the input file names. The extensions will be assumed. + +Input for compression is expected to be a Sanger FASTQ file. +The file consists of "reads" from a DNA sequencing machine. Each +read has the following format: + + @header + ACGTN (base calls, length n) + + + !..I# (quality scores, length n, ASCII 34..73 for A,C,G,T, 33 for N) + +Maximum line length is 4095. Lines must be terminated by LF +(ASCII 10) only (no CR). All base and quality lines must have +the same length (read length = n) throughout the file. +Files not in this format are rejected. + +If [reference] is present, then it is the file name of a reference +genome. The same reference must be present for decompression. +The reference genome consists of a sequence of bases packed 4 +per byte in MSB to LSB order with ACGT=0..3. You can use the +program fapack to convert FASTA files into this format. +The reference genome cannot be bigger than 1 GB (2^32 bases). +You need 1.5 GB memory to encode and 1 GB to decode. +A fourth file will be produced: output.fxa.zpaq or output.fxa +containing compressed alignments. + + +COMPRESSION FORMAT + +Command "c" and "e" both split the input into 3 or 4 parts and +compress them as described below. Command "c" further compresses +each of the 3 or 4 files using a different ZPAQ model. + +Headers (.fxh) are coded in the form (j,k,len,xxx...,0) which means +go to column j-1 (first column is 0) in the previous header and +add k-1 to the decimal number ending there. If k=1, then skip +this step. Then copy the first len characters of the modified previous +header, then output xxx, and finally a linefeed (ASCII 10). Save this +output, minus the linefeed. + +The first 2 bytes of the .fxh file encodes the read length, n, +MSB first (e.g. 0,100 if all base and quality lines have length 100). + +Base calls (.fxb) are encoded first by deleting all N's. These can be +restored because their location is indicated by a quality score +of 33. Then the remaining bases are encoded in self terminating +base 4 with A=1, T=2, C=3, G=4 allowing 3 or 4 bases per byte. +For example, "TACT" is coded as 2*64 + 1*16 + 3*4 + 2*1 = 158. + +If a reference is given, then a list of matches are stored in a .fxa +file. The format is: + + (m1+1+128*dir,m2+1,m3+1,m4+1,p3,p2,p1,p0) to encode a match + (0) to encode no match + +where p3..p0 is a 32 bit pointer (MSB first) +into the reference genome after expanding to 1 base per element +(with 0..3=ACGT) and padding the ends with 16384 zeros (or A). +'dir' is 0 for a match in the forward direction or 1 for a +match in the reverse direction starting at the same point but +exchanging A with T and C with G. m1..m4 are the locations of +the first 4 diferences between the base sequence (after deleting +N's) and the reference, in the range 0..len-1 where len is the +length of the sequence with N's deleted. Thus, the bytes are +coded in the range 1..len, with bit 7 of the first byte set if +the match is reversed. The mismatches are in ascending order. +If there are less than 4 mismatches, then the remaining bytes +are coded as len+1. Thus, only reads up to 126 can be fully +matched. + +If a match is present, then only the corresponding mismatched bases, +plus any bases after m4 (except N), are written to the .fxb file. +If the first byte is 0, then there is no match and the entire +base string is written (except N). + +Quality scores are decoded as follows: q=1..72 decode as q+32 +(33..104). q=73..136 decode as a pair (q-73)%8+64, (q-73)/8+64, +both in the range 64..71. q=137..200 decode as the triple +(q-137)%4+68, (q-137)/4%4+68, (q-137)/16+68 in the range 68..71. +q=201..255 decodes as 71 repeated q-200 (1..55) times. q=0 +decodes by setting all remaining values to 35 and terminating +the sequence. The coding takes advantage of the high frequency +of q at or just below 71 that group early in the sequence, and +of sequences that end in runs of 35. + +Command "c" further compresses the output.fx? files +to output.fx?.zpaq files as defined by the ZPAQ level 2 standard +which can be found at http://mattmahoney.net/zpaq/ or +https://sourceforge.net/projects/zpaq/ + +ZPAQ is a configurable compression format based on the PAQ context +mixing algorithm with bit-wise prediction and arithmetic coding. +Context models are described in ZPAQL byte code, which is saved to +the compressed file and can be read by a generic ZPAQ decompressor. +Thus, a FASTQ file compressed with "fastqz c" could be decompressed +first with zpaq and then with "fastq f" as opposed to decompressing +with "fastq d". + +ZPAQL byte code describes an array of components and code to compute +contexts. Each component takes a context and possibly the predictions +of earlier components and outputs a new probability that the next +bit will be a 1. The output of the last component is used to arithmetic +encode or decode the next bit. After encoding or decoding, the bit +is used to update the models to reduce their prediction errors. + +Whole-byte contexts are computed on byte boundaries by code running on +a ZPAQL virtual machine. This program is executed once after modeling +each byte with that byte as input. The output is saved in an array +of 32-bit values which is available as input to the array of components. +These values are combined with the previously coded bits of the current +byte to form a complete context. + +A ZPAQ model is described by a config file. In this program, the +compiled byte code is fed to the model during compression, or read +from the compressed file header during decompression. The source code +for each model is given below, followed by an explanation of the code. +The command "zpaq -mfx? l" will generate the byte code used in this +program from the sources below named "fx?.cfg" (where ? is h,b,q,a). + +A config file has 3 sections: + + COMP - describes the array of modeling components. + HCOMP - ZPAQL code to compute contexts. + POST/PCOMP - ZPAQL code for post-processing. + +Post-processing is not used, so each file ends with POST 0 END. +Modeled bits are output directly. + +A ZPAQL virtual machine has 32-bit registers A,B,C,D, an array +of bytes M, an array of 32 bit unsigned integers H, a condition flag F, +and a 16 bit program counter. H is the context output to the model. +A is the input byte and accumulator for arithmetic and logical operations. +B and C are pointers into M. D points to H. *B, *C, *D refer +to the elements pointed to, modulo the array sizes. The sizes are +given by the first 2 parameters after COMP. + + +HEADER MODELING + +(fxh.cfg model to compress headers) +comp 3 8 0 0 5 (H has size 2^3, M has size 2^8) + 0 cm 20 128 (direct 20-bit context model with max count 128*4) + 1 cm 22 128 + 2 icm 18 (indirect context model with 2^(18+6) bit histories) + 3 icm 19 + 4 mix 13 0 4 24 255 (13 bit context, mix 0..0+4-1, rate 24, mask 255) +hcomp + *c=a c++ a== 0 if c=0 endif (save input in buffer M pointed to by C) + d=0 *d=0 b=c a=c hashd (context H[0] is a hash of column number) + a=*b hashd (combined with the byte above, saved in M) + b-- a=*b hashd (combined with the byte to the left (order 1)) + a=*d d++ *d=a b-- a=*b hashd (context H[1] as above but order 2) + a=*d d++ *d=a b-- a=*b hashd (context H[2] as above but order 3) + a=*d d++ *d=a b-- a=*b hashd (context H[3] as above put order 4) + d++ a=c a<<= 8 *d=a (context H[5] for mixer is just the column number) + halt +post 0 end (no post-processing) + +The headers are compressed using a mixture of 4 context models. +The first two are direct (CM: context -> bit prediction) +and 3 and 4 are indirect (ICM: context -> bit history -> prediction). +The context for the first model is the column number, the byte +above and the byte to the left. The next 3 add 1 to 3 +more bytes to the left as context, respectively. The four +bit predictions are mixed by weighted averaging in the logistic +domain (log p/(1-p)) and the weights adapted to reduce prediction +errors. The mixer weight vector is selected by a context consisting +of the column number and the previously coded bits of the +current byte. The resulting bits are arithmetic coded. + +In the code above, *C=A saves the input byte in M. C++ advances +to the next byte, which was saved from the previous line. +"A== 0 IF C=0 ENDIF" tests if the input is 0, marking the end of a +header line, and if so, resets the pointer C to the beginning of +the buffer. + +The next 3 lines set the context for component 0, pointed to by D. +HASHD computes the hash *D=(*D+A+512)*773. + +The next 3 lines set the contexts for components 1 through 3 by +copying the previous context hash and combining it with the next +byte back in the history buffer maintained in M and pointed to +by *B. + +The last line uses the low 5 bits of the column number (in C) +as part of the 13 bit context to the mixer. The low 8 bits are +left as zeros so that during modeling the bits from the partial +byte can be added. + + +BASE CALL MODELING + +(fxb.cfg model to compress base calls) +comp 3 3 0 0 7 (hh hm ph pm n) + 0 cm 9 255 (2 KB) + 1 cm 18 255 (1 MB) + 2 cm 25 255 (128 MB) + 3 icm 22 (256 MB) + 4 isse 23 3 (512 MB) + 5 match 26 28 (256 MB hash table, 256 MB buffer) + 6 mix 8 0 6 12 255 (order 0 mix of 0..0+6-1, rate 12, mask 255) +hcomp + c++ *c=a b=c a=0 (save in rotating buffer M) + d= 1 hash *d=a + b-- d++ hash *d=a + b-- d++ hash *d=a + b-- d++ hash *d=a + b-- d++ hash *d=a + halt +post + 0 +end + +Base calls are modeled using an order 0..5 mix. Orders 0, 1, and 2 +are direct, slow adapting (rate = error/count up to 255*4) context models. +Order 3 is indirect. Order 4 is indirect and chained to the order 3 +output, i.e. order 3 prediction is mixed with a constant 1 in the +logistic domain by a pair of adaptive weights selected by the +bit history indexed by the order 4 context hash. The order 5 +context is a match model which looks up the previous occurrence +of the context hash and predicts whatever bit followed. The +mixer context is bytewise order 0. + +The HASH instruction computes A=(A+*B+512)*773. + + +QUALITY MODELING + +(fxq.cfg model used to compress quality scores) +comp 2 12 0 0 4 + 0 cm 22 128 + 1 cm 22 128 + 2 cm 22 128 + 3 mix 14 0 3 12 255 +hcomp + c++ *c=a (store input in M pointed to by C) + a== 0 if c=0 endif (reset M at newline) + d=0 b=c hash *d=a a=c a>>= 3 hashd + d++ a=0 b-- hash *d=a + b-- a=*b a>>= 5 hashd + d++ *d=0 b-- a=*b hashd + b-- a=*b a>>= 4 hashd + d++ a=*c a>>= 3 *d=0 hashd + a=c a> 3 if a>>= 5 a+= 4 endif hashd + halt +post 0 end + +Quality scores use a mix of 3 direct context models. The first +uses the previous byte and the column number excluding the +low 3 bits as the context hash. The second model uses the second byte +and the high 3 bits of the third byte back as the context hash. +The third model uses the 4'th byte and the high 4 bits of +the 5'th byte back as context hash. The mixer uses a 14 bit +context consisting of the current partial byte and the column +number with the high 5 bits dropped for column numbers above 3. + + +ALIGNMENT MODELING + +(fxa.cfg to model reference matches) +comp 0 0 0 0 1 + 0 cm 20 255 +hcomp + c++ b=a + a== 0 if a=c a== 1 if c=0 endif endif + a=c a> 7 if c=0 endif + a< 6 if + a=b a>>= 2 a<<= 5 a+=c + else + a=c + endif + a<<= 9 *d=a + halt +post 0 end + +Reference matches (if present) use a stationary order 0 model with +the parse state (0..7) as context. States 0..3 expect a mismatch +byte and 4..7 expect a pointer byte. States 0..5 also use +the previous byte as context with the low 2 bits discarded. + +The ZPAQ archives are each saved as a single segment in a single block +without a locator tag, filename, comment, or checksum. No post-processing +is used. The ZPAQL code used for each of the 4 files is as follows: + +Each of the 3 or 4 ZPAQ models is compressed or decompressed in parallel +in separate threads from or to temporary files, which are deleted +when done. + +c: input -> output.fx? -> output.fx?.zpaq (delete output.fx?) +d: input.fx?.zpaq -> input.fx? -> output (delete input.fx?) +e: input -> output.fx? +f: input.fx? -> output + +*/ + +#include +#include +#include +#include +#include +#include +#include +#include "libzpaq.h" +using std::string; + +const int N=4096; // max FASTQ line length + +// print error message and exit (may be called by libzpaq) +void libzpaq::error(const char* msg) { + fprintf(stderr, "fastqz error: %s\n", msg); + exit(1); +} +using libzpaq::error; + +// I/O for libzpaq +struct File: public libzpaq::Reader, public libzpaq::Writer { + FILE* f; + int get() {return getc(f);} + void put(int c) {putc(c, f);} + int read(char* buf, int n) {return fread(buf, 1, n, f);} + void write(const char* buf, int n) {fwrite(buf, 1, n, f);} +}; + +// Thread argument +struct Job { + int id; // model 0..2 + string input, output; // filenames +}; + +// Thread to compress job.input to job.output using model job.id +void* compress(void *arg) { + Job& job=*(Job*)arg; + printf("compressing %s\n", job.input.c_str()); + + // Models for fxh, fxb, fxq files + // Byte codes generated by "zpaq -mfx? l" using fx?.cfg above + static char hcomp[4][76]={ + {64,0,3,8,0,0,5,2,20,-128,2,22,-128,3,18,3, + 19,7,13,0,4,24,-1,0,104,17,-33,0,47,1,20,28, + 52,74,66,60,68,60,10,68,60,70,25,112,10,68,60,70, + 25,112,10,68,60,70,25,112,10,68,60,25,66,-49,8,112, + 56,0}, + {55,0,3,3,0,0,7,2,9,-1,2,18,-1,2,25,-1, // fxb + 3,22,8,23,3,4,26,28,7,8,0,6,12,-1,0,17, + 104,74,4,95,1,59,112,10,25,59,112,10,25,59,112,10, + 25,59,112,10,25,59,112,56,0}, + {74,0,2,12,0,0,4,2,22,-128,2,22,-128,2,22,-128, // fxq + 7,14,0,3,12,-1,0,17,104,-33,0,47,1,20,28,74, + 59,112,66,-41,3,60,25,4,10,59,112,10,68,-41,5,60, + 25,52,10,68,60,10,68,-41,4,60,25,69,-41,3,52,60, + 66,-17,3,47,4,-41,5,-121,4,60,56,0}, + {45,0,0,0,0,0,1,2,20,-1,0,17,72,-33,0,47, + 6,66,-33,1,47,1,20,66,-17,7,47,1,20,-25,6,47, + 8,65,-41,2,-49,5,-126,63,1,66,-49,9,112,56,0}}; + + // Compress input to output, then delete input + libzpaq::Compressor co; + File in, out; + in.f=fopen(job.input.c_str(), "rb"); + if (!in.f) perror(job.input.c_str()), exit(1); + out.f=fopen(job.output.c_str(), "wb"); + if (!out.f) perror(job.output.c_str()), exit(1); + co.setInput(&in); + co.setOutput(&out); + co.startBlock(hcomp[job.id]); + co.startSegment(); + co.postProcess(); + co.compress(); + co.endSegment(); + co.endBlock(); + fclose(out.f); + fclose(in.f); + remove(job.input.c_str()); + printf("compressed %s\n", job.output.c_str()); + return 0; +} + +// Thread to decompress job.input to job.output +void* decompress(void *arg) { + Job& job=*(Job*)arg; + printf("decompressing %s\n", job.input.c_str()); + File in, out; + in.f=fopen(job.input.c_str(), "rb"); + if (!in.f) perror(job.input.c_str()), exit(1); + out.f=fopen(job.output.c_str(), "wb"); + if (!out.f) perror(job.output.c_str()), exit(1); + libzpaq::decompress(&in, &out); + fclose(out.f); + fclose(in.f); + printf("decompressed %s\n", job.output.c_str()); + return 0; +} + +// hash 64 bits to 32 bits +unsigned int hash(unsigned long long hl) { + return (hl*12345679123456789ull)>>32; +} + +// Return the positions of the first 4 mismatches between bbuf[0..len-1] +// and ref[h/4...] (incrementing by dir=(+1,-1)), packed LSB first. +// If there are less than 4 mismatches, use len. +int rmatch(libzpaq::Array& ref, unsigned int h, + unsigned char* bbuf, int len, int dir) { + int i, j, score=0; + if (len>126) len=126; + for (i=j=0; i>(6-h%4*2))&3)!=(dir>0?bbuf[i]:3-bbuf[i])) + score+=i<<(j++*8); + for (; j<4; ++j) + score+=len<<(j*8); + return score; +} + +// read reference file into ref +void readref(libzpaq::Array& ref, const char* filename) { + FILE* in=fopen(filename, "rb"); + if (!in) perror(filename), exit(1); + fseek(in, 0, SEEK_END); + int rlen=ftell(in); + if (rlen<0 || rlen>=(1<<30)) + error("reference must be smaller than 1 GB"); + rewind(in); + ref.resize(rlen+N*2); // pad extra N bytes at each end + if (int(fread(&ref[N], 1, rlen, in))!=rlen) error("ref read error"); + printf("%s: length=%d bytes\n", filename, rlen); + fclose(in); +} + +int main(int argc, char** argv) { + + // Start timer + clock_t start=clock(); + + // Check command line: {c|d|e|f} input output + if (argc<4) { + printf("fastqz v1.5 FASTQ compressor\n" + "(C) 2012, Dell Inc. Written by Matt Mahoney. Compiled %s.\n" + "Licensed under BSD 2 clause license\n" + "\n" + "Usage: fastqz command input output [reference]\n" + "Commands\n" + " c[Q] - compress input to output.fx?.zpaq (? = {h,b,q})\n" + " d - decompress input.fx?.zpaq to output\n" + " e[Q] - encode (fast) input to output.fx? (? = {h,b,q})\n" + " f - fast decode input.fx? to output\n" + "Use Q to quantize quality values to steps of size Q for better but\n" + "lossy compression. Default is c1 or e1 (lossless).\n" + "Use fapacks to create a reference genome from FASTA files\n", + __DATE__); + exit(1); + } + + const char cmd=argv[1][0]; // c,d,e,f + int quality=atoi(argv[1]+1); + if (quality<1) quality=1; + const int isref=argc>4; // 1 if a reference file supplied + const int BUCKET=8; // index bucket size + libzpaq::Array ref; // copy of packed reference genome + libzpaq::Array index; // hash table index to ref + + // Encode + if (cmd=='e' || cmd=='c') { + + // Read reference file + if (isref) { + readref(ref, argv[4]); // read into ref + + // Create an index. Divide ref into groups of 32 bases (8 bytes) + // and compute a 32 bit hash, h. Use the low 27 bits as a hash index + // and high 5 bits as a hash checksum. Store the checksum and a + // 27 bit pointer into ref packed into index[h]. + if (cmd=='c' || cmd=='e') { + index.resize((1<<27)+BUCKET); + int collisions=0; + for (int i=N; i<=int(ref.size())-N-8; i+=8) { + unsigned long long hl=0; + for (int j=0; j<8; ++j) hl=hl<<8|ref[i+j]; + unsigned int h=hash(hl); + unsigned int hi=h&0x7ffffff; + int j; + for (j=0; j>3); + } + printf("indexed %s: %d of %d collisions\n", + argv[4], collisions, ref.size()/8); + } + } + + // read input files + FILE *in, *out[4]; // fastq, fxh, fxb, fxq, fxa + int n, i, j, k, len, c; + in=fopen(argv[2], "rb"); + if (!in) perror(argv[2]), exit(1); + for (i=0; i<3+isref; ++i) { + string fn=string(argv[3])+".fx"+"hbqa"[i]; + out[i]=fopen(fn.c_str(), "wb"); + if (!out[i]) perror(fn.c_str()), exit(1); + } + + // Save read length, n + for (i=j=n=0; (c=getc(in))!=EOF && !n; ++i) { + if (c==10 && j) n=i-j-1; + else if (c==10) j=i; + } + if (n<1 || n>=N) error("read length must be 1..4095"); + printf("encoding %s -> %s read length %d\n", + argv[2], argv[3], n); + rewind(in); + putc(n>>8, out[0]); + putc(n&255, out[0]); + + // encode + int base=0; // packed bases in base 4 + unsigned char hbuf[N]={0}; // previous header + unsigned char bbuf[N]={0}; // one sequence + int matches[N+3]={0}; + int match_sum=0, base_sum=0; + int line=0; + bool ismatch=false; + for (line=0; 1; ++line) { + + // encode header as (j+1,k+1,len+1,xxx,0) meaning + // add k to hbuf[..j], then len bytes match, followed by xxx,10. + for (i=j=k=len=0; (c=getc(in))!=EOF && c!=10; ++i) { + if (i>=N) error("Line too long\n"); + if (c!=hbuf[i] && isdigit(c) && isdigit(hbuf[i]) && j<254 + && i<254 && i==len && (!j || j==i)) { + int d=k*10+c-hbuf[i]; + if (d>0 && d<254) hbuf[i]=c, k=d, j=i+1; + } + if (c==hbuf[i] && i==len && len<254) ++len; + hbuf[i]=c; + } + if (c==EOF) { + if (i) error("unexpected EOF in header"); + break; // done + } + putc(j+(j==0), out[0]); + putc(k+1, out[0]); + putc(len+1, out[0]); + for (j=len; j=31) { + unsigned int h=hash(hl); + unsigned int hi=h&0x7ffffff; + for (k=0; kbm) bm=m, bptr=ptr; + } + } + } + } + + // search for complementary matches + hl=0; + for (j=len-1; j>=0; --j) { + hl=hl*4+3-bbuf[j]; + if (j<=len-32) { + unsigned int h=hash(hl); + unsigned int hi=h&0x7ffffff; + for (k=0; kbm) bm=m, bptr=ptr, bdir=-1; + } + } + } + } + ++matches[bm>>24&127]; + match_sum+=(bm>>24)&127; + match_sum-=(bm^bm<<8)>0xffffff; + match_sum-=(bm<<8^bm<<16)>0xffffff; + match_sum-=(bm<<16^bm<<24)>0xffffff; + base_sum+=len; + + // write mismatch locations and pointer to reference genome + ismatch=(bm>>23)>=len; + if (!ismatch) + putc(0, out[3]); + else { + putc(1+bm+128*(bdir<0), out[3]); + putc(1+(bm>>8), out[3]); + putc(1+(bm>>16), out[3]); + putc(1+(bm>>24), out[3]); + putc(bptr>>24, out[3]); + putc(bptr>>16, out[3]); + putc(bptr>>8, out[3]); + putc(bptr, out[3]); + } + } + + // write the bases + for (i=0; i=(bm>>24&255) || i==(bm>>16&255) || i==(bm>>8&255) + || i==(bm&255)) { + j="\x01\x03\x04\x02"[bbuf[i]]; // ACGT -> ATCG + if (base*4+j>255) putc(base, out[1]), base=0; + base=base*4+j; + } + } + + // verify empty second header "+\n" + if (getc(in)!='+') error("expected +"); + if (getc(in)!=10) error("expected newline after +"); + + // encode quality scores + // c=33..104 -> c-32 + // j,c=64..71 -> 73+(j-64)+8*(c-64) + // k,j,c=68..71 -> 137+(k-68)+4*(j-68)+16*(c-68) + // 35...,10 -> 0 + // 71... -> 200+len + len=0; // pending output bytes + j=k=0; // last 2 bytes + for (i=0; (c=getc(in))!=EOF; ++i, k=j, j=c) { + if (c!=10 && (c<33 || c>104)) + error("expected quality score in 33..104"); + if (quality>1 && c>35) c-=(c-35)%quality; + if (c==35 && (len==0 || j==35)) ++len; + else if (len==0 && c>=64 && c<=71) ++len; + else if (len==1 && c>=68 && c<=71 && j>=68 && j<=71) ++len; + else if (len>=2 && len<55 && k==71 && j==71 && c==71) ++len; + else if (c==10 && (len==0 || j==35)) break; + else { // must write pending output + ++len; // c is pending + while (len>1 && j==35) + putc(3, out[2]), --len; + if (len>3 && j==71 && k==71) + putc(199+len, out[2]), len=1; + if (len==3) { + if (c>=68 && c<=71) + putc(137+(k-68)+4*(j-68)+16*(c-68), out[2]), len=0; + else + putc(73+(k-64)+8*(j-64), out[2]), len=1; + } + if (len==2) { + if (c>=64 && c<=71) putc(73+(j-64)+8*(c-64), out[2]), len=0; + else putc(j-32, out[2]), len=1; + } + if (len==1) { + if (c==10) break; + if (c!=35 && (c<64 || c>71)) putc(c-32, out[2]), len=0; + } + } + } + putc(0, out[2]); + if (i!=n) error("wrong number of quality scores"); + } + putc(base, out[1]); + for (i=2+isref; i>=0; --i) fclose(out[i]); + fclose(in); + index.resize(0); + ref.resize(0); + + // print match statistics + if (base_sum>0) { + printf("matches[0..%d+2]=", n); + for (i=0; i<=n+2; ++i) { + printf("%d ", matches[i]); + if (i%10==0) printf("\n"); + } + printf("\nMatched %d of %d bases (%1.2f%%)\n", + match_sum, base_sum, match_sum*100.0/base_sum); + } + + // compress each temporary file to .zpaq in a separate thread + if (cmd=='c') { + pthread_t tid[4]; + pthread_attr_t attr; // thread joinable attribute + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + Job job[4]; + for (i=0; i<3+isref; ++i) { + job[i].id=i; + job[i].input=string(argv[3])+".fx"+"hbqa"[i]; + job[i].output=job[i].input+".zpaq"; + pthread_create(&tid[i], &attr, compress, (void*)&job[i]); + } + + // wait until all jobs are done + for (i=0; i<3+isref; ++i) { + void* status; + pthread_join(tid[i], &status); + } + } + } + + // decode + else if (cmd=='d' || cmd=='f') { + + // decompress .zpaq + Job job[4]; + if (cmd=='d') { + pthread_t tid[4]; + pthread_attr_t attr; // thread joinable attribute + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + for (int i=0; i<3+isref; ++i) { + job[i].id=i; + job[i].output=string(argv[2])+".fx"+"hbqa"[i]; + job[i].input=job[i].output+".zpaq"; + pthread_create(&tid[i], &attr, decompress, (void*)&job[i]); + } + + // wait until all threads are done + for (int i=0; i<3+isref; ++i) { + void* status; + pthread_join(tid[i], &status); + } + } + + // read reference + if (isref) readref(ref, argv[4]); + + // open files + FILE *in[4], *out; // fxh, fxb, fxq, fxa, fastq + int i, j, k, c, n; + for (i=0; i<3+isref; ++i) { + string fn=string(argv[2])+".fx"+"hbqa"[i]; + in[i]=fopen(fn.c_str(), "rb"); + if (!in[i]) perror(fn.c_str()), exit(1); + } + out=fopen(argv[3], "wb"); + if (!out) perror(argv[3]), exit(1); + + // get read length, n + n=getc(in[0]); + n=n*256+getc(in[0]); + printf("decoding %s -> %s read length %d\n", + argv[2], argv[3], n); + if (n<1 || n>=N) error("bad read length"); + + // decode + int base=0; + unsigned char hbuf[N]={0}, qbuf[N]={0}; + while (1) { + + // decode header + j=getc(in[0])-1; // index of last digit of number to adjust + if (j==EOF-1) break; + k=getc(in[0])-1; // amount to add + i=getc(in[0])-1; // number of matched bytes after adjustment + if (j<0 || k<0 || i<0) error("bad header"); + for (; i=0; --j, k/=10) { + int d=k%10; + hbuf[j]+=d, k-=d; + if (hbuf[j]>'9') hbuf[j]-=10, k+=10; + } + for (j=0; j pad with 35 and end + // c=1..72 -> c+32 + // c=73..136 -> (c-73)%8+64, (c-73)/8+64 + // c=137..200 -> (c-137)%4+68, (c-137)/4%4+68, (c-137)%16+68 + // c=201..255 -> 71 repeated c-200 times + for (i=0;;) { + c=getc(in[2]); + if (c==EOF) error("unexpected end of .fxq"); + if (i>n) error("missing .fxq terminator"); + if (c==0) { // end of line + for (; i=201 && i+c-200<=n) + while (c-->200) qbuf[i++]=71; + else if (c>=137 && c<=200 && i>2)&3)+68; + qbuf[i++]=((c>>4)&3)+68; + } + else if (c>=73 && c<=136 && i>3)&7)+64; + } + else if (c>=1 && c<=72 && i=128) miss1-=128, bdir=-1; + else bdir=1; + --miss1; + miss2=getc(in[3])-1; + miss3=getc(in[3])-1; + miss4=getc(in[3])-1; + bptr=getc(in[3]); + bptr=bptr*256+getc(in[3]); + bptr=bptr*256+getc(in[3]); + bptr=bptr*256+getc(in[3]); + } + } + + // decode bases + for (i=k=0; i=ref.size()) error(".fxa pointer out of bounds"); + j=(ref[bptr/4]>>(6-bptr%4*2))&3; + bptr+=bdir; + if (bdir<0) j=3-j; + putc("ACGT"[j], out); + ++k; + } + else { + while (base==0) { + base=getc(in[1]); + if (base==EOF) error("unexpected end of .fxb"); + } + if (base>84) j=(base-21)>>6, base-=j*64; + else if (base>20) j=(base-5)>>4, base-=j*16; + else if (base>4) j=(base-1)>>2, base-=j*4; + else j=base, base=0; + putc(" ATCG"[j], out); + ++k; + bptr+=bdir; + } + } + putc(10, out); + + // write empty second header + putc('+', out); + putc(10, out); + + // write quality scores + for (i=0; i=0; --i) fclose(in[i]); + + // delete temporary files + if (cmd=='d') + for (int i=0; i<3+isref; ++i) + remove(job[i].output.c_str()); + + // show results + printf("decoded %s\n", argv[3]); + } + printf("%1.2f seconds\n", double(clock()-start)/CLOCKS_PER_SEC); + return 0; +} diff --git a/libzpaq.3.pod b/libzpaq.3.pod index 5726755..3ea1b95 100644 --- a/libzpaq.3.pod +++ b/libzpaq.3.pod @@ -1,737 +1,737 @@ -# Documentation for libzpaq -# -# Copyright (C) 2012, Dell Inc. Written by Matt Mahoney. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so without restriction. -# This Software is provided "as is" without warranty. -# -# To create man page: pod2man libzpaq.3.pod > libzpaq.3 -# To create HTML documentation: pod2html libzpaq.3.pod > libzpaq.html - -=pod - -=head1 NAME - -libzpaq - ZPAQ compression API - -=head1 SYNOPSIS - - #include "libzpaq.h" - - namespace libzpaq { - - extern void error(const char* msg); - - class Reader { - public: - virtual int get() = 0; - virtual int read(char* buf, int n); // optional - virtual ~Reader() {} - }; - - class Writer { - public: - virtual void put(int c) = 0; - virtual void write(const char* buf, int n); // optional - virtual ~Writer() {} - }; - - class SHA1 { - public: - SHA1(); - void put(int c); - double size() const; - uint64_t usize() const - const char* result(); - }; - - class Compressor { - public: - Compressor(); - void setOutput(Writer* out); - void writeTag(); - void startBlock(int level); - void startBlock(const char* hcomp); - void startSegment(const char* filename = 0, - const char* comment = 0); - void setInput(Reader* i); - void postProcess(const char* pcomp = 0, int length = 0); - bool compress(int n = -1); - void endSegment(const char* sha1string = 0); - void endBlock(); - }; - - class Decompresser { - public: - Decompresser(); - void setInput(Reader* in); - bool findBlock(double* memptr = 0); - void hcomp(Writer* out); - bool findFilename(Writer* = 0); - void readComment(Writer* = 0); - void setOutput(Writer* out); - void setSHA1(SHA1* sha1ptr); - bool decompress(int n = -1); - bool pcomp(Writer* out); - void readSegmentEnd(char* sha1string = 0); - }; - - void compress(Reader* in, Writer* out, int level); - - void decompress(Reader* in, Writer* out); - } - -=head1 DESCRIPTION - -I is a C++ API for compressing or decompressing -files or objects in memory comforming to the ZPAQ level 1 and 2 standards -(see I). This document describes version 5.00 -of the software. The software may be used without -restriction under a modified MIT license. - -ZPAQ provides a high level of data compression in a streaming -(single pass) self-describing format that supports single or multiple -named objects (such as archives) with optional integrity checking. - -The library provides 3 default compression levels but supports -custom algorithms. The performance of the default levels is -shown in the table below for the 14 file Calgary corpus as -a tar file. Compression and decompression times are in seconds -on a 2 GHz T3200 running on one of two cores. Memory required -to compress or decompress is in MB. Some popular formats -are shown for comparison. - - Program Format Size Time (C, D) Memory - ----------- ------ --------- ----------- ------ - Uncompresed .tar 3,152,896 - compress .tar.Z 1,319,521 1.6 0.2 .1 MB - gzip -9 .tar.gz 1,022,810 0.7 0.1 .1 MB - bzip2 -9 .tar.bz2 860,097 0.6 0.4 5 MB - 7zip .tar.7z 824,573 1.5 0.1 195 MB - zpaq 1 (fast) .tar.zpaq 806,959 2 2 38 MB - zpaq 2 (mid) .tar.zpaq 699,191 8 8 112 MB - zpaq 3 (max) .tar.zpaq 644,190 20 20 246 MB - -A ZPAQ stream consists of one or more blocks, possibly mixed with -other data, that can be decompressed independently in any order. -Each block consists of one or more segments that must be decompressed -in order from the beginning of the block. Each block header contains -a description of the decompression algorithm. Each segment consists -of an optional filename string, an optional comment string, -self delimiting compressed data, and an optional SHA-1 checksum. -If ZPAQ blocks are mixed with other data, they must be -preceded by an identifying 13 byte tag which does not otherwise -appear in that data. - -ZPAQ compression is based on the PAQ context mixing model. -An array of components predict the probability of the next bit -of input, either independently or depending on the predictions -of earlier components. The final prediction is arithmetic coded. -Each component inputs a context computed from earlier input -by a program written in ZPAQL byte code which runs on a virtual -machine. Both the component array description and the ZPAQL -code are encoded in a string called HCOMP in each block header. -Data can also be stored uncompressed. - -A block may optionally specify a post-processor, a program -(also in ZPAQL) which takes the decoded data as input and -outputs the decompressed output. This program, if present, -is encoded as a string called PCOMP which is compressed -in the first segment prior to the compressed data. The first -decoded byte from the first segment is a flag indicating -whether a PCOMP string is present. The user is responsible -for correctly pre-processing the data so that post-processing -restores the original data. - -=head2 API Organization - -The I API consists of 2 files. - -=over - -=item libzpaq.h - -Header file to include in your application. - -=item libzpaq.cpp - -Source code file to link to your application. - -=back - -An application would have the line C<#include "libzpaq.h"> and -link to libzpaq.cpp. -The API provides two classes, C and C -which write or read respectively each of the syntactic elements -of a ZPAQ stream. The two functions C and -C provide simple interfaces for the most common -uses. In either case, the user must create classes derived -from the abstract base classes C and C and -define methods C and C which the code -will use to read and write bytes. The user must also define -a callback error handler. - -By default, libzpaq(3) uses just-in-time (JIT) acceleration -by translating ZPAQL code to x86-32 or x86-64 internally -and executing it. This feature can be disabled by compiling -with -DNOJIT. If enabled, it requires an x86 processor -capable of executing SSE2 instructions. SSE2 is supported -by most Intel processors since 2001 and AMD since 2003. - -Run time checks (assertions) can be enabled with -DDEBUG -for debugging purposes. - -All of the API code is contained in the namespace C. - -=head2 Callback Functions - -The following three functions must be defined by the user. - -=over - -=item C - -This function must be defined by the user to handle errors -from libzpaq. The library will call the function with -an English language message passed to C. Errors may -result from bad input during decompression, out of memory, -or illegal arguments or calling sequences to libzpaq -functions. Errors should be considered unrecoverable. - -=item C - -The user must create a class derived from Reader with an -implementation for C that reads one byte of input -and returns its value in the range 0...255, or returns -EOF (-1) at end of input. Objects of the derived type -would then be passed to functions that require a C. - -=item C - -The user must create a class derived from Writer with -an implemenation of C which is expected to take -a byte value C in the range 0...255 and write it to -output. Objects of the derived type -would then be passed to functions that require a C. - -=back - -The following two functions are optional. Defining them -can improve performance slightly. - -=over - -=item C - -If defined, this function should input up to C bytes into -the array C and return the number actually read, in -the range 0..n. A return value of 0 indicates end of input. -If C is not defined, then the default implementation -will call C n times. - -=item C - -If defined, this function should output the elements C -through C in order. If not defined, then the default -implementation will call C n times. - -=back - -=head2 Simple Compression - -In the remainder of this document, all classes and -functions are assumed to be in namespace C. - -=over - -=item C - -C compresses from C to C until C -returns EOF. It writes a single segment in a single block -with empty filename, comment, and checksum fields. C -must be 1, 2, or 3, to select models I, I, or -I respectively. Higher modes compress smaller but -take longer to compress and subsequently decompress. - -=item C - -C decompresses any valid ZPAQ stream from -C to C until C returns EOF. Any -non-ZPAQ data in the input is ignored. Any ZPAQ blocks -following non-ZPAQ must be preceded by a marker tag -to be recognized. Each block is decoded according to the -instructions in the block header. The contents of the -filename, comment, and checksum fields are ignored. -Data with bad checksums will be decoded anyway. If there -is more than one segment, then all of the output -data will be concatenated. - -=back - -=head2 class SHA1 - -The SHA1 class is used to compute SHA-1 checksums for compression -and verify them for decompression. It is believed to be -computationally infeasible to find two different strings -with the same hash value. Its member functions -are as follows: - -=over - -=item C - -The constructor creates a new SHA1 object representing the -hash of an empty string. - -=item C - -Appends one byte c (0...255) to the string whose hash is represented. - -=item C - -Returns the length (so far) of the string whose hash is represented. -The largest possible value returned is -2^61 - 1 = 2305843009213693951.0, but values larger than 2^53 = -9007199254740992.0 -will not be exact on systems using IEEE 64 bit floating point -representation of type C. The initial value is 0.0. - -=item C - -Returns the length (so far) as a 64 bit unsigned integer. - -=item C - -Computes the 20 byte SHA-1 hash and resets the string back -to a size of 0.0. The returned pointer points to an array -inside the SHA1 object whose -contents remain unchanged until the next call to C. - -=back - -=head2 class Compressor - -The C class has member functions to write -each of the syntactic elements of a ZPAQ stream and to specify -their values. It will compress using either built-in or -user supplied models. - -=over - -=item C - -The constructor creates a Compression object. No input source, -output destination, or compression model is specified. - -=item C - -Specifies a destination for output. Must be specified before -calling any function that writes data. - -=item C - -Writes a 13 byte marker tag which can be used to identify -the start of a block following non-ZPAQ data. - -=item C - -Writes a block header and specifies a compression model. -If linked with F, then C must be 1, 2, or 3 -to specify I, I, or I respectively. Higher numbers -compress smaller but more slowly. These models are compatible -with both the ZPAQ level 1 and 2 standards. - -=item C - -Writes a block header and specifies the HCOMP portion of the -compression model. The first two bytes of the string should -encode the length of the rest of the string as a 16 bit unsigned -number with the least significant bit first. The meaning of the -rest of the string is defined in the ZPAQ level 2 standard. -If the number of components (C) is 0, then the block -is saved in ZPAQ level 2 format, which cannot be read by -older ZPAQ level 1 decoders. Otherwise the block is saved in -ZPAQ level 1 format, which is compatible with all decoders. - -=item C - -Writes a segment header. C and -C are NUL terminated strings. If specified, then their -values are stored. Normally, C would be a file name -when compressing to an archive or omitted otherwise. If a file -is split among segments, then by convention only the first segment -is named. C is normally the uncompressed size as a decimal -number which is displayed when listing the contents of an archive. -Omitting it does not affect decompression. - -=item C - -Specifies the optional PCOMP string used for post-processing. -It must be called from within the first segment -of each block prior to compressing any data, but not from within -any other segment. -If C is 0 or no argument is passed, then the decompresser -will not post-process the data. The effect is to compress a -0 byte to indicate to the decompresser that no PCOMP string -is present. - -If C is not 0, then I bytes of the string I -are passed. If I is 0 or omitted, then -the first two bytes must encode -the length of the rest of the string as a 16 bit unsigned number -with the least significant byte first. The format of the remainder -of the string is described in the ZPAQ level 2 standard. -The effect is to compress a 1 byte -to indicate the presence of PCOMP, followed by the two length -bytes and the string as passed. For example, either -C or C -would compress the 5 bytes 1, 2, 0, 5, 8. -The user is responsible for pre-processing the input -prior to compression so that PCOMP restores the original data. - -=item C - -Specifies the input source for compression. It must be set -prior to the first call to C. - -=item C - -Compress n bytes of data, or until EOF is input, whichever comes -first. If n < 0 or omitted, then compress until EOF. -Returns true if there is more input available, or false if EOF -was read. - -=item C - -Stop compressing and write the end of a segment. If -C is specified, it should be a 20 byte string -as returned by C on the input data for -this segment I pre-processing. - -=item C - -Finish writing the current block. - -=back - -In order to create a valid ZPAQ stream, the components must -be written in the following order: - - for each block do { - if any non-ZPAQ data then { - write non-ZPAQ data - writeTag() - } - startBlock() - for each segment do { - startSegment() - if first segment in block then { - postProcess() - } - while (compress(n)) ; - endSegment() - } - endBlock() - } - -=head2 class Decompresser - -The class Decompresser has member functions to read each of the -syntactic elements of a ZPAQ stream. - -=over - -=item C - -The constructor creates a Decompresser object. No input source or -output destination is specified. - -=item C - -Specifies where the ZPAQ stream will be read from. Must be called -before any function that reads the stream. - -=item C - -Scan the input to find the start of the next block. If a block -does not start immediately, then the block must be preceded by -a marker tag (written with C) or it will -not be found. If C is not 0, then write the approximate -memory requirement (in bytes) to decompress to C<*memptr>). The -memory will be allocated by the first call to C. -It returns true if a block is found, or false if it reads to EOF -without finding a block. - -=item C - -Write the HCOMP string of the current block to C. -It will be in a format suitable -for passing to C. The first 2 bytes will -encode the length of the rest of the string as a 16 bit unsigned -integer with the least significant byte first. The format of the -remainder of the string is described in the ZPAQ level 1 -specification. - -=item C - -Find the start of the next segment. If another segment is found -within the current block then return true. If the end of the block -is found first, then return false. If a segment is found, the -filename field is not empty, and C -is not 0, then write the filename (without a terminating NUL byte) -to C. - -=item C - -Read or skip past the comment field following the filename field -in the segment header. If C is not 0 and the comment field is -not empty, then write the comment -(without a terminating NUL byte) to C. - -=item C - -Specify the destination for decompression. It must be set before -any data can be decompressed. - -=item C - -Specify the address of a SHA1 object for computing the checksum -of the decompressed data (after post-processing). As each byte C -is output, it is also passed to Cput(c)>. In order to -compute the correct checksum, the SHA1 object should be in its -initial state, either newly created, or by calling C, -before the first call to C. When the end of the segment -is reached, the value returned by Cresult()> should match -the stored checksum, if any. - -=item C - -Decode n bytes or until the end of segment, whichever comes -first. Return false if the end of segment is reached first. If -n < 0 or not specified, then decompress to the end of segment -and return false. C is the number of bytes prior to post-processing. -If the data is post-processed, then the size of the output may -be different. - -=item C - -Write the PCOMP string, if any, for the current block to C. -If there is no PCOMP string (no post-processor) then return false. -Otherwise write the string to C in a format suitable for -passing to C and return true. If written, -then the first 2 bytes will encode the length of the rest of the -string as a 16 bit unsigned integer with the least significant -bit first. The format of the rest of the string is descibed in -the ZPAQ level 1 standard. - -C is only valid after the first call to C -in the current block. To read the PCOMP string without decompressing any -data, then call C first. It is not necessary to -call C in this case. - -=item C - -Skip any compressed data in the current segment that has not yet -been decompressed and advance to the end of the segment. -Then if C is not 0 then write into -the 21 byte array that it points to. If a checksum is present, -then write a 1 into C and write the stored checksum -in C. Otherwise write a 0 in C. - -Note that it is not permitted to call decompress() if any compressed -data has been skipped in any earlier segments in the same block. - -=back - -A valid sequence of calls is as follows: - - while (findBlock()) { - while (findFilename()) { - readComment(); - if first segment in block then { (optional) - decompress(0) - pcomp() - } - while (decompress(n)) ; (optional) - readSegmentEnd(); - } - } - -=head1 EXAMPLES - -The following program F -lists the contents of a ZPAQ archive -read from standard input. - - #include - #include - #include "libzpaq.h" - - // Implement Reader and Writer interfaces for file I/O - class File: public libzpaq::Reader, public libzpaq::Writer { - FILE* f; - public: - File(FILE* f_): f(f_) {} - int get() {return getc(f);} - void put(int c) {putc(c, f);} - int read(char* buf, int n) {return fread(buf, 1, n, f);} - void write(const char* buf, int n) {fwrite(buf, 1, n, f);} - }; - - // Implement error handler - namespace libzpaq { - void error(const char* msg) { - fprintf(stderr, "Error: %s\n", msg); - exit(1); - } - } - - // List the contents of an archive. For each block, show - // the memory required to decompress. For each segment, - // show the filename and comment. - void list(FILE* input, FILE* output) { - libzpaq::Decompresser d; - File in(input), out(output); - double memory; - d.setInput(&in); - for (int block=1; d.findBlock(&memory); ++block) { - printf("Block %d needs %1.0f MB\n", block, memory/1e6); - while (d.findFilename(&out)) { // print filename - printf("\t"); - d.readComment(&out); // print comment - printf("\n"); - d.readSegmentEnd(); // skip compressed data - } - } - } - - int main() { - list(stdin, stdout); - return 0; - } - -The program could be compiled as follows: - - g++ listzpaq.cpp libzpaq.cpp - -The following code compresses a list of files into one block -written to stdout. Each file is compressed to a separate -segment. For each segment, the filename, comment, and SHA-1 -checksum are stored. The comment, as conventional, is the -file size as a decimal string. - - // Compress one file to one segment - void compress_file(libzpaq::Compressor& c, - const char* filename, - bool first_segment) { - - // Open input file - FILE* f; - f=fopen(filename, "rb"); - if (!f) return; - - // Compute SHA-1 checksum and file size - libzpaq::SHA1 sha1; - int ch; - while ((ch=getc(f))!=EOF) - sha1.put(ch); - - // Write file size as a comment. - // The size can have at most 19 digits. - char comment[20]; - sprintf(comment, "%1.0f", sha1.size()); - - // Compress segment - rewind(f); - File in(f); - c.startSegment(filename, comment); - if (first_segment) - c.postProcess(); - c.setInput(&in); - c.compress(); - c.endSegment(sha1.result()); - - // Close input file - fclose(f); - } - - // Compress a list of argc files in argv[0...argc-1] into one - // ZPAQ block to stdout at level 2. - void compress_list(int argc, char** argv) { - libzpaq::Compressor c; - File out(stdout); - c.setOutput(&out); - c.startBlock(2); - for (int i=0; i and C can -be passed an argument n to display progress every n bytes, -for example: - - for (int i=1; d.decompress(1000000); ++i) - fprintf(stderr, "Decompressed %d MB\n", i); - -To compress or decompress to and from objects in memory, derive -appropriate classes from C and C. For example, it is -possible to compress or decompress to a C using -the following class. - - struct String: public libzpaq::Writer { - std::string s; - void put(int c) {s+=char(c);} - }; - -This class is also useful for reading the filename and comment -fields during decompression as follows: - - String filename, comment; - while (d.findFilename(&filename)) { - d.readComment(&comment); - // ... - -=head1 AVAILABILITY - -I, I, and the ZPAQ level 1 and 2 specifications are -available from L. - -=head1 SEE ALSO - -C -C - -=cut - - +# Documentation for libzpaq +# +# Copyright (C) 2012, Dell Inc. Written by Matt Mahoney. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so without restriction. +# This Software is provided "as is" without warranty. +# +# To create man page: pod2man libzpaq.3.pod > libzpaq.3 +# To create HTML documentation: pod2html libzpaq.3.pod > libzpaq.html + +=pod + +=head1 NAME + +libzpaq - ZPAQ compression API + +=head1 SYNOPSIS + + #include "libzpaq.h" + + namespace libzpaq { + + extern void error(const char* msg); + + class Reader { + public: + virtual int get() = 0; + virtual int read(char* buf, int n); // optional + virtual ~Reader() {} + }; + + class Writer { + public: + virtual void put(int c) = 0; + virtual void write(const char* buf, int n); // optional + virtual ~Writer() {} + }; + + class SHA1 { + public: + SHA1(); + void put(int c); + double size() const; + uint64_t usize() const + const char* result(); + }; + + class Compressor { + public: + Compressor(); + void setOutput(Writer* out); + void writeTag(); + void startBlock(int level); + void startBlock(const char* hcomp); + void startSegment(const char* filename = 0, + const char* comment = 0); + void setInput(Reader* i); + void postProcess(const char* pcomp = 0, int length = 0); + bool compress(int n = -1); + void endSegment(const char* sha1string = 0); + void endBlock(); + }; + + class Decompresser { + public: + Decompresser(); + void setInput(Reader* in); + bool findBlock(double* memptr = 0); + void hcomp(Writer* out); + bool findFilename(Writer* = 0); + void readComment(Writer* = 0); + void setOutput(Writer* out); + void setSHA1(SHA1* sha1ptr); + bool decompress(int n = -1); + bool pcomp(Writer* out); + void readSegmentEnd(char* sha1string = 0); + }; + + void compress(Reader* in, Writer* out, int level); + + void decompress(Reader* in, Writer* out); + } + +=head1 DESCRIPTION + +I is a C++ API for compressing or decompressing +files or objects in memory comforming to the ZPAQ level 1 and 2 standards +(see I). This document describes version 5.00 +of the software. The software may be used without +restriction under a modified MIT license. + +ZPAQ provides a high level of data compression in a streaming +(single pass) self-describing format that supports single or multiple +named objects (such as archives) with optional integrity checking. + +The library provides 3 default compression levels but supports +custom algorithms. The performance of the default levels is +shown in the table below for the 14 file Calgary corpus as +a tar file. Compression and decompression times are in seconds +on a 2 GHz T3200 running on one of two cores. Memory required +to compress or decompress is in MB. Some popular formats +are shown for comparison. + + Program Format Size Time (C, D) Memory + ----------- ------ --------- ----------- ------ + Uncompresed .tar 3,152,896 + compress .tar.Z 1,319,521 1.6 0.2 .1 MB + gzip -9 .tar.gz 1,022,810 0.7 0.1 .1 MB + bzip2 -9 .tar.bz2 860,097 0.6 0.4 5 MB + 7zip .tar.7z 824,573 1.5 0.1 195 MB + zpaq 1 (fast) .tar.zpaq 806,959 2 2 38 MB + zpaq 2 (mid) .tar.zpaq 699,191 8 8 112 MB + zpaq 3 (max) .tar.zpaq 644,190 20 20 246 MB + +A ZPAQ stream consists of one or more blocks, possibly mixed with +other data, that can be decompressed independently in any order. +Each block consists of one or more segments that must be decompressed +in order from the beginning of the block. Each block header contains +a description of the decompression algorithm. Each segment consists +of an optional filename string, an optional comment string, +self delimiting compressed data, and an optional SHA-1 checksum. +If ZPAQ blocks are mixed with other data, they must be +preceded by an identifying 13 byte tag which does not otherwise +appear in that data. + +ZPAQ compression is based on the PAQ context mixing model. +An array of components predict the probability of the next bit +of input, either independently or depending on the predictions +of earlier components. The final prediction is arithmetic coded. +Each component inputs a context computed from earlier input +by a program written in ZPAQL byte code which runs on a virtual +machine. Both the component array description and the ZPAQL +code are encoded in a string called HCOMP in each block header. +Data can also be stored uncompressed. + +A block may optionally specify a post-processor, a program +(also in ZPAQL) which takes the decoded data as input and +outputs the decompressed output. This program, if present, +is encoded as a string called PCOMP which is compressed +in the first segment prior to the compressed data. The first +decoded byte from the first segment is a flag indicating +whether a PCOMP string is present. The user is responsible +for correctly pre-processing the data so that post-processing +restores the original data. + +=head2 API Organization + +The I API consists of 2 files. + +=over + +=item libzpaq.h + +Header file to include in your application. + +=item libzpaq.cpp + +Source code file to link to your application. + +=back + +An application would have the line C<#include "libzpaq.h"> and +link to libzpaq.cpp. +The API provides two classes, C and C +which write or read respectively each of the syntactic elements +of a ZPAQ stream. The two functions C and +C provide simple interfaces for the most common +uses. In either case, the user must create classes derived +from the abstract base classes C and C and +define methods C and C which the code +will use to read and write bytes. The user must also define +a callback error handler. + +By default, libzpaq(3) uses just-in-time (JIT) acceleration +by translating ZPAQL code to x86-32 or x86-64 internally +and executing it. This feature can be disabled by compiling +with -DNOJIT. If enabled, it requires an x86 processor +capable of executing SSE2 instructions. SSE2 is supported +by most Intel processors since 2001 and AMD since 2003. + +Run time checks (assertions) can be enabled with -DDEBUG +for debugging purposes. + +All of the API code is contained in the namespace C. + +=head2 Callback Functions + +The following three functions must be defined by the user. + +=over + +=item C + +This function must be defined by the user to handle errors +from libzpaq. The library will call the function with +an English language message passed to C. Errors may +result from bad input during decompression, out of memory, +or illegal arguments or calling sequences to libzpaq +functions. Errors should be considered unrecoverable. + +=item C + +The user must create a class derived from Reader with an +implementation for C that reads one byte of input +and returns its value in the range 0...255, or returns +EOF (-1) at end of input. Objects of the derived type +would then be passed to functions that require a C. + +=item C + +The user must create a class derived from Writer with +an implemenation of C which is expected to take +a byte value C in the range 0...255 and write it to +output. Objects of the derived type +would then be passed to functions that require a C. + +=back + +The following two functions are optional. Defining them +can improve performance slightly. + +=over + +=item C + +If defined, this function should input up to C bytes into +the array C and return the number actually read, in +the range 0..n. A return value of 0 indicates end of input. +If C is not defined, then the default implementation +will call C n times. + +=item C + +If defined, this function should output the elements C +through C in order. If not defined, then the default +implementation will call C n times. + +=back + +=head2 Simple Compression + +In the remainder of this document, all classes and +functions are assumed to be in namespace C. + +=over + +=item C + +C compresses from C to C until C +returns EOF. It writes a single segment in a single block +with empty filename, comment, and checksum fields. C +must be 1, 2, or 3, to select models I, I, or +I respectively. Higher modes compress smaller but +take longer to compress and subsequently decompress. + +=item C + +C decompresses any valid ZPAQ stream from +C to C until C returns EOF. Any +non-ZPAQ data in the input is ignored. Any ZPAQ blocks +following non-ZPAQ must be preceded by a marker tag +to be recognized. Each block is decoded according to the +instructions in the block header. The contents of the +filename, comment, and checksum fields are ignored. +Data with bad checksums will be decoded anyway. If there +is more than one segment, then all of the output +data will be concatenated. + +=back + +=head2 class SHA1 + +The SHA1 class is used to compute SHA-1 checksums for compression +and verify them for decompression. It is believed to be +computationally infeasible to find two different strings +with the same hash value. Its member functions +are as follows: + +=over + +=item C + +The constructor creates a new SHA1 object representing the +hash of an empty string. + +=item C + +Appends one byte c (0...255) to the string whose hash is represented. + +=item C + +Returns the length (so far) of the string whose hash is represented. +The largest possible value returned is +2^61 - 1 = 2305843009213693951.0, but values larger than 2^53 = +9007199254740992.0 +will not be exact on systems using IEEE 64 bit floating point +representation of type C. The initial value is 0.0. + +=item C + +Returns the length (so far) as a 64 bit unsigned integer. + +=item C + +Computes the 20 byte SHA-1 hash and resets the string back +to a size of 0.0. The returned pointer points to an array +inside the SHA1 object whose +contents remain unchanged until the next call to C. + +=back + +=head2 class Compressor + +The C class has member functions to write +each of the syntactic elements of a ZPAQ stream and to specify +their values. It will compress using either built-in or +user supplied models. + +=over + +=item C + +The constructor creates a Compression object. No input source, +output destination, or compression model is specified. + +=item C + +Specifies a destination for output. Must be specified before +calling any function that writes data. + +=item C + +Writes a 13 byte marker tag which can be used to identify +the start of a block following non-ZPAQ data. + +=item C + +Writes a block header and specifies a compression model. +If linked with F, then C must be 1, 2, or 3 +to specify I, I, or I respectively. Higher numbers +compress smaller but more slowly. These models are compatible +with both the ZPAQ level 1 and 2 standards. + +=item C + +Writes a block header and specifies the HCOMP portion of the +compression model. The first two bytes of the string should +encode the length of the rest of the string as a 16 bit unsigned +number with the least significant bit first. The meaning of the +rest of the string is defined in the ZPAQ level 2 standard. +If the number of components (C) is 0, then the block +is saved in ZPAQ level 2 format, which cannot be read by +older ZPAQ level 1 decoders. Otherwise the block is saved in +ZPAQ level 1 format, which is compatible with all decoders. + +=item C + +Writes a segment header. C and +C are NUL terminated strings. If specified, then their +values are stored. Normally, C would be a file name +when compressing to an archive or omitted otherwise. If a file +is split among segments, then by convention only the first segment +is named. C is normally the uncompressed size as a decimal +number which is displayed when listing the contents of an archive. +Omitting it does not affect decompression. + +=item C + +Specifies the optional PCOMP string used for post-processing. +It must be called from within the first segment +of each block prior to compressing any data, but not from within +any other segment. +If C is 0 or no argument is passed, then the decompresser +will not post-process the data. The effect is to compress a +0 byte to indicate to the decompresser that no PCOMP string +is present. + +If C is not 0, then I bytes of the string I +are passed. If I is 0 or omitted, then +the first two bytes must encode +the length of the rest of the string as a 16 bit unsigned number +with the least significant byte first. The format of the remainder +of the string is described in the ZPAQ level 2 standard. +The effect is to compress a 1 byte +to indicate the presence of PCOMP, followed by the two length +bytes and the string as passed. For example, either +C or C +would compress the 5 bytes 1, 2, 0, 5, 8. +The user is responsible for pre-processing the input +prior to compression so that PCOMP restores the original data. + +=item C + +Specifies the input source for compression. It must be set +prior to the first call to C. + +=item C + +Compress n bytes of data, or until EOF is input, whichever comes +first. If n < 0 or omitted, then compress until EOF. +Returns true if there is more input available, or false if EOF +was read. + +=item C + +Stop compressing and write the end of a segment. If +C is specified, it should be a 20 byte string +as returned by C on the input data for +this segment I pre-processing. + +=item C + +Finish writing the current block. + +=back + +In order to create a valid ZPAQ stream, the components must +be written in the following order: + + for each block do { + if any non-ZPAQ data then { + write non-ZPAQ data + writeTag() + } + startBlock() + for each segment do { + startSegment() + if first segment in block then { + postProcess() + } + while (compress(n)) ; + endSegment() + } + endBlock() + } + +=head2 class Decompresser + +The class Decompresser has member functions to read each of the +syntactic elements of a ZPAQ stream. + +=over + +=item C + +The constructor creates a Decompresser object. No input source or +output destination is specified. + +=item C + +Specifies where the ZPAQ stream will be read from. Must be called +before any function that reads the stream. + +=item C + +Scan the input to find the start of the next block. If a block +does not start immediately, then the block must be preceded by +a marker tag (written with C) or it will +not be found. If C is not 0, then write the approximate +memory requirement (in bytes) to decompress to C<*memptr>). The +memory will be allocated by the first call to C. +It returns true if a block is found, or false if it reads to EOF +without finding a block. + +=item C + +Write the HCOMP string of the current block to C. +It will be in a format suitable +for passing to C. The first 2 bytes will +encode the length of the rest of the string as a 16 bit unsigned +integer with the least significant byte first. The format of the +remainder of the string is described in the ZPAQ level 1 +specification. + +=item C + +Find the start of the next segment. If another segment is found +within the current block then return true. If the end of the block +is found first, then return false. If a segment is found, the +filename field is not empty, and C +is not 0, then write the filename (without a terminating NUL byte) +to C. + +=item C + +Read or skip past the comment field following the filename field +in the segment header. If C is not 0 and the comment field is +not empty, then write the comment +(without a terminating NUL byte) to C. + +=item C + +Specify the destination for decompression. It must be set before +any data can be decompressed. + +=item C + +Specify the address of a SHA1 object for computing the checksum +of the decompressed data (after post-processing). As each byte C +is output, it is also passed to Cput(c)>. In order to +compute the correct checksum, the SHA1 object should be in its +initial state, either newly created, or by calling C, +before the first call to C. When the end of the segment +is reached, the value returned by Cresult()> should match +the stored checksum, if any. + +=item C + +Decode n bytes or until the end of segment, whichever comes +first. Return false if the end of segment is reached first. If +n < 0 or not specified, then decompress to the end of segment +and return false. C is the number of bytes prior to post-processing. +If the data is post-processed, then the size of the output may +be different. + +=item C + +Write the PCOMP string, if any, for the current block to C. +If there is no PCOMP string (no post-processor) then return false. +Otherwise write the string to C in a format suitable for +passing to C and return true. If written, +then the first 2 bytes will encode the length of the rest of the +string as a 16 bit unsigned integer with the least significant +bit first. The format of the rest of the string is descibed in +the ZPAQ level 1 standard. + +C is only valid after the first call to C +in the current block. To read the PCOMP string without decompressing any +data, then call C first. It is not necessary to +call C in this case. + +=item C + +Skip any compressed data in the current segment that has not yet +been decompressed and advance to the end of the segment. +Then if C is not 0 then write into +the 21 byte array that it points to. If a checksum is present, +then write a 1 into C and write the stored checksum +in C. Otherwise write a 0 in C. + +Note that it is not permitted to call decompress() if any compressed +data has been skipped in any earlier segments in the same block. + +=back + +A valid sequence of calls is as follows: + + while (findBlock()) { + while (findFilename()) { + readComment(); + if first segment in block then { (optional) + decompress(0) + pcomp() + } + while (decompress(n)) ; (optional) + readSegmentEnd(); + } + } + +=head1 EXAMPLES + +The following program F +lists the contents of a ZPAQ archive +read from standard input. + + #include + #include + #include "libzpaq.h" + + // Implement Reader and Writer interfaces for file I/O + class File: public libzpaq::Reader, public libzpaq::Writer { + FILE* f; + public: + File(FILE* f_): f(f_) {} + int get() {return getc(f);} + void put(int c) {putc(c, f);} + int read(char* buf, int n) {return fread(buf, 1, n, f);} + void write(const char* buf, int n) {fwrite(buf, 1, n, f);} + }; + + // Implement error handler + namespace libzpaq { + void error(const char* msg) { + fprintf(stderr, "Error: %s\n", msg); + exit(1); + } + } + + // List the contents of an archive. For each block, show + // the memory required to decompress. For each segment, + // show the filename and comment. + void list(FILE* input, FILE* output) { + libzpaq::Decompresser d; + File in(input), out(output); + double memory; + d.setInput(&in); + for (int block=1; d.findBlock(&memory); ++block) { + printf("Block %d needs %1.0f MB\n", block, memory/1e6); + while (d.findFilename(&out)) { // print filename + printf("\t"); + d.readComment(&out); // print comment + printf("\n"); + d.readSegmentEnd(); // skip compressed data + } + } + } + + int main() { + list(stdin, stdout); + return 0; + } + +The program could be compiled as follows: + + g++ listzpaq.cpp libzpaq.cpp + +The following code compresses a list of files into one block +written to stdout. Each file is compressed to a separate +segment. For each segment, the filename, comment, and SHA-1 +checksum are stored. The comment, as conventional, is the +file size as a decimal string. + + // Compress one file to one segment + void compress_file(libzpaq::Compressor& c, + const char* filename, + bool first_segment) { + + // Open input file + FILE* f; + f=fopen(filename, "rb"); + if (!f) return; + + // Compute SHA-1 checksum and file size + libzpaq::SHA1 sha1; + int ch; + while ((ch=getc(f))!=EOF) + sha1.put(ch); + + // Write file size as a comment. + // The size can have at most 19 digits. + char comment[20]; + sprintf(comment, "%1.0f", sha1.size()); + + // Compress segment + rewind(f); + File in(f); + c.startSegment(filename, comment); + if (first_segment) + c.postProcess(); + c.setInput(&in); + c.compress(); + c.endSegment(sha1.result()); + + // Close input file + fclose(f); + } + + // Compress a list of argc files in argv[0...argc-1] into one + // ZPAQ block to stdout at level 2. + void compress_list(int argc, char** argv) { + libzpaq::Compressor c; + File out(stdout); + c.setOutput(&out); + c.startBlock(2); + for (int i=0; i and C can +be passed an argument n to display progress every n bytes, +for example: + + for (int i=1; d.decompress(1000000); ++i) + fprintf(stderr, "Decompressed %d MB\n", i); + +To compress or decompress to and from objects in memory, derive +appropriate classes from C and C. For example, it is +possible to compress or decompress to a C using +the following class. + + struct String: public libzpaq::Writer { + std::string s; + void put(int c) {s+=char(c);} + }; + +This class is also useful for reading the filename and comment +fields during decompression as follows: + + String filename, comment; + while (d.findFilename(&filename)) { + d.readComment(&comment); + // ... + +=head1 AVAILABILITY + +I, I, and the ZPAQ level 1 and 2 specifications are +available from L. + +=head1 SEE ALSO + +C +C + +=cut + + diff --git a/libzpaq.cpp b/libzpaq.cpp index f0c35d5..084a7aa 100644 --- a/libzpaq.cpp +++ b/libzpaq.cpp @@ -1,3181 +1,3181 @@ -/* libzpaq.cpp - Part of LIBZPAQ Version 5.01 - - Copyright (C) 2011, Dell Inc. Written by Matt Mahoney. - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so without restriction. - This Software is provided "as is" without warranty. - -LIBZPAQ is a C++ library for compression and decompression of data -conforming to the ZPAQ level 2 standard. See http://mattmahoney.net/zpaq/ -*/ - -#include "libzpaq.h" -#include -#include -#include - -#ifndef NOJIT -#ifdef unix -#include -#else -#include -#endif -#endif - -namespace libzpaq { - -// Standard library redirections -void* calloc(size_t a, size_t b) {return ::calloc(a, b);} -void free(void* p) {::free(p);} -int memcmp(const void* d, const void* s, size_t n) { - return ::memcmp(d, s, n);} -void* memset(void* d, int c, size_t n) {return ::memset(d, c, n);} -double log(double x) {return ::log(x);} -double exp(double x) {return ::exp(x);} -double pow(double x, double y) {return ::pow(x, y);} - -// Read 16 bit little-endian number -int toU16(const char* p) { - return (p[0]&255)+256*(p[1]&255); -} - -// Default read() and write() -int Reader::read(char* buf, int n) { - int i=0, c; - while (i=0) - buf[i++]=c; - return i; -} - -void Writer::write(const char* buf, int n) { - for (int i=0; i 0 bytes of executable memory and update -// p to point to it and newsize = n. Free any previously -// allocated memory first. If newsize is 0 then free only. -// Call error in case of failure. If NOJIT, ignore newsize -// and set p=0, n=0 without allocating memory. -void allocx(U8* &p, int &n, int newsize) { -#ifdef NOJIT - p=0; - n=0; -#else - if (p || n) { - if (p) -#ifdef unix - munmap(p, n); -#else // Windows - VirtualFree(p, 0, MEM_RELEASE); -#endif - p=0; - n=0; - } - if (newsize>0) { -#ifdef unix - p=(U8*)mmap(0, newsize, PROT_READ|PROT_WRITE|PROT_EXEC, - MAP_PRIVATE|MAP_ANON, -1, 0); - if ((void*)p==MAP_FAILED) p=0; -#else - p=(U8*)VirtualAlloc(0, newsize, MEM_RESERVE|MEM_COMMIT, - PAGE_EXECUTE_READWRITE); -#endif - if (p) - n=newsize; - else { - n=0; - error("allocx failed"); - } - } -#endif -} - -//////////////////////////// SHA1 //////////////////////////// - -// SHA1 code, see http://en.wikipedia.org/wiki/SHA-1 - -// Start a new hash -void SHA1::init() { - len0=len1=0; - h[0]=0x67452301; - h[1]=0xEFCDAB89; - h[2]=0x98BADCFE; - h[3]=0x10325476; - h[4]=0xC3D2E1F0; -} - -// Return old result and start a new hash -const char* SHA1::result() { - - // pad and append length - const U32 s1=len1, s0=len0; - put(0x80); - while ((len0&511)!=448) - put(0); - put(s1>>24); - put(s1>>16); - put(s1>>8); - put(s1); - put(s0>>24); - put(s0>>16); - put(s0>>8); - put(s0); - - // copy h to hbuf - for (int i=0; i<5; ++i) { - hbuf[4*i]=h[i]>>24; - hbuf[4*i+1]=h[i]>>16; - hbuf[4*i+2]=h[i]>>8; - hbuf[4*i+3]=h[i]; - } - - // return hash prior to clearing state - init(); - return hbuf; -} - -// Hash 1 block of 64 bytes -void SHA1::process() { - for (int i=16; i<80; ++i) { - w[i]=w[i-3]^w[i-8]^w[i-14]^w[i-16]; - w[i]=w[i]<<1|w[i]>>31; - } - U32 a=h[0]; - U32 b=h[1]; - U32 c=h[2]; - U32 d=h[3]; - U32 e=h[4]; - const U32 k1=0x5A827999, k2=0x6ED9EBA1, k3=0x8F1BBCDC, k4=0xCA62C1D6; -#define f1(a,b,c,d,e,i) e+=(a<<5|a>>27)+((b&c)|(~b&d))+k1+w[i]; b=b<<30|b>>2; -#define f5(i) f1(a,b,c,d,e,i) f1(e,a,b,c,d,i+1) f1(d,e,a,b,c,i+2) \ - f1(c,d,e,a,b,i+3) f1(b,c,d,e,a,i+4) - f5(0) f5(5) f5(10) f5(15) -#undef f1 -#define f1(a,b,c,d,e,i) e+=(a<<5|a>>27)+(b^c^d)+k2+w[i]; b=b<<30|b>>2; - f5(20) f5(25) f5(30) f5(35) -#undef f1 -#define f1(a,b,c,d,e,i) e+=(a<<5|a>>27)+((b&c)|(b&d)|(c&d))+k3+w[i]; b=b<<30|b>>2; - f5(40) f5(45) f5(50) f5(55) -#undef f1 -#define f1(a,b,c,d,e,i) e+=(a<<5|a>>27)+(b^c^d)+k4+w[i]; b=b<<30|b>>2; - f5(60) f5(65) f5(70) f5(75) -#undef f1 -#undef f5 - h[0]+=a; - h[1]+=b; - h[2]+=c; - h[3]+=d; - h[4]+=e; -} - -//////////////////////////// Component /////////////////////// - -// A Component is a context model, indirect context model, match model, -// fixed weight mixer, adaptive 2 input mixer without or with current -// partial byte as context, adaptive m input mixer (without or with), -// or SSE (without or with). - -const int compsize[256]={0,2,3,2,3,4,6,6,3,5}; - -void Component::init() { - limit=cxt=a=b=c=0; - cm.resize(0); - ht.resize(0); - a16.resize(0); -} - -////////////////////////// StateTable ////////////////////////// - -// How many states with count of n0 zeros, n1 ones (0...2) -int StateTable::num_states(int n0, int n1) { - const int B=6; - const int bound[B]={20,48,15,8,6,5}; // n0 -> max n1, n1 -> max n0 - if (n0=B || n0>bound[n1]) return 0; - return 1+(n1>0 && n0+n1<=17); -} - -// New value of count n0 if 1 is observed (and vice versa) -void StateTable::discount(int& n0) { - n0=(n0>=1)+(n0>=2)+(n0>=3)+(n0>=4)+(n0>=5)+(n0>=7)+(n0>=8); -} - -// compute next n0,n1 (0 to N) given input y (0 or 1) -void StateTable::next_state(int& n0, int& n1, int y) { - if (n0 20,0 - // 48,1,0 -> 48,1 - // 15,2,0 -> 8,1 - // 8,3,0 -> 6,2 - // 8,3,1 -> 5,3 - // 6,4,0 -> 5,3 - // 5,5,0 -> 5,4 - // 5,5,1 -> 4,5 - while (!num_states(n0, n1)) { - if (n1<2) --n0; - else { - n0=(n0*(n1-1)+(n1/2))/n1; - --n1; - } - } - } -} - -// Initialize next state table ns[state*4] -> next if 0, next if 1, n0, n1 -StateTable::StateTable() { - - // Assign states by increasing priority - const int N=50; - U8 t[N][N][2]={{{0}}}; // (n0,n1,y) -> state number - int state=0; - for (int i=0; i=0 && n<=2); - if (n) { - t[n0][n1][0]=state; - t[n0][n1][1]=state+n-1; - state+=n; - } - } - } - - // Generate next state table - memset(ns, 0, sizeof(ns)); - for (int n0=0; n0=0 && s<256); - int s0=n0, s1=n1; - next_state(s0, s1, 0); - assert(s0>=0 && s0=0 && s1=0 && s0=0 && s1=7); - assert(hbegin>=cend); - assert(hend>=hbegin); - assert(out2); - if (!pp) { // if not a postprocessor then write COMP - for (int i=0; iput(header[i]); - } - else { // write PCOMP size only - out2->put((hend-hbegin)&255); - out2->put((hend-hbegin)>>8); - } - for (int i=hbegin; iput(header[i]); - return true; -} - -// Read header from in2 -int ZPAQL::read(Reader* in2) { - - // Get header size and allocate - int hsize=in2->get(); - hsize+=in2->get()*256; - header.resize(hsize+300); - cend=hbegin=hend=0; - header[cend++]=hsize&255; - header[cend++]=hsize>>8; - while (cend<7) header[cend++]=in2->get(); // hh hm ph pm n - - // Read COMP - int n=header[cend-1]; - for (int i=0; iget(); // component type - if (type==-1) error("unexpected end of file"); - header[cend++]=type; // component type - int size=compsize[type]; - if (size<1) error("Invalid component type"); - if (cend+size>header.isize()-8) error("COMP list too big"); - for (int j=1; jget(); - } - if ((header[cend++]=in2->get())!=0) error("missing COMP END"); - - // Insert a guard gap and read HCOMP - hbegin=hend=cend+128; - while (hendget(); - if (op==-1) error("unexpected end of file"); - header[hend++]=op; - } - if ((header[hend++]=in2->get())!=0) error("missing HCOMP END"); - assert(cend>=7 && cendhbegin && hend6); - assert(output==0); - assert(sha1==0); - init(header[2], header[3]); // hh, hm -} - -// Initialize machine state as PCOMP -void ZPAQL::initp() { - assert(header.isize()>6); - init(header[4], header[5]); // ph, pm -} - -// Flush pending output -void ZPAQL::flush() { - if (output) output->write(&outbuf[0], bufptr); - if (sha1) for (int i=0; iput(U8(outbuf[i])); - bufptr=0; -} - -// Return memory requirement in bytes -double ZPAQL::memory() { - double mem=pow(2.0,header[2]+2)+pow(2.0,header[3]) // hh hm - +pow(2.0,header[4]+2)+pow(2.0,header[5]) // ph pm - +header.size(); - int cp=7; // start of comp list - for (int i=0; i0); - assert(cend>=7); - assert(hbegin>=cend+128); - assert(hend>=hbegin); - assert(hend0); - h.resize(1, hbits); - m.resize(1, mbits); - r.resize(256); - a=b=c=d=pc=f=0; -} - -// Run program on input by interpreting header -void ZPAQL::run0(U32 input) { - assert(cend>6); - assert(hbegin>=cend+128); - assert(hend>=hbegin); - assert(hend0); - assert(h.size()>0); - assert(header[0]+256*header[1]==cend+hend-hbegin-2); - pc=hbegin; - a=input; - while (execute()) ; -} - -// Execute one instruction, return 0 after HALT else 1 -int ZPAQL::execute() { - switch(header[pc++]) { - case 0: err(); break; // ERROR - case 1: ++a; break; // A++ - case 2: --a; break; // A-- - case 3: a = ~a; break; // A! - case 4: a = 0; break; // A=0 - case 7: a = r[header[pc++]]; break; // A=R N - case 8: swap(b); break; // B<>A - case 9: ++b; break; // B++ - case 10: --b; break; // B-- - case 11: b = ~b; break; // B! - case 12: b = 0; break; // B=0 - case 15: b = r[header[pc++]]; break; // B=R N - case 16: swap(c); break; // C<>A - case 17: ++c; break; // C++ - case 18: --c; break; // C-- - case 19: c = ~c; break; // C! - case 20: c = 0; break; // C=0 - case 23: c = r[header[pc++]]; break; // C=R N - case 24: swap(d); break; // D<>A - case 25: ++d; break; // D++ - case 26: --d; break; // D-- - case 27: d = ~d; break; // D! - case 28: d = 0; break; // D=0 - case 31: d = r[header[pc++]]; break; // D=R N - case 32: swap(m(b)); break; // *B<>A - case 33: ++m(b); break; // *B++ - case 34: --m(b); break; // *B-- - case 35: m(b) = ~m(b); break; // *B! - case 36: m(b) = 0; break; // *B=0 - case 39: if (f) pc+=((header[pc]+128)&255)-127; else ++pc; break; // JT N - case 40: swap(m(c)); break; // *C<>A - case 41: ++m(c); break; // *C++ - case 42: --m(c); break; // *C-- - case 43: m(c) = ~m(c); break; // *C! - case 44: m(c) = 0; break; // *C=0 - case 47: if (!f) pc+=((header[pc]+128)&255)-127; else ++pc; break; // JF N - case 48: swap(h(d)); break; // *D<>A - case 49: ++h(d); break; // *D++ - case 50: --h(d); break; // *D-- - case 51: h(d) = ~h(d); break; // *D! - case 52: h(d) = 0; break; // *D=0 - case 55: r[header[pc++]] = a; break; // R=A N - case 56: return 0 ; // HALT - case 57: outc(a&255); break; // OUT - case 59: a = (a+m(b)+512)*773; break; // HASH - case 60: h(d) = (h(d)+a+512)*773; break; // HASHD - case 63: pc+=((header[pc]+128)&255)-127; break; // JMP N - case 64: a = a; break; // A=A - case 65: a = b; break; // A=B - case 66: a = c; break; // A=C - case 67: a = d; break; // A=D - case 68: a = m(b); break; // A=*B - case 69: a = m(c); break; // A=*C - case 70: a = h(d); break; // A=*D - case 71: a = header[pc++]; break; // A= N - case 72: b = a; break; // B=A - case 73: b = b; break; // B=B - case 74: b = c; break; // B=C - case 75: b = d; break; // B=D - case 76: b = m(b); break; // B=*B - case 77: b = m(c); break; // B=*C - case 78: b = h(d); break; // B=*D - case 79: b = header[pc++]; break; // B= N - case 80: c = a; break; // C=A - case 81: c = b; break; // C=B - case 82: c = c; break; // C=C - case 83: c = d; break; // C=D - case 84: c = m(b); break; // C=*B - case 85: c = m(c); break; // C=*C - case 86: c = h(d); break; // C=*D - case 87: c = header[pc++]; break; // C= N - case 88: d = a; break; // D=A - case 89: d = b; break; // D=B - case 90: d = c; break; // D=C - case 91: d = d; break; // D=D - case 92: d = m(b); break; // D=*B - case 93: d = m(c); break; // D=*C - case 94: d = h(d); break; // D=*D - case 95: d = header[pc++]; break; // D= N - case 96: m(b) = a; break; // *B=A - case 97: m(b) = b; break; // *B=B - case 98: m(b) = c; break; // *B=C - case 99: m(b) = d; break; // *B=D - case 100: m(b) = m(b); break; // *B=*B - case 101: m(b) = m(c); break; // *B=*C - case 102: m(b) = h(d); break; // *B=*D - case 103: m(b) = header[pc++]; break; // *B= N - case 104: m(c) = a; break; // *C=A - case 105: m(c) = b; break; // *C=B - case 106: m(c) = c; break; // *C=C - case 107: m(c) = d; break; // *C=D - case 108: m(c) = m(b); break; // *C=*B - case 109: m(c) = m(c); break; // *C=*C - case 110: m(c) = h(d); break; // *C=*D - case 111: m(c) = header[pc++]; break; // *C= N - case 112: h(d) = a; break; // *D=A - case 113: h(d) = b; break; // *D=B - case 114: h(d) = c; break; // *D=C - case 115: h(d) = d; break; // *D=D - case 116: h(d) = m(b); break; // *D=*B - case 117: h(d) = m(c); break; // *D=*C - case 118: h(d) = h(d); break; // *D=*D - case 119: h(d) = header[pc++]; break; // *D= N - case 128: a += a; break; // A+=A - case 129: a += b; break; // A+=B - case 130: a += c; break; // A+=C - case 131: a += d; break; // A+=D - case 132: a += m(b); break; // A+=*B - case 133: a += m(c); break; // A+=*C - case 134: a += h(d); break; // A+=*D - case 135: a += header[pc++]; break; // A+= N - case 136: a -= a; break; // A-=A - case 137: a -= b; break; // A-=B - case 138: a -= c; break; // A-=C - case 139: a -= d; break; // A-=D - case 140: a -= m(b); break; // A-=*B - case 141: a -= m(c); break; // A-=*C - case 142: a -= h(d); break; // A-=*D - case 143: a -= header[pc++]; break; // A-= N - case 144: a *= a; break; // A*=A - case 145: a *= b; break; // A*=B - case 146: a *= c; break; // A*=C - case 147: a *= d; break; // A*=D - case 148: a *= m(b); break; // A*=*B - case 149: a *= m(c); break; // A*=*C - case 150: a *= h(d); break; // A*=*D - case 151: a *= header[pc++]; break; // A*= N - case 152: div(a); break; // A/=A - case 153: div(b); break; // A/=B - case 154: div(c); break; // A/=C - case 155: div(d); break; // A/=D - case 156: div(m(b)); break; // A/=*B - case 157: div(m(c)); break; // A/=*C - case 158: div(h(d)); break; // A/=*D - case 159: div(header[pc++]); break; // A/= N - case 160: mod(a); break; // A%=A - case 161: mod(b); break; // A%=B - case 162: mod(c); break; // A%=C - case 163: mod(d); break; // A%=D - case 164: mod(m(b)); break; // A%=*B - case 165: mod(m(c)); break; // A%=*C - case 166: mod(h(d)); break; // A%=*D - case 167: mod(header[pc++]); break; // A%= N - case 168: a &= a; break; // A&=A - case 169: a &= b; break; // A&=B - case 170: a &= c; break; // A&=C - case 171: a &= d; break; // A&=D - case 172: a &= m(b); break; // A&=*B - case 173: a &= m(c); break; // A&=*C - case 174: a &= h(d); break; // A&=*D - case 175: a &= header[pc++]; break; // A&= N - case 176: a &= ~ a; break; // A&~A - case 177: a &= ~ b; break; // A&~B - case 178: a &= ~ c; break; // A&~C - case 179: a &= ~ d; break; // A&~D - case 180: a &= ~ m(b); break; // A&~*B - case 181: a &= ~ m(c); break; // A&~*C - case 182: a &= ~ h(d); break; // A&~*D - case 183: a &= ~ header[pc++]; break; // A&~ N - case 184: a |= a; break; // A|=A - case 185: a |= b; break; // A|=B - case 186: a |= c; break; // A|=C - case 187: a |= d; break; // A|=D - case 188: a |= m(b); break; // A|=*B - case 189: a |= m(c); break; // A|=*C - case 190: a |= h(d); break; // A|=*D - case 191: a |= header[pc++]; break; // A|= N - case 192: a ^= a; break; // A^=A - case 193: a ^= b; break; // A^=B - case 194: a ^= c; break; // A^=C - case 195: a ^= d; break; // A^=D - case 196: a ^= m(b); break; // A^=*B - case 197: a ^= m(c); break; // A^=*C - case 198: a ^= h(d); break; // A^=*D - case 199: a ^= header[pc++]; break; // A^= N - case 200: a <<= (a&31); break; // A<<=A - case 201: a <<= (b&31); break; // A<<=B - case 202: a <<= (c&31); break; // A<<=C - case 203: a <<= (d&31); break; // A<<=D - case 204: a <<= (m(b)&31); break; // A<<=*B - case 205: a <<= (m(c)&31); break; // A<<=*C - case 206: a <<= (h(d)&31); break; // A<<=*D - case 207: a <<= (header[pc++]&31); break; // A<<= N - case 208: a >>= (a&31); break; // A>>=A - case 209: a >>= (b&31); break; // A>>=B - case 210: a >>= (c&31); break; // A>>=C - case 211: a >>= (d&31); break; // A>>=D - case 212: a >>= (m(b)&31); break; // A>>=*B - case 213: a >>= (m(c)&31); break; // A>>=*C - case 214: a >>= (h(d)&31); break; // A>>=*D - case 215: a >>= (header[pc++]&31); break; // A>>= N - case 216: f = (a == a); break; // A==A - case 217: f = (a == b); break; // A==B - case 218: f = (a == c); break; // A==C - case 219: f = (a == d); break; // A==D - case 220: f = (a == U32(m(b))); break; // A==*B - case 221: f = (a == U32(m(c))); break; // A==*C - case 222: f = (a == h(d)); break; // A==*D - case 223: f = (a == U32(header[pc++])); break; // A== N - case 224: f = (a < a); break; // A a); break; // A>A - case 233: f = (a > b); break; // A>B - case 234: f = (a > c); break; // A>C - case 235: f = (a > d); break; // A>D - case 236: f = (a > U32(m(b))); break; // A>*B - case 237: f = (a > U32(m(c))); break; // A>*C - case 238: f = (a > h(d)); break; // A>*D - case 239: f = (a > U32(header[pc++])); break; // A> N - case 255: if((pc=hbegin+header[pc]+256*header[pc+1])>=hend)err();break;//LJ - default: err(); - } - return 1; -} - -// Print illegal instruction error message and exit -void ZPAQL::err() { - error("ZPAQL execution error"); -} - -///////////////////////// Predictor ///////////////////////// - -// Initailize model-independent tables -Predictor::Predictor(ZPAQL& zr): - c8(1), hmap4(1), z(zr) { - assert(sizeof(U8)==1); - assert(sizeof(U16)==2); - assert(sizeof(U32)==4); - assert(sizeof(U64)==8); - assert(sizeof(short)==2); - assert(sizeof(int)==4); - - // Initialize tables - dt2k[0]=0; - for (int i=1; i<256; ++i) - dt2k[i]=2048/i; - for (int i=0; i<1024; ++i) - dt[i]=(1<<17)/(i*2+3)*2; - for (int i=0; i<32768; ++i) - stretcht[i]=int(log((i+0.5)/(32767.5-i))*64+0.5+100000)-100000; - for (int i=0; i<4096; ++i) - squasht[i]=int(32768.0/(1+exp((i-2048)*(-1.0/64)))); - - // Verify floating point math for squash() and stretch() - U32 sqsum=0, stsum=0; - for (int i=32767; i>=0; --i) - stsum=stsum*3+stretch(i); - for (int i=4095; i>=0; --i) - sqsum=sqsum*3+squash(i-2048); - assert(stsum==3887533746u); - assert(sqsum==2278286169u); - - pcode=0; - pcode_size=0; -} - -Predictor::~Predictor() { - allocx(pcode, pcode_size, 0); // free executable memory -} - -// Initialize the predictor with a new model in z -void Predictor::init() { - - // Clear old JIT code if any - allocx(pcode, pcode_size, 0); - - // Initialize context hash function - z.inith(); - - // Initialize predictions - for (int i=0; i<256; ++i) h[i]=p[i]=0; - - // Initialize components - for (int i=0; i<256; ++i) // clear old model - comp[i].init(); - int n=z.header[6]; // hsize[0..1] hh hm ph pm n (comp)[n] END 0[128] (hcomp) END - const U8* cp=&z.header[7]; // start of component list - for (int i=0; i&z.header[0] && cp<&z.header[z.header.isize()-8]); - Component& cr=comp[i]; - switch(cp[0]) { - case CONS: // c - p[i]=(cp[1]-128)*4; - break; - case CM: // sizebits limit - if (cp[1]>32) error("max size for CM is 32"); - cr.cm.resize(1, cp[1]); // packed CM (22 bits) + CMCOUNT (10 bits) - cr.limit=cp[2]*4; - for (size_t j=0; j26) error("max size for ICM is 26"); - cr.limit=1023; - cr.cm.resize(256); - cr.ht.resize(64, cp[1]); - for (size_t j=0; j32 || cp[2]>32) error("max size for MATCH is 32 32"); - cr.cm.resize(1, cp[1]); // index - cr.ht.resize(1, cp[2]); // buf - cr.ht(0)=1; - break; - case AVG: // j k wt - if (cp[1]>=i) error("AVG j >= i"); - if (cp[2]>=i) error("AVG k >= i"); - break; - case MIX2: // sizebits j k rate mask - if (cp[1]>32) error("max size for MIX2 is 32"); - if (cp[3]>=i) error("MIX2 k >= i"); - if (cp[2]>=i) error("MIX2 j >= i"); - cr.c=(size_t(1)<32) error("max size for MIX is 32"); - if (cp[2]>=i) error("MIX j >= i"); - if (cp[3]<1 || cp[3]>i-cp[2]) error("MIX m not in 1..i-j"); - int m=cp[3]; // number of inputs - assert(m>=1); - cr.c=(size_t(1)<32) error("max size for ISSE is 32"); - if (cp[2]>=i) error("ISSE j >= i"); - cr.ht.resize(64, cp[1]); - cr.cm.resize(512); - for (int j=0; j<256; ++j) { - cr.cm[j*2]=1<<15; - cr.cm[j*2+1]=clamp512k(stretch(st.cminit(j)>>8)<<10); - } - break; - case SSE: // sizebits j start limit - if (cp[1]>32) error("max size for SSE is 32"); - if (cp[2]>=i) error("SSE j >= i"); - if (cp[3]>cp[4]*4) error("SSE start > limit*4"); - cr.cm.resize(32, cp[1]); - cr.limit=cp[4]*4; - for (size_t j=0; j0); - cp+=compsize[*cp]; - assert(cp>=&z.header[7] && cp<&z.header[z.cend]); - } -} - -// Return next bit prediction using interpreted COMP code -int Predictor::predict0() { - assert(c8>=1 && c8<=255); - - // Predict next bit - int n=z.header[6]; - assert(n>0 && n<=255); - const U8* cp=&z.header[7]; - assert(cp[-1]==n); - for (int i=0; i&z.header[0] && cp<&z.header[z.header.isize()-8]); - Component& cr=comp[i]; - switch(cp[0]) { - case CONS: // c - break; - case CM: // sizebits limit - cr.cxt=h[i]^hmap4; - p[i]=stretch(cr.cm(cr.cxt)>>17); - break; - case ICM: // sizebits - assert((hmap4&15)>0); - if (c8==1 || (c8&0xf0)==16) cr.c=find(cr.ht, cp[1]+2, h[i]+16*c8); - cr.cxt=cr.ht[cr.c+(hmap4&15)]; - p[i]=stretch(cr.cm(cr.cxt)>>8); - break; - case MATCH: // sizebits bufbits: a=len, b=offset, c=bit, cxt=bitpos, - // ht=buf, limit=pos - assert(cr.cm.size()==(size_t(1)<>(7-cr.cxt))&1; // predicted bit - p[i]=stretch(dt2k[cr.a]*(cr.c*-2+1)&32767); - } - break; - case AVG: // j k wt - p[i]=(p[cp[1]]*cp[3]+p[cp[2]]*(256-cp[3]))>>8; - break; - case MIX2: { // sizebits j k rate mask - // c=size cm=wt[size] cxt=input - cr.cxt=((h[i]+(c8&cp[5]))&(cr.c-1)); - assert(cr.cxt=0 && w<65536); - p[i]=(w*p[cp[2]]+(65536-w)*p[cp[3]])>>16; - assert(p[i]>=-2048 && p[i]<2048); - } - break; - case MIX: { // sizebits j m rate mask - // c=size cm=wt[size][m] cxt=index of wt in cm - int m=cp[3]; - assert(m>=1 && m<=i); - cr.cxt=h[i]+(c8&cp[5]); - cr.cxt=(cr.cxt&(cr.c-1))*m; // pointer to row of weights - assert(cr.cxt<=cr.cm.size()-m); - int* wt=(int*)&cr.cm[cr.cxt]; - p[i]=0; - for (int j=0; j>8)*p[cp[2]+j]; - p[i]=clamp2k(p[i]>>8); - } - break; - case ISSE: { // sizebits j -- c=hi, cxt=bh - assert((hmap4&15)>0); - if (c8==1 || (c8&0xf0)==16) - cr.c=find(cr.ht, cp[1]+2, h[i]+16*c8); - cr.cxt=cr.ht[cr.c+(hmap4&15)]; // bit history - int *wt=(int*)&cr.cm[cr.cxt*2]; - p[i]=clamp2k((wt[0]*p[cp[2]]+wt[1]*64)>>16); - } - break; - case SSE: { // sizebits j start limit - cr.cxt=(h[i]+c8)*32; - int pq=p[cp[2]]+992; - if (pq<0) pq=0; - if (pq>1983) pq=1983; - int wt=pq&63; - pq>>=6; - assert(pq>=0 && pq<=30); - cr.cxt+=pq; - p[i]=stretch(((cr.cm(cr.cxt)>>10)*(64-wt)+(cr.cm(cr.cxt+1)>>10)*wt)>>13); - cr.cxt+=wt>>5; - } - break; - default: - error("component predict not implemented"); - } - cp+=compsize[cp[0]]; - assert(cp<&z.header[z.cend]); - assert(p[i]>=-2048 && p[i]<2048); - } - assert(cp[0]==NONE); - return squash(p[n-1]); -} - -// Update model with decoded bit y (0...1) -void Predictor::update0(int y) { - assert(y==0 || y==1); - assert(c8>=1 && c8<=255); - assert(hmap4>=1 && hmap4<=511); - - // Update components - const U8* cp=&z.header[7]; - int n=z.header[6]; - assert(n>=1 && n<=255); - assert(cp[-1]==n); - for (int i=0; i>8))>>2; - } - break; - case MATCH: // sizebits bufbits: - // a=len, b=offset, c=bit, cm=index, cxt=bitpos - // ht=buf, limit=pos - { - assert(cr.a<=255); - assert(cr.c==0 || cr.c==1); - assert(cr.cxt<8); - assert(cr.cm.size()==(size_t(1)<>5; - int w=cr.a16[cr.cxt]; - w+=(err*(p[cp[2]]-p[cp[3]])+(1<<12))>>13; - if (w<0) w=0; - if (w>65535) w=65535; - cr.a16[cr.cxt]=w; - } - break; - case MIX: { // sizebits j m rate mask - // cm=wt[size][m], cxt=input - int m=cp[3]; - assert(m>0 && m<=i); - assert(cr.cm.size()==m*cr.c); - assert(cr.cxt+m<=cr.cm.size()); - int err=(y*32767-squash(p[i]))*cp[4]>>4; - int* wt=(int*)&cr.cm[cr.cxt]; - for (int j=0; j>13)); - } - break; - case ISSE: { // sizebits j -- c=hi, cxt=bh - assert(cr.cxt==cr.ht[cr.c+(hmap4&15)]); - int err=y*32767-squash(p[i]); - int *wt=(int*)&cr.cm[cr.cxt*2]; - wt[0]=clamp512k(wt[0]+((err*p[cp[2]]+(1<<12))>>13)); - wt[1]=clamp512k(wt[1]+((err+16)>>5)); - cr.ht[cr.c+(hmap4&15)]=st.next(cr.cxt, y); - } - break; - case SSE: // sizebits j start limit - train(cr, y); - break; - default: - assert(0); - } - cp+=compsize[cp[0]]; - assert(cp>=&z.header[7] && cp<&z.header[z.cend] - && cp<&z.header[z.header.isize()-8]); - } - assert(cp[0]==NONE); - - // Save bit y in c8, hmap4 - c8+=c8+y; - if (c8>=256) { - z.run(c8-256); - hmap4=1; - c8=1; - for (int i=0; i=16 && c8<32) - hmap4=(hmap4&0xf)<<5|y<<4|1; - else - hmap4=(hmap4&0x1f0)|(((hmap4&0xf)*2+y)&0xf); -} - -// Find cxt row in hash table ht. ht has rows of 16 indexed by the -// low sizebits of cxt with element 0 having the next higher 8 bits for -// collision detection. If not found after 3 adjacent tries, replace the -// row with lowest element 1 as priority. Return index of row. -size_t Predictor::find(Array& ht, int sizebits, U32 cxt) { - assert(ht.size()==size_t(16)<>sizebits&255; - size_t h0=(cxt*16)&(ht.size()-16); - if (ht[h0]==chk) return h0; - size_t h1=h0^16; - if (ht[h1]==chk) return h1; - size_t h2=h0^32; - if (ht[h2]==chk) return h2; - if (ht[h0+1]<=ht[h1+1] && ht[h0+1]<=ht[h2+1]) - return memset(&ht[h0], 0, 16), ht[h0]=chk, h0; - else if (ht[h1+1]get(); - if (c<0) error("unexpected end of input"); - curr=curr<<8|c; - } - } - U32 n=buf.size(); - if (n>curr) n=curr; - high=in->read(&buf[0], n); - curr-=high; - low=0; -} - -// Return next bit of decoded input, which has 16 bit probability p of being 1 -int Decoder::decode(int p) { - assert(p>=0 && p<65536); - assert(high>low && low>0); - if (currhigh) error("archive corrupted"); - assert(curr>=low && curr<=high); - U32 mid=low+U32(((high-low)*U64(U32(p)))>>16); // split range - assert(high>mid && mid>=low); - int y=curr<=mid; - if (y) high=mid; else low=mid+1; // pick half - while ((high^low)<0x1000000) { // shift out identical leading bytes - high=high<<8|255; - low=low<<8; - low+=(low==0); - int c=in->get(); - if (c<0) error("unexpected end of file"); - curr=curr<<8|c; - } - return y; -} - -// Decompress 1 byte or -1 at end of input -int Decoder::decompress() { - if (pr.isModeled()) { // n>0 components? - if (curr==0) { // segment initialization - for (int i=0; i<4; ++i) - curr=curr<<8|in->get(); - } - if (decode(0)) { - if (curr!=0) error("decoding end of stream"); - return -1; - } - else { - int c=1; - while (c<256) { // get 8 bits - int p=pr.predict()*2+1; - c+=c+decode(p); - pr.update(c&1); - } - return c-256; - } - } - else { - if (low==high) loadbuf(); - if (low==high) return -1; - return buf[low++]&255; - } -} - -// Find end of compressed data and return next byte -int Decoder::skip() { - int c=-1; - if (pr.isModeled()) { - while (curr==0) // at start? - curr=in->get(); - while (curr && (c=in->get())>=0) // find 4 zeros - curr=curr<<8|c; - while ((c=in->get())==0) ; // might be more than 4 - return c; - } - else { - if (curr==0) // at start? - for (int i=0; i<4 && (c=in->get())>=0; ++i) curr=curr<<8|c; - while (curr>0) { - U32 n=BUFSIZE; - if (n>curr) n=curr; - U32 n1=in->read(&buf[0], n); - curr-=n1; - if (n1!=n) return -1; - if (curr==0) - for (int i=0; i<4 && (c=in->get())>=0; ++i) curr=curr<<8|c; - } - if (c>=0) c=in->get(); - return c; - } -} - -////////////////////// PostProcessor ////////////////////// - -// Copy ph, pm from block header -void PostProcessor::init(int h, int m) { - state=hsize=0; - ph=h; - pm=m; - z.clear(); -} - -// (PASS=0 | PROG=1 psize[0..1] pcomp[0..psize-1]) data... EOB=-1 -// Return state: 1=PASS, 2..4=loading PROG, 5=PROG loaded -int PostProcessor::write(int c) { - assert(c>=-1 && c<=255); - switch (state) { - case 0: // initial state - if (c<0) error("Unexpected EOS"); - state=c+1; // 1=PASS, 2=PROG - if (state>2) error("unknown post processing type"); - if (state==1) z.clear(); - break; - case 1: // PASS - z.outc(c); - break; - case 2: // PROG - if (c<0) error("Unexpected EOS"); - hsize=c; // low byte of size - state=3; - break; - case 3: // PROG psize[0] - if (c<0) error("Unexpected EOS"); - hsize+=c*256; // high byte of psize - z.header.resize(hsize+300); - z.cend=8; - z.hbegin=z.hend=z.cend+128; - z.header[4]=ph; - z.header[5]=pm; - state=4; - break; - case 4: // PROG psize[0..1] pcomp[0...] - if (c<0) error("Unexpected EOS"); - assert(z.hend>8; - z.initp(); - state=5; - } - break; - case 5: // PROG ... data - z.run(c); - if (c<0) z.flush(); - break; - } - return state; -} - -/////////////////////// Decompresser ///////////////////// - -// Find the start of a block and return true if found. Set memptr -// to memory used. -bool Decompresser::findBlock(double* memptr) { - assert(state==BLOCK); - - // Find start of block - U32 h1=0x3D49B113, h2=0x29EB7F93, h3=0x2614BE13, h4=0x3828EB13; - // Rolling hashes initialized to hash of first 13 bytes - int c; - while ((c=dec.in->get())!=-1) { - h1=h1*12+c; - h2=h2*20+c; - h3=h3*28+c; - h4=h4*44+c; - if (h1==0xB16B88F1 && h2==0xFF5376F1 && h3==0x72AC5BF1 && h4==0x2F909AF1) - break; // hash of 16 byte string - } - if (c==-1) return false; - - // Read header - if ((c=dec.in->get())!=1 && c!=2) error("unsupported ZPAQ level"); - if (dec.in->get()!=1) error("unsupported ZPAQL type"); - z.read(dec.in); - if (c==1 && z.header.isize()>6 && z.header[6]==0) - error("ZPAQ level 1 requires at least 1 component"); - if (memptr) *memptr=z.memory(); - state=FILENAME; - decode_state=FIRSTSEG; - return true; -} - -// Read the start of a segment (1) or end of block code (255). -// If a segment is found, write the filename and return true, else false. -bool Decompresser::findFilename(Writer* filename) { - assert(state==FILENAME); - int c=dec.in->get(); - if (c==1) { // segment found - while (true) { - c=dec.in->get(); - if (c==-1) error("unexpected EOF"); - if (c==0) { - state=COMMENT; - return true; - } - if (filename) filename->put(c); - } - } - else if (c==255) { // end of block found - state=BLOCK; - return false; - } - else - error("missing segment or end of block"); - return false; -} - -// Read the comment from the segment header -void Decompresser::readComment(Writer* comment) { - assert(state==COMMENT); - state=DATA; - while (true) { - int c=dec.in->get(); - if (c==-1) error("unexpected EOF"); - if (c==0) break; - if (comment) comment->put(c); - } - if (dec.in->get()!=0) error("missing reserved byte"); -} - -// Decompress n bytes, or all if n < 0. Return false if done -bool Decompresser::decompress(int n) { - assert(state==DATA); - assert(decode_state!=SKIP); - - // Initialize models to start decompressing block - if (decode_state==FIRSTSEG) { - dec.init(); - assert(z.header.size()>5); - pp.init(z.header[4], z.header[5]); - decode_state=SEG; - } - - // Decompress and load PCOMP into postprocessor - while ((pp.getState()&3)!=1) - pp.write(dec.decompress()); - - // Decompress n bytes, or all if n < 0 - while (n) { - int c=dec.decompress(); - pp.write(c); - if (c==-1) { - state=SEGEND; - return false; - } - if (n>0) --n; - } - return true; -} - -// Read end of block. If a SHA1 checksum is present, write 1 and the -// 20 byte checksum into sha1string, else write 0 in first byte. -// If sha1string is 0 then discard it. -void Decompresser::readSegmentEnd(char* sha1string) { - assert(state==DATA || state==SEGEND); - - // Skip remaining data if any and get next byte - int c=0; - if (state==DATA) { - c=dec.skip(); - decode_state=SKIP; - } - else if (state==SEGEND) - c=dec.in->get(); - state=FILENAME; - - // Read checksum - if (c==254) { - if (sha1string) sha1string[0]=0; // no checksum - } - else if (c==253) { - if (sha1string) sha1string[0]=1; - for (int i=1; i<=20; ++i) { - c=dec.in->get(); - if (sha1string) sha1string[i]=c; - } - } - else - error("missing end of segment marker"); -} - -/////////////////////////// decompress() ///////////////////// - -void decompress(Reader* in, Writer* out) { - Decompresser d; - d.setInput(in); - d.setOutput(out); - while (d.findBlock()) { // don't calculate memory - while (d.findFilename()) { // discard filename - d.readComment(); // discard comment - d.decompress(); // to end of segment - d.readSegmentEnd(); // discard sha1string - } - } -} - -////////////////////// Encoder //////////////////// - -// Initialize for start of block -void Encoder::init() { - low=1; - high=0xFFFFFFFF; - pr.init(); - if (!pr.isModeled()) low=0, buf.resize(1<<16); -} - -// compress bit y having probability p/64K -void Encoder::encode(int y, int p) { - assert(out); - assert(p>=0 && p<65536); - assert(y==0 || y==1); - assert(high>low && low>0); - U32 mid=low+U32(((high-low)*U64(U32(p)))>>16); // split range - assert(high>mid && mid>=low); - if (y) high=mid; else low=mid+1; // pick half - while ((high^low)<0x1000000) { // write identical leading bytes - out->put(high>>24); // same as low>>24 - high=high<<8|255; - low=low<<8; - low+=(low==0); // so we don't code 4 0 bytes in a row - } -} - -// compress byte c (0..255 or -1=EOS) -void Encoder::compress(int c) { - assert(out); - if (pr.isModeled()) { - if (c==-1) - encode(1, 0); - else { - assert(c>=0 && c<=255); - encode(0, 0); - for (int i=7; i>=0; --i) { - int p=pr.predict()*2+1; - assert(p>0 && p<65536); - int y=c>>i&1; - encode(y, p); - pr.update(y); - } - } - } - else { - if (c<0 || low==buf.size()) { - out->put((low>>24)&255); - out->put((low>>16)&255); - out->put((low>>8)&255); - out->put(low&255); - out->write(&buf[0], low); - low=0; - } - if (c>=0) buf[low++]=c; - } -} - -///////////////////// Compressor ////////////////////// - -// Write 13 byte start tag -// "\x37\x6B\x53\x74\xA0\x31\x83\xD3\x8C\xB2\x28\xB0\xD3" -void Compressor::writeTag() { - assert(state==INIT); - enc.out->put(0x37); - enc.out->put(0x6b); - enc.out->put(0x53); - enc.out->put(0x74); - enc.out->put(0xa0); - enc.out->put(0x31); - enc.out->put(0x83); - enc.out->put(0xd3); - enc.out->put(0x8c); - enc.out->put(0xb2); - enc.out->put(0x28); - enc.out->put(0xb0); - enc.out->put(0xd3); -} - -void Compressor::startBlock(int level) { - - // Model 1 - min.cfg - static const char models[]={ - 26,0,1,2,0,0,2,3,16,8,19,0,0,96,4,28, - 59,10,59,112,25,10,59,10,59,112,56,0, - - // Model 2 - mid.cfg - 69,0,3,3,0,0,8,3,5,8,13,0,8,17,1,8, - 18,2,8,18,3,8,19,4,4,22,24,7,16,0,7,24, - -1,0,17,104,74,4,95,1,59,112,10,25,59,112,10,25, - 59,112,10,25,59,112,10,25,59,112,10,25,59,10,59,112, - 25,69,-49,8,112,56,0, - - // Model 3 - max.cfg - -60,0,5,9,0,0,22,1,-96,3,5,8,13,1,8,16, - 2,8,18,3,8,19,4,8,19,5,8,20,6,4,22,24, - 3,17,8,19,9,3,13,3,13,3,13,3,14,7,16,0, - 15,24,-1,7,8,0,16,10,-1,6,0,15,16,24,0,9, - 8,17,32,-1,6,8,17,18,16,-1,9,16,19,32,-1,6, - 0,19,20,16,0,0,17,104,74,4,95,2,59,112,10,25, - 59,112,10,25,59,112,10,25,59,112,10,25,59,112,10,25, - 59,10,59,112,10,25,59,112,10,25,69,-73,32,-17,64,47, - 14,-25,91,47,10,25,60,26,48,-122,-105,20,112,63,9,70, - -33,0,39,3,25,112,26,52,25,25,74,10,4,59,112,25, - 10,4,59,112,25,10,4,59,112,25,65,-113,-44,72,4,59, - 112,8,-113,-40,8,68,-81,60,60,25,69,-49,9,112,25,25, - 25,25,25,112,56,0, - - 0,0}; // 0,0 = end of list - - if (level<1) error("compression level must be at least 1"); - const char* p=models; - int i; - for (i=1; iput('z'); - enc.out->put('P'); - enc.out->put('Q'); - enc.out->put(1+(len>6 && hcomp[6]==0)); // level 1 or 2 - enc.out->put(1); - for (int i=0; iput(hcomp[i]); - MemoryReader m(hcomp); - z.read(&m); - state=BLOCK1; -} - -// Write a segment header -void Compressor::startSegment(const char* filename, const char* comment) { - assert(state==BLOCK1 || state==BLOCK2); - enc.out->put(1); - while (filename && *filename) - enc.out->put(*filename++); - enc.out->put(0); - while (comment && *comment) - enc.out->put(*comment++); - enc.out->put(0); - enc.out->put(0); - if (state==BLOCK1) state=SEG1; - if (state==BLOCK2) state=SEG2; -} - -// Initialize encoding and write pcomp to first segment -// If len is 0 then length is encoded in pcomp[0..1] -void Compressor::postProcess(const char* pcomp, int len) { - assert(state==SEG1); - enc.init(); - if (pcomp) { - enc.compress(1); - if (len<=0) { - len=toU16(pcomp); - pcomp+=2; - } - enc.compress(len&255); - enc.compress((len>>8)&255); - for (int i=0; iget())>=0) { - enc.compress(ch); - if (n>0) --n; - } - return ch>=0; -} - -// End segment, write sha1string if present -void Compressor::endSegment(const char* sha1string) { - assert(state==SEG2); - enc.compress(-1); - enc.out->put(0); - enc.out->put(0); - enc.out->put(0); - enc.out->put(0); - if (sha1string) { - enc.out->put(253); - for (int i=0; i<20; ++i) - enc.out->put(sha1string[i]); - } - else - enc.out->put(254); - state=BLOCK2; -} - -// End block -void Compressor::endBlock() { - assert(state==BLOCK2); - enc.out->put(255); - state=INIT; -} - -/////////////////////////// compress() /////////////////////// - -void compress(Reader* in, Writer* out, int level) { - assert(level>=1); - Compressor c; - c.setInput(in); - c.setOutput(out); - c.startBlock(level); - c.startSegment(); - c.postProcess(); - c.compress(); - c.endSegment(); - c.endBlock(); -} - -//////////////////////// ZPAQL::assemble() //////////////////// - -#ifndef NOJIT -/* -assemble(); - -Assembles the ZPAQL code in hcomp[0..hlen-1] and stores x86-32 or x86-64 -code in rcode[0..rcode_size-1]. Execution begins at rcode[0]. It will not -write beyond the end of rcode, but in any case it returns the number of -bytes that would have been written. It returns 0 in case of error. - -The assembled code implements run() and returns 1 if successful or -0 if the ZPAQL code executes an invalid instruction or jumps out of -bounds. - -A ZPAQL virtual machine has the following state. All values are -unsigned and initially 0: - - a, b, c, d: 32 bit registers (pointed to by their respective parameters) - f: 1 bit flag register (pointed to) - r[0..255]: 32 bit registers - m[0..msize-1]: 8 bit registers, where msize is a power of 2 - h[0..hsize-1]: 32 bit registers, where hsize is a power of 2 - out: pointer to a Writer - sha1: pointer to a SHA1 - -Generally a ZPAQL machine is used to compute contexts which are -placed in h. A second machine might post-process, and write its -output to out and sha1. In either case, a machine is called with -its input in a, representing a single byte (0..255) or -(for a postprocessor) EOF (0xffffffff). Execution returs after a -ZPAQL halt instruction. - -ZPAQL instructions are 1 byte unless the last 3 bits are 1. -In this case, a second operand byte follows. Opcode 255 is -the only 3 byte instruction. They are organized: - - 00dddxxx = unary opcode xxx on destination ddd (ddd < 111) - 00111xxx = special instruction xxx - 01dddsss = assignment: ddd = sss (ddd < 111) - 1xxxxsss = operation sxxx from sss to a - -The meaning of sss and ddd are as follows: - - 000 = a (accumulator) - 001 = b - 010 = c - 011 = d - 100 = *b (means m[b mod msize]) - 101 = *c (means m[c mod msize]) - 110 = *d (means h[d mod hsize]) - 111 = n (constant 0..255 in second byte of instruction) - -For example, 01001110 assigns *d to b. The other instructions xxx -are as follows: - -Group 00dddxxx where ddd < 111 and xxx is: - 000 = ddd<>a, swap with a (except 00000000 is an error, and swap - with *b or *c leaves the high bits of a unchanged) - 001 = ddd++, increment - 010 = ddd--, decrement - 011 = ddd!, not (invert all bits) - 100 = ddd=0, clear (set all bits of ddd to 0) - 101 = not used (error) - 110 = not used - 111 = ddd=r n, assign from r[n] to ddd, n=0..255 in next opcode byte -Except: - 00100111 = jt n, jump if f is true (n = -128..127, relative to next opcode) - 00101111 = jf n, jump if f is false (n = -128..127) - 00110111 = r=a n, assign r[n] = a (n = 0..255) - -Group 00111xxx where xxx is: - 000 = halt (return) - 001 = output a - 010 = not used - 011 = hash: a = (a + *b + 512) * 773 - 100 = hashd: *d = (*d + a + 512) * 773 - 101 = not used - 110 = not used - 111 = unconditional jump (n = -128 to 127, relative to next opcode) - -Group 1xxxxsss where xxxx is: - 0000 = a += sss (add, subtract, multiply, divide sss to a) - 0001 = a -= sss - 0010 = a *= sss - 0011 = a /= sss (unsigned, except set a = 0 if sss is 0) - 0100 = a %= sss (remainder, except set a = 0 if sss is 0) - 0101 = a &= sss (bitwise AND) - 0110 = a &= ~sss (bitwise AND with complement of sss) - 0111 = a |= sss (bitwise OR) - 1000 = a ^= sss (bitwise XOR) - 1001 = a <<= (sss % 32) (left shift by low 5 bits of sss) - 1010 = a >>= (sss % 32) (unsigned, zero bits shifted in) - 1011 = a == sss (compare, set f = true if equal or false otherwise) - 1100 = a < sss (unsigned compare, result in f) - 1101 = a > sss (unsigned compare) - 1110 = not used - 1111 = not used except 11111111 is a 3 byte jump to the absolute address - in the next 2 bytes in little-endian (LSB first) order. - -assemble() translates ZPAQL to 32 bit x86 code to be executed by run(). -Registers are mapped as follows: - - eax = source sss from *b, *c, *d or sometimes n - ecx = pointer to destination *b, *c, *d, or spare - edx = a - ebx = f (1 for true, 0 for false) - esp = stack pointer - ebp = d - esi = b - edi = c - -run() saves non-volatile registers (ebp, esi, edi, ebx) on the stack, -loads a, b, c, d, f, and executes the translated instructions. -A halt instruction saves a, b, c, d, f, pops the saved registers -and returns. Invalid instructions or jumps outside of the range -of the ZPAQL code call libzpaq::error(). - -In 64 bit mode, the following additional registers are used: - - r12 = h - r14 = r - r15 = m - -*/ - -// Called by out -static void flush1(ZPAQL* z) { - z->flush(); -} - -// return true if op is an undefined ZPAQL instruction -static bool iserr(int op) { - return op==0 || (op>=120 && op<=127) || (op>=240 && op<=254) - || op==58 || (op<64 && (op%8==5 || op%8==6)); -} - -// Write k bytes of x to rcode[o++] MSB first -static void put(U8* rcode, int n, int& o, U32 x, int k) { - while (k-->0) { - if (o>(k*8))&255; - ++o; - } -} - -// Write 4 bytes of x to rcode[o++] LSB first -static void put4lsb(U8* rcode, int n, int& o, U32 x) { - for (int k=0; k<4; ++k) { - if (o>(k*8))&255; - ++o; - } -} - -// Write a 1-4 byte x86 opcode without or with an 4 byte operand -// to rcode[o...] -#define put1(x) put(rcode, rcode_size, o, (x), 1) -#define put2(x) put(rcode, rcode_size, o, (x), 2) -#define put3(x) put(rcode, rcode_size, o, (x), 3) -#define put4(x) put(rcode, rcode_size, o, (x), 4) -#define put5(x,y) put4(x), put1(y) -#define put6(x,y) put4(x), put2(y) -#define put4r(x) put4lsb(rcode, rcode_size, o, x) -#define puta(x) t=U32(size_t(x)), put4r(t) -#define put1a(x,y) put1(x), puta(y) -#define put2a(x,y) put2(x), puta(y) -#define put3a(x,y) put3(x), puta(y) -#define put4a(x,y) put4(x), puta(y) -#define put5a(x,y,z) put4(x), put1(y), puta(z) -#define put2l(x,y) put2(x), t=U32(size_t(y)), put4r(t), \ - t=U32(size_t(y)>>(S*4)), put4r(t) - -// Assemble ZPAQL in in the HCOMP section of header to rcode, -// but do not write beyond rcode_size. Return the number of -// bytes output or that would have been output. -// Execution starts at rcode[0] and returns 1 if successful or 0 -// in case of a ZPAQL execution error. -int ZPAQL::assemble() { - - // x86? (not foolproof) - const int S=sizeof(char*); // 4 = x86, 8 = x86-64 - U32 t=0x12345678; - if (*(char*)&t!=0x78 || (S!=4 && S!=8)) - error("JIT supported only for x86-32 and x86-64"); - - const U8* hcomp=&header[hbegin]; - const int hlen=hend-hbegin+1; - const int msize=m.size(); - const int hsize=h.size(); - const int regcode[8]={2,6,7,5}; // a,b,c,d.. -> edx,esi,edi,ebp,eax.. - Array it(hlen); // hcomp -> rcode locations - int done=0; // number of instructions assembled (0..hlen) - int o=5; // rcode output index, reserve space for jmp - - // Code for the halt instruction (restore registers and return) - const int halt=o; - if (S==8) { - put2l(0x48b9, &a); // mov rcx, a - put2(0x8911); // mov [rcx], edx - put2l(0x48b9, &b); // mov rcx, b - put2(0x8931); // mov [rcx], esi - put2l(0x48b9, &c); // mov rcx, c - put2(0x8939); // mov [rcx], edi - put2l(0x48b9, &d); // mov rcx, d - put2(0x8929); // mov [rcx], ebp - put2l(0x48b9, &f); // mov rcx, f - put2(0x8919); // mov [rcx], ebx - put4(0x4883c438); // add rsp, 56 - put2(0x415f); // pop r15 - put2(0x415e); // pop r14 - put2(0x415d); // pop r13 - put2(0x415c); // pop r12 - } - else { - put2a(0x8915, &a); // mov [a], edx - put2a(0x8935, &b); // mov [b], esi - put2a(0x893d, &c); // mov [c], edi - put2a(0x892d, &d); // mov [d], ebp - put2a(0x891d, &f); // mov [f], ebx - put3(0x83c43c); // add esp, 60 - } - put1(0x5d); // pop ebp - put1(0x5b); // pop ebx - put1(0x5f); // pop edi - put1(0x5e); // pop esi - put1(0xc3); // ret - - // Code for the out instruction. - // Store a=edx at outbuf[bufptr++]. If full, call flush1(). - const int outlabel=o; - if (S==8) { - put2l(0x48b8, &outbuf[0]);// mov rax, outbuf.p - put2l(0x49ba, &bufptr); // mov r10, &bufptr - put3(0x418b0a); // mov ecx, [r10] - put3(0x891408); // mov [rax+rcx], edx - put2(0xffc1); // inc ecx - put3(0x41890a); // mov [r10], ecx - put2a(0x81f9, outbuf.size()); // cmp ecx, outbuf.size() - put2(0x7401); // jz L1 - put1(0xc3); // ret - put4(0x4883ec30); // L1: sub esp, 48 ; call flush1(this) - put4(0x48893c24); // mov [rsp], rdi - put5(0x48897424,8); // mov [rsp+8], rsi - put5(0x48895424,16); // mov [rsp+16], rdx - put5(0x48894c24,24); // mov [rsp+24], rcx -#ifdef unix - put2l(0x48bf, this); // mov rdi, this -#else // Windows - put2l(0x48b9, this); // mov rcx, this -#endif - put2l(0x49bb, &flush1); // mov r11, &flush1 - put3(0x41ffd3); // call r11 - put5(0x488b4c24,24); // mov rcx, [rsp+24] - put5(0x488b5424,16); // mov rdx, [rsp+16] - put5(0x488b7424,8); // mov rsi, [rsp+8] - put4(0x488b3c24); // mov rdi, [rsp] - put4(0x4883c430); // add esp, 48 - put1(0xc3); // ret - } - else { - put1a(0xb8, &outbuf[0]); // mov eax, outbuf.p - put2a(0x8b0d, &bufptr); // mov ecx, [bufptr] - put3(0x891408); // mov [eax+ecx], edx - put2(0xffc1); // inc ecx - put2a(0x890d, &bufptr); // mov [bufptr], ecx - put2a(0x81f9, outbuf.size()); // cmp ecx, outbuf.size() - put2(0x7401); // jz L1 - put1(0xc3); // ret - put3(0x83ec08); // L1: sub esp, 8 - put4(0x89542404); // mov [esp+4], edx - put3a(0xc70424, this); // mov [esp], this - put1a(0xb8, &flush1); // mov eax, &flush1 - put2(0xffd0); // call eax - put4(0x8b542404); // mov edx, [esp+4] - put3(0x83c408); // add esp, 8 - put1(0xc3); // ret - } - - // Set it[i]=1 for each ZPAQL instruction reachable from the previous - // instruction + 2 if reachable by a jump (or 3 if both). - it[0]=2; - assert(hlen>0 && hcomp[hlen-1]==0); // ends with error - do { - done=0; - const int NONE=0x80000000; - for (int i=0; i>24);// jt,jf,jmp - if (op==63) next1=NONE; // jmp - if ((next2<0 || next2>=hlen) && next2!=NONE) next2=hlen-1; // error - if (next1!=NONE && !(it[next1]&1)) it[next1]|=1, ++done; - if (next2!=NONE && !(it[next2]&2)) it[next2]|=2, ++done; - } - } - } while (done>0); - - // Set it[i] bits 2-3 to 4, 8, or 12 if a comparison - // (<, >, == respectively) does not need to save the result in f, - // or if a conditional jump (jt, jf) does not need to read f. - // This is true if a comparison is followed directly by a jt/jf, - // the jt/jf is not a jump target, the byte before is not a jump - // target (for a 2 byte comparison), and for the comparison instruction - // if both paths after the jt/jf lead to another comparison or error - // before another jt/jf. At most hlen steps are traced because after - // that it must be an infinite loop. - for (int i=0; i=216 && op1<240 && (op2==39 || op2==47) - && it[i2]==1 && (i2==i+1 || it[i+1]==0)) { - int code=(op1-208)/8*4; // 4,8,12 is ==,<,> - it[i2]+=code; // OK to test CF, ZF instead of f - for (int j=0; j<2 && code; ++j) { // trace each path from i2 - int k=i2+2; // branch not taken - if (j==1) k=i2+2+(hcomp[i2+1]<<24>>24); // branch taken - for (int l=0; l=hlen) break; // out of bounds, pass - const int op=hcomp[k]; - if (op==39 || op==47) code=0; // jt,jf, fail - else if (op>=216 && op<240) break; // ==,<,>, pass - else if (iserr(op)) break; // error, pass - else if (op==255) k=hcomp[k+1]+256*hcomp[k+2]; // lj - else if (op==63) k=k+2+(hcomp[k+1]<<24>>24); // jmp - else if (op==56) k=0; // halt - else k=k+1+(op%8==7); // ordinary instruction - } - } - it[i]+=code; // if > 0 then OK to not save flags in f (bl) - } - } - - // Start of run(): Save x86 and load ZPAQL registers - const int start=o; - assert(start>=16); - put1(0x56); // push esi/rsi - put1(0x57); // push edi/rdi - put1(0x53); // push ebx/rbx - put1(0x55); // push ebp/rbp - if (S==8) { - put2(0x4154); // push r12 - put2(0x4155); // push r13 - put2(0x4156); // push r14 - put2(0x4157); // push r15 - put4(0x4883ec38); // sub rsp, 56 - put2l(0x48b8, &a); // mov rax, a - put2(0x8b10); // mov edx, [rax] - put2l(0x48b8, &b); // mov rax, b - put2(0x8b30); // mov esi, [rax] - put2l(0x48b8, &c); // mov rax, c - put2(0x8b38); // mov edi, [rax] - put2l(0x48b8, &d); // mov rax, d - put2(0x8b28); // mov ebp, [rax] - put2l(0x48b8, &f); // mov rax, f - put2(0x8b18); // mov ebx, [rax] - put2l(0x49bc, &h[0]); // mov r12, h - put2l(0x49bd, &outbuf[0]); // mov r13, outbuf.p - put2l(0x49be, &r[0]); // mov r14, r - put2l(0x49bf, &m[0]); // mov r15, m - } - else { - put3(0x83ec3c); // sub esp, 60 - put2a(0x8b15, &a); // mov edx, [a] - put2a(0x8b35, &b); // mov esi, [b] - put2a(0x8b3d, &c); // mov edi, [c] - put2a(0x8b2d, &d); // mov ebp, [d] - put2a(0x8b1d, &f); // mov ebx, [f] - } - - // Assemble in multiple passes until every byte of hcomp has a translation - for (int istart=0; istarti); - assert(i>=0 && i=16) { - if (i>istart) { - int a=code-o; - if (a>-120 && a<120) - put2(0xeb00+((a-2)&255)); // jmp short o - else - put1a(0xe9, a-5); // jmp near o - } - break; - } - - // Else assemble the instruction at hcode[i] to rcode[o] - else { - assert(i>=0 && i0 && it[i]<16); - assert(o>=16); - it[i]=o; - ++done; - const int op=hcomp[i]; - const int arg=hcomp[i+1]+((op==255)?256*hcomp[i+2]:0); - const int ddd=op/8%8; - const int sss=op%8; - - // error instruction: return 0 - if (iserr(op)) { - put2(0x31c0); // xor eax, eax - put1a(0xe9, halt-o-4); // jmp near halt - continue; - } - - // Load source *b, *c, *d, or hash (*b) into eax except: - // {a,b,c,d}=*d, a{+,-,*,&,|,^,=,==,>,>}=*d: load address to eax - // {a,b,c,d}={*b,*c}: load source into ddd - if (op==59 || (op>=64 && op<240 && op%8>=4 && op%8<7)) { - put2(0x89c0+8*regcode[sss-3+(op==59)]); // mov eax, {esi,edi,ebp} - const int sz=(sss==6?hsize:msize)-1; - if (sz>=128) put1a(0x25, sz); // and eax, dword msize-1 - else put3(0x83e000+sz); // and eax, byte msize-1 - const int move=(op>=64 && op<112); // = or else ddd is eax - if (sss<6) { // ddd={a,b,c,d,*b,*c} - if (S==8) put5(0x410fb604+8*move*regcode[ddd],0x07); - // movzx ddd, byte [r15+rax] - else put3a(0x0fb680+8*move*regcode[ddd], &m[0]); - // movzx ddd, byte [m+eax] - } - else if ((0x06587000>>(op/8))&1) {// {*b,*c,*d,a/,a%,a&~,a<<,a>>}=*d - if (S==8) put4(0x418b0484); // mov eax, [r12+rax*4] - else put3a(0x8b0485, &h[0]); // mov eax, [h+eax*4] - } - } - - // Load destination address *b, *c, *d or hashd (*d) into ecx - if ((op>=32 && op<56 && op%8<5) || (op>=96 && op<120) || op==60) { - put2(0x89c1+8*regcode[op/8%8-3-(op==60)]);// mov ecx,{esi,edi,ebp} - const int sz=(ddd==6||op==60?hsize:msize)-1; - if (sz>=128) put2a(0x81e1, sz); // and ecx, dword sz - else put3(0x83e100+sz); // and ecx, byte sz - if (op/8%8==6 || op==60) { // *d - if (S==8) put4(0x498d0c8c); // lea rcx, [r12+rcx*4] - else put3a(0x8d0c8d, &h[0]); // lea ecx, [ecx*4+h] - } - else { // *b, *c - if (S==8) put4(0x498d0c0f); // lea rcx, [r15+rcx] - else put2a(0x8d89, &m[0]); // lea ecx, [ecx+h] - } - } - - // Translate by opcode - switch((op/8)&31) { - case 0: // ddd = a - case 1: // ddd = b - case 2: // ddd = c - case 3: // ddd = d - switch(sss) { - case 0: // ddd<>a (swap) - put2(0x87d0+regcode[ddd]); // xchg edx, ddd - break; - case 1: // ddd++ - put2(0xffc0+regcode[ddd]); // inc ddd - break; - case 2: // ddd-- - put2(0xffc8+regcode[ddd]); // dec ddd - break; - case 3: // ddd! - put2(0xf7d0+regcode[ddd]); // not ddd - break; - case 4: // ddd=0 - put2(0x31c0+9*regcode[ddd]); // xor ddd,ddd - break; - case 7: // ddd=r n - if (S==8) - put3a(0x418b86+8*regcode[ddd], arg*4); // mov ddd, [r14+n*4] - else - put2a(0x8b05+8*regcode[ddd], (&r[arg]));//mov ddd, [r+n] - break; - } - break; - case 4: // ddd = *b - case 5: // ddd = *c - switch(sss) { - case 0: // ddd<>a (swap) - put2(0x8611); // xchg dl, [ecx] - break; - case 1: // ddd++ - put2(0xfe01); // inc byte [ecx] - break; - case 2: // ddd-- - put2(0xfe09); // dec byte [ecx] - break; - case 3: // ddd! - put2(0xf611); // not byte [ecx] - break; - case 4: // ddd=0 - put2(0x31c0); // xor eax, eax - put2(0x8801); // mov [ecx], al - break; - case 7: // jt, jf - { - assert(code>=0 && code<16); - const int jtab[2][4]={{5,4,2,7},{4,5,3,6}}; - // jnz,je,jb,ja, jz,jne,jae,jbe - if (code<4) put2(0x84db); // test bl, bl - if (arg>=128 && arg-257-i>=0 && o-it[arg-257-i]<120) - put2(0x7000+256*jtab[op==47][code/4]); // jx short 0 - else - put2a(0x0f80+jtab[op==47][code/4], 0); // jx near 0 - break; - } - } - break; - case 6: // ddd = *d - switch(sss) { - case 0: // ddd<>a (swap) - put2(0x8711); // xchg edx, [ecx] - break; - case 1: // ddd++ - put2(0xff01); // inc dword [ecx] - break; - case 2: // ddd-- - put2(0xff09); // dec dword [ecx] - break; - case 3: // ddd! - put2(0xf711); // not dword [ecx] - break; - case 4: // ddd=0 - put2(0x31c0); // xor eax, eax - put2(0x8901); // mov [ecx], eax - break; - case 7: // ddd=r n - if (S==8) - put3a(0x418996, arg*4); // mov [r14+n*4], edx - else - put2a(0x8915, &r[arg]); // mov [r+n], edx - break; - } - break; - case 7: // special - switch(op) { - case 56: // halt - put1a(0xb8, 1); // mov eax, 1 - put1a(0xe9, halt-o-4); // jmp near halt - break; - case 57: // out - put1a(0xe8, outlabel-o-4);// call outlabel - break; - case 59: // hash: a = (a + *b + 512) * 773 - put3a(0x8d8410, 512); // lea edx, [eax+edx+512] - put2a(0x69d0, 773); // imul edx, eax, 773 - break; - case 60: // hashd: *d = (*d + a + 512) * 773 - put2(0x8b01); // mov eax, [ecx] - put3a(0x8d8410, 512); // lea eax, [eax+edx+512] - put2a(0x69c0, 773); // imul eax, eax, 773 - put2(0x8901); // mov [ecx], eax - break; - case 63: // jmp - put1a(0xe9, 0); // jmp near 0 (fill in target later) - break; - } - break; - case 8: // a= - case 9: // b= - case 10: // c= - case 11: // d= - if (sss==7) // n - put1a(0xb8+regcode[ddd], arg); // mov ddd, n - else if (sss==6) { // *d - if (S==8) - put4(0x418b0484+(regcode[ddd]<<11)); // mov ddd, [r12+rax*4] - else - put3a(0x8b0485+(regcode[ddd]<<11),&h[0]);// mov ddd, [h+eax*4] - } - else if (sss<4) // a, b, c, d - put2(0x89c0+regcode[ddd]+8*regcode[sss]);// mov ddd,sss - break; - case 12: // *b= - case 13: // *c= - if (sss==7) put3(0xc60100+arg); // mov byte [ecx], n - else if (sss==0) put2(0x8811); // mov byte [ecx], dl - else { - if (sss<4) put2(0x89c0+8*regcode[sss]);// mov eax, sss - put2(0x8801); // mov byte [ecx], al - } - break; - case 14: // *d= - if (sss<7) put2(0x8901+8*regcode[sss]); // mov [ecx], sss - else put2a(0xc701, arg); // mov dword [ecx], n - break; - case 15: break; // not used - case 16: // a+= - if (sss==6) { - if (S==8) put4(0x41031484); // add edx, [r12+rax*4] - else put3a(0x031485, &h[0]); // add edx, [h+eax*4] - } - else if (sss<7) put2(0x01c2+8*regcode[sss]);// add edx, sss - else if (arg>128) put2a(0x81c2, arg); // add edx, n - else put3(0x83c200+arg); // add edx, byte n - break; - case 17: // a-= - if (sss==6) { - if (S==8) put4(0x412b1484); // sub edx, [r12+rax*4] - else put3a(0x2b1485, &h[0]); // sub edx, [h+eax*4] - } - else if (sss<7) put2(0x29c2+8*regcode[sss]);// sub edx, sss - else if (arg>=128) put2a(0x81ea, arg); // sub edx, n - else put3(0x83ea00+arg); // sub edx, byte n - break; - case 18: // a*= - if (sss==6) { - if (S==8) put5(0x410faf14,0x84); // imul edx, [r12+rax*4] - else put4a(0x0faf1485, &h[0]); // imul edx, [h+eax*4] - } - else if (sss<7) put3(0x0fafd0+regcode[sss]);// imul edx, sss - else if (arg>=128) put2a(0x69d2, arg); // imul edx, n - else put3(0x6bd200+arg); // imul edx, byte n - break; - case 19: // a/= - case 20: // a%= - if (sss<7) put2(0x89c1+8*regcode[sss]); // mov ecx, sss - else put1a(0xb9, arg); // mov ecx, n - put2(0x85c9); // test ecx, ecx - put3(0x0f44d1); // cmovz edx, ecx - put2(0x7408-2*(op/8==20)); // jz (over rest) - put2(0x89d0); // mov eax, edx - put2(0x31d2); // xor edx, edx - put2(0xf7f1); // div ecx - if (op/8==19) put2(0x89c2); // mov edx, eax - break; - case 21: // a&= - if (sss==6) { - if (S==8) put4(0x41231484); // and edx, [r12+rax*4] - else put3a(0x231485, &h[0]); // and edx, [h+eax*4] - } - else if (sss<7) put2(0x21c2+8*regcode[sss]);// and edx, sss - else if (arg>=128) put2a(0x81e2, arg); // and edx, n - else put3(0x83e200+arg); // and edx, byte n - break; - case 22: // a&~ - if (sss==7) { - if (arg<128) put3(0x83e200+(~arg&255));// and edx, byte ~n - else put2a(0x81e2, ~arg); // and edx, ~n - } - else { - if (sss<4) put2(0x89c0+8*regcode[sss]);// mov eax, sss - put2(0xf7d0); // not eax - put2(0x21c2); // and edx, eax - } - break; - case 23: // a|= - if (sss==6) { - if (S==8) put4(0x410b1484); // or edx, [r12+rax*4] - else put3a(0x0b1485, &h[0]); // or edx, [h+eax*4] - } - else if (sss<7) put2(0x09c2+8*regcode[sss]);// or edx, sss - else if (arg>=128) put2a(0x81ca, arg); // or edx, n - else put3(0x83ca00+arg); // or edx, byte n - break; - case 24: // a^= - if (sss==6) { - if (S==8) put4(0x41331484); // xor edx, [r12+rax*4] - else put3a(0x331485, &h[0]); // xor edx, [h+eax*4] - } - else if (sss<7) put2(0x31c2+8*regcode[sss]);// xor edx, sss - else if (arg>=128) put2a(0x81f2, arg); // xor edx, byte n - else put3(0x83f200+arg); // xor edx, n - break; - case 25: // a<<= - case 26: // a>>= - if (sss==7) // sss = n - put3(0xc1e200+8*256*(op/8==26)+arg); // shl/shr n - else { - put2(0x89c1+8*regcode[sss]); // mov ecx, sss - put2(0xd3e2+8*(op/8==26)); // shl/shr edx, cl - } - break; - case 27: // a== - case 28: // a< - case 29: // a> - if (sss==6) { - if (S==8) put4(0x413b1484); // cmp edx, [r12+rax*4] - else put3a(0x3b1485, &h[0]); // cmp edx, [h+eax*4] - } - else if (sss==7) // sss = n - put2a(0x81fa, arg); // cmp edx, dword n - else - put2(0x39c2+8*regcode[sss]); // cmp edx, sss - if (code<4) { - if (op/8==27) put3(0x0f94c3); // setz bl - if (op/8==28) put3(0x0f92c3); // setc bl - if (op/8==29) put3(0x0f97c3); // seta bl - } - break; - case 30: // not used - case 31: // 255 = lj - if (op==255) put1a(0xe9, 0); // jmp near - break; - } - } - } - } - - // Finish first pass - const int rsize=o; - if (o>rcode_size) return rsize; - - // Fill in jump addresses (second pass) - for (int i=0; i=128) target-=256; - target+=i+2; - } - if (target<0 || target>=hlen) target=hlen-1; // runtime ZPAQL error - o=it[i]; - assert(o>=16 && o skip test - assert(o>=16 && o=0x72 && op<0x78) || op==0xeb) { // jx, jmp short - --target; - if (target<-128 || target>127) - error("Cannot code x86 short jump"); - assert(o=0x82 && op<0x88) || op==0xe9) // jx, jmp near - { - target-=4; - puta(target); - } - else assert(false); // not a x86 jump - } - } - - // Jump to start - o=0; - put1a(0xe9, start-5); // jmp near start - return rsize; -} - -//////////////////////// Predictor::assemble_p() ///////////////////// - -// Assemble the ZPAQL code in the HCOMP section of z.header to pcomp and -// return the number of bytes of x86 or x86-64 code written, or that would -// be written if pcomp were large enough. The code for predict() begins -// at pr.pcomp[0] and update() at pr.pcomp[5], both as jmp instructions. - -// The assembled code is equivalent to int predict(Predictor*) -// and void update(Predictor*, int y); The Preditor address is placed in -// edi/rdi. The update bit y is placed in ebp/rbp. - -int Predictor::assemble_p() { - Predictor& pr=*this; - U8* rcode=pr.pcode; // x86 output array - int rcode_size=pcode_size; // output size - int o=0; // output index in pcode - const int S=sizeof(char*); // 4 or 8 - U8* hcomp=&pr.z.header[0]; // The code to translate -#define off(x) ((char*)&(pr.x)-(char*)&pr) -#define offc(x) ((char*)&(pr.comp[i].x)-(char*)&pr) - - // test for little-endian (probably x86) - U32 t=0x12345678; - if (*(char*)&t!=0x78 || (S!=4 && S!=8)) - error("JIT supported only for x86-32 and x86-64"); - - // Initialize for predict(). Put predictor address in edi/rdi - put1a(0xe9, 5); // jmp predict - put1a(0, 0x90909000); // reserve space for jmp update - put1(0x53); // push ebx/rbx - put1(0x55); // push ebp/rbp - put1(0x56); // push esi/rsi - put1(0x57); // push edi/rdi - if (S==4) - put4(0x8b7c2414); // mov edi,[esp+0x14] ; pr - else { -#ifndef unix - put3(0x4889cf); // mov rdi, rcx (1st arg in Win64) -#endif - } - - // Code predict() for each component - const int n=hcomp[6]; // number of components - U8* cp=hcomp+7; - for (int i=0; i=pr.z.cend) error("comp too big"); - if (cp[0]<1 || cp[0]>9) error("invalid component"); - assert(compsize[cp[0]]>0 && compsize[cp[0]]<8); - switch (cp[0]) { - - case CONS: // c - break; - - case CM: // sizebits limit - // Component& cr=comp[i]; - // cr.cxt=h[i]^hmap4; - // p[i]=stretch(cr.cm(cr.cxt)>>17); - - put2a(0x8b87, off(h[i])); // mov eax, [edi+&h[i]] - put2a(0x3387, off(hmap4)); // xor eax, [edi+&hmap4] - put1a(0x25, (1<rsi) - put2a(0x8bb7, offc(cm)); // mov esi, [edi+&cm] - put3(0x8b0486); // mov eax, [esi+eax*4] - put3(0xc1e811); // shr eax, 17 - put4a(0x0fbf8447, off(stretcht)); // movsx eax,word[edi+eax*2+..] - put2a(0x8987, off(p[i])); // mov [edi+&p[i]], eax - break; - - case ISSE: // sizebits j -- c=hi, cxt=bh - // assert((hmap4&15)>0); - // if (c8==1 || (c8&0xf0)==16) - // cr.c=find(cr.ht, cp[1]+2, h[i]+16*c8); - // cr.cxt=cr.ht[cr.c+(hmap4&15)]; // bit history - // int *wt=(int*)&cr.cm[cr.cxt*2]; - // p[i]=clamp2k((wt[0]*p[cp[2]]+wt[1]*64)>>16); - - case ICM: // sizebits - // assert((hmap4&15)>0); - // if (c8==1 || (c8&0xf0)==16) cr.c=find(cr.ht, cp[1]+2, h[i]+16*c8); - // cr.cxt=cr.ht[cr.c+(hmap4&15)]; - // p[i]=stretch(cr.cm(cr.cxt)>>8); - // - // Find cxt row in hash table ht. ht has rows of 16 indexed by the low - // sizebits of cxt with element 0 having the next higher 8 bits for - // collision detection. If not found after 3 adjacent tries, replace - // row with lowest element 1 as priority. Return index of row. - // - // size_t Predictor::find(Array& ht, int sizebits, U32 cxt) { - // assert(ht.size()==size_t(16)<>sizebits&255; - // size_t h0=(cxt*16)&(ht.size()-16); - // if (ht[h0]==chk) return h0; - // size_t h1=h0^16; - // if (ht[h1]==chk) return h1; - // size_t h2=h0^32; - // if (ht[h2]==chk) return h2; - // if (ht[h0+1]<=ht[h1+1] && ht[h0+1]<=ht[h2+1]) - // return memset(&ht[h0], 0, 16), ht[h0]=chk, h0; - // else if (ht[h1+1]>(7-cr.cxt))&1; // predicted bit - // p[i]=stretch(dt2k[cr.a]*(cr.c*-2+1)&32767); - // } - - if (S==8) put1(0x48); // rex.w - put2a(0x8bb7, offc(ht)); // mov esi, [edi+&ht] - - // If match length (a) is 0 then p[i]=0 - put2a(0x8b87, offc(a)); // mov eax, [edi+&a] - put2(0x85c0); // test eax, eax - put2(0x7449); // jz L2 ; p[i]=0 - - // Else put predicted bit in c - put1a(0xb9, 7); // mov ecx, 7 - put2a(0x2b8f, offc(cxt)); // sub ecx, [edi+&cxt] - put2a(0x8b87, offc(limit)); // mov eax, [edi+&limit] - put2a(0x2b87, offc(b)); // sub eax, [edi+&b] - put1a(0x25, (1<>8; - - put2a(0x8b87, off(p[cp[1]])); // mov eax, [edi+&p[j]] - put2a(0x2b87, off(p[cp[2]])); // sub eax, [edi+&p[k]] - put2a(0x69c0, cp[3]); // imul eax, wt - put3(0xc1f808); // sar eax, 8 - put2a(0x0387, off(p[cp[2]])); // add eax, [edi+&p[k]] - put2a(0x8987, off(p[i])); // mov [edi+&p[i]], eax - break; - - case MIX2: // sizebits j k rate mask - // c=size cm=wt[size] cxt=input - // cr.cxt=((h[i]+(c8&cp[5]))&(cr.c-1)); - // assert(cr.cxt=0 && w<65536); - // p[i]=(w*p[cp[2]]+(65536-w)*p[cp[3]])>>16; - // assert(p[i]>=-2048 && p[i]<2048); - - put2(0x8b07); // mov eax, [edi] ; c8 - put1a(0x25, cp[5]); // and eax, mask - put2a(0x0387, off(h[i])); // add eax, [edi+&h[i]] - put1a(0x25, (1<=1 && m<=i); - // cr.cxt=h[i]+(c8&cp[5]); - // cr.cxt=(cr.cxt&(cr.c-1))*m; // pointer to row of weights - // assert(cr.cxt<=cr.cm.size()-m); - // int* wt=(int*)&cr.cm[cr.cxt]; - // p[i]=0; - // for (int j=0; j>8)*p[cp[2]+j]; - // p[i]=clamp2k(p[i]>>8); - - put2(0x8b07); // mov eax, [edi] ; c8 - put1a(0x25, cp[5]); // and eax, mask - put2a(0x0387, off(h[i])); // add eax, [edi+&h[i]] - put1a(0x25, (1<3) put4a(0xf30f6f96, k*4+16);//movdqu xmm2, [esi+k*4+16] - put5(0x660f72e1,0x08); // psrad xmm1, 8 - if (tail>3) put5(0x660f72e2,0x08); // psrad xmm2, 8 - put4(0x660f6bca); // packssdw xmm1, xmm2 - put4a(0xf30f6f9f, off(p[cp[2]+k])); // movdqu xmm3, [edi+&p[j+k]] - if (tail>3) - put4a(0xf30f6fa7,off(p[cp[2]+k+4]));//movdqu xmm4, [edi+&p[j+k+4]] - put4(0x660f6bdc); // packssdw, xmm3, xmm4 - if (tail>0 && tail<8) { // last loop, mask extra weights - put4(0x660f76ed); // pcmpeqd xmm5, xmm5 ; -1 - put5(0x660f73dd, 16-tail*2); // psrldq xmm5, 16-tail*2 - put4(0x660fdbcd); // pand xmm1, xmm5 - } - if (k==0) { // first loop, initialize sum in xmm0 - put4(0xf30f6fc1); // movdqu xmm0, xmm1 - put4(0x660ff5c3); // pmaddwd xmm0, xmm3 - } - else { // accumulate sum in xmm0 - put4(0xf30f6fd1); // movdqu xmm2, xmm1 - put4(0x660ff5d3); // pmaddwd xmm2, xmm3 - put4(0x660ffec2); // paddd, xmm0, xmm2 - } - } - - // Add up the 4 elements of xmm0 = p[i] in the first element - put4(0xf30f6fc8); // movdqu xmm1, xmm0 - put5(0x660f73d9,0x08); // psrldq xmm1, 8 - put4(0x660ffec1); // paddd xmm0, xmm1 - put4(0xf30f6fc8); // movdqu xmm1, xmm0 - put5(0x660f73d9,0x04); // psrldq xmm1, 4 - put4(0x660ffec1); // paddd xmm0, xmm1 - put4(0x660f7ec0); // movd eax, xmm0 ; p[i] - put3(0xc1f808); // sar eax, 8 - put1a(0xb9, 2047); // mov ecx, 2047 ; clamp2k - put2(0x39c8); // cmp eax, ecx - put3(0x0f4fc1); // cmovg eax, ecx - put2(0xf7d1); // not ecx ; -2048 - put2(0x39c8); // cmp eax, ecx - put3(0x0f4cc1); // cmovl eax, ecx - put2a(0x8987, off(p[i])); // mov [edi+&p[i]], eax - break; - - case SSE: // sizebits j start limit - // cr.cxt=(h[i]+c8)*32; - // int pq=p[cp[2]]+992; - // if (pq<0) pq=0; - // if (pq>1983) pq=1983; - // int wt=pq&63; - // pq>>=6; - // assert(pq>=0 && pq<=30); - // cr.cxt+=pq; - // p[i]=stretch(((cr.cm(cr.cxt)>>10)*(64-wt) // p0 - // +(cr.cm(cr.cxt+1)>>10)*wt)>>13); // p1 - // // p = p0*(64-wt)+p1*wt = (p1-p0)*wt + p0*64 - // cr.cxt+=wt>>5; - - put2a(0x8b8f, off(h[i])); // mov ecx, [edi+&h[i]] - put2(0x030f); // add ecx, [edi] ; c0 - put2a(0x81e1, (1<>5 - put2a(0x898f, offc(cxt)); // mov [edi+cxt], ecx ; cxt saved - put3(0xc1e80a); // shr eax, 10 ; p0 = cm[cxt]>>10 - put3(0xc1eb0a); // shr ebx, 10 ; p1 = cm[cxt+1]>>10 - put2(0x29c3); // sub ebx, eax, ; p1-p0 - put3(0x0fafda); // imul ebx, edx ; (p1-p0)*wt - put3(0xc1e006); // shr eax, 6 - put2(0x01d8); // add eax, ebx ; p in 0..2^28-1 - put3(0xc1e80d); // shr eax, 13 ; p in 0..32767 - put4a(0x0fbf8447, off(stretcht)); // movsx eax, word [edi+eax*2+...] - put2a(0x8987, off(p[i])); // mov [edi+&p[i]], eax - break; - - default: - error("invalid ZPAQ component"); - } - } - - // return squash(p[n-1]) - put2a(0x8b87, off(p[n-1])); // mov eax, [edi+...] - put1a(0x05, 0x800); // add eax, 2048 - put4a(0x0fbf8447, off(squasht[0])); // movsx eax, word [edi+eax*2+...] - put1(0x5f); // pop edi - put1(0x5e); // pop esi - put1(0x5d); // pop ebp - put1(0x5b); // pop ebx - put1(0xc3); // ret - - // Initialize for update() Put predictor address in edi/rdi - // and bit y=0..1 in ebp - int save_o=o; - o=5; - put1a(0xe9, save_o-10); // jmp update - o=save_o; - put1(0x53); // push ebx/rbx - put1(0x55); // push ebp/rbp - put1(0x56); // push esi/rsi - put1(0x57); // push edi/rdi - if (S==4) { - put4(0x8b7c2414); // mov edi,[esp+0x14] ; (1st arg = pr) - put4(0x8b6c2418); // mov ebp,[esp+0x18] ; (2nd arg = y) - } - else { -#ifdef unix // (1st arg already in rdi) - put3(0x4889f5); // mov rbp, rsi (2nd arg in Linux-64) -#else - put3(0x4889cf); // mov rdi, rcx (1st arg in Win64) - put3(0x4889d5); // mov rbp, rdx (2nd arg) -#endif - } - - // Code update() for each component - cp=hcomp+7; - for (int i=0; i=1 && cp[0]<=9); - assert(compsize[cp[0]]>0 && compsize[cp[0]]<8); - switch (cp[0]) { - - case CONS: // c - break; - - case SSE: // sizebits j start limit - case CM: // sizebits limit - // train(cr, y); - // - // reduce prediction error in cr.cm - // void train(Component& cr, int y) { - // assert(y==0 || y==1); - // U32& pn=cr.cm(cr.cxt); - // U32 count=pn&0x3ff; - // int error=y*32767-(cr.cm(cr.cxt)>>17); - // pn+=(error*dt[count]&-1024)+(countrsi) - put2a(0x8bb7, offc(cm)); // mov esi,[edi+cm] ; cm - put2a(0x8b87, offc(cxt)); // mov eax,[edi+cxt] ; cxt - put1a(0x25, pr.comp[i].cm.size()-1); // and eax, size-1 - if (S==8) put1(0x48); // rex.w - put3(0x8d3486); // lea esi,[esi+eax*4] ; &cm[cxt] - put2(0x8b06); // mov eax,[esi] ; cm[cxt] - put2(0x89c2); // mov edx, eax ; cm[cxt] - put3(0xc1e811); // shr eax, 17 ; cm[cxt]>>17 - put2(0x89e9); // mov ecx, ebp ; y - put3(0xc1e10f); // shl ecx, 15 ; y*32768 - put2(0x29e9); // sub ecx, ebp ; y*32767 - put2(0x29c1); // sub ecx, eax ; error - put2a(0x81e2, 0x3ff); // and edx, 1023 ; count - put3a(0x8b8497, off(dt)); // mov eax,[edi+edx*4+dt] ; dt[count] - put3(0x0fafc8); // imul ecx, eax ; error*dt[count] - put2a(0x81e1, 0xfffffc00); // and ecx, -1024 - put2a(0x81fa, cp[2+2*(cp[0]==SSE)]*4); // cmp edx, limit*4 - put2(0x110e); // adc [esi], ecx ; pn+=... - break; - - case ICM: // sizebits: cxt=bh, ht[c][0..15]=bh row - // cr.ht[cr.c+(hmap4&15)]=st.next(cr.ht[cr.c+(hmap4&15)], y); - // U32& pn=cr.cm(cr.cxt); - // pn+=int(y*32767-(pn>>8))>>2; - - case ISSE: // sizebits j -- c=hi, cxt=bh - // assert(cr.cxt==cr.ht[cr.c+(hmap4&15)]); - // int err=y*32767-squash(p[i]); - // int *wt=(int*)&cr.cm[cr.cxt*2]; - // wt[0]=clamp512k(wt[0]+((err*p[cp[2]]+(1<<12))>>13)); - // wt[1]=clamp512k(wt[1]+((err+16)>>5)); - // cr.ht[cr.c+(hmap4&15)]=st.next(cr.cxt, y); - - // update bit history bh to next(bh,y=ebp) in ht[c+(hmap4&15)] - put3(0x8b4700+off(hmap4)); // mov eax, [edi+&hmap4] - put3(0x83e00f); // and eax, 15 - put2a(0x0387, offc(c)); // add eax [edi+&c] ; cxt - if (S==8) put1(0x48); // rex.w - put2a(0x8bb7, offc(ht)); // mov esi, [edi+&ht] - put4(0x0fb61406); // movzx edx, byte [esi+eax] ; bh - put4(0x8d5c9500); // lea ebx, [ebp+edx*4] ; index to st - put4a(0x0fb69c1f, off(st)); // movzx ebx,byte[edi+ebx+st]; next bh - put3(0x881c06); // mov [esi+eax], bl ; save next bh - if (S==8) put1(0x48); // rex.w - put2a(0x8bb7, offc(cm)); // mov esi, [edi+&cm] - - // ICM: update cm[cxt=edx=bit history] to reduce prediction error - // esi = &cm - if (cp[0]==ICM) { - if (S==8) put1(0x48); // rex.w - put3(0x8d3496); // lea esi, [esi+edx*4] ; &cm[bh] - put2(0x8b06); // mov eax, [esi] ; pn - put3(0xc1e808); // shr eax, 8 ; pn>>8 - put2(0x89e9); // mov ecx, ebp ; y - put3(0xc1e10f); // shl ecx, 15 - put2(0x29e9); // sub ecx, ebp ; y*32767 - put2(0x29c1); // sub ecx, eax - put3(0xc1f902); // sar ecx, 2 - put2(0x010e); // add [esi], ecx - } - - // ISSE: update weights. edx=cxt=bit history (0..255), esi=cm[512] - else { - put2a(0x8b87, off(p[i])); // mov eax, [edi+&p[i]] - put1a(0x05, 2048); // add eax, 2048 - put4a(0x0fb78447, off(squasht)); // movzx eax, word [edi+eax*2+..] - put2(0x89e9); // mov ecx, ebp ; y - put3(0xc1e10f); // shl ecx, 15 - put2(0x29e9); // sub ecx, ebp ; y*32767 - put2(0x29c1); // sub ecx, eax ; err - put2a(0x8b87, off(p[cp[2]]));// mov eax, [edi+&p[j]] - put3(0x0fafc1); // imul eax, ecx - put1a(0x05, (1<<12)); // add eax, 4096 - put3(0xc1f80d); // sar eax, 13 - put3(0x0304d6); // add eax, [esi+edx*8] ; wt[0] - put1a(0xbb, (1<<19)-1); // mov ebx, 524287 - put2(0x39d8); // cmp eax, ebx - put3(0x0f4fc3); // cmovg eax, ebx - put2(0xf7d3); // not ebx ; -524288 - put2(0x39d8); // cmp eax, ebx - put3(0x0f4cc3); // cmovl eax, ebx - put3(0x8904d6); // mov [esi+edx*8], eax - put3(0x83c110); // add ecx, 16 ; err - put3(0xc1f905); // sar ecx, 5 - put4(0x034cd604); // add ecx, [esi+edx*8+4] ; wt[1] - put1a(0xb8, (1<<19)-1); // mov eax, 524287 - put2(0x39c1); // cmp ecx, eax - put3(0x0f4fc8); // cmovg ecx, eax - put2(0xf7d0); // not eax ; -524288 - put2(0x39c1); // cmp ecx, eax - put3(0x0f4cc8); // cmovl ecx, eax - put4(0x894cd604); // mov [esi+edx*8+4], ecx - } - break; - - case MATCH: // sizebits bufbits: - // a=len, b=offset, c=bit, cm=index, cxt=bitpos - // ht=buf, limit=pos - // assert(cr.a<=255); - // assert(cr.c==0 || cr.c==1); - // assert(cr.cxt<8); - // assert(cr.cm.size()==(size_t(1)<>5; - // int w=cr.a16[cr.cxt]; - // w+=(err*(p[cp[2]]-p[cp[3]])+(1<<12))>>13; - // if (w<0) w=0; - // if (w>65535) w=65535; - // cr.a16[cr.cxt]=w; - - // set ecx=err - put2a(0x8b87, off(p[i])); // mov eax, [edi+&p[i]] - put1a(0x05, 2048); // add eax, 2048 - put4a(0x0fb78447, off(squasht));//movzx eax, word [edi+eax*2+&squasht] - put2(0x89e9); // mov ecx, ebp ; y - put3(0xc1e10f); // shl ecx, 15 - put2(0x29e9); // sub ecx, ebp ; y*32767 - put2(0x29c1); // sub ecx, eax - put2a(0x69c9, cp[4]); // imul ecx, rate - put3(0xc1f905); // sar ecx, 5 ; err - - // Update w - put2a(0x8b87, offc(cxt)); // mov eax, [edi+&cxt] - if (S==8) put1(0x48); // rex.w - put2a(0x8bb7, offc(a16)); // mov esi, [edi+&a16] - if (S==8) put1(0x48); // rex.w - put3(0x8d3446); // lea esi, [esi+eax*2] ; &w - put2a(0x8b87, off(p[cp[2]])); // mov eax, [edi+&p[j]] - put2a(0x2b87, off(p[cp[3]])); // sub eax, [edi+&p[k]] ; p[j]-p[k] - put3(0x0fafc1); // imul eax, ecx ; * err - put1a(0x05, 1<<12); // add eax, 4096 - put3(0xc1f80d); // sar eax, 13 - put3(0x0fb716); // movzx edx, word [esi] ; w - put2(0x01d0); // add eax, edx - put1a(0xba, 0xffff); // mov edx, 65535 - put2(0x39d0); // cmp eax, edx - put3(0x0f4fc2); // cmovg eax, edx - put2(0x31d2); // xor edx, edx - put2(0x39d0); // cmp eax, edx - put3(0x0f4cc2); // cmovl eax, edx - put3(0x668906); // mov word [esi], ax - break; - - case MIX: // sizebits j m rate mask - // cm=wt[size][m], cxt=input - // int m=cp[3]; - // assert(m>0 && m<=i); - // assert(cr.cm.size()==m*cr.c); - // assert(cr.cxt+m<=cr.cm.size()); - // int err=(y*32767-squash(p[i]))*cp[4]>>4; - // int* wt=(int*)&cr.cm[cr.cxt]; - // for (int j=0; j>13)); - - // set ecx=err - put2a(0x8b87, off(p[i])); // mov eax, [edi+&p[i]] - put1a(0x05, 2048); // add eax, 2048 - put4a(0x0fb78447, off(squasht));//movzx eax, word [edi+eax*2+&squasht] - put2(0x89e9); // mov ecx, ebp ; y - put3(0xc1e10f); // shl ecx, 15 - put2(0x29e9); // sub ecx, ebp ; y*32767 - put2(0x29c1); // sub ecx, eax - put2a(0x69c9, cp[4]); // imul ecx, rate - put3(0xc1f904); // sar ecx, 4 ; err - - // set esi=wt - put2a(0x8b87, offc(cxt)); // mov eax, [edi+&cxt] ; cxt - if (S==8) put1(0x48); // rex.w - put2a(0x8bb7, offc(cm)); // mov esi, [edi+&cm] - if (S==8) put1(0x48); // rex.w - put3(0x8d3486); // lea esi, [esi+eax*4] ; wt - - for (int k=0; k=256) { - z.run(c8-256); - hmap4=1; - c8=1; - for (int i=0; i=16 && c8<32) - hmap4=(hmap4&0xf)<<5|y<<4|1; - else - hmap4=(hmap4&0x1f0)|(((hmap4&0xf)*2+y)&0xf); -#endif -} - -// Execute the ZPAQL code with input byte or -1 for EOF. -// Use JIT code at rcode if available, or else create it. -void ZPAQL::run(U32 input) { -#ifdef NOJIT - run0(input); -#else - if (!rcode) { - int n=assemble(); - allocx(rcode, rcode_size, n); - if (!rcode || n<10 || rcode_size<10 || n!=assemble()) - error("run JIT failed"); - } - a=input; - if (!((int(*)())(&rcode[0]))()) - libzpaq::error("Bad ZPAQL opcode"); -#endif -} - -} // end namespace libzpaq +/* libzpaq.cpp - Part of LIBZPAQ Version 5.01 + + Copyright (C) 2011, Dell Inc. Written by Matt Mahoney. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so without restriction. + This Software is provided "as is" without warranty. + +LIBZPAQ is a C++ library for compression and decompression of data +conforming to the ZPAQ level 2 standard. See http://mattmahoney.net/zpaq/ +*/ + +#include "libzpaq.h" +#include +#include +#include + +#ifndef NOJIT +#ifdef unix +#include +#else +#include +#endif +#endif + +namespace libzpaq { + +// Standard library redirections +void* calloc(size_t a, size_t b) {return ::calloc(a, b);} +void free(void* p) {::free(p);} +int memcmp(const void* d, const void* s, size_t n) { + return ::memcmp(d, s, n);} +void* memset(void* d, int c, size_t n) {return ::memset(d, c, n);} +double log(double x) {return ::log(x);} +double exp(double x) {return ::exp(x);} +double pow(double x, double y) {return ::pow(x, y);} + +// Read 16 bit little-endian number +int toU16(const char* p) { + return (p[0]&255)+256*(p[1]&255); +} + +// Default read() and write() +int Reader::read(char* buf, int n) { + int i=0, c; + while (i=0) + buf[i++]=c; + return i; +} + +void Writer::write(const char* buf, int n) { + for (int i=0; i 0 bytes of executable memory and update +// p to point to it and newsize = n. Free any previously +// allocated memory first. If newsize is 0 then free only. +// Call error in case of failure. If NOJIT, ignore newsize +// and set p=0, n=0 without allocating memory. +void allocx(U8* &p, int &n, int newsize) { +#ifdef NOJIT + p=0; + n=0; +#else + if (p || n) { + if (p) +#ifdef unix + munmap(p, n); +#else // Windows + VirtualFree(p, 0, MEM_RELEASE); +#endif + p=0; + n=0; + } + if (newsize>0) { +#ifdef unix + p=(U8*)mmap(0, newsize, PROT_READ|PROT_WRITE|PROT_EXEC, + MAP_PRIVATE|MAP_ANON, -1, 0); + if ((void*)p==MAP_FAILED) p=0; +#else + p=(U8*)VirtualAlloc(0, newsize, MEM_RESERVE|MEM_COMMIT, + PAGE_EXECUTE_READWRITE); +#endif + if (p) + n=newsize; + else { + n=0; + error("allocx failed"); + } + } +#endif +} + +//////////////////////////// SHA1 //////////////////////////// + +// SHA1 code, see http://en.wikipedia.org/wiki/SHA-1 + +// Start a new hash +void SHA1::init() { + len0=len1=0; + h[0]=0x67452301; + h[1]=0xEFCDAB89; + h[2]=0x98BADCFE; + h[3]=0x10325476; + h[4]=0xC3D2E1F0; +} + +// Return old result and start a new hash +const char* SHA1::result() { + + // pad and append length + const U32 s1=len1, s0=len0; + put(0x80); + while ((len0&511)!=448) + put(0); + put(s1>>24); + put(s1>>16); + put(s1>>8); + put(s1); + put(s0>>24); + put(s0>>16); + put(s0>>8); + put(s0); + + // copy h to hbuf + for (int i=0; i<5; ++i) { + hbuf[4*i]=h[i]>>24; + hbuf[4*i+1]=h[i]>>16; + hbuf[4*i+2]=h[i]>>8; + hbuf[4*i+3]=h[i]; + } + + // return hash prior to clearing state + init(); + return hbuf; +} + +// Hash 1 block of 64 bytes +void SHA1::process() { + for (int i=16; i<80; ++i) { + w[i]=w[i-3]^w[i-8]^w[i-14]^w[i-16]; + w[i]=w[i]<<1|w[i]>>31; + } + U32 a=h[0]; + U32 b=h[1]; + U32 c=h[2]; + U32 d=h[3]; + U32 e=h[4]; + const U32 k1=0x5A827999, k2=0x6ED9EBA1, k3=0x8F1BBCDC, k4=0xCA62C1D6; +#define f1(a,b,c,d,e,i) e+=(a<<5|a>>27)+((b&c)|(~b&d))+k1+w[i]; b=b<<30|b>>2; +#define f5(i) f1(a,b,c,d,e,i) f1(e,a,b,c,d,i+1) f1(d,e,a,b,c,i+2) \ + f1(c,d,e,a,b,i+3) f1(b,c,d,e,a,i+4) + f5(0) f5(5) f5(10) f5(15) +#undef f1 +#define f1(a,b,c,d,e,i) e+=(a<<5|a>>27)+(b^c^d)+k2+w[i]; b=b<<30|b>>2; + f5(20) f5(25) f5(30) f5(35) +#undef f1 +#define f1(a,b,c,d,e,i) e+=(a<<5|a>>27)+((b&c)|(b&d)|(c&d))+k3+w[i]; b=b<<30|b>>2; + f5(40) f5(45) f5(50) f5(55) +#undef f1 +#define f1(a,b,c,d,e,i) e+=(a<<5|a>>27)+(b^c^d)+k4+w[i]; b=b<<30|b>>2; + f5(60) f5(65) f5(70) f5(75) +#undef f1 +#undef f5 + h[0]+=a; + h[1]+=b; + h[2]+=c; + h[3]+=d; + h[4]+=e; +} + +//////////////////////////// Component /////////////////////// + +// A Component is a context model, indirect context model, match model, +// fixed weight mixer, adaptive 2 input mixer without or with current +// partial byte as context, adaptive m input mixer (without or with), +// or SSE (without or with). + +const int compsize[256]={0,2,3,2,3,4,6,6,3,5}; + +void Component::init() { + limit=cxt=a=b=c=0; + cm.resize(0); + ht.resize(0); + a16.resize(0); +} + +////////////////////////// StateTable ////////////////////////// + +// How many states with count of n0 zeros, n1 ones (0...2) +int StateTable::num_states(int n0, int n1) { + const int B=6; + const int bound[B]={20,48,15,8,6,5}; // n0 -> max n1, n1 -> max n0 + if (n0=B || n0>bound[n1]) return 0; + return 1+(n1>0 && n0+n1<=17); +} + +// New value of count n0 if 1 is observed (and vice versa) +void StateTable::discount(int& n0) { + n0=(n0>=1)+(n0>=2)+(n0>=3)+(n0>=4)+(n0>=5)+(n0>=7)+(n0>=8); +} + +// compute next n0,n1 (0 to N) given input y (0 or 1) +void StateTable::next_state(int& n0, int& n1, int y) { + if (n0 20,0 + // 48,1,0 -> 48,1 + // 15,2,0 -> 8,1 + // 8,3,0 -> 6,2 + // 8,3,1 -> 5,3 + // 6,4,0 -> 5,3 + // 5,5,0 -> 5,4 + // 5,5,1 -> 4,5 + while (!num_states(n0, n1)) { + if (n1<2) --n0; + else { + n0=(n0*(n1-1)+(n1/2))/n1; + --n1; + } + } + } +} + +// Initialize next state table ns[state*4] -> next if 0, next if 1, n0, n1 +StateTable::StateTable() { + + // Assign states by increasing priority + const int N=50; + U8 t[N][N][2]={{{0}}}; // (n0,n1,y) -> state number + int state=0; + for (int i=0; i=0 && n<=2); + if (n) { + t[n0][n1][0]=state; + t[n0][n1][1]=state+n-1; + state+=n; + } + } + } + + // Generate next state table + memset(ns, 0, sizeof(ns)); + for (int n0=0; n0=0 && s<256); + int s0=n0, s1=n1; + next_state(s0, s1, 0); + assert(s0>=0 && s0=0 && s1=0 && s0=0 && s1=7); + assert(hbegin>=cend); + assert(hend>=hbegin); + assert(out2); + if (!pp) { // if not a postprocessor then write COMP + for (int i=0; iput(header[i]); + } + else { // write PCOMP size only + out2->put((hend-hbegin)&255); + out2->put((hend-hbegin)>>8); + } + for (int i=hbegin; iput(header[i]); + return true; +} + +// Read header from in2 +int ZPAQL::read(Reader* in2) { + + // Get header size and allocate + int hsize=in2->get(); + hsize+=in2->get()*256; + header.resize(hsize+300); + cend=hbegin=hend=0; + header[cend++]=hsize&255; + header[cend++]=hsize>>8; + while (cend<7) header[cend++]=in2->get(); // hh hm ph pm n + + // Read COMP + int n=header[cend-1]; + for (int i=0; iget(); // component type + if (type==-1) error("unexpected end of file"); + header[cend++]=type; // component type + int size=compsize[type]; + if (size<1) error("Invalid component type"); + if (cend+size>header.isize()-8) error("COMP list too big"); + for (int j=1; jget(); + } + if ((header[cend++]=in2->get())!=0) error("missing COMP END"); + + // Insert a guard gap and read HCOMP + hbegin=hend=cend+128; + while (hendget(); + if (op==-1) error("unexpected end of file"); + header[hend++]=op; + } + if ((header[hend++]=in2->get())!=0) error("missing HCOMP END"); + assert(cend>=7 && cendhbegin && hend6); + assert(output==0); + assert(sha1==0); + init(header[2], header[3]); // hh, hm +} + +// Initialize machine state as PCOMP +void ZPAQL::initp() { + assert(header.isize()>6); + init(header[4], header[5]); // ph, pm +} + +// Flush pending output +void ZPAQL::flush() { + if (output) output->write(&outbuf[0], bufptr); + if (sha1) for (int i=0; iput(U8(outbuf[i])); + bufptr=0; +} + +// Return memory requirement in bytes +double ZPAQL::memory() { + double mem=pow(2.0,header[2]+2)+pow(2.0,header[3]) // hh hm + +pow(2.0,header[4]+2)+pow(2.0,header[5]) // ph pm + +header.size(); + int cp=7; // start of comp list + for (int i=0; i0); + assert(cend>=7); + assert(hbegin>=cend+128); + assert(hend>=hbegin); + assert(hend0); + h.resize(1, hbits); + m.resize(1, mbits); + r.resize(256); + a=b=c=d=pc=f=0; +} + +// Run program on input by interpreting header +void ZPAQL::run0(U32 input) { + assert(cend>6); + assert(hbegin>=cend+128); + assert(hend>=hbegin); + assert(hend0); + assert(h.size()>0); + assert(header[0]+256*header[1]==cend+hend-hbegin-2); + pc=hbegin; + a=input; + while (execute()) ; +} + +// Execute one instruction, return 0 after HALT else 1 +int ZPAQL::execute() { + switch(header[pc++]) { + case 0: err(); break; // ERROR + case 1: ++a; break; // A++ + case 2: --a; break; // A-- + case 3: a = ~a; break; // A! + case 4: a = 0; break; // A=0 + case 7: a = r[header[pc++]]; break; // A=R N + case 8: swap(b); break; // B<>A + case 9: ++b; break; // B++ + case 10: --b; break; // B-- + case 11: b = ~b; break; // B! + case 12: b = 0; break; // B=0 + case 15: b = r[header[pc++]]; break; // B=R N + case 16: swap(c); break; // C<>A + case 17: ++c; break; // C++ + case 18: --c; break; // C-- + case 19: c = ~c; break; // C! + case 20: c = 0; break; // C=0 + case 23: c = r[header[pc++]]; break; // C=R N + case 24: swap(d); break; // D<>A + case 25: ++d; break; // D++ + case 26: --d; break; // D-- + case 27: d = ~d; break; // D! + case 28: d = 0; break; // D=0 + case 31: d = r[header[pc++]]; break; // D=R N + case 32: swap(m(b)); break; // *B<>A + case 33: ++m(b); break; // *B++ + case 34: --m(b); break; // *B-- + case 35: m(b) = ~m(b); break; // *B! + case 36: m(b) = 0; break; // *B=0 + case 39: if (f) pc+=((header[pc]+128)&255)-127; else ++pc; break; // JT N + case 40: swap(m(c)); break; // *C<>A + case 41: ++m(c); break; // *C++ + case 42: --m(c); break; // *C-- + case 43: m(c) = ~m(c); break; // *C! + case 44: m(c) = 0; break; // *C=0 + case 47: if (!f) pc+=((header[pc]+128)&255)-127; else ++pc; break; // JF N + case 48: swap(h(d)); break; // *D<>A + case 49: ++h(d); break; // *D++ + case 50: --h(d); break; // *D-- + case 51: h(d) = ~h(d); break; // *D! + case 52: h(d) = 0; break; // *D=0 + case 55: r[header[pc++]] = a; break; // R=A N + case 56: return 0 ; // HALT + case 57: outc(a&255); break; // OUT + case 59: a = (a+m(b)+512)*773; break; // HASH + case 60: h(d) = (h(d)+a+512)*773; break; // HASHD + case 63: pc+=((header[pc]+128)&255)-127; break; // JMP N + case 64: a = a; break; // A=A + case 65: a = b; break; // A=B + case 66: a = c; break; // A=C + case 67: a = d; break; // A=D + case 68: a = m(b); break; // A=*B + case 69: a = m(c); break; // A=*C + case 70: a = h(d); break; // A=*D + case 71: a = header[pc++]; break; // A= N + case 72: b = a; break; // B=A + case 73: b = b; break; // B=B + case 74: b = c; break; // B=C + case 75: b = d; break; // B=D + case 76: b = m(b); break; // B=*B + case 77: b = m(c); break; // B=*C + case 78: b = h(d); break; // B=*D + case 79: b = header[pc++]; break; // B= N + case 80: c = a; break; // C=A + case 81: c = b; break; // C=B + case 82: c = c; break; // C=C + case 83: c = d; break; // C=D + case 84: c = m(b); break; // C=*B + case 85: c = m(c); break; // C=*C + case 86: c = h(d); break; // C=*D + case 87: c = header[pc++]; break; // C= N + case 88: d = a; break; // D=A + case 89: d = b; break; // D=B + case 90: d = c; break; // D=C + case 91: d = d; break; // D=D + case 92: d = m(b); break; // D=*B + case 93: d = m(c); break; // D=*C + case 94: d = h(d); break; // D=*D + case 95: d = header[pc++]; break; // D= N + case 96: m(b) = a; break; // *B=A + case 97: m(b) = b; break; // *B=B + case 98: m(b) = c; break; // *B=C + case 99: m(b) = d; break; // *B=D + case 100: m(b) = m(b); break; // *B=*B + case 101: m(b) = m(c); break; // *B=*C + case 102: m(b) = h(d); break; // *B=*D + case 103: m(b) = header[pc++]; break; // *B= N + case 104: m(c) = a; break; // *C=A + case 105: m(c) = b; break; // *C=B + case 106: m(c) = c; break; // *C=C + case 107: m(c) = d; break; // *C=D + case 108: m(c) = m(b); break; // *C=*B + case 109: m(c) = m(c); break; // *C=*C + case 110: m(c) = h(d); break; // *C=*D + case 111: m(c) = header[pc++]; break; // *C= N + case 112: h(d) = a; break; // *D=A + case 113: h(d) = b; break; // *D=B + case 114: h(d) = c; break; // *D=C + case 115: h(d) = d; break; // *D=D + case 116: h(d) = m(b); break; // *D=*B + case 117: h(d) = m(c); break; // *D=*C + case 118: h(d) = h(d); break; // *D=*D + case 119: h(d) = header[pc++]; break; // *D= N + case 128: a += a; break; // A+=A + case 129: a += b; break; // A+=B + case 130: a += c; break; // A+=C + case 131: a += d; break; // A+=D + case 132: a += m(b); break; // A+=*B + case 133: a += m(c); break; // A+=*C + case 134: a += h(d); break; // A+=*D + case 135: a += header[pc++]; break; // A+= N + case 136: a -= a; break; // A-=A + case 137: a -= b; break; // A-=B + case 138: a -= c; break; // A-=C + case 139: a -= d; break; // A-=D + case 140: a -= m(b); break; // A-=*B + case 141: a -= m(c); break; // A-=*C + case 142: a -= h(d); break; // A-=*D + case 143: a -= header[pc++]; break; // A-= N + case 144: a *= a; break; // A*=A + case 145: a *= b; break; // A*=B + case 146: a *= c; break; // A*=C + case 147: a *= d; break; // A*=D + case 148: a *= m(b); break; // A*=*B + case 149: a *= m(c); break; // A*=*C + case 150: a *= h(d); break; // A*=*D + case 151: a *= header[pc++]; break; // A*= N + case 152: div(a); break; // A/=A + case 153: div(b); break; // A/=B + case 154: div(c); break; // A/=C + case 155: div(d); break; // A/=D + case 156: div(m(b)); break; // A/=*B + case 157: div(m(c)); break; // A/=*C + case 158: div(h(d)); break; // A/=*D + case 159: div(header[pc++]); break; // A/= N + case 160: mod(a); break; // A%=A + case 161: mod(b); break; // A%=B + case 162: mod(c); break; // A%=C + case 163: mod(d); break; // A%=D + case 164: mod(m(b)); break; // A%=*B + case 165: mod(m(c)); break; // A%=*C + case 166: mod(h(d)); break; // A%=*D + case 167: mod(header[pc++]); break; // A%= N + case 168: a &= a; break; // A&=A + case 169: a &= b; break; // A&=B + case 170: a &= c; break; // A&=C + case 171: a &= d; break; // A&=D + case 172: a &= m(b); break; // A&=*B + case 173: a &= m(c); break; // A&=*C + case 174: a &= h(d); break; // A&=*D + case 175: a &= header[pc++]; break; // A&= N + case 176: a &= ~ a; break; // A&~A + case 177: a &= ~ b; break; // A&~B + case 178: a &= ~ c; break; // A&~C + case 179: a &= ~ d; break; // A&~D + case 180: a &= ~ m(b); break; // A&~*B + case 181: a &= ~ m(c); break; // A&~*C + case 182: a &= ~ h(d); break; // A&~*D + case 183: a &= ~ header[pc++]; break; // A&~ N + case 184: a |= a; break; // A|=A + case 185: a |= b; break; // A|=B + case 186: a |= c; break; // A|=C + case 187: a |= d; break; // A|=D + case 188: a |= m(b); break; // A|=*B + case 189: a |= m(c); break; // A|=*C + case 190: a |= h(d); break; // A|=*D + case 191: a |= header[pc++]; break; // A|= N + case 192: a ^= a; break; // A^=A + case 193: a ^= b; break; // A^=B + case 194: a ^= c; break; // A^=C + case 195: a ^= d; break; // A^=D + case 196: a ^= m(b); break; // A^=*B + case 197: a ^= m(c); break; // A^=*C + case 198: a ^= h(d); break; // A^=*D + case 199: a ^= header[pc++]; break; // A^= N + case 200: a <<= (a&31); break; // A<<=A + case 201: a <<= (b&31); break; // A<<=B + case 202: a <<= (c&31); break; // A<<=C + case 203: a <<= (d&31); break; // A<<=D + case 204: a <<= (m(b)&31); break; // A<<=*B + case 205: a <<= (m(c)&31); break; // A<<=*C + case 206: a <<= (h(d)&31); break; // A<<=*D + case 207: a <<= (header[pc++]&31); break; // A<<= N + case 208: a >>= (a&31); break; // A>>=A + case 209: a >>= (b&31); break; // A>>=B + case 210: a >>= (c&31); break; // A>>=C + case 211: a >>= (d&31); break; // A>>=D + case 212: a >>= (m(b)&31); break; // A>>=*B + case 213: a >>= (m(c)&31); break; // A>>=*C + case 214: a >>= (h(d)&31); break; // A>>=*D + case 215: a >>= (header[pc++]&31); break; // A>>= N + case 216: f = (a == a); break; // A==A + case 217: f = (a == b); break; // A==B + case 218: f = (a == c); break; // A==C + case 219: f = (a == d); break; // A==D + case 220: f = (a == U32(m(b))); break; // A==*B + case 221: f = (a == U32(m(c))); break; // A==*C + case 222: f = (a == h(d)); break; // A==*D + case 223: f = (a == U32(header[pc++])); break; // A== N + case 224: f = (a < a); break; // A a); break; // A>A + case 233: f = (a > b); break; // A>B + case 234: f = (a > c); break; // A>C + case 235: f = (a > d); break; // A>D + case 236: f = (a > U32(m(b))); break; // A>*B + case 237: f = (a > U32(m(c))); break; // A>*C + case 238: f = (a > h(d)); break; // A>*D + case 239: f = (a > U32(header[pc++])); break; // A> N + case 255: if((pc=hbegin+header[pc]+256*header[pc+1])>=hend)err();break;//LJ + default: err(); + } + return 1; +} + +// Print illegal instruction error message and exit +void ZPAQL::err() { + error("ZPAQL execution error"); +} + +///////////////////////// Predictor ///////////////////////// + +// Initailize model-independent tables +Predictor::Predictor(ZPAQL& zr): + c8(1), hmap4(1), z(zr) { + assert(sizeof(U8)==1); + assert(sizeof(U16)==2); + assert(sizeof(U32)==4); + assert(sizeof(U64)==8); + assert(sizeof(short)==2); + assert(sizeof(int)==4); + + // Initialize tables + dt2k[0]=0; + for (int i=1; i<256; ++i) + dt2k[i]=2048/i; + for (int i=0; i<1024; ++i) + dt[i]=(1<<17)/(i*2+3)*2; + for (int i=0; i<32768; ++i) + stretcht[i]=int(log((i+0.5)/(32767.5-i))*64+0.5+100000)-100000; + for (int i=0; i<4096; ++i) + squasht[i]=int(32768.0/(1+exp((i-2048)*(-1.0/64)))); + + // Verify floating point math for squash() and stretch() + U32 sqsum=0, stsum=0; + for (int i=32767; i>=0; --i) + stsum=stsum*3+stretch(i); + for (int i=4095; i>=0; --i) + sqsum=sqsum*3+squash(i-2048); + assert(stsum==3887533746u); + assert(sqsum==2278286169u); + + pcode=0; + pcode_size=0; +} + +Predictor::~Predictor() { + allocx(pcode, pcode_size, 0); // free executable memory +} + +// Initialize the predictor with a new model in z +void Predictor::init() { + + // Clear old JIT code if any + allocx(pcode, pcode_size, 0); + + // Initialize context hash function + z.inith(); + + // Initialize predictions + for (int i=0; i<256; ++i) h[i]=p[i]=0; + + // Initialize components + for (int i=0; i<256; ++i) // clear old model + comp[i].init(); + int n=z.header[6]; // hsize[0..1] hh hm ph pm n (comp)[n] END 0[128] (hcomp) END + const U8* cp=&z.header[7]; // start of component list + for (int i=0; i&z.header[0] && cp<&z.header[z.header.isize()-8]); + Component& cr=comp[i]; + switch(cp[0]) { + case CONS: // c + p[i]=(cp[1]-128)*4; + break; + case CM: // sizebits limit + if (cp[1]>32) error("max size for CM is 32"); + cr.cm.resize(1, cp[1]); // packed CM (22 bits) + CMCOUNT (10 bits) + cr.limit=cp[2]*4; + for (size_t j=0; j26) error("max size for ICM is 26"); + cr.limit=1023; + cr.cm.resize(256); + cr.ht.resize(64, cp[1]); + for (size_t j=0; j32 || cp[2]>32) error("max size for MATCH is 32 32"); + cr.cm.resize(1, cp[1]); // index + cr.ht.resize(1, cp[2]); // buf + cr.ht(0)=1; + break; + case AVG: // j k wt + if (cp[1]>=i) error("AVG j >= i"); + if (cp[2]>=i) error("AVG k >= i"); + break; + case MIX2: // sizebits j k rate mask + if (cp[1]>32) error("max size for MIX2 is 32"); + if (cp[3]>=i) error("MIX2 k >= i"); + if (cp[2]>=i) error("MIX2 j >= i"); + cr.c=(size_t(1)<32) error("max size for MIX is 32"); + if (cp[2]>=i) error("MIX j >= i"); + if (cp[3]<1 || cp[3]>i-cp[2]) error("MIX m not in 1..i-j"); + int m=cp[3]; // number of inputs + assert(m>=1); + cr.c=(size_t(1)<32) error("max size for ISSE is 32"); + if (cp[2]>=i) error("ISSE j >= i"); + cr.ht.resize(64, cp[1]); + cr.cm.resize(512); + for (int j=0; j<256; ++j) { + cr.cm[j*2]=1<<15; + cr.cm[j*2+1]=clamp512k(stretch(st.cminit(j)>>8)<<10); + } + break; + case SSE: // sizebits j start limit + if (cp[1]>32) error("max size for SSE is 32"); + if (cp[2]>=i) error("SSE j >= i"); + if (cp[3]>cp[4]*4) error("SSE start > limit*4"); + cr.cm.resize(32, cp[1]); + cr.limit=cp[4]*4; + for (size_t j=0; j0); + cp+=compsize[*cp]; + assert(cp>=&z.header[7] && cp<&z.header[z.cend]); + } +} + +// Return next bit prediction using interpreted COMP code +int Predictor::predict0() { + assert(c8>=1 && c8<=255); + + // Predict next bit + int n=z.header[6]; + assert(n>0 && n<=255); + const U8* cp=&z.header[7]; + assert(cp[-1]==n); + for (int i=0; i&z.header[0] && cp<&z.header[z.header.isize()-8]); + Component& cr=comp[i]; + switch(cp[0]) { + case CONS: // c + break; + case CM: // sizebits limit + cr.cxt=h[i]^hmap4; + p[i]=stretch(cr.cm(cr.cxt)>>17); + break; + case ICM: // sizebits + assert((hmap4&15)>0); + if (c8==1 || (c8&0xf0)==16) cr.c=find(cr.ht, cp[1]+2, h[i]+16*c8); + cr.cxt=cr.ht[cr.c+(hmap4&15)]; + p[i]=stretch(cr.cm(cr.cxt)>>8); + break; + case MATCH: // sizebits bufbits: a=len, b=offset, c=bit, cxt=bitpos, + // ht=buf, limit=pos + assert(cr.cm.size()==(size_t(1)<>(7-cr.cxt))&1; // predicted bit + p[i]=stretch(dt2k[cr.a]*(cr.c*-2+1)&32767); + } + break; + case AVG: // j k wt + p[i]=(p[cp[1]]*cp[3]+p[cp[2]]*(256-cp[3]))>>8; + break; + case MIX2: { // sizebits j k rate mask + // c=size cm=wt[size] cxt=input + cr.cxt=((h[i]+(c8&cp[5]))&(cr.c-1)); + assert(cr.cxt=0 && w<65536); + p[i]=(w*p[cp[2]]+(65536-w)*p[cp[3]])>>16; + assert(p[i]>=-2048 && p[i]<2048); + } + break; + case MIX: { // sizebits j m rate mask + // c=size cm=wt[size][m] cxt=index of wt in cm + int m=cp[3]; + assert(m>=1 && m<=i); + cr.cxt=h[i]+(c8&cp[5]); + cr.cxt=(cr.cxt&(cr.c-1))*m; // pointer to row of weights + assert(cr.cxt<=cr.cm.size()-m); + int* wt=(int*)&cr.cm[cr.cxt]; + p[i]=0; + for (int j=0; j>8)*p[cp[2]+j]; + p[i]=clamp2k(p[i]>>8); + } + break; + case ISSE: { // sizebits j -- c=hi, cxt=bh + assert((hmap4&15)>0); + if (c8==1 || (c8&0xf0)==16) + cr.c=find(cr.ht, cp[1]+2, h[i]+16*c8); + cr.cxt=cr.ht[cr.c+(hmap4&15)]; // bit history + int *wt=(int*)&cr.cm[cr.cxt*2]; + p[i]=clamp2k((wt[0]*p[cp[2]]+wt[1]*64)>>16); + } + break; + case SSE: { // sizebits j start limit + cr.cxt=(h[i]+c8)*32; + int pq=p[cp[2]]+992; + if (pq<0) pq=0; + if (pq>1983) pq=1983; + int wt=pq&63; + pq>>=6; + assert(pq>=0 && pq<=30); + cr.cxt+=pq; + p[i]=stretch(((cr.cm(cr.cxt)>>10)*(64-wt)+(cr.cm(cr.cxt+1)>>10)*wt)>>13); + cr.cxt+=wt>>5; + } + break; + default: + error("component predict not implemented"); + } + cp+=compsize[cp[0]]; + assert(cp<&z.header[z.cend]); + assert(p[i]>=-2048 && p[i]<2048); + } + assert(cp[0]==NONE); + return squash(p[n-1]); +} + +// Update model with decoded bit y (0...1) +void Predictor::update0(int y) { + assert(y==0 || y==1); + assert(c8>=1 && c8<=255); + assert(hmap4>=1 && hmap4<=511); + + // Update components + const U8* cp=&z.header[7]; + int n=z.header[6]; + assert(n>=1 && n<=255); + assert(cp[-1]==n); + for (int i=0; i>8))>>2; + } + break; + case MATCH: // sizebits bufbits: + // a=len, b=offset, c=bit, cm=index, cxt=bitpos + // ht=buf, limit=pos + { + assert(cr.a<=255); + assert(cr.c==0 || cr.c==1); + assert(cr.cxt<8); + assert(cr.cm.size()==(size_t(1)<>5; + int w=cr.a16[cr.cxt]; + w+=(err*(p[cp[2]]-p[cp[3]])+(1<<12))>>13; + if (w<0) w=0; + if (w>65535) w=65535; + cr.a16[cr.cxt]=w; + } + break; + case MIX: { // sizebits j m rate mask + // cm=wt[size][m], cxt=input + int m=cp[3]; + assert(m>0 && m<=i); + assert(cr.cm.size()==m*cr.c); + assert(cr.cxt+m<=cr.cm.size()); + int err=(y*32767-squash(p[i]))*cp[4]>>4; + int* wt=(int*)&cr.cm[cr.cxt]; + for (int j=0; j>13)); + } + break; + case ISSE: { // sizebits j -- c=hi, cxt=bh + assert(cr.cxt==cr.ht[cr.c+(hmap4&15)]); + int err=y*32767-squash(p[i]); + int *wt=(int*)&cr.cm[cr.cxt*2]; + wt[0]=clamp512k(wt[0]+((err*p[cp[2]]+(1<<12))>>13)); + wt[1]=clamp512k(wt[1]+((err+16)>>5)); + cr.ht[cr.c+(hmap4&15)]=st.next(cr.cxt, y); + } + break; + case SSE: // sizebits j start limit + train(cr, y); + break; + default: + assert(0); + } + cp+=compsize[cp[0]]; + assert(cp>=&z.header[7] && cp<&z.header[z.cend] + && cp<&z.header[z.header.isize()-8]); + } + assert(cp[0]==NONE); + + // Save bit y in c8, hmap4 + c8+=c8+y; + if (c8>=256) { + z.run(c8-256); + hmap4=1; + c8=1; + for (int i=0; i=16 && c8<32) + hmap4=(hmap4&0xf)<<5|y<<4|1; + else + hmap4=(hmap4&0x1f0)|(((hmap4&0xf)*2+y)&0xf); +} + +// Find cxt row in hash table ht. ht has rows of 16 indexed by the +// low sizebits of cxt with element 0 having the next higher 8 bits for +// collision detection. If not found after 3 adjacent tries, replace the +// row with lowest element 1 as priority. Return index of row. +size_t Predictor::find(Array& ht, int sizebits, U32 cxt) { + assert(ht.size()==size_t(16)<>sizebits&255; + size_t h0=(cxt*16)&(ht.size()-16); + if (ht[h0]==chk) return h0; + size_t h1=h0^16; + if (ht[h1]==chk) return h1; + size_t h2=h0^32; + if (ht[h2]==chk) return h2; + if (ht[h0+1]<=ht[h1+1] && ht[h0+1]<=ht[h2+1]) + return memset(&ht[h0], 0, 16), ht[h0]=chk, h0; + else if (ht[h1+1]get(); + if (c<0) error("unexpected end of input"); + curr=curr<<8|c; + } + } + U32 n=buf.size(); + if (n>curr) n=curr; + high=in->read(&buf[0], n); + curr-=high; + low=0; +} + +// Return next bit of decoded input, which has 16 bit probability p of being 1 +int Decoder::decode(int p) { + assert(p>=0 && p<65536); + assert(high>low && low>0); + if (currhigh) error("archive corrupted"); + assert(curr>=low && curr<=high); + U32 mid=low+U32(((high-low)*U64(U32(p)))>>16); // split range + assert(high>mid && mid>=low); + int y=curr<=mid; + if (y) high=mid; else low=mid+1; // pick half + while ((high^low)<0x1000000) { // shift out identical leading bytes + high=high<<8|255; + low=low<<8; + low+=(low==0); + int c=in->get(); + if (c<0) error("unexpected end of file"); + curr=curr<<8|c; + } + return y; +} + +// Decompress 1 byte or -1 at end of input +int Decoder::decompress() { + if (pr.isModeled()) { // n>0 components? + if (curr==0) { // segment initialization + for (int i=0; i<4; ++i) + curr=curr<<8|in->get(); + } + if (decode(0)) { + if (curr!=0) error("decoding end of stream"); + return -1; + } + else { + int c=1; + while (c<256) { // get 8 bits + int p=pr.predict()*2+1; + c+=c+decode(p); + pr.update(c&1); + } + return c-256; + } + } + else { + if (low==high) loadbuf(); + if (low==high) return -1; + return buf[low++]&255; + } +} + +// Find end of compressed data and return next byte +int Decoder::skip() { + int c=-1; + if (pr.isModeled()) { + while (curr==0) // at start? + curr=in->get(); + while (curr && (c=in->get())>=0) // find 4 zeros + curr=curr<<8|c; + while ((c=in->get())==0) ; // might be more than 4 + return c; + } + else { + if (curr==0) // at start? + for (int i=0; i<4 && (c=in->get())>=0; ++i) curr=curr<<8|c; + while (curr>0) { + U32 n=BUFSIZE; + if (n>curr) n=curr; + U32 n1=in->read(&buf[0], n); + curr-=n1; + if (n1!=n) return -1; + if (curr==0) + for (int i=0; i<4 && (c=in->get())>=0; ++i) curr=curr<<8|c; + } + if (c>=0) c=in->get(); + return c; + } +} + +////////////////////// PostProcessor ////////////////////// + +// Copy ph, pm from block header +void PostProcessor::init(int h, int m) { + state=hsize=0; + ph=h; + pm=m; + z.clear(); +} + +// (PASS=0 | PROG=1 psize[0..1] pcomp[0..psize-1]) data... EOB=-1 +// Return state: 1=PASS, 2..4=loading PROG, 5=PROG loaded +int PostProcessor::write(int c) { + assert(c>=-1 && c<=255); + switch (state) { + case 0: // initial state + if (c<0) error("Unexpected EOS"); + state=c+1; // 1=PASS, 2=PROG + if (state>2) error("unknown post processing type"); + if (state==1) z.clear(); + break; + case 1: // PASS + z.outc(c); + break; + case 2: // PROG + if (c<0) error("Unexpected EOS"); + hsize=c; // low byte of size + state=3; + break; + case 3: // PROG psize[0] + if (c<0) error("Unexpected EOS"); + hsize+=c*256; // high byte of psize + z.header.resize(hsize+300); + z.cend=8; + z.hbegin=z.hend=z.cend+128; + z.header[4]=ph; + z.header[5]=pm; + state=4; + break; + case 4: // PROG psize[0..1] pcomp[0...] + if (c<0) error("Unexpected EOS"); + assert(z.hend>8; + z.initp(); + state=5; + } + break; + case 5: // PROG ... data + z.run(c); + if (c<0) z.flush(); + break; + } + return state; +} + +/////////////////////// Decompresser ///////////////////// + +// Find the start of a block and return true if found. Set memptr +// to memory used. +bool Decompresser::findBlock(double* memptr) { + assert(state==BLOCK); + + // Find start of block + U32 h1=0x3D49B113, h2=0x29EB7F93, h3=0x2614BE13, h4=0x3828EB13; + // Rolling hashes initialized to hash of first 13 bytes + int c; + while ((c=dec.in->get())!=-1) { + h1=h1*12+c; + h2=h2*20+c; + h3=h3*28+c; + h4=h4*44+c; + if (h1==0xB16B88F1 && h2==0xFF5376F1 && h3==0x72AC5BF1 && h4==0x2F909AF1) + break; // hash of 16 byte string + } + if (c==-1) return false; + + // Read header + if ((c=dec.in->get())!=1 && c!=2) error("unsupported ZPAQ level"); + if (dec.in->get()!=1) error("unsupported ZPAQL type"); + z.read(dec.in); + if (c==1 && z.header.isize()>6 && z.header[6]==0) + error("ZPAQ level 1 requires at least 1 component"); + if (memptr) *memptr=z.memory(); + state=FILENAME; + decode_state=FIRSTSEG; + return true; +} + +// Read the start of a segment (1) or end of block code (255). +// If a segment is found, write the filename and return true, else false. +bool Decompresser::findFilename(Writer* filename) { + assert(state==FILENAME); + int c=dec.in->get(); + if (c==1) { // segment found + while (true) { + c=dec.in->get(); + if (c==-1) error("unexpected EOF"); + if (c==0) { + state=COMMENT; + return true; + } + if (filename) filename->put(c); + } + } + else if (c==255) { // end of block found + state=BLOCK; + return false; + } + else + error("missing segment or end of block"); + return false; +} + +// Read the comment from the segment header +void Decompresser::readComment(Writer* comment) { + assert(state==COMMENT); + state=DATA; + while (true) { + int c=dec.in->get(); + if (c==-1) error("unexpected EOF"); + if (c==0) break; + if (comment) comment->put(c); + } + if (dec.in->get()!=0) error("missing reserved byte"); +} + +// Decompress n bytes, or all if n < 0. Return false if done +bool Decompresser::decompress(int n) { + assert(state==DATA); + assert(decode_state!=SKIP); + + // Initialize models to start decompressing block + if (decode_state==FIRSTSEG) { + dec.init(); + assert(z.header.size()>5); + pp.init(z.header[4], z.header[5]); + decode_state=SEG; + } + + // Decompress and load PCOMP into postprocessor + while ((pp.getState()&3)!=1) + pp.write(dec.decompress()); + + // Decompress n bytes, or all if n < 0 + while (n) { + int c=dec.decompress(); + pp.write(c); + if (c==-1) { + state=SEGEND; + return false; + } + if (n>0) --n; + } + return true; +} + +// Read end of block. If a SHA1 checksum is present, write 1 and the +// 20 byte checksum into sha1string, else write 0 in first byte. +// If sha1string is 0 then discard it. +void Decompresser::readSegmentEnd(char* sha1string) { + assert(state==DATA || state==SEGEND); + + // Skip remaining data if any and get next byte + int c=0; + if (state==DATA) { + c=dec.skip(); + decode_state=SKIP; + } + else if (state==SEGEND) + c=dec.in->get(); + state=FILENAME; + + // Read checksum + if (c==254) { + if (sha1string) sha1string[0]=0; // no checksum + } + else if (c==253) { + if (sha1string) sha1string[0]=1; + for (int i=1; i<=20; ++i) { + c=dec.in->get(); + if (sha1string) sha1string[i]=c; + } + } + else + error("missing end of segment marker"); +} + +/////////////////////////// decompress() ///////////////////// + +void decompress(Reader* in, Writer* out) { + Decompresser d; + d.setInput(in); + d.setOutput(out); + while (d.findBlock()) { // don't calculate memory + while (d.findFilename()) { // discard filename + d.readComment(); // discard comment + d.decompress(); // to end of segment + d.readSegmentEnd(); // discard sha1string + } + } +} + +////////////////////// Encoder //////////////////// + +// Initialize for start of block +void Encoder::init() { + low=1; + high=0xFFFFFFFF; + pr.init(); + if (!pr.isModeled()) low=0, buf.resize(1<<16); +} + +// compress bit y having probability p/64K +void Encoder::encode(int y, int p) { + assert(out); + assert(p>=0 && p<65536); + assert(y==0 || y==1); + assert(high>low && low>0); + U32 mid=low+U32(((high-low)*U64(U32(p)))>>16); // split range + assert(high>mid && mid>=low); + if (y) high=mid; else low=mid+1; // pick half + while ((high^low)<0x1000000) { // write identical leading bytes + out->put(high>>24); // same as low>>24 + high=high<<8|255; + low=low<<8; + low+=(low==0); // so we don't code 4 0 bytes in a row + } +} + +// compress byte c (0..255 or -1=EOS) +void Encoder::compress(int c) { + assert(out); + if (pr.isModeled()) { + if (c==-1) + encode(1, 0); + else { + assert(c>=0 && c<=255); + encode(0, 0); + for (int i=7; i>=0; --i) { + int p=pr.predict()*2+1; + assert(p>0 && p<65536); + int y=c>>i&1; + encode(y, p); + pr.update(y); + } + } + } + else { + if (c<0 || low==buf.size()) { + out->put((low>>24)&255); + out->put((low>>16)&255); + out->put((low>>8)&255); + out->put(low&255); + out->write(&buf[0], low); + low=0; + } + if (c>=0) buf[low++]=c; + } +} + +///////////////////// Compressor ////////////////////// + +// Write 13 byte start tag +// "\x37\x6B\x53\x74\xA0\x31\x83\xD3\x8C\xB2\x28\xB0\xD3" +void Compressor::writeTag() { + assert(state==INIT); + enc.out->put(0x37); + enc.out->put(0x6b); + enc.out->put(0x53); + enc.out->put(0x74); + enc.out->put(0xa0); + enc.out->put(0x31); + enc.out->put(0x83); + enc.out->put(0xd3); + enc.out->put(0x8c); + enc.out->put(0xb2); + enc.out->put(0x28); + enc.out->put(0xb0); + enc.out->put(0xd3); +} + +void Compressor::startBlock(int level) { + + // Model 1 - min.cfg + static const char models[]={ + 26,0,1,2,0,0,2,3,16,8,19,0,0,96,4,28, + 59,10,59,112,25,10,59,10,59,112,56,0, + + // Model 2 - mid.cfg + 69,0,3,3,0,0,8,3,5,8,13,0,8,17,1,8, + 18,2,8,18,3,8,19,4,4,22,24,7,16,0,7,24, + -1,0,17,104,74,4,95,1,59,112,10,25,59,112,10,25, + 59,112,10,25,59,112,10,25,59,112,10,25,59,10,59,112, + 25,69,-49,8,112,56,0, + + // Model 3 - max.cfg + -60,0,5,9,0,0,22,1,-96,3,5,8,13,1,8,16, + 2,8,18,3,8,19,4,8,19,5,8,20,6,4,22,24, + 3,17,8,19,9,3,13,3,13,3,13,3,14,7,16,0, + 15,24,-1,7,8,0,16,10,-1,6,0,15,16,24,0,9, + 8,17,32,-1,6,8,17,18,16,-1,9,16,19,32,-1,6, + 0,19,20,16,0,0,17,104,74,4,95,2,59,112,10,25, + 59,112,10,25,59,112,10,25,59,112,10,25,59,112,10,25, + 59,10,59,112,10,25,59,112,10,25,69,-73,32,-17,64,47, + 14,-25,91,47,10,25,60,26,48,-122,-105,20,112,63,9,70, + -33,0,39,3,25,112,26,52,25,25,74,10,4,59,112,25, + 10,4,59,112,25,10,4,59,112,25,65,-113,-44,72,4,59, + 112,8,-113,-40,8,68,-81,60,60,25,69,-49,9,112,25,25, + 25,25,25,112,56,0, + + 0,0}; // 0,0 = end of list + + if (level<1) error("compression level must be at least 1"); + const char* p=models; + int i; + for (i=1; iput('z'); + enc.out->put('P'); + enc.out->put('Q'); + enc.out->put(1+(len>6 && hcomp[6]==0)); // level 1 or 2 + enc.out->put(1); + for (int i=0; iput(hcomp[i]); + MemoryReader m(hcomp); + z.read(&m); + state=BLOCK1; +} + +// Write a segment header +void Compressor::startSegment(const char* filename, const char* comment) { + assert(state==BLOCK1 || state==BLOCK2); + enc.out->put(1); + while (filename && *filename) + enc.out->put(*filename++); + enc.out->put(0); + while (comment && *comment) + enc.out->put(*comment++); + enc.out->put(0); + enc.out->put(0); + if (state==BLOCK1) state=SEG1; + if (state==BLOCK2) state=SEG2; +} + +// Initialize encoding and write pcomp to first segment +// If len is 0 then length is encoded in pcomp[0..1] +void Compressor::postProcess(const char* pcomp, int len) { + assert(state==SEG1); + enc.init(); + if (pcomp) { + enc.compress(1); + if (len<=0) { + len=toU16(pcomp); + pcomp+=2; + } + enc.compress(len&255); + enc.compress((len>>8)&255); + for (int i=0; iget())>=0) { + enc.compress(ch); + if (n>0) --n; + } + return ch>=0; +} + +// End segment, write sha1string if present +void Compressor::endSegment(const char* sha1string) { + assert(state==SEG2); + enc.compress(-1); + enc.out->put(0); + enc.out->put(0); + enc.out->put(0); + enc.out->put(0); + if (sha1string) { + enc.out->put(253); + for (int i=0; i<20; ++i) + enc.out->put(sha1string[i]); + } + else + enc.out->put(254); + state=BLOCK2; +} + +// End block +void Compressor::endBlock() { + assert(state==BLOCK2); + enc.out->put(255); + state=INIT; +} + +/////////////////////////// compress() /////////////////////// + +void compress(Reader* in, Writer* out, int level) { + assert(level>=1); + Compressor c; + c.setInput(in); + c.setOutput(out); + c.startBlock(level); + c.startSegment(); + c.postProcess(); + c.compress(); + c.endSegment(); + c.endBlock(); +} + +//////////////////////// ZPAQL::assemble() //////////////////// + +#ifndef NOJIT +/* +assemble(); + +Assembles the ZPAQL code in hcomp[0..hlen-1] and stores x86-32 or x86-64 +code in rcode[0..rcode_size-1]. Execution begins at rcode[0]. It will not +write beyond the end of rcode, but in any case it returns the number of +bytes that would have been written. It returns 0 in case of error. + +The assembled code implements run() and returns 1 if successful or +0 if the ZPAQL code executes an invalid instruction or jumps out of +bounds. + +A ZPAQL virtual machine has the following state. All values are +unsigned and initially 0: + + a, b, c, d: 32 bit registers (pointed to by their respective parameters) + f: 1 bit flag register (pointed to) + r[0..255]: 32 bit registers + m[0..msize-1]: 8 bit registers, where msize is a power of 2 + h[0..hsize-1]: 32 bit registers, where hsize is a power of 2 + out: pointer to a Writer + sha1: pointer to a SHA1 + +Generally a ZPAQL machine is used to compute contexts which are +placed in h. A second machine might post-process, and write its +output to out and sha1. In either case, a machine is called with +its input in a, representing a single byte (0..255) or +(for a postprocessor) EOF (0xffffffff). Execution returs after a +ZPAQL halt instruction. + +ZPAQL instructions are 1 byte unless the last 3 bits are 1. +In this case, a second operand byte follows. Opcode 255 is +the only 3 byte instruction. They are organized: + + 00dddxxx = unary opcode xxx on destination ddd (ddd < 111) + 00111xxx = special instruction xxx + 01dddsss = assignment: ddd = sss (ddd < 111) + 1xxxxsss = operation sxxx from sss to a + +The meaning of sss and ddd are as follows: + + 000 = a (accumulator) + 001 = b + 010 = c + 011 = d + 100 = *b (means m[b mod msize]) + 101 = *c (means m[c mod msize]) + 110 = *d (means h[d mod hsize]) + 111 = n (constant 0..255 in second byte of instruction) + +For example, 01001110 assigns *d to b. The other instructions xxx +are as follows: + +Group 00dddxxx where ddd < 111 and xxx is: + 000 = ddd<>a, swap with a (except 00000000 is an error, and swap + with *b or *c leaves the high bits of a unchanged) + 001 = ddd++, increment + 010 = ddd--, decrement + 011 = ddd!, not (invert all bits) + 100 = ddd=0, clear (set all bits of ddd to 0) + 101 = not used (error) + 110 = not used + 111 = ddd=r n, assign from r[n] to ddd, n=0..255 in next opcode byte +Except: + 00100111 = jt n, jump if f is true (n = -128..127, relative to next opcode) + 00101111 = jf n, jump if f is false (n = -128..127) + 00110111 = r=a n, assign r[n] = a (n = 0..255) + +Group 00111xxx where xxx is: + 000 = halt (return) + 001 = output a + 010 = not used + 011 = hash: a = (a + *b + 512) * 773 + 100 = hashd: *d = (*d + a + 512) * 773 + 101 = not used + 110 = not used + 111 = unconditional jump (n = -128 to 127, relative to next opcode) + +Group 1xxxxsss where xxxx is: + 0000 = a += sss (add, subtract, multiply, divide sss to a) + 0001 = a -= sss + 0010 = a *= sss + 0011 = a /= sss (unsigned, except set a = 0 if sss is 0) + 0100 = a %= sss (remainder, except set a = 0 if sss is 0) + 0101 = a &= sss (bitwise AND) + 0110 = a &= ~sss (bitwise AND with complement of sss) + 0111 = a |= sss (bitwise OR) + 1000 = a ^= sss (bitwise XOR) + 1001 = a <<= (sss % 32) (left shift by low 5 bits of sss) + 1010 = a >>= (sss % 32) (unsigned, zero bits shifted in) + 1011 = a == sss (compare, set f = true if equal or false otherwise) + 1100 = a < sss (unsigned compare, result in f) + 1101 = a > sss (unsigned compare) + 1110 = not used + 1111 = not used except 11111111 is a 3 byte jump to the absolute address + in the next 2 bytes in little-endian (LSB first) order. + +assemble() translates ZPAQL to 32 bit x86 code to be executed by run(). +Registers are mapped as follows: + + eax = source sss from *b, *c, *d or sometimes n + ecx = pointer to destination *b, *c, *d, or spare + edx = a + ebx = f (1 for true, 0 for false) + esp = stack pointer + ebp = d + esi = b + edi = c + +run() saves non-volatile registers (ebp, esi, edi, ebx) on the stack, +loads a, b, c, d, f, and executes the translated instructions. +A halt instruction saves a, b, c, d, f, pops the saved registers +and returns. Invalid instructions or jumps outside of the range +of the ZPAQL code call libzpaq::error(). + +In 64 bit mode, the following additional registers are used: + + r12 = h + r14 = r + r15 = m + +*/ + +// Called by out +static void flush1(ZPAQL* z) { + z->flush(); +} + +// return true if op is an undefined ZPAQL instruction +static bool iserr(int op) { + return op==0 || (op>=120 && op<=127) || (op>=240 && op<=254) + || op==58 || (op<64 && (op%8==5 || op%8==6)); +} + +// Write k bytes of x to rcode[o++] MSB first +static void put(U8* rcode, int n, int& o, U32 x, int k) { + while (k-->0) { + if (o>(k*8))&255; + ++o; + } +} + +// Write 4 bytes of x to rcode[o++] LSB first +static void put4lsb(U8* rcode, int n, int& o, U32 x) { + for (int k=0; k<4; ++k) { + if (o>(k*8))&255; + ++o; + } +} + +// Write a 1-4 byte x86 opcode without or with an 4 byte operand +// to rcode[o...] +#define put1(x) put(rcode, rcode_size, o, (x), 1) +#define put2(x) put(rcode, rcode_size, o, (x), 2) +#define put3(x) put(rcode, rcode_size, o, (x), 3) +#define put4(x) put(rcode, rcode_size, o, (x), 4) +#define put5(x,y) put4(x), put1(y) +#define put6(x,y) put4(x), put2(y) +#define put4r(x) put4lsb(rcode, rcode_size, o, x) +#define puta(x) t=U32(size_t(x)), put4r(t) +#define put1a(x,y) put1(x), puta(y) +#define put2a(x,y) put2(x), puta(y) +#define put3a(x,y) put3(x), puta(y) +#define put4a(x,y) put4(x), puta(y) +#define put5a(x,y,z) put4(x), put1(y), puta(z) +#define put2l(x,y) put2(x), t=U32(size_t(y)), put4r(t), \ + t=U32(size_t(y)>>(S*4)), put4r(t) + +// Assemble ZPAQL in in the HCOMP section of header to rcode, +// but do not write beyond rcode_size. Return the number of +// bytes output or that would have been output. +// Execution starts at rcode[0] and returns 1 if successful or 0 +// in case of a ZPAQL execution error. +int ZPAQL::assemble() { + + // x86? (not foolproof) + const int S=sizeof(char*); // 4 = x86, 8 = x86-64 + U32 t=0x12345678; + if (*(char*)&t!=0x78 || (S!=4 && S!=8)) + error("JIT supported only for x86-32 and x86-64"); + + const U8* hcomp=&header[hbegin]; + const int hlen=hend-hbegin+1; + const int msize=m.size(); + const int hsize=h.size(); + const int regcode[8]={2,6,7,5}; // a,b,c,d.. -> edx,esi,edi,ebp,eax.. + Array it(hlen); // hcomp -> rcode locations + int done=0; // number of instructions assembled (0..hlen) + int o=5; // rcode output index, reserve space for jmp + + // Code for the halt instruction (restore registers and return) + const int halt=o; + if (S==8) { + put2l(0x48b9, &a); // mov rcx, a + put2(0x8911); // mov [rcx], edx + put2l(0x48b9, &b); // mov rcx, b + put2(0x8931); // mov [rcx], esi + put2l(0x48b9, &c); // mov rcx, c + put2(0x8939); // mov [rcx], edi + put2l(0x48b9, &d); // mov rcx, d + put2(0x8929); // mov [rcx], ebp + put2l(0x48b9, &f); // mov rcx, f + put2(0x8919); // mov [rcx], ebx + put4(0x4883c438); // add rsp, 56 + put2(0x415f); // pop r15 + put2(0x415e); // pop r14 + put2(0x415d); // pop r13 + put2(0x415c); // pop r12 + } + else { + put2a(0x8915, &a); // mov [a], edx + put2a(0x8935, &b); // mov [b], esi + put2a(0x893d, &c); // mov [c], edi + put2a(0x892d, &d); // mov [d], ebp + put2a(0x891d, &f); // mov [f], ebx + put3(0x83c43c); // add esp, 60 + } + put1(0x5d); // pop ebp + put1(0x5b); // pop ebx + put1(0x5f); // pop edi + put1(0x5e); // pop esi + put1(0xc3); // ret + + // Code for the out instruction. + // Store a=edx at outbuf[bufptr++]. If full, call flush1(). + const int outlabel=o; + if (S==8) { + put2l(0x48b8, &outbuf[0]);// mov rax, outbuf.p + put2l(0x49ba, &bufptr); // mov r10, &bufptr + put3(0x418b0a); // mov ecx, [r10] + put3(0x891408); // mov [rax+rcx], edx + put2(0xffc1); // inc ecx + put3(0x41890a); // mov [r10], ecx + put2a(0x81f9, outbuf.size()); // cmp ecx, outbuf.size() + put2(0x7401); // jz L1 + put1(0xc3); // ret + put4(0x4883ec30); // L1: sub esp, 48 ; call flush1(this) + put4(0x48893c24); // mov [rsp], rdi + put5(0x48897424,8); // mov [rsp+8], rsi + put5(0x48895424,16); // mov [rsp+16], rdx + put5(0x48894c24,24); // mov [rsp+24], rcx +#ifdef unix + put2l(0x48bf, this); // mov rdi, this +#else // Windows + put2l(0x48b9, this); // mov rcx, this +#endif + put2l(0x49bb, &flush1); // mov r11, &flush1 + put3(0x41ffd3); // call r11 + put5(0x488b4c24,24); // mov rcx, [rsp+24] + put5(0x488b5424,16); // mov rdx, [rsp+16] + put5(0x488b7424,8); // mov rsi, [rsp+8] + put4(0x488b3c24); // mov rdi, [rsp] + put4(0x4883c430); // add esp, 48 + put1(0xc3); // ret + } + else { + put1a(0xb8, &outbuf[0]); // mov eax, outbuf.p + put2a(0x8b0d, &bufptr); // mov ecx, [bufptr] + put3(0x891408); // mov [eax+ecx], edx + put2(0xffc1); // inc ecx + put2a(0x890d, &bufptr); // mov [bufptr], ecx + put2a(0x81f9, outbuf.size()); // cmp ecx, outbuf.size() + put2(0x7401); // jz L1 + put1(0xc3); // ret + put3(0x83ec08); // L1: sub esp, 8 + put4(0x89542404); // mov [esp+4], edx + put3a(0xc70424, this); // mov [esp], this + put1a(0xb8, &flush1); // mov eax, &flush1 + put2(0xffd0); // call eax + put4(0x8b542404); // mov edx, [esp+4] + put3(0x83c408); // add esp, 8 + put1(0xc3); // ret + } + + // Set it[i]=1 for each ZPAQL instruction reachable from the previous + // instruction + 2 if reachable by a jump (or 3 if both). + it[0]=2; + assert(hlen>0 && hcomp[hlen-1]==0); // ends with error + do { + done=0; + const int NONE=0x80000000; + for (int i=0; i>24);// jt,jf,jmp + if (op==63) next1=NONE; // jmp + if ((next2<0 || next2>=hlen) && next2!=NONE) next2=hlen-1; // error + if (next1!=NONE && !(it[next1]&1)) it[next1]|=1, ++done; + if (next2!=NONE && !(it[next2]&2)) it[next2]|=2, ++done; + } + } + } while (done>0); + + // Set it[i] bits 2-3 to 4, 8, or 12 if a comparison + // (<, >, == respectively) does not need to save the result in f, + // or if a conditional jump (jt, jf) does not need to read f. + // This is true if a comparison is followed directly by a jt/jf, + // the jt/jf is not a jump target, the byte before is not a jump + // target (for a 2 byte comparison), and for the comparison instruction + // if both paths after the jt/jf lead to another comparison or error + // before another jt/jf. At most hlen steps are traced because after + // that it must be an infinite loop. + for (int i=0; i=216 && op1<240 && (op2==39 || op2==47) + && it[i2]==1 && (i2==i+1 || it[i+1]==0)) { + int code=(op1-208)/8*4; // 4,8,12 is ==,<,> + it[i2]+=code; // OK to test CF, ZF instead of f + for (int j=0; j<2 && code; ++j) { // trace each path from i2 + int k=i2+2; // branch not taken + if (j==1) k=i2+2+(hcomp[i2+1]<<24>>24); // branch taken + for (int l=0; l=hlen) break; // out of bounds, pass + const int op=hcomp[k]; + if (op==39 || op==47) code=0; // jt,jf, fail + else if (op>=216 && op<240) break; // ==,<,>, pass + else if (iserr(op)) break; // error, pass + else if (op==255) k=hcomp[k+1]+256*hcomp[k+2]; // lj + else if (op==63) k=k+2+(hcomp[k+1]<<24>>24); // jmp + else if (op==56) k=0; // halt + else k=k+1+(op%8==7); // ordinary instruction + } + } + it[i]+=code; // if > 0 then OK to not save flags in f (bl) + } + } + + // Start of run(): Save x86 and load ZPAQL registers + const int start=o; + assert(start>=16); + put1(0x56); // push esi/rsi + put1(0x57); // push edi/rdi + put1(0x53); // push ebx/rbx + put1(0x55); // push ebp/rbp + if (S==8) { + put2(0x4154); // push r12 + put2(0x4155); // push r13 + put2(0x4156); // push r14 + put2(0x4157); // push r15 + put4(0x4883ec38); // sub rsp, 56 + put2l(0x48b8, &a); // mov rax, a + put2(0x8b10); // mov edx, [rax] + put2l(0x48b8, &b); // mov rax, b + put2(0x8b30); // mov esi, [rax] + put2l(0x48b8, &c); // mov rax, c + put2(0x8b38); // mov edi, [rax] + put2l(0x48b8, &d); // mov rax, d + put2(0x8b28); // mov ebp, [rax] + put2l(0x48b8, &f); // mov rax, f + put2(0x8b18); // mov ebx, [rax] + put2l(0x49bc, &h[0]); // mov r12, h + put2l(0x49bd, &outbuf[0]); // mov r13, outbuf.p + put2l(0x49be, &r[0]); // mov r14, r + put2l(0x49bf, &m[0]); // mov r15, m + } + else { + put3(0x83ec3c); // sub esp, 60 + put2a(0x8b15, &a); // mov edx, [a] + put2a(0x8b35, &b); // mov esi, [b] + put2a(0x8b3d, &c); // mov edi, [c] + put2a(0x8b2d, &d); // mov ebp, [d] + put2a(0x8b1d, &f); // mov ebx, [f] + } + + // Assemble in multiple passes until every byte of hcomp has a translation + for (int istart=0; istarti); + assert(i>=0 && i=16) { + if (i>istart) { + int a=code-o; + if (a>-120 && a<120) + put2(0xeb00+((a-2)&255)); // jmp short o + else + put1a(0xe9, a-5); // jmp near o + } + break; + } + + // Else assemble the instruction at hcode[i] to rcode[o] + else { + assert(i>=0 && i0 && it[i]<16); + assert(o>=16); + it[i]=o; + ++done; + const int op=hcomp[i]; + const int arg=hcomp[i+1]+((op==255)?256*hcomp[i+2]:0); + const int ddd=op/8%8; + const int sss=op%8; + + // error instruction: return 0 + if (iserr(op)) { + put2(0x31c0); // xor eax, eax + put1a(0xe9, halt-o-4); // jmp near halt + continue; + } + + // Load source *b, *c, *d, or hash (*b) into eax except: + // {a,b,c,d}=*d, a{+,-,*,&,|,^,=,==,>,>}=*d: load address to eax + // {a,b,c,d}={*b,*c}: load source into ddd + if (op==59 || (op>=64 && op<240 && op%8>=4 && op%8<7)) { + put2(0x89c0+8*regcode[sss-3+(op==59)]); // mov eax, {esi,edi,ebp} + const int sz=(sss==6?hsize:msize)-1; + if (sz>=128) put1a(0x25, sz); // and eax, dword msize-1 + else put3(0x83e000+sz); // and eax, byte msize-1 + const int move=(op>=64 && op<112); // = or else ddd is eax + if (sss<6) { // ddd={a,b,c,d,*b,*c} + if (S==8) put5(0x410fb604+8*move*regcode[ddd],0x07); + // movzx ddd, byte [r15+rax] + else put3a(0x0fb680+8*move*regcode[ddd], &m[0]); + // movzx ddd, byte [m+eax] + } + else if ((0x06587000>>(op/8))&1) {// {*b,*c,*d,a/,a%,a&~,a<<,a>>}=*d + if (S==8) put4(0x418b0484); // mov eax, [r12+rax*4] + else put3a(0x8b0485, &h[0]); // mov eax, [h+eax*4] + } + } + + // Load destination address *b, *c, *d or hashd (*d) into ecx + if ((op>=32 && op<56 && op%8<5) || (op>=96 && op<120) || op==60) { + put2(0x89c1+8*regcode[op/8%8-3-(op==60)]);// mov ecx,{esi,edi,ebp} + const int sz=(ddd==6||op==60?hsize:msize)-1; + if (sz>=128) put2a(0x81e1, sz); // and ecx, dword sz + else put3(0x83e100+sz); // and ecx, byte sz + if (op/8%8==6 || op==60) { // *d + if (S==8) put4(0x498d0c8c); // lea rcx, [r12+rcx*4] + else put3a(0x8d0c8d, &h[0]); // lea ecx, [ecx*4+h] + } + else { // *b, *c + if (S==8) put4(0x498d0c0f); // lea rcx, [r15+rcx] + else put2a(0x8d89, &m[0]); // lea ecx, [ecx+h] + } + } + + // Translate by opcode + switch((op/8)&31) { + case 0: // ddd = a + case 1: // ddd = b + case 2: // ddd = c + case 3: // ddd = d + switch(sss) { + case 0: // ddd<>a (swap) + put2(0x87d0+regcode[ddd]); // xchg edx, ddd + break; + case 1: // ddd++ + put2(0xffc0+regcode[ddd]); // inc ddd + break; + case 2: // ddd-- + put2(0xffc8+regcode[ddd]); // dec ddd + break; + case 3: // ddd! + put2(0xf7d0+regcode[ddd]); // not ddd + break; + case 4: // ddd=0 + put2(0x31c0+9*regcode[ddd]); // xor ddd,ddd + break; + case 7: // ddd=r n + if (S==8) + put3a(0x418b86+8*regcode[ddd], arg*4); // mov ddd, [r14+n*4] + else + put2a(0x8b05+8*regcode[ddd], (&r[arg]));//mov ddd, [r+n] + break; + } + break; + case 4: // ddd = *b + case 5: // ddd = *c + switch(sss) { + case 0: // ddd<>a (swap) + put2(0x8611); // xchg dl, [ecx] + break; + case 1: // ddd++ + put2(0xfe01); // inc byte [ecx] + break; + case 2: // ddd-- + put2(0xfe09); // dec byte [ecx] + break; + case 3: // ddd! + put2(0xf611); // not byte [ecx] + break; + case 4: // ddd=0 + put2(0x31c0); // xor eax, eax + put2(0x8801); // mov [ecx], al + break; + case 7: // jt, jf + { + assert(code>=0 && code<16); + const int jtab[2][4]={{5,4,2,7},{4,5,3,6}}; + // jnz,je,jb,ja, jz,jne,jae,jbe + if (code<4) put2(0x84db); // test bl, bl + if (arg>=128 && arg-257-i>=0 && o-it[arg-257-i]<120) + put2(0x7000+256*jtab[op==47][code/4]); // jx short 0 + else + put2a(0x0f80+jtab[op==47][code/4], 0); // jx near 0 + break; + } + } + break; + case 6: // ddd = *d + switch(sss) { + case 0: // ddd<>a (swap) + put2(0x8711); // xchg edx, [ecx] + break; + case 1: // ddd++ + put2(0xff01); // inc dword [ecx] + break; + case 2: // ddd-- + put2(0xff09); // dec dword [ecx] + break; + case 3: // ddd! + put2(0xf711); // not dword [ecx] + break; + case 4: // ddd=0 + put2(0x31c0); // xor eax, eax + put2(0x8901); // mov [ecx], eax + break; + case 7: // ddd=r n + if (S==8) + put3a(0x418996, arg*4); // mov [r14+n*4], edx + else + put2a(0x8915, &r[arg]); // mov [r+n], edx + break; + } + break; + case 7: // special + switch(op) { + case 56: // halt + put1a(0xb8, 1); // mov eax, 1 + put1a(0xe9, halt-o-4); // jmp near halt + break; + case 57: // out + put1a(0xe8, outlabel-o-4);// call outlabel + break; + case 59: // hash: a = (a + *b + 512) * 773 + put3a(0x8d8410, 512); // lea edx, [eax+edx+512] + put2a(0x69d0, 773); // imul edx, eax, 773 + break; + case 60: // hashd: *d = (*d + a + 512) * 773 + put2(0x8b01); // mov eax, [ecx] + put3a(0x8d8410, 512); // lea eax, [eax+edx+512] + put2a(0x69c0, 773); // imul eax, eax, 773 + put2(0x8901); // mov [ecx], eax + break; + case 63: // jmp + put1a(0xe9, 0); // jmp near 0 (fill in target later) + break; + } + break; + case 8: // a= + case 9: // b= + case 10: // c= + case 11: // d= + if (sss==7) // n + put1a(0xb8+regcode[ddd], arg); // mov ddd, n + else if (sss==6) { // *d + if (S==8) + put4(0x418b0484+(regcode[ddd]<<11)); // mov ddd, [r12+rax*4] + else + put3a(0x8b0485+(regcode[ddd]<<11),&h[0]);// mov ddd, [h+eax*4] + } + else if (sss<4) // a, b, c, d + put2(0x89c0+regcode[ddd]+8*regcode[sss]);// mov ddd,sss + break; + case 12: // *b= + case 13: // *c= + if (sss==7) put3(0xc60100+arg); // mov byte [ecx], n + else if (sss==0) put2(0x8811); // mov byte [ecx], dl + else { + if (sss<4) put2(0x89c0+8*regcode[sss]);// mov eax, sss + put2(0x8801); // mov byte [ecx], al + } + break; + case 14: // *d= + if (sss<7) put2(0x8901+8*regcode[sss]); // mov [ecx], sss + else put2a(0xc701, arg); // mov dword [ecx], n + break; + case 15: break; // not used + case 16: // a+= + if (sss==6) { + if (S==8) put4(0x41031484); // add edx, [r12+rax*4] + else put3a(0x031485, &h[0]); // add edx, [h+eax*4] + } + else if (sss<7) put2(0x01c2+8*regcode[sss]);// add edx, sss + else if (arg>128) put2a(0x81c2, arg); // add edx, n + else put3(0x83c200+arg); // add edx, byte n + break; + case 17: // a-= + if (sss==6) { + if (S==8) put4(0x412b1484); // sub edx, [r12+rax*4] + else put3a(0x2b1485, &h[0]); // sub edx, [h+eax*4] + } + else if (sss<7) put2(0x29c2+8*regcode[sss]);// sub edx, sss + else if (arg>=128) put2a(0x81ea, arg); // sub edx, n + else put3(0x83ea00+arg); // sub edx, byte n + break; + case 18: // a*= + if (sss==6) { + if (S==8) put5(0x410faf14,0x84); // imul edx, [r12+rax*4] + else put4a(0x0faf1485, &h[0]); // imul edx, [h+eax*4] + } + else if (sss<7) put3(0x0fafd0+regcode[sss]);// imul edx, sss + else if (arg>=128) put2a(0x69d2, arg); // imul edx, n + else put3(0x6bd200+arg); // imul edx, byte n + break; + case 19: // a/= + case 20: // a%= + if (sss<7) put2(0x89c1+8*regcode[sss]); // mov ecx, sss + else put1a(0xb9, arg); // mov ecx, n + put2(0x85c9); // test ecx, ecx + put3(0x0f44d1); // cmovz edx, ecx + put2(0x7408-2*(op/8==20)); // jz (over rest) + put2(0x89d0); // mov eax, edx + put2(0x31d2); // xor edx, edx + put2(0xf7f1); // div ecx + if (op/8==19) put2(0x89c2); // mov edx, eax + break; + case 21: // a&= + if (sss==6) { + if (S==8) put4(0x41231484); // and edx, [r12+rax*4] + else put3a(0x231485, &h[0]); // and edx, [h+eax*4] + } + else if (sss<7) put2(0x21c2+8*regcode[sss]);// and edx, sss + else if (arg>=128) put2a(0x81e2, arg); // and edx, n + else put3(0x83e200+arg); // and edx, byte n + break; + case 22: // a&~ + if (sss==7) { + if (arg<128) put3(0x83e200+(~arg&255));// and edx, byte ~n + else put2a(0x81e2, ~arg); // and edx, ~n + } + else { + if (sss<4) put2(0x89c0+8*regcode[sss]);// mov eax, sss + put2(0xf7d0); // not eax + put2(0x21c2); // and edx, eax + } + break; + case 23: // a|= + if (sss==6) { + if (S==8) put4(0x410b1484); // or edx, [r12+rax*4] + else put3a(0x0b1485, &h[0]); // or edx, [h+eax*4] + } + else if (sss<7) put2(0x09c2+8*regcode[sss]);// or edx, sss + else if (arg>=128) put2a(0x81ca, arg); // or edx, n + else put3(0x83ca00+arg); // or edx, byte n + break; + case 24: // a^= + if (sss==6) { + if (S==8) put4(0x41331484); // xor edx, [r12+rax*4] + else put3a(0x331485, &h[0]); // xor edx, [h+eax*4] + } + else if (sss<7) put2(0x31c2+8*regcode[sss]);// xor edx, sss + else if (arg>=128) put2a(0x81f2, arg); // xor edx, byte n + else put3(0x83f200+arg); // xor edx, n + break; + case 25: // a<<= + case 26: // a>>= + if (sss==7) // sss = n + put3(0xc1e200+8*256*(op/8==26)+arg); // shl/shr n + else { + put2(0x89c1+8*regcode[sss]); // mov ecx, sss + put2(0xd3e2+8*(op/8==26)); // shl/shr edx, cl + } + break; + case 27: // a== + case 28: // a< + case 29: // a> + if (sss==6) { + if (S==8) put4(0x413b1484); // cmp edx, [r12+rax*4] + else put3a(0x3b1485, &h[0]); // cmp edx, [h+eax*4] + } + else if (sss==7) // sss = n + put2a(0x81fa, arg); // cmp edx, dword n + else + put2(0x39c2+8*regcode[sss]); // cmp edx, sss + if (code<4) { + if (op/8==27) put3(0x0f94c3); // setz bl + if (op/8==28) put3(0x0f92c3); // setc bl + if (op/8==29) put3(0x0f97c3); // seta bl + } + break; + case 30: // not used + case 31: // 255 = lj + if (op==255) put1a(0xe9, 0); // jmp near + break; + } + } + } + } + + // Finish first pass + const int rsize=o; + if (o>rcode_size) return rsize; + + // Fill in jump addresses (second pass) + for (int i=0; i=128) target-=256; + target+=i+2; + } + if (target<0 || target>=hlen) target=hlen-1; // runtime ZPAQL error + o=it[i]; + assert(o>=16 && o skip test + assert(o>=16 && o=0x72 && op<0x78) || op==0xeb) { // jx, jmp short + --target; + if (target<-128 || target>127) + error("Cannot code x86 short jump"); + assert(o=0x82 && op<0x88) || op==0xe9) // jx, jmp near + { + target-=4; + puta(target); + } + else assert(false); // not a x86 jump + } + } + + // Jump to start + o=0; + put1a(0xe9, start-5); // jmp near start + return rsize; +} + +//////////////////////// Predictor::assemble_p() ///////////////////// + +// Assemble the ZPAQL code in the HCOMP section of z.header to pcomp and +// return the number of bytes of x86 or x86-64 code written, or that would +// be written if pcomp were large enough. The code for predict() begins +// at pr.pcomp[0] and update() at pr.pcomp[5], both as jmp instructions. + +// The assembled code is equivalent to int predict(Predictor*) +// and void update(Predictor*, int y); The Preditor address is placed in +// edi/rdi. The update bit y is placed in ebp/rbp. + +int Predictor::assemble_p() { + Predictor& pr=*this; + U8* rcode=pr.pcode; // x86 output array + int rcode_size=pcode_size; // output size + int o=0; // output index in pcode + const int S=sizeof(char*); // 4 or 8 + U8* hcomp=&pr.z.header[0]; // The code to translate +#define off(x) ((char*)&(pr.x)-(char*)&pr) +#define offc(x) ((char*)&(pr.comp[i].x)-(char*)&pr) + + // test for little-endian (probably x86) + U32 t=0x12345678; + if (*(char*)&t!=0x78 || (S!=4 && S!=8)) + error("JIT supported only for x86-32 and x86-64"); + + // Initialize for predict(). Put predictor address in edi/rdi + put1a(0xe9, 5); // jmp predict + put1a(0, 0x90909000); // reserve space for jmp update + put1(0x53); // push ebx/rbx + put1(0x55); // push ebp/rbp + put1(0x56); // push esi/rsi + put1(0x57); // push edi/rdi + if (S==4) + put4(0x8b7c2414); // mov edi,[esp+0x14] ; pr + else { +#ifndef unix + put3(0x4889cf); // mov rdi, rcx (1st arg in Win64) +#endif + } + + // Code predict() for each component + const int n=hcomp[6]; // number of components + U8* cp=hcomp+7; + for (int i=0; i=pr.z.cend) error("comp too big"); + if (cp[0]<1 || cp[0]>9) error("invalid component"); + assert(compsize[cp[0]]>0 && compsize[cp[0]]<8); + switch (cp[0]) { + + case CONS: // c + break; + + case CM: // sizebits limit + // Component& cr=comp[i]; + // cr.cxt=h[i]^hmap4; + // p[i]=stretch(cr.cm(cr.cxt)>>17); + + put2a(0x8b87, off(h[i])); // mov eax, [edi+&h[i]] + put2a(0x3387, off(hmap4)); // xor eax, [edi+&hmap4] + put1a(0x25, (1<rsi) + put2a(0x8bb7, offc(cm)); // mov esi, [edi+&cm] + put3(0x8b0486); // mov eax, [esi+eax*4] + put3(0xc1e811); // shr eax, 17 + put4a(0x0fbf8447, off(stretcht)); // movsx eax,word[edi+eax*2+..] + put2a(0x8987, off(p[i])); // mov [edi+&p[i]], eax + break; + + case ISSE: // sizebits j -- c=hi, cxt=bh + // assert((hmap4&15)>0); + // if (c8==1 || (c8&0xf0)==16) + // cr.c=find(cr.ht, cp[1]+2, h[i]+16*c8); + // cr.cxt=cr.ht[cr.c+(hmap4&15)]; // bit history + // int *wt=(int*)&cr.cm[cr.cxt*2]; + // p[i]=clamp2k((wt[0]*p[cp[2]]+wt[1]*64)>>16); + + case ICM: // sizebits + // assert((hmap4&15)>0); + // if (c8==1 || (c8&0xf0)==16) cr.c=find(cr.ht, cp[1]+2, h[i]+16*c8); + // cr.cxt=cr.ht[cr.c+(hmap4&15)]; + // p[i]=stretch(cr.cm(cr.cxt)>>8); + // + // Find cxt row in hash table ht. ht has rows of 16 indexed by the low + // sizebits of cxt with element 0 having the next higher 8 bits for + // collision detection. If not found after 3 adjacent tries, replace + // row with lowest element 1 as priority. Return index of row. + // + // size_t Predictor::find(Array& ht, int sizebits, U32 cxt) { + // assert(ht.size()==size_t(16)<>sizebits&255; + // size_t h0=(cxt*16)&(ht.size()-16); + // if (ht[h0]==chk) return h0; + // size_t h1=h0^16; + // if (ht[h1]==chk) return h1; + // size_t h2=h0^32; + // if (ht[h2]==chk) return h2; + // if (ht[h0+1]<=ht[h1+1] && ht[h0+1]<=ht[h2+1]) + // return memset(&ht[h0], 0, 16), ht[h0]=chk, h0; + // else if (ht[h1+1]>(7-cr.cxt))&1; // predicted bit + // p[i]=stretch(dt2k[cr.a]*(cr.c*-2+1)&32767); + // } + + if (S==8) put1(0x48); // rex.w + put2a(0x8bb7, offc(ht)); // mov esi, [edi+&ht] + + // If match length (a) is 0 then p[i]=0 + put2a(0x8b87, offc(a)); // mov eax, [edi+&a] + put2(0x85c0); // test eax, eax + put2(0x7449); // jz L2 ; p[i]=0 + + // Else put predicted bit in c + put1a(0xb9, 7); // mov ecx, 7 + put2a(0x2b8f, offc(cxt)); // sub ecx, [edi+&cxt] + put2a(0x8b87, offc(limit)); // mov eax, [edi+&limit] + put2a(0x2b87, offc(b)); // sub eax, [edi+&b] + put1a(0x25, (1<>8; + + put2a(0x8b87, off(p[cp[1]])); // mov eax, [edi+&p[j]] + put2a(0x2b87, off(p[cp[2]])); // sub eax, [edi+&p[k]] + put2a(0x69c0, cp[3]); // imul eax, wt + put3(0xc1f808); // sar eax, 8 + put2a(0x0387, off(p[cp[2]])); // add eax, [edi+&p[k]] + put2a(0x8987, off(p[i])); // mov [edi+&p[i]], eax + break; + + case MIX2: // sizebits j k rate mask + // c=size cm=wt[size] cxt=input + // cr.cxt=((h[i]+(c8&cp[5]))&(cr.c-1)); + // assert(cr.cxt=0 && w<65536); + // p[i]=(w*p[cp[2]]+(65536-w)*p[cp[3]])>>16; + // assert(p[i]>=-2048 && p[i]<2048); + + put2(0x8b07); // mov eax, [edi] ; c8 + put1a(0x25, cp[5]); // and eax, mask + put2a(0x0387, off(h[i])); // add eax, [edi+&h[i]] + put1a(0x25, (1<=1 && m<=i); + // cr.cxt=h[i]+(c8&cp[5]); + // cr.cxt=(cr.cxt&(cr.c-1))*m; // pointer to row of weights + // assert(cr.cxt<=cr.cm.size()-m); + // int* wt=(int*)&cr.cm[cr.cxt]; + // p[i]=0; + // for (int j=0; j>8)*p[cp[2]+j]; + // p[i]=clamp2k(p[i]>>8); + + put2(0x8b07); // mov eax, [edi] ; c8 + put1a(0x25, cp[5]); // and eax, mask + put2a(0x0387, off(h[i])); // add eax, [edi+&h[i]] + put1a(0x25, (1<3) put4a(0xf30f6f96, k*4+16);//movdqu xmm2, [esi+k*4+16] + put5(0x660f72e1,0x08); // psrad xmm1, 8 + if (tail>3) put5(0x660f72e2,0x08); // psrad xmm2, 8 + put4(0x660f6bca); // packssdw xmm1, xmm2 + put4a(0xf30f6f9f, off(p[cp[2]+k])); // movdqu xmm3, [edi+&p[j+k]] + if (tail>3) + put4a(0xf30f6fa7,off(p[cp[2]+k+4]));//movdqu xmm4, [edi+&p[j+k+4]] + put4(0x660f6bdc); // packssdw, xmm3, xmm4 + if (tail>0 && tail<8) { // last loop, mask extra weights + put4(0x660f76ed); // pcmpeqd xmm5, xmm5 ; -1 + put5(0x660f73dd, 16-tail*2); // psrldq xmm5, 16-tail*2 + put4(0x660fdbcd); // pand xmm1, xmm5 + } + if (k==0) { // first loop, initialize sum in xmm0 + put4(0xf30f6fc1); // movdqu xmm0, xmm1 + put4(0x660ff5c3); // pmaddwd xmm0, xmm3 + } + else { // accumulate sum in xmm0 + put4(0xf30f6fd1); // movdqu xmm2, xmm1 + put4(0x660ff5d3); // pmaddwd xmm2, xmm3 + put4(0x660ffec2); // paddd, xmm0, xmm2 + } + } + + // Add up the 4 elements of xmm0 = p[i] in the first element + put4(0xf30f6fc8); // movdqu xmm1, xmm0 + put5(0x660f73d9,0x08); // psrldq xmm1, 8 + put4(0x660ffec1); // paddd xmm0, xmm1 + put4(0xf30f6fc8); // movdqu xmm1, xmm0 + put5(0x660f73d9,0x04); // psrldq xmm1, 4 + put4(0x660ffec1); // paddd xmm0, xmm1 + put4(0x660f7ec0); // movd eax, xmm0 ; p[i] + put3(0xc1f808); // sar eax, 8 + put1a(0xb9, 2047); // mov ecx, 2047 ; clamp2k + put2(0x39c8); // cmp eax, ecx + put3(0x0f4fc1); // cmovg eax, ecx + put2(0xf7d1); // not ecx ; -2048 + put2(0x39c8); // cmp eax, ecx + put3(0x0f4cc1); // cmovl eax, ecx + put2a(0x8987, off(p[i])); // mov [edi+&p[i]], eax + break; + + case SSE: // sizebits j start limit + // cr.cxt=(h[i]+c8)*32; + // int pq=p[cp[2]]+992; + // if (pq<0) pq=0; + // if (pq>1983) pq=1983; + // int wt=pq&63; + // pq>>=6; + // assert(pq>=0 && pq<=30); + // cr.cxt+=pq; + // p[i]=stretch(((cr.cm(cr.cxt)>>10)*(64-wt) // p0 + // +(cr.cm(cr.cxt+1)>>10)*wt)>>13); // p1 + // // p = p0*(64-wt)+p1*wt = (p1-p0)*wt + p0*64 + // cr.cxt+=wt>>5; + + put2a(0x8b8f, off(h[i])); // mov ecx, [edi+&h[i]] + put2(0x030f); // add ecx, [edi] ; c0 + put2a(0x81e1, (1<>5 + put2a(0x898f, offc(cxt)); // mov [edi+cxt], ecx ; cxt saved + put3(0xc1e80a); // shr eax, 10 ; p0 = cm[cxt]>>10 + put3(0xc1eb0a); // shr ebx, 10 ; p1 = cm[cxt+1]>>10 + put2(0x29c3); // sub ebx, eax, ; p1-p0 + put3(0x0fafda); // imul ebx, edx ; (p1-p0)*wt + put3(0xc1e006); // shr eax, 6 + put2(0x01d8); // add eax, ebx ; p in 0..2^28-1 + put3(0xc1e80d); // shr eax, 13 ; p in 0..32767 + put4a(0x0fbf8447, off(stretcht)); // movsx eax, word [edi+eax*2+...] + put2a(0x8987, off(p[i])); // mov [edi+&p[i]], eax + break; + + default: + error("invalid ZPAQ component"); + } + } + + // return squash(p[n-1]) + put2a(0x8b87, off(p[n-1])); // mov eax, [edi+...] + put1a(0x05, 0x800); // add eax, 2048 + put4a(0x0fbf8447, off(squasht[0])); // movsx eax, word [edi+eax*2+...] + put1(0x5f); // pop edi + put1(0x5e); // pop esi + put1(0x5d); // pop ebp + put1(0x5b); // pop ebx + put1(0xc3); // ret + + // Initialize for update() Put predictor address in edi/rdi + // and bit y=0..1 in ebp + int save_o=o; + o=5; + put1a(0xe9, save_o-10); // jmp update + o=save_o; + put1(0x53); // push ebx/rbx + put1(0x55); // push ebp/rbp + put1(0x56); // push esi/rsi + put1(0x57); // push edi/rdi + if (S==4) { + put4(0x8b7c2414); // mov edi,[esp+0x14] ; (1st arg = pr) + put4(0x8b6c2418); // mov ebp,[esp+0x18] ; (2nd arg = y) + } + else { +#ifdef unix // (1st arg already in rdi) + put3(0x4889f5); // mov rbp, rsi (2nd arg in Linux-64) +#else + put3(0x4889cf); // mov rdi, rcx (1st arg in Win64) + put3(0x4889d5); // mov rbp, rdx (2nd arg) +#endif + } + + // Code update() for each component + cp=hcomp+7; + for (int i=0; i=1 && cp[0]<=9); + assert(compsize[cp[0]]>0 && compsize[cp[0]]<8); + switch (cp[0]) { + + case CONS: // c + break; + + case SSE: // sizebits j start limit + case CM: // sizebits limit + // train(cr, y); + // + // reduce prediction error in cr.cm + // void train(Component& cr, int y) { + // assert(y==0 || y==1); + // U32& pn=cr.cm(cr.cxt); + // U32 count=pn&0x3ff; + // int error=y*32767-(cr.cm(cr.cxt)>>17); + // pn+=(error*dt[count]&-1024)+(countrsi) + put2a(0x8bb7, offc(cm)); // mov esi,[edi+cm] ; cm + put2a(0x8b87, offc(cxt)); // mov eax,[edi+cxt] ; cxt + put1a(0x25, pr.comp[i].cm.size()-1); // and eax, size-1 + if (S==8) put1(0x48); // rex.w + put3(0x8d3486); // lea esi,[esi+eax*4] ; &cm[cxt] + put2(0x8b06); // mov eax,[esi] ; cm[cxt] + put2(0x89c2); // mov edx, eax ; cm[cxt] + put3(0xc1e811); // shr eax, 17 ; cm[cxt]>>17 + put2(0x89e9); // mov ecx, ebp ; y + put3(0xc1e10f); // shl ecx, 15 ; y*32768 + put2(0x29e9); // sub ecx, ebp ; y*32767 + put2(0x29c1); // sub ecx, eax ; error + put2a(0x81e2, 0x3ff); // and edx, 1023 ; count + put3a(0x8b8497, off(dt)); // mov eax,[edi+edx*4+dt] ; dt[count] + put3(0x0fafc8); // imul ecx, eax ; error*dt[count] + put2a(0x81e1, 0xfffffc00); // and ecx, -1024 + put2a(0x81fa, cp[2+2*(cp[0]==SSE)]*4); // cmp edx, limit*4 + put2(0x110e); // adc [esi], ecx ; pn+=... + break; + + case ICM: // sizebits: cxt=bh, ht[c][0..15]=bh row + // cr.ht[cr.c+(hmap4&15)]=st.next(cr.ht[cr.c+(hmap4&15)], y); + // U32& pn=cr.cm(cr.cxt); + // pn+=int(y*32767-(pn>>8))>>2; + + case ISSE: // sizebits j -- c=hi, cxt=bh + // assert(cr.cxt==cr.ht[cr.c+(hmap4&15)]); + // int err=y*32767-squash(p[i]); + // int *wt=(int*)&cr.cm[cr.cxt*2]; + // wt[0]=clamp512k(wt[0]+((err*p[cp[2]]+(1<<12))>>13)); + // wt[1]=clamp512k(wt[1]+((err+16)>>5)); + // cr.ht[cr.c+(hmap4&15)]=st.next(cr.cxt, y); + + // update bit history bh to next(bh,y=ebp) in ht[c+(hmap4&15)] + put3(0x8b4700+off(hmap4)); // mov eax, [edi+&hmap4] + put3(0x83e00f); // and eax, 15 + put2a(0x0387, offc(c)); // add eax [edi+&c] ; cxt + if (S==8) put1(0x48); // rex.w + put2a(0x8bb7, offc(ht)); // mov esi, [edi+&ht] + put4(0x0fb61406); // movzx edx, byte [esi+eax] ; bh + put4(0x8d5c9500); // lea ebx, [ebp+edx*4] ; index to st + put4a(0x0fb69c1f, off(st)); // movzx ebx,byte[edi+ebx+st]; next bh + put3(0x881c06); // mov [esi+eax], bl ; save next bh + if (S==8) put1(0x48); // rex.w + put2a(0x8bb7, offc(cm)); // mov esi, [edi+&cm] + + // ICM: update cm[cxt=edx=bit history] to reduce prediction error + // esi = &cm + if (cp[0]==ICM) { + if (S==8) put1(0x48); // rex.w + put3(0x8d3496); // lea esi, [esi+edx*4] ; &cm[bh] + put2(0x8b06); // mov eax, [esi] ; pn + put3(0xc1e808); // shr eax, 8 ; pn>>8 + put2(0x89e9); // mov ecx, ebp ; y + put3(0xc1e10f); // shl ecx, 15 + put2(0x29e9); // sub ecx, ebp ; y*32767 + put2(0x29c1); // sub ecx, eax + put3(0xc1f902); // sar ecx, 2 + put2(0x010e); // add [esi], ecx + } + + // ISSE: update weights. edx=cxt=bit history (0..255), esi=cm[512] + else { + put2a(0x8b87, off(p[i])); // mov eax, [edi+&p[i]] + put1a(0x05, 2048); // add eax, 2048 + put4a(0x0fb78447, off(squasht)); // movzx eax, word [edi+eax*2+..] + put2(0x89e9); // mov ecx, ebp ; y + put3(0xc1e10f); // shl ecx, 15 + put2(0x29e9); // sub ecx, ebp ; y*32767 + put2(0x29c1); // sub ecx, eax ; err + put2a(0x8b87, off(p[cp[2]]));// mov eax, [edi+&p[j]] + put3(0x0fafc1); // imul eax, ecx + put1a(0x05, (1<<12)); // add eax, 4096 + put3(0xc1f80d); // sar eax, 13 + put3(0x0304d6); // add eax, [esi+edx*8] ; wt[0] + put1a(0xbb, (1<<19)-1); // mov ebx, 524287 + put2(0x39d8); // cmp eax, ebx + put3(0x0f4fc3); // cmovg eax, ebx + put2(0xf7d3); // not ebx ; -524288 + put2(0x39d8); // cmp eax, ebx + put3(0x0f4cc3); // cmovl eax, ebx + put3(0x8904d6); // mov [esi+edx*8], eax + put3(0x83c110); // add ecx, 16 ; err + put3(0xc1f905); // sar ecx, 5 + put4(0x034cd604); // add ecx, [esi+edx*8+4] ; wt[1] + put1a(0xb8, (1<<19)-1); // mov eax, 524287 + put2(0x39c1); // cmp ecx, eax + put3(0x0f4fc8); // cmovg ecx, eax + put2(0xf7d0); // not eax ; -524288 + put2(0x39c1); // cmp ecx, eax + put3(0x0f4cc8); // cmovl ecx, eax + put4(0x894cd604); // mov [esi+edx*8+4], ecx + } + break; + + case MATCH: // sizebits bufbits: + // a=len, b=offset, c=bit, cm=index, cxt=bitpos + // ht=buf, limit=pos + // assert(cr.a<=255); + // assert(cr.c==0 || cr.c==1); + // assert(cr.cxt<8); + // assert(cr.cm.size()==(size_t(1)<>5; + // int w=cr.a16[cr.cxt]; + // w+=(err*(p[cp[2]]-p[cp[3]])+(1<<12))>>13; + // if (w<0) w=0; + // if (w>65535) w=65535; + // cr.a16[cr.cxt]=w; + + // set ecx=err + put2a(0x8b87, off(p[i])); // mov eax, [edi+&p[i]] + put1a(0x05, 2048); // add eax, 2048 + put4a(0x0fb78447, off(squasht));//movzx eax, word [edi+eax*2+&squasht] + put2(0x89e9); // mov ecx, ebp ; y + put3(0xc1e10f); // shl ecx, 15 + put2(0x29e9); // sub ecx, ebp ; y*32767 + put2(0x29c1); // sub ecx, eax + put2a(0x69c9, cp[4]); // imul ecx, rate + put3(0xc1f905); // sar ecx, 5 ; err + + // Update w + put2a(0x8b87, offc(cxt)); // mov eax, [edi+&cxt] + if (S==8) put1(0x48); // rex.w + put2a(0x8bb7, offc(a16)); // mov esi, [edi+&a16] + if (S==8) put1(0x48); // rex.w + put3(0x8d3446); // lea esi, [esi+eax*2] ; &w + put2a(0x8b87, off(p[cp[2]])); // mov eax, [edi+&p[j]] + put2a(0x2b87, off(p[cp[3]])); // sub eax, [edi+&p[k]] ; p[j]-p[k] + put3(0x0fafc1); // imul eax, ecx ; * err + put1a(0x05, 1<<12); // add eax, 4096 + put3(0xc1f80d); // sar eax, 13 + put3(0x0fb716); // movzx edx, word [esi] ; w + put2(0x01d0); // add eax, edx + put1a(0xba, 0xffff); // mov edx, 65535 + put2(0x39d0); // cmp eax, edx + put3(0x0f4fc2); // cmovg eax, edx + put2(0x31d2); // xor edx, edx + put2(0x39d0); // cmp eax, edx + put3(0x0f4cc2); // cmovl eax, edx + put3(0x668906); // mov word [esi], ax + break; + + case MIX: // sizebits j m rate mask + // cm=wt[size][m], cxt=input + // int m=cp[3]; + // assert(m>0 && m<=i); + // assert(cr.cm.size()==m*cr.c); + // assert(cr.cxt+m<=cr.cm.size()); + // int err=(y*32767-squash(p[i]))*cp[4]>>4; + // int* wt=(int*)&cr.cm[cr.cxt]; + // for (int j=0; j>13)); + + // set ecx=err + put2a(0x8b87, off(p[i])); // mov eax, [edi+&p[i]] + put1a(0x05, 2048); // add eax, 2048 + put4a(0x0fb78447, off(squasht));//movzx eax, word [edi+eax*2+&squasht] + put2(0x89e9); // mov ecx, ebp ; y + put3(0xc1e10f); // shl ecx, 15 + put2(0x29e9); // sub ecx, ebp ; y*32767 + put2(0x29c1); // sub ecx, eax + put2a(0x69c9, cp[4]); // imul ecx, rate + put3(0xc1f904); // sar ecx, 4 ; err + + // set esi=wt + put2a(0x8b87, offc(cxt)); // mov eax, [edi+&cxt] ; cxt + if (S==8) put1(0x48); // rex.w + put2a(0x8bb7, offc(cm)); // mov esi, [edi+&cm] + if (S==8) put1(0x48); // rex.w + put3(0x8d3486); // lea esi, [esi+eax*4] ; wt + + for (int k=0; k=256) { + z.run(c8-256); + hmap4=1; + c8=1; + for (int i=0; i=16 && c8<32) + hmap4=(hmap4&0xf)<<5|y<<4|1; + else + hmap4=(hmap4&0x1f0)|(((hmap4&0xf)*2+y)&0xf); +#endif +} + +// Execute the ZPAQL code with input byte or -1 for EOF. +// Use JIT code at rcode if available, or else create it. +void ZPAQL::run(U32 input) { +#ifdef NOJIT + run0(input); +#else + if (!rcode) { + int n=assemble(); + allocx(rcode, rcode_size, n); + if (!rcode || n<10 || rcode_size<10 || n!=assemble()) + error("run JIT failed"); + } + a=input; + if (!((int(*)())(&rcode[0]))()) + libzpaq::error("Bad ZPAQL opcode"); +#endif +} + +} // end namespace libzpaq diff --git a/libzpaq.h b/libzpaq.h index e7879b4..be67318 100644 --- a/libzpaq.h +++ b/libzpaq.h @@ -1,441 +1,441 @@ -/* libzpaq.h - LIBZPAQ Version 5.00. - - Copyright (C) 2011, Dell Inc. Written by Matt Mahoney. - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so without restriction. - This Software is provided "as is" without warranty. - -LIBZPAQ is a C++ library for compression and decompression of data -conforming to the ZPAQ level 2 standard. See http://mattmahoney.net/zpaq/ - -By default, LIBZPAQ uses JIT (just in time) acceleration. This only -works on x86-32 and x86-64 processors that support the SSE2 instruction -set. To disable JIT, compile with -DNOJIT. To enable run time checks, -compile with -DDEBUG. Both options will decrease speed. - -The decompression code, when compiled with -DDEBUG and -DNOJIT, -comprises the reference decoder for the ZPAQ level 2 standard. -*/ - -#ifndef LIBZPAQ_H -#define LIBZPAQ_H - -#ifndef DEBUG -#define NDEBUG 1 -#endif -#include -#include -#include - -namespace libzpaq { - -// 1, 2, 4, 8 byte unsigned integers -typedef uint8_t U8; -typedef uint16_t U16; -typedef uint32_t U32; -typedef uint64_t U64; - -// Standard library prototypes redirected to libzpaq.cpp -void* calloc(size_t, size_t); -void free(void*); - -// Callback for error handling -extern void error(const char* msg); - -// Virtual base classes for input and output -// get() and put() must be overridden to read or write 1 byte. -// read() and write() may be overridden to read or write n bytes more -// efficiently than calling get() or put() n times. -class Reader { -public: - virtual int get() = 0; // should return 0..255, or -1 at EOF - virtual int read(char* buf, int n); // read to buf[n], return no. read - virtual ~Reader() {} -}; - -class Writer { -public: - virtual void put(int c) = 0; // should output low 8 bits of c - virtual void write(const char* buf, int n); // write buf[n] - virtual ~Writer() {} -}; - -// Read 16 bit little-endian number -int toU16(const char* p); - -// An Array of T is cleared and aligned on a 64 byte address -// with no constructors called. No copy or assignment. -// Array a(n, ex=0); - creates n< -class Array { - T *data; // user location of [0] on a 64 byte boundary - size_t n; // user size - int offset; // distance back in bytes to start of actual allocation - void operator=(const Array&); // no assignment - Array(const Array&); // no copy -public: - Array(size_t sz=0, int ex=0): data(0), n(0), offset(0) { - resize(sz, ex);} // [0..sz-1] = 0 - void resize(size_t sz, int ex=0); // change size, erase content to zeros - ~Array() {resize(0);} // free memory - size_t size() const {return n;} // get size - int isize() const {return int(n);} // get size as an int - T& operator[](size_t i) {assert(n>0 && i0 && (n&(n-1))==0); return data[i&(n-1)];} -}; - -// Change size to sz< -void Array::resize(size_t sz, int ex) { - assert(size_t(-1)>0); // unsigned type? - while (ex>0) { - if (sz>sz*2) error("Array too big"); - sz*=2, --ex; - } - if (n>0) { - assert(offset>0 && offset<=64); - assert((char*)data-offset); - free((char*)data-offset); - } - n=0; - if (sz==0) return; - n=sz; - const size_t nb=128+n*sizeof(T); // test for overflow - if (nb<=128 || (nb-128)/sizeof(T)!=n) error("Array too big"); - data=(T*)calloc(nb, 1); - if (!data) error("Out of memory"); - offset=64-(((char*)data-(char*)0)&63); - assert(offset>0 && offset<=64); - data=(T*)((char*)data+offset); -} - -//////////////////////////// SHA1 //////////////////////////// - -// For computing SHA-1 checksums -class SHA1 { -public: - void put(int c) { // hash 1 byte - U32& r=w[len0>>5&15]; - r=(r<<8)|(c&255); - if (!(len0+=8)) ++len1; - if ((len0&511)==0) process(); - } - double size() const {return len0/8+len1*536870912.0;} // size in bytes - uint64_t usize() const {return len0/8+(U64(len1)<<29);} // size in bytes - const char* result(); // get hash and reset - SHA1() {init();} -private: - void init(); // reset, but don't clear hbuf - U32 len0, len1; // length in bits (low, high) - U32 h[5]; // hash state - U32 w[80]; // input buffer - char hbuf[20]; // result - void process(); // hash 1 block -}; - -//////////////////////////// ZPAQL /////////////////////////// - -// Symbolic constants, instruction size, and names -typedef enum {NONE,CONS,CM,ICM,MATCH,AVG,MIX2,MIX,ISSE,SSE} CompType; -extern const int compsize[256]; - -// A ZPAQL machine COMP+HCOMP or PCOMP. -class ZPAQL { -public: - ZPAQL(); - ~ZPAQL(); - void clear(); // Free memory, erase program, reset machine state - void inith(); // Initialize as HCOMP to run - void initp(); // Initialize as PCOMP to run - double memory(); // Return memory requirement in bytes - void run(U32 input); // Execute with input - int read(Reader* in2); // Read header - bool write(Writer* out2, bool pp); // If pp write PCOMP else HCOMP header - int step(U32 input, int mode); // Trace execution (defined externally) - - Writer* output; // Destination for OUT instruction, or 0 to suppress - SHA1* sha1; // Points to checksum computer - U32 H(int i) {return h(i);} // get element of h - - void flush(); // write outbuf[0..bufptr-1] to output and sha1 - void outc(int c) { // output byte c (0..255) or -1 at EOS - if (c<0 || (outbuf[bufptr]=c, ++bufptr==outbuf.isize())) flush(); - } - - // ZPAQ1 block header - Array header; // hsize[2] hh hm ph pm n COMP (guard) HCOMP (guard) - int cend; // COMP in header[7...cend-1] - int hbegin, hend; // HCOMP/PCOMP in header[hbegin...hend-1] - -private: - // Machine state for executing HCOMP - Array m; // memory array M for HCOMP - Array h; // hash array H for HCOMP - Array r; // 256 element register array - Array outbuf; // output buffer - int bufptr; // number of bytes in outbuf - U32 a, b, c, d; // machine registers - int f; // condition flag - int pc; // program counter - int rcode_size; // length of rcode - U8* rcode; // JIT code for run() - - // Support code - int assemble(); // put JIT code in rcode - void init(int hbits, int mbits); // initialize H and M sizes - int execute(); // execute 1 instruction, return 0 after HALT, else 1 - void run0(U32 input); // default run() when select==0 - void div(U32 x) {if (x) a/=x; else a=0;} - void mod(U32 x) {if (x) a%=x; else a=0;} - void swap(U32& x) {a^=x; x^=a; a^=x;} - void swap(U8& x) {a^=x; x^=a; a^=x;} - void err(); // exit with run time error -}; - -///////////////////////// Component ////////////////////////// - -// A Component is a context model, indirect context model, match model, -// fixed weight mixer, adaptive 2 input mixer without or with current -// partial byte as context, adaptive m input mixer (without or with), -// or SSE (without or with). - -struct Component { - size_t limit; // max count for cm - size_t cxt; // saved context - size_t a, b, c; // multi-purpose variables - Array cm; // cm[cxt] -> p in bits 31..10, n in 9..0; MATCH index - Array ht; // ICM/ISSE hash table[0..size1][0..15] and MATCH buf - Array a16; // MIX weights - void init(); // initialize to all 0 - Component() {init();} -}; - -////////////////////////// StateTable //////////////////////// - -// Next state table generator -class StateTable { - enum {N=64}; // sizes of b, t - int num_states(int n0, int n1); // compute t[n0][n1][1] - void discount(int& n0); // set new value of n0 after 1 or n1 after 0 - void next_state(int& n0, int& n1, int y); // new (n0,n1) after bit y -public: - U8 ns[1024]; // state*4 -> next state if 0, if 1, n0, n1 - int next(int state, int y) { // next state for bit y - assert(state>=0 && state<256); - assert(y>=0 && y<4); - return ns[state*4+y]; - } - int cminit(int state) { // initial probability of 1 * 2^23 - assert(state>=0 && state<256); - return ((ns[state*4+3]*2+1)<<22)/(ns[state*4+2]+ns[state*4+3]+1); - } - StateTable(); -}; - -///////////////////////// Predictor ////////////////////////// - -// A predictor guesses the next bit -class Predictor { -public: - Predictor(ZPAQL&); - ~Predictor(); - void init(); // build model - int predict(); // probability that next bit is a 1 (0..4095) - void update(int y); // train on bit y (0..1) - int stat(int); // Defined externally - bool isModeled() { // n>0 components? - assert(z.header.isize()>6); - return z.header[6]!=0; - } -private: - - // Predictor state - int c8; // last 0...7 bits. - int hmap4; // c8 split into nibbles - int p[256]; // predictions - U32 h[256]; // unrolled copy of z.h - ZPAQL& z; // VM to compute context hashes, includes H, n - Component comp[256]; // the model, includes P - - // Modeling support functions - int predict0(); // default - void update0(int y); // default - int dt2k[256]; // division table for match: dt2k[i] = 2^12/i - int dt[1024]; // division table for cm: dt[i] = 2^16/(i+1.5) - U16 squasht[4096]; // squash() lookup table - short stretcht[32768];// stretch() lookup table - StateTable st; // next, cminit functions - U8* pcode; // JIT code for predict() and update() - int pcode_size; // length of pcode - - // reduce prediction error in cr.cm - void train(Component& cr, int y) { - assert(y==0 || y==1); - U32& pn=cr.cm(cr.cxt); - U32 count=pn&0x3ff; - int error=y*32767-(cr.cm(cr.cxt)>>17); - pn+=(error*dt[count]&-1024)+(count floor(32768/(1+exp(-x/64))) - int squash(int x) { - assert(x>=-2048 && x<=2047); - return squasht[x+2048]; - } - - // x -> round(64*log((x+0.5)/(32767.5-x))), approx inverse of squash - int stretch(int x) { - assert(x>=0 && x<=32767); - return stretcht[x]; - } - - // bound x to a 12 bit signed int - int clamp2k(int x) { - if (x<-2048) return -2048; - else if (x>2047) return 2047; - else return x; - } - - // bound x to a 20 bit signed int - int clamp512k(int x) { - if (x<-(1<<19)) return -(1<<19); - else if (x>=(1<<19)) return (1<<19)-1; - else return x; - } - - // Get cxt in ht, creating a new row if needed - size_t find(Array& ht, int sizebits, U32 cxt); - - // Put JIT code in pcode - int assemble_p(); -}; - -//////////////////////////// Decoder ///////////////////////// - -// Decoder decompresses using an arithmetic code -class Decoder { -public: - Reader* in; // destination - Decoder(ZPAQL& z); - int decompress(); // return a byte or EOF - int skip(); // skip to the end of the segment, return next byte - void init(); // initialize at start of block - int stat(int x) {return pr.stat(x);} -private: - U32 low, high; // range - U32 curr; // last 4 bytes of archive - Predictor pr; // to get p - enum {BUFSIZE=1<<16}; - Array buf; // input buffer of size BUFSIZE bytes - // of unmodeled data. buf[low..high-1] is input with curr - // remaining in sub-block. - int decode(int p); // return decoded bit (0..1) with prob. p (0..65535) - void loadbuf(); // read unmodeled data into buf to EOS -}; - -/////////////////////////// PostProcessor //////////////////// - -class PostProcessor { - int state; // input parse state: 0=INIT, 1=PASS, 2..4=loading, 5=POST - int hsize; // header size - int ph, pm; // sizes of H and M in z -public: - ZPAQL z; // holds PCOMP - PostProcessor(): state(0), hsize(0), ph(0), pm(0) {} - void init(int h, int m); // ph, pm sizes of H and M - int write(int c); // Input a byte, return state - int getState() const {return state;} - void setOutput(Writer* out) {z.output=out;} - void setSHA1(SHA1* sha1ptr) {z.sha1=sha1ptr;} -}; - -//////////////////////// Decompresser //////////////////////// - -// For decompression and listing archive contents -class Decompresser { -public: - Decompresser(): z(), dec(z), pp(), state(BLOCK), decode_state(FIRSTSEG) {} - void setInput(Reader* in) {dec.in=in;} - bool findBlock(double* memptr = 0); - void hcomp(Writer* out2) {z.write(out2, false);} - bool findFilename(Writer* = 0); - void readComment(Writer* = 0); - void setOutput(Writer* out) {pp.setOutput(out);} - void setSHA1(SHA1* sha1ptr) {pp.setSHA1(sha1ptr);} - bool decompress(int n = -1); // n bytes, -1=all, return true until done - bool pcomp(Writer* out2) {return pp.z.write(out2, true);} - void readSegmentEnd(char* sha1string = 0); - int stat(int x) {return dec.stat(x);} -private: - ZPAQL z; - Decoder dec; - PostProcessor pp; - enum {BLOCK, FILENAME, COMMENT, DATA, SEGEND} state; // expected next - enum {FIRSTSEG, SEG, SKIP} decode_state; // which segment in block? -}; - -/////////////////////////// decompress() ///////////////////// - -void decompress(Reader* in, Writer* out); - -////////////////////////////////////////////////////////////// -////////////////////////////////////////////////////////////// - -// Code following this point is not a part of the ZPAQ level 2 standard. - -//////////////////////////// Encoder ///////////////////////// - -// Encoder compresses using an arithmetic code -class Encoder { -public: - Encoder(ZPAQL& z, int size=0): - out(0), low(1), high(0xFFFFFFFF), pr(z) {} - void init(); - void compress(int c); // c is 0..255 or EOF - int stat(int x) {return pr.stat(x);} - Writer* out; // destination -private: - U32 low, high; // range - Predictor pr; // to get p - Array buf; // unmodeled input - void encode(int y, int p); // encode bit y (0..1) with prob. p (0..65535) -}; - -//////////////////////// Compressor ////////////////////////// - -class Compressor { -public: - Compressor(): enc(z), in(0), state(INIT) {} - void setOutput(Writer* out) {enc.out=out;} - void writeTag(); - void startBlock(int level); // level=1,2,3 - void startBlock(const char* hcomp); - void startSegment(const char* filename = 0, const char* comment = 0); - void setInput(Reader* i) {in=i;} - void postProcess(const char* pcomp = 0, int len = 0); - bool compress(int n = -1); // n bytes, -1=all, return true until done - void endSegment(const char* sha1string = 0); - void endBlock(); - int stat(int x) {return enc.stat(x);} -private: - ZPAQL z; - Encoder enc; - Reader* in; - enum {INIT, BLOCK1, SEG1, BLOCK2, SEG2} state; -}; - -/////////////////////////// compress() /////////////////////// - -void compress(Reader* in, Writer* out, int level); - -} // namespace libzpaq - -#endif // LIBZPAQ_H +/* libzpaq.h - LIBZPAQ Version 5.00. + + Copyright (C) 2011, Dell Inc. Written by Matt Mahoney. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so without restriction. + This Software is provided "as is" without warranty. + +LIBZPAQ is a C++ library for compression and decompression of data +conforming to the ZPAQ level 2 standard. See http://mattmahoney.net/zpaq/ + +By default, LIBZPAQ uses JIT (just in time) acceleration. This only +works on x86-32 and x86-64 processors that support the SSE2 instruction +set. To disable JIT, compile with -DNOJIT. To enable run time checks, +compile with -DDEBUG. Both options will decrease speed. + +The decompression code, when compiled with -DDEBUG and -DNOJIT, +comprises the reference decoder for the ZPAQ level 2 standard. +*/ + +#ifndef LIBZPAQ_H +#define LIBZPAQ_H + +#ifndef DEBUG +#define NDEBUG 1 +#endif +#include +#include +#include + +namespace libzpaq { + +// 1, 2, 4, 8 byte unsigned integers +typedef uint8_t U8; +typedef uint16_t U16; +typedef uint32_t U32; +typedef uint64_t U64; + +// Standard library prototypes redirected to libzpaq.cpp +void* calloc(size_t, size_t); +void free(void*); + +// Callback for error handling +extern void error(const char* msg); + +// Virtual base classes for input and output +// get() and put() must be overridden to read or write 1 byte. +// read() and write() may be overridden to read or write n bytes more +// efficiently than calling get() or put() n times. +class Reader { +public: + virtual int get() = 0; // should return 0..255, or -1 at EOF + virtual int read(char* buf, int n); // read to buf[n], return no. read + virtual ~Reader() {} +}; + +class Writer { +public: + virtual void put(int c) = 0; // should output low 8 bits of c + virtual void write(const char* buf, int n); // write buf[n] + virtual ~Writer() {} +}; + +// Read 16 bit little-endian number +int toU16(const char* p); + +// An Array of T is cleared and aligned on a 64 byte address +// with no constructors called. No copy or assignment. +// Array a(n, ex=0); - creates n< +class Array { + T *data; // user location of [0] on a 64 byte boundary + size_t n; // user size + int offset; // distance back in bytes to start of actual allocation + void operator=(const Array&); // no assignment + Array(const Array&); // no copy +public: + Array(size_t sz=0, int ex=0): data(0), n(0), offset(0) { + resize(sz, ex);} // [0..sz-1] = 0 + void resize(size_t sz, int ex=0); // change size, erase content to zeros + ~Array() {resize(0);} // free memory + size_t size() const {return n;} // get size + int isize() const {return int(n);} // get size as an int + T& operator[](size_t i) {assert(n>0 && i0 && (n&(n-1))==0); return data[i&(n-1)];} +}; + +// Change size to sz< +void Array::resize(size_t sz, int ex) { + assert(size_t(-1)>0); // unsigned type? + while (ex>0) { + if (sz>sz*2) error("Array too big"); + sz*=2, --ex; + } + if (n>0) { + assert(offset>0 && offset<=64); + assert((char*)data-offset); + free((char*)data-offset); + } + n=0; + if (sz==0) return; + n=sz; + const size_t nb=128+n*sizeof(T); // test for overflow + if (nb<=128 || (nb-128)/sizeof(T)!=n) error("Array too big"); + data=(T*)calloc(nb, 1); + if (!data) error("Out of memory"); + offset=64-(((char*)data-(char*)0)&63); + assert(offset>0 && offset<=64); + data=(T*)((char*)data+offset); +} + +//////////////////////////// SHA1 //////////////////////////// + +// For computing SHA-1 checksums +class SHA1 { +public: + void put(int c) { // hash 1 byte + U32& r=w[len0>>5&15]; + r=(r<<8)|(c&255); + if (!(len0+=8)) ++len1; + if ((len0&511)==0) process(); + } + double size() const {return len0/8+len1*536870912.0;} // size in bytes + uint64_t usize() const {return len0/8+(U64(len1)<<29);} // size in bytes + const char* result(); // get hash and reset + SHA1() {init();} +private: + void init(); // reset, but don't clear hbuf + U32 len0, len1; // length in bits (low, high) + U32 h[5]; // hash state + U32 w[80]; // input buffer + char hbuf[20]; // result + void process(); // hash 1 block +}; + +//////////////////////////// ZPAQL /////////////////////////// + +// Symbolic constants, instruction size, and names +typedef enum {NONE,CONS,CM,ICM,MATCH,AVG,MIX2,MIX,ISSE,SSE} CompType; +extern const int compsize[256]; + +// A ZPAQL machine COMP+HCOMP or PCOMP. +class ZPAQL { +public: + ZPAQL(); + ~ZPAQL(); + void clear(); // Free memory, erase program, reset machine state + void inith(); // Initialize as HCOMP to run + void initp(); // Initialize as PCOMP to run + double memory(); // Return memory requirement in bytes + void run(U32 input); // Execute with input + int read(Reader* in2); // Read header + bool write(Writer* out2, bool pp); // If pp write PCOMP else HCOMP header + int step(U32 input, int mode); // Trace execution (defined externally) + + Writer* output; // Destination for OUT instruction, or 0 to suppress + SHA1* sha1; // Points to checksum computer + U32 H(int i) {return h(i);} // get element of h + + void flush(); // write outbuf[0..bufptr-1] to output and sha1 + void outc(int c) { // output byte c (0..255) or -1 at EOS + if (c<0 || (outbuf[bufptr]=c, ++bufptr==outbuf.isize())) flush(); + } + + // ZPAQ1 block header + Array header; // hsize[2] hh hm ph pm n COMP (guard) HCOMP (guard) + int cend; // COMP in header[7...cend-1] + int hbegin, hend; // HCOMP/PCOMP in header[hbegin...hend-1] + +private: + // Machine state for executing HCOMP + Array m; // memory array M for HCOMP + Array h; // hash array H for HCOMP + Array r; // 256 element register array + Array outbuf; // output buffer + int bufptr; // number of bytes in outbuf + U32 a, b, c, d; // machine registers + int f; // condition flag + int pc; // program counter + int rcode_size; // length of rcode + U8* rcode; // JIT code for run() + + // Support code + int assemble(); // put JIT code in rcode + void init(int hbits, int mbits); // initialize H and M sizes + int execute(); // execute 1 instruction, return 0 after HALT, else 1 + void run0(U32 input); // default run() when select==0 + void div(U32 x) {if (x) a/=x; else a=0;} + void mod(U32 x) {if (x) a%=x; else a=0;} + void swap(U32& x) {a^=x; x^=a; a^=x;} + void swap(U8& x) {a^=x; x^=a; a^=x;} + void err(); // exit with run time error +}; + +///////////////////////// Component ////////////////////////// + +// A Component is a context model, indirect context model, match model, +// fixed weight mixer, adaptive 2 input mixer without or with current +// partial byte as context, adaptive m input mixer (without or with), +// or SSE (without or with). + +struct Component { + size_t limit; // max count for cm + size_t cxt; // saved context + size_t a, b, c; // multi-purpose variables + Array cm; // cm[cxt] -> p in bits 31..10, n in 9..0; MATCH index + Array ht; // ICM/ISSE hash table[0..size1][0..15] and MATCH buf + Array a16; // MIX weights + void init(); // initialize to all 0 + Component() {init();} +}; + +////////////////////////// StateTable //////////////////////// + +// Next state table generator +class StateTable { + enum {N=64}; // sizes of b, t + int num_states(int n0, int n1); // compute t[n0][n1][1] + void discount(int& n0); // set new value of n0 after 1 or n1 after 0 + void next_state(int& n0, int& n1, int y); // new (n0,n1) after bit y +public: + U8 ns[1024]; // state*4 -> next state if 0, if 1, n0, n1 + int next(int state, int y) { // next state for bit y + assert(state>=0 && state<256); + assert(y>=0 && y<4); + return ns[state*4+y]; + } + int cminit(int state) { // initial probability of 1 * 2^23 + assert(state>=0 && state<256); + return ((ns[state*4+3]*2+1)<<22)/(ns[state*4+2]+ns[state*4+3]+1); + } + StateTable(); +}; + +///////////////////////// Predictor ////////////////////////// + +// A predictor guesses the next bit +class Predictor { +public: + Predictor(ZPAQL&); + ~Predictor(); + void init(); // build model + int predict(); // probability that next bit is a 1 (0..4095) + void update(int y); // train on bit y (0..1) + int stat(int); // Defined externally + bool isModeled() { // n>0 components? + assert(z.header.isize()>6); + return z.header[6]!=0; + } +private: + + // Predictor state + int c8; // last 0...7 bits. + int hmap4; // c8 split into nibbles + int p[256]; // predictions + U32 h[256]; // unrolled copy of z.h + ZPAQL& z; // VM to compute context hashes, includes H, n + Component comp[256]; // the model, includes P + + // Modeling support functions + int predict0(); // default + void update0(int y); // default + int dt2k[256]; // division table for match: dt2k[i] = 2^12/i + int dt[1024]; // division table for cm: dt[i] = 2^16/(i+1.5) + U16 squasht[4096]; // squash() lookup table + short stretcht[32768];// stretch() lookup table + StateTable st; // next, cminit functions + U8* pcode; // JIT code for predict() and update() + int pcode_size; // length of pcode + + // reduce prediction error in cr.cm + void train(Component& cr, int y) { + assert(y==0 || y==1); + U32& pn=cr.cm(cr.cxt); + U32 count=pn&0x3ff; + int error=y*32767-(cr.cm(cr.cxt)>>17); + pn+=(error*dt[count]&-1024)+(count floor(32768/(1+exp(-x/64))) + int squash(int x) { + assert(x>=-2048 && x<=2047); + return squasht[x+2048]; + } + + // x -> round(64*log((x+0.5)/(32767.5-x))), approx inverse of squash + int stretch(int x) { + assert(x>=0 && x<=32767); + return stretcht[x]; + } + + // bound x to a 12 bit signed int + int clamp2k(int x) { + if (x<-2048) return -2048; + else if (x>2047) return 2047; + else return x; + } + + // bound x to a 20 bit signed int + int clamp512k(int x) { + if (x<-(1<<19)) return -(1<<19); + else if (x>=(1<<19)) return (1<<19)-1; + else return x; + } + + // Get cxt in ht, creating a new row if needed + size_t find(Array& ht, int sizebits, U32 cxt); + + // Put JIT code in pcode + int assemble_p(); +}; + +//////////////////////////// Decoder ///////////////////////// + +// Decoder decompresses using an arithmetic code +class Decoder { +public: + Reader* in; // destination + Decoder(ZPAQL& z); + int decompress(); // return a byte or EOF + int skip(); // skip to the end of the segment, return next byte + void init(); // initialize at start of block + int stat(int x) {return pr.stat(x);} +private: + U32 low, high; // range + U32 curr; // last 4 bytes of archive + Predictor pr; // to get p + enum {BUFSIZE=1<<16}; + Array buf; // input buffer of size BUFSIZE bytes + // of unmodeled data. buf[low..high-1] is input with curr + // remaining in sub-block. + int decode(int p); // return decoded bit (0..1) with prob. p (0..65535) + void loadbuf(); // read unmodeled data into buf to EOS +}; + +/////////////////////////// PostProcessor //////////////////// + +class PostProcessor { + int state; // input parse state: 0=INIT, 1=PASS, 2..4=loading, 5=POST + int hsize; // header size + int ph, pm; // sizes of H and M in z +public: + ZPAQL z; // holds PCOMP + PostProcessor(): state(0), hsize(0), ph(0), pm(0) {} + void init(int h, int m); // ph, pm sizes of H and M + int write(int c); // Input a byte, return state + int getState() const {return state;} + void setOutput(Writer* out) {z.output=out;} + void setSHA1(SHA1* sha1ptr) {z.sha1=sha1ptr;} +}; + +//////////////////////// Decompresser //////////////////////// + +// For decompression and listing archive contents +class Decompresser { +public: + Decompresser(): z(), dec(z), pp(), state(BLOCK), decode_state(FIRSTSEG) {} + void setInput(Reader* in) {dec.in=in;} + bool findBlock(double* memptr = 0); + void hcomp(Writer* out2) {z.write(out2, false);} + bool findFilename(Writer* = 0); + void readComment(Writer* = 0); + void setOutput(Writer* out) {pp.setOutput(out);} + void setSHA1(SHA1* sha1ptr) {pp.setSHA1(sha1ptr);} + bool decompress(int n = -1); // n bytes, -1=all, return true until done + bool pcomp(Writer* out2) {return pp.z.write(out2, true);} + void readSegmentEnd(char* sha1string = 0); + int stat(int x) {return dec.stat(x);} +private: + ZPAQL z; + Decoder dec; + PostProcessor pp; + enum {BLOCK, FILENAME, COMMENT, DATA, SEGEND} state; // expected next + enum {FIRSTSEG, SEG, SKIP} decode_state; // which segment in block? +}; + +/////////////////////////// decompress() ///////////////////// + +void decompress(Reader* in, Writer* out); + +////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////// + +// Code following this point is not a part of the ZPAQ level 2 standard. + +//////////////////////////// Encoder ///////////////////////// + +// Encoder compresses using an arithmetic code +class Encoder { +public: + Encoder(ZPAQL& z, int size=0): + out(0), low(1), high(0xFFFFFFFF), pr(z) {} + void init(); + void compress(int c); // c is 0..255 or EOF + int stat(int x) {return pr.stat(x);} + Writer* out; // destination +private: + U32 low, high; // range + Predictor pr; // to get p + Array buf; // unmodeled input + void encode(int y, int p); // encode bit y (0..1) with prob. p (0..65535) +}; + +//////////////////////// Compressor ////////////////////////// + +class Compressor { +public: + Compressor(): enc(z), in(0), state(INIT) {} + void setOutput(Writer* out) {enc.out=out;} + void writeTag(); + void startBlock(int level); // level=1,2,3 + void startBlock(const char* hcomp); + void startSegment(const char* filename = 0, const char* comment = 0); + void setInput(Reader* i) {in=i;} + void postProcess(const char* pcomp = 0, int len = 0); + bool compress(int n = -1); // n bytes, -1=all, return true until done + void endSegment(const char* sha1string = 0); + void endBlock(); + int stat(int x) {return enc.stat(x);} +private: + ZPAQL z; + Encoder enc; + Reader* in; + enum {INIT, BLOCK1, SEG1, BLOCK2, SEG2} state; +}; + +/////////////////////////// compress() /////////////////////// + +void compress(Reader* in, Writer* out, int level); + +} // namespace libzpaq + +#endif // LIBZPAQ_H diff --git a/readme.txt b/readme.txt index 42f2d87..2bdf98e 100644 --- a/readme.txt +++ b/readme.txt @@ -1,84 +1,84 @@ -fastqz15.cpp is the source code for the latest version -of the FASTQ compressor. It compresses the common Sanger -variant. FASTQ is output by DNA sequencing machines. - -fapack.cpp is a program to pack FASTA files into a format -suitable for input to fastqz as a reference genome for -better compression. -It packs 4 bases per byte and discards all but A,C,G,T. - -fapacks.cpp works the same except that it does not ignore -lowercase a,c,g,t. Lowercase is used in hg19 to indicate -repeats. Generally it produces a larger reference but -gives better compression. - -Other fastqz*.cpp are older versions. You don't need them. - -Usage: fastqz {c[Q]|d|e[Q]|f} input output [reference] - -Command c compresses input to output.fx?.zpaq (3 or 4 files) -Command d decompresses input.fx?.zpaq to output -Command e encodes input to output.fx? -Command f decodes input.fx? to output - -Commands c and d are slow, require 1.5 GB memory, use 3 or -4 cores, but get very good compression. Commands e and f are much -faster, use little memory, and only one thread, but compression -ratio is not as good. - -Commands cQ or eQ quantize the quality scores for lossy but -better compression. The default is c1 or e1, which is lossless. -Quality scores in the range 33..73 are rounded down to 35 plus -a multiple of Q. - -You can supply a reference genome to improve compression. -If you use this, the same reference is needed to decompress. -It also increases the memory requirement to 1.2 GB for the -e command and 0.5 GB for the f command. c and d still need -1.5 GB. - -You can prepare the reference genome from FASTA files like: - - fapacks hg19s *.fa - -to produce the file hg19s. Then compress: - - fastqz c in.fastq arc hg19s - -To decompress: - - fastqz d arc out.fastq hg19s - -There are 4 compressed files: - - arc.fxh.zpaq - compressed headers - arc.fxb.zpaq - compressed base calls - arc.fxq.zpaq - compressed quality scores - arc.fxa.zpaq - compressed alignments if a reference is used. - -Commands e and f work the same way except the compressed -files do not have a .zpaq extension. If no reference is -used, then no .fxa or .fxa.zpaq file is produced or expected. - -fastqz only works on the Sanger FASTQ variant. It assumes -that quality scores are Phred+33 (range ASCII 33 to 73). -Base calls must be A,C,G,T,N only. N must have a quality -score of 0, and all others 1 or higher. Maximum line length -is 4095. Lines must be terminated by linefeeds only (no -carriage returns). If a reference is used, it must be -smaller than 1 GB packed (4 billion bases). - -To compile fastqz you will need the latest version of -libzpaq from https://sourceforge.net/projects/zpaq/ -or http://mattmahoney.net/zpaq/ -These programs will work in either Windows or Linux. -In Windows, you will also need Pthreads-Win32 from -http://sourceware.org/pthreads-win32/ to compile or run. -To compile (no Makefile, sorry): - - g++ -O3 -msse2 -s -lpthread fastqz.cpp libzpaq.cpp -o fastqz - g++ -O3 -s fapack.cpp -o fapack - -fastqz* and fapack* are written by Matt Mahoney, Dell Inc. -All are BSD-2 licensed. But note that libzpaq -is public domain and Pthreads-Win32 is LGPL. +fastqz15.cpp is the source code for the latest version +of the FASTQ compressor. It compresses the common Sanger +variant. FASTQ is output by DNA sequencing machines. + +fapack.cpp is a program to pack FASTA files into a format +suitable for input to fastqz as a reference genome for +better compression. +It packs 4 bases per byte and discards all but A,C,G,T. + +fapacks.cpp works the same except that it does not ignore +lowercase a,c,g,t. Lowercase is used in hg19 to indicate +repeats. Generally it produces a larger reference but +gives better compression. + +Other fastqz*.cpp are older versions. You don't need them. + +Usage: fastqz {c[Q]|d|e[Q]|f} input output [reference] + +Command c compresses input to output.fx?.zpaq (3 or 4 files) +Command d decompresses input.fx?.zpaq to output +Command e encodes input to output.fx? +Command f decodes input.fx? to output + +Commands c and d are slow, require 1.5 GB memory, use 3 or +4 cores, but get very good compression. Commands e and f are much +faster, use little memory, and only one thread, but compression +ratio is not as good. + +Commands cQ or eQ quantize the quality scores for lossy but +better compression. The default is c1 or e1, which is lossless. +Quality scores in the range 33..73 are rounded down to 35 plus +a multiple of Q. + +You can supply a reference genome to improve compression. +If you use this, the same reference is needed to decompress. +It also increases the memory requirement to 1.2 GB for the +e command and 0.5 GB for the f command. c and d still need +1.5 GB. + +You can prepare the reference genome from FASTA files like: + + fapacks hg19s *.fa + +to produce the file hg19s. Then compress: + + fastqz c in.fastq arc hg19s + +To decompress: + + fastqz d arc out.fastq hg19s + +There are 4 compressed files: + + arc.fxh.zpaq - compressed headers + arc.fxb.zpaq - compressed base calls + arc.fxq.zpaq - compressed quality scores + arc.fxa.zpaq - compressed alignments if a reference is used. + +Commands e and f work the same way except the compressed +files do not have a .zpaq extension. If no reference is +used, then no .fxa or .fxa.zpaq file is produced or expected. + +fastqz only works on the Sanger FASTQ variant. It assumes +that quality scores are Phred+33 (range ASCII 33 to 73). +Base calls must be A,C,G,T,N only. N must have a quality +score of 0, and all others 1 or higher. Maximum line length +is 4095. Lines must be terminated by linefeeds only (no +carriage returns). If a reference is used, it must be +smaller than 1 GB packed (4 billion bases). + +To compile fastqz you will need the latest version of +libzpaq from https://sourceforge.net/projects/zpaq/ +or http://mattmahoney.net/zpaq/ +These programs will work in either Windows or Linux. +In Windows, you will also need Pthreads-Win32 from +http://sourceware.org/pthreads-win32/ to compile or run. +To compile (no Makefile, sorry): + + g++ -O3 -msse2 -s -lpthread fastqz.cpp libzpaq.cpp -o fastqz + g++ -O3 -s fapack.cpp -o fapack + +fastqz* and fapack* are written by Matt Mahoney, Dell Inc. +All are BSD-2 licensed. But note that libzpaq +is public domain and Pthreads-Win32 is LGPL.