diff --git a/.gitignore b/.gitignore
index 8e5cd9d..1de2a68 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,3 @@
 fastqz
+fapack
+fapacks
diff --git a/Makefile b/Makefile
index cc5d45c..d1cf0a3 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,7 @@
-default: fastqz
+default: fastqz fapack fapacks
 
 fastqz: fastqz15.cpp libzpaq.3.pod libzpaq.cpp libzpaq.h
 	g++ -O3 -msse2 -s -lpthread fastqz15.cpp libzpaq.cpp -o $@
+
+clean:
+	- rm -f fastqz fapack fapacks
diff --git a/fapack.cpp b/fapack.cpp
new file mode 100644
index 0000000..487e03c
--- /dev/null
+++ b/fapack.cpp
@@ -0,0 +1,71 @@
+/* fapack.cpp - pack FASTA 4 bases per byte
+
+  Copyright (C) 2012, Matt Mahoney, Dell Inc.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  Redistributions of source code must retain the above copyright notice,
+  this list of conditions and the following disclaimer.
+
+  Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+
+This program produces packed DNA sequences from FASTA files.
+The output may be used as a reference genome for the program fastqz.
+
+To use: fapack output *.fa
+
+where *.fa is a list of FASTA input files. For each file,
+input lines starting with ">" are ignored. For all other lines,
+the letters A,C,G,T are packed MSB first with A=00,C=01,G=10,T=11.
+All other characters are ignored. The last partial byte is discarded.
+
+To compile: g++ -O3 fapack.cpp -o fapack
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+
+int main(int argc, char** argv) {
+  if (argc<3)
+    printf("To pack FASTA files: fapack output *.fa\n"), exit(1);
+  FILE *out=fopen(argv[1], "wb");
+  int b=1, c;
+  for (int i=2; i<argc; ++i) {
+    printf("%s\n", argv[i]);
+    FILE *in=fopen(argv[i], "rb");
+    if (!in) continue;
+    bool dna=true;
+    while ((c=getc(in))!=EOF) {
+      if (c=='>') dna=false;
+      else if (c==10) dna=true;
+      if (dna) {
+        if (c=='A') b=b*4;
+        if (c=='C') b=b*4+1;
+        if (c=='G') b=b*4+2;
+        if (c=='T') b=b*4+3;
+        if (b>=256) putc(b&255, out), b=1;
+      }
+    }
+    if (in) fclose(in);
+  }
+  fclose(out);
+  return 0;
+}
+
+
diff --git a/fapacks.cpp b/fapacks.cpp
new file mode 100644
index 0000000..1a1e09a
--- /dev/null
+++ b/fapacks.cpp
@@ -0,0 +1,65 @@
+/* fapacks.cpp - pack FASTA 4 bases per byte
+   includes lowercase a,c,g,t
+
+  Copyright (C) 2012, Matt Mahoney, Dell Inc.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  Redistributions of source code must retain the above copyright notice,
+  this list of conditions and the following disclaimer.
+
+  Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+
+This program produces packed DNA sequences from FASTA files.
+The output may be used as a reference genome for the program fastqz.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+
+int main(int argc, char** argv) {
+  if (argc<3)
+    printf("To pack FASTA files: fapack output *.fa\n"), exit(1);
+  FILE *out=fopen(argv[1], "wb");
+  int b=1, c;
+  for (int i=2; i<argc; ++i) {
+    printf("%s\n", argv[i]);
+    FILE *in=fopen(argv[i], "rb");
+    if (!in) continue;
+    bool dna=true;
+    while ((c=getc(in))!=EOF) {
+      if (c=='>') dna=false;
+      else if (c==10) dna=true;
+      if (islower(c)) c=toupper(c);
+      if (dna) {
+        if (c=='A') b=b*4;
+        if (c=='C') b=b*4+1;
+        if (c=='G') b=b*4+2;
+        if (c=='T') b=b*4+3;
+        if (b>=256) putc(b&255, out), b=1;
+      }
+    }
+    if (in) fclose(in);
+  }
+  fclose(out);
+  return 0;
+}
+
+
diff --git a/fastqz15.cpp b/fastqz15.cpp
index 2261556..4808f6e 100644
--- a/fastqz15.cpp
+++ b/fastqz15.cpp
@@ -1,960 +1,960 @@
-/* fastqz v1.5 - Sanger FASTQ compressor
-
-  Copyright (C) 2012, Matt Mahoney, Dell Inc.
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are met:
-
-  Redistributions of source code must retain the above copyright notice,
-  this list of conditions and the following disclaimer.
-
-  Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-  POSSIBILITY OF SUCH DAMAGE.
-
-TO COMPILE
-
-g++ -O3 -msse2 -s -lpthread fastqz.cpp libzpaq.cpp -o fastqz
-
-You need libzpaq.cpp and libzpaq.h from either
-https://sourceforge.net/projects/zpaq/ or
-http://mattmahoney.net/zpaq/
-libzpaq is public domain.
-
-Also, to use in Windows you need to install Pthreads-Win32 from
-http://sourceware.org/pthreads-win32/
-In particular you need pthread.h to compile and pthreadGC2.dll
-in your PATH to run. Pthreads-Win32 is licensed under LGPL.
-
-libzpaq uses Just-In-Time (JIT) optimization of ZPAQL code on
-x86 32 or 64 bit processors. To run on other processors, compile
-with -DNOJIT to disable this feature. It will still work but run slower.
-
-
-USAGE
-
-fx is a compressor for Sanger FASTQ files. It has two compression modes,
-fast and slow.
-
-Usage: fastqz command input output [reference]
-Commands:
-  c - compress input to output.fx?.zpaq (3 files, ? = {h,b,q})
-  d - decompress input.fx?.zpaq to output
-  e - encode input to output.fx? without zpaq compression (faster)
-  f - decode input.fx? to output
-  cQ, eQ - quantize quality values to 35 plus a multiple of Q, rounding
-           down. Default is c1 or e1.
-
-Commands c and e compress. c compresses smaller but e compresses faster.
-The corresponding decompression commands are d and f respectively.
-You need 1.5 GB memory to compress with c or decompress with d.
-They also both produce temporary files taking the same space as the
-output of command e. The e and f commands don't use significant
-memory and don't produce temporary files.
-
-Using a quantization like c2 or e4 is lossy but improves compression
-when exact quality values are not needed. Values are rounded down.
-
-Compression produces 3 files. Command e produces files named
-output.fxh, output.fxb, output.fxq. Command c produces files named
-output.fxh.zpaq, output.fxb.zpaq, output.fxq.zpaq
-When decompressing, omit the .fx? or .fx?.zpaq extension
-on the input file names. The extensions will be assumed.
-
-Input for compression is expected to be a Sanger FASTQ file.
-The file consists of "reads" from a DNA sequencing machine. Each
-read has the following format:
-
-  @header
-  ACGTN    (base calls, length n)
-  +
-  !..I#    (quality scores, length n, ASCII 34..73 for A,C,G,T, 33 for N)
-
-Maximum line length is 4095. Lines must be terminated by LF
-(ASCII 10) only (no CR). All base and quality lines must have
-the same length (read length = n) throughout the file.
-Files not in this format are rejected.
-
-If [reference] is present, then it is the file name of a reference
-genome. The same reference must be present for decompression.
-The reference genome consists of a sequence of bases packed 4
-per byte in MSB to LSB order with ACGT=0..3. You can use the
-program fapack to convert FASTA files into this format.
-The reference genome cannot be bigger than 1 GB (2^32 bases).
-You need 1.5 GB memory to encode and 1 GB to decode.
-A fourth file will be produced: output.fxa.zpaq or output.fxa
-containing compressed alignments.
-
-
-COMPRESSION FORMAT
-
-Command "c" and "e" both split the input into 3 or 4 parts and
-compress them as described below. Command "c" further compresses
-each of the 3 or 4 files using a different ZPAQ model.
-
-Headers (.fxh) are coded in the form (j,k,len,xxx...,0) which means
-go to column j-1 (first column is 0) in the previous header and
-add k-1 to the decimal number ending there. If k=1, then skip
-this step. Then copy the first len characters of the modified previous
-header, then output xxx, and finally a linefeed (ASCII 10). Save this
-output, minus the linefeed.
-
-The first 2 bytes of the .fxh file encodes the read length, n,
-MSB first (e.g. 0,100 if all base and quality lines have length 100).
-
-Base calls (.fxb) are encoded first by deleting all N's. These can be
-restored because their location is indicated by a quality score
-of 33. Then the remaining bases are encoded in self terminating
-base 4 with A=1, T=2, C=3, G=4 allowing 3 or 4 bases per byte.
-For example, "TACT" is coded as 2*64 + 1*16 + 3*4 + 2*1 = 158.
-
-If a reference is given, then a list of matches are stored in a .fxa
-file. The format is:
-
-  (m1+1+128*dir,m2+1,m3+1,m4+1,p3,p2,p1,p0)  to encode a match
-  (0)                                        to encode no match
-
-where p3..p0 is a 32 bit pointer (MSB first)
-into the reference genome after expanding to 1 base per element
-(with 0..3=ACGT) and padding the ends with 16384 zeros (or A).
-'dir' is 0 for a match in the forward direction or 1 for a
-match in the reverse direction starting at the same point but
-exchanging A with T and C with G. m1..m4 are the locations of
-the first 4 diferences between the base sequence (after deleting
-N's) and the reference, in the range 0..len-1 where len is the
-length of the sequence with N's deleted. Thus, the bytes are
-coded in the range 1..len, with bit 7 of the first byte set if
-the match is reversed. The mismatches are in ascending order.
-If there are less than 4 mismatches, then the remaining bytes
-are coded as len+1. Thus, only reads up to 126 can be fully
-matched.
-
-If a match is present, then only the corresponding mismatched bases,
-plus any bases after m4 (except N), are written to the .fxb file.
-If the first byte is 0, then there is no match and the entire
-base string is written (except N).
-
-Quality scores are decoded as follows: q=1..72 decode as q+32
-(33..104). q=73..136 decode as a pair (q-73)%8+64, (q-73)/8+64,
-both in the range 64..71. q=137..200 decode as the triple
-(q-137)%4+68, (q-137)/4%4+68, (q-137)/16+68 in the range 68..71.
-q=201..255 decodes as 71 repeated q-200 (1..55) times. q=0
-decodes by setting all remaining values to 35 and terminating
-the sequence. The coding takes advantage of the high frequency
-of q at or just below 71 that group early in the sequence, and
-of sequences that end in runs of 35.
-
-Command "c" further compresses the output.fx? files
-to output.fx?.zpaq files as defined by the ZPAQ level 2 standard
-which can be found at http://mattmahoney.net/zpaq/ or
-https://sourceforge.net/projects/zpaq/
-
-ZPAQ is a configurable compression format based on the PAQ context
-mixing algorithm with bit-wise prediction and arithmetic coding.
-Context models are described in ZPAQL byte code, which is saved to
-the compressed file and can be read by a generic ZPAQ decompressor.
-Thus, a FASTQ file compressed with "fastqz c" could be decompressed
-first with zpaq and then with "fastq f" as opposed to decompressing
-with "fastq d".
-
-ZPAQL byte code describes an array of components and code to compute
-contexts. Each component takes a context and possibly the predictions
-of earlier components and outputs a new probability that the next
-bit will be a 1. The output of the last component is used to arithmetic
-encode or decode the next bit. After encoding or decoding, the bit
-is used to update the models to reduce their prediction errors.
-
-Whole-byte contexts are computed on byte boundaries by code running on
-a ZPAQL virtual machine. This program is executed once after modeling
-each byte with that byte as input. The output is saved in an array
-of 32-bit values which is available as input to the array of components.
-These values are combined with the previously coded bits of the current
-byte to form a complete context.
-
-A ZPAQ model is described by a config file. In this program, the
-compiled byte code is fed to the model during compression, or read
-from the compressed file header during decompression. The source code
-for each model is given below, followed by an explanation of the code.
-The command "zpaq -mfx? l" will generate the byte code used in this
-program from the sources below named "fx?.cfg" (where ? is h,b,q,a).
-
-A config file has 3 sections:
-
-  COMP - describes the array of modeling components.
-  HCOMP - ZPAQL code to compute contexts.
-  POST/PCOMP - ZPAQL code for post-processing.
-
-Post-processing is not used, so each file ends with POST 0 END.
-Modeled bits are output directly.
-
-A ZPAQL virtual machine has 32-bit registers A,B,C,D, an array
-of bytes M, an array of 32 bit unsigned integers H, a condition flag F,
-and a 16 bit program counter. H is the context output to the model.
-A is the input byte and accumulator for arithmetic and logical operations.
-B and C are pointers into M. D points to H. *B, *C, *D refer
-to the elements pointed to, modulo the array sizes. The sizes are
-given by the first 2 parameters after COMP.
-
-
-HEADER MODELING
-
-(fxh.cfg model to compress headers)
-comp 3 8 0 0 5 (H has size 2^3, M has size 2^8)
-  0 cm 20 128  (direct 20-bit context model with max count 128*4)
-  1 cm 22 128
-  2 icm 18     (indirect context model with 2^(18+6) bit histories)
-  3 icm 19
-  4 mix 13 0 4 24 255 (13 bit context, mix 0..0+4-1, rate 24, mask 255)
-hcomp
-  *c=a c++ a== 0 if c=0 endif (save input in buffer M pointed to by C)
-  d=0 *d=0 b=c a=c hashd (context H[0] is a hash of column number)
-    a=*b hashd (combined with the byte above, saved in M)
-    b-- a=*b hashd (combined with the byte to the left (order 1))
-  a=*d d++ *d=a b-- a=*b hashd (context H[1] as above but order 2)
-  a=*d d++ *d=a b-- a=*b hashd (context H[2] as above but order 3)
-  a=*d d++ *d=a b-- a=*b hashd (context H[3] as above put order 4)
-  d++ a=c a<<= 8 *d=a (context H[5] for mixer is just the column number)
-  halt
-post 0 end (no post-processing)
-
-The headers are compressed using a mixture of 4 context models.
-The first two are direct (CM: context -> bit prediction)
-and 3 and 4 are indirect (ICM: context -> bit history -> prediction).
-The context for the first model is the column number, the byte
-above and the byte to the left. The next 3 add 1 to 3
-more bytes to the left as context, respectively. The four
-bit predictions are mixed by weighted averaging in the logistic
-domain (log p/(1-p)) and the weights adapted to reduce prediction
-errors. The mixer weight vector is selected by a context consisting
-of the column number and the previously coded bits of the
-current byte. The resulting bits are arithmetic coded.
-
-In the code above, *C=A saves the input byte in M. C++ advances
-to the next byte, which was saved from the previous line.
-"A== 0 IF C=0 ENDIF" tests if the input is 0, marking the end of a
-header line, and if so, resets the pointer C to the beginning of
-the buffer.
-
-The next 3 lines set the context for component 0, pointed to by D.
-HASHD computes the hash *D=(*D+A+512)*773.
-
-The next 3 lines set the contexts for components 1 through 3 by
-copying the previous context hash and combining it with the next
-byte back in the history buffer maintained in M and pointed to
-by *B.
-
-The last line uses the low 5 bits of the column number (in C)
-as part of the 13 bit context to the mixer. The low 8 bits are
-left as zeros so that during modeling the bits from the partial
-byte can be added.
-
-
-BASE CALL MODELING
-
-(fxb.cfg model to compress base calls)
-comp 3 3 0 0 7 (hh hm ph pm n)
-  0 cm 9 255 (2 KB)
-  1 cm 18 255 (1 MB)
-  2 cm 25 255 (128 MB)
-  3 icm 22 (256 MB)
-  4 isse 23 3 (512 MB)
-  5 match 26 28 (256 MB hash table, 256 MB buffer)
-  6 mix 8 0 6 12 255 (order 0 mix of 0..0+6-1, rate 12, mask 255)
-hcomp
-  c++ *c=a b=c a=0 (save in rotating buffer M)
-  d= 1 hash *d=a
-  b-- d++ hash *d=a
-  b-- d++ hash *d=a
-  b-- d++ hash *d=a
-  b-- d++ hash *d=a
-  halt
-post
-  0
-end
-
-Base calls are modeled using an order 0..5 mix. Orders 0, 1, and 2
-are direct, slow adapting (rate = error/count up to 255*4) context models.
-Order 3 is indirect. Order 4 is indirect and chained to the order 3
-output, i.e. order 3 prediction is mixed with a constant 1 in the
-logistic domain by a pair of adaptive weights selected by the
-bit history indexed by the order 4 context hash. The order 5
-context is a match model which looks up the previous occurrence
-of the context hash and predicts whatever bit followed. The
-mixer context is bytewise order 0.
-
-The HASH instruction computes A=(A+*B+512)*773.
-
-
-QUALITY MODELING
-
-(fxq.cfg model used to compress quality scores)
-comp 2 12 0 0 4
-  0 cm 22 128
-  1 cm 22 128
-  2 cm 22 128
-  3 mix 14 0 3 12 255
-hcomp
-  c++ *c=a (store input in M pointed to by C)
-  a== 0 if c=0 endif (reset M at newline)
-  d=0 b=c hash *d=a a=c a>>= 3 hashd
-  d++ a=0 b-- hash *d=a
-    b-- a=*b a>>= 5 hashd
-  d++ *d=0 b-- a=*b hashd
-    b-- a=*b a>>= 4 hashd
-  d++ a=*c a>>= 3 *d=0 hashd
-    a=c a> 3 if a>>= 5 a+= 4 endif hashd
-  halt
-post 0 end
-
-Quality scores use a mix of 3 direct context models. The first
-uses the previous byte and the column number excluding the
-low 3 bits as the context hash. The second model uses the second byte
-and the high 3 bits of the third byte back as the context hash.
-The third model uses the 4'th byte and the high 4 bits of
-the 5'th byte back as context hash. The mixer uses a 14 bit
-context consisting of the current partial byte and the column
-number with the high 5 bits dropped for column numbers above 3.
-
-
-ALIGNMENT MODELING
-
-(fxa.cfg to model reference matches)
-comp 0 0 0 0 1
-  0 cm 20 255
-hcomp
-  c++ b=a
-  a== 0 if a=c a== 1 if c=0 endif endif
-  a=c a> 7 if c=0 endif
-  a< 6 if
-    a=b a>>= 2 a<<= 5 a+=c
-  else
-    a=c
-  endif
-  a<<= 9 *d=a
-  halt
-post 0 end
-
-Reference matches (if present) use a stationary order 0 model with
-the parse state (0..7) as context. States 0..3 expect a mismatch
-byte and 4..7 expect a pointer byte. States 0..5 also use
-the previous byte as context with the low 2 bits discarded.
-
-The ZPAQ archives are each saved as a single segment in a single block
-without a locator tag, filename, comment, or checksum. No post-processing
-is used. The ZPAQL code used for each of the 4 files is as follows:
-
-Each of the 3 or 4 ZPAQ models is compressed or decompressed in parallel
-in separate threads from or to temporary files, which are deleted
-when done.
-
-c: input -> output.fx? -> output.fx?.zpaq  (delete output.fx?)
-d: input.fx?.zpaq -> input.fx? -> output   (delete input.fx?)
-e: input -> output.fx?
-f: input.fx? -> output
-
-*/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <ctype.h>
-#include <string>
-#include <vector>
-#include <time.h>
-#include <pthread.h>
-#include "libzpaq.h"
-using std::string;
-
-const int N=4096; // max FASTQ line length
-
-// print error message and exit (may be called by libzpaq)
-void libzpaq::error(const char* msg) {
-  fprintf(stderr, "fastqz error: %s\n", msg);
-  exit(1);
-}
-using libzpaq::error;
-
-// I/O for libzpaq
-struct File: public libzpaq::Reader, public libzpaq::Writer {
-  FILE* f;
-  int get() {return getc(f);}
-  void put(int c) {putc(c, f);}
-  int read(char* buf, int n) {return fread(buf, 1, n, f);}
-  void write(const char* buf, int n) {fwrite(buf, 1, n, f);}
-};
-
-// Thread argument
-struct Job {
-  int id;  // model 0..2
-  string input, output;  // filenames
-};
-
-// Thread to compress job.input to job.output using model job.id
-void* compress(void *arg) {
-  Job& job=*(Job*)arg;
-  printf("compressing %s\n", job.input.c_str());
-
-  // Models for fxh, fxb, fxq files
-  // Byte codes generated by "zpaq -mfx? l" using fx?.cfg above
-  static char hcomp[4][76]={
-  {64,0,3,8,0,0,5,2,20,-128,2,22,-128,3,18,3,
-  19,7,13,0,4,24,-1,0,104,17,-33,0,47,1,20,28,
-  52,74,66,60,68,60,10,68,60,70,25,112,10,68,60,70,
-  25,112,10,68,60,70,25,112,10,68,60,25,66,-49,8,112,
-  56,0},
-  {55,0,3,3,0,0,7,2,9,-1,2,18,-1,2,25,-1,  // fxb
-  3,22,8,23,3,4,26,28,7,8,0,6,12,-1,0,17,
-  104,74,4,95,1,59,112,10,25,59,112,10,25,59,112,10,
-  25,59,112,10,25,59,112,56,0},
-  {74,0,2,12,0,0,4,2,22,-128,2,22,-128,2,22,-128,  // fxq
-  7,14,0,3,12,-1,0,17,104,-33,0,47,1,20,28,74,
-  59,112,66,-41,3,60,25,4,10,59,112,10,68,-41,5,60,
-  25,52,10,68,60,10,68,-41,4,60,25,69,-41,3,52,60,
-  66,-17,3,47,4,-41,5,-121,4,60,56,0},
-  {45,0,0,0,0,0,1,2,20,-1,0,17,72,-33,0,47,
-  6,66,-33,1,47,1,20,66,-17,7,47,1,20,-25,6,47,
-  8,65,-41,2,-49,5,-126,63,1,66,-49,9,112,56,0}};
-
-  // Compress input to output, then delete input
-  libzpaq::Compressor co;
-  File in, out;
-  in.f=fopen(job.input.c_str(), "rb");
-  if (!in.f) perror(job.input.c_str()), exit(1);
-  out.f=fopen(job.output.c_str(), "wb");
-  if (!out.f) perror(job.output.c_str()), exit(1);
-  co.setInput(&in);
-  co.setOutput(&out);
-  co.startBlock(hcomp[job.id]);
-  co.startSegment();
-  co.postProcess();
-  co.compress();
-  co.endSegment();
-  co.endBlock();
-  fclose(out.f);
-  fclose(in.f);
-  remove(job.input.c_str());
-  printf("compressed %s\n", job.output.c_str());
-  return 0;
-}
-
-// Thread to decompress job.input to job.output
-void* decompress(void *arg) {
-  Job& job=*(Job*)arg;
-  printf("decompressing %s\n", job.input.c_str());
-  File in, out;
-  in.f=fopen(job.input.c_str(), "rb");
-  if (!in.f) perror(job.input.c_str()), exit(1);
-  out.f=fopen(job.output.c_str(), "wb");
-  if (!out.f) perror(job.output.c_str()), exit(1);
-  libzpaq::decompress(&in, &out);
-  fclose(out.f);
-  fclose(in.f);
-  printf("decompressed %s\n", job.output.c_str());
-  return 0;
-}
-
-// hash 64 bits to 32 bits
-unsigned int hash(unsigned long long hl) {
-  return (hl*12345679123456789ull)>>32;
-}
-
-// Return the positions of the first 4 mismatches between bbuf[0..len-1]
-// and ref[h/4...] (incrementing by dir=(+1,-1)), packed LSB first.
-// If there are less than 4 mismatches, use len.
-int rmatch(libzpaq::Array<unsigned char>& ref, unsigned int h,
-          unsigned char* bbuf, int len, int dir) {
-  int i, j, score=0;
-  if (len>126) len=126;
-  for (i=j=0; i<len && j<4; h+=dir, ++i)
-    if (((ref[h/4]>>(6-h%4*2))&3)!=(dir>0?bbuf[i]:3-bbuf[i]))
-      score+=i<<(j++*8);
-  for (; j<4; ++j)
-    score+=len<<(j*8);
-  return score;
-}
-
-// read reference file into ref
-void readref(libzpaq::Array<unsigned char>& ref, const char* filename) {
-  FILE* in=fopen(filename, "rb");
-  if (!in) perror(filename), exit(1);
-  fseek(in, 0, SEEK_END);
-  int rlen=ftell(in);
-  if (rlen<0 || rlen>=(1<<30))
-    error("reference must be smaller than 1 GB");
-  rewind(in);
-  ref.resize(rlen+N*2);  // pad extra N bytes at each end
-  if (int(fread(&ref[N], 1, rlen, in))!=rlen) error("ref read error");
-  printf("%s: length=%d bytes\n", filename, rlen);
-  fclose(in);
-}
-
-int main(int argc, char** argv) {
-
-  // Start timer
-  clock_t start=clock();
-
-  // Check command line: {c|d|e|f} input output
-  if (argc<4) {
-    printf("fastqz v1.5 FASTQ compressor\n"
-    "(C) 2012, Dell Inc. Written by Matt Mahoney. Compiled %s.\n"
-    "Licensed under BSD 2 clause license\n"
-    "\n"
-    "Usage: fastqz command input output [reference]\n"
-    "Commands\n"
-    "  c[Q] - compress input to output.fx?.zpaq (? = {h,b,q})\n"
-    "  d    - decompress input.fx?.zpaq to output\n"
-    "  e[Q] - encode (fast) input to output.fx? (? = {h,b,q})\n"
-    "  f    - fast decode input.fx? to output\n"
-    "Use Q to quantize quality values to steps of size Q for better but\n"
-    "lossy compression. Default is c1 or e1 (lossless).\n"
-    "Use fapacks to create a reference genome from FASTA files\n",
-    __DATE__);
-    exit(1);
-  }
-
-  const char cmd=argv[1][0]; // c,d,e,f
-  int quality=atoi(argv[1]+1);
-  if (quality<1) quality=1;
-  const int isref=argc>4;    // 1 if a reference file supplied
-  const int BUCKET=8;        // index bucket size
-  libzpaq::Array<unsigned char> ref;  // copy of packed reference genome
-  libzpaq::Array<unsigned int> index; // hash table index to ref
-
-  // Encode
-  if (cmd=='e' || cmd=='c') {
-
-    // Read reference file
-    if (isref) {
-      readref(ref, argv[4]);  // read into ref
-
-      // Create an index. Divide ref into groups of 32 bases (8 bytes)
-      // and compute a 32 bit hash, h. Use the low 27 bits as a hash index
-      // and high 5 bits as a hash checksum. Store the checksum and a
-      // 27 bit pointer into ref packed into index[h].
-      if (cmd=='c' || cmd=='e') {
-        index.resize((1<<27)+BUCKET);
-        int collisions=0;
-        for (int i=N; i<=int(ref.size())-N-8; i+=8) {
-          unsigned long long hl=0;
-          for (int j=0; j<8; ++j) hl=hl<<8|ref[i+j];
-          unsigned int h=hash(hl);
-          unsigned int hi=h&0x7ffffff;
-          int j;
-          for (j=0; j<BUCKET && index[hi+j]; ++j);
-          if (j==BUCKET) ++collisions;
-          else index[hi+j]=(h&0xf8000000)+(i>>3);
-        }
-        printf("indexed %s: %d of %d collisions\n",
-          argv[4], collisions, ref.size()/8);
-      }
-    }
-
-    // read input files
-    FILE *in, *out[4];  // fastq, fxh, fxb, fxq, fxa
-    int n, i, j, k, len, c;
-    in=fopen(argv[2], "rb");
-    if (!in) perror(argv[2]), exit(1);
-    for (i=0; i<3+isref; ++i) {
-      string fn=string(argv[3])+".fx"+"hbqa"[i];
-      out[i]=fopen(fn.c_str(), "wb");
-      if (!out[i]) perror(fn.c_str()), exit(1);
-    }
-
-    // Save read length, n
-    for (i=j=n=0; (c=getc(in))!=EOF && !n; ++i) {
-      if (c==10 && j) n=i-j-1;
-      else if (c==10) j=i;
-    }
-    if (n<1 || n>=N) error("read length must be 1..4095");
-    printf("encoding %s -> %s read length %d\n",
-      argv[2], argv[3], n);
-    rewind(in);
-    putc(n>>8, out[0]);
-    putc(n&255, out[0]);
-
-    // encode
-    int base=0;  // packed bases in base 4
-    unsigned char hbuf[N]={0};  // previous header
-    unsigned char bbuf[N]={0};  // one sequence
-    int matches[N+3]={0};
-    int match_sum=0, base_sum=0;
-    int line=0;
-    bool ismatch=false;
-    for (line=0; 1; ++line) {
-
-      // encode header as (j+1,k+1,len+1,xxx,0) meaning
-      // add k to hbuf[..j], then len bytes match, followed by xxx,10.
-      for (i=j=k=len=0; (c=getc(in))!=EOF && c!=10; ++i) {
-        if (i>=N) error("Line too long\n");
-        if (c!=hbuf[i] && isdigit(c) && isdigit(hbuf[i]) && j<254
-            && i<254 && i==len && (!j || j==i)) {
-          int d=k*10+c-hbuf[i];
-          if (d>0 && d<254) hbuf[i]=c, k=d, j=i+1;
-        }
-        if (c==hbuf[i] && i==len && len<254) ++len;
-        hbuf[i]=c;
-      }
-      if (c==EOF) {
-        if (i) error("unexpected EOF in header");
-        break;  // done
-      }
-      putc(j+(j==0), out[0]);
-      putc(k+1, out[0]);
-      putc(len+1, out[0]);
-      for (j=len; j<i; ++j) putc(hbuf[j], out[0]);
-      putc(0, out[0]);
-
-      // read base calls into bbuf coded as ACGT=0..3
-      for (i=0, len=0; (c=getc(in))!=EOF && c!=10; ++i) {
-        if (c==EOF) error("unexpected EOF");
-        if (c!='N') {
-          j=(c=='A')+(c=='C')*2+(c=='G')*3+(c=='T')*4;
-          if (!j) error("expected base A,C,G,T,N");
-          bbuf[len++]=j-1;
-        }
-      }
-      if (i!=n) error("wrong number of base calls");
-
-      // Search for matches in the reference genome
-      int bm=0;  // best match length
-      if (isref) {
-        unsigned long long hl=0;
-        unsigned int bptr=0;  // best match index
-        int bdir=1;  // best match direction, -1 if reversed
-
-        // search in the forward direction
-        for (j=0; j<len; ++j) {
-          hl=hl*4+bbuf[j];
-          if (j>=31) {
-            unsigned int h=hash(hl);
-            unsigned int hi=h&0x7ffffff;
-            for (k=0; k<BUCKET && index[hi+k]; ++k) {
-              int m=0;
-              if ((index[hi+k]^h)<0x8000000) {
-                unsigned int ptr=(index[hi+k]&0x7ffffff)*32+31-j;
-                ++matches[n+1];
-                m=rmatch(ref, ptr, bbuf, len, 1);
-                if (m>bm) bm=m, bptr=ptr;
-              }
-            }
-          }
-        }
-
-        // search for complementary matches
-        hl=0;
-        for (j=len-1; j>=0; --j) {
-          hl=hl*4+3-bbuf[j];
-          if (j<=len-32) {
-            unsigned int h=hash(hl);
-            unsigned int hi=h&0x7ffffff;
-            for (k=0; k<BUCKET && index[hi+k]; ++k) {
-              int m=0;
-              if ((index[hi+k]^h)<0x8000000) {
-                unsigned int ptr=(index[hi+k]&0x7ffffff)*32+31+j;
-                ++matches[n+2];
-                m=rmatch(ref, ptr, bbuf, len, -1);
-                if (m>bm) bm=m, bptr=ptr, bdir=-1;
-              }
-            }
-          }
-        }
-        ++matches[bm>>24&127];
-        match_sum+=(bm>>24)&127;
-        match_sum-=(bm^bm<<8)>0xffffff;
-        match_sum-=(bm<<8^bm<<16)>0xffffff;
-        match_sum-=(bm<<16^bm<<24)>0xffffff;
-        base_sum+=len;
-
-        // write mismatch locations and pointer to reference genome
-        ismatch=(bm>>23)>=len;
-        if (!ismatch)
-          putc(0, out[3]);
-        else {
-          putc(1+bm+128*(bdir<0), out[3]);
-          putc(1+(bm>>8), out[3]);
-          putc(1+(bm>>16), out[3]);
-          putc(1+(bm>>24), out[3]);
-          putc(bptr>>24, out[3]);
-          putc(bptr>>16, out[3]);
-          putc(bptr>>8, out[3]);
-          putc(bptr, out[3]);
-        }
-      }
-
-      // write the bases
-      for (i=0; i<len; ++i) {
-        if (!ismatch || i>=(bm>>24&255) || i==(bm>>16&255) || i==(bm>>8&255)
-            || i==(bm&255)) {
-          j="\x01\x03\x04\x02"[bbuf[i]];  // ACGT -> ATCG
-          if (base*4+j>255) putc(base, out[1]), base=0;
-          base=base*4+j;
-        }
-      }
-
-      // verify empty second header "+\n"
-      if (getc(in)!='+') error("expected +");
-      if (getc(in)!=10) error("expected newline after +");
-
-      // encode quality scores
-      // c=33..104 -> c-32
-      // j,c=64..71 -> 73+(j-64)+8*(c-64)
-      // k,j,c=68..71 -> 137+(k-68)+4*(j-68)+16*(c-68)
-      // 35...,10 -> 0
-      // 71... -> 200+len
-      len=0; // pending output bytes
-      j=k=0; // last 2 bytes
-      for (i=0; (c=getc(in))!=EOF; ++i, k=j, j=c) {
-        if (c!=10 && (c<33 || c>104))
-          error("expected quality score in 33..104");
-        if (quality>1 && c>35) c-=(c-35)%quality;
-        if (c==35 && (len==0 || j==35)) ++len;
-        else if (len==0 && c>=64 && c<=71) ++len;
-        else if (len==1 && c>=68 && c<=71 && j>=68 && j<=71) ++len;
-        else if (len>=2 && len<55 && k==71 && j==71 && c==71) ++len;
-        else if (c==10 && (len==0 || j==35)) break;
-        else {  // must write pending output
-          ++len;  // c is pending
-          while (len>1 && j==35)
-            putc(3, out[2]), --len;
-          if (len>3 && j==71 && k==71)
-            putc(199+len, out[2]), len=1;
-          if (len==3) {
-            if (c>=68 && c<=71)
-              putc(137+(k-68)+4*(j-68)+16*(c-68), out[2]), len=0;
-            else
-              putc(73+(k-64)+8*(j-64), out[2]), len=1;
-          }
-          if (len==2) {
-            if (c>=64 && c<=71) putc(73+(j-64)+8*(c-64), out[2]), len=0;
-            else putc(j-32, out[2]), len=1;
-          }
-          if (len==1) {
-            if (c==10) break;
-            if (c!=35 && (c<64 || c>71)) putc(c-32, out[2]), len=0;
-          }
-        }
-      }
-      putc(0, out[2]);
-      if (i!=n) error("wrong number of quality scores");
-    }
-    putc(base, out[1]);
-    for (i=2+isref; i>=0; --i) fclose(out[i]);
-    fclose(in);
-    index.resize(0);
-    ref.resize(0);
-
-    // print match statistics
-    if (base_sum>0) {
-      printf("matches[0..%d+2]=", n);
-      for (i=0; i<=n+2; ++i) {
-        printf("%d ", matches[i]);
-        if (i%10==0) printf("\n");
-      }
-      printf("\nMatched %d of %d bases (%1.2f%%)\n",
-        match_sum, base_sum, match_sum*100.0/base_sum);
-    }
-
-    // compress each temporary file to .zpaq in a separate thread
-    if (cmd=='c') {
-      pthread_t tid[4];
-      pthread_attr_t attr; // thread joinable attribute
-      pthread_attr_init(&attr);
-      pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
-      Job job[4];
-      for (i=0; i<3+isref; ++i) {
-        job[i].id=i;
-        job[i].input=string(argv[3])+".fx"+"hbqa"[i];
-        job[i].output=job[i].input+".zpaq";
-        pthread_create(&tid[i], &attr, compress, (void*)&job[i]);
-      }
-
-      // wait until all jobs are done
-      for (i=0; i<3+isref; ++i) {
-        void* status;
-        pthread_join(tid[i], &status);
-      }
-    }
-  }
-
-  // decode
-  else if (cmd=='d' || cmd=='f') {
-
-    // decompress .zpaq
-    Job job[4];
-    if (cmd=='d') {
-      pthread_t tid[4];
-      pthread_attr_t attr; // thread joinable attribute
-      pthread_attr_init(&attr);
-      pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
-      for (int i=0; i<3+isref; ++i) {
-        job[i].id=i;
-        job[i].output=string(argv[2])+".fx"+"hbqa"[i];
-        job[i].input=job[i].output+".zpaq";
-        pthread_create(&tid[i], &attr, decompress, (void*)&job[i]);
-      }
-
-      // wait until all threads are done
-      for (int i=0; i<3+isref; ++i) {
-        void* status;
-        pthread_join(tid[i], &status);
-      }
-    }
-
-    // read reference
-    if (isref) readref(ref, argv[4]);
-
-    // open  files
-    FILE *in[4], *out;  // fxh, fxb, fxq, fxa, fastq
-    int i, j, k, c, n;
-    for (i=0; i<3+isref; ++i) {
-      string fn=string(argv[2])+".fx"+"hbqa"[i];
-      in[i]=fopen(fn.c_str(), "rb");
-      if (!in[i]) perror(fn.c_str()), exit(1);
-    }
-    out=fopen(argv[3], "wb");
-    if (!out) perror(argv[3]), exit(1);
-
-    // get read length, n
-    n=getc(in[0]);
-    n=n*256+getc(in[0]);
-    printf("decoding %s -> %s read length %d\n",
-      argv[2], argv[3], n);
-    if (n<1 || n>=N) error("bad read length");
-
-    // decode
-    int base=0;
-    unsigned char hbuf[N]={0}, qbuf[N]={0};
-    while (1) {
-
-      // decode header
-      j=getc(in[0])-1;  // index of last digit of number to adjust
-      if (j==EOF-1) break;
-      k=getc(in[0])-1;  // amount to add
-      i=getc(in[0])-1;  // number of matched bytes after adjustment
-      if (j<0 || k<0 || i<0) error("bad header");
-      for (; i<N && (c=getc(in[0]))!=EOF && c; ++i) hbuf[i]=c;
-      for (; k && j>=0; --j, k/=10) {
-        int d=k%10;
-        hbuf[j]+=d, k-=d;
-        if (hbuf[j]>'9') hbuf[j]-=10, k+=10;
-      }
-      for (j=0; j<i; ++j) putc(hbuf[j], out);
-      putc(10, out);
-
-      // read quality scores and save in qbuf[0..n-1]
-      // 0 -> pad with 35 and end
-      // c=1..72 -> c+32
-      // c=73..136 -> (c-73)%8+64, (c-73)/8+64
-      // c=137..200 -> (c-137)%4+68, (c-137)/4%4+68, (c-137)%16+68
-      // c=201..255 -> 71 repeated c-200 times
-      for (i=0;;) {
-        c=getc(in[2]);
-        if (c==EOF) error("unexpected end of .fxq");
-        if (i>n) error("missing .fxq terminator");
-        if (c==0) { // end of line
-          for (; i<n; ++i) qbuf[i]=35;
-          break;
-        }
-        else if (c>=201 && i+c-200<=n)
-          while (c-->200) qbuf[i++]=71;
-        else if (c>=137 && c<=200 && i<n-2) {
-          c-=137;
-          qbuf[i++]=(c&3)+68;
-          qbuf[i++]=((c>>2)&3)+68;
-          qbuf[i++]=((c>>4)&3)+68;
-        }
-        else if (c>=73 && c<=136 && i<n-1) {
-          c-=73;
-          qbuf[i++]=(c&7)+64;
-          qbuf[i++]=((c>>3)&7)+64;
-        }
-        else if (c>=1 && c<=72 && i<n) {
-          qbuf[i++]=c+32;
-        }
-        else error (".fxq code overflow");
-      }
-      if (i!=n) error("incorrect .fxq read length");
-
-      // decode match to reference
-      unsigned int bptr=0;  // pointer to match in ref
-      int bdir=0;  // read direction
-      int miss1=0, miss2=0, miss3=0, miss4=0;  // mismatches, ascending order
-      if (isref) {
-        miss1=getc(in[3]);
-        if (miss1==EOF) error("unexpcted EOF in .fxa");
-        if (miss1) {
-          if (miss1>=128) miss1-=128, bdir=-1;
-          else bdir=1;
-          --miss1;
-          miss2=getc(in[3])-1;
-          miss3=getc(in[3])-1;
-          miss4=getc(in[3])-1;
-          bptr=getc(in[3]);
-          bptr=bptr*256+getc(in[3]);
-          bptr=bptr*256+getc(in[3]);
-          bptr=bptr*256+getc(in[3]);
-        }
-      }
-
-      // decode bases
-      for (i=k=0; i<n; ++i) {
-        if (qbuf[i]==33)
-          putc('N', out);
-        else if (bdir && k!=miss1 && k!=miss2 && k!=miss3 && k<miss4) {
-          if (bptr/4>=ref.size()) error(".fxa pointer out of bounds");
-          j=(ref[bptr/4]>>(6-bptr%4*2))&3;
-          bptr+=bdir;
-          if (bdir<0) j=3-j;
-          putc("ACGT"[j], out);
-          ++k;
-        }
-        else {
-          while (base==0) {
-            base=getc(in[1]);
-            if (base==EOF) error("unexpected end of .fxb");
-          }
-          if (base>84) j=(base-21)>>6, base-=j*64;
-          else if (base>20) j=(base-5)>>4, base-=j*16;
-          else if (base>4) j=(base-1)>>2, base-=j*4;
-          else j=base, base=0;
-          putc(" ATCG"[j], out);
-          ++k;
-          bptr+=bdir;
-        }
-      }
-      putc(10, out);
-
-      // write empty second header
-      putc('+', out);
-      putc(10, out);
-
-      // write quality scores
-      for (i=0; i<n; ++i) putc(qbuf[i], out);
-      putc(10, out);
-    }
-    fclose(out);
-    for (i=2+isref; i>=0; --i) fclose(in[i]);
-
-    // delete temporary files
-    if (cmd=='d')
-      for (int i=0; i<3+isref; ++i)
-        remove(job[i].output.c_str());
-
-    // show results
-    printf("decoded %s\n", argv[3]);
-  }
-  printf("%1.2f seconds\n", double(clock()-start)/CLOCKS_PER_SEC);
-  return 0;
-}
+/* fastqz v1.5 - Sanger FASTQ compressor
+
+  Copyright (C) 2012, Matt Mahoney, Dell Inc.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  Redistributions of source code must retain the above copyright notice,
+  this list of conditions and the following disclaimer.
+
+  Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+
+TO COMPILE
+
+g++ -O3 -msse2 -s -lpthread fastqz.cpp libzpaq.cpp -o fastqz
+
+You need libzpaq.cpp and libzpaq.h from either
+https://sourceforge.net/projects/zpaq/ or
+http://mattmahoney.net/zpaq/
+libzpaq is public domain.
+
+Also, to use in Windows you need to install Pthreads-Win32 from
+http://sourceware.org/pthreads-win32/
+In particular you need pthread.h to compile and pthreadGC2.dll
+in your PATH to run. Pthreads-Win32 is licensed under LGPL.
+
+libzpaq uses Just-In-Time (JIT) optimization of ZPAQL code on
+x86 32 or 64 bit processors. To run on other processors, compile
+with -DNOJIT to disable this feature. It will still work but run slower.
+
+
+USAGE
+
+fx is a compressor for Sanger FASTQ files. It has two compression modes,
+fast and slow.
+
+Usage: fastqz command input output [reference]
+Commands:
+  c - compress input to output.fx?.zpaq (3 files, ? = {h,b,q})
+  d - decompress input.fx?.zpaq to output
+  e - encode input to output.fx? without zpaq compression (faster)
+  f - decode input.fx? to output
+  cQ, eQ - quantize quality values to 35 plus a multiple of Q, rounding
+           down. Default is c1 or e1.
+
+Commands c and e compress. c compresses smaller but e compresses faster.
+The corresponding decompression commands are d and f respectively.
+You need 1.5 GB memory to compress with c or decompress with d.
+They also both produce temporary files taking the same space as the
+output of command e. The e and f commands don't use significant
+memory and don't produce temporary files.
+
+Using a quantization like c2 or e4 is lossy but improves compression
+when exact quality values are not needed. Values are rounded down.
+
+Compression produces 3 files. Command e produces files named
+output.fxh, output.fxb, output.fxq. Command c produces files named
+output.fxh.zpaq, output.fxb.zpaq, output.fxq.zpaq
+When decompressing, omit the .fx? or .fx?.zpaq extension
+on the input file names. The extensions will be assumed.
+
+Input for compression is expected to be a Sanger FASTQ file.
+The file consists of "reads" from a DNA sequencing machine. Each
+read has the following format:
+
+  @header
+  ACGTN    (base calls, length n)
+  +
+  !..I#    (quality scores, length n, ASCII 34..73 for A,C,G,T, 33 for N)
+
+Maximum line length is 4095. Lines must be terminated by LF
+(ASCII 10) only (no CR). All base and quality lines must have
+the same length (read length = n) throughout the file.
+Files not in this format are rejected.
+
+If [reference] is present, then it is the file name of a reference
+genome. The same reference must be present for decompression.
+The reference genome consists of a sequence of bases packed 4
+per byte in MSB to LSB order with ACGT=0..3. You can use the
+program fapack to convert FASTA files into this format.
+The reference genome cannot be bigger than 1 GB (2^32 bases).
+You need 1.5 GB memory to encode and 1 GB to decode.
+A fourth file will be produced: output.fxa.zpaq or output.fxa
+containing compressed alignments.
+
+
+COMPRESSION FORMAT
+
+Command "c" and "e" both split the input into 3 or 4 parts and
+compress them as described below. Command "c" further compresses
+each of the 3 or 4 files using a different ZPAQ model.
+
+Headers (.fxh) are coded in the form (j,k,len,xxx...,0) which means
+go to column j-1 (first column is 0) in the previous header and
+add k-1 to the decimal number ending there. If k=1, then skip
+this step. Then copy the first len characters of the modified previous
+header, then output xxx, and finally a linefeed (ASCII 10). Save this
+output, minus the linefeed.
+
+The first 2 bytes of the .fxh file encodes the read length, n,
+MSB first (e.g. 0,100 if all base and quality lines have length 100).
+
+Base calls (.fxb) are encoded first by deleting all N's. These can be
+restored because their location is indicated by a quality score
+of 33. Then the remaining bases are encoded in self terminating
+base 4 with A=1, T=2, C=3, G=4 allowing 3 or 4 bases per byte.
+For example, "TACT" is coded as 2*64 + 1*16 + 3*4 + 2*1 = 158.
+
+If a reference is given, then a list of matches are stored in a .fxa
+file. The format is:
+
+  (m1+1+128*dir,m2+1,m3+1,m4+1,p3,p2,p1,p0)  to encode a match
+  (0)                                        to encode no match
+
+where p3..p0 is a 32 bit pointer (MSB first)
+into the reference genome after expanding to 1 base per element
+(with 0..3=ACGT) and padding the ends with 16384 zeros (or A).
+'dir' is 0 for a match in the forward direction or 1 for a
+match in the reverse direction starting at the same point but
+exchanging A with T and C with G. m1..m4 are the locations of
+the first 4 diferences between the base sequence (after deleting
+N's) and the reference, in the range 0..len-1 where len is the
+length of the sequence with N's deleted. Thus, the bytes are
+coded in the range 1..len, with bit 7 of the first byte set if
+the match is reversed. The mismatches are in ascending order.
+If there are less than 4 mismatches, then the remaining bytes
+are coded as len+1. Thus, only reads up to 126 can be fully
+matched.
+
+If a match is present, then only the corresponding mismatched bases,
+plus any bases after m4 (except N), are written to the .fxb file.
+If the first byte is 0, then there is no match and the entire
+base string is written (except N).
+
+Quality scores are decoded as follows: q=1..72 decode as q+32
+(33..104). q=73..136 decode as a pair (q-73)%8+64, (q-73)/8+64,
+both in the range 64..71. q=137..200 decode as the triple
+(q-137)%4+68, (q-137)/4%4+68, (q-137)/16+68 in the range 68..71.
+q=201..255 decodes as 71 repeated q-200 (1..55) times. q=0
+decodes by setting all remaining values to 35 and terminating
+the sequence. The coding takes advantage of the high frequency
+of q at or just below 71 that group early in the sequence, and
+of sequences that end in runs of 35.
+
+Command "c" further compresses the output.fx? files
+to output.fx?.zpaq files as defined by the ZPAQ level 2 standard
+which can be found at http://mattmahoney.net/zpaq/ or
+https://sourceforge.net/projects/zpaq/
+
+ZPAQ is a configurable compression format based on the PAQ context
+mixing algorithm with bit-wise prediction and arithmetic coding.
+Context models are described in ZPAQL byte code, which is saved to
+the compressed file and can be read by a generic ZPAQ decompressor.
+Thus, a FASTQ file compressed with "fastqz c" could be decompressed
+first with zpaq and then with "fastq f" as opposed to decompressing
+with "fastq d".
+
+ZPAQL byte code describes an array of components and code to compute
+contexts. Each component takes a context and possibly the predictions
+of earlier components and outputs a new probability that the next
+bit will be a 1. The output of the last component is used to arithmetic
+encode or decode the next bit. After encoding or decoding, the bit
+is used to update the models to reduce their prediction errors.
+
+Whole-byte contexts are computed on byte boundaries by code running on
+a ZPAQL virtual machine. This program is executed once after modeling
+each byte with that byte as input. The output is saved in an array
+of 32-bit values which is available as input to the array of components.
+These values are combined with the previously coded bits of the current
+byte to form a complete context.
+
+A ZPAQ model is described by a config file. In this program, the
+compiled byte code is fed to the model during compression, or read
+from the compressed file header during decompression. The source code
+for each model is given below, followed by an explanation of the code.
+The command "zpaq -mfx? l" will generate the byte code used in this
+program from the sources below named "fx?.cfg" (where ? is h,b,q,a).
+
+A config file has 3 sections:
+
+  COMP - describes the array of modeling components.
+  HCOMP - ZPAQL code to compute contexts.
+  POST/PCOMP - ZPAQL code for post-processing.
+
+Post-processing is not used, so each file ends with POST 0 END.
+Modeled bits are output directly.
+
+A ZPAQL virtual machine has 32-bit registers A,B,C,D, an array
+of bytes M, an array of 32 bit unsigned integers H, a condition flag F,
+and a 16 bit program counter. H is the context output to the model.
+A is the input byte and accumulator for arithmetic and logical operations.
+B and C are pointers into M. D points to H. *B, *C, *D refer
+to the elements pointed to, modulo the array sizes. The sizes are
+given by the first 2 parameters after COMP.
+
+
+HEADER MODELING
+
+(fxh.cfg model to compress headers)
+comp 3 8 0 0 5 (H has size 2^3, M has size 2^8)
+  0 cm 20 128  (direct 20-bit context model with max count 128*4)
+  1 cm 22 128
+  2 icm 18     (indirect context model with 2^(18+6) bit histories)
+  3 icm 19
+  4 mix 13 0 4 24 255 (13 bit context, mix 0..0+4-1, rate 24, mask 255)
+hcomp
+  *c=a c++ a== 0 if c=0 endif (save input in buffer M pointed to by C)
+  d=0 *d=0 b=c a=c hashd (context H[0] is a hash of column number)
+    a=*b hashd (combined with the byte above, saved in M)
+    b-- a=*b hashd (combined with the byte to the left (order 1))
+  a=*d d++ *d=a b-- a=*b hashd (context H[1] as above but order 2)
+  a=*d d++ *d=a b-- a=*b hashd (context H[2] as above but order 3)
+  a=*d d++ *d=a b-- a=*b hashd (context H[3] as above put order 4)
+  d++ a=c a<<= 8 *d=a (context H[5] for mixer is just the column number)
+  halt
+post 0 end (no post-processing)
+
+The headers are compressed using a mixture of 4 context models.
+The first two are direct (CM: context -> bit prediction)
+and 3 and 4 are indirect (ICM: context -> bit history -> prediction).
+The context for the first model is the column number, the byte
+above and the byte to the left. The next 3 add 1 to 3
+more bytes to the left as context, respectively. The four
+bit predictions are mixed by weighted averaging in the logistic
+domain (log p/(1-p)) and the weights adapted to reduce prediction
+errors. The mixer weight vector is selected by a context consisting
+of the column number and the previously coded bits of the
+current byte. The resulting bits are arithmetic coded.
+
+In the code above, *C=A saves the input byte in M. C++ advances
+to the next byte, which was saved from the previous line.
+"A== 0 IF C=0 ENDIF" tests if the input is 0, marking the end of a
+header line, and if so, resets the pointer C to the beginning of
+the buffer.
+
+The next 3 lines set the context for component 0, pointed to by D.
+HASHD computes the hash *D=(*D+A+512)*773.
+
+The next 3 lines set the contexts for components 1 through 3 by
+copying the previous context hash and combining it with the next
+byte back in the history buffer maintained in M and pointed to
+by *B.
+
+The last line uses the low 5 bits of the column number (in C)
+as part of the 13 bit context to the mixer. The low 8 bits are
+left as zeros so that during modeling the bits from the partial
+byte can be added.
+
+
+BASE CALL MODELING
+
+(fxb.cfg model to compress base calls)
+comp 3 3 0 0 7 (hh hm ph pm n)
+  0 cm 9 255 (2 KB)
+  1 cm 18 255 (1 MB)
+  2 cm 25 255 (128 MB)
+  3 icm 22 (256 MB)
+  4 isse 23 3 (512 MB)
+  5 match 26 28 (256 MB hash table, 256 MB buffer)
+  6 mix 8 0 6 12 255 (order 0 mix of 0..0+6-1, rate 12, mask 255)
+hcomp
+  c++ *c=a b=c a=0 (save in rotating buffer M)
+  d= 1 hash *d=a
+  b-- d++ hash *d=a
+  b-- d++ hash *d=a
+  b-- d++ hash *d=a
+  b-- d++ hash *d=a
+  halt
+post
+  0
+end
+
+Base calls are modeled using an order 0..5 mix. Orders 0, 1, and 2
+are direct, slow adapting (rate = error/count up to 255*4) context models.
+Order 3 is indirect. Order 4 is indirect and chained to the order 3
+output, i.e. order 3 prediction is mixed with a constant 1 in the
+logistic domain by a pair of adaptive weights selected by the
+bit history indexed by the order 4 context hash. The order 5
+context is a match model which looks up the previous occurrence
+of the context hash and predicts whatever bit followed. The
+mixer context is bytewise order 0.
+
+The HASH instruction computes A=(A+*B+512)*773.
+
+
+QUALITY MODELING
+
+(fxq.cfg model used to compress quality scores)
+comp 2 12 0 0 4
+  0 cm 22 128
+  1 cm 22 128
+  2 cm 22 128
+  3 mix 14 0 3 12 255
+hcomp
+  c++ *c=a (store input in M pointed to by C)
+  a== 0 if c=0 endif (reset M at newline)
+  d=0 b=c hash *d=a a=c a>>= 3 hashd
+  d++ a=0 b-- hash *d=a
+    b-- a=*b a>>= 5 hashd
+  d++ *d=0 b-- a=*b hashd
+    b-- a=*b a>>= 4 hashd
+  d++ a=*c a>>= 3 *d=0 hashd
+    a=c a> 3 if a>>= 5 a+= 4 endif hashd
+  halt
+post 0 end
+
+Quality scores use a mix of 3 direct context models. The first
+uses the previous byte and the column number excluding the
+low 3 bits as the context hash. The second model uses the second byte
+and the high 3 bits of the third byte back as the context hash.
+The third model uses the 4'th byte and the high 4 bits of
+the 5'th byte back as context hash. The mixer uses a 14 bit
+context consisting of the current partial byte and the column
+number with the high 5 bits dropped for column numbers above 3.
+
+
+ALIGNMENT MODELING
+
+(fxa.cfg to model reference matches)
+comp 0 0 0 0 1
+  0 cm 20 255
+hcomp
+  c++ b=a
+  a== 0 if a=c a== 1 if c=0 endif endif
+  a=c a> 7 if c=0 endif
+  a< 6 if
+    a=b a>>= 2 a<<= 5 a+=c
+  else
+    a=c
+  endif
+  a<<= 9 *d=a
+  halt
+post 0 end
+
+Reference matches (if present) use a stationary order 0 model with
+the parse state (0..7) as context. States 0..3 expect a mismatch
+byte and 4..7 expect a pointer byte. States 0..5 also use
+the previous byte as context with the low 2 bits discarded.
+
+The ZPAQ archives are each saved as a single segment in a single block
+without a locator tag, filename, comment, or checksum. No post-processing
+is used. The ZPAQL code used for each of the 4 files is as follows:
+
+Each of the 3 or 4 ZPAQ models is compressed or decompressed in parallel
+in separate threads from or to temporary files, which are deleted
+when done.
+
+c: input -> output.fx? -> output.fx?.zpaq  (delete output.fx?)
+d: input.fx?.zpaq -> input.fx? -> output   (delete input.fx?)
+e: input -> output.fx?
+f: input.fx? -> output
+
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <string>
+#include <vector>
+#include <time.h>
+#include <pthread.h>
+#include "libzpaq.h"
+using std::string;
+
+const int N=4096; // max FASTQ line length
+
+// print error message and exit (may be called by libzpaq)
+void libzpaq::error(const char* msg) {
+  fprintf(stderr, "fastqz error: %s\n", msg);
+  exit(1);
+}
+using libzpaq::error;
+
+// I/O for libzpaq
+struct File: public libzpaq::Reader, public libzpaq::Writer {
+  FILE* f;
+  int get() {return getc(f);}
+  void put(int c) {putc(c, f);}
+  int read(char* buf, int n) {return fread(buf, 1, n, f);}
+  void write(const char* buf, int n) {fwrite(buf, 1, n, f);}
+};
+
+// Thread argument
+struct Job {
+  int id;  // model 0..2
+  string input, output;  // filenames
+};
+
+// Thread to compress job.input to job.output using model job.id
+void* compress(void *arg) {
+  Job& job=*(Job*)arg;
+  printf("compressing %s\n", job.input.c_str());
+
+  // Models for fxh, fxb, fxq files
+  // Byte codes generated by "zpaq -mfx? l" using fx?.cfg above
+  static char hcomp[4][76]={
+  {64,0,3,8,0,0,5,2,20,-128,2,22,-128,3,18,3,
+  19,7,13,0,4,24,-1,0,104,17,-33,0,47,1,20,28,
+  52,74,66,60,68,60,10,68,60,70,25,112,10,68,60,70,
+  25,112,10,68,60,70,25,112,10,68,60,25,66,-49,8,112,
+  56,0},
+  {55,0,3,3,0,0,7,2,9,-1,2,18,-1,2,25,-1,  // fxb
+  3,22,8,23,3,4,26,28,7,8,0,6,12,-1,0,17,
+  104,74,4,95,1,59,112,10,25,59,112,10,25,59,112,10,
+  25,59,112,10,25,59,112,56,0},
+  {74,0,2,12,0,0,4,2,22,-128,2,22,-128,2,22,-128,  // fxq
+  7,14,0,3,12,-1,0,17,104,-33,0,47,1,20,28,74,
+  59,112,66,-41,3,60,25,4,10,59,112,10,68,-41,5,60,
+  25,52,10,68,60,10,68,-41,4,60,25,69,-41,3,52,60,
+  66,-17,3,47,4,-41,5,-121,4,60,56,0},
+  {45,0,0,0,0,0,1,2,20,-1,0,17,72,-33,0,47,
+  6,66,-33,1,47,1,20,66,-17,7,47,1,20,-25,6,47,
+  8,65,-41,2,-49,5,-126,63,1,66,-49,9,112,56,0}};
+
+  // Compress input to output, then delete input
+  libzpaq::Compressor co;
+  File in, out;
+  in.f=fopen(job.input.c_str(), "rb");
+  if (!in.f) perror(job.input.c_str()), exit(1);
+  out.f=fopen(job.output.c_str(), "wb");
+  if (!out.f) perror(job.output.c_str()), exit(1);
+  co.setInput(&in);
+  co.setOutput(&out);
+  co.startBlock(hcomp[job.id]);
+  co.startSegment();
+  co.postProcess();
+  co.compress();
+  co.endSegment();
+  co.endBlock();
+  fclose(out.f);
+  fclose(in.f);
+  remove(job.input.c_str());
+  printf("compressed %s\n", job.output.c_str());
+  return 0;
+}
+
+// Thread to decompress job.input to job.output
+void* decompress(void *arg) {
+  Job& job=*(Job*)arg;
+  printf("decompressing %s\n", job.input.c_str());
+  File in, out;
+  in.f=fopen(job.input.c_str(), "rb");
+  if (!in.f) perror(job.input.c_str()), exit(1);
+  out.f=fopen(job.output.c_str(), "wb");
+  if (!out.f) perror(job.output.c_str()), exit(1);
+  libzpaq::decompress(&in, &out);
+  fclose(out.f);
+  fclose(in.f);
+  printf("decompressed %s\n", job.output.c_str());
+  return 0;
+}
+
+// hash 64 bits to 32 bits
+unsigned int hash(unsigned long long hl) {
+  return (hl*12345679123456789ull)>>32;
+}
+
+// Return the positions of the first 4 mismatches between bbuf[0..len-1]
+// and ref[h/4...] (incrementing by dir=(+1,-1)), packed LSB first.
+// If there are less than 4 mismatches, use len.
+int rmatch(libzpaq::Array<unsigned char>& ref, unsigned int h,
+          unsigned char* bbuf, int len, int dir) {
+  int i, j, score=0;
+  if (len>126) len=126;
+  for (i=j=0; i<len && j<4; h+=dir, ++i)
+    if (((ref[h/4]>>(6-h%4*2))&3)!=(dir>0?bbuf[i]:3-bbuf[i]))
+      score+=i<<(j++*8);
+  for (; j<4; ++j)
+    score+=len<<(j*8);
+  return score;
+}
+
+// read reference file into ref
+void readref(libzpaq::Array<unsigned char>& ref, const char* filename) {
+  FILE* in=fopen(filename, "rb");
+  if (!in) perror(filename), exit(1);
+  fseek(in, 0, SEEK_END);
+  int rlen=ftell(in);
+  if (rlen<0 || rlen>=(1<<30))
+    error("reference must be smaller than 1 GB");
+  rewind(in);
+  ref.resize(rlen+N*2);  // pad extra N bytes at each end
+  if (int(fread(&ref[N], 1, rlen, in))!=rlen) error("ref read error");
+  printf("%s: length=%d bytes\n", filename, rlen);
+  fclose(in);
+}
+
+int main(int argc, char** argv) {
+
+  // Start timer
+  clock_t start=clock();
+
+  // Check command line: {c|d|e|f} input output
+  if (argc<4) {
+    printf("fastqz v1.5 FASTQ compressor\n"
+    "(C) 2012, Dell Inc. Written by Matt Mahoney. Compiled %s.\n"
+    "Licensed under BSD 2 clause license\n"
+    "\n"
+    "Usage: fastqz command input output [reference]\n"
+    "Commands\n"
+    "  c[Q] - compress input to output.fx?.zpaq (? = {h,b,q})\n"
+    "  d    - decompress input.fx?.zpaq to output\n"
+    "  e[Q] - encode (fast) input to output.fx? (? = {h,b,q})\n"
+    "  f    - fast decode input.fx? to output\n"
+    "Use Q to quantize quality values to steps of size Q for better but\n"
+    "lossy compression. Default is c1 or e1 (lossless).\n"
+    "Use fapacks to create a reference genome from FASTA files\n",
+    __DATE__);
+    exit(1);
+  }
+
+  const char cmd=argv[1][0]; // c,d,e,f
+  int quality=atoi(argv[1]+1);
+  if (quality<1) quality=1;
+  const int isref=argc>4;    // 1 if a reference file supplied
+  const int BUCKET=8;        // index bucket size
+  libzpaq::Array<unsigned char> ref;  // copy of packed reference genome
+  libzpaq::Array<unsigned int> index; // hash table index to ref
+
+  // Encode
+  if (cmd=='e' || cmd=='c') {
+
+    // Read reference file
+    if (isref) {
+      readref(ref, argv[4]);  // read into ref
+
+      // Create an index. Divide ref into groups of 32 bases (8 bytes)
+      // and compute a 32 bit hash, h. Use the low 27 bits as a hash index
+      // and high 5 bits as a hash checksum. Store the checksum and a
+      // 27 bit pointer into ref packed into index[h].
+      if (cmd=='c' || cmd=='e') {
+        index.resize((1<<27)+BUCKET);
+        int collisions=0;
+        for (int i=N; i<=int(ref.size())-N-8; i+=8) {
+          unsigned long long hl=0;
+          for (int j=0; j<8; ++j) hl=hl<<8|ref[i+j];
+          unsigned int h=hash(hl);
+          unsigned int hi=h&0x7ffffff;
+          int j;
+          for (j=0; j<BUCKET && index[hi+j]; ++j);
+          if (j==BUCKET) ++collisions;
+          else index[hi+j]=(h&0xf8000000)+(i>>3);
+        }
+        printf("indexed %s: %d of %d collisions\n",
+          argv[4], collisions, ref.size()/8);
+      }
+    }
+
+    // read input files
+    FILE *in, *out[4];  // fastq, fxh, fxb, fxq, fxa
+    int n, i, j, k, len, c;
+    in=fopen(argv[2], "rb");
+    if (!in) perror(argv[2]), exit(1);
+    for (i=0; i<3+isref; ++i) {
+      string fn=string(argv[3])+".fx"+"hbqa"[i];
+      out[i]=fopen(fn.c_str(), "wb");
+      if (!out[i]) perror(fn.c_str()), exit(1);
+    }
+
+    // Save read length, n
+    for (i=j=n=0; (c=getc(in))!=EOF && !n; ++i) {
+      if (c==10 && j) n=i-j-1;
+      else if (c==10) j=i;
+    }
+    if (n<1 || n>=N) error("read length must be 1..4095");
+    printf("encoding %s -> %s read length %d\n",
+      argv[2], argv[3], n);
+    rewind(in);
+    putc(n>>8, out[0]);
+    putc(n&255, out[0]);
+
+    // encode
+    int base=0;  // packed bases in base 4
+    unsigned char hbuf[N]={0};  // previous header
+    unsigned char bbuf[N]={0};  // one sequence
+    int matches[N+3]={0};
+    int match_sum=0, base_sum=0;
+    int line=0;
+    bool ismatch=false;
+    for (line=0; 1; ++line) {
+
+      // encode header as (j+1,k+1,len+1,xxx,0) meaning
+      // add k to hbuf[..j], then len bytes match, followed by xxx,10.
+      for (i=j=k=len=0; (c=getc(in))!=EOF && c!=10; ++i) {
+        if (i>=N) error("Line too long\n");
+        if (c!=hbuf[i] && isdigit(c) && isdigit(hbuf[i]) && j<254
+            && i<254 && i==len && (!j || j==i)) {
+          int d=k*10+c-hbuf[i];
+          if (d>0 && d<254) hbuf[i]=c, k=d, j=i+1;
+        }
+        if (c==hbuf[i] && i==len && len<254) ++len;
+        hbuf[i]=c;
+      }
+      if (c==EOF) {
+        if (i) error("unexpected EOF in header");
+        break;  // done
+      }
+      putc(j+(j==0), out[0]);
+      putc(k+1, out[0]);
+      putc(len+1, out[0]);
+      for (j=len; j<i; ++j) putc(hbuf[j], out[0]);
+      putc(0, out[0]);
+
+      // read base calls into bbuf coded as ACGT=0..3
+      for (i=0, len=0; (c=getc(in))!=EOF && c!=10; ++i) {
+        if (c==EOF) error("unexpected EOF");
+        if (c!='N') {
+          j=(c=='A')+(c=='C')*2+(c=='G')*3+(c=='T')*4;
+          if (!j) error("expected base A,C,G,T,N");
+          bbuf[len++]=j-1;
+        }
+      }
+      if (i!=n) error("wrong number of base calls");
+
+      // Search for matches in the reference genome
+      int bm=0;  // best match length
+      if (isref) {
+        unsigned long long hl=0;
+        unsigned int bptr=0;  // best match index
+        int bdir=1;  // best match direction, -1 if reversed
+
+        // search in the forward direction
+        for (j=0; j<len; ++j) {
+          hl=hl*4+bbuf[j];
+          if (j>=31) {
+            unsigned int h=hash(hl);
+            unsigned int hi=h&0x7ffffff;
+            for (k=0; k<BUCKET && index[hi+k]; ++k) {
+              int m=0;
+              if ((index[hi+k]^h)<0x8000000) {
+                unsigned int ptr=(index[hi+k]&0x7ffffff)*32+31-j;
+                ++matches[n+1];
+                m=rmatch(ref, ptr, bbuf, len, 1);
+                if (m>bm) bm=m, bptr=ptr;
+              }
+            }
+          }
+        }
+
+        // search for complementary matches
+        hl=0;
+        for (j=len-1; j>=0; --j) {
+          hl=hl*4+3-bbuf[j];
+          if (j<=len-32) {
+            unsigned int h=hash(hl);
+            unsigned int hi=h&0x7ffffff;
+            for (k=0; k<BUCKET && index[hi+k]; ++k) {
+              int m=0;
+              if ((index[hi+k]^h)<0x8000000) {
+                unsigned int ptr=(index[hi+k]&0x7ffffff)*32+31+j;
+                ++matches[n+2];
+                m=rmatch(ref, ptr, bbuf, len, -1);
+                if (m>bm) bm=m, bptr=ptr, bdir=-1;
+              }
+            }
+          }
+        }
+        ++matches[bm>>24&127];
+        match_sum+=(bm>>24)&127;
+        match_sum-=(bm^bm<<8)>0xffffff;
+        match_sum-=(bm<<8^bm<<16)>0xffffff;
+        match_sum-=(bm<<16^bm<<24)>0xffffff;
+        base_sum+=len;
+
+        // write mismatch locations and pointer to reference genome
+        ismatch=(bm>>23)>=len;
+        if (!ismatch)
+          putc(0, out[3]);
+        else {
+          putc(1+bm+128*(bdir<0), out[3]);
+          putc(1+(bm>>8), out[3]);
+          putc(1+(bm>>16), out[3]);
+          putc(1+(bm>>24), out[3]);
+          putc(bptr>>24, out[3]);
+          putc(bptr>>16, out[3]);
+          putc(bptr>>8, out[3]);
+          putc(bptr, out[3]);
+        }
+      }
+
+      // write the bases
+      for (i=0; i<len; ++i) {
+        if (!ismatch || i>=(bm>>24&255) || i==(bm>>16&255) || i==(bm>>8&255)
+            || i==(bm&255)) {
+          j="\x01\x03\x04\x02"[bbuf[i]];  // ACGT -> ATCG
+          if (base*4+j>255) putc(base, out[1]), base=0;
+          base=base*4+j;
+        }
+      }
+
+      // verify empty second header "+\n"
+      if (getc(in)!='+') error("expected +");
+      if (getc(in)!=10) error("expected newline after +");
+
+      // encode quality scores
+      // c=33..104 -> c-32
+      // j,c=64..71 -> 73+(j-64)+8*(c-64)
+      // k,j,c=68..71 -> 137+(k-68)+4*(j-68)+16*(c-68)
+      // 35...,10 -> 0
+      // 71... -> 200+len
+      len=0; // pending output bytes
+      j=k=0; // last 2 bytes
+      for (i=0; (c=getc(in))!=EOF; ++i, k=j, j=c) {
+        if (c!=10 && (c<33 || c>104))
+          error("expected quality score in 33..104");
+        if (quality>1 && c>35) c-=(c-35)%quality;
+        if (c==35 && (len==0 || j==35)) ++len;
+        else if (len==0 && c>=64 && c<=71) ++len;
+        else if (len==1 && c>=68 && c<=71 && j>=68 && j<=71) ++len;
+        else if (len>=2 && len<55 && k==71 && j==71 && c==71) ++len;
+        else if (c==10 && (len==0 || j==35)) break;
+        else {  // must write pending output
+          ++len;  // c is pending
+          while (len>1 && j==35)
+            putc(3, out[2]), --len;
+          if (len>3 && j==71 && k==71)
+            putc(199+len, out[2]), len=1;
+          if (len==3) {
+            if (c>=68 && c<=71)
+              putc(137+(k-68)+4*(j-68)+16*(c-68), out[2]), len=0;
+            else
+              putc(73+(k-64)+8*(j-64), out[2]), len=1;
+          }
+          if (len==2) {
+            if (c>=64 && c<=71) putc(73+(j-64)+8*(c-64), out[2]), len=0;
+            else putc(j-32, out[2]), len=1;
+          }
+          if (len==1) {
+            if (c==10) break;
+            if (c!=35 && (c<64 || c>71)) putc(c-32, out[2]), len=0;
+          }
+        }
+      }
+      putc(0, out[2]);
+      if (i!=n) error("wrong number of quality scores");
+    }
+    putc(base, out[1]);
+    for (i=2+isref; i>=0; --i) fclose(out[i]);
+    fclose(in);
+    index.resize(0);
+    ref.resize(0);
+
+    // print match statistics
+    if (base_sum>0) {
+      printf("matches[0..%d+2]=", n);
+      for (i=0; i<=n+2; ++i) {
+        printf("%d ", matches[i]);
+        if (i%10==0) printf("\n");
+      }
+      printf("\nMatched %d of %d bases (%1.2f%%)\n",
+        match_sum, base_sum, match_sum*100.0/base_sum);
+    }
+
+    // compress each temporary file to .zpaq in a separate thread
+    if (cmd=='c') {
+      pthread_t tid[4];
+      pthread_attr_t attr; // thread joinable attribute
+      pthread_attr_init(&attr);
+      pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+      Job job[4];
+      for (i=0; i<3+isref; ++i) {
+        job[i].id=i;
+        job[i].input=string(argv[3])+".fx"+"hbqa"[i];
+        job[i].output=job[i].input+".zpaq";
+        pthread_create(&tid[i], &attr, compress, (void*)&job[i]);
+      }
+
+      // wait until all jobs are done
+      for (i=0; i<3+isref; ++i) {
+        void* status;
+        pthread_join(tid[i], &status);
+      }
+    }
+  }
+
+  // decode
+  else if (cmd=='d' || cmd=='f') {
+
+    // decompress .zpaq
+    Job job[4];
+    if (cmd=='d') {
+      pthread_t tid[4];
+      pthread_attr_t attr; // thread joinable attribute
+      pthread_attr_init(&attr);
+      pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+      for (int i=0; i<3+isref; ++i) {
+        job[i].id=i;
+        job[i].output=string(argv[2])+".fx"+"hbqa"[i];
+        job[i].input=job[i].output+".zpaq";
+        pthread_create(&tid[i], &attr, decompress, (void*)&job[i]);
+      }
+
+      // wait until all threads are done
+      for (int i=0; i<3+isref; ++i) {
+        void* status;
+        pthread_join(tid[i], &status);
+      }
+    }
+
+    // read reference
+    if (isref) readref(ref, argv[4]);
+
+    // open  files
+    FILE *in[4], *out;  // fxh, fxb, fxq, fxa, fastq
+    int i, j, k, c, n;
+    for (i=0; i<3+isref; ++i) {
+      string fn=string(argv[2])+".fx"+"hbqa"[i];
+      in[i]=fopen(fn.c_str(), "rb");
+      if (!in[i]) perror(fn.c_str()), exit(1);
+    }
+    out=fopen(argv[3], "wb");
+    if (!out) perror(argv[3]), exit(1);
+
+    // get read length, n
+    n=getc(in[0]);
+    n=n*256+getc(in[0]);
+    printf("decoding %s -> %s read length %d\n",
+      argv[2], argv[3], n);
+    if (n<1 || n>=N) error("bad read length");
+
+    // decode
+    int base=0;
+    unsigned char hbuf[N]={0}, qbuf[N]={0};
+    while (1) {
+
+      // decode header
+      j=getc(in[0])-1;  // index of last digit of number to adjust
+      if (j==EOF-1) break;
+      k=getc(in[0])-1;  // amount to add
+      i=getc(in[0])-1;  // number of matched bytes after adjustment
+      if (j<0 || k<0 || i<0) error("bad header");
+      for (; i<N && (c=getc(in[0]))!=EOF && c; ++i) hbuf[i]=c;
+      for (; k && j>=0; --j, k/=10) {
+        int d=k%10;
+        hbuf[j]+=d, k-=d;
+        if (hbuf[j]>'9') hbuf[j]-=10, k+=10;
+      }
+      for (j=0; j<i; ++j) putc(hbuf[j], out);
+      putc(10, out);
+
+      // read quality scores and save in qbuf[0..n-1]
+      // 0 -> pad with 35 and end
+      // c=1..72 -> c+32
+      // c=73..136 -> (c-73)%8+64, (c-73)/8+64
+      // c=137..200 -> (c-137)%4+68, (c-137)/4%4+68, (c-137)%16+68
+      // c=201..255 -> 71 repeated c-200 times
+      for (i=0;;) {
+        c=getc(in[2]);
+        if (c==EOF) error("unexpected end of .fxq");
+        if (i>n) error("missing .fxq terminator");
+        if (c==0) { // end of line
+          for (; i<n; ++i) qbuf[i]=35;
+          break;
+        }
+        else if (c>=201 && i+c-200<=n)
+          while (c-->200) qbuf[i++]=71;
+        else if (c>=137 && c<=200 && i<n-2) {
+          c-=137;
+          qbuf[i++]=(c&3)+68;
+          qbuf[i++]=((c>>2)&3)+68;
+          qbuf[i++]=((c>>4)&3)+68;
+        }
+        else if (c>=73 && c<=136 && i<n-1) {
+          c-=73;
+          qbuf[i++]=(c&7)+64;
+          qbuf[i++]=((c>>3)&7)+64;
+        }
+        else if (c>=1 && c<=72 && i<n) {
+          qbuf[i++]=c+32;
+        }
+        else error (".fxq code overflow");
+      }
+      if (i!=n) error("incorrect .fxq read length");
+
+      // decode match to reference
+      unsigned int bptr=0;  // pointer to match in ref
+      int bdir=0;  // read direction
+      int miss1=0, miss2=0, miss3=0, miss4=0;  // mismatches, ascending order
+      if (isref) {
+        miss1=getc(in[3]);
+        if (miss1==EOF) error("unexpcted EOF in .fxa");
+        if (miss1) {
+          if (miss1>=128) miss1-=128, bdir=-1;
+          else bdir=1;
+          --miss1;
+          miss2=getc(in[3])-1;
+          miss3=getc(in[3])-1;
+          miss4=getc(in[3])-1;
+          bptr=getc(in[3]);
+          bptr=bptr*256+getc(in[3]);
+          bptr=bptr*256+getc(in[3]);
+          bptr=bptr*256+getc(in[3]);
+        }
+      }
+
+      // decode bases
+      for (i=k=0; i<n; ++i) {
+        if (qbuf[i]==33)
+          putc('N', out);
+        else if (bdir && k!=miss1 && k!=miss2 && k!=miss3 && k<miss4) {
+          if (bptr/4>=ref.size()) error(".fxa pointer out of bounds");
+          j=(ref[bptr/4]>>(6-bptr%4*2))&3;
+          bptr+=bdir;
+          if (bdir<0) j=3-j;
+          putc("ACGT"[j], out);
+          ++k;
+        }
+        else {
+          while (base==0) {
+            base=getc(in[1]);
+            if (base==EOF) error("unexpected end of .fxb");
+          }
+          if (base>84) j=(base-21)>>6, base-=j*64;
+          else if (base>20) j=(base-5)>>4, base-=j*16;
+          else if (base>4) j=(base-1)>>2, base-=j*4;
+          else j=base, base=0;
+          putc(" ATCG"[j], out);
+          ++k;
+          bptr+=bdir;
+        }
+      }
+      putc(10, out);
+
+      // write empty second header
+      putc('+', out);
+      putc(10, out);
+
+      // write quality scores
+      for (i=0; i<n; ++i) putc(qbuf[i], out);
+      putc(10, out);
+    }
+    fclose(out);
+    for (i=2+isref; i>=0; --i) fclose(in[i]);
+
+    // delete temporary files
+    if (cmd=='d')
+      for (int i=0; i<3+isref; ++i)
+        remove(job[i].output.c_str());
+
+    // show results
+    printf("decoded %s\n", argv[3]);
+  }
+  printf("%1.2f seconds\n", double(clock()-start)/CLOCKS_PER_SEC);
+  return 0;
+}
diff --git a/libzpaq.3.pod b/libzpaq.3.pod
index 5726755..3ea1b95 100644
--- a/libzpaq.3.pod
+++ b/libzpaq.3.pod
@@ -1,737 +1,737 @@
-#  Documentation for libzpaq
-#
-#  Copyright (C) 2012, Dell Inc. Written by Matt Mahoney.
-#
-#  Permission is hereby granted, free of charge, to any person obtaining a copy
-#  of this software and associated documentation files (the "Software"), to deal
-#  in the Software without restriction, including without limitation the rights
-#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-#  copies of the Software, and to permit persons to whom the Software is
-#  furnished to do so without restriction.
-#  This Software is provided "as is" without warranty.
-#
-#  To create man page: pod2man libzpaq.3.pod > libzpaq.3
-#  To create HTML documentation: pod2html libzpaq.3.pod > libzpaq.html
-
-=pod
-
-=head1 NAME
-
-libzpaq - ZPAQ compression API
-
-=head1 SYNOPSIS
-
-    #include "libzpaq.h"
-
-    namespace libzpaq {
-
-    extern void error(const char* msg);
-
-    class Reader {
-    public:
-        virtual int get() = 0;
-        virtual int read(char* buf, int n); // optional
-        virtual ~Reader() {}
-    };
-
-    class Writer {
-    public:
-        virtual void put(int c) = 0;
-        virtual void write(const char* buf, int n); // optional
-        virtual ~Writer() {}
-    };
-
-    class SHA1 {
-    public:
-        SHA1();
-        void put(int c);
-        double size() const;
-        uint64_t usize() const
-        const char* result();
-    };
-
-    class Compressor {
-    public:
-        Compressor();
-        void setOutput(Writer* out);
-        void writeTag();
-        void startBlock(int level);
-        void startBlock(const char* hcomp);
-        void startSegment(const char* filename = 0,
-                          const char* comment = 0);
-        void setInput(Reader* i);
-        void postProcess(const char* pcomp = 0, int length = 0);
-        bool compress(int n = -1);
-        void endSegment(const char* sha1string = 0);
-        void endBlock();
-    };
-
-    class Decompresser {
-    public:
-        Decompresser();
-        void setInput(Reader* in);
-        bool findBlock(double* memptr = 0);
-        void hcomp(Writer* out);
-        bool findFilename(Writer* = 0);
-        void readComment(Writer* = 0);
-        void setOutput(Writer* out);
-        void setSHA1(SHA1* sha1ptr);
-        bool decompress(int n = -1);
-        bool pcomp(Writer* out);
-        void readSegmentEnd(char* sha1string = 0);
-    };
-
-    void compress(Reader* in, Writer* out, int level);
-
-    void decompress(Reader* in, Writer* out);
-    }
-
-=head1 DESCRIPTION
-
-I<libzpaq> is a C++ API for compressing or decompressing
-files or objects in memory comforming to the ZPAQ level 1 and 2 standards
-(see I<availability>). This document describes version 5.00
-of the software. The software may be used without
-restriction under a modified MIT license.
-
-ZPAQ provides a high level of data compression in a streaming
-(single pass) self-describing format that supports single or multiple
-named objects (such as archives) with optional integrity checking.
-
-The library provides 3 default compression levels but supports
-custom algorithms. The performance of the default levels is
-shown in the table below for the 14 file Calgary corpus as
-a tar file. Compression and decompression times are in seconds
-on a 2 GHz T3200 running on one of two cores. Memory required
-to compress or decompress is in MB. Some popular formats
-are shown for comparison.
-
-    Program       Format       Size     Time (C, D)   Memory
-    -----------   ------     ---------  -----------   ------
-    Uncompresed   .tar       3,152,896
-    compress      .tar.Z     1,319,521    1.6   0.2    .1 MB
-    gzip -9       .tar.gz    1,022,810    0.7   0.1    .1 MB
-    bzip2 -9      .tar.bz2     860,097    0.6   0.4     5 MB
-    7zip          .tar.7z      824,573    1.5   0.1   195 MB
-    zpaq 1 (fast) .tar.zpaq    806,959    2     2      38 MB
-    zpaq 2 (mid)  .tar.zpaq    699,191    8     8     112 MB
-    zpaq 3 (max)  .tar.zpaq    644,190   20    20     246 MB
-
-A ZPAQ stream consists of one or more blocks, possibly mixed with
-other data, that can be decompressed independently in any order.
-Each block consists of one or more segments that must be decompressed
-in order from the beginning of the block. Each block header contains
-a description of the decompression algorithm. Each segment consists
-of an optional filename string, an optional comment string,
-self delimiting compressed data, and an optional SHA-1 checksum.
-If ZPAQ blocks are mixed with other data, they must be
-preceded by an identifying 13 byte tag which does not otherwise
-appear in that data.
-
-ZPAQ compression is based on the PAQ context mixing model.
-An array of components predict the probability of the next bit
-of input, either independently or depending on the predictions
-of earlier components. The final prediction is arithmetic coded.
-Each component inputs a context computed from earlier input
-by a program written in ZPAQL byte code which runs on a virtual
-machine. Both the component array description and the ZPAQL
-code are encoded in a string called HCOMP in each block header.
-Data can also be stored uncompressed.
-
-A block may optionally specify a post-processor, a program
-(also in ZPAQL) which takes the decoded data as input and
-outputs the decompressed output. This program, if present,
-is encoded as a string called PCOMP which is compressed
-in the first segment prior to the compressed data. The first
-decoded byte from the first segment is a flag indicating
-whether a PCOMP string is present. The user is responsible
-for correctly pre-processing the data so that post-processing
-restores the original data.
-
-=head2 API Organization
-
-The I<libzpaq> API consists of 2 files.
-
-=over
-
-=item libzpaq.h
-
-Header file to include in your application.
-
-=item libzpaq.cpp
-
-Source code file to link to your application.
-
-=back
-
-An application would have the line C<#include "libzpaq.h"> and
-link to libzpaq.cpp.
-The API provides two classes, C<Compressor> and C<Decompresser>
-which write or read respectively each of the syntactic elements
-of a ZPAQ stream. The two functions C<compress()> and
-C<decompress()> provide simple interfaces for the most common
-uses. In either case, the user must create classes derived
-from the abstract base classes C<Reader> and C<Writer> and
-define methods C<get()> and C<put()> which the code
-will use to read and write bytes. The user must also define
-a callback error handler.
-
-By default, libzpaq(3) uses just-in-time (JIT) acceleration
-by translating ZPAQL code to x86-32 or x86-64 internally
-and executing it. This feature can be disabled by compiling
-with -DNOJIT. If enabled, it requires an x86 processor
-capable of executing SSE2 instructions. SSE2 is supported
-by most Intel processors since 2001 and AMD since 2003.
-
-Run time checks (assertions) can be enabled with -DDEBUG
-for debugging purposes.
-
-All of the API code is contained in the namespace C<libzpaq>.
-
-=head2 Callback Functions
-
-The following three functions must be defined by the user.
-
-=over
-
-=item C<extern void libzpaq::error(const char* msg);>
-
-This function must be defined by the user to handle errors
-from libzpaq. The library will call the function with
-an English language message passed to C<msg>. Errors may
-result from bad input during decompression, out of memory,
-or illegal arguments or calling sequences to libzpaq
-functions. Errors should be considered unrecoverable.
-
-=item C<int libzpaq::Reader::get() = 0;>
-
-The user must create a class derived from Reader with an
-implementation for C<get()> that reads one byte of input
-and returns its value in the range 0...255, or returns
-EOF (-1) at end of input. Objects of the derived type
-would then be passed to functions that require a C<Reader>.
-
-=item C<void libzpaq::Writer::put(int c) = 0;>
-
-The user must create a class derived from Writer with
-an implemenation of C<put()> which is expected to take
-a byte value C<c> in the range 0...255 and write it to
-output. Objects of the derived type
-would then be passed to functions that require a C<Writer>.
-
-=back
-
-The following two functions are optional. Defining them
-can improve performance slightly.
-
-=over
-
-=item C<virtual int read(char* buf, int n);>
-
-If defined, this function should input up to C<n> bytes into
-the array C<buf> and return the number actually read, in
-the range 0..n. A return value of 0 indicates end of input.
-If C<read()> is not defined, then the default implementation
-will call C<get()> n times.
-
-=item C<virtual void write(const char* buf, int n);>
-
-If defined, this function should output the elements C<buf[0]>
-through C<buf[n-1]> in order. If not defined, then the default
-implementation will call C<put()> n times.
-
-=back
-
-=head2 Simple Compression
-
-In the remainder of this document, all classes and
-functions are assumed to be in namespace C<libzpaq>.
-
-=over
-
-=item C<void compress(Reader* in, Writer* out, int mode);>
-
-C<compress()> compresses from C<in> to C<out> until C<get()>
-returns EOF. It writes a single segment in a single block
-with empty filename, comment, and checksum fields. C<mode>
-must be 1, 2, or 3, to select models I<fast>, I<mid>, or
-I<max> respectively. Higher modes compress smaller but
-take longer to compress and subsequently decompress.
-
-=item C<void decompress(Reader* in, Writer* out);>
-
-C<decompress()> decompresses any valid ZPAQ stream from
-C<in> to C<out> until C<get()> returns EOF. Any
-non-ZPAQ data in the input is ignored. Any ZPAQ blocks
-following non-ZPAQ must be preceded by a marker tag
-to be recognized. Each block is decoded according to the
-instructions in the block header. The contents of the
-filename, comment, and checksum fields are ignored.
-Data with bad checksums will be decoded anyway. If there
-is more than one segment, then all of the output
-data will be concatenated.
-
-=back
-
-=head2 class SHA1
-
-The SHA1 class is used to compute SHA-1 checksums for compression
-and verify them for decompression. It is believed to be
-computationally infeasible to find two different strings
-with the same hash value. Its member functions
-are as follows:
-
-=over
-
-=item C<SHA1();>
-
-The constructor creates a new SHA1 object representing the
-hash of an empty string.
-
-=item C<void put(int c);>
-
-Appends one byte c (0...255) to the string whose hash is represented.
-
-=item C<double size() const;>
-
-Returns the length (so far) of the string whose hash is represented.
-The largest possible value returned is
-2^61 - 1 = 2305843009213693951.0, but values larger than 2^53 =
-9007199254740992.0
-will not be exact on systems using IEEE 64 bit floating point
-representation of type C<double>. The initial value is 0.0.
-
-=item C<int64_t usize() const;>
-
-Returns the length (so far) as a 64 bit unsigned integer.
-
-=item C<const char* result();>
-
-Computes the 20 byte SHA-1 hash and resets the string back
-to a size of 0.0. The returned pointer points to an array
-inside the SHA1 object whose
-contents remain unchanged until the next call to C<result()>.
-
-=back
-
-=head2 class Compressor
-
-The C<Compressor> class has member functions to write
-each of the syntactic elements of a ZPAQ stream and to specify
-their values. It will compress using either built-in or
-user supplied models.
-
-=over
-
-=item C<Compressor();>
-
-The constructor creates a Compression object. No input source,
-output destination, or compression model is specified.
-
-=item C<void setOutput(Writer* out);>
-
-Specifies a destination for output. Must be specified before
-calling any function that writes data.
-
-=item C<void writeTag();>
-
-Writes a 13 byte marker tag which can be used to identify
-the start of a block following non-ZPAQ data.
-
-=item C<void startBlock(int level);>
-
-Writes a block header and specifies a compression model.
-If linked with F<libzpaqo.cpp>, then C<level> must be 1, 2, or 3
-to specify I<fast>, I<mid>, or I<max> respectively. Higher numbers
-compress smaller but more slowly. These models are compatible
-with both the ZPAQ level 1 and 2 standards.
-
-=item C<void startBlock(const char* hcomp);>
-
-Writes a block header and specifies the HCOMP portion of the
-compression model. The first two bytes of the string should
-encode the length of the rest of the string as a 16 bit unsigned
-number with the least significant bit first. The meaning of the
-rest of the string is defined in the ZPAQ level 2 standard.
-If the number of components (C<hcomp[8]>) is 0, then the block
-is saved in ZPAQ level 2 format, which cannot be read by
-older ZPAQ level 1 decoders. Otherwise the block is saved in
-ZPAQ level 1 format, which is compatible with all decoders.
-
-=item C<void startSegment(const char* filename = 0, const char* comment = 0);>
-
-Writes a segment header. C<filename> and
-C<comment> are NUL terminated strings. If specified, then their
-values are stored. Normally, C<filename> would be a file name
-when compressing to an archive or omitted otherwise. If a file
-is split among segments, then by convention only the first segment
-is named. C<comment> is normally the uncompressed size as a decimal
-number which is displayed when listing the contents of an archive.
-Omitting it does not affect decompression.
-
-=item C<void postProcess(const char* pcomp = 0, int length = 0);>
-
-Specifies the optional PCOMP string used for post-processing.
-It must be called from within the first segment
-of each block prior to compressing any data, but not from within
-any other segment.
-If C<pcomp> is 0 or no argument is passed, then the decompresser
-will not post-process the data. The effect is to compress a
-0 byte to indicate to the decompresser that no PCOMP string
-is present.
-
-If C<pcomp> is not 0, then I<length> bytes of the string I<pcomp>
-are passed. If I<length> is 0 or omitted, then
-the first two bytes must encode
-the length of the rest of the string as a 16 bit unsigned number
-with the least significant byte first. The format of the remainder
-of the string is described in the ZPAQ level 2 standard.
-The effect is to compress a 1 byte
-to indicate the presence of PCOMP, followed by the two length
-bytes and the string as passed. For example, either
-C<pcomp("\x02\x00\x05\x08")> or C<pcomp("\x05\x08", 2)>
-would compress the 5 bytes 1, 2, 0, 5, 8.
-The user is responsible for pre-processing the input
-prior to compression so that PCOMP restores the original data.
-
-=item C<void setInput(Reader* in);>
-
-Specifies the input source for compression. It must be set
-prior to the first call to C<compress()>.
-
-=item C<bool compress(int n = -1);>
-
-Compress n bytes of data, or until EOF is input, whichever comes
-first. If n < 0 or omitted, then compress until EOF.
-Returns true if there is more input available, or false if EOF
-was read.
-
-=item C<void endSegment(const char* sha1string = 0);>
-
-Stop compressing and write the end of a segment. If
-C<sha1string> is specified, it should be a 20 byte string
-as returned by C<SHA1::result()> on the input data for
-this segment I<before> pre-processing.
-
-=item C<void endBlock();>
-
-Finish writing the current block.
-
-=back
-
-In order to create a valid ZPAQ stream, the components must
-be written in the following order:
-
-    for each block do {
-        if any non-ZPAQ data then {
-            write non-ZPAQ data
-            writeTag()
-        }
-        startBlock()
-        for each segment do {
-            startSegment()
-            if first segment in block then {
-                postProcess()
-            }
-            while (compress(n)) ;
-            endSegment()
-        }
-        endBlock()
-    }
-
-=head2 class Decompresser
-
-The class Decompresser has member functions to read each of the
-syntactic elements of a ZPAQ stream.
-
-=over
-
-=item C<Decompresser()>
-
-The constructor creates a Decompresser object. No input source or
-output destination is specified.
-
-=item C<void setInput(Reader* in);>
-
-Specifies where the ZPAQ stream will be read from. Must be called
-before any function that reads the stream.
-
-=item C<bool findBlock(double* memptr = 0);>
-
-Scan the input to find the start of the next block. If a block
-does not start immediately, then the block must be preceded by
-a marker tag (written with C<Compressor::writeTag()>) or it will
-not be found. If C<memptr> is not 0, then write the approximate
-memory requirement (in bytes) to decompress to C<*memptr>). The
-memory will be allocated by the first call to C<decompress()>.
-It returns true if a block is found, or false if it reads to EOF
-without finding a block.
-
-=item C<void hcomp(Writer* out);>
-
-Write the HCOMP string of the current block to C<out>.
-It will be in a format suitable
-for passing to C<Compressor::startBlock()>. The first 2 bytes will
-encode the length of the rest of the string as a 16 bit unsigned
-integer with the least significant byte first. The format of the
-remainder of the string is described in the ZPAQ level 1
-specification.
-
-=item C<bool findFilename(Writer* out = 0);>
-
-Find the start of the next segment. If another segment is found
-within the current block then return true. If the end of the block
-is found first, then return false. If a segment is found, the
-filename field is not empty, and C<out>
-is not 0, then write the filename (without a terminating NUL byte)
-to C<out>.
-
-=item C<void readComment(Writer* out = 0);>
-
-Read or skip past the comment field following the filename field
-in the segment header. If C<out> is not 0 and the comment field is
-not empty, then write the comment
-(without a terminating NUL byte) to C<out>.
-
-=item C<void setOutput(Writer* out);>
-
-Specify the destination for decompression. It must be set before
-any data can be decompressed.
-
-=item C<void setSHA1(SHA1* sha1ptr);>
-
-Specify the address of a SHA1 object for computing the checksum
-of the decompressed data (after post-processing). As each byte C<c>
-is output, it is also passed to C<sha1ptr-E<gt>put(c)>. In order to
-compute the correct checksum, the SHA1 object should be in its
-initial state, either newly created, or by calling C<SHA1::result()>,
-before the first call to C<decompress()>. When the end of the segment
-is reached, the value returned by C<sha1ptr-E<gt>result()> should match
-the stored checksum, if any.
-
-=item C<bool decompress(int n = -1);>
-
-Decode n bytes or until the end of segment, whichever comes
-first. Return false if the end of segment is reached first. If
-n < 0 or not specified, then decompress to the end of segment
-and return false. C<n> is the number of bytes prior to post-processing.
-If the data is post-processed, then the size of the output may
-be different.
-
-=item C<bool pcomp(Writer* out);>
-
-Write the PCOMP string, if any, for the current block to C<out>.
-If there is no PCOMP string (no post-processor) then return false.
-Otherwise write the string to C<out> in a format suitable for
-passing to C<Compressor::postProcess()> and return true. If written,
-then the first 2 bytes will encode the length of the rest of the
-string as a 16 bit unsigned integer with the least significant
-bit first. The format of the rest of the string is descibed in
-the ZPAQ level 1 standard.
-
-C<pcomp()> is only valid after the first call to C<decompress()>
-in the current block. To read the PCOMP string without decompressing any
-data, then call C<decompress(0)> first. It is not necessary to
-call C<setOutput()> in this case.
-
-=item C<void readSegmentEnd(char* sha1string = 0);>
-
-Skip any compressed data in the current segment that has not yet
-been decompressed and advance to the end of the segment.
-Then if C<sha1string> is not 0 then write into
-the 21 byte array that it points to. If a checksum is present,
-then write a 1 into C<sha1string[0]> and write the stored checksum
-in C<sha1string[1...20]>. Otherwise write a 0 in C<sha1string[0]>.
-
-Note that it is not permitted to call decompress() if any compressed
-data has been skipped in any earlier segments in the same block.
-
-=back
-
-A valid sequence of calls is as follows:
-
-    while (findBlock()) {
-        while (findFilename()) {
-            readComment();
-            if first segment in block then { (optional)
-                decompress(0)
-                pcomp()
-            }
-            while (decompress(n)) ; (optional)
-            readSegmentEnd();
-        }
-    }
-
-=head1 EXAMPLES
-
-The following program F<listzpaq.cpp>
-lists the contents of a ZPAQ archive
-read from standard input.
-
-    #include <stdio.h>
-    #include <stdlib.h>
-    #include "libzpaq.h"
-
-    // Implement Reader and Writer interfaces for file I/O
-    class File: public libzpaq::Reader, public libzpaq::Writer {
-        FILE* f;
-    public:
-        File(FILE* f_): f(f_) {}
-        int get() {return getc(f);}
-        void put(int c) {putc(c, f);}
-        int read(char* buf, int n) {return fread(buf, 1, n, f);}
-        void write(const char* buf, int n) {fwrite(buf, 1, n, f);}
-    };
-
-    // Implement error handler
-    namespace libzpaq {
-        void error(const char* msg) {
-            fprintf(stderr, "Error: %s\n", msg);
-            exit(1);
-        }
-    }
-
-    // List the contents of an archive. For each block, show
-    // the memory required to decompress. For each segment,
-    // show the filename and comment.
-    void list(FILE* input, FILE* output) {
-        libzpaq::Decompresser d;
-        File in(input), out(output);
-        double memory;
-        d.setInput(&in);
-        for (int block=1; d.findBlock(&memory); ++block) {
-            printf("Block %d needs %1.0f MB\n", block, memory/1e6);
-            while (d.findFilename(&out)) {  // print filename
-                printf("\t");
-                d.readComment(&out);  // print comment
-                printf("\n");
-                d.readSegmentEnd();  // skip compressed data
-            }
-        }
-    }
-
-    int main() {
-        list(stdin, stdout);
-        return 0;
-    }
-
-The program could be compiled as follows:
-
-    g++ listzpaq.cpp libzpaq.cpp
-
-The following code compresses a list of files into one block
-written to stdout. Each file is compressed to a separate
-segment. For each segment, the filename, comment, and SHA-1
-checksum are stored. The comment, as conventional, is the
-file size as a decimal string.
-
-    // Compress one file to one segment
-    void compress_file(libzpaq::Compressor& c,
-                       const char* filename,
-                       bool first_segment) {
-
-        // Open input file
-        FILE* f;
-        f=fopen(filename, "rb");
-        if (!f) return;
-
-        // Compute SHA-1 checksum and file size
-        libzpaq::SHA1 sha1;
-        int ch;
-        while ((ch=getc(f))!=EOF)
-            sha1.put(ch);
-
-        // Write file size as a comment.
-        // The size can have at most 19 digits.
-        char comment[20];
-        sprintf(comment, "%1.0f", sha1.size());
-
-        // Compress segment
-        rewind(f);
-        File in(f);
-        c.startSegment(filename, comment);
-        if (first_segment)
-            c.postProcess();
-        c.setInput(&in);
-        c.compress();
-        c.endSegment(sha1.result());
-
-        // Close input file
-        fclose(f);
-    }
-
-    // Compress a list of argc files in argv[0...argc-1] into one
-    // ZPAQ block to stdout at level 2.
-    void compress_list(int argc, char** argv) {
-        libzpaq::Compressor c;
-        File out(stdout);
-        c.setOutput(&out);
-        c.startBlock(2);
-        for (int i=0; i<argc; ++i)
-            compress_file(c, argv[i], i==0);
-        c.endBlock();
-    }
-
-The following function decompresses from stdin to stdout.
-Filenames and comments are ignored, but checksums are verified
-if present.
-
-    void decompress() {
-        libzpaq::Decompresser d;
-        File in(stdin), out(stdout);
-        d.setInput(&in);
-        while (d.findBlock()) {
-            while (d.findFilename()) {
-                d.readComment();
-                libzpaq::SHA1 sha1;
-                d.setSHA1(&sha1);
-                d.setOutput(&out);
-                d.decompress();
-                char sha1string[21];
-                d.readSegmentEnd(sha1string);
-                const char* sha1result = sha1.result();
-                if (sha1string[0]==1
-                         && memcmp(sha1string+1, sha1result, 20))
-                    libzpaq::error("checksum verify error");
-            }
-        }
-    }
-
-C<Compressor::compress()> and C<Decompresser::decompress()> can
-be passed an argument n to display progress every n bytes,
-for example:
-
-    for (int i=1; d.decompress(1000000); ++i)
-        fprintf(stderr, "Decompressed %d MB\n", i);
-
-To compress or decompress to and from objects in memory, derive
-appropriate classes from C<Reader> and C<Writer>. For example, it is
-possible to compress or decompress to a C<std::string> using
-the following class.
-
-    struct String: public libzpaq::Writer {
-        std::string s;
-        void put(int c) {s+=char(c);}
-    };
-
-This class is also useful for reading the filename and comment
-fields during decompression as follows:
-
-    String filename, comment;
-    while (d.findFilename(&filename)) {
-        d.readComment(&comment);
-        // ...
-
-=head1 AVAILABILITY
-
-I<libzpaq>, I<zpaq>, and the ZPAQ level 1 and 2 specifications are
-available from L<http://mattmahoney.net/zpaq/>.
-
-=head1 SEE ALSO
-
-C<zpaq(1)>
-C<sha1(1SSL)>
-
-=cut
-
-
+#  Documentation for libzpaq
+#
+#  Copyright (C) 2012, Dell Inc. Written by Matt Mahoney.
+#
+#  Permission is hereby granted, free of charge, to any person obtaining a copy
+#  of this software and associated documentation files (the "Software"), to deal
+#  in the Software without restriction, including without limitation the rights
+#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#  copies of the Software, and to permit persons to whom the Software is
+#  furnished to do so without restriction.
+#  This Software is provided "as is" without warranty.
+#
+#  To create man page: pod2man libzpaq.3.pod > libzpaq.3
+#  To create HTML documentation: pod2html libzpaq.3.pod > libzpaq.html
+
+=pod
+
+=head1 NAME
+
+libzpaq - ZPAQ compression API
+
+=head1 SYNOPSIS
+
+    #include "libzpaq.h"
+
+    namespace libzpaq {
+
+    extern void error(const char* msg);
+
+    class Reader {
+    public:
+        virtual int get() = 0;
+        virtual int read(char* buf, int n); // optional
+        virtual ~Reader() {}
+    };
+
+    class Writer {
+    public:
+        virtual void put(int c) = 0;
+        virtual void write(const char* buf, int n); // optional
+        virtual ~Writer() {}
+    };
+
+    class SHA1 {
+    public:
+        SHA1();
+        void put(int c);
+        double size() const;
+        uint64_t usize() const
+        const char* result();
+    };
+
+    class Compressor {
+    public:
+        Compressor();
+        void setOutput(Writer* out);
+        void writeTag();
+        void startBlock(int level);
+        void startBlock(const char* hcomp);
+        void startSegment(const char* filename = 0,
+                          const char* comment = 0);
+        void setInput(Reader* i);
+        void postProcess(const char* pcomp = 0, int length = 0);
+        bool compress(int n = -1);
+        void endSegment(const char* sha1string = 0);
+        void endBlock();
+    };
+
+    class Decompresser {
+    public:
+        Decompresser();
+        void setInput(Reader* in);
+        bool findBlock(double* memptr = 0);
+        void hcomp(Writer* out);
+        bool findFilename(Writer* = 0);
+        void readComment(Writer* = 0);
+        void setOutput(Writer* out);
+        void setSHA1(SHA1* sha1ptr);
+        bool decompress(int n = -1);
+        bool pcomp(Writer* out);
+        void readSegmentEnd(char* sha1string = 0);
+    };
+
+    void compress(Reader* in, Writer* out, int level);
+
+    void decompress(Reader* in, Writer* out);
+    }
+
+=head1 DESCRIPTION
+
+I<libzpaq> is a C++ API for compressing or decompressing
+files or objects in memory comforming to the ZPAQ level 1 and 2 standards
+(see I<availability>). This document describes version 5.00
+of the software. The software may be used without
+restriction under a modified MIT license.
+
+ZPAQ provides a high level of data compression in a streaming
+(single pass) self-describing format that supports single or multiple
+named objects (such as archives) with optional integrity checking.
+
+The library provides 3 default compression levels but supports
+custom algorithms. The performance of the default levels is
+shown in the table below for the 14 file Calgary corpus as
+a tar file. Compression and decompression times are in seconds
+on a 2 GHz T3200 running on one of two cores. Memory required
+to compress or decompress is in MB. Some popular formats
+are shown for comparison.
+
+    Program       Format       Size     Time (C, D)   Memory
+    -----------   ------     ---------  -----------   ------
+    Uncompresed   .tar       3,152,896
+    compress      .tar.Z     1,319,521    1.6   0.2    .1 MB
+    gzip -9       .tar.gz    1,022,810    0.7   0.1    .1 MB
+    bzip2 -9      .tar.bz2     860,097    0.6   0.4     5 MB
+    7zip          .tar.7z      824,573    1.5   0.1   195 MB
+    zpaq 1 (fast) .tar.zpaq    806,959    2     2      38 MB
+    zpaq 2 (mid)  .tar.zpaq    699,191    8     8     112 MB
+    zpaq 3 (max)  .tar.zpaq    644,190   20    20     246 MB
+
+A ZPAQ stream consists of one or more blocks, possibly mixed with
+other data, that can be decompressed independently in any order.
+Each block consists of one or more segments that must be decompressed
+in order from the beginning of the block. Each block header contains
+a description of the decompression algorithm. Each segment consists
+of an optional filename string, an optional comment string,
+self delimiting compressed data, and an optional SHA-1 checksum.
+If ZPAQ blocks are mixed with other data, they must be
+preceded by an identifying 13 byte tag which does not otherwise
+appear in that data.
+
+ZPAQ compression is based on the PAQ context mixing model.
+An array of components predict the probability of the next bit
+of input, either independently or depending on the predictions
+of earlier components. The final prediction is arithmetic coded.
+Each component inputs a context computed from earlier input
+by a program written in ZPAQL byte code which runs on a virtual
+machine. Both the component array description and the ZPAQL
+code are encoded in a string called HCOMP in each block header.
+Data can also be stored uncompressed.
+
+A block may optionally specify a post-processor, a program
+(also in ZPAQL) which takes the decoded data as input and
+outputs the decompressed output. This program, if present,
+is encoded as a string called PCOMP which is compressed
+in the first segment prior to the compressed data. The first
+decoded byte from the first segment is a flag indicating
+whether a PCOMP string is present. The user is responsible
+for correctly pre-processing the data so that post-processing
+restores the original data.
+
+=head2 API Organization
+
+The I<libzpaq> API consists of 2 files.
+
+=over
+
+=item libzpaq.h
+
+Header file to include in your application.
+
+=item libzpaq.cpp
+
+Source code file to link to your application.
+
+=back
+
+An application would have the line C<#include "libzpaq.h"> and
+link to libzpaq.cpp.
+The API provides two classes, C<Compressor> and C<Decompresser>
+which write or read respectively each of the syntactic elements
+of a ZPAQ stream. The two functions C<compress()> and
+C<decompress()> provide simple interfaces for the most common
+uses. In either case, the user must create classes derived
+from the abstract base classes C<Reader> and C<Writer> and
+define methods C<get()> and C<put()> which the code
+will use to read and write bytes. The user must also define
+a callback error handler.
+
+By default, libzpaq(3) uses just-in-time (JIT) acceleration
+by translating ZPAQL code to x86-32 or x86-64 internally
+and executing it. This feature can be disabled by compiling
+with -DNOJIT. If enabled, it requires an x86 processor
+capable of executing SSE2 instructions. SSE2 is supported
+by most Intel processors since 2001 and AMD since 2003.
+
+Run time checks (assertions) can be enabled with -DDEBUG
+for debugging purposes.
+
+All of the API code is contained in the namespace C<libzpaq>.
+
+=head2 Callback Functions
+
+The following three functions must be defined by the user.
+
+=over
+
+=item C<extern void libzpaq::error(const char* msg);>
+
+This function must be defined by the user to handle errors
+from libzpaq. The library will call the function with
+an English language message passed to C<msg>. Errors may
+result from bad input during decompression, out of memory,
+or illegal arguments or calling sequences to libzpaq
+functions. Errors should be considered unrecoverable.
+
+=item C<int libzpaq::Reader::get() = 0;>
+
+The user must create a class derived from Reader with an
+implementation for C<get()> that reads one byte of input
+and returns its value in the range 0...255, or returns
+EOF (-1) at end of input. Objects of the derived type
+would then be passed to functions that require a C<Reader>.
+
+=item C<void libzpaq::Writer::put(int c) = 0;>
+
+The user must create a class derived from Writer with
+an implemenation of C<put()> which is expected to take
+a byte value C<c> in the range 0...255 and write it to
+output. Objects of the derived type
+would then be passed to functions that require a C<Writer>.
+
+=back
+
+The following two functions are optional. Defining them
+can improve performance slightly.
+
+=over
+
+=item C<virtual int read(char* buf, int n);>
+
+If defined, this function should input up to C<n> bytes into
+the array C<buf> and return the number actually read, in
+the range 0..n. A return value of 0 indicates end of input.
+If C<read()> is not defined, then the default implementation
+will call C<get()> n times.
+
+=item C<virtual void write(const char* buf, int n);>
+
+If defined, this function should output the elements C<buf[0]>
+through C<buf[n-1]> in order. If not defined, then the default
+implementation will call C<put()> n times.
+
+=back
+
+=head2 Simple Compression
+
+In the remainder of this document, all classes and
+functions are assumed to be in namespace C<libzpaq>.
+
+=over
+
+=item C<void compress(Reader* in, Writer* out, int mode);>
+
+C<compress()> compresses from C<in> to C<out> until C<get()>
+returns EOF. It writes a single segment in a single block
+with empty filename, comment, and checksum fields. C<mode>
+must be 1, 2, or 3, to select models I<fast>, I<mid>, or
+I<max> respectively. Higher modes compress smaller but
+take longer to compress and subsequently decompress.
+
+=item C<void decompress(Reader* in, Writer* out);>
+
+C<decompress()> decompresses any valid ZPAQ stream from
+C<in> to C<out> until C<get()> returns EOF. Any
+non-ZPAQ data in the input is ignored. Any ZPAQ blocks
+following non-ZPAQ must be preceded by a marker tag
+to be recognized. Each block is decoded according to the
+instructions in the block header. The contents of the
+filename, comment, and checksum fields are ignored.
+Data with bad checksums will be decoded anyway. If there
+is more than one segment, then all of the output
+data will be concatenated.
+
+=back
+
+=head2 class SHA1
+
+The SHA1 class is used to compute SHA-1 checksums for compression
+and verify them for decompression. It is believed to be
+computationally infeasible to find two different strings
+with the same hash value. Its member functions
+are as follows:
+
+=over
+
+=item C<SHA1();>
+
+The constructor creates a new SHA1 object representing the
+hash of an empty string.
+
+=item C<void put(int c);>
+
+Appends one byte c (0...255) to the string whose hash is represented.
+
+=item C<double size() const;>
+
+Returns the length (so far) of the string whose hash is represented.
+The largest possible value returned is
+2^61 - 1 = 2305843009213693951.0, but values larger than 2^53 =
+9007199254740992.0
+will not be exact on systems using IEEE 64 bit floating point
+representation of type C<double>. The initial value is 0.0.
+
+=item C<int64_t usize() const;>
+
+Returns the length (so far) as a 64 bit unsigned integer.
+
+=item C<const char* result();>
+
+Computes the 20 byte SHA-1 hash and resets the string back
+to a size of 0.0. The returned pointer points to an array
+inside the SHA1 object whose
+contents remain unchanged until the next call to C<result()>.
+
+=back
+
+=head2 class Compressor
+
+The C<Compressor> class has member functions to write
+each of the syntactic elements of a ZPAQ stream and to specify
+their values. It will compress using either built-in or
+user supplied models.
+
+=over
+
+=item C<Compressor();>
+
+The constructor creates a Compression object. No input source,
+output destination, or compression model is specified.
+
+=item C<void setOutput(Writer* out);>
+
+Specifies a destination for output. Must be specified before
+calling any function that writes data.
+
+=item C<void writeTag();>
+
+Writes a 13 byte marker tag which can be used to identify
+the start of a block following non-ZPAQ data.
+
+=item C<void startBlock(int level);>
+
+Writes a block header and specifies a compression model.
+If linked with F<libzpaqo.cpp>, then C<level> must be 1, 2, or 3
+to specify I<fast>, I<mid>, or I<max> respectively. Higher numbers
+compress smaller but more slowly. These models are compatible
+with both the ZPAQ level 1 and 2 standards.
+
+=item C<void startBlock(const char* hcomp);>
+
+Writes a block header and specifies the HCOMP portion of the
+compression model. The first two bytes of the string should
+encode the length of the rest of the string as a 16 bit unsigned
+number with the least significant bit first. The meaning of the
+rest of the string is defined in the ZPAQ level 2 standard.
+If the number of components (C<hcomp[8]>) is 0, then the block
+is saved in ZPAQ level 2 format, which cannot be read by
+older ZPAQ level 1 decoders. Otherwise the block is saved in
+ZPAQ level 1 format, which is compatible with all decoders.
+
+=item C<void startSegment(const char* filename = 0, const char* comment = 0);>
+
+Writes a segment header. C<filename> and
+C<comment> are NUL terminated strings. If specified, then their
+values are stored. Normally, C<filename> would be a file name
+when compressing to an archive or omitted otherwise. If a file
+is split among segments, then by convention only the first segment
+is named. C<comment> is normally the uncompressed size as a decimal
+number which is displayed when listing the contents of an archive.
+Omitting it does not affect decompression.
+
+=item C<void postProcess(const char* pcomp = 0, int length = 0);>
+
+Specifies the optional PCOMP string used for post-processing.
+It must be called from within the first segment
+of each block prior to compressing any data, but not from within
+any other segment.
+If C<pcomp> is 0 or no argument is passed, then the decompresser
+will not post-process the data. The effect is to compress a
+0 byte to indicate to the decompresser that no PCOMP string
+is present.
+
+If C<pcomp> is not 0, then I<length> bytes of the string I<pcomp>
+are passed. If I<length> is 0 or omitted, then
+the first two bytes must encode
+the length of the rest of the string as a 16 bit unsigned number
+with the least significant byte first. The format of the remainder
+of the string is described in the ZPAQ level 2 standard.
+The effect is to compress a 1 byte
+to indicate the presence of PCOMP, followed by the two length
+bytes and the string as passed. For example, either
+C<pcomp("\x02\x00\x05\x08")> or C<pcomp("\x05\x08", 2)>
+would compress the 5 bytes 1, 2, 0, 5, 8.
+The user is responsible for pre-processing the input
+prior to compression so that PCOMP restores the original data.
+
+=item C<void setInput(Reader* in);>
+
+Specifies the input source for compression. It must be set
+prior to the first call to C<compress()>.
+
+=item C<bool compress(int n = -1);>
+
+Compress n bytes of data, or until EOF is input, whichever comes
+first. If n < 0 or omitted, then compress until EOF.
+Returns true if there is more input available, or false if EOF
+was read.
+
+=item C<void endSegment(const char* sha1string = 0);>
+
+Stop compressing and write the end of a segment. If
+C<sha1string> is specified, it should be a 20 byte string
+as returned by C<SHA1::result()> on the input data for
+this segment I<before> pre-processing.
+
+=item C<void endBlock();>
+
+Finish writing the current block.
+
+=back
+
+In order to create a valid ZPAQ stream, the components must
+be written in the following order:
+
+    for each block do {
+        if any non-ZPAQ data then {
+            write non-ZPAQ data
+            writeTag()
+        }
+        startBlock()
+        for each segment do {
+            startSegment()
+            if first segment in block then {
+                postProcess()
+            }
+            while (compress(n)) ;
+            endSegment()
+        }
+        endBlock()
+    }
+
+=head2 class Decompresser
+
+The class Decompresser has member functions to read each of the
+syntactic elements of a ZPAQ stream.
+
+=over
+
+=item C<Decompresser()>
+
+The constructor creates a Decompresser object. No input source or
+output destination is specified.
+
+=item C<void setInput(Reader* in);>
+
+Specifies where the ZPAQ stream will be read from. Must be called
+before any function that reads the stream.
+
+=item C<bool findBlock(double* memptr = 0);>
+
+Scan the input to find the start of the next block. If a block
+does not start immediately, then the block must be preceded by
+a marker tag (written with C<Compressor::writeTag()>) or it will
+not be found. If C<memptr> is not 0, then write the approximate
+memory requirement (in bytes) to decompress to C<*memptr>). The
+memory will be allocated by the first call to C<decompress()>.
+It returns true if a block is found, or false if it reads to EOF
+without finding a block.
+
+=item C<void hcomp(Writer* out);>
+
+Write the HCOMP string of the current block to C<out>.
+It will be in a format suitable
+for passing to C<Compressor::startBlock()>. The first 2 bytes will
+encode the length of the rest of the string as a 16 bit unsigned
+integer with the least significant byte first. The format of the
+remainder of the string is described in the ZPAQ level 1
+specification.
+
+=item C<bool findFilename(Writer* out = 0);>
+
+Find the start of the next segment. If another segment is found
+within the current block then return true. If the end of the block
+is found first, then return false. If a segment is found, the
+filename field is not empty, and C<out>
+is not 0, then write the filename (without a terminating NUL byte)
+to C<out>.
+
+=item C<void readComment(Writer* out = 0);>
+
+Read or skip past the comment field following the filename field
+in the segment header. If C<out> is not 0 and the comment field is
+not empty, then write the comment
+(without a terminating NUL byte) to C<out>.
+
+=item C<void setOutput(Writer* out);>
+
+Specify the destination for decompression. It must be set before
+any data can be decompressed.
+
+=item C<void setSHA1(SHA1* sha1ptr);>
+
+Specify the address of a SHA1 object for computing the checksum
+of the decompressed data (after post-processing). As each byte C<c>
+is output, it is also passed to C<sha1ptr-E<gt>put(c)>. In order to
+compute the correct checksum, the SHA1 object should be in its
+initial state, either newly created, or by calling C<SHA1::result()>,
+before the first call to C<decompress()>. When the end of the segment
+is reached, the value returned by C<sha1ptr-E<gt>result()> should match
+the stored checksum, if any.
+
+=item C<bool decompress(int n = -1);>
+
+Decode n bytes or until the end of segment, whichever comes
+first. Return false if the end of segment is reached first. If
+n < 0 or not specified, then decompress to the end of segment
+and return false. C<n> is the number of bytes prior to post-processing.
+If the data is post-processed, then the size of the output may
+be different.
+
+=item C<bool pcomp(Writer* out);>
+
+Write the PCOMP string, if any, for the current block to C<out>.
+If there is no PCOMP string (no post-processor) then return false.
+Otherwise write the string to C<out> in a format suitable for
+passing to C<Compressor::postProcess()> and return true. If written,
+then the first 2 bytes will encode the length of the rest of the
+string as a 16 bit unsigned integer with the least significant
+bit first. The format of the rest of the string is descibed in
+the ZPAQ level 1 standard.
+
+C<pcomp()> is only valid after the first call to C<decompress()>
+in the current block. To read the PCOMP string without decompressing any
+data, then call C<decompress(0)> first. It is not necessary to
+call C<setOutput()> in this case.
+
+=item C<void readSegmentEnd(char* sha1string = 0);>
+
+Skip any compressed data in the current segment that has not yet
+been decompressed and advance to the end of the segment.
+Then if C<sha1string> is not 0 then write into
+the 21 byte array that it points to. If a checksum is present,
+then write a 1 into C<sha1string[0]> and write the stored checksum
+in C<sha1string[1...20]>. Otherwise write a 0 in C<sha1string[0]>.
+
+Note that it is not permitted to call decompress() if any compressed
+data has been skipped in any earlier segments in the same block.
+
+=back
+
+A valid sequence of calls is as follows:
+
+    while (findBlock()) {
+        while (findFilename()) {
+            readComment();
+            if first segment in block then { (optional)
+                decompress(0)
+                pcomp()
+            }
+            while (decompress(n)) ; (optional)
+            readSegmentEnd();
+        }
+    }
+
+=head1 EXAMPLES
+
+The following program F<listzpaq.cpp>
+lists the contents of a ZPAQ archive
+read from standard input.
+
+    #include <stdio.h>
+    #include <stdlib.h>
+    #include "libzpaq.h"
+
+    // Implement Reader and Writer interfaces for file I/O
+    class File: public libzpaq::Reader, public libzpaq::Writer {
+        FILE* f;
+    public:
+        File(FILE* f_): f(f_) {}
+        int get() {return getc(f);}
+        void put(int c) {putc(c, f);}
+        int read(char* buf, int n) {return fread(buf, 1, n, f);}
+        void write(const char* buf, int n) {fwrite(buf, 1, n, f);}
+    };
+
+    // Implement error handler
+    namespace libzpaq {
+        void error(const char* msg) {
+            fprintf(stderr, "Error: %s\n", msg);
+            exit(1);
+        }
+    }
+
+    // List the contents of an archive. For each block, show
+    // the memory required to decompress. For each segment,
+    // show the filename and comment.
+    void list(FILE* input, FILE* output) {
+        libzpaq::Decompresser d;
+        File in(input), out(output);
+        double memory;
+        d.setInput(&in);
+        for (int block=1; d.findBlock(&memory); ++block) {
+            printf("Block %d needs %1.0f MB\n", block, memory/1e6);
+            while (d.findFilename(&out)) {  // print filename
+                printf("\t");
+                d.readComment(&out);  // print comment
+                printf("\n");
+                d.readSegmentEnd();  // skip compressed data
+            }
+        }
+    }
+
+    int main() {
+        list(stdin, stdout);
+        return 0;
+    }
+
+The program could be compiled as follows:
+
+    g++ listzpaq.cpp libzpaq.cpp
+
+The following code compresses a list of files into one block
+written to stdout. Each file is compressed to a separate
+segment. For each segment, the filename, comment, and SHA-1
+checksum are stored. The comment, as conventional, is the
+file size as a decimal string.
+
+    // Compress one file to one segment
+    void compress_file(libzpaq::Compressor& c,
+                       const char* filename,
+                       bool first_segment) {
+
+        // Open input file
+        FILE* f;
+        f=fopen(filename, "rb");
+        if (!f) return;
+
+        // Compute SHA-1 checksum and file size
+        libzpaq::SHA1 sha1;
+        int ch;
+        while ((ch=getc(f))!=EOF)
+            sha1.put(ch);
+
+        // Write file size as a comment.
+        // The size can have at most 19 digits.
+        char comment[20];
+        sprintf(comment, "%1.0f", sha1.size());
+
+        // Compress segment
+        rewind(f);
+        File in(f);
+        c.startSegment(filename, comment);
+        if (first_segment)
+            c.postProcess();
+        c.setInput(&in);
+        c.compress();
+        c.endSegment(sha1.result());
+
+        // Close input file
+        fclose(f);
+    }
+
+    // Compress a list of argc files in argv[0...argc-1] into one
+    // ZPAQ block to stdout at level 2.
+    void compress_list(int argc, char** argv) {
+        libzpaq::Compressor c;
+        File out(stdout);
+        c.setOutput(&out);
+        c.startBlock(2);
+        for (int i=0; i<argc; ++i)
+            compress_file(c, argv[i], i==0);
+        c.endBlock();
+    }
+
+The following function decompresses from stdin to stdout.
+Filenames and comments are ignored, but checksums are verified
+if present.
+
+    void decompress() {
+        libzpaq::Decompresser d;
+        File in(stdin), out(stdout);
+        d.setInput(&in);
+        while (d.findBlock()) {
+            while (d.findFilename()) {
+                d.readComment();
+                libzpaq::SHA1 sha1;
+                d.setSHA1(&sha1);
+                d.setOutput(&out);
+                d.decompress();
+                char sha1string[21];
+                d.readSegmentEnd(sha1string);
+                const char* sha1result = sha1.result();
+                if (sha1string[0]==1
+                         && memcmp(sha1string+1, sha1result, 20))
+                    libzpaq::error("checksum verify error");
+            }
+        }
+    }
+
+C<Compressor::compress()> and C<Decompresser::decompress()> can
+be passed an argument n to display progress every n bytes,
+for example:
+
+    for (int i=1; d.decompress(1000000); ++i)
+        fprintf(stderr, "Decompressed %d MB\n", i);
+
+To compress or decompress to and from objects in memory, derive
+appropriate classes from C<Reader> and C<Writer>. For example, it is
+possible to compress or decompress to a C<std::string> using
+the following class.
+
+    struct String: public libzpaq::Writer {
+        std::string s;
+        void put(int c) {s+=char(c);}
+    };
+
+This class is also useful for reading the filename and comment
+fields during decompression as follows:
+
+    String filename, comment;
+    while (d.findFilename(&filename)) {
+        d.readComment(&comment);
+        // ...
+
+=head1 AVAILABILITY
+
+I<libzpaq>, I<zpaq>, and the ZPAQ level 1 and 2 specifications are
+available from L<http://mattmahoney.net/zpaq/>.
+
+=head1 SEE ALSO
+
+C<zpaq(1)>
+C<sha1(1SSL)>
+
+=cut
+
+
diff --git a/libzpaq.cpp b/libzpaq.cpp
index f0c35d5..084a7aa 100644
--- a/libzpaq.cpp
+++ b/libzpaq.cpp
@@ -1,3181 +1,3181 @@
-/* libzpaq.cpp - Part of LIBZPAQ Version 5.01
-
-  Copyright (C) 2011, Dell Inc. Written by Matt Mahoney.
-
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so without restriction.
-  This Software is provided "as is" without warranty.
-
-LIBZPAQ is a C++ library for compression and decompression of data
-conforming to the ZPAQ level 2 standard. See http://mattmahoney.net/zpaq/
-*/
-
-#include "libzpaq.h"
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-
-#ifndef NOJIT
-#ifdef unix
-#include <sys/mman.h>
-#else
-#include <windows.h>
-#endif
-#endif
-
-namespace libzpaq {
-
-// Standard library redirections
-void* calloc(size_t a, size_t b) {return ::calloc(a, b);}
-void free(void* p) {::free(p);}
-int memcmp(const void* d, const void* s, size_t n) {
-  return ::memcmp(d, s, n);}
-void* memset(void* d, int c, size_t n) {return ::memset(d, c, n);}
-double log(double x) {return ::log(x);}
-double exp(double x) {return ::exp(x);}
-double pow(double x, double y) {return ::pow(x, y);}
-
-// Read 16 bit little-endian number
-int toU16(const char* p) {
-  return (p[0]&255)+256*(p[1]&255);
-}
-
-// Default read() and write()
-int Reader::read(char* buf, int n) {
-  int i=0, c;
-  while (i<n && (c=get())>=0)
-    buf[i++]=c;
-  return i;
-}
-
-void Writer::write(const char* buf, int n) {
-  for (int i=0; i<n; ++i)
-    put(U8(buf[i]));
-}
-
-///////////////////////// allocx //////////////////////
-
-// Allocate newsize > 0 bytes of executable memory and update
-// p to point to it and newsize = n. Free any previously
-// allocated memory first. If newsize is 0 then free only.
-// Call error in case of failure. If NOJIT, ignore newsize
-// and set p=0, n=0 without allocating memory.
-void allocx(U8* &p, int &n, int newsize) {
-#ifdef NOJIT
-  p=0;
-  n=0;
-#else
-  if (p || n) {
-    if (p)
-#ifdef unix
-      munmap(p, n);
-#else // Windows
-      VirtualFree(p, 0, MEM_RELEASE);
-#endif
-    p=0;
-    n=0;
-  }
-  if (newsize>0) {
-#ifdef unix
-    p=(U8*)mmap(0, newsize, PROT_READ|PROT_WRITE|PROT_EXEC,
-                MAP_PRIVATE|MAP_ANON, -1, 0);
-    if ((void*)p==MAP_FAILED) p=0;
-#else
-    p=(U8*)VirtualAlloc(0, newsize, MEM_RESERVE|MEM_COMMIT,
-                        PAGE_EXECUTE_READWRITE);
-#endif
-    if (p)
-      n=newsize;
-    else {
-      n=0;
-      error("allocx failed");
-    }
-  }
-#endif
-}
-
-//////////////////////////// SHA1 ////////////////////////////
-
-// SHA1 code, see http://en.wikipedia.org/wiki/SHA-1
-
-// Start a new hash
-void SHA1::init() {
-  len0=len1=0;
-  h[0]=0x67452301;
-  h[1]=0xEFCDAB89;
-  h[2]=0x98BADCFE;
-  h[3]=0x10325476;
-  h[4]=0xC3D2E1F0;
-}
-
-// Return old result and start a new hash
-const char* SHA1::result() {
-
-  // pad and append length
-  const U32 s1=len1, s0=len0;
-  put(0x80);
-  while ((len0&511)!=448)
-    put(0);
-  put(s1>>24);
-  put(s1>>16);
-  put(s1>>8);
-  put(s1);
-  put(s0>>24);
-  put(s0>>16);
-  put(s0>>8);
-  put(s0);
-
-  // copy h to hbuf
-  for (int i=0; i<5; ++i) {
-    hbuf[4*i]=h[i]>>24;
-    hbuf[4*i+1]=h[i]>>16;
-    hbuf[4*i+2]=h[i]>>8;
-    hbuf[4*i+3]=h[i];
-  }
-
-  // return hash prior to clearing state
-  init();
-  return hbuf;
-}
-
-// Hash 1 block of 64 bytes
-void SHA1::process() {
-  for (int i=16; i<80; ++i) {
-    w[i]=w[i-3]^w[i-8]^w[i-14]^w[i-16];
-    w[i]=w[i]<<1|w[i]>>31;
-  }
-  U32 a=h[0];
-  U32 b=h[1];
-  U32 c=h[2];
-  U32 d=h[3];
-  U32 e=h[4];
-  const U32 k1=0x5A827999, k2=0x6ED9EBA1, k3=0x8F1BBCDC, k4=0xCA62C1D6;
-#define f1(a,b,c,d,e,i) e+=(a<<5|a>>27)+((b&c)|(~b&d))+k1+w[i]; b=b<<30|b>>2;
-#define f5(i) f1(a,b,c,d,e,i) f1(e,a,b,c,d,i+1) f1(d,e,a,b,c,i+2) \
-              f1(c,d,e,a,b,i+3) f1(b,c,d,e,a,i+4)
-  f5(0) f5(5) f5(10) f5(15)
-#undef f1
-#define f1(a,b,c,d,e,i) e+=(a<<5|a>>27)+(b^c^d)+k2+w[i]; b=b<<30|b>>2;
-  f5(20) f5(25) f5(30) f5(35)
-#undef f1
-#define f1(a,b,c,d,e,i) e+=(a<<5|a>>27)+((b&c)|(b&d)|(c&d))+k3+w[i]; b=b<<30|b>>2;
-  f5(40) f5(45) f5(50) f5(55)
-#undef f1
-#define f1(a,b,c,d,e,i) e+=(a<<5|a>>27)+(b^c^d)+k4+w[i]; b=b<<30|b>>2;
-  f5(60) f5(65) f5(70) f5(75)
-#undef f1
-#undef f5
-  h[0]+=a;
-  h[1]+=b;
-  h[2]+=c;
-  h[3]+=d;
-  h[4]+=e;
-}
-
-//////////////////////////// Component ///////////////////////
-
-// A Component is a context model, indirect context model, match model,
-// fixed weight mixer, adaptive 2 input mixer without or with current
-// partial byte as context, adaptive m input mixer (without or with),
-// or SSE (without or with).
-
-const int compsize[256]={0,2,3,2,3,4,6,6,3,5};
-
-void Component::init() {
-  limit=cxt=a=b=c=0;
-  cm.resize(0);
-  ht.resize(0);
-  a16.resize(0);
-}
-
-////////////////////////// StateTable //////////////////////////
-
-// How many states with count of n0 zeros, n1 ones (0...2)
-int StateTable::num_states(int n0, int n1) {
-  const int B=6;
-  const int bound[B]={20,48,15,8,6,5}; // n0 -> max n1, n1 -> max n0
-  if (n0<n1) return num_states(n1, n0);
-  if (n0<0 || n1<0 || n1>=B || n0>bound[n1]) return 0;
-  return 1+(n1>0 && n0+n1<=17);
-}
-
-// New value of count n0 if 1 is observed (and vice versa)
-void StateTable::discount(int& n0) {
-  n0=(n0>=1)+(n0>=2)+(n0>=3)+(n0>=4)+(n0>=5)+(n0>=7)+(n0>=8);
-}
-
-// compute next n0,n1 (0 to N) given input y (0 or 1)
-void StateTable::next_state(int& n0, int& n1, int y) {
-  if (n0<n1)
-    next_state(n1, n0, 1-y);
-  else {
-    if (y) {
-      ++n1;
-      discount(n0);
-    }
-    else {
-      ++n0;
-      discount(n1);
-    }
-    // 20,0,0 -> 20,0
-    // 48,1,0 -> 48,1
-    // 15,2,0 -> 8,1
-    //  8,3,0 -> 6,2
-    //  8,3,1 -> 5,3
-    //  6,4,0 -> 5,3
-    //  5,5,0 -> 5,4
-    //  5,5,1 -> 4,5
-    while (!num_states(n0, n1)) {
-      if (n1<2) --n0;
-      else {
-        n0=(n0*(n1-1)+(n1/2))/n1;
-        --n1;
-      }
-    }
-  }
-}
-
-// Initialize next state table ns[state*4] -> next if 0, next if 1, n0, n1
-StateTable::StateTable() {
-
-  // Assign states by increasing priority
-  const int N=50;
-  U8 t[N][N][2]={{{0}}}; // (n0,n1,y) -> state number
-  int state=0;
-  for (int i=0; i<N; ++i) {
-    for (int n1=0; n1<=i; ++n1) {
-      int n0=i-n1;
-      int n=num_states(n0, n1);
-      assert(n>=0 && n<=2);
-      if (n) {
-        t[n0][n1][0]=state;
-        t[n0][n1][1]=state+n-1;
-        state+=n;
-      }
-    }
-  }
-       
-  // Generate next state table
-  memset(ns, 0, sizeof(ns));
-  for (int n0=0; n0<N; ++n0) {
-    for (int n1=0; n1<N; ++n1) {
-      for (int y=0; y<num_states(n0, n1); ++y) {
-        int s=t[n0][n1][y];
-        assert(s>=0 && s<256);
-        int s0=n0, s1=n1;
-        next_state(s0, s1, 0);
-        assert(s0>=0 && s0<N && s1>=0 && s1<N);
-        ns[s*4+0]=t[s0][s1][0];
-        s0=n0, s1=n1;
-        next_state(s0, s1, 1);
-        assert(s0>=0 && s0<N && s1>=0 && s1<N);
-        ns[s*4+1]=t[s0][s1][1];
-        ns[s*4+2]=n0;
-        ns[s*4+3]=n1;
-      }
-    }
-  }
-}
-
-/////////////////////////// ZPAQL //////////////////////////
-
-// Write header to out2, return true if HCOMP/PCOMP section is present.
-// If pp is true, then write only the postprocessor code.
-bool ZPAQL::write(Writer* out2, bool pp) {
-  if (header.size()<=6) return false;
-  assert(header[0]+256*header[1]==cend-2+hend-hbegin);
-  assert(cend>=7);
-  assert(hbegin>=cend);
-  assert(hend>=hbegin);
-  assert(out2);
-  if (!pp) {  // if not a postprocessor then write COMP
-    for (int i=0; i<cend; ++i)
-      out2->put(header[i]);
-  }
-  else {  // write PCOMP size only
-    out2->put((hend-hbegin)&255);
-    out2->put((hend-hbegin)>>8);
-  }
-  for (int i=hbegin; i<hend; ++i)
-    out2->put(header[i]);
-  return true;
-}
-
-// Read header from in2
-int ZPAQL::read(Reader* in2) {
-
-  // Get header size and allocate
-  int hsize=in2->get();
-  hsize+=in2->get()*256;
-  header.resize(hsize+300);
-  cend=hbegin=hend=0;
-  header[cend++]=hsize&255;
-  header[cend++]=hsize>>8;
-  while (cend<7) header[cend++]=in2->get(); // hh hm ph pm n
-
-  // Read COMP
-  int n=header[cend-1];
-  for (int i=0; i<n; ++i) {
-    int type=in2->get();  // component type
-    if (type==-1) error("unexpected end of file");
-    header[cend++]=type;  // component type
-    int size=compsize[type];
-    if (size<1) error("Invalid component type");
-    if (cend+size>header.isize()-8) error("COMP list too big");
-    for (int j=1; j<size; ++j)
-      header[cend++]=in2->get();
-  }
-  if ((header[cend++]=in2->get())!=0) error("missing COMP END");
-
-  // Insert a guard gap and read HCOMP
-  hbegin=hend=cend+128;
-  while (hend<hsize+129) {
-    assert(hend<header.isize()-8);
-    int op=in2->get();
-    if (op==-1) error("unexpected end of file");
-    header[hend++]=op;
-  }
-  if ((header[hend++]=in2->get())!=0) error("missing HCOMP END");
-  assert(cend>=7 && cend<header.isize());
-  assert(hbegin==cend+128 && hbegin<header.isize());
-  assert(hend>hbegin && hend<header.isize());
-  assert(hsize==header[0]+256*header[1]);
-  assert(hsize==cend-2+hend-hbegin);
-  allocx(rcode, rcode_size, 0);  // clear JIT code
-  return cend+hend-hbegin;
-}
-
-// Free memory, but preserve output, sha1 pointers
-void ZPAQL::clear() {
-  cend=hbegin=hend=0;  // COMP and HCOMP locations
-  a=b=c=d=f=pc=0;      // machine state
-  header.resize(0);
-  h.resize(0);
-  m.resize(0);
-  r.resize(0);
-  allocx(rcode, rcode_size, 0);
-}
-
-// Constructor
-ZPAQL::ZPAQL() {
-  output=0;
-  sha1=0;
-  rcode=0;
-  rcode_size=0;
-  clear();
-  outbuf.resize(1<<14);
-  bufptr=0;
-}
-
-ZPAQL::~ZPAQL() {
-  allocx(rcode, rcode_size, 0);
-}
-
-// Initialize machine state as HCOMP
-void ZPAQL::inith() {
-  assert(header.isize()>6);
-  assert(output==0);
-  assert(sha1==0);
-  init(header[2], header[3]); // hh, hm
-}
-
-// Initialize machine state as PCOMP
-void ZPAQL::initp() {
-  assert(header.isize()>6);
-  init(header[4], header[5]); // ph, pm
-}
-
-// Flush pending output
-void ZPAQL::flush() {
-  if (output) output->write(&outbuf[0], bufptr);
-  if (sha1) for (int i=0; i<bufptr; ++i) sha1->put(U8(outbuf[i]));
-  bufptr=0;
-}
-
-// Return memory requirement in bytes
-double ZPAQL::memory() {
-  double mem=pow(2.0,header[2]+2)+pow(2.0,header[3])  // hh hm
-            +pow(2.0,header[4]+2)+pow(2.0,header[5])  // ph pm
-            +header.size();
-  int cp=7;  // start of comp list
-  for (int i=0; i<header[6]; ++i) {  // n
-    assert(cp<cend);
-    double size=pow(2.0, header[cp+1]); // sizebits
-    switch(header[cp]) {
-      case CM: mem+=4*size; break;
-      case ICM: mem+=64*size+1024; break;
-      case MATCH: mem+=4*size+pow(2.0, header[cp+2]); break; // bufbits
-      case MIX2: mem+=2*size; break;
-      case MIX: mem+=4*size*header[cp+3]; break; // m
-      case ISSE: mem+=64*size+2048; break;
-      case SSE: mem+=128*size; break;
-    }
-    cp+=compsize[header[cp]];
-  }
-  return mem;
-}
-
-// Initialize machine state to run a program.
-void ZPAQL::init(int hbits, int mbits) {
-  assert(header.isize()>0);
-  assert(cend>=7);
-  assert(hbegin>=cend+128);
-  assert(hend>=hbegin);
-  assert(hend<header.isize()-130);
-  assert(header[0]+256*header[1]==cend-2+hend-hbegin);
-  assert(bufptr==0);
-  assert(outbuf.isize()>0);
-  h.resize(1, hbits);
-  m.resize(1, mbits);
-  r.resize(256);
-  a=b=c=d=pc=f=0;
-}
-
-// Run program on input by interpreting header
-void ZPAQL::run0(U32 input) {
-  assert(cend>6);
-  assert(hbegin>=cend+128);
-  assert(hend>=hbegin);
-  assert(hend<header.isize()-130);
-  assert(m.size()>0);
-  assert(h.size()>0);
-  assert(header[0]+256*header[1]==cend+hend-hbegin-2);
-  pc=hbegin;
-  a=input;
-  while (execute()) ;
-}
-
-// Execute one instruction, return 0 after HALT else 1
-int ZPAQL::execute() {
-  switch(header[pc++]) {
-    case 0: err(); break; // ERROR
-    case 1: ++a; break; // A++
-    case 2: --a; break; // A--
-    case 3: a = ~a; break; // A!
-    case 4: a = 0; break; // A=0
-    case 7: a = r[header[pc++]]; break; // A=R N
-    case 8: swap(b); break; // B<>A
-    case 9: ++b; break; // B++
-    case 10: --b; break; // B--
-    case 11: b = ~b; break; // B!
-    case 12: b = 0; break; // B=0
-    case 15: b = r[header[pc++]]; break; // B=R N
-    case 16: swap(c); break; // C<>A
-    case 17: ++c; break; // C++
-    case 18: --c; break; // C--
-    case 19: c = ~c; break; // C!
-    case 20: c = 0; break; // C=0
-    case 23: c = r[header[pc++]]; break; // C=R N
-    case 24: swap(d); break; // D<>A
-    case 25: ++d; break; // D++
-    case 26: --d; break; // D--
-    case 27: d = ~d; break; // D!
-    case 28: d = 0; break; // D=0
-    case 31: d = r[header[pc++]]; break; // D=R N
-    case 32: swap(m(b)); break; // *B<>A
-    case 33: ++m(b); break; // *B++
-    case 34: --m(b); break; // *B--
-    case 35: m(b) = ~m(b); break; // *B!
-    case 36: m(b) = 0; break; // *B=0
-    case 39: if (f) pc+=((header[pc]+128)&255)-127; else ++pc; break; // JT N
-    case 40: swap(m(c)); break; // *C<>A
-    case 41: ++m(c); break; // *C++
-    case 42: --m(c); break; // *C--
-    case 43: m(c) = ~m(c); break; // *C!
-    case 44: m(c) = 0; break; // *C=0
-    case 47: if (!f) pc+=((header[pc]+128)&255)-127; else ++pc; break; // JF N
-    case 48: swap(h(d)); break; // *D<>A
-    case 49: ++h(d); break; // *D++
-    case 50: --h(d); break; // *D--
-    case 51: h(d) = ~h(d); break; // *D!
-    case 52: h(d) = 0; break; // *D=0
-    case 55: r[header[pc++]] = a; break; // R=A N
-    case 56: return 0  ; // HALT
-    case 57: outc(a&255); break; // OUT
-    case 59: a = (a+m(b)+512)*773; break; // HASH
-    case 60: h(d) = (h(d)+a+512)*773; break; // HASHD
-    case 63: pc+=((header[pc]+128)&255)-127; break; // JMP N
-    case 64: a = a; break; // A=A
-    case 65: a = b; break; // A=B
-    case 66: a = c; break; // A=C
-    case 67: a = d; break; // A=D
-    case 68: a = m(b); break; // A=*B
-    case 69: a = m(c); break; // A=*C
-    case 70: a = h(d); break; // A=*D
-    case 71: a = header[pc++]; break; // A= N
-    case 72: b = a; break; // B=A
-    case 73: b = b; break; // B=B
-    case 74: b = c; break; // B=C
-    case 75: b = d; break; // B=D
-    case 76: b = m(b); break; // B=*B
-    case 77: b = m(c); break; // B=*C
-    case 78: b = h(d); break; // B=*D
-    case 79: b = header[pc++]; break; // B= N
-    case 80: c = a; break; // C=A
-    case 81: c = b; break; // C=B
-    case 82: c = c; break; // C=C
-    case 83: c = d; break; // C=D
-    case 84: c = m(b); break; // C=*B
-    case 85: c = m(c); break; // C=*C
-    case 86: c = h(d); break; // C=*D
-    case 87: c = header[pc++]; break; // C= N
-    case 88: d = a; break; // D=A
-    case 89: d = b; break; // D=B
-    case 90: d = c; break; // D=C
-    case 91: d = d; break; // D=D
-    case 92: d = m(b); break; // D=*B
-    case 93: d = m(c); break; // D=*C
-    case 94: d = h(d); break; // D=*D
-    case 95: d = header[pc++]; break; // D= N
-    case 96: m(b) = a; break; // *B=A
-    case 97: m(b) = b; break; // *B=B
-    case 98: m(b) = c; break; // *B=C
-    case 99: m(b) = d; break; // *B=D
-    case 100: m(b) = m(b); break; // *B=*B
-    case 101: m(b) = m(c); break; // *B=*C
-    case 102: m(b) = h(d); break; // *B=*D
-    case 103: m(b) = header[pc++]; break; // *B= N
-    case 104: m(c) = a; break; // *C=A
-    case 105: m(c) = b; break; // *C=B
-    case 106: m(c) = c; break; // *C=C
-    case 107: m(c) = d; break; // *C=D
-    case 108: m(c) = m(b); break; // *C=*B
-    case 109: m(c) = m(c); break; // *C=*C
-    case 110: m(c) = h(d); break; // *C=*D
-    case 111: m(c) = header[pc++]; break; // *C= N
-    case 112: h(d) = a; break; // *D=A
-    case 113: h(d) = b; break; // *D=B
-    case 114: h(d) = c; break; // *D=C
-    case 115: h(d) = d; break; // *D=D
-    case 116: h(d) = m(b); break; // *D=*B
-    case 117: h(d) = m(c); break; // *D=*C
-    case 118: h(d) = h(d); break; // *D=*D
-    case 119: h(d) = header[pc++]; break; // *D= N
-    case 128: a += a; break; // A+=A
-    case 129: a += b; break; // A+=B
-    case 130: a += c; break; // A+=C
-    case 131: a += d; break; // A+=D
-    case 132: a += m(b); break; // A+=*B
-    case 133: a += m(c); break; // A+=*C
-    case 134: a += h(d); break; // A+=*D
-    case 135: a += header[pc++]; break; // A+= N
-    case 136: a -= a; break; // A-=A
-    case 137: a -= b; break; // A-=B
-    case 138: a -= c; break; // A-=C
-    case 139: a -= d; break; // A-=D
-    case 140: a -= m(b); break; // A-=*B
-    case 141: a -= m(c); break; // A-=*C
-    case 142: a -= h(d); break; // A-=*D
-    case 143: a -= header[pc++]; break; // A-= N
-    case 144: a *= a; break; // A*=A
-    case 145: a *= b; break; // A*=B
-    case 146: a *= c; break; // A*=C
-    case 147: a *= d; break; // A*=D
-    case 148: a *= m(b); break; // A*=*B
-    case 149: a *= m(c); break; // A*=*C
-    case 150: a *= h(d); break; // A*=*D
-    case 151: a *= header[pc++]; break; // A*= N
-    case 152: div(a); break; // A/=A
-    case 153: div(b); break; // A/=B
-    case 154: div(c); break; // A/=C
-    case 155: div(d); break; // A/=D
-    case 156: div(m(b)); break; // A/=*B
-    case 157: div(m(c)); break; // A/=*C
-    case 158: div(h(d)); break; // A/=*D
-    case 159: div(header[pc++]); break; // A/= N
-    case 160: mod(a); break; // A%=A
-    case 161: mod(b); break; // A%=B
-    case 162: mod(c); break; // A%=C
-    case 163: mod(d); break; // A%=D
-    case 164: mod(m(b)); break; // A%=*B
-    case 165: mod(m(c)); break; // A%=*C
-    case 166: mod(h(d)); break; // A%=*D
-    case 167: mod(header[pc++]); break; // A%= N
-    case 168: a &= a; break; // A&=A
-    case 169: a &= b; break; // A&=B
-    case 170: a &= c; break; // A&=C
-    case 171: a &= d; break; // A&=D
-    case 172: a &= m(b); break; // A&=*B
-    case 173: a &= m(c); break; // A&=*C
-    case 174: a &= h(d); break; // A&=*D
-    case 175: a &= header[pc++]; break; // A&= N
-    case 176: a &= ~ a; break; // A&~A
-    case 177: a &= ~ b; break; // A&~B
-    case 178: a &= ~ c; break; // A&~C
-    case 179: a &= ~ d; break; // A&~D
-    case 180: a &= ~ m(b); break; // A&~*B
-    case 181: a &= ~ m(c); break; // A&~*C
-    case 182: a &= ~ h(d); break; // A&~*D
-    case 183: a &= ~ header[pc++]; break; // A&~ N
-    case 184: a |= a; break; // A|=A
-    case 185: a |= b; break; // A|=B
-    case 186: a |= c; break; // A|=C
-    case 187: a |= d; break; // A|=D
-    case 188: a |= m(b); break; // A|=*B
-    case 189: a |= m(c); break; // A|=*C
-    case 190: a |= h(d); break; // A|=*D
-    case 191: a |= header[pc++]; break; // A|= N
-    case 192: a ^= a; break; // A^=A
-    case 193: a ^= b; break; // A^=B
-    case 194: a ^= c; break; // A^=C
-    case 195: a ^= d; break; // A^=D
-    case 196: a ^= m(b); break; // A^=*B
-    case 197: a ^= m(c); break; // A^=*C
-    case 198: a ^= h(d); break; // A^=*D
-    case 199: a ^= header[pc++]; break; // A^= N
-    case 200: a <<= (a&31); break; // A<<=A
-    case 201: a <<= (b&31); break; // A<<=B
-    case 202: a <<= (c&31); break; // A<<=C
-    case 203: a <<= (d&31); break; // A<<=D
-    case 204: a <<= (m(b)&31); break; // A<<=*B
-    case 205: a <<= (m(c)&31); break; // A<<=*C
-    case 206: a <<= (h(d)&31); break; // A<<=*D
-    case 207: a <<= (header[pc++]&31); break; // A<<= N
-    case 208: a >>= (a&31); break; // A>>=A
-    case 209: a >>= (b&31); break; // A>>=B
-    case 210: a >>= (c&31); break; // A>>=C
-    case 211: a >>= (d&31); break; // A>>=D
-    case 212: a >>= (m(b)&31); break; // A>>=*B
-    case 213: a >>= (m(c)&31); break; // A>>=*C
-    case 214: a >>= (h(d)&31); break; // A>>=*D
-    case 215: a >>= (header[pc++]&31); break; // A>>= N
-    case 216: f = (a == a); break; // A==A
-    case 217: f = (a == b); break; // A==B
-    case 218: f = (a == c); break; // A==C
-    case 219: f = (a == d); break; // A==D
-    case 220: f = (a == U32(m(b))); break; // A==*B
-    case 221: f = (a == U32(m(c))); break; // A==*C
-    case 222: f = (a == h(d)); break; // A==*D
-    case 223: f = (a == U32(header[pc++])); break; // A== N
-    case 224: f = (a < a); break; // A<A
-    case 225: f = (a < b); break; // A<B
-    case 226: f = (a < c); break; // A<C
-    case 227: f = (a < d); break; // A<D
-    case 228: f = (a < U32(m(b))); break; // A<*B
-    case 229: f = (a < U32(m(c))); break; // A<*C
-    case 230: f = (a < h(d)); break; // A<*D
-    case 231: f = (a < U32(header[pc++])); break; // A< N
-    case 232: f = (a > a); break; // A>A
-    case 233: f = (a > b); break; // A>B
-    case 234: f = (a > c); break; // A>C
-    case 235: f = (a > d); break; // A>D
-    case 236: f = (a > U32(m(b))); break; // A>*B
-    case 237: f = (a > U32(m(c))); break; // A>*C
-    case 238: f = (a > h(d)); break; // A>*D
-    case 239: f = (a > U32(header[pc++])); break; // A> N
-    case 255: if((pc=hbegin+header[pc]+256*header[pc+1])>=hend)err();break;//LJ
-    default: err();
-  }
-  return 1;
-}
-
-// Print illegal instruction error message and exit
-void ZPAQL::err() {
-  error("ZPAQL execution error");
-}
-
-///////////////////////// Predictor /////////////////////////
-
-// Initailize model-independent tables
-Predictor::Predictor(ZPAQL& zr):
-    c8(1), hmap4(1), z(zr) {
-  assert(sizeof(U8)==1);
-  assert(sizeof(U16)==2);
-  assert(sizeof(U32)==4);
-  assert(sizeof(U64)==8);
-  assert(sizeof(short)==2);
-  assert(sizeof(int)==4);
-
-  // Initialize tables
-  dt2k[0]=0;
-  for (int i=1; i<256; ++i)
-    dt2k[i]=2048/i;
-  for (int i=0; i<1024; ++i)
-    dt[i]=(1<<17)/(i*2+3)*2;
-  for (int i=0; i<32768; ++i)
-    stretcht[i]=int(log((i+0.5)/(32767.5-i))*64+0.5+100000)-100000;
-  for (int i=0; i<4096; ++i)
-    squasht[i]=int(32768.0/(1+exp((i-2048)*(-1.0/64))));
-
-  // Verify floating point math for squash() and stretch()
-  U32 sqsum=0, stsum=0;
-  for (int i=32767; i>=0; --i)
-    stsum=stsum*3+stretch(i);
-  for (int i=4095; i>=0; --i)
-    sqsum=sqsum*3+squash(i-2048);
-  assert(stsum==3887533746u);
-  assert(sqsum==2278286169u);
-
-  pcode=0;
-  pcode_size=0;
-}
-
-Predictor::~Predictor() {
-  allocx(pcode, pcode_size, 0);  // free executable memory
-}
-
-// Initialize the predictor with a new model in z
-void Predictor::init() {
-
-  // Clear old JIT code if any
-  allocx(pcode, pcode_size, 0);
-
-  // Initialize context hash function
-  z.inith();
-
-  // Initialize predictions
-  for (int i=0; i<256; ++i) h[i]=p[i]=0;
-
-  // Initialize components
-  for (int i=0; i<256; ++i)  // clear old model
-    comp[i].init();
-  int n=z.header[6]; // hsize[0..1] hh hm ph pm n (comp)[n] END 0[128] (hcomp) END
-  const U8* cp=&z.header[7];  // start of component list
-  for (int i=0; i<n; ++i) {
-    assert(cp<&z.header[z.cend]);
-    assert(cp>&z.header[0] && cp<&z.header[z.header.isize()-8]);
-    Component& cr=comp[i];
-    switch(cp[0]) {
-      case CONS:  // c
-        p[i]=(cp[1]-128)*4;
-        break;
-      case CM: // sizebits limit
-        if (cp[1]>32) error("max size for CM is 32");
-        cr.cm.resize(1, cp[1]);  // packed CM (22 bits) + CMCOUNT (10 bits)
-        cr.limit=cp[2]*4;
-        for (size_t j=0; j<cr.cm.size(); ++j)
-          cr.cm[j]=0x80000000;
-        break;
-      case ICM: // sizebits
-        if (cp[1]>26) error("max size for ICM is 26");
-        cr.limit=1023;
-        cr.cm.resize(256);
-        cr.ht.resize(64, cp[1]);
-        for (size_t j=0; j<cr.cm.size(); ++j)
-          cr.cm[j]=st.cminit(j);
-        break;
-      case MATCH:  // sizebits
-        if (cp[1]>32 || cp[2]>32) error("max size for MATCH is 32 32");
-        cr.cm.resize(1, cp[1]);  // index
-        cr.ht.resize(1, cp[2]);  // buf
-        cr.ht(0)=1;
-        break;
-      case AVG: // j k wt
-        if (cp[1]>=i) error("AVG j >= i");
-        if (cp[2]>=i) error("AVG k >= i");
-        break;
-      case MIX2:  // sizebits j k rate mask
-        if (cp[1]>32) error("max size for MIX2 is 32");
-        if (cp[3]>=i) error("MIX2 k >= i");
-        if (cp[2]>=i) error("MIX2 j >= i");
-        cr.c=(size_t(1)<<cp[1]); // size (number of contexts)
-        cr.a16.resize(1, cp[1]);  // wt[size][m]
-        for (size_t j=0; j<cr.a16.size(); ++j)
-          cr.a16[j]=32768;
-        break;
-      case MIX: {  // sizebits j m rate mask
-        if (cp[1]>32) error("max size for MIX is 32");
-        if (cp[2]>=i) error("MIX j >= i");
-        if (cp[3]<1 || cp[3]>i-cp[2]) error("MIX m not in 1..i-j");
-        int m=cp[3];  // number of inputs
-        assert(m>=1);
-        cr.c=(size_t(1)<<cp[1]); // size (number of contexts)
-        cr.cm.resize(m, cp[1]);  // wt[size][m]
-        for (size_t j=0; j<cr.cm.size(); ++j)
-          cr.cm[j]=65536/m;
-        break;
-      }
-      case ISSE:  // sizebits j
-        if (cp[1]>32) error("max size for ISSE is 32");
-        if (cp[2]>=i) error("ISSE j >= i");
-        cr.ht.resize(64, cp[1]);
-        cr.cm.resize(512);
-        for (int j=0; j<256; ++j) {
-          cr.cm[j*2]=1<<15;
-          cr.cm[j*2+1]=clamp512k(stretch(st.cminit(j)>>8)<<10);
-        }
-        break;
-      case SSE: // sizebits j start limit
-        if (cp[1]>32) error("max size for SSE is 32");
-        if (cp[2]>=i) error("SSE j >= i");
-        if (cp[3]>cp[4]*4) error("SSE start > limit*4");
-        cr.cm.resize(32, cp[1]);
-        cr.limit=cp[4]*4;
-        for (size_t j=0; j<cr.cm.size(); ++j)
-          cr.cm[j]=squash((j&31)*64-992)<<17|cp[3];
-        break;
-      default: error("unknown component type");
-    }
-    assert(compsize[*cp]>0);
-    cp+=compsize[*cp];
-    assert(cp>=&z.header[7] && cp<&z.header[z.cend]);
-  }
-}
-
-// Return next bit prediction using interpreted COMP code
-int Predictor::predict0() {
-  assert(c8>=1 && c8<=255);
-
-  // Predict next bit
-  int n=z.header[6];
-  assert(n>0 && n<=255);
-  const U8* cp=&z.header[7];
-  assert(cp[-1]==n);
-  for (int i=0; i<n; ++i) {
-    assert(cp>&z.header[0] && cp<&z.header[z.header.isize()-8]);
-    Component& cr=comp[i];
-    switch(cp[0]) {
-      case CONS:  // c
-        break;
-      case CM:  // sizebits limit
-        cr.cxt=h[i]^hmap4;
-        p[i]=stretch(cr.cm(cr.cxt)>>17);
-        break;
-      case ICM: // sizebits
-        assert((hmap4&15)>0);
-        if (c8==1 || (c8&0xf0)==16) cr.c=find(cr.ht, cp[1]+2, h[i]+16*c8);
-        cr.cxt=cr.ht[cr.c+(hmap4&15)];
-        p[i]=stretch(cr.cm(cr.cxt)>>8);
-        break;
-      case MATCH: // sizebits bufbits: a=len, b=offset, c=bit, cxt=bitpos,
-                  //                   ht=buf, limit=pos
-        assert(cr.cm.size()==(size_t(1)<<cp[1]));
-        assert(cr.ht.size()==(size_t(1)<<cp[2]));
-        assert(cr.a<=255);
-        assert(cr.c==0 || cr.c==1);
-        assert(cr.cxt<8);
-        assert(cr.limit<cr.ht.size());
-        if (cr.a==0) p[i]=0;
-        else {
-          cr.c=(cr.ht(cr.limit-cr.b)>>(7-cr.cxt))&1; // predicted bit
-          p[i]=stretch(dt2k[cr.a]*(cr.c*-2+1)&32767);
-        }
-        break;
-      case AVG: // j k wt
-        p[i]=(p[cp[1]]*cp[3]+p[cp[2]]*(256-cp[3]))>>8;
-        break;
-      case MIX2: { // sizebits j k rate mask
-                   // c=size cm=wt[size] cxt=input
-        cr.cxt=((h[i]+(c8&cp[5]))&(cr.c-1));
-        assert(cr.cxt<cr.a16.size());
-        int w=cr.a16[cr.cxt];
-        assert(w>=0 && w<65536);
-        p[i]=(w*p[cp[2]]+(65536-w)*p[cp[3]])>>16;
-        assert(p[i]>=-2048 && p[i]<2048);
-      }
-        break;
-      case MIX: {  // sizebits j m rate mask
-                   // c=size cm=wt[size][m] cxt=index of wt in cm
-        int m=cp[3];
-        assert(m>=1 && m<=i);
-        cr.cxt=h[i]+(c8&cp[5]);
-        cr.cxt=(cr.cxt&(cr.c-1))*m; // pointer to row of weights
-        assert(cr.cxt<=cr.cm.size()-m);
-        int* wt=(int*)&cr.cm[cr.cxt];
-        p[i]=0;
-        for (int j=0; j<m; ++j)
-          p[i]+=(wt[j]>>8)*p[cp[2]+j];
-        p[i]=clamp2k(p[i]>>8);
-      }
-        break;
-      case ISSE: { // sizebits j -- c=hi, cxt=bh
-        assert((hmap4&15)>0);
-        if (c8==1 || (c8&0xf0)==16)
-          cr.c=find(cr.ht, cp[1]+2, h[i]+16*c8);
-        cr.cxt=cr.ht[cr.c+(hmap4&15)];  // bit history
-        int *wt=(int*)&cr.cm[cr.cxt*2];
-        p[i]=clamp2k((wt[0]*p[cp[2]]+wt[1]*64)>>16);
-      }
-        break;
-      case SSE: { // sizebits j start limit
-        cr.cxt=(h[i]+c8)*32;
-        int pq=p[cp[2]]+992;
-        if (pq<0) pq=0;
-        if (pq>1983) pq=1983;
-        int wt=pq&63;
-        pq>>=6;
-        assert(pq>=0 && pq<=30);
-        cr.cxt+=pq;
-        p[i]=stretch(((cr.cm(cr.cxt)>>10)*(64-wt)+(cr.cm(cr.cxt+1)>>10)*wt)>>13);
-        cr.cxt+=wt>>5;
-      }
-        break;
-      default:
-        error("component predict not implemented");
-    }
-    cp+=compsize[cp[0]];
-    assert(cp<&z.header[z.cend]);
-    assert(p[i]>=-2048 && p[i]<2048);
-  }
-  assert(cp[0]==NONE);
-  return squash(p[n-1]);
-}
-
-// Update model with decoded bit y (0...1)
-void Predictor::update0(int y) {
-  assert(y==0 || y==1);
-  assert(c8>=1 && c8<=255);
-  assert(hmap4>=1 && hmap4<=511);
-
-  // Update components
-  const U8* cp=&z.header[7];
-  int n=z.header[6];
-  assert(n>=1 && n<=255);
-  assert(cp[-1]==n);
-  for (int i=0; i<n; ++i) {
-    Component& cr=comp[i];
-    switch(cp[0]) {
-      case CONS:  // c
-        break;
-      case CM:  // sizebits limit
-        train(cr, y);
-        break;
-      case ICM: { // sizebits: cxt=ht[b]=bh, ht[c][0..15]=bh row, cxt=bh
-        cr.ht[cr.c+(hmap4&15)]=st.next(cr.ht[cr.c+(hmap4&15)], y);
-        U32& pn=cr.cm(cr.cxt);
-        pn+=int(y*32767-(pn>>8))>>2;
-      }
-        break;
-      case MATCH: // sizebits bufbits:
-                  //   a=len, b=offset, c=bit, cm=index, cxt=bitpos
-                  //   ht=buf, limit=pos
-      {
-        assert(cr.a<=255);
-        assert(cr.c==0 || cr.c==1);
-        assert(cr.cxt<8);
-        assert(cr.cm.size()==(size_t(1)<<cp[1]));
-        assert(cr.ht.size()==(size_t(1)<<cp[2]));
-        assert(cr.limit<cr.ht.size());
-        if (int(cr.c)!=y) cr.a=0;  // mismatch?
-        cr.ht(cr.limit)+=cr.ht(cr.limit)+y;
-        if (++cr.cxt==8) {
-          cr.cxt=0;
-          ++cr.limit;
-          cr.limit&=(1<<cp[2])-1;
-          if (cr.a==0) {  // look for a match
-            cr.b=cr.limit-cr.cm(h[i]);
-            if (cr.b&(cr.ht.size()-1))
-              while (cr.a<255
-                     && cr.ht(cr.limit-cr.a-1)==cr.ht(cr.limit-cr.a-cr.b-1))
-                ++cr.a;
-          }
-          else cr.a+=cr.a<255;
-          cr.cm(h[i])=cr.limit;
-        }
-      }
-        break;
-      case AVG:  // j k wt
-        break;
-      case MIX2: { // sizebits j k rate mask
-                   // cm=wt[size], cxt=input
-        assert(cr.a16.size()==cr.c);
-        assert(cr.cxt<cr.a16.size());
-        int err=(y*32767-squash(p[i]))*cp[4]>>5;
-        int w=cr.a16[cr.cxt];
-        w+=(err*(p[cp[2]]-p[cp[3]])+(1<<12))>>13;
-        if (w<0) w=0;
-        if (w>65535) w=65535;
-        cr.a16[cr.cxt]=w;
-      }
-        break;
-      case MIX: {   // sizebits j m rate mask
-                    // cm=wt[size][m], cxt=input
-        int m=cp[3];
-        assert(m>0 && m<=i);
-        assert(cr.cm.size()==m*cr.c);
-        assert(cr.cxt+m<=cr.cm.size());
-        int err=(y*32767-squash(p[i]))*cp[4]>>4;
-        int* wt=(int*)&cr.cm[cr.cxt];
-        for (int j=0; j<m; ++j)
-          wt[j]=clamp512k(wt[j]+((err*p[cp[2]+j]+(1<<12))>>13));
-      }
-        break;
-      case ISSE: { // sizebits j  -- c=hi, cxt=bh
-        assert(cr.cxt==cr.ht[cr.c+(hmap4&15)]);
-        int err=y*32767-squash(p[i]);
-        int *wt=(int*)&cr.cm[cr.cxt*2];
-        wt[0]=clamp512k(wt[0]+((err*p[cp[2]]+(1<<12))>>13));
-        wt[1]=clamp512k(wt[1]+((err+16)>>5));
-        cr.ht[cr.c+(hmap4&15)]=st.next(cr.cxt, y);
-      }
-        break;
-      case SSE:  // sizebits j start limit
-        train(cr, y);
-        break;
-      default:
-        assert(0);
-    }
-    cp+=compsize[cp[0]];
-    assert(cp>=&z.header[7] && cp<&z.header[z.cend] 
-           && cp<&z.header[z.header.isize()-8]);
-  }
-  assert(cp[0]==NONE);
-
-  // Save bit y in c8, hmap4
-  c8+=c8+y;
-  if (c8>=256) {
-    z.run(c8-256);
-    hmap4=1;
-    c8=1;
-    for (int i=0; i<n; ++i) h[i]=z.H(i);
-  }
-  else if (c8>=16 && c8<32)
-    hmap4=(hmap4&0xf)<<5|y<<4|1;
-  else
-    hmap4=(hmap4&0x1f0)|(((hmap4&0xf)*2+y)&0xf);
-}
-
-// Find cxt row in hash table ht. ht has rows of 16 indexed by the
-// low sizebits of cxt with element 0 having the next higher 8 bits for
-// collision detection. If not found after 3 adjacent tries, replace the
-// row with lowest element 1 as priority. Return index of row.
-size_t Predictor::find(Array<U8>& ht, int sizebits, U32 cxt) {
-  assert(ht.size()==size_t(16)<<sizebits);
-  int chk=cxt>>sizebits&255;
-  size_t h0=(cxt*16)&(ht.size()-16);
-  if (ht[h0]==chk) return h0;
-  size_t h1=h0^16;
-  if (ht[h1]==chk) return h1;
-  size_t h2=h0^32;
-  if (ht[h2]==chk) return h2;
-  if (ht[h0+1]<=ht[h1+1] && ht[h0+1]<=ht[h2+1])
-    return memset(&ht[h0], 0, 16), ht[h0]=chk, h0;
-  else if (ht[h1+1]<ht[h2+1])
-    return memset(&ht[h1], 0, 16), ht[h1]=chk, h1;
-  else
-    return memset(&ht[h2], 0, 16), ht[h2]=chk, h2;
-}
-
-/////////////////////// Decoder ///////////////////////
-
-Decoder::Decoder(ZPAQL& z):
-    in(0), low(1), high(0xFFFFFFFF), curr(0), pr(z), buf(BUFSIZE) {
-}
-
-void Decoder::init() {
-  pr.init();
-  if (pr.isModeled()) low=1, high=0xFFFFFFFF, curr=0;
-  else low=high=curr=0;
-}
-
-// Read un-modeled input into buf[low=0..high-1]
-// with curr remaining in subblock to read.
-void Decoder::loadbuf() {
-  assert(!pr.isModeled());
-  assert(low==high);
-  if (curr==0) {
-    for (int i=0; i<4; ++i) {
-      int c=in->get();
-      if (c<0) error("unexpected end of input");
-      curr=curr<<8|c;
-    }
-  }
-  U32 n=buf.size();
-  if (n>curr) n=curr;
-  high=in->read(&buf[0], n);
-  curr-=high;
-  low=0;
-}
-
-// Return next bit of decoded input, which has 16 bit probability p of being 1
-int Decoder::decode(int p) {
-  assert(p>=0 && p<65536);
-  assert(high>low && low>0);
-  if (curr<low || curr>high) error("archive corrupted");
-  assert(curr>=low && curr<=high);
-  U32 mid=low+U32(((high-low)*U64(U32(p)))>>16);  // split range
-  assert(high>mid && mid>=low);
-  int y=curr<=mid;
-  if (y) high=mid; else low=mid+1; // pick half
-  while ((high^low)<0x1000000) { // shift out identical leading bytes
-    high=high<<8|255;
-    low=low<<8;
-    low+=(low==0);
-    int c=in->get();
-    if (c<0) error("unexpected end of file");
-    curr=curr<<8|c;
-  }
-  return y;
-}
-
-// Decompress 1 byte or -1 at end of input
-int Decoder::decompress() {
-  if (pr.isModeled()) {  // n>0 components?
-    if (curr==0) {  // segment initialization
-      for (int i=0; i<4; ++i)
-        curr=curr<<8|in->get();
-    }
-    if (decode(0)) {
-      if (curr!=0) error("decoding end of stream");
-      return -1;
-    }
-    else {
-      int c=1;
-      while (c<256) {  // get 8 bits
-        int p=pr.predict()*2+1;
-        c+=c+decode(p);
-        pr.update(c&1);
-      }
-      return c-256;
-    }
-  }
-  else {
-    if (low==high) loadbuf();
-    if (low==high) return -1;
-    return buf[low++]&255;
-  }
-}
-
-// Find end of compressed data and return next byte
-int Decoder::skip() {
-  int c=-1;
-  if (pr.isModeled()) {
-    while (curr==0)  // at start?
-      curr=in->get();
-    while (curr && (c=in->get())>=0)  // find 4 zeros
-      curr=curr<<8|c;
-    while ((c=in->get())==0) ;  // might be more than 4
-    return c;
-  }
-  else {
-    if (curr==0)  // at start?
-      for (int i=0; i<4 && (c=in->get())>=0; ++i) curr=curr<<8|c;
-    while (curr>0) {
-      U32 n=BUFSIZE;
-      if (n>curr) n=curr;
-      U32 n1=in->read(&buf[0], n);
-      curr-=n1;
-      if (n1!=n) return -1;
-      if (curr==0)
-        for (int i=0; i<4 && (c=in->get())>=0; ++i) curr=curr<<8|c;
-    }
-    if (c>=0) c=in->get();
-    return c;
-  }
-}
-
-////////////////////// PostProcessor //////////////////////
-
-// Copy ph, pm from block header
-void PostProcessor::init(int h, int m) {
-  state=hsize=0;
-  ph=h;
-  pm=m;
-  z.clear();
-}
-
-// (PASS=0 | PROG=1 psize[0..1] pcomp[0..psize-1]) data... EOB=-1
-// Return state: 1=PASS, 2..4=loading PROG, 5=PROG loaded
-int PostProcessor::write(int c) {
-  assert(c>=-1 && c<=255);
-  switch (state) {
-    case 0:  // initial state
-      if (c<0) error("Unexpected EOS");
-      state=c+1;  // 1=PASS, 2=PROG
-      if (state>2) error("unknown post processing type");
-      if (state==1) z.clear();
-      break;
-    case 1:  // PASS
-      z.outc(c);
-      break;
-    case 2: // PROG
-      if (c<0) error("Unexpected EOS");
-      hsize=c;  // low byte of size
-      state=3;
-      break;
-    case 3:  // PROG psize[0]
-      if (c<0) error("Unexpected EOS");
-      hsize+=c*256;  // high byte of psize
-      z.header.resize(hsize+300);
-      z.cend=8;
-      z.hbegin=z.hend=z.cend+128;
-      z.header[4]=ph;
-      z.header[5]=pm;
-      state=4;
-      break;
-    case 4:  // PROG psize[0..1] pcomp[0...]
-      if (c<0) error("Unexpected EOS");
-      assert(z.hend<z.header.isize());
-      z.header[z.hend++]=c;  // one byte of pcomp
-      if (z.hend-z.hbegin==hsize) {  // last byte of pcomp?
-        hsize=z.cend-2+z.hend-z.hbegin;
-        z.header[0]=hsize&255;  // header size with empty COMP
-        z.header[1]=hsize>>8;
-        z.initp();
-        state=5;
-      }
-      break;
-    case 5:  // PROG ... data
-      z.run(c);
-      if (c<0) z.flush();
-      break;
-  }
-  return state;
-}
-
-/////////////////////// Decompresser /////////////////////
-
-// Find the start of a block and return true if found. Set memptr
-// to memory used.
-bool Decompresser::findBlock(double* memptr) {
-  assert(state==BLOCK);
-
-  // Find start of block
-  U32 h1=0x3D49B113, h2=0x29EB7F93, h3=0x2614BE13, h4=0x3828EB13;
-  // Rolling hashes initialized to hash of first 13 bytes
-  int c;
-  while ((c=dec.in->get())!=-1) {
-    h1=h1*12+c;
-    h2=h2*20+c;
-    h3=h3*28+c;
-    h4=h4*44+c;
-    if (h1==0xB16B88F1 && h2==0xFF5376F1 && h3==0x72AC5BF1 && h4==0x2F909AF1)
-      break;  // hash of 16 byte string
-  }
-  if (c==-1) return false;
-
-  // Read header
-  if ((c=dec.in->get())!=1 && c!=2) error("unsupported ZPAQ level");
-  if (dec.in->get()!=1) error("unsupported ZPAQL type");
-  z.read(dec.in);
-  if (c==1 && z.header.isize()>6 && z.header[6]==0)
-    error("ZPAQ level 1 requires at least 1 component");
-  if (memptr) *memptr=z.memory();
-  state=FILENAME;
-  decode_state=FIRSTSEG;
-  return true;
-}
-
-// Read the start of a segment (1) or end of block code (255).
-// If a segment is found, write the filename and return true, else false.
-bool Decompresser::findFilename(Writer* filename) {
-  assert(state==FILENAME);
-  int c=dec.in->get();
-  if (c==1) {  // segment found
-    while (true) {
-      c=dec.in->get();
-      if (c==-1) error("unexpected EOF");
-      if (c==0) {
-        state=COMMENT;
-        return true;
-      }
-      if (filename) filename->put(c);
-    }
-  }
-  else if (c==255) {  // end of block found
-    state=BLOCK;
-    return false;
-  }
-  else
-    error("missing segment or end of block");
-  return false;
-}
-
-// Read the comment from the segment header
-void Decompresser::readComment(Writer* comment) {
-  assert(state==COMMENT);
-  state=DATA;
-  while (true) {
-    int c=dec.in->get();
-    if (c==-1) error("unexpected EOF");
-    if (c==0) break;
-    if (comment) comment->put(c);
-  }
-  if (dec.in->get()!=0) error("missing reserved byte");
-}
-
-// Decompress n bytes, or all if n < 0. Return false if done
-bool Decompresser::decompress(int n) {
-  assert(state==DATA);
-  assert(decode_state!=SKIP);
-
-  // Initialize models to start decompressing block
-  if (decode_state==FIRSTSEG) {
-    dec.init();
-    assert(z.header.size()>5);
-    pp.init(z.header[4], z.header[5]);
-    decode_state=SEG;
-  }
-
-  // Decompress and load PCOMP into postprocessor
-  while ((pp.getState()&3)!=1)
-    pp.write(dec.decompress());
-
-  // Decompress n bytes, or all if n < 0
-  while (n) {
-    int c=dec.decompress();
-    pp.write(c);
-    if (c==-1) {
-      state=SEGEND;
-      return false;
-    }
-    if (n>0) --n;
-  }
-  return true;
-}
-
-// Read end of block. If a SHA1 checksum is present, write 1 and the
-// 20 byte checksum into sha1string, else write 0 in first byte.
-// If sha1string is 0 then discard it.
-void Decompresser::readSegmentEnd(char* sha1string) {
-  assert(state==DATA || state==SEGEND);
-
-  // Skip remaining data if any and get next byte
-  int c=0;
-  if (state==DATA) {
-    c=dec.skip();
-    decode_state=SKIP;
-  }
-  else if (state==SEGEND)
-    c=dec.in->get();
-  state=FILENAME;
-
-  // Read checksum
-  if (c==254) {
-    if (sha1string) sha1string[0]=0;  // no checksum
-  }
-  else if (c==253) {
-    if (sha1string) sha1string[0]=1;
-    for (int i=1; i<=20; ++i) {
-      c=dec.in->get();
-      if (sha1string) sha1string[i]=c;
-    }
-  }
-  else
-    error("missing end of segment marker");
-}
-
-/////////////////////////// decompress() /////////////////////
-
-void decompress(Reader* in, Writer* out) {
-  Decompresser d;
-  d.setInput(in);
-  d.setOutput(out);
-  while (d.findBlock()) {       // don't calculate memory
-    while (d.findFilename()) {  // discard filename
-      d.readComment();          // discard comment
-      d.decompress();           // to end of segment
-      d.readSegmentEnd();       // discard sha1string
-    }
-  }
-}
-
-////////////////////// Encoder ////////////////////
-
-// Initialize for start of block
-void Encoder::init() {
-  low=1;
-  high=0xFFFFFFFF;
-  pr.init();
-  if (!pr.isModeled()) low=0, buf.resize(1<<16);
-}
-
-// compress bit y having probability p/64K
-void Encoder::encode(int y, int p) {
-  assert(out);
-  assert(p>=0 && p<65536);
-  assert(y==0 || y==1);
-  assert(high>low && low>0);
-  U32 mid=low+U32(((high-low)*U64(U32(p)))>>16);  // split range
-  assert(high>mid && mid>=low);
-  if (y) high=mid; else low=mid+1; // pick half
-  while ((high^low)<0x1000000) { // write identical leading bytes
-    out->put(high>>24);  // same as low>>24
-    high=high<<8|255;
-    low=low<<8;
-    low+=(low==0); // so we don't code 4 0 bytes in a row
-  }
-}
-
-// compress byte c (0..255 or -1=EOS)
-void Encoder::compress(int c) {
-  assert(out);
-  if (pr.isModeled()) {
-    if (c==-1)
-      encode(1, 0);
-    else {
-      assert(c>=0 && c<=255);
-      encode(0, 0);
-      for (int i=7; i>=0; --i) {
-        int p=pr.predict()*2+1;
-        assert(p>0 && p<65536);
-        int y=c>>i&1;
-        encode(y, p);
-        pr.update(y);
-      }
-    }
-  }
-  else {
-    if (c<0 || low==buf.size()) {
-      out->put((low>>24)&255);
-      out->put((low>>16)&255);
-      out->put((low>>8)&255);
-      out->put(low&255);
-      out->write(&buf[0], low);
-      low=0;
-    }
-    if (c>=0) buf[low++]=c;
-  }
-}
-
-///////////////////// Compressor //////////////////////
-
-// Write 13 byte start tag
-// "\x37\x6B\x53\x74\xA0\x31\x83\xD3\x8C\xB2\x28\xB0\xD3"
-void Compressor::writeTag() {
-  assert(state==INIT);
-  enc.out->put(0x37);
-  enc.out->put(0x6b);
-  enc.out->put(0x53);
-  enc.out->put(0x74);
-  enc.out->put(0xa0);
-  enc.out->put(0x31);
-  enc.out->put(0x83);
-  enc.out->put(0xd3);
-  enc.out->put(0x8c);
-  enc.out->put(0xb2);
-  enc.out->put(0x28);
-  enc.out->put(0xb0);
-  enc.out->put(0xd3);
-}
-
-void Compressor::startBlock(int level) {
-
-  // Model 1 - min.cfg
-  static const char models[]={
-  26,0,1,2,0,0,2,3,16,8,19,0,0,96,4,28,
-  59,10,59,112,25,10,59,10,59,112,56,0,
-
-  // Model 2 - mid.cfg
-  69,0,3,3,0,0,8,3,5,8,13,0,8,17,1,8,
-  18,2,8,18,3,8,19,4,4,22,24,7,16,0,7,24,
-  -1,0,17,104,74,4,95,1,59,112,10,25,59,112,10,25,
-  59,112,10,25,59,112,10,25,59,112,10,25,59,10,59,112,
-  25,69,-49,8,112,56,0,
-
-  // Model 3 - max.cfg
-  -60,0,5,9,0,0,22,1,-96,3,5,8,13,1,8,16,
-  2,8,18,3,8,19,4,8,19,5,8,20,6,4,22,24,
-  3,17,8,19,9,3,13,3,13,3,13,3,14,7,16,0,
-  15,24,-1,7,8,0,16,10,-1,6,0,15,16,24,0,9,
-  8,17,32,-1,6,8,17,18,16,-1,9,16,19,32,-1,6,
-  0,19,20,16,0,0,17,104,74,4,95,2,59,112,10,25,
-  59,112,10,25,59,112,10,25,59,112,10,25,59,112,10,25,
-  59,10,59,112,10,25,59,112,10,25,69,-73,32,-17,64,47,
-  14,-25,91,47,10,25,60,26,48,-122,-105,20,112,63,9,70,
-  -33,0,39,3,25,112,26,52,25,25,74,10,4,59,112,25,
-  10,4,59,112,25,10,4,59,112,25,65,-113,-44,72,4,59,
-  112,8,-113,-40,8,68,-81,60,60,25,69,-49,9,112,25,25,
-  25,25,25,112,56,0,
-
-  0,0}; // 0,0 = end of list
-
-  if (level<1) error("compression level must be at least 1");
-  const char* p=models;
-  int i;
-  for (i=1; i<level && toU16(p); ++i)
-    p+=toU16(p)+2;
-  if (toU16(p)<1) error("compression level too high");
-  startBlock(p);
-}
-
-// Memory reader
-class MemoryReader: public Reader {
-  const char* p;
-public:
-  MemoryReader(const char* p_): p(p_) {}
-  int get() {return *p++&255;}
-};
-
-// Write a block header
-void Compressor::startBlock(const char* hcomp) {
-  assert(state==INIT);
-  assert(hcomp);
-  int len=toU16(hcomp)+2;
-  enc.out->put('z');
-  enc.out->put('P');
-  enc.out->put('Q');
-  enc.out->put(1+(len>6 && hcomp[6]==0));  // level 1 or 2
-  enc.out->put(1);
-  for (int i=0; i<len; ++i)  // write compression model hcomp
-    enc.out->put(hcomp[i]);
-  MemoryReader m(hcomp);
-  z.read(&m);
-  state=BLOCK1;
-}
-
-// Write a segment header
-void Compressor::startSegment(const char* filename, const char* comment) {
-  assert(state==BLOCK1 || state==BLOCK2);
-  enc.out->put(1);
-  while (filename && *filename)
-    enc.out->put(*filename++);
-  enc.out->put(0);
-  while (comment && *comment)
-    enc.out->put(*comment++);
-  enc.out->put(0);
-  enc.out->put(0);
-  if (state==BLOCK1) state=SEG1;
-  if (state==BLOCK2) state=SEG2;
-}
-
-// Initialize encoding and write pcomp to first segment
-// If len is 0 then length is encoded in pcomp[0..1]
-void Compressor::postProcess(const char* pcomp, int len) {
-  assert(state==SEG1);
-  enc.init();
-  if (pcomp) {
-    enc.compress(1);
-    if (len<=0) {
-      len=toU16(pcomp);
-      pcomp+=2;
-    }
-    enc.compress(len&255);
-    enc.compress((len>>8)&255);
-    for (int i=0; i<len; ++i)
-      enc.compress(pcomp[i]&255);
-  }
-  else
-    enc.compress(0);
-  state=SEG2;
-}
-
-// Compress n bytes, or to EOF if n <= 0
-bool Compressor::compress(int n) {
-  assert(state==SEG2);
-  int ch=0;
-  while (n && (ch=in->get())>=0) {
-    enc.compress(ch);
-    if (n>0) --n;
-  }
-  return ch>=0;
-}
-
-// End segment, write sha1string if present
-void Compressor::endSegment(const char* sha1string) {
-  assert(state==SEG2);
-  enc.compress(-1);
-  enc.out->put(0);
-  enc.out->put(0);
-  enc.out->put(0);
-  enc.out->put(0);
-  if (sha1string) {
-    enc.out->put(253);
-    for (int i=0; i<20; ++i)
-      enc.out->put(sha1string[i]);
-  }
-  else
-    enc.out->put(254);
-  state=BLOCK2;
-}
-
-// End block
-void Compressor::endBlock() {
-  assert(state==BLOCK2);
-  enc.out->put(255);
-  state=INIT;
-}
-
-/////////////////////////// compress() ///////////////////////
-
-void compress(Reader* in, Writer* out, int level) {
-  assert(level>=1);
-  Compressor c;
-  c.setInput(in);
-  c.setOutput(out);
-  c.startBlock(level);
-  c.startSegment();
-  c.postProcess();
-  c.compress();
-  c.endSegment();
-  c.endBlock();
-}
-
-//////////////////////// ZPAQL::assemble() ////////////////////
-
-#ifndef NOJIT
-/*
-assemble();
-
-Assembles the ZPAQL code in hcomp[0..hlen-1] and stores x86-32 or x86-64
-code in rcode[0..rcode_size-1]. Execution begins at rcode[0]. It will not
-write beyond the end of rcode, but in any case it returns the number of
-bytes that would have been written. It returns 0 in case of error.
-
-The assembled code implements run() and returns 1 if successful or
-0 if the ZPAQL code executes an invalid instruction or jumps out of
-bounds.
-
-A ZPAQL virtual machine has the following state. All values are
-unsigned and initially 0:
-
-  a, b, c, d: 32 bit registers (pointed to by their respective parameters)
-  f: 1 bit flag register (pointed to)
-  r[0..255]: 32 bit registers
-  m[0..msize-1]: 8 bit registers, where msize is a power of 2
-  h[0..hsize-1]: 32 bit registers, where hsize is a power of 2
-  out: pointer to a Writer
-  sha1: pointer to a SHA1
-
-Generally a ZPAQL machine is used to compute contexts which are
-placed in h. A second machine might post-process, and write its
-output to out and sha1. In either case, a machine is called with
-its input in a, representing a single byte (0..255) or
-(for a postprocessor) EOF (0xffffffff). Execution returs after a
-ZPAQL halt instruction.
-
-ZPAQL instructions are 1 byte unless the last 3 bits are 1.
-In this case, a second operand byte follows. Opcode 255 is
-the only 3 byte instruction. They are organized:
-
-  00dddxxx = unary opcode xxx on destination ddd (ddd < 111)
-  00111xxx = special instruction xxx
-  01dddsss = assignment: ddd = sss (ddd < 111)
-  1xxxxsss = operation sxxx from sss to a
-
-The meaning of sss and ddd are as follows:
-
-  000 = a   (accumulator)
-  001 = b
-  010 = c
-  011 = d
-  100 = *b  (means m[b mod msize])
-  101 = *c  (means m[c mod msize])
-  110 = *d  (means h[d mod hsize])
-  111 = n   (constant 0..255 in second byte of instruction)
-
-For example, 01001110 assigns *d to b. The other instructions xxx
-are as follows:
-
-Group 00dddxxx where ddd < 111 and xxx is:
-  000 = ddd<>a, swap with a (except 00000000 is an error, and swap
-        with *b or *c leaves the high bits of a unchanged)
-  001 = ddd++, increment
-  010 = ddd--, decrement
-  011 = ddd!, not (invert all bits)
-  100 = ddd=0, clear (set all bits of ddd to 0)
-  101 = not used (error)
-  110 = not used
-  111 = ddd=r n, assign from r[n] to ddd, n=0..255 in next opcode byte
-Except:
-  00100111 = jt n, jump if f is true (n = -128..127, relative to next opcode)
-  00101111 = jf n, jump if f is false (n = -128..127)
-  00110111 = r=a n, assign r[n] = a (n = 0..255)
-
-Group 00111xxx where xxx is:
-  000 = halt (return)
-  001 = output a
-  010 = not used
-  011 = hash: a = (a + *b + 512) * 773
-  100 = hashd: *d = (*d + a + 512) * 773
-  101 = not used
-  110 = not used
-  111 = unconditional jump (n = -128 to 127, relative to next opcode)
-  
-Group 1xxxxsss where xxxx is:
-  0000 = a += sss (add, subtract, multiply, divide sss to a)
-  0001 = a -= sss
-  0010 = a *= sss
-  0011 = a /= sss (unsigned, except set a = 0 if sss is 0)
-  0100 = a %= sss (remainder, except set a = 0 if sss is 0)
-  0101 = a &= sss (bitwise AND)
-  0110 = a &= ~sss (bitwise AND with complement of sss)
-  0111 = a |= sss (bitwise OR)
-  1000 = a ^= sss (bitwise XOR)
-  1001 = a <<= (sss % 32) (left shift by low 5 bits of sss)
-  1010 = a >>= (sss % 32) (unsigned, zero bits shifted in)
-  1011 = a == sss (compare, set f = true if equal or false otherwise)
-  1100 = a < sss (unsigned compare, result in f)
-  1101 = a > sss (unsigned compare)
-  1110 = not used
-  1111 = not used except 11111111 is a 3 byte jump to the absolute address
-         in the next 2 bytes in little-endian (LSB first) order.
-
-assemble() translates ZPAQL to 32 bit x86 code to be executed by run().
-Registers are mapped as follows:
-
-  eax = source sss from *b, *c, *d or sometimes n
-  ecx = pointer to destination *b, *c, *d, or spare
-  edx = a
-  ebx = f (1 for true, 0 for false)
-  esp = stack pointer
-  ebp = d
-  esi = b
-  edi = c
-
-run() saves non-volatile registers (ebp, esi, edi, ebx) on the stack,
-loads a, b, c, d, f, and executes the translated instructions.
-A halt instruction saves a, b, c, d, f, pops the saved registers
-and returns. Invalid instructions or jumps outside of the range
-of the ZPAQL code call libzpaq::error().
-
-In 64 bit mode, the following additional registers are used:
-
-  r12 = h
-  r14 = r
-  r15 = m
-
-*/
-
-// Called by out
-static void flush1(ZPAQL* z) {
-  z->flush();
-}
-
-// return true if op is an undefined ZPAQL instruction
-static bool iserr(int op) {
-  return op==0 || (op>=120 && op<=127) || (op>=240 && op<=254)
-    || op==58 || (op<64 && (op%8==5 || op%8==6));
-}
-
-// Write k bytes of x to rcode[o++] MSB first
-static void put(U8* rcode, int n, int& o, U32 x, int k) {
-  while (k-->0) {
-    if (o<n) rcode[o]=(x>>(k*8))&255;
-    ++o;
-  }
-}
-
-// Write 4 bytes of x to rcode[o++] LSB first
-static void put4lsb(U8* rcode, int n, int& o, U32 x) {
-  for (int k=0; k<4; ++k) {
-    if (o<n) rcode[o]=(x>>(k*8))&255;
-    ++o;
-  }
-}
-
-// Write a 1-4 byte x86 opcode without or with an 4 byte operand
-// to rcode[o...]
-#define put1(x) put(rcode, rcode_size, o, (x), 1)
-#define put2(x) put(rcode, rcode_size, o, (x), 2)
-#define put3(x) put(rcode, rcode_size, o, (x), 3)
-#define put4(x) put(rcode, rcode_size, o, (x), 4)
-#define put5(x,y) put4(x), put1(y)
-#define put6(x,y) put4(x), put2(y)
-#define put4r(x) put4lsb(rcode, rcode_size, o, x)
-#define puta(x) t=U32(size_t(x)), put4r(t)
-#define put1a(x,y) put1(x), puta(y)
-#define put2a(x,y) put2(x), puta(y)
-#define put3a(x,y) put3(x), puta(y)
-#define put4a(x,y) put4(x), puta(y)
-#define put5a(x,y,z) put4(x), put1(y), puta(z)
-#define put2l(x,y) put2(x), t=U32(size_t(y)), put4r(t), \
-  t=U32(size_t(y)>>(S*4)), put4r(t)
-
-// Assemble ZPAQL in in the HCOMP section of header to rcode,
-// but do not write beyond rcode_size. Return the number of
-// bytes output or that would have been output.
-// Execution starts at rcode[0] and returns 1 if successful or 0
-// in case of a ZPAQL execution error.
-int ZPAQL::assemble() {
-
-  // x86? (not foolproof)
-  const int S=sizeof(char*);      // 4 = x86, 8 = x86-64
-  U32 t=0x12345678;
-  if (*(char*)&t!=0x78 || (S!=4 && S!=8))
-    error("JIT supported only for x86-32 and x86-64");
-
-  const U8* hcomp=&header[hbegin];
-  const int hlen=hend-hbegin+1;
-  const int msize=m.size();
-  const int hsize=h.size();
-  const int regcode[8]={2,6,7,5}; // a,b,c,d.. -> edx,esi,edi,ebp,eax..
-  Array<int> it(hlen);            // hcomp -> rcode locations
-  int done=0;  // number of instructions assembled (0..hlen)
-  int o=5;  // rcode output index, reserve space for jmp
-
-  // Code for the halt instruction (restore registers and return)
-  const int halt=o;
-  if (S==8) {
-    put2l(0x48b9, &a);        // mov rcx, a
-    put2(0x8911);             // mov [rcx], edx
-    put2l(0x48b9, &b);        // mov rcx, b
-    put2(0x8931);             // mov [rcx], esi
-    put2l(0x48b9, &c);        // mov rcx, c
-    put2(0x8939);             // mov [rcx], edi
-    put2l(0x48b9, &d);        // mov rcx, d
-    put2(0x8929);             // mov [rcx], ebp
-    put2l(0x48b9, &f);        // mov rcx, f
-    put2(0x8919);             // mov [rcx], ebx
-    put4(0x4883c438);         // add rsp, 56
-    put2(0x415f);             // pop r15
-    put2(0x415e);             // pop r14
-    put2(0x415d);             // pop r13
-    put2(0x415c);             // pop r12
-  }
-  else {
-    put2a(0x8915, &a);        // mov [a], edx
-    put2a(0x8935, &b);        // mov [b], esi
-    put2a(0x893d, &c);        // mov [c], edi
-    put2a(0x892d, &d);        // mov [d], ebp
-    put2a(0x891d, &f);        // mov [f], ebx
-    put3(0x83c43c);           // add esp, 60
-  }
-  put1(0x5d);                 // pop ebp
-  put1(0x5b);                 // pop ebx
-  put1(0x5f);                 // pop edi
-  put1(0x5e);                 // pop esi
-  put1(0xc3);                 // ret
-
-  // Code for the out instruction.
-  // Store a=edx at outbuf[bufptr++]. If full, call flush1().
-  const int outlabel=o;
-  if (S==8) {
-    put2l(0x48b8, &outbuf[0]);// mov rax, outbuf.p
-    put2l(0x49ba, &bufptr);   // mov r10, &bufptr
-    put3(0x418b0a);           // mov ecx, [r10]
-    put3(0x891408);           // mov [rax+rcx], edx
-    put2(0xffc1);             // inc ecx
-    put3(0x41890a);           // mov [r10], ecx
-    put2a(0x81f9, outbuf.size());  // cmp ecx, outbuf.size()
-    put2(0x7401);             // jz L1
-    put1(0xc3);               // ret
-    put4(0x4883ec30);         // L1: sub esp, 48  ; call flush1(this)
-    put4(0x48893c24);         // mov [rsp], rdi
-    put5(0x48897424,8);       // mov [rsp+8], rsi
-    put5(0x48895424,16);      // mov [rsp+16], rdx
-    put5(0x48894c24,24);      // mov [rsp+24], rcx
-#ifdef unix
-    put2l(0x48bf, this);      // mov rdi, this
-#else  // Windows
-    put2l(0x48b9, this);      // mov rcx, this
-#endif
-    put2l(0x49bb, &flush1);   // mov r11, &flush1
-    put3(0x41ffd3);           // call r11
-    put5(0x488b4c24,24);      // mov rcx, [rsp+24]
-    put5(0x488b5424,16);      // mov rdx, [rsp+16]
-    put5(0x488b7424,8);       // mov rsi, [rsp+8]
-    put4(0x488b3c24);         // mov rdi, [rsp]
-    put4(0x4883c430);         // add esp, 48
-    put1(0xc3);               // ret
-  }
-  else {
-    put1a(0xb8, &outbuf[0]);  // mov eax, outbuf.p
-    put2a(0x8b0d, &bufptr);   // mov ecx, [bufptr]
-    put3(0x891408);           // mov [eax+ecx], edx
-    put2(0xffc1);             // inc ecx
-    put2a(0x890d, &bufptr);   // mov [bufptr], ecx
-    put2a(0x81f9, outbuf.size());  // cmp ecx, outbuf.size()
-    put2(0x7401);             // jz L1
-    put1(0xc3);               // ret
-    put3(0x83ec08);           // L1: sub esp, 8
-    put4(0x89542404);         // mov [esp+4], edx
-    put3a(0xc70424, this);    // mov [esp], this
-    put1a(0xb8, &flush1);     // mov eax, &flush1
-    put2(0xffd0);             // call eax
-    put4(0x8b542404);         // mov edx, [esp+4]
-    put3(0x83c408);           // add esp, 8
-    put1(0xc3);               // ret
-  }
-
-  // Set it[i]=1 for each ZPAQL instruction reachable from the previous
-  // instruction + 2 if reachable by a jump (or 3 if both).
-  it[0]=2;
-  assert(hlen>0 && hcomp[hlen-1]==0);  // ends with error
-  do {
-    done=0;
-    const int NONE=0x80000000;
-    for (int i=0; i<hlen; ++i) {
-      int op=hcomp[i];
-      if (it[i]) {
-        int next1=i+1+(op%8==7), next2=NONE; // next and jump targets
-        if (iserr(op)) next1=NONE;  // error
-        if (op==56) next1=NONE, next2=0;  // halt
-        if (op==255) next1=NONE, next2=hcomp[i+1]+256*hcomp[i+2]; // lj
-        if (op==39||op==47||op==63)next2=i+2+(hcomp[i+1]<<24>>24);// jt,jf,jmp
-        if (op==63) next1=NONE;  // jmp
-        if ((next2<0 || next2>=hlen) && next2!=NONE) next2=hlen-1; // error
-        if (next1!=NONE && !(it[next1]&1)) it[next1]|=1, ++done;
-        if (next2!=NONE && !(it[next2]&2)) it[next2]|=2, ++done;
-      }
-    }
-  } while (done>0);
-
-  // Set it[i] bits 2-3 to 4, 8, or 12 if a comparison
-  //  (<, >, == respectively) does not need to save the result in f,
-  // or if a conditional jump (jt, jf) does not need to read f.
-  // This is true if a comparison is followed directly by a jt/jf,
-  // the jt/jf is not a jump target, the byte before is not a jump
-  // target (for a 2 byte comparison), and for the comparison instruction
-  // if both paths after the jt/jf lead to another comparison or error
-  // before another jt/jf. At most hlen steps are traced because after
-  // that it must be an infinite loop.
-  for (int i=0; i<hlen; ++i) {
-    const int op1=hcomp[i]; // 216..239 = comparison
-    const int i2=i+1+(op1%8==7);  // address of next instruction
-    const int op2=hcomp[i2];  // 39,47 = jt,jf
-    if (it[i] && op1>=216 && op1<240 && (op2==39 || op2==47)
-        && it[i2]==1 && (i2==i+1 || it[i+1]==0)) {
-      int code=(op1-208)/8*4; // 4,8,12 is ==,<,>
-      it[i2]+=code;  // OK to test CF, ZF instead of f
-      for (int j=0; j<2 && code; ++j) {  // trace each path from i2
-        int k=i2+2; // branch not taken
-        if (j==1) k=i2+2+(hcomp[i2+1]<<24>>24);  // branch taken
-        for (int l=0; l<hlen && code; ++l) {  // trace at most hlen steps
-          if (k<0 || k>=hlen) break;  // out of bounds, pass
-          const int op=hcomp[k];
-          if (op==39 || op==47) code=0;  // jt,jf, fail
-          else if (op>=216 && op<240) break;  // ==,<,>, pass
-          else if (iserr(op)) break;  // error, pass
-          else if (op==255) k=hcomp[k+1]+256*hcomp[k+2]; // lj
-          else if (op==63) k=k+2+(hcomp[k+1]<<24>>24);  // jmp
-          else if (op==56) k=0;  // halt
-          else k=k+1+(op%8==7);  // ordinary instruction
-        }
-      }
-      it[i]+=code;  // if > 0 then OK to not save flags in f (bl)
-    }
-  }
-
-  // Start of run(): Save x86 and load ZPAQL registers
-  const int start=o;
-  assert(start>=16);
-  put1(0x56);          // push esi/rsi
-  put1(0x57);          // push edi/rdi
-  put1(0x53);          // push ebx/rbx
-  put1(0x55);          // push ebp/rbp
-  if (S==8) {
-    put2(0x4154);      // push r12
-    put2(0x4155);      // push r13
-    put2(0x4156);      // push r14
-    put2(0x4157);      // push r15
-    put4(0x4883ec38);  // sub rsp, 56
-    put2l(0x48b8, &a); // mov rax, a
-    put2(0x8b10);      // mov edx, [rax]
-    put2l(0x48b8, &b); // mov rax, b
-    put2(0x8b30);      // mov esi, [rax]
-    put2l(0x48b8, &c); // mov rax, c
-    put2(0x8b38);      // mov edi, [rax]
-    put2l(0x48b8, &d); // mov rax, d
-    put2(0x8b28);      // mov ebp, [rax]
-    put2l(0x48b8, &f); // mov rax, f
-    put2(0x8b18);      // mov ebx, [rax]
-    put2l(0x49bc, &h[0]);   // mov r12, h
-    put2l(0x49bd, &outbuf[0]); // mov r13, outbuf.p
-    put2l(0x49be, &r[0]);   // mov r14, r
-    put2l(0x49bf, &m[0]);   // mov r15, m
-  }
-  else {
-    put3(0x83ec3c);    // sub esp, 60
-    put2a(0x8b15, &a); // mov edx, [a]
-    put2a(0x8b35, &b); // mov esi, [b]
-    put2a(0x8b3d, &c); // mov edi, [c]
-    put2a(0x8b2d, &d); // mov ebp, [d]
-    put2a(0x8b1d, &f); // mov ebx, [f]
-  }
-
-  // Assemble in multiple passes until every byte of hcomp has a translation
-  for (int istart=0; istart<hlen; ++istart) {
-    for (int i=istart; i<hlen&&it[i]; i=i+1+(hcomp[i]%8==7)+(hcomp[i]==255)) {
-      const int code=it[i];
-
-      // If already assembled, then assemble a jump to it
-      U32 t;
-      assert(it.isize()>i);
-      assert(i>=0 && i<hlen);
-      if (code>=16) {
-        if (i>istart) {
-          int a=code-o;
-          if (a>-120 && a<120)
-            put2(0xeb00+((a-2)&255)); // jmp short o
-          else
-            put1a(0xe9, a-5);  // jmp near o
-        }
-        break;
-      }
-
-      // Else assemble the instruction at hcode[i] to rcode[o]
-      else {
-        assert(i>=0 && i<it.isize());
-        assert(it[i]>0 && it[i]<16);
-        assert(o>=16);
-        it[i]=o;
-        ++done;
-        const int op=hcomp[i];
-        const int arg=hcomp[i+1]+((op==255)?256*hcomp[i+2]:0);
-        const int ddd=op/8%8;
-        const int sss=op%8;
-
-        // error instruction: return 0
-        if (iserr(op)) {
-          put2(0x31c0);           // xor eax, eax
-          put1a(0xe9, halt-o-4);  // jmp near halt
-          continue;
-        }
-
-        // Load source *b, *c, *d, or hash (*b) into eax except:
-        // {a,b,c,d}=*d, a{+,-,*,&,|,^,=,==,>,>}=*d: load address to eax
-        // {a,b,c,d}={*b,*c}: load source into ddd
-        if (op==59 || (op>=64 && op<240 && op%8>=4 && op%8<7)) {
-          put2(0x89c0+8*regcode[sss-3+(op==59)]);  // mov eax, {esi,edi,ebp}
-          const int sz=(sss==6?hsize:msize)-1;
-          if (sz>=128) put1a(0x25, sz);            // and eax, dword msize-1
-          else put3(0x83e000+sz);                  // and eax, byte msize-1
-          const int move=(op>=64 && op<112); // = or else ddd is eax
-          if (sss<6) { // ddd={a,b,c,d,*b,*c}
-            if (S==8) put5(0x410fb604+8*move*regcode[ddd],0x07);
-                                                   // movzx ddd, byte [r15+rax]
-            else put3a(0x0fb680+8*move*regcode[ddd], &m[0]);
-                                                   // movzx ddd, byte [m+eax]
-          }
-          else if ((0x06587000>>(op/8))&1) {// {*b,*c,*d,a/,a%,a&~,a<<,a>>}=*d
-            if (S==8) put4(0x418b0484);            // mov eax, [r12+rax*4]
-            else put3a(0x8b0485, &h[0]);           // mov eax, [h+eax*4]
-          }
-        }
-
-        // Load destination address *b, *c, *d or hashd (*d) into ecx
-        if ((op>=32 && op<56 && op%8<5) || (op>=96 && op<120) || op==60) {
-          put2(0x89c1+8*regcode[op/8%8-3-(op==60)]);// mov ecx,{esi,edi,ebp}
-          const int sz=(ddd==6||op==60?hsize:msize)-1;
-          if (sz>=128) put2a(0x81e1, sz);   // and ecx, dword sz
-          else put3(0x83e100+sz);           // and ecx, byte sz
-          if (op/8%8==6 || op==60) { // *d
-            if (S==8) put4(0x498d0c8c);     // lea rcx, [r12+rcx*4]
-            else put3a(0x8d0c8d, &h[0]);    // lea ecx, [ecx*4+h]
-          }
-          else { // *b, *c
-            if (S==8) put4(0x498d0c0f);     // lea rcx, [r15+rcx]
-            else put2a(0x8d89, &m[0]);      // lea ecx, [ecx+h]
-          }
-        }
-
-        // Translate by opcode
-        switch((op/8)&31) {
-          case 0:  // ddd = a
-          case 1:  // ddd = b
-          case 2:  // ddd = c
-          case 3:  // ddd = d
-            switch(sss) {
-              case 0:  // ddd<>a (swap)
-                put2(0x87d0+regcode[ddd]);   // xchg edx, ddd
-                break;
-              case 1:  // ddd++
-                put2(0xffc0+regcode[ddd]);   // inc ddd
-                break;
-              case 2:  // ddd--
-                put2(0xffc8+regcode[ddd]);   // dec ddd
-                break;
-              case 3:  // ddd!
-                put2(0xf7d0+regcode[ddd]);   // not ddd
-                break;
-              case 4:  // ddd=0
-                put2(0x31c0+9*regcode[ddd]); // xor ddd,ddd
-                break;
-              case 7:  // ddd=r n
-                if (S==8)
-                  put3a(0x418b86+8*regcode[ddd], arg*4); // mov ddd, [r14+n*4]
-                else
-                  put2a(0x8b05+8*regcode[ddd], (&r[arg]));//mov ddd, [r+n]
-                break;
-            }
-            break;
-          case 4:  // ddd = *b
-          case 5:  // ddd = *c
-            switch(sss) {
-              case 0:  // ddd<>a (swap)
-                put2(0x8611);                // xchg dl, [ecx]
-                break;
-              case 1:  // ddd++
-                put2(0xfe01);                // inc byte [ecx]
-                break;
-              case 2:  // ddd--
-                put2(0xfe09);                // dec byte [ecx]
-                break;
-              case 3:  // ddd!
-                put2(0xf611);                // not byte [ecx]
-                break;
-              case 4:  // ddd=0
-                put2(0x31c0);                // xor eax, eax
-                put2(0x8801);                // mov [ecx], al
-                break;
-              case 7:  // jt, jf
-              {
-                assert(code>=0 && code<16);
-                const int jtab[2][4]={{5,4,2,7},{4,5,3,6}};
-                               // jnz,je,jb,ja, jz,jne,jae,jbe
-                if (code<4) put2(0x84db);    // test bl, bl
-                if (arg>=128 && arg-257-i>=0 && o-it[arg-257-i]<120)
-                  put2(0x7000+256*jtab[op==47][code/4]); // jx short 0
-                else
-                  put2a(0x0f80+jtab[op==47][code/4], 0); // jx near 0
-                break;
-              }
-            }
-            break;
-          case 6:  // ddd = *d
-            switch(sss) {
-              case 0:  // ddd<>a (swap)
-                put2(0x8711);             // xchg edx, [ecx]
-                break;
-              case 1:  // ddd++
-                put2(0xff01);             // inc dword [ecx]
-                break;
-              case 2:  // ddd--
-                put2(0xff09);             // dec dword [ecx]
-                break;
-              case 3:  // ddd!
-                put2(0xf711);             // not dword [ecx]
-                break;
-              case 4:  // ddd=0
-                put2(0x31c0);             // xor eax, eax
-                put2(0x8901);             // mov [ecx], eax
-                break;
-              case 7:  // ddd=r n
-                if (S==8)
-                  put3a(0x418996, arg*4); // mov [r14+n*4], edx
-                else
-                  put2a(0x8915, &r[arg]); // mov [r+n], edx
-                break;
-            }
-            break;
-          case 7:  // special
-            switch(op) {
-              case 56: // halt
-                put1a(0xb8, 1);           // mov eax, 1
-                put1a(0xe9, halt-o-4);    // jmp near halt
-                break;
-              case 57:  // out
-                put1a(0xe8, outlabel-o-4);// call outlabel
-                break;
-              case 59:  // hash: a = (a + *b + 512) * 773
-                put3a(0x8d8410, 512);     // lea edx, [eax+edx+512]
-                put2a(0x69d0, 773);       // imul edx, eax, 773
-                break;
-              case 60:  // hashd: *d = (*d + a + 512) * 773
-                put2(0x8b01);             // mov eax, [ecx]
-                put3a(0x8d8410, 512);     // lea eax, [eax+edx+512]
-                put2a(0x69c0, 773);       // imul eax, eax, 773
-                put2(0x8901);             // mov [ecx], eax
-                break;
-              case 63:  // jmp
-                put1a(0xe9, 0);           // jmp near 0 (fill in target later)
-                break;
-            }
-            break;
-          case 8:   // a=
-          case 9:   // b=
-          case 10:  // c=
-          case 11:  // d=
-            if (sss==7)  // n
-              put1a(0xb8+regcode[ddd], arg);         // mov ddd, n
-            else if (sss==6) { // *d
-              if (S==8)
-                put4(0x418b0484+(regcode[ddd]<<11)); // mov ddd, [r12+rax*4]
-              else
-                put3a(0x8b0485+(regcode[ddd]<<11),&h[0]);// mov ddd, [h+eax*4]
-            }
-            else if (sss<4) // a, b, c, d
-              put2(0x89c0+regcode[ddd]+8*regcode[sss]);// mov ddd,sss
-            break;
-          case 12:  // *b=
-          case 13:  // *c=
-            if (sss==7) put3(0xc60100+arg);          // mov byte [ecx], n
-            else if (sss==0) put2(0x8811);           // mov byte [ecx], dl
-            else {
-              if (sss<4) put2(0x89c0+8*regcode[sss]);// mov eax, sss
-              put2(0x8801);                          // mov byte [ecx], al
-            }
-            break;
-          case 14:  // *d=
-            if (sss<7) put2(0x8901+8*regcode[sss]);  // mov [ecx], sss
-            else put2a(0xc701, arg);                 // mov dword [ecx], n
-            break;
-          case 15: break; // not used
-          case 16:  // a+=
-            if (sss==6) {
-              if (S==8) put4(0x41031484);            // add edx, [r12+rax*4]
-              else put3a(0x031485, &h[0]);           // add edx, [h+eax*4]
-            }
-            else if (sss<7) put2(0x01c2+8*regcode[sss]);// add edx, sss
-            else if (arg>128) put2a(0x81c2, arg);    // add edx, n
-            else put3(0x83c200+arg);                 // add edx, byte n
-            break;
-          case 17:  // a-=
-            if (sss==6) {
-              if (S==8) put4(0x412b1484);            // sub edx, [r12+rax*4]
-              else put3a(0x2b1485, &h[0]);           // sub edx, [h+eax*4]
-            }
-            else if (sss<7) put2(0x29c2+8*regcode[sss]);// sub edx, sss
-            else if (arg>=128) put2a(0x81ea, arg);   // sub edx, n
-            else put3(0x83ea00+arg);                 // sub edx, byte n
-            break;
-          case 18:  // a*=
-            if (sss==6) {
-              if (S==8) put5(0x410faf14,0x84);       // imul edx, [r12+rax*4]
-              else put4a(0x0faf1485, &h[0]);         // imul edx, [h+eax*4]
-            }
-            else if (sss<7) put3(0x0fafd0+regcode[sss]);// imul edx, sss
-            else if (arg>=128) put2a(0x69d2, arg);   // imul edx, n
-            else put3(0x6bd200+arg);                 // imul edx, byte n
-            break;
-          case 19:  // a/=
-          case 20:  // a%=
-            if (sss<7) put2(0x89c1+8*regcode[sss]);  // mov ecx, sss
-            else put1a(0xb9, arg);                   // mov ecx, n
-            put2(0x85c9);                            // test ecx, ecx
-            put3(0x0f44d1);                          // cmovz edx, ecx
-            put2(0x7408-2*(op/8==20));               // jz (over rest)
-            put2(0x89d0);                            // mov eax, edx
-            put2(0x31d2);                            // xor edx, edx
-            put2(0xf7f1);                            // div ecx
-            if (op/8==19) put2(0x89c2);              // mov edx, eax
-            break;
-          case 21:  // a&=
-            if (sss==6) {
-              if (S==8) put4(0x41231484);            // and edx, [r12+rax*4]
-              else put3a(0x231485, &h[0]);           // and edx, [h+eax*4]
-            }
-            else if (sss<7) put2(0x21c2+8*regcode[sss]);// and edx, sss
-            else if (arg>=128) put2a(0x81e2, arg);   // and edx, n
-            else put3(0x83e200+arg);                 // and edx, byte n
-            break;
-          case 22:  // a&~
-            if (sss==7) {
-              if (arg<128) put3(0x83e200+(~arg&255));// and edx, byte ~n
-              else put2a(0x81e2, ~arg);              // and edx, ~n
-            }
-            else {
-              if (sss<4) put2(0x89c0+8*regcode[sss]);// mov eax, sss
-              put2(0xf7d0);                          // not eax
-              put2(0x21c2);                          // and edx, eax
-            }
-            break;
-          case 23:  // a|=
-            if (sss==6) {
-              if (S==8) put4(0x410b1484);            // or edx, [r12+rax*4]
-              else put3a(0x0b1485, &h[0]);           // or edx, [h+eax*4]
-            }
-            else if (sss<7) put2(0x09c2+8*regcode[sss]);// or edx, sss
-            else if (arg>=128) put2a(0x81ca, arg);   // or edx, n
-            else put3(0x83ca00+arg);                 // or edx, byte n
-            break;
-          case 24:  // a^=
-            if (sss==6) {
-              if (S==8) put4(0x41331484);            // xor edx, [r12+rax*4]
-              else put3a(0x331485, &h[0]);           // xor edx, [h+eax*4]
-            }
-            else if (sss<7) put2(0x31c2+8*regcode[sss]);// xor edx, sss
-            else if (arg>=128) put2a(0x81f2, arg);   // xor edx, byte n
-            else put3(0x83f200+arg);                 // xor edx, n
-            break;
-          case 25:  // a<<=
-          case 26:  // a>>=
-            if (sss==7)  // sss = n
-              put3(0xc1e200+8*256*(op/8==26)+arg);   // shl/shr n
-            else {
-              put2(0x89c1+8*regcode[sss]);           // mov ecx, sss
-              put2(0xd3e2+8*(op/8==26));             // shl/shr edx, cl
-            }
-            break;
-          case 27:  // a==
-          case 28:  // a<
-          case 29:  // a>
-            if (sss==6) {
-              if (S==8) put4(0x413b1484);            // cmp edx, [r12+rax*4]
-              else put3a(0x3b1485, &h[0]);           // cmp edx, [h+eax*4]
-            }
-            else if (sss==7)  // sss = n
-              put2a(0x81fa, arg);                    // cmp edx, dword n
-            else
-              put2(0x39c2+8*regcode[sss]);           // cmp edx, sss
-            if (code<4) {
-              if (op/8==27) put3(0x0f94c3);          // setz bl
-              if (op/8==28) put3(0x0f92c3);          // setc bl
-              if (op/8==29) put3(0x0f97c3);          // seta bl
-            }
-            break;
-          case 30:  // not used
-          case 31:  // 255 = lj
-            if (op==255) put1a(0xe9, 0);             // jmp near
-            break;
-        }
-      }
-    }
-  }
-
-  // Finish first pass
-  const int rsize=o;
-  if (o>rcode_size) return rsize;
-
-  // Fill in jump addresses (second pass)
-  for (int i=0; i<hlen; ++i) {
-    if (it[i]<16) continue;
-    int op=hcomp[i];
-    if (op==39 || op==47 || op==63 || op==255) {  // jt, jf, jmp, lj
-      int target=hcomp[i+1];
-      if (op==255) target+=hcomp[i+2]*256;  // lj
-      else {
-        if (target>=128) target-=256;
-        target+=i+2;
-      }
-      if (target<0 || target>=hlen) target=hlen-1;  // runtime ZPAQL error
-      o=it[i];
-      assert(o>=16 && o<rcode_size);
-      if ((op==39 || op==47) && rcode[o]==0x84) o+=2;  // jt, jf -> skip test
-      assert(o>=16 && o<rcode_size);
-      if (rcode[o]==0x0f) ++o;  // first byte of jz near, jnz near
-      assert(o<rcode_size);
-      op=rcode[o++];  // x86 opcode
-      target=it[target]-o;
-      if ((op>=0x72 && op<0x78) || op==0xeb) {  // jx, jmp short
-        --target;
-        if (target<-128 || target>127)
-          error("Cannot code x86 short jump");
-        assert(o<rcode_size);
-        rcode[o]=target&255;
-      }
-      else if ((op>=0x82 && op<0x88) || op==0xe9) // jx, jmp near
-      {
-        target-=4;
-        puta(target);
-      }
-      else assert(false);  // not a x86 jump
-    }
-  }
-
-  // Jump to start
-  o=0;
-  put1a(0xe9, start-5);  // jmp near start
-  return rsize;
-}
-
-//////////////////////// Predictor::assemble_p() /////////////////////
-
-// Assemble the ZPAQL code in the HCOMP section of z.header to pcomp and
-// return the number of bytes of x86 or x86-64 code written, or that would
-// be written if pcomp were large enough. The code for predict() begins
-// at pr.pcomp[0] and update() at pr.pcomp[5], both as jmp instructions.
-
-// The assembled code is equivalent to int predict(Predictor*)
-// and void update(Predictor*, int y); The Preditor address is placed in
-// edi/rdi. The update bit y is placed in ebp/rbp.
-
-int Predictor::assemble_p() {
-  Predictor& pr=*this;
-  U8* rcode=pr.pcode;         // x86 output array
-  int rcode_size=pcode_size;  // output size
-  int o=0;                    // output index in pcode
-  const int S=sizeof(char*);  // 4 or 8
-  U8* hcomp=&pr.z.header[0];  // The code to translate
-#define off(x)  ((char*)&(pr.x)-(char*)&pr)
-#define offc(x) ((char*)&(pr.comp[i].x)-(char*)&pr)
-
-  // test for little-endian (probably x86)
-  U32 t=0x12345678;
-  if (*(char*)&t!=0x78 || (S!=4 && S!=8))
-    error("JIT supported only for x86-32 and x86-64");
-
-  // Initialize for predict(). Put predictor address in edi/rdi
-  put1a(0xe9, 5);             // jmp predict
-  put1a(0, 0x90909000);       // reserve space for jmp update
-  put1(0x53);                 // push ebx/rbx
-  put1(0x55);                 // push ebp/rbp
-  put1(0x56);                 // push esi/rsi
-  put1(0x57);                 // push edi/rdi
-  if (S==4)
-    put4(0x8b7c2414);         // mov edi,[esp+0x14] ; pr
-  else {
-#ifndef unix
-    put3(0x4889cf);           // mov rdi, rcx (1st arg in Win64)
-#endif
-  }
-
-  // Code predict() for each component
-  const int n=hcomp[6];  // number of components
-  U8* cp=hcomp+7;
-  for (int i=0; i<n; ++i, cp+=compsize[cp[0]]) {
-    if (cp-hcomp>=pr.z.cend) error("comp too big");
-    if (cp[0]<1 || cp[0]>9) error("invalid component");
-    assert(compsize[cp[0]]>0 && compsize[cp[0]]<8);
-    switch (cp[0]) {
-
-      case CONS:  // c
-        break;
-
-      case CM:  // sizebits limit
-        // Component& cr=comp[i];
-        // cr.cxt=h[i]^hmap4;
-        // p[i]=stretch(cr.cm(cr.cxt)>>17);
-
-        put2a(0x8b87, off(h[i]));              // mov eax, [edi+&h[i]]
-        put2a(0x3387, off(hmap4));             // xor eax, [edi+&hmap4]
-        put1a(0x25, (1<<cp[1])-1);             // and eax, size-1
-        put2a(0x8987, offc(cxt));              // mov [edi+cxt], eax
-        if (S==8) put1(0x48);                  // rex.w (esi->rsi)
-        put2a(0x8bb7, offc(cm));               // mov esi, [edi+&cm]
-        put3(0x8b0486);                        // mov eax, [esi+eax*4]
-        put3(0xc1e811);                        // shr eax, 17
-        put4a(0x0fbf8447, off(stretcht));      // movsx eax,word[edi+eax*2+..]
-        put2a(0x8987, off(p[i]));              // mov [edi+&p[i]], eax
-        break;
-
-      case ISSE:  // sizebits j -- c=hi, cxt=bh
-        // assert((hmap4&15)>0);
-        // if (c8==1 || (c8&0xf0)==16)
-        //   cr.c=find(cr.ht, cp[1]+2, h[i]+16*c8);
-        // cr.cxt=cr.ht[cr.c+(hmap4&15)];  // bit history
-        // int *wt=(int*)&cr.cm[cr.cxt*2];
-        // p[i]=clamp2k((wt[0]*p[cp[2]]+wt[1]*64)>>16);
-
-      case ICM: // sizebits
-        // assert((hmap4&15)>0);
-        // if (c8==1 || (c8&0xf0)==16) cr.c=find(cr.ht, cp[1]+2, h[i]+16*c8);
-        // cr.cxt=cr.ht[cr.c+(hmap4&15)];
-        // p[i]=stretch(cr.cm(cr.cxt)>>8);
-        //
-        // Find cxt row in hash table ht. ht has rows of 16 indexed by the low
-        // sizebits of cxt with element 0 having the next higher 8 bits for
-        // collision detection. If not found after 3 adjacent tries, replace
-        // row with lowest element 1 as priority. Return index of row.
-        //
-        // size_t Predictor::find(Array<U8>& ht, int sizebits, U32 cxt) {
-        //  assert(ht.size()==size_t(16)<<sizebits);
-        //  int chk=cxt>>sizebits&255;
-        //  size_t h0=(cxt*16)&(ht.size()-16);
-        //  if (ht[h0]==chk) return h0;
-        //  size_t h1=h0^16;
-        //  if (ht[h1]==chk) return h1;
-        //  size_t h2=h0^32;
-        //  if (ht[h2]==chk) return h2;
-        //  if (ht[h0+1]<=ht[h1+1] && ht[h0+1]<=ht[h2+1])
-        //    return memset(&ht[h0], 0, 16), ht[h0]=chk, h0;
-        //  else if (ht[h1+1]<ht[h2+1])
-        //    return memset(&ht[h1], 0, 16), ht[h1]=chk, h1;
-        //  else
-        //    return memset(&ht[h2], 0, 16), ht[h2]=chk, h2;
-        // }
-
-        if (S==8) put1(0x48);                  // rex.w
-        put2a(0x8bb7, offc(ht));               // mov esi, [edi+&ht]
-        put2(0x8b07);                          // mov eax, edi ; c8
-        put2(0x89c1);                          // mov ecx, eax ; c8
-        put3(0x83f801);                        // cmp eax, 1
-        put2(0x740a);                          // je L1
-        put1a(0x25, 240);                      // and eax, 0xf0
-        put3(0x83f810);                        // cmp eax, 16
-        put2(0x7576);                          // jne L2 ; skip find()
-           // L1: ; find cxt in ht, return index in eax
-        put3(0xc1e104);                        // shl ecx, 4
-        put2a(0x038f, off(h[i]));              // add [edi+&h[i]]
-        put2(0x89c8);                          // mov eax, ecx ; cxt
-        put3(0xc1e902+cp[1]);                  // shr ecx, sizebits+2
-        put2a(0x81e1, 255);                    // and eax, 255 ; chk
-        put3(0xc1e004);                        // shl eax, 4
-        put1a(0x25, (64<<cp[1])-16);           // and eax, ht.size()-16 = h0
-        put3(0x3a0c06);                        // cmp cl, [esi+eax] ; ht[h0]
-        put2(0x744d);                          // je L3 ; match h0
-        put3(0x83f010);                        // xor eax, 16 ; h1
-        put3(0x3a0c06);                        // cmp cl, [esi+eax]
-        put2(0x7445);                          // je L3 ; match h1
-        put3(0x83f030);                        // xor eax, 48 ; h2
-        put3(0x3a0c06);                        // cmp cl, [esi+eax]
-        put2(0x743d);                          // je L3 ; match h2
-          // No checksum match, so replace the lowest priority among h0,h1,h2
-        put3(0x83f021);                        // xor eax, 33 ; h0+1
-        put3(0x8a1c06);                        // mov bl, [esi+eax] ; ht[h0+1]
-        put2(0x89c2);                          // mov edx, eax ; h0+1
-        put3(0x83f220);                        // xor edx, 32  ; h2+1
-        put3(0x3a1c16);                        // cmp bl, [esi+edx]
-        put2(0x7708);                          // ja L4 ; test h1 vs h2
-        put3(0x83f230);                        // xor edx, 48  ; h1+1
-        put3(0x3a1c16);                        // cmp bl, [esi+edx]
-        put2(0x7611);                          // jbe L7 ; replace h0
-          // L4: ; h0 is not lowest, so replace h1 or h2
-        put3(0x83f010);                        // xor eax, 16 ; h1+1
-        put3(0x8a1c06);                        // mov bl, [esi+eax]
-        put3(0x83f030);                        // xor eax, 48 ; h2+1
-        put3(0x3a1c06);                        // cmp bl, [esi+eax]
-        put2(0x7303);                          // jae L7
-        put3(0x83f030);                        // xor eax, 48 ; h1+1
-          // L7: ; replace row pointed to by eax = h0,h1,h2
-        put3(0x83f001);                        // xor eax, 1
-        put3(0x890c06);                        // mov [esi+eax], ecx ; chk
-        put2(0x31c9);                          // xor ecx, ecx
-        put4(0x894c0604);                      // mov [esi+eax+4], ecx
-        put4(0x894c0608);                      // mov [esi+eax+8], ecx
-        put4(0x894c060c);                      // mov [esi+eax+12], ecx
-          // L3: ; save nibble context (in eax) in c
-        put2a(0x8987, offc(c));                // mov [edi+c], eax
-        put2(0xeb06);                          // jmp L8
-          // L2: ; get nibble context
-        put2a(0x8b87, offc(c));                // mov eax, [edi+c]
-          // L8: ; nibble context is in eax
-        put2a(0x8b97, off(hmap4));             // mov edx, [edi+&hmap4]
-        put3(0x83e20f);                        // and edx, 15  ; hmap4
-        put2(0x01d0);                          // add eax, edx ; c+(hmap4&15)
-        put4(0x0fb61406);                      // movzx edx, byte [esi+eax]
-        put2a(0x8997, offc(cxt));              // mov [edi+&cxt], edx ; cxt=bh
-        if (S==8) put1(0x48);                  // rex.w
-        put2a(0x8bb7, offc(cm));               // mov esi, [edi+&cm] ; cm
-
-        // esi points to cm[256] (ICM) or cm[512] (ISSE) with 23 bit
-        // prediction (ICM) or a pair of 20 bit signed weights (ISSE).
-        // cxt = bit history bh (0..255) is in edx.
-        if (cp[0]==ICM) {
-          put3(0x8b0496);                      // mov eax, [esi+edx*4];cm[bh]
-          put3(0xc1e808);                      // shr eax, 8
-          put4a(0x0fbf8447, off(stretcht));    // movsx eax,word[edi+eax*2+..]
-        }
-        else {  // ISSE
-          put2a(0x8b87, off(p[cp[2]]));        // mov eax, [edi+&p[j]]
-          put4(0x0faf04d6);                    // imul eax, [esi+edx*8] ;wt[0]
-          put4(0x8b4cd604);                    // mov ecx, [esi+edx*8+4];wt[1]
-          put3(0xc1e106);                      // shl ecx, 6
-          put2(0x01c8);                        // add eax, ecx
-          put3(0xc1f810);                      // sar eax, 16
-          put1a(0xb9, 2047);                   // mov ecx, 2047
-          put2(0x39c8);                        // cmp eax, ecx
-          put3(0x0f4fc1);                      // cmovg eax, ecx
-          put1a(0xb9, -2048);                  // mov ecx, -2048
-          put2(0x39c8);                        // cmp eax, ecx
-          put3(0x0f4cc1);                      // cmovl eax, ecx
-
-        }
-        put2a(0x8987, off(p[i]));              // mov [edi+&p[i]], eax
-        break;
-
-      case MATCH: // sizebits bufbits: a=len, b=offset, c=bit, cxt=bitpos,
-                  //                   ht=buf, limit=pos
-        // assert(cr.cm.size()==(size_t(1)<<cp[1]));
-        // assert(cr.ht.size()==(size_t(1)<<cp[2]));
-        // assert(cr.a<=255);
-        // assert(cr.c==0 || cr.c==1);
-        // assert(cr.cxt<8);
-        // assert(cr.limit<cr.ht.size());
-        // if (cr.a==0) p[i]=0;
-        // else {
-        //   cr.c=(cr.ht(cr.limit-cr.b)>>(7-cr.cxt))&1; // predicted bit
-        //   p[i]=stretch(dt2k[cr.a]*(cr.c*-2+1)&32767);
-        // }
-
-        if (S==8) put1(0x48);          // rex.w
-        put2a(0x8bb7, offc(ht));       // mov esi, [edi+&ht]
-
-        // If match length (a) is 0 then p[i]=0
-        put2a(0x8b87, offc(a));        // mov eax, [edi+&a]
-        put2(0x85c0);                  // test eax, eax
-        put2(0x7449);                  // jz L2 ; p[i]=0
-
-        // Else put predicted bit in c
-        put1a(0xb9, 7);                // mov ecx, 7
-        put2a(0x2b8f, offc(cxt));      // sub ecx, [edi+&cxt]
-        put2a(0x8b87, offc(limit));    // mov eax, [edi+&limit]
-        put2a(0x2b87, offc(b));        // sub eax, [edi+&b]
-        put1a(0x25, (1<<cp[2])-1);     // and eax, ht.size()-1
-        put4(0x0fb60406);              // movzx eax, byte [esi+eax]
-        put2(0xd3e8);                  // shr eax, cl
-        put3(0x83e001);                // and eax, 1  ; predicted bit
-        put2a(0x8987, offc(c));        // mov [edi+&c], eax ; c
-
-        // p[i]=stretch(dt2k[cr.a]*(cr.c*-2+1)&32767);
-        put2a(0x8b87, offc(a));        // mov eax, [edi+&a]
-        put3a(0x8b8487, off(dt2k));    // mov eax, [edi+eax*4+&dt2k] ; weight
-        put2(0x7402);                  // jz L1 ; z if c==0
-        put2(0xf7d8);                  // neg eax
-        put1a(0x25, 0x7fff);           // L1: and eax, 32767
-        put4a(0x0fbf8447, off(stretcht)); //movsx eax, word [edi+eax*2+...]
-        put2a(0x8987, off(p[i]));      // L2: mov [edi+&p[i]], eax
-        break;
-
-      case AVG: // j k wt
-        // p[i]=(p[cp[1]]*cp[3]+p[cp[2]]*(256-cp[3]))>>8;
-
-        put2a(0x8b87, off(p[cp[1]]));  // mov eax, [edi+&p[j]]
-        put2a(0x2b87, off(p[cp[2]]));  // sub eax, [edi+&p[k]]
-        put2a(0x69c0, cp[3]);          // imul eax, wt
-        put3(0xc1f808);                // sar eax, 8
-        put2a(0x0387, off(p[cp[2]]));  // add eax, [edi+&p[k]]
-        put2a(0x8987, off(p[i]));      // mov [edi+&p[i]], eax
-        break;
-
-      case MIX2:   // sizebits j k rate mask
-                   // c=size cm=wt[size] cxt=input
-        // cr.cxt=((h[i]+(c8&cp[5]))&(cr.c-1));
-        // assert(cr.cxt<cr.a16.size());
-        // int w=cr.a16[cr.cxt];
-        // assert(w>=0 && w<65536);
-        // p[i]=(w*p[cp[2]]+(65536-w)*p[cp[3]])>>16;
-        // assert(p[i]>=-2048 && p[i]<2048);
-
-        put2(0x8b07);                  // mov eax, [edi] ; c8
-        put1a(0x25, cp[5]);            // and eax, mask
-        put2a(0x0387, off(h[i]));      // add eax, [edi+&h[i]]
-        put1a(0x25, (1<<cp[1])-1);     // and eax, size-1
-        put2a(0x8987, offc(cxt));      // mov [edi+&cxt], eax ; cxt
-        if (S==8) put1(0x48);          // rex.w
-        put2a(0x8bb7, offc(a16));      // mov esi, [edi+&a16]
-        put4(0x0fb70446);              // movzx eax, word [edi+eax*2] ; w
-        put2a(0x8b8f, off(p[cp[2]]));  // mov ecx, [edi+&p[j]]
-        put2a(0x8b97, off(p[cp[3]]));  // mov edx, [edi+&p[k]]
-        put2(0x29d1);                  // sub ecx, edx
-        put3(0x0fafc8);                // imul ecx, eax
-        put3(0xc1e210);                // shl edx, 16
-        put2(0x01d1);                  // add ecx, edx
-        put3(0xc1f910);                // sar ecx, 16
-        put2a(0x898f, off(p[i]));      // mov [edi+&p[i]]
-        break;
-
-      case MIX:    // sizebits j m rate mask
-                   // c=size cm=wt[size][m] cxt=index of wt in cm
-        // int m=cp[3];
-        // assert(m>=1 && m<=i);
-        // cr.cxt=h[i]+(c8&cp[5]);
-        // cr.cxt=(cr.cxt&(cr.c-1))*m; // pointer to row of weights
-        // assert(cr.cxt<=cr.cm.size()-m);
-        // int* wt=(int*)&cr.cm[cr.cxt];
-        // p[i]=0;
-        // for (int j=0; j<m; ++j)
-        //   p[i]+=(wt[j]>>8)*p[cp[2]+j];
-        // p[i]=clamp2k(p[i]>>8);
-
-        put2(0x8b07);                          // mov eax, [edi] ; c8
-        put1a(0x25, cp[5]);                    // and eax, mask
-        put2a(0x0387, off(h[i]));              // add eax, [edi+&h[i]]
-        put1a(0x25, (1<<cp[1])-1);             // and eax, size-1
-        put2a(0x69c0, cp[3]);                  // imul eax, m
-        put2a(0x8987, offc(cxt));              // mov [edi+&cxt], eax ; cxt
-        if (S==8) put1(0x48);                  // rex.w
-        put2a(0x8bb7, offc(cm));               // mov esi, [edi+&cm]
-        if (S==8) put1(0x48);                  // rex.w
-        put3(0x8d3486);                        // lea esi, [esi+eax*4] ; wt
-
-        // Unroll summation loop: esi=wt[0..m-1]
-        for (int k=0; k<cp[3]; k+=8) {
-          const int tail=cp[3]-k;  // number of elements remaining
-
-          // pack 8 elements of wt in xmm1, 8 elements of p in xmm3
-          put4a(0xf30f6f8e, k*4);              // movdqu xmm1, [esi+k*4]
-          if (tail>3) put4a(0xf30f6f96, k*4+16);//movdqu xmm2, [esi+k*4+16]
-          put5(0x660f72e1,0x08);               // psrad xmm1, 8
-          if (tail>3) put5(0x660f72e2,0x08);   // psrad xmm2, 8
-          put4(0x660f6bca);                    // packssdw xmm1, xmm2
-          put4a(0xf30f6f9f, off(p[cp[2]+k]));  // movdqu xmm3, [edi+&p[j+k]]
-          if (tail>3)
-            put4a(0xf30f6fa7,off(p[cp[2]+k+4]));//movdqu xmm4, [edi+&p[j+k+4]]
-          put4(0x660f6bdc);                    // packssdw, xmm3, xmm4
-          if (tail>0 && tail<8) {  // last loop, mask extra weights
-            put4(0x660f76ed);                  // pcmpeqd xmm5, xmm5 ; -1
-            put5(0x660f73dd, 16-tail*2);       // psrldq xmm5, 16-tail*2
-            put4(0x660fdbcd);                  // pand xmm1, xmm5
-          }
-          if (k==0) {  // first loop, initialize sum in xmm0
-            put4(0xf30f6fc1);                  // movdqu xmm0, xmm1
-            put4(0x660ff5c3);                  // pmaddwd xmm0, xmm3
-          }
-          else {  // accumulate sum in xmm0
-            put4(0xf30f6fd1);                  // movdqu xmm2, xmm1
-            put4(0x660ff5d3);                  // pmaddwd xmm2, xmm3
-            put4(0x660ffec2);                  // paddd, xmm0, xmm2
-          }
-        }
-
-        // Add up the 4 elements of xmm0 = p[i] in the first element
-        put4(0xf30f6fc8);                      // movdqu xmm1, xmm0
-        put5(0x660f73d9,0x08);                 // psrldq xmm1, 8
-        put4(0x660ffec1);                      // paddd xmm0, xmm1
-        put4(0xf30f6fc8);                      // movdqu xmm1, xmm0
-        put5(0x660f73d9,0x04);                 // psrldq xmm1, 4
-        put4(0x660ffec1);                      // paddd xmm0, xmm1
-        put4(0x660f7ec0);                      // movd eax, xmm0 ; p[i]
-        put3(0xc1f808);                        // sar eax, 8
-        put1a(0xb9, 2047);                     // mov ecx, 2047 ; clamp2k
-        put2(0x39c8);                          // cmp eax, ecx
-        put3(0x0f4fc1);                        // cmovg eax, ecx
-        put2(0xf7d1);                          // not ecx ; -2048
-        put2(0x39c8);                          // cmp eax, ecx
-        put3(0x0f4cc1);                        // cmovl eax, ecx
-        put2a(0x8987, off(p[i]));              // mov [edi+&p[i]], eax
-        break;
-
-      case SSE:  // sizebits j start limit
-        // cr.cxt=(h[i]+c8)*32;
-        // int pq=p[cp[2]]+992;
-        // if (pq<0) pq=0;
-        // if (pq>1983) pq=1983;
-        // int wt=pq&63;
-        // pq>>=6;
-        // assert(pq>=0 && pq<=30);
-        // cr.cxt+=pq;
-        // p[i]=stretch(((cr.cm(cr.cxt)>>10)*(64-wt)       // p0
-        //               +(cr.cm(cr.cxt+1)>>10)*wt)>>13);  // p1
-        // // p = p0*(64-wt)+p1*wt = (p1-p0)*wt + p0*64
-        // cr.cxt+=wt>>5;
-
-        put2a(0x8b8f, off(h[i]));      // mov ecx, [edi+&h[i]]
-        put2(0x030f);                  // add ecx, [edi]  ; c0
-        put2a(0x81e1, (1<<cp[1])-1);   // and ecx, size-1
-        put3(0xc1e105);                // shl ecx, 5  ; cxt in 0..size*32-32
-        put2a(0x8b87, off(p[cp[2]]));  // mov eax, [edi+&p[j]] ; pq
-        put1a(0x05, 992);              // add eax, 992
-        put2(0x31d2);                  // xor edx, edx ; 0
-        put2(0x39d0);                  // cmp eax, edx
-        put3(0x0f4cc2);                // cmovl eax, edx
-        put1a(0xba, 1983);             // mov edx, 1983
-        put2(0x39d0);                  // cmp eax, edx
-        put3(0x0f4fc2);                // cmovg eax, edx ; pq in 0..1983
-        put2(0x89c2);                  // mov edx, eax
-        put3(0x83e23f);                // and edx, 63  ; wt in 0..63
-        put3(0xc1e806);                // shr eax, 6   ; pq in 0..30
-        put2(0x01c1);                  // add ecx, eax ; cxt in 0..size*32-2
-        if (S==8) put1(0x48);          // rex.w
-        put2a(0x8bb7, offc(cm));       // mov esi, [edi+cm]
-        put3(0x8b048e);                // mov eax, [esi+ecx*4] ; cm[cxt]
-        put4(0x8b5c8e04);              // mov ebx, [esi+ecx*4+4] ; cm[cxt+1]
-        put3(0x83fa20);                // cmp edx, 32  ; wt
-        put3(0x83d9ff);                // sbb ecx, -1  ; cxt+=wt>>5
-        put2a(0x898f, offc(cxt));      // mov [edi+cxt], ecx  ; cxt saved
-        put3(0xc1e80a);                // shr eax, 10 ; p0 = cm[cxt]>>10
-        put3(0xc1eb0a);                // shr ebx, 10 ; p1 = cm[cxt+1]>>10
-        put2(0x29c3);                  // sub ebx, eax, ; p1-p0
-        put3(0x0fafda);                // imul ebx, edx ; (p1-p0)*wt
-        put3(0xc1e006);                // shr eax, 6
-        put2(0x01d8);                  // add eax, ebx ; p in 0..2^28-1
-        put3(0xc1e80d);                // shr eax, 13  ; p in 0..32767
-        put4a(0x0fbf8447, off(stretcht));  // movsx eax, word [edi+eax*2+...]
-        put2a(0x8987, off(p[i]));      // mov [edi+&p[i]], eax
-        break;
-
-      default:
-        error("invalid ZPAQ component");
-    }
-  }
-
-  // return squash(p[n-1])
-  put2a(0x8b87, off(p[n-1]));          // mov eax, [edi+...]
-  put1a(0x05, 0x800);                  // add eax, 2048
-  put4a(0x0fbf8447, off(squasht[0]));  // movsx eax, word [edi+eax*2+...]
-  put1(0x5f);                          // pop edi
-  put1(0x5e);                          // pop esi
-  put1(0x5d);                          // pop ebp
-  put1(0x5b);                          // pop ebx
-  put1(0xc3);                          // ret
-
-  // Initialize for update() Put predictor address in edi/rdi
-  // and bit y=0..1 in ebp
-  int save_o=o;
-  o=5;
-  put1a(0xe9, save_o-10);      // jmp update
-  o=save_o;
-  put1(0x53);                  // push ebx/rbx
-  put1(0x55);                  // push ebp/rbp
-  put1(0x56);                  // push esi/rsi
-  put1(0x57);                  // push edi/rdi
-  if (S==4) {
-    put4(0x8b7c2414);          // mov edi,[esp+0x14] ; (1st arg = pr)
-    put4(0x8b6c2418);          // mov ebp,[esp+0x18] ; (2nd arg = y)
-  }
-  else {
-#ifdef unix                    // (1st arg already in rdi)
-    put3(0x4889f5);            // mov rbp, rsi (2nd arg in Linux-64)
-#else
-    put3(0x4889cf);            // mov rdi, rcx (1st arg in Win64)
-    put3(0x4889d5);            // mov rbp, rdx (2nd arg)
-#endif
-  }
-
-  // Code update() for each component
-  cp=hcomp+7;
-  for (int i=0; i<n; ++i, cp+=compsize[cp[0]]) {
-    assert(cp-hcomp<pr.z.cend);
-    assert (cp[0]>=1 && cp[0]<=9);
-    assert(compsize[cp[0]]>0 && compsize[cp[0]]<8);
-    switch (cp[0]) {
-
-      case CONS:  // c
-        break;
-
-      case SSE:  // sizebits j start limit
-      case CM:   // sizebits limit
-        // train(cr, y);
-        //
-        // reduce prediction error in cr.cm
-        // void train(Component& cr, int y) {
-        //   assert(y==0 || y==1);
-        //   U32& pn=cr.cm(cr.cxt);
-        //   U32 count=pn&0x3ff;
-        //   int error=y*32767-(cr.cm(cr.cxt)>>17);
-        //   pn+=(error*dt[count]&-1024)+(count<cr.limit);
-
-        if (S==8) put1(0x48);          // rex.w (esi->rsi)
-        put2a(0x8bb7, offc(cm));       // mov esi,[edi+cm]  ; cm
-        put2a(0x8b87, offc(cxt));      // mov eax,[edi+cxt] ; cxt
-        put1a(0x25, pr.comp[i].cm.size()-1);  // and eax, size-1
-        if (S==8) put1(0x48);          // rex.w
-        put3(0x8d3486);                // lea esi,[esi+eax*4] ; &cm[cxt]
-        put2(0x8b06);                  // mov eax,[esi] ; cm[cxt]
-        put2(0x89c2);                  // mov edx, eax  ; cm[cxt]
-        put3(0xc1e811);                // shr eax, 17   ; cm[cxt]>>17
-        put2(0x89e9);                  // mov ecx, ebp  ; y
-        put3(0xc1e10f);                // shl ecx, 15   ; y*32768
-        put2(0x29e9);                  // sub ecx, ebp  ; y*32767
-        put2(0x29c1);                  // sub ecx, eax  ; error
-        put2a(0x81e2, 0x3ff);          // and edx, 1023 ; count
-        put3a(0x8b8497, off(dt));      // mov eax,[edi+edx*4+dt] ; dt[count]
-        put3(0x0fafc8);                // imul ecx, eax ; error*dt[count]
-        put2a(0x81e1, 0xfffffc00);     // and ecx, -1024
-        put2a(0x81fa, cp[2+2*(cp[0]==SSE)]*4); // cmp edx, limit*4
-        put2(0x110e);                  // adc [esi], ecx ; pn+=...
-        break;
-
-      case ICM:   // sizebits: cxt=bh, ht[c][0..15]=bh row
-        // cr.ht[cr.c+(hmap4&15)]=st.next(cr.ht[cr.c+(hmap4&15)], y);
-        // U32& pn=cr.cm(cr.cxt);
-        // pn+=int(y*32767-(pn>>8))>>2;
-
-      case ISSE:  // sizebits j  -- c=hi, cxt=bh
-        // assert(cr.cxt==cr.ht[cr.c+(hmap4&15)]);
-        // int err=y*32767-squash(p[i]);
-        // int *wt=(int*)&cr.cm[cr.cxt*2];
-        // wt[0]=clamp512k(wt[0]+((err*p[cp[2]]+(1<<12))>>13));
-        // wt[1]=clamp512k(wt[1]+((err+16)>>5));
-        // cr.ht[cr.c+(hmap4&15)]=st.next(cr.cxt, y);
-
-        // update bit history bh to next(bh,y=ebp) in ht[c+(hmap4&15)]
-        put3(0x8b4700+off(hmap4));     // mov eax, [edi+&hmap4]
-        put3(0x83e00f);                // and eax, 15
-        put2a(0x0387, offc(c));        // add eax [edi+&c] ; cxt
-        if (S==8) put1(0x48);          // rex.w
-        put2a(0x8bb7, offc(ht));       // mov esi, [edi+&ht]
-        put4(0x0fb61406);              // movzx edx, byte [esi+eax] ; bh
-        put4(0x8d5c9500);              // lea ebx, [ebp+edx*4] ; index to st
-        put4a(0x0fb69c1f, off(st));    // movzx ebx,byte[edi+ebx+st]; next bh
-        put3(0x881c06);                // mov [esi+eax], bl ; save next bh
-        if (S==8) put1(0x48);          // rex.w
-        put2a(0x8bb7, offc(cm));       // mov esi, [edi+&cm]
-
-        // ICM: update cm[cxt=edx=bit history] to reduce prediction error
-        // esi = &cm
-        if (cp[0]==ICM) {
-          if (S==8) put1(0x48);        // rex.w
-          put3(0x8d3496);              // lea esi, [esi+edx*4] ; &cm[bh]
-          put2(0x8b06);                // mov eax, [esi] ; pn
-          put3(0xc1e808);              // shr eax, 8 ; pn>>8
-          put2(0x89e9);                // mov ecx, ebp ; y
-          put3(0xc1e10f);              // shl ecx, 15
-          put2(0x29e9);                // sub ecx, ebp ; y*32767
-          put2(0x29c1);                // sub ecx, eax
-          put3(0xc1f902);              // sar ecx, 2
-          put2(0x010e);                // add [esi], ecx
-        }
-
-        // ISSE: update weights. edx=cxt=bit history (0..255), esi=cm[512]
-        else {
-          put2a(0x8b87, off(p[i]));    // mov eax, [edi+&p[i]]
-          put1a(0x05, 2048);           // add eax, 2048
-          put4a(0x0fb78447, off(squasht)); // movzx eax, word [edi+eax*2+..]
-          put2(0x89e9);                // mov ecx, ebp ; y
-          put3(0xc1e10f);              // shl ecx, 15
-          put2(0x29e9);                // sub ecx, ebp ; y*32767
-          put2(0x29c1);                // sub ecx, eax ; err
-          put2a(0x8b87, off(p[cp[2]]));// mov eax, [edi+&p[j]]
-          put3(0x0fafc1);              // imul eax, ecx
-          put1a(0x05, (1<<12));        // add eax, 4096
-          put3(0xc1f80d);              // sar eax, 13
-          put3(0x0304d6);              // add eax, [esi+edx*8] ; wt[0]
-          put1a(0xbb, (1<<19)-1);      // mov ebx, 524287
-          put2(0x39d8);                // cmp eax, ebx
-          put3(0x0f4fc3);              // cmovg eax, ebx
-          put2(0xf7d3);                // not ebx ; -524288
-          put2(0x39d8);                // cmp eax, ebx
-          put3(0x0f4cc3);              // cmovl eax, ebx
-          put3(0x8904d6);              // mov [esi+edx*8], eax
-          put3(0x83c110);              // add ecx, 16 ; err
-          put3(0xc1f905);              // sar ecx, 5
-          put4(0x034cd604);            // add ecx, [esi+edx*8+4] ; wt[1]
-          put1a(0xb8, (1<<19)-1);      // mov eax, 524287
-          put2(0x39c1);                // cmp ecx, eax
-          put3(0x0f4fc8);              // cmovg ecx, eax
-          put2(0xf7d0);                // not eax ; -524288
-          put2(0x39c1);                // cmp ecx, eax
-          put3(0x0f4cc8);              // cmovl ecx, eax
-          put4(0x894cd604);            // mov [esi+edx*8+4], ecx
-        }
-        break;
-
-      case MATCH: // sizebits bufbits:
-                  //   a=len, b=offset, c=bit, cm=index, cxt=bitpos
-                  //   ht=buf, limit=pos
-        // assert(cr.a<=255);
-        // assert(cr.c==0 || cr.c==1);
-        // assert(cr.cxt<8);
-        // assert(cr.cm.size()==(size_t(1)<<cp[1]));
-        // assert(cr.ht.size()==(size_t(1)<<cp[2]));
-        // if (int(cr.c)!=y) cr.a=0;  // mismatch?
-        // cr.ht(cr.limit)+=cr.ht(cr.limit)+y;
-        // if (++cr.cxt==8) {
-        //   cr.cxt=0;
-        //   ++cr.limit;
-        //   cr.limit&=(1<<cp[2])-1;
-        //   if (cr.a==0) {  // look for a match
-        //     cr.b=cr.limit-cr.cm(h[i]);
-        //     if (cr.b&(cr.ht.size()-1))
-        //       while (cr.a<255
-        //              && cr.ht(cr.limit-cr.a-1)==cr.ht(cr.limit-cr.a-cr.b-1))
-        //         ++cr.a;
-        //   }
-        //   else cr.a+=cr.a<255;
-        //   cr.cm(h[i])=cr.limit;
-        // }
-
-        // Set pointers ebx=&cm, esi=&ht
-        if (S==8) put1(0x48);          // rex.w
-        put2a(0x8bb7, offc(ht));       // mov esi, [edi+&ht]
-        if (S==8) put1(0x48);          // rex.w
-        put2a(0x8b9f, offc(cm));       // mov ebx, [edi+&cm]
-
-        // if (c!=y) a=0;
-        put2a(0x8b87, offc(c));        // mov eax, [edi+&c]
-        put2(0x39e8);                  // cmp eax, ebp ; y
-        put2(0x7408);                  // jz L1
-        put2(0x31c0);                  // xor eax, eax
-        put2a(0x8987, offc(a));        // mov [edi+&a], eax
-
-        // ht(limit)+=ht(limit)+y  (1E)
-        put2a(0x8b87, offc(limit));    // mov eax, [edi+&limit]
-        put4(0x0fb60c06);              // movzx, ecx, byte [esi+eax]
-        put2(0x01c9);                  // add ecx, ecx
-        put2(0x01e9);                  // add ecx, ebp
-        put3(0x880c06);                // mov [esi+eax], cl
-
-        // if (++cxt==8)
-        put2a(0x8b87, offc(cxt));      // mov eax, [edi+&cxt]
-        put2(0xffc0);                  // inc eax
-        put3(0x83e007);                // and eax,byte +0x7
-        put2a(0x8987, offc(cxt));      // mov [edi+&cxt],eax
-        put2a(0x0f85, 0x9b);           // jnz L8
-
-        // ++limit;
-        // limit&=bufsize-1;
-        put2a(0x8b87, offc(limit));    // mov eax,[edi+&limit]
-        put2(0xffc0);                  // inc eax
-        put1a(0x25, (1<<cp[2])-1);     // and eax, bufsize-1
-        put2a(0x8987, offc(limit));    // mov [edi+&limit],eax
-
-        // if (a==0)
-        put2a(0x8b87, offc(a));        // mov eax, [edi+&a]
-        put2(0x85c0);                  // test eax,eax
-        put2(0x755c);                  // jnz L6
-
-        //   b=limit-cm(h[i])
-        put2a(0x8b8f, off(h[i]));      // mov ecx,[edi+h[i]]
-        put2a(0x81e1, (1<<cp[1])-1);   // and ecx, size-1
-        put2a(0x8b87, offc(limit));    // mov eax,[edi-&limit]
-        put3(0x2b048b);                // sub eax,[ebx+ecx*4]
-        put2a(0x8987, offc(b));        // mov [edi+&b],eax
-
-        //   if (b&(bufsize-1))
-        put1a(0xa9, (1<<cp[2])-1);     // test eax, bufsize-1
-        put2(0x7448);                  // jz L7
-
-        //      while (a<255 && ht(limit-a-1)==ht(limit-a-b-1)) ++a;
-        put1(0x53);                    // push ebx
-        put2a(0x8b9f, offc(limit));    // mov ebx,[edi+&limit]
-        put2(0x89da);                  // mov edx,ebx
-        put2(0x29c3);                  // sub ebx,eax  ; limit-b
-        put2(0x31c9);                  // xor ecx,ecx  ; a=0
-        put2a(0x81f9, 0xff);           // L2: cmp ecx,0xff ; while
-        put2(0x741c);                  // jz L3 ; break
-        put2(0xffca);                  // dec edx
-        put2(0xffcb);                  // dec ebx
-        put2a(0x81e2, (1<<cp[2])-1);   // and edx, bufsize-1
-        put2a(0x81e3, (1<<cp[2])-1);   // and ebx, bufsize-1
-        put3(0x8a0416);                // mov al,[esi+edx]
-        put3(0x3a041e);                // cmp al,[esi+ebx]
-        put2(0x7504);                  // jnz L3 ; break
-        put2(0xffc1);                  // inc ecx
-        put2(0xebdc);                  // jmp short L2 ; end while
-        put1(0x5b);                    // L3: pop ebx
-        put2a(0x898f, offc(a));        // mov [edi+&a],ecx
-        put2(0xeb0e);                  // jmp short L7
-
-        // a+=(a<255)
-        put1a(0x3d, 0xff);             // L6: cmp eax, 0xff ; a
-        put3(0x83d000);                // adc eax, 0
-        put2a(0x8987, offc(a));        // mov [edi+&a],eax
-
-        // cm(h[i])=limit
-        put2a(0x8b87, off(h[i]));      // L7: mov eax,[edi+&h[i]]
-        put1a(0x25, (1<<cp[1])-1);     // and eax, size-1
-        put2a(0x8b8f, offc(limit));    // mov ecx,[edi+&limit]
-        put3(0x890c83);                // mov [ebx+eax*4],ecx
-                                       // L8:
-        break;
-
-      case AVG:  // j k wt
-        break;
-
-      case MIX2: // sizebits j k rate mask
-                 // cm=wt[size], cxt=input
-        // assert(cr.a16.size()==cr.c);
-        // assert(cr.cxt<cr.a16.size());
-        // int err=(y*32767-squash(p[i]))*cp[4]>>5;
-        // int w=cr.a16[cr.cxt];
-        // w+=(err*(p[cp[2]]-p[cp[3]])+(1<<12))>>13;
-        // if (w<0) w=0;
-        // if (w>65535) w=65535;
-        // cr.a16[cr.cxt]=w;
-
-        // set ecx=err
-        put2a(0x8b87, off(p[i]));      // mov eax, [edi+&p[i]]
-        put1a(0x05, 2048);             // add eax, 2048
-        put4a(0x0fb78447, off(squasht));//movzx eax, word [edi+eax*2+&squasht]
-        put2(0x89e9);                  // mov ecx, ebp ; y
-        put3(0xc1e10f);                // shl ecx, 15
-        put2(0x29e9);                  // sub ecx, ebp ; y*32767
-        put2(0x29c1);                  // sub ecx, eax
-        put2a(0x69c9, cp[4]);          // imul ecx, rate
-        put3(0xc1f905);                // sar ecx, 5  ; err
-
-        // Update w
-        put2a(0x8b87, offc(cxt));      // mov eax, [edi+&cxt]
-        if (S==8) put1(0x48);          // rex.w
-        put2a(0x8bb7, offc(a16));      // mov esi, [edi+&a16]
-        if (S==8) put1(0x48);          // rex.w
-        put3(0x8d3446);                // lea esi, [esi+eax*2] ; &w
-        put2a(0x8b87, off(p[cp[2]]));  // mov eax, [edi+&p[j]]
-        put2a(0x2b87, off(p[cp[3]]));  // sub eax, [edi+&p[k]] ; p[j]-p[k]
-        put3(0x0fafc1);                // imul eax, ecx  ; * err
-        put1a(0x05, 1<<12);            // add eax, 4096
-        put3(0xc1f80d);                // sar eax, 13
-        put3(0x0fb716);                // movzx edx, word [esi] ; w
-        put2(0x01d0);                  // add eax, edx
-        put1a(0xba, 0xffff);           // mov edx, 65535
-        put2(0x39d0);                  // cmp eax, edx
-        put3(0x0f4fc2);                // cmovg eax, edx
-        put2(0x31d2);                  // xor edx, edx
-        put2(0x39d0);                  // cmp eax, edx
-        put3(0x0f4cc2);                // cmovl eax, edx
-        put3(0x668906);                // mov word [esi], ax
-        break;
-
-      case MIX: // sizebits j m rate mask
-                // cm=wt[size][m], cxt=input
-        // int m=cp[3];
-        // assert(m>0 && m<=i);
-        // assert(cr.cm.size()==m*cr.c);
-        // assert(cr.cxt+m<=cr.cm.size());
-        // int err=(y*32767-squash(p[i]))*cp[4]>>4;
-        // int* wt=(int*)&cr.cm[cr.cxt];
-        // for (int j=0; j<m; ++j)
-        //   wt[j]=clamp512k(wt[j]+((err*p[cp[2]+j]+(1<<12))>>13));
-
-        // set ecx=err
-        put2a(0x8b87, off(p[i]));      // mov eax, [edi+&p[i]]
-        put1a(0x05, 2048);             // add eax, 2048
-        put4a(0x0fb78447, off(squasht));//movzx eax, word [edi+eax*2+&squasht]
-        put2(0x89e9);                  // mov ecx, ebp ; y
-        put3(0xc1e10f);                // shl ecx, 15
-        put2(0x29e9);                  // sub ecx, ebp ; y*32767
-        put2(0x29c1);                  // sub ecx, eax
-        put2a(0x69c9, cp[4]);          // imul ecx, rate
-        put3(0xc1f904);                // sar ecx, 4  ; err
-
-        // set esi=wt
-        put2a(0x8b87, offc(cxt));      // mov eax, [edi+&cxt] ; cxt
-        if (S==8) put1(0x48);          // rex.w
-        put2a(0x8bb7, offc(cm));       // mov esi, [edi+&cm]
-        if (S==8) put1(0x48);          // rex.w
-        put3(0x8d3486);                // lea esi, [esi+eax*4] ; wt
-
-        for (int k=0; k<cp[3]; ++k) {
-          put2a(0x8b87,off(p[cp[2]+k]));//mov eax, [edi+&p[cp[2]+k]
-          put3(0x0fafc1);              // imul eax, ecx
-          put1a(0x05, 1<<12);          // add eax, 1<<12
-          put3(0xc1f80d);              // sar eax, 13
-          put2(0x0306);                // add eax, [esi]
-          put1a(0xba, (1<<19)-1);      // mov edx, (1<<19)-1
-          put2(0x39d0);                // cmp eax, edx
-          put3(0x0f4fc2);              // cmovg eax, edx
-          put2(0xf7d2);                // not edx
-          put2(0x39d0);                // cmp eax, edx
-          put3(0x0f4cc2);              // cmovl eax, edx
-          put2(0x8906);                // mov [esi], eax
-          if (k<cp[3]-1) {
-            if (S==8) put1(0x48);      // rex.w
-            put3(0x83c604);            // add esi, 4
-          }
-        }
-        break;
-
-      default:
-        error("invalid ZPAQ component");
-    }
-  }
-
-  // return from update()
-  put1(0x5f);                 // pop edi
-  put1(0x5e);                 // pop esi
-  put1(0x5d);                 // pop ebp
-  put1(0x5b);                 // pop ebx
-  put1(0xc3);                 // ret
-
-  return o;
-}
-
-#endif // ifndef NOJIT
-
-// Return a prediction of the next bit in range 0..32767
-// Use JIT code starting at pcode[0] if available, or else create it.
-int Predictor::predict() {
-#ifdef NOJIT
-  return predict0();
-#else
-  if (!pcode) {
-    int n=assemble_p();
-    allocx(pcode, pcode_size, n);
-    if (!pcode || n!=assemble_p() || n<10 || pcode_size<10)
-      error("predictor JIT failed");
-  }
-  assert(pcode && pcode[0]);
-  return ((int(*)(Predictor*))&pcode[0])(this);
-#endif
-}
-
-// Update the model with bit y = 0..1
-// Use the JIT code starting at pcode[5].
-void Predictor::update(int y) {
-#ifdef NOJIT
-  update0(y);
-#else
-  assert(pcode && pcode[5]);
-  ((void(*)(Predictor*, int))&pcode[5])(this, y);
-
-  // Save bit y in c8, hmap4 (not implemented in JIT)
-  c8+=c8+y;
-  if (c8>=256) {
-    z.run(c8-256);
-    hmap4=1;
-    c8=1;
-    for (int i=0; i<z.header[6]; ++i) h[i]=z.H(i);
-  }
-  else if (c8>=16 && c8<32)
-    hmap4=(hmap4&0xf)<<5|y<<4|1;
-  else
-    hmap4=(hmap4&0x1f0)|(((hmap4&0xf)*2+y)&0xf);
-#endif
-}
-
-// Execute the ZPAQL code with input byte or -1 for EOF.
-// Use JIT code at rcode if available, or else create it.
-void ZPAQL::run(U32 input) {
-#ifdef NOJIT
-  run0(input);
-#else
-  if (!rcode) {
-    int n=assemble();
-    allocx(rcode, rcode_size, n);
-    if (!rcode || n<10 || rcode_size<10 || n!=assemble())
-      error("run JIT failed");
-  }
-  a=input;
-  if (!((int(*)())(&rcode[0]))())
-    libzpaq::error("Bad ZPAQL opcode");
-#endif
-}
-
-}  // end namespace libzpaq
+/* libzpaq.cpp - Part of LIBZPAQ Version 5.01
+
+  Copyright (C) 2011, Dell Inc. Written by Matt Mahoney.
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so without restriction.
+  This Software is provided "as is" without warranty.
+
+LIBZPAQ is a C++ library for compression and decompression of data
+conforming to the ZPAQ level 2 standard. See http://mattmahoney.net/zpaq/
+*/
+
+#include "libzpaq.h"
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#ifndef NOJIT
+#ifdef unix
+#include <sys/mman.h>
+#else
+#include <windows.h>
+#endif
+#endif
+
+namespace libzpaq {
+
+// Standard library redirections
+void* calloc(size_t a, size_t b) {return ::calloc(a, b);}
+void free(void* p) {::free(p);}
+int memcmp(const void* d, const void* s, size_t n) {
+  return ::memcmp(d, s, n);}
+void* memset(void* d, int c, size_t n) {return ::memset(d, c, n);}
+double log(double x) {return ::log(x);}
+double exp(double x) {return ::exp(x);}
+double pow(double x, double y) {return ::pow(x, y);}
+
+// Read 16 bit little-endian number
+int toU16(const char* p) {
+  return (p[0]&255)+256*(p[1]&255);
+}
+
+// Default read() and write()
+int Reader::read(char* buf, int n) {
+  int i=0, c;
+  while (i<n && (c=get())>=0)
+    buf[i++]=c;
+  return i;
+}
+
+void Writer::write(const char* buf, int n) {
+  for (int i=0; i<n; ++i)
+    put(U8(buf[i]));
+}
+
+///////////////////////// allocx //////////////////////
+
+// Allocate newsize > 0 bytes of executable memory and update
+// p to point to it and newsize = n. Free any previously
+// allocated memory first. If newsize is 0 then free only.
+// Call error in case of failure. If NOJIT, ignore newsize
+// and set p=0, n=0 without allocating memory.
+void allocx(U8* &p, int &n, int newsize) {
+#ifdef NOJIT
+  p=0;
+  n=0;
+#else
+  if (p || n) {
+    if (p)
+#ifdef unix
+      munmap(p, n);
+#else // Windows
+      VirtualFree(p, 0, MEM_RELEASE);
+#endif
+    p=0;
+    n=0;
+  }
+  if (newsize>0) {
+#ifdef unix
+    p=(U8*)mmap(0, newsize, PROT_READ|PROT_WRITE|PROT_EXEC,
+                MAP_PRIVATE|MAP_ANON, -1, 0);
+    if ((void*)p==MAP_FAILED) p=0;
+#else
+    p=(U8*)VirtualAlloc(0, newsize, MEM_RESERVE|MEM_COMMIT,
+                        PAGE_EXECUTE_READWRITE);
+#endif
+    if (p)
+      n=newsize;
+    else {
+      n=0;
+      error("allocx failed");
+    }
+  }
+#endif
+}
+
+//////////////////////////// SHA1 ////////////////////////////
+
+// SHA1 code, see http://en.wikipedia.org/wiki/SHA-1
+
+// Start a new hash
+void SHA1::init() {
+  len0=len1=0;
+  h[0]=0x67452301;
+  h[1]=0xEFCDAB89;
+  h[2]=0x98BADCFE;
+  h[3]=0x10325476;
+  h[4]=0xC3D2E1F0;
+}
+
+// Return old result and start a new hash
+const char* SHA1::result() {
+
+  // pad and append length
+  const U32 s1=len1, s0=len0;
+  put(0x80);
+  while ((len0&511)!=448)
+    put(0);
+  put(s1>>24);
+  put(s1>>16);
+  put(s1>>8);
+  put(s1);
+  put(s0>>24);
+  put(s0>>16);
+  put(s0>>8);
+  put(s0);
+
+  // copy h to hbuf
+  for (int i=0; i<5; ++i) {
+    hbuf[4*i]=h[i]>>24;
+    hbuf[4*i+1]=h[i]>>16;
+    hbuf[4*i+2]=h[i]>>8;
+    hbuf[4*i+3]=h[i];
+  }
+
+  // return hash prior to clearing state
+  init();
+  return hbuf;
+}
+
+// Hash 1 block of 64 bytes
+void SHA1::process() {
+  for (int i=16; i<80; ++i) {
+    w[i]=w[i-3]^w[i-8]^w[i-14]^w[i-16];
+    w[i]=w[i]<<1|w[i]>>31;
+  }
+  U32 a=h[0];
+  U32 b=h[1];
+  U32 c=h[2];
+  U32 d=h[3];
+  U32 e=h[4];
+  const U32 k1=0x5A827999, k2=0x6ED9EBA1, k3=0x8F1BBCDC, k4=0xCA62C1D6;
+#define f1(a,b,c,d,e,i) e+=(a<<5|a>>27)+((b&c)|(~b&d))+k1+w[i]; b=b<<30|b>>2;
+#define f5(i) f1(a,b,c,d,e,i) f1(e,a,b,c,d,i+1) f1(d,e,a,b,c,i+2) \
+              f1(c,d,e,a,b,i+3) f1(b,c,d,e,a,i+4)
+  f5(0) f5(5) f5(10) f5(15)
+#undef f1
+#define f1(a,b,c,d,e,i) e+=(a<<5|a>>27)+(b^c^d)+k2+w[i]; b=b<<30|b>>2;
+  f5(20) f5(25) f5(30) f5(35)
+#undef f1
+#define f1(a,b,c,d,e,i) e+=(a<<5|a>>27)+((b&c)|(b&d)|(c&d))+k3+w[i]; b=b<<30|b>>2;
+  f5(40) f5(45) f5(50) f5(55)
+#undef f1
+#define f1(a,b,c,d,e,i) e+=(a<<5|a>>27)+(b^c^d)+k4+w[i]; b=b<<30|b>>2;
+  f5(60) f5(65) f5(70) f5(75)
+#undef f1
+#undef f5
+  h[0]+=a;
+  h[1]+=b;
+  h[2]+=c;
+  h[3]+=d;
+  h[4]+=e;
+}
+
+//////////////////////////// Component ///////////////////////
+
+// A Component is a context model, indirect context model, match model,
+// fixed weight mixer, adaptive 2 input mixer without or with current
+// partial byte as context, adaptive m input mixer (without or with),
+// or SSE (without or with).
+
+const int compsize[256]={0,2,3,2,3,4,6,6,3,5};
+
+void Component::init() {
+  limit=cxt=a=b=c=0;
+  cm.resize(0);
+  ht.resize(0);
+  a16.resize(0);
+}
+
+////////////////////////// StateTable //////////////////////////
+
+// How many states with count of n0 zeros, n1 ones (0...2)
+int StateTable::num_states(int n0, int n1) {
+  const int B=6;
+  const int bound[B]={20,48,15,8,6,5}; // n0 -> max n1, n1 -> max n0
+  if (n0<n1) return num_states(n1, n0);
+  if (n0<0 || n1<0 || n1>=B || n0>bound[n1]) return 0;
+  return 1+(n1>0 && n0+n1<=17);
+}
+
+// New value of count n0 if 1 is observed (and vice versa)
+void StateTable::discount(int& n0) {
+  n0=(n0>=1)+(n0>=2)+(n0>=3)+(n0>=4)+(n0>=5)+(n0>=7)+(n0>=8);
+}
+
+// compute next n0,n1 (0 to N) given input y (0 or 1)
+void StateTable::next_state(int& n0, int& n1, int y) {
+  if (n0<n1)
+    next_state(n1, n0, 1-y);
+  else {
+    if (y) {
+      ++n1;
+      discount(n0);
+    }
+    else {
+      ++n0;
+      discount(n1);
+    }
+    // 20,0,0 -> 20,0
+    // 48,1,0 -> 48,1
+    // 15,2,0 -> 8,1
+    //  8,3,0 -> 6,2
+    //  8,3,1 -> 5,3
+    //  6,4,0 -> 5,3
+    //  5,5,0 -> 5,4
+    //  5,5,1 -> 4,5
+    while (!num_states(n0, n1)) {
+      if (n1<2) --n0;
+      else {
+        n0=(n0*(n1-1)+(n1/2))/n1;
+        --n1;
+      }
+    }
+  }
+}
+
+// Initialize next state table ns[state*4] -> next if 0, next if 1, n0, n1
+StateTable::StateTable() {
+
+  // Assign states by increasing priority
+  const int N=50;
+  U8 t[N][N][2]={{{0}}}; // (n0,n1,y) -> state number
+  int state=0;
+  for (int i=0; i<N; ++i) {
+    for (int n1=0; n1<=i; ++n1) {
+      int n0=i-n1;
+      int n=num_states(n0, n1);
+      assert(n>=0 && n<=2);
+      if (n) {
+        t[n0][n1][0]=state;
+        t[n0][n1][1]=state+n-1;
+        state+=n;
+      }
+    }
+  }
+       
+  // Generate next state table
+  memset(ns, 0, sizeof(ns));
+  for (int n0=0; n0<N; ++n0) {
+    for (int n1=0; n1<N; ++n1) {
+      for (int y=0; y<num_states(n0, n1); ++y) {
+        int s=t[n0][n1][y];
+        assert(s>=0 && s<256);
+        int s0=n0, s1=n1;
+        next_state(s0, s1, 0);
+        assert(s0>=0 && s0<N && s1>=0 && s1<N);
+        ns[s*4+0]=t[s0][s1][0];
+        s0=n0, s1=n1;
+        next_state(s0, s1, 1);
+        assert(s0>=0 && s0<N && s1>=0 && s1<N);
+        ns[s*4+1]=t[s0][s1][1];
+        ns[s*4+2]=n0;
+        ns[s*4+3]=n1;
+      }
+    }
+  }
+}
+
+/////////////////////////// ZPAQL //////////////////////////
+
+// Write header to out2, return true if HCOMP/PCOMP section is present.
+// If pp is true, then write only the postprocessor code.
+bool ZPAQL::write(Writer* out2, bool pp) {
+  if (header.size()<=6) return false;
+  assert(header[0]+256*header[1]==cend-2+hend-hbegin);
+  assert(cend>=7);
+  assert(hbegin>=cend);
+  assert(hend>=hbegin);
+  assert(out2);
+  if (!pp) {  // if not a postprocessor then write COMP
+    for (int i=0; i<cend; ++i)
+      out2->put(header[i]);
+  }
+  else {  // write PCOMP size only
+    out2->put((hend-hbegin)&255);
+    out2->put((hend-hbegin)>>8);
+  }
+  for (int i=hbegin; i<hend; ++i)
+    out2->put(header[i]);
+  return true;
+}
+
+// Read header from in2
+int ZPAQL::read(Reader* in2) {
+
+  // Get header size and allocate
+  int hsize=in2->get();
+  hsize+=in2->get()*256;
+  header.resize(hsize+300);
+  cend=hbegin=hend=0;
+  header[cend++]=hsize&255;
+  header[cend++]=hsize>>8;
+  while (cend<7) header[cend++]=in2->get(); // hh hm ph pm n
+
+  // Read COMP
+  int n=header[cend-1];
+  for (int i=0; i<n; ++i) {
+    int type=in2->get();  // component type
+    if (type==-1) error("unexpected end of file");
+    header[cend++]=type;  // component type
+    int size=compsize[type];
+    if (size<1) error("Invalid component type");
+    if (cend+size>header.isize()-8) error("COMP list too big");
+    for (int j=1; j<size; ++j)
+      header[cend++]=in2->get();
+  }
+  if ((header[cend++]=in2->get())!=0) error("missing COMP END");
+
+  // Insert a guard gap and read HCOMP
+  hbegin=hend=cend+128;
+  while (hend<hsize+129) {
+    assert(hend<header.isize()-8);
+    int op=in2->get();
+    if (op==-1) error("unexpected end of file");
+    header[hend++]=op;
+  }
+  if ((header[hend++]=in2->get())!=0) error("missing HCOMP END");
+  assert(cend>=7 && cend<header.isize());
+  assert(hbegin==cend+128 && hbegin<header.isize());
+  assert(hend>hbegin && hend<header.isize());
+  assert(hsize==header[0]+256*header[1]);
+  assert(hsize==cend-2+hend-hbegin);
+  allocx(rcode, rcode_size, 0);  // clear JIT code
+  return cend+hend-hbegin;
+}
+
+// Free memory, but preserve output, sha1 pointers
+void ZPAQL::clear() {
+  cend=hbegin=hend=0;  // COMP and HCOMP locations
+  a=b=c=d=f=pc=0;      // machine state
+  header.resize(0);
+  h.resize(0);
+  m.resize(0);
+  r.resize(0);
+  allocx(rcode, rcode_size, 0);
+}
+
+// Constructor
+ZPAQL::ZPAQL() {
+  output=0;
+  sha1=0;
+  rcode=0;
+  rcode_size=0;
+  clear();
+  outbuf.resize(1<<14);
+  bufptr=0;
+}
+
+ZPAQL::~ZPAQL() {
+  allocx(rcode, rcode_size, 0);
+}
+
+// Initialize machine state as HCOMP
+void ZPAQL::inith() {
+  assert(header.isize()>6);
+  assert(output==0);
+  assert(sha1==0);
+  init(header[2], header[3]); // hh, hm
+}
+
+// Initialize machine state as PCOMP
+void ZPAQL::initp() {
+  assert(header.isize()>6);
+  init(header[4], header[5]); // ph, pm
+}
+
+// Flush pending output
+void ZPAQL::flush() {
+  if (output) output->write(&outbuf[0], bufptr);
+  if (sha1) for (int i=0; i<bufptr; ++i) sha1->put(U8(outbuf[i]));
+  bufptr=0;
+}
+
+// Return memory requirement in bytes
+double ZPAQL::memory() {
+  double mem=pow(2.0,header[2]+2)+pow(2.0,header[3])  // hh hm
+            +pow(2.0,header[4]+2)+pow(2.0,header[5])  // ph pm
+            +header.size();
+  int cp=7;  // start of comp list
+  for (int i=0; i<header[6]; ++i) {  // n
+    assert(cp<cend);
+    double size=pow(2.0, header[cp+1]); // sizebits
+    switch(header[cp]) {
+      case CM: mem+=4*size; break;
+      case ICM: mem+=64*size+1024; break;
+      case MATCH: mem+=4*size+pow(2.0, header[cp+2]); break; // bufbits
+      case MIX2: mem+=2*size; break;
+      case MIX: mem+=4*size*header[cp+3]; break; // m
+      case ISSE: mem+=64*size+2048; break;
+      case SSE: mem+=128*size; break;
+    }
+    cp+=compsize[header[cp]];
+  }
+  return mem;
+}
+
+// Initialize machine state to run a program.
+void ZPAQL::init(int hbits, int mbits) {
+  assert(header.isize()>0);
+  assert(cend>=7);
+  assert(hbegin>=cend+128);
+  assert(hend>=hbegin);
+  assert(hend<header.isize()-130);
+  assert(header[0]+256*header[1]==cend-2+hend-hbegin);
+  assert(bufptr==0);
+  assert(outbuf.isize()>0);
+  h.resize(1, hbits);
+  m.resize(1, mbits);
+  r.resize(256);
+  a=b=c=d=pc=f=0;
+}
+
+// Run program on input by interpreting header
+void ZPAQL::run0(U32 input) {
+  assert(cend>6);
+  assert(hbegin>=cend+128);
+  assert(hend>=hbegin);
+  assert(hend<header.isize()-130);
+  assert(m.size()>0);
+  assert(h.size()>0);
+  assert(header[0]+256*header[1]==cend+hend-hbegin-2);
+  pc=hbegin;
+  a=input;
+  while (execute()) ;
+}
+
+// Execute one instruction, return 0 after HALT else 1
+int ZPAQL::execute() {
+  switch(header[pc++]) {
+    case 0: err(); break; // ERROR
+    case 1: ++a; break; // A++
+    case 2: --a; break; // A--
+    case 3: a = ~a; break; // A!
+    case 4: a = 0; break; // A=0
+    case 7: a = r[header[pc++]]; break; // A=R N
+    case 8: swap(b); break; // B<>A
+    case 9: ++b; break; // B++
+    case 10: --b; break; // B--
+    case 11: b = ~b; break; // B!
+    case 12: b = 0; break; // B=0
+    case 15: b = r[header[pc++]]; break; // B=R N
+    case 16: swap(c); break; // C<>A
+    case 17: ++c; break; // C++
+    case 18: --c; break; // C--
+    case 19: c = ~c; break; // C!
+    case 20: c = 0; break; // C=0
+    case 23: c = r[header[pc++]]; break; // C=R N
+    case 24: swap(d); break; // D<>A
+    case 25: ++d; break; // D++
+    case 26: --d; break; // D--
+    case 27: d = ~d; break; // D!
+    case 28: d = 0; break; // D=0
+    case 31: d = r[header[pc++]]; break; // D=R N
+    case 32: swap(m(b)); break; // *B<>A
+    case 33: ++m(b); break; // *B++
+    case 34: --m(b); break; // *B--
+    case 35: m(b) = ~m(b); break; // *B!
+    case 36: m(b) = 0; break; // *B=0
+    case 39: if (f) pc+=((header[pc]+128)&255)-127; else ++pc; break; // JT N
+    case 40: swap(m(c)); break; // *C<>A
+    case 41: ++m(c); break; // *C++
+    case 42: --m(c); break; // *C--
+    case 43: m(c) = ~m(c); break; // *C!
+    case 44: m(c) = 0; break; // *C=0
+    case 47: if (!f) pc+=((header[pc]+128)&255)-127; else ++pc; break; // JF N
+    case 48: swap(h(d)); break; // *D<>A
+    case 49: ++h(d); break; // *D++
+    case 50: --h(d); break; // *D--
+    case 51: h(d) = ~h(d); break; // *D!
+    case 52: h(d) = 0; break; // *D=0
+    case 55: r[header[pc++]] = a; break; // R=A N
+    case 56: return 0  ; // HALT
+    case 57: outc(a&255); break; // OUT
+    case 59: a = (a+m(b)+512)*773; break; // HASH
+    case 60: h(d) = (h(d)+a+512)*773; break; // HASHD
+    case 63: pc+=((header[pc]+128)&255)-127; break; // JMP N
+    case 64: a = a; break; // A=A
+    case 65: a = b; break; // A=B
+    case 66: a = c; break; // A=C
+    case 67: a = d; break; // A=D
+    case 68: a = m(b); break; // A=*B
+    case 69: a = m(c); break; // A=*C
+    case 70: a = h(d); break; // A=*D
+    case 71: a = header[pc++]; break; // A= N
+    case 72: b = a; break; // B=A
+    case 73: b = b; break; // B=B
+    case 74: b = c; break; // B=C
+    case 75: b = d; break; // B=D
+    case 76: b = m(b); break; // B=*B
+    case 77: b = m(c); break; // B=*C
+    case 78: b = h(d); break; // B=*D
+    case 79: b = header[pc++]; break; // B= N
+    case 80: c = a; break; // C=A
+    case 81: c = b; break; // C=B
+    case 82: c = c; break; // C=C
+    case 83: c = d; break; // C=D
+    case 84: c = m(b); break; // C=*B
+    case 85: c = m(c); break; // C=*C
+    case 86: c = h(d); break; // C=*D
+    case 87: c = header[pc++]; break; // C= N
+    case 88: d = a; break; // D=A
+    case 89: d = b; break; // D=B
+    case 90: d = c; break; // D=C
+    case 91: d = d; break; // D=D
+    case 92: d = m(b); break; // D=*B
+    case 93: d = m(c); break; // D=*C
+    case 94: d = h(d); break; // D=*D
+    case 95: d = header[pc++]; break; // D= N
+    case 96: m(b) = a; break; // *B=A
+    case 97: m(b) = b; break; // *B=B
+    case 98: m(b) = c; break; // *B=C
+    case 99: m(b) = d; break; // *B=D
+    case 100: m(b) = m(b); break; // *B=*B
+    case 101: m(b) = m(c); break; // *B=*C
+    case 102: m(b) = h(d); break; // *B=*D
+    case 103: m(b) = header[pc++]; break; // *B= N
+    case 104: m(c) = a; break; // *C=A
+    case 105: m(c) = b; break; // *C=B
+    case 106: m(c) = c; break; // *C=C
+    case 107: m(c) = d; break; // *C=D
+    case 108: m(c) = m(b); break; // *C=*B
+    case 109: m(c) = m(c); break; // *C=*C
+    case 110: m(c) = h(d); break; // *C=*D
+    case 111: m(c) = header[pc++]; break; // *C= N
+    case 112: h(d) = a; break; // *D=A
+    case 113: h(d) = b; break; // *D=B
+    case 114: h(d) = c; break; // *D=C
+    case 115: h(d) = d; break; // *D=D
+    case 116: h(d) = m(b); break; // *D=*B
+    case 117: h(d) = m(c); break; // *D=*C
+    case 118: h(d) = h(d); break; // *D=*D
+    case 119: h(d) = header[pc++]; break; // *D= N
+    case 128: a += a; break; // A+=A
+    case 129: a += b; break; // A+=B
+    case 130: a += c; break; // A+=C
+    case 131: a += d; break; // A+=D
+    case 132: a += m(b); break; // A+=*B
+    case 133: a += m(c); break; // A+=*C
+    case 134: a += h(d); break; // A+=*D
+    case 135: a += header[pc++]; break; // A+= N
+    case 136: a -= a; break; // A-=A
+    case 137: a -= b; break; // A-=B
+    case 138: a -= c; break; // A-=C
+    case 139: a -= d; break; // A-=D
+    case 140: a -= m(b); break; // A-=*B
+    case 141: a -= m(c); break; // A-=*C
+    case 142: a -= h(d); break; // A-=*D
+    case 143: a -= header[pc++]; break; // A-= N
+    case 144: a *= a; break; // A*=A
+    case 145: a *= b; break; // A*=B
+    case 146: a *= c; break; // A*=C
+    case 147: a *= d; break; // A*=D
+    case 148: a *= m(b); break; // A*=*B
+    case 149: a *= m(c); break; // A*=*C
+    case 150: a *= h(d); break; // A*=*D
+    case 151: a *= header[pc++]; break; // A*= N
+    case 152: div(a); break; // A/=A
+    case 153: div(b); break; // A/=B
+    case 154: div(c); break; // A/=C
+    case 155: div(d); break; // A/=D
+    case 156: div(m(b)); break; // A/=*B
+    case 157: div(m(c)); break; // A/=*C
+    case 158: div(h(d)); break; // A/=*D
+    case 159: div(header[pc++]); break; // A/= N
+    case 160: mod(a); break; // A%=A
+    case 161: mod(b); break; // A%=B
+    case 162: mod(c); break; // A%=C
+    case 163: mod(d); break; // A%=D
+    case 164: mod(m(b)); break; // A%=*B
+    case 165: mod(m(c)); break; // A%=*C
+    case 166: mod(h(d)); break; // A%=*D
+    case 167: mod(header[pc++]); break; // A%= N
+    case 168: a &= a; break; // A&=A
+    case 169: a &= b; break; // A&=B
+    case 170: a &= c; break; // A&=C
+    case 171: a &= d; break; // A&=D
+    case 172: a &= m(b); break; // A&=*B
+    case 173: a &= m(c); break; // A&=*C
+    case 174: a &= h(d); break; // A&=*D
+    case 175: a &= header[pc++]; break; // A&= N
+    case 176: a &= ~ a; break; // A&~A
+    case 177: a &= ~ b; break; // A&~B
+    case 178: a &= ~ c; break; // A&~C
+    case 179: a &= ~ d; break; // A&~D
+    case 180: a &= ~ m(b); break; // A&~*B
+    case 181: a &= ~ m(c); break; // A&~*C
+    case 182: a &= ~ h(d); break; // A&~*D
+    case 183: a &= ~ header[pc++]; break; // A&~ N
+    case 184: a |= a; break; // A|=A
+    case 185: a |= b; break; // A|=B
+    case 186: a |= c; break; // A|=C
+    case 187: a |= d; break; // A|=D
+    case 188: a |= m(b); break; // A|=*B
+    case 189: a |= m(c); break; // A|=*C
+    case 190: a |= h(d); break; // A|=*D
+    case 191: a |= header[pc++]; break; // A|= N
+    case 192: a ^= a; break; // A^=A
+    case 193: a ^= b; break; // A^=B
+    case 194: a ^= c; break; // A^=C
+    case 195: a ^= d; break; // A^=D
+    case 196: a ^= m(b); break; // A^=*B
+    case 197: a ^= m(c); break; // A^=*C
+    case 198: a ^= h(d); break; // A^=*D
+    case 199: a ^= header[pc++]; break; // A^= N
+    case 200: a <<= (a&31); break; // A<<=A
+    case 201: a <<= (b&31); break; // A<<=B
+    case 202: a <<= (c&31); break; // A<<=C
+    case 203: a <<= (d&31); break; // A<<=D
+    case 204: a <<= (m(b)&31); break; // A<<=*B
+    case 205: a <<= (m(c)&31); break; // A<<=*C
+    case 206: a <<= (h(d)&31); break; // A<<=*D
+    case 207: a <<= (header[pc++]&31); break; // A<<= N
+    case 208: a >>= (a&31); break; // A>>=A
+    case 209: a >>= (b&31); break; // A>>=B
+    case 210: a >>= (c&31); break; // A>>=C
+    case 211: a >>= (d&31); break; // A>>=D
+    case 212: a >>= (m(b)&31); break; // A>>=*B
+    case 213: a >>= (m(c)&31); break; // A>>=*C
+    case 214: a >>= (h(d)&31); break; // A>>=*D
+    case 215: a >>= (header[pc++]&31); break; // A>>= N
+    case 216: f = (a == a); break; // A==A
+    case 217: f = (a == b); break; // A==B
+    case 218: f = (a == c); break; // A==C
+    case 219: f = (a == d); break; // A==D
+    case 220: f = (a == U32(m(b))); break; // A==*B
+    case 221: f = (a == U32(m(c))); break; // A==*C
+    case 222: f = (a == h(d)); break; // A==*D
+    case 223: f = (a == U32(header[pc++])); break; // A== N
+    case 224: f = (a < a); break; // A<A
+    case 225: f = (a < b); break; // A<B
+    case 226: f = (a < c); break; // A<C
+    case 227: f = (a < d); break; // A<D
+    case 228: f = (a < U32(m(b))); break; // A<*B
+    case 229: f = (a < U32(m(c))); break; // A<*C
+    case 230: f = (a < h(d)); break; // A<*D
+    case 231: f = (a < U32(header[pc++])); break; // A< N
+    case 232: f = (a > a); break; // A>A
+    case 233: f = (a > b); break; // A>B
+    case 234: f = (a > c); break; // A>C
+    case 235: f = (a > d); break; // A>D
+    case 236: f = (a > U32(m(b))); break; // A>*B
+    case 237: f = (a > U32(m(c))); break; // A>*C
+    case 238: f = (a > h(d)); break; // A>*D
+    case 239: f = (a > U32(header[pc++])); break; // A> N
+    case 255: if((pc=hbegin+header[pc]+256*header[pc+1])>=hend)err();break;//LJ
+    default: err();
+  }
+  return 1;
+}
+
+// Print illegal instruction error message and exit
+void ZPAQL::err() {
+  error("ZPAQL execution error");
+}
+
+///////////////////////// Predictor /////////////////////////
+
+// Initailize model-independent tables
+Predictor::Predictor(ZPAQL& zr):
+    c8(1), hmap4(1), z(zr) {
+  assert(sizeof(U8)==1);
+  assert(sizeof(U16)==2);
+  assert(sizeof(U32)==4);
+  assert(sizeof(U64)==8);
+  assert(sizeof(short)==2);
+  assert(sizeof(int)==4);
+
+  // Initialize tables
+  dt2k[0]=0;
+  for (int i=1; i<256; ++i)
+    dt2k[i]=2048/i;
+  for (int i=0; i<1024; ++i)
+    dt[i]=(1<<17)/(i*2+3)*2;
+  for (int i=0; i<32768; ++i)
+    stretcht[i]=int(log((i+0.5)/(32767.5-i))*64+0.5+100000)-100000;
+  for (int i=0; i<4096; ++i)
+    squasht[i]=int(32768.0/(1+exp((i-2048)*(-1.0/64))));
+
+  // Verify floating point math for squash() and stretch()
+  U32 sqsum=0, stsum=0;
+  for (int i=32767; i>=0; --i)
+    stsum=stsum*3+stretch(i);
+  for (int i=4095; i>=0; --i)
+    sqsum=sqsum*3+squash(i-2048);
+  assert(stsum==3887533746u);
+  assert(sqsum==2278286169u);
+
+  pcode=0;
+  pcode_size=0;
+}
+
+Predictor::~Predictor() {
+  allocx(pcode, pcode_size, 0);  // free executable memory
+}
+
+// Initialize the predictor with a new model in z
+void Predictor::init() {
+
+  // Clear old JIT code if any
+  allocx(pcode, pcode_size, 0);
+
+  // Initialize context hash function
+  z.inith();
+
+  // Initialize predictions
+  for (int i=0; i<256; ++i) h[i]=p[i]=0;
+
+  // Initialize components
+  for (int i=0; i<256; ++i)  // clear old model
+    comp[i].init();
+  int n=z.header[6]; // hsize[0..1] hh hm ph pm n (comp)[n] END 0[128] (hcomp) END
+  const U8* cp=&z.header[7];  // start of component list
+  for (int i=0; i<n; ++i) {
+    assert(cp<&z.header[z.cend]);
+    assert(cp>&z.header[0] && cp<&z.header[z.header.isize()-8]);
+    Component& cr=comp[i];
+    switch(cp[0]) {
+      case CONS:  // c
+        p[i]=(cp[1]-128)*4;
+        break;
+      case CM: // sizebits limit
+        if (cp[1]>32) error("max size for CM is 32");
+        cr.cm.resize(1, cp[1]);  // packed CM (22 bits) + CMCOUNT (10 bits)
+        cr.limit=cp[2]*4;
+        for (size_t j=0; j<cr.cm.size(); ++j)
+          cr.cm[j]=0x80000000;
+        break;
+      case ICM: // sizebits
+        if (cp[1]>26) error("max size for ICM is 26");
+        cr.limit=1023;
+        cr.cm.resize(256);
+        cr.ht.resize(64, cp[1]);
+        for (size_t j=0; j<cr.cm.size(); ++j)
+          cr.cm[j]=st.cminit(j);
+        break;
+      case MATCH:  // sizebits
+        if (cp[1]>32 || cp[2]>32) error("max size for MATCH is 32 32");
+        cr.cm.resize(1, cp[1]);  // index
+        cr.ht.resize(1, cp[2]);  // buf
+        cr.ht(0)=1;
+        break;
+      case AVG: // j k wt
+        if (cp[1]>=i) error("AVG j >= i");
+        if (cp[2]>=i) error("AVG k >= i");
+        break;
+      case MIX2:  // sizebits j k rate mask
+        if (cp[1]>32) error("max size for MIX2 is 32");
+        if (cp[3]>=i) error("MIX2 k >= i");
+        if (cp[2]>=i) error("MIX2 j >= i");
+        cr.c=(size_t(1)<<cp[1]); // size (number of contexts)
+        cr.a16.resize(1, cp[1]);  // wt[size][m]
+        for (size_t j=0; j<cr.a16.size(); ++j)
+          cr.a16[j]=32768;
+        break;
+      case MIX: {  // sizebits j m rate mask
+        if (cp[1]>32) error("max size for MIX is 32");
+        if (cp[2]>=i) error("MIX j >= i");
+        if (cp[3]<1 || cp[3]>i-cp[2]) error("MIX m not in 1..i-j");
+        int m=cp[3];  // number of inputs
+        assert(m>=1);
+        cr.c=(size_t(1)<<cp[1]); // size (number of contexts)
+        cr.cm.resize(m, cp[1]);  // wt[size][m]
+        for (size_t j=0; j<cr.cm.size(); ++j)
+          cr.cm[j]=65536/m;
+        break;
+      }
+      case ISSE:  // sizebits j
+        if (cp[1]>32) error("max size for ISSE is 32");
+        if (cp[2]>=i) error("ISSE j >= i");
+        cr.ht.resize(64, cp[1]);
+        cr.cm.resize(512);
+        for (int j=0; j<256; ++j) {
+          cr.cm[j*2]=1<<15;
+          cr.cm[j*2+1]=clamp512k(stretch(st.cminit(j)>>8)<<10);
+        }
+        break;
+      case SSE: // sizebits j start limit
+        if (cp[1]>32) error("max size for SSE is 32");
+        if (cp[2]>=i) error("SSE j >= i");
+        if (cp[3]>cp[4]*4) error("SSE start > limit*4");
+        cr.cm.resize(32, cp[1]);
+        cr.limit=cp[4]*4;
+        for (size_t j=0; j<cr.cm.size(); ++j)
+          cr.cm[j]=squash((j&31)*64-992)<<17|cp[3];
+        break;
+      default: error("unknown component type");
+    }
+    assert(compsize[*cp]>0);
+    cp+=compsize[*cp];
+    assert(cp>=&z.header[7] && cp<&z.header[z.cend]);
+  }
+}
+
+// Return next bit prediction using interpreted COMP code
+int Predictor::predict0() {
+  assert(c8>=1 && c8<=255);
+
+  // Predict next bit
+  int n=z.header[6];
+  assert(n>0 && n<=255);
+  const U8* cp=&z.header[7];
+  assert(cp[-1]==n);
+  for (int i=0; i<n; ++i) {
+    assert(cp>&z.header[0] && cp<&z.header[z.header.isize()-8]);
+    Component& cr=comp[i];
+    switch(cp[0]) {
+      case CONS:  // c
+        break;
+      case CM:  // sizebits limit
+        cr.cxt=h[i]^hmap4;
+        p[i]=stretch(cr.cm(cr.cxt)>>17);
+        break;
+      case ICM: // sizebits
+        assert((hmap4&15)>0);
+        if (c8==1 || (c8&0xf0)==16) cr.c=find(cr.ht, cp[1]+2, h[i]+16*c8);
+        cr.cxt=cr.ht[cr.c+(hmap4&15)];
+        p[i]=stretch(cr.cm(cr.cxt)>>8);
+        break;
+      case MATCH: // sizebits bufbits: a=len, b=offset, c=bit, cxt=bitpos,
+                  //                   ht=buf, limit=pos
+        assert(cr.cm.size()==(size_t(1)<<cp[1]));
+        assert(cr.ht.size()==(size_t(1)<<cp[2]));
+        assert(cr.a<=255);
+        assert(cr.c==0 || cr.c==1);
+        assert(cr.cxt<8);
+        assert(cr.limit<cr.ht.size());
+        if (cr.a==0) p[i]=0;
+        else {
+          cr.c=(cr.ht(cr.limit-cr.b)>>(7-cr.cxt))&1; // predicted bit
+          p[i]=stretch(dt2k[cr.a]*(cr.c*-2+1)&32767);
+        }
+        break;
+      case AVG: // j k wt
+        p[i]=(p[cp[1]]*cp[3]+p[cp[2]]*(256-cp[3]))>>8;
+        break;
+      case MIX2: { // sizebits j k rate mask
+                   // c=size cm=wt[size] cxt=input
+        cr.cxt=((h[i]+(c8&cp[5]))&(cr.c-1));
+        assert(cr.cxt<cr.a16.size());
+        int w=cr.a16[cr.cxt];
+        assert(w>=0 && w<65536);
+        p[i]=(w*p[cp[2]]+(65536-w)*p[cp[3]])>>16;
+        assert(p[i]>=-2048 && p[i]<2048);
+      }
+        break;
+      case MIX: {  // sizebits j m rate mask
+                   // c=size cm=wt[size][m] cxt=index of wt in cm
+        int m=cp[3];
+        assert(m>=1 && m<=i);
+        cr.cxt=h[i]+(c8&cp[5]);
+        cr.cxt=(cr.cxt&(cr.c-1))*m; // pointer to row of weights
+        assert(cr.cxt<=cr.cm.size()-m);
+        int* wt=(int*)&cr.cm[cr.cxt];
+        p[i]=0;
+        for (int j=0; j<m; ++j)
+          p[i]+=(wt[j]>>8)*p[cp[2]+j];
+        p[i]=clamp2k(p[i]>>8);
+      }
+        break;
+      case ISSE: { // sizebits j -- c=hi, cxt=bh
+        assert((hmap4&15)>0);
+        if (c8==1 || (c8&0xf0)==16)
+          cr.c=find(cr.ht, cp[1]+2, h[i]+16*c8);
+        cr.cxt=cr.ht[cr.c+(hmap4&15)];  // bit history
+        int *wt=(int*)&cr.cm[cr.cxt*2];
+        p[i]=clamp2k((wt[0]*p[cp[2]]+wt[1]*64)>>16);
+      }
+        break;
+      case SSE: { // sizebits j start limit
+        cr.cxt=(h[i]+c8)*32;
+        int pq=p[cp[2]]+992;
+        if (pq<0) pq=0;
+        if (pq>1983) pq=1983;
+        int wt=pq&63;
+        pq>>=6;
+        assert(pq>=0 && pq<=30);
+        cr.cxt+=pq;
+        p[i]=stretch(((cr.cm(cr.cxt)>>10)*(64-wt)+(cr.cm(cr.cxt+1)>>10)*wt)>>13);
+        cr.cxt+=wt>>5;
+      }
+        break;
+      default:
+        error("component predict not implemented");
+    }
+    cp+=compsize[cp[0]];
+    assert(cp<&z.header[z.cend]);
+    assert(p[i]>=-2048 && p[i]<2048);
+  }
+  assert(cp[0]==NONE);
+  return squash(p[n-1]);
+}
+
+// Update model with decoded bit y (0...1)
+void Predictor::update0(int y) {
+  assert(y==0 || y==1);
+  assert(c8>=1 && c8<=255);
+  assert(hmap4>=1 && hmap4<=511);
+
+  // Update components
+  const U8* cp=&z.header[7];
+  int n=z.header[6];
+  assert(n>=1 && n<=255);
+  assert(cp[-1]==n);
+  for (int i=0; i<n; ++i) {
+    Component& cr=comp[i];
+    switch(cp[0]) {
+      case CONS:  // c
+        break;
+      case CM:  // sizebits limit
+        train(cr, y);
+        break;
+      case ICM: { // sizebits: cxt=ht[b]=bh, ht[c][0..15]=bh row, cxt=bh
+        cr.ht[cr.c+(hmap4&15)]=st.next(cr.ht[cr.c+(hmap4&15)], y);
+        U32& pn=cr.cm(cr.cxt);
+        pn+=int(y*32767-(pn>>8))>>2;
+      }
+        break;
+      case MATCH: // sizebits bufbits:
+                  //   a=len, b=offset, c=bit, cm=index, cxt=bitpos
+                  //   ht=buf, limit=pos
+      {
+        assert(cr.a<=255);
+        assert(cr.c==0 || cr.c==1);
+        assert(cr.cxt<8);
+        assert(cr.cm.size()==(size_t(1)<<cp[1]));
+        assert(cr.ht.size()==(size_t(1)<<cp[2]));
+        assert(cr.limit<cr.ht.size());
+        if (int(cr.c)!=y) cr.a=0;  // mismatch?
+        cr.ht(cr.limit)+=cr.ht(cr.limit)+y;
+        if (++cr.cxt==8) {
+          cr.cxt=0;
+          ++cr.limit;
+          cr.limit&=(1<<cp[2])-1;
+          if (cr.a==0) {  // look for a match
+            cr.b=cr.limit-cr.cm(h[i]);
+            if (cr.b&(cr.ht.size()-1))
+              while (cr.a<255
+                     && cr.ht(cr.limit-cr.a-1)==cr.ht(cr.limit-cr.a-cr.b-1))
+                ++cr.a;
+          }
+          else cr.a+=cr.a<255;
+          cr.cm(h[i])=cr.limit;
+        }
+      }
+        break;
+      case AVG:  // j k wt
+        break;
+      case MIX2: { // sizebits j k rate mask
+                   // cm=wt[size], cxt=input
+        assert(cr.a16.size()==cr.c);
+        assert(cr.cxt<cr.a16.size());
+        int err=(y*32767-squash(p[i]))*cp[4]>>5;
+        int w=cr.a16[cr.cxt];
+        w+=(err*(p[cp[2]]-p[cp[3]])+(1<<12))>>13;
+        if (w<0) w=0;
+        if (w>65535) w=65535;
+        cr.a16[cr.cxt]=w;
+      }
+        break;
+      case MIX: {   // sizebits j m rate mask
+                    // cm=wt[size][m], cxt=input
+        int m=cp[3];
+        assert(m>0 && m<=i);
+        assert(cr.cm.size()==m*cr.c);
+        assert(cr.cxt+m<=cr.cm.size());
+        int err=(y*32767-squash(p[i]))*cp[4]>>4;
+        int* wt=(int*)&cr.cm[cr.cxt];
+        for (int j=0; j<m; ++j)
+          wt[j]=clamp512k(wt[j]+((err*p[cp[2]+j]+(1<<12))>>13));
+      }
+        break;
+      case ISSE: { // sizebits j  -- c=hi, cxt=bh
+        assert(cr.cxt==cr.ht[cr.c+(hmap4&15)]);
+        int err=y*32767-squash(p[i]);
+        int *wt=(int*)&cr.cm[cr.cxt*2];
+        wt[0]=clamp512k(wt[0]+((err*p[cp[2]]+(1<<12))>>13));
+        wt[1]=clamp512k(wt[1]+((err+16)>>5));
+        cr.ht[cr.c+(hmap4&15)]=st.next(cr.cxt, y);
+      }
+        break;
+      case SSE:  // sizebits j start limit
+        train(cr, y);
+        break;
+      default:
+        assert(0);
+    }
+    cp+=compsize[cp[0]];
+    assert(cp>=&z.header[7] && cp<&z.header[z.cend] 
+           && cp<&z.header[z.header.isize()-8]);
+  }
+  assert(cp[0]==NONE);
+
+  // Save bit y in c8, hmap4
+  c8+=c8+y;
+  if (c8>=256) {
+    z.run(c8-256);
+    hmap4=1;
+    c8=1;
+    for (int i=0; i<n; ++i) h[i]=z.H(i);
+  }
+  else if (c8>=16 && c8<32)
+    hmap4=(hmap4&0xf)<<5|y<<4|1;
+  else
+    hmap4=(hmap4&0x1f0)|(((hmap4&0xf)*2+y)&0xf);
+}
+
+// Find cxt row in hash table ht. ht has rows of 16 indexed by the
+// low sizebits of cxt with element 0 having the next higher 8 bits for
+// collision detection. If not found after 3 adjacent tries, replace the
+// row with lowest element 1 as priority. Return index of row.
+size_t Predictor::find(Array<U8>& ht, int sizebits, U32 cxt) {
+  assert(ht.size()==size_t(16)<<sizebits);
+  int chk=cxt>>sizebits&255;
+  size_t h0=(cxt*16)&(ht.size()-16);
+  if (ht[h0]==chk) return h0;
+  size_t h1=h0^16;
+  if (ht[h1]==chk) return h1;
+  size_t h2=h0^32;
+  if (ht[h2]==chk) return h2;
+  if (ht[h0+1]<=ht[h1+1] && ht[h0+1]<=ht[h2+1])
+    return memset(&ht[h0], 0, 16), ht[h0]=chk, h0;
+  else if (ht[h1+1]<ht[h2+1])
+    return memset(&ht[h1], 0, 16), ht[h1]=chk, h1;
+  else
+    return memset(&ht[h2], 0, 16), ht[h2]=chk, h2;
+}
+
+/////////////////////// Decoder ///////////////////////
+
+Decoder::Decoder(ZPAQL& z):
+    in(0), low(1), high(0xFFFFFFFF), curr(0), pr(z), buf(BUFSIZE) {
+}
+
+void Decoder::init() {
+  pr.init();
+  if (pr.isModeled()) low=1, high=0xFFFFFFFF, curr=0;
+  else low=high=curr=0;
+}
+
+// Read un-modeled input into buf[low=0..high-1]
+// with curr remaining in subblock to read.
+void Decoder::loadbuf() {
+  assert(!pr.isModeled());
+  assert(low==high);
+  if (curr==0) {
+    for (int i=0; i<4; ++i) {
+      int c=in->get();
+      if (c<0) error("unexpected end of input");
+      curr=curr<<8|c;
+    }
+  }
+  U32 n=buf.size();
+  if (n>curr) n=curr;
+  high=in->read(&buf[0], n);
+  curr-=high;
+  low=0;
+}
+
+// Return next bit of decoded input, which has 16 bit probability p of being 1
+int Decoder::decode(int p) {
+  assert(p>=0 && p<65536);
+  assert(high>low && low>0);
+  if (curr<low || curr>high) error("archive corrupted");
+  assert(curr>=low && curr<=high);
+  U32 mid=low+U32(((high-low)*U64(U32(p)))>>16);  // split range
+  assert(high>mid && mid>=low);
+  int y=curr<=mid;
+  if (y) high=mid; else low=mid+1; // pick half
+  while ((high^low)<0x1000000) { // shift out identical leading bytes
+    high=high<<8|255;
+    low=low<<8;
+    low+=(low==0);
+    int c=in->get();
+    if (c<0) error("unexpected end of file");
+    curr=curr<<8|c;
+  }
+  return y;
+}
+
+// Decompress 1 byte or -1 at end of input
+int Decoder::decompress() {
+  if (pr.isModeled()) {  // n>0 components?
+    if (curr==0) {  // segment initialization
+      for (int i=0; i<4; ++i)
+        curr=curr<<8|in->get();
+    }
+    if (decode(0)) {
+      if (curr!=0) error("decoding end of stream");
+      return -1;
+    }
+    else {
+      int c=1;
+      while (c<256) {  // get 8 bits
+        int p=pr.predict()*2+1;
+        c+=c+decode(p);
+        pr.update(c&1);
+      }
+      return c-256;
+    }
+  }
+  else {
+    if (low==high) loadbuf();
+    if (low==high) return -1;
+    return buf[low++]&255;
+  }
+}
+
+// Find end of compressed data and return next byte
+int Decoder::skip() {
+  int c=-1;
+  if (pr.isModeled()) {
+    while (curr==0)  // at start?
+      curr=in->get();
+    while (curr && (c=in->get())>=0)  // find 4 zeros
+      curr=curr<<8|c;
+    while ((c=in->get())==0) ;  // might be more than 4
+    return c;
+  }
+  else {
+    if (curr==0)  // at start?
+      for (int i=0; i<4 && (c=in->get())>=0; ++i) curr=curr<<8|c;
+    while (curr>0) {
+      U32 n=BUFSIZE;
+      if (n>curr) n=curr;
+      U32 n1=in->read(&buf[0], n);
+      curr-=n1;
+      if (n1!=n) return -1;
+      if (curr==0)
+        for (int i=0; i<4 && (c=in->get())>=0; ++i) curr=curr<<8|c;
+    }
+    if (c>=0) c=in->get();
+    return c;
+  }
+}
+
+////////////////////// PostProcessor //////////////////////
+
+// Copy ph, pm from block header
+void PostProcessor::init(int h, int m) {
+  state=hsize=0;
+  ph=h;
+  pm=m;
+  z.clear();
+}
+
+// (PASS=0 | PROG=1 psize[0..1] pcomp[0..psize-1]) data... EOB=-1
+// Return state: 1=PASS, 2..4=loading PROG, 5=PROG loaded
+int PostProcessor::write(int c) {
+  assert(c>=-1 && c<=255);
+  switch (state) {
+    case 0:  // initial state
+      if (c<0) error("Unexpected EOS");
+      state=c+1;  // 1=PASS, 2=PROG
+      if (state>2) error("unknown post processing type");
+      if (state==1) z.clear();
+      break;
+    case 1:  // PASS
+      z.outc(c);
+      break;
+    case 2: // PROG
+      if (c<0) error("Unexpected EOS");
+      hsize=c;  // low byte of size
+      state=3;
+      break;
+    case 3:  // PROG psize[0]
+      if (c<0) error("Unexpected EOS");
+      hsize+=c*256;  // high byte of psize
+      z.header.resize(hsize+300);
+      z.cend=8;
+      z.hbegin=z.hend=z.cend+128;
+      z.header[4]=ph;
+      z.header[5]=pm;
+      state=4;
+      break;
+    case 4:  // PROG psize[0..1] pcomp[0...]
+      if (c<0) error("Unexpected EOS");
+      assert(z.hend<z.header.isize());
+      z.header[z.hend++]=c;  // one byte of pcomp
+      if (z.hend-z.hbegin==hsize) {  // last byte of pcomp?
+        hsize=z.cend-2+z.hend-z.hbegin;
+        z.header[0]=hsize&255;  // header size with empty COMP
+        z.header[1]=hsize>>8;
+        z.initp();
+        state=5;
+      }
+      break;
+    case 5:  // PROG ... data
+      z.run(c);
+      if (c<0) z.flush();
+      break;
+  }
+  return state;
+}
+
+/////////////////////// Decompresser /////////////////////
+
+// Find the start of a block and return true if found. Set memptr
+// to memory used.
+bool Decompresser::findBlock(double* memptr) {
+  assert(state==BLOCK);
+
+  // Find start of block
+  U32 h1=0x3D49B113, h2=0x29EB7F93, h3=0x2614BE13, h4=0x3828EB13;
+  // Rolling hashes initialized to hash of first 13 bytes
+  int c;
+  while ((c=dec.in->get())!=-1) {
+    h1=h1*12+c;
+    h2=h2*20+c;
+    h3=h3*28+c;
+    h4=h4*44+c;
+    if (h1==0xB16B88F1 && h2==0xFF5376F1 && h3==0x72AC5BF1 && h4==0x2F909AF1)
+      break;  // hash of 16 byte string
+  }
+  if (c==-1) return false;
+
+  // Read header
+  if ((c=dec.in->get())!=1 && c!=2) error("unsupported ZPAQ level");
+  if (dec.in->get()!=1) error("unsupported ZPAQL type");
+  z.read(dec.in);
+  if (c==1 && z.header.isize()>6 && z.header[6]==0)
+    error("ZPAQ level 1 requires at least 1 component");
+  if (memptr) *memptr=z.memory();
+  state=FILENAME;
+  decode_state=FIRSTSEG;
+  return true;
+}
+
+// Read the start of a segment (1) or end of block code (255).
+// If a segment is found, write the filename and return true, else false.
+bool Decompresser::findFilename(Writer* filename) {
+  assert(state==FILENAME);
+  int c=dec.in->get();
+  if (c==1) {  // segment found
+    while (true) {
+      c=dec.in->get();
+      if (c==-1) error("unexpected EOF");
+      if (c==0) {
+        state=COMMENT;
+        return true;
+      }
+      if (filename) filename->put(c);
+    }
+  }
+  else if (c==255) {  // end of block found
+    state=BLOCK;
+    return false;
+  }
+  else
+    error("missing segment or end of block");
+  return false;
+}
+
+// Read the comment from the segment header
+void Decompresser::readComment(Writer* comment) {
+  assert(state==COMMENT);
+  state=DATA;
+  while (true) {
+    int c=dec.in->get();
+    if (c==-1) error("unexpected EOF");
+    if (c==0) break;
+    if (comment) comment->put(c);
+  }
+  if (dec.in->get()!=0) error("missing reserved byte");
+}
+
+// Decompress n bytes, or all if n < 0. Return false if done
+bool Decompresser::decompress(int n) {
+  assert(state==DATA);
+  assert(decode_state!=SKIP);
+
+  // Initialize models to start decompressing block
+  if (decode_state==FIRSTSEG) {
+    dec.init();
+    assert(z.header.size()>5);
+    pp.init(z.header[4], z.header[5]);
+    decode_state=SEG;
+  }
+
+  // Decompress and load PCOMP into postprocessor
+  while ((pp.getState()&3)!=1)
+    pp.write(dec.decompress());
+
+  // Decompress n bytes, or all if n < 0
+  while (n) {
+    int c=dec.decompress();
+    pp.write(c);
+    if (c==-1) {
+      state=SEGEND;
+      return false;
+    }
+    if (n>0) --n;
+  }
+  return true;
+}
+
+// Read end of block. If a SHA1 checksum is present, write 1 and the
+// 20 byte checksum into sha1string, else write 0 in first byte.
+// If sha1string is 0 then discard it.
+void Decompresser::readSegmentEnd(char* sha1string) {
+  assert(state==DATA || state==SEGEND);
+
+  // Skip remaining data if any and get next byte
+  int c=0;
+  if (state==DATA) {
+    c=dec.skip();
+    decode_state=SKIP;
+  }
+  else if (state==SEGEND)
+    c=dec.in->get();
+  state=FILENAME;
+
+  // Read checksum
+  if (c==254) {
+    if (sha1string) sha1string[0]=0;  // no checksum
+  }
+  else if (c==253) {
+    if (sha1string) sha1string[0]=1;
+    for (int i=1; i<=20; ++i) {
+      c=dec.in->get();
+      if (sha1string) sha1string[i]=c;
+    }
+  }
+  else
+    error("missing end of segment marker");
+}
+
+/////////////////////////// decompress() /////////////////////
+
+void decompress(Reader* in, Writer* out) {
+  Decompresser d;
+  d.setInput(in);
+  d.setOutput(out);
+  while (d.findBlock()) {       // don't calculate memory
+    while (d.findFilename()) {  // discard filename
+      d.readComment();          // discard comment
+      d.decompress();           // to end of segment
+      d.readSegmentEnd();       // discard sha1string
+    }
+  }
+}
+
+////////////////////// Encoder ////////////////////
+
+// Initialize for start of block
+void Encoder::init() {
+  low=1;
+  high=0xFFFFFFFF;
+  pr.init();
+  if (!pr.isModeled()) low=0, buf.resize(1<<16);
+}
+
+// compress bit y having probability p/64K
+void Encoder::encode(int y, int p) {
+  assert(out);
+  assert(p>=0 && p<65536);
+  assert(y==0 || y==1);
+  assert(high>low && low>0);
+  U32 mid=low+U32(((high-low)*U64(U32(p)))>>16);  // split range
+  assert(high>mid && mid>=low);
+  if (y) high=mid; else low=mid+1; // pick half
+  while ((high^low)<0x1000000) { // write identical leading bytes
+    out->put(high>>24);  // same as low>>24
+    high=high<<8|255;
+    low=low<<8;
+    low+=(low==0); // so we don't code 4 0 bytes in a row
+  }
+}
+
+// compress byte c (0..255 or -1=EOS)
+void Encoder::compress(int c) {
+  assert(out);
+  if (pr.isModeled()) {
+    if (c==-1)
+      encode(1, 0);
+    else {
+      assert(c>=0 && c<=255);
+      encode(0, 0);
+      for (int i=7; i>=0; --i) {
+        int p=pr.predict()*2+1;
+        assert(p>0 && p<65536);
+        int y=c>>i&1;
+        encode(y, p);
+        pr.update(y);
+      }
+    }
+  }
+  else {
+    if (c<0 || low==buf.size()) {
+      out->put((low>>24)&255);
+      out->put((low>>16)&255);
+      out->put((low>>8)&255);
+      out->put(low&255);
+      out->write(&buf[0], low);
+      low=0;
+    }
+    if (c>=0) buf[low++]=c;
+  }
+}
+
+///////////////////// Compressor //////////////////////
+
+// Write 13 byte start tag
+// "\x37\x6B\x53\x74\xA0\x31\x83\xD3\x8C\xB2\x28\xB0\xD3"
+void Compressor::writeTag() {
+  assert(state==INIT);
+  enc.out->put(0x37);
+  enc.out->put(0x6b);
+  enc.out->put(0x53);
+  enc.out->put(0x74);
+  enc.out->put(0xa0);
+  enc.out->put(0x31);
+  enc.out->put(0x83);
+  enc.out->put(0xd3);
+  enc.out->put(0x8c);
+  enc.out->put(0xb2);
+  enc.out->put(0x28);
+  enc.out->put(0xb0);
+  enc.out->put(0xd3);
+}
+
+void Compressor::startBlock(int level) {
+
+  // Model 1 - min.cfg
+  static const char models[]={
+  26,0,1,2,0,0,2,3,16,8,19,0,0,96,4,28,
+  59,10,59,112,25,10,59,10,59,112,56,0,
+
+  // Model 2 - mid.cfg
+  69,0,3,3,0,0,8,3,5,8,13,0,8,17,1,8,
+  18,2,8,18,3,8,19,4,4,22,24,7,16,0,7,24,
+  -1,0,17,104,74,4,95,1,59,112,10,25,59,112,10,25,
+  59,112,10,25,59,112,10,25,59,112,10,25,59,10,59,112,
+  25,69,-49,8,112,56,0,
+
+  // Model 3 - max.cfg
+  -60,0,5,9,0,0,22,1,-96,3,5,8,13,1,8,16,
+  2,8,18,3,8,19,4,8,19,5,8,20,6,4,22,24,
+  3,17,8,19,9,3,13,3,13,3,13,3,14,7,16,0,
+  15,24,-1,7,8,0,16,10,-1,6,0,15,16,24,0,9,
+  8,17,32,-1,6,8,17,18,16,-1,9,16,19,32,-1,6,
+  0,19,20,16,0,0,17,104,74,4,95,2,59,112,10,25,
+  59,112,10,25,59,112,10,25,59,112,10,25,59,112,10,25,
+  59,10,59,112,10,25,59,112,10,25,69,-73,32,-17,64,47,
+  14,-25,91,47,10,25,60,26,48,-122,-105,20,112,63,9,70,
+  -33,0,39,3,25,112,26,52,25,25,74,10,4,59,112,25,
+  10,4,59,112,25,10,4,59,112,25,65,-113,-44,72,4,59,
+  112,8,-113,-40,8,68,-81,60,60,25,69,-49,9,112,25,25,
+  25,25,25,112,56,0,
+
+  0,0}; // 0,0 = end of list
+
+  if (level<1) error("compression level must be at least 1");
+  const char* p=models;
+  int i;
+  for (i=1; i<level && toU16(p); ++i)
+    p+=toU16(p)+2;
+  if (toU16(p)<1) error("compression level too high");
+  startBlock(p);
+}
+
+// Memory reader
+class MemoryReader: public Reader {
+  const char* p;
+public:
+  MemoryReader(const char* p_): p(p_) {}
+  int get() {return *p++&255;}
+};
+
+// Write a block header
+void Compressor::startBlock(const char* hcomp) {
+  assert(state==INIT);
+  assert(hcomp);
+  int len=toU16(hcomp)+2;
+  enc.out->put('z');
+  enc.out->put('P');
+  enc.out->put('Q');
+  enc.out->put(1+(len>6 && hcomp[6]==0));  // level 1 or 2
+  enc.out->put(1);
+  for (int i=0; i<len; ++i)  // write compression model hcomp
+    enc.out->put(hcomp[i]);
+  MemoryReader m(hcomp);
+  z.read(&m);
+  state=BLOCK1;
+}
+
+// Write a segment header
+void Compressor::startSegment(const char* filename, const char* comment) {
+  assert(state==BLOCK1 || state==BLOCK2);
+  enc.out->put(1);
+  while (filename && *filename)
+    enc.out->put(*filename++);
+  enc.out->put(0);
+  while (comment && *comment)
+    enc.out->put(*comment++);
+  enc.out->put(0);
+  enc.out->put(0);
+  if (state==BLOCK1) state=SEG1;
+  if (state==BLOCK2) state=SEG2;
+}
+
+// Initialize encoding and write pcomp to first segment
+// If len is 0 then length is encoded in pcomp[0..1]
+void Compressor::postProcess(const char* pcomp, int len) {
+  assert(state==SEG1);
+  enc.init();
+  if (pcomp) {
+    enc.compress(1);
+    if (len<=0) {
+      len=toU16(pcomp);
+      pcomp+=2;
+    }
+    enc.compress(len&255);
+    enc.compress((len>>8)&255);
+    for (int i=0; i<len; ++i)
+      enc.compress(pcomp[i]&255);
+  }
+  else
+    enc.compress(0);
+  state=SEG2;
+}
+
+// Compress n bytes, or to EOF if n <= 0
+bool Compressor::compress(int n) {
+  assert(state==SEG2);
+  int ch=0;
+  while (n && (ch=in->get())>=0) {
+    enc.compress(ch);
+    if (n>0) --n;
+  }
+  return ch>=0;
+}
+
+// End segment, write sha1string if present
+void Compressor::endSegment(const char* sha1string) {
+  assert(state==SEG2);
+  enc.compress(-1);
+  enc.out->put(0);
+  enc.out->put(0);
+  enc.out->put(0);
+  enc.out->put(0);
+  if (sha1string) {
+    enc.out->put(253);
+    for (int i=0; i<20; ++i)
+      enc.out->put(sha1string[i]);
+  }
+  else
+    enc.out->put(254);
+  state=BLOCK2;
+}
+
+// End block
+void Compressor::endBlock() {
+  assert(state==BLOCK2);
+  enc.out->put(255);
+  state=INIT;
+}
+
+/////////////////////////// compress() ///////////////////////
+
+void compress(Reader* in, Writer* out, int level) {
+  assert(level>=1);
+  Compressor c;
+  c.setInput(in);
+  c.setOutput(out);
+  c.startBlock(level);
+  c.startSegment();
+  c.postProcess();
+  c.compress();
+  c.endSegment();
+  c.endBlock();
+}
+
+//////////////////////// ZPAQL::assemble() ////////////////////
+
+#ifndef NOJIT
+/*
+assemble();
+
+Assembles the ZPAQL code in hcomp[0..hlen-1] and stores x86-32 or x86-64
+code in rcode[0..rcode_size-1]. Execution begins at rcode[0]. It will not
+write beyond the end of rcode, but in any case it returns the number of
+bytes that would have been written. It returns 0 in case of error.
+
+The assembled code implements run() and returns 1 if successful or
+0 if the ZPAQL code executes an invalid instruction or jumps out of
+bounds.
+
+A ZPAQL virtual machine has the following state. All values are
+unsigned and initially 0:
+
+  a, b, c, d: 32 bit registers (pointed to by their respective parameters)
+  f: 1 bit flag register (pointed to)
+  r[0..255]: 32 bit registers
+  m[0..msize-1]: 8 bit registers, where msize is a power of 2
+  h[0..hsize-1]: 32 bit registers, where hsize is a power of 2
+  out: pointer to a Writer
+  sha1: pointer to a SHA1
+
+Generally a ZPAQL machine is used to compute contexts which are
+placed in h. A second machine might post-process, and write its
+output to out and sha1. In either case, a machine is called with
+its input in a, representing a single byte (0..255) or
+(for a postprocessor) EOF (0xffffffff). Execution returs after a
+ZPAQL halt instruction.
+
+ZPAQL instructions are 1 byte unless the last 3 bits are 1.
+In this case, a second operand byte follows. Opcode 255 is
+the only 3 byte instruction. They are organized:
+
+  00dddxxx = unary opcode xxx on destination ddd (ddd < 111)
+  00111xxx = special instruction xxx
+  01dddsss = assignment: ddd = sss (ddd < 111)
+  1xxxxsss = operation sxxx from sss to a
+
+The meaning of sss and ddd are as follows:
+
+  000 = a   (accumulator)
+  001 = b
+  010 = c
+  011 = d
+  100 = *b  (means m[b mod msize])
+  101 = *c  (means m[c mod msize])
+  110 = *d  (means h[d mod hsize])
+  111 = n   (constant 0..255 in second byte of instruction)
+
+For example, 01001110 assigns *d to b. The other instructions xxx
+are as follows:
+
+Group 00dddxxx where ddd < 111 and xxx is:
+  000 = ddd<>a, swap with a (except 00000000 is an error, and swap
+        with *b or *c leaves the high bits of a unchanged)
+  001 = ddd++, increment
+  010 = ddd--, decrement
+  011 = ddd!, not (invert all bits)
+  100 = ddd=0, clear (set all bits of ddd to 0)
+  101 = not used (error)
+  110 = not used
+  111 = ddd=r n, assign from r[n] to ddd, n=0..255 in next opcode byte
+Except:
+  00100111 = jt n, jump if f is true (n = -128..127, relative to next opcode)
+  00101111 = jf n, jump if f is false (n = -128..127)
+  00110111 = r=a n, assign r[n] = a (n = 0..255)
+
+Group 00111xxx where xxx is:
+  000 = halt (return)
+  001 = output a
+  010 = not used
+  011 = hash: a = (a + *b + 512) * 773
+  100 = hashd: *d = (*d + a + 512) * 773
+  101 = not used
+  110 = not used
+  111 = unconditional jump (n = -128 to 127, relative to next opcode)
+  
+Group 1xxxxsss where xxxx is:
+  0000 = a += sss (add, subtract, multiply, divide sss to a)
+  0001 = a -= sss
+  0010 = a *= sss
+  0011 = a /= sss (unsigned, except set a = 0 if sss is 0)
+  0100 = a %= sss (remainder, except set a = 0 if sss is 0)
+  0101 = a &= sss (bitwise AND)
+  0110 = a &= ~sss (bitwise AND with complement of sss)
+  0111 = a |= sss (bitwise OR)
+  1000 = a ^= sss (bitwise XOR)
+  1001 = a <<= (sss % 32) (left shift by low 5 bits of sss)
+  1010 = a >>= (sss % 32) (unsigned, zero bits shifted in)
+  1011 = a == sss (compare, set f = true if equal or false otherwise)
+  1100 = a < sss (unsigned compare, result in f)
+  1101 = a > sss (unsigned compare)
+  1110 = not used
+  1111 = not used except 11111111 is a 3 byte jump to the absolute address
+         in the next 2 bytes in little-endian (LSB first) order.
+
+assemble() translates ZPAQL to 32 bit x86 code to be executed by run().
+Registers are mapped as follows:
+
+  eax = source sss from *b, *c, *d or sometimes n
+  ecx = pointer to destination *b, *c, *d, or spare
+  edx = a
+  ebx = f (1 for true, 0 for false)
+  esp = stack pointer
+  ebp = d
+  esi = b
+  edi = c
+
+run() saves non-volatile registers (ebp, esi, edi, ebx) on the stack,
+loads a, b, c, d, f, and executes the translated instructions.
+A halt instruction saves a, b, c, d, f, pops the saved registers
+and returns. Invalid instructions or jumps outside of the range
+of the ZPAQL code call libzpaq::error().
+
+In 64 bit mode, the following additional registers are used:
+
+  r12 = h
+  r14 = r
+  r15 = m
+
+*/
+
+// Called by out
+static void flush1(ZPAQL* z) {
+  z->flush();
+}
+
+// return true if op is an undefined ZPAQL instruction
+static bool iserr(int op) {
+  return op==0 || (op>=120 && op<=127) || (op>=240 && op<=254)
+    || op==58 || (op<64 && (op%8==5 || op%8==6));
+}
+
+// Write k bytes of x to rcode[o++] MSB first
+static void put(U8* rcode, int n, int& o, U32 x, int k) {
+  while (k-->0) {
+    if (o<n) rcode[o]=(x>>(k*8))&255;
+    ++o;
+  }
+}
+
+// Write 4 bytes of x to rcode[o++] LSB first
+static void put4lsb(U8* rcode, int n, int& o, U32 x) {
+  for (int k=0; k<4; ++k) {
+    if (o<n) rcode[o]=(x>>(k*8))&255;
+    ++o;
+  }
+}
+
+// Write a 1-4 byte x86 opcode without or with an 4 byte operand
+// to rcode[o...]
+#define put1(x) put(rcode, rcode_size, o, (x), 1)
+#define put2(x) put(rcode, rcode_size, o, (x), 2)
+#define put3(x) put(rcode, rcode_size, o, (x), 3)
+#define put4(x) put(rcode, rcode_size, o, (x), 4)
+#define put5(x,y) put4(x), put1(y)
+#define put6(x,y) put4(x), put2(y)
+#define put4r(x) put4lsb(rcode, rcode_size, o, x)
+#define puta(x) t=U32(size_t(x)), put4r(t)
+#define put1a(x,y) put1(x), puta(y)
+#define put2a(x,y) put2(x), puta(y)
+#define put3a(x,y) put3(x), puta(y)
+#define put4a(x,y) put4(x), puta(y)
+#define put5a(x,y,z) put4(x), put1(y), puta(z)
+#define put2l(x,y) put2(x), t=U32(size_t(y)), put4r(t), \
+  t=U32(size_t(y)>>(S*4)), put4r(t)
+
+// Assemble ZPAQL in in the HCOMP section of header to rcode,
+// but do not write beyond rcode_size. Return the number of
+// bytes output or that would have been output.
+// Execution starts at rcode[0] and returns 1 if successful or 0
+// in case of a ZPAQL execution error.
+int ZPAQL::assemble() {
+
+  // x86? (not foolproof)
+  const int S=sizeof(char*);      // 4 = x86, 8 = x86-64
+  U32 t=0x12345678;
+  if (*(char*)&t!=0x78 || (S!=4 && S!=8))
+    error("JIT supported only for x86-32 and x86-64");
+
+  const U8* hcomp=&header[hbegin];
+  const int hlen=hend-hbegin+1;
+  const int msize=m.size();
+  const int hsize=h.size();
+  const int regcode[8]={2,6,7,5}; // a,b,c,d.. -> edx,esi,edi,ebp,eax..
+  Array<int> it(hlen);            // hcomp -> rcode locations
+  int done=0;  // number of instructions assembled (0..hlen)
+  int o=5;  // rcode output index, reserve space for jmp
+
+  // Code for the halt instruction (restore registers and return)
+  const int halt=o;
+  if (S==8) {
+    put2l(0x48b9, &a);        // mov rcx, a
+    put2(0x8911);             // mov [rcx], edx
+    put2l(0x48b9, &b);        // mov rcx, b
+    put2(0x8931);             // mov [rcx], esi
+    put2l(0x48b9, &c);        // mov rcx, c
+    put2(0x8939);             // mov [rcx], edi
+    put2l(0x48b9, &d);        // mov rcx, d
+    put2(0x8929);             // mov [rcx], ebp
+    put2l(0x48b9, &f);        // mov rcx, f
+    put2(0x8919);             // mov [rcx], ebx
+    put4(0x4883c438);         // add rsp, 56
+    put2(0x415f);             // pop r15
+    put2(0x415e);             // pop r14
+    put2(0x415d);             // pop r13
+    put2(0x415c);             // pop r12
+  }
+  else {
+    put2a(0x8915, &a);        // mov [a], edx
+    put2a(0x8935, &b);        // mov [b], esi
+    put2a(0x893d, &c);        // mov [c], edi
+    put2a(0x892d, &d);        // mov [d], ebp
+    put2a(0x891d, &f);        // mov [f], ebx
+    put3(0x83c43c);           // add esp, 60
+  }
+  put1(0x5d);                 // pop ebp
+  put1(0x5b);                 // pop ebx
+  put1(0x5f);                 // pop edi
+  put1(0x5e);                 // pop esi
+  put1(0xc3);                 // ret
+
+  // Code for the out instruction.
+  // Store a=edx at outbuf[bufptr++]. If full, call flush1().
+  const int outlabel=o;
+  if (S==8) {
+    put2l(0x48b8, &outbuf[0]);// mov rax, outbuf.p
+    put2l(0x49ba, &bufptr);   // mov r10, &bufptr
+    put3(0x418b0a);           // mov ecx, [r10]
+    put3(0x891408);           // mov [rax+rcx], edx
+    put2(0xffc1);             // inc ecx
+    put3(0x41890a);           // mov [r10], ecx
+    put2a(0x81f9, outbuf.size());  // cmp ecx, outbuf.size()
+    put2(0x7401);             // jz L1
+    put1(0xc3);               // ret
+    put4(0x4883ec30);         // L1: sub esp, 48  ; call flush1(this)
+    put4(0x48893c24);         // mov [rsp], rdi
+    put5(0x48897424,8);       // mov [rsp+8], rsi
+    put5(0x48895424,16);      // mov [rsp+16], rdx
+    put5(0x48894c24,24);      // mov [rsp+24], rcx
+#ifdef unix
+    put2l(0x48bf, this);      // mov rdi, this
+#else  // Windows
+    put2l(0x48b9, this);      // mov rcx, this
+#endif
+    put2l(0x49bb, &flush1);   // mov r11, &flush1
+    put3(0x41ffd3);           // call r11
+    put5(0x488b4c24,24);      // mov rcx, [rsp+24]
+    put5(0x488b5424,16);      // mov rdx, [rsp+16]
+    put5(0x488b7424,8);       // mov rsi, [rsp+8]
+    put4(0x488b3c24);         // mov rdi, [rsp]
+    put4(0x4883c430);         // add esp, 48
+    put1(0xc3);               // ret
+  }
+  else {
+    put1a(0xb8, &outbuf[0]);  // mov eax, outbuf.p
+    put2a(0x8b0d, &bufptr);   // mov ecx, [bufptr]
+    put3(0x891408);           // mov [eax+ecx], edx
+    put2(0xffc1);             // inc ecx
+    put2a(0x890d, &bufptr);   // mov [bufptr], ecx
+    put2a(0x81f9, outbuf.size());  // cmp ecx, outbuf.size()
+    put2(0x7401);             // jz L1
+    put1(0xc3);               // ret
+    put3(0x83ec08);           // L1: sub esp, 8
+    put4(0x89542404);         // mov [esp+4], edx
+    put3a(0xc70424, this);    // mov [esp], this
+    put1a(0xb8, &flush1);     // mov eax, &flush1
+    put2(0xffd0);             // call eax
+    put4(0x8b542404);         // mov edx, [esp+4]
+    put3(0x83c408);           // add esp, 8
+    put1(0xc3);               // ret
+  }
+
+  // Set it[i]=1 for each ZPAQL instruction reachable from the previous
+  // instruction + 2 if reachable by a jump (or 3 if both).
+  it[0]=2;
+  assert(hlen>0 && hcomp[hlen-1]==0);  // ends with error
+  do {
+    done=0;
+    const int NONE=0x80000000;
+    for (int i=0; i<hlen; ++i) {
+      int op=hcomp[i];
+      if (it[i]) {
+        int next1=i+1+(op%8==7), next2=NONE; // next and jump targets
+        if (iserr(op)) next1=NONE;  // error
+        if (op==56) next1=NONE, next2=0;  // halt
+        if (op==255) next1=NONE, next2=hcomp[i+1]+256*hcomp[i+2]; // lj
+        if (op==39||op==47||op==63)next2=i+2+(hcomp[i+1]<<24>>24);// jt,jf,jmp
+        if (op==63) next1=NONE;  // jmp
+        if ((next2<0 || next2>=hlen) && next2!=NONE) next2=hlen-1; // error
+        if (next1!=NONE && !(it[next1]&1)) it[next1]|=1, ++done;
+        if (next2!=NONE && !(it[next2]&2)) it[next2]|=2, ++done;
+      }
+    }
+  } while (done>0);
+
+  // Set it[i] bits 2-3 to 4, 8, or 12 if a comparison
+  //  (<, >, == respectively) does not need to save the result in f,
+  // or if a conditional jump (jt, jf) does not need to read f.
+  // This is true if a comparison is followed directly by a jt/jf,
+  // the jt/jf is not a jump target, the byte before is not a jump
+  // target (for a 2 byte comparison), and for the comparison instruction
+  // if both paths after the jt/jf lead to another comparison or error
+  // before another jt/jf. At most hlen steps are traced because after
+  // that it must be an infinite loop.
+  for (int i=0; i<hlen; ++i) {
+    const int op1=hcomp[i]; // 216..239 = comparison
+    const int i2=i+1+(op1%8==7);  // address of next instruction
+    const int op2=hcomp[i2];  // 39,47 = jt,jf
+    if (it[i] && op1>=216 && op1<240 && (op2==39 || op2==47)
+        && it[i2]==1 && (i2==i+1 || it[i+1]==0)) {
+      int code=(op1-208)/8*4; // 4,8,12 is ==,<,>
+      it[i2]+=code;  // OK to test CF, ZF instead of f
+      for (int j=0; j<2 && code; ++j) {  // trace each path from i2
+        int k=i2+2; // branch not taken
+        if (j==1) k=i2+2+(hcomp[i2+1]<<24>>24);  // branch taken
+        for (int l=0; l<hlen && code; ++l) {  // trace at most hlen steps
+          if (k<0 || k>=hlen) break;  // out of bounds, pass
+          const int op=hcomp[k];
+          if (op==39 || op==47) code=0;  // jt,jf, fail
+          else if (op>=216 && op<240) break;  // ==,<,>, pass
+          else if (iserr(op)) break;  // error, pass
+          else if (op==255) k=hcomp[k+1]+256*hcomp[k+2]; // lj
+          else if (op==63) k=k+2+(hcomp[k+1]<<24>>24);  // jmp
+          else if (op==56) k=0;  // halt
+          else k=k+1+(op%8==7);  // ordinary instruction
+        }
+      }
+      it[i]+=code;  // if > 0 then OK to not save flags in f (bl)
+    }
+  }
+
+  // Start of run(): Save x86 and load ZPAQL registers
+  const int start=o;
+  assert(start>=16);
+  put1(0x56);          // push esi/rsi
+  put1(0x57);          // push edi/rdi
+  put1(0x53);          // push ebx/rbx
+  put1(0x55);          // push ebp/rbp
+  if (S==8) {
+    put2(0x4154);      // push r12
+    put2(0x4155);      // push r13
+    put2(0x4156);      // push r14
+    put2(0x4157);      // push r15
+    put4(0x4883ec38);  // sub rsp, 56
+    put2l(0x48b8, &a); // mov rax, a
+    put2(0x8b10);      // mov edx, [rax]
+    put2l(0x48b8, &b); // mov rax, b
+    put2(0x8b30);      // mov esi, [rax]
+    put2l(0x48b8, &c); // mov rax, c
+    put2(0x8b38);      // mov edi, [rax]
+    put2l(0x48b8, &d); // mov rax, d
+    put2(0x8b28);      // mov ebp, [rax]
+    put2l(0x48b8, &f); // mov rax, f
+    put2(0x8b18);      // mov ebx, [rax]
+    put2l(0x49bc, &h[0]);   // mov r12, h
+    put2l(0x49bd, &outbuf[0]); // mov r13, outbuf.p
+    put2l(0x49be, &r[0]);   // mov r14, r
+    put2l(0x49bf, &m[0]);   // mov r15, m
+  }
+  else {
+    put3(0x83ec3c);    // sub esp, 60
+    put2a(0x8b15, &a); // mov edx, [a]
+    put2a(0x8b35, &b); // mov esi, [b]
+    put2a(0x8b3d, &c); // mov edi, [c]
+    put2a(0x8b2d, &d); // mov ebp, [d]
+    put2a(0x8b1d, &f); // mov ebx, [f]
+  }
+
+  // Assemble in multiple passes until every byte of hcomp has a translation
+  for (int istart=0; istart<hlen; ++istart) {
+    for (int i=istart; i<hlen&&it[i]; i=i+1+(hcomp[i]%8==7)+(hcomp[i]==255)) {
+      const int code=it[i];
+
+      // If already assembled, then assemble a jump to it
+      U32 t;
+      assert(it.isize()>i);
+      assert(i>=0 && i<hlen);
+      if (code>=16) {
+        if (i>istart) {
+          int a=code-o;
+          if (a>-120 && a<120)
+            put2(0xeb00+((a-2)&255)); // jmp short o
+          else
+            put1a(0xe9, a-5);  // jmp near o
+        }
+        break;
+      }
+
+      // Else assemble the instruction at hcode[i] to rcode[o]
+      else {
+        assert(i>=0 && i<it.isize());
+        assert(it[i]>0 && it[i]<16);
+        assert(o>=16);
+        it[i]=o;
+        ++done;
+        const int op=hcomp[i];
+        const int arg=hcomp[i+1]+((op==255)?256*hcomp[i+2]:0);
+        const int ddd=op/8%8;
+        const int sss=op%8;
+
+        // error instruction: return 0
+        if (iserr(op)) {
+          put2(0x31c0);           // xor eax, eax
+          put1a(0xe9, halt-o-4);  // jmp near halt
+          continue;
+        }
+
+        // Load source *b, *c, *d, or hash (*b) into eax except:
+        // {a,b,c,d}=*d, a{+,-,*,&,|,^,=,==,>,>}=*d: load address to eax
+        // {a,b,c,d}={*b,*c}: load source into ddd
+        if (op==59 || (op>=64 && op<240 && op%8>=4 && op%8<7)) {
+          put2(0x89c0+8*regcode[sss-3+(op==59)]);  // mov eax, {esi,edi,ebp}
+          const int sz=(sss==6?hsize:msize)-1;
+          if (sz>=128) put1a(0x25, sz);            // and eax, dword msize-1
+          else put3(0x83e000+sz);                  // and eax, byte msize-1
+          const int move=(op>=64 && op<112); // = or else ddd is eax
+          if (sss<6) { // ddd={a,b,c,d,*b,*c}
+            if (S==8) put5(0x410fb604+8*move*regcode[ddd],0x07);
+                                                   // movzx ddd, byte [r15+rax]
+            else put3a(0x0fb680+8*move*regcode[ddd], &m[0]);
+                                                   // movzx ddd, byte [m+eax]
+          }
+          else if ((0x06587000>>(op/8))&1) {// {*b,*c,*d,a/,a%,a&~,a<<,a>>}=*d
+            if (S==8) put4(0x418b0484);            // mov eax, [r12+rax*4]
+            else put3a(0x8b0485, &h[0]);           // mov eax, [h+eax*4]
+          }
+        }
+
+        // Load destination address *b, *c, *d or hashd (*d) into ecx
+        if ((op>=32 && op<56 && op%8<5) || (op>=96 && op<120) || op==60) {
+          put2(0x89c1+8*regcode[op/8%8-3-(op==60)]);// mov ecx,{esi,edi,ebp}
+          const int sz=(ddd==6||op==60?hsize:msize)-1;
+          if (sz>=128) put2a(0x81e1, sz);   // and ecx, dword sz
+          else put3(0x83e100+sz);           // and ecx, byte sz
+          if (op/8%8==6 || op==60) { // *d
+            if (S==8) put4(0x498d0c8c);     // lea rcx, [r12+rcx*4]
+            else put3a(0x8d0c8d, &h[0]);    // lea ecx, [ecx*4+h]
+          }
+          else { // *b, *c
+            if (S==8) put4(0x498d0c0f);     // lea rcx, [r15+rcx]
+            else put2a(0x8d89, &m[0]);      // lea ecx, [ecx+h]
+          }
+        }
+
+        // Translate by opcode
+        switch((op/8)&31) {
+          case 0:  // ddd = a
+          case 1:  // ddd = b
+          case 2:  // ddd = c
+          case 3:  // ddd = d
+            switch(sss) {
+              case 0:  // ddd<>a (swap)
+                put2(0x87d0+regcode[ddd]);   // xchg edx, ddd
+                break;
+              case 1:  // ddd++
+                put2(0xffc0+regcode[ddd]);   // inc ddd
+                break;
+              case 2:  // ddd--
+                put2(0xffc8+regcode[ddd]);   // dec ddd
+                break;
+              case 3:  // ddd!
+                put2(0xf7d0+regcode[ddd]);   // not ddd
+                break;
+              case 4:  // ddd=0
+                put2(0x31c0+9*regcode[ddd]); // xor ddd,ddd
+                break;
+              case 7:  // ddd=r n
+                if (S==8)
+                  put3a(0x418b86+8*regcode[ddd], arg*4); // mov ddd, [r14+n*4]
+                else
+                  put2a(0x8b05+8*regcode[ddd], (&r[arg]));//mov ddd, [r+n]
+                break;
+            }
+            break;
+          case 4:  // ddd = *b
+          case 5:  // ddd = *c
+            switch(sss) {
+              case 0:  // ddd<>a (swap)
+                put2(0x8611);                // xchg dl, [ecx]
+                break;
+              case 1:  // ddd++
+                put2(0xfe01);                // inc byte [ecx]
+                break;
+              case 2:  // ddd--
+                put2(0xfe09);                // dec byte [ecx]
+                break;
+              case 3:  // ddd!
+                put2(0xf611);                // not byte [ecx]
+                break;
+              case 4:  // ddd=0
+                put2(0x31c0);                // xor eax, eax
+                put2(0x8801);                // mov [ecx], al
+                break;
+              case 7:  // jt, jf
+              {
+                assert(code>=0 && code<16);
+                const int jtab[2][4]={{5,4,2,7},{4,5,3,6}};
+                               // jnz,je,jb,ja, jz,jne,jae,jbe
+                if (code<4) put2(0x84db);    // test bl, bl
+                if (arg>=128 && arg-257-i>=0 && o-it[arg-257-i]<120)
+                  put2(0x7000+256*jtab[op==47][code/4]); // jx short 0
+                else
+                  put2a(0x0f80+jtab[op==47][code/4], 0); // jx near 0
+                break;
+              }
+            }
+            break;
+          case 6:  // ddd = *d
+            switch(sss) {
+              case 0:  // ddd<>a (swap)
+                put2(0x8711);             // xchg edx, [ecx]
+                break;
+              case 1:  // ddd++
+                put2(0xff01);             // inc dword [ecx]
+                break;
+              case 2:  // ddd--
+                put2(0xff09);             // dec dword [ecx]
+                break;
+              case 3:  // ddd!
+                put2(0xf711);             // not dword [ecx]
+                break;
+              case 4:  // ddd=0
+                put2(0x31c0);             // xor eax, eax
+                put2(0x8901);             // mov [ecx], eax
+                break;
+              case 7:  // ddd=r n
+                if (S==8)
+                  put3a(0x418996, arg*4); // mov [r14+n*4], edx
+                else
+                  put2a(0x8915, &r[arg]); // mov [r+n], edx
+                break;
+            }
+            break;
+          case 7:  // special
+            switch(op) {
+              case 56: // halt
+                put1a(0xb8, 1);           // mov eax, 1
+                put1a(0xe9, halt-o-4);    // jmp near halt
+                break;
+              case 57:  // out
+                put1a(0xe8, outlabel-o-4);// call outlabel
+                break;
+              case 59:  // hash: a = (a + *b + 512) * 773
+                put3a(0x8d8410, 512);     // lea edx, [eax+edx+512]
+                put2a(0x69d0, 773);       // imul edx, eax, 773
+                break;
+              case 60:  // hashd: *d = (*d + a + 512) * 773
+                put2(0x8b01);             // mov eax, [ecx]
+                put3a(0x8d8410, 512);     // lea eax, [eax+edx+512]
+                put2a(0x69c0, 773);       // imul eax, eax, 773
+                put2(0x8901);             // mov [ecx], eax
+                break;
+              case 63:  // jmp
+                put1a(0xe9, 0);           // jmp near 0 (fill in target later)
+                break;
+            }
+            break;
+          case 8:   // a=
+          case 9:   // b=
+          case 10:  // c=
+          case 11:  // d=
+            if (sss==7)  // n
+              put1a(0xb8+regcode[ddd], arg);         // mov ddd, n
+            else if (sss==6) { // *d
+              if (S==8)
+                put4(0x418b0484+(regcode[ddd]<<11)); // mov ddd, [r12+rax*4]
+              else
+                put3a(0x8b0485+(regcode[ddd]<<11),&h[0]);// mov ddd, [h+eax*4]
+            }
+            else if (sss<4) // a, b, c, d
+              put2(0x89c0+regcode[ddd]+8*regcode[sss]);// mov ddd,sss
+            break;
+          case 12:  // *b=
+          case 13:  // *c=
+            if (sss==7) put3(0xc60100+arg);          // mov byte [ecx], n
+            else if (sss==0) put2(0x8811);           // mov byte [ecx], dl
+            else {
+              if (sss<4) put2(0x89c0+8*regcode[sss]);// mov eax, sss
+              put2(0x8801);                          // mov byte [ecx], al
+            }
+            break;
+          case 14:  // *d=
+            if (sss<7) put2(0x8901+8*regcode[sss]);  // mov [ecx], sss
+            else put2a(0xc701, arg);                 // mov dword [ecx], n
+            break;
+          case 15: break; // not used
+          case 16:  // a+=
+            if (sss==6) {
+              if (S==8) put4(0x41031484);            // add edx, [r12+rax*4]
+              else put3a(0x031485, &h[0]);           // add edx, [h+eax*4]
+            }
+            else if (sss<7) put2(0x01c2+8*regcode[sss]);// add edx, sss
+            else if (arg>128) put2a(0x81c2, arg);    // add edx, n
+            else put3(0x83c200+arg);                 // add edx, byte n
+            break;
+          case 17:  // a-=
+            if (sss==6) {
+              if (S==8) put4(0x412b1484);            // sub edx, [r12+rax*4]
+              else put3a(0x2b1485, &h[0]);           // sub edx, [h+eax*4]
+            }
+            else if (sss<7) put2(0x29c2+8*regcode[sss]);// sub edx, sss
+            else if (arg>=128) put2a(0x81ea, arg);   // sub edx, n
+            else put3(0x83ea00+arg);                 // sub edx, byte n
+            break;
+          case 18:  // a*=
+            if (sss==6) {
+              if (S==8) put5(0x410faf14,0x84);       // imul edx, [r12+rax*4]
+              else put4a(0x0faf1485, &h[0]);         // imul edx, [h+eax*4]
+            }
+            else if (sss<7) put3(0x0fafd0+regcode[sss]);// imul edx, sss
+            else if (arg>=128) put2a(0x69d2, arg);   // imul edx, n
+            else put3(0x6bd200+arg);                 // imul edx, byte n
+            break;
+          case 19:  // a/=
+          case 20:  // a%=
+            if (sss<7) put2(0x89c1+8*regcode[sss]);  // mov ecx, sss
+            else put1a(0xb9, arg);                   // mov ecx, n
+            put2(0x85c9);                            // test ecx, ecx
+            put3(0x0f44d1);                          // cmovz edx, ecx
+            put2(0x7408-2*(op/8==20));               // jz (over rest)
+            put2(0x89d0);                            // mov eax, edx
+            put2(0x31d2);                            // xor edx, edx
+            put2(0xf7f1);                            // div ecx
+            if (op/8==19) put2(0x89c2);              // mov edx, eax
+            break;
+          case 21:  // a&=
+            if (sss==6) {
+              if (S==8) put4(0x41231484);            // and edx, [r12+rax*4]
+              else put3a(0x231485, &h[0]);           // and edx, [h+eax*4]
+            }
+            else if (sss<7) put2(0x21c2+8*regcode[sss]);// and edx, sss
+            else if (arg>=128) put2a(0x81e2, arg);   // and edx, n
+            else put3(0x83e200+arg);                 // and edx, byte n
+            break;
+          case 22:  // a&~
+            if (sss==7) {
+              if (arg<128) put3(0x83e200+(~arg&255));// and edx, byte ~n
+              else put2a(0x81e2, ~arg);              // and edx, ~n
+            }
+            else {
+              if (sss<4) put2(0x89c0+8*regcode[sss]);// mov eax, sss
+              put2(0xf7d0);                          // not eax
+              put2(0x21c2);                          // and edx, eax
+            }
+            break;
+          case 23:  // a|=
+            if (sss==6) {
+              if (S==8) put4(0x410b1484);            // or edx, [r12+rax*4]
+              else put3a(0x0b1485, &h[0]);           // or edx, [h+eax*4]
+            }
+            else if (sss<7) put2(0x09c2+8*regcode[sss]);// or edx, sss
+            else if (arg>=128) put2a(0x81ca, arg);   // or edx, n
+            else put3(0x83ca00+arg);                 // or edx, byte n
+            break;
+          case 24:  // a^=
+            if (sss==6) {
+              if (S==8) put4(0x41331484);            // xor edx, [r12+rax*4]
+              else put3a(0x331485, &h[0]);           // xor edx, [h+eax*4]
+            }
+            else if (sss<7) put2(0x31c2+8*regcode[sss]);// xor edx, sss
+            else if (arg>=128) put2a(0x81f2, arg);   // xor edx, byte n
+            else put3(0x83f200+arg);                 // xor edx, n
+            break;
+          case 25:  // a<<=
+          case 26:  // a>>=
+            if (sss==7)  // sss = n
+              put3(0xc1e200+8*256*(op/8==26)+arg);   // shl/shr n
+            else {
+              put2(0x89c1+8*regcode[sss]);           // mov ecx, sss
+              put2(0xd3e2+8*(op/8==26));             // shl/shr edx, cl
+            }
+            break;
+          case 27:  // a==
+          case 28:  // a<
+          case 29:  // a>
+            if (sss==6) {
+              if (S==8) put4(0x413b1484);            // cmp edx, [r12+rax*4]
+              else put3a(0x3b1485, &h[0]);           // cmp edx, [h+eax*4]
+            }
+            else if (sss==7)  // sss = n
+              put2a(0x81fa, arg);                    // cmp edx, dword n
+            else
+              put2(0x39c2+8*regcode[sss]);           // cmp edx, sss
+            if (code<4) {
+              if (op/8==27) put3(0x0f94c3);          // setz bl
+              if (op/8==28) put3(0x0f92c3);          // setc bl
+              if (op/8==29) put3(0x0f97c3);          // seta bl
+            }
+            break;
+          case 30:  // not used
+          case 31:  // 255 = lj
+            if (op==255) put1a(0xe9, 0);             // jmp near
+            break;
+        }
+      }
+    }
+  }
+
+  // Finish first pass
+  const int rsize=o;
+  if (o>rcode_size) return rsize;
+
+  // Fill in jump addresses (second pass)
+  for (int i=0; i<hlen; ++i) {
+    if (it[i]<16) continue;
+    int op=hcomp[i];
+    if (op==39 || op==47 || op==63 || op==255) {  // jt, jf, jmp, lj
+      int target=hcomp[i+1];
+      if (op==255) target+=hcomp[i+2]*256;  // lj
+      else {
+        if (target>=128) target-=256;
+        target+=i+2;
+      }
+      if (target<0 || target>=hlen) target=hlen-1;  // runtime ZPAQL error
+      o=it[i];
+      assert(o>=16 && o<rcode_size);
+      if ((op==39 || op==47) && rcode[o]==0x84) o+=2;  // jt, jf -> skip test
+      assert(o>=16 && o<rcode_size);
+      if (rcode[o]==0x0f) ++o;  // first byte of jz near, jnz near
+      assert(o<rcode_size);
+      op=rcode[o++];  // x86 opcode
+      target=it[target]-o;
+      if ((op>=0x72 && op<0x78) || op==0xeb) {  // jx, jmp short
+        --target;
+        if (target<-128 || target>127)
+          error("Cannot code x86 short jump");
+        assert(o<rcode_size);
+        rcode[o]=target&255;
+      }
+      else if ((op>=0x82 && op<0x88) || op==0xe9) // jx, jmp near
+      {
+        target-=4;
+        puta(target);
+      }
+      else assert(false);  // not a x86 jump
+    }
+  }
+
+  // Jump to start
+  o=0;
+  put1a(0xe9, start-5);  // jmp near start
+  return rsize;
+}
+
+//////////////////////// Predictor::assemble_p() /////////////////////
+
+// Assemble the ZPAQL code in the HCOMP section of z.header to pcomp and
+// return the number of bytes of x86 or x86-64 code written, or that would
+// be written if pcomp were large enough. The code for predict() begins
+// at pr.pcomp[0] and update() at pr.pcomp[5], both as jmp instructions.
+
+// The assembled code is equivalent to int predict(Predictor*)
+// and void update(Predictor*, int y); The Preditor address is placed in
+// edi/rdi. The update bit y is placed in ebp/rbp.
+
+int Predictor::assemble_p() {
+  Predictor& pr=*this;
+  U8* rcode=pr.pcode;         // x86 output array
+  int rcode_size=pcode_size;  // output size
+  int o=0;                    // output index in pcode
+  const int S=sizeof(char*);  // 4 or 8
+  U8* hcomp=&pr.z.header[0];  // The code to translate
+#define off(x)  ((char*)&(pr.x)-(char*)&pr)
+#define offc(x) ((char*)&(pr.comp[i].x)-(char*)&pr)
+
+  // test for little-endian (probably x86)
+  U32 t=0x12345678;
+  if (*(char*)&t!=0x78 || (S!=4 && S!=8))
+    error("JIT supported only for x86-32 and x86-64");
+
+  // Initialize for predict(). Put predictor address in edi/rdi
+  put1a(0xe9, 5);             // jmp predict
+  put1a(0, 0x90909000);       // reserve space for jmp update
+  put1(0x53);                 // push ebx/rbx
+  put1(0x55);                 // push ebp/rbp
+  put1(0x56);                 // push esi/rsi
+  put1(0x57);                 // push edi/rdi
+  if (S==4)
+    put4(0x8b7c2414);         // mov edi,[esp+0x14] ; pr
+  else {
+#ifndef unix
+    put3(0x4889cf);           // mov rdi, rcx (1st arg in Win64)
+#endif
+  }
+
+  // Code predict() for each component
+  const int n=hcomp[6];  // number of components
+  U8* cp=hcomp+7;
+  for (int i=0; i<n; ++i, cp+=compsize[cp[0]]) {
+    if (cp-hcomp>=pr.z.cend) error("comp too big");
+    if (cp[0]<1 || cp[0]>9) error("invalid component");
+    assert(compsize[cp[0]]>0 && compsize[cp[0]]<8);
+    switch (cp[0]) {
+
+      case CONS:  // c
+        break;
+
+      case CM:  // sizebits limit
+        // Component& cr=comp[i];
+        // cr.cxt=h[i]^hmap4;
+        // p[i]=stretch(cr.cm(cr.cxt)>>17);
+
+        put2a(0x8b87, off(h[i]));              // mov eax, [edi+&h[i]]
+        put2a(0x3387, off(hmap4));             // xor eax, [edi+&hmap4]
+        put1a(0x25, (1<<cp[1])-1);             // and eax, size-1
+        put2a(0x8987, offc(cxt));              // mov [edi+cxt], eax
+        if (S==8) put1(0x48);                  // rex.w (esi->rsi)
+        put2a(0x8bb7, offc(cm));               // mov esi, [edi+&cm]
+        put3(0x8b0486);                        // mov eax, [esi+eax*4]
+        put3(0xc1e811);                        // shr eax, 17
+        put4a(0x0fbf8447, off(stretcht));      // movsx eax,word[edi+eax*2+..]
+        put2a(0x8987, off(p[i]));              // mov [edi+&p[i]], eax
+        break;
+
+      case ISSE:  // sizebits j -- c=hi, cxt=bh
+        // assert((hmap4&15)>0);
+        // if (c8==1 || (c8&0xf0)==16)
+        //   cr.c=find(cr.ht, cp[1]+2, h[i]+16*c8);
+        // cr.cxt=cr.ht[cr.c+(hmap4&15)];  // bit history
+        // int *wt=(int*)&cr.cm[cr.cxt*2];
+        // p[i]=clamp2k((wt[0]*p[cp[2]]+wt[1]*64)>>16);
+
+      case ICM: // sizebits
+        // assert((hmap4&15)>0);
+        // if (c8==1 || (c8&0xf0)==16) cr.c=find(cr.ht, cp[1]+2, h[i]+16*c8);
+        // cr.cxt=cr.ht[cr.c+(hmap4&15)];
+        // p[i]=stretch(cr.cm(cr.cxt)>>8);
+        //
+        // Find cxt row in hash table ht. ht has rows of 16 indexed by the low
+        // sizebits of cxt with element 0 having the next higher 8 bits for
+        // collision detection. If not found after 3 adjacent tries, replace
+        // row with lowest element 1 as priority. Return index of row.
+        //
+        // size_t Predictor::find(Array<U8>& ht, int sizebits, U32 cxt) {
+        //  assert(ht.size()==size_t(16)<<sizebits);
+        //  int chk=cxt>>sizebits&255;
+        //  size_t h0=(cxt*16)&(ht.size()-16);
+        //  if (ht[h0]==chk) return h0;
+        //  size_t h1=h0^16;
+        //  if (ht[h1]==chk) return h1;
+        //  size_t h2=h0^32;
+        //  if (ht[h2]==chk) return h2;
+        //  if (ht[h0+1]<=ht[h1+1] && ht[h0+1]<=ht[h2+1])
+        //    return memset(&ht[h0], 0, 16), ht[h0]=chk, h0;
+        //  else if (ht[h1+1]<ht[h2+1])
+        //    return memset(&ht[h1], 0, 16), ht[h1]=chk, h1;
+        //  else
+        //    return memset(&ht[h2], 0, 16), ht[h2]=chk, h2;
+        // }
+
+        if (S==8) put1(0x48);                  // rex.w
+        put2a(0x8bb7, offc(ht));               // mov esi, [edi+&ht]
+        put2(0x8b07);                          // mov eax, edi ; c8
+        put2(0x89c1);                          // mov ecx, eax ; c8
+        put3(0x83f801);                        // cmp eax, 1
+        put2(0x740a);                          // je L1
+        put1a(0x25, 240);                      // and eax, 0xf0
+        put3(0x83f810);                        // cmp eax, 16
+        put2(0x7576);                          // jne L2 ; skip find()
+           // L1: ; find cxt in ht, return index in eax
+        put3(0xc1e104);                        // shl ecx, 4
+        put2a(0x038f, off(h[i]));              // add [edi+&h[i]]
+        put2(0x89c8);                          // mov eax, ecx ; cxt
+        put3(0xc1e902+cp[1]);                  // shr ecx, sizebits+2
+        put2a(0x81e1, 255);                    // and eax, 255 ; chk
+        put3(0xc1e004);                        // shl eax, 4
+        put1a(0x25, (64<<cp[1])-16);           // and eax, ht.size()-16 = h0
+        put3(0x3a0c06);                        // cmp cl, [esi+eax] ; ht[h0]
+        put2(0x744d);                          // je L3 ; match h0
+        put3(0x83f010);                        // xor eax, 16 ; h1
+        put3(0x3a0c06);                        // cmp cl, [esi+eax]
+        put2(0x7445);                          // je L3 ; match h1
+        put3(0x83f030);                        // xor eax, 48 ; h2
+        put3(0x3a0c06);                        // cmp cl, [esi+eax]
+        put2(0x743d);                          // je L3 ; match h2
+          // No checksum match, so replace the lowest priority among h0,h1,h2
+        put3(0x83f021);                        // xor eax, 33 ; h0+1
+        put3(0x8a1c06);                        // mov bl, [esi+eax] ; ht[h0+1]
+        put2(0x89c2);                          // mov edx, eax ; h0+1
+        put3(0x83f220);                        // xor edx, 32  ; h2+1
+        put3(0x3a1c16);                        // cmp bl, [esi+edx]
+        put2(0x7708);                          // ja L4 ; test h1 vs h2
+        put3(0x83f230);                        // xor edx, 48  ; h1+1
+        put3(0x3a1c16);                        // cmp bl, [esi+edx]
+        put2(0x7611);                          // jbe L7 ; replace h0
+          // L4: ; h0 is not lowest, so replace h1 or h2
+        put3(0x83f010);                        // xor eax, 16 ; h1+1
+        put3(0x8a1c06);                        // mov bl, [esi+eax]
+        put3(0x83f030);                        // xor eax, 48 ; h2+1
+        put3(0x3a1c06);                        // cmp bl, [esi+eax]
+        put2(0x7303);                          // jae L7
+        put3(0x83f030);                        // xor eax, 48 ; h1+1
+          // L7: ; replace row pointed to by eax = h0,h1,h2
+        put3(0x83f001);                        // xor eax, 1
+        put3(0x890c06);                        // mov [esi+eax], ecx ; chk
+        put2(0x31c9);                          // xor ecx, ecx
+        put4(0x894c0604);                      // mov [esi+eax+4], ecx
+        put4(0x894c0608);                      // mov [esi+eax+8], ecx
+        put4(0x894c060c);                      // mov [esi+eax+12], ecx
+          // L3: ; save nibble context (in eax) in c
+        put2a(0x8987, offc(c));                // mov [edi+c], eax
+        put2(0xeb06);                          // jmp L8
+          // L2: ; get nibble context
+        put2a(0x8b87, offc(c));                // mov eax, [edi+c]
+          // L8: ; nibble context is in eax
+        put2a(0x8b97, off(hmap4));             // mov edx, [edi+&hmap4]
+        put3(0x83e20f);                        // and edx, 15  ; hmap4
+        put2(0x01d0);                          // add eax, edx ; c+(hmap4&15)
+        put4(0x0fb61406);                      // movzx edx, byte [esi+eax]
+        put2a(0x8997, offc(cxt));              // mov [edi+&cxt], edx ; cxt=bh
+        if (S==8) put1(0x48);                  // rex.w
+        put2a(0x8bb7, offc(cm));               // mov esi, [edi+&cm] ; cm
+
+        // esi points to cm[256] (ICM) or cm[512] (ISSE) with 23 bit
+        // prediction (ICM) or a pair of 20 bit signed weights (ISSE).
+        // cxt = bit history bh (0..255) is in edx.
+        if (cp[0]==ICM) {
+          put3(0x8b0496);                      // mov eax, [esi+edx*4];cm[bh]
+          put3(0xc1e808);                      // shr eax, 8
+          put4a(0x0fbf8447, off(stretcht));    // movsx eax,word[edi+eax*2+..]
+        }
+        else {  // ISSE
+          put2a(0x8b87, off(p[cp[2]]));        // mov eax, [edi+&p[j]]
+          put4(0x0faf04d6);                    // imul eax, [esi+edx*8] ;wt[0]
+          put4(0x8b4cd604);                    // mov ecx, [esi+edx*8+4];wt[1]
+          put3(0xc1e106);                      // shl ecx, 6
+          put2(0x01c8);                        // add eax, ecx
+          put3(0xc1f810);                      // sar eax, 16
+          put1a(0xb9, 2047);                   // mov ecx, 2047
+          put2(0x39c8);                        // cmp eax, ecx
+          put3(0x0f4fc1);                      // cmovg eax, ecx
+          put1a(0xb9, -2048);                  // mov ecx, -2048
+          put2(0x39c8);                        // cmp eax, ecx
+          put3(0x0f4cc1);                      // cmovl eax, ecx
+
+        }
+        put2a(0x8987, off(p[i]));              // mov [edi+&p[i]], eax
+        break;
+
+      case MATCH: // sizebits bufbits: a=len, b=offset, c=bit, cxt=bitpos,
+                  //                   ht=buf, limit=pos
+        // assert(cr.cm.size()==(size_t(1)<<cp[1]));
+        // assert(cr.ht.size()==(size_t(1)<<cp[2]));
+        // assert(cr.a<=255);
+        // assert(cr.c==0 || cr.c==1);
+        // assert(cr.cxt<8);
+        // assert(cr.limit<cr.ht.size());
+        // if (cr.a==0) p[i]=0;
+        // else {
+        //   cr.c=(cr.ht(cr.limit-cr.b)>>(7-cr.cxt))&1; // predicted bit
+        //   p[i]=stretch(dt2k[cr.a]*(cr.c*-2+1)&32767);
+        // }
+
+        if (S==8) put1(0x48);          // rex.w
+        put2a(0x8bb7, offc(ht));       // mov esi, [edi+&ht]
+
+        // If match length (a) is 0 then p[i]=0
+        put2a(0x8b87, offc(a));        // mov eax, [edi+&a]
+        put2(0x85c0);                  // test eax, eax
+        put2(0x7449);                  // jz L2 ; p[i]=0
+
+        // Else put predicted bit in c
+        put1a(0xb9, 7);                // mov ecx, 7
+        put2a(0x2b8f, offc(cxt));      // sub ecx, [edi+&cxt]
+        put2a(0x8b87, offc(limit));    // mov eax, [edi+&limit]
+        put2a(0x2b87, offc(b));        // sub eax, [edi+&b]
+        put1a(0x25, (1<<cp[2])-1);     // and eax, ht.size()-1
+        put4(0x0fb60406);              // movzx eax, byte [esi+eax]
+        put2(0xd3e8);                  // shr eax, cl
+        put3(0x83e001);                // and eax, 1  ; predicted bit
+        put2a(0x8987, offc(c));        // mov [edi+&c], eax ; c
+
+        // p[i]=stretch(dt2k[cr.a]*(cr.c*-2+1)&32767);
+        put2a(0x8b87, offc(a));        // mov eax, [edi+&a]
+        put3a(0x8b8487, off(dt2k));    // mov eax, [edi+eax*4+&dt2k] ; weight
+        put2(0x7402);                  // jz L1 ; z if c==0
+        put2(0xf7d8);                  // neg eax
+        put1a(0x25, 0x7fff);           // L1: and eax, 32767
+        put4a(0x0fbf8447, off(stretcht)); //movsx eax, word [edi+eax*2+...]
+        put2a(0x8987, off(p[i]));      // L2: mov [edi+&p[i]], eax
+        break;
+
+      case AVG: // j k wt
+        // p[i]=(p[cp[1]]*cp[3]+p[cp[2]]*(256-cp[3]))>>8;
+
+        put2a(0x8b87, off(p[cp[1]]));  // mov eax, [edi+&p[j]]
+        put2a(0x2b87, off(p[cp[2]]));  // sub eax, [edi+&p[k]]
+        put2a(0x69c0, cp[3]);          // imul eax, wt
+        put3(0xc1f808);                // sar eax, 8
+        put2a(0x0387, off(p[cp[2]]));  // add eax, [edi+&p[k]]
+        put2a(0x8987, off(p[i]));      // mov [edi+&p[i]], eax
+        break;
+
+      case MIX2:   // sizebits j k rate mask
+                   // c=size cm=wt[size] cxt=input
+        // cr.cxt=((h[i]+(c8&cp[5]))&(cr.c-1));
+        // assert(cr.cxt<cr.a16.size());
+        // int w=cr.a16[cr.cxt];
+        // assert(w>=0 && w<65536);
+        // p[i]=(w*p[cp[2]]+(65536-w)*p[cp[3]])>>16;
+        // assert(p[i]>=-2048 && p[i]<2048);
+
+        put2(0x8b07);                  // mov eax, [edi] ; c8
+        put1a(0x25, cp[5]);            // and eax, mask
+        put2a(0x0387, off(h[i]));      // add eax, [edi+&h[i]]
+        put1a(0x25, (1<<cp[1])-1);     // and eax, size-1
+        put2a(0x8987, offc(cxt));      // mov [edi+&cxt], eax ; cxt
+        if (S==8) put1(0x48);          // rex.w
+        put2a(0x8bb7, offc(a16));      // mov esi, [edi+&a16]
+        put4(0x0fb70446);              // movzx eax, word [edi+eax*2] ; w
+        put2a(0x8b8f, off(p[cp[2]]));  // mov ecx, [edi+&p[j]]
+        put2a(0x8b97, off(p[cp[3]]));  // mov edx, [edi+&p[k]]
+        put2(0x29d1);                  // sub ecx, edx
+        put3(0x0fafc8);                // imul ecx, eax
+        put3(0xc1e210);                // shl edx, 16
+        put2(0x01d1);                  // add ecx, edx
+        put3(0xc1f910);                // sar ecx, 16
+        put2a(0x898f, off(p[i]));      // mov [edi+&p[i]]
+        break;
+
+      case MIX:    // sizebits j m rate mask
+                   // c=size cm=wt[size][m] cxt=index of wt in cm
+        // int m=cp[3];
+        // assert(m>=1 && m<=i);
+        // cr.cxt=h[i]+(c8&cp[5]);
+        // cr.cxt=(cr.cxt&(cr.c-1))*m; // pointer to row of weights
+        // assert(cr.cxt<=cr.cm.size()-m);
+        // int* wt=(int*)&cr.cm[cr.cxt];
+        // p[i]=0;
+        // for (int j=0; j<m; ++j)
+        //   p[i]+=(wt[j]>>8)*p[cp[2]+j];
+        // p[i]=clamp2k(p[i]>>8);
+
+        put2(0x8b07);                          // mov eax, [edi] ; c8
+        put1a(0x25, cp[5]);                    // and eax, mask
+        put2a(0x0387, off(h[i]));              // add eax, [edi+&h[i]]
+        put1a(0x25, (1<<cp[1])-1);             // and eax, size-1
+        put2a(0x69c0, cp[3]);                  // imul eax, m
+        put2a(0x8987, offc(cxt));              // mov [edi+&cxt], eax ; cxt
+        if (S==8) put1(0x48);                  // rex.w
+        put2a(0x8bb7, offc(cm));               // mov esi, [edi+&cm]
+        if (S==8) put1(0x48);                  // rex.w
+        put3(0x8d3486);                        // lea esi, [esi+eax*4] ; wt
+
+        // Unroll summation loop: esi=wt[0..m-1]
+        for (int k=0; k<cp[3]; k+=8) {
+          const int tail=cp[3]-k;  // number of elements remaining
+
+          // pack 8 elements of wt in xmm1, 8 elements of p in xmm3
+          put4a(0xf30f6f8e, k*4);              // movdqu xmm1, [esi+k*4]
+          if (tail>3) put4a(0xf30f6f96, k*4+16);//movdqu xmm2, [esi+k*4+16]
+          put5(0x660f72e1,0x08);               // psrad xmm1, 8
+          if (tail>3) put5(0x660f72e2,0x08);   // psrad xmm2, 8
+          put4(0x660f6bca);                    // packssdw xmm1, xmm2
+          put4a(0xf30f6f9f, off(p[cp[2]+k]));  // movdqu xmm3, [edi+&p[j+k]]
+          if (tail>3)
+            put4a(0xf30f6fa7,off(p[cp[2]+k+4]));//movdqu xmm4, [edi+&p[j+k+4]]
+          put4(0x660f6bdc);                    // packssdw, xmm3, xmm4
+          if (tail>0 && tail<8) {  // last loop, mask extra weights
+            put4(0x660f76ed);                  // pcmpeqd xmm5, xmm5 ; -1
+            put5(0x660f73dd, 16-tail*2);       // psrldq xmm5, 16-tail*2
+            put4(0x660fdbcd);                  // pand xmm1, xmm5
+          }
+          if (k==0) {  // first loop, initialize sum in xmm0
+            put4(0xf30f6fc1);                  // movdqu xmm0, xmm1
+            put4(0x660ff5c3);                  // pmaddwd xmm0, xmm3
+          }
+          else {  // accumulate sum in xmm0
+            put4(0xf30f6fd1);                  // movdqu xmm2, xmm1
+            put4(0x660ff5d3);                  // pmaddwd xmm2, xmm3
+            put4(0x660ffec2);                  // paddd, xmm0, xmm2
+          }
+        }
+
+        // Add up the 4 elements of xmm0 = p[i] in the first element
+        put4(0xf30f6fc8);                      // movdqu xmm1, xmm0
+        put5(0x660f73d9,0x08);                 // psrldq xmm1, 8
+        put4(0x660ffec1);                      // paddd xmm0, xmm1
+        put4(0xf30f6fc8);                      // movdqu xmm1, xmm0
+        put5(0x660f73d9,0x04);                 // psrldq xmm1, 4
+        put4(0x660ffec1);                      // paddd xmm0, xmm1
+        put4(0x660f7ec0);                      // movd eax, xmm0 ; p[i]
+        put3(0xc1f808);                        // sar eax, 8
+        put1a(0xb9, 2047);                     // mov ecx, 2047 ; clamp2k
+        put2(0x39c8);                          // cmp eax, ecx
+        put3(0x0f4fc1);                        // cmovg eax, ecx
+        put2(0xf7d1);                          // not ecx ; -2048
+        put2(0x39c8);                          // cmp eax, ecx
+        put3(0x0f4cc1);                        // cmovl eax, ecx
+        put2a(0x8987, off(p[i]));              // mov [edi+&p[i]], eax
+        break;
+
+      case SSE:  // sizebits j start limit
+        // cr.cxt=(h[i]+c8)*32;
+        // int pq=p[cp[2]]+992;
+        // if (pq<0) pq=0;
+        // if (pq>1983) pq=1983;
+        // int wt=pq&63;
+        // pq>>=6;
+        // assert(pq>=0 && pq<=30);
+        // cr.cxt+=pq;
+        // p[i]=stretch(((cr.cm(cr.cxt)>>10)*(64-wt)       // p0
+        //               +(cr.cm(cr.cxt+1)>>10)*wt)>>13);  // p1
+        // // p = p0*(64-wt)+p1*wt = (p1-p0)*wt + p0*64
+        // cr.cxt+=wt>>5;
+
+        put2a(0x8b8f, off(h[i]));      // mov ecx, [edi+&h[i]]
+        put2(0x030f);                  // add ecx, [edi]  ; c0
+        put2a(0x81e1, (1<<cp[1])-1);   // and ecx, size-1
+        put3(0xc1e105);                // shl ecx, 5  ; cxt in 0..size*32-32
+        put2a(0x8b87, off(p[cp[2]]));  // mov eax, [edi+&p[j]] ; pq
+        put1a(0x05, 992);              // add eax, 992
+        put2(0x31d2);                  // xor edx, edx ; 0
+        put2(0x39d0);                  // cmp eax, edx
+        put3(0x0f4cc2);                // cmovl eax, edx
+        put1a(0xba, 1983);             // mov edx, 1983
+        put2(0x39d0);                  // cmp eax, edx
+        put3(0x0f4fc2);                // cmovg eax, edx ; pq in 0..1983
+        put2(0x89c2);                  // mov edx, eax
+        put3(0x83e23f);                // and edx, 63  ; wt in 0..63
+        put3(0xc1e806);                // shr eax, 6   ; pq in 0..30
+        put2(0x01c1);                  // add ecx, eax ; cxt in 0..size*32-2
+        if (S==8) put1(0x48);          // rex.w
+        put2a(0x8bb7, offc(cm));       // mov esi, [edi+cm]
+        put3(0x8b048e);                // mov eax, [esi+ecx*4] ; cm[cxt]
+        put4(0x8b5c8e04);              // mov ebx, [esi+ecx*4+4] ; cm[cxt+1]
+        put3(0x83fa20);                // cmp edx, 32  ; wt
+        put3(0x83d9ff);                // sbb ecx, -1  ; cxt+=wt>>5
+        put2a(0x898f, offc(cxt));      // mov [edi+cxt], ecx  ; cxt saved
+        put3(0xc1e80a);                // shr eax, 10 ; p0 = cm[cxt]>>10
+        put3(0xc1eb0a);                // shr ebx, 10 ; p1 = cm[cxt+1]>>10
+        put2(0x29c3);                  // sub ebx, eax, ; p1-p0
+        put3(0x0fafda);                // imul ebx, edx ; (p1-p0)*wt
+        put3(0xc1e006);                // shr eax, 6
+        put2(0x01d8);                  // add eax, ebx ; p in 0..2^28-1
+        put3(0xc1e80d);                // shr eax, 13  ; p in 0..32767
+        put4a(0x0fbf8447, off(stretcht));  // movsx eax, word [edi+eax*2+...]
+        put2a(0x8987, off(p[i]));      // mov [edi+&p[i]], eax
+        break;
+
+      default:
+        error("invalid ZPAQ component");
+    }
+  }
+
+  // return squash(p[n-1])
+  put2a(0x8b87, off(p[n-1]));          // mov eax, [edi+...]
+  put1a(0x05, 0x800);                  // add eax, 2048
+  put4a(0x0fbf8447, off(squasht[0]));  // movsx eax, word [edi+eax*2+...]
+  put1(0x5f);                          // pop edi
+  put1(0x5e);                          // pop esi
+  put1(0x5d);                          // pop ebp
+  put1(0x5b);                          // pop ebx
+  put1(0xc3);                          // ret
+
+  // Initialize for update() Put predictor address in edi/rdi
+  // and bit y=0..1 in ebp
+  int save_o=o;
+  o=5;
+  put1a(0xe9, save_o-10);      // jmp update
+  o=save_o;
+  put1(0x53);                  // push ebx/rbx
+  put1(0x55);                  // push ebp/rbp
+  put1(0x56);                  // push esi/rsi
+  put1(0x57);                  // push edi/rdi
+  if (S==4) {
+    put4(0x8b7c2414);          // mov edi,[esp+0x14] ; (1st arg = pr)
+    put4(0x8b6c2418);          // mov ebp,[esp+0x18] ; (2nd arg = y)
+  }
+  else {
+#ifdef unix                    // (1st arg already in rdi)
+    put3(0x4889f5);            // mov rbp, rsi (2nd arg in Linux-64)
+#else
+    put3(0x4889cf);            // mov rdi, rcx (1st arg in Win64)
+    put3(0x4889d5);            // mov rbp, rdx (2nd arg)
+#endif
+  }
+
+  // Code update() for each component
+  cp=hcomp+7;
+  for (int i=0; i<n; ++i, cp+=compsize[cp[0]]) {
+    assert(cp-hcomp<pr.z.cend);
+    assert (cp[0]>=1 && cp[0]<=9);
+    assert(compsize[cp[0]]>0 && compsize[cp[0]]<8);
+    switch (cp[0]) {
+
+      case CONS:  // c
+        break;
+
+      case SSE:  // sizebits j start limit
+      case CM:   // sizebits limit
+        // train(cr, y);
+        //
+        // reduce prediction error in cr.cm
+        // void train(Component& cr, int y) {
+        //   assert(y==0 || y==1);
+        //   U32& pn=cr.cm(cr.cxt);
+        //   U32 count=pn&0x3ff;
+        //   int error=y*32767-(cr.cm(cr.cxt)>>17);
+        //   pn+=(error*dt[count]&-1024)+(count<cr.limit);
+
+        if (S==8) put1(0x48);          // rex.w (esi->rsi)
+        put2a(0x8bb7, offc(cm));       // mov esi,[edi+cm]  ; cm
+        put2a(0x8b87, offc(cxt));      // mov eax,[edi+cxt] ; cxt
+        put1a(0x25, pr.comp[i].cm.size()-1);  // and eax, size-1
+        if (S==8) put1(0x48);          // rex.w
+        put3(0x8d3486);                // lea esi,[esi+eax*4] ; &cm[cxt]
+        put2(0x8b06);                  // mov eax,[esi] ; cm[cxt]
+        put2(0x89c2);                  // mov edx, eax  ; cm[cxt]
+        put3(0xc1e811);                // shr eax, 17   ; cm[cxt]>>17
+        put2(0x89e9);                  // mov ecx, ebp  ; y
+        put3(0xc1e10f);                // shl ecx, 15   ; y*32768
+        put2(0x29e9);                  // sub ecx, ebp  ; y*32767
+        put2(0x29c1);                  // sub ecx, eax  ; error
+        put2a(0x81e2, 0x3ff);          // and edx, 1023 ; count
+        put3a(0x8b8497, off(dt));      // mov eax,[edi+edx*4+dt] ; dt[count]
+        put3(0x0fafc8);                // imul ecx, eax ; error*dt[count]
+        put2a(0x81e1, 0xfffffc00);     // and ecx, -1024
+        put2a(0x81fa, cp[2+2*(cp[0]==SSE)]*4); // cmp edx, limit*4
+        put2(0x110e);                  // adc [esi], ecx ; pn+=...
+        break;
+
+      case ICM:   // sizebits: cxt=bh, ht[c][0..15]=bh row
+        // cr.ht[cr.c+(hmap4&15)]=st.next(cr.ht[cr.c+(hmap4&15)], y);
+        // U32& pn=cr.cm(cr.cxt);
+        // pn+=int(y*32767-(pn>>8))>>2;
+
+      case ISSE:  // sizebits j  -- c=hi, cxt=bh
+        // assert(cr.cxt==cr.ht[cr.c+(hmap4&15)]);
+        // int err=y*32767-squash(p[i]);
+        // int *wt=(int*)&cr.cm[cr.cxt*2];
+        // wt[0]=clamp512k(wt[0]+((err*p[cp[2]]+(1<<12))>>13));
+        // wt[1]=clamp512k(wt[1]+((err+16)>>5));
+        // cr.ht[cr.c+(hmap4&15)]=st.next(cr.cxt, y);
+
+        // update bit history bh to next(bh,y=ebp) in ht[c+(hmap4&15)]
+        put3(0x8b4700+off(hmap4));     // mov eax, [edi+&hmap4]
+        put3(0x83e00f);                // and eax, 15
+        put2a(0x0387, offc(c));        // add eax [edi+&c] ; cxt
+        if (S==8) put1(0x48);          // rex.w
+        put2a(0x8bb7, offc(ht));       // mov esi, [edi+&ht]
+        put4(0x0fb61406);              // movzx edx, byte [esi+eax] ; bh
+        put4(0x8d5c9500);              // lea ebx, [ebp+edx*4] ; index to st
+        put4a(0x0fb69c1f, off(st));    // movzx ebx,byte[edi+ebx+st]; next bh
+        put3(0x881c06);                // mov [esi+eax], bl ; save next bh
+        if (S==8) put1(0x48);          // rex.w
+        put2a(0x8bb7, offc(cm));       // mov esi, [edi+&cm]
+
+        // ICM: update cm[cxt=edx=bit history] to reduce prediction error
+        // esi = &cm
+        if (cp[0]==ICM) {
+          if (S==8) put1(0x48);        // rex.w
+          put3(0x8d3496);              // lea esi, [esi+edx*4] ; &cm[bh]
+          put2(0x8b06);                // mov eax, [esi] ; pn
+          put3(0xc1e808);              // shr eax, 8 ; pn>>8
+          put2(0x89e9);                // mov ecx, ebp ; y
+          put3(0xc1e10f);              // shl ecx, 15
+          put2(0x29e9);                // sub ecx, ebp ; y*32767
+          put2(0x29c1);                // sub ecx, eax
+          put3(0xc1f902);              // sar ecx, 2
+          put2(0x010e);                // add [esi], ecx
+        }
+
+        // ISSE: update weights. edx=cxt=bit history (0..255), esi=cm[512]
+        else {
+          put2a(0x8b87, off(p[i]));    // mov eax, [edi+&p[i]]
+          put1a(0x05, 2048);           // add eax, 2048
+          put4a(0x0fb78447, off(squasht)); // movzx eax, word [edi+eax*2+..]
+          put2(0x89e9);                // mov ecx, ebp ; y
+          put3(0xc1e10f);              // shl ecx, 15
+          put2(0x29e9);                // sub ecx, ebp ; y*32767
+          put2(0x29c1);                // sub ecx, eax ; err
+          put2a(0x8b87, off(p[cp[2]]));// mov eax, [edi+&p[j]]
+          put3(0x0fafc1);              // imul eax, ecx
+          put1a(0x05, (1<<12));        // add eax, 4096
+          put3(0xc1f80d);              // sar eax, 13
+          put3(0x0304d6);              // add eax, [esi+edx*8] ; wt[0]
+          put1a(0xbb, (1<<19)-1);      // mov ebx, 524287
+          put2(0x39d8);                // cmp eax, ebx
+          put3(0x0f4fc3);              // cmovg eax, ebx
+          put2(0xf7d3);                // not ebx ; -524288
+          put2(0x39d8);                // cmp eax, ebx
+          put3(0x0f4cc3);              // cmovl eax, ebx
+          put3(0x8904d6);              // mov [esi+edx*8], eax
+          put3(0x83c110);              // add ecx, 16 ; err
+          put3(0xc1f905);              // sar ecx, 5
+          put4(0x034cd604);            // add ecx, [esi+edx*8+4] ; wt[1]
+          put1a(0xb8, (1<<19)-1);      // mov eax, 524287
+          put2(0x39c1);                // cmp ecx, eax
+          put3(0x0f4fc8);              // cmovg ecx, eax
+          put2(0xf7d0);                // not eax ; -524288
+          put2(0x39c1);                // cmp ecx, eax
+          put3(0x0f4cc8);              // cmovl ecx, eax
+          put4(0x894cd604);            // mov [esi+edx*8+4], ecx
+        }
+        break;
+
+      case MATCH: // sizebits bufbits:
+                  //   a=len, b=offset, c=bit, cm=index, cxt=bitpos
+                  //   ht=buf, limit=pos
+        // assert(cr.a<=255);
+        // assert(cr.c==0 || cr.c==1);
+        // assert(cr.cxt<8);
+        // assert(cr.cm.size()==(size_t(1)<<cp[1]));
+        // assert(cr.ht.size()==(size_t(1)<<cp[2]));
+        // if (int(cr.c)!=y) cr.a=0;  // mismatch?
+        // cr.ht(cr.limit)+=cr.ht(cr.limit)+y;
+        // if (++cr.cxt==8) {
+        //   cr.cxt=0;
+        //   ++cr.limit;
+        //   cr.limit&=(1<<cp[2])-1;
+        //   if (cr.a==0) {  // look for a match
+        //     cr.b=cr.limit-cr.cm(h[i]);
+        //     if (cr.b&(cr.ht.size()-1))
+        //       while (cr.a<255
+        //              && cr.ht(cr.limit-cr.a-1)==cr.ht(cr.limit-cr.a-cr.b-1))
+        //         ++cr.a;
+        //   }
+        //   else cr.a+=cr.a<255;
+        //   cr.cm(h[i])=cr.limit;
+        // }
+
+        // Set pointers ebx=&cm, esi=&ht
+        if (S==8) put1(0x48);          // rex.w
+        put2a(0x8bb7, offc(ht));       // mov esi, [edi+&ht]
+        if (S==8) put1(0x48);          // rex.w
+        put2a(0x8b9f, offc(cm));       // mov ebx, [edi+&cm]
+
+        // if (c!=y) a=0;
+        put2a(0x8b87, offc(c));        // mov eax, [edi+&c]
+        put2(0x39e8);                  // cmp eax, ebp ; y
+        put2(0x7408);                  // jz L1
+        put2(0x31c0);                  // xor eax, eax
+        put2a(0x8987, offc(a));        // mov [edi+&a], eax
+
+        // ht(limit)+=ht(limit)+y  (1E)
+        put2a(0x8b87, offc(limit));    // mov eax, [edi+&limit]
+        put4(0x0fb60c06);              // movzx, ecx, byte [esi+eax]
+        put2(0x01c9);                  // add ecx, ecx
+        put2(0x01e9);                  // add ecx, ebp
+        put3(0x880c06);                // mov [esi+eax], cl
+
+        // if (++cxt==8)
+        put2a(0x8b87, offc(cxt));      // mov eax, [edi+&cxt]
+        put2(0xffc0);                  // inc eax
+        put3(0x83e007);                // and eax,byte +0x7
+        put2a(0x8987, offc(cxt));      // mov [edi+&cxt],eax
+        put2a(0x0f85, 0x9b);           // jnz L8
+
+        // ++limit;
+        // limit&=bufsize-1;
+        put2a(0x8b87, offc(limit));    // mov eax,[edi+&limit]
+        put2(0xffc0);                  // inc eax
+        put1a(0x25, (1<<cp[2])-1);     // and eax, bufsize-1
+        put2a(0x8987, offc(limit));    // mov [edi+&limit],eax
+
+        // if (a==0)
+        put2a(0x8b87, offc(a));        // mov eax, [edi+&a]
+        put2(0x85c0);                  // test eax,eax
+        put2(0x755c);                  // jnz L6
+
+        //   b=limit-cm(h[i])
+        put2a(0x8b8f, off(h[i]));      // mov ecx,[edi+h[i]]
+        put2a(0x81e1, (1<<cp[1])-1);   // and ecx, size-1
+        put2a(0x8b87, offc(limit));    // mov eax,[edi-&limit]
+        put3(0x2b048b);                // sub eax,[ebx+ecx*4]
+        put2a(0x8987, offc(b));        // mov [edi+&b],eax
+
+        //   if (b&(bufsize-1))
+        put1a(0xa9, (1<<cp[2])-1);     // test eax, bufsize-1
+        put2(0x7448);                  // jz L7
+
+        //      while (a<255 && ht(limit-a-1)==ht(limit-a-b-1)) ++a;
+        put1(0x53);                    // push ebx
+        put2a(0x8b9f, offc(limit));    // mov ebx,[edi+&limit]
+        put2(0x89da);                  // mov edx,ebx
+        put2(0x29c3);                  // sub ebx,eax  ; limit-b
+        put2(0x31c9);                  // xor ecx,ecx  ; a=0
+        put2a(0x81f9, 0xff);           // L2: cmp ecx,0xff ; while
+        put2(0x741c);                  // jz L3 ; break
+        put2(0xffca);                  // dec edx
+        put2(0xffcb);                  // dec ebx
+        put2a(0x81e2, (1<<cp[2])-1);   // and edx, bufsize-1
+        put2a(0x81e3, (1<<cp[2])-1);   // and ebx, bufsize-1
+        put3(0x8a0416);                // mov al,[esi+edx]
+        put3(0x3a041e);                // cmp al,[esi+ebx]
+        put2(0x7504);                  // jnz L3 ; break
+        put2(0xffc1);                  // inc ecx
+        put2(0xebdc);                  // jmp short L2 ; end while
+        put1(0x5b);                    // L3: pop ebx
+        put2a(0x898f, offc(a));        // mov [edi+&a],ecx
+        put2(0xeb0e);                  // jmp short L7
+
+        // a+=(a<255)
+        put1a(0x3d, 0xff);             // L6: cmp eax, 0xff ; a
+        put3(0x83d000);                // adc eax, 0
+        put2a(0x8987, offc(a));        // mov [edi+&a],eax
+
+        // cm(h[i])=limit
+        put2a(0x8b87, off(h[i]));      // L7: mov eax,[edi+&h[i]]
+        put1a(0x25, (1<<cp[1])-1);     // and eax, size-1
+        put2a(0x8b8f, offc(limit));    // mov ecx,[edi+&limit]
+        put3(0x890c83);                // mov [ebx+eax*4],ecx
+                                       // L8:
+        break;
+
+      case AVG:  // j k wt
+        break;
+
+      case MIX2: // sizebits j k rate mask
+                 // cm=wt[size], cxt=input
+        // assert(cr.a16.size()==cr.c);
+        // assert(cr.cxt<cr.a16.size());
+        // int err=(y*32767-squash(p[i]))*cp[4]>>5;
+        // int w=cr.a16[cr.cxt];
+        // w+=(err*(p[cp[2]]-p[cp[3]])+(1<<12))>>13;
+        // if (w<0) w=0;
+        // if (w>65535) w=65535;
+        // cr.a16[cr.cxt]=w;
+
+        // set ecx=err
+        put2a(0x8b87, off(p[i]));      // mov eax, [edi+&p[i]]
+        put1a(0x05, 2048);             // add eax, 2048
+        put4a(0x0fb78447, off(squasht));//movzx eax, word [edi+eax*2+&squasht]
+        put2(0x89e9);                  // mov ecx, ebp ; y
+        put3(0xc1e10f);                // shl ecx, 15
+        put2(0x29e9);                  // sub ecx, ebp ; y*32767
+        put2(0x29c1);                  // sub ecx, eax
+        put2a(0x69c9, cp[4]);          // imul ecx, rate
+        put3(0xc1f905);                // sar ecx, 5  ; err
+
+        // Update w
+        put2a(0x8b87, offc(cxt));      // mov eax, [edi+&cxt]
+        if (S==8) put1(0x48);          // rex.w
+        put2a(0x8bb7, offc(a16));      // mov esi, [edi+&a16]
+        if (S==8) put1(0x48);          // rex.w
+        put3(0x8d3446);                // lea esi, [esi+eax*2] ; &w
+        put2a(0x8b87, off(p[cp[2]]));  // mov eax, [edi+&p[j]]
+        put2a(0x2b87, off(p[cp[3]]));  // sub eax, [edi+&p[k]] ; p[j]-p[k]
+        put3(0x0fafc1);                // imul eax, ecx  ; * err
+        put1a(0x05, 1<<12);            // add eax, 4096
+        put3(0xc1f80d);                // sar eax, 13
+        put3(0x0fb716);                // movzx edx, word [esi] ; w
+        put2(0x01d0);                  // add eax, edx
+        put1a(0xba, 0xffff);           // mov edx, 65535
+        put2(0x39d0);                  // cmp eax, edx
+        put3(0x0f4fc2);                // cmovg eax, edx
+        put2(0x31d2);                  // xor edx, edx
+        put2(0x39d0);                  // cmp eax, edx
+        put3(0x0f4cc2);                // cmovl eax, edx
+        put3(0x668906);                // mov word [esi], ax
+        break;
+
+      case MIX: // sizebits j m rate mask
+                // cm=wt[size][m], cxt=input
+        // int m=cp[3];
+        // assert(m>0 && m<=i);
+        // assert(cr.cm.size()==m*cr.c);
+        // assert(cr.cxt+m<=cr.cm.size());
+        // int err=(y*32767-squash(p[i]))*cp[4]>>4;
+        // int* wt=(int*)&cr.cm[cr.cxt];
+        // for (int j=0; j<m; ++j)
+        //   wt[j]=clamp512k(wt[j]+((err*p[cp[2]+j]+(1<<12))>>13));
+
+        // set ecx=err
+        put2a(0x8b87, off(p[i]));      // mov eax, [edi+&p[i]]
+        put1a(0x05, 2048);             // add eax, 2048
+        put4a(0x0fb78447, off(squasht));//movzx eax, word [edi+eax*2+&squasht]
+        put2(0x89e9);                  // mov ecx, ebp ; y
+        put3(0xc1e10f);                // shl ecx, 15
+        put2(0x29e9);                  // sub ecx, ebp ; y*32767
+        put2(0x29c1);                  // sub ecx, eax
+        put2a(0x69c9, cp[4]);          // imul ecx, rate
+        put3(0xc1f904);                // sar ecx, 4  ; err
+
+        // set esi=wt
+        put2a(0x8b87, offc(cxt));      // mov eax, [edi+&cxt] ; cxt
+        if (S==8) put1(0x48);          // rex.w
+        put2a(0x8bb7, offc(cm));       // mov esi, [edi+&cm]
+        if (S==8) put1(0x48);          // rex.w
+        put3(0x8d3486);                // lea esi, [esi+eax*4] ; wt
+
+        for (int k=0; k<cp[3]; ++k) {
+          put2a(0x8b87,off(p[cp[2]+k]));//mov eax, [edi+&p[cp[2]+k]
+          put3(0x0fafc1);              // imul eax, ecx
+          put1a(0x05, 1<<12);          // add eax, 1<<12
+          put3(0xc1f80d);              // sar eax, 13
+          put2(0x0306);                // add eax, [esi]
+          put1a(0xba, (1<<19)-1);      // mov edx, (1<<19)-1
+          put2(0x39d0);                // cmp eax, edx
+          put3(0x0f4fc2);              // cmovg eax, edx
+          put2(0xf7d2);                // not edx
+          put2(0x39d0);                // cmp eax, edx
+          put3(0x0f4cc2);              // cmovl eax, edx
+          put2(0x8906);                // mov [esi], eax
+          if (k<cp[3]-1) {
+            if (S==8) put1(0x48);      // rex.w
+            put3(0x83c604);            // add esi, 4
+          }
+        }
+        break;
+
+      default:
+        error("invalid ZPAQ component");
+    }
+  }
+
+  // return from update()
+  put1(0x5f);                 // pop edi
+  put1(0x5e);                 // pop esi
+  put1(0x5d);                 // pop ebp
+  put1(0x5b);                 // pop ebx
+  put1(0xc3);                 // ret
+
+  return o;
+}
+
+#endif // ifndef NOJIT
+
+// Return a prediction of the next bit in range 0..32767
+// Use JIT code starting at pcode[0] if available, or else create it.
+int Predictor::predict() {
+#ifdef NOJIT
+  return predict0();
+#else
+  if (!pcode) {
+    int n=assemble_p();
+    allocx(pcode, pcode_size, n);
+    if (!pcode || n!=assemble_p() || n<10 || pcode_size<10)
+      error("predictor JIT failed");
+  }
+  assert(pcode && pcode[0]);
+  return ((int(*)(Predictor*))&pcode[0])(this);
+#endif
+}
+
+// Update the model with bit y = 0..1
+// Use the JIT code starting at pcode[5].
+void Predictor::update(int y) {
+#ifdef NOJIT
+  update0(y);
+#else
+  assert(pcode && pcode[5]);
+  ((void(*)(Predictor*, int))&pcode[5])(this, y);
+
+  // Save bit y in c8, hmap4 (not implemented in JIT)
+  c8+=c8+y;
+  if (c8>=256) {
+    z.run(c8-256);
+    hmap4=1;
+    c8=1;
+    for (int i=0; i<z.header[6]; ++i) h[i]=z.H(i);
+  }
+  else if (c8>=16 && c8<32)
+    hmap4=(hmap4&0xf)<<5|y<<4|1;
+  else
+    hmap4=(hmap4&0x1f0)|(((hmap4&0xf)*2+y)&0xf);
+#endif
+}
+
+// Execute the ZPAQL code with input byte or -1 for EOF.
+// Use JIT code at rcode if available, or else create it.
+void ZPAQL::run(U32 input) {
+#ifdef NOJIT
+  run0(input);
+#else
+  if (!rcode) {
+    int n=assemble();
+    allocx(rcode, rcode_size, n);
+    if (!rcode || n<10 || rcode_size<10 || n!=assemble())
+      error("run JIT failed");
+  }
+  a=input;
+  if (!((int(*)())(&rcode[0]))())
+    libzpaq::error("Bad ZPAQL opcode");
+#endif
+}
+
+}  // end namespace libzpaq
diff --git a/libzpaq.h b/libzpaq.h
index e7879b4..be67318 100644
--- a/libzpaq.h
+++ b/libzpaq.h
@@ -1,441 +1,441 @@
-/* libzpaq.h - LIBZPAQ Version 5.00.
-
-  Copyright (C) 2011, Dell Inc. Written by Matt Mahoney.
-
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so without restriction.
-  This Software is provided "as is" without warranty.
-
-LIBZPAQ is a C++ library for compression and decompression of data
-conforming to the ZPAQ level 2 standard. See http://mattmahoney.net/zpaq/
-
-By default, LIBZPAQ uses JIT (just in time) acceleration. This only
-works on x86-32 and x86-64 processors that support the SSE2 instruction
-set. To disable JIT, compile with -DNOJIT. To enable run time checks,
-compile with -DDEBUG. Both options will decrease speed.
-
-The decompression code, when compiled with -DDEBUG and -DNOJIT,
-comprises the reference decoder for the ZPAQ level 2 standard.
-*/
-
-#ifndef LIBZPAQ_H
-#define LIBZPAQ_H
-
-#ifndef DEBUG
-#define NDEBUG 1
-#endif
-#include <assert.h>
-#include <stddef.h>
-#include <stdint.h>
-
-namespace libzpaq {
-
-// 1, 2, 4, 8 byte unsigned integers
-typedef uint8_t U8;
-typedef uint16_t U16;
-typedef uint32_t U32;
-typedef uint64_t U64;
-
-// Standard library prototypes redirected to libzpaq.cpp
-void* calloc(size_t, size_t);
-void free(void*);
-
-// Callback for error handling
-extern void error(const char* msg);
-
-// Virtual base classes for input and output
-// get() and put() must be overridden to read or write 1 byte.
-// read() and write() may be overridden to read or write n bytes more
-// efficiently than calling get() or put() n times.
-class Reader {
-public:
-  virtual int get() = 0;  // should return 0..255, or -1 at EOF
-  virtual int read(char* buf, int n); // read to buf[n], return no. read
-  virtual ~Reader() {}
-};
-
-class Writer {
-public:
-  virtual void put(int c) = 0;  // should output low 8 bits of c
-  virtual void write(const char* buf, int n);  // write buf[n]
-  virtual ~Writer() {}
-};
-
-// Read 16 bit little-endian number
-int toU16(const char* p);
-
-// An Array of T is cleared and aligned on a 64 byte address
-//   with no constructors called. No copy or assignment.
-// Array<T> a(n, ex=0);  - creates n<<ex elements of type T
-// a[i] - index
-// a(i) - index mod n, n must be a power of 2
-// a.size() - gets n
-template <typename T>
-class Array {
-  T *data;     // user location of [0] on a 64 byte boundary
-  size_t n;    // user size
-  int offset;  // distance back in bytes to start of actual allocation
-  void operator=(const Array&);  // no assignment
-  Array(const Array&);  // no copy
-public:
-  Array(size_t sz=0, int ex=0): data(0), n(0), offset(0) {
-    resize(sz, ex);} // [0..sz-1] = 0
-  void resize(size_t sz, int ex=0); // change size, erase content to zeros
-  ~Array() {resize(0);}  // free memory
-  size_t size() const {return n;}  // get size
-  int isize() const {return int(n);}  // get size as an int
-  T& operator[](size_t i) {assert(n>0 && i<n); return data[i];}
-  T& operator()(size_t i) {assert(n>0 && (n&(n-1))==0); return data[i&(n-1)];}
-};
-
-// Change size to sz<<ex elements of 0
-template<typename T>
-void Array<T>::resize(size_t sz, int ex) {
-  assert(size_t(-1)>0);  // unsigned type?
-  while (ex>0) {
-    if (sz>sz*2) error("Array too big");
-    sz*=2, --ex;
-  }
-  if (n>0) {
-    assert(offset>0 && offset<=64);
-    assert((char*)data-offset);
-    free((char*)data-offset);
-  }
-  n=0;
-  if (sz==0) return;
-  n=sz;
-  const size_t nb=128+n*sizeof(T);  // test for overflow
-  if (nb<=128 || (nb-128)/sizeof(T)!=n) error("Array too big");
-  data=(T*)calloc(nb, 1);
-  if (!data) error("Out of memory");
-  offset=64-(((char*)data-(char*)0)&63);
-  assert(offset>0 && offset<=64);
-  data=(T*)((char*)data+offset);
-}
-
-//////////////////////////// SHA1 ////////////////////////////
-
-// For computing SHA-1 checksums
-class SHA1 {
-public:
-  void put(int c) {  // hash 1 byte
-    U32& r=w[len0>>5&15];
-    r=(r<<8)|(c&255);
-    if (!(len0+=8)) ++len1;
-    if ((len0&511)==0) process();
-  }
-  double size() const {return len0/8+len1*536870912.0;} // size in bytes
-  uint64_t usize() const {return len0/8+(U64(len1)<<29);} // size in bytes
-  const char* result();  // get hash and reset
-  SHA1() {init();}
-private:
-  void init();      // reset, but don't clear hbuf
-  U32 len0, len1;   // length in bits (low, high)
-  U32 h[5];         // hash state
-  U32 w[80];        // input buffer
-  char hbuf[20];    // result
-  void process();   // hash 1 block
-};
-
-//////////////////////////// ZPAQL ///////////////////////////
-
-// Symbolic constants, instruction size, and names
-typedef enum {NONE,CONS,CM,ICM,MATCH,AVG,MIX2,MIX,ISSE,SSE} CompType;
-extern const int compsize[256];
-
-// A ZPAQL machine COMP+HCOMP or PCOMP.
-class ZPAQL {
-public:
-  ZPAQL();
-  ~ZPAQL();
-  void clear();           // Free memory, erase program, reset machine state
-  void inith();           // Initialize as HCOMP to run
-  void initp();           // Initialize as PCOMP to run
-  double memory();        // Return memory requirement in bytes
-  void run(U32 input);    // Execute with input
-  int read(Reader* in2);  // Read header
-  bool write(Writer* out2, bool pp); // If pp write PCOMP else HCOMP header
-  int step(U32 input, int mode);  // Trace execution (defined externally)
-
-  Writer* output;         // Destination for OUT instruction, or 0 to suppress
-  SHA1* sha1;             // Points to checksum computer
-  U32 H(int i) {return h(i);}  // get element of h
-
-  void flush();           // write outbuf[0..bufptr-1] to output and sha1
-  void outc(int c) {      // output byte c (0..255) or -1 at EOS
-    if (c<0 || (outbuf[bufptr]=c, ++bufptr==outbuf.isize())) flush();
-  }
-
-  // ZPAQ1 block header
-  Array<U8> header;   // hsize[2] hh hm ph pm n COMP (guard) HCOMP (guard)
-  int cend;           // COMP in header[7...cend-1]
-  int hbegin, hend;   // HCOMP/PCOMP in header[hbegin...hend-1]
-
-private:
-  // Machine state for executing HCOMP
-  Array<U8> m;        // memory array M for HCOMP
-  Array<U32> h;       // hash array H for HCOMP
-  Array<U32> r;       // 256 element register array
-  Array<char> outbuf; // output buffer
-  int bufptr;         // number of bytes in outbuf
-  U32 a, b, c, d;     // machine registers
-  int f;              // condition flag
-  int pc;             // program counter
-  int rcode_size;     // length of rcode
-  U8* rcode;          // JIT code for run()
-
-  // Support code
-  int assemble();  // put JIT code in rcode
-  void init(int hbits, int mbits);  // initialize H and M sizes
-  int execute();  // execute 1 instruction, return 0 after HALT, else 1
-  void run0(U32 input);  // default run() when select==0
-  void div(U32 x) {if (x) a/=x; else a=0;}
-  void mod(U32 x) {if (x) a%=x; else a=0;}
-  void swap(U32& x) {a^=x; x^=a; a^=x;}
-  void swap(U8& x)  {a^=x; x^=a; a^=x;}
-  void err();  // exit with run time error
-};
-
-///////////////////////// Component //////////////////////////
-
-// A Component is a context model, indirect context model, match model,
-// fixed weight mixer, adaptive 2 input mixer without or with current
-// partial byte as context, adaptive m input mixer (without or with),
-// or SSE (without or with).
-
-struct Component {
-  size_t limit;   // max count for cm
-  size_t cxt;     // saved context
-  size_t a, b, c; // multi-purpose variables
-  Array<U32> cm;  // cm[cxt] -> p in bits 31..10, n in 9..0; MATCH index
-  Array<U8> ht;   // ICM/ISSE hash table[0..size1][0..15] and MATCH buf
-  Array<U16> a16; // MIX weights
-  void init();    // initialize to all 0
-  Component() {init();}
-};
-
-////////////////////////// StateTable ////////////////////////
-
-// Next state table generator
-class StateTable {
-  enum {N=64}; // sizes of b, t
-  int num_states(int n0, int n1);  // compute t[n0][n1][1]
-  void discount(int& n0);  // set new value of n0 after 1 or n1 after 0
-  void next_state(int& n0, int& n1, int y);  // new (n0,n1) after bit y
-public:
-  U8 ns[1024]; // state*4 -> next state if 0, if 1, n0, n1
-  int next(int state, int y) {  // next state for bit y
-    assert(state>=0 && state<256);
-    assert(y>=0 && y<4);
-    return ns[state*4+y];
-  }
-  int cminit(int state) {  // initial probability of 1 * 2^23
-    assert(state>=0 && state<256);
-    return ((ns[state*4+3]*2+1)<<22)/(ns[state*4+2]+ns[state*4+3]+1);
-  }
-  StateTable();
-};
-
-///////////////////////// Predictor //////////////////////////
-
-// A predictor guesses the next bit
-class Predictor {
-public:
-  Predictor(ZPAQL&);
-  ~Predictor();
-  void init();          // build model
-  int predict();        // probability that next bit is a 1 (0..4095)
-  void update(int y);   // train on bit y (0..1)
-  int stat(int);        // Defined externally
-  bool isModeled() {    // n>0 components?
-    assert(z.header.isize()>6);
-    return z.header[6]!=0;
-  }
-private:
-
-  // Predictor state
-  int c8;               // last 0...7 bits.
-  int hmap4;            // c8 split into nibbles
-  int p[256];           // predictions
-  U32 h[256];           // unrolled copy of z.h
-  ZPAQL& z;             // VM to compute context hashes, includes H, n
-  Component comp[256];  // the model, includes P
-
-  // Modeling support functions
-  int predict0();       // default
-  void update0(int y);  // default
-  int dt2k[256];        // division table for match: dt2k[i] = 2^12/i
-  int dt[1024];         // division table for cm: dt[i] = 2^16/(i+1.5)
-  U16 squasht[4096];    // squash() lookup table
-  short stretcht[32768];// stretch() lookup table
-  StateTable st;        // next, cminit functions
-  U8* pcode;            // JIT code for predict() and update()
-  int pcode_size;       // length of pcode
-
-  // reduce prediction error in cr.cm
-  void train(Component& cr, int y) {
-    assert(y==0 || y==1);
-    U32& pn=cr.cm(cr.cxt);
-    U32 count=pn&0x3ff;
-    int error=y*32767-(cr.cm(cr.cxt)>>17);
-    pn+=(error*dt[count]&-1024)+(count<cr.limit);
-  }
-
-  // x -> floor(32768/(1+exp(-x/64)))
-  int squash(int x) {
-    assert(x>=-2048 && x<=2047);
-    return squasht[x+2048];
-  }
-
-  // x -> round(64*log((x+0.5)/(32767.5-x))), approx inverse of squash
-  int stretch(int x) {
-    assert(x>=0 && x<=32767);
-    return stretcht[x];
-  }
-
-  // bound x to a 12 bit signed int
-  int clamp2k(int x) {
-    if (x<-2048) return -2048;
-    else if (x>2047) return 2047;
-    else return x;
-  }
-
-  // bound x to a 20 bit signed int
-  int clamp512k(int x) {
-    if (x<-(1<<19)) return -(1<<19);
-    else if (x>=(1<<19)) return (1<<19)-1;
-    else return x;
-  }
-
-  // Get cxt in ht, creating a new row if needed
-  size_t find(Array<U8>& ht, int sizebits, U32 cxt);
-
-  // Put JIT code in pcode
-  int assemble_p();
-};
-
-//////////////////////////// Decoder /////////////////////////
-
-// Decoder decompresses using an arithmetic code
-class Decoder {
-public:
-  Reader* in;        // destination
-  Decoder(ZPAQL& z);
-  int decompress();  // return a byte or EOF
-  int skip();        // skip to the end of the segment, return next byte
-  void init();       // initialize at start of block
-  int stat(int x) {return pr.stat(x);}
-private:
-  U32 low, high;     // range
-  U32 curr;          // last 4 bytes of archive
-  Predictor pr;      // to get p
-  enum {BUFSIZE=1<<16};
-  Array<char> buf;   // input buffer of size BUFSIZE bytes
-    // of unmodeled data. buf[low..high-1] is input with curr
-    // remaining in sub-block.
-  int decode(int p); // return decoded bit (0..1) with prob. p (0..65535)
-  void loadbuf();    // read unmodeled data into buf to EOS
-};
-
-/////////////////////////// PostProcessor ////////////////////
-
-class PostProcessor {
-  int state;   // input parse state: 0=INIT, 1=PASS, 2..4=loading, 5=POST
-  int hsize;   // header size
-  int ph, pm;  // sizes of H and M in z
-public:
-  ZPAQL z;     // holds PCOMP
-  PostProcessor(): state(0), hsize(0), ph(0), pm(0) {}
-  void init(int h, int m);  // ph, pm sizes of H and M
-  int write(int c);  // Input a byte, return state
-  int getState() const {return state;}
-  void setOutput(Writer* out) {z.output=out;}
-  void setSHA1(SHA1* sha1ptr) {z.sha1=sha1ptr;}
-};
-
-//////////////////////// Decompresser ////////////////////////
-
-// For decompression and listing archive contents
-class Decompresser {
-public:
-  Decompresser(): z(), dec(z), pp(), state(BLOCK), decode_state(FIRSTSEG) {}
-  void setInput(Reader* in) {dec.in=in;}
-  bool findBlock(double* memptr = 0);
-  void hcomp(Writer* out2) {z.write(out2, false);}
-  bool findFilename(Writer* = 0);
-  void readComment(Writer* = 0);
-  void setOutput(Writer* out) {pp.setOutput(out);}
-  void setSHA1(SHA1* sha1ptr) {pp.setSHA1(sha1ptr);}
-  bool decompress(int n = -1);  // n bytes, -1=all, return true until done
-  bool pcomp(Writer* out2) {return pp.z.write(out2, true);}
-  void readSegmentEnd(char* sha1string = 0);
-  int stat(int x) {return dec.stat(x);}
-private:
-  ZPAQL z;
-  Decoder dec;
-  PostProcessor pp;
-  enum {BLOCK, FILENAME, COMMENT, DATA, SEGEND} state;  // expected next
-  enum {FIRSTSEG, SEG, SKIP} decode_state;  // which segment in block?
-};
-
-/////////////////////////// decompress() /////////////////////
-
-void decompress(Reader* in, Writer* out);
-
-//////////////////////////////////////////////////////////////
-//////////////////////////////////////////////////////////////
-
-// Code following this point is not a part of the ZPAQ level 2 standard.
-
-//////////////////////////// Encoder /////////////////////////
-
-// Encoder compresses using an arithmetic code
-class Encoder {
-public:
-  Encoder(ZPAQL& z, int size=0):
-    out(0), low(1), high(0xFFFFFFFF), pr(z) {}
-  void init();
-  void compress(int c);  // c is 0..255 or EOF
-  int stat(int x) {return pr.stat(x);}
-  Writer* out;  // destination
-private:
-  U32 low, high; // range
-  Predictor pr;  // to get p
-  Array<char> buf; // unmodeled input
-  void encode(int y, int p); // encode bit y (0..1) with prob. p (0..65535)
-};
-
-//////////////////////// Compressor //////////////////////////
-
-class Compressor {
-public:
-  Compressor(): enc(z), in(0), state(INIT) {}
-  void setOutput(Writer* out) {enc.out=out;}
-  void writeTag();
-  void startBlock(int level);  // level=1,2,3
-  void startBlock(const char* hcomp);
-  void startSegment(const char* filename = 0, const char* comment = 0);
-  void setInput(Reader* i) {in=i;}
-  void postProcess(const char* pcomp = 0, int len = 0);
-  bool compress(int n = -1);  // n bytes, -1=all, return true until done
-  void endSegment(const char* sha1string = 0);
-  void endBlock();
-  int stat(int x) {return enc.stat(x);}
-private:
-  ZPAQL z;
-  Encoder enc;
-  Reader* in;
-  enum {INIT, BLOCK1, SEG1, BLOCK2, SEG2} state;
-};
-
-/////////////////////////// compress() ///////////////////////
-
-void compress(Reader* in, Writer* out, int level);
-
-}  // namespace libzpaq
-
-#endif  // LIBZPAQ_H
+/* libzpaq.h - LIBZPAQ Version 5.00.
+
+  Copyright (C) 2011, Dell Inc. Written by Matt Mahoney.
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so without restriction.
+  This Software is provided "as is" without warranty.
+
+LIBZPAQ is a C++ library for compression and decompression of data
+conforming to the ZPAQ level 2 standard. See http://mattmahoney.net/zpaq/
+
+By default, LIBZPAQ uses JIT (just in time) acceleration. This only
+works on x86-32 and x86-64 processors that support the SSE2 instruction
+set. To disable JIT, compile with -DNOJIT. To enable run time checks,
+compile with -DDEBUG. Both options will decrease speed.
+
+The decompression code, when compiled with -DDEBUG and -DNOJIT,
+comprises the reference decoder for the ZPAQ level 2 standard.
+*/
+
+#ifndef LIBZPAQ_H
+#define LIBZPAQ_H
+
+#ifndef DEBUG
+#define NDEBUG 1
+#endif
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+namespace libzpaq {
+
+// 1, 2, 4, 8 byte unsigned integers
+typedef uint8_t U8;
+typedef uint16_t U16;
+typedef uint32_t U32;
+typedef uint64_t U64;
+
+// Standard library prototypes redirected to libzpaq.cpp
+void* calloc(size_t, size_t);
+void free(void*);
+
+// Callback for error handling
+extern void error(const char* msg);
+
+// Virtual base classes for input and output
+// get() and put() must be overridden to read or write 1 byte.
+// read() and write() may be overridden to read or write n bytes more
+// efficiently than calling get() or put() n times.
+class Reader {
+public:
+  virtual int get() = 0;  // should return 0..255, or -1 at EOF
+  virtual int read(char* buf, int n); // read to buf[n], return no. read
+  virtual ~Reader() {}
+};
+
+class Writer {
+public:
+  virtual void put(int c) = 0;  // should output low 8 bits of c
+  virtual void write(const char* buf, int n);  // write buf[n]
+  virtual ~Writer() {}
+};
+
+// Read 16 bit little-endian number
+int toU16(const char* p);
+
+// An Array of T is cleared and aligned on a 64 byte address
+//   with no constructors called. No copy or assignment.
+// Array<T> a(n, ex=0);  - creates n<<ex elements of type T
+// a[i] - index
+// a(i) - index mod n, n must be a power of 2
+// a.size() - gets n
+template <typename T>
+class Array {
+  T *data;     // user location of [0] on a 64 byte boundary
+  size_t n;    // user size
+  int offset;  // distance back in bytes to start of actual allocation
+  void operator=(const Array&);  // no assignment
+  Array(const Array&);  // no copy
+public:
+  Array(size_t sz=0, int ex=0): data(0), n(0), offset(0) {
+    resize(sz, ex);} // [0..sz-1] = 0
+  void resize(size_t sz, int ex=0); // change size, erase content to zeros
+  ~Array() {resize(0);}  // free memory
+  size_t size() const {return n;}  // get size
+  int isize() const {return int(n);}  // get size as an int
+  T& operator[](size_t i) {assert(n>0 && i<n); return data[i];}
+  T& operator()(size_t i) {assert(n>0 && (n&(n-1))==0); return data[i&(n-1)];}
+};
+
+// Change size to sz<<ex elements of 0
+template<typename T>
+void Array<T>::resize(size_t sz, int ex) {
+  assert(size_t(-1)>0);  // unsigned type?
+  while (ex>0) {
+    if (sz>sz*2) error("Array too big");
+    sz*=2, --ex;
+  }
+  if (n>0) {
+    assert(offset>0 && offset<=64);
+    assert((char*)data-offset);
+    free((char*)data-offset);
+  }
+  n=0;
+  if (sz==0) return;
+  n=sz;
+  const size_t nb=128+n*sizeof(T);  // test for overflow
+  if (nb<=128 || (nb-128)/sizeof(T)!=n) error("Array too big");
+  data=(T*)calloc(nb, 1);
+  if (!data) error("Out of memory");
+  offset=64-(((char*)data-(char*)0)&63);
+  assert(offset>0 && offset<=64);
+  data=(T*)((char*)data+offset);
+}
+
+//////////////////////////// SHA1 ////////////////////////////
+
+// For computing SHA-1 checksums
+class SHA1 {
+public:
+  void put(int c) {  // hash 1 byte
+    U32& r=w[len0>>5&15];
+    r=(r<<8)|(c&255);
+    if (!(len0+=8)) ++len1;
+    if ((len0&511)==0) process();
+  }
+  double size() const {return len0/8+len1*536870912.0;} // size in bytes
+  uint64_t usize() const {return len0/8+(U64(len1)<<29);} // size in bytes
+  const char* result();  // get hash and reset
+  SHA1() {init();}
+private:
+  void init();      // reset, but don't clear hbuf
+  U32 len0, len1;   // length in bits (low, high)
+  U32 h[5];         // hash state
+  U32 w[80];        // input buffer
+  char hbuf[20];    // result
+  void process();   // hash 1 block
+};
+
+//////////////////////////// ZPAQL ///////////////////////////
+
+// Symbolic constants, instruction size, and names
+typedef enum {NONE,CONS,CM,ICM,MATCH,AVG,MIX2,MIX,ISSE,SSE} CompType;
+extern const int compsize[256];
+
+// A ZPAQL machine COMP+HCOMP or PCOMP.
+class ZPAQL {
+public:
+  ZPAQL();
+  ~ZPAQL();
+  void clear();           // Free memory, erase program, reset machine state
+  void inith();           // Initialize as HCOMP to run
+  void initp();           // Initialize as PCOMP to run
+  double memory();        // Return memory requirement in bytes
+  void run(U32 input);    // Execute with input
+  int read(Reader* in2);  // Read header
+  bool write(Writer* out2, bool pp); // If pp write PCOMP else HCOMP header
+  int step(U32 input, int mode);  // Trace execution (defined externally)
+
+  Writer* output;         // Destination for OUT instruction, or 0 to suppress
+  SHA1* sha1;             // Points to checksum computer
+  U32 H(int i) {return h(i);}  // get element of h
+
+  void flush();           // write outbuf[0..bufptr-1] to output and sha1
+  void outc(int c) {      // output byte c (0..255) or -1 at EOS
+    if (c<0 || (outbuf[bufptr]=c, ++bufptr==outbuf.isize())) flush();
+  }
+
+  // ZPAQ1 block header
+  Array<U8> header;   // hsize[2] hh hm ph pm n COMP (guard) HCOMP (guard)
+  int cend;           // COMP in header[7...cend-1]
+  int hbegin, hend;   // HCOMP/PCOMP in header[hbegin...hend-1]
+
+private:
+  // Machine state for executing HCOMP
+  Array<U8> m;        // memory array M for HCOMP
+  Array<U32> h;       // hash array H for HCOMP
+  Array<U32> r;       // 256 element register array
+  Array<char> outbuf; // output buffer
+  int bufptr;         // number of bytes in outbuf
+  U32 a, b, c, d;     // machine registers
+  int f;              // condition flag
+  int pc;             // program counter
+  int rcode_size;     // length of rcode
+  U8* rcode;          // JIT code for run()
+
+  // Support code
+  int assemble();  // put JIT code in rcode
+  void init(int hbits, int mbits);  // initialize H and M sizes
+  int execute();  // execute 1 instruction, return 0 after HALT, else 1
+  void run0(U32 input);  // default run() when select==0
+  void div(U32 x) {if (x) a/=x; else a=0;}
+  void mod(U32 x) {if (x) a%=x; else a=0;}
+  void swap(U32& x) {a^=x; x^=a; a^=x;}
+  void swap(U8& x)  {a^=x; x^=a; a^=x;}
+  void err();  // exit with run time error
+};
+
+///////////////////////// Component //////////////////////////
+
+// A Component is a context model, indirect context model, match model,
+// fixed weight mixer, adaptive 2 input mixer without or with current
+// partial byte as context, adaptive m input mixer (without or with),
+// or SSE (without or with).
+
+struct Component {
+  size_t limit;   // max count for cm
+  size_t cxt;     // saved context
+  size_t a, b, c; // multi-purpose variables
+  Array<U32> cm;  // cm[cxt] -> p in bits 31..10, n in 9..0; MATCH index
+  Array<U8> ht;   // ICM/ISSE hash table[0..size1][0..15] and MATCH buf
+  Array<U16> a16; // MIX weights
+  void init();    // initialize to all 0
+  Component() {init();}
+};
+
+////////////////////////// StateTable ////////////////////////
+
+// Next state table generator
+class StateTable {
+  enum {N=64}; // sizes of b, t
+  int num_states(int n0, int n1);  // compute t[n0][n1][1]
+  void discount(int& n0);  // set new value of n0 after 1 or n1 after 0
+  void next_state(int& n0, int& n1, int y);  // new (n0,n1) after bit y
+public:
+  U8 ns[1024]; // state*4 -> next state if 0, if 1, n0, n1
+  int next(int state, int y) {  // next state for bit y
+    assert(state>=0 && state<256);
+    assert(y>=0 && y<4);
+    return ns[state*4+y];
+  }
+  int cminit(int state) {  // initial probability of 1 * 2^23
+    assert(state>=0 && state<256);
+    return ((ns[state*4+3]*2+1)<<22)/(ns[state*4+2]+ns[state*4+3]+1);
+  }
+  StateTable();
+};
+
+///////////////////////// Predictor //////////////////////////
+
+// A predictor guesses the next bit
+class Predictor {
+public:
+  Predictor(ZPAQL&);
+  ~Predictor();
+  void init();          // build model
+  int predict();        // probability that next bit is a 1 (0..4095)
+  void update(int y);   // train on bit y (0..1)
+  int stat(int);        // Defined externally
+  bool isModeled() {    // n>0 components?
+    assert(z.header.isize()>6);
+    return z.header[6]!=0;
+  }
+private:
+
+  // Predictor state
+  int c8;               // last 0...7 bits.
+  int hmap4;            // c8 split into nibbles
+  int p[256];           // predictions
+  U32 h[256];           // unrolled copy of z.h
+  ZPAQL& z;             // VM to compute context hashes, includes H, n
+  Component comp[256];  // the model, includes P
+
+  // Modeling support functions
+  int predict0();       // default
+  void update0(int y);  // default
+  int dt2k[256];        // division table for match: dt2k[i] = 2^12/i
+  int dt[1024];         // division table for cm: dt[i] = 2^16/(i+1.5)
+  U16 squasht[4096];    // squash() lookup table
+  short stretcht[32768];// stretch() lookup table
+  StateTable st;        // next, cminit functions
+  U8* pcode;            // JIT code for predict() and update()
+  int pcode_size;       // length of pcode
+
+  // reduce prediction error in cr.cm
+  void train(Component& cr, int y) {
+    assert(y==0 || y==1);
+    U32& pn=cr.cm(cr.cxt);
+    U32 count=pn&0x3ff;
+    int error=y*32767-(cr.cm(cr.cxt)>>17);
+    pn+=(error*dt[count]&-1024)+(count<cr.limit);
+  }
+
+  // x -> floor(32768/(1+exp(-x/64)))
+  int squash(int x) {
+    assert(x>=-2048 && x<=2047);
+    return squasht[x+2048];
+  }
+
+  // x -> round(64*log((x+0.5)/(32767.5-x))), approx inverse of squash
+  int stretch(int x) {
+    assert(x>=0 && x<=32767);
+    return stretcht[x];
+  }
+
+  // bound x to a 12 bit signed int
+  int clamp2k(int x) {
+    if (x<-2048) return -2048;
+    else if (x>2047) return 2047;
+    else return x;
+  }
+
+  // bound x to a 20 bit signed int
+  int clamp512k(int x) {
+    if (x<-(1<<19)) return -(1<<19);
+    else if (x>=(1<<19)) return (1<<19)-1;
+    else return x;
+  }
+
+  // Get cxt in ht, creating a new row if needed
+  size_t find(Array<U8>& ht, int sizebits, U32 cxt);
+
+  // Put JIT code in pcode
+  int assemble_p();
+};
+
+//////////////////////////// Decoder /////////////////////////
+
+// Decoder decompresses using an arithmetic code
+class Decoder {
+public:
+  Reader* in;        // destination
+  Decoder(ZPAQL& z);
+  int decompress();  // return a byte or EOF
+  int skip();        // skip to the end of the segment, return next byte
+  void init();       // initialize at start of block
+  int stat(int x) {return pr.stat(x);}
+private:
+  U32 low, high;     // range
+  U32 curr;          // last 4 bytes of archive
+  Predictor pr;      // to get p
+  enum {BUFSIZE=1<<16};
+  Array<char> buf;   // input buffer of size BUFSIZE bytes
+    // of unmodeled data. buf[low..high-1] is input with curr
+    // remaining in sub-block.
+  int decode(int p); // return decoded bit (0..1) with prob. p (0..65535)
+  void loadbuf();    // read unmodeled data into buf to EOS
+};
+
+/////////////////////////// PostProcessor ////////////////////
+
+class PostProcessor {
+  int state;   // input parse state: 0=INIT, 1=PASS, 2..4=loading, 5=POST
+  int hsize;   // header size
+  int ph, pm;  // sizes of H and M in z
+public:
+  ZPAQL z;     // holds PCOMP
+  PostProcessor(): state(0), hsize(0), ph(0), pm(0) {}
+  void init(int h, int m);  // ph, pm sizes of H and M
+  int write(int c);  // Input a byte, return state
+  int getState() const {return state;}
+  void setOutput(Writer* out) {z.output=out;}
+  void setSHA1(SHA1* sha1ptr) {z.sha1=sha1ptr;}
+};
+
+//////////////////////// Decompresser ////////////////////////
+
+// For decompression and listing archive contents
+class Decompresser {
+public:
+  Decompresser(): z(), dec(z), pp(), state(BLOCK), decode_state(FIRSTSEG) {}
+  void setInput(Reader* in) {dec.in=in;}
+  bool findBlock(double* memptr = 0);
+  void hcomp(Writer* out2) {z.write(out2, false);}
+  bool findFilename(Writer* = 0);
+  void readComment(Writer* = 0);
+  void setOutput(Writer* out) {pp.setOutput(out);}
+  void setSHA1(SHA1* sha1ptr) {pp.setSHA1(sha1ptr);}
+  bool decompress(int n = -1);  // n bytes, -1=all, return true until done
+  bool pcomp(Writer* out2) {return pp.z.write(out2, true);}
+  void readSegmentEnd(char* sha1string = 0);
+  int stat(int x) {return dec.stat(x);}
+private:
+  ZPAQL z;
+  Decoder dec;
+  PostProcessor pp;
+  enum {BLOCK, FILENAME, COMMENT, DATA, SEGEND} state;  // expected next
+  enum {FIRSTSEG, SEG, SKIP} decode_state;  // which segment in block?
+};
+
+/////////////////////////// decompress() /////////////////////
+
+void decompress(Reader* in, Writer* out);
+
+//////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////
+
+// Code following this point is not a part of the ZPAQ level 2 standard.
+
+//////////////////////////// Encoder /////////////////////////
+
+// Encoder compresses using an arithmetic code
+class Encoder {
+public:
+  Encoder(ZPAQL& z, int size=0):
+    out(0), low(1), high(0xFFFFFFFF), pr(z) {}
+  void init();
+  void compress(int c);  // c is 0..255 or EOF
+  int stat(int x) {return pr.stat(x);}
+  Writer* out;  // destination
+private:
+  U32 low, high; // range
+  Predictor pr;  // to get p
+  Array<char> buf; // unmodeled input
+  void encode(int y, int p); // encode bit y (0..1) with prob. p (0..65535)
+};
+
+//////////////////////// Compressor //////////////////////////
+
+class Compressor {
+public:
+  Compressor(): enc(z), in(0), state(INIT) {}
+  void setOutput(Writer* out) {enc.out=out;}
+  void writeTag();
+  void startBlock(int level);  // level=1,2,3
+  void startBlock(const char* hcomp);
+  void startSegment(const char* filename = 0, const char* comment = 0);
+  void setInput(Reader* i) {in=i;}
+  void postProcess(const char* pcomp = 0, int len = 0);
+  bool compress(int n = -1);  // n bytes, -1=all, return true until done
+  void endSegment(const char* sha1string = 0);
+  void endBlock();
+  int stat(int x) {return enc.stat(x);}
+private:
+  ZPAQL z;
+  Encoder enc;
+  Reader* in;
+  enum {INIT, BLOCK1, SEG1, BLOCK2, SEG2} state;
+};
+
+/////////////////////////// compress() ///////////////////////
+
+void compress(Reader* in, Writer* out, int level);
+
+}  // namespace libzpaq
+
+#endif  // LIBZPAQ_H
diff --git a/readme.txt b/readme.txt
index 42f2d87..2bdf98e 100644
--- a/readme.txt
+++ b/readme.txt
@@ -1,84 +1,84 @@
-fastqz15.cpp is the source code for the latest version
-of the FASTQ compressor. It compresses the common Sanger
-variant. FASTQ is output by DNA sequencing machines.
-
-fapack.cpp is a program to pack FASTA files into a format
-suitable for input to fastqz as a reference genome for
-better compression.
-It packs 4 bases per byte and discards all but A,C,G,T.
-
-fapacks.cpp works the same except that it does not ignore
-lowercase a,c,g,t. Lowercase is used in hg19 to indicate
-repeats. Generally it produces a larger reference but
-gives better compression.
-
-Other fastqz*.cpp are older versions. You don't need them.
-
-Usage: fastqz {c[Q]|d|e[Q]|f} input output [reference]
-
-Command c compresses input to output.fx?.zpaq (3 or 4 files)
-Command d decompresses input.fx?.zpaq to output
-Command e encodes input to output.fx?
-Command f decodes input.fx? to output
-
-Commands c and d are slow, require 1.5 GB memory, use 3 or
-4 cores, but get very good compression. Commands e and f are much
-faster, use little memory, and only one thread, but compression
-ratio is not as good.
-
-Commands cQ or eQ quantize the quality scores for lossy but
-better compression. The default is c1 or e1, which is lossless.
-Quality scores in the range 33..73 are rounded down to 35 plus
-a multiple of Q.
-
-You can supply a reference genome to improve compression.
-If you use this, the same reference is needed to decompress.
-It also increases the memory requirement to 1.2 GB for the
-e command and 0.5 GB for the f command. c and d still need
-1.5 GB.
-
-You can prepare the reference genome from FASTA files like:
-
-  fapacks hg19s *.fa
-
-to produce the file hg19s. Then compress:
-
-  fastqz c in.fastq arc hg19s
-
-To decompress:
-
-  fastqz d arc out.fastq hg19s
-
-There are 4 compressed files:
-
-  arc.fxh.zpaq - compressed headers
-  arc.fxb.zpaq - compressed base calls
-  arc.fxq.zpaq - compressed quality scores
-  arc.fxa.zpaq - compressed alignments if a reference is used.
-
-Commands e and f work the same way except the compressed
-files do not have a .zpaq extension. If no reference is
-used, then no .fxa or .fxa.zpaq file is produced or expected.
-
-fastqz only works on the Sanger FASTQ variant. It assumes
-that quality scores are Phred+33 (range ASCII 33 to 73).
-Base calls must be A,C,G,T,N only. N must have a quality
-score of 0, and all others 1 or higher. Maximum line length
-is 4095. Lines must be terminated by linefeeds only (no
-carriage returns). If a reference is used, it must be
-smaller than 1 GB packed (4 billion bases).
-
-To compile fastqz you will need the latest version of
-libzpaq from https://sourceforge.net/projects/zpaq/
-or http://mattmahoney.net/zpaq/
-These programs will work in either Windows or Linux.
-In Windows, you will also need Pthreads-Win32 from
-http://sourceware.org/pthreads-win32/ to compile or run.
-To compile (no Makefile, sorry):
-
-  g++ -O3 -msse2 -s -lpthread fastqz.cpp libzpaq.cpp -o fastqz
-  g++ -O3 -s fapack.cpp -o fapack
-
-fastqz* and fapack* are written by Matt Mahoney, Dell Inc.
-All are BSD-2 licensed. But note that libzpaq
-is public domain and Pthreads-Win32 is LGPL.
+fastqz15.cpp is the source code for the latest version
+of the FASTQ compressor. It compresses the common Sanger
+variant. FASTQ is output by DNA sequencing machines.
+
+fapack.cpp is a program to pack FASTA files into a format
+suitable for input to fastqz as a reference genome for
+better compression.
+It packs 4 bases per byte and discards all but A,C,G,T.
+
+fapacks.cpp works the same except that it does not ignore
+lowercase a,c,g,t. Lowercase is used in hg19 to indicate
+repeats. Generally it produces a larger reference but
+gives better compression.
+
+Other fastqz*.cpp are older versions. You don't need them.
+
+Usage: fastqz {c[Q]|d|e[Q]|f} input output [reference]
+
+Command c compresses input to output.fx?.zpaq (3 or 4 files)
+Command d decompresses input.fx?.zpaq to output
+Command e encodes input to output.fx?
+Command f decodes input.fx? to output
+
+Commands c and d are slow, require 1.5 GB memory, use 3 or
+4 cores, but get very good compression. Commands e and f are much
+faster, use little memory, and only one thread, but compression
+ratio is not as good.
+
+Commands cQ or eQ quantize the quality scores for lossy but
+better compression. The default is c1 or e1, which is lossless.
+Quality scores in the range 33..73 are rounded down to 35 plus
+a multiple of Q.
+
+You can supply a reference genome to improve compression.
+If you use this, the same reference is needed to decompress.
+It also increases the memory requirement to 1.2 GB for the
+e command and 0.5 GB for the f command. c and d still need
+1.5 GB.
+
+You can prepare the reference genome from FASTA files like:
+
+  fapacks hg19s *.fa
+
+to produce the file hg19s. Then compress:
+
+  fastqz c in.fastq arc hg19s
+
+To decompress:
+
+  fastqz d arc out.fastq hg19s
+
+There are 4 compressed files:
+
+  arc.fxh.zpaq - compressed headers
+  arc.fxb.zpaq - compressed base calls
+  arc.fxq.zpaq - compressed quality scores
+  arc.fxa.zpaq - compressed alignments if a reference is used.
+
+Commands e and f work the same way except the compressed
+files do not have a .zpaq extension. If no reference is
+used, then no .fxa or .fxa.zpaq file is produced or expected.
+
+fastqz only works on the Sanger FASTQ variant. It assumes
+that quality scores are Phred+33 (range ASCII 33 to 73).
+Base calls must be A,C,G,T,N only. N must have a quality
+score of 0, and all others 1 or higher. Maximum line length
+is 4095. Lines must be terminated by linefeeds only (no
+carriage returns). If a reference is used, it must be
+smaller than 1 GB packed (4 billion bases).
+
+To compile fastqz you will need the latest version of
+libzpaq from https://sourceforge.net/projects/zpaq/
+or http://mattmahoney.net/zpaq/
+These programs will work in either Windows or Linux.
+In Windows, you will also need Pthreads-Win32 from
+http://sourceware.org/pthreads-win32/ to compile or run.
+To compile (no Makefile, sorry):
+
+  g++ -O3 -msse2 -s -lpthread fastqz.cpp libzpaq.cpp -o fastqz
+  g++ -O3 -s fapack.cpp -o fapack
+
+fastqz* and fapack* are written by Matt Mahoney, Dell Inc.
+All are BSD-2 licensed. But note that libzpaq
+is public domain and Pthreads-Win32 is LGPL.