diff --git a/renderdoc/3rdparty/stb/stb_image.h b/renderdoc/3rdparty/stb/stb_image.h
index e42c6cc372..5e807a0a6e 100644
--- a/renderdoc/3rdparty/stb/stb_image.h
+++ b/renderdoc/3rdparty/stb/stb_image.h
@@ -1,5 +1,5 @@
-/* stb_image - v2.12 - public domain image loader - http://nothings.org/stb_image.h
-                                     no warranty implied; use at your own risk
+/* stb_image - v2.28 - public domain image loader - http://nothings.org/stb
+                                  no warranty implied; use at your own risk
 
    Do this:
       #define STB_IMAGE_IMPLEMENTATION
@@ -21,7 +21,7 @@
           avoid problematic images and only need the trivial interface
 
       JPEG baseline & progressive (12 bpc/arithmetic not supported, same as stock IJG lib)
-      PNG 1/2/4/8-bit-per-channel (16 bpc not supported)
+      PNG 1/2/4/8/16-bit-per-channel
 
       TGA (not sure what subset, if a subset)
       BMP non-1bpp, non-RLE
@@ -42,136 +42,35 @@
    Full documentation under "DOCUMENTATION" below.
 
 
-   Revision 2.00 release notes:
-
-      - Progressive JPEG is now supported.
-
-      - PPM and PGM binary formats are now supported, thanks to Ken Miller.
-
-      - x86 platforms now make use of SSE2 SIMD instructions for
-        JPEG decoding, and ARM platforms can use NEON SIMD if requested.
-        This work was done by Fabian "ryg" Giesen. SSE2 is used by
-        default, but NEON must be enabled explicitly; see docs.
-
-        With other JPEG optimizations included in this version, we see
-        2x speedup on a JPEG on an x86 machine, and a 1.5x speedup
-        on a JPEG on an ARM machine, relative to previous versions of this
-        library. The same results will not obtain for all JPGs and for all
-        x86/ARM machines. (Note that progressive JPEGs are significantly
-        slower to decode than regular JPEGs.) This doesn't mean that this
-        is the fastest JPEG decoder in the land; rather, it brings it
-        closer to parity with standard libraries. If you want the fastest
-        decode, look elsewhere. (See "Philosophy" section of docs below.)
-
-        See final bullet items below for more info on SIMD.
-
-      - Added STBI_MALLOC, STBI_REALLOC, and STBI_FREE macros for replacing
-        the memory allocator. Unlike other STBI libraries, these macros don't
-        support a context parameter, so if you need to pass a context in to
-        the allocator, you'll have to store it in a global or a thread-local
-        variable.
-
-      - Split existing STBI_NO_HDR flag into two flags, STBI_NO_HDR and
-        STBI_NO_LINEAR.
-            STBI_NO_HDR:     suppress implementation of .hdr reader format
-            STBI_NO_LINEAR:  suppress high-dynamic-range light-linear float API
-
-      - You can suppress implementation of any of the decoders to reduce
-        your code footprint by #defining one or more of the following
-        symbols before creating the implementation.
-
-            STBI_NO_JPEG
-            STBI_NO_PNG
-            STBI_NO_BMP
-            STBI_NO_PSD
-            STBI_NO_TGA
-            STBI_NO_GIF
-            STBI_NO_HDR
-            STBI_NO_PIC
-            STBI_NO_PNM   (.ppm and .pgm)
-
-      - You can request *only* certain decoders and suppress all other ones
-        (this will be more forward-compatible, as addition of new decoders
-        doesn't require you to disable them explicitly):
-
-            STBI_ONLY_JPEG
-            STBI_ONLY_PNG
-            STBI_ONLY_BMP
-            STBI_ONLY_PSD
-            STBI_ONLY_TGA
-            STBI_ONLY_GIF
-            STBI_ONLY_HDR
-            STBI_ONLY_PIC
-            STBI_ONLY_PNM   (.ppm and .pgm)
-
-         Note that you can define multiples of these, and you will get all
-         of them ("only x" and "only y" is interpreted to mean "only x&y").
-
-       - If you use STBI_NO_PNG (or _ONLY_ without PNG), and you still
-         want the zlib decoder to be available, #define STBI_SUPPORT_ZLIB
-
-      - Compilation of all SIMD code can be suppressed with
-            #define STBI_NO_SIMD
-        It should not be necessary to disable SIMD unless you have issues
-        compiling (e.g. using an x86 compiler which doesn't support SSE
-        intrinsics or that doesn't support the method used to detect
-        SSE2 support at run-time), and even those can be reported as
-        bugs so I can refine the built-in compile-time checking to be
-        smarter.
-
-      - The old STBI_SIMD system which allowed installing a user-defined
-        IDCT etc. has been removed. If you need this, don't upgrade. My
-        assumption is that almost nobody was doing this, and those who
-        were will find the built-in SIMD more satisfactory anyway.
-
-      - RGB values computed for JPEG images are slightly different from
-        previous versions of stb_image. (This is due to using less
-        integer precision in SIMD.) The C code has been adjusted so
-        that the same RGB values will be computed regardless of whether
-        SIMD support is available, so your app should always produce
-        consistent results. But these results are slightly different from
-        previous versions. (Specifically, about 3% of available YCbCr values
-        will compute different RGB results from pre-1.49 versions by +-1;
-        most of the deviating values are one smaller in the G channel.)
-
-      - If you must produce consistent results with previous versions of
-        stb_image, #define STBI_JPEG_OLD and you will get the same results
-        you used to; however, you will not get the SIMD speedups for
-        the YCbCr-to-RGB conversion step (although you should still see
-        significant JPEG speedup from the other changes).
-
-        Please note that STBI_JPEG_OLD is a temporary feature; it will be
-        removed in future versions of the library. It is only intended for
-        near-term back-compatibility use.
-
-
-   Latest revision history:
+LICENSE
+
+  See end of file for license information.
+
+RECENT REVISION HISTORY:
+
+      2.28  (2023-01-29) many error fixes, security errors, just tons of stuff
+      2.27  (2021-07-11) document stbi_info better, 16-bit PNM support, bug fixes
+      2.26  (2020-07-13) many minor fixes
+      2.25  (2020-02-02) fix warnings
+      2.24  (2020-02-02) fix warnings; thread-local failure_reason and flip_vertically
+      2.23  (2019-08-11) fix clang static analysis warning
+      2.22  (2019-03-04) gif fixes, fix warnings
+      2.21  (2019-02-25) fix typo in comment
+      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs
+      2.19  (2018-02-11) fix warning
+      2.18  (2018-01-30) fix warnings
+      2.17  (2018-01-29) bugfix, 1-bit BMP, 16-bitness query, fix warnings
+      2.16  (2017-07-23) all functions have 16-bit variants; optimizations; bugfixes
+      2.15  (2017-03-18) fix png-1,2,4; all Imagenet JPGs; no runtime SSE detection on GCC
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
+      2.13  (2016-12-04) experimental 16-bit API, only for PNG so far; fixes
       2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
       2.11  (2016-04-02) 16-bit PNGS; enable SSE2 in non-gcc x64
                          RGB-format JPEG; remove white matting in PSD;
-                         allocate large structures on the stack; 
+                         allocate large structures on the stack;
                          correct channel count for PNG & BMP
       2.10  (2016-01-22) avoid warning introduced in 2.09
       2.09  (2016-01-16) 16-bit TGA; comments in PNM files; STBI_REALLOC_SIZED
-      2.08  (2015-09-13) fix to 2.07 cleanup, reading RGB PSD as RGBA
-      2.07  (2015-09-13) partial animated GIF support
-                         limited 16-bit PSD support
-                         minor bugs, code cleanup, and compiler warnings
-      2.06  (2015-04-19) fix bug where PSD returns wrong '*comp' value
-      2.05  (2015-04-19) fix bug in progressive JPEG handling, fix warning
-      2.04  (2015-04-15) try to re-enable SIMD on MinGW 64-bit
-      2.03  (2015-04-12) additional corruption checking
-                         stbi_set_flip_vertically_on_load
-                         fix NEON support; fix mingw support
-      2.02  (2015-01-19) fix incorrect assert, fix warning
-      2.01  (2015-01-17) fix various warnings
-      2.00b (2014-12-25) fix STBI_MALLOC in progressive JPEG
-      2.00  (2014-12-25) optimize JPEG, including x86 SSE2 & ARM NEON SIMD
-                         progressive JPEG
-                         PGM/PPM support
-                         STBI_MALLOC,STBI_REALLOC,STBI_FREE
-                         STBI_NO_*, STBI_ONLY_*
-                         GIF bugfix
 
    See end of file for full revision history.
 
@@ -186,34 +85,43 @@
     Tom Seddon (pic)                       Omar Cornut (1/2/4-bit PNG)
     Thatcher Ulrich (psd)                  Nicolas Guillemot (vertical flip)
     Ken Miller (pgm, ppm)                  Richard Mitton (16-bit PSD)
-    urraka@github (animated gif)           Junggon Kim (PNM comments)
-                                           Daniel Gibson (16-bit TGA)
-
- Optimizations & bugfixes
-    Fabian "ryg" Giesen
-    Arseny Kapoulkine
+    github:urraka (animated gif)           Junggon Kim (PNM comments)
+    Christopher Forseth (animated gif)     Daniel Gibson (16-bit TGA)
+                                           socks-the-fox (16-bit PNG)
+                                           Jeremy Sawicki (handle all ImageNet JPGs)
+ Optimizations & bugfixes                  Mikhail Morozov (1-bit BMP)
+    Fabian "ryg" Giesen                    Anael Seghezzi (is-16-bit query)
+    Arseny Kapoulkine                      Simon Breuss (16-bit PNM)
+    John-Mark Allen
+    Carmelo J Fdez-Aguera
 
  Bug & warning fixes
-    Marc LeBlanc            David Woo          Guillaume George   Martins Mozeiko
-    Christpher Lloyd        Martin Golini      Jerry Jansson      Joseph Thomson
-    Dave Moore              Roy Eltham         Hayaki Saito       Phil Jordan
-    Won Chun                Luke Graham        Johan Duparc       Nathan Reed
-    the Horde3D community   Thomas Ruf         Ronny Chevalier    Nick Verigakis
-    Janez Zemva             John Bartholomew   Michal Cichon      svdijk@github
-    Jonathan Blow           Ken Hamada         Tero Hanninen      Baldur Karlsson
-    Laurent Gomila          Cort Stratton      Sergio Gonzalez    romigrou@github
-    Aruelien Pocheville     Thibault Reuille   Cass Everitt       Matthew Gregan
-    Ryamond Barbiero        Paul Du Bois       Engin Manap        snagar@github
-    Michaelangel007@github  Oriol Ferrer Mesia socks-the-fox
-    Blazej Dariusz Roszkowski
-
-
-LICENSE
-
-This software is dual-licensed to the public domain and under the following
-license: you are granted a perpetual, irrevocable license to copy, modify,
-publish, and distribute this file as you see fit.
-
+    Marc LeBlanc            David Woo          Guillaume George     Martins Mozeiko
+    Christpher Lloyd        Jerry Jansson      Joseph Thomson       Blazej Dariusz Roszkowski
+    Phil Jordan                                Dave Moore           Roy Eltham
+    Hayaki Saito            Nathan Reed        Won Chun
+    Luke Graham             Johan Duparc       Nick Verigakis       the Horde3D community
+    Thomas Ruf              Ronny Chevalier                         github:rlyeh
+    Janez Zemva             John Bartholomew   Michal Cichon        github:romigrou
+    Jonathan Blow           Ken Hamada         Tero Hanninen        github:svdijk
+    Eugene Golushkov        Laurent Gomila     Cort Stratton        github:snagar
+    Aruelien Pocheville     Sergio Gonzalez    Thibault Reuille     github:Zelex
+    Cass Everitt            Ryamond Barbiero                        github:grim210
+    Paul Du Bois            Engin Manap        Aldo Culquicondor    github:sammyhw
+    Philipp Wiesemann       Dale Weiler        Oriol Ferrer Mesia   github:phprus
+    Josh Tobin              Neil Bickford      Matthew Gregan       github:poppolopoppo
+    Julian Raschke          Gregory Mullen     Christian Floisand   github:darealshinji
+    Baldur Karlsson         Kevin Schmidt      JR Smith             github:Michaelangel007
+                            Brad Weinberger    Matvey Cherevko      github:mosra
+    Luca Sas                Alexander Veselov  Zack Middleton       [reserved]
+    Ryan C. Gordon          [reserved]                              [reserved]
+                     DO NOT ADD YOUR NAME HERE
+
+                     Jacko Dirks
+
+  To add your name to the credits, pick a random blank space in the middle and fill it.
+  80% of merge conflicts on stb PRs are due to people adding their name at the end
+  of the credits.
 */
 
 #ifndef STBI_INCLUDE_STB_IMAGE_H
@@ -222,10 +130,8 @@ publish, and distribute this file as you see fit.
 // DOCUMENTATION
 //
 // Limitations:
-//    - no 16-bit-per-channel PNG
 //    - no 12-bit-per-channel JPEG
 //    - no JPEGs with arithmetic coding
-//    - no 1-bit BMP
 //    - GIF always returns *comp=4
 //
 // Basic usage (see HDR discussion below for HDR usage):
@@ -235,13 +141,13 @@ publish, and distribute this file as you see fit.
 //    // ... x = width, y = height, n = # 8-bit components per pixel ...
 //    // ... replace '0' with '1'..'4' to force that many components per pixel
 //    // ... but 'n' will always be the number that it would have been if you said 0
-//    stbi_image_free(data)
+//    stbi_image_free(data);
 //
 // Standard parameters:
-//    int *x       -- outputs image width in pixels
-//    int *y       -- outputs image height in pixels
-//    int *comp    -- outputs # of image components in image file
-//    int req_comp -- if non-zero, # of image components requested in result
+//    int *x                 -- outputs image width in pixels
+//    int *y                 -- outputs image height in pixels
+//    int *channels_in_file  -- outputs # of image components in image file
+//    int desired_channels   -- if non-zero, # of image components requested in result
 //
 // The return value from an image loader is an 'unsigned char *' which points
 // to the pixel data, or NULL on an allocation failure or if the image is
@@ -249,11 +155,12 @@ publish, and distribute this file as you see fit.
 // with each pixel consisting of N interleaved 8-bit components; the first
 // pixel pointed to is top-left-most in the image. There is no padding between
 // image scanlines or between pixels, regardless of format. The number of
-// components N is 'req_comp' if req_comp is non-zero, or *comp otherwise.
-// If req_comp is non-zero, *comp has the number of components that _would_
-// have been output otherwise. E.g. if you set req_comp to 4, you will always
-// get RGBA output, but you can check *comp to see if it's trivially opaque
-// because e.g. there were only 3 channels in the source image.
+// components N is 'desired_channels' if desired_channels is non-zero, or
+// *channels_in_file otherwise. If desired_channels is non-zero,
+// *channels_in_file has the number of components that _would_ have been
+// output otherwise. E.g. if you set desired_channels to 4, you will always
+// get RGBA output, but you can check *channels_in_file to see if it's trivially
+// opaque because e.g. there were only 3 channels in the source image.
 //
 // An output image with N components has the following components interleaved
 // in this order in each pixel:
@@ -265,14 +172,50 @@ publish, and distribute this file as you see fit.
 //       4           red, green, blue, alpha
 //
 // If image loading fails for any reason, the return value will be NULL,
-// and *x, *y, *comp will be unchanged. The function stbi_failure_reason()
-// can be queried for an extremely brief, end-user unfriendly explanation
-// of why the load failed. Define STBI_NO_FAILURE_STRINGS to avoid
-// compiling these strings at all, and STBI_FAILURE_USERMSG to get slightly
+// and *x, *y, *channels_in_file will be unchanged. The function
+// stbi_failure_reason() can be queried for an extremely brief, end-user
+// unfriendly explanation of why the load failed. Define STBI_NO_FAILURE_STRINGS
+// to avoid compiling these strings at all, and STBI_FAILURE_USERMSG to get slightly
 // more user-friendly ones.
 //
 // Paletted PNG, BMP, GIF, and PIC images are automatically depalettized.
 //
+// To query the width, height and component count of an image without having to
+// decode the full file, you can use the stbi_info family of functions:
+//
+//   int x,y,n,ok;
+//   ok = stbi_info(filename, &x, &y, &n);
+//   // returns ok=1 and sets x, y, n if image is a supported format,
+//   // 0 otherwise.
+//
+// Note that stb_image pervasively uses ints in its public API for sizes,
+// including sizes of memory buffers. This is now part of the API and thus
+// hard to change without causing breakage. As a result, the various image
+// loaders all have certain limits on image size; these differ somewhat
+// by format but generally boil down to either just under 2GB or just under
+// 1GB. When the decoded image would be larger than this, stb_image decoding
+// will fail.
+//
+// Additionally, stb_image will reject image files that have any of their
+// dimensions set to a larger value than the configurable STBI_MAX_DIMENSIONS,
+// which defaults to 2**24 = 16777216 pixels. Due to the above memory limit,
+// the only way to have an image with such dimensions load correctly
+// is for it to have a rather extreme aspect ratio. Either way, the
+// assumption here is that such larger images are likely to be malformed
+// or malicious. If you do need to load an image with individual dimensions
+// larger than that, and it still fits in the overall size limit, you can
+// #define STBI_MAX_DIMENSIONS on your own to be something larger.
+//
+// ===========================================================================
+//
+// UNICODE:
+//
+//   If compiling for Windows and you wish to use Unicode filenames, compile
+//   with
+//       #define STBI_WINDOWS_UTF8
+//   and pass utf8-encoded filenames. Call stbi_convert_wchar_to_utf8 to convert
+//   Windows wchar_t filenames to utf8.
+//
 // ===========================================================================
 //
 // Philosophy
@@ -285,15 +228,15 @@ publish, and distribute this file as you see fit.
 //
 // Sometimes I let "good performance" creep up in priority over "easy to maintain",
 // and for best performance I may provide less-easy-to-use APIs that give higher
-// performance, in addition to the easy to use ones. Nevertheless, it's important
+// performance, in addition to the easy-to-use ones. Nevertheless, it's important
 // to keep in mind that from the standpoint of you, a client of this library,
-// all you care about is #1 and #3, and stb libraries do not emphasize #3 above all.
+// all you care about is #1 and #3, and stb libraries DO NOT emphasize #3 above all.
 //
 // Some secondary priorities arise directly from the first two, some of which
-// make more explicit reasons why performance can't be emphasized.
+// provide more explicit reasons why performance can't be emphasized.
 //
 //    - Portable ("ease of use")
-//    - Small footprint ("easy to maintain")
+//    - Small source code footprint ("easy to maintain")
 //    - No dependencies ("ease of use")
 //
 // ===========================================================================
@@ -325,13 +268,6 @@ publish, and distribute this file as you see fit.
 // (at least this is true for iOS and Android). Therefore, the NEON support is
 // toggled by a build flag: define STBI_NEON to get NEON loops.
 //
-// The output of the JPEG decoder is slightly different from versions where
-// SIMD support was introduced (that is, for versions before 1.49). The
-// difference is only +-1 in the 8-bit RGB channels, and only on a small
-// fraction of pixels. You can force the pre-1.49 behavior by defining
-// STBI_JPEG_OLD, but this will disable some of the SIMD decoding path
-// and hence cost some performance.
-//
 // If for some reason you do not want to use any of SIMD code, or if
 // you have issues compiling it, you can disable it entirely by
 // defining STBI_NO_SIMD.
@@ -340,11 +276,10 @@ publish, and distribute this file as you see fit.
 //
 // HDR image support   (disable by defining STBI_NO_HDR)
 //
-// stb_image now supports loading HDR images in general, and currently
-// the Radiance .HDR file format, although the support is provided
-// generically. You can still load any file through the existing interface;
-// if you attempt to load an HDR file, it will be automatically remapped to
-// LDR, assuming gamma 2.2 and an arbitrary scale factor defaulting to 1;
+// stb_image supports loading HDR images in general, and currently the Radiance
+// .HDR file format specifically. You can still load any file through the existing
+// interface; if you attempt to load an HDR file, it will be automatically remapped
+// to LDR, assuming gamma 2.2 and an arbitrary scale factor defaulting to 1;
 // both of these constants can be reconfigured through this interface:
 //
 //     stbi_hdr_to_ldr_gamma(2.2f);
@@ -376,18 +311,59 @@ publish, and distribute this file as you see fit.
 //
 // iPhone PNG support:
 //
-// By default we convert iphone-formatted PNGs back to RGB, even though
-// they are internally encoded differently. You can disable this conversion
-// by by calling stbi_convert_iphone_png_to_rgb(0), in which case
-// you will always just get the native iphone "format" through (which
-// is BGR stored in RGB).
+// We optionally support converting iPhone-formatted PNGs (which store
+// premultiplied BGRA) back to RGB, even though they're internally encoded
+// differently. To enable this conversion, call
+// stbi_convert_iphone_png_to_rgb(1).
 //
 // Call stbi_set_unpremultiply_on_load(1) as well to force a divide per
 // pixel to remove any premultiplied alpha *only* if the image file explicitly
 // says there's premultiplied data (currently only happens in iPhone images,
 // and only if iPhone convert-to-rgb processing is on).
 //
-
+// ===========================================================================
+//
+// ADDITIONAL CONFIGURATION
+//
+//  - You can suppress implementation of any of the decoders to reduce
+//    your code footprint by #defining one or more of the following
+//    symbols before creating the implementation.
+//
+//        STBI_NO_JPEG
+//        STBI_NO_PNG
+//        STBI_NO_BMP
+//        STBI_NO_PSD
+//        STBI_NO_TGA
+//        STBI_NO_GIF
+//        STBI_NO_HDR
+//        STBI_NO_PIC
+//        STBI_NO_PNM   (.ppm and .pgm)
+//
+//  - You can request *only* certain decoders and suppress all other ones
+//    (this will be more forward-compatible, as addition of new decoders
+//    doesn't require you to disable them explicitly):
+//
+//        STBI_ONLY_JPEG
+//        STBI_ONLY_PNG
+//        STBI_ONLY_BMP
+//        STBI_ONLY_PSD
+//        STBI_ONLY_TGA
+//        STBI_ONLY_GIF
+//        STBI_ONLY_HDR
+//        STBI_ONLY_PIC
+//        STBI_ONLY_PNM   (.ppm and .pgm)
+//
+//   - If you use STBI_NO_PNG (or _ONLY_ without PNG), and you still
+//     want the zlib decoder to be available, #define STBI_SUPPORT_ZLIB
+//
+//  - If you define STBI_MAX_DIMENSIONS, stb_image will reject images greater
+//    than that size (in either width or height) without further processing.
+//    This is to let programs in the wild set an upper bound to prevent
+//    denial-of-service attacks on untrusted data, as one could generate a
+//    valid image of gigantic dimensions and force stb_image to allocate a
+//    huge block of memory and spend disproportionate time decoding it. By
+//    default this is set to (1 << 24), which is 16777216, but that's still
+//    very big.
 
 #ifndef STBI_NO_STDIO
 #include <stdio.h>
@@ -397,7 +373,7 @@ publish, and distribute this file as you see fit.
 
 enum
 {
-   STBI_default = 0, // only used for req_comp
+   STBI_default = 0, // only used for desired_channels
 
    STBI_grey       = 1,
    STBI_grey_alpha = 2,
@@ -405,17 +381,21 @@ enum
    STBI_rgb_alpha  = 4
 };
 
+#include <stdlib.h>
 typedef unsigned char stbi_uc;
+typedef unsigned short stbi_us;
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#ifndef STBIDEF
 #ifdef STB_IMAGE_STATIC
 #define STBIDEF static
 #else
 #define STBIDEF extern
 #endif
+#endif
 
 //////////////////////////////////////////////////////////////////////////////
 //
@@ -433,22 +413,52 @@ typedef struct
    int      (*eof)   (void *user);                       // returns nonzero if we are at end of file/data
 } stbi_io_callbacks;
 
-STBIDEF stbi_uc *stbi_load               (char              const *filename,           int *x, int *y, int *comp, int req_comp);
-STBIDEF stbi_uc *stbi_load_from_memory   (stbi_uc           const *buffer, int len   , int *x, int *y, int *comp, int req_comp);
-STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk  , void *user, int *x, int *y, int *comp, int req_comp);
+////////////////////////////////////
+//
+// 8-bits-per-channel interface
+//
+
+STBIDEF stbi_uc *stbi_load_from_memory   (stbi_uc           const *buffer, int len   , int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk  , void *user, int *x, int *y, int *channels_in_file, int desired_channels);
 
 #ifndef STBI_NO_STDIO
-STBIDEF stbi_uc *stbi_load_from_file  (FILE *f,                  int *x, int *y, int *comp, int req_comp);
+STBIDEF stbi_uc *stbi_load            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
 // for stbi_load_from_file, file pointer is left pointing immediately after image
 #endif
 
+#ifndef STBI_NO_GIF
+STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
+#endif
+
+#ifdef STBI_WINDOWS_UTF8
+STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
+#endif
+
+////////////////////////////////////
+//
+// 16-bits-per-channel interface
+//
+
+STBIDEF stbi_us *stbi_load_16_from_memory   (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels);
+
+#ifndef STBI_NO_STDIO
+STBIDEF stbi_us *stbi_load_16          (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_from_file_16(FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+#endif
+
+////////////////////////////////////
+//
+// float-per-channel interface
+//
 #ifndef STBI_NO_LINEAR
-   STBIDEF float *stbi_loadf                 (char const *filename,           int *x, int *y, int *comp, int req_comp);
-   STBIDEF float *stbi_loadf_from_memory     (stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp);
-   STBIDEF float *stbi_loadf_from_callbacks  (stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp);
+   STBIDEF float *stbi_loadf_from_memory     (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
+   STBIDEF float *stbi_loadf_from_callbacks  (stbi_io_callbacks const *clbk, void *user, int *x, int *y,  int *channels_in_file, int desired_channels);
 
    #ifndef STBI_NO_STDIO
-   STBIDEF float *stbi_loadf_from_file  (FILE *f,                int *x, int *y, int *comp, int req_comp);
+   STBIDEF float *stbi_loadf            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+   STBIDEF float *stbi_loadf_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
    #endif
 #endif
 
@@ -472,7 +482,7 @@ STBIDEF int      stbi_is_hdr_from_file(FILE *f);
 
 
 // get a VERY brief reason for failure
-// NOT THREADSAFE
+// on most compilers (and ALL modern mainstream compilers) this is threadsafe
 STBIDEF const char *stbi_failure_reason  (void);
 
 // free the loaded image -- this is just free()
@@ -481,11 +491,14 @@ STBIDEF void     stbi_image_free      (void *retval_from_stbi_load);
 // get image dimensions & components without fully decoding
 STBIDEF int      stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp);
 STBIDEF int      stbi_info_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp);
+STBIDEF int      stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len);
+STBIDEF int      stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *clbk, void *user);
 
 #ifndef STBI_NO_STDIO
-STBIDEF int      stbi_info            (char const *filename,     int *x, int *y, int *comp);
-STBIDEF int      stbi_info_from_file  (FILE *f,                  int *x, int *y, int *comp);
-
+STBIDEF int      stbi_info               (char const *filename,     int *x, int *y, int *comp);
+STBIDEF int      stbi_info_from_file     (FILE *f,                  int *x, int *y, int *comp);
+STBIDEF int      stbi_is_16_bit          (char const *filename);
+STBIDEF int      stbi_is_16_bit_from_file(FILE *f);
 #endif
 
 
@@ -502,6 +515,13 @@ STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert);
 // flip the image vertically, so the first pixel in the output array is the bottom left
 STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip);
 
+// as above, but only applies to images loaded on the thread that calls the function
+// this function is only available if your compiler supports thread-local variables;
+// calling it will fail to link if your compiler doesn't
+STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply);
+STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert);
+STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip);
+
 // ZLIB client - used by PNG, available for other purposes
 
 STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen);
@@ -566,9 +586,10 @@ STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const ch
 #include <stddef.h> // ptrdiff_t on osx
 #include <stdlib.h>
 #include <string.h>
+#include <limits.h>
 
 #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
-#include <math.h>  // ldexp
+#include <math.h>  // ldexp, pow
 #endif
 
 #ifndef STBI_NO_STDIO
@@ -580,6 +601,12 @@ STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const ch
 #define STBI_ASSERT(x) assert(x)
 #endif
 
+#ifdef __cplusplus
+#define STBI_EXTERN extern "C"
+#else
+#define STBI_EXTERN extern
+#endif
+
 
 #ifndef _MSC_VER
    #ifdef __cplusplus
@@ -591,8 +618,25 @@ STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const ch
    #define stbi_inline __forceinline
 #endif
 
+#ifndef STBI_NO_THREAD_LOCALS
+   #if defined(__cplusplus) &&  __cplusplus >= 201103L
+      #define STBI_THREAD_LOCAL       thread_local
+   #elif defined(__GNUC__) && __GNUC__ < 5
+      #define STBI_THREAD_LOCAL       __thread
+   #elif defined(_MSC_VER)
+      #define STBI_THREAD_LOCAL       __declspec(thread)
+   #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
+      #define STBI_THREAD_LOCAL       _Thread_local
+   #endif
 
-#ifdef _MSC_VER
+   #ifndef STBI_THREAD_LOCAL
+      #if defined(__GNUC__)
+        #define STBI_THREAD_LOCAL       __thread
+      #endif
+   #endif
+#endif
+
+#if defined(_MSC_VER) || defined(__SYMBIAN32__)
 typedef unsigned short stbi__uint16;
 typedef   signed short stbi__int16;
 typedef unsigned int   stbi__uint32;
@@ -621,7 +665,7 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
 #ifdef STBI_HAS_LROTL
    #define stbi_lrot(x,y)  _lrotl(x,y)
 #else
-   #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (32 - (y))))
+   #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (-(y) & 31)))
 #endif
 
 #if defined(STBI_MALLOC) && defined(STBI_FREE) && (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED))
@@ -649,12 +693,14 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
 #define STBI__X86_TARGET
 #endif
 
-#if defined(__GNUC__) && (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET)) && !defined(__SSE2__) && !defined(STBI_NO_SIMD)
-// NOTE: not clear do we actually need this for the 64-bit path?
+#if defined(__GNUC__) && defined(STBI__X86_TARGET) && !defined(__SSE2__) && !defined(STBI_NO_SIMD)
 // gcc doesn't support sse2 intrinsics unless you compile with -msse2,
-// (but compiling with -msse2 allows the compiler to use SSE2 everywhere;
-// this is just broken and gcc are jerks for not fixing it properly
-// http://www.virtualdub.org/blog/pivot/entry.php?id=363 )
+// which in turn means it gets to use SSE2 everywhere. This is unfortunate,
+// but previous attempts to provide the SSE2 functions with runtime
+// detection caused numerous issues. The way architecture extensions are
+// exposed in GCC/Clang is, sadly, not really suited for one-file libs.
+// New behavior: if compiled with -msse2, we use SSE2 without any
+// detection; if not, we don't use it at all.
 #define STBI_NO_SIMD
 #endif
 
@@ -702,28 +748,27 @@ static int stbi__cpuid3(void)
 
 #define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
 
-static int stbi__sse2_available()
+#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
+static int stbi__sse2_available(void)
 {
    int info3 = stbi__cpuid3();
    return ((info3 >> 26) & 1) != 0;
 }
+#endif
+
 #else // assume GCC-style if not VC++
 #define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
 
-static int stbi__sse2_available()
+#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
+static int stbi__sse2_available(void)
 {
-#if defined(STBI__X64_TARGET)
-   // on x64, SSE2 can be assumed to be available.
+   // If we're even attempting to compile this on GCC/Clang, that means
+   // -msse2 is on, which means the compiler is allowed to use SSE2
+   // instructions at will, and so are we.
    return 1;
-#else
-   // __builtin_cpu_supports is buggy on GCC 5 and above, causing problems if
-   // referenced in a shared object, giving missing __cpu_model hidden symbol errors.
-   // To get around that, just assume that SSE2 is not available on x86.
-   //
-   // See https://github.com/nothings/stb/issues/280 for more information.
-   return 0;
-#endif
 }
+#endif
+
 #endif
 #endif
 
@@ -734,14 +779,21 @@ static int stbi__sse2_available()
 
 #ifdef STBI_NEON
 #include <arm_neon.h>
-// assume GCC or Clang on ARM targets
+#ifdef _MSC_VER
+#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+#else
 #define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
 #endif
+#endif
 
 #ifndef STBI_SIMD_ALIGN
 #define STBI_SIMD_ALIGN(type, name) type name
 #endif
 
+#ifndef STBI_MAX_DIMENSIONS
+#define STBI_MAX_DIMENSIONS (1 << 24)
+#endif
+
 ///////////////////////////////////////////////
 //
 //  stbi__context struct and start_xxx functions
@@ -759,6 +811,7 @@ typedef struct
    int read_from_callbacks;
    int buflen;
    stbi_uc buffer_start[128];
+   int callback_already_read;
 
    stbi_uc *img_buffer, *img_buffer_end;
    stbi_uc *img_buffer_original, *img_buffer_original_end;
@@ -772,6 +825,7 @@ static void stbi__start_mem(stbi__context *s, stbi_uc const *buffer, int len)
 {
    s->io.read = NULL;
    s->read_from_callbacks = 0;
+   s->callback_already_read = 0;
    s->img_buffer = s->img_buffer_original = (stbi_uc *) buffer;
    s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *) buffer+len;
 }
@@ -783,7 +837,8 @@ static void stbi__start_callbacks(stbi__context *s, stbi_io_callbacks *c, void *
    s->io_user_data = user;
    s->buflen = sizeof(s->buffer_start);
    s->read_from_callbacks = 1;
-   s->img_buffer_original = s->buffer_start;
+   s->callback_already_read = 0;
+   s->img_buffer = s->img_buffer_original = s->buffer_start;
    stbi__refill_buffer(s);
    s->img_buffer_original_end = s->img_buffer_end;
 }
@@ -797,12 +852,17 @@ static int stbi__stdio_read(void *user, char *data, int size)
 
 static void stbi__stdio_skip(void *user, int n)
 {
+   int ch;
    fseek((FILE*) user, n, SEEK_CUR);
+   ch = fgetc((FILE*) user);  /* have to read a byte to reset feof()'s flag */
+   if (ch != EOF) {
+      ungetc(ch, (FILE *) user);  /* push byte back onto stream if valid. */
+   }
 }
 
 static int stbi__stdio_eof(void *user)
 {
-   return feof((FILE*) user);
+   return feof((FILE*) user) || ferror((FILE *) user);
 }
 
 static stbi_io_callbacks stbi__stdio_callbacks =
@@ -830,79 +890,197 @@ static void stbi__rewind(stbi__context *s)
    s->img_buffer_end = s->img_buffer_original_end;
 }
 
+enum
+{
+   STBI_ORDER_RGB,
+   STBI_ORDER_BGR
+};
+
+typedef struct
+{
+   int bits_per_channel;
+   int num_channels;
+   int channel_order;
+} stbi__result_info;
+
 #ifndef STBI_NO_JPEG
 static int      stbi__jpeg_test(stbi__context *s);
-static stbi_uc *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_PNG
 static int      stbi__png_test(stbi__context *s);
-static stbi_uc *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__png_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__png_is16(stbi__context *s);
 #endif
 
 #ifndef STBI_NO_BMP
 static int      stbi__bmp_test(stbi__context *s);
-static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_TGA
 static int      stbi__tga_test(stbi__context *s);
-static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__tga_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_PSD
 static int      stbi__psd_test(stbi__context *s);
-static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc);
 static int      stbi__psd_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__psd_is16(stbi__context *s);
 #endif
 
 #ifndef STBI_NO_HDR
 static int      stbi__hdr_test(stbi__context *s);
-static float   *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static float   *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_PIC
 static int      stbi__pic_test(stbi__context *s);
-static stbi_uc *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__pic_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_GIF
 static int      stbi__gif_test(stbi__context *s);
-static stbi_uc *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static void    *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
 static int      stbi__gif_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_PNM
 static int      stbi__pnm_test(stbi__context *s);
-static stbi_uc *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__pnm_is16(stbi__context *s);
 #endif
 
-// this is not threadsafe
-static const char *stbi__g_failure_reason;
+static
+#ifdef STBI_THREAD_LOCAL
+STBI_THREAD_LOCAL
+#endif
+const char *stbi__g_failure_reason;
 
 STBIDEF const char *stbi_failure_reason(void)
 {
    return stbi__g_failure_reason;
 }
 
+#ifndef STBI_NO_FAILURE_STRINGS
 static int stbi__err(const char *str)
 {
    stbi__g_failure_reason = str;
    return 0;
 }
+#endif
 
 static void *stbi__malloc(size_t size)
 {
     return STBI_MALLOC(size);
 }
 
+// stb_image uses ints pervasively, including for offset calculations.
+// therefore the largest decoded image size we can support with the
+// current code, even on 64-bit targets, is INT_MAX. this is not a
+// significant limitation for the intended use case.
+//
+// we do, however, need to make sure our size calculations don't
+// overflow. hence a few helper functions for size calculations that
+// multiply integers together, making sure that they're non-negative
+// and no overflow occurs.
+
+// return 1 if the sum is valid, 0 on overflow.
+// negative terms are considered invalid.
+static int stbi__addsizes_valid(int a, int b)
+{
+   if (b < 0) return 0;
+   // now 0 <= b <= INT_MAX, hence also
+   // 0 <= INT_MAX - b <= INTMAX.
+   // And "a + b <= INT_MAX" (which might overflow) is the
+   // same as a <= INT_MAX - b (no overflow)
+   return a <= INT_MAX - b;
+}
+
+// returns 1 if the product is valid, 0 on overflow.
+// negative factors are considered invalid.
+static int stbi__mul2sizes_valid(int a, int b)
+{
+   if (a < 0 || b < 0) return 0;
+   if (b == 0) return 1; // mul-by-0 is always safe
+   // portable way to check for no overflows in a*b
+   return a <= INT_MAX/b;
+}
+
+#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
+// returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad2sizes_valid(int a, int b, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a*b, add);
+}
+#endif
+
+// returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad3sizes_valid(int a, int b, int c, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
+      stbi__addsizes_valid(a*b*c, add);
+}
+
+// returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
+static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
+      stbi__mul2sizes_valid(a*b*c, d) && stbi__addsizes_valid(a*b*c*d, add);
+}
+#endif
+
+#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
+// mallocs with size overflow checking
+static void *stbi__malloc_mad2(int a, int b, int add)
+{
+   if (!stbi__mad2sizes_valid(a, b, add)) return NULL;
+   return stbi__malloc(a*b + add);
+}
+#endif
+
+static void *stbi__malloc_mad3(int a, int b, int c, int add)
+{
+   if (!stbi__mad3sizes_valid(a, b, c, add)) return NULL;
+   return stbi__malloc(a*b*c + add);
+}
+
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
+static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
+{
+   if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL;
+   return stbi__malloc(a*b*c*d + add);
+}
+#endif
+
+// returns 1 if the sum of two signed ints is valid (between -2^31 and 2^31-1 inclusive), 0 on overflow.
+static int stbi__addints_valid(int a, int b)
+{
+   if ((a >= 0) != (b >= 0)) return 1; // a and b have different signs, so no overflow
+   if (a < 0 && b < 0) return a >= INT_MIN - b; // same as a + b >= INT_MIN; INT_MIN - b cannot overflow since b < 0.
+   return a <= INT_MAX - b;
+}
+
+// returns 1 if the product of two signed shorts is valid, 0 on overflow.
+static int stbi__mul2shorts_valid(short a, short b)
+{
+   if (b == 0 || b == -1) return 1; // multiplication by 0 is always 0; check for -1 so SHRT_MIN/b doesn't overflow
+   if ((a >= 0) == (b >= 0)) return a <= SHRT_MAX/b; // product is positive, so similar to mul2sizes_valid
+   if (b < 0) return a <= SHRT_MIN / b; // same as a * b >= SHRT_MIN
+   return a >= SHRT_MIN / b;
+}
+
 // stbi__err - error
 // stbi__errpf - error returning pointer to float
 // stbi__errpuc - error returning pointer to unsigned char
@@ -931,40 +1109,69 @@ static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp);
 static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp);
 #endif
 
-static int stbi__vertically_flip_on_load = 0;
+static int stbi__vertically_flip_on_load_global = 0;
 
 STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip)
 {
-    stbi__vertically_flip_on_load = flag_true_if_should_flip;
+   stbi__vertically_flip_on_load_global = flag_true_if_should_flip;
 }
 
-static unsigned char *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+#ifndef STBI_THREAD_LOCAL
+#define stbi__vertically_flip_on_load  stbi__vertically_flip_on_load_global
+#else
+static STBI_THREAD_LOCAL int stbi__vertically_flip_on_load_local, stbi__vertically_flip_on_load_set;
+
+STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip)
 {
-   #ifndef STBI_NO_JPEG
-   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp);
-   #endif
+   stbi__vertically_flip_on_load_local = flag_true_if_should_flip;
+   stbi__vertically_flip_on_load_set = 1;
+}
+
+#define stbi__vertically_flip_on_load  (stbi__vertically_flip_on_load_set       \
+                                         ? stbi__vertically_flip_on_load_local  \
+                                         : stbi__vertically_flip_on_load_global)
+#endif // STBI_THREAD_LOCAL
+
+static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
+{
+   memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields
+   ri->bits_per_channel = 8; // default is 8 so most paths don't have to be changed
+   ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order
+   ri->num_channels = 0;
+
+   // test the formats with a very explicit header first (at least a FOURCC
+   // or distinctive magic number first)
    #ifndef STBI_NO_PNG
-   if (stbi__png_test(s))  return stbi__png_load(s,x,y,comp,req_comp);
+   if (stbi__png_test(s))  return stbi__png_load(s,x,y,comp,req_comp, ri);
    #endif
    #ifndef STBI_NO_BMP
-   if (stbi__bmp_test(s))  return stbi__bmp_load(s,x,y,comp,req_comp);
+   if (stbi__bmp_test(s))  return stbi__bmp_load(s,x,y,comp,req_comp, ri);
    #endif
    #ifndef STBI_NO_GIF
-   if (stbi__gif_test(s))  return stbi__gif_load(s,x,y,comp,req_comp);
+   if (stbi__gif_test(s))  return stbi__gif_load(s,x,y,comp,req_comp, ri);
    #endif
    #ifndef STBI_NO_PSD
-   if (stbi__psd_test(s))  return stbi__psd_load(s,x,y,comp,req_comp);
+   if (stbi__psd_test(s))  return stbi__psd_load(s,x,y,comp,req_comp, ri, bpc);
+   #else
+   STBI_NOTUSED(bpc);
    #endif
    #ifndef STBI_NO_PIC
-   if (stbi__pic_test(s))  return stbi__pic_load(s,x,y,comp,req_comp);
+   if (stbi__pic_test(s))  return stbi__pic_load(s,x,y,comp,req_comp, ri);
+   #endif
+
+   // then the formats that can end up attempting to load with just 1 or 2
+   // bytes matching expectations; these are prone to false positives, so
+   // try them later
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri);
    #endif
    #ifndef STBI_NO_PNM
-   if (stbi__pnm_test(s))  return stbi__pnm_load(s,x,y,comp,req_comp);
+   if (stbi__pnm_test(s))  return stbi__pnm_load(s,x,y,comp,req_comp, ri);
    #endif
 
    #ifndef STBI_NO_HDR
    if (stbi__hdr_test(s)) {
-      float *hdr = stbi__hdr_load(s, x,y,comp,req_comp);
+      float *hdr = stbi__hdr_load(s, x,y,comp,req_comp, ri);
       return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
    }
    #endif
@@ -972,66 +1179,179 @@ static unsigned char *stbi__load_main(stbi__context *s, int *x, int *y, int *com
    #ifndef STBI_NO_TGA
    // test tga last because it's a crappy test!
    if (stbi__tga_test(s))
-      return stbi__tga_load(s,x,y,comp,req_comp);
+      return stbi__tga_load(s,x,y,comp,req_comp, ri);
    #endif
 
    return stbi__errpuc("unknown image type", "Image not of any known type, or corrupt");
 }
 
-static unsigned char *stbi__load_flip(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static stbi_uc *stbi__convert_16_to_8(stbi__uint16 *orig, int w, int h, int channels)
 {
-   unsigned char *result = stbi__load_main(s, x, y, comp, req_comp);
+   int i;
+   int img_len = w * h * channels;
+   stbi_uc *reduced;
 
-   if (stbi__vertically_flip_on_load && result != NULL) {
-      int w = *x, h = *y;
-      int depth = req_comp ? req_comp : *comp;
-      int row,col,z;
-      stbi_uc temp;
-
-      // @OPTIMIZE: use a bigger temp buffer and memcpy multiple pixels at once
-      for (row = 0; row < (h>>1); row++) {
-         for (col = 0; col < w; col++) {
-            for (z = 0; z < depth; z++) {
-               temp = result[(row * w + col) * depth + z];
-               result[(row * w + col) * depth + z] = result[((h - row - 1) * w + col) * depth + z];
-               result[((h - row - 1) * w + col) * depth + z] = temp;
-            }
-         }
+   reduced = (stbi_uc *) stbi__malloc(img_len);
+   if (reduced == NULL) return stbi__errpuc("outofmem", "Out of memory");
+
+   for (i = 0; i < img_len; ++i)
+      reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling
+
+   STBI_FREE(orig);
+   return reduced;
+}
+
+static stbi__uint16 *stbi__convert_8_to_16(stbi_uc *orig, int w, int h, int channels)
+{
+   int i;
+   int img_len = w * h * channels;
+   stbi__uint16 *enlarged;
+
+   enlarged = (stbi__uint16 *) stbi__malloc(img_len*2);
+   if (enlarged == NULL) return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
+
+   for (i = 0; i < img_len; ++i)
+      enlarged[i] = (stbi__uint16)((orig[i] << 8) + orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff
+
+   STBI_FREE(orig);
+   return enlarged;
+}
+
+static void stbi__vertical_flip(void *image, int w, int h, int bytes_per_pixel)
+{
+   int row;
+   size_t bytes_per_row = (size_t)w * bytes_per_pixel;
+   stbi_uc temp[2048];
+   stbi_uc *bytes = (stbi_uc *)image;
+
+   for (row = 0; row < (h>>1); row++) {
+      stbi_uc *row0 = bytes + row*bytes_per_row;
+      stbi_uc *row1 = bytes + (h - row - 1)*bytes_per_row;
+      // swap row0 with row1
+      size_t bytes_left = bytes_per_row;
+      while (bytes_left) {
+         size_t bytes_copy = (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp);
+         memcpy(temp, row0, bytes_copy);
+         memcpy(row0, row1, bytes_copy);
+         memcpy(row1, temp, bytes_copy);
+         row0 += bytes_copy;
+         row1 += bytes_copy;
+         bytes_left -= bytes_copy;
       }
    }
+}
 
-   return result;
+#ifndef STBI_NO_GIF
+static void stbi__vertical_flip_slices(void *image, int w, int h, int z, int bytes_per_pixel)
+{
+   int slice;
+   int slice_size = w * h * bytes_per_pixel;
+
+   stbi_uc *bytes = (stbi_uc *)image;
+   for (slice = 0; slice < z; ++slice) {
+      stbi__vertical_flip(bytes, w, h, bytes_per_pixel);
+      bytes += slice_size;
+   }
 }
+#endif
 
-#ifndef STBI_NO_HDR
+static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__result_info ri;
+   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8);
+
+   if (result == NULL)
+      return NULL;
+
+   // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
+   STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
+
+   if (ri.bits_per_channel != 8) {
+      result = stbi__convert_16_to_8((stbi__uint16 *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
+      ri.bits_per_channel = 8;
+   }
+
+   // @TODO: move stbi__convert_format to here
+
+   if (stbi__vertically_flip_on_load) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc));
+   }
+
+   return (unsigned char *) result;
+}
+
+static stbi__uint16 *stbi__load_and_postprocess_16bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__result_info ri;
+   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16);
+
+   if (result == NULL)
+      return NULL;
+
+   // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
+   STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
+
+   if (ri.bits_per_channel != 16) {
+      result = stbi__convert_8_to_16((stbi_uc *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
+      ri.bits_per_channel = 16;
+   }
+
+   // @TODO: move stbi__convert_format16 to here
+   // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to keep more precision
+
+   if (stbi__vertically_flip_on_load) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16));
+   }
+
+   return (stbi__uint16 *) result;
+}
+
+#if !defined(STBI_NO_HDR) && !defined(STBI_NO_LINEAR)
 static void stbi__float_postprocess(float *result, int *x, int *y, int *comp, int req_comp)
 {
    if (stbi__vertically_flip_on_load && result != NULL) {
-      int w = *x, h = *y;
-      int depth = req_comp ? req_comp : *comp;
-      int row,col,z;
-      float temp;
-
-      // @OPTIMIZE: use a bigger temp buffer and memcpy multiple pixels at once
-      for (row = 0; row < (h>>1); row++) {
-         for (col = 0; col < w; col++) {
-            for (z = 0; z < depth; z++) {
-               temp = result[(row * w + col) * depth + z];
-               result[(row * w + col) * depth + z] = result[((h - row - 1) * w + col) * depth + z];
-               result[((h - row - 1) * w + col) * depth + z] = temp;
-            }
-         }
-      }
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(float));
    }
 }
 #endif
 
 #ifndef STBI_NO_STDIO
 
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+STBI_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
+STBI_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
+#endif
+
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
+{
+	return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
+}
+#endif
+
 static FILE *stbi__fopen(char const *filename, char const *mode)
 {
    FILE *f;
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+   wchar_t wMode[64];
+   wchar_t wFilename[1024];
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
+      return 0;
+
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
+      return 0;
+
 #if defined(_MSC_VER) && _MSC_VER >= 1400
+	if (0 != _wfopen_s(&f, wFilename, wMode))
+		f = 0;
+#else
+   f = _wfopen(wFilename, wMode);
+#endif
+
+#elif defined(_MSC_VER) && _MSC_VER >= 1400
    if (0 != fopen_s(&f, filename, mode))
       f=0;
 #else
@@ -1056,28 +1376,83 @@ STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req
    unsigned char *result;
    stbi__context s;
    stbi__start_file(&s,f);
-   result = stbi__load_flip(&s,x,y,comp,req_comp);
+   result = stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
    if (result) {
       // need to 'unget' all the characters in the IO buffer
       fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
    }
    return result;
 }
+
+STBIDEF stbi__uint16 *stbi_load_from_file_16(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__uint16 *result;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   result = stbi__load_and_postprocess_16bit(&s,x,y,comp,req_comp);
+   if (result) {
+      // need to 'unget' all the characters in the IO buffer
+      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
+   }
+   return result;
+}
+
+STBIDEF stbi_us *stbi_load_16(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   stbi__uint16 *result;
+   if (!f) return (stbi_us *) stbi__errpuc("can't fopen", "Unable to open file");
+   result = stbi_load_from_file_16(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+
 #endif //!STBI_NO_STDIO
 
+STBIDEF stbi_us *stbi_load_16_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
+}
+
+STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
+   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
+}
+
 STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
 {
    stbi__context s;
    stbi__start_mem(&s,buffer,len);
-   return stbi__load_flip(&s,x,y,comp,req_comp);
+   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
 }
 
 STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
 {
    stbi__context s;
    stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
-   return stbi__load_flip(&s,x,y,comp,req_comp);
+   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+}
+
+#ifndef STBI_NO_GIF
+STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
+{
+   unsigned char *result;
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+
+   result = (unsigned char*) stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp);
+   if (stbi__vertically_flip_on_load) {
+      stbi__vertical_flip_slices( result, *x, *y, *z, *comp );
+   }
+
+   return result;
 }
+#endif
 
 #ifndef STBI_NO_LINEAR
 static float *stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
@@ -1085,13 +1460,14 @@ static float *stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp, int
    unsigned char *data;
    #ifndef STBI_NO_HDR
    if (stbi__hdr_test(s)) {
-      float *hdr_data = stbi__hdr_load(s,x,y,comp,req_comp);
+      stbi__result_info ri;
+      float *hdr_data = stbi__hdr_load(s,x,y,comp,req_comp, &ri);
       if (hdr_data)
          stbi__float_postprocess(hdr_data,x,y,comp,req_comp);
       return hdr_data;
    }
    #endif
-   data = stbi__load_flip(s, x, y, comp, req_comp);
+   data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp);
    if (data)
       return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
    return stbi__errpf("unknown image type", "Image not of any known type, or corrupt");
@@ -1161,12 +1537,16 @@ STBIDEF int      stbi_is_hdr          (char const *filename)
    return result;
 }
 
-STBIDEF int      stbi_is_hdr_from_file(FILE *f)
+STBIDEF int stbi_is_hdr_from_file(FILE *f)
 {
    #ifndef STBI_NO_HDR
+   long pos = ftell(f);
+   int res;
    stbi__context s;
    stbi__start_file(&s,f);
-   return stbi__hdr_test(&s);
+   res = stbi__hdr_test(&s);
+   fseek(f, pos, SEEK_SET);
+   return res;
    #else
    STBI_NOTUSED(f);
    return 0;
@@ -1215,6 +1595,7 @@ enum
 static void stbi__refill_buffer(stbi__context *s)
 {
    int n = (s->io.read)(s->io_user_data,(char*)s->buffer_start,s->buflen);
+   s->callback_already_read += (int) (s->img_buffer - s->img_buffer_original);
    if (n == 0) {
       // at end of file, treat same as if from memory, but need to handle case
       // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
@@ -1239,6 +1620,9 @@ stbi_inline static stbi_uc stbi__get8(stbi__context *s)
    return 0;
 }
 
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_HDR) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
 stbi_inline static int stbi__at_eof(stbi__context *s)
 {
    if (s->io.read) {
@@ -1250,9 +1634,14 @@ stbi_inline static int stbi__at_eof(stbi__context *s)
 
    return s->img_buffer >= s->img_buffer_end;
 }
+#endif
 
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC)
+// nothing
+#else
 static void stbi__skip(stbi__context *s, int n)
 {
+   if (n == 0) return;  // already there!
    if (n < 0) {
       s->img_buffer = s->img_buffer_end;
       return;
@@ -1267,7 +1656,11 @@ static void stbi__skip(stbi__context *s, int n)
    }
    s->img_buffer += n;
 }
+#endif
 
+#if defined(STBI_NO_PNG) && defined(STBI_NO_TGA) && defined(STBI_NO_HDR) && defined(STBI_NO_PNM)
+// nothing
+#else
 static int stbi__getn(stbi__context *s, stbi_uc *buffer, int n)
 {
    if (s->io.read) {
@@ -1291,18 +1684,27 @@ static int stbi__getn(stbi__context *s, stbi_uc *buffer, int n)
    } else
       return 0;
 }
+#endif
 
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC)
+// nothing
+#else
 static int stbi__get16be(stbi__context *s)
 {
    int z = stbi__get8(s);
    return (z << 8) + stbi__get8(s);
 }
+#endif
 
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC)
+// nothing
+#else
 static stbi__uint32 stbi__get32be(stbi__context *s)
 {
    stbi__uint32 z = stbi__get16be(s);
    return (z << 16) + stbi__get16be(s);
 }
+#endif
 
 #if defined(STBI_NO_BMP) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF)
 // nothing
@@ -1318,13 +1720,16 @@ static int stbi__get16le(stbi__context *s)
 static stbi__uint32 stbi__get32le(stbi__context *s)
 {
    stbi__uint32 z = stbi__get16le(s);
-   return z + (stbi__get16le(s) << 16);
+   z += (stbi__uint32)stbi__get16le(s) << 16;
+   return z;
 }
 #endif
 
 #define STBI__BYTECAST(x)  ((stbi_uc) ((x) & 255))  // truncate int to byte without warnings
 
-
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
 //////////////////////////////////////////////////////////////////////////////
 //
 //  generic converter from built-in img_n to req_comp
@@ -1340,7 +1745,11 @@ static stbi_uc stbi__compute_y(int r, int g, int b)
 {
    return (stbi_uc) (((r*77) + (g*150) +  (29*b)) >> 8);
 }
+#endif
 
+#if defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
 static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int req_comp, unsigned int x, unsigned int y)
 {
    int i,j;
@@ -1349,7 +1758,7 @@ static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int r
    if (req_comp == img_n) return data;
    STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
 
-   good = (unsigned char *) stbi__malloc(req_comp * x * y);
+   good = (unsigned char *) stbi__malloc_mad3(req_comp, x, y, 0);
    if (good == NULL) {
       STBI_FREE(data);
       return stbi__errpuc("outofmem", "Out of memory");
@@ -1359,37 +1768,97 @@ static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int r
       unsigned char *src  = data + j * x * img_n   ;
       unsigned char *dest = good + j * x * req_comp;
 
-      #define COMBO(a,b)  ((a)*8+(b))
-      #define CASE(a,b)   case COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
       // convert source image with img_n components to one with req_comp components;
       // avoid switch per pixel, so use switch per scanline and massive macros
-      switch (COMBO(img_n, req_comp)) {
-         CASE(1,2) dest[0]=src[0], dest[1]=255; break;
-         CASE(1,3) dest[0]=dest[1]=dest[2]=src[0]; break;
-         CASE(1,4) dest[0]=dest[1]=dest[2]=src[0], dest[3]=255; break;
-         CASE(2,1) dest[0]=src[0]; break;
-         CASE(2,3) dest[0]=dest[1]=dest[2]=src[0]; break;
-         CASE(2,4) dest[0]=dest[1]=dest[2]=src[0], dest[3]=src[1]; break;
-         CASE(3,4) dest[0]=src[0],dest[1]=src[1],dest[2]=src[2],dest[3]=255; break;
-         CASE(3,1) dest[0]=stbi__compute_y(src[0],src[1],src[2]); break;
-         CASE(3,2) dest[0]=stbi__compute_y(src[0],src[1],src[2]), dest[1] = 255; break;
-         CASE(4,1) dest[0]=stbi__compute_y(src[0],src[1],src[2]); break;
-         CASE(4,2) dest[0]=stbi__compute_y(src[0],src[1],src[2]), dest[1] = src[3]; break;
-         CASE(4,3) dest[0]=src[0],dest[1]=src[1],dest[2]=src[2]; break;
-         default: STBI_ASSERT(0);
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=255;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=255;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                  } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                  } break;
+         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=255;        } break;
+         STBI__CASE(3,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = 255;    } break;
+         STBI__CASE(4,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                    } break;
+         default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return stbi__errpuc("unsupported", "Unsupported format conversion");
       }
-      #undef CASE
+      #undef STBI__CASE
    }
 
    STBI_FREE(data);
    return good;
 }
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD)
+// nothing
+#else
+static stbi__uint16 stbi__compute_y_16(int r, int g, int b)
+{
+   return (stbi__uint16) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD)
+// nothing
+#else
+static stbi__uint16 *stbi__convert_format16(stbi__uint16 *data, int img_n, int req_comp, unsigned int x, unsigned int y)
+{
+   int i,j;
+   stbi__uint16 *good;
+
+   if (req_comp == img_n) return data;
+   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+   good = (stbi__uint16 *) stbi__malloc(req_comp * x * y * 2);
+   if (good == NULL) {
+      STBI_FREE(data);
+      return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
+   }
+
+   for (j=0; j < (int) y; ++j) {
+      stbi__uint16 *src  = data + j * x * img_n   ;
+      stbi__uint16 *dest = good + j * x * req_comp;
+
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      // convert source image with img_n components to one with req_comp components;
+      // avoid switch per pixel, so use switch per scanline and massive macros
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=0xffff;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=0xffff;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                     } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                     } break;
+         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=0xffff;        } break;
+         STBI__CASE(3,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = 0xffff; } break;
+         STBI__CASE(4,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                       } break;
+         default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return (stbi__uint16*) stbi__errpuc("unsupported", "Unsupported format conversion");
+      }
+      #undef STBI__CASE
+   }
+
+   STBI_FREE(data);
+   return good;
+}
+#endif
 
 #ifndef STBI_NO_LINEAR
 static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
 {
    int i,k,n;
-   float *output = (float *) stbi__malloc(x * y * comp * sizeof(float));
+   float *output;
+   if (!data) return NULL;
+   output = (float *) stbi__malloc_mad4(x, y, comp, sizeof(float), 0);
    if (output == NULL) { STBI_FREE(data); return stbi__errpf("outofmem", "Out of memory"); }
    // compute number of non-alpha components
    if (comp & 1) n = comp; else n = comp-1;
@@ -1397,7 +1866,11 @@ static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
       for (k=0; k < n; ++k) {
          output[i*comp + k] = (float) (pow(data[i*comp+k]/255.0f, stbi__l2h_gamma) * stbi__l2h_scale);
       }
-      if (k < comp) output[i*comp + k] = data[i*comp+k]/255.0f;
+   }
+   if (n < comp) {
+      for (i=0; i < x*y; ++i) {
+         output[i*comp + n] = data[i*comp + n]/255.0f;
+      }
    }
    STBI_FREE(data);
    return output;
@@ -1409,7 +1882,9 @@ static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
 static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp)
 {
    int i,k,n;
-   stbi_uc *output = (stbi_uc *) stbi__malloc(x * y * comp);
+   stbi_uc *output;
+   if (!data) return NULL;
+   output = (stbi_uc *) stbi__malloc_mad3(x, y, comp, 0);
    if (output == NULL) { STBI_FREE(data); return stbi__errpuc("outofmem", "Out of memory"); }
    // compute number of non-alpha components
    if (comp & 1) n = comp; else n = comp-1;
@@ -1474,7 +1949,7 @@ typedef struct
    stbi__context *s;
    stbi__huffman huff_dc[4];
    stbi__huffman huff_ac[4];
-   stbi_uc dequant[4][64];
+   stbi__uint16 dequant[4][64];
    stbi__int16 fast_ac[4][1 << FAST_BITS];
 
 // sizes for components, interleaved MCUs
@@ -1510,6 +1985,8 @@ typedef struct
    int            succ_high;
    int            succ_low;
    int            eob_run;
+   int            jfif;
+   int            app14_color_transform; // Adobe APP14 tag
    int            rgb;
 
    int scan_n, order[4];
@@ -1523,11 +2000,15 @@ typedef struct
 
 static int stbi__build_huffman(stbi__huffman *h, int *count)
 {
-   int i,j,k=0,code;
+   int i,j,k=0;
+   unsigned int code;
    // build size list for each symbol (from JPEG spec)
-   for (i=0; i < 16; ++i)
-      for (j=0; j < count[i]; ++j)
+   for (i=0; i < 16; ++i) {
+      for (j=0; j < count[i]; ++j) {
          h->size[k++] = (stbi_uc) (i+1);
+         if(k >= 257) return stbi__err("bad size list","Corrupt JPEG");
+      }
+   }
    h->size[k] = 0;
 
    // compute actual symbols (from jpeg spec)
@@ -1539,7 +2020,7 @@ static int stbi__build_huffman(stbi__huffman *h, int *count)
       if (h->size[k] == j) {
          while (h->size[k] == j)
             h->code[k++] = (stbi__uint16) (code++);
-         if (code-1 >= (1 << j)) return stbi__err("bad code lengths","Corrupt JPEG");
+         if (code-1 >= (1u << j)) return stbi__err("bad code lengths","Corrupt JPEG");
       }
       // compute largest code + 1 for this size, preshifted as needed later
       h->maxcode[j] = code << (16-j);
@@ -1580,10 +2061,10 @@ static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h)
             // magnitude code followed by receive_extend code
             int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
             int m = 1 << (magbits - 1);
-            if (k < m) k += (-1 << magbits) + 1;
+            if (k < m) k += (~0U << magbits) + 1;
             // if the result is small enough, we can fit it in fast_ac table
             if (k >= -128 && k <= 127)
-               fast_ac[i] = (stbi__int16) ((k << 8) + (run << 4) + (len + magbits));
+               fast_ac[i] = (stbi__int16) ((k * 256) + (run * 16) + (len + magbits));
          }
       }
    }
@@ -1592,9 +2073,10 @@ static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h)
 static void stbi__grow_buffer_unsafe(stbi__jpeg *j)
 {
    do {
-      int b = j->nomore ? 0 : stbi__get8(j->s);
+      unsigned int b = j->nomore ? 0 : stbi__get8(j->s);
       if (b == 0xff) {
          int c = stbi__get8(j->s);
+         while (c == 0xff) c = stbi__get8(j->s); // consume fill bytes
          if (c != 0) {
             j->marker = (unsigned char) c;
             j->nomore = 1;
@@ -1607,7 +2089,7 @@ static void stbi__grow_buffer_unsafe(stbi__jpeg *j)
 }
 
 // (1 << n) - 1
-static stbi__uint32 stbi__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
+static const stbi__uint32 stbi__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
 
 // decode a jpeg huffman value from the bitstream
 stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
@@ -1651,6 +2133,8 @@ stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
 
    // convert the huffman code to the symbol id
    c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k];
+   if(c < 0 || c >= 256) // symbol id out of bounds!
+       return -1;
    STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) == h->code[c]);
 
    // convert the id to a symbol
@@ -1660,7 +2144,7 @@ stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
 }
 
 // bias[n] = (-1<<n) + 1
-static int const stbi__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
+static const int stbi__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
 
 // combined JPEG 'receive' and JPEG 'extend', since baseline
 // always extends everything it receives.
@@ -1669,14 +2153,14 @@ stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n)
    unsigned int k;
    int sgn;
    if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
 
-   sgn = (stbi__int32)j->code_buffer >> 31; // sign bit is always in MSB
+   sgn = j->code_buffer >> 31; // sign bit always in MSB; 0 if MSB clear (positive), 1 if MSB set (negative)
    k = stbi_lrot(j->code_buffer, n);
-   STBI_ASSERT(n >= 0 && n < (int) (sizeof(stbi__bmask)/sizeof(*stbi__bmask)));
    j->code_buffer = k & ~stbi__bmask[n];
    k &= stbi__bmask[n];
    j->code_bits -= n;
-   return k + (stbi__jbias[n] & ~sgn);
+   return k + (stbi__jbias[n] & (sgn - 1));
 }
 
 // get some unsigned bits
@@ -1684,6 +2168,7 @@ stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg *j, int n)
 {
    unsigned int k;
    if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
    k = stbi_lrot(j->code_buffer, n);
    j->code_buffer = k & ~stbi__bmask[n];
    k &= stbi__bmask[n];
@@ -1695,6 +2180,7 @@ stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j)
 {
    unsigned int k;
    if (j->code_bits < 1) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < 1) return 0; // ran out of bits from stream, return 0s intead of continuing
    k = j->code_buffer;
    j->code_buffer <<= 1;
    --j->code_bits;
@@ -1703,7 +2189,7 @@ stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j)
 
 // given a value that's at position X in the zigzag stream,
 // where does it appear in the 8x8 matrix coded as row-major?
-static stbi_uc stbi__jpeg_dezigzag[64+15] =
+static const stbi_uc stbi__jpeg_dezigzag[64+15] =
 {
     0,  1,  8, 16,  9,  2,  3, 10,
    17, 24, 32, 25, 18, 11,  4,  5,
@@ -1719,21 +2205,23 @@ static stbi_uc stbi__jpeg_dezigzag[64+15] =
 };
 
 // decode one 64-entry block--
-static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, stbi__int16 *fac, int b, stbi_uc *dequant)
+static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, stbi__int16 *fac, int b, stbi__uint16 *dequant)
 {
    int diff,dc,k;
    int t;
 
    if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
    t = stbi__jpeg_huff_decode(j, hdc);
-   if (t < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+   if (t < 0 || t > 15) return stbi__err("bad huffman code","Corrupt JPEG");
 
    // 0 all the ac values now so we can do it 32-bits at a time
    memset(data,0,64*sizeof(data[0]));
 
    diff = t ? stbi__extend_receive(j, t) : 0;
+   if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta","Corrupt JPEG");
    dc = j->img_comp[b].dc_pred + diff;
    j->img_comp[b].dc_pred = dc;
+   if (!stbi__mul2shorts_valid(dc, dequant[0])) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
    data[0] = (short) (dc * dequant[0]);
 
    // decode AC components, see JPEG spec
@@ -1747,6 +2235,7 @@ static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman
       if (r) { // fast-AC path
          k += (r >> 4) & 15; // run
          s = r & 15; // combined length
+         if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
          j->code_buffer <<= s;
          j->code_bits -= s;
          // decode into unzigzag'd location
@@ -1783,11 +2272,14 @@ static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__
       // first scan for DC coefficient, must be first
       memset(data,0,64*sizeof(data[0])); // 0 all the ac values now
       t = stbi__jpeg_huff_decode(j, hdc);
+      if (t < 0 || t > 15) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
       diff = t ? stbi__extend_receive(j, t) : 0;
 
+      if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta", "Corrupt JPEG");
       dc = j->img_comp[b].dc_pred + diff;
       j->img_comp[b].dc_pred = dc;
-      data[0] = (short) (dc << j->succ_low);
+      if (!stbi__mul2shorts_valid(dc, 1 << j->succ_low)) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+      data[0] = (short) (dc * (1 << j->succ_low));
    } else {
       // refinement scan for DC coefficient
       if (stbi__jpeg_get_bit(j))
@@ -1821,10 +2313,11 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__
          if (r) { // fast-AC path
             k += (r >> 4) & 15; // run
             s = r & 15; // combined length
+            if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
             j->code_buffer <<= s;
             j->code_bits -= s;
             zig = stbi__jpeg_dezigzag[k++];
-            data[zig] = (short) ((r >> 8) << shift);
+            data[zig] = (short) ((r >> 8) * (1 << shift));
          } else {
             int rs = stbi__jpeg_huff_decode(j, hac);
             if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
@@ -1842,7 +2335,7 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__
             } else {
                k += r;
                zig = stbi__jpeg_dezigzag[k++];
-               data[zig] = (short) (stbi__extend_receive(j,s) << shift);
+               data[zig] = (short) (stbi__extend_receive(j,s) * (1 << shift));
             }
          }
       } while (k <= j->spec_end);
@@ -1929,7 +2422,7 @@ stbi_inline static stbi_uc stbi__clamp(int x)
 }
 
 #define stbi__f2f(x)  ((int) (((x) * 4096 + 0.5)))
-#define stbi__fsh(x)  ((x) << 12)
+#define stbi__fsh(x)  ((x) * 4096)
 
 // derived from jidctint -- DCT_ISLOW
 #define STBI__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
@@ -1984,7 +2477,7 @@ static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64])
          //    (1|2|3|4|5|6|7)==0          0     seconds
          //    all separate               -0.047 seconds
          //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
-         int dcterm = d[0] << 2;
+         int dcterm = d[0]*4;
          v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
       } else {
          STBI__IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56])
@@ -2428,7 +2921,7 @@ static stbi_uc stbi__get_marker(stbi__jpeg *j)
    x = stbi__get8(j->s);
    if (x != 0xff) return STBI__MARKER_none;
    while (x == 0xff)
-      x = stbi__get8(j->s);
+      x = stbi__get8(j->s); // consume repeated 0xff fill bytes
    return x;
 }
 
@@ -2443,7 +2936,7 @@ static void stbi__jpeg_reset(stbi__jpeg *j)
    j->code_bits = 0;
    j->code_buffer = 0;
    j->nomore = 0;
-   j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = 0;
+   j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = j->img_comp[3].dc_pred = 0;
    j->marker = STBI__MARKER_none;
    j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
    j->eob_run = 0;
@@ -2575,7 +3068,7 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg *z)
    }
 }
 
-static void stbi__jpeg_dequantize(short *data, stbi_uc *dequant)
+static void stbi__jpeg_dequantize(short *data, stbi__uint16 *dequant)
 {
    int i;
    for (i=0; i < 64; ++i)
@@ -2617,13 +3110,14 @@ static int stbi__process_marker(stbi__jpeg *z, int m)
          L = stbi__get16be(z->s)-2;
          while (L > 0) {
             int q = stbi__get8(z->s);
-            int p = q >> 4;
+            int p = q >> 4, sixteen = (p != 0);
             int t = q & 15,i;
-            if (p != 0) return stbi__err("bad DQT type","Corrupt JPEG");
+            if (p != 0 && p != 1) return stbi__err("bad DQT type","Corrupt JPEG");
             if (t > 3) return stbi__err("bad DQT table","Corrupt JPEG");
+
             for (i=0; i < 64; ++i)
-               z->dequant[t][stbi__jpeg_dezigzag[i]] = stbi__get8(z->s);
-            L -= 65;
+               z->dequant[t][stbi__jpeg_dezigzag[i]] = (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s));
+            L -= (sixteen ? 129 : 65);
          }
          return L==0;
 
@@ -2640,6 +3134,7 @@ static int stbi__process_marker(stbi__jpeg *z, int m)
                sizes[i] = stbi__get8(z->s);
                n += sizes[i];
             }
+            if(n > 256) return stbi__err("bad DHT header","Corrupt JPEG"); // Loop over i < n would write past end of values!
             L -= 17;
             if (tc == 0) {
                if (!stbi__build_huffman(z->huff_dc+th, sizes)) return 0;
@@ -2656,12 +3151,50 @@ static int stbi__process_marker(stbi__jpeg *z, int m)
          }
          return L==0;
    }
+
    // check for comment block or APP blocks
    if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
-      stbi__skip(z->s, stbi__get16be(z->s)-2);
+      L = stbi__get16be(z->s);
+      if (L < 2) {
+         if (m == 0xFE)
+            return stbi__err("bad COM len","Corrupt JPEG");
+         else
+            return stbi__err("bad APP len","Corrupt JPEG");
+      }
+      L -= 2;
+
+      if (m == 0xE0 && L >= 5) { // JFIF APP0 segment
+         static const unsigned char tag[5] = {'J','F','I','F','\0'};
+         int ok = 1;
+         int i;
+         for (i=0; i < 5; ++i)
+            if (stbi__get8(z->s) != tag[i])
+               ok = 0;
+         L -= 5;
+         if (ok)
+            z->jfif = 1;
+      } else if (m == 0xEE && L >= 12) { // Adobe APP14 segment
+         static const unsigned char tag[6] = {'A','d','o','b','e','\0'};
+         int ok = 1;
+         int i;
+         for (i=0; i < 6; ++i)
+            if (stbi__get8(z->s) != tag[i])
+               ok = 0;
+         L -= 6;
+         if (ok) {
+            stbi__get8(z->s); // version
+            stbi__get16be(z->s); // flags0
+            stbi__get16be(z->s); // flags1
+            z->app14_color_transform = stbi__get8(z->s); // color transform
+            L -= 6;
+         }
+      }
+
+      stbi__skip(z->s, L);
       return 1;
    }
-   return 0;
+
+   return stbi__err("unknown marker","Corrupt JPEG");
 }
 
 // after we see SOS
@@ -2704,6 +3237,28 @@ static int stbi__process_scan_header(stbi__jpeg *z)
    return 1;
 }
 
+static int stbi__free_jpeg_components(stbi__jpeg *z, int ncomp, int why)
+{
+   int i;
+   for (i=0; i < ncomp; ++i) {
+      if (z->img_comp[i].raw_data) {
+         STBI_FREE(z->img_comp[i].raw_data);
+         z->img_comp[i].raw_data = NULL;
+         z->img_comp[i].data = NULL;
+      }
+      if (z->img_comp[i].raw_coeff) {
+         STBI_FREE(z->img_comp[i].raw_coeff);
+         z->img_comp[i].raw_coeff = 0;
+         z->img_comp[i].coeff = 0;
+      }
+      if (z->img_comp[i].linebuf) {
+         STBI_FREE(z->img_comp[i].linebuf);
+         z->img_comp[i].linebuf = NULL;
+      }
+   }
+   return why;
+}
+
 static int stbi__process_frame_header(stbi__jpeg *z, int scan)
 {
    stbi__context *s = z->s;
@@ -2712,8 +3267,10 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
    p  = stbi__get8(s);            if (p != 8) return stbi__err("only 8-bit","JPEG format not supported: 8-bit only"); // JPEG baseline
    s->img_y = stbi__get16be(s);   if (s->img_y == 0) return stbi__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
    s->img_x = stbi__get16be(s);   if (s->img_x == 0) return stbi__err("0 width","Corrupt JPEG"); // JPEG requires
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
    c = stbi__get8(s);
-   if (c != 3 && c != 1) return stbi__err("bad component count","Corrupt JPEG");    // JFIF requires
+   if (c != 3 && c != 1 && c != 4) return stbi__err("bad component count","Corrupt JPEG");
    s->img_n = c;
    for (i=0; i < c; ++i) {
       z->img_comp[i].data = NULL;
@@ -2724,15 +3281,10 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
 
    z->rgb = 0;
    for (i=0; i < s->img_n; ++i) {
-      static unsigned char rgb[3] = { 'R', 'G', 'B' };
+      static const unsigned char rgb[3] = { 'R', 'G', 'B' };
       z->img_comp[i].id = stbi__get8(s);
-      if (z->img_comp[i].id != i+1)   // JFIF requires
-         if (z->img_comp[i].id != i) {  // some version of jpegtran outputs non-JFIF-compliant files!
-            // somethings output this (see http://fileformats.archiveteam.org/wiki/JPEG#Color_format)
-            if (z->img_comp[i].id != rgb[i])
-               return stbi__err("bad component ID","Corrupt JPEG");
-            ++z->rgb;
-         }
+      if (s->img_n == 3 && z->img_comp[i].id == rgb[i])
+         ++z->rgb;
       q = stbi__get8(s);
       z->img_comp[i].h = (q >> 4);  if (!z->img_comp[i].h || z->img_comp[i].h > 4) return stbi__err("bad H","Corrupt JPEG");
       z->img_comp[i].v = q & 15;    if (!z->img_comp[i].v || z->img_comp[i].v > 4) return stbi__err("bad V","Corrupt JPEG");
@@ -2741,18 +3293,26 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
 
    if (scan != STBI__SCAN_load) return 1;
 
-   if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
+   if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) return stbi__err("too large", "Image too large to decode");
 
    for (i=0; i < s->img_n; ++i) {
       if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h;
       if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v;
    }
 
+   // check that plane subsampling factors are integer ratios; our resamplers can't deal with fractional ratios
+   // and I've never seen a non-corrupted JPEG file actually use them
+   for (i=0; i < s->img_n; ++i) {
+      if (h_max % z->img_comp[i].h != 0) return stbi__err("bad H","Corrupt JPEG");
+      if (v_max % z->img_comp[i].v != 0) return stbi__err("bad V","Corrupt JPEG");
+   }
+
    // compute interleaved mcu info
    z->img_h_max = h_max;
    z->img_v_max = v_max;
    z->img_mcu_w = h_max * 8;
    z->img_mcu_h = v_max * 8;
+   // these sizes can't be more than 17 bits
    z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
    z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
 
@@ -2764,28 +3324,27 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
       // the bogus oversized data from using interleaved MCUs and their
       // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
       // discard the extra data until colorspace conversion
+      //
+      // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked earlier)
+      // so these muls can't overflow with 32-bit ints (which we require)
       z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
       z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
-      z->img_comp[i].raw_data = stbi__malloc(z->img_comp[i].w2 * z->img_comp[i].h2+15);
-
-      if (z->img_comp[i].raw_data == NULL) {
-         for(--i; i >= 0; --i) {
-            STBI_FREE(z->img_comp[i].raw_data);
-            z->img_comp[i].raw_data = NULL;
-         }
-         return stbi__err("outofmem", "Out of memory");
-      }
+      z->img_comp[i].coeff = 0;
+      z->img_comp[i].raw_coeff = 0;
+      z->img_comp[i].linebuf = NULL;
+      z->img_comp[i].raw_data = stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15);
+      if (z->img_comp[i].raw_data == NULL)
+         return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
       // align blocks for idct using mmx/sse
       z->img_comp[i].data = (stbi_uc*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
-      z->img_comp[i].linebuf = NULL;
       if (z->progressive) {
-         z->img_comp[i].coeff_w = (z->img_comp[i].w2 + 7) >> 3;
-         z->img_comp[i].coeff_h = (z->img_comp[i].h2 + 7) >> 3;
-         z->img_comp[i].raw_coeff = STBI_MALLOC(z->img_comp[i].coeff_w * z->img_comp[i].coeff_h * 64 * sizeof(short) + 15);
+         // w2, h2 are multiples of 8 (see above)
+         z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8;
+         z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8;
+         z->img_comp[i].raw_coeff = stbi__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15);
+         if (z->img_comp[i].raw_coeff == NULL)
+            return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
          z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
-      } else {
-         z->img_comp[i].coeff = 0;
-         z->img_comp[i].raw_coeff = 0;
       }
    }
 
@@ -2804,6 +3363,8 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
 static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
 {
    int m;
+   z->jfif = 0;
+   z->app14_color_transform = -1; // valid values are 0,1,2
    z->marker = STBI__MARKER_none; // initialize cached marker to empty
    m = stbi__get_marker(z);
    if (!stbi__SOI(m)) return stbi__err("no SOI","Corrupt JPEG");
@@ -2823,6 +3384,28 @@ static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
    return 1;
 }
 
+static int stbi__skip_jpeg_junk_at_end(stbi__jpeg *j)
+{
+   // some JPEGs have junk at end, skip over it but if we find what looks
+   // like a valid marker, resume there
+   while (!stbi__at_eof(j->s)) {
+      int x = stbi__get8(j->s);
+      while (x == 255) { // might be a marker
+         if (stbi__at_eof(j->s)) return STBI__MARKER_none;
+         x = stbi__get8(j->s);
+         if (x != 0x00 && x != 0xff) {
+            // not a stuffed zero or lead-in to another marker, looks
+            // like an actual marker, return it
+            return x;
+         }
+         // stuffed zero has x=0 now which ends the loop, meaning we go
+         // back to regular scan loop.
+         // repeated 0xff keeps trying to read the next byte of the marker.
+      }
+   }
+   return STBI__MARKER_none;
+}
+
 // decode image to YCbCr format
 static int stbi__decode_jpeg_image(stbi__jpeg *j)
 {
@@ -2839,22 +3422,22 @@ static int stbi__decode_jpeg_image(stbi__jpeg *j)
          if (!stbi__process_scan_header(j)) return 0;
          if (!stbi__parse_entropy_coded_data(j)) return 0;
          if (j->marker == STBI__MARKER_none ) {
-            // handle 0s at the end of image data from IP Kamera 9060
-            while (!stbi__at_eof(j->s)) {
-               int x = stbi__get8(j->s);
-               if (x == 255) {
-                  j->marker = stbi__get8(j->s);
-                  break;
-               } else if (x != 0) {
-                  return stbi__err("junk before marker", "Corrupt JPEG");
-               }
-            }
+         j->marker = stbi__skip_jpeg_junk_at_end(j);
             // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
          }
+         m = stbi__get_marker(j);
+         if (STBI__RESTART(m))
+            m = stbi__get_marker(j);
+      } else if (stbi__DNL(m)) {
+         int Ld = stbi__get16be(j->s);
+         stbi__uint32 NL = stbi__get16be(j->s);
+         if (Ld != 4) return stbi__err("bad DNL len", "Corrupt JPEG");
+         if (NL != j->s->img_y) return stbi__err("bad DNL height", "Corrupt JPEG");
+         m = stbi__get_marker(j);
       } else {
-         if (!stbi__process_marker(j, m)) return 0;
+         if (!stbi__process_marker(j, m)) return 1;
+         m = stbi__get_marker(j);
       }
-      m = stbi__get_marker(j);
    }
    if (j->progressive)
       stbi__jpeg_finish(j);
@@ -3069,38 +3652,9 @@ static stbi_uc *stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near, stbi_
    return out;
 }
 
-#ifdef STBI_JPEG_OLD
-// this is the same YCbCr-to-RGB calculation that stb_image has used
-// historically before the algorithm changes in 1.49
-#define float2fixed(x)  ((int) ((x) * 65536 + 0.5))
-static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
-{
-   int i;
-   for (i=0; i < count; ++i) {
-      int y_fixed = (y[i] << 16) + 32768; // rounding
-      int r,g,b;
-      int cr = pcr[i] - 128;
-      int cb = pcb[i] - 128;
-      r = y_fixed + cr*float2fixed(1.40200f);
-      g = y_fixed - cr*float2fixed(0.71414f) - cb*float2fixed(0.34414f);
-      b = y_fixed                            + cb*float2fixed(1.77200f);
-      r >>= 16;
-      g >>= 16;
-      b >>= 16;
-      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
-      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
-      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
-      out[0] = (stbi_uc)r;
-      out[1] = (stbi_uc)g;
-      out[2] = (stbi_uc)b;
-      out[3] = 255;
-      out += step;
-   }
-}
-#else
 // this is a reduced-precision calculation of YCbCr-to-RGB introduced
 // to make sure the code produces the same results in both SIMD and scalar
-#define float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
+#define stbi__float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
 static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
 {
    int i;
@@ -3109,9 +3663,9 @@ static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc
       int r,g,b;
       int cr = pcr[i] - 128;
       int cb = pcb[i] - 128;
-      r = y_fixed +  cr* float2fixed(1.40200f);
-      g = y_fixed + (cr*-float2fixed(0.71414f)) + ((cb*-float2fixed(0.34414f)) & 0xffff0000);
-      b = y_fixed                               +   cb* float2fixed(1.77200f);
+      r = y_fixed +  cr* stbi__float2fixed(1.40200f);
+      g = y_fixed + (cr*-stbi__float2fixed(0.71414f)) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                                     +   cb* stbi__float2fixed(1.77200f);
       r >>= 20;
       g >>= 20;
       b >>= 20;
@@ -3125,7 +3679,6 @@ static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc
       out += step;
    }
 }
-#endif
 
 #if defined(STBI_SSE2) || defined(STBI_NEON)
 static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
@@ -3244,9 +3797,9 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc cons
       int r,g,b;
       int cr = pcr[i] - 128;
       int cb = pcb[i] - 128;
-      r = y_fixed + cr* float2fixed(1.40200f);
-      g = y_fixed + cr*-float2fixed(0.71414f) + ((cb*-float2fixed(0.34414f)) & 0xffff0000);
-      b = y_fixed                             +   cb* float2fixed(1.77200f);
+      r = y_fixed + cr* stbi__float2fixed(1.40200f);
+      g = y_fixed + cr*-stbi__float2fixed(0.71414f) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                                   +   cb* stbi__float2fixed(1.77200f);
       r >>= 20;
       g >>= 20;
       b >>= 20;
@@ -3272,18 +3825,14 @@ static void stbi__setup_jpeg(stbi__jpeg *j)
 #ifdef STBI_SSE2
    if (stbi__sse2_available()) {
       j->idct_block_kernel = stbi__idct_simd;
-      #ifndef STBI_JPEG_OLD
       j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
-      #endif
       j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
    }
 #endif
 
 #ifdef STBI_NEON
    j->idct_block_kernel = stbi__idct_simd;
-   #ifndef STBI_JPEG_OLD
    j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
-   #endif
    j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
 #endif
 }
@@ -3291,23 +3840,7 @@ static void stbi__setup_jpeg(stbi__jpeg *j)
 // clean up the temporary component buffers
 static void stbi__cleanup_jpeg(stbi__jpeg *j)
 {
-   int i;
-   for (i=0; i < j->s->img_n; ++i) {
-      if (j->img_comp[i].raw_data) {
-         STBI_FREE(j->img_comp[i].raw_data);
-         j->img_comp[i].raw_data = NULL;
-         j->img_comp[i].data = NULL;
-      }
-      if (j->img_comp[i].raw_coeff) {
-         STBI_FREE(j->img_comp[i].raw_coeff);
-         j->img_comp[i].raw_coeff = 0;
-         j->img_comp[i].coeff = 0;
-      }
-      if (j->img_comp[i].linebuf) {
-         STBI_FREE(j->img_comp[i].linebuf);
-         j->img_comp[i].linebuf = NULL;
-      }
-   }
+   stbi__free_jpeg_components(j, j->s->img_n, 0);
 }
 
 typedef struct
@@ -3320,9 +3853,16 @@ typedef struct
    int ypos;    // which pre-expansion row we're on
 } stbi__resample;
 
+// fast 0..255 * 0..255 => 0..255 rounded multiplication
+static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y)
+{
+   unsigned int t = x*y + 128;
+   return (stbi_uc) ((t + (t >>8)) >> 8);
+}
+
 static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp, int req_comp)
 {
-   int n, decode_n;
+   int n, decode_n, is_rgb;
    z->s->img_n = 0; // make stbi__cleanup_jpeg safe
 
    // validate req_comp
@@ -3332,19 +3872,25 @@ static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp
    if (!stbi__decode_jpeg_image(z)) { stbi__cleanup_jpeg(z); return NULL; }
 
    // determine actual number of components to generate
-   n = req_comp ? req_comp : z->s->img_n;
+   n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1;
+
+   is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif));
 
-   if (z->s->img_n == 3 && n < 3)
+   if (z->s->img_n == 3 && n < 3 && !is_rgb)
       decode_n = 1;
    else
       decode_n = z->s->img_n;
 
+   // nothing to do if no components requested; check this now to avoid
+   // accessing uninitialized coutput[0] later
+   if (decode_n <= 0) { stbi__cleanup_jpeg(z); return NULL; }
+
    // resample and color-convert
    {
       int k;
       unsigned int i,j;
       stbi_uc *output;
-      stbi_uc *coutput[4];
+      stbi_uc *coutput[4] = { NULL, NULL, NULL, NULL };
 
       stbi__resample res_comp[4];
 
@@ -3371,7 +3917,7 @@ static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp
       }
 
       // can't error after this so, this is safe
-      output = (stbi_uc *) stbi__malloc(n * z->s->img_x * z->s->img_y + 1);
+      output = (stbi_uc *) stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
       if (!output) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
 
       // now go ahead and resample
@@ -3394,7 +3940,7 @@ static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp
          if (n >= 3) {
             stbi_uc *y = coutput[0];
             if (z->s->img_n == 3) {
-               if (z->rgb == 3) {
+               if (is_rgb) {
                   for (i=0; i < z->s->img_x; ++i) {
                      out[0] = y[i];
                      out[1] = coutput[1][i];
@@ -3405,6 +3951,28 @@ static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp
                } else {
                   z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
                }
+            } else if (z->s->img_n == 4) {
+               if (z->app14_color_transform == 0) { // CMYK
+                  for (i=0; i < z->s->img_x; ++i) {
+                     stbi_uc m = coutput[3][i];
+                     out[0] = stbi__blinn_8x8(coutput[0][i], m);
+                     out[1] = stbi__blinn_8x8(coutput[1][i], m);
+                     out[2] = stbi__blinn_8x8(coutput[2][i], m);
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else if (z->app14_color_transform == 2) { // YCCK
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+                  for (i=0; i < z->s->img_x; ++i) {
+                     stbi_uc m = coutput[3][i];
+                     out[0] = stbi__blinn_8x8(255 - out[0], m);
+                     out[1] = stbi__blinn_8x8(255 - out[1], m);
+                     out[2] = stbi__blinn_8x8(255 - out[2], m);
+                     out += n;
+                  }
+               } else { // YCbCr + alpha?  Ignore the fourth channel for now
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
             } else
                for (i=0; i < z->s->img_x; ++i) {
                   out[0] = out[1] = out[2] = y[i];
@@ -3412,25 +3980,56 @@ static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp
                   out += n;
                }
          } else {
-            stbi_uc *y = coutput[0];
-            if (n == 1)
-               for (i=0; i < z->s->img_x; ++i) out[i] = y[i];
-            else
-               for (i=0; i < z->s->img_x; ++i) *out++ = y[i], *out++ = 255;
+            if (is_rgb) {
+               if (n == 1)
+                  for (i=0; i < z->s->img_x; ++i)
+                     *out++ = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+               else {
+                  for (i=0; i < z->s->img_x; ++i, out += 2) {
+                     out[0] = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+                     out[1] = 255;
+                  }
+               }
+            } else if (z->s->img_n == 4 && z->app14_color_transform == 0) {
+               for (i=0; i < z->s->img_x; ++i) {
+                  stbi_uc m = coutput[3][i];
+                  stbi_uc r = stbi__blinn_8x8(coutput[0][i], m);
+                  stbi_uc g = stbi__blinn_8x8(coutput[1][i], m);
+                  stbi_uc b = stbi__blinn_8x8(coutput[2][i], m);
+                  out[0] = stbi__compute_y(r, g, b);
+                  out[1] = 255;
+                  out += n;
+               }
+            } else if (z->s->img_n == 4 && z->app14_color_transform == 2) {
+               for (i=0; i < z->s->img_x; ++i) {
+                  out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]);
+                  out[1] = 255;
+                  out += n;
+               }
+            } else {
+               stbi_uc *y = coutput[0];
+               if (n == 1)
+                  for (i=0; i < z->s->img_x; ++i) out[i] = y[i];
+               else
+                  for (i=0; i < z->s->img_x; ++i) { *out++ = y[i]; *out++ = 255; }
+            }
          }
       }
       stbi__cleanup_jpeg(z);
       *out_x = z->s->img_x;
       *out_y = z->s->img_y;
-      if (comp) *comp  = z->s->img_n; // report original components, not output
+      if (comp) *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output
       return output;
    }
 }
 
-static unsigned char *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
 {
    unsigned char* result;
    stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg));
+   if (!j) return stbi__errpuc("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
+   STBI_NOTUSED(ri);
    j->s = s;
    stbi__setup_jpeg(j);
    result = load_jpeg_image(j, x,y,comp,req_comp);
@@ -3441,11 +4040,14 @@ static unsigned char *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *com
 static int stbi__jpeg_test(stbi__context *s)
 {
    int r;
-   stbi__jpeg j;
-   j.s = s;
-   stbi__setup_jpeg(&j);
-   r = stbi__decode_jpeg_header(&j, STBI__SCAN_type);
+   stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
+   if (!j) return stbi__err("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
+   j->s = s;
+   stbi__setup_jpeg(j);
+   r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
    stbi__rewind(s);
+   STBI_FREE(j);
    return r;
 }
 
@@ -3457,7 +4059,7 @@ static int stbi__jpeg_info_raw(stbi__jpeg *j, int *x, int *y, int *comp)
    }
    if (x) *x = j->s->img_x;
    if (y) *y = j->s->img_y;
-   if (comp) *comp = j->s->img_n;
+   if (comp) *comp = j->s->img_n >= 3 ? 3 : 1;
    return 1;
 }
 
@@ -3465,6 +4067,8 @@ static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
 {
    int result;
    stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg)));
+   if (!j) return stbi__err("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
    j->s = s;
    result = stbi__jpeg_info_raw(j, x, y, comp);
    STBI_FREE(j);
@@ -3484,6 +4088,7 @@ static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
 // fast-way is faster to check than jpeg huffman, but slow way is slower
 #define STBI__ZFAST_BITS  9 // accelerate all cases in default tables
 #define STBI__ZFAST_MASK  ((1 << STBI__ZFAST_BITS) - 1)
+#define STBI__ZNSYMS 288 // number of symbols in literal/length alphabet
 
 // zlib-style huffman encoding
 // (jpegs packs from left, zlib from right, so can't share code)
@@ -3493,8 +4098,8 @@ typedef struct
    stbi__uint16 firstcode[16];
    int maxcode[17];
    stbi__uint16 firstsymbol[16];
-   stbi_uc  size[288];
-   stbi__uint16 value[288];
+   stbi_uc  size[STBI__ZNSYMS];
+   stbi__uint16 value[STBI__ZNSYMS];
 } stbi__zhuffman;
 
 stbi_inline static int stbi__bitreverse16(int n)
@@ -3514,7 +4119,7 @@ stbi_inline static int stbi__bit_reverse(int v, int bits)
    return stbi__bitreverse16(v) >> (16-bits);
 }
 
-static int stbi__zbuild_huffman(stbi__zhuffman *z, stbi_uc *sizelist, int num)
+static int stbi__zbuild_huffman(stbi__zhuffman *z, const stbi_uc *sizelist, int num)
 {
    int i,k=0;
    int code, next_code[16], sizes[17];
@@ -3581,16 +4186,23 @@ typedef struct
    stbi__zhuffman z_length, z_distance;
 } stbi__zbuf;
 
+stbi_inline static int stbi__zeof(stbi__zbuf *z)
+{
+   return (z->zbuffer >= z->zbuffer_end);
+}
+
 stbi_inline static stbi_uc stbi__zget8(stbi__zbuf *z)
 {
-   if (z->zbuffer >= z->zbuffer_end) return 0;
-   return *z->zbuffer++;
+   return stbi__zeof(z) ? 0 : *z->zbuffer++;
 }
 
 static void stbi__fill_bits(stbi__zbuf *z)
 {
    do {
-      STBI_ASSERT(z->code_buffer < (1U << z->num_bits));
+      if (z->code_buffer >= (1U << z->num_bits)) {
+        z->zbuffer = z->zbuffer_end;  /* treat this as EOF so we fail. */
+        return;
+      }
       z->code_buffer |= (unsigned int) stbi__zget8(z) << z->num_bits;
       z->num_bits += 8;
    } while (z->num_bits <= 24);
@@ -3615,10 +4227,11 @@ static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z)
    for (s=STBI__ZFAST_BITS+1; ; ++s)
       if (k < z->maxcode[s])
          break;
-   if (s == 16) return -1; // invalid code!
+   if (s >= 16) return -1; // invalid code!
    // code size is s, so:
    b = (k >> (16-s)) - z->firstcode[s] + z->firstsymbol[s];
-   STBI_ASSERT(z->size[b] == s);
+   if (b >= STBI__ZNSYMS) return -1; // some data was corrupt somewhere!
+   if (z->size[b] != s) return -1;  // was originally an assert, but report failure instead.
    a->code_buffer >>= s;
    a->num_bits -= s;
    return z->value[b];
@@ -3627,7 +4240,12 @@ static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z)
 stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
 {
    int b,s;
-   if (a->num_bits < 16) stbi__fill_bits(a);
+   if (a->num_bits < 16) {
+      if (stbi__zeof(a)) {
+         return -1;   /* report error for unexpected end of data. */
+      }
+      stbi__fill_bits(a);
+   }
    b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
    if (b) {
       s = b >> 9;
@@ -3641,13 +4259,16 @@ stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
 static int stbi__zexpand(stbi__zbuf *z, char *zout, int n)  // need to make room for n bytes
 {
    char *q;
-   int cur, limit, old_limit;
+   unsigned int cur, limit, old_limit;
    z->zout = zout;
    if (!z->z_expandable) return stbi__err("output buffer limit","Corrupt PNG");
-   cur   = (int) (z->zout     - z->zout_start);
-   limit = old_limit = (int) (z->zout_end - z->zout_start);
-   while (cur + n > limit)
+   cur   = (unsigned int) (z->zout - z->zout_start);
+   limit = old_limit = (unsigned) (z->zout_end - z->zout_start);
+   if (UINT_MAX - cur < (unsigned) n) return stbi__err("outofmem", "Out of memory");
+   while (cur + n > limit) {
+      if(limit > UINT_MAX / 2) return stbi__err("outofmem", "Out of memory");
       limit *= 2;
+   }
    q = (char *) STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
    STBI_NOTUSED(old_limit);
    if (q == NULL) return stbi__err("outofmem", "Out of memory");
@@ -3657,18 +4278,18 @@ static int stbi__zexpand(stbi__zbuf *z, char *zout, int n)  // need to make room
    return 1;
 }
 
-static int stbi__zlength_base[31] = {
+static const int stbi__zlength_base[31] = {
    3,4,5,6,7,8,9,10,11,13,
    15,17,19,23,27,31,35,43,51,59,
    67,83,99,115,131,163,195,227,258,0,0 };
 
-static int stbi__zlength_extra[31]=
+static const int stbi__zlength_extra[31]=
 { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
 
-static int stbi__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
+static const int stbi__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
 257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0};
 
-static int stbi__zdist_extra[32] =
+static const int stbi__zdist_extra[32] =
 { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
 
 static int stbi__parse_huffman_block(stbi__zbuf *a)
@@ -3690,11 +4311,12 @@ static int stbi__parse_huffman_block(stbi__zbuf *a)
             a->zout = zout;
             return 1;
          }
+         if (z >= 286) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, length codes 286 and 287 must not appear in compressed data
          z -= 257;
          len = stbi__zlength_base[z];
          if (stbi__zlength_extra[z]) len += stbi__zreceive(a, stbi__zlength_extra[z]);
          z = stbi__zhuffman_decode(a, &a->z_distance);
-         if (z < 0) return stbi__err("bad huffman code","Corrupt PNG");
+         if (z < 0 || z >= 30) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, distance codes 30 and 31 must not appear in compressed data
          dist = stbi__zdist_base[z];
          if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]);
          if (zout - a->zout_start < dist) return stbi__err("bad dist","Corrupt PNG");
@@ -3715,7 +4337,7 @@ static int stbi__parse_huffman_block(stbi__zbuf *a)
 
 static int stbi__compute_huffman_codes(stbi__zbuf *a)
 {
-   static stbi_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
+   static const stbi_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
    stbi__zhuffman z_codelength;
    stbi_uc lencodes[286+32+137];//padding for maximum single op
    stbi_uc codelength_sizes[19];
@@ -3724,6 +4346,7 @@ static int stbi__compute_huffman_codes(stbi__zbuf *a)
    int hlit  = stbi__zreceive(a,5) + 257;
    int hdist = stbi__zreceive(a,5) + 1;
    int hclen = stbi__zreceive(a,4) + 4;
+   int ntot  = hlit + hdist;
 
    memset(codelength_sizes, 0, sizeof(codelength_sizes));
    for (i=0; i < hclen; ++i) {
@@ -3733,27 +4356,30 @@ static int stbi__compute_huffman_codes(stbi__zbuf *a)
    if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0;
 
    n = 0;
-   while (n < hlit + hdist) {
+   while (n < ntot) {
       int c = stbi__zhuffman_decode(a, &z_codelength);
       if (c < 0 || c >= 19) return stbi__err("bad codelengths", "Corrupt PNG");
       if (c < 16)
          lencodes[n++] = (stbi_uc) c;
-      else if (c == 16) {
-         c = stbi__zreceive(a,2)+3;
-         memset(lencodes+n, lencodes[n-1], c);
-         n += c;
-      } else if (c == 17) {
-         c = stbi__zreceive(a,3)+3;
-         memset(lencodes+n, 0, c);
-         n += c;
-      } else {
-         STBI_ASSERT(c == 18);
-         c = stbi__zreceive(a,7)+11;
-         memset(lencodes+n, 0, c);
+      else {
+         stbi_uc fill = 0;
+         if (c == 16) {
+            c = stbi__zreceive(a,2)+3;
+            if (n == 0) return stbi__err("bad codelengths", "Corrupt PNG");
+            fill = lencodes[n-1];
+         } else if (c == 17) {
+            c = stbi__zreceive(a,3)+3;
+         } else if (c == 18) {
+            c = stbi__zreceive(a,7)+11;
+         } else {
+            return stbi__err("bad codelengths", "Corrupt PNG");
+         }
+         if (ntot - n < c) return stbi__err("bad codelengths", "Corrupt PNG");
+         memset(lencodes+n, fill, c);
          n += c;
       }
    }
-   if (n != hlit+hdist) return stbi__err("bad codelengths","Corrupt PNG");
+   if (n != ntot) return stbi__err("bad codelengths","Corrupt PNG");
    if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit)) return 0;
    if (!stbi__zbuild_huffman(&a->z_distance, lencodes+hlit, hdist)) return 0;
    return 1;
@@ -3772,7 +4398,7 @@ static int stbi__parse_uncompressed_block(stbi__zbuf *a)
       a->code_buffer >>= 8;
       a->num_bits -= 8;
    }
-   STBI_ASSERT(a->num_bits == 0);
+   if (a->num_bits < 0) return stbi__err("zlib corrupt","Corrupt PNG");
    // now fill header the normal way
    while (k < 4)
       header[k++] = stbi__zget8(a);
@@ -3794,6 +4420,7 @@ static int stbi__parse_zlib_header(stbi__zbuf *a)
    int cm    = cmf & 15;
    /* int cinfo = cmf >> 4; */
    int flg   = stbi__zget8(a);
+   if (stbi__zeof(a)) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
    if ((cmf*256+flg) % 31 != 0) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
    if (flg & 32) return stbi__err("no preset dict","Corrupt PNG"); // preset dictionary not allowed in png
    if (cm != 8) return stbi__err("bad compression","Corrupt PNG"); // DEFLATE required for png
@@ -3801,9 +4428,24 @@ static int stbi__parse_zlib_header(stbi__zbuf *a)
    return 1;
 }
 
-// @TODO: should statically initialize these for optimal thread safety
-static stbi_uc stbi__zdefault_length[288], stbi__zdefault_distance[32];
-static void stbi__init_zdefaults(void)
+static const stbi_uc stbi__zdefault_length[STBI__ZNSYMS] =
+{
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8
+};
+static const stbi_uc stbi__zdefault_distance[32] =
+{
+   5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
+};
+/*
+Init algorithm:
 {
    int i;   // use <= to match clearly with spec
    for (i=0; i <= 143; ++i)     stbi__zdefault_length[i]   = 8;
@@ -3813,6 +4455,7 @@ static void stbi__init_zdefaults(void)
 
    for (i=0; i <=  31; ++i)     stbi__zdefault_distance[i] = 5;
 }
+*/
 
 static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
 {
@@ -3831,8 +4474,7 @@ static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
       } else {
          if (type == 1) {
             // use fixed code lengths
-            if (!stbi__zdefault_distance[31]) stbi__init_zdefaults();
-            if (!stbi__zbuild_huffman(&a->z_length  , stbi__zdefault_length  , 288)) return 0;
+            if (!stbi__zbuild_huffman(&a->z_length  , stbi__zdefault_length  , STBI__ZNSYMS)) return 0;
             if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance,  32)) return 0;
          } else {
             if (!stbi__compute_huffman_codes(a)) return 0;
@@ -3956,7 +4598,7 @@ static stbi__pngchunk stbi__get_chunk_header(stbi__context *s)
 
 static int stbi__check_png_header(stbi__context *s)
 {
-   static stbi_uc png_sig[8] = { 137,80,78,71,13,10,26,10 };
+   static const stbi_uc png_sig[8] = { 137,80,78,71,13,10,26,10 };
    int i;
    for (i=0; i < 8; ++i)
       if (stbi__get8(s) != png_sig[i]) return stbi__err("bad png sig","Not a PNG");
@@ -4002,7 +4644,7 @@ static int stbi__paeth(int a, int b, int c)
    return c;
 }
 
-static stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
+static const stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
 
 // create the png data from post-deflated data
 static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x, stbi__uint32 y, int depth, int color)
@@ -4019,31 +4661,33 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r
    int width = x;
 
    STBI_ASSERT(out_n == s->img_n || out_n == s->img_n+1);
-   a->out = (stbi_uc *) stbi__malloc(x * y * output_bytes); // extra bytes to write off the end into
+   a->out = (stbi_uc *) stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into
    if (!a->out) return stbi__err("outofmem", "Out of memory");
 
+   if (!stbi__mad3sizes_valid(img_n, x, depth, 7)) return stbi__err("too large", "Corrupt PNG");
    img_width_bytes = (((img_n * x * depth) + 7) >> 3);
    img_len = (img_width_bytes + 1) * y;
-   if (s->img_x == x && s->img_y == y) {
-      if (raw_len != img_len) return stbi__err("not enough pixels","Corrupt PNG");
-   } else { // interlaced:
-      if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG");
-   }
+
+   // we used to check for exact match between raw_len and img_len on non-interlaced PNGs,
+   // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros),
+   // so just check for raw_len < img_len always.
+   if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG");
 
    for (j=0; j < y; ++j) {
       stbi_uc *cur = a->out + stride*j;
-      stbi_uc *prior = cur - stride;
+      stbi_uc *prior;
       int filter = *raw++;
 
       if (filter > 4)
          return stbi__err("invalid filter","Corrupt PNG");
 
       if (depth < 8) {
-         STBI_ASSERT(img_width_bytes <= x);
+         if (img_width_bytes > x) return stbi__err("invalid width","Corrupt PNG");
          cur += x*out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place
          filter_bytes = 1;
          width = img_width_bytes;
       }
+      prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above
 
       // if first row, use special filter that doesn't sample previous row
       if (j == 0) filter = first_row_filter[filter];
@@ -4084,37 +4728,37 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r
       // this is a little gross, so that we don't switch per-pixel or per-component
       if (depth < 8 || img_n == out_n) {
          int nk = (width - 1)*filter_bytes;
-         #define CASE(f) \
+         #define STBI__CASE(f) \
              case f:     \
                 for (k=0; k < nk; ++k)
          switch (filter) {
             // "none" filter turns into a memcpy here; make that explicit.
             case STBI__F_none:         memcpy(cur, raw, nk); break;
-            CASE(STBI__F_sub)          cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]); break;
-            CASE(STBI__F_up)           cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
-            CASE(STBI__F_avg)          cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1)); break;
-            CASE(STBI__F_paeth)        cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],prior[k],prior[k-filter_bytes])); break;
-            CASE(STBI__F_avg_first)    cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1)); break;
-            CASE(STBI__F_paeth_first)  cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],0,0)); break;
+            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]); } break;
+            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
+            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1)); } break;
+            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],prior[k],prior[k-filter_bytes])); } break;
+            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1)); } break;
+            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],0,0)); } break;
          }
-         #undef CASE
+         #undef STBI__CASE
          raw += nk;
       } else {
          STBI_ASSERT(img_n+1 == out_n);
-         #define CASE(f) \
+         #define STBI__CASE(f) \
              case f:     \
                 for (i=x-1; i >= 1; --i, cur[filter_bytes]=255,raw+=filter_bytes,cur+=output_bytes,prior+=output_bytes) \
                    for (k=0; k < filter_bytes; ++k)
          switch (filter) {
-            CASE(STBI__F_none)         cur[k] = raw[k]; break;
-            CASE(STBI__F_sub)          cur[k] = STBI__BYTECAST(raw[k] + cur[k- output_bytes]); break;
-            CASE(STBI__F_up)           cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
-            CASE(STBI__F_avg)          cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k- output_bytes])>>1)); break;
-            CASE(STBI__F_paeth)        cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],prior[k],prior[k- output_bytes])); break;
-            CASE(STBI__F_avg_first)    cur[k] = STBI__BYTECAST(raw[k] + (cur[k- output_bytes] >> 1)); break;
-            CASE(STBI__F_paeth_first)  cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],0,0)); break;
+            STBI__CASE(STBI__F_none)         { cur[k] = raw[k]; } break;
+            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k- output_bytes]); } break;
+            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
+            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k- output_bytes])>>1)); } break;
+            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],prior[k],prior[k- output_bytes])); } break;
+            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k- output_bytes] >> 1)); } break;
+            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],0,0)); } break;
          }
-         #undef CASE
+         #undef STBI__CASE
 
          // the loop above sets the high byte of the pixels' alpha, but for
          // 16 bit png files we also need the low byte set. we'll do that here.
@@ -4217,13 +4861,16 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r
 
 static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint32 image_data_len, int out_n, int depth, int color, int interlaced)
 {
+   int bytes = (depth == 16 ? 2 : 1);
+   int out_bytes = out_n * bytes;
    stbi_uc *final;
    int p;
    if (!interlaced)
       return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
 
    // de-interlacing
-   final = (stbi_uc *) stbi__malloc(a->s->img_x * a->s->img_y * out_n);
+   final = (stbi_uc *) stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
+   if (!final) return stbi__err("outofmem", "Out of memory");
    for (p=0; p < 7; ++p) {
       int xorig[] = { 0,4,0,2,0,1,0 };
       int yorig[] = { 0,0,4,0,2,0,1 };
@@ -4243,8 +4890,8 @@ static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint3
             for (i=0; i < x; ++i) {
                int out_y = j*yspc[p]+yorig[p];
                int out_x = i*xspc[p]+xorig[p];
-               memcpy(final + out_y*a->s->img_x*out_n + out_x*out_n,
-                      a->out + (j*x+i)*out_n, out_n);
+               memcpy(final + out_y*a->s->img_x*out_bytes + out_x*out_bytes,
+                      a->out + (j*x+i)*out_bytes, out_bytes);
             }
          }
          STBI_FREE(a->out);
@@ -4312,7 +4959,7 @@ static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int
    stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
    stbi_uc *p, *temp_out, *orig = a->out;
 
-   p = (stbi_uc *) stbi__malloc(pixel_count * pal_img_n);
+   p = (stbi_uc *) stbi__malloc_mad2(pixel_count, pal_img_n, 0);
    if (p == NULL) return stbi__err("outofmem", "Out of memory");
 
    // between here and free(out) below, exitting would leak
@@ -4344,39 +4991,46 @@ static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int
    return 1;
 }
 
-static int stbi__reduce_png(stbi__png *p)
-{
-   int i;
-   int img_len = p->s->img_x * p->s->img_y * p->s->img_out_n;
-   stbi_uc *reduced;
-   stbi__uint16 *orig = (stbi__uint16*)p->out;
-
-   if (p->depth != 16) return 1; // don't need to do anything if not 16-bit data
+static int stbi__unpremultiply_on_load_global = 0;
+static int stbi__de_iphone_flag_global = 0;
 
-   reduced = (stbi_uc *)stbi__malloc(img_len);
-   if (p == NULL) return stbi__err("outofmem", "Out of memory");
-
-   for (i = 0; i < img_len; ++i) reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is a decent approx of 16->8 bit scaling
-
-   p->out = reduced;
-   STBI_FREE(orig);
+STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply)
+{
+   stbi__unpremultiply_on_load_global = flag_true_if_should_unpremultiply;
+}
 
-   return 1;
+STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
+{
+   stbi__de_iphone_flag_global = flag_true_if_should_convert;
 }
 
-static int stbi__unpremultiply_on_load = 0;
-static int stbi__de_iphone_flag = 0;
+#ifndef STBI_THREAD_LOCAL
+#define stbi__unpremultiply_on_load  stbi__unpremultiply_on_load_global
+#define stbi__de_iphone_flag  stbi__de_iphone_flag_global
+#else
+static STBI_THREAD_LOCAL int stbi__unpremultiply_on_load_local, stbi__unpremultiply_on_load_set;
+static STBI_THREAD_LOCAL int stbi__de_iphone_flag_local, stbi__de_iphone_flag_set;
 
-STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply)
+STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply)
 {
-   stbi__unpremultiply_on_load = flag_true_if_should_unpremultiply;
+   stbi__unpremultiply_on_load_local = flag_true_if_should_unpremultiply;
+   stbi__unpremultiply_on_load_set = 1;
 }
 
-STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
+STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert)
 {
-   stbi__de_iphone_flag = flag_true_if_should_convert;
+   stbi__de_iphone_flag_local = flag_true_if_should_convert;
+   stbi__de_iphone_flag_set = 1;
 }
 
+#define stbi__unpremultiply_on_load  (stbi__unpremultiply_on_load_set           \
+                                       ? stbi__unpremultiply_on_load_local      \
+                                       : stbi__unpremultiply_on_load_global)
+#define stbi__de_iphone_flag  (stbi__de_iphone_flag_set                         \
+                                ? stbi__de_iphone_flag_local                    \
+                                : stbi__de_iphone_flag_global)
+#endif // STBI_THREAD_LOCAL
+
 static void stbi__de_iphone(stbi__png *z)
 {
    stbi__context *s = z->s;
@@ -4398,9 +5052,10 @@ static void stbi__de_iphone(stbi__png *z)
             stbi_uc a = p[3];
             stbi_uc t = p[0];
             if (a) {
-               p[0] = p[2] * 255 / a;
-               p[1] = p[1] * 255 / a;
-               p[2] =  t   * 255 / a;
+               stbi_uc half = a / 2;
+               p[0] = (p[2] * 255 + half) / a;
+               p[1] = (p[1] * 255 + half) / a;
+               p[2] = ( t   * 255 + half) / a;
             } else {
                p[0] = p[2];
                p[2] = t;
@@ -4419,12 +5074,12 @@ static void stbi__de_iphone(stbi__png *z)
    }
 }
 
-#define STBI__PNG_TYPE(a,b,c,d)  (((a) << 24) + ((b) << 16) + ((c) << 8) + (d))
+#define STBI__PNG_TYPE(a,b,c,d)  (((unsigned) (a) << 24) + ((unsigned) (b) << 16) + ((unsigned) (c) << 8) + (unsigned) (d))
 
 static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
 {
    stbi_uc palette[1024], pal_img_n=0;
-   stbi_uc has_trans=0, tc[3];
+   stbi_uc has_trans=0, tc[3]={0};
    stbi__uint16 tc16[3];
    stbi__uint32 ioff=0, idata_limit=0, i, pal_len=0;
    int first=1,k,interlace=0, color=0, is_iphone=0;
@@ -4450,11 +5105,13 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
             if (!first) return stbi__err("multiple IHDR","Corrupt PNG");
             first = 0;
             if (c.length != 13) return stbi__err("bad IHDR len","Corrupt PNG");
-            s->img_x = stbi__get32be(s); if (s->img_x > (1 << 24)) return stbi__err("too large","Very large image (corrupt?)");
-            s->img_y = stbi__get32be(s); if (s->img_y > (1 << 24)) return stbi__err("too large","Very large image (corrupt?)");
+            s->img_x = stbi__get32be(s);
+            s->img_y = stbi__get32be(s);
+            if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+            if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
             z->depth = stbi__get8(s);  if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)  return stbi__err("1/2/4/8/16-bit only","PNG not supported: 1/2/4/8/16-bit only");
             color = stbi__get8(s);  if (color > 6)         return stbi__err("bad ctype","Corrupt PNG");
-			if (color == 3 && z->depth == 16)                  return stbi__err("bad ctype","Corrupt PNG");
+            if (color == 3 && z->depth == 16)                  return stbi__err("bad ctype","Corrupt PNG");
             if (color == 3) pal_img_n = 3; else if (color & 1) return stbi__err("bad ctype","Corrupt PNG");
             comp  = stbi__get8(s);  if (comp) return stbi__err("bad comp method","Corrupt PNG");
             filter= stbi__get8(s);  if (filter) return stbi__err("bad filter method","Corrupt PNG");
@@ -4463,14 +5120,13 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
             if (!pal_img_n) {
                s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
                if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
-               if (scan == STBI__SCAN_header) return 1;
             } else {
                // if paletted, then pal_n is our final components, and
                // img_n is # components to decompress/filter.
                s->img_n = 1;
                if ((1 << 30) / s->img_x / 4 < s->img_y) return stbi__err("too large","Corrupt PNG");
-               // if SCAN_header, have to scan to see if we have a tRNS
             }
+            // even with SCAN_header, have to scan to see if we have a tRNS
             break;
          }
 
@@ -4502,6 +5158,8 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
                if (!(s->img_n & 1)) return stbi__err("tRNS with alpha","Corrupt PNG");
                if (c.length != (stbi__uint32) s->img_n*2) return stbi__err("bad tRNS len","Corrupt PNG");
                has_trans = 1;
+               // non-paletted with tRNS = constant alpha. if header-scanning, we can stop now.
+               if (scan == STBI__SCAN_header) { ++s->img_n; return 1; }
                if (z->depth == 16) {
                   for (k = 0; k < s->img_n; ++k) tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
                } else {
@@ -4514,7 +5172,13 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
          case STBI__PNG_TYPE('I','D','A','T'): {
             if (first) return stbi__err("first not IHDR", "Corrupt PNG");
             if (pal_img_n && !pal_len) return stbi__err("no PLTE","Corrupt PNG");
-            if (scan == STBI__SCAN_header) { s->img_n = pal_img_n; return 1; }
+            if (scan == STBI__SCAN_header) {
+               // header scan definitely stops at first IDAT
+               if (pal_img_n)
+                  s->img_n = pal_img_n;
+               return 1;
+            }
+            if (c.length > (1u << 30)) return stbi__err("IDAT size limit", "IDAT section larger than 2^30 bytes");
             if ((int)(ioff + c.length) < (int)ioff) return 0;
             if (ioff + c.length > idata_limit) {
                stbi__uint32 idata_limit_old = idata_limit;
@@ -4563,8 +5227,13 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
                if (req_comp >= 3) s->img_out_n = req_comp;
                if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
                   return 0;
+            } else if (has_trans) {
+               // non-paletted image with tRNS -> source image has (constant) alpha
+               ++s->img_n;
             }
             STBI_FREE(z->expanded); z->expanded = NULL;
+            // end of PNG chunk, read and skip CRC
+            stbi__get32be(s);
             return 1;
          }
 
@@ -4590,20 +5259,24 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
    }
 }
 
-static unsigned char *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp)
+static void *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp, stbi__result_info *ri)
 {
-   unsigned char *result=NULL;
+   void *result=NULL;
    if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
    if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
-      if (p->depth == 16) {
-         if (!stbi__reduce_png(p)) {
-            return result;
-         }
-      }
+      if (p->depth <= 8)
+         ri->bits_per_channel = 8;
+      else if (p->depth == 16)
+         ri->bits_per_channel = 16;
+      else
+         return stbi__errpuc("bad bits_per_channel", "PNG not supported: unsupported color depth");
       result = p->out;
       p->out = NULL;
       if (req_comp && req_comp != p->s->img_out_n) {
-         result = stbi__convert_format(result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         if (ri->bits_per_channel == 8)
+            result = stbi__convert_format((unsigned char *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         else
+            result = stbi__convert_format16((stbi__uint16 *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
          p->s->img_out_n = req_comp;
          if (result == NULL) return result;
       }
@@ -4618,11 +5291,11 @@ static unsigned char *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req
    return result;
 }
 
-static unsigned char *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static void *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
 {
    stbi__png p;
    p.s = s;
-   return stbi__do_png(&p, x,y,comp,req_comp);
+   return stbi__do_png(&p, x,y,comp,req_comp, ri);
 }
 
 static int stbi__png_test(stbi__context *s)
@@ -4651,6 +5324,19 @@ static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp)
    p.s = s;
    return stbi__png_info_raw(&p, x, y, comp);
 }
+
+static int stbi__png_is16(stbi__context *s)
+{
+   stbi__png p;
+   p.s = s;
+   if (!stbi__png_info_raw(&p, NULL, NULL, NULL))
+	   return 0;
+   if (p.depth != 16) {
+      stbi__rewind(p.s);
+      return 0;
+   }
+   return 1;
+}
 #endif
 
 // Microsoft/Windows BMP image
@@ -4684,11 +5370,11 @@ static int stbi__high_bit(unsigned int z)
 {
    int n=0;
    if (z == 0) return -1;
-   if (z >= 0x10000) n += 16, z >>= 16;
-   if (z >= 0x00100) n +=  8, z >>=  8;
-   if (z >= 0x00010) n +=  4, z >>=  4;
-   if (z >= 0x00004) n +=  2, z >>=  2;
-   if (z >= 0x00002) n +=  1, z >>=  1;
+   if (z >= 0x10000) { n += 16; z >>= 16; }
+   if (z >= 0x00100) { n +=  8; z >>=  8; }
+   if (z >= 0x00010) { n +=  4; z >>=  4; }
+   if (z >= 0x00004) { n +=  2; z >>=  2; }
+   if (z >= 0x00002) { n +=  1;/* >>=  1;*/ }
    return n;
 }
 
@@ -4702,29 +5388,62 @@ static int stbi__bitcount(unsigned int a)
    return a & 0xff;
 }
 
-static int stbi__shiftsigned(int v, int shift, int bits)
-{
-   int result;
-   int z=0;
-
-   if (shift < 0) v <<= -shift;
-   else v >>= shift;
-   result = v;
-
-   z = bits;
-   while (z < 8) {
-      result += v >> z;
-      z += bits;
-   }
-   return result;
+// extract an arbitrarily-aligned N-bit value (N=bits)
+// from v, and then make it 8-bits long and fractionally
+// extend it to full full range.
+static int stbi__shiftsigned(unsigned int v, int shift, int bits)
+{
+   static unsigned int mul_table[9] = {
+      0,
+      0xff/*0b11111111*/, 0x55/*0b01010101*/, 0x49/*0b01001001*/, 0x11/*0b00010001*/,
+      0x21/*0b00100001*/, 0x41/*0b01000001*/, 0x81/*0b10000001*/, 0x01/*0b00000001*/,
+   };
+   static unsigned int shift_table[9] = {
+      0, 0,0,1,0,2,4,6,0,
+   };
+   if (shift < 0)
+      v <<= -shift;
+   else
+      v >>= shift;
+   STBI_ASSERT(v < 256);
+   v >>= (8-bits);
+   STBI_ASSERT(bits >= 0 && bits <= 8);
+   return (int) ((unsigned) v * mul_table[bits]) >> shift_table[bits];
 }
 
 typedef struct
 {
    int bpp, offset, hsz;
    unsigned int mr,mg,mb,ma, all_a;
+   int extra_read;
 } stbi__bmp_data;
 
+static int stbi__bmp_set_mask_defaults(stbi__bmp_data *info, int compress)
+{
+   // BI_BITFIELDS specifies masks explicitly, don't override
+   if (compress == 3)
+      return 1;
+
+   if (compress == 0) {
+      if (info->bpp == 16) {
+         info->mr = 31u << 10;
+         info->mg = 31u <<  5;
+         info->mb = 31u <<  0;
+      } else if (info->bpp == 32) {
+         info->mr = 0xffu << 16;
+         info->mg = 0xffu <<  8;
+         info->mb = 0xffu <<  0;
+         info->ma = 0xffu << 24;
+         info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
+      } else {
+         // otherwise, use defaults, which is all-0
+         info->mr = info->mg = info->mb = info->ma = 0;
+      }
+      return 1;
+   }
+   return 0; // error
+}
+
 static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
 {
    int hsz;
@@ -4735,7 +5454,10 @@ static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
    info->offset = stbi__get32le(s);
    info->hsz = hsz = stbi__get32le(s);
    info->mr = info->mg = info->mb = info->ma = 0;
-   
+   info->extra_read = 14;
+
+   if (info->offset < 0) return stbi__errpuc("bad BMP", "bad BMP");
+
    if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) return stbi__errpuc("unknown BMP", "BMP type not supported: unknown");
    if (hsz == 12) {
       s->img_x = stbi__get16le(s);
@@ -4746,10 +5468,11 @@ static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
    }
    if (stbi__get16le(s) != 1) return stbi__errpuc("bad BMP", "bad BMP");
    info->bpp = stbi__get16le(s);
-   if (info->bpp == 1) return stbi__errpuc("monochrome", "BMP type not supported: 1-bit");
    if (hsz != 12) {
       int compress = stbi__get32le(s);
       if (compress == 1 || compress == 2) return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
+      if (compress >= 4) return stbi__errpuc("BMP JPEG/PNG", "BMP type not supported: unsupported compression"); // this includes PNG/JPEG modes
+      if (compress == 3 && info->bpp != 16 && info->bpp != 32) return stbi__errpuc("bad BMP", "bad BMP"); // bitfields requires 16 or 32 bits/pixel
       stbi__get32le(s); // discard sizeof
       stbi__get32le(s); // discard hres
       stbi__get32le(s); // discard vres
@@ -4764,21 +5487,12 @@ static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
          }
          if (info->bpp == 16 || info->bpp == 32) {
             if (compress == 0) {
-               if (info->bpp == 32) {
-                  info->mr = 0xffu << 16;
-                  info->mg = 0xffu <<  8;
-                  info->mb = 0xffu <<  0;
-                  info->ma = 0xffu << 24;
-                  info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
-               } else {
-                  info->mr = 31u << 10;
-                  info->mg = 31u <<  5;
-                  info->mb = 31u <<  0;
-               }
+               stbi__bmp_set_mask_defaults(info, compress);
             } else if (compress == 3) {
                info->mr = stbi__get32le(s);
                info->mg = stbi__get32le(s);
                info->mb = stbi__get32le(s);
+               info->extra_read += 12;
                // not documented, but generated by photoshop and handled by mspaint
                if (info->mr == info->mg && info->mg == info->mb) {
                   // ?!?!?
@@ -4788,6 +5502,7 @@ static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
                return stbi__errpuc("bad BMP", "bad BMP");
          }
       } else {
+         // V4/V5 header
          int i;
          if (hsz != 108 && hsz != 124)
             return stbi__errpuc("bad BMP", "bad BMP");
@@ -4795,6 +5510,8 @@ static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
          info->mg = stbi__get32le(s);
          info->mb = stbi__get32le(s);
          info->ma = stbi__get32le(s);
+         if (compress != 3) // override mr/mg/mb unless in BI_BITFIELDS mode, as per docs
+            stbi__bmp_set_mask_defaults(info, compress);
          stbi__get32le(s); // discard color space
          for (i=0; i < 12; ++i)
             stbi__get32le(s); // discard color space parameters
@@ -4806,11 +5523,11 @@ static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
          }
       }
    }
-   return (void *)(size_t) 1;
+   return (void *) 1;
 }
 
 
-static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
 {
    stbi_uc *out;
    unsigned int mr=0,mg=0,mb=0,ma=0, all_a;
@@ -4818,14 +5535,18 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
    int psize=0,i,j,width;
    int flip_vertically, pad, target;
    stbi__bmp_data info;
+   STBI_NOTUSED(ri);
 
-   info.all_a = 255;   
+   info.all_a = 255;
    if (stbi__bmp_parse_header(s, &info) == NULL)
       return NULL; // error code already set
 
    flip_vertically = ((int) s->img_y) > 0;
    s->img_y = abs((int) s->img_y);
 
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
    mr = info.mr;
    mg = info.mg;
    mb = info.mb;
@@ -4834,19 +5555,45 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
 
    if (info.hsz == 12) {
       if (info.bpp < 24)
-         psize = (info.offset - 14 - 24) / 3;
+         psize = (info.offset - info.extra_read - 24) / 3;
    } else {
       if (info.bpp < 16)
-         psize = (info.offset - 14 - info.hsz) >> 2;
+         psize = (info.offset - info.extra_read - info.hsz) >> 2;
+   }
+   if (psize == 0) {
+      // accept some number of extra bytes after the header, but if the offset points either to before
+      // the header ends or implies a large amount of extra data, reject the file as malformed
+      int bytes_read_so_far = s->callback_already_read + (int)(s->img_buffer - s->img_buffer_original);
+      int header_limit = 1024; // max we actually read is below 256 bytes currently.
+      int extra_data_limit = 256*4; // what ordinarily goes here is a palette; 256 entries*4 bytes is its max size.
+      if (bytes_read_so_far <= 0 || bytes_read_so_far > header_limit) {
+         return stbi__errpuc("bad header", "Corrupt BMP");
+      }
+      // we established that bytes_read_so_far is positive and sensible.
+      // the first half of this test rejects offsets that are either too small positives, or
+      // negative, and guarantees that info.offset >= bytes_read_so_far > 0. this in turn
+      // ensures the number computed in the second half of the test can't overflow.
+      if (info.offset < bytes_read_so_far || info.offset - bytes_read_so_far > extra_data_limit) {
+         return stbi__errpuc("bad offset", "Corrupt BMP");
+      } else {
+         stbi__skip(s, info.offset - bytes_read_so_far);
+      }
    }
 
-   s->img_n = ma ? 4 : 3;
+   if (info.bpp == 24 && ma == 0xff000000)
+      s->img_n = 3;
+   else
+      s->img_n = ma ? 4 : 3;
    if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
       target = req_comp;
    else
       target = s->img_n; // if they want monochrome, we'll post-convert
 
-   out = (stbi_uc *) stbi__malloc(target * s->img_x * s->img_y);
+   // sanity-check size
+   if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0))
+      return stbi__errpuc("too large", "Corrupt BMP");
+
+   out = (stbi_uc *) stbi__malloc_mad3(target, s->img_x, s->img_y, 0);
    if (!out) return stbi__errpuc("outofmem", "Out of memory");
    if (info.bpp < 16) {
       int z=0;
@@ -4858,36 +5605,56 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
          if (info.hsz != 12) stbi__get8(s);
          pal[i][3] = 255;
       }
-      stbi__skip(s, info.offset - 14 - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
-      if (info.bpp == 4) width = (s->img_x + 1) >> 1;
+      stbi__skip(s, info.offset - info.extra_read - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
+      if (info.bpp == 1) width = (s->img_x + 7) >> 3;
+      else if (info.bpp == 4) width = (s->img_x + 1) >> 1;
       else if (info.bpp == 8) width = s->img_x;
       else { STBI_FREE(out); return stbi__errpuc("bad bpp", "Corrupt BMP"); }
       pad = (-width)&3;
-      for (j=0; j < (int) s->img_y; ++j) {
-         for (i=0; i < (int) s->img_x; i += 2) {
-            int v=stbi__get8(s),v2=0;
-            if (info.bpp == 4) {
-               v2 = v & 15;
-               v >>= 4;
+      if (info.bpp == 1) {
+         for (j=0; j < (int) s->img_y; ++j) {
+            int bit_offset = 7, v = stbi__get8(s);
+            for (i=0; i < (int) s->img_x; ++i) {
+               int color = (v>>bit_offset)&0x1;
+               out[z++] = pal[color][0];
+               out[z++] = pal[color][1];
+               out[z++] = pal[color][2];
+               if (target == 4) out[z++] = 255;
+               if (i+1 == (int) s->img_x) break;
+               if((--bit_offset) < 0) {
+                  bit_offset = 7;
+                  v = stbi__get8(s);
+               }
             }
-            out[z++] = pal[v][0];
-            out[z++] = pal[v][1];
-            out[z++] = pal[v][2];
-            if (target == 4) out[z++] = 255;
-            if (i+1 == (int) s->img_x) break;
-            v = (info.bpp == 8) ? stbi__get8(s) : v2;
-            out[z++] = pal[v][0];
-            out[z++] = pal[v][1];
-            out[z++] = pal[v][2];
-            if (target == 4) out[z++] = 255;
+            stbi__skip(s, pad);
+         }
+      } else {
+         for (j=0; j < (int) s->img_y; ++j) {
+            for (i=0; i < (int) s->img_x; i += 2) {
+               int v=stbi__get8(s),v2=0;
+               if (info.bpp == 4) {
+                  v2 = v & 15;
+                  v >>= 4;
+               }
+               out[z++] = pal[v][0];
+               out[z++] = pal[v][1];
+               out[z++] = pal[v][2];
+               if (target == 4) out[z++] = 255;
+               if (i+1 == (int) s->img_x) break;
+               v = (info.bpp == 8) ? stbi__get8(s) : v2;
+               out[z++] = pal[v][0];
+               out[z++] = pal[v][1];
+               out[z++] = pal[v][2];
+               if (target == 4) out[z++] = 255;
+            }
+            stbi__skip(s, pad);
          }
-         stbi__skip(s, pad);
       }
    } else {
       int rshift=0,gshift=0,bshift=0,ashift=0,rcount=0,gcount=0,bcount=0,acount=0;
       int z = 0;
       int easy=0;
-      stbi__skip(s, info.offset - 14 - info.hsz);
+      stbi__skip(s, info.offset - info.extra_read - info.hsz);
       if (info.bpp == 24) width = 3 * s->img_x;
       else if (info.bpp == 16) width = 2*s->img_x;
       else /* bpp = 32 and pad = 0 */ width=0;
@@ -4905,6 +5672,7 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
          gshift = stbi__high_bit(mg)-7; gcount = stbi__bitcount(mg);
          bshift = stbi__high_bit(mb)-7; bcount = stbi__bitcount(mb);
          ashift = stbi__high_bit(ma)-7; acount = stbi__bitcount(ma);
+         if (rcount > 8 || gcount > 8 || bcount > 8 || acount > 8) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
       }
       for (j=0; j < (int) s->img_y; ++j) {
          if (easy) {
@@ -4922,7 +5690,7 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
             int bpp = info.bpp;
             for (i=0; i < (int) s->img_x; ++i) {
                stbi__uint32 v = (bpp == 16 ? (stbi__uint32) stbi__get16le(s) : stbi__get32le(s));
-               int a;
+               unsigned int a;
                out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount));
                out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount));
                out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount));
@@ -4934,7 +5702,7 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
          stbi__skip(s, pad);
       }
    }
-   
+
    // if alpha channel is all 0s, replace with all 255s
    if (target == 4 && all_a == 0)
       for (i=4*s->img_x*s->img_y-1; i >= 0; i -= 4)
@@ -4946,7 +5714,7 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
          stbi_uc *p1 = out +      j     *s->img_x*target;
          stbi_uc *p2 = out + (s->img_y-1-j)*s->img_x*target;
          for (i=0; i < (int) s->img_x*target; ++i) {
-            t = p1[i], p1[i] = p2[i], p2[i] = t;
+            t = p1[i]; p1[i] = p2[i]; p2[i] = t;
          }
       }
    }
@@ -4970,14 +5738,14 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
 static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int* is_rgb16)
 {
    // only RGB or RGBA (incl. 16bit) or grey allowed
-   if(is_rgb16) *is_rgb16 = 0;
+   if (is_rgb16) *is_rgb16 = 0;
    switch(bits_per_pixel) {
       case 8:  return STBI_grey;
       case 16: if(is_grey) return STBI_grey_alpha;
-            // else: fall-through
+               // fallthrough
       case 15: if(is_rgb16) *is_rgb16 = 1;
-            return STBI_rgb;
-      case 24: // fall-through
+               return STBI_rgb;
+      case 24: // fallthrough
       case 32: return bits_per_pixel/8;
       default: return 0;
    }
@@ -5080,7 +5848,7 @@ static int stbi__tga_test(stbi__context *s)
 }
 
 // read 16bit value and convert to 24bit RGB
-void stbi__tga_read_rgb16(stbi__context *s, stbi_uc* out)
+static void stbi__tga_read_rgb16(stbi__context *s, stbi_uc* out)
 {
    stbi__uint16 px = (stbi__uint16)stbi__get16le(s);
    stbi__uint16 fiveBitMask = 31;
@@ -5089,9 +5857,9 @@ void stbi__tga_read_rgb16(stbi__context *s, stbi_uc* out)
    int g = (px >> 5) & fiveBitMask;
    int b = px & fiveBitMask;
    // Note that this saves the data in RGB(A) order, so it doesn't need to be swapped later
-   out[0] = (stbi_uc)(r * 255)/31;
-   out[1] = (stbi_uc)(g * 255)/31;
-   out[2] = (stbi_uc)(b * 255)/31;
+   out[0] = (stbi_uc)((r * 255)/31);
+   out[1] = (stbi_uc)((g * 255)/31);
+   out[2] = (stbi_uc)((b * 255)/31);
 
    // some people claim that the most significant bit might be used for alpha
    // (possibly if an alpha-bit is set in the "image descriptor byte")
@@ -5099,7 +5867,7 @@ void stbi__tga_read_rgb16(stbi__context *s, stbi_uc* out)
    // so let's treat all 15 and 16bit TGAs as RGB with no alpha.
 }
 
-static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
 {
    //   read in the TGA header stuff
    int tga_offset = stbi__get8(s);
@@ -5121,10 +5889,16 @@ static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int
    unsigned char *tga_data;
    unsigned char *tga_palette = NULL;
    int i, j;
-   unsigned char raw_data[4];
+   unsigned char raw_data[4] = {0};
    int RLE_count = 0;
    int RLE_repeating = 0;
    int read_next_pixel = 1;
+   STBI_NOTUSED(ri);
+   STBI_NOTUSED(tga_x_origin); // @TODO
+   STBI_NOTUSED(tga_y_origin); // @TODO
+
+   if (tga_height > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (tga_width > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
 
    //   do a tiny bit of precessing
    if ( tga_image_type >= 8 )
@@ -5146,7 +5920,10 @@ static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int
    *y = tga_height;
    if (comp) *comp = tga_comp;
 
-   tga_data = (unsigned char*)stbi__malloc( (size_t)tga_width * tga_height * tga_comp );
+   if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0))
+      return stbi__errpuc("too large", "Corrupt TGA");
+
+   tga_data = (unsigned char*)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0);
    if (!tga_data) return stbi__errpuc("outofmem", "Out of memory");
 
    // skip to the data's starting position (offset usually = 0)
@@ -5162,10 +5939,15 @@ static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int
       //   do I need to load a palette?
       if ( tga_indexed)
       {
+         if (tga_palette_len == 0) {  /* you have to have at least one entry! */
+            STBI_FREE(tga_data);
+            return stbi__errpuc("bad palette", "Corrupt TGA");
+         }
+
          //   any data to skip? (offset usually = 0)
          stbi__skip(s, tga_palette_start );
          //   load the palette
-         tga_palette = (unsigned char*)stbi__malloc( tga_palette_len * tga_comp );
+         tga_palette = (unsigned char*)stbi__malloc_mad2(tga_palette_len, tga_comp, 0);
          if (!tga_palette) {
             STBI_FREE(tga_data);
             return stbi__errpuc("outofmem", "Out of memory");
@@ -5285,6 +6067,7 @@ static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int
    //   Microsoft's C compilers happy... [8^(
    tga_palette_start = tga_palette_len = tga_palette_bits =
          tga_x_origin = tga_y_origin = 0;
+   STBI_NOTUSED(tga_palette_start);
    //   OK, done
    return tga_data;
 }
@@ -5301,14 +6084,53 @@ static int stbi__psd_test(stbi__context *s)
    return r;
 }
 
-static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static int stbi__psd_decode_rle(stbi__context *s, stbi_uc *p, int pixelCount)
 {
-   int   pixelCount;
+   int count, nleft, len;
+
+   count = 0;
+   while ((nleft = pixelCount - count) > 0) {
+      len = stbi__get8(s);
+      if (len == 128) {
+         // No-op.
+      } else if (len < 128) {
+         // Copy next len+1 bytes literally.
+         len++;
+         if (len > nleft) return 0; // corrupt data
+         count += len;
+         while (len) {
+            *p = stbi__get8(s);
+            p += 4;
+            len--;
+         }
+      } else if (len > 128) {
+         stbi_uc   val;
+         // Next -len+1 bytes in the dest are replicated from next source byte.
+         // (Interpret len as a negative 8-bit int.)
+         len = 257 - len;
+         if (len > nleft) return 0; // corrupt data
+         val = stbi__get8(s);
+         count += len;
+         while (len) {
+            *p = val;
+            p += 4;
+            len--;
+         }
+      }
+   }
+
+   return 1;
+}
+
+static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
+{
+   int pixelCount;
    int channelCount, compression;
-   int channel, i, count, len;
+   int channel, i;
    int bitdepth;
    int w,h;
    stbi_uc *out;
+   STBI_NOTUSED(ri);
 
    // Check identifier
    if (stbi__get32be(s) != 0x38425053)   // "8BPS"
@@ -5330,6 +6152,9 @@ static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int
    h = stbi__get32be(s);
    w = stbi__get32be(s);
 
+   if (h > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (w > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
    // Make sure the depth is 8 bits.
    bitdepth = stbi__get16be(s);
    if (bitdepth != 8 && bitdepth != 16)
@@ -5365,8 +6190,18 @@ static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int
    if (compression > 1)
       return stbi__errpuc("bad compression", "PSD has an unknown compression format");
 
+   // Check size
+   if (!stbi__mad3sizes_valid(4, w, h, 0))
+      return stbi__errpuc("too large", "Corrupt PSD");
+
    // Create the destination image.
-   out = (stbi_uc *) stbi__malloc(4 * w*h);
+
+   if (!compression && bitdepth == 16 && bpc == 16) {
+      out = (stbi_uc *) stbi__malloc_mad3(8, w, h, 0);
+      ri->bits_per_channel = 16;
+   } else
+      out = (stbi_uc *) stbi__malloc(4 * w*h);
+
    if (!out) return stbi__errpuc("outofmem", "Out of memory");
    pixelCount = w*h;
 
@@ -5383,7 +6218,7 @@ static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int
       //     Else if n is 128, noop.
       // Endloop
 
-      // The RLE-compressed data is preceeded by a 2-byte data count for each row in the data,
+      // The RLE-compressed data is preceded by a 2-byte data count for each row in the data,
       // which we're going to just skip.
       stbi__skip(s, h * channelCount * 2 );
 
@@ -5398,82 +6233,86 @@ static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int
                *p = (channel == 3 ? 255 : 0);
          } else {
             // Read the RLE data.
-            count = 0;
-            while (count < pixelCount) {
-               len = stbi__get8(s);
-               if (len == 128) {
-                  // No-op.
-               } else if (len < 128) {
-                  // Copy next len+1 bytes literally.
-                  len++;
-                  count += len;
-                  while (len) {
-                     *p = stbi__get8(s);
-                     p += 4;
-                     len--;
-                  }
-               } else if (len > 128) {
-                  stbi_uc   val;
-                  // Next -len+1 bytes in the dest are replicated from next source byte.
-                  // (Interpret len as a negative 8-bit int.)
-                  len ^= 0x0FF;
-                  len += 2;
-                  val = stbi__get8(s);
-                  count += len;
-                  while (len) {
-                     *p = val;
-                     p += 4;
-                     len--;
-                  }
-               }
+            if (!stbi__psd_decode_rle(s, p, pixelCount)) {
+               STBI_FREE(out);
+               return stbi__errpuc("corrupt", "bad RLE data");
             }
          }
       }
 
    } else {
       // We're at the raw image data.  It's each channel in order (Red, Green, Blue, Alpha, ...)
-      // where each channel consists of an 8-bit value for each pixel in the image.
+      // where each channel consists of an 8-bit (or 16-bit) value for each pixel in the image.
 
       // Read the data by channel.
       for (channel = 0; channel < 4; channel++) {
-         stbi_uc *p;
-
-         p = out + channel;
          if (channel >= channelCount) {
             // Fill this channel with default data.
-            stbi_uc val = channel == 3 ? 255 : 0;
-            for (i = 0; i < pixelCount; i++, p += 4)
-               *p = val;
-         } else {
-            // Read the data.
-            if (bitdepth == 16) {
-               for (i = 0; i < pixelCount; i++, p += 4)
-                  *p = (stbi_uc) (stbi__get16be(s) >> 8);
+            if (bitdepth == 16 && bpc == 16) {
+               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
+               stbi__uint16 val = channel == 3 ? 65535 : 0;
+               for (i = 0; i < pixelCount; i++, q += 4)
+                  *q = val;
             } else {
+               stbi_uc *p = out+channel;
+               stbi_uc val = channel == 3 ? 255 : 0;
                for (i = 0; i < pixelCount; i++, p += 4)
-                  *p = stbi__get8(s);
+                  *p = val;
+            }
+         } else {
+            if (ri->bits_per_channel == 16) {    // output bpc
+               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
+               for (i = 0; i < pixelCount; i++, q += 4)
+                  *q = (stbi__uint16) stbi__get16be(s);
+            } else {
+               stbi_uc *p = out+channel;
+               if (bitdepth == 16) {  // input bpc
+                  for (i = 0; i < pixelCount; i++, p += 4)
+                     *p = (stbi_uc) (stbi__get16be(s) >> 8);
+               } else {
+                  for (i = 0; i < pixelCount; i++, p += 4)
+                     *p = stbi__get8(s);
+               }
             }
          }
       }
    }
 
+   // remove weird white matte from PSD
    if (channelCount >= 4) {
-      for (i=0; i < w*h; ++i) {
-         unsigned char *pixel = out + 4*i;
-         if (pixel[3] != 0 && pixel[3] != 255) {
-            // remove weird white matte from PSD
-            float a = pixel[3] / 255.0f;
-            float ra = 1.0f / a;
-            float inv_a = 255.0f * (1 - ra);
-            pixel[0] = (unsigned char) (pixel[0]*ra + inv_a);
-            pixel[1] = (unsigned char) (pixel[1]*ra + inv_a);
-            pixel[2] = (unsigned char) (pixel[2]*ra + inv_a);
+      if (ri->bits_per_channel == 16) {
+         for (i=0; i < w*h; ++i) {
+            stbi__uint16 *pixel = (stbi__uint16 *) out + 4*i;
+            if (pixel[3] != 0 && pixel[3] != 65535) {
+               float a = pixel[3] / 65535.0f;
+               float ra = 1.0f / a;
+               float inv_a = 65535.0f * (1 - ra);
+               pixel[0] = (stbi__uint16) (pixel[0]*ra + inv_a);
+               pixel[1] = (stbi__uint16) (pixel[1]*ra + inv_a);
+               pixel[2] = (stbi__uint16) (pixel[2]*ra + inv_a);
+            }
+         }
+      } else {
+         for (i=0; i < w*h; ++i) {
+            unsigned char *pixel = out + 4*i;
+            if (pixel[3] != 0 && pixel[3] != 255) {
+               float a = pixel[3] / 255.0f;
+               float ra = 1.0f / a;
+               float inv_a = 255.0f * (1 - ra);
+               pixel[0] = (unsigned char) (pixel[0]*ra + inv_a);
+               pixel[1] = (unsigned char) (pixel[1]*ra + inv_a);
+               pixel[2] = (unsigned char) (pixel[2]*ra + inv_a);
+            }
          }
       }
    }
 
+   // convert to desired output format
    if (req_comp && req_comp != 4) {
-      out = stbi__convert_format(out, 4, req_comp, w, h);
+      if (ri->bits_per_channel == 16)
+         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, 4, req_comp, w, h);
+      else
+         out = stbi__convert_format(out, 4, req_comp, w, h);
       if (out == NULL) return out; // stbi__convert_format frees input on failure
    }
 
@@ -5657,25 +6496,33 @@ static stbi_uc *stbi__pic_load_core(stbi__context *s,int width,int height,int *c
    return result;
 }
 
-static stbi_uc *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_comp)
+static void *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_comp, stbi__result_info *ri)
 {
    stbi_uc *result;
-   int i, x,y;
+   int i, x,y, internal_comp;
+   STBI_NOTUSED(ri);
+
+   if (!comp) comp = &internal_comp;
 
    for (i=0; i<92; ++i)
       stbi__get8(s);
 
    x = stbi__get16be(s);
    y = stbi__get16be(s);
+
+   if (y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
    if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (pic header)");
-   if ((1 << 28) / x < y) return stbi__errpuc("too large", "Image too large to decode");
+   if (!stbi__mad3sizes_valid(x, y, 4, 0)) return stbi__errpuc("too large", "PIC image too large to decode");
 
    stbi__get32be(s); //skip `ratio'
    stbi__get16be(s); //skip `fields'
    stbi__get16be(s); //skip `pad'
 
    // intermediate buffer is RGBA
-   result = (stbi_uc *) stbi__malloc(x*y*4);
+   result = (stbi_uc *) stbi__malloc_mad3(x, y, 4, 0);
+   if (!result) return stbi__errpuc("outofmem", "Out of memory");
    memset(result, 0xff, x*y*4);
 
    if (!stbi__pic_load_core(s,x,y,comp, result)) {
@@ -5712,11 +6559,13 @@ typedef struct
 typedef struct
 {
    int w,h;
-   stbi_uc *out, *old_out;             // output buffer (always 4 components)
-   int flags, bgindex, ratio, transparent, eflags, delay;
+   stbi_uc *out;                 // output buffer (always 4 components)
+   stbi_uc *background;          // The current "background" as far as a gif is concerned
+   stbi_uc *history;
+   int flags, bgindex, ratio, transparent, eflags;
    stbi_uc  pal[256][4];
    stbi_uc lpal[256][4];
-   stbi__gif_lzw codes[4096];
+   stbi__gif_lzw codes[8192];
    stbi_uc *color_table;
    int parse, step;
    int lflags;
@@ -5724,6 +6573,7 @@ typedef struct
    int max_x, max_y;
    int cur_x, cur_y;
    int line_size;
+   int delay;
 } stbi__gif;
 
 static int stbi__gif_test_raw(stbi__context *s)
@@ -5772,6 +6622,9 @@ static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_in
    g->ratio = stbi__get8(s);
    g->transparent = -1;
 
+   if (g->w > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   if (g->h > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+
    if (comp != 0) *comp = 4;  // can't actually tell whether it's 3 or 4 until we parse the comments
 
    if (is_info) return 1;
@@ -5785,6 +6638,7 @@ static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_in
 static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp)
 {
    stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif));
+   if (!g) return stbi__err("outofmem", "Out of memory");
    if (!stbi__gif_header(s, g, comp, 1)) {
       STBI_FREE(g);
       stbi__rewind( s );
@@ -5799,6 +6653,7 @@ static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp)
 static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
 {
    stbi_uc *p, *c;
+   int idx;
 
    // recurse to decode the prefixes, since the linked-list is backwards,
    // and working backwards through an interleaved image would be nasty
@@ -5807,10 +6662,12 @@ static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
 
    if (g->cur_y >= g->max_y) return;
 
-   p = &g->out[g->cur_x + g->cur_y];
-   c = &g->color_table[g->codes[code].suffix * 4];
+   idx = g->cur_x + g->cur_y;
+   p = &g->out[idx];
+   g->history[idx / 4] = 1;
 
-   if (c[3] >= 128) {
+   c = &g->color_table[g->codes[code].suffix * 4];
+   if (c[3] > 128) { // don't render transparent pixels;
       p[0] = c[2];
       p[1] = c[1];
       p[2] = c[0];
@@ -5884,11 +6741,16 @@ static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
                stbi__skip(s,len);
             return g->out;
          } else if (code <= avail) {
-            if (first) return stbi__errpuc("no clear code", "Corrupt GIF");
+            if (first) {
+               return stbi__errpuc("no clear code", "Corrupt GIF");
+            }
 
             if (oldcode >= 0) {
                p = &g->codes[avail++];
-               if (avail > 4096)        return stbi__errpuc("too many codes", "Corrupt GIF");
+               if (avail > 8192) {
+                  return stbi__errpuc("too many codes", "Corrupt GIF");
+               }
+
                p->prefix = (stbi__int16) oldcode;
                p->first = g->codes[oldcode].first;
                p->suffix = (code == avail) ? p->first : g->codes[code].first;
@@ -5910,59 +6772,77 @@ static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
    }
 }
 
-static void stbi__fill_gif_background(stbi__gif *g, int x0, int y0, int x1, int y1)
-{
-   int x, y;
-   stbi_uc *c = g->pal[g->bgindex];
-   for (y = y0; y < y1; y += 4 * g->w) {
-      for (x = x0; x < x1; x += 4) {
-         stbi_uc *p  = &g->out[y + x];
-         p[0] = c[2];
-         p[1] = c[1];
-         p[2] = c[0];
-         p[3] = 0;
-      }
-   }
-}
-
 // this function is designed to support animated gifs, although stb_image doesn't support it
-static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, int req_comp)
+// two back is the image from two frames ago, used for a very specific disposal format
+static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, int req_comp, stbi_uc *two_back)
 {
-   int i;
-   stbi_uc *prev_out = 0;
+   int dispose;
+   int first_frame;
+   int pi;
+   int pcount;
+   STBI_NOTUSED(req_comp);
 
-   if (g->out == 0 && !stbi__gif_header(s, g, comp,0))
-      return 0; // stbi__g_failure_reason set by stbi__gif_header
+   // on first frame, any non-written pixels get the background colour (non-transparent)
+   first_frame = 0;
+   if (g->out == 0) {
+      if (!stbi__gif_header(s, g, comp,0)) return 0; // stbi__g_failure_reason set by stbi__gif_header
+      if (!stbi__mad3sizes_valid(4, g->w, g->h, 0))
+         return stbi__errpuc("too large", "GIF image is too large");
+      pcount = g->w * g->h;
+      g->out = (stbi_uc *) stbi__malloc(4 * pcount);
+      g->background = (stbi_uc *) stbi__malloc(4 * pcount);
+      g->history = (stbi_uc *) stbi__malloc(pcount);
+      if (!g->out || !g->background || !g->history)
+         return stbi__errpuc("outofmem", "Out of memory");
+
+      // image is treated as "transparent" at the start - ie, nothing overwrites the current background;
+      // background colour is only used for pixels that are not rendered first frame, after that "background"
+      // color refers to the color that was there the previous frame.
+      memset(g->out, 0x00, 4 * pcount);
+      memset(g->background, 0x00, 4 * pcount); // state of the background (starts transparent)
+      memset(g->history, 0x00, pcount);        // pixels that were affected previous frame
+      first_frame = 1;
+   } else {
+      // second frame - how do we dispose of the previous one?
+      dispose = (g->eflags & 0x1C) >> 2;
+      pcount = g->w * g->h;
 
-   prev_out = g->out;
-   g->out = (stbi_uc *) stbi__malloc(4 * g->w * g->h);
-   if (g->out == 0) return stbi__errpuc("outofmem", "Out of memory");
+      if ((dispose == 3) && (two_back == 0)) {
+         dispose = 2; // if I don't have an image to revert back to, default to the old background
+      }
 
-   switch ((g->eflags & 0x1C) >> 2) {
-      case 0: // unspecified (also always used on 1st frame)
-         stbi__fill_gif_background(g, 0, 0, 4 * g->w, 4 * g->w * g->h);
-         break;
-      case 1: // do not dispose
-         if (prev_out) memcpy(g->out, prev_out, 4 * g->w * g->h);
-         g->old_out = prev_out;
-         break;
-      case 2: // dispose to background
-         if (prev_out) memcpy(g->out, prev_out, 4 * g->w * g->h);
-         stbi__fill_gif_background(g, g->start_x, g->start_y, g->max_x, g->max_y);
-         break;
-      case 3: // dispose to previous
-         if (g->old_out) {
-            for (i = g->start_y; i < g->max_y; i += 4 * g->w)
-               memcpy(&g->out[i + g->start_x], &g->old_out[i + g->start_x], g->max_x - g->start_x);
+      if (dispose == 3) { // use previous graphic
+         for (pi = 0; pi < pcount; ++pi) {
+            if (g->history[pi]) {
+               memcpy( &g->out[pi * 4], &two_back[pi * 4], 4 );
+            }
          }
-         break;
+      } else if (dispose == 2) {
+         // restore what was changed last frame to background before that frame;
+         for (pi = 0; pi < pcount; ++pi) {
+            if (g->history[pi]) {
+               memcpy( &g->out[pi * 4], &g->background[pi * 4], 4 );
+            }
+         }
+      } else {
+         // This is a non-disposal case eithe way, so just
+         // leave the pixels as is, and they will become the new background
+         // 1: do not dispose
+         // 0:  not specified.
+      }
+
+      // background is what out is after the undoing of the previou frame;
+      memcpy( g->background, g->out, 4 * g->w * g->h );
    }
 
+   // clear my history;
+   memset( g->history, 0x00, g->w * g->h );        // pixels that were affected previous frame
+
    for (;;) {
-      switch (stbi__get8(s)) {
+      int tag = stbi__get8(s);
+      switch (tag) {
          case 0x2C: /* Image Descriptor */
          {
-            int prev_trans = -1;
             stbi__int32 x, y, w, h;
             stbi_uc *o;
 
@@ -5981,6 +6861,13 @@ static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, i
             g->cur_x   = g->start_x;
             g->cur_y   = g->start_y;
 
+            // if the width of the specified rectangle is 0, that means
+            // we may not see *any* pixels or the image is malformed;
+            // to make sure this is caught, move the current y down to
+            // max_y (which is what out_gif_code checks).
+            if (w == 0)
+               g->cur_y = g->max_y;
+
             g->lflags = stbi__get8(s);
 
             if (g->lflags & 0x40) {
@@ -5995,19 +6882,24 @@ static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, i
                stbi__gif_parse_colortable(s,g->lpal, 2 << (g->lflags & 7), g->eflags & 0x01 ? g->transparent : -1);
                g->color_table = (stbi_uc *) g->lpal;
             } else if (g->flags & 0x80) {
-               if (g->transparent >= 0 && (g->eflags & 0x01)) {
-                  prev_trans = g->pal[g->transparent][3];
-                  g->pal[g->transparent][3] = 0;
-               }
                g->color_table = (stbi_uc *) g->pal;
             } else
                return stbi__errpuc("missing color table", "Corrupt GIF");
 
             o = stbi__process_gif_raster(s, g);
-            if (o == NULL) return NULL;
-
-            if (prev_trans != -1)
-               g->pal[g->transparent][3] = (stbi_uc) prev_trans;
+            if (!o) return NULL;
+
+            // if this was the first frame,
+            pcount = g->w * g->h;
+            if (first_frame && (g->bgindex > 0)) {
+               // if first frame, any pixel not drawn to gets the background color
+               for (pi = 0; pi < pcount; ++pi) {
+                  if (g->history[pi] == 0) {
+                     g->pal[g->bgindex][3] = 255; // just in case it was made transparent, undo that; It will be reset next frame if need be;
+                     memcpy( &g->out[pi * 4], &g->pal[g->bgindex], 4 );
+                  }
+               }
+            }
 
             return o;
          }
@@ -6015,19 +6907,35 @@ static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, i
          case 0x21: // Comment Extension.
          {
             int len;
-            if (stbi__get8(s) == 0xF9) { // Graphic Control Extension.
+            int ext = stbi__get8(s);
+            if (ext == 0xF9) { // Graphic Control Extension.
                len = stbi__get8(s);
                if (len == 4) {
                   g->eflags = stbi__get8(s);
-                  g->delay = stbi__get16le(s);
-                  g->transparent = stbi__get8(s);
+                  g->delay = 10 * stbi__get16le(s); // delay - 1/100th of a second, saving as 1/1000ths.
+
+                  // unset old transparent
+                  if (g->transparent >= 0) {
+                     g->pal[g->transparent][3] = 255;
+                  }
+                  if (g->eflags & 0x01) {
+                     g->transparent = stbi__get8(s);
+                     if (g->transparent >= 0) {
+                        g->pal[g->transparent][3] = 0;
+                     }
+                  } else {
+                     // don't need transparent
+                     stbi__skip(s, 1);
+                     g->transparent = -1;
+                  }
                } else {
                   stbi__skip(s, len);
                   break;
                }
             }
-            while ((len = stbi__get8(s)) != 0)
+            while ((len = stbi__get8(s)) != 0) {
                stbi__skip(s, len);
+            }
             break;
          }
 
@@ -6038,27 +6946,130 @@ static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, i
             return stbi__errpuc("unknown code", "Corrupt GIF");
       }
    }
+}
 
-   STBI_NOTUSED(req_comp);
+static void *stbi__load_gif_main_outofmem(stbi__gif *g, stbi_uc *out, int **delays)
+{
+   STBI_FREE(g->out);
+   STBI_FREE(g->history);
+   STBI_FREE(g->background);
+
+   if (out) STBI_FREE(out);
+   if (delays && *delays) STBI_FREE(*delays);
+   return stbi__errpuc("outofmem", "Out of memory");
+}
+
+static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
+{
+   if (stbi__gif_test(s)) {
+      int layers = 0;
+      stbi_uc *u = 0;
+      stbi_uc *out = 0;
+      stbi_uc *two_back = 0;
+      stbi__gif g;
+      int stride;
+      int out_size = 0;
+      int delays_size = 0;
+
+      STBI_NOTUSED(out_size);
+      STBI_NOTUSED(delays_size);
+
+      memset(&g, 0, sizeof(g));
+      if (delays) {
+         *delays = 0;
+      }
+
+      do {
+         u = stbi__gif_load_next(s, &g, comp, req_comp, two_back);
+         if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
+
+         if (u) {
+            *x = g.w;
+            *y = g.h;
+            ++layers;
+            stride = g.w * g.h * 4;
+
+            if (out) {
+               void *tmp = (stbi_uc*) STBI_REALLOC_SIZED( out, out_size, layers * stride );
+               if (!tmp)
+                  return stbi__load_gif_main_outofmem(&g, out, delays);
+               else {
+                   out = (stbi_uc*) tmp;
+                   out_size = layers * stride;
+               }
+
+               if (delays) {
+                  int *new_delays = (int*) STBI_REALLOC_SIZED( *delays, delays_size, sizeof(int) * layers );
+                  if (!new_delays)
+                     return stbi__load_gif_main_outofmem(&g, out, delays);
+                  *delays = new_delays;
+                  delays_size = layers * sizeof(int);
+               }
+            } else {
+               out = (stbi_uc*)stbi__malloc( layers * stride );
+               if (!out)
+                  return stbi__load_gif_main_outofmem(&g, out, delays);
+               out_size = layers * stride;
+               if (delays) {
+                  *delays = (int*) stbi__malloc( layers * sizeof(int) );
+                  if (!*delays)
+                     return stbi__load_gif_main_outofmem(&g, out, delays);
+                  delays_size = layers * sizeof(int);
+               }
+            }
+            memcpy( out + ((layers - 1) * stride), u, stride );
+            if (layers >= 2) {
+               two_back = out - 2 * stride;
+            }
+
+            if (delays) {
+               (*delays)[layers - 1U] = g.delay;
+            }
+         }
+      } while (u != 0);
+
+      // free temp buffer;
+      STBI_FREE(g.out);
+      STBI_FREE(g.history);
+      STBI_FREE(g.background);
+
+      // do the final conversion after loading everything;
+      if (req_comp && req_comp != 4)
+         out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h);
+
+      *z = layers;
+      return out;
+   } else {
+      return stbi__errpuc("not GIF", "Image was not as a gif type.");
+   }
 }
 
-static stbi_uc *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static void *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
 {
    stbi_uc *u = 0;
-   stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif));
-   memset(g, 0, sizeof(*g));
+   stbi__gif g;
+   memset(&g, 0, sizeof(g));
+   STBI_NOTUSED(ri);
 
-   u = stbi__gif_load_next(s, g, comp, req_comp);
+   u = stbi__gif_load_next(s, &g, comp, req_comp, 0);
    if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
    if (u) {
-      *x = g->w;
-      *y = g->h;
+      *x = g.w;
+      *y = g.h;
+
+      // moved conversion to after successful load so that the same
+      // can be done for multiple frames.
       if (req_comp && req_comp != 4)
-         u = stbi__convert_format(u, 4, req_comp, g->w, g->h);
+         u = stbi__convert_format(u, 4, req_comp, g.w, g.h);
+   } else if (g.out) {
+      // if there was an error and we allocated an image buffer, free it!
+      STBI_FREE(g.out);
    }
-   else if (g->out)
-      STBI_FREE(g->out);
-   STBI_FREE(g);
+
+   // free buffers needed for multiple frame loading;
+   STBI_FREE(g.history);
+   STBI_FREE(g.background);
+
    return u;
 }
 
@@ -6072,20 +7083,24 @@ static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp)
 // Radiance RGBE HDR loader
 // originally by Nicolas Schulz
 #ifndef STBI_NO_HDR
-static int stbi__hdr_test_core(stbi__context *s)
+static int stbi__hdr_test_core(stbi__context *s, const char *signature)
 {
-   const char *signature = "#?RADIANCE\n";
    int i;
    for (i=0; signature[i]; ++i)
       if (stbi__get8(s) != signature[i])
-         return 0;
+          return 0;
+   stbi__rewind(s);
    return 1;
 }
 
 static int stbi__hdr_test(stbi__context* s)
 {
-   int r = stbi__hdr_test_core(s);
+   int r = stbi__hdr_test_core(s, "#?RADIANCE\n");
    stbi__rewind(s);
+   if(!r) {
+       r = stbi__hdr_test_core(s, "#?RGBE\n");
+       stbi__rewind(s);
+   }
    return r;
 }
 
@@ -6139,7 +7154,7 @@ static void stbi__hdr_convert(float *output, stbi_uc *input, int req_comp)
    }
 }
 
-static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
 {
    char buffer[STBI__HDR_BUFLEN];
    char *token;
@@ -6150,10 +7165,12 @@ static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int re
    int len;
    unsigned char count, value;
    int i, j, k, c1,c2, z;
-
+   const char *headerToken;
+   STBI_NOTUSED(ri);
 
    // Check identifier
-   if (strcmp(stbi__hdr_gettoken(s,buffer), "#?RADIANCE") != 0)
+   headerToken = stbi__hdr_gettoken(s,buffer);
+   if (strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0)
       return stbi__errpf("not HDR", "Corrupt HDR image");
 
    // Parse header
@@ -6176,14 +7193,22 @@ static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int re
    token += 3;
    width = (int) strtol(token, NULL, 10);
 
+   if (height > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)");
+   if (width > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)");
+
    *x = width;
    *y = height;
 
    if (comp) *comp = 3;
    if (req_comp == 0) req_comp = 3;
 
+   if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0))
+      return stbi__errpf("too large", "HDR image is too large");
+
    // Read data
-   hdr_data = (float *) stbi__malloc(height * width * req_comp * sizeof(float));
+   hdr_data = (float *) stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0);
+   if (!hdr_data)
+      return stbi__errpf("outofmem", "Out of memory");
 
    // Load image data
    // image data is stored as some number of sca
@@ -6222,20 +7247,29 @@ static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int re
          len <<= 8;
          len |= stbi__get8(s);
          if (len != width) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("invalid decoded scanline length", "corrupt HDR"); }
-         if (scanline == NULL) scanline = (stbi_uc *) stbi__malloc(width * 4);
+         if (scanline == NULL) {
+            scanline = (stbi_uc *) stbi__malloc_mad2(width, 4, 0);
+            if (!scanline) {
+               STBI_FREE(hdr_data);
+               return stbi__errpf("outofmem", "Out of memory");
+            }
+         }
 
          for (k = 0; k < 4; ++k) {
+            int nleft;
             i = 0;
-            while (i < width) {
+            while ((nleft = width - i) > 0) {
                count = stbi__get8(s);
                if (count > 128) {
                   // Run
                   value = stbi__get8(s);
                   count -= 128;
+                  if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
                   for (z = 0; z < count; ++z)
                      scanline[i++ * 4 + k] = value;
                } else {
                   // Dump
+                  if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
                   for (z = 0; z < count; ++z)
                      scanline[i++ * 4 + k] = stbi__get8(s);
                }
@@ -6244,7 +7278,8 @@ static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int re
          for (i=0; i < width; ++i)
             stbi__hdr_convert(hdr_data+(j*width + i)*req_comp, scanline + i*4, req_comp);
       }
-      STBI_FREE(scanline);
+      if (scanline)
+         STBI_FREE(scanline);
    }
 
    return hdr_data;
@@ -6255,6 +7290,11 @@ static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp)
    char buffer[STBI__HDR_BUFLEN];
    char *token;
    int valid = 0;
+   int dummy;
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
 
    if (stbi__hdr_test(s) == 0) {
        stbi__rewind( s );
@@ -6296,14 +7336,20 @@ static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
    void *p;
    stbi__bmp_data info;
 
-   info.all_a = 255;   
+   info.all_a = 255;
    p = stbi__bmp_parse_header(s, &info);
-   stbi__rewind( s );
-   if (p == NULL)
+   if (p == NULL) {
+      stbi__rewind( s );
       return 0;
-   *x = s->img_x;
-   *y = s->img_y;
-   *comp = info.ma ? 4 : 3;
+   }
+   if (x) *x = s->img_x;
+   if (y) *y = s->img_y;
+   if (comp) {
+      if (info.bpp == 24 && info.ma == 0xff000000)
+         *comp = 3;
+      else
+         *comp = info.ma ? 4 : 3;
+   }
    return 1;
 }
 #endif
@@ -6311,7 +7357,10 @@ static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
 #ifndef STBI_NO_PSD
 static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
 {
-   int channelCount;
+   int channelCount, dummy, depth;
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
    if (stbi__get32be(s) != 0x38425053) {
        stbi__rewind( s );
        return 0;
@@ -6328,7 +7377,8 @@ static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
    }
    *y = stbi__get32be(s);
    *x = stbi__get32be(s);
-   if (stbi__get16be(s) != 8) {
+   depth = stbi__get16be(s);
+   if (depth != 8 && depth != 16) {
        stbi__rewind( s );
        return 0;
    }
@@ -6339,14 +7389,45 @@ static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
    *comp = 4;
    return 1;
 }
+
+static int stbi__psd_is16(stbi__context *s)
+{
+   int channelCount, depth;
+   if (stbi__get32be(s) != 0x38425053) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 1) {
+       stbi__rewind( s );
+       return 0;
+   }
+   stbi__skip(s, 6);
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   STBI_NOTUSED(stbi__get32be(s));
+   STBI_NOTUSED(stbi__get32be(s));
+   depth = stbi__get16be(s);
+   if (depth != 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   return 1;
+}
 #endif
 
 #ifndef STBI_NO_PIC
 static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
 {
-   int act_comp=0,num_packets=0,chained;
+   int act_comp=0,num_packets=0,chained,dummy;
    stbi__pic_packet packets[10];
 
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
    if (!stbi__pic_is4(s,"\x53\x80\xF6\x34")) {
       stbi__rewind(s);
       return 0;
@@ -6406,7 +7487,6 @@ static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
 // Known limitations:
 //    Does not support comments in the header section
 //    Does not support ASCII image data (formats P2 and P3)
-//    Does not support 16-bit-per-channel
 
 #ifndef STBI_NO_PNM
 
@@ -6422,21 +7502,38 @@ static int      stbi__pnm_test(stbi__context *s)
    return 1;
 }
 
-static stbi_uc *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
 {
    stbi_uc *out;
-   if (!stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n))
+   STBI_NOTUSED(ri);
+
+   ri->bits_per_channel = stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n);
+   if (ri->bits_per_channel == 0)
       return 0;
+
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
    *x = s->img_x;
    *y = s->img_y;
-   *comp = s->img_n;
+   if (comp) *comp = s->img_n;
 
-   out = (stbi_uc *) stbi__malloc(s->img_n * s->img_x * s->img_y);
+   if (!stbi__mad4sizes_valid(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0))
+      return stbi__errpuc("too large", "PNM too large");
+
+   out = (stbi_uc *) stbi__malloc_mad4(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0);
    if (!out) return stbi__errpuc("outofmem", "Out of memory");
-   stbi__getn(s, out, s->img_n * s->img_x * s->img_y);
+   if (!stbi__getn(s, out, s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8))) {
+      STBI_FREE(out);
+      return stbi__errpuc("bad PNM", "PNM file truncated");
+   }
 
    if (req_comp && req_comp != s->img_n) {
-      out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
+      if (ri->bits_per_channel == 16) {
+         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, s->img_n, req_comp, s->img_x, s->img_y);
+      } else {
+         out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
+      }
       if (out == NULL) return out; // stbi__convert_format frees input on failure
    }
    return out;
@@ -6473,6 +7570,8 @@ static int      stbi__pnm_getinteger(stbi__context *s, char *c)
    while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) {
       value = value*10 + (*c - '0');
       *c = (char) stbi__get8(s);
+      if((value > 214748364) || (value == 214748364 && *c > '7'))
+          return stbi__err("integer parse overflow", "Parsing an integer in the PPM header overflowed a 32-bit int");
    }
 
    return value;
@@ -6480,16 +7579,20 @@ static int      stbi__pnm_getinteger(stbi__context *s, char *c)
 
 static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
 {
-   int maxv;
+   int maxv, dummy;
    char c, p, t;
 
-   stbi__rewind( s );
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   stbi__rewind(s);
 
    // Get identifier
    p = (char) stbi__get8(s);
    t = (char) stbi__get8(s);
    if (p != 'P' || (t != '5' && t != '6')) {
-       stbi__rewind( s );
+       stbi__rewind(s);
        return 0;
    }
 
@@ -6499,17 +7602,29 @@ static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
    stbi__pnm_skip_whitespace(s, &c);
 
    *x = stbi__pnm_getinteger(s, &c); // read width
+   if(*x == 0)
+       return stbi__err("invalid width", "PPM image header had zero or overflowing width");
    stbi__pnm_skip_whitespace(s, &c);
 
    *y = stbi__pnm_getinteger(s, &c); // read height
+   if (*y == 0)
+       return stbi__err("invalid width", "PPM image header had zero or overflowing width");
    stbi__pnm_skip_whitespace(s, &c);
 
    maxv = stbi__pnm_getinteger(s, &c);  // read max value
-
-   if (maxv > 255)
-      return stbi__err("max value > 255", "PPM image not 8-bit");
+   if (maxv > 65535)
+      return stbi__err("max value > 65535", "PPM image supports only 8-bit and 16-bit images");
+   else if (maxv > 255)
+      return 16;
    else
-      return 1;
+      return 8;
+}
+
+static int stbi__pnm_is16(stbi__context *s)
+{
+   if (stbi__pnm_info(s, NULL, NULL, NULL) == 16)
+	   return 1;
+   return 0;
 }
 #endif
 
@@ -6555,6 +7670,22 @@ static int stbi__info_main(stbi__context *s, int *x, int *y, int *comp)
    return stbi__err("unknown image type", "Image not of any known type, or corrupt");
 }
 
+static int stbi__is_16_main(stbi__context *s)
+{
+   #ifndef STBI_NO_PNG
+   if (stbi__png_is16(s))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_is16(s))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_is16(s))  return 1;
+   #endif
+   return 0;
+}
+
 #ifndef STBI_NO_STDIO
 STBIDEF int stbi_info(char const *filename, int *x, int *y, int *comp)
 {
@@ -6576,6 +7707,27 @@ STBIDEF int stbi_info_from_file(FILE *f, int *x, int *y, int *comp)
    fseek(f,pos,SEEK_SET);
    return r;
 }
+
+STBIDEF int stbi_is_16_bit(char const *filename)
+{
+    FILE *f = stbi__fopen(filename, "rb");
+    int result;
+    if (!f) return stbi__err("can't fopen", "Unable to open file");
+    result = stbi_is_16_bit_from_file(f);
+    fclose(f);
+    return result;
+}
+
+STBIDEF int stbi_is_16_bit_from_file(FILE *f)
+{
+   int r;
+   stbi__context s;
+   long pos = ftell(f);
+   stbi__start_file(&s, f);
+   r = stbi__is_16_main(&s);
+   fseek(f,pos,SEEK_SET);
+   return r;
+}
 #endif // !STBI_NO_STDIO
 
 STBIDEF int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp)
@@ -6592,10 +7744,44 @@ STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user, int
    return stbi__info_main(&s,x,y,comp);
 }
 
+STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__is_16_main(&s);
+}
+
+STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
+   return stbi__is_16_main(&s);
+}
+
 #endif // STB_IMAGE_IMPLEMENTATION
 
 /*
    revision history:
+      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs
+      2.19  (2018-02-11) fix warning
+      2.18  (2018-01-30) fix warnings
+      2.17  (2018-01-29) change sbti__shiftsigned to avoid clang -O2 bug
+                         1-bit BMP
+                         *_is_16_bit api
+                         avoid warnings
+      2.16  (2017-07-23) all functions have 16-bit variants;
+                         STBI_NO_STDIO works again;
+                         compilation fixes;
+                         fix rounding in unpremultiply;
+                         optimize vertical flip;
+                         disable raw_len validation;
+                         documentation fixes
+      2.15  (2017-03-18) fix png-1,2,4 bug; now all Imagenet JPGs decode;
+                         warning fixes; disable run-time SSE detection on gcc;
+                         uniform handling of optional "return" values;
+                         thread-safe initialization of zlib tables
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
+      2.13  (2016-11-29) add 16-bit API, only supported for PNG right now
       2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
       2.11  (2016-04-02) allocate large structures on the stack
                          remove white matting for transparent PSD
@@ -6756,3 +7942,46 @@ STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user, int
       0.50  (2006-11-19)
               first released version
 */
+
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/renderdoc/3rdparty/stb/stb_image_resize.h b/renderdoc/3rdparty/stb/stb_image_resize.h
deleted file mode 100644
index 4cabe54089..0000000000
--- a/renderdoc/3rdparty/stb/stb_image_resize.h
+++ /dev/null
@@ -1,2578 +0,0 @@
-/* stb_image_resize - v0.91 - public domain image resizing
-   by Jorge L Rodriguez (@VinoBS) - 2014
-   http://github.com/nothings/stb
-
-   Written with emphasis on usability, portability, and efficiency. (No
-   SIMD or threads, so it be easily outperformed by libs that use those.)
-   Only scaling and translation is supported, no rotations or shears.
-   Easy API downsamples w/Mitchell filter, upsamples w/cubic interpolation.
-
-   COMPILING & LINKING
-      In one C/C++ file that #includes this file, do this:
-         #define STB_IMAGE_RESIZE_IMPLEMENTATION
-      before the #include. That will create the implementation in that file.
-
-   QUICKSTART
-      stbir_resize_uint8(      input_pixels , in_w , in_h , 0,
-                               output_pixels, out_w, out_h, 0, num_channels)
-      stbir_resize_float(...)
-      stbir_resize_uint8_srgb( input_pixels , in_w , in_h , 0,
-                               output_pixels, out_w, out_h, 0,
-                               num_channels , alpha_chan  , 0)
-      stbir_resize_uint8_srgb_edgemode(
-                               input_pixels , in_w , in_h , 0, 
-                               output_pixels, out_w, out_h, 0, 
-                               num_channels , alpha_chan  , 0, STBIR_EDGE_CLAMP)
-                                                            // WRAP/REFLECT/ZERO
-
-   FULL API
-      See the "header file" section of the source for API documentation.
-
-   ADDITIONAL DOCUMENTATION
-
-      SRGB & FLOATING POINT REPRESENTATION
-         The sRGB functions presume IEEE floating point. If you do not have
-         IEEE floating point, define STBIR_NON_IEEE_FLOAT. This will use
-         a slower implementation.
-
-      MEMORY ALLOCATION
-         The resize functions here perform a single memory allocation using
-         malloc. To control the memory allocation, before the #include that
-         triggers the implementation, do:
-
-            #define STBIR_MALLOC(size,context) ...
-            #define STBIR_FREE(ptr,context)   ...
-
-         Each resize function makes exactly one call to malloc/free, so to use
-         temp memory, store the temp memory in the context and return that.
-
-      ASSERT
-         Define STBIR_ASSERT(boolval) to override assert() and not use assert.h
-
-      OPTIMIZATION
-         Define STBIR_SATURATE_INT to compute clamp values in-range using
-         integer operations instead of float operations. This may be faster
-         on some platforms.
-
-      DEFAULT FILTERS
-         For functions which don't provide explicit control over what filters
-         to use, you can change the compile-time defaults with
-
-            #define STBIR_DEFAULT_FILTER_UPSAMPLE     STBIR_FILTER_something
-            #define STBIR_DEFAULT_FILTER_DOWNSAMPLE   STBIR_FILTER_something
-
-         See stbir_filter in the header-file section for the list of filters.
-
-      NEW FILTERS
-         A number of 1D filter kernels are used. For a list of
-         supported filters see the stbir_filter enum. To add a new filter,
-         write a filter function and add it to stbir__filter_info_table.
-
-      PROGRESS
-         For interactive use with slow resize operations, you can install
-         a progress-report callback:
-
-            #define STBIR_PROGRESS_REPORT(val)   some_func(val)
-
-         The parameter val is a float which goes from 0 to 1 as progress is made.
-
-         For example:
-
-            static void my_progress_report(float progress);
-            #define STBIR_PROGRESS_REPORT(val) my_progress_report(val)
-
-            #define STB_IMAGE_RESIZE_IMPLEMENTATION
-            #include "stb_image_resize.h"
-
-            static void my_progress_report(float progress)
-            {
-               printf("Progress: %f%%\n", progress*100);
-            }
-
-      MAX CHANNELS
-         If your image has more than 64 channels, define STBIR_MAX_CHANNELS
-         to the max you'll have.
-
-      ALPHA CHANNEL
-         Most of the resizing functions provide the ability to control how
-         the alpha channel of an image is processed. The important things
-         to know about this:
-
-         1. The best mathematically-behaved version of alpha to use is
-         called "premultiplied alpha", in which the other color channels
-         have had the alpha value multiplied in. If you use premultiplied
-         alpha, linear filtering (such as image resampling done by this
-         library, or performed in texture units on GPUs) does the "right
-         thing". While premultiplied alpha is standard in the movie CGI
-         industry, it is still uncommon in the videogame/real-time world.
-
-         If you linearly filter non-premultiplied alpha, strange effects
-         occur. (For example, the average of 1% opaque bright green
-         and 99% opaque black produces 50% transparent dark green when
-         non-premultiplied, whereas premultiplied it produces 50%
-         transparent near-black. The former introduces green energy
-         that doesn't exist in the source image.)
-
-         2. Artists should not edit premultiplied-alpha images; artists
-         want non-premultiplied alpha images. Thus, art tools generally output
-         non-premultiplied alpha images.
-
-         3. You will get best results in most cases by converting images
-         to premultiplied alpha before processing them mathematically.
-
-         4. If you pass the flag STBIR_FLAG_ALPHA_PREMULTIPLIED, the
-         resizer does not do anything special for the alpha channel;
-         it is resampled identically to other channels. This produces
-         the correct results for premultiplied-alpha images, but produces
-         less-than-ideal results for non-premultiplied-alpha images.
-
-         5. If you do not pass the flag STBIR_FLAG_ALPHA_PREMULTIPLIED,
-         then the resizer weights the contribution of input pixels
-         based on their alpha values, or, equivalently, it multiplies
-         the alpha value into the color channels, resamples, then divides
-         by the resultant alpha value. Input pixels which have alpha=0 do
-         not contribute at all to output pixels unless _all_ of the input
-         pixels affecting that output pixel have alpha=0, in which case
-         the result for that pixel is the same as it would be without
-         STBIR_FLAG_ALPHA_PREMULTIPLIED. However, this is only true for
-         input images in integer formats. For input images in float format,
-         input pixels with alpha=0 have no effect, and output pixels
-         which have alpha=0 will be 0 in all channels. (For float images,
-         you can manually achieve the same result by adding a tiny epsilon
-         value to the alpha channel of every image, and then subtracting
-         or clamping it at the end.)
-
-         6. You can suppress the behavior described in #5 and make
-         all-0-alpha pixels have 0 in all channels by #defining
-         STBIR_NO_ALPHA_EPSILON.
-
-         7. You can separately control whether the alpha channel is
-         interpreted as linear or affected by the colorspace. By default
-         it is linear; you almost never want to apply the colorspace.
-         (For example, graphics hardware does not apply sRGB conversion
-         to the alpha channel.)
-
-   ADDITIONAL CONTRIBUTORS
-      Sean Barrett: API design, optimizations
-         
-   REVISIONS
-      0.91 (2016-04-02) fix warnings; fix handling of subpixel regions
-      0.90 (2014-09-17) first released version
-
-   LICENSE
-
-     This software is dual-licensed to the public domain and under the following
-     license: you are granted a perpetual, irrevocable license to copy, modify,
-     publish, and distribute this file as you see fit.
-
-   TODO
-      Don't decode all of the image data when only processing a partial tile
-      Don't use full-width decode buffers when only processing a partial tile
-      When processing wide images, break processing into tiles so data fits in L1 cache
-      Installable filters?
-      Resize that respects alpha test coverage
-         (Reference code: FloatImage::alphaTestCoverage and FloatImage::scaleAlphaToCoverage:
-         https://code.google.com/p/nvidia-texture-tools/source/browse/trunk/src/nvimage/FloatImage.cpp )
-*/
-
-#ifndef STBIR_INCLUDE_STB_IMAGE_RESIZE_H
-#define STBIR_INCLUDE_STB_IMAGE_RESIZE_H
-
-#ifdef _MSC_VER
-typedef unsigned char  stbir_uint8;
-typedef unsigned short stbir_uint16;
-typedef unsigned int   stbir_uint32;
-#else
-#include <stdint.h>
-typedef uint8_t  stbir_uint8;
-typedef uint16_t stbir_uint16;
-typedef uint32_t stbir_uint32;
-#endif
-
-#ifdef STB_IMAGE_RESIZE_STATIC
-#define STBIRDEF static
-#else
-#ifdef __cplusplus
-#define STBIRDEF extern "C"
-#else
-#define STBIRDEF extern
-#endif
-#endif
-
-
-//////////////////////////////////////////////////////////////////////////////
-//
-// Easy-to-use API:
-//
-//     * "input pixels" points to an array of image data with 'num_channels' channels (e.g. RGB=3, RGBA=4)
-//     * input_w is input image width (x-axis), input_h is input image height (y-axis)
-//     * stride is the offset between successive rows of image data in memory, in bytes. you can
-//       specify 0 to mean packed continuously in memory
-//     * alpha channel is treated identically to other channels.
-//     * colorspace is linear or sRGB as specified by function name
-//     * returned result is 1 for success or 0 in case of an error.
-//       #define STBIR_ASSERT() to trigger an assert on parameter validation errors.
-//     * Memory required grows approximately linearly with input and output size, but with
-//       discontinuities at input_w == output_w and input_h == output_h.
-//     * These functions use a "default" resampling filter defined at compile time. To change the filter,
-//       you can change the compile-time defaults by #defining STBIR_DEFAULT_FILTER_UPSAMPLE
-//       and STBIR_DEFAULT_FILTER_DOWNSAMPLE, or you can use the medium-complexity API.
-
-STBIRDEF int stbir_resize_uint8(     const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
-                                           unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
-                                     int num_channels);
-
-STBIRDEF int stbir_resize_float(     const float *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
-                                           float *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
-                                     int num_channels);
-
-
-// The following functions interpret image data as gamma-corrected sRGB. 
-// Specify STBIR_ALPHA_CHANNEL_NONE if you have no alpha channel,
-// or otherwise provide the index of the alpha channel. Flags value
-// of 0 will probably do the right thing if you're not sure what
-// the flags mean.
-
-#define STBIR_ALPHA_CHANNEL_NONE       -1
-
-// Set this flag if your texture has premultiplied alpha. Otherwise, stbir will
-// use alpha-weighted resampling (effectively premultiplying, resampling,
-// then unpremultiplying).
-#define STBIR_FLAG_ALPHA_PREMULTIPLIED    (1 << 0)
-// The specified alpha channel should be handled as gamma-corrected value even
-// when doing sRGB operations.
-#define STBIR_FLAG_ALPHA_USES_COLORSPACE  (1 << 1)
-
-STBIRDEF int stbir_resize_uint8_srgb(const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
-                                           unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
-                                     int num_channels, int alpha_channel, int flags);
-
-
-typedef enum
-{
-    STBIR_EDGE_CLAMP   = 1,
-    STBIR_EDGE_REFLECT = 2,
-    STBIR_EDGE_WRAP    = 3,
-    STBIR_EDGE_ZERO    = 4,
-} stbir_edge;
-
-// This function adds the ability to specify how requests to sample off the edge of the image are handled.
-STBIRDEF int stbir_resize_uint8_srgb_edgemode(const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
-                                                    unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
-                                              int num_channels, int alpha_channel, int flags,
-                                              stbir_edge edge_wrap_mode);
-
-//////////////////////////////////////////////////////////////////////////////
-//
-// Medium-complexity API
-//
-// This extends the easy-to-use API as follows:
-//
-//     * Alpha-channel can be processed separately
-//       * If alpha_channel is not STBIR_ALPHA_CHANNEL_NONE
-//         * Alpha channel will not be gamma corrected (unless flags&STBIR_FLAG_GAMMA_CORRECT)
-//         * Filters will be weighted by alpha channel (unless flags&STBIR_FLAG_ALPHA_PREMULTIPLIED)
-//     * Filter can be selected explicitly
-//     * uint16 image type
-//     * sRGB colorspace available for all types
-//     * context parameter for passing to STBIR_MALLOC
-
-typedef enum
-{
-    STBIR_FILTER_DEFAULT      = 0,  // use same filter type that easy-to-use API chooses
-    STBIR_FILTER_BOX          = 1,  // A trapezoid w/1-pixel wide ramps, same result as box for integer scale ratios
-    STBIR_FILTER_TRIANGLE     = 2,  // On upsampling, produces same results as bilinear texture filtering
-    STBIR_FILTER_CUBICBSPLINE = 3,  // The cubic b-spline (aka Mitchell-Netrevalli with B=1,C=0), gaussian-esque
-    STBIR_FILTER_CATMULLROM   = 4,  // An interpolating cubic spline
-    STBIR_FILTER_MITCHELL     = 5,  // Mitchell-Netrevalli filter with B=1/3, C=1/3
-} stbir_filter;
-
-typedef enum
-{
-    STBIR_COLORSPACE_LINEAR,
-    STBIR_COLORSPACE_SRGB,
-
-    STBIR_MAX_COLORSPACES,
-} stbir_colorspace;
-
-// The following functions are all identical except for the type of the image data
-
-STBIRDEF int stbir_resize_uint8_generic( const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
-                                               unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
-                                         int num_channels, int alpha_channel, int flags,
-                                         stbir_edge edge_wrap_mode, stbir_filter filter, stbir_colorspace space, 
-                                         void *alloc_context);
-
-STBIRDEF int stbir_resize_uint16_generic(const stbir_uint16 *input_pixels  , int input_w , int input_h , int input_stride_in_bytes,
-                                               stbir_uint16 *output_pixels , int output_w, int output_h, int output_stride_in_bytes,
-                                         int num_channels, int alpha_channel, int flags,
-                                         stbir_edge edge_wrap_mode, stbir_filter filter, stbir_colorspace space, 
-                                         void *alloc_context);
-
-STBIRDEF int stbir_resize_float_generic( const float *input_pixels         , int input_w , int input_h , int input_stride_in_bytes,
-                                               float *output_pixels        , int output_w, int output_h, int output_stride_in_bytes,
-                                         int num_channels, int alpha_channel, int flags,
-                                         stbir_edge edge_wrap_mode, stbir_filter filter, stbir_colorspace space, 
-                                         void *alloc_context);
-
-
-
-//////////////////////////////////////////////////////////////////////////////
-//
-// Full-complexity API
-//
-// This extends the medium API as follows:
-//
-//       * uint32 image type
-//     * not typesafe
-//     * separate filter types for each axis
-//     * separate edge modes for each axis
-//     * can specify scale explicitly for subpixel correctness
-//     * can specify image source tile using texture coordinates
-
-typedef enum
-{
-    STBIR_TYPE_UINT8 ,
-    STBIR_TYPE_UINT16,
-    STBIR_TYPE_UINT32,
-    STBIR_TYPE_FLOAT ,
-
-    STBIR_MAX_TYPES
-} stbir_datatype;
-
-STBIRDEF int stbir_resize(         const void *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
-                                         void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
-                                   stbir_datatype datatype,
-                                   int num_channels, int alpha_channel, int flags,
-                                   stbir_edge edge_mode_horizontal, stbir_edge edge_mode_vertical, 
-                                   stbir_filter filter_horizontal,  stbir_filter filter_vertical,
-                                   stbir_colorspace space, void *alloc_context);
-
-STBIRDEF int stbir_resize_subpixel(const void *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
-                                         void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
-                                   stbir_datatype datatype,
-                                   int num_channels, int alpha_channel, int flags,
-                                   stbir_edge edge_mode_horizontal, stbir_edge edge_mode_vertical, 
-                                   stbir_filter filter_horizontal,  stbir_filter filter_vertical,
-                                   stbir_colorspace space, void *alloc_context,
-                                   float x_scale, float y_scale,
-                                   float x_offset, float y_offset);
-
-STBIRDEF int stbir_resize_region(  const void *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
-                                         void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
-                                   stbir_datatype datatype,
-                                   int num_channels, int alpha_channel, int flags,
-                                   stbir_edge edge_mode_horizontal, stbir_edge edge_mode_vertical, 
-                                   stbir_filter filter_horizontal,  stbir_filter filter_vertical,
-                                   stbir_colorspace space, void *alloc_context,
-                                   float s0, float t0, float s1, float t1);
-// (s0, t0) & (s1, t1) are the top-left and bottom right corner (uv addressing style: [0, 1]x[0, 1]) of a region of the input image to use.
-
-//
-//
-////   end header file   /////////////////////////////////////////////////////
-#endif // STBIR_INCLUDE_STB_IMAGE_RESIZE_H
-
-
-
-
-
-#ifdef STB_IMAGE_RESIZE_IMPLEMENTATION
-
-#ifndef STBIR_ASSERT
-#include <assert.h>
-#define STBIR_ASSERT(x) assert(x)
-#endif
-
-// For memset
-#include <string.h>
-
-#include <math.h>
-
-#ifndef STBIR_MALLOC
-#include <stdlib.h>
-#define STBIR_MALLOC(size,c) malloc(size)
-#define STBIR_FREE(ptr,c)    free(ptr)
-#endif
-
-#ifndef _MSC_VER
-#ifdef __cplusplus
-#define stbir__inline inline
-#else
-#define stbir__inline
-#endif
-#else
-#define stbir__inline __forceinline
-#endif
-
-
-// should produce compiler error if size is wrong
-typedef unsigned char stbir__validate_uint32[sizeof(stbir_uint32) == 4 ? 1 : -1];
-
-#ifdef _MSC_VER
-#define STBIR__NOTUSED(v)  (void)(v)
-#else
-#define STBIR__NOTUSED(v)  (void)sizeof(v)
-#endif
-
-#define STBIR__ARRAY_SIZE(a) (sizeof((a))/sizeof((a)[0]))
-
-#ifndef STBIR_DEFAULT_FILTER_UPSAMPLE
-#define STBIR_DEFAULT_FILTER_UPSAMPLE    STBIR_FILTER_CATMULLROM
-#endif
-
-#ifndef STBIR_DEFAULT_FILTER_DOWNSAMPLE
-#define STBIR_DEFAULT_FILTER_DOWNSAMPLE  STBIR_FILTER_MITCHELL
-#endif
-
-#ifndef STBIR_PROGRESS_REPORT
-#define STBIR_PROGRESS_REPORT(float_0_to_1)
-#endif
-
-#ifndef STBIR_MAX_CHANNELS
-#define STBIR_MAX_CHANNELS 64
-#endif
-
-#if STBIR_MAX_CHANNELS > 65536
-#error "Too many channels; STBIR_MAX_CHANNELS must be no more than 65536."
-// because we store the indices in 16-bit variables
-#endif
-
-// This value is added to alpha just before premultiplication to avoid
-// zeroing out color values. It is equivalent to 2^-80. If you don't want
-// that behavior (it may interfere if you have floating point images with
-// very small alpha values) then you can define STBIR_NO_ALPHA_EPSILON to
-// disable it.
-#ifndef STBIR_ALPHA_EPSILON
-#define STBIR_ALPHA_EPSILON ((float)1 / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20))
-#endif
-
-
-
-#ifdef _MSC_VER
-#define STBIR__UNUSED_PARAM(v)  (void)(v)
-#else
-#define STBIR__UNUSED_PARAM(v)  (void)sizeof(v)
-#endif
-
-// must match stbir_datatype
-static unsigned char stbir__type_size[] = {
-    1, // STBIR_TYPE_UINT8
-    2, // STBIR_TYPE_UINT16
-    4, // STBIR_TYPE_UINT32
-    4, // STBIR_TYPE_FLOAT
-};
-
-// Kernel function centered at 0
-typedef float (stbir__kernel_fn)(float x, float scale);
-typedef float (stbir__support_fn)(float scale);
-
-typedef struct
-{
-    stbir__kernel_fn* kernel;
-    stbir__support_fn* support;
-} stbir__filter_info;
-
-// When upsampling, the contributors are which source pixels contribute.
-// When downsampling, the contributors are which destination pixels are contributed to.
-typedef struct
-{
-    int n0; // First contributing pixel
-    int n1; // Last contributing pixel
-} stbir__contributors;
-
-typedef struct
-{
-    const void* input_data;
-    int input_w;
-    int input_h;
-    int input_stride_bytes;
-
-    void* output_data;
-    int output_w;
-    int output_h;
-    int output_stride_bytes;
-
-    float s0, t0, s1, t1;
-
-    float horizontal_shift; // Units: output pixels
-    float vertical_shift;   // Units: output pixels
-    float horizontal_scale;
-    float vertical_scale;
-
-    int channels;
-    int alpha_channel;
-    stbir_uint32 flags;
-    stbir_datatype type;
-    stbir_filter horizontal_filter;
-    stbir_filter vertical_filter;
-    stbir_edge edge_horizontal;
-    stbir_edge edge_vertical;
-    stbir_colorspace colorspace;
-
-    stbir__contributors* horizontal_contributors;
-    float* horizontal_coefficients;
-
-    stbir__contributors* vertical_contributors;
-    float* vertical_coefficients;
-
-    int decode_buffer_pixels;
-    float* decode_buffer;
-
-    float* horizontal_buffer;
-
-    // cache these because ceil/floor are inexplicably showing up in profile
-    int horizontal_coefficient_width;
-    int vertical_coefficient_width;
-    int horizontal_filter_pixel_width;
-    int vertical_filter_pixel_width;
-    int horizontal_filter_pixel_margin;
-    int vertical_filter_pixel_margin;
-    int horizontal_num_contributors;
-    int vertical_num_contributors;
-
-    int ring_buffer_length_bytes; // The length of an individual entry in the ring buffer. The total number of ring buffers is stbir__get_filter_pixel_width(filter)
-    int ring_buffer_first_scanline;
-    int ring_buffer_last_scanline;
-    int ring_buffer_begin_index;
-    float* ring_buffer;
-
-    float* encode_buffer; // A temporary buffer to store floats so we don't lose precision while we do multiply-adds.
-
-    int horizontal_contributors_size;
-    int horizontal_coefficients_size;
-    int vertical_contributors_size;
-    int vertical_coefficients_size;
-    int decode_buffer_size;
-    int horizontal_buffer_size;
-    int ring_buffer_size;
-    int encode_buffer_size;
-} stbir__info;
-
-static stbir__inline int stbir__min(int a, int b)
-{
-    return a < b ? a : b;
-}
-
-static stbir__inline int stbir__max(int a, int b)
-{
-    return a > b ? a : b;
-}
-
-static stbir__inline float stbir__saturate(float x)
-{
-    if (x < 0)
-        return 0;
-
-    if (x > 1)
-        return 1;
-
-    return x;
-}
-
-#ifdef STBIR_SATURATE_INT
-static stbir__inline stbir_uint8 stbir__saturate8(int x)
-{
-    if ((unsigned int) x <= 255)
-        return x;
-
-    if (x < 0)
-        return 0;
-
-    return 255;
-}
-
-static stbir__inline stbir_uint16 stbir__saturate16(int x)
-{
-    if ((unsigned int) x <= 65535)
-        return x;
-
-    if (x < 0)
-        return 0;
-
-    return 65535;
-}
-#endif
-
-static float stbir__srgb_uchar_to_linear_float[256] = {
-    0.000000f, 0.000304f, 0.000607f, 0.000911f, 0.001214f, 0.001518f, 0.001821f, 0.002125f, 0.002428f, 0.002732f, 0.003035f,
-    0.003347f, 0.003677f, 0.004025f, 0.004391f, 0.004777f, 0.005182f, 0.005605f, 0.006049f, 0.006512f, 0.006995f, 0.007499f,
-    0.008023f, 0.008568f, 0.009134f, 0.009721f, 0.010330f, 0.010960f, 0.011612f, 0.012286f, 0.012983f, 0.013702f, 0.014444f,
-    0.015209f, 0.015996f, 0.016807f, 0.017642f, 0.018500f, 0.019382f, 0.020289f, 0.021219f, 0.022174f, 0.023153f, 0.024158f,
-    0.025187f, 0.026241f, 0.027321f, 0.028426f, 0.029557f, 0.030713f, 0.031896f, 0.033105f, 0.034340f, 0.035601f, 0.036889f,
-    0.038204f, 0.039546f, 0.040915f, 0.042311f, 0.043735f, 0.045186f, 0.046665f, 0.048172f, 0.049707f, 0.051269f, 0.052861f,
-    0.054480f, 0.056128f, 0.057805f, 0.059511f, 0.061246f, 0.063010f, 0.064803f, 0.066626f, 0.068478f, 0.070360f, 0.072272f,
-    0.074214f, 0.076185f, 0.078187f, 0.080220f, 0.082283f, 0.084376f, 0.086500f, 0.088656f, 0.090842f, 0.093059f, 0.095307f,
-    0.097587f, 0.099899f, 0.102242f, 0.104616f, 0.107023f, 0.109462f, 0.111932f, 0.114435f, 0.116971f, 0.119538f, 0.122139f,
-    0.124772f, 0.127438f, 0.130136f, 0.132868f, 0.135633f, 0.138432f, 0.141263f, 0.144128f, 0.147027f, 0.149960f, 0.152926f,
-    0.155926f, 0.158961f, 0.162029f, 0.165132f, 0.168269f, 0.171441f, 0.174647f, 0.177888f, 0.181164f, 0.184475f, 0.187821f,
-    0.191202f, 0.194618f, 0.198069f, 0.201556f, 0.205079f, 0.208637f, 0.212231f, 0.215861f, 0.219526f, 0.223228f, 0.226966f,
-    0.230740f, 0.234551f, 0.238398f, 0.242281f, 0.246201f, 0.250158f, 0.254152f, 0.258183f, 0.262251f, 0.266356f, 0.270498f,
-    0.274677f, 0.278894f, 0.283149f, 0.287441f, 0.291771f, 0.296138f, 0.300544f, 0.304987f, 0.309469f, 0.313989f, 0.318547f,
-    0.323143f, 0.327778f, 0.332452f, 0.337164f, 0.341914f, 0.346704f, 0.351533f, 0.356400f, 0.361307f, 0.366253f, 0.371238f,
-    0.376262f, 0.381326f, 0.386430f, 0.391573f, 0.396755f, 0.401978f, 0.407240f, 0.412543f, 0.417885f, 0.423268f, 0.428691f,
-    0.434154f, 0.439657f, 0.445201f, 0.450786f, 0.456411f, 0.462077f, 0.467784f, 0.473532f, 0.479320f, 0.485150f, 0.491021f,
-    0.496933f, 0.502887f, 0.508881f, 0.514918f, 0.520996f, 0.527115f, 0.533276f, 0.539480f, 0.545725f, 0.552011f, 0.558340f,
-    0.564712f, 0.571125f, 0.577581f, 0.584078f, 0.590619f, 0.597202f, 0.603827f, 0.610496f, 0.617207f, 0.623960f, 0.630757f,
-    0.637597f, 0.644480f, 0.651406f, 0.658375f, 0.665387f, 0.672443f, 0.679543f, 0.686685f, 0.693872f, 0.701102f, 0.708376f,
-    0.715694f, 0.723055f, 0.730461f, 0.737911f, 0.745404f, 0.752942f, 0.760525f, 0.768151f, 0.775822f, 0.783538f, 0.791298f,
-    0.799103f, 0.806952f, 0.814847f, 0.822786f, 0.830770f, 0.838799f, 0.846873f, 0.854993f, 0.863157f, 0.871367f, 0.879622f,
-    0.887923f, 0.896269f, 0.904661f, 0.913099f, 0.921582f, 0.930111f, 0.938686f, 0.947307f, 0.955974f, 0.964686f, 0.973445f,
-    0.982251f, 0.991102f, 1.0f
-};
-
-static float stbir__srgb_to_linear(float f)
-{
-    if (f <= 0.04045f)
-        return f / 12.92f;
-    else
-        return (float)pow((f + 0.055f) / 1.055f, 2.4f);
-}
-
-static float stbir__linear_to_srgb(float f)
-{
-    if (f <= 0.0031308f)
-        return f * 12.92f;
-    else
-        return 1.055f * (float)pow(f, 1 / 2.4f) - 0.055f;
-}
-
-#ifndef STBIR_NON_IEEE_FLOAT
-// From https://gist.github.com/rygorous/2203834
-
-typedef union
-{
-    stbir_uint32 u;
-    float f;
-} stbir__FP32;
-
-static const stbir_uint32 fp32_to_srgb8_tab4[104] = {
-    0x0073000d, 0x007a000d, 0x0080000d, 0x0087000d, 0x008d000d, 0x0094000d, 0x009a000d, 0x00a1000d,
-    0x00a7001a, 0x00b4001a, 0x00c1001a, 0x00ce001a, 0x00da001a, 0x00e7001a, 0x00f4001a, 0x0101001a,
-    0x010e0033, 0x01280033, 0x01410033, 0x015b0033, 0x01750033, 0x018f0033, 0x01a80033, 0x01c20033,
-    0x01dc0067, 0x020f0067, 0x02430067, 0x02760067, 0x02aa0067, 0x02dd0067, 0x03110067, 0x03440067,
-    0x037800ce, 0x03df00ce, 0x044600ce, 0x04ad00ce, 0x051400ce, 0x057b00c5, 0x05dd00bc, 0x063b00b5,
-    0x06970158, 0x07420142, 0x07e30130, 0x087b0120, 0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2,
-    0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180, 0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143,
-    0x11070264, 0x1238023e, 0x1357021d, 0x14660201, 0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af,
-    0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad, 0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240,
-    0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392, 0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300,
-    0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5, 0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401,
-    0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d, 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
-    0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f, 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
-};
- 
-static stbir_uint8 stbir__linear_to_srgb_uchar(float in)
-{
-    static const stbir__FP32 almostone = { 0x3f7fffff }; // 1-eps
-    static const stbir__FP32 minval = { (127-13) << 23 };
-    stbir_uint32 tab,bias,scale,t;
-    stbir__FP32 f;
- 
-    // Clamp to [2^(-13), 1-eps]; these two values map to 0 and 1, respectively.
-    // The tests are carefully written so that NaNs map to 0, same as in the reference
-    // implementation.
-    if (!(in > minval.f)) // written this way to catch NaNs
-        in = minval.f;
-    if (in > almostone.f)
-        in = almostone.f;
- 
-    // Do the table lookup and unpack bias, scale
-    f.f = in;
-    tab = fp32_to_srgb8_tab4[(f.u - minval.u) >> 20];
-    bias = (tab >> 16) << 9;
-    scale = tab & 0xffff;
- 
-    // Grab next-highest mantissa bits and perform linear interpolation
-    t = (f.u >> 12) & 0xff;
-    return (unsigned char) ((bias + scale*t) >> 16);
-}
-
-#else
-// sRGB transition values, scaled by 1<<28
-static int stbir__srgb_offset_to_linear_scaled[256] =
-{
-            0,     40738,    122216,    203693,    285170,    366648,    448125,    529603,
-       611080,    692557,    774035,    855852,    942009,   1033024,   1128971,   1229926,
-      1335959,   1447142,   1563542,   1685229,   1812268,   1944725,   2082664,   2226148,
-      2375238,   2529996,   2690481,   2856753,   3028870,   3206888,   3390865,   3580856,
-      3776916,   3979100,   4187460,   4402049,   4622919,   4850123,   5083710,   5323731,
-      5570236,   5823273,   6082892,   6349140,   6622065,   6901714,   7188133,   7481369,
-      7781466,   8088471,   8402427,   8723380,   9051372,   9386448,   9728650,  10078021,
-     10434603,  10798439,  11169569,  11548036,  11933879,  12327139,  12727857,  13136073,
-     13551826,  13975156,  14406100,  14844697,  15290987,  15745007,  16206795,  16676389,
-     17153826,  17639142,  18132374,  18633560,  19142734,  19659934,  20185196,  20718552,
-     21260042,  21809696,  22367554,  22933648,  23508010,  24090680,  24681686,  25281066,
-     25888850,  26505076,  27129772,  27762974,  28404716,  29055026,  29713942,  30381490,
-     31057708,  31742624,  32436272,  33138682,  33849884,  34569912,  35298800,  36036568,
-     36783260,  37538896,  38303512,  39077136,  39859796,  40651528,  41452360,  42262316,
-     43081432,  43909732,  44747252,  45594016,  46450052,  47315392,  48190064,  49074096,
-     49967516,  50870356,  51782636,  52704392,  53635648,  54576432,  55526772,  56486700,
-     57456236,  58435408,  59424248,  60422780,  61431036,  62449032,  63476804,  64514376,
-     65561776,  66619028,  67686160,  68763192,  69850160,  70947088,  72053992,  73170912,
-     74297864,  75434880,  76581976,  77739184,  78906536,  80084040,  81271736,  82469648,
-     83677792,  84896192,  86124888,  87363888,  88613232,  89872928,  91143016,  92423512,
-     93714432,  95015816,  96327688,  97650056,  98982952, 100326408, 101680440, 103045072,
-    104420320, 105806224, 107202800, 108610064, 110028048, 111456776, 112896264, 114346544,
-    115807632, 117279552, 118762328, 120255976, 121760536, 123276016, 124802440, 126339832,
-    127888216, 129447616, 131018048, 132599544, 134192112, 135795792, 137410592, 139036528,
-    140673648, 142321952, 143981456, 145652208, 147334208, 149027488, 150732064, 152447968,
-    154175200, 155913792, 157663776, 159425168, 161197984, 162982240, 164777968, 166585184,
-    168403904, 170234160, 172075968, 173929344, 175794320, 177670896, 179559120, 181458992,
-    183370528, 185293776, 187228736, 189175424, 191133888, 193104112, 195086128, 197079968,
-    199085648, 201103184, 203132592, 205173888, 207227120, 209292272, 211369392, 213458480,
-    215559568, 217672656, 219797792, 221934976, 224084240, 226245600, 228419056, 230604656,
-    232802400, 235012320, 237234432, 239468736, 241715280, 243974080, 246245120, 248528464,
-    250824112, 253132064, 255452368, 257785040, 260130080, 262487520, 264857376, 267239664,
-};
-
-static stbir_uint8 stbir__linear_to_srgb_uchar(float f)
-{
-    int x = (int) (f * (1 << 28)); // has headroom so you don't need to clamp
-    int v = 0;
-    int i;
-
-    // Refine the guess with a short binary search.
-    i = v + 128; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
-    i = v +  64; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
-    i = v +  32; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
-    i = v +  16; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
-    i = v +   8; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
-    i = v +   4; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
-    i = v +   2; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
-    i = v +   1; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
-
-    return (stbir_uint8) v;
-}
-#endif
-
-static float stbir__filter_trapezoid(float x, float scale)
-{
-    float halfscale = scale / 2;
-    float t = 0.5f + halfscale;
-    STBIR_ASSERT(scale <= 1);
-
-    x = (float)fabs(x);
-
-    if (x >= t)
-        return 0;
-    else
-    {
-        float r = 0.5f - halfscale;
-        if (x <= r)
-            return 1;
-        else
-            return (t - x) / scale;
-    }
-}
-
-static float stbir__support_trapezoid(float scale)
-{
-    STBIR_ASSERT(scale <= 1);
-    return 0.5f + scale / 2;
-}
-
-static float stbir__filter_triangle(float x, float s)
-{
-    STBIR__UNUSED_PARAM(s);
-
-    x = (float)fabs(x);
-
-    if (x <= 1.0f)
-        return 1 - x;
-    else
-        return 0;
-}
-
-static float stbir__filter_cubic(float x, float s)
-{
-    STBIR__UNUSED_PARAM(s);
-
-    x = (float)fabs(x);
-
-    if (x < 1.0f)
-        return (4 + x*x*(3*x - 6))/6;
-    else if (x < 2.0f)
-        return (8 + x*(-12 + x*(6 - x)))/6;
-
-    return (0.0f);
-}
-
-static float stbir__filter_catmullrom(float x, float s)
-{
-    STBIR__UNUSED_PARAM(s);
-
-    x = (float)fabs(x);
-
-    if (x < 1.0f)
-        return 1 - x*x*(2.5f - 1.5f*x);
-    else if (x < 2.0f)
-        return 2 - x*(4 + x*(0.5f*x - 2.5f));
-
-    return (0.0f);
-}
-
-static float stbir__filter_mitchell(float x, float s)
-{
-    STBIR__UNUSED_PARAM(s);
-
-    x = (float)fabs(x);
-
-    if (x < 1.0f)
-        return (16 + x*x*(21 * x - 36))/18;
-    else if (x < 2.0f)
-        return (32 + x*(-60 + x*(36 - 7*x)))/18;
-
-    return (0.0f);
-}
-
-static float stbir__support_zero(float s)
-{
-    STBIR__UNUSED_PARAM(s);
-    return 0;
-}
-
-static float stbir__support_one(float s)
-{
-    STBIR__UNUSED_PARAM(s);
-    return 1;
-}
-
-static float stbir__support_two(float s)
-{
-    STBIR__UNUSED_PARAM(s);
-    return 2;
-}
-
-static stbir__filter_info stbir__filter_info_table[] = {
-        { NULL,                     stbir__support_zero },
-        { stbir__filter_trapezoid,  stbir__support_trapezoid },
-        { stbir__filter_triangle,   stbir__support_one },
-        { stbir__filter_cubic,      stbir__support_two },
-        { stbir__filter_catmullrom, stbir__support_two },
-        { stbir__filter_mitchell,   stbir__support_two },
-};
-
-stbir__inline static int stbir__use_upsampling(float ratio)
-{
-    return ratio > 1;
-}
-
-stbir__inline static int stbir__use_width_upsampling(stbir__info* stbir_info)
-{
-    return stbir__use_upsampling(stbir_info->horizontal_scale);
-}
-
-stbir__inline static int stbir__use_height_upsampling(stbir__info* stbir_info)
-{
-    return stbir__use_upsampling(stbir_info->vertical_scale);
-}
-
-// This is the maximum number of input samples that can affect an output sample
-// with the given filter
-static int stbir__get_filter_pixel_width(stbir_filter filter, float scale)
-{
-    STBIR_ASSERT(filter != 0);
-    STBIR_ASSERT(filter < STBIR__ARRAY_SIZE(stbir__filter_info_table));
-
-    if (stbir__use_upsampling(scale))
-        return (int)ceil(stbir__filter_info_table[filter].support(1/scale) * 2);
-    else
-        return (int)ceil(stbir__filter_info_table[filter].support(scale) * 2 / scale);
-}
-
-// This is how much to expand buffers to account for filters seeking outside
-// the image boundaries.
-static int stbir__get_filter_pixel_margin(stbir_filter filter, float scale)
-{
-    return stbir__get_filter_pixel_width(filter, scale) / 2;
-}
-
-static int stbir__get_coefficient_width(stbir_filter filter, float scale)
-{
-    if (stbir__use_upsampling(scale))
-        return (int)ceil(stbir__filter_info_table[filter].support(1 / scale) * 2);
-    else
-        return (int)ceil(stbir__filter_info_table[filter].support(scale) * 2);
-}
-
-static int stbir__get_contributors(float scale, stbir_filter filter, int input_size, int output_size)
-{
-    if (stbir__use_upsampling(scale))
-        return output_size;
-    else
-        return (input_size + stbir__get_filter_pixel_margin(filter, scale) * 2);
-}
-
-static int stbir__get_total_horizontal_coefficients(stbir__info* info)
-{
-    return info->horizontal_num_contributors
-         * stbir__get_coefficient_width      (info->horizontal_filter, info->horizontal_scale);
-}
-
-static int stbir__get_total_vertical_coefficients(stbir__info* info)
-{
-    return info->vertical_num_contributors
-         * stbir__get_coefficient_width      (info->vertical_filter, info->vertical_scale);
-}
-
-static stbir__contributors* stbir__get_contributor(stbir__contributors* contributors, int n)
-{
-    return &contributors[n];
-}
-
-// For perf reasons this code is duplicated in stbir__resample_horizontal_upsample/downsample,
-// if you change it here change it there too.
-static float* stbir__get_coefficient(float* coefficients, stbir_filter filter, float scale, int n, int c)
-{
-    int width = stbir__get_coefficient_width(filter, scale);
-    return &coefficients[width*n + c];
-}
-
-static int stbir__edge_wrap_slow(stbir_edge edge, int n, int max)
-{
-    switch (edge)
-    {
-    case STBIR_EDGE_ZERO:
-        return 0; // we'll decode the wrong pixel here, and then overwrite with 0s later
-
-    case STBIR_EDGE_CLAMP:
-        if (n < 0)
-            return 0;
-
-        if (n >= max)
-            return max - 1;
-
-        return n; // NOTREACHED
-
-    case STBIR_EDGE_REFLECT:
-    {
-        if (n < 0)
-        {
-            if (n < max)
-                return -n;
-            else
-                return max - 1;
-        }
-
-        if (n >= max)
-        {
-            int max2 = max * 2;
-            if (n >= max2)
-                return 0;
-            else
-                return max2 - n - 1;
-        }
-
-        return n; // NOTREACHED
-    }
-
-    case STBIR_EDGE_WRAP:
-        if (n >= 0)
-            return (n % max);
-        else
-        {
-            int m = (-n) % max;
-
-            if (m != 0)
-                m = max - m;
-
-            return (m);
-        }
-        return n;  // NOTREACHED
-
-    default:
-        STBIR_ASSERT(!"Unimplemented edge type");
-        return 0;
-    }
-}
-
-stbir__inline static int stbir__edge_wrap(stbir_edge edge, int n, int max)
-{
-    // avoid per-pixel switch
-    if (n >= 0 && n < max)
-        return n;
-    return stbir__edge_wrap_slow(edge, n, max);
-}
-
-// What input pixels contribute to this output pixel?
-static void stbir__calculate_sample_range_upsample(int n, float out_filter_radius, float scale_ratio, float out_shift, int* in_first_pixel, int* in_last_pixel, float* in_center_of_out)
-{
-    float out_pixel_center = (float)n + 0.5f;
-    float out_pixel_influence_lowerbound = out_pixel_center - out_filter_radius;
-    float out_pixel_influence_upperbound = out_pixel_center + out_filter_radius;
-
-    float in_pixel_influence_lowerbound = (out_pixel_influence_lowerbound + out_shift) / scale_ratio;
-    float in_pixel_influence_upperbound = (out_pixel_influence_upperbound + out_shift) / scale_ratio;
-
-    *in_center_of_out = (out_pixel_center + out_shift) / scale_ratio;
-    *in_first_pixel = (int)(floor(in_pixel_influence_lowerbound + 0.5));
-    *in_last_pixel = (int)(floor(in_pixel_influence_upperbound - 0.5));
-}
-
-// What output pixels does this input pixel contribute to?
-static void stbir__calculate_sample_range_downsample(int n, float in_pixels_radius, float scale_ratio, float out_shift, int* out_first_pixel, int* out_last_pixel, float* out_center_of_in)
-{
-    float in_pixel_center = (float)n + 0.5f;
-    float in_pixel_influence_lowerbound = in_pixel_center - in_pixels_radius;
-    float in_pixel_influence_upperbound = in_pixel_center + in_pixels_radius;
-
-    float out_pixel_influence_lowerbound = in_pixel_influence_lowerbound * scale_ratio - out_shift;
-    float out_pixel_influence_upperbound = in_pixel_influence_upperbound * scale_ratio - out_shift;
-
-    *out_center_of_in = in_pixel_center * scale_ratio - out_shift;
-    *out_first_pixel = (int)(floor(out_pixel_influence_lowerbound + 0.5));
-    *out_last_pixel = (int)(floor(out_pixel_influence_upperbound - 0.5));
-}
-
-static void stbir__calculate_coefficients_upsample(stbir__info* stbir_info, stbir_filter filter, float scale, int in_first_pixel, int in_last_pixel, float in_center_of_out, stbir__contributors* contributor, float* coefficient_group)
-{
-    int i;
-    float total_filter = 0;
-    float filter_scale;
-
-    STBIR_ASSERT(in_last_pixel - in_first_pixel <= (int)ceil(stbir__filter_info_table[filter].support(1/scale) * 2)); // Taken directly from stbir__get_coefficient_width() which we can't call because we don't know if we're horizontal or vertical.
-
-    contributor->n0 = in_first_pixel;
-    contributor->n1 = in_last_pixel;
-
-    STBIR_ASSERT(contributor->n1 >= contributor->n0);
-
-    for (i = 0; i <= in_last_pixel - in_first_pixel; i++)
-    {
-        float in_pixel_center = (float)(i + in_first_pixel) + 0.5f;
-        coefficient_group[i] = stbir__filter_info_table[filter].kernel(in_center_of_out - in_pixel_center, 1 / scale);
-
-        // If the coefficient is zero, skip it. (Don't do the <0 check here, we want the influence of those outside pixels.)
-        if (i == 0 && !coefficient_group[i])
-        {
-            contributor->n0 = ++in_first_pixel;
-            i--;
-            continue;
-        }
-
-        total_filter += coefficient_group[i];
-    }
-
-    STBIR_ASSERT(stbir__filter_info_table[filter].kernel((float)(in_last_pixel + 1) + 0.5f - in_center_of_out, 1/scale) == 0);
-
-    STBIR_ASSERT(total_filter > 0.9);
-    STBIR_ASSERT(total_filter < 1.1f); // Make sure it's not way off.
-
-    // Make sure the sum of all coefficients is 1.
-    filter_scale = 1 / total_filter;
-
-    for (i = 0; i <= in_last_pixel - in_first_pixel; i++)
-        coefficient_group[i] *= filter_scale;
-
-    for (i = in_last_pixel - in_first_pixel; i >= 0; i--)
-    {
-        if (coefficient_group[i])
-            break;
-
-        // This line has no weight. We can skip it.
-        contributor->n1 = contributor->n0 + i - 1;
-    }
-}
-
-static void stbir__calculate_coefficients_downsample(stbir__info* stbir_info, stbir_filter filter, float scale_ratio, int out_first_pixel, int out_last_pixel, float out_center_of_in, stbir__contributors* contributor, float* coefficient_group)
-{
-    int i;
-
-     STBIR_ASSERT(out_last_pixel - out_first_pixel <= (int)ceil(stbir__filter_info_table[filter].support(scale_ratio) * 2)); // Taken directly from stbir__get_coefficient_width() which we can't call because we don't know if we're horizontal or vertical.
-
-    contributor->n0 = out_first_pixel;
-    contributor->n1 = out_last_pixel;
-
-    STBIR_ASSERT(contributor->n1 >= contributor->n0);
-
-    for (i = 0; i <= out_last_pixel - out_first_pixel; i++)
-    {
-        float out_pixel_center = (float)(i + out_first_pixel) + 0.5f;
-        float x = out_pixel_center - out_center_of_in;
-        coefficient_group[i] = stbir__filter_info_table[filter].kernel(x, scale_ratio) * scale_ratio;
-    }
-
-    STBIR_ASSERT(stbir__filter_info_table[filter].kernel((float)(out_last_pixel + 1) + 0.5f - out_center_of_in, scale_ratio) == 0);
-
-    for (i = out_last_pixel - out_first_pixel; i >= 0; i--)
-    {
-        if (coefficient_group[i])
-            break;
-
-        // This line has no weight. We can skip it.
-        contributor->n1 = contributor->n0 + i - 1;
-    }
-}
-
-static void stbir__normalize_downsample_coefficients(stbir__info* stbir_info, stbir__contributors* contributors, float* coefficients, stbir_filter filter, float scale_ratio, float shift, int input_size, int output_size)
-{
-    int num_contributors = stbir__get_contributors(scale_ratio, filter, input_size, output_size);
-    int num_coefficients = stbir__get_coefficient_width(filter, scale_ratio);
-    int i, j;
-    int skip;
-
-    for (i = 0; i < output_size; i++)
-    {
-        float scale;
-        float total = 0;
-
-        for (j = 0; j < num_contributors; j++)
-        {
-            if (i >= contributors[j].n0 && i <= contributors[j].n1)
-            {
-                float coefficient = *stbir__get_coefficient(coefficients, filter, scale_ratio, j, i - contributors[j].n0);
-                total += coefficient;
-            }
-            else if (i < contributors[j].n0)
-                break;
-        }
-
-        STBIR_ASSERT(total > 0.9f);
-        STBIR_ASSERT(total < 1.1f);
-
-        scale = 1 / total;
-
-        for (j = 0; j < num_contributors; j++)
-        {
-            if (i >= contributors[j].n0 && i <= contributors[j].n1)
-                *stbir__get_coefficient(coefficients, filter, scale_ratio, j, i - contributors[j].n0) *= scale;
-            else if (i < contributors[j].n0)
-                break;
-        }
-    }
-
-    // Optimize: Skip zero coefficients and contributions outside of image bounds.
-    // Do this after normalizing because normalization depends on the n0/n1 values.
-    for (j = 0; j < num_contributors; j++)
-    {
-        int range, max, width;
-
-        skip = 0;
-        while (*stbir__get_coefficient(coefficients, filter, scale_ratio, j, skip) == 0)
-            skip++;
-
-        contributors[j].n0 += skip;
-
-        while (contributors[j].n0 < 0)
-        {
-            contributors[j].n0++;
-            skip++;
-        }
-
-        range = contributors[j].n1 - contributors[j].n0 + 1;
-        max = stbir__min(num_coefficients, range);
-
-        width = stbir__get_coefficient_width(filter, scale_ratio);
-        for (i = 0; i < max; i++)
-        {
-            if (i + skip >= width)
-                break;
-
-            *stbir__get_coefficient(coefficients, filter, scale_ratio, j, i) = *stbir__get_coefficient(coefficients, filter, scale_ratio, j, i + skip);
-        }
-
-        continue;
-    }
-
-    // Using min to avoid writing into invalid pixels.
-    for (i = 0; i < num_contributors; i++)
-        contributors[i].n1 = stbir__min(contributors[i].n1, output_size - 1);
-}
-
-// Each scan line uses the same kernel values so we should calculate the kernel
-// values once and then we can use them for every scan line.
-static void stbir__calculate_filters(stbir__info* stbir_info, stbir__contributors* contributors, float* coefficients, stbir_filter filter, float scale_ratio, float shift, int input_size, int output_size)
-{
-    int n;
-    int total_contributors = stbir__get_contributors(scale_ratio, filter, input_size, output_size);
-
-    if (stbir__use_upsampling(scale_ratio))
-    {
-        float out_pixels_radius = stbir__filter_info_table[filter].support(1 / scale_ratio) * scale_ratio;
-
-        // Looping through out pixels
-        for (n = 0; n < total_contributors; n++)
-        {
-            float in_center_of_out; // Center of the current out pixel in the in pixel space
-            int in_first_pixel, in_last_pixel;
-
-            stbir__calculate_sample_range_upsample(n, out_pixels_radius, scale_ratio, shift, &in_first_pixel, &in_last_pixel, &in_center_of_out);
-
-            stbir__calculate_coefficients_upsample(stbir_info, filter, scale_ratio, in_first_pixel, in_last_pixel, in_center_of_out, stbir__get_contributor(contributors, n), stbir__get_coefficient(coefficients, filter, scale_ratio, n, 0));
-        }
-    }
-    else
-    {
-        float in_pixels_radius = stbir__filter_info_table[filter].support(scale_ratio) / scale_ratio;
-
-        // Looping through in pixels
-        for (n = 0; n < total_contributors; n++)
-        {
-            float out_center_of_in; // Center of the current out pixel in the in pixel space
-            int out_first_pixel, out_last_pixel;
-            int n_adjusted = n - stbir__get_filter_pixel_margin(filter, scale_ratio);
-
-            stbir__calculate_sample_range_downsample(n_adjusted, in_pixels_radius, scale_ratio, shift, &out_first_pixel, &out_last_pixel, &out_center_of_in);
-
-            stbir__calculate_coefficients_downsample(stbir_info, filter, scale_ratio, out_first_pixel, out_last_pixel, out_center_of_in, stbir__get_contributor(contributors, n), stbir__get_coefficient(coefficients, filter, scale_ratio, n, 0));
-        }
-
-        stbir__normalize_downsample_coefficients(stbir_info, contributors, coefficients, filter, scale_ratio, shift, input_size, output_size);
-    }
-}
-
-static float* stbir__get_decode_buffer(stbir__info* stbir_info)
-{
-    // The 0 index of the decode buffer starts after the margin. This makes
-    // it okay to use negative indexes on the decode buffer.
-    return &stbir_info->decode_buffer[stbir_info->horizontal_filter_pixel_margin * stbir_info->channels];
-}
-
-#define STBIR__DECODE(type, colorspace) ((type) * (STBIR_MAX_COLORSPACES) + (colorspace))
-
-static void stbir__decode_scanline(stbir__info* stbir_info, int n)
-{
-    int c;
-    int channels = stbir_info->channels;
-    int alpha_channel = stbir_info->alpha_channel;
-    int type = stbir_info->type;
-    int colorspace = stbir_info->colorspace;
-    int input_w = stbir_info->input_w;
-    int input_stride_bytes = stbir_info->input_stride_bytes;
-    float* decode_buffer = stbir__get_decode_buffer(stbir_info);
-    stbir_edge edge_horizontal = stbir_info->edge_horizontal;
-    stbir_edge edge_vertical = stbir_info->edge_vertical;
-    int in_buffer_row_offset = stbir__edge_wrap(edge_vertical, n, stbir_info->input_h) * input_stride_bytes;
-    const void* input_data = (char *) stbir_info->input_data + in_buffer_row_offset;
-    int max_x = input_w + stbir_info->horizontal_filter_pixel_margin;
-    int decode = STBIR__DECODE(type, colorspace);
-
-    int x = -stbir_info->horizontal_filter_pixel_margin;
-
-    // special handling for STBIR_EDGE_ZERO because it needs to return an item that doesn't appear in the input,
-    // and we want to avoid paying overhead on every pixel if not STBIR_EDGE_ZERO
-    if (edge_vertical == STBIR_EDGE_ZERO && (n < 0 || n >= stbir_info->input_h))
-    {
-        for (; x < max_x; x++)
-            for (c = 0; c < channels; c++)
-                decode_buffer[x*channels + c] = 0;
-        return;
-    }
-
-    switch (decode)
-    {
-    case STBIR__DECODE(STBIR_TYPE_UINT8, STBIR_COLORSPACE_LINEAR):
-        for (; x < max_x; x++)
-        {
-            int decode_pixel_index = x * channels;
-            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
-            for (c = 0; c < channels; c++)
-                decode_buffer[decode_pixel_index + c] = ((float)((const unsigned char*)input_data)[input_pixel_index + c]) / 255;
-        }
-        break;
-
-    case STBIR__DECODE(STBIR_TYPE_UINT8, STBIR_COLORSPACE_SRGB):
-        for (; x < max_x; x++)
-        {
-            int decode_pixel_index = x * channels;
-            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
-            for (c = 0; c < channels; c++)
-                decode_buffer[decode_pixel_index + c] = stbir__srgb_uchar_to_linear_float[((const unsigned char*)input_data)[input_pixel_index + c]];
-
-            if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
-                decode_buffer[decode_pixel_index + alpha_channel] = ((float)((const unsigned char*)input_data)[input_pixel_index + alpha_channel]) / 255;
-        }
-        break;
-
-    case STBIR__DECODE(STBIR_TYPE_UINT16, STBIR_COLORSPACE_LINEAR):
-        for (; x < max_x; x++)
-        {
-            int decode_pixel_index = x * channels;
-            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
-            for (c = 0; c < channels; c++)
-                decode_buffer[decode_pixel_index + c] = ((float)((const unsigned short*)input_data)[input_pixel_index + c]) / 65535;
-        }
-        break;
-
-    case STBIR__DECODE(STBIR_TYPE_UINT16, STBIR_COLORSPACE_SRGB):
-        for (; x < max_x; x++)
-        {
-            int decode_pixel_index = x * channels;
-            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
-            for (c = 0; c < channels; c++)
-                decode_buffer[decode_pixel_index + c] = stbir__srgb_to_linear(((float)((const unsigned short*)input_data)[input_pixel_index + c]) / 65535);
-
-            if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
-                decode_buffer[decode_pixel_index + alpha_channel] = ((float)((const unsigned short*)input_data)[input_pixel_index + alpha_channel]) / 65535;
-        }
-        break;
-
-    case STBIR__DECODE(STBIR_TYPE_UINT32, STBIR_COLORSPACE_LINEAR):
-        for (; x < max_x; x++)
-        {
-            int decode_pixel_index = x * channels;
-            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
-            for (c = 0; c < channels; c++)
-                decode_buffer[decode_pixel_index + c] = (float)(((double)((const unsigned int*)input_data)[input_pixel_index + c]) / 4294967295);
-        }
-        break;
-
-    case STBIR__DECODE(STBIR_TYPE_UINT32, STBIR_COLORSPACE_SRGB):
-        for (; x < max_x; x++)
-        {
-            int decode_pixel_index = x * channels;
-            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
-            for (c = 0; c < channels; c++)
-                decode_buffer[decode_pixel_index + c] = stbir__srgb_to_linear((float)(((double)((const unsigned int*)input_data)[input_pixel_index + c]) / 4294967295));
-
-            if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
-                decode_buffer[decode_pixel_index + alpha_channel] = (float)(((double)((const unsigned int*)input_data)[input_pixel_index + alpha_channel]) / 4294967295);
-        }
-        break;
-
-    case STBIR__DECODE(STBIR_TYPE_FLOAT, STBIR_COLORSPACE_LINEAR):
-        for (; x < max_x; x++)
-        {
-            int decode_pixel_index = x * channels;
-            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
-            for (c = 0; c < channels; c++)
-                decode_buffer[decode_pixel_index + c] = ((const float*)input_data)[input_pixel_index + c];
-        }
-        break;
-
-    case STBIR__DECODE(STBIR_TYPE_FLOAT, STBIR_COLORSPACE_SRGB):
-        for (; x < max_x; x++)
-        {
-            int decode_pixel_index = x * channels;
-            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
-            for (c = 0; c < channels; c++)
-                decode_buffer[decode_pixel_index + c] = stbir__srgb_to_linear(((const float*)input_data)[input_pixel_index + c]);
-
-            if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
-                decode_buffer[decode_pixel_index + alpha_channel] = ((const float*)input_data)[input_pixel_index + alpha_channel];
-        }
-
-        break;
-
-    default:
-        STBIR_ASSERT(!"Unknown type/colorspace/channels combination.");
-        break;
-    }
-
-    if (!(stbir_info->flags & STBIR_FLAG_ALPHA_PREMULTIPLIED))
-    {
-        for (x = -stbir_info->horizontal_filter_pixel_margin; x < max_x; x++)
-        {
-            int decode_pixel_index = x * channels;
-
-            // If the alpha value is 0 it will clobber the color values. Make sure it's not.
-            float alpha = decode_buffer[decode_pixel_index + alpha_channel];
-#ifndef STBIR_NO_ALPHA_EPSILON
-            if (stbir_info->type != STBIR_TYPE_FLOAT) {
-                alpha += STBIR_ALPHA_EPSILON;
-                decode_buffer[decode_pixel_index + alpha_channel] = alpha;
-            }
-#endif
-            for (c = 0; c < channels; c++)
-            {
-                if (c == alpha_channel)
-                    continue;
-
-                decode_buffer[decode_pixel_index + c] *= alpha;
-            }
-        }
-    }
-
-    if (edge_horizontal == STBIR_EDGE_ZERO)
-    {
-        for (x = -stbir_info->horizontal_filter_pixel_margin; x < 0; x++)
-        {
-            for (c = 0; c < channels; c++)
-                decode_buffer[x*channels + c] = 0;
-        }
-        for (x = input_w; x < max_x; x++)
-        {
-            for (c = 0; c < channels; c++)
-                decode_buffer[x*channels + c] = 0;
-        }
-    }
-}
-
-static float* stbir__get_ring_buffer_entry(float* ring_buffer, int index, int ring_buffer_length)
-{
-    return &ring_buffer[index * ring_buffer_length];
-}
-
-static float* stbir__add_empty_ring_buffer_entry(stbir__info* stbir_info, int n)
-{
-    int ring_buffer_index;
-    float* ring_buffer;
-
-    if (stbir_info->ring_buffer_begin_index < 0)
-    {
-        ring_buffer_index = stbir_info->ring_buffer_begin_index = 0;
-        stbir_info->ring_buffer_first_scanline = n;
-    }
-    else
-    {
-        ring_buffer_index = (stbir_info->ring_buffer_begin_index + (stbir_info->ring_buffer_last_scanline - stbir_info->ring_buffer_first_scanline) + 1) % stbir_info->vertical_filter_pixel_width;
-        STBIR_ASSERT(ring_buffer_index != stbir_info->ring_buffer_begin_index);
-    }
-
-    ring_buffer = stbir__get_ring_buffer_entry(stbir_info->ring_buffer, ring_buffer_index, stbir_info->ring_buffer_length_bytes / sizeof(float));
-    memset(ring_buffer, 0, stbir_info->ring_buffer_length_bytes);
-
-    stbir_info->ring_buffer_last_scanline = n;
-
-    return ring_buffer;
-}
-
-
-static void stbir__resample_horizontal_upsample(stbir__info* stbir_info, int n, float* output_buffer)
-{
-    int x, k;
-    int output_w = stbir_info->output_w;
-    int kernel_pixel_width = stbir_info->horizontal_filter_pixel_width;
-    int channels = stbir_info->channels;
-    float* decode_buffer = stbir__get_decode_buffer(stbir_info);
-    stbir__contributors* horizontal_contributors = stbir_info->horizontal_contributors;
-    float* horizontal_coefficients = stbir_info->horizontal_coefficients;
-    int coefficient_width = stbir_info->horizontal_coefficient_width;
-
-    for (x = 0; x < output_w; x++)
-    {
-        int n0 = horizontal_contributors[x].n0;
-        int n1 = horizontal_contributors[x].n1;
-
-        int out_pixel_index = x * channels;
-        int coefficient_group = coefficient_width * x;
-        int coefficient_counter = 0;
-
-        STBIR_ASSERT(n1 >= n0);
-        STBIR_ASSERT(n0 >= -stbir_info->horizontal_filter_pixel_margin);
-        STBIR_ASSERT(n1 >= -stbir_info->horizontal_filter_pixel_margin);
-        STBIR_ASSERT(n0 < stbir_info->input_w + stbir_info->horizontal_filter_pixel_margin);
-        STBIR_ASSERT(n1 < stbir_info->input_w + stbir_info->horizontal_filter_pixel_margin);
-
-        switch (channels) {
-            case 1:
-                for (k = n0; k <= n1; k++)
-                {
-                    int in_pixel_index = k * 1;
-                    float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
-                    STBIR_ASSERT(coefficient != 0);
-                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
-                }
-                break;
-            case 2:
-                for (k = n0; k <= n1; k++)
-                {
-                    int in_pixel_index = k * 2;
-                    float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
-                    STBIR_ASSERT(coefficient != 0);
-                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
-                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
-                }
-                break;
-            case 3:
-                for (k = n0; k <= n1; k++)
-                {
-                    int in_pixel_index = k * 3;
-                    float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
-                    STBIR_ASSERT(coefficient != 0);
-                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
-                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
-                    output_buffer[out_pixel_index + 2] += decode_buffer[in_pixel_index + 2] * coefficient;
-                }
-                break;
-            case 4:
-                for (k = n0; k <= n1; k++)
-                {
-                    int in_pixel_index = k * 4;
-                    float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
-                    STBIR_ASSERT(coefficient != 0);
-                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
-                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
-                    output_buffer[out_pixel_index + 2] += decode_buffer[in_pixel_index + 2] * coefficient;
-                    output_buffer[out_pixel_index + 3] += decode_buffer[in_pixel_index + 3] * coefficient;
-                }
-                break;
-            default:
-                for (k = n0; k <= n1; k++)
-                {
-                    int in_pixel_index = k * channels;
-                    float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
-                    int c;
-                    STBIR_ASSERT(coefficient != 0);
-                    for (c = 0; c < channels; c++)
-                        output_buffer[out_pixel_index + c] += decode_buffer[in_pixel_index + c] * coefficient;
-                }
-                break;
-        }
-    }
-}
-
-static void stbir__resample_horizontal_downsample(stbir__info* stbir_info, int n, float* output_buffer)
-{
-    int x, k;
-    int input_w = stbir_info->input_w;
-    int output_w = stbir_info->output_w;
-    int kernel_pixel_width = stbir_info->horizontal_filter_pixel_width;
-    int channels = stbir_info->channels;
-    float* decode_buffer = stbir__get_decode_buffer(stbir_info);
-    stbir__contributors* horizontal_contributors = stbir_info->horizontal_contributors;
-    float* horizontal_coefficients = stbir_info->horizontal_coefficients;
-    int coefficient_width = stbir_info->horizontal_coefficient_width;
-    int filter_pixel_margin = stbir_info->horizontal_filter_pixel_margin;
-    int max_x = input_w + filter_pixel_margin * 2;
-
-    STBIR_ASSERT(!stbir__use_width_upsampling(stbir_info));
-
-    switch (channels) {
-        case 1:
-            for (x = 0; x < max_x; x++)
-            {
-                int n0 = horizontal_contributors[x].n0;
-                int n1 = horizontal_contributors[x].n1;
-
-                int in_x = x - filter_pixel_margin;
-                int in_pixel_index = in_x * 1;
-                int max_n = n1;
-                int coefficient_group = coefficient_width * x;
-
-                for (k = n0; k <= max_n; k++)
-                {
-                    int out_pixel_index = k * 1;
-                    float coefficient = horizontal_coefficients[coefficient_group + k - n0];
-                    STBIR_ASSERT(coefficient != 0);
-                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
-                }
-            }
-            break;
-
-        case 2:
-            for (x = 0; x < max_x; x++)
-            {
-                int n0 = horizontal_contributors[x].n0;
-                int n1 = horizontal_contributors[x].n1;
-
-                int in_x = x - filter_pixel_margin;
-                int in_pixel_index = in_x * 2;
-                int max_n = n1;
-                int coefficient_group = coefficient_width * x;
-
-                for (k = n0; k <= max_n; k++)
-                {
-                    int out_pixel_index = k * 2;
-                    float coefficient = horizontal_coefficients[coefficient_group + k - n0];
-                    STBIR_ASSERT(coefficient != 0);
-                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
-                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
-                }
-            }
-            break;
-
-        case 3:
-            for (x = 0; x < max_x; x++)
-            {
-                int n0 = horizontal_contributors[x].n0;
-                int n1 = horizontal_contributors[x].n1;
-
-                int in_x = x - filter_pixel_margin;
-                int in_pixel_index = in_x * 3;
-                int max_n = n1;
-                int coefficient_group = coefficient_width * x;
-
-                for (k = n0; k <= max_n; k++)
-                {
-                    int out_pixel_index = k * 3;
-                    float coefficient = horizontal_coefficients[coefficient_group + k - n0];
-                    STBIR_ASSERT(coefficient != 0);
-                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
-                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
-                    output_buffer[out_pixel_index + 2] += decode_buffer[in_pixel_index + 2] * coefficient;
-                }
-            }
-            break;
-
-        case 4:
-            for (x = 0; x < max_x; x++)
-            {
-                int n0 = horizontal_contributors[x].n0;
-                int n1 = horizontal_contributors[x].n1;
-
-                int in_x = x - filter_pixel_margin;
-                int in_pixel_index = in_x * 4;
-                int max_n = n1;
-                int coefficient_group = coefficient_width * x;
-
-                for (k = n0; k <= max_n; k++)
-                {
-                    int out_pixel_index = k * 4;
-                    float coefficient = horizontal_coefficients[coefficient_group + k - n0];
-                    STBIR_ASSERT(coefficient != 0);
-                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
-                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
-                    output_buffer[out_pixel_index + 2] += decode_buffer[in_pixel_index + 2] * coefficient;
-                    output_buffer[out_pixel_index + 3] += decode_buffer[in_pixel_index + 3] * coefficient;
-                }
-            }
-            break;
-
-        default:
-            for (x = 0; x < max_x; x++)
-            {
-                int n0 = horizontal_contributors[x].n0;
-                int n1 = horizontal_contributors[x].n1;
-
-                int in_x = x - filter_pixel_margin;
-                int in_pixel_index = in_x * channels;
-                int max_n = n1;
-                int coefficient_group = coefficient_width * x;
-
-                for (k = n0; k <= max_n; k++)
-                {
-                    int c;
-                    int out_pixel_index = k * channels;
-                    float coefficient = horizontal_coefficients[coefficient_group + k - n0];
-                    STBIR_ASSERT(coefficient != 0);
-                    for (c = 0; c < channels; c++)
-                        output_buffer[out_pixel_index + c] += decode_buffer[in_pixel_index + c] * coefficient;
-                }
-            }
-            break;
-    }
-}
-
-static void stbir__decode_and_resample_upsample(stbir__info* stbir_info, int n)
-{
-    // Decode the nth scanline from the source image into the decode buffer.
-    stbir__decode_scanline(stbir_info, n);
-
-    // Now resample it into the ring buffer.
-    if (stbir__use_width_upsampling(stbir_info))
-        stbir__resample_horizontal_upsample(stbir_info, n, stbir__add_empty_ring_buffer_entry(stbir_info, n));
-    else
-        stbir__resample_horizontal_downsample(stbir_info, n, stbir__add_empty_ring_buffer_entry(stbir_info, n));
-
-    // Now it's sitting in the ring buffer ready to be used as source for the vertical sampling.
-}
-
-static void stbir__decode_and_resample_downsample(stbir__info* stbir_info, int n)
-{
-    // Decode the nth scanline from the source image into the decode buffer.
-    stbir__decode_scanline(stbir_info, n);
-
-    memset(stbir_info->horizontal_buffer, 0, stbir_info->output_w * stbir_info->channels * sizeof(float));
-
-    // Now resample it into the horizontal buffer.
-    if (stbir__use_width_upsampling(stbir_info))
-        stbir__resample_horizontal_upsample(stbir_info, n, stbir_info->horizontal_buffer);
-    else
-        stbir__resample_horizontal_downsample(stbir_info, n, stbir_info->horizontal_buffer);
-
-    // Now it's sitting in the horizontal buffer ready to be distributed into the ring buffers.
-}
-
-// Get the specified scan line from the ring buffer.
-static float* stbir__get_ring_buffer_scanline(int get_scanline, float* ring_buffer, int begin_index, int first_scanline, int ring_buffer_size, int ring_buffer_length)
-{
-    int ring_buffer_index = (begin_index + (get_scanline - first_scanline)) % ring_buffer_size;
-    return stbir__get_ring_buffer_entry(ring_buffer, ring_buffer_index, ring_buffer_length);
-}
-
-
-static void stbir__encode_scanline(stbir__info* stbir_info, int num_pixels, void *output_buffer, float *encode_buffer, int channels, int alpha_channel, int decode)
-{
-    int x;
-    int n;
-    int num_nonalpha;
-    stbir_uint16 nonalpha[STBIR_MAX_CHANNELS];
-
-    if (!(stbir_info->flags&STBIR_FLAG_ALPHA_PREMULTIPLIED))
-    {
-        for (x=0; x < num_pixels; ++x)
-        {
-            int pixel_index = x*channels;
-
-            float alpha = encode_buffer[pixel_index + alpha_channel];
-            float reciprocal_alpha = alpha ? 1.0f / alpha : 0;
-
-            // unrolling this produced a 1% slowdown upscaling a large RGBA linear-space image on my machine - stb
-            for (n = 0; n < channels; n++)
-                if (n != alpha_channel)
-                    encode_buffer[pixel_index + n] *= reciprocal_alpha;
-
-            // We added in a small epsilon to prevent the color channel from being deleted with zero alpha.
-            // Because we only add it for integer types, it will automatically be discarded on integer
-            // conversion, so we don't need to subtract it back out (which would be problematic for
-            // numeric precision reasons).
-        }
-    }
-
-    // build a table of all channels that need colorspace correction, so
-    // we don't perform colorspace correction on channels that don't need it.
-    for (x=0, num_nonalpha=0; x < channels; ++x)
-        if (x != alpha_channel || (stbir_info->flags & STBIR_FLAG_ALPHA_USES_COLORSPACE))
-            nonalpha[num_nonalpha++] = x;
-
-    #define STBIR__ROUND_INT(f)    ((int)          ((f)+0.5))
-    #define STBIR__ROUND_UINT(f)   ((stbir_uint32) ((f)+0.5))
-
-    #ifdef STBIR__SATURATE_INT
-    #define STBIR__ENCODE_LINEAR8(f)   stbir__saturate8 (STBIR__ROUND_INT((f) * 255  ))
-    #define STBIR__ENCODE_LINEAR16(f)  stbir__saturate16(STBIR__ROUND_INT((f) * 65535))
-    #else
-    #define STBIR__ENCODE_LINEAR8(f)   (unsigned char ) STBIR__ROUND_INT(stbir__saturate(f) * 255  )
-    #define STBIR__ENCODE_LINEAR16(f)  (unsigned short) STBIR__ROUND_INT(stbir__saturate(f) * 65535)
-    #endif
-
-    switch (decode)
-    {
-        case STBIR__DECODE(STBIR_TYPE_UINT8, STBIR_COLORSPACE_LINEAR):
-            for (x=0; x < num_pixels; ++x)
-            {
-                int pixel_index = x*channels;
-
-                for (n = 0; n < channels; n++)
-                {
-                    int index = pixel_index + n;
-                    ((unsigned char*)output_buffer)[index] = STBIR__ENCODE_LINEAR8(encode_buffer[index]);
-                }
-            }
-            break;
-
-        case STBIR__DECODE(STBIR_TYPE_UINT8, STBIR_COLORSPACE_SRGB):
-            for (x=0; x < num_pixels; ++x)
-            {
-                int pixel_index = x*channels;
-
-                for (n = 0; n < num_nonalpha; n++)
-                {
-                    int index = pixel_index + nonalpha[n];
-                    ((unsigned char*)output_buffer)[index] = stbir__linear_to_srgb_uchar(encode_buffer[index]);
-                }
-
-                if (!(stbir_info->flags & STBIR_FLAG_ALPHA_USES_COLORSPACE))
-                    ((unsigned char *)output_buffer)[pixel_index + alpha_channel] = STBIR__ENCODE_LINEAR8(encode_buffer[pixel_index+alpha_channel]);
-            }
-            break;
-
-        case STBIR__DECODE(STBIR_TYPE_UINT16, STBIR_COLORSPACE_LINEAR):
-            for (x=0; x < num_pixels; ++x)
-            {
-                int pixel_index = x*channels;
-
-                for (n = 0; n < channels; n++)
-                {
-                    int index = pixel_index + n;
-                    ((unsigned short*)output_buffer)[index] = STBIR__ENCODE_LINEAR16(encode_buffer[index]);
-                }
-            }
-            break;
-
-        case STBIR__DECODE(STBIR_TYPE_UINT16, STBIR_COLORSPACE_SRGB):
-            for (x=0; x < num_pixels; ++x)
-            {
-                int pixel_index = x*channels;
-
-                for (n = 0; n < num_nonalpha; n++)
-                {
-                    int index = pixel_index + nonalpha[n];
-                    ((unsigned short*)output_buffer)[index] = (unsigned short)STBIR__ROUND_INT(stbir__linear_to_srgb(stbir__saturate(encode_buffer[index])) * 65535);
-                }
-
-                if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
-                    ((unsigned short*)output_buffer)[pixel_index + alpha_channel] = STBIR__ENCODE_LINEAR16(encode_buffer[pixel_index + alpha_channel]);
-            }
-
-            break;
-
-        case STBIR__DECODE(STBIR_TYPE_UINT32, STBIR_COLORSPACE_LINEAR):
-            for (x=0; x < num_pixels; ++x)
-            {
-                int pixel_index = x*channels;
-
-                for (n = 0; n < channels; n++)
-                {
-                    int index = pixel_index + n;
-                    ((unsigned int*)output_buffer)[index] = (unsigned int)STBIR__ROUND_UINT(((double)stbir__saturate(encode_buffer[index])) * 4294967295);
-                }
-            }
-            break;
-
-        case STBIR__DECODE(STBIR_TYPE_UINT32, STBIR_COLORSPACE_SRGB):
-            for (x=0; x < num_pixels; ++x)
-            {
-                int pixel_index = x*channels;
-
-                for (n = 0; n < num_nonalpha; n++)
-                {
-                    int index = pixel_index + nonalpha[n];
-                    ((unsigned int*)output_buffer)[index] = (unsigned int)STBIR__ROUND_UINT(((double)stbir__linear_to_srgb(stbir__saturate(encode_buffer[index]))) * 4294967295);
-                }
-
-                if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
-                    ((unsigned int*)output_buffer)[pixel_index + alpha_channel] = (unsigned int)STBIR__ROUND_INT(((double)stbir__saturate(encode_buffer[pixel_index + alpha_channel])) * 4294967295);
-            }
-            break;
-
-        case STBIR__DECODE(STBIR_TYPE_FLOAT, STBIR_COLORSPACE_LINEAR):
-            for (x=0; x < num_pixels; ++x)
-            {
-                int pixel_index = x*channels;
-
-                for (n = 0; n < channels; n++)
-                {
-                    int index = pixel_index + n;
-                    ((float*)output_buffer)[index] = encode_buffer[index];
-                }
-            }
-            break;
-
-        case STBIR__DECODE(STBIR_TYPE_FLOAT, STBIR_COLORSPACE_SRGB):
-            for (x=0; x < num_pixels; ++x)
-            {
-                int pixel_index = x*channels;
-
-                for (n = 0; n < num_nonalpha; n++)
-                {
-                    int index = pixel_index + nonalpha[n];
-                    ((float*)output_buffer)[index] = stbir__linear_to_srgb(encode_buffer[index]);
-                }
-
-                if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
-                    ((float*)output_buffer)[pixel_index + alpha_channel] = encode_buffer[pixel_index + alpha_channel];
-            }
-            break;
-
-        default:
-            STBIR_ASSERT(!"Unknown type/colorspace/channels combination.");
-            break;
-    }
-}
-
-static void stbir__resample_vertical_upsample(stbir__info* stbir_info, int n, int in_first_scanline, int in_last_scanline, float in_center_of_out)
-{
-    int x, k;
-    int output_w = stbir_info->output_w;
-    stbir__contributors* vertical_contributors = stbir_info->vertical_contributors;
-    float* vertical_coefficients = stbir_info->vertical_coefficients;
-    int channels = stbir_info->channels;
-    int alpha_channel = stbir_info->alpha_channel;
-    int type = stbir_info->type;
-    int colorspace = stbir_info->colorspace;
-    int kernel_pixel_width = stbir_info->vertical_filter_pixel_width;
-    void* output_data = stbir_info->output_data;
-    float* encode_buffer = stbir_info->encode_buffer;
-    int decode = STBIR__DECODE(type, colorspace);
-    int coefficient_width = stbir_info->vertical_coefficient_width;
-    int coefficient_counter;
-    int contributor = n;
-
-    float* ring_buffer = stbir_info->ring_buffer;
-    int ring_buffer_begin_index = stbir_info->ring_buffer_begin_index;
-    int ring_buffer_first_scanline = stbir_info->ring_buffer_first_scanline;
-    int ring_buffer_last_scanline = stbir_info->ring_buffer_last_scanline;
-    int ring_buffer_length = stbir_info->ring_buffer_length_bytes/sizeof(float);
-
-    int n0,n1, output_row_start;
-    int coefficient_group = coefficient_width * contributor;
-
-    n0 = vertical_contributors[contributor].n0;
-    n1 = vertical_contributors[contributor].n1;
-
-    output_row_start = n * stbir_info->output_stride_bytes;
-
-    STBIR_ASSERT(stbir__use_height_upsampling(stbir_info));
-
-    memset(encode_buffer, 0, output_w * sizeof(float) * channels);
-
-    // I tried reblocking this for better cache usage of encode_buffer
-    // (using x_outer, k, x_inner), but it lost speed. -- stb
-
-    coefficient_counter = 0;
-    switch (channels) {
-        case 1:
-            for (k = n0; k <= n1; k++)
-            {
-                int coefficient_index = coefficient_counter++;
-                float* ring_buffer_entry = stbir__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, kernel_pixel_width, ring_buffer_length);
-                float coefficient = vertical_coefficients[coefficient_group + coefficient_index];
-                for (x = 0; x < output_w; ++x)
-                {
-                    int in_pixel_index = x * 1;
-                    encode_buffer[in_pixel_index + 0] += ring_buffer_entry[in_pixel_index + 0] * coefficient;
-                }
-            }
-            break;
-        case 2:
-            for (k = n0; k <= n1; k++)
-            {
-                int coefficient_index = coefficient_counter++;
-                float* ring_buffer_entry = stbir__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, kernel_pixel_width, ring_buffer_length);
-                float coefficient = vertical_coefficients[coefficient_group + coefficient_index];
-                for (x = 0; x < output_w; ++x)
-                {
-                    int in_pixel_index = x * 2;
-                    encode_buffer[in_pixel_index + 0] += ring_buffer_entry[in_pixel_index + 0] * coefficient;
-                    encode_buffer[in_pixel_index + 1] += ring_buffer_entry[in_pixel_index + 1] * coefficient;
-                }
-            }
-            break;
-        case 3:
-            for (k = n0; k <= n1; k++)
-            {
-                int coefficient_index = coefficient_counter++;
-                float* ring_buffer_entry = stbir__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, kernel_pixel_width, ring_buffer_length);
-                float coefficient = vertical_coefficients[coefficient_group + coefficient_index];
-                for (x = 0; x < output_w; ++x)
-                {
-                    int in_pixel_index = x * 3;
-                    encode_buffer[in_pixel_index + 0] += ring_buffer_entry[in_pixel_index + 0] * coefficient;
-                    encode_buffer[in_pixel_index + 1] += ring_buffer_entry[in_pixel_index + 1] * coefficient;
-                    encode_buffer[in_pixel_index + 2] += ring_buffer_entry[in_pixel_index + 2] * coefficient;
-                }
-            }
-            break;
-        case 4:
-            for (k = n0; k <= n1; k++)
-            {
-                int coefficient_index = coefficient_counter++;
-                float* ring_buffer_entry = stbir__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, kernel_pixel_width, ring_buffer_length);
-                float coefficient = vertical_coefficients[coefficient_group + coefficient_index];
-                for (x = 0; x < output_w; ++x)
-                {
-                    int in_pixel_index = x * 4;
-                    encode_buffer[in_pixel_index + 0] += ring_buffer_entry[in_pixel_index + 0] * coefficient;
-                    encode_buffer[in_pixel_index + 1] += ring_buffer_entry[in_pixel_index + 1] * coefficient;
-                    encode_buffer[in_pixel_index + 2] += ring_buffer_entry[in_pixel_index + 2] * coefficient;
-                    encode_buffer[in_pixel_index + 3] += ring_buffer_entry[in_pixel_index + 3] * coefficient;
-                }
-            }
-            break;
-        default:
-            for (k = n0; k <= n1; k++)
-            {
-                int coefficient_index = coefficient_counter++;
-                float* ring_buffer_entry = stbir__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, kernel_pixel_width, ring_buffer_length);
-                float coefficient = vertical_coefficients[coefficient_group + coefficient_index];
-                for (x = 0; x < output_w; ++x)
-                {
-                    int in_pixel_index = x * channels;
-                    int c;
-                    for (c = 0; c < channels; c++)
-                        encode_buffer[in_pixel_index + c] += ring_buffer_entry[in_pixel_index + c] * coefficient;
-                }
-            }
-            break;
-    }
-    stbir__encode_scanline(stbir_info, output_w, (char *) output_data + output_row_start, encode_buffer, channels, alpha_channel, decode);
-}
-
-static void stbir__resample_vertical_downsample(stbir__info* stbir_info, int n, int in_first_scanline, int in_last_scanline, float in_center_of_out)
-{
-    int x, k;
-    int output_w = stbir_info->output_w;
-    int output_h = stbir_info->output_h;
-    stbir__contributors* vertical_contributors = stbir_info->vertical_contributors;
-    float* vertical_coefficients = stbir_info->vertical_coefficients;
-    int channels = stbir_info->channels;
-    int kernel_pixel_width = stbir_info->vertical_filter_pixel_width;
-    void* output_data = stbir_info->output_data;
-    float* horizontal_buffer = stbir_info->horizontal_buffer;
-    int coefficient_width = stbir_info->vertical_coefficient_width;
-    int contributor = n + stbir_info->vertical_filter_pixel_margin;
-
-    float* ring_buffer = stbir_info->ring_buffer;
-    int ring_buffer_begin_index = stbir_info->ring_buffer_begin_index;
-    int ring_buffer_first_scanline = stbir_info->ring_buffer_first_scanline;
-    int ring_buffer_last_scanline = stbir_info->ring_buffer_last_scanline;
-    int ring_buffer_length = stbir_info->ring_buffer_length_bytes/sizeof(float);
-    int n0,n1;
-
-    n0 = vertical_contributors[contributor].n0;
-    n1 = vertical_contributors[contributor].n1;
-
-    STBIR_ASSERT(!stbir__use_height_upsampling(stbir_info));
-
-    for (k = n0; k <= n1; k++)
-    {
-        int coefficient_index = k - n0;
-        int coefficient_group = coefficient_width * contributor;
-        float coefficient = vertical_coefficients[coefficient_group + coefficient_index];
-
-        float* ring_buffer_entry = stbir__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, kernel_pixel_width, ring_buffer_length);
-
-        switch (channels) {
-            case 1:
-                for (x = 0; x < output_w; x++)
-                {
-                    int in_pixel_index = x * 1;
-                    ring_buffer_entry[in_pixel_index + 0] += horizontal_buffer[in_pixel_index + 0] * coefficient;
-                }
-                break;
-            case 2:
-                for (x = 0; x < output_w; x++)
-                {
-                    int in_pixel_index = x * 2;
-                    ring_buffer_entry[in_pixel_index + 0] += horizontal_buffer[in_pixel_index + 0] * coefficient;
-                    ring_buffer_entry[in_pixel_index + 1] += horizontal_buffer[in_pixel_index + 1] * coefficient;
-                }
-                break;
-            case 3:
-                for (x = 0; x < output_w; x++)
-                {
-                    int in_pixel_index = x * 3;
-                    ring_buffer_entry[in_pixel_index + 0] += horizontal_buffer[in_pixel_index + 0] * coefficient;
-                    ring_buffer_entry[in_pixel_index + 1] += horizontal_buffer[in_pixel_index + 1] * coefficient;
-                    ring_buffer_entry[in_pixel_index + 2] += horizontal_buffer[in_pixel_index + 2] * coefficient;
-                }
-                break;
-            case 4:
-                for (x = 0; x < output_w; x++)
-                {
-                    int in_pixel_index = x * 4;
-                    ring_buffer_entry[in_pixel_index + 0] += horizontal_buffer[in_pixel_index + 0] * coefficient;
-                    ring_buffer_entry[in_pixel_index + 1] += horizontal_buffer[in_pixel_index + 1] * coefficient;
-                    ring_buffer_entry[in_pixel_index + 2] += horizontal_buffer[in_pixel_index + 2] * coefficient;
-                    ring_buffer_entry[in_pixel_index + 3] += horizontal_buffer[in_pixel_index + 3] * coefficient;
-                }
-                break;
-            default:
-                for (x = 0; x < output_w; x++)
-                {
-                    int in_pixel_index = x * channels;
-
-                    int c;
-                    for (c = 0; c < channels; c++)
-                        ring_buffer_entry[in_pixel_index + c] += horizontal_buffer[in_pixel_index + c] * coefficient;
-                }
-                break;
-        }
-    }
-}
-
-static void stbir__buffer_loop_upsample(stbir__info* stbir_info)
-{
-    int y;
-    float scale_ratio = stbir_info->vertical_scale;
-    float out_scanlines_radius = stbir__filter_info_table[stbir_info->vertical_filter].support(1/scale_ratio) * scale_ratio;
-
-    STBIR_ASSERT(stbir__use_height_upsampling(stbir_info));
-
-    for (y = 0; y < stbir_info->output_h; y++)
-    {
-        float in_center_of_out = 0; // Center of the current out scanline in the in scanline space
-        int in_first_scanline = 0, in_last_scanline = 0;
-
-        stbir__calculate_sample_range_upsample(y, out_scanlines_radius, scale_ratio, stbir_info->vertical_shift, &in_first_scanline, &in_last_scanline, &in_center_of_out);
-
-        STBIR_ASSERT(in_last_scanline - in_first_scanline <= stbir_info->vertical_filter_pixel_width);
-
-        if (stbir_info->ring_buffer_begin_index >= 0)
-        {
-            // Get rid of whatever we don't need anymore.
-            while (in_first_scanline > stbir_info->ring_buffer_first_scanline)
-            {
-                if (stbir_info->ring_buffer_first_scanline == stbir_info->ring_buffer_last_scanline)
-                {
-                    // We just popped the last scanline off the ring buffer.
-                    // Reset it to the empty state.
-                    stbir_info->ring_buffer_begin_index = -1;
-                    stbir_info->ring_buffer_first_scanline = 0;
-                    stbir_info->ring_buffer_last_scanline = 0;
-                    break;
-                }
-                else
-                {
-                    stbir_info->ring_buffer_first_scanline++;
-                    stbir_info->ring_buffer_begin_index = (stbir_info->ring_buffer_begin_index + 1) % stbir_info->vertical_filter_pixel_width;
-                }
-            }
-        }
-
-        // Load in new ones.
-        if (stbir_info->ring_buffer_begin_index < 0)
-            stbir__decode_and_resample_upsample(stbir_info, in_first_scanline);
-
-        while (in_last_scanline > stbir_info->ring_buffer_last_scanline)
-            stbir__decode_and_resample_upsample(stbir_info, stbir_info->ring_buffer_last_scanline + 1);
-
-        // Now all buffers should be ready to write a row of vertical sampling.
-        stbir__resample_vertical_upsample(stbir_info, y, in_first_scanline, in_last_scanline, in_center_of_out);
-
-        STBIR_PROGRESS_REPORT((float)y / stbir_info->output_h);
-    }
-}
-
-static void stbir__empty_ring_buffer(stbir__info* stbir_info, int first_necessary_scanline)
-{
-    int output_stride_bytes = stbir_info->output_stride_bytes;
-    int channels = stbir_info->channels;
-    int alpha_channel = stbir_info->alpha_channel;
-    int type = stbir_info->type;
-    int colorspace = stbir_info->colorspace;
-    int output_w = stbir_info->output_w;
-    void* output_data = stbir_info->output_data;
-    int decode = STBIR__DECODE(type, colorspace);
-
-    float* ring_buffer = stbir_info->ring_buffer;
-    int ring_buffer_length = stbir_info->ring_buffer_length_bytes/sizeof(float);
-
-    if (stbir_info->ring_buffer_begin_index >= 0)
-    {
-        // Get rid of whatever we don't need anymore.
-        while (first_necessary_scanline > stbir_info->ring_buffer_first_scanline)
-        {
-            if (stbir_info->ring_buffer_first_scanline >= 0 && stbir_info->ring_buffer_first_scanline < stbir_info->output_h)
-            {
-                int output_row_start = stbir_info->ring_buffer_first_scanline * output_stride_bytes;
-                float* ring_buffer_entry = stbir__get_ring_buffer_entry(ring_buffer, stbir_info->ring_buffer_begin_index, ring_buffer_length);
-                stbir__encode_scanline(stbir_info, output_w, (char *) output_data + output_row_start, ring_buffer_entry, channels, alpha_channel, decode);
-                STBIR_PROGRESS_REPORT((float)stbir_info->ring_buffer_first_scanline / stbir_info->output_h);
-            }
-
-            if (stbir_info->ring_buffer_first_scanline == stbir_info->ring_buffer_last_scanline)
-            {
-                // We just popped the last scanline off the ring buffer.
-                // Reset it to the empty state.
-                stbir_info->ring_buffer_begin_index = -1;
-                stbir_info->ring_buffer_first_scanline = 0;
-                stbir_info->ring_buffer_last_scanline = 0;
-                break;
-            }
-            else
-            {
-                stbir_info->ring_buffer_first_scanline++;
-                stbir_info->ring_buffer_begin_index = (stbir_info->ring_buffer_begin_index + 1) % stbir_info->vertical_filter_pixel_width;
-            }
-        }
-    }
-}
-
-static void stbir__buffer_loop_downsample(stbir__info* stbir_info)
-{
-    int y;
-    float scale_ratio = stbir_info->vertical_scale;
-    int output_h = stbir_info->output_h;
-    float in_pixels_radius = stbir__filter_info_table[stbir_info->vertical_filter].support(scale_ratio) / scale_ratio;
-    int pixel_margin = stbir_info->vertical_filter_pixel_margin;
-    int max_y = stbir_info->input_h + pixel_margin;
-
-    STBIR_ASSERT(!stbir__use_height_upsampling(stbir_info));
-
-    for (y = -pixel_margin; y < max_y; y++)
-    {
-        float out_center_of_in; // Center of the current out scanline in the in scanline space
-        int out_first_scanline, out_last_scanline;
-
-        stbir__calculate_sample_range_downsample(y, in_pixels_radius, scale_ratio, stbir_info->vertical_shift, &out_first_scanline, &out_last_scanline, &out_center_of_in);
-
-        STBIR_ASSERT(out_last_scanline - out_first_scanline <= stbir_info->vertical_filter_pixel_width);
-
-        if (out_last_scanline < 0 || out_first_scanline >= output_h)
-            continue;
-
-        stbir__empty_ring_buffer(stbir_info, out_first_scanline);
-
-        stbir__decode_and_resample_downsample(stbir_info, y);
-
-        // Load in new ones.
-        if (stbir_info->ring_buffer_begin_index < 0)
-            stbir__add_empty_ring_buffer_entry(stbir_info, out_first_scanline);
-
-        while (out_last_scanline > stbir_info->ring_buffer_last_scanline)
-            stbir__add_empty_ring_buffer_entry(stbir_info, stbir_info->ring_buffer_last_scanline + 1);
-
-        // Now the horizontal buffer is ready to write to all ring buffer rows.
-        stbir__resample_vertical_downsample(stbir_info, y, out_first_scanline, out_last_scanline, out_center_of_in);
-    }
-
-    stbir__empty_ring_buffer(stbir_info, stbir_info->output_h);
-}
-
-static void stbir__setup(stbir__info *info, int input_w, int input_h, int output_w, int output_h, int channels)
-{
-    info->input_w = input_w;
-    info->input_h = input_h;
-    info->output_w = output_w;
-    info->output_h = output_h;
-    info->channels = channels;
-}
-
-static void stbir__calculate_transform(stbir__info *info, float s0, float t0, float s1, float t1, float *transform)
-{
-    info->s0 = s0;
-    info->t0 = t0;
-    info->s1 = s1;
-    info->t1 = t1;
-
-    if (transform)
-    {
-        info->horizontal_scale = transform[0];
-        info->vertical_scale   = transform[1];
-        info->horizontal_shift = transform[2];
-        info->vertical_shift   = transform[3];
-    }
-    else
-    {
-        info->horizontal_scale = ((float)info->output_w / info->input_w) / (s1 - s0);
-        info->vertical_scale = ((float)info->output_h / info->input_h) / (t1 - t0);
-
-        info->horizontal_shift = s0 * info->output_w / (s1 - s0);
-        info->vertical_shift = t0 * info->output_h / (t1 - t0);
-    }
-}
-
-static void stbir__choose_filter(stbir__info *info, stbir_filter h_filter, stbir_filter v_filter)
-{
-    if (h_filter == 0)
-        h_filter = stbir__use_upsampling(info->horizontal_scale) ? STBIR_DEFAULT_FILTER_UPSAMPLE : STBIR_DEFAULT_FILTER_DOWNSAMPLE;
-    if (v_filter == 0)
-        v_filter = stbir__use_upsampling(info->vertical_scale)   ? STBIR_DEFAULT_FILTER_UPSAMPLE : STBIR_DEFAULT_FILTER_DOWNSAMPLE;
-    info->horizontal_filter = h_filter;
-    info->vertical_filter = v_filter;
-}
-
-static stbir_uint32 stbir__calculate_memory(stbir__info *info)
-{
-    int pixel_margin = stbir__get_filter_pixel_margin(info->horizontal_filter, info->horizontal_scale);
-    int filter_height = stbir__get_filter_pixel_width(info->vertical_filter, info->vertical_scale);
-
-    info->horizontal_num_contributors = stbir__get_contributors(info->horizontal_scale, info->horizontal_filter, info->input_w, info->output_w);
-    info->vertical_num_contributors   = stbir__get_contributors(info->vertical_scale  , info->vertical_filter  , info->input_h, info->output_h);
-
-    info->horizontal_contributors_size = info->horizontal_num_contributors * sizeof(stbir__contributors);
-    info->horizontal_coefficients_size = stbir__get_total_horizontal_coefficients(info) * sizeof(float);
-    info->vertical_contributors_size = info->vertical_num_contributors * sizeof(stbir__contributors);
-    info->vertical_coefficients_size = stbir__get_total_vertical_coefficients(info) * sizeof(float);
-    info->decode_buffer_size = (info->input_w + pixel_margin * 2) * info->channels * sizeof(float);
-    info->horizontal_buffer_size = info->output_w * info->channels * sizeof(float);
-    info->ring_buffer_size = info->output_w * info->channels * filter_height * sizeof(float);
-    info->encode_buffer_size = info->output_w * info->channels * sizeof(float);
-
-    STBIR_ASSERT(info->horizontal_filter != 0);
-    STBIR_ASSERT(info->horizontal_filter < STBIR__ARRAY_SIZE(stbir__filter_info_table)); // this now happens too late
-    STBIR_ASSERT(info->vertical_filter != 0);
-    STBIR_ASSERT(info->vertical_filter < STBIR__ARRAY_SIZE(stbir__filter_info_table)); // this now happens too late
-
-    if (stbir__use_height_upsampling(info))
-        // The horizontal buffer is for when we're downsampling the height and we
-        // can't output the result of sampling the decode buffer directly into the
-        // ring buffers.
-        info->horizontal_buffer_size = 0;
-    else
-        // The encode buffer is to retain precision in the height upsampling method
-        // and isn't used when height downsampling.
-        info->encode_buffer_size = 0;
-
-    return info->horizontal_contributors_size + info->horizontal_coefficients_size
-        + info->vertical_contributors_size + info->vertical_coefficients_size
-        + info->decode_buffer_size + info->horizontal_buffer_size
-        + info->ring_buffer_size + info->encode_buffer_size;
-}
-
-static int stbir__resize_allocated(stbir__info *info,
-    const void* input_data, int input_stride_in_bytes,
-    void* output_data, int output_stride_in_bytes,
-    int alpha_channel, stbir_uint32 flags, stbir_datatype type,
-    stbir_edge edge_horizontal, stbir_edge edge_vertical, stbir_colorspace colorspace,
-    void* tempmem, size_t tempmem_size_in_bytes)
-{
-    size_t memory_required = stbir__calculate_memory(info);
-
-    int width_stride_input = input_stride_in_bytes ? input_stride_in_bytes : info->channels * info->input_w * stbir__type_size[type];
-    int width_stride_output = output_stride_in_bytes ? output_stride_in_bytes : info->channels * info->output_w * stbir__type_size[type];
-
-#ifdef STBIR_DEBUG_OVERWRITE_TEST
-#define OVERWRITE_ARRAY_SIZE 8
-    unsigned char overwrite_output_before_pre[OVERWRITE_ARRAY_SIZE];
-    unsigned char overwrite_tempmem_before_pre[OVERWRITE_ARRAY_SIZE];
-    unsigned char overwrite_output_after_pre[OVERWRITE_ARRAY_SIZE];
-    unsigned char overwrite_tempmem_after_pre[OVERWRITE_ARRAY_SIZE];
-
-    size_t begin_forbidden = width_stride_output * (info->output_h - 1) + info->output_w * info->channels * stbir__type_size[type];
-    memcpy(overwrite_output_before_pre, &((unsigned char*)output_data)[-OVERWRITE_ARRAY_SIZE], OVERWRITE_ARRAY_SIZE);
-    memcpy(overwrite_output_after_pre, &((unsigned char*)output_data)[begin_forbidden], OVERWRITE_ARRAY_SIZE);
-    memcpy(overwrite_tempmem_before_pre, &((unsigned char*)tempmem)[-OVERWRITE_ARRAY_SIZE], OVERWRITE_ARRAY_SIZE);
-    memcpy(overwrite_tempmem_after_pre, &((unsigned char*)tempmem)[tempmem_size_in_bytes], OVERWRITE_ARRAY_SIZE);
-#endif
-
-    STBIR_ASSERT(info->channels >= 0);
-    STBIR_ASSERT(info->channels <= STBIR_MAX_CHANNELS);
-
-    if (info->channels < 0 || info->channels > STBIR_MAX_CHANNELS)
-        return 0;
-
-    STBIR_ASSERT(info->horizontal_filter < STBIR__ARRAY_SIZE(stbir__filter_info_table));
-    STBIR_ASSERT(info->vertical_filter < STBIR__ARRAY_SIZE(stbir__filter_info_table));
-
-    if (info->horizontal_filter >= STBIR__ARRAY_SIZE(stbir__filter_info_table))
-        return 0;
-    if (info->vertical_filter >= STBIR__ARRAY_SIZE(stbir__filter_info_table))
-        return 0;
-
-    if (alpha_channel < 0)
-        flags |= STBIR_FLAG_ALPHA_USES_COLORSPACE | STBIR_FLAG_ALPHA_PREMULTIPLIED;
-
-    if (!(flags&STBIR_FLAG_ALPHA_USES_COLORSPACE) || !(flags&STBIR_FLAG_ALPHA_PREMULTIPLIED))
-        STBIR_ASSERT(alpha_channel >= 0 && alpha_channel < info->channels);
-
-    if (alpha_channel >= info->channels)
-        return 0;
-
-    STBIR_ASSERT(tempmem);
-
-    if (!tempmem)
-        return 0;
-
-    STBIR_ASSERT(tempmem_size_in_bytes >= memory_required);
-
-    if (tempmem_size_in_bytes < memory_required)
-        return 0;
-
-    memset(tempmem, 0, tempmem_size_in_bytes);
-
-    info->input_data = input_data;
-    info->input_stride_bytes = width_stride_input;
-
-    info->output_data = output_data;
-    info->output_stride_bytes = width_stride_output;
-
-    info->alpha_channel = alpha_channel;
-    info->flags = flags;
-    info->type = type;
-    info->edge_horizontal = edge_horizontal;
-    info->edge_vertical = edge_vertical;
-    info->colorspace = colorspace;
-
-    info->horizontal_coefficient_width   = stbir__get_coefficient_width  (info->horizontal_filter, info->horizontal_scale);
-    info->vertical_coefficient_width     = stbir__get_coefficient_width  (info->vertical_filter  , info->vertical_scale  );
-    info->horizontal_filter_pixel_width  = stbir__get_filter_pixel_width (info->horizontal_filter, info->horizontal_scale);
-    info->vertical_filter_pixel_width    = stbir__get_filter_pixel_width (info->vertical_filter  , info->vertical_scale  );
-    info->horizontal_filter_pixel_margin = stbir__get_filter_pixel_margin(info->horizontal_filter, info->horizontal_scale);
-    info->vertical_filter_pixel_margin   = stbir__get_filter_pixel_margin(info->vertical_filter  , info->vertical_scale  );
-
-    info->ring_buffer_length_bytes = info->output_w * info->channels * sizeof(float);
-    info->decode_buffer_pixels = info->input_w + info->horizontal_filter_pixel_margin * 2;
-
-#define STBIR__NEXT_MEMPTR(current, newtype) (newtype*)(((unsigned char*)current) + current##_size)
-
-    info->horizontal_contributors = (stbir__contributors *) tempmem;
-    info->horizontal_coefficients = STBIR__NEXT_MEMPTR(info->horizontal_contributors, float);
-    info->vertical_contributors = STBIR__NEXT_MEMPTR(info->horizontal_coefficients, stbir__contributors);
-    info->vertical_coefficients = STBIR__NEXT_MEMPTR(info->vertical_contributors, float);
-    info->decode_buffer = STBIR__NEXT_MEMPTR(info->vertical_coefficients, float);
-
-    if (stbir__use_height_upsampling(info))
-    {
-        info->horizontal_buffer = NULL;
-        info->ring_buffer = STBIR__NEXT_MEMPTR(info->decode_buffer, float);
-        info->encode_buffer = STBIR__NEXT_MEMPTR(info->ring_buffer, float);
-
-        STBIR_ASSERT((size_t)STBIR__NEXT_MEMPTR(info->encode_buffer, unsigned char) == (size_t)tempmem + tempmem_size_in_bytes);
-    }
-    else
-    {
-        info->horizontal_buffer = STBIR__NEXT_MEMPTR(info->decode_buffer, float);
-        info->ring_buffer = STBIR__NEXT_MEMPTR(info->horizontal_buffer, float);
-        info->encode_buffer = NULL;
-
-        STBIR_ASSERT((size_t)STBIR__NEXT_MEMPTR(info->ring_buffer, unsigned char) == (size_t)tempmem + tempmem_size_in_bytes);
-    }
-
-#undef STBIR__NEXT_MEMPTR
-
-    // This signals that the ring buffer is empty
-    info->ring_buffer_begin_index = -1;
-
-    stbir__calculate_filters(info, info->horizontal_contributors, info->horizontal_coefficients, info->horizontal_filter, info->horizontal_scale, info->horizontal_shift, info->input_w, info->output_w);
-    stbir__calculate_filters(info, info->vertical_contributors, info->vertical_coefficients, info->vertical_filter, info->vertical_scale, info->vertical_shift, info->input_h, info->output_h);
-
-    STBIR_PROGRESS_REPORT(0);
-
-    if (stbir__use_height_upsampling(info))
-        stbir__buffer_loop_upsample(info);
-    else
-        stbir__buffer_loop_downsample(info);
-
-    STBIR_PROGRESS_REPORT(1);
-
-#ifdef STBIR_DEBUG_OVERWRITE_TEST
-    STBIR_ASSERT(memcmp(overwrite_output_before_pre, &((unsigned char*)output_data)[-OVERWRITE_ARRAY_SIZE], OVERWRITE_ARRAY_SIZE) == 0);
-    STBIR_ASSERT(memcmp(overwrite_output_after_pre, &((unsigned char*)output_data)[begin_forbidden], OVERWRITE_ARRAY_SIZE) == 0);
-    STBIR_ASSERT(memcmp(overwrite_tempmem_before_pre, &((unsigned char*)tempmem)[-OVERWRITE_ARRAY_SIZE], OVERWRITE_ARRAY_SIZE) == 0);
-    STBIR_ASSERT(memcmp(overwrite_tempmem_after_pre, &((unsigned char*)tempmem)[tempmem_size_in_bytes], OVERWRITE_ARRAY_SIZE) == 0);
-#endif
-
-    return 1;
-}
-
-
-static int stbir__resize_arbitrary(
-    void *alloc_context,
-    const void* input_data, int input_w, int input_h, int input_stride_in_bytes,
-    void* output_data, int output_w, int output_h, int output_stride_in_bytes,
-    float s0, float t0, float s1, float t1, float *transform,
-    int channels, int alpha_channel, stbir_uint32 flags, stbir_datatype type,
-    stbir_filter h_filter, stbir_filter v_filter,
-    stbir_edge edge_horizontal, stbir_edge edge_vertical, stbir_colorspace colorspace)
-{
-    stbir__info info;
-    int result;
-    size_t memory_required;
-    void* extra_memory;
-
-    stbir__setup(&info, input_w, input_h, output_w, output_h, channels);
-    stbir__calculate_transform(&info, s0,t0,s1,t1,transform);
-    stbir__choose_filter(&info, h_filter, v_filter);
-    memory_required = stbir__calculate_memory(&info);
-    extra_memory = STBIR_MALLOC(memory_required, alloc_context);
-
-    if (!extra_memory)
-        return 0;
-
-    result = stbir__resize_allocated(&info, input_data, input_stride_in_bytes,
-                                            output_data, output_stride_in_bytes, 
-                                            alpha_channel, flags, type,
-                                            edge_horizontal, edge_vertical,
-                                            colorspace, extra_memory, memory_required);
-
-    STBIR_FREE(extra_memory, alloc_context);
-
-    return result;
-}
-
-STBIRDEF int stbir_resize_uint8(     const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
-                                           unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
-                                     int num_channels)
-{
-    return stbir__resize_arbitrary(NULL, input_pixels, input_w, input_h, input_stride_in_bytes,
-        output_pixels, output_w, output_h, output_stride_in_bytes,
-        0,0,1,1,NULL,num_channels,-1,0, STBIR_TYPE_UINT8, STBIR_FILTER_DEFAULT, STBIR_FILTER_DEFAULT,
-        STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP, STBIR_COLORSPACE_LINEAR);
-}
-
-STBIRDEF int stbir_resize_float(     const float *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
-                                           float *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
-                                     int num_channels)
-{
-    return stbir__resize_arbitrary(NULL, input_pixels, input_w, input_h, input_stride_in_bytes,
-        output_pixels, output_w, output_h, output_stride_in_bytes,
-        0,0,1,1,NULL,num_channels,-1,0, STBIR_TYPE_FLOAT, STBIR_FILTER_DEFAULT, STBIR_FILTER_DEFAULT,
-        STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP, STBIR_COLORSPACE_LINEAR);
-}
-
-STBIRDEF int stbir_resize_uint8_srgb(const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
-                                           unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
-                                     int num_channels, int alpha_channel, int flags)
-{
-    return stbir__resize_arbitrary(NULL, input_pixels, input_w, input_h, input_stride_in_bytes,
-        output_pixels, output_w, output_h, output_stride_in_bytes,
-        0,0,1,1,NULL,num_channels,alpha_channel,flags, STBIR_TYPE_UINT8, STBIR_FILTER_DEFAULT, STBIR_FILTER_DEFAULT,
-        STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP, STBIR_COLORSPACE_SRGB);
-}
-
-STBIRDEF int stbir_resize_uint8_srgb_edgemode(const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
-                                                    unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
-                                              int num_channels, int alpha_channel, int flags,
-                                              stbir_edge edge_wrap_mode)
-{
-    return stbir__resize_arbitrary(NULL, input_pixels, input_w, input_h, input_stride_in_bytes,
-        output_pixels, output_w, output_h, output_stride_in_bytes,
-        0,0,1,1,NULL,num_channels,alpha_channel,flags, STBIR_TYPE_UINT8, STBIR_FILTER_DEFAULT, STBIR_FILTER_DEFAULT,
-        edge_wrap_mode, edge_wrap_mode, STBIR_COLORSPACE_SRGB);
-}
-
-STBIRDEF int stbir_resize_uint8_generic( const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
-                                               unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
-                                         int num_channels, int alpha_channel, int flags,
-                                         stbir_edge edge_wrap_mode, stbir_filter filter, stbir_colorspace space, 
-                                         void *alloc_context)
-{
-    return stbir__resize_arbitrary(alloc_context, input_pixels, input_w, input_h, input_stride_in_bytes,
-        output_pixels, output_w, output_h, output_stride_in_bytes,
-        0,0,1,1,NULL,num_channels,alpha_channel,flags, STBIR_TYPE_UINT8, filter, filter,
-        edge_wrap_mode, edge_wrap_mode, space);
-}
-
-STBIRDEF int stbir_resize_uint16_generic(const stbir_uint16 *input_pixels  , int input_w , int input_h , int input_stride_in_bytes,
-                                               stbir_uint16 *output_pixels , int output_w, int output_h, int output_stride_in_bytes,
-                                         int num_channels, int alpha_channel, int flags,
-                                         stbir_edge edge_wrap_mode, stbir_filter filter, stbir_colorspace space, 
-                                         void *alloc_context)
-{
-    return stbir__resize_arbitrary(alloc_context, input_pixels, input_w, input_h, input_stride_in_bytes,
-        output_pixels, output_w, output_h, output_stride_in_bytes,
-        0,0,1,1,NULL,num_channels,alpha_channel,flags, STBIR_TYPE_UINT16, filter, filter,
-        edge_wrap_mode, edge_wrap_mode, space);
-}
-
-
-STBIRDEF int stbir_resize_float_generic( const float *input_pixels         , int input_w , int input_h , int input_stride_in_bytes,
-                                               float *output_pixels        , int output_w, int output_h, int output_stride_in_bytes,
-                                         int num_channels, int alpha_channel, int flags,
-                                         stbir_edge edge_wrap_mode, stbir_filter filter, stbir_colorspace space, 
-                                         void *alloc_context)
-{
-    return stbir__resize_arbitrary(alloc_context, input_pixels, input_w, input_h, input_stride_in_bytes,
-        output_pixels, output_w, output_h, output_stride_in_bytes,
-        0,0,1,1,NULL,num_channels,alpha_channel,flags, STBIR_TYPE_FLOAT, filter, filter,
-        edge_wrap_mode, edge_wrap_mode, space);
-}
-
-
-STBIRDEF int stbir_resize(         const void *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
-                                         void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
-                                   stbir_datatype datatype,
-                                   int num_channels, int alpha_channel, int flags,
-                                   stbir_edge edge_mode_horizontal, stbir_edge edge_mode_vertical, 
-                                   stbir_filter filter_horizontal,  stbir_filter filter_vertical,
-                                   stbir_colorspace space, void *alloc_context)
-{
-    return stbir__resize_arbitrary(alloc_context, input_pixels, input_w, input_h, input_stride_in_bytes,
-        output_pixels, output_w, output_h, output_stride_in_bytes,
-        0,0,1,1,NULL,num_channels,alpha_channel,flags, datatype, filter_horizontal, filter_vertical,
-        edge_mode_horizontal, edge_mode_vertical, space);
-}
-
-
-STBIRDEF int stbir_resize_subpixel(const void *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
-                                         void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
-                                   stbir_datatype datatype,
-                                   int num_channels, int alpha_channel, int flags,
-                                   stbir_edge edge_mode_horizontal, stbir_edge edge_mode_vertical, 
-                                   stbir_filter filter_horizontal,  stbir_filter filter_vertical,
-                                   stbir_colorspace space, void *alloc_context,
-                                   float x_scale, float y_scale,
-                                   float x_offset, float y_offset)
-{
-    float transform[4];
-    transform[0] = x_scale;
-    transform[1] = y_scale;
-    transform[2] = x_offset;
-    transform[3] = y_offset;
-    return stbir__resize_arbitrary(alloc_context, input_pixels, input_w, input_h, input_stride_in_bytes,
-        output_pixels, output_w, output_h, output_stride_in_bytes,
-        0,0,1,1,transform,num_channels,alpha_channel,flags, datatype, filter_horizontal, filter_vertical,
-        edge_mode_horizontal, edge_mode_vertical, space);
-}
-
-STBIRDEF int stbir_resize_region(  const void *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
-                                         void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
-                                   stbir_datatype datatype,
-                                   int num_channels, int alpha_channel, int flags,
-                                   stbir_edge edge_mode_horizontal, stbir_edge edge_mode_vertical, 
-                                   stbir_filter filter_horizontal,  stbir_filter filter_vertical,
-                                   stbir_colorspace space, void *alloc_context,
-                                   float s0, float t0, float s1, float t1)
-{
-    return stbir__resize_arbitrary(alloc_context, input_pixels, input_w, input_h, input_stride_in_bytes,
-        output_pixels, output_w, output_h, output_stride_in_bytes,
-        s0,t0,s1,t1,NULL,num_channels,alpha_channel,flags, datatype, filter_horizontal, filter_vertical,
-        edge_mode_horizontal, edge_mode_vertical, space);
-}
-
-#endif // STB_IMAGE_RESIZE_IMPLEMENTATION
diff --git a/renderdoc/3rdparty/stb/stb_image_resize2.h b/renderdoc/3rdparty/stb/stb_image_resize2.h
new file mode 100644
index 0000000000..e0c4282463
--- /dev/null
+++ b/renderdoc/3rdparty/stb/stb_image_resize2.h
@@ -0,0 +1,10303 @@
+/* stb_image_resize2 - v2.01 - public domain image resizing
+   
+   by Jeff Roberts (v2) and Jorge L Rodriguez 
+   http://github.com/nothings/stb
+
+   Can be threaded with the extended API. SSE2, AVX, Neon and WASM SIMD support. Only 
+   scaling and translation is supported, no rotations or shears.
+
+   COMPILING & LINKING
+      In one C/C++ file that #includes this file, do this:
+         #define STB_IMAGE_RESIZE_IMPLEMENTATION
+      before the #include. That will create the implementation in that file.
+
+   PORTING FROM VERSION 1
+
+      The API has changed. You can continue to use the old version of stb_image_resize.h,
+      which is available in the "deprecated/" directory.
+
+      If you're using the old simple-to-use API, porting is straightforward.
+      (For more advanced APIs, read the documentation.)
+
+        stbir_resize_uint8():
+          - call `stbir_resize_uint8_linear`, cast channel count to `stbir_pixel_layout`
+
+        stbir_resize_float():
+          - call `stbir_resize_float_linear`, cast channel count to `stbir_pixel_layout`
+
+        stbir_resize_uint8_srgb():
+          - function name is unchanged
+          - cast channel count to `stbir_pixel_layout`
+          - above is sufficient unless your image has alpha and it's not RGBA/BGRA
+            - in that case, follow the below instructions for stbir_resize_uint8_srgb_edgemode
+
+        stbir_resize_uint8_srgb_edgemode()
+          - switch to the "medium complexity" API
+          - stbir_resize(), very similar API but a few more parameters:
+            - pixel_layout: cast channel count to `stbir_pixel_layout`
+            - data_type:    STBIR_TYPE_UINT8_SRGB
+            - edge:         unchanged (STBIR_EDGE_WRAP, etc.)
+            - filter:       STBIR_FILTER_DEFAULT
+          - which channel is alpha is specified in stbir_pixel_layout, see enum for details
+
+   EASY API CALLS:
+     Easy API downsamples w/Mitchell filter, upsamples w/cubic interpolation, clamps to edge.
+
+     stbir_resize_uint8_srgb( input_pixels,  input_w,  input_h,  input_stride_in_bytes,
+                              output_pixels, output_w, output_h, output_stride_in_bytes,
+                              pixel_layout_enum )
+
+     stbir_resize_uint8_linear( input_pixels,  input_w,  input_h,  input_stride_in_bytes,
+                                output_pixels, output_w, output_h, output_stride_in_bytes,
+                                pixel_layout_enum )
+
+     stbir_resize_float_linear( input_pixels,  input_w,  input_h,  input_stride_in_bytes,
+                                output_pixels, output_w, output_h, output_stride_in_bytes,
+                                pixel_layout_enum )
+
+     If you pass NULL or zero for the output_pixels, we will allocate the output buffer
+     for you and return it from the function (free with free() or STBIR_FREE).
+     As a special case, XX_stride_in_bytes of 0 means packed continuously in memory.
+
+   API LEVELS
+      There are three levels of API - easy-to-use, medium-complexity and extended-complexity.
+
+      See the "header file" section of the source for API documentation.
+
+   ADDITIONAL DOCUMENTATION
+
+      MEMORY ALLOCATION
+         By default, we use malloc and free for memory allocation.  To override the 
+         memory allocation, before the implementation #include, add a:
+
+            #define STBIR_MALLOC(size,user_data) ...
+            #define STBIR_FREE(ptr,user_data)   ...
+
+         Each resize makes exactly one call to malloc/free (unless you use the 
+         extended API where you can do one allocation for many resizes). Under
+         address sanitizer, we do separate allocations to find overread/writes.
+
+      PERFORMANCE
+         This library was written with an emphasis on performance. When testing
+         stb_image_resize with RGBA, the fastest mode is STBIR_4CHANNEL with 
+         STBIR_TYPE_UINT8 pixels and CLAMPed edges (which is what many other resize
+         libs do by default). Also, make sure SIMD is turned on of course (default 
+         for 64-bit targets). Avoid WRAP edge mode if you want the fastest speed.
+
+         This library also comes with profiling built-in. If you define STBIR_PROFILE,
+         you can use the advanced API and get low-level profiling information by 
+         calling stbir_resize_extended_profile_info() or stbir_resize_split_profile_info()
+         after a resize.
+
+      SIMD
+         Most of the routines have optimized SSE2, AVX, NEON and WASM versions. 
+
+         On Microsoft compilers, we automatically turn on SIMD for 64-bit x64 and 
+         ARM; for 32-bit x86 and ARM, you select SIMD mode by defining STBIR_SSE2 or 
+         STBIR_NEON. For AVX and AVX2, we auto-select it by detecting the /arch:AVX
+         or /arch:AVX2 switches. You can also always manually turn SSE2, AVX or AVX2 
+         support on by defining STBIR_SSE2, STBIR_AVX or STBIR_AVX2.
+
+         On Linux, SSE2 and Neon is on by default for 64-bit x64 or ARM64. For 32-bit,
+         we select x86 SIMD mode by whether you have -msse2, -mavx or -mavx2 enabled
+         on the command line. For 32-bit ARM, you must pass -mfpu=neon-vfpv4 for both
+         clang and GCC, but GCC also requires an additional -mfp16-format=ieee to 
+         automatically enable NEON.
+
+         On x86 platforms, you can also define STBIR_FP16C to turn on FP16C instructions
+         for converting back and forth to half-floats. This is autoselected when we
+         are using AVX2. Clang and GCC also require the -mf16c switch. ARM always uses 
+         the built-in half float hardware NEON instructions. 
+
+         You can also tell us to use multiply-add instructions with STBIR_USE_FMA. 
+         Because x86 doesn't always have fma, we turn it off by default to maintain
+         determinism across all platforms. If you don't care about non-FMA determinism
+         and are willing to restrict yourself to more recent x86 CPUs (around the AVX 
+         timeframe), then fma will give you around a 15% speedup.
+
+         You can force off SIMD in all cases by defining STBIR_NO_SIMD. You can turn
+         off AVX or AVX2 specifically with STBIR_NO_AVX or STBIR_NO_AVX2. AVX is 10%
+         to 40% faster, and AVX2 is generally another 12%.
+        
+      ALPHA CHANNEL
+         Most of the resizing functions provide the ability to control how the alpha 
+         channel of an image is processed.
+
+         When alpha represents transparency, it is important that when combining
+         colors with filtering, the pixels should not be treated equally; they
+         should use a weighted average based on their alpha values. For example,
+         if a pixel is 1% opaque bright green and another pixel is 99% opaque
+         black and you average them, the average will be 50% opaque, but the
+         unweighted average and will be a middling green color, while the weighted
+         average will be nearly black. This means the unweighted version introduced
+         green energy that didn't exist in the source image.
+
+         (If you want to know why this makes sense, you can work out the math for
+         the following: consider what happens if you alpha composite a source image
+         over a fixed color and then average the output, vs. if you average the
+         source image pixels and then composite that over the same fixed color.
+         Only the weighted average produces the same result as the ground truth
+         composite-then-average result.)
+
+         Therefore, it is in general best to "alpha weight" the pixels when applying
+         filters to them. This essentially means multiplying the colors by the alpha
+         values before combining them, and then dividing by the alpha value at the
+         end.
+
+         The computer graphics industry introduced a technique called "premultiplied
+         alpha" or "associated alpha" in which image colors are stored in image files
+         already multiplied by their alpha. This saves some math when compositing,
+         and also avoids the need to divide by the alpha at the end (which is quite
+         inefficient). However, while premultiplied alpha is common in the movie CGI
+         industry, it is not commonplace in other industries like videogames, and most
+         consumer file formats are generally expected to contain not-premultiplied
+         colors. For example, Photoshop saves PNG files "unpremultiplied", and web
+         browsers like Chrome and Firefox expect PNG images to be unpremultiplied.
+
+         Note that there are three possibilities that might describe your image
+         and resize expectation:
+
+             1. images are not premultiplied, alpha weighting is desired
+             2. images are not premultiplied, alpha weighting is not desired
+             3. images are premultiplied
+
+         Both case #2 and case #3 require the exact same math: no alpha weighting
+         should be applied or removed. Only case 1 requires extra math operations;
+         the other two cases can be handled identically.
+
+         stb_image_resize expects case #1 by default, applying alpha weighting to
+         images, expecting the input images to be unpremultiplied. This is what the
+         COLOR+ALPHA buffer types tell the resizer to do. 
+
+         When you use the pixel layouts STBIR_RGBA, STBIR_BGRA, STBIR_ARGB, 
+         STBIR_ABGR, STBIR_RX, or STBIR_XR you are telling us that the pixels are 
+         non-premultiplied. In these cases, the resizer will alpha weight the colors 
+         (effectively creating the premultiplied image), do the filtering, and then 
+         convert back to non-premult on exit.
+
+         When you use the pixel layouts STBIR_RGBA_PM, STBIR_RGBA_PM, STBIR_RGBA_PM,
+         STBIR_RGBA_PM, STBIR_RX_PM or STBIR_XR_PM, you are telling that the pixels 
+         ARE premultiplied. In this case, the resizer doesn't have to do the 
+         premultipling - it can filter directly on the input. This about twice as 
+         fast as the non-premultiplied case, so it's the right option if your data is 
+         already setup correctly.
+
+         When you use the pixel layout STBIR_4CHANNEL or STBIR_2CHANNEL, you are 
+         telling us that there is no channel that represents transparency; it may be 
+         RGB and some unrelated fourth channel that has been stored in the alpha 
+         channel, but it is actually not alpha. No special processing will be 
+         performed. 
+
+         The difference between the generic 4 or 2 channel layouts, and the 
+         specialized _PM versions is with the _PM versions you are telling us that
+         the data *is* alpha, just don't premultiply it. That's important when
+         using SRGB pixel formats, we need to know where the alpha is, because
+         it is converted linearly (rather than with the SRGB converters).
+   
+         Because alpha weighting produces the same effect as premultiplying, you
+         even have the option with non-premultiplied inputs to let the resizer
+         produce a premultiplied output. Because the intially computed alpha-weighted
+         output image is effectively premultiplied, this is actually more performant
+         than the normal path which un-premultiplies the output image as a final step.
+
+         Finally, when converting both in and out of non-premulitplied space (for
+         example, when using STBIR_RGBA), we go to somewhat heroic measures to 
+         ensure that areas with zero alpha value pixels get something reasonable 
+         in the RGB values. If you don't care about the RGB values of zero alpha 
+         pixels, you can call the stbir_set_non_pm_alpha_speed_over_quality() 
+         function - this runs a premultiplied resize about 25% faster. That said,
+         when you really care about speed, using premultiplied pixels for both in
+         and out (STBIR_RGBA_PM, etc) much faster than both of these premultiplied
+         options.
+
+      PIXEL LAYOUT CONVERSION
+         The resizer can convert from some pixel layouts to others. When using the
+         stbir_set_pixel_layouts(), you can, for example, specify STBIR_RGBA
+         on input, and STBIR_ARGB on output, and it will re-organize the channels
+         during the resize. Currently, you can only convert between two pixel
+         layouts with the same number of channels.
+
+      DETERMINISM
+         We commit to being deterministic (from x64 to ARM to scalar to SIMD, etc). 
+         This requires compiling with fast-math off (using at least /fp:precise). 
+         Also, you must turn off fp-contracting (which turns mult+adds into fmas)!
+         We attempt to do this with pragmas, but with Clang, you usually want to add 
+         -ffp-contract=off to the command line as well.
+
+         For 32-bit x86, you must use SSE and SSE2 codegen for determinism. That is, 
+         if the scalar x87 unit gets used at all, we immediately lose determinism. 
+         On Microsoft Visual Studio 2008 and earlier, from what we can tell there is
+         no way to be deterministic in 32-bit x86 (some x87 always leaks in, even 
+         with fp:strict). On 32-bit x86 GCC, determinism requires both -msse2 and 
+         -fpmath=sse.
+
+         Note that we will not be deterministic with float data containing NaNs -
+         the NaNs will propagate differently on different SIMD and platforms. 
+
+         If you turn on STBIR_USE_FMA, then we will be deterministic with other 
+         fma targets, but we will differ from non-fma targets (this is unavoidable, 
+         because a fma isn't simply an add with a mult - it also introduces a 
+         rounding difference compared to non-fma instruction sequences. 
+
+      FLOAT PIXEL FORMAT RANGE
+         Any range of values can be used for the non-alpha float data that you pass 
+         in (0 to 1, -1 to 1, whatever). However, if you are inputting float values 
+         but *outputting* bytes or shorts, you must use a range of 0 to 1 so that we 
+         scale back properly. The alpha channel must also be 0 to 1 for any format 
+         that does premultiplication prior to resizing. 
+
+         Note also that with float output, using filters with negative lobes, the 
+         output filtered values might go slightly out of range. You can define 
+         STBIR_FLOAT_LOW_CLAMP and/or STBIR_FLOAT_HIGH_CLAMP to specify the range 
+         to clamp to on output, if that's important. 
+
+      MAX/MIN SCALE FACTORS
+         The input pixel resolutions are in integers, and we do the internal pointer
+         resolution in size_t sized integers. However, the scale ratio from input
+         resolution to output resolution is calculated in float form. This means
+         the effective possible scale ratio is limited to 24 bits (or 16 million
+         to 1). As you get close to the size of the float resolution (again, 16
+         million pixels wide or high), you might start seeing float inaccuracy
+         issues in general in the pipeline. If you have to do extreme resizes,
+         you can usually do this is multiple stages (using float intermediate
+         buffers).
+
+      FLIPPED IMAGES
+         Stride is just the delta from one scanline to the next. This means you can 
+         use a negative stride to handle inverted images (point to the final 
+         scanline and use a negative stride). You can invert the input or output,
+         using negative strides.
+
+      DEFAULT FILTERS
+         For functions which don't provide explicit control over what filters to 
+         use, you can change the compile-time defaults with:
+
+            #define STBIR_DEFAULT_FILTER_UPSAMPLE     STBIR_FILTER_something
+            #define STBIR_DEFAULT_FILTER_DOWNSAMPLE   STBIR_FILTER_something
+
+         See stbir_filter in the header-file section for the list of filters.
+
+      NEW FILTERS
+         A number of 1D filter kernels are supplied. For a list of supported 
+         filters, see the stbir_filter enum. You can install your own filters by 
+         using the stbir_set_filter_callbacks function.
+
+      PROGRESS
+         For interactive use with slow resize operations, you can use the the 
+         scanline callbacks in the extended API. It would have to be a *very* large 
+         image resample to need progress though - we're very fast.
+
+      CEIL and FLOOR
+         In scalar mode, the only functions we use from math.h are ceilf and floorf, 
+         but if you have your own versions, you can define the STBIR_CEILF(v) and 
+         STBIR_FLOORF(v) macros and we'll use them instead. In SIMD, we just use
+         our own versions.
+
+      ASSERT
+         Define STBIR_ASSERT(boolval) to override assert() and not use assert.h
+
+      FUTURE TODOS
+        *  For polyphase integral filters, we just memcpy the coeffs to dupe
+           them, but we should indirect and use the same coeff memory.
+        *  Add pixel layout conversions for sensible different channel counts
+           (maybe, 1->3/4, 3->4, 4->1, 3->1).
+         * For SIMD encode and decode scanline routines, do any pre-aligning
+           for bad input/output buffer alignments and pitch?
+         * For very wide scanlines, we should we do vertical strips to stay within
+           L2 cache. Maybe do chunks of 1K pixels at a time. There would be 
+           some pixel reconversion, but probably dwarfed by things falling out
+           of cache. Probably also something possible with alternating between
+           scattering and gathering at high resize scales?
+         * Rewrite the coefficient generator to do many at once.
+         * AVX-512 vertical kernels - worried about downclocking here.
+         * Convert the reincludes to macros when we know they aren't changing.
+         * Experiment with pivoting the horizontal and always using the
+           vertical filters (which are faster, but perhaps not enough to overcome
+           the pivot cost and the extra memory touches). Need to buffer the whole
+           image so have to balance memory use.
+         * Most of our code is internally function pointers, should we compile
+           all the SIMD stuff always and dynamically dispatch? 
+
+   CONTRIBUTORS
+      Jeff Roberts: 2.0 implementation, optimizations, SIMD
+      Martins Mozeiko: NEON simd, WASM simd, clang and GCC whisperer.
+      Fabian Giesen: half float and srgb converters
+      Sean Barrett: API design, optimizations
+      Jorge L Rodriguez: Original 1.0 implementation
+      Aras Pranckevicius: bugfixes for 1.0
+      Nathan Reed: warning fixes for 1.0
+
+   REVISIONS
+      2.00 (2022-02-20) mostly new source: new api, optimizations, simd, vertical-first, etc 
+                       (2x-5x faster without simd, 4x-12x faster with simd)
+                       (in some cases, 20x to 40x faster - resizing to very small for example)
+      0.96 (2019-03-04) fixed warnings
+      0.95 (2017-07-23) fixed warnings
+      0.94 (2017-03-18) fixed warnings
+      0.93 (2017-03-03) fixed bug with certain combinations of heights
+      0.92 (2017-01-02) fix integer overflow on large (>2GB) images
+      0.91 (2016-04-02) fix warnings; fix handling of subpixel regions
+      0.90 (2014-09-17) first released version
+
+   LICENSE
+     See end of file for license information.
+*/
+
+#if !defined(STB_IMAGE_RESIZE_DO_HORIZONTALS) && !defined(STB_IMAGE_RESIZE_DO_VERTICALS) && !defined(STB_IMAGE_RESIZE_DO_CODERS)   // for internal re-includes
+
+#ifndef STBIR_INCLUDE_STB_IMAGE_RESIZE2_H
+#define STBIR_INCLUDE_STB_IMAGE_RESIZE2_H
+
+#include <stddef.h>
+#ifdef _MSC_VER
+typedef unsigned char    stbir_uint8;
+typedef unsigned short   stbir_uint16;
+typedef unsigned int     stbir_uint32;
+typedef unsigned __int64 stbir_uint64;
+#else
+#include <stdint.h>
+typedef uint8_t  stbir_uint8;
+typedef uint16_t stbir_uint16;
+typedef uint32_t stbir_uint32;
+typedef uint64_t stbir_uint64;
+#endif
+
+#ifdef _M_IX86_FP
+#if ( _M_IX86_FP >= 1 )
+#ifndef STBIR_SSE
+#define STBIR_SSE
+#endif
+#endif
+#endif 
+
+#if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(_M_AMD64) || defined(__SSE2__) || defined(STBIR_SSE) || defined(STBIR_SSE2)
+  #ifndef STBIR_SSE2
+    #define STBIR_SSE2
+  #endif
+  #if defined(__AVX__) || defined(STBIR_AVX2)
+    #ifndef STBIR_AVX
+      #ifndef STBIR_NO_AVX
+        #define STBIR_AVX
+      #endif
+    #endif
+  #endif
+  #if defined(__AVX2__) || defined(STBIR_AVX2)
+    #ifndef STBIR_NO_AVX2
+      #ifndef STBIR_AVX2  
+        #define STBIR_AVX2
+      #endif
+      #if defined( _MSC_VER ) && !defined(__clang__)
+        #ifndef STBIR_FP16C  // FP16C instructions are on all AVX2 cpus, so we can autoselect it here on microsoft - clang needs -m16c
+          #define STBIR_FP16C
+        #endif
+      #endif
+    #endif
+  #endif
+  #ifdef __F16C__
+    #ifndef STBIR_FP16C  // turn on FP16C instructions if the define is set (for clang and gcc)
+      #define STBIR_FP16C
+    #endif
+  #endif
+#endif
+
+#if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || defined(_M_ARM) || (__ARM_NEON_FP & 4) != 0 &&  __ARM_FP16_FORMAT_IEEE != 0
+#ifndef STBIR_NEON
+#define STBIR_NEON
+#endif
+#endif
+
+#if defined(_M_ARM)
+#ifdef STBIR_USE_FMA
+#undef STBIR_USE_FMA // no FMA for 32-bit arm on MSVC 
+#endif
+#endif
+
+#if defined(__wasm__) && defined(__wasm_simd128__)
+#ifndef STBIR_WASM
+#define STBIR_WASM
+#endif
+#endif
+
+#ifndef STBIRDEF
+#ifdef STB_IMAGE_RESIZE_STATIC
+#define STBIRDEF static
+#else
+#ifdef __cplusplus
+#define STBIRDEF extern "C"
+#else
+#define STBIRDEF extern
+#endif
+#endif
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+////   start "header file" ///////////////////////////////////////////////////
+//
+// Easy-to-use API:
+//
+//     * stride is the offset between successive rows of image data 
+//        in memory, in bytes. specify 0 for packed continuously in memory
+//     * colorspace is linear or sRGB as specified by function name
+//     * Uses the default filters
+//     * Uses edge mode clamped
+//     * returned result is 1 for success or 0 in case of an error.
+
+
+// stbir_pixel_layout specifies:
+//   number of channels
+//   order of channels
+//   whether color is premultiplied by alpha
+// for back compatibility, you can cast the old channel count to an stbir_pixel_layout
+typedef enum 
+{
+  STBIR_BGR      = 0,               // 3-chan, with order specified (for channel flipping)
+  STBIR_1CHANNEL = 1,              
+  STBIR_2CHANNEL = 2,
+  STBIR_RGB      = 3,               // 3-chan, with order specified (for channel flipping) 
+  STBIR_RGBA     = 4,               // alpha formats, alpha is NOT premultiplied into color channels
+
+  STBIR_4CHANNEL = 5,
+  STBIR_BGRA = 6,
+  STBIR_ARGB = 7,
+  STBIR_ABGR = 8,
+  STBIR_RA   = 9,
+  STBIR_AR   = 10,
+
+  STBIR_RGBA_PM = 11,               // alpha formats, alpha is premultiplied into color channels
+  STBIR_BGRA_PM = 12,
+  STBIR_ARGB_PM = 13,
+  STBIR_ABGR_PM = 14,
+  STBIR_RA_PM   = 15,
+  STBIR_AR_PM   = 16,
+} stbir_pixel_layout;
+
+//===============================================================
+//  Simple-complexity API
+//
+//    If output_pixels is NULL (0), then we will allocate the buffer and return it to you.
+//--------------------------------
+
+STBIRDEF unsigned char * stbir_resize_uint8_srgb( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
+                                                        unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                                        stbir_pixel_layout pixel_type );
+
+STBIRDEF unsigned char * stbir_resize_uint8_linear( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
+                                                          unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                                          stbir_pixel_layout pixel_type );
+
+STBIRDEF float * stbir_resize_float_linear( const float *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
+                                                  float *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                                  stbir_pixel_layout pixel_type );
+//===============================================================
+
+//===============================================================
+// Medium-complexity API
+//
+// This extends the easy-to-use API as follows:
+//
+//     * Can specify the datatype - U8, U8_SRGB, U16, FLOAT, HALF_FLOAT
+//     * Edge wrap can selected explicitly
+//     * Filter can be selected explicitly
+//--------------------------------
+
+typedef enum
+{
+  STBIR_EDGE_CLAMP   = 0,
+  STBIR_EDGE_REFLECT = 1,
+  STBIR_EDGE_WRAP    = 2,  // this edge mode is slower and uses more memory
+  STBIR_EDGE_ZERO    = 3,
+} stbir_edge;
+
+typedef enum
+{
+  STBIR_FILTER_DEFAULT      = 0,  // use same filter type that easy-to-use API chooses
+  STBIR_FILTER_BOX          = 1,  // A trapezoid w/1-pixel wide ramps, same result as box for integer scale ratios
+  STBIR_FILTER_TRIANGLE     = 2,  // On upsampling, produces same results as bilinear texture filtering
+  STBIR_FILTER_CUBICBSPLINE = 3,  // The cubic b-spline (aka Mitchell-Netrevalli with B=1,C=0), gaussian-esque
+  STBIR_FILTER_CATMULLROM   = 4,  // An interpolating cubic spline
+  STBIR_FILTER_MITCHELL     = 5,  // Mitchell-Netrevalli filter with B=1/3, C=1/3
+  STBIR_FILTER_POINT_SAMPLE = 6,  // Simple point sampling
+  STBIR_FILTER_OTHER        = 7,  // User callback specified
+} stbir_filter;
+
+typedef enum
+{
+  STBIR_TYPE_UINT8            = 0,
+  STBIR_TYPE_UINT8_SRGB       = 1,
+  STBIR_TYPE_UINT8_SRGB_ALPHA = 2,  // alpha channel, when present, should also be SRGB (this is very unusual)
+  STBIR_TYPE_UINT16           = 3,
+  STBIR_TYPE_FLOAT            = 4,
+  STBIR_TYPE_HALF_FLOAT       = 5
+} stbir_datatype;
+
+// medium api
+STBIRDEF void *  stbir_resize( const void *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
+                                     void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                               stbir_pixel_layout pixel_layout, stbir_datatype data_type,
+                               stbir_edge edge, stbir_filter filter );
+//===============================================================
+
+
+
+//===============================================================
+// Extended-complexity API
+//
+// This API exposes all resize functionality.
+//
+//     * Separate filter types for each axis
+//     * Separate edge modes for each axis
+//     * Separate input and output data types
+//     * Can specify regions with subpixel correctness
+//     * Can specify alpha flags
+//     * Can specify a memory callback 
+//     * Can specify a callback data type for pixel input and output 
+//     * Can be threaded for a single resize
+//     * Can be used to resize many frames without recalculating the sampler info
+//
+//  Use this API as follows:
+//     1) Call the stbir_resize_init function on a local STBIR_RESIZE structure
+//     2) Call any of the stbir_set functions
+//     3) Optionally call stbir_build_samplers() if you are going to resample multiple times
+//        with the same input and output dimensions (like resizing video frames)
+//     4) Resample by calling stbir_resize_extended().
+//     5) Call stbir_free_samplers() if you called stbir_build_samplers()
+//--------------------------------
+
+
+// Types:
+
+// INPUT CALLBACK: this callback is used for input scanlines
+typedef void const * stbir_input_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context );
+
+// OUTPUT CALLBACK: this callback is used for output scanlines
+typedef void stbir_output_callback( void const * output_ptr, int num_pixels, int y, void * context );
+
+// callbacks for user installed filters
+typedef float stbir__kernel_callback( float x, float scale, void * user_data ); // centered at zero
+typedef float stbir__support_callback( float scale, void * user_data );
+
+// internal structure with precomputed scaling
+typedef struct stbir__info stbir__info; 
+
+typedef struct STBIR_RESIZE  // use the stbir_resize_init and stbir_override functions to set these values for future compatibility
+{
+  void * user_data;
+  void const * input_pixels;
+  int input_w, input_h;
+  double input_s0, input_t0, input_s1, input_t1;
+  stbir_input_callback * input_cb;
+  void * output_pixels;
+  int output_w, output_h;
+  int output_subx, output_suby, output_subw, output_subh;
+  stbir_output_callback * output_cb;
+  int input_stride_in_bytes;
+  int output_stride_in_bytes;
+  int splits;
+  int fast_alpha;
+  int needs_rebuild;
+  int called_alloc;
+  stbir_pixel_layout input_pixel_layout_public;
+  stbir_pixel_layout output_pixel_layout_public;
+  stbir_datatype input_data_type;
+  stbir_datatype output_data_type;
+  stbir_filter horizontal_filter, vertical_filter;
+  stbir_edge horizontal_edge, vertical_edge;
+  stbir__kernel_callback * horizontal_filter_kernel; stbir__support_callback * horizontal_filter_support;
+  stbir__kernel_callback * vertical_filter_kernel; stbir__support_callback * vertical_filter_support;
+  stbir__info * samplers;      
+} STBIR_RESIZE;
+
+// extended complexity api
+
+
+// First off, you must ALWAYS call stbir_resize_init on your resize structure before any of the other calls!
+STBIRDEF void stbir_resize_init( STBIR_RESIZE * resize,
+                                 const void *input_pixels,  int input_w,  int input_h, int input_stride_in_bytes, // stride can be zero
+                                       void *output_pixels, int output_w, int output_h, int output_stride_in_bytes, // stride can be zero
+                                 stbir_pixel_layout pixel_layout, stbir_datatype data_type );
+
+//===============================================================
+// You can update these parameters any time after resize_init and there is no cost
+//--------------------------------
+
+STBIRDEF void stbir_set_datatypes( STBIR_RESIZE * resize, stbir_datatype input_type, stbir_datatype output_type );         
+STBIRDEF void stbir_set_pixel_callbacks( STBIR_RESIZE * resize, stbir_input_callback * input_cb, stbir_output_callback * output_cb );   // no callbacks by default
+STBIRDEF void stbir_set_user_data( STBIR_RESIZE * resize, void * user_data );                                               // pass back STBIR_RESIZE* by default
+STBIRDEF void stbir_set_buffer_ptrs( STBIR_RESIZE * resize, const void * input_pixels, int input_stride_in_bytes, void * output_pixels, int output_stride_in_bytes );
+
+//===============================================================
+
+
+//===============================================================
+// If you call any of these functions, you will trigger a sampler rebuild!
+//--------------------------------
+
+STBIRDEF int stbir_set_pixel_layouts( STBIR_RESIZE * resize, stbir_pixel_layout input_pixel_layout, stbir_pixel_layout output_pixel_layout );  // sets new buffer layouts
+STBIRDEF int stbir_set_edgemodes( STBIR_RESIZE * resize, stbir_edge horizontal_edge, stbir_edge vertical_edge );       // CLAMP by default
+
+STBIRDEF int stbir_set_filters( STBIR_RESIZE * resize, stbir_filter horizontal_filter, stbir_filter vertical_filter ); // STBIR_DEFAULT_FILTER_UPSAMPLE/DOWNSAMPLE by default
+STBIRDEF int stbir_set_filter_callbacks( STBIR_RESIZE * resize, stbir__kernel_callback * horizontal_filter, stbir__support_callback * horizontal_support, stbir__kernel_callback * vertical_filter, stbir__support_callback * vertical_support ); 
+
+STBIRDEF int stbir_set_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh );        // sets both sub-regions (full regions by default)
+STBIRDEF int stbir_set_input_subrect( STBIR_RESIZE * resize, double s0, double t0, double s1, double t1 );    // sets input sub-region (full region by default)
+STBIRDEF int stbir_set_output_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh ); // sets output sub-region (full region by default)
+
+// when inputting AND outputting non-premultiplied alpha pixels, we use a slower but higher quality technique
+//   that fills the zero alpha pixel's RGB values with something plausible.  If you don't care about areas of
+//   zero alpha, you can call this function to get about a 25% speed improvement for STBIR_RGBA to STBIR_RGBA
+//   types of resizes.
+STBIRDEF int stbir_set_non_pm_alpha_speed_over_quality( STBIR_RESIZE * resize, int non_pma_alpha_speed_over_quality );
+//===============================================================
+
+
+//===============================================================
+// You can call build_samplers to prebuild all the internal data we need to resample.
+//   Then, if you call resize_extended many times with the same resize, you only pay the
+//   cost once.
+// If you do call build_samplers, you MUST call free_samplers eventually.
+//--------------------------------
+
+// This builds the samplers and does one allocation
+STBIRDEF int stbir_build_samplers( STBIR_RESIZE * resize ); 
+
+// You MUST call this, if you call stbir_build_samplers or stbir_build_samplers_with_splits
+STBIRDEF void stbir_free_samplers( STBIR_RESIZE * resize );
+//===============================================================
+
+
+// And this is the main function to perform the resize synchronously on one thread.
+STBIRDEF int stbir_resize_extended( STBIR_RESIZE * resize );
+
+
+//===============================================================
+// Use these functions for multithreading.
+//   1) You call stbir_build_samplers_with_splits first on the main thread
+//   2) Then stbir_resize_with_split on each thread
+//   3) stbir_free_samplers when done on the main thread
+//--------------------------------
+
+// This will build samplers for threading.
+//   You can pass in the number of threads you'd like to use (try_splits).
+//   It returns the number of splits (threads) that you can call it with.
+///  It might be less if the image resize can't be split up that many ways.
+
+STBIRDEF int stbir_build_samplers_with_splits( STBIR_RESIZE * resize, int try_splits );             
+
+// This function does a split of the resizing (you call this fuction for each
+// split, on multiple threads). A split is a piece of the output resize pixel space.
+
+// Note that you MUST call stbir_build_samplers_with_splits before stbir_resize_extended_split!
+
+// Usually, you will always call stbir_resize_split with split_start as the thread_index
+//   and "1" for the split_count.
+// But, if you have a weird situation where you MIGHT want 8 threads, but sometimes
+//   only 4 threads, you can use 0,2,4,6 for the split_start's and use "2" for the 
+//   split_count each time to turn in into a 4 thread resize. (This is unusual).
+
+STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start, int split_count );         
+//===============================================================
+
+
+//===============================================================
+// Pixel Callbacks info:
+//--------------------------------
+
+//   The input callback is super flexible - it calls you with the input address
+//   (based on the stride and base pointer), it gives you an optional_output
+//   pointer that you can fill, or you can just return your own pointer into
+//   your own data. 
+//
+//   You can also do conversion from non-supported data types if necessary - in 
+//   this case, you ignore the input_ptr and just use the x and y parameters to 
+//   calculate your own input_ptr based on the size of each non-supported pixel.
+//   (Something like the third example below.)
+//
+//   You can also install just an input or just an output callback by setting the
+//   callback that you don't want to zero.
+//
+//     First example, progress: (getting a callback that you can monitor the progress):
+//        void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context )
+//        {
+//           percentage_done = y / input_height;
+//           return input_ptr;  // use buffer from call
+//        }
+//
+//     Next example, copying: (copy from some other buffer or stream):  
+//        void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context )
+//        {
+//           CopyOrStreamData( optional_output, other_data_src, num_pixels * pixel_width_in_bytes );
+//           return optional_output;  // return the optional buffer that we filled
+//        }
+//
+//     Third example, input another buffer without copying: (zero-copy from other buffer):  
+//        void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context )
+//        {
+//           void * pixels = ( (char*) other_image_base ) + ( y * other_image_stride ) + ( x * other_pixel_width_in_bytes );
+//           return pixels;       // return pointer to your data without copying
+//        }
+//
+//
+//   The output callback is considerably simpler - it just calls you so that you can dump
+//   out each scanline. You could even directly copy out to disk if you have a simple format
+//   like TGA or BMP. You can also convert to other output types here if you want.
+//
+//   Simple example:
+//        void const * my_output( void * output_ptr, int num_pixels, int y, void * context )
+//        {
+//           percentage_done = y / output_height;
+//           fwrite( output_ptr, pixel_width_in_bytes, num_pixels, output_file );
+//        }
+//===============================================================
+
+
+
+
+//===============================================================
+// optional built-in profiling API
+//--------------------------------
+
+#ifdef STBIR_PROFILE
+
+typedef struct STBIR_PROFILE_INFO 
+{
+  stbir_uint64 total_clocks;
+
+  // how many clocks spent (of total_clocks) in the various resize routines, along with a string description
+  //    there are "resize_count" number of zones
+  stbir_uint64 clocks[ 8 ];
+  char const ** descriptions;
+  
+  // count of clocks and descriptions
+  stbir_uint32 count;
+} STBIR_PROFILE_INFO;
+
+// use after calling stbir_resize_extended (or stbir_build_samplers or stbir_build_samplers_with_splits)
+STBIRDEF void stbir_resize_build_profile_info( STBIR_PROFILE_INFO * out_info, STBIR_RESIZE const * resize );
+
+// use after calling stbir_resize_extended
+STBIRDEF void stbir_resize_extended_profile_info( STBIR_PROFILE_INFO * out_info, STBIR_RESIZE const * resize );
+
+// use after calling stbir_resize_extended_split
+STBIRDEF void stbir_resize_split_profile_info( STBIR_PROFILE_INFO * out_info, STBIR_RESIZE const * resize, int split_start, int split_num );
+
+//===============================================================
+
+#endif
+
+
+////   end header file   /////////////////////////////////////////////////////
+#endif // STBIR_INCLUDE_STB_IMAGE_RESIZE2_H
+
+#if defined(STB_IMAGE_RESIZE_IMPLEMENTATION) || defined(STB_IMAGE_RESIZE2_IMPLEMENTATION)
+
+#ifndef STBIR_ASSERT
+#include <assert.h>
+#define STBIR_ASSERT(x) assert(x)
+#endif
+
+#ifndef STBIR_MALLOC
+#include <stdlib.h>
+#define STBIR_MALLOC(size,user_data) ((void)(user_data), malloc(size))
+#define STBIR_FREE(ptr,user_data)    ((void)(user_data), free(ptr))
+// (we used the comma operator to evaluate user_data, to avoid "unused parameter" warnings)
+#endif
+
+#ifdef _MSC_VER
+
+#define stbir__inline __forceinline
+
+#else
+
+#define stbir__inline __inline__
+
+// Clang address sanitizer
+#if defined(__has_feature)
+  #if __has_feature(address_sanitizer) || __has_feature(memory_sanitizer)
+    #ifndef STBIR__SEPARATE_ALLOCATIONS
+      #define STBIR__SEPARATE_ALLOCATIONS
+    #endif
+  #endif
+#endif
+
+#endif
+
+// GCC and MSVC
+#if defined(__SANITIZE_ADDRESS__)
+  #ifndef STBIR__SEPARATE_ALLOCATIONS
+    #define STBIR__SEPARATE_ALLOCATIONS
+  #endif
+#endif
+
+// Always turn off automatic FMA use - use STBIR_USE_FMA if you want.
+// Otherwise, this is a determinism disaster.
+#ifndef STBIR_DONT_CHANGE_FP_CONTRACT  // override in case you don't want this behavior
+#if defined(_MSC_VER) && !defined(__clang__)
+#if _MSC_VER > 1200
+#pragma fp_contract(off)
+#endif
+#elif defined(__GNUC__) &&  !defined(__clang__)
+#pragma GCC optimize("fp-contract=off")
+#else
+#pragma STDC FP_CONTRACT OFF
+#endif
+#endif
+
+#ifdef _MSC_VER
+#define STBIR__UNUSED(v)  (void)(v)
+#else
+#define STBIR__UNUSED(v)  (void)sizeof(v)
+#endif
+
+#define STBIR__ARRAY_SIZE(a) (sizeof((a))/sizeof((a)[0]))
+
+
+#ifndef STBIR_DEFAULT_FILTER_UPSAMPLE
+#define STBIR_DEFAULT_FILTER_UPSAMPLE    STBIR_FILTER_CATMULLROM
+#endif
+
+#ifndef STBIR_DEFAULT_FILTER_DOWNSAMPLE
+#define STBIR_DEFAULT_FILTER_DOWNSAMPLE  STBIR_FILTER_MITCHELL
+#endif
+
+
+#ifndef STBIR__HEADER_FILENAME
+#define STBIR__HEADER_FILENAME "stb_image_resize2.h"
+#endif
+
+// the internal pixel layout enums are in a different order, so we can easily do range comparisons of types
+//   the public pixel layout is ordered in a way that if you cast num_channels (1-4) to the enum, you get something sensible 
+typedef enum 
+{
+  STBIRI_1CHANNEL = 0,
+  STBIRI_2CHANNEL = 1,
+  STBIRI_RGB      = 2,
+  STBIRI_BGR      = 3,
+  STBIRI_4CHANNEL = 4,
+  
+  STBIRI_RGBA = 5,
+  STBIRI_BGRA = 6,
+  STBIRI_ARGB = 7,
+  STBIRI_ABGR = 8,
+  STBIRI_RA   = 9,
+  STBIRI_AR   = 10,
+
+  STBIRI_RGBA_PM = 11,
+  STBIRI_BGRA_PM = 12,
+  STBIRI_ARGB_PM = 13,
+  STBIRI_ABGR_PM = 14,
+  STBIRI_RA_PM   = 15,
+  STBIRI_AR_PM   = 16,
+} stbir_internal_pixel_layout;
+
+// define the public pixel layouts to not compile inside the implementation (to avoid accidental use)
+#define STBIR_BGR bad_dont_use_in_implementation
+#define STBIR_1CHANNEL STBIR_BGR
+#define STBIR_2CHANNEL STBIR_BGR
+#define STBIR_RGB STBIR_BGR
+#define STBIR_RGBA STBIR_BGR
+#define STBIR_4CHANNEL STBIR_BGR
+#define STBIR_BGRA STBIR_BGR
+#define STBIR_ARGB STBIR_BGR
+#define STBIR_ABGR STBIR_BGR
+#define STBIR_RA STBIR_BGR
+#define STBIR_AR STBIR_BGR
+#define STBIR_RGBA_PM STBIR_BGR
+#define STBIR_BGRA_PM STBIR_BGR
+#define STBIR_ARGB_PM STBIR_BGR
+#define STBIR_ABGR_PM STBIR_BGR
+#define STBIR_RA_PM STBIR_BGR
+#define STBIR_AR_PM STBIR_BGR
+
+// must match stbir_datatype
+static unsigned char stbir__type_size[] = {
+  1,1,1,2,4,2 // STBIR_TYPE_UINT8,STBIR_TYPE_UINT8_SRGB,STBIR_TYPE_UINT8_SRGB_ALPHA,STBIR_TYPE_UINT16,STBIR_TYPE_FLOAT,STBIR_TYPE_HALF_FLOAT
+};
+
+// When gathering, the contributors are which source pixels contribute.
+// When scattering, the contributors are which destination pixels are contributed to.
+typedef struct
+{
+  int n0; // First contributing pixel
+  int n1; // Last contributing pixel
+} stbir__contributors;
+
+typedef struct
+{
+  int lowest;    // First sample index for whole filter
+  int highest;   // Last sample index for whole filter
+  int widest;    // widest single set of samples for an output
+} stbir__filter_extent_info;
+
+typedef struct
+{
+  int n0; // First pixel of decode buffer to write to
+  int n1; // Last pixel of decode that will be written to
+  int pixel_offset_for_input;  // Pixel offset into input_scanline
+} stbir__span;
+
+typedef struct stbir__scale_info
+{
+  int input_full_size;
+  int output_sub_size;
+  float scale;
+  float inv_scale;
+  float pixel_shift; // starting shift in output pixel space (in pixels)
+  int scale_is_rational;
+  stbir_uint32 scale_numerator, scale_denominator;
+} stbir__scale_info;
+
+typedef struct
+{
+  stbir__contributors * contributors;
+  float* coefficients;
+  stbir__contributors * gather_prescatter_contributors;
+  float * gather_prescatter_coefficients;
+  stbir__scale_info scale_info;
+  float support;
+  stbir_filter filter_enum;
+  stbir__kernel_callback * filter_kernel;
+  stbir__support_callback * filter_support;
+  stbir_edge edge;
+  int coefficient_width;
+  int filter_pixel_width;
+  int filter_pixel_margin;
+  int num_contributors;
+  int contributors_size;
+  int coefficients_size;
+  stbir__filter_extent_info extent_info;
+  int is_gather;  // 0 = scatter, 1 = gather with scale >= 1, 2 = gather with scale < 1
+  int gather_prescatter_num_contributors;
+  int gather_prescatter_coefficient_width;
+  int gather_prescatter_contributors_size;
+  int gather_prescatter_coefficients_size;
+} stbir__sampler;
+
+typedef struct
+{
+  stbir__contributors conservative;
+  int edge_sizes[2];    // this can be less than filter_pixel_margin, if the filter and scaling falls off
+  stbir__span spans[2]; // can be two spans, if doing input subrect with clamp mode WRAP
+} stbir__extents;
+
+typedef struct 
+{
+#ifdef STBIR_PROFILE
+  union
+  {
+    struct { stbir_uint64 total, looping, vertical, horizontal, decode, encode, alpha, unalpha; } named;
+    stbir_uint64 array[8];
+  } profile;
+  stbir_uint64 * current_zone_excluded_ptr;
+#endif
+  float* decode_buffer;
+
+  int ring_buffer_first_scanline;
+  int ring_buffer_last_scanline;
+  int ring_buffer_begin_index;    // first_scanline is at this index in the ring buffer
+  int start_output_y, end_output_y;
+  int start_input_y, end_input_y;  // used in scatter only
+
+  #ifdef STBIR__SEPARATE_ALLOCATIONS
+    float** ring_buffers; // one pointer for each ring buffer
+  #else
+    float* ring_buffer;  // one big buffer that we index into
+  #endif
+
+  float* vertical_buffer;
+
+  char no_cache_straddle[64];
+} stbir__per_split_info;
+
+typedef void stbir__decode_pixels_func( float * decode, int width_times_channels, void const * input );
+typedef void stbir__alpha_weight_func( float * decode_buffer, int width_times_channels );
+typedef void stbir__horizontal_gather_channels_func( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, 
+  stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width );
+typedef void stbir__alpha_unweight_func(float * encode_buffer, int width_times_channels );
+typedef void stbir__encode_pixels_func( void * output, int width_times_channels, float const * encode );
+
+struct stbir__info
+{
+#ifdef STBIR_PROFILE
+  union
+  {
+    struct { stbir_uint64 total, build, alloc, horizontal, vertical, cleanup, pivot; } named;
+    stbir_uint64 array[7];
+  } profile;
+  stbir_uint64 * current_zone_excluded_ptr;
+#endif
+  stbir__sampler horizontal;
+  stbir__sampler vertical;
+
+  void const * input_data;
+  void * output_data;
+
+  int input_stride_bytes;
+  int output_stride_bytes;
+  int ring_buffer_length_bytes;   // The length of an individual entry in the ring buffer. The total number of ring buffers is stbir__get_filter_pixel_width(filter)
+  int ring_buffer_num_entries;    // Total number of entries in the ring buffer.
+
+  stbir_datatype input_type;
+  stbir_datatype output_type;
+
+  stbir_input_callback * in_pixels_cb;
+  void * user_data;
+  stbir_output_callback * out_pixels_cb;
+
+  stbir__extents scanline_extents;
+
+  void * alloced_mem;
+  stbir__per_split_info * split_info;  // by default 1, but there will be N of these allocated based on the thread init you did
+
+  stbir__decode_pixels_func * decode_pixels;
+  stbir__alpha_weight_func * alpha_weight;
+  stbir__horizontal_gather_channels_func * horizontal_gather_channels;
+  stbir__alpha_unweight_func * alpha_unweight;
+  stbir__encode_pixels_func * encode_pixels;
+  
+  int alloced_total;
+  int splits; // count of splits
+  
+  stbir_internal_pixel_layout input_pixel_layout_internal;
+  stbir_internal_pixel_layout output_pixel_layout_internal;
+
+  int input_color_and_type;
+  int offset_x, offset_y; // offset within output_data
+  int vertical_first;
+  int channels;
+  int effective_channels; // same as channels, except on RGBA/ARGB (7), or XA/AX (3)
+  int alloc_ring_buffer_num_entries;    // Number of entries in the ring buffer that will be allocated
+};
+
+
+#define stbir__max_uint8_as_float             255.0f
+#define stbir__max_uint16_as_float            65535.0f
+#define stbir__max_uint8_as_float_inverted    (1.0f/255.0f)
+#define stbir__max_uint16_as_float_inverted   (1.0f/65535.0f)
+#define stbir__small_float ((float)1 / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20))
+
+// min/max friendly
+#define STBIR_CLAMP(x, xmin, xmax) do { \
+  if ( (x) < (xmin) ) (x) = (xmin);     \
+  if ( (x) > (xmax) ) (x) = (xmax);     \
+} while (0)
+
+static stbir__inline int stbir__min(int a, int b)
+{
+  return a < b ? a : b;
+}
+
+static stbir__inline int stbir__max(int a, int b)
+{
+  return a > b ? a : b;
+}
+
+static float stbir__srgb_uchar_to_linear_float[256] = {
+  0.000000f, 0.000304f, 0.000607f, 0.000911f, 0.001214f, 0.001518f, 0.001821f, 0.002125f, 0.002428f, 0.002732f, 0.003035f,
+  0.003347f, 0.003677f, 0.004025f, 0.004391f, 0.004777f, 0.005182f, 0.005605f, 0.006049f, 0.006512f, 0.006995f, 0.007499f,
+  0.008023f, 0.008568f, 0.009134f, 0.009721f, 0.010330f, 0.010960f, 0.011612f, 0.012286f, 0.012983f, 0.013702f, 0.014444f,
+  0.015209f, 0.015996f, 0.016807f, 0.017642f, 0.018500f, 0.019382f, 0.020289f, 0.021219f, 0.022174f, 0.023153f, 0.024158f,
+  0.025187f, 0.026241f, 0.027321f, 0.028426f, 0.029557f, 0.030713f, 0.031896f, 0.033105f, 0.034340f, 0.035601f, 0.036889f,
+  0.038204f, 0.039546f, 0.040915f, 0.042311f, 0.043735f, 0.045186f, 0.046665f, 0.048172f, 0.049707f, 0.051269f, 0.052861f,
+  0.054480f, 0.056128f, 0.057805f, 0.059511f, 0.061246f, 0.063010f, 0.064803f, 0.066626f, 0.068478f, 0.070360f, 0.072272f,
+  0.074214f, 0.076185f, 0.078187f, 0.080220f, 0.082283f, 0.084376f, 0.086500f, 0.088656f, 0.090842f, 0.093059f, 0.095307f,
+  0.097587f, 0.099899f, 0.102242f, 0.104616f, 0.107023f, 0.109462f, 0.111932f, 0.114435f, 0.116971f, 0.119538f, 0.122139f,
+  0.124772f, 0.127438f, 0.130136f, 0.132868f, 0.135633f, 0.138432f, 0.141263f, 0.144128f, 0.147027f, 0.149960f, 0.152926f,
+  0.155926f, 0.158961f, 0.162029f, 0.165132f, 0.168269f, 0.171441f, 0.174647f, 0.177888f, 0.181164f, 0.184475f, 0.187821f,
+  0.191202f, 0.194618f, 0.198069f, 0.201556f, 0.205079f, 0.208637f, 0.212231f, 0.215861f, 0.219526f, 0.223228f, 0.226966f,
+  0.230740f, 0.234551f, 0.238398f, 0.242281f, 0.246201f, 0.250158f, 0.254152f, 0.258183f, 0.262251f, 0.266356f, 0.270498f,
+  0.274677f, 0.278894f, 0.283149f, 0.287441f, 0.291771f, 0.296138f, 0.300544f, 0.304987f, 0.309469f, 0.313989f, 0.318547f,
+  0.323143f, 0.327778f, 0.332452f, 0.337164f, 0.341914f, 0.346704f, 0.351533f, 0.356400f, 0.361307f, 0.366253f, 0.371238f,
+  0.376262f, 0.381326f, 0.386430f, 0.391573f, 0.396755f, 0.401978f, 0.407240f, 0.412543f, 0.417885f, 0.423268f, 0.428691f,
+  0.434154f, 0.439657f, 0.445201f, 0.450786f, 0.456411f, 0.462077f, 0.467784f, 0.473532f, 0.479320f, 0.485150f, 0.491021f,
+  0.496933f, 0.502887f, 0.508881f, 0.514918f, 0.520996f, 0.527115f, 0.533276f, 0.539480f, 0.545725f, 0.552011f, 0.558340f,
+  0.564712f, 0.571125f, 0.577581f, 0.584078f, 0.590619f, 0.597202f, 0.603827f, 0.610496f, 0.617207f, 0.623960f, 0.630757f,
+  0.637597f, 0.644480f, 0.651406f, 0.658375f, 0.665387f, 0.672443f, 0.679543f, 0.686685f, 0.693872f, 0.701102f, 0.708376f,
+  0.715694f, 0.723055f, 0.730461f, 0.737911f, 0.745404f, 0.752942f, 0.760525f, 0.768151f, 0.775822f, 0.783538f, 0.791298f,
+  0.799103f, 0.806952f, 0.814847f, 0.822786f, 0.830770f, 0.838799f, 0.846873f, 0.854993f, 0.863157f, 0.871367f, 0.879622f,
+  0.887923f, 0.896269f, 0.904661f, 0.913099f, 0.921582f, 0.930111f, 0.938686f, 0.947307f, 0.955974f, 0.964686f, 0.973445f,
+  0.982251f, 0.991102f, 1.0f
+};
+
+typedef union
+{
+  unsigned int u;
+  float f;
+} stbir__FP32;
+
+// From https://gist.github.com/rygorous/2203834
+
+static const stbir_uint32 fp32_to_srgb8_tab4[104] = {
+  0x0073000d, 0x007a000d, 0x0080000d, 0x0087000d, 0x008d000d, 0x0094000d, 0x009a000d, 0x00a1000d,
+  0x00a7001a, 0x00b4001a, 0x00c1001a, 0x00ce001a, 0x00da001a, 0x00e7001a, 0x00f4001a, 0x0101001a,
+  0x010e0033, 0x01280033, 0x01410033, 0x015b0033, 0x01750033, 0x018f0033, 0x01a80033, 0x01c20033,
+  0x01dc0067, 0x020f0067, 0x02430067, 0x02760067, 0x02aa0067, 0x02dd0067, 0x03110067, 0x03440067,
+  0x037800ce, 0x03df00ce, 0x044600ce, 0x04ad00ce, 0x051400ce, 0x057b00c5, 0x05dd00bc, 0x063b00b5,
+  0x06970158, 0x07420142, 0x07e30130, 0x087b0120, 0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2,
+  0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180, 0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143,
+  0x11070264, 0x1238023e, 0x1357021d, 0x14660201, 0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af,
+  0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad, 0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240,
+  0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392, 0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300,
+  0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5, 0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401,
+  0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d, 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
+  0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f, 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
+};
+ 
+static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
+{
+  static const stbir__FP32 almostone = { 0x3f7fffff }; // 1-eps
+  static const stbir__FP32 minval = { (127-13) << 23 };
+  stbir_uint32 tab,bias,scale,t;
+  stbir__FP32 f;
+
+  // Clamp to [2^(-13), 1-eps]; these two values map to 0 and 1, respectively.
+  // The tests are carefully written so that NaNs map to 0, same as in the reference
+  // implementation.
+  if (!(in > minval.f)) // written this way to catch NaNs
+      return 0;
+  if (in > almostone.f)
+      return 255;
+
+  // Do the table lookup and unpack bias, scale
+  f.f = in;
+  tab = fp32_to_srgb8_tab4[(f.u - minval.u) >> 20];
+  bias = (tab >> 16) << 9;
+  scale = tab & 0xffff;
+
+  // Grab next-highest mantissa bits and perform linear interpolation
+  t = (f.u >> 12) & 0xff;
+  return (unsigned char) ((bias + scale*t) >> 16);
+}
+
+#ifndef STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT
+#define STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT 32 // when downsampling and <= 32 scanlines of buffering, use gather. gather used down to 1/8th scaling for 25% win.
+#endif
+
+// restrict pointers for the output pointers
+#if defined( _MSC_VER ) && !defined(__clang__)
+  #define STBIR_STREAMOUT_PTR( star ) star __restrict
+  #define STBIR_NO_UNROLL( ptr ) __assume(ptr) // this oddly keeps msvc from unrolling a loop
+#elif defined(  __clang__ )
+  #define STBIR_STREAMOUT_PTR( star ) star __restrict__
+  #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr))
+#elif defined(  __GNUC__ )
+  #define STBIR_STREAMOUT_PTR( star ) star __restrict__
+  #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr))
+#else
+  #define STBIR_STREAMOUT_PTR( star ) star
+  #define STBIR_NO_UNROLL( ptr )
+#endif
+
+#ifdef STBIR_NO_SIMD // force simd off for whatever reason
+
+// force simd off overrides everything else, so clear it all
+
+#ifdef STBIR_SSE2
+#undef STBIR_SSE2
+#endif
+
+#ifdef STBIR_AVX
+#undef STBIR_AVX
+#endif
+
+#ifdef STBIR_NEON
+#undef STBIR_NEON
+#endif
+
+#ifdef STBIR_AVX2
+#undef STBIR_AVX2
+#endif
+
+#ifdef STBIR_FP16C
+#undef STBIR_FP16C
+#endif
+
+#ifdef STBIR_WASM
+#undef STBIR_WASM
+#endif
+
+#ifdef STBIR_SIMD
+#undef STBIR_SIMD
+#endif
+
+#else // STBIR_SIMD
+
+#ifdef STBIR_SSE2
+  #include <emmintrin.h>
+  
+  #define stbir__simdf __m128
+  #define stbir__simdi __m128i
+
+  #define stbir_simdi_castf( reg ) _mm_castps_si128(reg)
+  #define stbir_simdf_casti( reg ) _mm_castsi128_ps(reg)
+
+  #define stbir__simdf_load( reg, ptr ) (reg) = _mm_loadu_ps( (float const*)(ptr) )
+  #define stbir__simdi_load( reg, ptr ) (reg) = _mm_loadu_si128 ( (stbir__simdi const*)(ptr) )
+  #define stbir__simdf_load1( out, ptr ) (out) = _mm_load_ss( (float const*)(ptr) )  // top values can be random (not denormal or nan for perf)
+  #define stbir__simdi_load1( out, ptr ) (out) = _mm_castps_si128( _mm_load_ss( (float const*)(ptr) ))
+  #define stbir__simdf_load1z( out, ptr ) (out) = _mm_load_ss( (float const*)(ptr) )  // top values must be zero
+  #define stbir__simdf_frep4( fvar ) _mm_set_ps1( fvar )
+  #define stbir__simdf_load1frep4( out, fvar ) (out) = _mm_set_ps1( fvar )
+  #define stbir__simdf_load2( out, ptr ) (out) = _mm_castsi128_ps( _mm_loadl_epi64( (__m128i*)(ptr)) ) // top values can be random (not denormal or nan for perf)
+  #define stbir__simdf_load2z( out, ptr ) (out) = _mm_castsi128_ps( _mm_loadl_epi64( (__m128i*)(ptr)) ) // top values must be zero
+  #define stbir__simdf_load2hmerge( out, reg, ptr ) (out) = _mm_castpd_ps(_mm_loadh_pd( _mm_castps_pd(reg), (double*)(ptr) ))
+
+  #define stbir__simdf_zeroP() _mm_setzero_ps()
+  #define stbir__simdf_zero( reg ) (reg) = _mm_setzero_ps()
+
+  #define stbir__simdf_store( ptr, reg )  _mm_storeu_ps( (float*)(ptr), reg )
+  #define stbir__simdf_store1( ptr, reg ) _mm_store_ss( (float*)(ptr), reg )
+  #define stbir__simdf_store2( ptr, reg ) _mm_storel_epi64( (__m128i*)(ptr), _mm_castps_si128(reg) )
+  #define stbir__simdf_store2h( ptr, reg ) _mm_storeh_pd( (double*)(ptr), _mm_castps_pd(reg) )
+
+  #define stbir__simdi_store( ptr, reg )  _mm_storeu_si128( (__m128i*)(ptr), reg )
+  #define stbir__simdi_store1( ptr, reg ) _mm_store_ss( (float*)(ptr), _mm_castsi128_ps(reg) )
+  #define stbir__simdi_store2( ptr, reg ) _mm_storel_epi64( (__m128i*)(ptr), (reg) )
+
+  #define stbir__prefetch( ptr ) _mm_prefetch((char*)(ptr), _MM_HINT_T0 )
+ 
+  #define stbir__simdi_expand_u8_to_u32(out0,out1,out2,out3,ireg) \
+  { \
+    stbir__simdi zero = _mm_setzero_si128(); \
+    out2 = _mm_unpacklo_epi8( ireg, zero ); \
+    out3 = _mm_unpackhi_epi8( ireg, zero ); \
+    out0 = _mm_unpacklo_epi16( out2, zero ); \
+    out1 = _mm_unpackhi_epi16( out2, zero ); \
+    out2 = _mm_unpacklo_epi16( out3, zero ); \
+    out3 = _mm_unpackhi_epi16( out3, zero ); \
+  }
+
+#define stbir__simdi_expand_u8_to_1u32(out,ireg) \
+  { \
+    stbir__simdi zero = _mm_setzero_si128(); \
+    out = _mm_unpacklo_epi8( ireg, zero ); \
+    out = _mm_unpacklo_epi16( out, zero ); \
+  }
+
+  #define stbir__simdi_expand_u16_to_u32(out0,out1,ireg) \
+  { \
+    stbir__simdi zero = _mm_setzero_si128(); \
+    out0 = _mm_unpacklo_epi16( ireg, zero ); \
+    out1 = _mm_unpackhi_epi16( ireg, zero ); \
+  }
+
+  #define stbir__simdf_convert_float_to_i32( i, f ) (i) = _mm_cvttps_epi32(f)
+  #define stbir__simdf_convert_float_to_int( f ) _mm_cvtt_ss2si(f)
+  #define stbir__simdf_convert_float_to_uint8( f ) ((unsigned char)_mm_cvtsi128_si32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(f,STBIR__CONSTF(STBIR_max_uint8_as_float)),_mm_setzero_ps()))))
+  #define stbir__simdf_convert_float_to_short( f ) ((unsigned short)_mm_cvtsi128_si32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(f,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps()))))
+
+  #define stbir__simdi_to_int( i ) _mm_cvtsi128_si32(i) 
+  #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = _mm_cvtepi32_ps( ireg )
+  #define stbir__simdf_add( out, reg0, reg1 ) (out) = _mm_add_ps( reg0, reg1 )
+  #define stbir__simdf_mult( out, reg0, reg1 ) (out) = _mm_mul_ps( reg0, reg1 )
+  #define stbir__simdf_mult_mem( out, reg, ptr ) (out) = _mm_mul_ps( reg, _mm_loadu_ps( (float const*)(ptr) ) )
+  #define stbir__simdf_mult1_mem( out, reg, ptr ) (out) = _mm_mul_ss( reg, _mm_load_ss( (float const*)(ptr) ) )
+  #define stbir__simdf_add_mem( out, reg, ptr ) (out) = _mm_add_ps( reg, _mm_loadu_ps( (float const*)(ptr) ) )
+  #define stbir__simdf_add1_mem( out, reg, ptr ) (out) = _mm_add_ss( reg, _mm_load_ss( (float const*)(ptr) ) )
+
+  #ifdef STBIR_USE_FMA           // not on by default to maintain bit identical simd to non-simd
+  #include <immintrin.h>
+  #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = _mm_fmadd_ps( mul1, mul2, add )
+  #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = _mm_fmadd_ss( mul1, mul2, add )
+  #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = _mm_fmadd_ps( mul, _mm_loadu_ps( (float const*)(ptr) ), add )
+  #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = _mm_fmadd_ss( mul, _mm_load_ss( (float const*)(ptr) ), add )
+  #else
+  #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = _mm_add_ps( add, _mm_mul_ps( mul1, mul2 ) )
+  #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = _mm_add_ss( add, _mm_mul_ss( mul1, mul2 ) )
+  #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = _mm_add_ps( add, _mm_mul_ps( mul, _mm_loadu_ps( (float const*)(ptr) ) ) )
+  #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = _mm_add_ss( add, _mm_mul_ss( mul, _mm_load_ss( (float const*)(ptr) ) ) )
+  #endif
+
+  #define stbir__simdf_add1( out, reg0, reg1 ) (out) = _mm_add_ss( reg0, reg1 )
+  #define stbir__simdf_mult1( out, reg0, reg1 ) (out) = _mm_mul_ss( reg0, reg1 )
+
+  #define stbir__simdf_and( out, reg0, reg1 ) (out) = _mm_and_ps( reg0, reg1 )
+  #define stbir__simdf_or( out, reg0, reg1 ) (out) = _mm_or_ps( reg0, reg1 )
+
+  #define stbir__simdf_min( out, reg0, reg1 ) (out) = _mm_min_ps( reg0, reg1 )
+  #define stbir__simdf_max( out, reg0, reg1 ) (out) = _mm_max_ps( reg0, reg1 )
+  #define stbir__simdf_min1( out, reg0, reg1 ) (out) = _mm_min_ss( reg0, reg1 )
+  #define stbir__simdf_max1( out, reg0, reg1 ) (out) = _mm_max_ss( reg0, reg1 )
+
+  #define stbir__simdf_0123ABCDto3ABx( out, reg0, reg1 ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_shuffle_ps( reg1,reg0, (0<<0) + (1<<2) + (2<<4) + (3<<6) )), (3<<0) + (0<<2) + (1<<4) + (2<<6) ) )
+  #define stbir__simdf_0123ABCDto23Ax( out, reg0, reg1 ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_shuffle_ps( reg1,reg0, (0<<0) + (1<<2) + (2<<4) + (3<<6) )), (2<<0) + (3<<2) + (0<<4) + (1<<6) ) )
+
+  static const stbir__simdf STBIR_zeroones = { 0.0f,1.0f,0.0f,1.0f };
+  static const stbir__simdf STBIR_onezeros = { 1.0f,0.0f,1.0f,0.0f };
+  #define stbir__simdf_aaa1( out, alp, ones ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_movehl_ps( ones, alp ) ), (1<<0) + (1<<2) + (1<<4) + (2<<6) ) )
+  #define stbir__simdf_1aaa( out, alp, ones ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_movelh_ps( ones, alp ) ), (0<<0) + (2<<2) + (2<<4) + (2<<6) ) )
+  #define stbir__simdf_a1a1( out, alp, ones) (out) = _mm_or_ps( _mm_castsi128_ps( _mm_srli_epi64( _mm_castps_si128(alp), 32 ) ), STBIR_zeroones )
+  #define stbir__simdf_1a1a( out, alp, ones) (out) = _mm_or_ps( _mm_castsi128_ps( _mm_slli_epi64( _mm_castps_si128(alp), 32 ) ), STBIR_onezeros )
+
+  #define stbir__simdf_swiz( reg, one, two, three, four ) _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( reg ), (one<<0) + (two<<2) + (three<<4) + (four<<6) ) )
+
+  #define stbir__simdi_and( out, reg0, reg1 ) (out) = _mm_and_si128( reg0, reg1 )
+  #define stbir__simdi_or( out, reg0, reg1 ) (out) = _mm_or_si128( reg0, reg1 )
+  #define stbir__simdi_16madd( out, reg0, reg1 ) (out) = _mm_madd_epi16( reg0, reg1 )
+
+  #define stbir__simdf_pack_to_8bytes(out,aa,bb) \
+  { \
+    stbir__simdf af,bf; \
+    stbir__simdi a,b; \
+    af = _mm_min_ps( aa, STBIR_max_uint8_as_float ); \
+    bf = _mm_min_ps( bb, STBIR_max_uint8_as_float ); \
+    af = _mm_max_ps( af, _mm_setzero_ps() ); \
+    bf = _mm_max_ps( bf, _mm_setzero_ps() ); \
+    a = _mm_cvttps_epi32( af ); \
+    b = _mm_cvttps_epi32( bf ); \
+    a = _mm_packs_epi32( a, b ); \
+    out = _mm_packus_epi16( a, a ); \
+  }
+
+  #define stbir__simdf_load4_transposed( o0, o1, o2, o3, ptr ) \
+      stbir__simdf_load( o0, (ptr) );    \
+      stbir__simdf_load( o1, (ptr)+4 );  \
+      stbir__simdf_load( o2, (ptr)+8 );  \
+      stbir__simdf_load( o3, (ptr)+12 ); \
+      {                                  \
+        __m128 tmp0, tmp1, tmp2, tmp3;   \
+        tmp0 = _mm_unpacklo_ps(o0, o1);  \
+        tmp2 = _mm_unpacklo_ps(o2, o3);  \
+        tmp1 = _mm_unpackhi_ps(o0, o1);  \
+        tmp3 = _mm_unpackhi_ps(o2, o3);  \
+        o0 = _mm_movelh_ps(tmp0, tmp2);  \
+        o1 = _mm_movehl_ps(tmp2, tmp0);  \
+        o2 = _mm_movelh_ps(tmp1, tmp3);  \
+        o3 = _mm_movehl_ps(tmp3, tmp1);  \
+      }
+
+  #define stbir__interleave_pack_and_store_16_u8( ptr, r0, r1, r2, r3 ) \
+      r0 = _mm_packs_epi32( r0, r1 ); \
+      r2 = _mm_packs_epi32( r2, r3 ); \
+      r1 = _mm_unpacklo_epi16( r0, r2 ); \
+      r3 = _mm_unpackhi_epi16( r0, r2 ); \
+      r0 = _mm_unpacklo_epi16( r1, r3 ); \
+      r2 = _mm_unpackhi_epi16( r1, r3 ); \
+      r0 = _mm_packus_epi16( r0, r2 ); \
+      stbir__simdi_store( ptr, r0 ); \
+
+  #define stbir__simdi_32shr( out, reg, imm ) out = _mm_srli_epi32( reg, imm )
+
+  #if defined(_MSC_VER) && !defined(__clang__)
+    // msvc inits with 8 bytes
+    #define STBIR__CONST_32_TO_8( v ) (char)(unsigned char)((v)&255),(char)(unsigned char)(((v)>>8)&255),(char)(unsigned char)(((v)>>16)&255),(char)(unsigned char)(((v)>>24)&255)
+    #define STBIR__CONST_4_32i( v ) STBIR__CONST_32_TO_8( v ), STBIR__CONST_32_TO_8( v ), STBIR__CONST_32_TO_8( v ), STBIR__CONST_32_TO_8( v )
+    #define STBIR__CONST_4d_32i( v0, v1, v2, v3 ) STBIR__CONST_32_TO_8( v0 ), STBIR__CONST_32_TO_8( v1 ), STBIR__CONST_32_TO_8( v2 ), STBIR__CONST_32_TO_8( v3 )
+  #else
+    // everything else inits with long long's
+    #define STBIR__CONST_4_32i( v ) (long long)((((stbir_uint64)(stbir_uint32)(v))<<32)|((stbir_uint64)(stbir_uint32)(v))),(long long)((((stbir_uint64)(stbir_uint32)(v))<<32)|((stbir_uint64)(stbir_uint32)(v)))
+    #define STBIR__CONST_4d_32i( v0, v1, v2, v3 ) (long long)((((stbir_uint64)(stbir_uint32)(v1))<<32)|((stbir_uint64)(stbir_uint32)(v0))),(long long)((((stbir_uint64)(stbir_uint32)(v3))<<32)|((stbir_uint64)(stbir_uint32)(v2)))
+  #endif
+
+  #define STBIR__SIMDF_CONST(var, x) stbir__simdf var = { x, x, x, x }
+  #define STBIR__SIMDI_CONST(var, x) stbir__simdi var = { STBIR__CONST_4_32i(x) }
+  #define STBIR__CONSTF(var) (var)
+  #define STBIR__CONSTI(var) (var)
+
+  #if defined(STBIR_AVX) || defined(__SSE4_1__)
+    #include <smmintrin.h>
+    #define stbir__simdf_pack_to_8words(out,reg0,reg1) out = _mm_packus_epi32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg0,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())), _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg1,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())))
+  #else
+    STBIR__SIMDI_CONST(stbir__s32_32768, 32768);
+    STBIR__SIMDI_CONST(stbir__s16_32768, ((32768<<16)|32768));
+
+    #define stbir__simdf_pack_to_8words(out,reg0,reg1) \
+      { \
+        stbir__simdi tmp0,tmp1; \
+        tmp0 = _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg0,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())); \
+        tmp1 = _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg1,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())); \
+        tmp0 = _mm_sub_epi32( tmp0, stbir__s32_32768 ); \
+        tmp1 = _mm_sub_epi32( tmp1, stbir__s32_32768 ); \
+        out = _mm_packs_epi32( tmp0, tmp1 ); \
+        out = _mm_sub_epi16( out, stbir__s16_32768 ); \
+      }
+
+  #endif
+
+  #define STBIR_SIMD
+
+  // if we detect AVX, set the simd8 defines
+  #ifdef STBIR_AVX
+    #include <immintrin.h>
+    #define STBIR_SIMD8
+    #define stbir__simdf8 __m256
+    #define stbir__simdi8 __m256i
+    #define stbir__simdf8_load( out, ptr ) (out) = _mm256_loadu_ps( (float const *)(ptr) )
+    #define stbir__simdi8_load( out, ptr ) (out) = _mm256_loadu_si256( (__m256i const *)(ptr) )
+    #define stbir__simdf8_mult( out, a, b ) (out) = _mm256_mul_ps( (a), (b) )
+    #define stbir__simdf8_store( ptr, out ) _mm256_storeu_ps( (float*)(ptr), out )
+    #define stbir__simdi8_store( ptr, reg )  _mm256_storeu_si256( (__m256i*)(ptr), reg )
+    #define stbir__simdf8_frep8( fval ) _mm256_set1_ps( fval )
+
+    #define stbir__simdf8_min( out, reg0, reg1 ) (out) = _mm256_min_ps( reg0, reg1 )
+    #define stbir__simdf8_max( out, reg0, reg1 ) (out) = _mm256_max_ps( reg0, reg1 )
+
+    #define stbir__simdf8_add4halves( out, bot4, top8 ) (out) = _mm_add_ps( bot4, _mm256_extractf128_ps( top8, 1 ) )
+    #define stbir__simdf8_mult_mem( out, reg, ptr ) (out) = _mm256_mul_ps( reg, _mm256_loadu_ps( (float const*)(ptr) ) )
+    #define stbir__simdf8_add_mem( out, reg, ptr ) (out) = _mm256_add_ps( reg, _mm256_loadu_ps( (float const*)(ptr) ) )
+    #define stbir__simdf8_add( out, a, b ) (out) = _mm256_add_ps( a, b )
+    #define stbir__simdf8_load1b( out, ptr ) (out) = _mm256_broadcast_ss( ptr )
+    #define stbir__simdf_load1rep4( out, ptr ) (out) = _mm_broadcast_ss( ptr )  // avx load instruction
+
+    #define stbir__simdi8_convert_i32_to_float(out, ireg) (out) = _mm256_cvtepi32_ps( ireg )
+    #define stbir__simdf8_convert_float_to_i32( i, f ) (i) = _mm256_cvttps_epi32(f)
+  
+    #define stbir__simdf8_bot4s( out, a, b ) (out) = _mm256_permute2f128_ps(a,b, (0<<0)+(2<<4) )
+    #define stbir__simdf8_top4s( out, a, b ) (out) = _mm256_permute2f128_ps(a,b, (1<<0)+(3<<4) )
+    
+    #define stbir__simdf8_gettop4( reg ) _mm256_extractf128_ps(reg,1)
+
+    #ifdef STBIR_AVX2
+
+    #define stbir__simdi8_expand_u8_to_u32(out0,out1,ireg) \
+    { \
+      stbir__simdi8 a, zero  =_mm256_setzero_si256();\
+      a = _mm256_permute4x64_epi64( _mm256_unpacklo_epi8( _mm256_permute4x64_epi64(_mm256_castsi128_si256(ireg),(0<<0)+(2<<2)+(1<<4)+(3<<6)), zero ),(0<<0)+(2<<2)+(1<<4)+(3<<6)); \
+      out0 = _mm256_unpacklo_epi16( a, zero ); \
+      out1 = _mm256_unpackhi_epi16( a, zero ); \
+    }
+
+    #define stbir__simdf8_pack_to_16bytes(out,aa,bb) \
+    { \
+      stbir__simdi8 t; \
+      stbir__simdf8 af,bf; \
+      stbir__simdi8 a,b; \
+      af = _mm256_min_ps( aa, STBIR_max_uint8_as_floatX ); \
+      bf = _mm256_min_ps( bb, STBIR_max_uint8_as_floatX ); \
+      af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
+      bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
+      a = _mm256_cvttps_epi32( af ); \
+      b = _mm256_cvttps_epi32( bf ); \
+      t = _mm256_permute4x64_epi64( _mm256_packs_epi32( a, b ), (0<<0)+(2<<2)+(1<<4)+(3<<6) ); \
+      out = _mm256_castsi256_si128( _mm256_permute4x64_epi64( _mm256_packus_epi16( t, t ), (0<<0)+(2<<2)+(1<<4)+(3<<6) ) ); \
+    }
+
+    #define stbir__simdi8_expand_u16_to_u32(out,ireg) out = _mm256_unpacklo_epi16( _mm256_permute4x64_epi64(_mm256_castsi128_si256(ireg),(0<<0)+(2<<2)+(1<<4)+(3<<6)), _mm256_setzero_si256() ); 
+  
+    #define stbir__simdf8_pack_to_16words(out,aa,bb) \
+      { \
+        stbir__simdf8 af,bf; \
+        stbir__simdi8 a,b; \
+        af = _mm256_min_ps( aa, STBIR_max_uint16_as_floatX ); \
+        bf = _mm256_min_ps( bb, STBIR_max_uint16_as_floatX ); \
+        af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
+        bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
+        a = _mm256_cvttps_epi32( af ); \
+        b = _mm256_cvttps_epi32( bf ); \
+        (out) = _mm256_permute4x64_epi64( _mm256_packus_epi32(a, b), (0<<0)+(2<<2)+(1<<4)+(3<<6) ); \
+      }
+
+    #else
+
+    #define stbir__simdi8_expand_u8_to_u32(out0,out1,ireg) \
+    { \
+      stbir__simdi a,zero = _mm_setzero_si128(); \
+      a = _mm_unpacklo_epi8( ireg, zero ); \
+      out0 = _mm256_setr_m128i( _mm_unpacklo_epi16( a, zero ), _mm_unpackhi_epi16( a, zero ) ); \
+      a = _mm_unpackhi_epi8( ireg, zero ); \
+      out1 = _mm256_setr_m128i( _mm_unpacklo_epi16( a, zero ), _mm_unpackhi_epi16( a, zero ) ); \
+    }
+  
+    #define stbir__simdf8_pack_to_16bytes(out,aa,bb) \
+    { \
+      stbir__simdi t; \
+      stbir__simdf8 af,bf; \
+      stbir__simdi8 a,b; \
+      af = _mm256_min_ps( aa, STBIR_max_uint8_as_floatX ); \
+      bf = _mm256_min_ps( bb, STBIR_max_uint8_as_floatX ); \
+      af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
+      bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
+      a = _mm256_cvttps_epi32( af ); \
+      b = _mm256_cvttps_epi32( bf ); \
+      out = _mm_packs_epi32( _mm256_castsi256_si128(a), _mm256_extractf128_si256( a, 1 ) ); \
+      out = _mm_packus_epi16( out, out ); \
+      t = _mm_packs_epi32( _mm256_castsi256_si128(b), _mm256_extractf128_si256( b, 1 ) ); \
+      t = _mm_packus_epi16( t, t ); \
+      out = _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps(out), _mm_castsi128_ps(t), (0<<0)+(1<<2)+(0<<4)+(1<<6) ) ); \
+    }
+  
+    #define stbir__simdi8_expand_u16_to_u32(out,ireg) \
+    { \
+      stbir__simdi a,b,zero = _mm_setzero_si128(); \
+      a = _mm_unpacklo_epi16( ireg, zero ); \
+      b = _mm_unpackhi_epi16( ireg, zero ); \
+      out = _mm256_insertf128_si256( _mm256_castsi128_si256( a ), b, 1 ); \
+    }
+
+    #define stbir__simdf8_pack_to_16words(out,aa,bb) \
+      { \
+        stbir__simdi t0,t1; \
+        stbir__simdf8 af,bf; \
+        stbir__simdi8 a,b; \
+        af = _mm256_min_ps( aa, STBIR_max_uint16_as_floatX ); \
+        bf = _mm256_min_ps( bb, STBIR_max_uint16_as_floatX ); \
+        af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
+        bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
+        a = _mm256_cvttps_epi32( af ); \
+        b = _mm256_cvttps_epi32( bf ); \
+        t0 = _mm_packus_epi32( _mm256_castsi256_si128(a), _mm256_extractf128_si256( a, 1 ) ); \
+        t1 = _mm_packus_epi32( _mm256_castsi256_si128(b), _mm256_extractf128_si256( b, 1 ) ); \
+        out = _mm256_setr_m128i( t0, t1 ); \
+      }
+
+    #endif
+
+    static __m256i stbir_00001111 = { STBIR__CONST_4d_32i( 0, 0, 0, 0 ), STBIR__CONST_4d_32i( 1, 1, 1, 1 ) };
+    #define stbir__simdf8_0123to00001111( out, in ) (out) = _mm256_permutevar_ps ( in, stbir_00001111 )
+
+    static __m256i stbir_22223333 = { STBIR__CONST_4d_32i( 2, 2, 2, 2 ), STBIR__CONST_4d_32i( 3, 3, 3, 3 ) };
+    #define stbir__simdf8_0123to22223333( out, in ) (out) = _mm256_permutevar_ps ( in, stbir_22223333 )
+
+    #define stbir__simdf8_0123to2222( out, in ) (out) = stbir__simdf_swiz(_mm256_castps256_ps128(in), 2,2,2,2 )
+
+    #define stbir__simdf8_load2( out, ptr ) (out) = _mm256_castsi256_ps(_mm256_castsi128_si256( _mm_loadl_epi64( (__m128i*)(ptr)) )) // top values can be random (not denormal or nan for perf)
+    #define stbir__simdf8_load4b( out, ptr ) (out) = _mm256_broadcast_ps( (__m128 const *)(ptr) )
+
+    static __m256i stbir_00112233 = { STBIR__CONST_4d_32i( 0, 0, 1, 1 ), STBIR__CONST_4d_32i( 2, 2, 3, 3 ) };
+    #define stbir__simdf8_0123to00112233( out, in ) (out) = _mm256_permutevar_ps ( in, stbir_00112233 )
+    #define stbir__simdf8_add4( out, a8, b ) (out) = _mm256_add_ps( a8,  _mm256_castps128_ps256( b ) )
+
+    static __m256i stbir_load6 = { STBIR__CONST_4_32i( 0x80000000 ), STBIR__CONST_4d_32i(  0x80000000,  0x80000000, 0, 0 ) };
+    #define stbir__simdf8_load6z( out, ptr ) (out) = _mm256_maskload_ps( ptr, stbir_load6 )
+
+    #define stbir__simdf8_0123to00000000( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (0<<0)+(0<<2)+(0<<4)+(0<<6) )
+    #define stbir__simdf8_0123to11111111( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (1<<0)+(1<<2)+(1<<4)+(1<<6) )
+    #define stbir__simdf8_0123to22222222( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (2<<0)+(2<<2)+(2<<4)+(2<<6) )
+    #define stbir__simdf8_0123to33333333( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (3<<0)+(3<<2)+(3<<4)+(3<<6) )
+    #define stbir__simdf8_0123to21032103( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (2<<0)+(1<<2)+(0<<4)+(3<<6) )
+    #define stbir__simdf8_0123to32103210( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (3<<0)+(2<<2)+(1<<4)+(0<<6) )
+    #define stbir__simdf8_0123to12301230( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (1<<0)+(2<<2)+(3<<4)+(0<<6) )
+    #define stbir__simdf8_0123to10321032( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (1<<0)+(0<<2)+(3<<4)+(2<<6) )
+    #define stbir__simdf8_0123to30123012( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (3<<0)+(0<<2)+(1<<4)+(2<<6) )
+
+    #define stbir__simdf8_0123to11331133( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (1<<0)+(1<<2)+(3<<4)+(3<<6) )
+    #define stbir__simdf8_0123to00220022( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (0<<0)+(0<<2)+(2<<4)+(2<<6) )
+
+    #define stbir__simdf8_aaa1( out, alp, ones ) (out) = _mm256_blend_ps( alp, ones, (1<<0)+(1<<1)+(1<<2)+(0<<3)+(1<<4)+(1<<5)+(1<<6)+(0<<7)); (out)=_mm256_shuffle_ps( out,out, (3<<0) + (3<<2) + (3<<4) + (0<<6) )
+    #define stbir__simdf8_1aaa( out, alp, ones ) (out) = _mm256_blend_ps( alp, ones, (0<<0)+(1<<1)+(1<<2)+(1<<3)+(0<<4)+(1<<5)+(1<<6)+(1<<7)); (out)=_mm256_shuffle_ps( out,out, (1<<0) + (0<<2) + (0<<4) + (0<<6) )
+    #define stbir__simdf8_a1a1( out, alp, ones) (out) = _mm256_blend_ps( alp, ones, (1<<0)+(0<<1)+(1<<2)+(0<<3)+(1<<4)+(0<<5)+(1<<6)+(0<<7)); (out)=_mm256_shuffle_ps( out,out, (1<<0) + (0<<2) + (3<<4) + (2<<6) )
+    #define stbir__simdf8_1a1a( out, alp, ones) (out) = _mm256_blend_ps( alp, ones, (0<<0)+(1<<1)+(0<<2)+(1<<3)+(0<<4)+(1<<5)+(0<<6)+(1<<7)); (out)=_mm256_shuffle_ps( out,out, (1<<0) + (0<<2) + (3<<4) + (2<<6) )
+
+    #define stbir__simdf8_zero( reg ) (reg) = _mm256_setzero_ps()
+
+    #ifdef STBIR_USE_FMA           // not on by default to maintain bit identical simd to non-simd
+    #define stbir__simdf8_madd( out, add, mul1, mul2 ) (out) = _mm256_fmadd_ps( mul1, mul2, add )
+    #define stbir__simdf8_madd_mem( out, add, mul, ptr ) (out) = _mm256_fmadd_ps( mul, _mm256_loadu_ps( (float const*)(ptr) ), add )
+    #define stbir__simdf8_madd_mem4( out, add, mul, ptr ) (out) = _mm256_fmadd_ps( _mm256_castps128_ps256( mul ), _mm256_castps128_ps256( _mm_loadu_ps( (float const*)(ptr) ) ), add )
+    #else
+    #define stbir__simdf8_madd( out, add, mul1, mul2 ) (out) = _mm256_add_ps( add, _mm256_mul_ps( mul1, mul2 ) )
+    #define stbir__simdf8_madd_mem( out, add, mul, ptr ) (out) = _mm256_add_ps( add, _mm256_mul_ps( mul, _mm256_loadu_ps( (float const*)(ptr) ) ) )
+    #define stbir__simdf8_madd_mem4( out, add, mul, ptr ) (out) = _mm256_add_ps( add, _mm256_castps128_ps256( _mm_mul_ps( mul, _mm_loadu_ps( (float const*)(ptr) ) ) ) )
+    #endif
+    #define stbir__if_simdf8_cast_to_simdf4( val ) _mm256_castps256_ps128( val )
+
+  #endif
+
+  #ifdef STBIR_FLOORF
+  #undef STBIR_FLOORF
+  #endif
+  #define STBIR_FLOORF stbir_simd_floorf
+  static stbir__inline float stbir_simd_floorf(float x)  // martins floorf
+  {
+    #if defined(STBIR_AVX) || defined(__SSE4_1__) || defined(STBIR_SSE41)
+    __m128 t = _mm_set_ss(x);
+    return _mm_cvtss_f32( _mm_floor_ss(t, t) );
+    #else
+    __m128 f = _mm_set_ss(x);
+    __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(f));
+    __m128 r = _mm_add_ss(t, _mm_and_ps(_mm_cmplt_ss(f, t), _mm_set_ss(-1.0f)));
+    return _mm_cvtss_f32(r);
+    #endif
+  }
+
+  #ifdef STBIR_CEILF
+  #undef STBIR_CEILF
+  #endif
+  #define STBIR_CEILF stbir_simd_ceilf
+  static stbir__inline float stbir_simd_ceilf(float x)  // martins ceilf
+  {
+    #if defined(STBIR_AVX) || defined(__SSE4_1__) || defined(STBIR_SSE41)
+    __m128 t = _mm_set_ss(x);
+    return _mm_cvtss_f32( _mm_ceil_ss(t, t) );
+    #else
+    __m128 f = _mm_set_ss(x);
+    __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(f));
+    __m128 r = _mm_add_ss(t, _mm_and_ps(_mm_cmplt_ss(t, f), _mm_set_ss(1.0f)));
+    return _mm_cvtss_f32(r);
+    #endif
+  }
+
+#elif defined(STBIR_NEON)
+  
+  #include <arm_neon.h>
+
+  #define stbir__simdf float32x4_t
+  #define stbir__simdi uint32x4_t
+
+  #define stbir_simdi_castf( reg ) vreinterpretq_u32_f32(reg)
+  #define stbir_simdf_casti( reg ) vreinterpretq_f32_u32(reg)
+
+  #define stbir__simdf_load( reg, ptr ) (reg) = vld1q_f32( (float const*)(ptr) )
+  #define stbir__simdi_load( reg, ptr ) (reg) = vld1q_u32( (uint32_t const*)(ptr) )
+  #define stbir__simdf_load1( out, ptr ) (out) = vld1q_dup_f32( (float const*)(ptr) ) // top values can be random (not denormal or nan for perf)
+  #define stbir__simdi_load1( out, ptr ) (out) = vld1q_dup_u32( (uint32_t const*)(ptr) )
+  #define stbir__simdf_load1z( out, ptr ) (out) = vld1q_lane_f32( (float const*)(ptr), vdupq_n_f32(0), 0 ) // top values must be zero
+  #define stbir__simdf_frep4( fvar ) vdupq_n_f32( fvar )
+  #define stbir__simdf_load1frep4( out, fvar ) (out) = vdupq_n_f32( fvar )
+  #define stbir__simdf_load2( out, ptr ) (out) = vcombine_f32( vld1_f32( (float const*)(ptr) ), vcreate_f32(0) ) // top values can be random (not denormal or nan for perf)
+  #define stbir__simdf_load2z( out, ptr ) (out) = vcombine_f32( vld1_f32( (float const*)(ptr) ), vcreate_f32(0) )  // top values must be zero
+  #define stbir__simdf_load2hmerge( out, reg, ptr ) (out) = vcombine_f32( vget_low_f32(reg), vld1_f32( (float const*)(ptr) ) )
+
+  #define stbir__simdf_zeroP() vdupq_n_f32(0)
+  #define stbir__simdf_zero( reg ) (reg) = vdupq_n_f32(0)
+
+  #define stbir__simdf_store( ptr, reg )  vst1q_f32( (float*)(ptr), reg )
+  #define stbir__simdf_store1( ptr, reg ) vst1q_lane_f32( (float*)(ptr), reg, 0)
+  #define stbir__simdf_store2( ptr, reg ) vst1_f32( (float*)(ptr), vget_low_f32(reg) )
+  #define stbir__simdf_store2h( ptr, reg ) vst1_f32( (float*)(ptr), vget_high_f32(reg) )
+
+  #define stbir__simdi_store( ptr, reg )  vst1q_u32( (uint32_t*)(ptr), reg )
+  #define stbir__simdi_store1( ptr, reg ) vst1q_lane_u32( (uint32_t*)(ptr), reg, 0 )
+  #define stbir__simdi_store2( ptr, reg ) vst1_u32( (uint32_t*)(ptr), vget_low_u32(reg) )
+
+  #define stbir__prefetch( ptr )
+
+  #define stbir__simdi_expand_u8_to_u32(out0,out1,out2,out3,ireg) \
+  { \
+    uint16x8_t l = vmovl_u8( vget_low_u8 ( vreinterpretq_u8_u32(ireg) ) ); \
+    uint16x8_t h = vmovl_u8( vget_high_u8( vreinterpretq_u8_u32(ireg) ) ); \
+    out0 = vmovl_u16( vget_low_u16 ( l ) ); \
+    out1 = vmovl_u16( vget_high_u16( l ) ); \
+    out2 = vmovl_u16( vget_low_u16 ( h ) ); \
+    out3 = vmovl_u16( vget_high_u16( h ) ); \
+  }
+
+  #define stbir__simdi_expand_u8_to_1u32(out,ireg) \
+  { \
+    uint16x8_t tmp = vmovl_u8( vget_low_u8( vreinterpretq_u8_u32(ireg) ) ); \
+    out = vmovl_u16( vget_low_u16( tmp ) ); \
+  }
+
+  #define stbir__simdi_expand_u16_to_u32(out0,out1,ireg) \
+  { \
+    uint16x8_t tmp = vreinterpretq_u16_u32(ireg); \
+    out0 = vmovl_u16( vget_low_u16 ( tmp ) ); \
+    out1 = vmovl_u16( vget_high_u16( tmp ) ); \
+  }
+
+  #define stbir__simdf_convert_float_to_i32( i, f ) (i) = vreinterpretq_u32_s32( vcvtq_s32_f32(f) )
+  #define stbir__simdf_convert_float_to_int( f ) vgetq_lane_s32(vcvtq_s32_f32(f), 0)
+  #define stbir__simdi_to_int( i ) (int)vgetq_lane_u32(i, 0) 
+  #define stbir__simdf_convert_float_to_uint8( f ) ((unsigned char)vgetq_lane_s32(vcvtq_s32_f32(vmaxq_f32(vminq_f32(f,STBIR__CONSTF(STBIR_max_uint8_as_float)),vdupq_n_f32(0))), 0))
+  #define stbir__simdf_convert_float_to_short( f ) ((unsigned short)vgetq_lane_s32(vcvtq_s32_f32(vmaxq_f32(vminq_f32(f,STBIR__CONSTF(STBIR_max_uint16_as_float)),vdupq_n_f32(0))), 0))
+  #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = vcvtq_f32_s32( vreinterpretq_s32_u32(ireg) )
+  #define stbir__simdf_add( out, reg0, reg1 ) (out) = vaddq_f32( reg0, reg1 )
+  #define stbir__simdf_mult( out, reg0, reg1 ) (out) = vmulq_f32( reg0, reg1 )
+  #define stbir__simdf_mult_mem( out, reg, ptr ) (out) = vmulq_f32( reg, vld1q_f32( (float const*)(ptr) ) )
+  #define stbir__simdf_mult1_mem( out, reg, ptr ) (out) = vmulq_f32( reg, vld1q_dup_f32( (float const*)(ptr) ) )
+  #define stbir__simdf_add_mem( out, reg, ptr ) (out) = vaddq_f32( reg, vld1q_f32( (float const*)(ptr) ) )
+  #define stbir__simdf_add1_mem( out, reg, ptr ) (out) = vaddq_f32( reg, vld1q_dup_f32( (float const*)(ptr) ) )
+
+  #ifdef STBIR_USE_FMA           // not on by default to maintain bit identical simd to non-simd (and also x64 no madd to arm madd)
+  #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = vfmaq_f32( add, mul1, mul2 )
+  #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = vfmaq_f32( add, mul1, mul2 )
+  #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = vfmaq_f32( add, mul, vld1q_f32( (float const*)(ptr) ) )
+  #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = vfmaq_f32( add, mul, vld1q_dup_f32( (float const*)(ptr) ) )
+  #else
+  #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = vaddq_f32( add, vmulq_f32( mul1, mul2 ) )
+  #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = vaddq_f32( add, vmulq_f32( mul1, mul2 ) )
+  #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = vaddq_f32( add, vmulq_f32( mul, vld1q_f32( (float const*)(ptr) ) ) )
+  #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = vaddq_f32( add, vmulq_f32( mul, vld1q_dup_f32( (float const*)(ptr) ) ) )
+  #endif
+
+  #define stbir__simdf_add1( out, reg0, reg1 ) (out) = vaddq_f32( reg0, reg1 )
+  #define stbir__simdf_mult1( out, reg0, reg1 ) (out) = vmulq_f32( reg0, reg1 )
+
+  #define stbir__simdf_and( out, reg0, reg1 ) (out) = vreinterpretq_f32_u32( vandq_u32( vreinterpretq_u32_f32(reg0), vreinterpretq_u32_f32(reg1) ) )
+  #define stbir__simdf_or( out, reg0, reg1 ) (out) = vreinterpretq_f32_u32( vorrq_u32( vreinterpretq_u32_f32(reg0), vreinterpretq_u32_f32(reg1) ) )
+
+  #define stbir__simdf_min( out, reg0, reg1 ) (out) = vminq_f32( reg0, reg1 )
+  #define stbir__simdf_max( out, reg0, reg1 ) (out) = vmaxq_f32( reg0, reg1 )
+  #define stbir__simdf_min1( out, reg0, reg1 ) (out) = vminq_f32( reg0, reg1 )
+  #define stbir__simdf_max1( out, reg0, reg1 ) (out) = vmaxq_f32( reg0, reg1 )
+
+  #define stbir__simdf_0123ABCDto3ABx( out, reg0, reg1 ) (out) = vextq_f32( reg0, reg1, 3 )
+  #define stbir__simdf_0123ABCDto23Ax( out, reg0, reg1 ) (out) = vextq_f32( reg0, reg1, 2 )
+
+  #define stbir__simdf_a1a1( out, alp, ones ) (out) = vzipq_f32(vuzpq_f32(alp, alp).val[1], ones).val[0]
+  #define stbir__simdf_1a1a( out, alp, ones ) (out) = vzipq_f32(ones, vuzpq_f32(alp, alp).val[0]).val[0]
+
+  #if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ )
+
+    #define stbir__simdf_aaa1( out, alp, ones ) (out) = vcopyq_laneq_f32(vdupq_n_f32(vgetq_lane_f32(alp, 3)), 3, ones, 3)
+    #define stbir__simdf_1aaa( out, alp, ones ) (out) = vcopyq_laneq_f32(vdupq_n_f32(vgetq_lane_f32(alp, 0)), 0, ones, 0)
+
+    #if defined( _MSC_VER ) && !defined(__clang__)
+      #define stbir_make16(a,b,c,d) vcombine_u8( \
+        vcreate_u8( (4*a+0) | ((4*a+1)<<8) | ((4*a+2)<<16) | ((4*a+3)<<24) | \
+          ((stbir_uint64)(4*b+0)<<32) | ((stbir_uint64)(4*b+1)<<40) | ((stbir_uint64)(4*b+2)<<48) | ((stbir_uint64)(4*b+3)<<56)), \
+        vcreate_u8( (4*c+0) | ((4*c+1)<<8) | ((4*c+2)<<16) | ((4*c+3)<<24) | \
+          ((stbir_uint64)(4*d+0)<<32) | ((stbir_uint64)(4*d+1)<<40) | ((stbir_uint64)(4*d+2)<<48) | ((stbir_uint64)(4*d+3)<<56) ) )
+    #else
+      #define stbir_make16(a,b,c,d) (uint8x16_t){4*a+0,4*a+1,4*a+2,4*a+3,4*b+0,4*b+1,4*b+2,4*b+3,4*c+0,4*c+1,4*c+2,4*c+3,4*d+0,4*d+1,4*d+2,4*d+3}
+    #endif
+
+    #define stbir__simdf_swiz( reg, one, two, three, four ) vreinterpretq_f32_u8( vqtbl1q_u8( vreinterpretq_u8_f32(reg), stbir_make16(one, two, three, four) ) )
+  
+    #define stbir__simdi_16madd( out, reg0, reg1 ) \
+    { \
+      int16x8_t r0 = vreinterpretq_s16_u32(reg0); \
+      int16x8_t r1 = vreinterpretq_s16_u32(reg1); \
+      int32x4_t tmp0 = vmull_s16( vget_low_s16(r0), vget_low_s16(r1) ); \
+      int32x4_t tmp1 = vmull_s16( vget_high_s16(r0), vget_high_s16(r1) ); \
+      (out) = vreinterpretq_u32_s32( vpaddq_s32(tmp0, tmp1) ); \
+    }
+
+  #else
+
+    #define stbir__simdf_aaa1( out, alp, ones ) (out) = vsetq_lane_f32(1.0f, vdupq_n_f32(vgetq_lane_f32(alp, 3)), 3)
+    #define stbir__simdf_1aaa( out, alp, ones ) (out) = vsetq_lane_f32(1.0f, vdupq_n_f32(vgetq_lane_f32(alp, 0)), 0)
+
+    #if defined( _MSC_VER ) && !defined(__clang__)
+      static stbir__inline uint8x8x2_t stbir_make8x2(float32x4_t reg)
+      {
+        uint8x8x2_t r = { { vget_low_u8(vreinterpretq_u8_f32(reg)), vget_high_u8(vreinterpretq_u8_f32(reg)) } };
+        return r;
+      }
+      #define stbir_make8(a,b) vcreate_u8( \
+        (4*a+0) | ((4*a+1)<<8) | ((4*a+2)<<16) | ((4*a+3)<<24) | \
+        ((stbir_uint64)(4*b+0)<<32) | ((stbir_uint64)(4*b+1)<<40) | ((stbir_uint64)(4*b+2)<<48) | ((stbir_uint64)(4*b+3)<<56) )
+    #else
+      #define stbir_make8x2(reg) (uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_f32(reg)), vget_high_u8(vreinterpretq_u8_f32(reg)) } }
+      #define stbir_make8(a,b) (uint8x8_t){4*a+0,4*a+1,4*a+2,4*a+3,4*b+0,4*b+1,4*b+2,4*b+3}
+    #endif
+
+    #define stbir__simdf_swiz( reg, one, two, three, four ) vreinterpretq_f32_u8( vcombine_u8( \
+        vtbl2_u8( stbir_make8x2( reg ), stbir_make8( one, two ) ), \
+        vtbl2_u8( stbir_make8x2( reg ), stbir_make8( three, four ) ) ) )
+
+    #define stbir__simdi_16madd( out, reg0, reg1 ) \
+    { \
+      int16x8_t r0 = vreinterpretq_s16_u32(reg0); \
+      int16x8_t r1 = vreinterpretq_s16_u32(reg1); \
+      int32x4_t tmp0 = vmull_s16( vget_low_s16(r0), vget_low_s16(r1) ); \
+      int32x4_t tmp1 = vmull_s16( vget_high_s16(r0), vget_high_s16(r1) ); \
+      int32x2_t out0 = vpadd_s32( vget_low_s32(tmp0), vget_high_s32(tmp0) ); \
+      int32x2_t out1 = vpadd_s32( vget_low_s32(tmp1), vget_high_s32(tmp1) ); \
+      (out) = vreinterpretq_u32_s32( vcombine_s32(out0, out1) ); \
+    }
+
+  #endif
+
+  #define stbir__simdi_and( out, reg0, reg1 ) (out) = vandq_u32( reg0, reg1 )
+  #define stbir__simdi_or( out, reg0, reg1 ) (out) = vorrq_u32( reg0, reg1 )
+
+  #define stbir__simdf_pack_to_8bytes(out,aa,bb) \
+  { \
+    float32x4_t af = vmaxq_f32( vminq_f32(aa,STBIR__CONSTF(STBIR_max_uint8_as_float) ), vdupq_n_f32(0) ); \
+    float32x4_t bf = vmaxq_f32( vminq_f32(bb,STBIR__CONSTF(STBIR_max_uint8_as_float) ), vdupq_n_f32(0) ); \
+    int16x4_t ai = vqmovn_s32( vcvtq_s32_f32( af ) ); \
+    int16x4_t bi = vqmovn_s32( vcvtq_s32_f32( bf ) ); \
+    uint8x8_t out8 = vqmovun_s16( vcombine_s16(ai, bi) ); \
+    out = vreinterpretq_u32_u8( vcombine_u8(out8, out8) ); \
+  }
+
+  #define stbir__simdf_pack_to_8words(out,aa,bb) \
+  { \
+    float32x4_t af = vmaxq_f32( vminq_f32(aa,STBIR__CONSTF(STBIR_max_uint16_as_float) ), vdupq_n_f32(0) ); \
+    float32x4_t bf = vmaxq_f32( vminq_f32(bb,STBIR__CONSTF(STBIR_max_uint16_as_float) ), vdupq_n_f32(0) ); \
+    int32x4_t ai = vcvtq_s32_f32( af ); \
+    int32x4_t bi = vcvtq_s32_f32( bf ); \
+    out = vreinterpretq_u32_u16( vcombine_u16(vqmovun_s32(ai), vqmovun_s32(bi)) ); \
+  }
+
+  #define stbir__interleave_pack_and_store_16_u8( ptr, r0, r1, r2, r3 ) \
+  { \
+    int16x4x2_t tmp0 = vzip_s16( vqmovn_s32(vreinterpretq_s32_u32(r0)), vqmovn_s32(vreinterpretq_s32_u32(r2)) ); \
+    int16x4x2_t tmp1 = vzip_s16( vqmovn_s32(vreinterpretq_s32_u32(r1)), vqmovn_s32(vreinterpretq_s32_u32(r3)) ); \
+    uint8x8x2_t out = \
+    { { \
+      vqmovun_s16( vcombine_s16(tmp0.val[0], tmp0.val[1]) ), \
+      vqmovun_s16( vcombine_s16(tmp1.val[0], tmp1.val[1]) ), \
+    } }; \
+    vst2_u8(ptr, out); \
+  }
+
+  #define stbir__simdf_load4_transposed( o0, o1, o2, o3, ptr ) \
+  { \
+    float32x4x4_t tmp = vld4q_f32(ptr); \
+    o0 = tmp.val[0]; \
+    o1 = tmp.val[1]; \
+    o2 = tmp.val[2]; \
+    o3 = tmp.val[3]; \
+  }
+
+  #define stbir__simdi_32shr( out, reg, imm ) out = vshrq_n_u32( reg, imm )
+
+  #if defined( _MSC_VER ) && !defined(__clang__)
+    #define STBIR__SIMDF_CONST(var, x) __declspec(align(8)) float var[] = { x, x, x, x }
+    #define STBIR__SIMDI_CONST(var, x) __declspec(align(8)) uint32_t var[] = { x, x, x, x }
+    #define STBIR__CONSTF(var) (*(const float32x4_t*)var)
+    #define STBIR__CONSTI(var) (*(const uint32x4_t*)var)
+  #else
+    #define STBIR__SIMDF_CONST(var, x) stbir__simdf var = { x, x, x, x }
+    #define STBIR__SIMDI_CONST(var, x) stbir__simdi var = { x, x, x, x }
+    #define STBIR__CONSTF(var) (var)
+    #define STBIR__CONSTI(var) (var)
+  #endif
+
+  #ifdef STBIR_FLOORF
+  #undef STBIR_FLOORF
+  #endif
+  #define STBIR_FLOORF stbir_simd_floorf
+  static stbir__inline float stbir_simd_floorf(float x)
+  {
+    #if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ )
+    return vget_lane_f32( vrndm_f32( vdup_n_f32(x) ), 0);
+    #else
+    float32x2_t f = vdup_n_f32(x);
+    float32x2_t t = vcvt_f32_s32(vcvt_s32_f32(f));
+    uint32x2_t a = vclt_f32(f, t);
+    uint32x2_t b = vreinterpret_u32_f32(vdup_n_f32(-1.0f));
+    float32x2_t r = vadd_f32(t, vreinterpret_f32_u32(vand_u32(a, b)));
+    return vget_lane_f32(r, 0);
+    #endif
+  }
+
+  #ifdef STBIR_CEILF
+  #undef STBIR_CEILF
+  #endif
+  #define STBIR_CEILF stbir_simd_ceilf
+  static stbir__inline float stbir_simd_ceilf(float x)
+  {
+    #if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ )
+    return vget_lane_f32( vrndp_f32( vdup_n_f32(x) ), 0);
+    #else
+    float32x2_t f = vdup_n_f32(x);
+    float32x2_t t = vcvt_f32_s32(vcvt_s32_f32(f));
+    uint32x2_t a = vclt_f32(t, f);
+    uint32x2_t b = vreinterpret_u32_f32(vdup_n_f32(1.0f));
+    float32x2_t r = vadd_f32(t, vreinterpret_f32_u32(vand_u32(a, b)));
+    return vget_lane_f32(r, 0);
+    #endif
+  }
+
+  #define STBIR_SIMD
+
+#elif defined(STBIR_WASM)
+
+  #include <wasm_simd128.h>
+
+  #define stbir__simdf v128_t
+  #define stbir__simdi v128_t
+
+  #define stbir_simdi_castf( reg ) (reg)
+  #define stbir_simdf_casti( reg ) (reg)
+
+  #define stbir__simdf_load( reg, ptr )             (reg) = wasm_v128_load( (void const*)(ptr) )
+  #define stbir__simdi_load( reg, ptr )             (reg) = wasm_v128_load( (void const*)(ptr) )
+  #define stbir__simdf_load1( out, ptr )            (out) = wasm_v128_load32_splat( (void const*)(ptr) ) // top values can be random (not denormal or nan for perf)
+  #define stbir__simdi_load1( out, ptr )            (out) = wasm_v128_load32_splat( (void const*)(ptr) )
+  #define stbir__simdf_load1z( out, ptr )           (out) = wasm_v128_load32_zero( (void const*)(ptr) ) // top values must be zero
+  #define stbir__simdf_frep4( fvar )                wasm_f32x4_splat( fvar )
+  #define stbir__simdf_load1frep4( out, fvar )      (out) = wasm_f32x4_splat( fvar )
+  #define stbir__simdf_load2( out, ptr )            (out) = wasm_v128_load64_splat( (void const*)(ptr) ) // top values can be random (not denormal or nan for perf)
+  #define stbir__simdf_load2z( out, ptr )           (out) = wasm_v128_load64_zero( (void const*)(ptr) ) // top values must be zero
+  #define stbir__simdf_load2hmerge( out, reg, ptr ) (out) = wasm_v128_load64_lane( (void const*)(ptr), reg, 1 )
+
+  #define stbir__simdf_zeroP() wasm_f32x4_const_splat(0)
+  #define stbir__simdf_zero( reg ) (reg) = wasm_f32x4_const_splat(0)
+
+  #define stbir__simdf_store( ptr, reg )   wasm_v128_store( (void*)(ptr), reg )
+  #define stbir__simdf_store1( ptr, reg )  wasm_v128_store32_lane( (void*)(ptr), reg, 0 )
+  #define stbir__simdf_store2( ptr, reg )  wasm_v128_store64_lane( (void*)(ptr), reg, 0 )
+  #define stbir__simdf_store2h( ptr, reg ) wasm_v128_store64_lane( (void*)(ptr), reg, 1 )
+
+  #define stbir__simdi_store( ptr, reg )  wasm_v128_store( (void*)(ptr), reg )
+  #define stbir__simdi_store1( ptr, reg ) wasm_v128_store32_lane( (void*)(ptr), reg, 0 )
+  #define stbir__simdi_store2( ptr, reg ) wasm_v128_store64_lane( (void*)(ptr), reg, 0 )
+
+  #define stbir__prefetch( ptr )
+
+  #define stbir__simdi_expand_u8_to_u32(out0,out1,out2,out3,ireg) \
+  { \
+    v128_t l = wasm_u16x8_extend_low_u8x16 ( ireg ); \
+    v128_t h = wasm_u16x8_extend_high_u8x16( ireg ); \
+    out0 = wasm_u32x4_extend_low_u16x8 ( l ); \
+    out1 = wasm_u32x4_extend_high_u16x8( l ); \
+    out2 = wasm_u32x4_extend_low_u16x8 ( h ); \
+    out3 = wasm_u32x4_extend_high_u16x8( h ); \
+  }
+
+  #define stbir__simdi_expand_u8_to_1u32(out,ireg) \
+  { \
+    v128_t tmp = wasm_u16x8_extend_low_u8x16(ireg); \
+    out = wasm_u32x4_extend_low_u16x8(tmp); \
+  }
+
+  #define stbir__simdi_expand_u16_to_u32(out0,out1,ireg) \
+  { \
+    out0 = wasm_u32x4_extend_low_u16x8 ( ireg ); \
+    out1 = wasm_u32x4_extend_high_u16x8( ireg ); \
+  }
+
+  #define stbir__simdf_convert_float_to_i32( i, f )    (i) = wasm_i32x4_trunc_sat_f32x4(f)
+  #define stbir__simdf_convert_float_to_int( f )       wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(f), 0)
+  #define stbir__simdi_to_int( i )                     wasm_i32x4_extract_lane(i, 0) 
+  #define stbir__simdf_convert_float_to_uint8( f )     ((unsigned char)wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_max(wasm_f32x4_min(f,STBIR_max_uint8_as_float),wasm_f32x4_const_splat(0))), 0))
+  #define stbir__simdf_convert_float_to_short( f )     ((unsigned short)wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_max(wasm_f32x4_min(f,STBIR_max_uint16_as_float),wasm_f32x4_const_splat(0))), 0))
+  #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = wasm_f32x4_convert_i32x4(ireg)
+  #define stbir__simdf_add( out, reg0, reg1 )          (out) = wasm_f32x4_add( reg0, reg1 )
+  #define stbir__simdf_mult( out, reg0, reg1 )         (out) = wasm_f32x4_mul( reg0, reg1 )
+  #define stbir__simdf_mult_mem( out, reg, ptr )       (out) = wasm_f32x4_mul( reg, wasm_v128_load( (void const*)(ptr) ) )
+  #define stbir__simdf_mult1_mem( out, reg, ptr )      (out) = wasm_f32x4_mul( reg, wasm_v128_load32_splat( (void const*)(ptr) ) )
+  #define stbir__simdf_add_mem( out, reg, ptr )        (out) = wasm_f32x4_add( reg, wasm_v128_load( (void const*)(ptr) ) )
+  #define stbir__simdf_add1_mem( out, reg, ptr )       (out) = wasm_f32x4_add( reg, wasm_v128_load32_splat( (void const*)(ptr) ) )
+
+  #define stbir__simdf_madd( out, add, mul1, mul2 )    (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul1, mul2 ) )
+  #define stbir__simdf_madd1( out, add, mul1, mul2 )   (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul1, mul2 ) )
+  #define stbir__simdf_madd_mem( out, add, mul, ptr )  (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul, wasm_v128_load( (void const*)(ptr) ) ) )
+  #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul, wasm_v128_load32_splat( (void const*)(ptr) ) ) )
+
+  #define stbir__simdf_add1( out, reg0, reg1 )  (out) = wasm_f32x4_add( reg0, reg1 )
+  #define stbir__simdf_mult1( out, reg0, reg1 ) (out) = wasm_f32x4_mul( reg0, reg1 )
+
+  #define stbir__simdf_and( out, reg0, reg1 ) (out) = wasm_v128_and( reg0, reg1 )
+  #define stbir__simdf_or( out, reg0, reg1 )  (out) = wasm_v128_or( reg0, reg1 )
+
+  #define stbir__simdf_min( out, reg0, reg1 ) (out) = wasm_f32x4_min( reg0, reg1 )
+  #define stbir__simdf_max( out, reg0, reg1 ) (out) = wasm_f32x4_max( reg0, reg1 )
+  #define stbir__simdf_min1( out, reg0, reg1 ) (out) = wasm_f32x4_min( reg0, reg1 )
+  #define stbir__simdf_max1( out, reg0, reg1 ) (out) = wasm_f32x4_max( reg0, reg1 )
+
+  #define stbir__simdf_0123ABCDto3ABx( out, reg0, reg1 ) (out) = wasm_i32x4_shuffle( reg0, reg1, 3, 4, 5, -1 )
+  #define stbir__simdf_0123ABCDto23Ax( out, reg0, reg1 ) (out) = wasm_i32x4_shuffle( reg0, reg1, 2, 3, 4, -1 )
+
+  #define stbir__simdf_aaa1(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 3, 3, 3, 4)
+  #define stbir__simdf_1aaa(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 4, 0, 0, 0)
+  #define stbir__simdf_a1a1(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 1, 4, 3, 4)
+  #define stbir__simdf_1a1a(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 4, 0, 4, 2)
+
+  #define stbir__simdf_swiz( reg, one, two, three, four ) wasm_i32x4_shuffle(reg, reg, one, two, three, four)
+
+  #define stbir__simdi_and( out, reg0, reg1 )    (out) = wasm_v128_and( reg0, reg1 )
+  #define stbir__simdi_or( out, reg0, reg1 )     (out) = wasm_v128_or( reg0, reg1 )
+  #define stbir__simdi_16madd( out, reg0, reg1 ) (out) = wasm_i32x4_dot_i16x8( reg0, reg1 )
+
+  #define stbir__simdf_pack_to_8bytes(out,aa,bb) \
+  { \
+    v128_t af = wasm_f32x4_max( wasm_f32x4_min(aa, STBIR_max_uint8_as_float), wasm_f32x4_const_splat(0) ); \
+    v128_t bf = wasm_f32x4_max( wasm_f32x4_min(bb, STBIR_max_uint8_as_float), wasm_f32x4_const_splat(0) ); \
+    v128_t ai = wasm_i32x4_trunc_sat_f32x4( af ); \
+    v128_t bi = wasm_i32x4_trunc_sat_f32x4( bf ); \
+    v128_t out16 = wasm_i16x8_narrow_i32x4( ai, bi ); \
+    out = wasm_u8x16_narrow_i16x8( out16, out16 ); \
+  }
+
+  #define stbir__simdf_pack_to_8words(out,aa,bb) \
+  { \
+    v128_t af = wasm_f32x4_max( wasm_f32x4_min(aa, STBIR_max_uint16_as_float), wasm_f32x4_const_splat(0)); \
+    v128_t bf = wasm_f32x4_max( wasm_f32x4_min(bb, STBIR_max_uint16_as_float), wasm_f32x4_const_splat(0)); \
+    v128_t ai = wasm_i32x4_trunc_sat_f32x4( af ); \
+    v128_t bi = wasm_i32x4_trunc_sat_f32x4( bf ); \
+    out = wasm_u16x8_narrow_i32x4( ai, bi ); \
+  }
+
+  #define stbir__interleave_pack_and_store_16_u8( ptr, r0, r1, r2, r3 ) \
+  { \
+    v128_t tmp0 = wasm_i16x8_narrow_i32x4(r0, r1); \
+    v128_t tmp1 = wasm_i16x8_narrow_i32x4(r2, r3); \
+    v128_t tmp = wasm_u8x16_narrow_i16x8(tmp0, tmp1); \
+    tmp = wasm_i8x16_shuffle(tmp, tmp, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); \
+    wasm_v128_store( (void*)(ptr), tmp); \
+  }
+
+  #define stbir__simdf_load4_transposed( o0, o1, o2, o3, ptr ) \
+  { \
+    v128_t t0 = wasm_v128_load( ptr    ); \
+    v128_t t1 = wasm_v128_load( ptr+4  ); \
+    v128_t t2 = wasm_v128_load( ptr+8  ); \
+    v128_t t3 = wasm_v128_load( ptr+12 ); \
+    v128_t s0 = wasm_i32x4_shuffle(t0, t1, 0, 4, 2, 6); \
+    v128_t s1 = wasm_i32x4_shuffle(t0, t1, 1, 5, 3, 7); \
+    v128_t s2 = wasm_i32x4_shuffle(t2, t3, 0, 4, 2, 6); \
+    v128_t s3 = wasm_i32x4_shuffle(t2, t3, 1, 5, 3, 7); \
+    o0 = wasm_i32x4_shuffle(s0, s2, 0, 1, 4, 5); \
+    o1 = wasm_i32x4_shuffle(s1, s3, 0, 1, 4, 5); \
+    o2 = wasm_i32x4_shuffle(s0, s2, 2, 3, 6, 7); \
+    o3 = wasm_i32x4_shuffle(s1, s3, 2, 3, 6, 7); \
+  }
+
+  #define stbir__simdi_32shr( out, reg, imm ) out = wasm_u32x4_shr( reg, imm )
+
+  typedef float stbir__f32x4 __attribute__((__vector_size__(16), __aligned__(16)));
+  #define STBIR__SIMDF_CONST(var, x) stbir__simdf var = (v128_t)(stbir__f32x4){ x, x, x, x }
+  #define STBIR__SIMDI_CONST(var, x) stbir__simdi var = { x, x, x, x }
+  #define STBIR__CONSTF(var) (var)
+  #define STBIR__CONSTI(var) (var)
+
+  #ifdef STBIR_FLOORF
+  #undef STBIR_FLOORF
+  #endif
+  #define STBIR_FLOORF stbir_simd_floorf
+  static stbir__inline float stbir_simd_floorf(float x)
+  {
+    return wasm_f32x4_extract_lane( wasm_f32x4_floor( wasm_f32x4_splat(x) ), 0);
+  }
+
+  #ifdef STBIR_CEILF
+  #undef STBIR_CEILF
+  #endif
+  #define STBIR_CEILF stbir_simd_ceilf
+  static stbir__inline float stbir_simd_ceilf(float x)
+  {
+    return wasm_f32x4_extract_lane( wasm_f32x4_ceil( wasm_f32x4_splat(x) ), 0);
+  }
+
+  #define STBIR_SIMD
+
+#endif  // SSE2/NEON/WASM
+
+#endif // NO SIMD
+
+#ifdef STBIR_SIMD8
+  #define stbir__simdfX stbir__simdf8
+  #define stbir__simdiX stbir__simdi8
+  #define stbir__simdfX_load stbir__simdf8_load
+  #define stbir__simdiX_load stbir__simdi8_load
+  #define stbir__simdfX_mult stbir__simdf8_mult
+  #define stbir__simdfX_add_mem stbir__simdf8_add_mem
+  #define stbir__simdfX_madd_mem stbir__simdf8_madd_mem
+  #define stbir__simdfX_store stbir__simdf8_store
+  #define stbir__simdiX_store stbir__simdi8_store
+  #define stbir__simdf_frepX  stbir__simdf8_frep8
+  #define stbir__simdfX_madd stbir__simdf8_madd
+  #define stbir__simdfX_min stbir__simdf8_min
+  #define stbir__simdfX_max stbir__simdf8_max
+  #define stbir__simdfX_aaa1 stbir__simdf8_aaa1
+  #define stbir__simdfX_1aaa stbir__simdf8_1aaa
+  #define stbir__simdfX_a1a1 stbir__simdf8_a1a1
+  #define stbir__simdfX_1a1a stbir__simdf8_1a1a
+  #define stbir__simdfX_convert_float_to_i32 stbir__simdf8_convert_float_to_i32
+  #define stbir__simdfX_pack_to_words stbir__simdf8_pack_to_16words
+  #define stbir__simdfX_zero stbir__simdf8_zero
+  #define STBIR_onesX STBIR_ones8
+  #define STBIR_max_uint8_as_floatX STBIR_max_uint8_as_float8
+  #define STBIR_max_uint16_as_floatX STBIR_max_uint16_as_float8
+  #define STBIR_simd_point5X STBIR_simd_point58
+  #define stbir__simdfX_float_count 8
+  #define stbir__simdfX_0123to1230 stbir__simdf8_0123to12301230
+  #define stbir__simdfX_0123to2103 stbir__simdf8_0123to21032103
+  static const stbir__simdf8 STBIR_max_uint16_as_float_inverted8 = { stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted };
+  static const stbir__simdf8 STBIR_max_uint8_as_float_inverted8 = { stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted };
+  static const stbir__simdf8 STBIR_ones8 = { 1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 };
+  static const stbir__simdf8 STBIR_simd_point58 = { 0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5 };
+  static const stbir__simdf8 STBIR_max_uint8_as_float8 = { stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float, stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float };
+  static const stbir__simdf8 STBIR_max_uint16_as_float8 = { stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float, stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float };
+#else
+  #define stbir__simdfX stbir__simdf
+  #define stbir__simdiX stbir__simdi
+  #define stbir__simdfX_load stbir__simdf_load
+  #define stbir__simdiX_load stbir__simdi_load
+  #define stbir__simdfX_mult stbir__simdf_mult
+  #define stbir__simdfX_add_mem stbir__simdf_add_mem
+  #define stbir__simdfX_madd_mem stbir__simdf_madd_mem
+  #define stbir__simdfX_store stbir__simdf_store
+  #define stbir__simdiX_store stbir__simdi_store
+  #define stbir__simdf_frepX  stbir__simdf_frep4
+  #define stbir__simdfX_madd stbir__simdf_madd
+  #define stbir__simdfX_min stbir__simdf_min
+  #define stbir__simdfX_max stbir__simdf_max
+  #define stbir__simdfX_aaa1 stbir__simdf_aaa1
+  #define stbir__simdfX_1aaa stbir__simdf_1aaa
+  #define stbir__simdfX_a1a1 stbir__simdf_a1a1
+  #define stbir__simdfX_1a1a stbir__simdf_1a1a
+  #define stbir__simdfX_convert_float_to_i32 stbir__simdf_convert_float_to_i32
+  #define stbir__simdfX_pack_to_words stbir__simdf_pack_to_8words
+  #define stbir__simdfX_zero stbir__simdf_zero
+  #define STBIR_onesX STBIR__CONSTF(STBIR_ones)
+  #define STBIR_simd_point5X STBIR__CONSTF(STBIR_simd_point5)
+  #define STBIR_max_uint8_as_floatX STBIR__CONSTF(STBIR_max_uint8_as_float)
+  #define STBIR_max_uint16_as_floatX STBIR__CONSTF(STBIR_max_uint16_as_float)
+  #define stbir__simdfX_float_count 4
+  #define stbir__if_simdf8_cast_to_simdf4( val ) ( val )
+  #define stbir__simdfX_0123to1230 stbir__simdf_0123to1230
+  #define stbir__simdfX_0123to2103 stbir__simdf_0123to2103
+#endif
+
+
+#if defined(STBIR_NEON) && !defined(_M_ARM)
+
+  #if defined( _MSC_VER ) && !defined(__clang__)
+  typedef __int16 stbir__FP16;
+  #else
+  typedef float16_t stbir__FP16;
+  #endif
+
+#else // no NEON, or 32-bit ARM for MSVC
+
+  typedef union stbir__FP16
+  {
+    unsigned short u;
+  } stbir__FP16;
+
+#endif
+
+#if !defined(STBIR_NEON) && !defined(STBIR_FP16C) || defined(STBIR_NEON) && defined(_M_ARM)
+
+  // Fabian's half float routines, see: https://gist.github.com/rygorous/2156668
+
+  static stbir__inline float stbir__half_to_float( stbir__FP16 h )
+  {
+    static const stbir__FP32 magic = { (254 - 15) << 23 };
+    static const stbir__FP32 was_infnan = { (127 + 16) << 23 };
+    stbir__FP32 o;
+
+    o.u = (h.u & 0x7fff) << 13;     // exponent/mantissa bits
+    o.f *= magic.f;                 // exponent adjust
+    if (o.f >= was_infnan.f)        // make sure Inf/NaN survive
+      o.u |= 255 << 23;
+    o.u |= (h.u & 0x8000) << 16;    // sign bit
+    return o.f;
+  }
+
+  static stbir__inline stbir__FP16 stbir__float_to_half(float val)
+  {
+    stbir__FP32 f32infty = { 255 << 23 };
+    stbir__FP32 f16max   = { (127 + 16) << 23 };
+    stbir__FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
+    unsigned int sign_mask = 0x80000000u;
+    stbir__FP16 o = { 0 };
+    stbir__FP32 f;
+    unsigned int sign; 
+
+    f.f = val;
+    sign = f.u & sign_mask;
+    f.u ^= sign;
+
+    if (f.u >= f16max.u) // result is Inf or NaN (all exponent bits set)
+      o.u = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
+    else // (De)normalized number or zero
+    {
+      if (f.u < (113 << 23)) // resulting FP16 is subnormal or zero
+      {
+        // use a magic value to align our 10 mantissa bits at the bottom of
+        // the float. as long as FP addition is round-to-nearest-even this
+        // just works.
+        f.f += denorm_magic.f;
+        // and one integer subtract of the bias later, we have our final float!
+        o.u = (unsigned short) ( f.u - denorm_magic.u );
+      }
+      else
+      {
+        unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
+        // update exponent, rounding bias part 1
+        f.u = f.u + ((15u - 127) << 23) + 0xfff;
+        // rounding bias part 2
+        f.u += mant_odd;
+        // take the bits!
+        o.u = (unsigned short) ( f.u >> 13 );
+      }
+    }
+
+    o.u |= sign >> 16;
+    return o;
+  }
+
+#endif
+
+
+#if defined(STBIR_FP16C)
+
+  #include <immintrin.h>
+
+  static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
+  {
+    _mm256_storeu_ps( (float*)output, _mm256_cvtph_ps( _mm_loadu_si128( (__m128i const* )input ) ) );
+  }
+
+  static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
+  {
+    _mm_storeu_si128( (__m128i*)output, _mm256_cvtps_ph( _mm256_loadu_ps( input ), 0 ) );
+  }
+
+  static stbir__inline float stbir__half_to_float( stbir__FP16 h )
+  {
+    return _mm_cvtss_f32( _mm_cvtph_ps( _mm_cvtsi32_si128( (int)h.u ) ) );
+  }
+
+  static stbir__inline stbir__FP16 stbir__float_to_half( float f )
+  {
+    stbir__FP16 h;
+    h.u = (unsigned short) _mm_cvtsi128_si32( _mm_cvtps_ph( _mm_set_ss( f ), 0 ) );
+    return h;
+  }
+
+#elif defined(STBIR_SSE2)
+
+  // Fabian's half float routines, see: https://gist.github.com/rygorous/2156668
+  stbir__inline static void stbir__half_to_float_SIMD(float * output, void const * input)
+  {
+    static const STBIR__SIMDI_CONST(mask_nosign,      0x7fff);
+    static const STBIR__SIMDI_CONST(smallest_normal,  0x0400);
+    static const STBIR__SIMDI_CONST(infinity,         0x7c00);
+    static const STBIR__SIMDI_CONST(expadjust_normal, (127 - 15) << 23);
+    static const STBIR__SIMDI_CONST(magic_denorm,     113 << 23);
+
+    __m128i i = _mm_loadu_si128 ( (__m128i const*)(input) );
+    __m128i h = _mm_unpacklo_epi16 ( i, _mm_setzero_si128() );
+    __m128i mnosign     = STBIR__CONSTI(mask_nosign);
+    __m128i eadjust     = STBIR__CONSTI(expadjust_normal);
+    __m128i smallest    = STBIR__CONSTI(smallest_normal);
+    __m128i infty       = STBIR__CONSTI(infinity);
+    __m128i expmant     = _mm_and_si128(mnosign, h);
+    __m128i justsign    = _mm_xor_si128(h, expmant);
+    __m128i b_notinfnan = _mm_cmpgt_epi32(infty, expmant);
+    __m128i b_isdenorm  = _mm_cmpgt_epi32(smallest, expmant);
+    __m128i shifted     = _mm_slli_epi32(expmant, 13);
+    __m128i adj_infnan  = _mm_andnot_si128(b_notinfnan, eadjust);
+    __m128i adjusted    = _mm_add_epi32(eadjust, shifted);
+    __m128i den1        = _mm_add_epi32(shifted, STBIR__CONSTI(magic_denorm));
+    __m128i adjusted2   = _mm_add_epi32(adjusted, adj_infnan);
+    __m128  den2        = _mm_sub_ps(_mm_castsi128_ps(den1), *(const __m128 *)&magic_denorm);
+    __m128  adjusted3   = _mm_and_ps(den2, _mm_castsi128_ps(b_isdenorm));
+    __m128  adjusted4   = _mm_andnot_ps(_mm_castsi128_ps(b_isdenorm), _mm_castsi128_ps(adjusted2));
+    __m128  adjusted5   = _mm_or_ps(adjusted3, adjusted4);
+    __m128i sign        = _mm_slli_epi32(justsign, 16);
+    __m128  final       = _mm_or_ps(adjusted5, _mm_castsi128_ps(sign));
+    stbir__simdf_store( output + 0,  final );
+
+    h = _mm_unpackhi_epi16 ( i, _mm_setzero_si128() );
+    expmant     = _mm_and_si128(mnosign, h);
+    justsign    = _mm_xor_si128(h, expmant);
+    b_notinfnan = _mm_cmpgt_epi32(infty, expmant);
+    b_isdenorm  = _mm_cmpgt_epi32(smallest, expmant);
+    shifted     = _mm_slli_epi32(expmant, 13);
+    adj_infnan  = _mm_andnot_si128(b_notinfnan, eadjust);
+    adjusted    = _mm_add_epi32(eadjust, shifted);
+    den1        = _mm_add_epi32(shifted, STBIR__CONSTI(magic_denorm));
+    adjusted2   = _mm_add_epi32(adjusted, adj_infnan);
+    den2        = _mm_sub_ps(_mm_castsi128_ps(den1), *(const __m128 *)&magic_denorm);
+    adjusted3   = _mm_and_ps(den2, _mm_castsi128_ps(b_isdenorm));
+    adjusted4   = _mm_andnot_ps(_mm_castsi128_ps(b_isdenorm), _mm_castsi128_ps(adjusted2));
+    adjusted5   = _mm_or_ps(adjusted3, adjusted4);
+    sign        = _mm_slli_epi32(justsign, 16);
+    final       = _mm_or_ps(adjusted5, _mm_castsi128_ps(sign));
+    stbir__simdf_store( output + 4,  final );
+
+    // ~38 SSE2 ops for 8 values
+  }
+
+  // Fabian's round-to-nearest-even float to half
+  // ~48 SSE2 ops for 8 output
+  stbir__inline static void stbir__float_to_half_SIMD(void * output, float const * input)
+  {
+    static const STBIR__SIMDI_CONST(mask_sign,      0x80000000u);
+    static const STBIR__SIMDI_CONST(c_f16max,       (127 + 16) << 23); // all FP32 values >=this round to +inf
+    static const STBIR__SIMDI_CONST(c_nanbit,        0x200);
+    static const STBIR__SIMDI_CONST(c_infty_as_fp16, 0x7c00);
+    static const STBIR__SIMDI_CONST(c_min_normal,    (127 - 14) << 23); // smallest FP32 that yields a normalized FP16
+    static const STBIR__SIMDI_CONST(c_subnorm_magic, ((127 - 15) + (23 - 10) + 1) << 23);
+    static const STBIR__SIMDI_CONST(c_normal_bias,    0xfff - ((127 - 15) << 23)); // adjust exponent and add mantissa rounding
+
+    __m128  f           =  _mm_loadu_ps(input);
+    __m128  msign       = _mm_castsi128_ps(STBIR__CONSTI(mask_sign));
+    __m128  justsign    = _mm_and_ps(msign, f);
+    __m128  absf        = _mm_xor_ps(f, justsign);
+    __m128i absf_int    = _mm_castps_si128(absf); // the cast is "free" (extra bypass latency, but no thruput hit)
+    __m128i f16max      = STBIR__CONSTI(c_f16max);
+    __m128  b_isnan     = _mm_cmpunord_ps(absf, absf); // is this a NaN?
+    __m128i b_isregular = _mm_cmpgt_epi32(f16max, absf_int); // (sub)normalized or special?
+    __m128i nanbit      = _mm_and_si128(_mm_castps_si128(b_isnan), STBIR__CONSTI(c_nanbit));
+    __m128i inf_or_nan  = _mm_or_si128(nanbit, STBIR__CONSTI(c_infty_as_fp16)); // output for specials
+
+    __m128i min_normal  = STBIR__CONSTI(c_min_normal);
+    __m128i b_issub     = _mm_cmpgt_epi32(min_normal, absf_int);
+
+    // "result is subnormal" path
+    __m128  subnorm1    = _mm_add_ps(absf, _mm_castsi128_ps(STBIR__CONSTI(c_subnorm_magic))); // magic value to round output mantissa
+    __m128i subnorm2    = _mm_sub_epi32(_mm_castps_si128(subnorm1), STBIR__CONSTI(c_subnorm_magic)); // subtract out bias
+
+    // "result is normal" path
+    __m128i mantoddbit  = _mm_slli_epi32(absf_int, 31 - 13); // shift bit 13 (mantissa LSB) to sign
+    __m128i mantodd     = _mm_srai_epi32(mantoddbit, 31); // -1 if FP16 mantissa odd, else 0
+
+    __m128i round1      = _mm_add_epi32(absf_int, STBIR__CONSTI(c_normal_bias));
+    __m128i round2      = _mm_sub_epi32(round1, mantodd); // if mantissa LSB odd, bias towards rounding up (RTNE)
+    __m128i normal      = _mm_srli_epi32(round2, 13); // rounded result
+
+    // combine the two non-specials
+    __m128i nonspecial  = _mm_or_si128(_mm_and_si128(subnorm2, b_issub), _mm_andnot_si128(b_issub, normal));
+
+    // merge in specials as well
+    __m128i joined      = _mm_or_si128(_mm_and_si128(nonspecial, b_isregular), _mm_andnot_si128(b_isregular, inf_or_nan));
+
+    __m128i sign_shift  = _mm_srai_epi32(_mm_castps_si128(justsign), 16);
+    __m128i final2, final= _mm_or_si128(joined, sign_shift);
+
+    f           =  _mm_loadu_ps(input+4);
+    justsign    = _mm_and_ps(msign, f);
+    absf        = _mm_xor_ps(f, justsign);
+    absf_int    = _mm_castps_si128(absf); // the cast is "free" (extra bypass latency, but no thruput hit)
+    b_isnan     = _mm_cmpunord_ps(absf, absf); // is this a NaN?
+    b_isregular = _mm_cmpgt_epi32(f16max, absf_int); // (sub)normalized or special?
+    nanbit      = _mm_and_si128(_mm_castps_si128(b_isnan), c_nanbit);
+    inf_or_nan  = _mm_or_si128(nanbit, STBIR__CONSTI(c_infty_as_fp16)); // output for specials
+
+    b_issub     = _mm_cmpgt_epi32(min_normal, absf_int);
+
+    // "result is subnormal" path
+    subnorm1    = _mm_add_ps(absf, _mm_castsi128_ps(STBIR__CONSTI(c_subnorm_magic))); // magic value to round output mantissa
+    subnorm2    = _mm_sub_epi32(_mm_castps_si128(subnorm1), STBIR__CONSTI(c_subnorm_magic)); // subtract out bias
+
+    // "result is normal" path
+    mantoddbit  = _mm_slli_epi32(absf_int, 31 - 13); // shift bit 13 (mantissa LSB) to sign
+    mantodd     = _mm_srai_epi32(mantoddbit, 31); // -1 if FP16 mantissa odd, else 0
+
+    round1      = _mm_add_epi32(absf_int, STBIR__CONSTI(c_normal_bias));
+    round2      = _mm_sub_epi32(round1, mantodd); // if mantissa LSB odd, bias towards rounding up (RTNE)
+    normal      = _mm_srli_epi32(round2, 13); // rounded result
+
+    // combine the two non-specials
+    nonspecial  = _mm_or_si128(_mm_and_si128(subnorm2, b_issub), _mm_andnot_si128(b_issub, normal));
+
+    // merge in specials as well
+    joined      = _mm_or_si128(_mm_and_si128(nonspecial, b_isregular), _mm_andnot_si128(b_isregular, inf_or_nan));
+
+    sign_shift  = _mm_srai_epi32(_mm_castps_si128(justsign), 16);
+    final2      = _mm_or_si128(joined, sign_shift);
+    final       = _mm_packs_epi32(final, final2);
+    stbir__simdi_store( output,final );
+  }
+
+#elif defined(STBIR_WASM) || (defined(STBIR_NEON) && defined(_MSC_VER) && defined(_M_ARM)) // WASM or 32-bit ARM on MSVC/clang
+
+  static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
+  {
+    for (int i=0; i<8; i++)
+    {
+      output[i] = stbir__half_to_float(input[i]);
+    }
+  }
+
+  static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
+  {
+    for (int i=0; i<8; i++)
+    {
+      output[i] = stbir__float_to_half(input[i]);
+    }
+  }
+
+#elif defined(STBIR_NEON) && defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) // 64-bit ARM on MSVC (not clang)
+
+  static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
+  {
+    float16x4_t in0 = vld1_f16(input + 0);
+    float16x4_t in1 = vld1_f16(input + 4);
+    vst1q_f32(output + 0, vcvt_f32_f16(in0));
+    vst1q_f32(output + 4, vcvt_f32_f16(in1));
+  }
+
+  static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
+  {
+    float16x4_t out0 = vcvt_f16_f32(vld1q_f32(input + 0));
+    float16x4_t out1 = vcvt_f16_f32(vld1q_f32(input + 4));
+    vst1_f16(output+0, out0);
+    vst1_f16(output+4, out1);
+  }
+
+  static stbir__inline float stbir__half_to_float( stbir__FP16 h )
+  {
+    return vgetq_lane_f32(vcvt_f32_f16(vld1_dup_f16(&h)), 0);
+  }
+
+  static stbir__inline stbir__FP16 stbir__float_to_half( float f )
+  {
+    return vget_lane_f16(vcvt_f16_f32(vdupq_n_f32(f)), 0).n16_u16[0];
+  }
+
+#elif defined(STBIR_NEON) // 64-bit ARM
+
+  static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
+  {
+    float16x8_t in = vld1q_f16(input);
+    vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(in)));
+    vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(in)));
+  }
+
+  static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
+  {
+    float16x4_t out0 = vcvt_f16_f32(vld1q_f32(input + 0));
+    float16x4_t out1 = vcvt_f16_f32(vld1q_f32(input + 4));
+    vst1q_f16(output, vcombine_f16(out0, out1));
+  }
+
+  static stbir__inline float stbir__half_to_float( stbir__FP16 h )
+  {
+    return vgetq_lane_f32(vcvt_f32_f16(vdup_n_f16(h)), 0);
+  }
+
+  static stbir__inline stbir__FP16 stbir__float_to_half( float f )
+  {
+    return vget_lane_f16(vcvt_f16_f32(vdupq_n_f32(f)), 0);
+  }
+
+#endif
+
+
+#ifdef STBIR_SIMD
+
+#define stbir__simdf_0123to3333( out, reg ) (out) = stbir__simdf_swiz( reg, 3,3,3,3 )
+#define stbir__simdf_0123to2222( out, reg ) (out) = stbir__simdf_swiz( reg, 2,2,2,2 )
+#define stbir__simdf_0123to1111( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,1,1 )
+#define stbir__simdf_0123to0000( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,0,0 )
+#define stbir__simdf_0123to0003( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,0,3 )
+#define stbir__simdf_0123to0001( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,0,1 )
+#define stbir__simdf_0123to1122( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,2,2 )
+#define stbir__simdf_0123to2333( out, reg ) (out) = stbir__simdf_swiz( reg, 2,3,3,3 )
+#define stbir__simdf_0123to0023( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,2,3 )
+#define stbir__simdf_0123to1230( out, reg ) (out) = stbir__simdf_swiz( reg, 1,2,3,0 )
+#define stbir__simdf_0123to2103( out, reg ) (out) = stbir__simdf_swiz( reg, 2,1,0,3 )
+#define stbir__simdf_0123to3210( out, reg ) (out) = stbir__simdf_swiz( reg, 3,2,1,0 )
+#define stbir__simdf_0123to2301( out, reg ) (out) = stbir__simdf_swiz( reg, 2,3,0,1 )
+#define stbir__simdf_0123to3012( out, reg ) (out) = stbir__simdf_swiz( reg, 3,0,1,2 )
+#define stbir__simdf_0123to0011( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,1,1 )
+#define stbir__simdf_0123to1100( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,0,0 )
+#define stbir__simdf_0123to2233( out, reg ) (out) = stbir__simdf_swiz( reg, 2,2,3,3 ) 
+#define stbir__simdf_0123to1133( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,3,3 ) 
+#define stbir__simdf_0123to0022( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,2,2 ) 
+#define stbir__simdf_0123to1032( out, reg ) (out) = stbir__simdf_swiz( reg, 1,0,3,2 ) 
+
+typedef union stbir__simdi_u32
+{
+  stbir_uint32 m128i_u32[4];
+  int m128i_i32[4];
+  stbir__simdi m128i_i128;
+} stbir__simdi_u32;
+
+static const int STBIR_mask[9] = { 0,0,0,-1,-1,-1,0,0,0 };
+
+static const STBIR__SIMDF_CONST(STBIR_max_uint8_as_float,           stbir__max_uint8_as_float);
+static const STBIR__SIMDF_CONST(STBIR_max_uint16_as_float,          stbir__max_uint16_as_float);
+static const STBIR__SIMDF_CONST(STBIR_max_uint8_as_float_inverted,  stbir__max_uint8_as_float_inverted);
+static const STBIR__SIMDF_CONST(STBIR_max_uint16_as_float_inverted, stbir__max_uint16_as_float_inverted);
+
+static const STBIR__SIMDF_CONST(STBIR_simd_point5,   0.5f);
+static const STBIR__SIMDF_CONST(STBIR_ones,          1.0f);
+static const STBIR__SIMDI_CONST(STBIR_almost_zero,   (127 - 13) << 23);
+static const STBIR__SIMDI_CONST(STBIR_almost_one,    0x3f7fffff);
+static const STBIR__SIMDI_CONST(STBIR_mastissa_mask, 0xff);
+static const STBIR__SIMDI_CONST(STBIR_topscale,      0x02000000);
+
+//   Basically, in simd mode, we unroll the proper amount, and we don't want
+//   the non-simd remnant loops to be unroll because they only run a few times
+//   Adding this switch saves about 5K on clang which is Captain Unroll the 3rd.
+#define STBIR_SIMD_STREAMOUT_PTR( star )  STBIR_STREAMOUT_PTR( star )
+#define STBIR_SIMD_NO_UNROLL(ptr) STBIR_NO_UNROLL(ptr)
+
+#ifdef STBIR_MEMCPY
+#undef STBIR_MEMCPY
+#define STBIR_MEMCPY stbir_simd_memcpy
+#endif
+
+// override normal use of memcpy with much simpler copy (faster and smaller with our sized copies)
+static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes ) 
+{
+  char STBIR_SIMD_STREAMOUT_PTR (*) d = (char*) dest;
+  char STBIR_SIMD_STREAMOUT_PTR( * ) d_end = ((char*) dest) + bytes;
+  ptrdiff_t ofs_to_src = (char*)src - (char*)dest;
+
+  // check overlaps
+  STBIR_ASSERT( ( ( d >= ( (char*)src) + bytes ) ) || ( ( d + bytes ) <= (char*)src ) );
+
+  if ( bytes < (16*stbir__simdfX_float_count) )
+  {
+    if ( bytes < 16 )
+    {
+      if ( bytes ) 
+      {
+        do
+        {
+          STBIR_SIMD_NO_UNROLL(d);
+          d[ 0 ] = d[ ofs_to_src ];
+          ++d;
+        } while ( d < d_end );
+      }
+    }
+    else
+    {
+      stbir__simdf x;
+      // do one unaligned to get us aligned for the stream out below
+      stbir__simdf_load( x, ( d + ofs_to_src ) );
+      stbir__simdf_store( d, x );
+      d = (char*)( ( ( (ptrdiff_t)d ) + 16 ) & ~15 );
+
+      for(;;)
+      {
+        STBIR_SIMD_NO_UNROLL(d);
+
+        if ( d > ( d_end - 16 ) )
+        {
+          if ( d == d_end )
+            return;
+          d = d_end - 16;
+        }
+
+        stbir__simdf_load( x, ( d + ofs_to_src ) );
+        stbir__simdf_store( d, x );
+        d += 16;
+      }
+    }
+  }
+  else
+  {
+    stbir__simdfX x0,x1,x2,x3;
+
+    // do one unaligned to get us aligned for the stream out below
+    stbir__simdfX_load( x0, ( d + ofs_to_src ) +  0*stbir__simdfX_float_count );
+    stbir__simdfX_load( x1, ( d + ofs_to_src ) +  4*stbir__simdfX_float_count );
+    stbir__simdfX_load( x2, ( d + ofs_to_src ) +  8*stbir__simdfX_float_count );
+    stbir__simdfX_load( x3, ( d + ofs_to_src ) + 12*stbir__simdfX_float_count );
+    stbir__simdfX_store( d +  0*stbir__simdfX_float_count, x0 );
+    stbir__simdfX_store( d +  4*stbir__simdfX_float_count, x1 );
+    stbir__simdfX_store( d +  8*stbir__simdfX_float_count, x2 );
+    stbir__simdfX_store( d + 12*stbir__simdfX_float_count, x3 );
+    d = (char*)( ( ( (ptrdiff_t)d ) + (16*stbir__simdfX_float_count) ) & ~((16*stbir__simdfX_float_count)-1) );
+
+    for(;;)
+    {
+      STBIR_SIMD_NO_UNROLL(d);
+  
+      if ( d > ( d_end - (16*stbir__simdfX_float_count) ) )
+      {
+        if ( d == d_end )
+          return;
+        d = d_end - (16*stbir__simdfX_float_count);
+      }
+
+      stbir__simdfX_load( x0, ( d + ofs_to_src ) +  0*stbir__simdfX_float_count );
+      stbir__simdfX_load( x1, ( d + ofs_to_src ) +  4*stbir__simdfX_float_count );
+      stbir__simdfX_load( x2, ( d + ofs_to_src ) +  8*stbir__simdfX_float_count );
+      stbir__simdfX_load( x3, ( d + ofs_to_src ) + 12*stbir__simdfX_float_count );
+      stbir__simdfX_store( d +  0*stbir__simdfX_float_count, x0 );
+      stbir__simdfX_store( d +  4*stbir__simdfX_float_count, x1 );
+      stbir__simdfX_store( d +  8*stbir__simdfX_float_count, x2 );
+      stbir__simdfX_store( d + 12*stbir__simdfX_float_count, x3 );
+      d += (16*stbir__simdfX_float_count);
+    }
+  }
+}
+
+// memcpy that is specically intentionally overlapping (src is smaller then dest, so can be
+//   a normal forward copy, bytes is divisible by 4 and bytes is greater than or equal to
+//   the diff between dest and src)
+static void stbir_overlapping_memcpy( void * dest, void const * src, size_t bytes ) 
+{
+  char STBIR_SIMD_STREAMOUT_PTR (*) sd = (char*) src;
+  char STBIR_SIMD_STREAMOUT_PTR( * ) s_end = ((char*) src) + bytes;
+  ptrdiff_t ofs_to_dest = (char*)dest - (char*)src;
+
+  if ( ofs_to_dest >= 16 ) // is the overlap more than 16 away?
+  {
+    char STBIR_SIMD_STREAMOUT_PTR( * ) s_end16 = ((char*) src) + (bytes&~15);
+    do
+    {
+      stbir__simdf x;
+      STBIR_SIMD_NO_UNROLL(sd);
+      stbir__simdf_load( x, sd );
+      stbir__simdf_store(  ( sd + ofs_to_dest ), x );
+      sd += 16;
+    } while ( sd < s_end16 );
+
+    if ( sd == s_end )
+      return;
+  }
+
+  do
+  {
+    STBIR_SIMD_NO_UNROLL(sd);
+    *(int*)( sd + ofs_to_dest ) = *(int*) sd; 
+    sd += 4;
+  } while ( sd < s_end );
+}
+
+#else // no SSE2
+
+// when in scalar mode, we let unrolling happen, so this macro just does the __restrict
+#define STBIR_SIMD_STREAMOUT_PTR( star ) STBIR_STREAMOUT_PTR( star )
+#define STBIR_SIMD_NO_UNROLL(ptr) 
+
+#endif // SSE2
+
+
+#ifdef STBIR_PROFILE
+
+#if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(__SSE2__) || defined(STBIR_SSE) || defined( _M_IX86_FP ) || defined(__i386) || defined( __i386__ ) || defined( _M_IX86 ) || defined( _X86_ )
+
+#ifdef _MSC_VER
+
+  STBIRDEF stbir_uint64 __rdtsc();
+  #define STBIR_PROFILE_FUNC() __rdtsc()
+
+#else // non msvc
+
+  static stbir__inline stbir_uint64 STBIR_PROFILE_FUNC() 
+  {
+    stbir_uint32 lo, hi;
+    asm volatile ("rdtsc" : "=a" (lo), "=d" (hi) );
+    return ( ( (stbir_uint64) hi ) << 32 ) | ( (stbir_uint64) lo );
+  }
+
+#endif  // msvc
+
+#elif defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || defined(__ARM_NEON__) 
+
+#if defined( _MSC_VER ) && !defined(__clang__)
+
+  #define STBIR_PROFILE_FUNC() _ReadStatusReg(ARM64_CNTVCT)
+
+#else
+
+  static stbir__inline stbir_uint64 STBIR_PROFILE_FUNC()
+  {
+    stbir_uint64 tsc;
+    asm volatile("mrs %0, cntvct_el0" : "=r" (tsc));
+    return tsc;
+  }
+
+#endif
+
+#else // x64, arm
+
+#error Unknown platform for profiling.
+
+#endif  //x64 and   
+
+
+#define STBIR_ONLY_PROFILE_GET_SPLIT_INFO ,stbir__per_split_info * split_info
+#define STBIR_ONLY_PROFILE_SET_SPLIT_INFO ,split_info
+
+#define STBIR_ONLY_PROFILE_BUILD_GET_INFO ,stbir__info * profile_info
+#define STBIR_ONLY_PROFILE_BUILD_SET_INFO ,profile_info
+
+// super light-weight micro profiler
+#define STBIR_PROFILE_START_ll( info, wh ) { stbir_uint64 wh##thiszonetime = STBIR_PROFILE_FUNC(); stbir_uint64 * wh##save_parent_excluded_ptr = info->current_zone_excluded_ptr; stbir_uint64 wh##current_zone_excluded = 0; info->current_zone_excluded_ptr = &wh##current_zone_excluded; 
+#define STBIR_PROFILE_END_ll( info, wh ) wh##thiszonetime = STBIR_PROFILE_FUNC() - wh##thiszonetime; info->profile.named.wh += wh##thiszonetime - wh##current_zone_excluded; *wh##save_parent_excluded_ptr += wh##thiszonetime; info->current_zone_excluded_ptr = wh##save_parent_excluded_ptr; }
+#define STBIR_PROFILE_FIRST_START_ll( info, wh ) { int i; info->current_zone_excluded_ptr = &info->profile.named.total; for(i=0;i<STBIR__ARRAY_SIZE(info->profile.array);i++) info->profile.array[i]=0; } STBIR_PROFILE_START_ll( info, wh );
+#define STBIR_PROFILE_CLEAR_EXTRAS_ll( info, num ) { int extra; for(extra=1;extra<(num);extra++) { int i; for(i=0;i<STBIR__ARRAY_SIZE((info)->profile.array);i++) (info)[extra].profile.array[i]=0; } }
+
+// for thread data
+#define STBIR_PROFILE_START( wh ) STBIR_PROFILE_START_ll( split_info, wh )
+#define STBIR_PROFILE_END( wh ) STBIR_PROFILE_END_ll( split_info, wh )
+#define STBIR_PROFILE_FIRST_START( wh ) STBIR_PROFILE_FIRST_START_ll( split_info, wh )
+#define STBIR_PROFILE_CLEAR_EXTRAS() STBIR_PROFILE_CLEAR_EXTRAS_ll( split_info, split_count )
+
+// for build data
+#define STBIR_PROFILE_BUILD_START( wh ) STBIR_PROFILE_START_ll( profile_info, wh )
+#define STBIR_PROFILE_BUILD_END( wh ) STBIR_PROFILE_END_ll( profile_info, wh )
+#define STBIR_PROFILE_BUILD_FIRST_START( wh ) STBIR_PROFILE_FIRST_START_ll( profile_info, wh )
+#define STBIR_PROFILE_BUILD_CLEAR( info ) { int i; for(i=0;i<STBIR__ARRAY_SIZE(info->profile.array);i++) info->profile.array[i]=0; }
+
+#else  // no profile
+
+#define STBIR_ONLY_PROFILE_GET_SPLIT_INFO
+#define STBIR_ONLY_PROFILE_SET_SPLIT_INFO
+
+#define STBIR_ONLY_PROFILE_BUILD_GET_INFO
+#define STBIR_ONLY_PROFILE_BUILD_SET_INFO
+
+#define STBIR_PROFILE_START( wh )
+#define STBIR_PROFILE_END( wh )
+#define STBIR_PROFILE_FIRST_START( wh )
+#define STBIR_PROFILE_CLEAR_EXTRAS( )
+
+#define STBIR_PROFILE_BUILD_START( wh ) 
+#define STBIR_PROFILE_BUILD_END( wh ) 
+#define STBIR_PROFILE_BUILD_FIRST_START( wh )
+#define STBIR_PROFILE_BUILD_CLEAR( info )
+
+#endif  // stbir_profile
+
+#ifndef STBIR_CEILF
+#include <math.h>
+#if _MSC_VER <= 1200 // support VC6 for Sean
+#define STBIR_CEILF(x) ((float)ceil((float)(x)))
+#define STBIR_FLOORF(x) ((float)floor((float)(x)))
+#else
+#define STBIR_CEILF(x) ceilf(x)
+#define STBIR_FLOORF(x) floorf(x)
+#endif
+#endif
+
+#ifndef STBIR_MEMCPY
+// For memcpy
+#include <string.h>
+#define STBIR_MEMCPY( dest, src, len ) memcpy( dest, src, len )
+#endif
+
+#ifndef STBIR_SIMD
+
+// memcpy that is specically intentionally overlapping (src is smaller then dest, so can be
+//   a normal forward copy, bytes is divisible by 4 and bytes is greater than or equal to
+//   the diff between dest and src)
+static void stbir_overlapping_memcpy( void * dest, void const * src, size_t bytes ) 
+{
+  char STBIR_SIMD_STREAMOUT_PTR (*) sd = (char*) src;
+  char STBIR_SIMD_STREAMOUT_PTR( * ) s_end = ((char*) src) + bytes;
+  ptrdiff_t ofs_to_dest = (char*)dest - (char*)src;
+
+  if ( ofs_to_dest >= 8 ) // is the overlap more than 8 away?
+  {
+    char STBIR_SIMD_STREAMOUT_PTR( * ) s_end8 = ((char*) src) + (bytes&~7);
+    do
+    {
+      STBIR_NO_UNROLL(sd);
+      *(stbir_uint64*)( sd + ofs_to_dest ) = *(stbir_uint64*) sd; 
+      sd += 8;
+    } while ( sd < s_end8 );
+
+    if ( sd == s_end )
+      return;
+  }
+
+  do
+  {
+    STBIR_NO_UNROLL(sd);
+    *(int*)( sd + ofs_to_dest ) = *(int*) sd; 
+    sd += 4;
+  } while ( sd < s_end );
+}
+
+#endif
+
+static float stbir__filter_trapezoid(float x, float scale, void * user_data)
+{
+  float halfscale = scale / 2;
+  float t = 0.5f + halfscale;
+  STBIR_ASSERT(scale <= 1);
+  STBIR__UNUSED(user_data);
+
+  if ( x < 0.0f ) x = -x;
+
+  if (x >= t)
+    return 0.0f;
+  else
+  {
+    float r = 0.5f - halfscale;
+    if (x <= r)
+      return 1.0f;
+    else
+      return (t - x) / scale;
+  }
+}
+
+static float stbir__support_trapezoid(float scale, void * user_data)
+{
+  STBIR__UNUSED(user_data);
+  return 0.5f + scale / 2.0f;
+}
+
+static float stbir__filter_triangle(float x, float s, void * user_data)
+{
+  STBIR__UNUSED(s);
+  STBIR__UNUSED(user_data);
+
+  if ( x < 0.0f ) x = -x;
+
+  if (x <= 1.0f)
+    return 1.0f - x;
+  else
+    return 0.0f;
+}
+
+static float stbir__filter_point(float x, float s, void * user_data)
+{
+  STBIR__UNUSED(x);
+  STBIR__UNUSED(s);
+  STBIR__UNUSED(user_data);
+
+  return 1.0f;
+}
+
+static float stbir__filter_cubic(float x, float s, void * user_data)
+{
+  STBIR__UNUSED(s);
+  STBIR__UNUSED(user_data);
+
+  if ( x < 0.0f ) x = -x;
+
+  if (x < 1.0f)
+    return (4.0f + x*x*(3.0f*x - 6.0f))/6.0f;
+  else if (x < 2.0f)
+    return (8.0f + x*(-12.0f + x*(6.0f - x)))/6.0f;
+
+  return (0.0f);
+}
+
+static float stbir__filter_catmullrom(float x, float s, void * user_data)
+{
+  STBIR__UNUSED(s);
+  STBIR__UNUSED(user_data);
+
+  if ( x < 0.0f ) x = -x;
+
+  if (x < 1.0f)
+    return 1.0f - x*x*(2.5f - 1.5f*x);
+  else if (x < 2.0f)
+    return 2.0f - x*(4.0f + x*(0.5f*x - 2.5f));
+
+  return (0.0f);
+}
+
+static float stbir__filter_mitchell(float x, float s, void * user_data)
+{
+  STBIR__UNUSED(s);
+  STBIR__UNUSED(user_data);
+
+  if ( x < 0.0f ) x = -x;
+
+  if (x < 1.0f)
+    return (16.0f + x*x*(21.0f * x - 36.0f))/18.0f;
+  else if (x < 2.0f)
+    return (32.0f + x*(-60.0f + x*(36.0f - 7.0f*x)))/18.0f;
+
+  return (0.0f);
+}
+
+static float stbir__support_zero(float s, void * user_data)
+{
+  STBIR__UNUSED(s);
+  STBIR__UNUSED(user_data);
+  return 0;
+}
+
+static float stbir__support_zeropoint5(float s, void * user_data)
+{
+  STBIR__UNUSED(s);
+  STBIR__UNUSED(user_data);
+  return 0.5f;
+}
+
+static float stbir__support_one(float s, void * user_data)
+{
+  STBIR__UNUSED(s);
+  STBIR__UNUSED(user_data);
+  return 1;
+}
+
+static float stbir__support_two(float s, void * user_data) 
+{
+  STBIR__UNUSED(s);
+  STBIR__UNUSED(user_data);
+  return 2;
+}
+
+// This is the maximum number of input samples that can affect an output sample
+// with the given filter from the output pixel's perspective
+static int stbir__get_filter_pixel_width(stbir__support_callback * support, float scale, void * user_data)
+{
+  STBIR_ASSERT(support != 0);
+
+  if ( scale >= ( 1.0f-stbir__small_float ) ) // upscale
+    return (int)STBIR_CEILF(support(1.0f/scale,user_data) * 2.0f);
+  else
+    return (int)STBIR_CEILF(support(scale,user_data) * 2.0f / scale);
+}
+
+// this is how many coefficents per run of the filter (which is different 
+//   from the filter_pixel_width depending on if we are scattering or gathering)
+static int stbir__get_coefficient_width(stbir__sampler * samp, int is_gather, void * user_data)
+{
+  float scale = samp->scale_info.scale;
+  stbir__support_callback * support = samp->filter_support;
+
+  switch( is_gather )
+  {
+    case 1:
+      return (int)STBIR_CEILF(support(1.0f / scale, user_data) * 2.0f);
+    case 2:
+      return (int)STBIR_CEILF(support(scale, user_data) * 2.0f / scale);
+    case 0:
+      return (int)STBIR_CEILF(support(scale, user_data) * 2.0f);
+    default:
+      STBIR_ASSERT( (is_gather >= 0 ) && (is_gather <= 2 ) );
+      return 0;
+  }
+}
+
+static int stbir__get_contributors(stbir__sampler * samp, int is_gather)  
+{
+  if (is_gather)
+      return samp->scale_info.output_sub_size;
+  else
+      return (samp->scale_info.input_full_size + samp->filter_pixel_margin * 2);
+}
+
+static int stbir__edge_zero_full( int n, int max )
+{
+  STBIR__UNUSED(n);
+  STBIR__UNUSED(max);
+  return 0; // NOTREACHED
+}
+
+static int stbir__edge_clamp_full( int n, int max )
+{
+  if (n < 0)
+    return 0;
+
+  if (n >= max)
+    return max - 1;
+
+  return n; // NOTREACHED
+}
+
+static int stbir__edge_reflect_full( int n, int max )
+{
+  if (n < 0)
+  {
+    if (n > -max)    
+      return -n;
+    else
+      return max - 1;
+  }
+
+  if (n >= max)
+  {
+    int max2 = max * 2;
+    if (n >= max2)
+      return 0;
+    else
+      return max2 - n - 1;
+  }
+
+  return n; // NOTREACHED
+}
+
+static int stbir__edge_wrap_full( int n, int max )
+{
+  if (n >= 0)
+    return (n % max);
+  else
+  {
+    int m = (-n) % max;
+
+    if (m != 0)
+      m = max - m;
+
+    return (m);
+  }
+}
+
+typedef int stbir__edge_wrap_func( int n, int max );
+static stbir__edge_wrap_func * stbir__edge_wrap_slow[] =
+{
+  stbir__edge_clamp_full,    // STBIR_EDGE_CLAMP
+  stbir__edge_reflect_full,  // STBIR_EDGE_REFLECT
+  stbir__edge_wrap_full,     // STBIR_EDGE_WRAP
+  stbir__edge_zero_full,     // STBIR_EDGE_ZERO
+};
+
+stbir__inline static int stbir__edge_wrap(stbir_edge edge, int n, int max)
+{
+  // avoid per-pixel switch
+  if (n >= 0 && n < max)
+      return n;
+  return stbir__edge_wrap_slow[edge]( n, max );
+}
+
+#define STBIR__MERGE_RUNS_PIXEL_THRESHOLD 16
+
+// get information on the extents of a sampler
+static void stbir__get_extents( stbir__sampler * samp, stbir__extents * scanline_extents )
+{
+  int j, stop;
+  int left_margin, right_margin;
+  int min_n = 0x7fffffff, max_n = -0x7fffffff;
+  int min_left = 0x7fffffff, max_left = -0x7fffffff;
+  int min_right = 0x7fffffff, max_right = -0x7fffffff;
+  stbir_edge edge = samp->edge;
+  stbir__contributors* contributors = samp->contributors;
+  int output_sub_size = samp->scale_info.output_sub_size;
+  int input_full_size = samp->scale_info.input_full_size;
+  int filter_pixel_margin = samp->filter_pixel_margin;
+
+  STBIR_ASSERT( samp->is_gather );
+
+  stop = output_sub_size;
+  for (j = 0; j < stop; j++ )
+  {
+    STBIR_ASSERT( contributors[j].n1 >= contributors[j].n0 );
+    if ( contributors[j].n0 < min_n )
+    {
+      min_n = contributors[j].n0;
+      stop = j + filter_pixel_margin;  // if we find a new min, only scan another filter width
+      if ( stop > output_sub_size ) stop = output_sub_size;
+    }
+  }
+
+  stop = 0;
+  for (j = output_sub_size - 1; j >= stop; j-- )
+  {
+    STBIR_ASSERT( contributors[j].n1 >= contributors[j].n0 );
+    if ( contributors[j].n1 > max_n )
+    {
+      max_n = contributors[j].n1;
+      stop = j - filter_pixel_margin;  // if we find a new max, only scan another filter width
+      if (stop<0) stop = 0;
+    }
+  }
+
+  STBIR_ASSERT( scanline_extents->conservative.n0 <= min_n );
+  STBIR_ASSERT( scanline_extents->conservative.n1 >= max_n );
+
+  // now calculate how much into the margins we really read
+  left_margin = 0;
+  if ( min_n < 0 )
+  {
+    left_margin = -min_n;
+    min_n = 0;
+  }
+  
+  right_margin = 0;
+  if ( max_n >= input_full_size )
+  {
+    right_margin = max_n - input_full_size + 1;
+    max_n = input_full_size - 1;
+  }
+
+  // index 1 is margin pixel extents (how many pixels we hang over the edge)
+  scanline_extents->edge_sizes[0] = left_margin;
+  scanline_extents->edge_sizes[1] = right_margin;
+
+  // index 2 is pixels read from the input
+  scanline_extents->spans[0].n0 = min_n;
+  scanline_extents->spans[0].n1 = max_n;
+  scanline_extents->spans[0].pixel_offset_for_input = min_n;
+
+  // default to no other input range
+  scanline_extents->spans[1].n0 = 0;
+  scanline_extents->spans[1].n1 = -1;
+  scanline_extents->spans[1].pixel_offset_for_input = 0;
+
+  // don't have to do edge calc for zero clamp
+  if ( edge == STBIR_EDGE_ZERO )
+    return;
+  
+  // convert margin pixels to the pixels within the input (min and max)
+  for( j = -left_margin ; j < 0 ; j++ )
+  {
+      int p = stbir__edge_wrap( edge, j, input_full_size );
+      if ( p < min_left )
+        min_left = p;
+      if ( p > max_left )
+        max_left = p;
+  }
+
+  for( j = input_full_size ; j < (input_full_size + right_margin) ; j++ )
+  {
+      int p = stbir__edge_wrap( edge, j, input_full_size );
+      if ( p < min_right )
+        min_right = p;
+      if ( p > max_right )
+        max_right = p;
+  }
+
+  // merge the left margin pixel region if it connects within 4 pixels of main pixel region
+  if ( min_left != 0x7fffffff )
+  {
+    if ( ( ( min_left <= min_n ) && ( ( max_left  + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= min_n ) ) ||
+         ( ( min_n <= min_left ) && ( ( max_n  + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= max_left ) ) )
+    {
+      scanline_extents->spans[0].n0 = min_n = stbir__min( min_n, min_left );
+      scanline_extents->spans[0].n1 = max_n = stbir__max( max_n, max_left );
+      scanline_extents->spans[0].pixel_offset_for_input = min_n;
+      left_margin = 0;
+    }
+  }
+
+  // merge the right margin pixel region if it connects within 4 pixels of main pixel region
+  if ( min_right != 0x7fffffff )
+  {
+    if ( ( ( min_right <= min_n ) && ( ( max_right  + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= min_n ) ) ||
+         ( ( min_n <= min_right ) && ( ( max_n  + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= max_right ) ) )
+    {
+      scanline_extents->spans[0].n0 = min_n = stbir__min( min_n, min_right );
+      scanline_extents->spans[0].n1 = max_n = stbir__max( max_n, max_right );
+      scanline_extents->spans[0].pixel_offset_for_input = min_n;
+      right_margin = 0;
+    }
+  }
+
+  STBIR_ASSERT( scanline_extents->conservative.n0 <= min_n );
+  STBIR_ASSERT( scanline_extents->conservative.n1 >= max_n );
+
+  // you get two ranges when you have the WRAP edge mode and you are doing just the a piece of the resize
+  //   so you need to get a second run of pixels from the opposite side of the scanline (which you
+  //   wouldn't need except for WRAP)
+
+
+  // if we can't merge the min_left range, add it as a second range
+  if ( ( left_margin ) && ( min_left != 0x7fffffff ) )
+  {
+    stbir__span * newspan = scanline_extents->spans + 1;
+    STBIR_ASSERT( right_margin == 0 );
+    if ( min_left < scanline_extents->spans[0].n0 )
+    {
+      scanline_extents->spans[1].pixel_offset_for_input = scanline_extents->spans[0].n0;
+      scanline_extents->spans[1].n0 = scanline_extents->spans[0].n0;
+      scanline_extents->spans[1].n1 = scanline_extents->spans[0].n1;
+      --newspan;
+    }
+    newspan->pixel_offset_for_input = min_left;
+    newspan->n0 = -left_margin;
+    newspan->n1 = ( max_left - min_left ) - left_margin;
+    scanline_extents->edge_sizes[0] = 0;  // don't need to copy the left margin, since we are directly decoding into the margin
+    return;
+  }
+
+  // if we can't merge the min_left range, add it as a second range
+  if ( ( right_margin ) && ( min_right != 0x7fffffff ) )
+  {
+    stbir__span * newspan = scanline_extents->spans + 1;
+    if ( min_right < scanline_extents->spans[0].n0 )
+    {
+      scanline_extents->spans[1].pixel_offset_for_input = scanline_extents->spans[0].n0;
+      scanline_extents->spans[1].n0 = scanline_extents->spans[0].n0;
+      scanline_extents->spans[1].n1 = scanline_extents->spans[0].n1;
+      --newspan;
+    }
+    newspan->pixel_offset_for_input = min_right;
+    newspan->n0 = scanline_extents->spans[1].n1 + 1;
+    newspan->n1 = scanline_extents->spans[1].n1 + 1 + ( max_right - min_right );
+    scanline_extents->edge_sizes[1] = 0;  // don't need to copy the right margin, since we are directly decoding into the margin
+    return;
+  }
+}
+
+static void stbir__calculate_in_pixel_range( int * first_pixel, int * last_pixel, float out_pixel_center, float out_filter_radius, float inv_scale, float out_shift, int input_size, stbir_edge edge )
+{
+  int first, last;
+  float out_pixel_influence_lowerbound = out_pixel_center - out_filter_radius;
+  float out_pixel_influence_upperbound = out_pixel_center + out_filter_radius;
+
+  float in_pixel_influence_lowerbound = (out_pixel_influence_lowerbound + out_shift) * inv_scale; 
+  float in_pixel_influence_upperbound = (out_pixel_influence_upperbound + out_shift) * inv_scale; 
+
+  first = (int)(STBIR_FLOORF(in_pixel_influence_lowerbound + 0.5f));
+  last = (int)(STBIR_FLOORF(in_pixel_influence_upperbound - 0.5f));
+
+  if ( edge == STBIR_EDGE_WRAP )
+  {
+    if ( first <= -input_size )
+      first = -(input_size-1);
+    if ( last >= (input_size*2))
+      last = (input_size*2) - 1;
+  }
+  
+  *first_pixel = first;
+  *last_pixel = last;
+}
+
+static void stbir__calculate_coefficients_for_gather_upsample( float out_filter_radius, stbir__kernel_callback * kernel, stbir__scale_info * scale_info, int num_contributors, stbir__contributors* contributors, float* coefficient_group, int coefficient_width, stbir_edge edge, void * user_data )
+{
+  int n, end;
+  float inv_scale = scale_info->inv_scale;
+  float out_shift = scale_info->pixel_shift;
+  int input_size  = scale_info->input_full_size;
+  int numerator = scale_info->scale_numerator;
+  int polyphase = ( ( scale_info->scale_is_rational ) && ( numerator < num_contributors ) );
+
+  // Looping through out pixels
+  end = num_contributors; if ( polyphase ) end = numerator;
+  for (n = 0; n < end; n++)
+  {
+    int i;
+    int last_non_zero;
+    float out_pixel_center = (float)n + 0.5f;
+    float in_center_of_out = (out_pixel_center + out_shift) * inv_scale;  
+
+    int in_first_pixel, in_last_pixel;
+    
+    stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, out_pixel_center, out_filter_radius, inv_scale, out_shift, input_size, edge );
+
+    last_non_zero = -1;
+    for (i = 0; i <= in_last_pixel - in_first_pixel; i++)
+    {
+      float in_pixel_center = (float)(i + in_first_pixel) + 0.5f;
+      float coeff = kernel(in_center_of_out - in_pixel_center, inv_scale, user_data);
+
+      // kill denormals
+      if ( ( ( coeff < stbir__small_float ) && ( coeff > -stbir__small_float ) ) )
+      {
+        if ( i == 0 )  // if we're at the front, just eat zero contributors
+        { 
+          STBIR_ASSERT ( ( in_last_pixel - in_first_pixel ) != 0 ); // there should be at least one contrib
+          ++in_first_pixel;
+          i--;
+          continue;
+        }
+        coeff = 0;  // make sure is fully zero (should keep denormals away)
+      }
+      else
+        last_non_zero = i;
+      
+      coefficient_group[i] = coeff;
+    }
+    
+    in_last_pixel = last_non_zero+in_first_pixel; // kills trailing zeros
+    contributors->n0 = in_first_pixel;
+    contributors->n1 = in_last_pixel;
+
+    STBIR_ASSERT(contributors->n1 >= contributors->n0);
+
+    ++contributors;
+    coefficient_group += coefficient_width;
+  }
+}
+
+static void stbir__insert_coeff( stbir__contributors * contribs, float * coeffs, int new_pixel, float new_coeff )
+{
+  if ( new_pixel <= contribs->n1 )  // before the end
+  {
+    if ( new_pixel < contribs->n0 ) // before the front?
+    {
+      int j, o = contribs->n0 - new_pixel;
+      for ( j = contribs->n1 - contribs->n0 ; j <= 0 ; j-- )
+        coeffs[ j + o ] = coeffs[ j ];
+      for ( j = 1 ; j < o ; j-- )
+        coeffs[ j ] = coeffs[ 0 ];
+      coeffs[ 0 ] = new_coeff;
+      contribs->n0 = new_pixel;
+    }
+    else
+    {
+      coeffs[ new_pixel - contribs->n0 ] += new_coeff;
+    }
+  }
+  else
+  {
+    int j, e = new_pixel - contribs->n0;
+    for( j = ( contribs->n1 - contribs->n0 ) + 1 ; j < e ; j++ ) // clear in-betweens coeffs if there are any
+      coeffs[j] = 0;
+
+    coeffs[ e ] = new_coeff;
+    contribs->n1 = new_pixel;
+  }
+}
+
+static void stbir__calculate_out_pixel_range( int * first_pixel, int * last_pixel, float in_pixel_center, float in_pixels_radius, float scale, float out_shift, int out_size )
+{
+  float in_pixel_influence_lowerbound = in_pixel_center - in_pixels_radius;
+  float in_pixel_influence_upperbound = in_pixel_center + in_pixels_radius;
+  float out_pixel_influence_lowerbound = in_pixel_influence_lowerbound * scale - out_shift;
+  float out_pixel_influence_upperbound = in_pixel_influence_upperbound * scale - out_shift;
+  int out_first_pixel = (int)(STBIR_FLOORF(out_pixel_influence_lowerbound + 0.5f));
+  int out_last_pixel = (int)(STBIR_FLOORF(out_pixel_influence_upperbound - 0.5f));
+
+  if ( out_first_pixel < 0 )
+    out_first_pixel = 0;
+  if ( out_last_pixel >= out_size )
+    out_last_pixel = out_size - 1;
+  *first_pixel = out_first_pixel;
+  *last_pixel = out_last_pixel;
+}
+
+static void stbir__calculate_coefficients_for_gather_downsample( int start, int end, float in_pixels_radius, stbir__kernel_callback * kernel, stbir__scale_info * scale_info, int coefficient_width, int num_contributors, stbir__contributors * contributors, float * coefficient_group, void * user_data )
+{
+  int in_pixel;
+  int i;
+  int first_out_inited = -1;
+  float scale = scale_info->scale;
+  float out_shift = scale_info->pixel_shift;
+  int out_size = scale_info->output_sub_size;
+  int numerator = scale_info->scale_numerator;
+  int polyphase = ( ( scale_info->scale_is_rational ) && ( numerator < out_size ) );
+
+  STBIR__UNUSED(num_contributors);
+
+  // Loop through the input pixels
+  for (in_pixel = start; in_pixel < end; in_pixel++)
+  {
+    float in_pixel_center = (float)in_pixel + 0.5f;
+    float out_center_of_in = in_pixel_center * scale - out_shift;
+    int out_first_pixel, out_last_pixel;
+
+    stbir__calculate_out_pixel_range( &out_first_pixel, &out_last_pixel, in_pixel_center, in_pixels_radius, scale, out_shift, out_size );
+
+    if ( out_first_pixel > out_last_pixel )
+      continue;
+
+    // clamp or exit if we are using polyphase filtering, and the limit is up
+    if ( polyphase )
+    {
+      // when polyphase, you only have to do coeffs up to the numerator count
+      if ( out_first_pixel == numerator )
+        break;
+
+      // don't do any extra work, clamp last pixel at numerator too
+      if ( out_last_pixel >= numerator )
+        out_last_pixel = numerator - 1;
+    }
+
+    for (i = 0; i <= out_last_pixel - out_first_pixel; i++)
+    {
+      float out_pixel_center = (float)(i + out_first_pixel) + 0.5f;
+      float x = out_pixel_center - out_center_of_in;
+      float coeff = kernel(x, scale, user_data) * scale;
+
+      // kill the coeff if it's too small (avoid denormals)
+      if ( ( ( coeff < stbir__small_float ) && ( coeff > -stbir__small_float ) ) )
+        coeff = 0.0f;
+
+      {
+        int out = i + out_first_pixel;
+        float * coeffs = coefficient_group + out * coefficient_width;
+        stbir__contributors * contribs = contributors + out;
+
+        // is this the first time this output pixel has been seen?  Init it.
+        if ( out > first_out_inited ) 
+        {
+          STBIR_ASSERT( out == ( first_out_inited + 1 ) ); // ensure we have only advanced one at time
+          first_out_inited = out;
+          contribs->n0 = in_pixel;
+          contribs->n1 = in_pixel;
+          coeffs[0]  = coeff;
+        }
+        else 
+        {
+          // insert on end (always in order)
+          if ( coeffs[0] == 0.0f )  // if the first coefficent is zero, then zap it for this coeffs
+          {
+            STBIR_ASSERT( ( in_pixel - contribs->n0 ) == 1 ); // ensure that when we zap, we're at the 2nd pos
+            contribs->n0 = in_pixel;
+          }
+          contribs->n1 = in_pixel;
+          STBIR_ASSERT( ( in_pixel - contribs->n0 ) < coefficient_width );
+          coeffs[in_pixel - contribs->n0]  = coeff;
+        }
+      }
+    }
+  }
+}
+
+static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter_extent_info* filter_info, stbir__scale_info * scale_info, int num_contributors, stbir__contributors* contributors, float * coefficient_group, int coefficient_width )
+{
+  int input_size = scale_info->input_full_size;
+  int input_last_n1 = input_size - 1; 
+  int n, end;
+  int lowest = 0x7fffffff;
+  int highest = -0x7fffffff;
+  int widest = -1;
+  int numerator = scale_info->scale_numerator;
+  int denominator = scale_info->scale_denominator;
+  int polyphase = ( ( scale_info->scale_is_rational ) && ( numerator < num_contributors ) );
+  float * coeffs;
+  stbir__contributors * contribs;
+
+  // weight all the coeffs for each sample
+  coeffs = coefficient_group;
+  contribs = contributors;
+  end = num_contributors; if ( polyphase ) end = numerator;
+  for (n = 0; n < end; n++)
+  {
+    int i;
+    float filter_scale, total_filter = 0;
+    int e;
+
+    // add all contribs
+    e = contribs->n1 - contribs->n0;
+    for( i = 0 ; i <= e ; i++ )
+    {
+      total_filter += coeffs[i];
+      STBIR_ASSERT( ( coeffs[i] >= -2.0f ) && ( coeffs[i] <= 2.0f )  ); // check for wonky weights
+    }
+
+    // rescale
+    if ( ( total_filter < stbir__small_float ) && ( total_filter > -stbir__small_float ) )
+    {
+      // all coeffs are extremely small, just zero it
+      contribs->n1 = contribs->n0;
+      coeffs[0] = 0.0f;
+    }
+    else
+    {
+      // if the total isn't 1.0, rescale everything
+      if ( ( total_filter < (1.0f-stbir__small_float) ) || ( total_filter > (1.0f+stbir__small_float) ) )
+      {
+        filter_scale = 1.0f / total_filter;
+        // scale them all
+        for (i = 0; i <= e; i++)
+          coeffs[i] *= filter_scale;
+      }
+    }
+    ++contribs;
+    coeffs += coefficient_width;
+  }
+
+  // if we have a rational for the scale, we can exploit the polyphaseness to not calculate
+  //   most of the coefficients, so we copy them here
+  if ( polyphase )
+  {
+    stbir__contributors * prev_contribs = contributors;
+    stbir__contributors * cur_contribs = contributors + numerator;
+
+    for( n = numerator ; n < num_contributors ; n++ )
+    {
+      cur_contribs->n0 = prev_contribs->n0 + denominator;
+      cur_contribs->n1 = prev_contribs->n1 + denominator;
+      ++cur_contribs;
+      ++prev_contribs;
+    }
+    stbir_overlapping_memcpy( coefficient_group + numerator * coefficient_width, coefficient_group, ( num_contributors - numerator ) * coefficient_width * sizeof( coeffs[ 0 ] ) );
+  }
+
+  coeffs = coefficient_group;
+  contribs = contributors;
+  for (n = 0; n < num_contributors; n++)
+  {
+    int i;
+
+    // in zero edge mode, just remove out of bounds contribs completely (since their weights are accounted for now)
+    if ( edge == STBIR_EDGE_ZERO )
+    {
+      // shrink the right side if necessary
+      if ( contribs->n1 > input_last_n1 )
+        contribs->n1 = input_last_n1;
+
+      // shrink the left side
+      if ( contribs->n0 < 0 )
+      {
+        int j, left, skips = 0;
+
+        skips = -contribs->n0;
+        contribs->n0 = 0;
+
+        // now move down the weights
+        left = contribs->n1 - contribs->n0 + 1;
+        if ( left > 0 )
+        {
+          for( j = 0 ; j < left ; j++ )
+            coeffs[ j ] = coeffs[ j + skips ];
+        }
+      }
+    }
+    else if ( ( edge == STBIR_EDGE_CLAMP ) || ( edge == STBIR_EDGE_REFLECT ) )
+    {
+      // for clamp and reflect, calculate the true inbounds position (based on edge type) and just add that to the existing weight
+      
+      // right hand side first
+      if ( contribs->n1 > input_last_n1 )
+      {
+        int start = contribs->n0;
+        int endi = contribs->n1;
+        contribs->n1 = input_last_n1;  
+        for( i = input_size; i <= endi; i++ )
+          stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), coeffs[i-start] );
+      }
+
+      // now check left hand edge
+      if ( contribs->n0 < 0 )
+      {
+        int save_n0;
+        float save_n0_coeff;
+        float * c = coeffs - ( contribs->n0 + 1 );
+        
+        // reinsert the coeffs with it reflected or clamped (insert accumulates, if the coeffs exist)
+        for( i = -1 ; i > contribs->n0 ; i-- ) 
+          stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), *c-- );
+        save_n0 = contribs->n0;
+        save_n0_coeff = c[0]; // save it, since we didn't do the final one (i==n0), because there might be too many coeffs to hold (before we resize)!
+
+        // now slide all the coeffs down (since we have accumulated them in the positive contribs) and reset the first contrib
+        contribs->n0 = 0;  
+        for(i = 0 ; i <= contribs->n1 ; i++ )
+          coeffs[i] = coeffs[i-save_n0];
+        
+        // now that we have shrunk down the contribs, we insert the first one safely
+        stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( save_n0, input_size ), save_n0_coeff );
+      }
+    }
+
+    if ( contribs->n0 <= contribs->n1 )
+    {
+      int diff = contribs->n1 - contribs->n0 + 1;
+      while ( diff && ( coeffs[ diff-1 ] == 0.0f ) )
+        --diff;
+      contribs->n1 = contribs->n0 + diff - 1;
+
+      if ( contribs->n0 <= contribs->n1 )
+      {
+        if ( contribs->n0 < lowest )
+          lowest = contribs->n0;
+        if ( contribs->n1 > highest )
+          highest = contribs->n1;
+        if ( diff > widest )
+          widest = diff;
+      }
+
+      // re-zero out unused coefficients (if any)
+      for( i = diff ; i < coefficient_width ; i++ )
+        coeffs[i] = 0.0f;
+    }
+
+    ++contribs;
+    coeffs += coefficient_width;
+  }
+  filter_info->lowest = lowest;
+  filter_info->highest = highest;
+  filter_info->widest = widest;
+}
+
+static int stbir__pack_coefficients( int num_contributors, stbir__contributors* contributors, float * coefficents, int coefficient_width, int widest, int row_width )
+{
+  #define STBIR_MOVE_1( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint32*)(dest))[0] = ((stbir_uint32*)(src))[0]; }
+  #define STBIR_MOVE_2( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint64*)(dest))[0] = ((stbir_uint64*)(src))[0]; }
+  #ifdef STBIR_SIMD
+  #define STBIR_MOVE_4( dest, src ) { stbir__simdf t; STBIR_NO_UNROLL(dest); stbir__simdf_load( t, src ); stbir__simdf_store( dest, t ); }
+  #else
+  #define STBIR_MOVE_4( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint64*)(dest))[0] = ((stbir_uint64*)(src))[0]; ((stbir_uint64*)(dest))[1] = ((stbir_uint64*)(src))[1]; }
+  #endif
+  if ( coefficient_width != widest )
+  {
+    float * pc = coefficents;
+    float * coeffs = coefficents;
+    float * pc_end = coefficents + num_contributors * widest;
+    switch( widest )
+    {
+      case 1:
+        do {
+          STBIR_MOVE_1( pc, coeffs );
+          ++pc;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 2:
+        do {
+          STBIR_MOVE_2( pc, coeffs );
+          pc += 2;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 3:
+        do {
+          STBIR_MOVE_2( pc, coeffs );
+          STBIR_MOVE_1( pc+2, coeffs+2 );
+          pc += 3;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 4:
+        do {
+          STBIR_MOVE_4( pc, coeffs );
+          pc += 4;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 5:
+        do {
+          STBIR_MOVE_4( pc, coeffs );
+          STBIR_MOVE_1( pc+4, coeffs+4 );
+          pc += 5;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 6:
+        do {
+          STBIR_MOVE_4( pc, coeffs );
+          STBIR_MOVE_2( pc+4, coeffs+4 );
+          pc += 6;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 7:
+        do {
+          STBIR_MOVE_4( pc, coeffs );
+          STBIR_MOVE_2( pc+4, coeffs+4 );
+          STBIR_MOVE_1( pc+6, coeffs+6 );
+          pc += 7;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 8:
+        do {
+          STBIR_MOVE_4( pc, coeffs );
+          STBIR_MOVE_4( pc+4, coeffs+4 );
+          pc += 8;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 9:
+        do {
+          STBIR_MOVE_4( pc, coeffs );
+          STBIR_MOVE_4( pc+4, coeffs+4 );
+          STBIR_MOVE_1( pc+8, coeffs+8 );
+          pc += 9;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 10:
+        do {
+          STBIR_MOVE_4( pc, coeffs );
+          STBIR_MOVE_4( pc+4, coeffs+4 );
+          STBIR_MOVE_2( pc+8, coeffs+8 );
+          pc += 10;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 11:
+        do {
+          STBIR_MOVE_4( pc, coeffs );
+          STBIR_MOVE_4( pc+4, coeffs+4 );
+          STBIR_MOVE_2( pc+8, coeffs+8 );
+          STBIR_MOVE_1( pc+10, coeffs+10 );
+          pc += 11;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 12:
+        do {
+          STBIR_MOVE_4( pc, coeffs );
+          STBIR_MOVE_4( pc+4, coeffs+4 );
+          STBIR_MOVE_4( pc+8, coeffs+8 );
+          pc += 12;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      default:
+        do {
+          float * copy_end = pc + widest - 4;
+          float * c = coeffs;
+          do {
+            STBIR_NO_UNROLL( pc );
+            STBIR_MOVE_4( pc, c );
+            pc += 4;
+            c += 4;
+          } while ( pc <= copy_end );
+          copy_end += 4;
+          while ( pc < copy_end )
+          {
+            STBIR_MOVE_1( pc, c );
+            ++pc; ++c;
+          }
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+    }
+  }
+
+  // some horizontal routines read one float off the end (which is then masked off), so put in a sentinal so we don't read an snan or denormal
+  coefficents[ widest * num_contributors ] = 8888.0f;
+
+  // the minimum we might read for unrolled filters widths is 12. So, we need to
+  //   make sure we never read outside the decode buffer, by possibly moving 
+  //   the sample area back into the scanline, and putting zeros weights first.
+  // we start on the right edge and check until we're well past the possible
+  //   clip area (2*widest).
+  {
+    stbir__contributors * contribs = contributors + num_contributors - 1;
+    float * coeffs = coefficents + widest * ( num_contributors - 1 );
+
+    // go until no chance of clipping (this is usually less than 8 lops)
+    while ( ( ( contribs->n0 + widest*2 ) >= row_width ) && ( contribs >= contributors ) )
+    {
+      // might we clip??
+      if ( ( contribs->n0 + widest ) > row_width )
+      {
+        int stop_range = widest;
+      
+        // if range is larger than 12, it will be handled by generic loops that can terminate on the exact length
+        //   of this contrib n1, instead of a fixed widest amount - so calculate this
+        if ( widest > 12 )
+        {
+          int mod;
+
+          // how far will be read in the n_coeff loop (which depends on the widest count mod4);
+          mod = widest & 3;
+          stop_range = ( ( ( contribs->n1 - contribs->n0 + 1 ) - mod + 3 ) & ~3 ) + mod; 
+
+          // the n_coeff loops do a minimum amount of coeffs, so factor that in!
+          if ( stop_range < ( 8 + mod ) ) stop_range = 8 + mod;
+        }
+
+        // now see if we still clip with the refined range
+        if ( ( contribs->n0 + stop_range ) > row_width )
+        {
+          int new_n0 = row_width - stop_range;
+          int num = contribs->n1 - contribs->n0 + 1;
+          int backup = contribs->n0 - new_n0;
+          float * from_co = coeffs + num - 1;
+          float * to_co = from_co + backup;
+
+          STBIR_ASSERT( ( new_n0 >= 0 ) && ( new_n0 < contribs->n0 ) );
+
+          // move the coeffs over
+          while( num )
+          {
+            *to_co-- = *from_co--;
+            --num;
+          }
+          // zero new positions
+          while ( to_co >= coeffs )
+            *to_co-- = 0;
+          // set new start point
+          contribs->n0 = new_n0;
+          if ( widest > 12 )
+          {
+            int mod;
+
+            // how far will be read in the n_coeff loop (which depends on the widest count mod4);
+            mod = widest & 3;
+            stop_range = ( ( ( contribs->n1 - contribs->n0 + 1 ) - mod + 3 ) & ~3 ) + mod; 
+
+            // the n_coeff loops do a minimum amount of coeffs, so factor that in!
+            if ( stop_range < ( 8 + mod ) ) stop_range = 8 + mod;
+          }
+        }
+      }
+      --contribs;
+      coeffs -= widest;
+    }
+  }
+
+  return widest;
+  #undef STBIR_MOVE_1
+  #undef STBIR_MOVE_2
+  #undef STBIR_MOVE_4
+}
+
+static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * other_axis_for_pivot, void * user_data STBIR_ONLY_PROFILE_BUILD_GET_INFO )
+{
+  int n;
+  float scale = samp->scale_info.scale;
+  stbir__kernel_callback * kernel = samp->filter_kernel;
+  stbir__support_callback * support = samp->filter_support;
+  float inv_scale = samp->scale_info.inv_scale;
+  int input_full_size = samp->scale_info.input_full_size;
+  int gather_num_contributors = samp->num_contributors;
+  stbir__contributors* gather_contributors = samp->contributors;
+  float * gather_coeffs = samp->coefficients; 
+  int gather_coefficient_width = samp->coefficient_width;
+
+  switch ( samp->is_gather )
+  {
+    case 1: // gather upsample
+    {
+      float out_pixels_radius = support(inv_scale,user_data) * scale;
+
+      stbir__calculate_coefficients_for_gather_upsample( out_pixels_radius, kernel, &samp->scale_info, gather_num_contributors, gather_contributors, gather_coeffs, gather_coefficient_width, samp->edge, user_data );
+
+      STBIR_PROFILE_BUILD_START( cleanup );
+      stbir__cleanup_gathered_coefficients( samp->edge, &samp->extent_info, &samp->scale_info, gather_num_contributors, gather_contributors, gather_coeffs, gather_coefficient_width );
+      STBIR_PROFILE_BUILD_END( cleanup );
+    }
+    break;
+
+    case 0: // scatter downsample (only on vertical)
+    case 2: // gather downsample  
+    {
+      float in_pixels_radius = support(scale,user_data) * inv_scale;
+      int filter_pixel_margin = samp->filter_pixel_margin;
+      int input_end = input_full_size + filter_pixel_margin;
+      
+      // if this is a scatter, we do a downsample gather to get the coeffs, and then pivot after
+      if ( !samp->is_gather )
+      {
+        // check if we are using the same gather downsample on the horizontal as this vertical, 
+        //   if so, then we don't have to generate them, we can just pivot from the horizontal.
+        if ( other_axis_for_pivot )
+        {
+          gather_contributors = other_axis_for_pivot->contributors;
+          gather_coeffs = other_axis_for_pivot->coefficients;
+          gather_coefficient_width = other_axis_for_pivot->coefficient_width;
+          gather_num_contributors = other_axis_for_pivot->num_contributors;
+          samp->extent_info.lowest = other_axis_for_pivot->extent_info.lowest;
+          samp->extent_info.highest = other_axis_for_pivot->extent_info.highest;
+          samp->extent_info.widest = other_axis_for_pivot->extent_info.widest;
+          goto jump_right_to_pivot;
+        }
+
+        gather_contributors = samp->gather_prescatter_contributors;
+        gather_coeffs = samp->gather_prescatter_coefficients;
+        gather_coefficient_width = samp->gather_prescatter_coefficient_width;
+        gather_num_contributors = samp->gather_prescatter_num_contributors;
+      }
+
+      stbir__calculate_coefficients_for_gather_downsample( -filter_pixel_margin, input_end, in_pixels_radius, kernel, &samp->scale_info, gather_coefficient_width, gather_num_contributors, gather_contributors, gather_coeffs, user_data );
+
+      STBIR_PROFILE_BUILD_START( cleanup );
+      stbir__cleanup_gathered_coefficients( samp->edge, &samp->extent_info, &samp->scale_info, gather_num_contributors, gather_contributors, gather_coeffs, gather_coefficient_width );
+      STBIR_PROFILE_BUILD_END( cleanup );
+
+      if ( !samp->is_gather )
+      {
+        // if this is a scatter (vertical only), then we need to pivot the coeffs
+        stbir__contributors * scatter_contributors;
+        int highest_set;
+
+        jump_right_to_pivot:
+
+        STBIR_PROFILE_BUILD_START( pivot );
+
+        highest_set = (-filter_pixel_margin) - 1;
+        for (n = 0; n < gather_num_contributors; n++)
+        {
+          int k;
+          int gn0 = gather_contributors->n0, gn1 = gather_contributors->n1;
+          int scatter_coefficient_width = samp->coefficient_width;
+          float * scatter_coeffs = samp->coefficients + ( gn0 + filter_pixel_margin ) * scatter_coefficient_width;
+          float * g_coeffs = gather_coeffs;
+          scatter_contributors = samp->contributors + ( gn0 + filter_pixel_margin );
+          
+          for (k = gn0 ; k <= gn1 ; k++ )
+          {
+            float gc = *g_coeffs++;
+            if ( ( k > highest_set ) || ( scatter_contributors->n0 > scatter_contributors->n1 ) )
+            {
+              {
+                // if we are skipping over several contributors, we need to clear the skipped ones
+                stbir__contributors * clear_contributors = samp->contributors + ( highest_set + filter_pixel_margin + 1);
+                while ( clear_contributors < scatter_contributors )
+                {
+                  clear_contributors->n0 = 0; 
+                  clear_contributors->n1 = -1;
+                  ++clear_contributors;
+                }
+              }
+              scatter_contributors->n0 = n;
+              scatter_contributors->n1 = n;
+              scatter_coeffs[0]  = gc;
+              highest_set = k;
+            }
+            else
+            {
+              stbir__insert_coeff( scatter_contributors, scatter_coeffs, n, gc );
+            }
+            ++scatter_contributors;
+            scatter_coeffs += scatter_coefficient_width;
+          }
+
+          ++gather_contributors;
+          gather_coeffs += gather_coefficient_width;
+        }
+
+        // now clear any unset contribs
+        {
+          stbir__contributors * clear_contributors = samp->contributors + ( highest_set + filter_pixel_margin + 1);
+          stbir__contributors * end_contributors = samp->contributors + samp->num_contributors;
+          while ( clear_contributors < end_contributors )
+          {
+            clear_contributors->n0 = 0;
+            clear_contributors->n1 = -1;
+            ++clear_contributors;
+          }
+        }
+
+        STBIR_PROFILE_BUILD_END( pivot );
+      }
+    }
+    break;
+  }
+}
+
+
+//========================================================================================================
+// scanline decoders and encoders
+
+#define stbir__coder_min_num 1
+#define STB_IMAGE_RESIZE_DO_CODERS
+#include STBIR__HEADER_FILENAME
+
+#define stbir__decode_suffix BGRA
+#define stbir__decode_swizzle
+#define stbir__decode_order0  2 
+#define stbir__decode_order1  1
+#define stbir__decode_order2  0
+#define stbir__decode_order3  3
+#define stbir__encode_order0  2 
+#define stbir__encode_order1  1
+#define stbir__encode_order2  0
+#define stbir__encode_order3  3
+#define stbir__coder_min_num 4
+#define STB_IMAGE_RESIZE_DO_CODERS
+#include STBIR__HEADER_FILENAME
+
+#define stbir__decode_suffix ARGB
+#define stbir__decode_swizzle
+#define stbir__decode_order0  1 
+#define stbir__decode_order1  2
+#define stbir__decode_order2  3
+#define stbir__decode_order3  0
+#define stbir__encode_order0  3 
+#define stbir__encode_order1  0
+#define stbir__encode_order2  1
+#define stbir__encode_order3  2
+#define stbir__coder_min_num 4
+#define STB_IMAGE_RESIZE_DO_CODERS
+#include STBIR__HEADER_FILENAME
+
+#define stbir__decode_suffix ABGR
+#define stbir__decode_swizzle
+#define stbir__decode_order0  3 
+#define stbir__decode_order1  2
+#define stbir__decode_order2  1
+#define stbir__decode_order3  0
+#define stbir__encode_order0  3 
+#define stbir__encode_order1  2
+#define stbir__encode_order2  1
+#define stbir__encode_order3  0
+#define stbir__coder_min_num 4
+#define STB_IMAGE_RESIZE_DO_CODERS
+#include STBIR__HEADER_FILENAME
+
+#define stbir__decode_suffix AR
+#define stbir__decode_swizzle
+#define stbir__decode_order0  1 
+#define stbir__decode_order1  0 
+#define stbir__decode_order2  3
+#define stbir__decode_order3  2
+#define stbir__encode_order0  1 
+#define stbir__encode_order1  0 
+#define stbir__encode_order2  3
+#define stbir__encode_order3  2
+#define stbir__coder_min_num 2
+#define STB_IMAGE_RESIZE_DO_CODERS
+#include STBIR__HEADER_FILENAME
+
+
+// fancy alpha means we expand to keep both premultipied and non-premultiplied color channels
+static void stbir__fancy_alpha_weight_4ch( float * out_buffer, int width_times_channels )
+{
+  float STBIR_STREAMOUT_PTR(*) out = out_buffer;
+  float const * end_decode = out_buffer + ( width_times_channels / 4 ) * 7;  // decode buffer aligned to end of out_buffer
+  float STBIR_STREAMOUT_PTR(*) decode = (float*)end_decode - width_times_channels;
+
+  // fancy alpha is stored internally as R G B A Rpm Gpm Bpm
+
+  #ifdef STBIR_SIMD
+  
+  #ifdef STBIR_SIMD8
+  decode += 16;
+  while ( decode <= end_decode )
+  {
+    stbir__simdf8 d0,d1,a0,a1,p0,p1;
+    STBIR_NO_UNROLL(decode);
+    stbir__simdf8_load( d0, decode-16 );
+    stbir__simdf8_load( d1, decode-16+8 );
+    stbir__simdf8_0123to33333333( a0, d0 );
+    stbir__simdf8_0123to33333333( a1, d1 );
+    stbir__simdf8_mult( p0, a0, d0 );
+    stbir__simdf8_mult( p1, a1, d1 );
+    stbir__simdf8_bot4s( a0, d0, p0 );
+    stbir__simdf8_bot4s( a1, d1, p1 );
+    stbir__simdf8_top4s( d0, d0, p0 );
+    stbir__simdf8_top4s( d1, d1, p1 );
+    stbir__simdf8_store ( out, a0 );
+    stbir__simdf8_store ( out+7, d0 );
+    stbir__simdf8_store ( out+14, a1 );
+    stbir__simdf8_store ( out+21, d1 );
+    decode += 16;
+    out += 28;
+  }
+  decode -= 16;
+  #else  
+  decode += 8;
+  while ( decode <= end_decode )
+  {
+    stbir__simdf d0,a0,d1,a1,p0,p1;
+    STBIR_NO_UNROLL(decode);
+    stbir__simdf_load( d0, decode-8 );
+    stbir__simdf_load( d1, decode-8+4 );
+    stbir__simdf_0123to3333( a0, d0 );
+    stbir__simdf_0123to3333( a1, d1 );
+    stbir__simdf_mult( p0, a0, d0 );
+    stbir__simdf_mult( p1, a1, d1 );
+    stbir__simdf_store ( out, d0 );
+    stbir__simdf_store ( out+4, p0 );
+    stbir__simdf_store ( out+7, d1 );
+    stbir__simdf_store ( out+7+4, p1 );
+    decode += 8;
+    out += 14;
+  }
+  decode -= 8;
+  #endif
+
+  // might be one last odd pixel
+  #ifdef STBIR_SIMD8
+  while ( decode < end_decode )
+  #else
+  if ( decode < end_decode )
+  #endif
+  {
+    stbir__simdf d,a,p;
+    stbir__simdf_load( d, decode );
+    stbir__simdf_0123to3333( a, d );
+    stbir__simdf_mult( p, a, d );
+    stbir__simdf_store ( out, d );
+    stbir__simdf_store ( out+4, p );
+    decode += 4;
+    out += 7;
+  }
+
+  #else
+
+  while( decode < end_decode )
+  {
+    float r = decode[0], g = decode[1], b = decode[2], alpha = decode[3];
+    out[0] = r;
+    out[1] = g;
+    out[2] = b;
+    out[3] = alpha;
+    out[4] = r * alpha;
+    out[5] = g * alpha;
+    out[6] = b * alpha;
+    out += 7;
+    decode += 4;
+  }
+
+  #endif
+}
+
+static void stbir__fancy_alpha_weight_2ch( float * out_buffer, int width_times_channels )
+{
+  float STBIR_STREAMOUT_PTR(*) out = out_buffer;
+  float const * end_decode = out_buffer + ( width_times_channels / 2 ) * 3;
+  float STBIR_STREAMOUT_PTR(*) decode = (float*)end_decode - width_times_channels;
+
+  //  for fancy alpha, turns into: [X A Xpm][X A Xpm],etc
+
+  #ifdef STBIR_SIMD
+
+  decode += 8;
+  if ( decode <= end_decode )
+  {
+    do {
+      #ifdef STBIR_SIMD8
+      stbir__simdf8 d0,a0,p0;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdf8_load( d0, decode-8 );
+      stbir__simdf8_0123to11331133( p0, d0 );
+      stbir__simdf8_0123to00220022( a0, d0 );
+      stbir__simdf8_mult( p0, p0, a0 );
+ 
+      stbir__simdf_store2( out, stbir__if_simdf8_cast_to_simdf4( d0 ) );
+      stbir__simdf_store( out+2, stbir__if_simdf8_cast_to_simdf4( p0 ) );
+      stbir__simdf_store2h( out+3, stbir__if_simdf8_cast_to_simdf4( d0 ) );
+      
+      stbir__simdf_store2( out+6, stbir__simdf8_gettop4( d0 ) );
+      stbir__simdf_store( out+8, stbir__simdf8_gettop4( p0 ) );
+      stbir__simdf_store2h( out+9, stbir__simdf8_gettop4( d0 ) );
+      #else
+      stbir__simdf d0,a0,d1,a1,p0,p1;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdf_load( d0, decode-8 );
+      stbir__simdf_load( d1, decode-8+4 );
+      stbir__simdf_0123to1133( p0, d0 );
+      stbir__simdf_0123to1133( p1, d1 );
+      stbir__simdf_0123to0022( a0, d0 );
+      stbir__simdf_0123to0022( a1, d1 );
+      stbir__simdf_mult( p0, p0, a0 );
+      stbir__simdf_mult( p1, p1, a1 );
+
+      stbir__simdf_store2( out, d0 );
+      stbir__simdf_store( out+2, p0 );
+      stbir__simdf_store2h( out+3, d0 );
+
+      stbir__simdf_store2( out+6, d1 );
+      stbir__simdf_store( out+8, p1 );
+      stbir__simdf_store2h( out+9, d1 );
+      #endif
+      decode += 8;
+      out += 12;
+    } while ( decode <= end_decode );
+  }
+  decode -= 8;
+  #endif
+
+  while( decode < end_decode )
+  {
+    float x = decode[0], y = decode[1];
+    STBIR_SIMD_NO_UNROLL(decode);
+    out[0] = x;
+    out[1] = y;
+    out[2] = x * y;
+    out += 3;
+    decode += 2;
+  }
+}
+
+static void stbir__fancy_alpha_unweight_4ch( float * encode_buffer, int width_times_channels )
+{
+  float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
+  float STBIR_SIMD_STREAMOUT_PTR(*) input = encode_buffer;
+  float const * end_output = encode_buffer + width_times_channels;
+
+  // fancy RGBA is stored internally as R G B A Rpm Gpm Bpm
+
+  do {
+    float alpha = input[3];
+#ifdef STBIR_SIMD
+    stbir__simdf i,ia;
+    STBIR_SIMD_NO_UNROLL(encode);
+    if ( alpha < stbir__small_float )
+    {
+      stbir__simdf_load( i, input );
+      stbir__simdf_store( encode, i );
+    }
+    else
+    {
+      stbir__simdf_load1frep4( ia, 1.0f / alpha );
+      stbir__simdf_load( i, input+4 );
+      stbir__simdf_mult( i, i, ia );
+      stbir__simdf_store( encode, i );
+      encode[3] = alpha;
+    }
+#else
+    if ( alpha < stbir__small_float )
+    {
+      encode[0] = input[0];
+      encode[1] = input[1];
+      encode[2] = input[2];
+    }
+    else
+    {
+      float ialpha = 1.0f / alpha;
+      encode[0] = input[4] * ialpha;
+      encode[1] = input[5] * ialpha;
+      encode[2] = input[6] * ialpha;
+    }
+    encode[3] = alpha;
+#endif
+
+    input += 7;
+    encode += 4;
+  } while ( encode < end_output );
+}
+
+//  format: [X A Xpm][X A Xpm] etc
+static void stbir__fancy_alpha_unweight_2ch( float * encode_buffer, int width_times_channels )
+{
+  float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
+  float STBIR_SIMD_STREAMOUT_PTR(*) input = encode_buffer;
+  float const * end_output = encode_buffer + width_times_channels;
+
+  do {
+    float alpha = input[1];
+    encode[0] = input[0];
+    if ( alpha >= stbir__small_float )
+      encode[0] = input[2] / alpha;
+    encode[1] = alpha;
+
+    input += 3;
+    encode += 2;
+  } while ( encode < end_output );
+}
+
+static void stbir__simple_alpha_weight_4ch( float * decode_buffer, int width_times_channels )
+{
+  float STBIR_STREAMOUT_PTR(*) decode = decode_buffer;
+  float const * end_decode = decode_buffer + width_times_channels;
+
+  #ifdef STBIR_SIMD
+  {
+    decode += 2 * stbir__simdfX_float_count;
+    while ( decode <= end_decode )
+    {
+      stbir__simdfX d0,a0,d1,a1;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdfX_load( d0, decode-2*stbir__simdfX_float_count );
+      stbir__simdfX_load( d1, decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count );
+      stbir__simdfX_aaa1( a0, d0, STBIR_onesX );
+      stbir__simdfX_aaa1( a1, d1, STBIR_onesX );
+      stbir__simdfX_mult( d0, d0, a0 );
+      stbir__simdfX_mult( d1, d1, a1 );
+      stbir__simdfX_store ( decode-2*stbir__simdfX_float_count, d0 );
+      stbir__simdfX_store ( decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count, d1 );
+      decode += 2 * stbir__simdfX_float_count;
+    }
+    decode -= 2 * stbir__simdfX_float_count;
+
+    // few last pixels remnants
+    #ifdef STBIR_SIMD8
+    while ( decode < end_decode )
+    #else
+    if ( decode < end_decode )
+    #endif
+    {
+      stbir__simdf d,a;
+      stbir__simdf_load( d, decode );
+      stbir__simdf_aaa1( a, d, STBIR__CONSTF(STBIR_ones) );
+      stbir__simdf_mult( d, d, a );
+      stbir__simdf_store ( decode, d );
+      decode += 4;
+    }
+  }
+
+  #else
+
+  while( decode < end_decode )
+  {
+    float alpha = decode[3];
+    decode[0] *= alpha;
+    decode[1] *= alpha;
+    decode[2] *= alpha;
+    decode += 4;
+  }
+
+  #endif
+}
+
+static void stbir__simple_alpha_weight_2ch( float * decode_buffer, int width_times_channels )
+{
+  float STBIR_STREAMOUT_PTR(*) decode = decode_buffer;
+  float const * end_decode = decode_buffer + width_times_channels;
+
+  #ifdef STBIR_SIMD
+  decode += 2 * stbir__simdfX_float_count;
+  while ( decode <= end_decode )
+  {
+    stbir__simdfX d0,a0,d1,a1;
+    STBIR_NO_UNROLL(decode);
+    stbir__simdfX_load( d0, decode-2*stbir__simdfX_float_count );
+    stbir__simdfX_load( d1, decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count );
+    stbir__simdfX_a1a1( a0, d0, STBIR_onesX );
+    stbir__simdfX_a1a1( a1, d1, STBIR_onesX );
+    stbir__simdfX_mult( d0, d0, a0 );
+    stbir__simdfX_mult( d1, d1, a1 );
+    stbir__simdfX_store ( decode-2*stbir__simdfX_float_count, d0 );
+    stbir__simdfX_store ( decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count, d1 );
+    decode += 2 * stbir__simdfX_float_count;
+  }
+  decode -= 2 * stbir__simdfX_float_count;
+  #endif
+
+  while( decode < end_decode )
+  {
+    float alpha = decode[1];
+    STBIR_SIMD_NO_UNROLL(decode);
+    decode[0] *= alpha;
+    decode += 2;
+  }
+}
+
+static void stbir__simple_alpha_unweight_4ch( float * encode_buffer, int width_times_channels )
+{
+  float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
+  float const * end_output = encode_buffer + width_times_channels;
+
+  do {
+    float alpha = encode[3];
+
+#ifdef STBIR_SIMD
+    stbir__simdf i,ia;
+    STBIR_SIMD_NO_UNROLL(encode);
+    if ( alpha >= stbir__small_float )
+    {
+      stbir__simdf_load1frep4( ia, 1.0f / alpha );
+      stbir__simdf_load( i, encode );
+      stbir__simdf_mult( i, i, ia );
+      stbir__simdf_store( encode, i );
+      encode[3] = alpha;
+    }
+#else
+    if ( alpha >= stbir__small_float )
+    {
+      float ialpha = 1.0f / alpha;
+      encode[0] *= ialpha;
+      encode[1] *= ialpha;
+      encode[2] *= ialpha;
+    }
+#endif
+    encode += 4;
+  } while ( encode < end_output );
+}
+
+static void stbir__simple_alpha_unweight_2ch( float * encode_buffer, int width_times_channels )
+{
+  float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
+  float const * end_output = encode_buffer + width_times_channels;
+
+  do {
+    float alpha = encode[1];
+    if ( alpha >= stbir__small_float )
+      encode[0] /= alpha;
+    encode += 2;
+  } while ( encode < end_output );
+}
+
+
+// only used in RGB->BGR or BGR->RGB
+static void stbir__simple_flip_3ch( float * decode_buffer, int width_times_channels )
+{
+  float STBIR_STREAMOUT_PTR(*) decode = decode_buffer;
+  float const * end_decode = decode_buffer + width_times_channels;
+
+  decode += 12;
+  while( decode <= end_decode )
+  {
+    float t0,t1,t2,t3;
+    STBIR_NO_UNROLL(decode);
+    t0 = decode[0]; t1 = decode[3]; t2 = decode[6]; t3 = decode[9];
+    decode[0] = decode[2]; decode[3] = decode[5]; decode[6] = decode[8]; decode[9] = decode[11];
+    decode[2] = t0; decode[5] = t1; decode[8] = t2; decode[11] = t3;
+    decode += 12;
+  }
+  decode -= 12;
+
+  while( decode < end_decode )
+  {
+    float t = decode[0];
+    STBIR_NO_UNROLL(decode);
+    decode[0] = decode[2];
+    decode[2] = t;
+    decode += 3;
+  }
+}
+
+
+
+static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float * output_buffer STBIR_ONLY_PROFILE_GET_SPLIT_INFO )
+{
+  int channels = stbir_info->channels;
+  int effective_channels = stbir_info->effective_channels;
+  int input_sample_in_bytes = stbir__type_size[stbir_info->input_type] * channels;
+  stbir_edge edge_horizontal = stbir_info->horizontal.edge;
+  stbir_edge edge_vertical = stbir_info->vertical.edge;
+  int row = stbir__edge_wrap(edge_vertical, n, stbir_info->vertical.scale_info.input_full_size);
+  const void* input_plane_data = ( (char *) stbir_info->input_data ) + (ptrdiff_t)row * (ptrdiff_t) stbir_info->input_stride_bytes;
+  stbir__span const * spans = stbir_info->scanline_extents.spans;
+  float* full_decode_buffer = output_buffer - stbir_info->scanline_extents.conservative.n0 * effective_channels;
+
+  // if we are on edge_zero, and we get in here with an out of bounds n, then the calculate filters has failed
+  STBIR_ASSERT( !(edge_vertical == STBIR_EDGE_ZERO && (n < 0 || n >= stbir_info->vertical.scale_info.input_full_size)) );
+
+  do 
+  {
+    float * decode_buffer;
+    void const * input_data;
+    float * end_decode;
+    int width_times_channels;
+    int width;
+
+    if ( spans->n1 < spans->n0 )    
+      break;
+
+    width = spans->n1 + 1 - spans->n0;
+    decode_buffer = full_decode_buffer + spans->n0 * effective_channels;
+    end_decode = full_decode_buffer + ( spans->n1 + 1 ) * effective_channels;
+    width_times_channels = width * channels;
+
+    // read directly out of input plane by default
+    input_data = ( (char*)input_plane_data ) + spans->pixel_offset_for_input * input_sample_in_bytes;
+
+    // if we have an input callback, call it to get the input data
+    if ( stbir_info->in_pixels_cb )
+    {
+      // call the callback with a temp buffer (that they can choose to use or not).  the temp is just right aligned memory in the decode_buffer itself
+      input_data = stbir_info->in_pixels_cb( ( (char*) end_decode ) - ( width * input_sample_in_bytes ), input_plane_data, width, spans->pixel_offset_for_input, row, stbir_info->user_data );
+    }
+    
+    STBIR_PROFILE_START( decode );
+    // convert the pixels info the float decode_buffer, (we index from end_decode, so that when channels<effective_channels, we are right justified in the buffer)
+    stbir_info->decode_pixels( (float*)end_decode - width_times_channels, width_times_channels, input_data );
+    STBIR_PROFILE_END( decode );
+
+    if (stbir_info->alpha_weight)
+    {
+      STBIR_PROFILE_START( alpha );
+      stbir_info->alpha_weight( decode_buffer, width_times_channels );
+      STBIR_PROFILE_END( alpha );
+    }
+
+    ++spans;
+  } while ( spans <= ( &stbir_info->scanline_extents.spans[1] ) );
+
+  // handle the edge_wrap filter (all other types are handled back out at the calculate_filter stage)
+  // basically the idea here is that if we have the whole scanline in memory, we don't redecode the
+  //   wrapped edge pixels, and instead just memcpy them from the scanline into the edge positions
+  if ( ( edge_horizontal == STBIR_EDGE_WRAP ) && ( stbir_info->scanline_extents.edge_sizes[0] | stbir_info->scanline_extents.edge_sizes[1] ) )
+  {
+    // this code only runs if we're in edge_wrap, and we're doing the entire scanline
+    int e, start_x[2];
+    int input_full_size = stbir_info->horizontal.scale_info.input_full_size;
+    
+    start_x[0] = -stbir_info->scanline_extents.edge_sizes[0];  // left edge start x
+    start_x[1] =  input_full_size;                             // right edge
+
+    for( e = 0; e < 2 ; e++ )
+    {
+      // do each margin
+      int margin = stbir_info->scanline_extents.edge_sizes[e];
+      if ( margin )
+      {
+        int x = start_x[e];
+        float * marg = full_decode_buffer + x * effective_channels;
+        float const * src = full_decode_buffer + stbir__edge_wrap(edge_horizontal, x, input_full_size) * effective_channels;
+        STBIR_MEMCPY( marg, src, margin * effective_channels * sizeof(float) );
+      }
+    }
+  }
+}
+
+
+//=================
+// Do 1 channel horizontal routines
+
+#ifdef STBIR_SIMD
+
+#define stbir__1_coeff_only()          \
+    stbir__simdf tot,c;                \
+    STBIR_SIMD_NO_UNROLL(decode);      \
+    stbir__simdf_load1( c, hc );       \
+    stbir__simdf_mult1_mem( tot, c, decode ); 
+
+#define stbir__2_coeff_only()          \
+    stbir__simdf tot,c,d;              \
+    STBIR_SIMD_NO_UNROLL(decode);      \
+    stbir__simdf_load2z( c, hc );      \
+    stbir__simdf_load2( d, decode );   \
+    stbir__simdf_mult( tot, c, d );    \
+    stbir__simdf_0123to1230( c, tot ); \
+    stbir__simdf_add1( tot, tot, c );          
+
+#define stbir__3_coeff_only()                  \
+    stbir__simdf tot,c,t;                      \
+    STBIR_SIMD_NO_UNROLL(decode);              \
+    stbir__simdf_load( c, hc );                \
+    stbir__simdf_mult_mem( tot, c, decode );   \
+    stbir__simdf_0123to1230( c, tot );         \
+    stbir__simdf_0123to2301( t, tot );         \
+    stbir__simdf_add1( tot, tot, c );          \
+    stbir__simdf_add1( tot, tot, t );    
+
+#define stbir__store_output_tiny()                \
+    stbir__simdf_store1( output, tot );           \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 1;
+
+#define stbir__4_coeff_start()                 \
+    stbir__simdf tot,c;                        \
+    STBIR_SIMD_NO_UNROLL(decode);              \
+    stbir__simdf_load( c, hc );                \
+    stbir__simdf_mult_mem( tot, c, decode );   \
+
+#define stbir__4_coeff_continue_from_4( ofs )  \
+    STBIR_SIMD_NO_UNROLL(decode);              \
+    stbir__simdf_load( c, hc + (ofs) );        \
+    stbir__simdf_madd_mem( tot, tot, c, decode+(ofs) ); 
+
+#define stbir__1_coeff_remnant( ofs )          \
+    { stbir__simdf d;                          \
+    stbir__simdf_load1z( c, hc + (ofs) );      \
+    stbir__simdf_load1( d, decode + (ofs) );   \
+    stbir__simdf_madd( tot, tot, d, c ); }
+
+#define stbir__2_coeff_remnant( ofs )          \
+    { stbir__simdf d;                          \
+    stbir__simdf_load2z( c, hc+(ofs) );        \
+    stbir__simdf_load2( d, decode+(ofs) );     \
+    stbir__simdf_madd( tot, tot, d, c ); }   
+
+#define stbir__3_coeff_setup()                 \
+    stbir__simdf mask;                         \
+    stbir__simdf_load( mask, STBIR_mask + 3 );
+
+#define stbir__3_coeff_remnant( ofs )                  \
+    stbir__simdf_load( c, hc+(ofs) );                  \
+    stbir__simdf_and( c, c, mask );                    \
+    stbir__simdf_madd_mem( tot, tot, c, decode+(ofs) );
+
+#define stbir__store_output()                     \
+    stbir__simdf_0123to2301( c, tot );            \
+    stbir__simdf_add( tot, tot, c );              \
+    stbir__simdf_0123to1230( c, tot );            \
+    stbir__simdf_add1( tot, tot, c );             \
+    stbir__simdf_store1( output, tot );           \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 1;
+
+#else
+
+#define stbir__1_coeff_only()  \
+    float tot;                 \
+    tot = decode[0]*hc[0];     
+
+#define stbir__2_coeff_only()  \
+    float tot;                 \
+    tot = decode[0] * hc[0];   \
+    tot += decode[1] * hc[1];    
+
+#define stbir__3_coeff_only()  \
+    float tot;                 \
+    tot = decode[0] * hc[0];   \
+    tot += decode[1] * hc[1];  \
+    tot += decode[2] * hc[2];    
+
+#define stbir__store_output_tiny()                \
+    output[0] = tot;                              \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 1;
+
+#define stbir__4_coeff_start()  \
+    float tot0,tot1,tot2,tot3;  \
+    tot0 = decode[0] * hc[0];   \
+    tot1 = decode[1] * hc[1];   \
+    tot2 = decode[2] * hc[2];   \
+    tot3 = decode[3] * hc[3];     
+
+#define stbir__4_coeff_continue_from_4( ofs )  \
+    tot0 += decode[0+(ofs)] * hc[0+(ofs)];     \
+    tot1 += decode[1+(ofs)] * hc[1+(ofs)];     \
+    tot2 += decode[2+(ofs)] * hc[2+(ofs)];     \
+    tot3 += decode[3+(ofs)] * hc[3+(ofs)];     
+
+#define stbir__1_coeff_remnant( ofs )        \
+    tot0 += decode[0+(ofs)] * hc[0+(ofs)];   
+
+#define stbir__2_coeff_remnant( ofs )        \
+    tot0 += decode[0+(ofs)] * hc[0+(ofs)];   \
+    tot1 += decode[1+(ofs)] * hc[1+(ofs)];   \
+
+#define stbir__3_coeff_remnant( ofs )        \
+    tot0 += decode[0+(ofs)] * hc[0+(ofs)];   \
+    tot1 += decode[1+(ofs)] * hc[1+(ofs)];   \
+    tot2 += decode[2+(ofs)] * hc[2+(ofs)];   
+
+#define stbir__store_output()                     \
+    output[0] = (tot0+tot2)+(tot1+tot3);          \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 1;
+
+#endif  
+
+#define STBIR__horizontal_channels 1
+#define STB_IMAGE_RESIZE_DO_HORIZONTALS
+#include STBIR__HEADER_FILENAME
+
+
+//=================
+// Do 2 channel horizontal routines
+
+#ifdef STBIR_SIMD
+
+#define stbir__1_coeff_only()         \
+    stbir__simdf tot,c,d;             \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    stbir__simdf_load1z( c, hc );     \
+    stbir__simdf_0123to0011( c, c );  \
+    stbir__simdf_load2( d, decode );  \
+    stbir__simdf_mult( tot, d, c ); 
+
+#define stbir__2_coeff_only()         \
+    stbir__simdf tot,c;               \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    stbir__simdf_load2( c, hc );      \
+    stbir__simdf_0123to0011( c, c );  \
+    stbir__simdf_mult_mem( tot, c, decode ); 
+
+#define stbir__3_coeff_only()                \
+    stbir__simdf tot,c,cs,d;                 \
+    STBIR_SIMD_NO_UNROLL(decode);            \
+    stbir__simdf_load( cs, hc );             \
+    stbir__simdf_0123to0011( c, cs );        \
+    stbir__simdf_mult_mem( tot, c, decode ); \
+    stbir__simdf_0123to2222( c, cs );        \
+    stbir__simdf_load2z( d, decode+4 );      \
+    stbir__simdf_madd( tot, tot, d, c );   
+
+#define stbir__store_output_tiny()                \
+    stbir__simdf_0123to2301( c, tot );            \
+    stbir__simdf_add( tot, tot, c );              \
+    stbir__simdf_store2( output, tot );           \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 2;
+
+#ifdef STBIR_SIMD8
+
+#define stbir__4_coeff_start()                    \
+    stbir__simdf8 tot0,c,cs;                      \
+    STBIR_SIMD_NO_UNROLL(decode);                 \
+    stbir__simdf8_load4b( cs, hc );               \
+    stbir__simdf8_0123to00112233( c, cs );        \
+    stbir__simdf8_mult_mem( tot0, c, decode );
+
+#define stbir__4_coeff_continue_from_4( ofs )        \
+    STBIR_SIMD_NO_UNROLL(decode);                    \
+    stbir__simdf8_load4b( cs, hc + (ofs) );          \
+    stbir__simdf8_0123to00112233( c, cs );           \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*2 ); 
+
+#define stbir__1_coeff_remnant( ofs )                \
+    { stbir__simdf t;                                \
+    stbir__simdf_load1z( t, hc + (ofs) );            \
+    stbir__simdf_0123to0011( t, t );                 \
+    stbir__simdf_mult_mem( t, t, decode+(ofs)*2 );   \
+    stbir__simdf8_add4( tot0, tot0, t ); }
+
+#define stbir__2_coeff_remnant( ofs )                \
+    { stbir__simdf t;                                \
+    stbir__simdf_load2( t, hc + (ofs) );             \
+    stbir__simdf_0123to0011( t, t );                 \
+    stbir__simdf_mult_mem( t, t, decode+(ofs)*2 );   \
+    stbir__simdf8_add4( tot0, tot0, t ); }
+
+#define stbir__3_coeff_remnant( ofs )                \
+    { stbir__simdf8 d;                               \
+    stbir__simdf8_load4b( cs, hc + (ofs) );          \
+    stbir__simdf8_0123to00112233( c, cs );           \
+    stbir__simdf8_load6z( d, decode+(ofs)*2 );       \
+    stbir__simdf8_madd( tot0, tot0, c, d ); }               
+
+#define stbir__store_output()                     \
+    { stbir__simdf t,c;                           \
+    stbir__simdf8_add4halves( t, stbir__if_simdf8_cast_to_simdf4(tot0), tot0 );    \
+    stbir__simdf_0123to2301( c, t );              \
+    stbir__simdf_add( t, t, c );                  \
+    stbir__simdf_store2( output, t );             \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 2; }
+
+#else
+
+#define stbir__4_coeff_start()                   \
+    stbir__simdf tot0,tot1,c,cs;                 \
+    STBIR_SIMD_NO_UNROLL(decode);                \
+    stbir__simdf_load( cs, hc );                 \
+    stbir__simdf_0123to0011( c, cs );            \
+    stbir__simdf_mult_mem( tot0, c, decode );    \
+    stbir__simdf_0123to2233( c, cs );            \
+    stbir__simdf_mult_mem( tot1, c, decode+4 );   
+
+#define stbir__4_coeff_continue_from_4( ofs )                \
+    STBIR_SIMD_NO_UNROLL(decode);                            \
+    stbir__simdf_load( cs, hc + (ofs) );                     \
+    stbir__simdf_0123to0011( c, cs );                        \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 );  \
+    stbir__simdf_0123to2233( c, cs );                        \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*2+4 );   
+
+#define stbir__1_coeff_remnant( ofs )            \
+    { stbir__simdf d;                            \
+    stbir__simdf_load1z( cs, hc + (ofs) );       \
+    stbir__simdf_0123to0011( c, cs );            \
+    stbir__simdf_load2( d, decode + (ofs) * 2 ); \
+    stbir__simdf_madd( tot0, tot0, d, c ); }
+
+#define stbir__2_coeff_remnant( ofs )                      \
+    stbir__simdf_load2( cs, hc + (ofs) );                  \
+    stbir__simdf_0123to0011( c, cs );                      \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 );       
+
+#define stbir__3_coeff_remnant( ofs )                       \
+    { stbir__simdf d;                                       \
+    stbir__simdf_load( cs, hc + (ofs) );                    \
+    stbir__simdf_0123to0011( c, cs );                       \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 ); \
+    stbir__simdf_0123to2222( c, cs );                       \
+    stbir__simdf_load2z( d, decode + (ofs) * 2 + 4 );       \
+    stbir__simdf_madd( tot1, tot1, d, c ); }  
+
+#define stbir__store_output()                     \
+    stbir__simdf_add( tot0, tot0, tot1 );         \
+    stbir__simdf_0123to2301( c, tot0 );           \
+    stbir__simdf_add( tot0, tot0, c );            \
+    stbir__simdf_store2( output, tot0 );          \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 2;
+
+#endif
+
+#else
+
+#define stbir__1_coeff_only()  \
+    float tota,totb,c;         \
+    c = hc[0];                 \
+    tota = decode[0]*c;        \
+    totb = decode[1]*c;     
+
+#define stbir__2_coeff_only()  \
+    float tota,totb,c;         \
+    c = hc[0];                 \
+    tota = decode[0]*c;        \
+    totb = decode[1]*c;        \
+    c = hc[1];                 \
+    tota += decode[2]*c;       \
+    totb += decode[3]*c;     
+
+// this weird order of add matches the simd
+#define stbir__3_coeff_only()  \
+    float tota,totb,c;         \
+    c = hc[0];                 \
+    tota = decode[0]*c;        \
+    totb = decode[1]*c;        \
+    c = hc[2];                 \
+    tota += decode[4]*c;       \
+    totb += decode[5]*c;       \
+    c = hc[1];                 \
+    tota += decode[2]*c;       \
+    totb += decode[3]*c;     
+
+#define stbir__store_output_tiny()                \
+    output[0] = tota;                             \
+    output[1] = totb;                             \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 2;
+
+#define stbir__4_coeff_start()      \
+    float tota0,tota1,tota2,tota3,totb0,totb1,totb2,totb3,c;  \
+    c = hc[0];                      \
+    tota0 = decode[0]*c;            \
+    totb0 = decode[1]*c;            \
+    c = hc[1];                      \
+    tota1 = decode[2]*c;            \
+    totb1 = decode[3]*c;            \
+    c = hc[2];                      \
+    tota2 = decode[4]*c;            \
+    totb2 = decode[5]*c;            \
+    c = hc[3];                      \
+    tota3 = decode[6]*c;            \
+    totb3 = decode[7]*c;     
+
+#define stbir__4_coeff_continue_from_4( ofs )  \
+    c = hc[0+(ofs)];                           \
+    tota0 += decode[0+(ofs)*2]*c;              \
+    totb0 += decode[1+(ofs)*2]*c;              \
+    c = hc[1+(ofs)];                           \
+    tota1 += decode[2+(ofs)*2]*c;              \
+    totb1 += decode[3+(ofs)*2]*c;              \
+    c = hc[2+(ofs)];                           \
+    tota2 += decode[4+(ofs)*2]*c;              \
+    totb2 += decode[5+(ofs)*2]*c;              \
+    c = hc[3+(ofs)];                           \
+    tota3 += decode[6+(ofs)*2]*c;              \
+    totb3 += decode[7+(ofs)*2]*c;     
+
+#define stbir__1_coeff_remnant( ofs )  \
+    c = hc[0+(ofs)];                   \
+    tota0 += decode[0+(ofs)*2] * c;    \
+    totb0 += decode[1+(ofs)*2] * c;   
+
+#define stbir__2_coeff_remnant( ofs )  \
+    c = hc[0+(ofs)];                   \
+    tota0 += decode[0+(ofs)*2] * c;    \
+    totb0 += decode[1+(ofs)*2] * c;    \
+    c = hc[1+(ofs)];                   \
+    tota1 += decode[2+(ofs)*2] * c;    \
+    totb1 += decode[3+(ofs)*2] * c;   
+
+#define stbir__3_coeff_remnant( ofs )  \
+    c = hc[0+(ofs)];                   \
+    tota0 += decode[0+(ofs)*2] * c;    \
+    totb0 += decode[1+(ofs)*2] * c;    \
+    c = hc[1+(ofs)];                   \
+    tota1 += decode[2+(ofs)*2] * c;    \
+    totb1 += decode[3+(ofs)*2] * c;    \
+    c = hc[2+(ofs)];                   \
+    tota2 += decode[4+(ofs)*2] * c;    \
+    totb2 += decode[5+(ofs)*2] * c;    
+
+#define stbir__store_output()                     \
+    output[0] = (tota0+tota2)+(tota1+tota3);      \
+    output[1] = (totb0+totb2)+(totb1+totb3);      \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 2;
+
+#endif  
+
+#define STBIR__horizontal_channels 2
+#define STB_IMAGE_RESIZE_DO_HORIZONTALS
+#include STBIR__HEADER_FILENAME
+
+
+//=================
+// Do 3 channel horizontal routines
+
+#ifdef STBIR_SIMD
+
+#define stbir__1_coeff_only()         \
+    stbir__simdf tot,c,d;             \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    stbir__simdf_load1z( c, hc );     \
+    stbir__simdf_0123to0001( c, c );  \
+    stbir__simdf_load( d, decode );   \
+    stbir__simdf_mult( tot, d, c ); 
+
+#define stbir__2_coeff_only()         \
+    stbir__simdf tot,c,cs,d;          \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    stbir__simdf_load2( cs, hc );     \
+    stbir__simdf_0123to0000( c, cs ); \
+    stbir__simdf_load( d, decode );   \
+    stbir__simdf_mult( tot, d, c );   \
+    stbir__simdf_0123to1111( c, cs ); \
+    stbir__simdf_load( d, decode+3 ); \
+    stbir__simdf_madd( tot, tot, d, c ); 
+
+#define stbir__3_coeff_only()            \
+    stbir__simdf tot,c,d,cs;             \
+    STBIR_SIMD_NO_UNROLL(decode);        \
+    stbir__simdf_load( cs, hc );         \
+    stbir__simdf_0123to0000( c, cs );    \
+    stbir__simdf_load( d, decode );      \
+    stbir__simdf_mult( tot, d, c );      \
+    stbir__simdf_0123to1111( c, cs );    \
+    stbir__simdf_load( d, decode+3 );    \
+    stbir__simdf_madd( tot, tot, d, c ); \
+    stbir__simdf_0123to2222( c, cs );    \
+    stbir__simdf_load( d, decode+6 );    \
+    stbir__simdf_madd( tot, tot, d, c ); 
+
+#define stbir__store_output_tiny()                \
+    stbir__simdf_store2( output, tot );           \
+    stbir__simdf_0123to2301( tot, tot );          \
+    stbir__simdf_store1( output+2, tot );         \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 3;
+
+#ifdef STBIR_SIMD8
+
+// we're loading from the XXXYYY decode by -1 to get the XXXYYY into different halves of the AVX reg fyi
+#define stbir__4_coeff_start()                     \
+    stbir__simdf8 tot0,tot1,c,cs; stbir__simdf t;  \
+    STBIR_SIMD_NO_UNROLL(decode);                  \
+    stbir__simdf8_load4b( cs, hc );                \
+    stbir__simdf8_0123to00001111( c, cs );         \
+    stbir__simdf8_mult_mem( tot0, c, decode - 1 ); \
+    stbir__simdf8_0123to22223333( c, cs );         \
+    stbir__simdf8_mult_mem( tot1, c, decode+6 - 1 );    
+
+#define stbir__4_coeff_continue_from_4( ofs )      \
+    STBIR_SIMD_NO_UNROLL(decode);                  \
+    stbir__simdf8_load4b( cs, hc + (ofs) );        \
+    stbir__simdf8_0123to00001111( c, cs );         \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 ); \
+    stbir__simdf8_0123to22223333( c, cs );         \
+    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*3 + 6 - 1 );    
+
+#define stbir__1_coeff_remnant( ofs )                          \
+    STBIR_SIMD_NO_UNROLL(decode);                              \
+    stbir__simdf_load1rep4( t, hc + (ofs) );                   \
+    stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*3 - 1 ); 
+
+#define stbir__2_coeff_remnant( ofs )                          \
+    STBIR_SIMD_NO_UNROLL(decode);                              \
+    stbir__simdf8_load4b( cs, hc + (ofs) - 2 );                \
+    stbir__simdf8_0123to22223333( c, cs );                     \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 );   
+ 
+ #define stbir__3_coeff_remnant( ofs )                           \
+    STBIR_SIMD_NO_UNROLL(decode);                                \
+    stbir__simdf8_load4b( cs, hc + (ofs) );                      \
+    stbir__simdf8_0123to00001111( c, cs );                       \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 ); \
+    stbir__simdf8_0123to2222( t, cs );                           \
+    stbir__simdf8_madd_mem4( tot1, tot1, t, decode+(ofs)*3 + 6 - 1 ); 
+
+#define stbir__store_output()                       \
+    stbir__simdf8_add( tot0, tot0, tot1 );          \
+    stbir__simdf_0123to1230( t, stbir__if_simdf8_cast_to_simdf4( tot0 ) ); \
+    stbir__simdf8_add4halves( t, t, tot0 );         \
+    horizontal_coefficients += coefficient_width;   \
+    ++horizontal_contributors;                      \
+    output += 3;                                    \
+    if ( output < output_end )                      \
+    {                                               \
+      stbir__simdf_store( output-3, t );            \
+      continue;                                     \
+    }                                               \
+    { stbir__simdf tt; stbir__simdf_0123to2301( tt, t ); \
+    stbir__simdf_store2( output-3, t );             \
+    stbir__simdf_store1( output+2-3, tt ); }        \
+    break;
+
+
+#else
+
+#define stbir__4_coeff_start()                  \
+    stbir__simdf tot0,tot1,tot2,c,cs;           \
+    STBIR_SIMD_NO_UNROLL(decode);               \
+    stbir__simdf_load( cs, hc );                \
+    stbir__simdf_0123to0001( c, cs );           \
+    stbir__simdf_mult_mem( tot0, c, decode );   \
+    stbir__simdf_0123to1122( c, cs );           \
+    stbir__simdf_mult_mem( tot1, c, decode+4 ); \
+    stbir__simdf_0123to2333( c, cs );           \
+    stbir__simdf_mult_mem( tot2, c, decode+8 ); 
+
+#define stbir__4_coeff_continue_from_4( ofs )                 \
+    STBIR_SIMD_NO_UNROLL(decode);                             \
+    stbir__simdf_load( cs, hc + (ofs) );                      \
+    stbir__simdf_0123to0001( c, cs );                         \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 );   \
+    stbir__simdf_0123to1122( c, cs );                         \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*3+4 ); \
+    stbir__simdf_0123to2333( c, cs );                         \
+    stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*3+8 );   
+
+#define stbir__1_coeff_remnant( ofs )         \
+    STBIR_SIMD_NO_UNROLL(decode);             \
+    stbir__simdf_load1z( c, hc + (ofs) );     \
+    stbir__simdf_0123to0001( c, c );          \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 );   
+
+#define stbir__2_coeff_remnant( ofs )                       \
+    { stbir__simdf d;                                       \
+    STBIR_SIMD_NO_UNROLL(decode);                           \
+    stbir__simdf_load2z( cs, hc + (ofs) );                  \
+    stbir__simdf_0123to0001( c, cs );                       \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 ); \
+    stbir__simdf_0123to1122( c, cs );                       \
+    stbir__simdf_load2z( d, decode+(ofs)*3+4 );             \
+    stbir__simdf_madd( tot1, tot1, c, d ); }                 
+
+#define stbir__3_coeff_remnant( ofs )                         \
+    { stbir__simdf d;                                         \
+    STBIR_SIMD_NO_UNROLL(decode);                             \
+    stbir__simdf_load( cs, hc + (ofs) );                      \
+    stbir__simdf_0123to0001( c, cs );                         \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 );   \
+    stbir__simdf_0123to1122( c, cs );                         \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*3+4 ); \
+    stbir__simdf_0123to2222( c, cs );                         \
+    stbir__simdf_load1z( d, decode+(ofs)*3+8 );               \
+    stbir__simdf_madd( tot2, tot2, c, d );  }                
+
+#define stbir__store_output()                       \
+    stbir__simdf_0123ABCDto3ABx( c, tot0, tot1 );   \
+    stbir__simdf_0123ABCDto23Ax( cs, tot1, tot2 );  \
+    stbir__simdf_0123to1230( tot2, tot2 );          \
+    stbir__simdf_add( tot0, tot0, cs );             \
+    stbir__simdf_add( c, c, tot2 );                 \
+    stbir__simdf_add( tot0, tot0, c );              \
+    horizontal_coefficients += coefficient_width;   \
+    ++horizontal_contributors;                      \
+    output += 3;                                    \
+    if ( output < output_end )                      \
+    {                                               \
+      stbir__simdf_store( output-3, tot0 );         \
+      continue;                                     \
+    }                                               \
+    stbir__simdf_0123to2301( tot1, tot0 );          \
+    stbir__simdf_store2( output-3, tot0 );          \
+    stbir__simdf_store1( output+2-3, tot1 );        \
+    break;
+
+#endif
+
+#else
+
+#define stbir__1_coeff_only()  \
+    float tot0, tot1, tot2, c; \
+    c = hc[0];                 \
+    tot0 = decode[0]*c;        \
+    tot1 = decode[1]*c;        \
+    tot2 = decode[2]*c;              
+
+#define stbir__2_coeff_only()  \
+    float tot0, tot1, tot2, c; \
+    c = hc[0];                 \
+    tot0 = decode[0]*c;        \
+    tot1 = decode[1]*c;        \
+    tot2 = decode[2]*c;        \
+    c = hc[1];                 \
+    tot0 += decode[3]*c;       \
+    tot1 += decode[4]*c;       \
+    tot2 += decode[5]*c;              
+
+#define stbir__3_coeff_only()  \
+    float tot0, tot1, tot2, c; \
+    c = hc[0];                 \
+    tot0 = decode[0]*c;        \
+    tot1 = decode[1]*c;        \
+    tot2 = decode[2]*c;        \
+    c = hc[1];                 \
+    tot0 += decode[3]*c;       \
+    tot1 += decode[4]*c;       \
+    tot2 += decode[5]*c;       \
+    c = hc[2];                 \
+    tot0 += decode[6]*c;       \
+    tot1 += decode[7]*c;       \
+    tot2 += decode[8]*c;              
+
+#define stbir__store_output_tiny()                \
+    output[0] = tot0;                             \
+    output[1] = tot1;                             \
+    output[2] = tot2;                             \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 3;
+
+#define stbir__4_coeff_start()      \
+    float tota0,tota1,tota2,totb0,totb1,totb2,totc0,totc1,totc2,totd0,totd1,totd2,c;  \
+    c = hc[0];                      \
+    tota0 = decode[0]*c;            \
+    tota1 = decode[1]*c;            \
+    tota2 = decode[2]*c;            \
+    c = hc[1];                      \
+    totb0 = decode[3]*c;            \
+    totb1 = decode[4]*c;            \
+    totb2 = decode[5]*c;            \
+    c = hc[2];                      \
+    totc0 = decode[6]*c;            \
+    totc1 = decode[7]*c;            \
+    totc2 = decode[8]*c;            \
+    c = hc[3];                      \
+    totd0 = decode[9]*c;            \
+    totd1 = decode[10]*c;           \
+    totd2 = decode[11]*c;            
+
+#define stbir__4_coeff_continue_from_4( ofs )  \
+    c = hc[0+(ofs)];                           \
+    tota0 += decode[0+(ofs)*3]*c;              \
+    tota1 += decode[1+(ofs)*3]*c;              \
+    tota2 += decode[2+(ofs)*3]*c;              \
+    c = hc[1+(ofs)];                           \
+    totb0 += decode[3+(ofs)*3]*c;              \
+    totb1 += decode[4+(ofs)*3]*c;              \
+    totb2 += decode[5+(ofs)*3]*c;              \
+    c = hc[2+(ofs)];                           \
+    totc0 += decode[6+(ofs)*3]*c;              \
+    totc1 += decode[7+(ofs)*3]*c;              \
+    totc2 += decode[8+(ofs)*3]*c;              \
+    c = hc[3+(ofs)];                           \
+    totd0 += decode[9+(ofs)*3]*c;              \
+    totd1 += decode[10+(ofs)*3]*c;             \
+    totd2 += decode[11+(ofs)*3]*c;              
+
+#define stbir__1_coeff_remnant( ofs )  \
+    c = hc[0+(ofs)];                   \
+    tota0 += decode[0+(ofs)*3]*c;      \
+    tota1 += decode[1+(ofs)*3]*c;      \
+    tota2 += decode[2+(ofs)*3]*c;
+
+#define stbir__2_coeff_remnant( ofs )  \
+    c = hc[0+(ofs)];                   \
+    tota0 += decode[0+(ofs)*3]*c;      \
+    tota1 += decode[1+(ofs)*3]*c;      \
+    tota2 += decode[2+(ofs)*3]*c;      \
+    c = hc[1+(ofs)];                   \
+    totb0 += decode[3+(ofs)*3]*c;      \
+    totb1 += decode[4+(ofs)*3]*c;      \
+    totb2 += decode[5+(ofs)*3]*c;      \
+
+#define stbir__3_coeff_remnant( ofs )  \
+    c = hc[0+(ofs)];                   \
+    tota0 += decode[0+(ofs)*3]*c;      \
+    tota1 += decode[1+(ofs)*3]*c;      \
+    tota2 += decode[2+(ofs)*3]*c;      \
+    c = hc[1+(ofs)];                   \
+    totb0 += decode[3+(ofs)*3]*c;      \
+    totb1 += decode[4+(ofs)*3]*c;      \
+    totb2 += decode[5+(ofs)*3]*c;      \
+    c = hc[2+(ofs)];                   \
+    totc0 += decode[6+(ofs)*3]*c;      \
+    totc1 += decode[7+(ofs)*3]*c;      \
+    totc2 += decode[8+(ofs)*3]*c;              
+
+#define stbir__store_output()                     \
+    output[0] = (tota0+totc0)+(totb0+totd0);      \
+    output[1] = (tota1+totc1)+(totb1+totd1);      \
+    output[2] = (tota2+totc2)+(totb2+totd2);      \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 3;
+
+#endif  
+
+#define STBIR__horizontal_channels 3
+#define STB_IMAGE_RESIZE_DO_HORIZONTALS
+#include STBIR__HEADER_FILENAME
+
+//=================
+// Do 4 channel horizontal routines
+
+#ifdef STBIR_SIMD
+
+#define stbir__1_coeff_only()             \
+    stbir__simdf tot,c;                   \
+    STBIR_SIMD_NO_UNROLL(decode);         \
+    stbir__simdf_load1( c, hc );          \
+    stbir__simdf_0123to0000( c, c );      \
+    stbir__simdf_mult_mem( tot, c, decode ); 
+
+#define stbir__2_coeff_only()                       \
+    stbir__simdf tot,c,cs;                          \
+    STBIR_SIMD_NO_UNROLL(decode);                   \
+    stbir__simdf_load2( cs, hc );                   \
+    stbir__simdf_0123to0000( c, cs );               \
+    stbir__simdf_mult_mem( tot, c, decode );        \
+    stbir__simdf_0123to1111( c, cs );               \
+    stbir__simdf_madd_mem( tot, tot, c, decode+4 ); 
+
+#define stbir__3_coeff_only()                       \
+    stbir__simdf tot,c,cs;                          \
+    STBIR_SIMD_NO_UNROLL(decode);                   \
+    stbir__simdf_load( cs, hc );                    \
+    stbir__simdf_0123to0000( c, cs );               \
+    stbir__simdf_mult_mem( tot, c, decode );        \
+    stbir__simdf_0123to1111( c, cs );               \
+    stbir__simdf_madd_mem( tot, tot, c, decode+4 ); \
+    stbir__simdf_0123to2222( c, cs );               \
+    stbir__simdf_madd_mem( tot, tot, c, decode+8 ); 
+
+#define stbir__store_output_tiny()                \
+    stbir__simdf_store( output, tot );            \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 4;
+
+#ifdef STBIR_SIMD8
+
+#define stbir__4_coeff_start()                     \
+    stbir__simdf8 tot0,c,cs; stbir__simdf t;  \
+    STBIR_SIMD_NO_UNROLL(decode);                  \
+    stbir__simdf8_load4b( cs, hc );                \
+    stbir__simdf8_0123to00001111( c, cs );         \
+    stbir__simdf8_mult_mem( tot0, c, decode );     \
+    stbir__simdf8_0123to22223333( c, cs );         \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+8 );    
+
+#define stbir__4_coeff_continue_from_4( ofs )                  \
+    STBIR_SIMD_NO_UNROLL(decode);                              \
+    stbir__simdf8_load4b( cs, hc + (ofs) );                    \
+    stbir__simdf8_0123to00001111( c, cs );                     \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 );   \
+    stbir__simdf8_0123to22223333( c, cs );                     \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 );    
+
+#define stbir__1_coeff_remnant( ofs )                          \
+    STBIR_SIMD_NO_UNROLL(decode);                              \
+    stbir__simdf_load1rep4( t, hc + (ofs) );                   \
+    stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*4 ); 
+
+#define stbir__2_coeff_remnant( ofs )                          \
+    STBIR_SIMD_NO_UNROLL(decode);                              \
+    stbir__simdf8_load4b( cs, hc + (ofs) - 2 );                \
+    stbir__simdf8_0123to22223333( c, cs );                     \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 );   
+ 
+ #define stbir__3_coeff_remnant( ofs )                         \
+    STBIR_SIMD_NO_UNROLL(decode);                              \
+    stbir__simdf8_load4b( cs, hc + (ofs) );                    \
+    stbir__simdf8_0123to00001111( c, cs );                     \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 );   \
+    stbir__simdf8_0123to2222( t, cs );                         \
+    stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*4+8 ); 
+
+#define stbir__store_output()                      \
+    stbir__simdf8_add4halves( t, stbir__if_simdf8_cast_to_simdf4(tot0), tot0 );     \
+    stbir__simdf_store( output, t );               \
+    horizontal_coefficients += coefficient_width;  \
+    ++horizontal_contributors;                     \
+    output += 4;
+
+#else    
+
+#define stbir__4_coeff_start()                        \
+    stbir__simdf tot0,tot1,c,cs;                      \
+    STBIR_SIMD_NO_UNROLL(decode);                     \
+    stbir__simdf_load( cs, hc );                      \
+    stbir__simdf_0123to0000( c, cs );                 \
+    stbir__simdf_mult_mem( tot0, c, decode );         \
+    stbir__simdf_0123to1111( c, cs );                 \
+    stbir__simdf_mult_mem( tot1, c, decode+4 );       \
+    stbir__simdf_0123to2222( c, cs );                 \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+8 ); \
+    stbir__simdf_0123to3333( c, cs );                 \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+12 ); 
+
+#define stbir__4_coeff_continue_from_4( ofs )                  \
+    STBIR_SIMD_NO_UNROLL(decode);                              \
+    stbir__simdf_load( cs, hc + (ofs) );                       \
+    stbir__simdf_0123to0000( c, cs );                          \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 );    \
+    stbir__simdf_0123to1111( c, cs );                          \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 );  \
+    stbir__simdf_0123to2222( c, cs );                          \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 );  \
+    stbir__simdf_0123to3333( c, cs );                          \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+12 ); 
+
+#define stbir__1_coeff_remnant( ofs )                       \
+    STBIR_SIMD_NO_UNROLL(decode);                           \
+    stbir__simdf_load1( c, hc + (ofs) );                    \
+    stbir__simdf_0123to0000( c, c );                        \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 ); 
+
+#define stbir__2_coeff_remnant( ofs )                         \
+    STBIR_SIMD_NO_UNROLL(decode);                             \
+    stbir__simdf_load2( cs, hc + (ofs) );                     \
+    stbir__simdf_0123to0000( c, cs );                         \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 );   \
+    stbir__simdf_0123to1111( c, cs );                         \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 ); 
+  
+#define stbir__3_coeff_remnant( ofs )                          \
+    STBIR_SIMD_NO_UNROLL(decode);                              \
+    stbir__simdf_load( cs, hc + (ofs) );                       \
+    stbir__simdf_0123to0000( c, cs );                          \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 );    \
+    stbir__simdf_0123to1111( c, cs );                          \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 );  \
+    stbir__simdf_0123to2222( c, cs );                          \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 );
+
+#define stbir__store_output()                     \
+    stbir__simdf_add( tot0, tot0, tot1 );         \
+    stbir__simdf_store( output, tot0 );           \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 4;
+
+#endif
+
+#else
+
+#define stbir__1_coeff_only()         \
+    float p0,p1,p2,p3,c;              \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    c = hc[0];                        \
+    p0 = decode[0] * c;               \
+    p1 = decode[1] * c;               \
+    p2 = decode[2] * c;               \
+    p3 = decode[3] * c;
+
+#define stbir__2_coeff_only()         \
+    float p0,p1,p2,p3,c;              \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    c = hc[0];                        \
+    p0 = decode[0] * c;               \
+    p1 = decode[1] * c;               \
+    p2 = decode[2] * c;               \
+    p3 = decode[3] * c;               \
+    c = hc[1];                        \
+    p0 += decode[4] * c;              \
+    p1 += decode[5] * c;              \
+    p2 += decode[6] * c;              \
+    p3 += decode[7] * c;
+
+#define stbir__3_coeff_only()         \
+    float p0,p1,p2,p3,c;              \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    c = hc[0];                        \
+    p0 = decode[0] * c;               \
+    p1 = decode[1] * c;               \
+    p2 = decode[2] * c;               \
+    p3 = decode[3] * c;               \
+    c = hc[1];                        \
+    p0 += decode[4] * c;              \
+    p1 += decode[5] * c;              \
+    p2 += decode[6] * c;              \
+    p3 += decode[7] * c;              \
+    c = hc[2];                        \
+    p0 += decode[8] * c;              \
+    p1 += decode[9] * c;              \
+    p2 += decode[10] * c;             \
+    p3 += decode[11] * c;
+
+#define stbir__store_output_tiny()                \
+    output[0] = p0;                               \
+    output[1] = p1;                               \
+    output[2] = p2;                               \
+    output[3] = p3;                               \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 4;
+
+#define stbir__4_coeff_start()        \
+    float x0,x1,x2,x3,y0,y1,y2,y3,c;  \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    c = hc[0];                        \
+    x0 = decode[0] * c;               \
+    x1 = decode[1] * c;               \
+    x2 = decode[2] * c;               \
+    x3 = decode[3] * c;               \
+    c = hc[1];                        \
+    y0 = decode[4] * c;               \
+    y1 = decode[5] * c;               \
+    y2 = decode[6] * c;               \
+    y3 = decode[7] * c;               \
+    c = hc[2];                        \
+    x0 += decode[8] * c;              \
+    x1 += decode[9] * c;              \
+    x2 += decode[10] * c;             \
+    x3 += decode[11] * c;             \
+    c = hc[3];                        \
+    y0 += decode[12] * c;             \
+    y1 += decode[13] * c;             \
+    y2 += decode[14] * c;             \
+    y3 += decode[15] * c;
+
+#define stbir__4_coeff_continue_from_4( ofs ) \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    c = hc[0+(ofs)];                  \
+    x0 += decode[0+(ofs)*4] * c;      \
+    x1 += decode[1+(ofs)*4] * c;      \
+    x2 += decode[2+(ofs)*4] * c;      \
+    x3 += decode[3+(ofs)*4] * c;      \
+    c = hc[1+(ofs)];                  \
+    y0 += decode[4+(ofs)*4] * c;      \
+    y1 += decode[5+(ofs)*4] * c;      \
+    y2 += decode[6+(ofs)*4] * c;      \
+    y3 += decode[7+(ofs)*4] * c;      \
+    c = hc[2+(ofs)];                  \
+    x0 += decode[8+(ofs)*4] * c;      \
+    x1 += decode[9+(ofs)*4] * c;      \
+    x2 += decode[10+(ofs)*4] * c;     \
+    x3 += decode[11+(ofs)*4] * c;     \
+    c = hc[3+(ofs)];                  \
+    y0 += decode[12+(ofs)*4] * c;     \
+    y1 += decode[13+(ofs)*4] * c;     \
+    y2 += decode[14+(ofs)*4] * c;     \
+    y3 += decode[15+(ofs)*4] * c;
+
+#define stbir__1_coeff_remnant( ofs ) \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    c = hc[0+(ofs)];                  \
+    x0 += decode[0+(ofs)*4] * c;      \
+    x1 += decode[1+(ofs)*4] * c;      \
+    x2 += decode[2+(ofs)*4] * c;      \
+    x3 += decode[3+(ofs)*4] * c;      
+
+#define stbir__2_coeff_remnant( ofs ) \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    c = hc[0+(ofs)];                  \
+    x0 += decode[0+(ofs)*4] * c;      \
+    x1 += decode[1+(ofs)*4] * c;      \
+    x2 += decode[2+(ofs)*4] * c;      \
+    x3 += decode[3+(ofs)*4] * c;      \
+    c = hc[1+(ofs)];                  \
+    y0 += decode[4+(ofs)*4] * c;      \
+    y1 += decode[5+(ofs)*4] * c;      \
+    y2 += decode[6+(ofs)*4] * c;      \
+    y3 += decode[7+(ofs)*4] * c;    
+  
+#define stbir__3_coeff_remnant( ofs ) \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    c = hc[0+(ofs)];                  \
+    x0 += decode[0+(ofs)*4] * c;      \
+    x1 += decode[1+(ofs)*4] * c;      \
+    x2 += decode[2+(ofs)*4] * c;      \
+    x3 += decode[3+(ofs)*4] * c;      \
+    c = hc[1+(ofs)];                  \
+    y0 += decode[4+(ofs)*4] * c;      \
+    y1 += decode[5+(ofs)*4] * c;      \
+    y2 += decode[6+(ofs)*4] * c;      \
+    y3 += decode[7+(ofs)*4] * c;      \
+    c = hc[2+(ofs)];                  \
+    x0 += decode[8+(ofs)*4] * c;      \
+    x1 += decode[9+(ofs)*4] * c;      \
+    x2 += decode[10+(ofs)*4] * c;     \
+    x3 += decode[11+(ofs)*4] * c;     
+
+#define stbir__store_output()                     \
+    output[0] = x0 + y0;                          \
+    output[1] = x1 + y1;                          \
+    output[2] = x2 + y2;                          \
+    output[3] = x3 + y3;                          \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 4;
+
+#endif  
+
+#define STBIR__horizontal_channels 4
+#define STB_IMAGE_RESIZE_DO_HORIZONTALS
+#include STBIR__HEADER_FILENAME
+
+
+
+//=================
+// Do 7 channel horizontal routines
+
+#ifdef STBIR_SIMD
+
+#define stbir__1_coeff_only()                   \
+    stbir__simdf tot0,tot1,c;                   \
+    STBIR_SIMD_NO_UNROLL(decode);               \
+    stbir__simdf_load1( c, hc );                \
+    stbir__simdf_0123to0000( c, c );            \
+    stbir__simdf_mult_mem( tot0, c, decode );   \
+    stbir__simdf_mult_mem( tot1, c, decode+3 ); 
+
+#define stbir__2_coeff_only()                         \
+    stbir__simdf tot0,tot1,c,cs;                      \
+    STBIR_SIMD_NO_UNROLL(decode);                     \
+    stbir__simdf_load2( cs, hc );                     \
+    stbir__simdf_0123to0000( c, cs );                 \
+    stbir__simdf_mult_mem( tot0, c, decode );         \
+    stbir__simdf_mult_mem( tot1, c, decode+3 );       \
+    stbir__simdf_0123to1111( c, cs );                 \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+7 ); \
+    stbir__simdf_madd_mem( tot1, tot1, c,decode+10 ); 
+
+#define stbir__3_coeff_only()                           \
+    stbir__simdf tot0,tot1,c,cs;                        \
+    STBIR_SIMD_NO_UNROLL(decode);                       \
+    stbir__simdf_load( cs, hc );                        \
+    stbir__simdf_0123to0000( c, cs );                   \
+    stbir__simdf_mult_mem( tot0, c, decode );           \
+    stbir__simdf_mult_mem( tot1, c, decode+3 );         \
+    stbir__simdf_0123to1111( c, cs );                   \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+7 );   \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+10 );  \
+    stbir__simdf_0123to2222( c, cs );                   \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+14 );  \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+17 );  
+
+#define stbir__store_output_tiny()                \
+    stbir__simdf_store( output+3, tot1 );         \
+    stbir__simdf_store( output, tot0 );           \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 7;
+
+#ifdef STBIR_SIMD8
+
+#define stbir__4_coeff_start()                     \
+    stbir__simdf8 tot0,tot1,c,cs;                  \
+    STBIR_SIMD_NO_UNROLL(decode);                  \
+    stbir__simdf8_load4b( cs, hc );                \
+    stbir__simdf8_0123to00000000( c, cs );         \
+    stbir__simdf8_mult_mem( tot0, c, decode );     \
+    stbir__simdf8_0123to11111111( c, cs );         \
+    stbir__simdf8_mult_mem( tot1, c, decode+7 );   \
+    stbir__simdf8_0123to22222222( c, cs );         \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+14 );  \
+    stbir__simdf8_0123to33333333( c, cs );         \
+    stbir__simdf8_madd_mem( tot1, tot1, c, decode+21 );  
+
+#define stbir__4_coeff_continue_from_4( ofs )                   \
+    STBIR_SIMD_NO_UNROLL(decode);                               \
+    stbir__simdf8_load4b( cs, hc + (ofs) );                     \
+    stbir__simdf8_0123to00000000( c, cs );                      \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 );    \
+    stbir__simdf8_0123to11111111( c, cs );                      \
+    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 );  \
+    stbir__simdf8_0123to22222222( c, cs );                      \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 ); \
+    stbir__simdf8_0123to33333333( c, cs );                      \
+    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+21 ); 
+
+#define stbir__1_coeff_remnant( ofs )                           \
+    STBIR_SIMD_NO_UNROLL(decode);                               \
+    stbir__simdf8_load1b( c, hc + (ofs) );                      \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 );    
+
+#define stbir__2_coeff_remnant( ofs )                           \
+    STBIR_SIMD_NO_UNROLL(decode);                               \
+    stbir__simdf8_load1b( c, hc + (ofs) );                      \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 );    \
+    stbir__simdf8_load1b( c, hc + (ofs)+1 );                    \
+    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 );   
+
+#define stbir__3_coeff_remnant( ofs )                           \
+    STBIR_SIMD_NO_UNROLL(decode);                               \
+    stbir__simdf8_load4b( cs, hc + (ofs) );                     \
+    stbir__simdf8_0123to00000000( c, cs );                      \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 );    \
+    stbir__simdf8_0123to11111111( c, cs );                      \
+    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 );  \
+    stbir__simdf8_0123to22222222( c, cs );                      \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 ); 
+
+#define stbir__store_output()                     \
+    stbir__simdf8_add( tot0, tot0, tot1 );        \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 7;                                  \
+    if ( output < output_end )                    \
+    {                                             \
+      stbir__simdf8_store( output-7, tot0 );      \
+      continue;                                   \
+    }                                             \
+    stbir__simdf_store( output-7+3, stbir__simdf_swiz(stbir__simdf8_gettop4(tot0),0,0,1,2) ); \
+    stbir__simdf_store( output-7, stbir__if_simdf8_cast_to_simdf4(tot0) );           \
+    break;
+
+#else
+
+#define stbir__4_coeff_start()                    \
+    stbir__simdf tot0,tot1,tot2,tot3,c,cs;        \
+    STBIR_SIMD_NO_UNROLL(decode);                 \
+    stbir__simdf_load( cs, hc );                  \
+    stbir__simdf_0123to0000( c, cs );             \
+    stbir__simdf_mult_mem( tot0, c, decode );     \
+    stbir__simdf_mult_mem( tot1, c, decode+3 );   \
+    stbir__simdf_0123to1111( c, cs );             \
+    stbir__simdf_mult_mem( tot2, c, decode+7 );   \
+    stbir__simdf_mult_mem( tot3, c, decode+10 );  \
+    stbir__simdf_0123to2222( c, cs );             \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+14 );  \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+17 );  \
+    stbir__simdf_0123to3333( c, cs );                   \
+    stbir__simdf_madd_mem( tot2, tot2, c, decode+21 );  \
+    stbir__simdf_madd_mem( tot3, tot3, c, decode+24 );         
+
+#define stbir__4_coeff_continue_from_4( ofs )                   \
+    STBIR_SIMD_NO_UNROLL(decode);                               \
+    stbir__simdf_load( cs, hc + (ofs) );                        \
+    stbir__simdf_0123to0000( c, cs );                           \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 );     \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 );   \
+    stbir__simdf_0123to1111( c, cs );                           \
+    stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+7 );   \
+    stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 );  \
+    stbir__simdf_0123to2222( c, cs );                           \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 );  \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+17 );  \
+    stbir__simdf_0123to3333( c, cs );                           \
+    stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+21 );  \
+    stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+24 );   
+
+#define stbir__1_coeff_remnant( ofs )                           \
+    STBIR_SIMD_NO_UNROLL(decode);                               \
+    stbir__simdf_load1( c, hc + (ofs) );                        \
+    stbir__simdf_0123to0000( c, c );                            \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 );     \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 );   \
+
+#define stbir__2_coeff_remnant( ofs )                           \
+    STBIR_SIMD_NO_UNROLL(decode);                               \
+    stbir__simdf_load2( cs, hc + (ofs) );                       \
+    stbir__simdf_0123to0000( c, cs );                           \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 );     \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 );   \
+    stbir__simdf_0123to1111( c, cs );                           \
+    stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+7 );   \
+    stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 );  
+  
+#define stbir__3_coeff_remnant( ofs )                           \
+    STBIR_SIMD_NO_UNROLL(decode);                               \
+    stbir__simdf_load( cs, hc + (ofs) );                        \
+    stbir__simdf_0123to0000( c, cs );                           \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 );     \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 );   \
+    stbir__simdf_0123to1111( c, cs );                           \
+    stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+7 );   \
+    stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 );  \
+    stbir__simdf_0123to2222( c, cs );                           \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 );  \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+17 );  
+
+#define stbir__store_output()                     \
+    stbir__simdf_add( tot0, tot0, tot2 );         \
+    stbir__simdf_add( tot1, tot1, tot3 );         \
+    stbir__simdf_store( output+3, tot1 );         \
+    stbir__simdf_store( output, tot0 );           \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 7;
+
+#endif
+
+#else
+
+#define stbir__1_coeff_only()        \
+    float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c; \
+    c = hc[0];                       \
+    tot0 = decode[0]*c;              \
+    tot1 = decode[1]*c;              \
+    tot2 = decode[2]*c;              \
+    tot3 = decode[3]*c;              \
+    tot4 = decode[4]*c;              \
+    tot5 = decode[5]*c;              \
+    tot6 = decode[6]*c;              
+
+#define stbir__2_coeff_only()        \
+    float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c; \
+    c = hc[0];                       \
+    tot0 = decode[0]*c;              \
+    tot1 = decode[1]*c;              \
+    tot2 = decode[2]*c;              \
+    tot3 = decode[3]*c;              \
+    tot4 = decode[4]*c;              \
+    tot5 = decode[5]*c;              \
+    tot6 = decode[6]*c;              \
+    c = hc[1];                       \
+    tot0 += decode[7]*c;             \
+    tot1 += decode[8]*c;             \
+    tot2 += decode[9]*c;             \
+    tot3 += decode[10]*c;            \
+    tot4 += decode[11]*c;            \
+    tot5 += decode[12]*c;            \
+    tot6 += decode[13]*c;            \
+
+#define stbir__3_coeff_only()        \
+    float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c; \
+    c = hc[0];                       \
+    tot0 = decode[0]*c;              \
+    tot1 = decode[1]*c;              \
+    tot2 = decode[2]*c;              \
+    tot3 = decode[3]*c;              \
+    tot4 = decode[4]*c;              \
+    tot5 = decode[5]*c;              \
+    tot6 = decode[6]*c;              \
+    c = hc[1];                       \
+    tot0 += decode[7]*c;             \
+    tot1 += decode[8]*c;             \
+    tot2 += decode[9]*c;             \
+    tot3 += decode[10]*c;            \
+    tot4 += decode[11]*c;            \
+    tot5 += decode[12]*c;            \
+    tot6 += decode[13]*c;            \
+    c = hc[2];                       \
+    tot0 += decode[14]*c;            \
+    tot1 += decode[15]*c;            \
+    tot2 += decode[16]*c;            \
+    tot3 += decode[17]*c;            \
+    tot4 += decode[18]*c;            \
+    tot5 += decode[19]*c;            \
+    tot6 += decode[20]*c;            \
+
+#define stbir__store_output_tiny()                \
+    output[0] = tot0;                             \
+    output[1] = tot1;                             \
+    output[2] = tot2;                             \
+    output[3] = tot3;                             \
+    output[4] = tot4;                             \
+    output[5] = tot5;                             \
+    output[6] = tot6;                             \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 7;
+
+#define stbir__4_coeff_start()    \
+    float x0,x1,x2,x3,x4,x5,x6,y0,y1,y2,y3,y4,y5,y6,c; \
+    STBIR_SIMD_NO_UNROLL(decode); \
+    c = hc[0];                    \
+    x0 = decode[0] * c;           \
+    x1 = decode[1] * c;           \
+    x2 = decode[2] * c;           \
+    x3 = decode[3] * c;           \
+    x4 = decode[4] * c;           \
+    x5 = decode[5] * c;           \
+    x6 = decode[6] * c;           \
+    c = hc[1];                    \
+    y0 = decode[7] * c;           \
+    y1 = decode[8] * c;           \
+    y2 = decode[9] * c;           \
+    y3 = decode[10] * c;          \
+    y4 = decode[11] * c;          \
+    y5 = decode[12] * c;          \
+    y6 = decode[13] * c;          \
+    c = hc[2];                    \
+    x0 += decode[14] * c;         \
+    x1 += decode[15] * c;         \
+    x2 += decode[16] * c;         \
+    x3 += decode[17] * c;         \
+    x4 += decode[18] * c;         \
+    x5 += decode[19] * c;         \
+    x6 += decode[20] * c;         \
+    c = hc[3];                    \
+    y0 += decode[21] * c;         \
+    y1 += decode[22] * c;         \
+    y2 += decode[23] * c;         \
+    y3 += decode[24] * c;         \
+    y4 += decode[25] * c;         \
+    y5 += decode[26] * c;         \
+    y6 += decode[27] * c; 
+
+#define stbir__4_coeff_continue_from_4( ofs ) \
+    STBIR_SIMD_NO_UNROLL(decode);  \
+    c = hc[0+(ofs)];               \
+    x0 += decode[0+(ofs)*7] * c;   \
+    x1 += decode[1+(ofs)*7] * c;   \
+    x2 += decode[2+(ofs)*7] * c;   \
+    x3 += decode[3+(ofs)*7] * c;   \
+    x4 += decode[4+(ofs)*7] * c;   \
+    x5 += decode[5+(ofs)*7] * c;   \
+    x6 += decode[6+(ofs)*7] * c;   \
+    c = hc[1+(ofs)];               \
+    y0 += decode[7+(ofs)*7] * c;   \
+    y1 += decode[8+(ofs)*7] * c;   \
+    y2 += decode[9+(ofs)*7] * c;   \
+    y3 += decode[10+(ofs)*7] * c;  \
+    y4 += decode[11+(ofs)*7] * c;  \
+    y5 += decode[12+(ofs)*7] * c;  \
+    y6 += decode[13+(ofs)*7] * c;  \
+    c = hc[2+(ofs)];               \
+    x0 += decode[14+(ofs)*7] * c;  \
+    x1 += decode[15+(ofs)*7] * c;  \
+    x2 += decode[16+(ofs)*7] * c;  \
+    x3 += decode[17+(ofs)*7] * c;  \
+    x4 += decode[18+(ofs)*7] * c;  \
+    x5 += decode[19+(ofs)*7] * c;  \
+    x6 += decode[20+(ofs)*7] * c;  \
+    c = hc[3+(ofs)];               \
+    y0 += decode[21+(ofs)*7] * c;  \
+    y1 += decode[22+(ofs)*7] * c;  \
+    y2 += decode[23+(ofs)*7] * c;  \
+    y3 += decode[24+(ofs)*7] * c;  \
+    y4 += decode[25+(ofs)*7] * c;  \
+    y5 += decode[26+(ofs)*7] * c;  \
+    y6 += decode[27+(ofs)*7] * c; 
+
+#define stbir__1_coeff_remnant( ofs ) \
+    STBIR_SIMD_NO_UNROLL(decode);  \
+    c = hc[0+(ofs)];               \
+    x0 += decode[0+(ofs)*7] * c;   \
+    x1 += decode[1+(ofs)*7] * c;   \
+    x2 += decode[2+(ofs)*7] * c;   \
+    x3 += decode[3+(ofs)*7] * c;   \
+    x4 += decode[4+(ofs)*7] * c;   \
+    x5 += decode[5+(ofs)*7] * c;   \
+    x6 += decode[6+(ofs)*7] * c;   \
+
+#define stbir__2_coeff_remnant( ofs ) \
+    STBIR_SIMD_NO_UNROLL(decode);  \
+    c = hc[0+(ofs)];               \
+    x0 += decode[0+(ofs)*7] * c;   \
+    x1 += decode[1+(ofs)*7] * c;   \
+    x2 += decode[2+(ofs)*7] * c;   \
+    x3 += decode[3+(ofs)*7] * c;   \
+    x4 += decode[4+(ofs)*7] * c;   \
+    x5 += decode[5+(ofs)*7] * c;   \
+    x6 += decode[6+(ofs)*7] * c;   \
+    c = hc[1+(ofs)];               \
+    y0 += decode[7+(ofs)*7] * c;   \
+    y1 += decode[8+(ofs)*7] * c;   \
+    y2 += decode[9+(ofs)*7] * c;   \
+    y3 += decode[10+(ofs)*7] * c;  \
+    y4 += decode[11+(ofs)*7] * c;  \
+    y5 += decode[12+(ofs)*7] * c;  \
+    y6 += decode[13+(ofs)*7] * c;  \
+  
+#define stbir__3_coeff_remnant( ofs ) \
+    STBIR_SIMD_NO_UNROLL(decode);  \
+    c = hc[0+(ofs)];               \
+    x0 += decode[0+(ofs)*7] * c;   \
+    x1 += decode[1+(ofs)*7] * c;   \
+    x2 += decode[2+(ofs)*7] * c;   \
+    x3 += decode[3+(ofs)*7] * c;   \
+    x4 += decode[4+(ofs)*7] * c;   \
+    x5 += decode[5+(ofs)*7] * c;   \
+    x6 += decode[6+(ofs)*7] * c;   \
+    c = hc[1+(ofs)];               \
+    y0 += decode[7+(ofs)*7] * c;   \
+    y1 += decode[8+(ofs)*7] * c;   \
+    y2 += decode[9+(ofs)*7] * c;   \
+    y3 += decode[10+(ofs)*7] * c;  \
+    y4 += decode[11+(ofs)*7] * c;  \
+    y5 += decode[12+(ofs)*7] * c;  \
+    y6 += decode[13+(ofs)*7] * c;  \
+    c = hc[2+(ofs)];               \
+    x0 += decode[14+(ofs)*7] * c;  \
+    x1 += decode[15+(ofs)*7] * c;  \
+    x2 += decode[16+(ofs)*7] * c;  \
+    x3 += decode[17+(ofs)*7] * c;  \
+    x4 += decode[18+(ofs)*7] * c;  \
+    x5 += decode[19+(ofs)*7] * c;  \
+    x6 += decode[20+(ofs)*7] * c;  \
+
+#define stbir__store_output()                     \
+    output[0] = x0 + y0;                          \
+    output[1] = x1 + y1;                          \
+    output[2] = x2 + y2;                          \
+    output[3] = x3 + y3;                          \
+    output[4] = x4 + y4;                          \
+    output[5] = x5 + y5;                          \
+    output[6] = x6 + y6;                          \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 7;
+
+#endif  
+
+#define STBIR__horizontal_channels 7
+#define STB_IMAGE_RESIZE_DO_HORIZONTALS
+#include STBIR__HEADER_FILENAME
+
+
+// include all of the vertical resamplers (both scatter and gather versions)
+
+#define STBIR__vertical_channels 1
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 1
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 2
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 2
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 3
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 3
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 4
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 4
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 5
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 5
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 6
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 6
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 7
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 7
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 8
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 8
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#include STBIR__HEADER_FILENAME
+
+typedef void STBIR_VERTICAL_GATHERFUNC( float * output, float const * coeffs, float const ** inputs, float const * input0_end );
+
+static STBIR_VERTICAL_GATHERFUNC * stbir__vertical_gathers[ 8 ] =
+{
+  stbir__vertical_gather_with_1_coeffs,stbir__vertical_gather_with_2_coeffs,stbir__vertical_gather_with_3_coeffs,stbir__vertical_gather_with_4_coeffs,stbir__vertical_gather_with_5_coeffs,stbir__vertical_gather_with_6_coeffs,stbir__vertical_gather_with_7_coeffs,stbir__vertical_gather_with_8_coeffs
+};
+
+static STBIR_VERTICAL_GATHERFUNC * stbir__vertical_gathers_continues[ 8 ] =
+{
+  stbir__vertical_gather_with_1_coeffs_cont,stbir__vertical_gather_with_2_coeffs_cont,stbir__vertical_gather_with_3_coeffs_cont,stbir__vertical_gather_with_4_coeffs_cont,stbir__vertical_gather_with_5_coeffs_cont,stbir__vertical_gather_with_6_coeffs_cont,stbir__vertical_gather_with_7_coeffs_cont,stbir__vertical_gather_with_8_coeffs_cont
+};
+
+typedef void STBIR_VERTICAL_SCATTERFUNC( float ** outputs, float const * coeffs, float const * input, float const * input_end );
+
+static STBIR_VERTICAL_SCATTERFUNC * stbir__vertical_scatter_sets[ 8 ] =
+{
+  stbir__vertical_scatter_with_1_coeffs,stbir__vertical_scatter_with_2_coeffs,stbir__vertical_scatter_with_3_coeffs,stbir__vertical_scatter_with_4_coeffs,stbir__vertical_scatter_with_5_coeffs,stbir__vertical_scatter_with_6_coeffs,stbir__vertical_scatter_with_7_coeffs,stbir__vertical_scatter_with_8_coeffs
+};
+
+static STBIR_VERTICAL_SCATTERFUNC * stbir__vertical_scatter_blends[ 8 ] =
+{
+  stbir__vertical_scatter_with_1_coeffs_cont,stbir__vertical_scatter_with_2_coeffs_cont,stbir__vertical_scatter_with_3_coeffs_cont,stbir__vertical_scatter_with_4_coeffs_cont,stbir__vertical_scatter_with_5_coeffs_cont,stbir__vertical_scatter_with_6_coeffs_cont,stbir__vertical_scatter_with_7_coeffs_cont,stbir__vertical_scatter_with_8_coeffs_cont
+};
+
+
+static void stbir__encode_scanline( stbir__info const * stbir_info, void *output_buffer_data, float * encode_buffer, int row  STBIR_ONLY_PROFILE_GET_SPLIT_INFO )
+{
+  int num_pixels = stbir_info->horizontal.scale_info.output_sub_size;
+  int channels = stbir_info->channels;
+  int width_times_channels = num_pixels * channels;
+  void * output_buffer;
+
+  // un-alpha weight if we need to
+  if ( stbir_info->alpha_unweight )
+  {
+    STBIR_PROFILE_START( unalpha );
+    stbir_info->alpha_unweight( encode_buffer, width_times_channels );
+    STBIR_PROFILE_END( unalpha );
+  }
+
+  // write directly into output by default
+  output_buffer = output_buffer_data;
+
+  // if we have an output callback, we first convert the decode buffer in place (and then hand that to the callback)
+  if ( stbir_info->out_pixels_cb )
+    output_buffer = encode_buffer;
+  
+  STBIR_PROFILE_START( encode );
+  // convert into the output buffer
+  stbir_info->encode_pixels( output_buffer, width_times_channels, encode_buffer );
+  STBIR_PROFILE_END( encode );
+
+  // if we have an output callback, call it to send the data
+  if ( stbir_info->out_pixels_cb )
+    stbir_info->out_pixels_cb( output_buffer_data, num_pixels, row, stbir_info->user_data );
+}
+
+
+// Get the ring buffer pointer for an index
+static float* stbir__get_ring_buffer_entry(stbir__info const * stbir_info, stbir__per_split_info const * split_info, int index )
+{
+  STBIR_ASSERT( index < stbir_info->ring_buffer_num_entries );
+
+  #ifdef STBIR__SEPARATE_ALLOCATIONS
+    return split_info->ring_buffers[ index ];
+  #else
+    return (float*) ( ( (char*) split_info->ring_buffer ) + ( index * stbir_info->ring_buffer_length_bytes ) );
+  #endif
+}
+
+// Get the specified scan line from the ring buffer
+static float* stbir__get_ring_buffer_scanline(stbir__info const * stbir_info, stbir__per_split_info const * split_info, int get_scanline)
+{
+  int ring_buffer_index = (split_info->ring_buffer_begin_index + (get_scanline - split_info->ring_buffer_first_scanline)) % stbir_info->ring_buffer_num_entries;
+  return stbir__get_ring_buffer_entry( stbir_info, split_info, ring_buffer_index );
+}
+
+static void stbir__resample_horizontal_gather(stbir__info const * stbir_info, float* output_buffer, float const * input_buffer STBIR_ONLY_PROFILE_GET_SPLIT_INFO )
+{
+  float const * decode_buffer = input_buffer - ( stbir_info->scanline_extents.conservative.n0 * stbir_info->effective_channels );
+
+  STBIR_PROFILE_START( horizontal );
+  if ( ( stbir_info->horizontal.filter_enum == STBIR_FILTER_POINT_SAMPLE ) && ( stbir_info->horizontal.scale_info.scale == 1.0f ) )
+    STBIR_MEMCPY( output_buffer, input_buffer, stbir_info->horizontal.scale_info.output_sub_size * sizeof( float ) * stbir_info->effective_channels );
+  else
+    stbir_info->horizontal_gather_channels( output_buffer, stbir_info->horizontal.scale_info.output_sub_size, decode_buffer, stbir_info->horizontal.contributors, stbir_info->horizontal.coefficients, stbir_info->horizontal.coefficient_width );
+  STBIR_PROFILE_END( horizontal );
+}
+
+static void stbir__resample_vertical_gather(stbir__info const * stbir_info, stbir__per_split_info* split_info, int n, int contrib_n0, int contrib_n1, float const * vertical_coefficients )
+{
+  float* encode_buffer = split_info->vertical_buffer;
+  float* decode_buffer = split_info->decode_buffer;
+  int vertical_first = stbir_info->vertical_first;
+  int width = (vertical_first) ? ( stbir_info->scanline_extents.conservative.n1-stbir_info->scanline_extents.conservative.n0+1 ) : stbir_info->horizontal.scale_info.output_sub_size;
+  int width_times_channels = stbir_info->effective_channels * width;
+
+  STBIR_ASSERT( stbir_info->vertical.is_gather );
+
+  // loop over the contributing scanlines and scale into the buffer
+  STBIR_PROFILE_START( vertical );
+  {
+    int k = 0, total = contrib_n1 - contrib_n0 + 1;
+    STBIR_ASSERT( total > 0 );
+    do {
+      float const * inputs[8];
+      int i, cnt = total; if ( cnt > 8 ) cnt = 8;
+      for( i = 0 ; i < cnt ; i++ )
+        inputs[ i ] = stbir__get_ring_buffer_scanline(stbir_info, split_info, k+i+contrib_n0 );
+
+      // call the N scanlines at a time function (up to 8 scanlines of blending at once)
+      ((k==0)?stbir__vertical_gathers:stbir__vertical_gathers_continues)[cnt-1]( (vertical_first) ? decode_buffer : encode_buffer, vertical_coefficients + k, inputs, inputs[0] + width_times_channels );
+      k += cnt;
+      total -= cnt;
+    } while ( total );
+  }
+  STBIR_PROFILE_END( vertical );
+
+  if ( vertical_first )
+  {
+    // Now resample the gathered vertical data in the horizontal axis into the encode buffer
+    stbir__resample_horizontal_gather(stbir_info, encode_buffer, decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+  }
+
+  stbir__encode_scanline( stbir_info, ( (char *) stbir_info->output_data ) + ((ptrdiff_t)n * (ptrdiff_t)stbir_info->output_stride_bytes), 
+                          encode_buffer, n  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+}
+
+static void stbir__decode_and_resample_for_vertical_gather_loop(stbir__info const * stbir_info, stbir__per_split_info* split_info, int n)
+{
+  int ring_buffer_index;
+  float* ring_buffer;
+
+  // Decode the nth scanline from the source image into the decode buffer.
+  stbir__decode_scanline( stbir_info, n, split_info->decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+
+  // update new end scanline
+  split_info->ring_buffer_last_scanline = n;
+
+  // get ring buffer 
+  ring_buffer_index = (split_info->ring_buffer_begin_index + (split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline)) % stbir_info->ring_buffer_num_entries;
+  ring_buffer = stbir__get_ring_buffer_entry(stbir_info, split_info, ring_buffer_index);
+
+  // Now resample it into the ring buffer.
+  stbir__resample_horizontal_gather( stbir_info, ring_buffer, split_info->decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+
+  // Now it's sitting in the ring buffer ready to be used as source for the vertical sampling.
+}
+
+static void stbir__vertical_gather_loop( stbir__info const * stbir_info, stbir__per_split_info* split_info, int split_count )
+{
+  int y, start_output_y, end_output_y;
+  stbir__contributors* vertical_contributors = stbir_info->vertical.contributors;
+  float const * vertical_coefficients = stbir_info->vertical.coefficients;
+
+  STBIR_ASSERT( stbir_info->vertical.is_gather );
+
+  start_output_y = split_info->start_output_y;
+  end_output_y = split_info[split_count-1].end_output_y;
+
+  vertical_contributors += start_output_y;
+  vertical_coefficients += start_output_y * stbir_info->vertical.coefficient_width;
+
+  // initialize the ring buffer for gathering
+  split_info->ring_buffer_begin_index = 0;
+  split_info->ring_buffer_first_scanline = stbir_info->vertical.extent_info.lowest;  
+  split_info->ring_buffer_last_scanline = split_info->ring_buffer_first_scanline - 1; // means "empty"
+
+  for (y = start_output_y; y < end_output_y; y++)
+  {
+    int in_first_scanline, in_last_scanline;
+
+    in_first_scanline = vertical_contributors->n0;
+    in_last_scanline = vertical_contributors->n1;
+
+    // make sure the indexing hasn't broken
+    STBIR_ASSERT( in_first_scanline >= split_info->ring_buffer_first_scanline );
+
+    // Load in new scanlines
+    while (in_last_scanline > split_info->ring_buffer_last_scanline)
+    {
+      STBIR_ASSERT( ( split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline + 1 ) <= stbir_info->ring_buffer_num_entries );
+
+      // make sure there was room in the ring buffer when we add new scanlines
+      if ( ( split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline + 1 ) == stbir_info->ring_buffer_num_entries )
+      {
+        split_info->ring_buffer_first_scanline++;
+        split_info->ring_buffer_begin_index++;
+      }
+      
+      if ( stbir_info->vertical_first )
+      {
+        float * ring_buffer = stbir__get_ring_buffer_scanline( stbir_info, split_info, ++split_info->ring_buffer_last_scanline );
+        // Decode the nth scanline from the source image into the decode buffer.
+        stbir__decode_scanline( stbir_info, split_info->ring_buffer_last_scanline, ring_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO ); 
+      }
+      else
+      {
+        stbir__decode_and_resample_for_vertical_gather_loop(stbir_info, split_info, split_info->ring_buffer_last_scanline + 1);
+      }
+    }
+
+    // Now all buffers should be ready to write a row of vertical sampling, so do it.
+    stbir__resample_vertical_gather(stbir_info, split_info, y, in_first_scanline, in_last_scanline, vertical_coefficients );
+
+    ++vertical_contributors;
+    vertical_coefficients += stbir_info->vertical.coefficient_width;
+  }
+}
+
+#define STBIR__FLOAT_EMPTY_MARKER 3.0e+38F
+#define STBIR__FLOAT_BUFFER_IS_EMPTY(ptr) ((ptr)[0]==STBIR__FLOAT_EMPTY_MARKER)
+
+static void stbir__encode_first_scanline_from_scatter(stbir__info const * stbir_info, stbir__per_split_info* split_info)
+{
+  // evict a scanline out into the output buffer
+  float* ring_buffer_entry = stbir__get_ring_buffer_entry(stbir_info, split_info, split_info->ring_buffer_begin_index );
+  
+  // dump the scanline out
+  stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (ptrdiff_t)split_info->ring_buffer_first_scanline * (ptrdiff_t)stbir_info->output_stride_bytes ), ring_buffer_entry, split_info->ring_buffer_first_scanline  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+  
+  // mark it as empty
+  ring_buffer_entry[ 0 ] = STBIR__FLOAT_EMPTY_MARKER;
+
+  // advance the first scanline
+  split_info->ring_buffer_first_scanline++;
+  if ( ++split_info->ring_buffer_begin_index == stbir_info->ring_buffer_num_entries )
+    split_info->ring_buffer_begin_index = 0;
+}
+
+static void stbir__horizontal_resample_and_encode_first_scanline_from_scatter(stbir__info const * stbir_info, stbir__per_split_info* split_info)
+{
+  // evict a scanline out into the output buffer
+
+  float* ring_buffer_entry = stbir__get_ring_buffer_entry(stbir_info, split_info, split_info->ring_buffer_begin_index );
+
+  // Now resample it into the buffer.
+  stbir__resample_horizontal_gather( stbir_info, split_info->vertical_buffer, ring_buffer_entry  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+  
+  // dump the scanline out
+  stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (ptrdiff_t)split_info->ring_buffer_first_scanline * (ptrdiff_t)stbir_info->output_stride_bytes ), split_info->vertical_buffer, split_info->ring_buffer_first_scanline  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+  
+  // mark it as empty
+  ring_buffer_entry[ 0 ] = STBIR__FLOAT_EMPTY_MARKER;
+
+  // advance the first scanline
+  split_info->ring_buffer_first_scanline++;
+  if ( ++split_info->ring_buffer_begin_index == stbir_info->ring_buffer_num_entries )
+    split_info->ring_buffer_begin_index = 0;
+}
+
+static void stbir__resample_vertical_scatter(stbir__info const * stbir_info, stbir__per_split_info* split_info, int n0, int n1, float const * vertical_coefficients, float const * vertical_buffer, float const * vertical_buffer_end )
+{
+  STBIR_ASSERT( !stbir_info->vertical.is_gather );
+
+  STBIR_PROFILE_START( vertical );
+  {
+    int k = 0, total = n1 - n0 + 1;
+    STBIR_ASSERT( total > 0 );
+    do {
+      float * outputs[8];
+      int i, n = total; if ( n > 8 ) n = 8;
+      for( i = 0 ; i < n ; i++ )
+      {
+        outputs[ i ] = stbir__get_ring_buffer_scanline(stbir_info, split_info, k+i+n0 );
+        if ( ( i ) && ( STBIR__FLOAT_BUFFER_IS_EMPTY( outputs[i] ) != STBIR__FLOAT_BUFFER_IS_EMPTY( outputs[0] ) ) ) // make sure runs are of the same type
+        {
+          n = i;
+          break;
+        }
+      }
+      // call the scatter to N scanlines at a time function (up to 8 scanlines of scattering at once)
+      ((STBIR__FLOAT_BUFFER_IS_EMPTY( outputs[0] ))?stbir__vertical_scatter_sets:stbir__vertical_scatter_blends)[n-1]( outputs, vertical_coefficients + k, vertical_buffer, vertical_buffer_end );
+      k += n;
+      total -= n;
+    } while ( total );
+  }
+
+  STBIR_PROFILE_END( vertical );
+}
+
+typedef void stbir__handle_scanline_for_scatter_func(stbir__info const * stbir_info, stbir__per_split_info* split_info); 
+
+static void stbir__vertical_scatter_loop( stbir__info const * stbir_info, stbir__per_split_info* split_info, int split_count )
+{
+  int y, start_output_y, end_output_y, start_input_y, end_input_y;
+  stbir__contributors* vertical_contributors = stbir_info->vertical.contributors;
+  float const * vertical_coefficients = stbir_info->vertical.coefficients;
+  stbir__handle_scanline_for_scatter_func * handle_scanline_for_scatter;
+  void * scanline_scatter_buffer;
+  void * scanline_scatter_buffer_end;
+  int on_first_input_y, last_input_y;
+
+  STBIR_ASSERT( !stbir_info->vertical.is_gather );
+
+  start_output_y = split_info->start_output_y;
+  end_output_y = split_info[split_count-1].end_output_y;  // may do multiple split counts
+
+  start_input_y = split_info->start_input_y;
+  end_input_y = split_info[split_count-1].end_input_y;
+
+  // adjust for starting offset start_input_y
+  y = start_input_y + stbir_info->vertical.filter_pixel_margin; 
+  vertical_contributors += y ;
+  vertical_coefficients += stbir_info->vertical.coefficient_width * y;
+
+  if ( stbir_info->vertical_first )
+  {
+    handle_scanline_for_scatter = stbir__horizontal_resample_and_encode_first_scanline_from_scatter;
+    scanline_scatter_buffer = split_info->decode_buffer;
+    scanline_scatter_buffer_end = ( (char*) scanline_scatter_buffer ) + sizeof( float ) * stbir_info->effective_channels * (stbir_info->scanline_extents.conservative.n1-stbir_info->scanline_extents.conservative.n0+1);
+  }
+  else
+  {
+    handle_scanline_for_scatter = stbir__encode_first_scanline_from_scatter;
+    scanline_scatter_buffer = split_info->vertical_buffer;
+    scanline_scatter_buffer_end = ( (char*) scanline_scatter_buffer ) + sizeof( float ) * stbir_info->effective_channels * stbir_info->horizontal.scale_info.output_sub_size;
+  }
+
+  // initialize the ring buffer for scattering
+  split_info->ring_buffer_first_scanline = start_output_y;
+  split_info->ring_buffer_last_scanline = -1;
+  split_info->ring_buffer_begin_index = -1;
+
+  // mark all the buffers as empty to start
+  for( y = 0 ; y < stbir_info->ring_buffer_num_entries ; y++ )
+    stbir__get_ring_buffer_entry( stbir_info, split_info, y )[0] = STBIR__FLOAT_EMPTY_MARKER; // only used on scatter
+
+  // do the loop in input space
+  on_first_input_y = 1; last_input_y = start_input_y;
+  for (y = start_input_y ; y < end_input_y; y++)
+  {
+    int out_first_scanline, out_last_scanline;
+
+    out_first_scanline = vertical_contributors->n0;
+    out_last_scanline = vertical_contributors->n1;
+
+    STBIR_ASSERT(out_last_scanline - out_first_scanline + 1 <= stbir_info->ring_buffer_num_entries);
+
+    if ( ( out_last_scanline >= out_first_scanline ) && ( ( ( out_first_scanline >= start_output_y ) && ( out_first_scanline < end_output_y ) ) || ( ( out_last_scanline >= start_output_y ) && ( out_last_scanline < end_output_y ) ) ) )
+    {
+      float const * vc = vertical_coefficients;
+
+      // keep track of the range actually seen for the next resize
+      last_input_y = y;
+      if ( ( on_first_input_y ) && ( y > start_input_y ) )
+        split_info->start_input_y = y;
+      on_first_input_y = 0;
+
+      // clip the region 
+      if ( out_first_scanline < start_output_y )
+      {
+        vc += start_output_y - out_first_scanline;
+        out_first_scanline = start_output_y;
+      }
+
+      if ( out_last_scanline >= end_output_y )
+        out_last_scanline = end_output_y - 1;
+
+      // if very first scanline, init the index
+      if (split_info->ring_buffer_begin_index < 0)
+        split_info->ring_buffer_begin_index = out_first_scanline - start_output_y;
+      
+      STBIR_ASSERT( split_info->ring_buffer_begin_index <= out_first_scanline );
+
+      // Decode the nth scanline from the source image into the decode buffer.
+      stbir__decode_scanline( stbir_info, y, split_info->decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO ); 
+
+      // When horizontal first, we resample horizontally into the vertical buffer before we scatter it out
+      if ( !stbir_info->vertical_first )
+        stbir__resample_horizontal_gather( stbir_info, split_info->vertical_buffer, split_info->decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+
+      // Now it's sitting in the buffer ready to be distributed into the ring buffers.
+
+      // evict from the ringbuffer, if we need are full
+      if ( ( ( split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline + 1 ) == stbir_info->ring_buffer_num_entries ) &&
+           ( out_last_scanline > split_info->ring_buffer_last_scanline ) )
+        handle_scanline_for_scatter( stbir_info, split_info );
+    
+      // Now the horizontal buffer is ready to write to all ring buffer rows, so do it.
+      stbir__resample_vertical_scatter(stbir_info, split_info, out_first_scanline, out_last_scanline, vc, (float*)scanline_scatter_buffer, (float*)scanline_scatter_buffer_end );
+
+      // update the end of the buffer
+      if ( out_last_scanline > split_info->ring_buffer_last_scanline )
+        split_info->ring_buffer_last_scanline = out_last_scanline;
+    }
+    ++vertical_contributors;
+    vertical_coefficients += stbir_info->vertical.coefficient_width;
+  }
+
+  // now evict the scanlines that are left over in the ring buffer
+  while ( split_info->ring_buffer_first_scanline < end_output_y )
+    handle_scanline_for_scatter(stbir_info, split_info);
+
+  // update the end_input_y if we do multiple resizes with the same data
+  ++last_input_y;
+  for( y = 0 ; y < split_count; y++ )
+    if ( split_info[y].end_input_y > last_input_y )
+      split_info[y].end_input_y = last_input_y;
+}
+
+
+static stbir__kernel_callback * stbir__builtin_kernels[] =   { 0, stbir__filter_trapezoid,  stbir__filter_triangle, stbir__filter_cubic, stbir__filter_catmullrom, stbir__filter_mitchell, stbir__filter_point };
+static stbir__support_callback * stbir__builtin_supports[] = { 0, stbir__support_trapezoid, stbir__support_one,     stbir__support_two,  stbir__support_two,       stbir__support_two,     stbir__support_zeropoint5 };
+
+static void stbir__set_sampler(stbir__sampler * samp, stbir_filter filter, stbir__kernel_callback * kernel, stbir__support_callback * support, stbir_edge edge, stbir__scale_info * scale_info, int always_gather, void * user_data )
+{
+  // set filter
+  if (filter == 0)
+  {
+    filter = STBIR_DEFAULT_FILTER_DOWNSAMPLE; // default to downsample
+    if (scale_info->scale >= ( 1.0f - stbir__small_float ) )
+    {
+      if ( (scale_info->scale <= ( 1.0f + stbir__small_float ) ) && ( STBIR_CEILF(scale_info->pixel_shift) == scale_info->pixel_shift ) )
+        filter = STBIR_FILTER_POINT_SAMPLE;  
+      else
+        filter = STBIR_DEFAULT_FILTER_UPSAMPLE;
+    }
+  }
+  samp->filter_enum = filter;
+
+  STBIR_ASSERT(samp->filter_enum != 0);
+  STBIR_ASSERT((unsigned)samp->filter_enum < STBIR_FILTER_OTHER); 
+  samp->filter_kernel = stbir__builtin_kernels[ filter ];
+  samp->filter_support = stbir__builtin_supports[ filter ];
+
+  if ( kernel && support )
+  {
+    samp->filter_kernel = kernel;
+    samp->filter_support = support;
+    samp->filter_enum = STBIR_FILTER_OTHER;
+  }
+
+  samp->edge = edge;
+  samp->filter_pixel_width  = stbir__get_filter_pixel_width (samp->filter_support, scale_info->scale, user_data );
+  // Gather is always better, but in extreme downsamples, you have to most or all of the data in memory
+  //    For horizontal, we always have all the pixels, so we always use gather here (always_gather==1).
+  //    For vertical, we use gather if scaling up (which means we will have samp->filter_pixel_width
+  //    scanlines in memory at once).
+  samp->is_gather = 0;
+  if ( scale_info->scale >= ( 1.0f - stbir__small_float ) )
+    samp->is_gather = 1;
+  else if ( ( always_gather ) || ( samp->filter_pixel_width <= STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT ) )
+    samp->is_gather = 2;
+
+  // pre calculate stuff based on the above
+  samp->coefficient_width = stbir__get_coefficient_width(samp, samp->is_gather, user_data);
+
+  if ( edge == STBIR_EDGE_WRAP )
+    if ( samp->filter_pixel_width > ( scale_info->input_full_size * 2 ) )  // this can only happen when shrinking to a single pixel
+      samp->filter_pixel_width = scale_info->input_full_size * 2;
+
+  // This is how much to expand buffers to account for filters seeking outside
+  // the image boundaries.
+  samp->filter_pixel_margin = samp->filter_pixel_width / 2;
+
+  samp->num_contributors = stbir__get_contributors(samp, samp->is_gather);
+  samp->contributors_size = samp->num_contributors * sizeof(stbir__contributors);
+  samp->coefficients_size = samp->num_contributors * samp->coefficient_width * sizeof(float) + sizeof(float); // extra sizeof(float) is padding
+
+  samp->gather_prescatter_contributors = 0;
+  samp->gather_prescatter_coefficients = 0;
+  if ( samp->is_gather == 0 )
+  {
+    samp->gather_prescatter_coefficient_width = samp->filter_pixel_width;
+    samp->gather_prescatter_num_contributors  = stbir__get_contributors(samp, 2);
+    samp->gather_prescatter_contributors_size = samp->gather_prescatter_num_contributors * sizeof(stbir__contributors);
+    samp->gather_prescatter_coefficients_size = samp->gather_prescatter_num_contributors * samp->gather_prescatter_coefficient_width * sizeof(float);
+  }
+}
+
+static void stbir__get_conservative_extents( stbir__sampler * samp, stbir__contributors * range, void * user_data )
+{
+  float scale = samp->scale_info.scale;
+  float out_shift = samp->scale_info.pixel_shift;
+  stbir__support_callback * support = samp->filter_support;
+  int input_full_size = samp->scale_info.input_full_size;
+  stbir_edge edge = samp->edge;
+  float inv_scale = samp->scale_info.inv_scale;
+
+  STBIR_ASSERT( samp->is_gather != 0 );
+
+  if ( samp->is_gather == 1 )
+  {
+    int in_first_pixel, in_last_pixel;
+    float out_filter_radius = support(inv_scale, user_data) * scale;
+
+    stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, 0.5, out_filter_radius, inv_scale, out_shift, input_full_size, edge );
+    range->n0 = in_first_pixel;
+    stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, ( (float)(samp->scale_info.output_sub_size-1) ) + 0.5f, out_filter_radius, inv_scale, out_shift, input_full_size, edge );
+    range->n1 = in_last_pixel;
+  }
+  else if ( samp->is_gather == 2 ) // downsample gather, refine
+  {
+    float in_pixels_radius = support(scale, user_data) * inv_scale;
+    int filter_pixel_margin = samp->filter_pixel_margin;
+    int output_sub_size = samp->scale_info.output_sub_size;
+    int input_end;
+    int n;
+    int in_first_pixel, in_last_pixel;
+
+    // get a conservative area of the input range
+    stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, 0, 0, inv_scale, out_shift, input_full_size, edge );
+    range->n0 = in_first_pixel;
+    stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, (float)output_sub_size, 0, inv_scale, out_shift, input_full_size, edge );
+    range->n1 = in_last_pixel;
+     
+    // now go through the margin to the start of area to find bottom 
+    n = range->n0 + 1;
+    input_end = -filter_pixel_margin;
+    while( n >= input_end )
+    {
+      int out_first_pixel, out_last_pixel;
+      stbir__calculate_out_pixel_range( &out_first_pixel, &out_last_pixel, ((float)n)+0.5f, in_pixels_radius, scale, out_shift, output_sub_size );
+      if ( out_first_pixel > out_last_pixel )
+        break;
+
+      if ( ( out_first_pixel < output_sub_size ) || ( out_last_pixel >= 0 ) )
+        range->n0 = n;
+      --n;
+    }
+
+    // now go through the end of the area through the margin to find top 
+    n = range->n1 - 1;
+    input_end = n + 1 + filter_pixel_margin;
+    while( n <= input_end )
+    {
+      int out_first_pixel, out_last_pixel;
+      stbir__calculate_out_pixel_range( &out_first_pixel, &out_last_pixel, ((float)n)+0.5f, in_pixels_radius, scale, out_shift, output_sub_size );
+      if ( out_first_pixel > out_last_pixel )
+        break;
+      if ( ( out_first_pixel < output_sub_size ) || ( out_last_pixel >= 0 ) )
+        range->n1 = n;
+      ++n;
+    }
+  }
+
+  if ( samp->edge == STBIR_EDGE_WRAP )
+  {
+    // if we are wrapping, and we are very close to the image size (so the edges might merge), just use the scanline up to the edge
+    if ( ( range->n0 > 0 ) && ( range->n1 >= input_full_size ) )
+    {
+      int marg = range->n1 - input_full_size + 1;
+      if ( ( marg + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= range->n0 )
+        range->n0 = 0;
+    }
+    if ( ( range->n0 < 0 ) && ( range->n1 < (input_full_size-1) ) )
+    {
+      int marg = -range->n0;
+      if ( ( input_full_size - marg - STBIR__MERGE_RUNS_PIXEL_THRESHOLD - 1 ) <= range->n1 )
+        range->n1 = input_full_size - 1;
+    }
+  }
+  else
+  {
+    // for non-edge-wrap modes, we never read over the edge, so clamp
+    if ( range->n0 < 0 )
+      range->n0 = 0;
+    if ( range->n1 >= input_full_size )
+      range->n1 = input_full_size - 1;
+  }
+}
+
+static void stbir__get_split_info( stbir__per_split_info* split_info, int splits, int output_height, int vertical_pixel_margin, int input_full_height )
+{
+  int i, cur;
+  int left = output_height;
+
+  cur = 0;
+  for( i = 0 ; i < splits ; i++ )
+  {
+    int each; 
+    split_info[i].start_output_y = cur;
+    each = left / ( splits - i );
+    split_info[i].end_output_y = cur + each;
+    cur += each;
+    left -= each;
+
+    // scatter range (updated to minimum as you run it)
+    split_info[i].start_input_y = -vertical_pixel_margin;
+    split_info[i].end_input_y = input_full_height + vertical_pixel_margin;
+  }
+}
+
+static void stbir__free_internal_mem( stbir__info *info )
+{
+  #define STBIR__FREE_AND_CLEAR( ptr ) { if ( ptr ) { void * p = (ptr); (ptr) = 0; STBIR_FREE( p, info->user_data); } }
+  
+  if ( info )
+  {
+  #ifndef STBIR__SEPARATE_ALLOCATIONS
+    STBIR__FREE_AND_CLEAR( info->alloced_mem );
+  #else
+    int i,j;
+
+    if ( ( info->vertical.gather_prescatter_contributors ) && ( (void*)info->vertical.gather_prescatter_contributors != (void*)info->split_info[0].decode_buffer ) )
+    {
+      STBIR__FREE_AND_CLEAR( info->vertical.gather_prescatter_coefficients );
+      STBIR__FREE_AND_CLEAR( info->vertical.gather_prescatter_contributors );
+    }
+    for( i = 0 ; i < info->splits ; i++ )
+    {
+      for( j = 0 ; j < info->alloc_ring_buffer_num_entries ; j++ )
+      {
+        #ifdef STBIR_SIMD8
+        if ( info->effective_channels == 3 ) 
+          --info->split_info[i].ring_buffers[j]; // avx in 3 channel mode needs one float at the start of the buffer
+        #endif  
+        STBIR__FREE_AND_CLEAR( info->split_info[i].ring_buffers[j] );
+      }
+
+      #ifdef STBIR_SIMD8
+      if ( info->effective_channels == 3 ) 
+        --info->split_info[i].decode_buffer; // avx in 3 channel mode needs one float at the start of the buffer
+      #endif  
+      STBIR__FREE_AND_CLEAR( info->split_info[i].decode_buffer );
+      STBIR__FREE_AND_CLEAR( info->split_info[i].ring_buffers );
+      STBIR__FREE_AND_CLEAR( info->split_info[i].vertical_buffer );
+    }
+    STBIR__FREE_AND_CLEAR( info->split_info );
+    if ( info->vertical.coefficients != info->horizontal.coefficients )
+    {
+      STBIR__FREE_AND_CLEAR( info->vertical.coefficients );
+      STBIR__FREE_AND_CLEAR( info->vertical.contributors );
+    }
+    STBIR__FREE_AND_CLEAR( info->horizontal.coefficients );
+    STBIR__FREE_AND_CLEAR( info->horizontal.contributors );
+    STBIR__FREE_AND_CLEAR( info->alloced_mem );
+    STBIR__FREE_AND_CLEAR( info );
+  #endif
+  }
+  
+  #undef STBIR__FREE_AND_CLEAR
+}
+
+static int stbir__get_max_split( int splits, int height )
+{
+  int i;
+  int max = 0;
+
+  for( i = 0 ; i < splits ; i++ )
+  {
+    int each = height / ( splits - i );
+    if ( each > max ) 
+      max = each;
+    height -= each;
+  }
+  return max;
+}
+
+static stbir__horizontal_gather_channels_func ** stbir__horizontal_gather_n_coeffs_funcs[8] = 
+{ 
+  0, stbir__horizontal_gather_1_channels_with_n_coeffs_funcs, stbir__horizontal_gather_2_channels_with_n_coeffs_funcs, stbir__horizontal_gather_3_channels_with_n_coeffs_funcs, stbir__horizontal_gather_4_channels_with_n_coeffs_funcs, 0,0, stbir__horizontal_gather_7_channels_with_n_coeffs_funcs
+};
+
+static stbir__horizontal_gather_channels_func ** stbir__horizontal_gather_channels_funcs[8] = 
+{ 
+  0, stbir__horizontal_gather_1_channels_funcs, stbir__horizontal_gather_2_channels_funcs, stbir__horizontal_gather_3_channels_funcs, stbir__horizontal_gather_4_channels_funcs, 0,0, stbir__horizontal_gather_7_channels_funcs
+};
+
+// there are six resize classifications: 0 == vertical scatter, 1 == vertical gather < 1x scale, 2 == vertical gather 1x-2x scale, 4 == vertical gather < 3x scale, 4 == vertical gather > 3x scale, 5 == <=4 pixel height, 6 == <=4 pixel wide column
+#define STBIR_RESIZE_CLASSIFICATIONS 8
+
+static float stbir__compute_weights[5][STBIR_RESIZE_CLASSIFICATIONS][4]=  // 5 = 0=1chan, 1=2chan, 2=3chan, 3=4chan, 4=7chan
+{
+  {
+    { 1.00000f, 1.00000f, 0.31250f, 1.00000f },
+    { 0.56250f, 0.59375f, 0.00000f, 0.96875f },
+    { 1.00000f, 0.06250f, 0.00000f, 1.00000f },
+    { 0.00000f, 0.09375f, 1.00000f, 1.00000f },
+    { 1.00000f, 1.00000f, 1.00000f, 1.00000f },
+    { 0.03125f, 0.12500f, 1.00000f, 1.00000f },
+    { 0.06250f, 0.12500f, 0.00000f, 1.00000f },
+    { 0.00000f, 1.00000f, 0.00000f, 0.03125f },
+  }, {
+    { 0.00000f, 0.84375f, 0.00000f, 0.03125f },
+    { 0.09375f, 0.93750f, 0.00000f, 0.78125f },
+    { 0.87500f, 0.21875f, 0.00000f, 0.96875f },
+    { 0.09375f, 0.09375f, 1.00000f, 1.00000f },
+    { 1.00000f, 1.00000f, 1.00000f, 1.00000f },
+    { 0.03125f, 0.12500f, 1.00000f, 1.00000f },
+    { 0.06250f, 0.12500f, 0.00000f, 1.00000f },
+    { 0.00000f, 1.00000f, 0.00000f, 0.53125f },
+  }, {
+    { 0.00000f, 0.53125f, 0.00000f, 0.03125f },
+    { 0.06250f, 0.96875f, 0.00000f, 0.53125f },
+    { 0.87500f, 0.18750f, 0.00000f, 0.93750f },
+    { 0.00000f, 0.09375f, 1.00000f, 1.00000f },
+    { 1.00000f, 1.00000f, 1.00000f, 1.00000f },
+    { 0.03125f, 0.12500f, 1.00000f, 1.00000f },
+    { 0.06250f, 0.12500f, 0.00000f, 1.00000f },
+    { 0.00000f, 1.00000f, 0.00000f, 0.56250f },
+  }, {
+    { 0.00000f, 0.50000f, 0.00000f, 0.71875f },
+    { 0.06250f, 0.84375f, 0.00000f, 0.87500f },
+    { 1.00000f, 0.50000f, 0.50000f, 0.96875f },
+    { 1.00000f, 0.09375f, 0.31250f, 0.50000f },
+    { 1.00000f, 1.00000f, 1.00000f, 1.00000f },
+    { 1.00000f, 0.03125f, 0.03125f, 0.53125f },
+    { 0.18750f, 0.12500f, 0.00000f, 1.00000f },
+    { 0.00000f, 1.00000f, 0.03125f, 0.18750f },
+  }, {
+    { 0.00000f, 0.59375f, 0.00000f, 0.96875f },
+    { 0.06250f, 0.81250f, 0.06250f, 0.59375f },
+    { 0.75000f, 0.43750f, 0.12500f, 0.96875f },
+    { 0.87500f, 0.06250f, 0.18750f, 0.43750f },
+    { 1.00000f, 1.00000f, 1.00000f, 1.00000f },
+    { 0.15625f, 0.12500f, 1.00000f, 1.00000f },
+    { 0.06250f, 0.12500f, 0.00000f, 1.00000f },
+    { 0.00000f, 1.00000f, 0.03125f, 0.34375f },
+  }
+};
+
+// structure that allow us to query and override info for training the costs
+typedef struct STBIR__V_FIRST_INFO
+{
+  double v_cost, h_cost;
+  int control_v_first; // 0 = no control, 1 = force hori, 2 = force vert
+  int v_first;
+  int v_resize_classification;
+  int is_gather;
+} STBIR__V_FIRST_INFO;
+
+#ifdef STBIR__V_FIRST_INFO_BUFFER
+static STBIR__V_FIRST_INFO STBIR__V_FIRST_INFO_BUFFER = {0};
+#define STBIR__V_FIRST_INFO_POINTER &STBIR__V_FIRST_INFO_BUFFER
+#else
+#define STBIR__V_FIRST_INFO_POINTER 0
+#endif
+
+// Figure out whether to scale along the horizontal or vertical first.
+//   This only *super* important when you are scaling by a massively 
+//   different amount in the vertical vs the horizontal (for example, if 
+//   you are scaling by 2x in the width, and 0.5x in the height, then you 
+//   want to do the vertical scale first, because it's around 3x faster 
+//   in that order.
+//
+//   In more normal circumstances, this makes a 20-40% differences, so 
+//     it's good to get right, but not critical. The normal way that you
+//     decide which direction goes first is just figuring out which 
+//     direction does more multiplies. But with modern CPUs with their 
+//     fancy caches and SIMD and high IPC abilities, so there's just a lot
+//     more that goes into it. 
+//
+//   My handwavy sort of solution is to have an app that does a whole 
+//     bunch of timing for both vertical and horizontal first modes,
+//     and then another app that can read lots of these timing files
+//     and try to search for the best weights to use. Dotimings.c
+//     is the app that does a bunch of timings, and vf_train.c is the
+//     app that solves for the best weights (and shows how well it 
+//     does currently).
+
+static int stbir__should_do_vertical_first( float weights_table[STBIR_RESIZE_CLASSIFICATIONS][4], int horizontal_filter_pixel_width, float horizontal_scale, int horizontal_output_size, int vertical_filter_pixel_width, float vertical_scale, int vertical_output_size, int is_gather, STBIR__V_FIRST_INFO * info )    
+{
+  double v_cost, h_cost;
+  float * weights;
+  int vertical_first;
+  int v_classification;
+
+  // categorize the resize into buckets
+  if ( ( vertical_output_size <= 4 ) || ( horizontal_output_size <= 4 ) )
+    v_classification = ( vertical_output_size < horizontal_output_size ) ? 6 : 7;
+  else if ( vertical_scale <= 1.0f )
+    v_classification = ( is_gather ) ? 1 : 0;
+  else if ( vertical_scale <= 2.0f) 
+    v_classification = 2;
+  else if ( vertical_scale <= 3.0f) 
+    v_classification = 3;
+  else if ( vertical_scale <= 4.0f) 
+    v_classification = 5;
+  else 
+    v_classification = 6;
+  
+  // use the right weights
+  weights = weights_table[ v_classification ];
+
+  // this is the costs when you don't take into account modern CPUs with high ipc and simd and caches - wish we had a better estimate
+  h_cost = (float)horizontal_filter_pixel_width * weights[0] + horizontal_scale * (float)vertical_filter_pixel_width * weights[1];
+  v_cost = (float)vertical_filter_pixel_width  * weights[2] + vertical_scale * (float)horizontal_filter_pixel_width * weights[3];
+
+  // use computation estimate to decide vertical first or not
+  vertical_first = ( v_cost <= h_cost ) ? 1 : 0;
+
+  // save these, if requested
+  if ( info )
+  {
+    info->h_cost = h_cost;
+    info->v_cost = v_cost;
+    info->v_resize_classification = v_classification;
+    info->v_first = vertical_first;
+    info->is_gather = is_gather;
+  }
+
+  // and this allows us to override everything for testing (see dotiming.c) 
+  if ( ( info ) && ( info->control_v_first ) ) 
+    vertical_first = ( info->control_v_first == 2 ) ? 1 : 0;
+  
+  return vertical_first;
+}
+
+// layout lookups - must match stbir_internal_pixel_layout
+static unsigned char stbir__pixel_channels[] = {
+  1,2,3,3,4,   // 1ch, 2ch, rgb, bgr, 4ch
+  4,4,4,4,2,2, // RGBA,BGRA,ARGB,ABGR,RA,AR
+  4,4,4,4,2,2, // RGBA_PM,BGRA_PM,ARGB_PM,ABGR_PM,RA_PM,AR_PM
+};
+
+// the internal pixel layout enums are in a different order, so we can easily do range comparisons of types
+//   the public pixel layout is ordered in a way that if you cast num_channels (1-4) to the enum, you get something sensible 
+static stbir_internal_pixel_layout stbir__pixel_layout_convert_public_to_internal[] = {
+  STBIRI_BGR, STBIRI_1CHANNEL, STBIRI_2CHANNEL, STBIRI_RGB, STBIRI_RGBA, 
+  STBIRI_4CHANNEL, STBIRI_BGRA, STBIRI_ARGB, STBIRI_ABGR, STBIRI_RA, STBIRI_AR,
+  STBIRI_RGBA_PM, STBIRI_BGRA_PM, STBIRI_ARGB_PM, STBIRI_ABGR_PM, STBIRI_RA_PM, STBIRI_AR_PM,
+};
+
+static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sampler * horizontal, stbir__sampler * vertical, stbir__contributors * conservative, stbir_pixel_layout input_pixel_layout_public, stbir_pixel_layout output_pixel_layout_public, int splits, int new_x, int new_y, int fast_alpha, void * user_data STBIR_ONLY_PROFILE_BUILD_GET_INFO )
+{
+  static char stbir_channel_count_index[8]={ 9,0,1,2, 3,9,9,4 };
+
+  stbir__info * info = 0;
+  void * alloced = 0;
+  int alloced_total = 0;
+  int vertical_first;
+  int decode_buffer_size, ring_buffer_length_bytes, ring_buffer_size, vertical_buffer_size, alloc_ring_buffer_num_entries;
+
+  int alpha_weighting_type = 0; // 0=none, 1=simple, 2=fancy
+  int conservative_split_output_size = stbir__get_max_split( splits, vertical->scale_info.output_sub_size ); 
+  stbir_internal_pixel_layout input_pixel_layout = stbir__pixel_layout_convert_public_to_internal[ input_pixel_layout_public ];  
+  stbir_internal_pixel_layout output_pixel_layout = stbir__pixel_layout_convert_public_to_internal[ output_pixel_layout_public ];
+  int channels = stbir__pixel_channels[ input_pixel_layout ];      
+  int effective_channels = channels;
+  
+  // first figure out what type of alpha weighting to use (if any)
+  if ( ( horizontal->filter_enum != STBIR_FILTER_POINT_SAMPLE ) || ( vertical->filter_enum != STBIR_FILTER_POINT_SAMPLE ) ) // no alpha weighting on point sampling
+  {
+    if ( ( input_pixel_layout >= STBIRI_RGBA ) && ( input_pixel_layout <= STBIRI_AR ) && ( output_pixel_layout >= STBIRI_RGBA ) && ( output_pixel_layout <= STBIRI_AR ) )
+    {
+      if ( fast_alpha )
+      {
+        alpha_weighting_type = 4;
+      }
+      else
+      {
+        static int fancy_alpha_effective_cnts[6] = { 7, 7, 7, 7, 3, 3 };
+        alpha_weighting_type = 2;
+        effective_channels = fancy_alpha_effective_cnts[ input_pixel_layout - STBIRI_RGBA ];
+      }
+    }
+    else if ( ( input_pixel_layout >= STBIRI_RGBA_PM ) && ( input_pixel_layout <= STBIRI_AR_PM ) && ( output_pixel_layout >= STBIRI_RGBA ) && ( output_pixel_layout <= STBIRI_AR ) )
+    {
+      // input premult, output non-premult
+      alpha_weighting_type = 3;
+    }
+    else if ( ( input_pixel_layout >= STBIRI_RGBA ) && ( input_pixel_layout <= STBIRI_AR ) && ( output_pixel_layout >= STBIRI_RGBA_PM ) && ( output_pixel_layout <= STBIRI_AR_PM ) )
+    {
+      // input non-premult, output premult
+      alpha_weighting_type = 1;
+    }
+  }
+
+  // channel in and out count must match currently
+  if ( channels != stbir__pixel_channels[ output_pixel_layout ] )
+    return 0;
+
+  // get vertical first
+  vertical_first = stbir__should_do_vertical_first( stbir__compute_weights[ (int)stbir_channel_count_index[ effective_channels ] ], horizontal->filter_pixel_width, horizontal->scale_info.scale, horizontal->scale_info.output_sub_size, vertical->filter_pixel_width, vertical->scale_info.scale, vertical->scale_info.output_sub_size, vertical->is_gather, STBIR__V_FIRST_INFO_POINTER );
+
+  // sometimes read one float off in some of the unrolled loops (with a weight of zero coeff, so it doesn't have an effect)
+  decode_buffer_size = ( conservative->n1 - conservative->n0 + 1 ) * effective_channels * sizeof(float) + sizeof(float); // extra float for padding
+  
+#if defined( STBIR__SEPARATE_ALLOCATIONS ) && defined(STBIR_SIMD8)
+  if ( effective_channels == 3 )
+    decode_buffer_size += sizeof(float); // avx in 3 channel mode needs one float at the start of the buffer (only with separate allocations)
+#endif  
+
+  ring_buffer_length_bytes = horizontal->scale_info.output_sub_size * effective_channels * sizeof(float) + sizeof(float); // extra float for padding
+
+  // if we do vertical first, the ring buffer holds a whole decoded line
+  if ( vertical_first )
+    ring_buffer_length_bytes = ( decode_buffer_size + 15 ) & ~15;
+
+  if ( ( ring_buffer_length_bytes & 4095 ) == 0 ) ring_buffer_length_bytes += 64*3; // avoid 4k alias
+
+  // One extra entry because floating point precision problems sometimes cause an extra to be necessary.
+  alloc_ring_buffer_num_entries = vertical->filter_pixel_width + 1;
+
+  // we never need more ring buffer entries than the scanlines we're outputting when in scatter mode
+  if ( ( !vertical->is_gather ) && ( alloc_ring_buffer_num_entries > conservative_split_output_size ) )
+    alloc_ring_buffer_num_entries = conservative_split_output_size;
+
+  ring_buffer_size = alloc_ring_buffer_num_entries * ring_buffer_length_bytes;
+
+  // The vertical buffer is used differently, depending on whether we are scattering
+  //   the vertical scanlines, or gathering them.
+  //   If scattering, it's used at the temp buffer to accumulate each output.
+  //   If gathering, it's just the output buffer.
+  vertical_buffer_size = horizontal->scale_info.output_sub_size * effective_channels * sizeof(float) + sizeof(float);  // extra float for padding
+
+  // we make two passes through this loop, 1st to add everything up, 2nd to allocate and init
+  for(;;)
+  {
+    int i;
+    void * advance_mem = alloced;
+    int copy_horizontal = 0;
+    stbir__sampler * possibly_use_horizontal_for_pivot = 0;
+
+#ifdef STBIR__SEPARATE_ALLOCATIONS
+    #define STBIR__NEXT_PTR( ptr, size, ntype ) if ( alloced ) { void * p = STBIR_MALLOC( size, user_data); if ( p == 0 ) { stbir__free_internal_mem( info ); return 0; } (ptr) = (ntype*)p; }
+#else
+    #define STBIR__NEXT_PTR( ptr, size, ntype ) advance_mem = (void*) ( ( ((size_t)advance_mem) + 15 ) & ~15 ); if ( alloced ) ptr = (ntype*)advance_mem; advance_mem = ((char*)advance_mem) + (size);
+#endif
+
+    STBIR__NEXT_PTR( info, sizeof( stbir__info ), stbir__info );      
+
+    STBIR__NEXT_PTR( info->split_info, sizeof( stbir__per_split_info ) * splits, stbir__per_split_info );      
+
+    if ( info )
+    {
+      static stbir__alpha_weight_func * fancy_alpha_weights[6]  =    { stbir__fancy_alpha_weight_4ch,   stbir__fancy_alpha_weight_4ch,   stbir__fancy_alpha_weight_4ch,   stbir__fancy_alpha_weight_4ch,   stbir__fancy_alpha_weight_2ch,   stbir__fancy_alpha_weight_2ch };
+      static stbir__alpha_unweight_func * fancy_alpha_unweights[6] = { stbir__fancy_alpha_unweight_4ch, stbir__fancy_alpha_unweight_4ch, stbir__fancy_alpha_unweight_4ch, stbir__fancy_alpha_unweight_4ch, stbir__fancy_alpha_unweight_2ch, stbir__fancy_alpha_unweight_2ch };
+      static stbir__alpha_weight_func * simple_alpha_weights[6] = { stbir__simple_alpha_weight_4ch, stbir__simple_alpha_weight_4ch, stbir__simple_alpha_weight_4ch, stbir__simple_alpha_weight_4ch, stbir__simple_alpha_weight_2ch, stbir__simple_alpha_weight_2ch };
+      static stbir__alpha_unweight_func * simple_alpha_unweights[6] = { stbir__simple_alpha_unweight_4ch, stbir__simple_alpha_unweight_4ch, stbir__simple_alpha_unweight_4ch, stbir__simple_alpha_unweight_4ch, stbir__simple_alpha_unweight_2ch, stbir__simple_alpha_unweight_2ch };
+
+      // initialize info fields
+      info->alloced_mem = alloced;
+      info->alloced_total = alloced_total;
+
+      info->channels = channels;
+      info->effective_channels = effective_channels;
+  
+      info->offset_x = new_x;
+      info->offset_y = new_y;
+      info->alloc_ring_buffer_num_entries = alloc_ring_buffer_num_entries;
+      info->ring_buffer_num_entries = 0;  
+      info->ring_buffer_length_bytes = ring_buffer_length_bytes;
+      info->splits = splits;
+      info->vertical_first = vertical_first;
+
+      info->input_pixel_layout_internal = input_pixel_layout;  
+      info->output_pixel_layout_internal = output_pixel_layout;
+
+      // setup alpha weight functions
+      info->alpha_weight = 0;
+      info->alpha_unweight = 0;
+    
+      // handle alpha weighting functions and overrides
+      if ( alpha_weighting_type == 2 )
+      {
+        // high quality alpha multiplying on the way in, dividing on the way out
+        info->alpha_weight = fancy_alpha_weights[ input_pixel_layout - STBIRI_RGBA ];  
+        info->alpha_unweight = fancy_alpha_unweights[ output_pixel_layout - STBIRI_RGBA ];
+      }
+      else if ( alpha_weighting_type == 4 )
+      {
+        // fast alpha multiplying on the way in, dividing on the way out
+        info->alpha_weight = simple_alpha_weights[ input_pixel_layout - STBIRI_RGBA ];  
+        info->alpha_unweight = simple_alpha_unweights[ output_pixel_layout - STBIRI_RGBA ];
+      }
+      else if ( alpha_weighting_type == 1 )
+      {
+        // fast alpha on the way in, leave in premultiplied form on way out
+        info->alpha_weight = simple_alpha_weights[ input_pixel_layout - STBIRI_RGBA ]; 
+      }
+      else if ( alpha_weighting_type == 3 )
+      {
+        // incoming is premultiplied, fast alpha dividing on the way out - non-premultiplied output
+        info->alpha_unweight = simple_alpha_unweights[ output_pixel_layout - STBIRI_RGBA ];
+      }
+
+      // handle 3-chan color flipping, using the alpha weight path
+      if ( ( ( input_pixel_layout == STBIRI_RGB ) && ( output_pixel_layout == STBIRI_BGR ) ) ||
+           ( ( input_pixel_layout == STBIRI_BGR ) && ( output_pixel_layout == STBIRI_RGB ) ) )
+      {
+        // do the flipping on the smaller of the two ends
+        if ( horizontal->scale_info.scale < 1.0f )
+          info->alpha_unweight = stbir__simple_flip_3ch;
+        else
+          info->alpha_weight = stbir__simple_flip_3ch;
+      }
+
+    }        
+
+    // get all the per-split buffers
+    for( i = 0 ; i < splits ; i++ )
+    {
+      STBIR__NEXT_PTR( info->split_info[i].decode_buffer, decode_buffer_size, float );
+
+#ifdef STBIR__SEPARATE_ALLOCATIONS
+
+      #ifdef STBIR_SIMD8
+      if ( ( info ) && ( effective_channels == 3 ) )
+        ++info->split_info[i].decode_buffer; // avx in 3 channel mode needs one float at the start of the buffer
+      #endif  
+
+      STBIR__NEXT_PTR( info->split_info[i].ring_buffers, alloc_ring_buffer_num_entries * sizeof(float*), float* );
+      {
+        int j;
+        for( j = 0 ; j < alloc_ring_buffer_num_entries ; j++ )
+        {
+          STBIR__NEXT_PTR( info->split_info[i].ring_buffers[j], ring_buffer_length_bytes, float );
+          #ifdef STBIR_SIMD8
+          if ( ( info ) && ( effective_channels == 3 ) )
+            ++info->split_info[i].ring_buffers[j]; // avx in 3 channel mode needs one float at the start of the buffer
+          #endif  
+        }
+      }
+#else
+      STBIR__NEXT_PTR( info->split_info[i].ring_buffer, ring_buffer_size, float );
+#endif
+      STBIR__NEXT_PTR( info->split_info[i].vertical_buffer, vertical_buffer_size, float );
+    }
+
+    // alloc memory for to-be-pivoted coeffs (if necessary)
+    if ( vertical->is_gather == 0 )
+    {
+      int both;
+      int temp_mem_amt;
+
+      // when in vertical scatter mode, we first build the coefficients in gather mode, and then pivot after,
+      //   that means we need two buffers, so we try to use the decode buffer and ring buffer for this. if that
+      //   is too small, we just allocate extra memory to use as this temp.
+
+      both = vertical->gather_prescatter_contributors_size + vertical->gather_prescatter_coefficients_size;
+
+#ifdef STBIR__SEPARATE_ALLOCATIONS
+      temp_mem_amt = decode_buffer_size;
+#else
+      temp_mem_amt = ( decode_buffer_size + ring_buffer_size + vertical_buffer_size ) * splits;
+#endif
+      if ( temp_mem_amt >= both )
+      {
+        if ( info ) 
+        { 
+          vertical->gather_prescatter_contributors = (stbir__contributors*)info->split_info[0].decode_buffer; 
+          vertical->gather_prescatter_coefficients = (float*) ( ( (char*)info->split_info[0].decode_buffer ) + vertical->gather_prescatter_contributors_size ); 
+        }
+      }
+      else
+      {
+        // ring+decode memory is too small, so allocate temp memory
+        STBIR__NEXT_PTR( vertical->gather_prescatter_contributors, vertical->gather_prescatter_contributors_size, stbir__contributors );
+        STBIR__NEXT_PTR( vertical->gather_prescatter_coefficients, vertical->gather_prescatter_coefficients_size, float );
+      }
+    }
+
+    STBIR__NEXT_PTR( horizontal->contributors, horizontal->contributors_size, stbir__contributors );
+    STBIR__NEXT_PTR( horizontal->coefficients, horizontal->coefficients_size, float );
+
+    // are the two filters identical?? (happens a lot with mipmap generation)
+    if ( ( horizontal->filter_kernel == vertical->filter_kernel ) && ( horizontal->filter_support == vertical->filter_support ) && ( horizontal->edge == vertical->edge ) && ( horizontal->scale_info.output_sub_size == vertical->scale_info.output_sub_size ) )
+    {
+      float diff_scale = horizontal->scale_info.scale - vertical->scale_info.scale;
+      float diff_shift = horizontal->scale_info.pixel_shift - vertical->scale_info.pixel_shift;
+      if ( diff_scale < 0.0f ) diff_scale = -diff_scale;
+      if ( diff_shift < 0.0f ) diff_shift = -diff_shift;
+      if ( ( diff_scale <= stbir__small_float ) && ( diff_shift <= stbir__small_float ) )
+      {
+        if ( horizontal->is_gather == vertical->is_gather ) 
+        {
+          copy_horizontal = 1;
+          goto no_vert_alloc;
+        }
+        // everything matches, but vertical is scatter, horizontal is gather, use horizontal coeffs for vertical pivot coeffs
+        possibly_use_horizontal_for_pivot = horizontal;
+      }
+    }
+
+    STBIR__NEXT_PTR( vertical->contributors, vertical->contributors_size, stbir__contributors );
+    STBIR__NEXT_PTR( vertical->coefficients, vertical->coefficients_size, float );
+
+   no_vert_alloc:
+
+    if ( info )
+    {
+      STBIR_PROFILE_BUILD_START( horizontal );
+
+      stbir__calculate_filters( horizontal, 0, user_data STBIR_ONLY_PROFILE_BUILD_SET_INFO );
+
+      // setup the horizontal gather functions
+      // start with defaulting to the n_coeffs functions (specialized on channels and remnant leftover)
+      info->horizontal_gather_channels = stbir__horizontal_gather_n_coeffs_funcs[ effective_channels ][ horizontal->extent_info.widest & 3 ];
+      // but if the number of coeffs <= 12, use another set of special cases. <=12 coeffs is any enlarging resize, or shrinking resize down to about 1/3 size
+      if ( horizontal->extent_info.widest <= 12 )
+        info->horizontal_gather_channels = stbir__horizontal_gather_channels_funcs[ effective_channels ][ horizontal->extent_info.widest - 1 ];
+      
+      info->scanline_extents.conservative.n0 = conservative->n0;
+      info->scanline_extents.conservative.n1 = conservative->n1;
+      
+      // get exact extents
+      stbir__get_extents( horizontal, &info->scanline_extents );
+
+      // pack the horizontal coeffs
+      horizontal->coefficient_width = stbir__pack_coefficients(horizontal->num_contributors, horizontal->contributors, horizontal->coefficients, horizontal->coefficient_width, horizontal->extent_info.widest, info->scanline_extents.conservative.n1 + 1 );
+      
+      STBIR_MEMCPY( &info->horizontal, horizontal, sizeof( stbir__sampler ) );
+
+      STBIR_PROFILE_BUILD_END( horizontal );
+
+      if ( copy_horizontal )
+      {
+        STBIR_MEMCPY( &info->vertical, horizontal, sizeof( stbir__sampler ) );
+      }
+      else
+      {
+        STBIR_PROFILE_BUILD_START( vertical );
+
+        stbir__calculate_filters( vertical, possibly_use_horizontal_for_pivot, user_data STBIR_ONLY_PROFILE_BUILD_SET_INFO );
+        STBIR_MEMCPY( &info->vertical, vertical, sizeof( stbir__sampler ) );
+
+        STBIR_PROFILE_BUILD_END( vertical );
+      }
+
+      // setup the vertical split ranges
+      stbir__get_split_info( info->split_info, info->splits, info->vertical.scale_info.output_sub_size, info->vertical.filter_pixel_margin, info->vertical.scale_info.input_full_size );
+
+      // now we know precisely how many entries we need
+      info->ring_buffer_num_entries = info->vertical.extent_info.widest;
+
+      // we never need more ring buffer entries than the scanlines we're outputting
+      if ( ( !info->vertical.is_gather ) && ( info->ring_buffer_num_entries > conservative_split_output_size ) )
+        info->ring_buffer_num_entries = conservative_split_output_size;
+      STBIR_ASSERT( info->ring_buffer_num_entries <= info->alloc_ring_buffer_num_entries );
+
+      // a few of the horizontal gather functions read one dword past the end (but mask it out), so put in a normal value so no snans or denormals accidentally sneak in
+      for( i = 0 ; i < splits ; i++ )
+      {
+        int width, ofs;
+        
+        // find the right most span
+        if ( info->scanline_extents.spans[0].n1 > info->scanline_extents.spans[1].n1 )
+          width = info->scanline_extents.spans[0].n1 - info->scanline_extents.spans[0].n0;
+        else
+          width = info->scanline_extents.spans[1].n1 - info->scanline_extents.spans[1].n0;
+        
+        // this calc finds the exact end of the decoded scanline for all filter modes.
+        //   usually this is just the width * effective channels.  But we have to account 
+        //   for the area to the left of the scanline for wrap filtering and alignment, this 
+        //   is stored as a negative value in info->scanline_extents.conservative.n0. Next,
+        //   we need to skip the exact size of the right hand size filter area (again for
+        //   wrap mode), this is in info->scanline_extents.edge_sizes[1]).
+        ofs = ( width + 1 - info->scanline_extents.conservative.n0 + info->scanline_extents.edge_sizes[1] ) * effective_channels;
+        
+        // place a known, but numerically valid value in the decode buffer
+        info->split_info[i].decode_buffer[ ofs ] = 9999.0f;
+
+        // if vertical filtering first, place a known, but numerically valid value in the all
+        //   of the ring buffer accumulators
+        if ( vertical_first )
+        {
+          int j;  
+          for( j = 0; j < info->ring_buffer_num_entries ; j++ )
+          {
+            stbir__get_ring_buffer_entry( info, info->split_info + i, j )[ ofs ] = 9999.0f;
+          }
+        }
+      }
+    }
+
+    #undef STBIR__NEXT_PTR
+
+
+    // is this the first time through loop?
+    if ( info == 0 )
+    {
+      alloced_total = (int) ( 15 + (size_t)advance_mem );
+      alloced = STBIR_MALLOC( alloced_total, user_data );
+      if ( alloced == 0 )
+        return 0;
+    }
+    else
+      return info;  // success
+  }
+}
+
+static int stbir__perform_resize( stbir__info const * info, int split_start, int split_count ) 
+{
+  stbir__per_split_info * split_info = info->split_info + split_start;
+
+  STBIR_PROFILE_CLEAR_EXTRAS();
+
+  STBIR_PROFILE_FIRST_START( looping );
+  if (info->vertical.is_gather)
+    stbir__vertical_gather_loop( info, split_info, split_count );
+  else
+    stbir__vertical_scatter_loop( info, split_info, split_count );
+  STBIR_PROFILE_END( looping );
+
+  return 1;
+}
+
+static void stbir__update_info_from_resize( stbir__info * info, STBIR_RESIZE * resize )
+{
+  static stbir__decode_pixels_func * decode_simple[STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]=
+  {
+    /* 1ch-4ch */ stbir__decode_uint8_srgb, stbir__decode_uint8_srgb, 0, stbir__decode_float_linear, stbir__decode_half_float_linear, 
+  };
+
+  static stbir__decode_pixels_func * decode_alphas[STBIRI_AR-STBIRI_RGBA+1][STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]=
+  {
+    { /* RGBA */ stbir__decode_uint8_srgb4_linearalpha,      stbir__decode_uint8_srgb,      0, stbir__decode_float_linear,      stbir__decode_half_float_linear },
+    { /* BGRA */ stbir__decode_uint8_srgb4_linearalpha_BGRA, stbir__decode_uint8_srgb_BGRA, 0, stbir__decode_float_linear_BGRA, stbir__decode_half_float_linear_BGRA },
+    { /* ARGB */ stbir__decode_uint8_srgb4_linearalpha_ARGB, stbir__decode_uint8_srgb_ARGB, 0, stbir__decode_float_linear_ARGB, stbir__decode_half_float_linear_ARGB },
+    { /* ABGR */ stbir__decode_uint8_srgb4_linearalpha_ABGR, stbir__decode_uint8_srgb_ABGR, 0, stbir__decode_float_linear_ABGR, stbir__decode_half_float_linear_ABGR },
+    { /* RA   */ stbir__decode_uint8_srgb2_linearalpha,      stbir__decode_uint8_srgb,      0, stbir__decode_float_linear,      stbir__decode_half_float_linear },
+    { /* AR   */ stbir__decode_uint8_srgb2_linearalpha_AR,   stbir__decode_uint8_srgb_AR,   0, stbir__decode_float_linear_AR,   stbir__decode_half_float_linear_AR },
+  };
+
+  static stbir__decode_pixels_func * decode_simple_scaled_or_not[2][2]=
+  {
+    { stbir__decode_uint8_linear_scaled,  stbir__decode_uint8_linear }, { stbir__decode_uint16_linear_scaled, stbir__decode_uint16_linear },
+  };
+
+  static stbir__decode_pixels_func * decode_alphas_scaled_or_not[STBIRI_AR-STBIRI_RGBA+1][2][2]=
+  {
+    { /* RGBA */ { stbir__decode_uint8_linear_scaled,       stbir__decode_uint8_linear },      { stbir__decode_uint16_linear_scaled,      stbir__decode_uint16_linear } },
+    { /* BGRA */ { stbir__decode_uint8_linear_scaled_BGRA,  stbir__decode_uint8_linear_BGRA }, { stbir__decode_uint16_linear_scaled_BGRA, stbir__decode_uint16_linear_BGRA } },
+    { /* ARGB */ { stbir__decode_uint8_linear_scaled_ARGB,  stbir__decode_uint8_linear_ARGB }, { stbir__decode_uint16_linear_scaled_ARGB, stbir__decode_uint16_linear_ARGB } },
+    { /* ABGR */ { stbir__decode_uint8_linear_scaled_ABGR,  stbir__decode_uint8_linear_ABGR }, { stbir__decode_uint16_linear_scaled_ABGR, stbir__decode_uint16_linear_ABGR } },
+    { /* RA   */ { stbir__decode_uint8_linear_scaled,       stbir__decode_uint8_linear },      { stbir__decode_uint16_linear_scaled,      stbir__decode_uint16_linear } },
+    { /* AR   */ { stbir__decode_uint8_linear_scaled_AR,    stbir__decode_uint8_linear_AR },   { stbir__decode_uint16_linear_scaled_AR,   stbir__decode_uint16_linear_AR } }
+  };
+
+  static stbir__encode_pixels_func * encode_simple[STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]=
+  {
+    /* 1ch-4ch */ stbir__encode_uint8_srgb, stbir__encode_uint8_srgb, 0, stbir__encode_float_linear, stbir__encode_half_float_linear,
+  };
+
+  static stbir__encode_pixels_func * encode_alphas[STBIRI_AR-STBIRI_RGBA+1][STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]=
+  {
+    { /* RGBA */ stbir__encode_uint8_srgb4_linearalpha,      stbir__encode_uint8_srgb,      0, stbir__encode_float_linear,      stbir__encode_half_float_linear },
+    { /* BGRA */ stbir__encode_uint8_srgb4_linearalpha_BGRA, stbir__encode_uint8_srgb_BGRA, 0, stbir__encode_float_linear_BGRA, stbir__encode_half_float_linear_BGRA },
+    { /* ARGB */ stbir__encode_uint8_srgb4_linearalpha_ARGB, stbir__encode_uint8_srgb_ARGB, 0, stbir__encode_float_linear_ARGB, stbir__encode_half_float_linear_ARGB },
+    { /* ABGR */ stbir__encode_uint8_srgb4_linearalpha_ABGR, stbir__encode_uint8_srgb_ABGR, 0, stbir__encode_float_linear_ABGR, stbir__encode_half_float_linear_ABGR },
+    { /* RA   */ stbir__encode_uint8_srgb2_linearalpha,      stbir__encode_uint8_srgb,      0, stbir__encode_float_linear,      stbir__encode_half_float_linear },
+    { /* AR   */ stbir__encode_uint8_srgb2_linearalpha_AR,   stbir__encode_uint8_srgb_AR,   0, stbir__encode_float_linear_AR,   stbir__encode_half_float_linear_AR }
+  };
+
+  static stbir__encode_pixels_func * encode_simple_scaled_or_not[2][2]=
+  {
+    { stbir__encode_uint8_linear_scaled,  stbir__encode_uint8_linear }, { stbir__encode_uint16_linear_scaled, stbir__encode_uint16_linear },
+  };
+
+  static stbir__encode_pixels_func * encode_alphas_scaled_or_not[STBIRI_AR-STBIRI_RGBA+1][2][2]=
+  {
+    { /* RGBA */ { stbir__encode_uint8_linear_scaled,       stbir__encode_uint8_linear },       { stbir__encode_uint16_linear_scaled,      stbir__encode_uint16_linear } },
+    { /* BGRA */ { stbir__encode_uint8_linear_scaled_BGRA,  stbir__encode_uint8_linear_BGRA },  { stbir__encode_uint16_linear_scaled_BGRA, stbir__encode_uint16_linear_BGRA } },
+    { /* ARGB */ { stbir__encode_uint8_linear_scaled_ARGB,  stbir__encode_uint8_linear_ARGB },  { stbir__encode_uint16_linear_scaled_ARGB, stbir__encode_uint16_linear_ARGB } },
+    { /* ABGR */ { stbir__encode_uint8_linear_scaled_ABGR,  stbir__encode_uint8_linear_ABGR },  { stbir__encode_uint16_linear_scaled_ABGR, stbir__encode_uint16_linear_ABGR } },
+    { /* RA   */ { stbir__encode_uint8_linear_scaled,       stbir__encode_uint8_linear },       { stbir__encode_uint16_linear_scaled,      stbir__encode_uint16_linear } },
+    { /* AR   */ { stbir__encode_uint8_linear_scaled_AR,    stbir__encode_uint8_linear_AR },    { stbir__encode_uint16_linear_scaled_AR,   stbir__encode_uint16_linear_AR } }
+  };
+
+  stbir__decode_pixels_func * decode_pixels = 0;
+  stbir__encode_pixels_func * encode_pixels = 0;
+  stbir_datatype input_type, output_type;
+
+  input_type = resize->input_data_type;
+  output_type = resize->output_data_type; 
+  info->input_data = resize->input_pixels;
+  info->input_stride_bytes = resize->input_stride_in_bytes;
+  info->output_stride_bytes = resize->output_stride_in_bytes;
+
+  // if we're completely point sampling, then we can turn off SRGB
+  if ( ( info->horizontal.filter_enum == STBIR_FILTER_POINT_SAMPLE ) && ( info->vertical.filter_enum == STBIR_FILTER_POINT_SAMPLE ) )
+  {
+    if ( ( ( input_type  == STBIR_TYPE_UINT8_SRGB ) || ( input_type  == STBIR_TYPE_UINT8_SRGB_ALPHA ) ) && 
+         ( ( output_type == STBIR_TYPE_UINT8_SRGB ) || ( output_type == STBIR_TYPE_UINT8_SRGB_ALPHA ) ) )
+    {
+      input_type = STBIR_TYPE_UINT8;
+      output_type = STBIR_TYPE_UINT8;
+    }
+  }
+
+  // recalc the output and input strides  
+  if ( info->input_stride_bytes == 0 )
+    info->input_stride_bytes = info->channels * info->horizontal.scale_info.input_full_size * stbir__type_size[input_type];
+
+  if ( info->output_stride_bytes == 0 )
+    info->output_stride_bytes = info->channels * info->horizontal.scale_info.output_sub_size * stbir__type_size[output_type];
+
+  // calc offset
+  info->output_data = ( (char*) resize->output_pixels ) + ( (ptrdiff_t) info->offset_y * (ptrdiff_t) resize->output_stride_in_bytes ) + ( info->offset_x * info->channels * stbir__type_size[output_type] );
+
+  info->in_pixels_cb = resize->input_cb;
+  info->user_data = resize->user_data;
+  info->out_pixels_cb = resize->output_cb;
+
+  // setup the input format converters
+  if ( ( input_type == STBIR_TYPE_UINT8 ) || ( input_type == STBIR_TYPE_UINT16 ) )
+  {
+    int non_scaled = 0;
+
+    // check if we can run unscaled - 0-255.0/0-65535.0 instead of 0-1.0 (which is a tiny bit faster when doing linear 8->8 or 16->16)
+    if ( ( !info->alpha_weight ) && ( !info->alpha_unweight )  ) // don't short circuit when alpha weighting (get everything to 0-1.0 as usual)
+      if ( ( ( input_type == STBIR_TYPE_UINT8 ) && ( output_type == STBIR_TYPE_UINT8 ) ) || ( ( input_type == STBIR_TYPE_UINT16 ) && ( output_type == STBIR_TYPE_UINT16 ) ) )
+        non_scaled = 1;
+
+    if ( info->input_pixel_layout_internal <= STBIRI_4CHANNEL )
+      decode_pixels = decode_simple_scaled_or_not[ input_type == STBIR_TYPE_UINT16 ][ non_scaled ];
+    else
+      decode_pixels = decode_alphas_scaled_or_not[ ( info->input_pixel_layout_internal - STBIRI_RGBA ) % ( STBIRI_AR-STBIRI_RGBA+1 ) ][ input_type == STBIR_TYPE_UINT16 ][ non_scaled ];
+  }
+  else
+  {
+    if ( info->input_pixel_layout_internal <= STBIRI_4CHANNEL )
+      decode_pixels = decode_simple[ input_type - STBIR_TYPE_UINT8_SRGB ];
+    else
+      decode_pixels = decode_alphas[ ( info->input_pixel_layout_internal - STBIRI_RGBA ) % ( STBIRI_AR-STBIRI_RGBA+1 ) ][ input_type - STBIR_TYPE_UINT8_SRGB ];
+  }
+
+  // setup the output format converters
+  if ( ( output_type == STBIR_TYPE_UINT8 ) || ( output_type == STBIR_TYPE_UINT16 ) )
+  {
+    int non_scaled = 0;
+    
+    // check if we can run unscaled - 0-255.0/0-65535.0 instead of 0-1.0 (which is a tiny bit faster when doing linear 8->8 or 16->16)
+    if ( ( !info->alpha_weight ) && ( !info->alpha_unweight ) ) // don't short circuit when alpha weighting (get everything to 0-1.0 as usual)
+      if ( ( ( input_type == STBIR_TYPE_UINT8 ) && ( output_type == STBIR_TYPE_UINT8 ) ) || ( ( input_type == STBIR_TYPE_UINT16 ) && ( output_type == STBIR_TYPE_UINT16 ) ) )
+        non_scaled = 1;
+
+    if ( info->output_pixel_layout_internal <= STBIRI_4CHANNEL )
+      encode_pixels = encode_simple_scaled_or_not[ output_type == STBIR_TYPE_UINT16 ][ non_scaled ];
+    else
+      encode_pixels = encode_alphas_scaled_or_not[ ( info->output_pixel_layout_internal - STBIRI_RGBA ) % ( STBIRI_AR-STBIRI_RGBA+1 ) ][ output_type == STBIR_TYPE_UINT16 ][ non_scaled ];
+  }
+  else
+  {
+    if ( info->output_pixel_layout_internal <= STBIRI_4CHANNEL )
+      encode_pixels = encode_simple[ output_type - STBIR_TYPE_UINT8_SRGB ];
+    else
+      encode_pixels = encode_alphas[ ( info->output_pixel_layout_internal - STBIRI_RGBA ) % ( STBIRI_AR-STBIRI_RGBA+1 ) ][ output_type - STBIR_TYPE_UINT8_SRGB ];
+  }
+
+  info->input_type = input_type;
+  info->output_type = output_type; 
+  info->decode_pixels = decode_pixels;
+  info->encode_pixels = encode_pixels; 
+}
+
+static void stbir__clip( int * outx, int * outsubw, int outw, double * u0, double * u1 )
+{
+  double per, adj;
+  int over;
+  
+  // do left/top edge
+  if ( *outx < 0 )
+  {
+    per = ( (double)*outx ) / ( (double)*outsubw ); // is negative
+    adj = per * ( *u1 - *u0 );
+    *u0 -= adj; // increases u0
+    *outx = 0;
+  }
+
+  // do right/bot edge
+  over = outw - ( *outx + *outsubw );
+  if ( over < 0 )
+  {
+    per = ( (double)over ) / ( (double)*outsubw ); // is negative
+    adj = per * ( *u1 - *u0 );
+    *u1 += adj; // decrease u1
+    *outsubw = outw - *outx;
+  }
+}    
+
+// converts a double to a rational that has less than one float bit of error (returns 0 if unable to do so)
+static int stbir__double_to_rational(double f, stbir_uint32 limit, stbir_uint32 *numer, stbir_uint32 *denom, int limit_denom ) // limit_denom (1) or limit numer (0)
+{
+  double err;
+  stbir_uint64 top, bot;
+  stbir_uint64 numer_last = 0;
+  stbir_uint64 denom_last = 1;
+  stbir_uint64 numer_estimate = 1;
+  stbir_uint64 denom_estimate = 0;
+
+  // scale to past float error range
+  top = (stbir_uint64)( f * (double)(1 << 25) );
+  bot = 1 << 25;
+
+  // keep refining, but usually stops in a few loops - usually 5 for bad cases
+  for(;;)  
+  {
+    stbir_uint64 est, temp;
+
+    // hit limit, break out and do best full range estimate
+    if ( ( ( limit_denom ) ? denom_estimate : numer_estimate ) >= limit )
+      break;
+
+    // is the current error less than 1 bit of a float? if so, we're done
+    if ( denom_estimate )
+    {
+      err = ( (double)numer_estimate / (double)denom_estimate ) - f;
+      if ( err < 0.0 ) err = -err;
+      if ( err < ( 1.0 / (double)(1<<24) ) )
+      {
+        // yup, found it
+        *numer = (stbir_uint32) numer_estimate;
+        *denom = (stbir_uint32) denom_estimate;
+        return 1;
+      }
+    }
+
+    // no more refinement bits left? break out and do full range estimate
+    if ( bot == 0 )
+      break;
+
+    // gcd the estimate bits
+    est = top / bot;
+    temp = top % bot;
+    top = bot;
+    bot = temp;
+
+    // move remainders
+    temp = est * denom_estimate + denom_last; 
+    denom_last = denom_estimate; 
+    denom_estimate = temp;
+
+    // move remainders
+    temp = est * numer_estimate + numer_last; 
+    numer_last = numer_estimate; 
+    numer_estimate = temp;
+  }
+
+  // we didn't fine anything good enough for float, use a full range estimate
+  if ( limit_denom )
+  {
+    numer_estimate= (stbir_uint64)( f * (double)limit + 0.5 );
+    denom_estimate = limit;
+  }
+  else
+  {
+    numer_estimate = limit;
+    denom_estimate = (stbir_uint64)( ( (double)limit / f ) + 0.5 );
+  }
+
+  *numer = (stbir_uint32) numer_estimate;
+  *denom = (stbir_uint32) denom_estimate;
+
+  err = ( denom_estimate ) ? ( ( (double)(stbir_uint32)numer_estimate / (double)(stbir_uint32)denom_estimate ) - f ) : 1.0;
+  if ( err < 0.0 ) err = -err;
+  return ( err < ( 1.0 / (double)(1<<24) ) ) ? 1 : 0;
+}
+
+static int stbir__calculate_region_transform( stbir__scale_info * scale_info, int output_full_range, int * output_offset, int output_sub_range, int input_full_range, double input_s0, double input_s1 )
+{
+  double output_range, input_range, output_s, input_s, ratio, scale;
+
+  input_s = input_s1 - input_s0;
+
+  // null area
+  if ( ( output_full_range == 0 ) || ( input_full_range == 0 ) ||
+       ( output_sub_range == 0 ) || ( input_s <= stbir__small_float ) )
+    return 0;
+
+  // are either of the ranges completely out of bounds?
+  if ( ( *output_offset >= output_full_range ) || ( ( *output_offset + output_sub_range ) <= 0 ) || ( input_s0 >= (1.0f-stbir__small_float) ) || ( input_s1 <= stbir__small_float ) )
+    return 0;
+
+  output_range = (double)output_full_range;
+  input_range = (double)input_full_range;
+
+  output_s = ( (double)output_sub_range) / output_range;
+
+  // figure out the scaling to use 
+  ratio = output_s / input_s; 
+
+  // save scale before clipping
+  scale = ( output_range / input_range ) * ratio; 
+  scale_info->scale = (float)scale;
+  scale_info->inv_scale = (float)( 1.0 / scale );
+
+  // clip output area to left/right output edges (and adjust input area)
+  stbir__clip( output_offset, &output_sub_range, output_full_range, &input_s0, &input_s1 );
+
+  // recalc input area
+  input_s = input_s1 - input_s0;
+
+  // after clipping do we have zero input area?
+  if ( input_s <= stbir__small_float ) 
+    return 0;
+
+  // calculate and store the starting source offsets in output pixel space 
+  scale_info->pixel_shift = (float) ( input_s0 * ratio * output_range ); 
+
+  scale_info->scale_is_rational = stbir__double_to_rational( scale, ( scale <= 1.0 ) ? output_full_range : input_full_range, &scale_info->scale_numerator, &scale_info->scale_denominator, ( scale >= 1.0 ) );
+
+  scale_info->input_full_size = input_full_range;
+  scale_info->output_sub_size = output_sub_range;
+
+  return 1;
+}
+
+
+static void stbir__init_and_set_layout( STBIR_RESIZE * resize, stbir_pixel_layout pixel_layout, stbir_datatype data_type )
+{
+  resize->input_cb = 0;
+  resize->output_cb = 0;
+  resize->user_data = resize;
+  resize->samplers = 0;
+  resize->needs_rebuild = 1;
+  resize->called_alloc = 0;
+  resize->horizontal_filter = STBIR_FILTER_DEFAULT;
+  resize->horizontal_filter_kernel = 0; resize->horizontal_filter_support = 0;
+  resize->vertical_filter = STBIR_FILTER_DEFAULT;
+  resize->vertical_filter_kernel = 0; resize->vertical_filter_support = 0;
+  resize->horizontal_edge = STBIR_EDGE_CLAMP;
+  resize->vertical_edge = STBIR_EDGE_CLAMP;
+  resize->input_s0 = 0; resize->input_t0 = 0; resize->input_s1 = 1; resize->input_t1 = 1;
+  resize->output_subx = 0; resize->output_suby = 0; resize->output_subw = resize->output_w; resize->output_subh = resize->output_h;
+  resize->input_data_type = data_type;
+  resize->output_data_type = data_type;
+  resize->input_pixel_layout_public = pixel_layout;
+  resize->output_pixel_layout_public = pixel_layout;
+}
+
+STBIRDEF void stbir_resize_init( STBIR_RESIZE * resize, 
+                                 const void *input_pixels,  int input_w,  int input_h, int input_stride_in_bytes, // stride can be zero
+                                       void *output_pixels, int output_w, int output_h, int output_stride_in_bytes, // stride can be zero
+                                 stbir_pixel_layout pixel_layout, stbir_datatype data_type )
+{
+  resize->input_pixels = input_pixels;
+  resize->input_w = input_w;
+  resize->input_h = input_h;
+  resize->input_stride_in_bytes = input_stride_in_bytes;
+  resize->output_pixels = output_pixels;
+  resize->output_w = output_w;
+  resize->output_h = output_h;
+  resize->output_stride_in_bytes = output_stride_in_bytes;
+  resize->fast_alpha = 0;
+
+  stbir__init_and_set_layout( resize, pixel_layout, data_type );
+}
+
+// You can update parameters any time after resize_init
+STBIRDEF void stbir_set_datatypes( STBIR_RESIZE * resize, stbir_datatype input_type, stbir_datatype output_type )  // by default, datatype from resize_init
+{
+  resize->input_data_type = input_type;
+  resize->output_data_type = output_type;
+}
+
+STBIRDEF void stbir_set_pixel_callbacks( STBIR_RESIZE * resize, stbir_input_callback * input_cb, stbir_output_callback * output_cb )   // no callbacks by default
+{
+  resize->input_cb = input_cb;
+  resize->output_cb = output_cb;
+}
+
+STBIRDEF void stbir_set_user_data( STBIR_RESIZE * resize, void * user_data )                                     // pass back STBIR_RESIZE* by default
+{
+  resize->user_data = user_data;
+}
+
+STBIRDEF void stbir_set_buffer_ptrs( STBIR_RESIZE * resize, const void * input_pixels, int input_stride_in_bytes, void * output_pixels, int output_stride_in_bytes )
+{
+  resize->input_pixels = input_pixels;
+  resize->input_stride_in_bytes = input_stride_in_bytes;
+  resize->output_pixels = output_pixels;
+  resize->output_stride_in_bytes = output_stride_in_bytes;
+}
+
+
+STBIRDEF int stbir_set_edgemodes( STBIR_RESIZE * resize, stbir_edge horizontal_edge, stbir_edge vertical_edge )       // CLAMP by default
+{
+  resize->horizontal_edge = horizontal_edge;
+  resize->vertical_edge = vertical_edge;
+  resize->needs_rebuild = 1;
+  return 1;
+}
+
+STBIRDEF int stbir_set_filters( STBIR_RESIZE * resize, stbir_filter horizontal_filter, stbir_filter vertical_filter ) // STBIR_DEFAULT_FILTER_UPSAMPLE/DOWNSAMPLE by default
+{
+  resize->horizontal_filter = horizontal_filter;
+  resize->vertical_filter = vertical_filter;
+  resize->needs_rebuild = 1;
+  return 1;
+}
+
+STBIRDEF int stbir_set_filter_callbacks( STBIR_RESIZE * resize, stbir__kernel_callback * horizontal_filter, stbir__support_callback * horizontal_support, stbir__kernel_callback * vertical_filter, stbir__support_callback * vertical_support )
+{
+  resize->horizontal_filter_kernel = horizontal_filter; resize->horizontal_filter_support = horizontal_support;
+  resize->vertical_filter_kernel = vertical_filter; resize->vertical_filter_support = vertical_support;
+  resize->needs_rebuild = 1;
+  return 1;
+}
+
+STBIRDEF int stbir_set_pixel_layouts( STBIR_RESIZE * resize, stbir_pixel_layout input_pixel_layout, stbir_pixel_layout output_pixel_layout )   // sets new pixel layouts
+{
+  resize->input_pixel_layout_public = input_pixel_layout;
+  resize->output_pixel_layout_public = output_pixel_layout;
+  resize->needs_rebuild = 1;
+  return 1;
+}
+
+
+STBIRDEF int stbir_set_non_pm_alpha_speed_over_quality( STBIR_RESIZE * resize, int non_pma_alpha_speed_over_quality )   // sets alpha speed
+{
+  resize->fast_alpha = non_pma_alpha_speed_over_quality;
+  resize->needs_rebuild = 1;
+  return 1;
+}
+
+STBIRDEF int stbir_set_input_subrect( STBIR_RESIZE * resize, double s0, double t0, double s1, double t1 )                 // sets input region (full region by default)
+{
+  resize->input_s0 = s0;
+  resize->input_t0 = t0;
+  resize->input_s1 = s1;
+  resize->input_t1 = t1;
+  resize->needs_rebuild = 1;
+
+  // are we inbounds?
+  if ( ( s1 < stbir__small_float ) || ( (s1-s0) < stbir__small_float ) ||
+       ( t1 < stbir__small_float ) || ( (t1-t0) < stbir__small_float ) ||
+       ( s0 > (1.0f-stbir__small_float) ) ||
+       ( t0 > (1.0f-stbir__small_float) ) )
+    return 0;
+
+  return 1;
+}
+
+STBIRDEF int stbir_set_output_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh )          // sets input region (full region by default)
+{
+  resize->output_subx = subx;
+  resize->output_suby = suby;
+  resize->output_subw = subw;
+  resize->output_subh = subh;
+  resize->needs_rebuild = 1;
+
+  // are we inbounds?
+  if ( ( subx >= resize->output_w ) || ( ( subx + subw ) <= 0 ) || ( suby >= resize->output_h ) || ( ( suby + subh ) <= 0 ) || ( subw == 0 ) || ( subh == 0 ) )
+    return 0;
+
+  return 1;
+}
+
+STBIRDEF int stbir_set_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh )                 // sets both regions (full regions by default)
+{
+  double s0, t0, s1, t1;
+
+  s0 = ( (double)subx ) / ( (double)resize->output_w );
+  t0 = ( (double)suby ) / ( (double)resize->output_h );
+  s1 = ( (double)(subx+subw) ) / ( (double)resize->output_w );
+  t1 = ( (double)(suby+subh) ) / ( (double)resize->output_h );
+
+  resize->input_s0 = s0;
+  resize->input_t0 = t0;
+  resize->input_s1 = s1;
+  resize->input_t1 = t1;
+  resize->output_subx = subx;
+  resize->output_suby = suby;
+  resize->output_subw = subw;
+  resize->output_subh = subh;
+  resize->needs_rebuild = 1;
+
+  // are we inbounds?
+  if ( ( subx >= resize->output_w ) || ( ( subx + subw ) <= 0 ) || ( suby >= resize->output_h ) || ( ( suby + subh ) <= 0 ) || ( subw == 0 ) || ( subh == 0 ) )
+    return 0;
+
+  return 1;
+}
+
+static int stbir__perform_build( STBIR_RESIZE * resize, int splits ) 
+{
+  stbir__contributors conservative = { 0, 0 };
+  stbir__sampler horizontal, vertical;
+  int new_output_subx, new_output_suby;
+  stbir__info * out_info;
+  #ifdef STBIR_PROFILE
+  stbir__info profile_infod;  // used to contain building profile info before everything is allocated
+  stbir__info * profile_info = &profile_infod;
+  #endif
+
+  // have we already built the samplers?
+  if ( resize->samplers )
+    return 0;
+  
+  #define STBIR_RETURN_ERROR_AND_ASSERT( exp )  STBIR_ASSERT( !(exp) ); if (exp) return 0;
+  STBIR_RETURN_ERROR_AND_ASSERT( (unsigned)resize->horizontal_filter >= STBIR_FILTER_OTHER)
+  STBIR_RETURN_ERROR_AND_ASSERT( (unsigned)resize->vertical_filter >= STBIR_FILTER_OTHER)
+  #undef STBIR_RETURN_ERROR_AND_ASSERT
+
+  if ( splits <= 0 ) 
+    return 0;
+
+  STBIR_PROFILE_BUILD_FIRST_START( build );
+
+  new_output_subx = resize->output_subx;
+  new_output_suby = resize->output_suby;
+
+  // do horizontal clip and scale calcs
+  if ( !stbir__calculate_region_transform( &horizontal.scale_info, resize->output_w, &new_output_subx, resize->output_subw, resize->input_w, resize->input_s0, resize->input_s1 ) )
+    return 0;
+
+  // do vertical clip and scale calcs
+  if ( !stbir__calculate_region_transform( &vertical.scale_info, resize->output_h, &new_output_suby, resize->output_subh, resize->input_h, resize->input_t0, resize->input_t1 ) )
+    return 0;
+
+  // if nothing to do, just return
+  if ( ( horizontal.scale_info.output_sub_size == 0 ) || ( vertical.scale_info.output_sub_size == 0 ) )
+    return 0;
+
+  stbir__set_sampler(&horizontal, resize->horizontal_filter, resize->horizontal_filter_kernel, resize->horizontal_filter_support, resize->horizontal_edge, &horizontal.scale_info, 1, resize->user_data );
+  stbir__get_conservative_extents( &horizontal, &conservative, resize->user_data );
+  stbir__set_sampler(&vertical, resize->vertical_filter, resize->horizontal_filter_kernel, resize->vertical_filter_support, resize->vertical_edge, &vertical.scale_info, 0, resize->user_data );
+
+  if ( ( vertical.scale_info.output_sub_size / splits ) < 4 ) // each split should be a minimum of 4 scanlines (handwavey choice)
+  {
+    splits = vertical.scale_info.output_sub_size / 4;
+    if ( splits == 0 ) splits = 1;
+  }
+
+  STBIR_PROFILE_BUILD_START( alloc );
+  out_info = stbir__alloc_internal_mem_and_build_samplers( &horizontal, &vertical, &conservative, resize->input_pixel_layout_public, resize->output_pixel_layout_public, splits, new_output_subx, new_output_suby, resize->fast_alpha, resize->user_data STBIR_ONLY_PROFILE_BUILD_SET_INFO );
+  STBIR_PROFILE_BUILD_END( alloc );
+  STBIR_PROFILE_BUILD_END( build );
+ 
+  if ( out_info )
+  {
+    resize->splits = splits;
+    resize->samplers = out_info;
+    resize->needs_rebuild = 0;
+    #ifdef STBIR_PROFILE
+      STBIR_MEMCPY( &out_info->profile, &profile_infod.profile, sizeof( out_info->profile ) );
+    #endif
+    return splits;
+  }
+
+  return 0;
+}
+
+void stbir_free_samplers( STBIR_RESIZE * resize )
+{
+  if ( resize->samplers )
+  {
+    stbir__free_internal_mem( resize->samplers );
+    resize->samplers = 0;
+    resize->called_alloc = 0;
+  }
+}
+
+STBIRDEF int stbir_build_samplers_with_splits( STBIR_RESIZE * resize, int splits )
+{
+  if ( ( resize->samplers == 0 ) || ( resize->needs_rebuild ) )
+  {
+    if ( resize->samplers )
+      stbir_free_samplers( resize );
+
+    resize->called_alloc = 1;
+    return stbir__perform_build( resize, splits );
+  }
+
+  STBIR_PROFILE_BUILD_CLEAR( resize->samplers );
+  
+  return 1;
+}
+
+STBIRDEF int stbir_build_samplers( STBIR_RESIZE * resize )
+{
+  return stbir_build_samplers_with_splits( resize, 1 );
+}
+
+STBIRDEF int stbir_resize_extended( STBIR_RESIZE * resize )
+{
+  int result;
+  
+  if ( ( resize->samplers == 0 ) || ( resize->needs_rebuild ) )
+  {
+    int alloc_state = resize->called_alloc;  // remember allocated state
+
+    if ( resize->samplers )
+    {
+      stbir__free_internal_mem( resize->samplers );
+      resize->samplers = 0;
+    }
+
+    if ( !stbir_build_samplers( resize ) )
+      return 0;
+    
+    resize->called_alloc = alloc_state;
+
+    // if build_samplers succeeded (above), but there are no samplers set, then 
+    //   the area to stretch into was zero pixels, so don't do anything and return
+    //   success
+    if ( resize->samplers == 0 )
+      return 1;
+  }
+  else
+  {
+    // didn't build anything - clear it
+    STBIR_PROFILE_BUILD_CLEAR( resize->samplers );
+  }
+
+
+  // update anything that can be changed without recalcing samplers
+  stbir__update_info_from_resize( resize->samplers, resize );
+
+  // do resize
+  result = stbir__perform_resize( resize->samplers, 0, resize->splits );
+
+  // if we alloced, then free
+  if ( !resize->called_alloc )
+  {
+    stbir_free_samplers( resize );
+    resize->samplers = 0;
+  } 
+
+  return result;
+}
+
+STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start, int split_count )
+{
+  STBIR_ASSERT( resize->samplers );
+
+  // if we're just doing the whole thing, call full
+  if ( ( split_start == -1 ) || ( ( split_start == 0 ) && ( split_count == resize->splits ) ) )
+    return stbir_resize_extended( resize );
+
+  // you **must** build samplers first when using split resize
+  if ( ( resize->samplers == 0 ) || ( resize->needs_rebuild ) )
+    return 0; 
+    
+  if ( ( split_start >= resize->splits ) || ( split_start < 0 ) || ( ( split_start + split_count ) > resize->splits ) || ( split_count <= 0 ) )
+    return 0;
+  
+  // update anything that can be changed without recalcing samplers
+  stbir__update_info_from_resize( resize->samplers, resize );
+ 
+  // do resize
+  return stbir__perform_resize( resize->samplers, split_start, split_count );
+}
+
+static int stbir__check_output_stuff( void ** ret_ptr, int * ret_pitch, void * output_pixels, int type_size, int output_w, int output_h, int output_stride_in_bytes, stbir_internal_pixel_layout pixel_layout )
+{
+  size_t size;
+  int pitch;
+  void * ptr;
+
+  pitch = output_w * type_size * stbir__pixel_channels[ pixel_layout ];
+  if ( pitch == 0 )
+    return 0;
+
+  if ( output_stride_in_bytes == 0 )
+    output_stride_in_bytes = pitch;
+
+  if ( output_stride_in_bytes < pitch )
+    return 0;
+
+  size = output_stride_in_bytes * output_h;
+  if ( size == 0 )
+    return 0;
+
+  *ret_ptr = 0;
+  *ret_pitch = output_stride_in_bytes;
+
+  if ( output_pixels == 0 )
+  {
+    ptr = STBIR_MALLOC( size, 0 );
+    if ( ptr == 0 )
+      return 0;
+
+    *ret_ptr = ptr;
+    *ret_pitch = pitch;
+  }
+
+  return 1;  
+}
+
+
+STBIRDEF unsigned char * stbir_resize_uint8_linear( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
+                                                          unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                                          stbir_pixel_layout pixel_layout )
+{
+  STBIR_RESIZE resize;
+  unsigned char * optr;
+  int opitch;
+
+  if ( !stbir__check_output_stuff( (void**)&optr, &opitch, output_pixels, sizeof( unsigned char ), output_w, output_h, output_stride_in_bytes, stbir__pixel_layout_convert_public_to_internal[ pixel_layout ] ) )
+    return 0;
+
+  stbir_resize_init( &resize, 
+                     input_pixels,  input_w,  input_h,  input_stride_in_bytes, 
+                     (optr) ? optr : output_pixels, output_w, output_h, opitch, 
+                     pixel_layout, STBIR_TYPE_UINT8 );
+
+  if ( !stbir_resize_extended( &resize ) )
+  {
+    if ( optr )
+      STBIR_FREE( optr, 0 );
+    return 0;
+  }
+
+  return (optr) ? optr : output_pixels;
+}
+
+STBIRDEF unsigned char * stbir_resize_uint8_srgb( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
+                                                        unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                                        stbir_pixel_layout pixel_layout )
+{
+  STBIR_RESIZE resize;
+  unsigned char * optr;
+  int opitch;
+
+  if ( !stbir__check_output_stuff( (void**)&optr, &opitch, output_pixels, sizeof( unsigned char ), output_w, output_h, output_stride_in_bytes, stbir__pixel_layout_convert_public_to_internal[ pixel_layout ] ) )
+    return 0;
+
+  stbir_resize_init( &resize, 
+                     input_pixels,  input_w,  input_h,  input_stride_in_bytes, 
+                     (optr) ? optr : output_pixels, output_w, output_h, opitch, 
+                     pixel_layout, STBIR_TYPE_UINT8_SRGB );
+
+  if ( !stbir_resize_extended( &resize ) )
+  {
+    if ( optr )
+      STBIR_FREE( optr, 0 );
+    return 0;
+  }
+
+  return (optr) ? optr : output_pixels;
+}
+
+
+STBIRDEF float * stbir_resize_float_linear( const float *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
+                                                  float *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                                  stbir_pixel_layout pixel_layout )
+{
+  STBIR_RESIZE resize;
+  float * optr;
+  int opitch;
+
+  if ( !stbir__check_output_stuff( (void**)&optr, &opitch, output_pixels, sizeof( float ), output_w, output_h, output_stride_in_bytes, stbir__pixel_layout_convert_public_to_internal[ pixel_layout ] ) )
+    return 0;
+
+  stbir_resize_init( &resize, 
+                     input_pixels,  input_w,  input_h,  input_stride_in_bytes, 
+                     (optr) ? optr : output_pixels, output_w, output_h, opitch, 
+                     pixel_layout, STBIR_TYPE_FLOAT );
+
+  if ( !stbir_resize_extended( &resize ) )
+  {
+    if ( optr )
+      STBIR_FREE( optr, 0 );
+    return 0;
+  }
+
+  return (optr) ? optr : output_pixels;
+}
+
+
+STBIRDEF void * stbir_resize( const void *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
+                                    void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                              stbir_pixel_layout pixel_layout, stbir_datatype data_type, 
+                              stbir_edge edge, stbir_filter filter )
+{
+  STBIR_RESIZE resize;
+  float * optr;
+  int opitch;
+
+  if ( !stbir__check_output_stuff( (void**)&optr, &opitch, output_pixels, stbir__type_size[data_type], output_w, output_h, output_stride_in_bytes, stbir__pixel_layout_convert_public_to_internal[ pixel_layout ] ) )
+    return 0;
+
+  stbir_resize_init( &resize, 
+                     input_pixels,  input_w,  input_h,  input_stride_in_bytes, 
+                     (optr) ? optr : output_pixels, output_w, output_h, output_stride_in_bytes, 
+                     pixel_layout, data_type );
+
+  resize.horizontal_edge = edge;
+  resize.vertical_edge = edge;
+  resize.horizontal_filter = filter;
+  resize.vertical_filter = filter;
+
+  if ( !stbir_resize_extended( &resize ) )
+  {
+    if ( optr )
+      STBIR_FREE( optr, 0 );
+    return 0;
+  }
+
+  return (optr) ? optr : output_pixels;
+}
+
+#ifdef STBIR_PROFILE
+
+STBIRDEF void stbir_resize_build_profile_info( STBIR_PROFILE_INFO * info, STBIR_RESIZE const * resize )
+{
+  static char const * bdescriptions[6] = { "Building", "Allocating", "Horizontal sampler", "Vertical sampler", "Coefficient cleanup", "Coefficient piovot" } ;
+  stbir__info* samp = resize->samplers;
+  int i;
+
+  typedef int testa[ (STBIR__ARRAY_SIZE( bdescriptions ) == (STBIR__ARRAY_SIZE( samp->profile.array )-1) )?1:-1];
+  typedef int testb[ (sizeof( samp->profile.array ) == (sizeof(samp->profile.named)) )?1:-1];
+  typedef int testc[ (sizeof( info->clocks ) >= (sizeof(samp->profile.named)) )?1:-1];
+
+  for( i = 0 ; i < STBIR__ARRAY_SIZE( bdescriptions ) ; i++)
+    info->clocks[i] = samp->profile.array[i+1];
+
+  info->total_clocks = samp->profile.named.total;
+  info->descriptions = bdescriptions;
+  info->count = STBIR__ARRAY_SIZE( bdescriptions );
+}
+
+STBIRDEF void stbir_resize_split_profile_info( STBIR_PROFILE_INFO * info, STBIR_RESIZE const * resize, int split_start, int split_count )
+{
+  static char const * descriptions[7] = { "Looping", "Vertical sampling", "Horizontal sampling", "Scanline input", "Scanline output", "Alpha weighting", "Alpha unweighting" };
+  stbir__per_split_info * split_info;
+  int s, i;
+
+  typedef int testa[ (STBIR__ARRAY_SIZE( descriptions ) == (STBIR__ARRAY_SIZE( split_info->profile.array )-1) )?1:-1];
+  typedef int testb[ (sizeof( split_info->profile.array ) == (sizeof(split_info->profile.named)) )?1:-1];
+  typedef int testc[ (sizeof( info->clocks ) >= (sizeof(split_info->profile.named)) )?1:-1];
+
+  if ( split_start == -1 )
+  {
+    split_start = 0;
+    split_count = resize->samplers->splits;
+  }
+
+  if ( ( split_start >= resize->splits ) || ( split_start < 0 ) || ( ( split_start + split_count ) > resize->splits ) || ( split_count <= 0 ) )
+  {
+    info->total_clocks = 0;
+    info->descriptions = 0;
+    info->count = 0;
+    return;
+  }
+
+  split_info = resize->samplers->split_info + split_start;
+
+  // sum up the profile from all the splits
+  for( i = 0 ; i < STBIR__ARRAY_SIZE( descriptions ) ; i++ )
+  {
+    stbir_uint64 sum = 0;
+    for( s = 0 ; s < split_count ; s++ )
+      sum += split_info[s].profile.array[i+1];
+    info->clocks[i] = sum;
+  }
+
+  info->total_clocks = split_info->profile.named.total;
+  info->descriptions = descriptions;
+  info->count = STBIR__ARRAY_SIZE( descriptions );
+}
+
+STBIRDEF void stbir_resize_extended_profile_info( STBIR_PROFILE_INFO * info, STBIR_RESIZE const * resize )
+{
+  stbir_resize_split_profile_info( info, resize, -1, 0 );
+}
+
+#endif // STBIR_PROFILE
+
+#undef STBIR_BGR
+#undef STBIR_1CHANNEL
+#undef STBIR_2CHANNEL
+#undef STBIR_RGB
+#undef STBIR_RGBA
+#undef STBIR_4CHANNEL
+#undef STBIR_BGRA
+#undef STBIR_ARGB
+#undef STBIR_ABGR
+#undef STBIR_RA
+#undef STBIR_AR
+#undef STBIR_RGBA_PM
+#undef STBIR_BGRA_PM
+#undef STBIR_ARGB_PM
+#undef STBIR_ABGR_PM
+#undef STBIR_RA_PM
+#undef STBIR_AR_PM
+
+#endif // STB_IMAGE_RESIZE_IMPLEMENTATION
+
+#else  // STB_IMAGE_RESIZE_HORIZONTALS&STB_IMAGE_RESIZE_DO_VERTICALS
+
+// we reinclude the header file to define all the horizontal functions
+//   specializing each function for the number of coeffs is 20-40% faster *OVERALL* 
+
+// by including the header file again this way, we can still debug the functions
+
+#define STBIR_strs_join2( start, mid, end ) start##mid##end
+#define STBIR_strs_join1( start, mid, end ) STBIR_strs_join2( start, mid, end )
+
+#define STBIR_strs_join24( start, mid1, mid2, end ) start##mid1##mid2##end
+#define STBIR_strs_join14( start, mid1, mid2, end ) STBIR_strs_join24( start, mid1, mid2, end )
+
+#ifdef STB_IMAGE_RESIZE_DO_CODERS
+
+#ifdef stbir__decode_suffix
+#define STBIR__CODER_NAME( name ) STBIR_strs_join1( name, _, stbir__decode_suffix )
+#else
+#define STBIR__CODER_NAME( name ) name
+#endif
+
+#ifdef stbir__decode_swizzle
+#define stbir__decode_simdf8_flip(reg) STBIR_strs_join1( STBIR_strs_join1( STBIR_strs_join1( STBIR_strs_join1( stbir__simdf8_0123to,stbir__decode_order0,stbir__decode_order1),stbir__decode_order2,stbir__decode_order3),stbir__decode_order0,stbir__decode_order1),stbir__decode_order2,stbir__decode_order3)(reg, reg)
+#define stbir__decode_simdf4_flip(reg) STBIR_strs_join1( STBIR_strs_join1( stbir__simdf_0123to,stbir__decode_order0,stbir__decode_order1),stbir__decode_order2,stbir__decode_order3)(reg, reg)
+#define stbir__encode_simdf8_unflip(reg) STBIR_strs_join1( STBIR_strs_join1( STBIR_strs_join1( STBIR_strs_join1( stbir__simdf8_0123to,stbir__encode_order0,stbir__encode_order1),stbir__encode_order2,stbir__encode_order3),stbir__encode_order0,stbir__encode_order1),stbir__encode_order2,stbir__encode_order3)(reg, reg)
+#define stbir__encode_simdf4_unflip(reg) STBIR_strs_join1( STBIR_strs_join1( stbir__simdf_0123to,stbir__encode_order0,stbir__encode_order1),stbir__encode_order2,stbir__encode_order3)(reg, reg)
+#else
+#define stbir__decode_order0 0
+#define stbir__decode_order1 1
+#define stbir__decode_order2 2
+#define stbir__decode_order3 3
+#define stbir__encode_order0 0
+#define stbir__encode_order1 1
+#define stbir__encode_order2 2
+#define stbir__encode_order3 3
+#define stbir__decode_simdf8_flip(reg)
+#define stbir__decode_simdf4_flip(reg) 
+#define stbir__encode_simdf8_unflip(reg)
+#define stbir__encode_simdf4_unflip(reg) 
+#endif
+
+#ifdef STBIR_SIMD8
+#define stbir__encode_simdfX_unflip  stbir__encode_simdf8_unflip
+#else
+#define stbir__encode_simdfX_unflip  stbir__encode_simdf4_unflip
+#endif 
+
+static void STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * decodep, int width_times_channels, void const * inputp )
+{
+  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
+  float * decode_end = (float*) decode + width_times_channels;
+  unsigned char const * input = (unsigned char const*)inputp;
+
+  #ifdef STBIR_SIMD
+  unsigned char const * end_input_m16 = input + width_times_channels - 16;
+  if ( width_times_channels >= 16 )
+  {
+    decode_end -= 16;
+    for(;;)
+    {
+      #ifdef STBIR_SIMD8
+      stbir__simdi i; stbir__simdi8 o0,o1;
+      stbir__simdf8 of0, of1;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdi_load( i, input );
+      stbir__simdi8_expand_u8_to_u32( o0, o1, i );
+      stbir__simdi8_convert_i32_to_float( of0, o0 );
+      stbir__simdi8_convert_i32_to_float( of1, o1 );
+      stbir__simdf8_mult( of0, of0, STBIR_max_uint8_as_float_inverted8);
+      stbir__simdf8_mult( of1, of1, STBIR_max_uint8_as_float_inverted8);
+      stbir__decode_simdf8_flip( of0 );
+      stbir__decode_simdf8_flip( of1 );
+      stbir__simdf8_store( decode + 0, of0 );
+      stbir__simdf8_store( decode + 8, of1 );
+      #else
+      stbir__simdi i, o0, o1, o2, o3;
+      stbir__simdf of0, of1, of2, of3;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdi_load( i, input );
+      stbir__simdi_expand_u8_to_u32( o0,o1,o2,o3,i);
+      stbir__simdi_convert_i32_to_float( of0, o0 );
+      stbir__simdi_convert_i32_to_float( of1, o1 );
+      stbir__simdi_convert_i32_to_float( of2, o2 );
+      stbir__simdi_convert_i32_to_float( of3, o3 );
+      stbir__simdf_mult( of0, of0, STBIR__CONSTF(STBIR_max_uint8_as_float_inverted) );
+      stbir__simdf_mult( of1, of1, STBIR__CONSTF(STBIR_max_uint8_as_float_inverted) );
+      stbir__simdf_mult( of2, of2, STBIR__CONSTF(STBIR_max_uint8_as_float_inverted) );
+      stbir__simdf_mult( of3, of3, STBIR__CONSTF(STBIR_max_uint8_as_float_inverted) );
+      stbir__decode_simdf4_flip( of0 );
+      stbir__decode_simdf4_flip( of1 );
+      stbir__decode_simdf4_flip( of2 );
+      stbir__decode_simdf4_flip( of3 );
+      stbir__simdf_store( decode + 0,  of0 );
+      stbir__simdf_store( decode + 4,  of1 );
+      stbir__simdf_store( decode + 8,  of2 );
+      stbir__simdf_store( decode + 12, of3 );
+      #endif
+      decode += 16;
+      input += 16;
+      if ( decode <= decode_end ) 
+        continue;
+      if ( decode == ( decode_end + 16 ) )
+        break;
+      decode = decode_end; // backup and do last couple
+      input = end_input_m16;
+    }
+    return;
+  }
+  #endif
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  decode += 4;
+  while( decode <= decode_end )
+  {
+    STBIR_SIMD_NO_UNROLL(decode);
+    decode[0-4] = ((float)(input[stbir__decode_order0])) * stbir__max_uint8_as_float_inverted;
+    decode[1-4] = ((float)(input[stbir__decode_order1])) * stbir__max_uint8_as_float_inverted;
+    decode[2-4] = ((float)(input[stbir__decode_order2])) * stbir__max_uint8_as_float_inverted;
+    decode[3-4] = ((float)(input[stbir__decode_order3])) * stbir__max_uint8_as_float_inverted;
+    decode += 4;
+    input += 4;
+  }
+  decode -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( decode < decode_end )
+  {
+    STBIR_NO_UNROLL(decode);
+    decode[0] = ((float)(input[stbir__decode_order0])) * stbir__max_uint8_as_float_inverted;
+    #if stbir__coder_min_num >= 2
+    decode[1] = ((float)(input[stbir__decode_order1])) * stbir__max_uint8_as_float_inverted;
+    #endif
+    #if stbir__coder_min_num >= 3
+    decode[2] = ((float)(input[stbir__decode_order2])) * stbir__max_uint8_as_float_inverted;
+    #endif
+    decode += stbir__coder_min_num;
+    input += stbir__coder_min_num;
+  }
+  #endif
+}
+
+static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outputp, int width_times_channels, float const * encode )
+{
+  unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char *) outputp;
+  unsigned char * end_output = ( (unsigned char *) output ) + width_times_channels;
+
+  #ifdef STBIR_SIMD
+  if ( width_times_channels >= stbir__simdfX_float_count*2 )
+  {
+    float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
+    end_output -= stbir__simdfX_float_count*2;
+    for(;;)
+    {
+      stbir__simdfX e0, e1;
+      stbir__simdi i;
+      STBIR_SIMD_NO_UNROLL(encode);
+      stbir__simdfX_madd_mem( e0, STBIR_simd_point5X, STBIR_max_uint8_as_floatX, encode );
+      stbir__simdfX_madd_mem( e1, STBIR_simd_point5X, STBIR_max_uint8_as_floatX, encode+stbir__simdfX_float_count );
+      stbir__encode_simdfX_unflip( e0 );
+      stbir__encode_simdfX_unflip( e1 );
+      #ifdef STBIR_SIMD8
+      stbir__simdf8_pack_to_16bytes( i, e0, e1 ); 
+      stbir__simdi_store( output, i );
+      #else
+      stbir__simdf_pack_to_8bytes( i, e0, e1 ); 
+      stbir__simdi_store2( output, i );
+      #endif
+      encode += stbir__simdfX_float_count*2;
+      output += stbir__simdfX_float_count*2;
+      if ( output <= end_output ) 
+        continue;
+      if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
+        break;
+      output = end_output; // backup and do last couple
+      encode = end_encode_m8;
+    }
+    return;
+  }
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  while( output <= end_output )
+  {
+    stbir__simdf e0;
+    stbir__simdi i0;
+    STBIR_NO_UNROLL(encode);
+    stbir__simdf_load( e0, encode );
+    stbir__simdf_madd( e0, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint8_as_float), e0 );
+    stbir__encode_simdf4_unflip( e0 );
+    stbir__simdf_pack_to_8bytes( i0, e0, e0 );  // only use first 4
+    *(int*)(output-4) = stbir__simdi_to_int( i0 );
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( output < end_output )
+  {
+    stbir__simdf e0; 
+    STBIR_NO_UNROLL(encode);
+    stbir__simdf_madd1_mem( e0, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint8_as_float), encode+stbir__encode_order0 ); output[0] = stbir__simdf_convert_float_to_uint8( e0 );
+    #if stbir__coder_min_num >= 2
+    stbir__simdf_madd1_mem( e0, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint8_as_float), encode+stbir__encode_order1 ); output[1] = stbir__simdf_convert_float_to_uint8( e0 );
+    #endif
+    #if stbir__coder_min_num >= 3
+    stbir__simdf_madd1_mem( e0, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint8_as_float), encode+stbir__encode_order2 ); output[2] = stbir__simdf_convert_float_to_uint8( e0 );
+    #endif
+    output += stbir__coder_min_num;
+    encode += stbir__coder_min_num;
+  }
+  #endif
+  
+  #else
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  while( output <= end_output )
+  {
+    float f;
+    f = encode[stbir__encode_order0] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[0-4] = (unsigned char)f;
+    f = encode[stbir__encode_order1] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[1-4] = (unsigned char)f;
+    f = encode[stbir__encode_order2] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[2-4] = (unsigned char)f;
+    f = encode[stbir__encode_order3] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[3-4] = (unsigned char)f;
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( output < end_output )
+  {
+    float f;
+    STBIR_NO_UNROLL(encode);
+    f = encode[stbir__encode_order0] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[0] = (unsigned char)f;
+    #if stbir__coder_min_num >= 2
+    f = encode[stbir__encode_order1] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[1] = (unsigned char)f;
+    #endif
+    #if stbir__coder_min_num >= 3
+    f = encode[stbir__encode_order2] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[2] = (unsigned char)f;
+    #endif
+    output += stbir__coder_min_num;
+    encode += stbir__coder_min_num;
+  }
+  #endif
+  #endif
+}
+
+static void STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int width_times_channels, void const * inputp )
+{
+  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
+  float * decode_end = (float*) decode + width_times_channels;
+  unsigned char const * input = (unsigned char const*)inputp;
+
+  #ifdef STBIR_SIMD
+  unsigned char const * end_input_m16 = input + width_times_channels - 16;
+  if ( width_times_channels >= 16 )
+  {
+    decode_end -= 16;
+    for(;;)
+    {
+      #ifdef STBIR_SIMD8
+      stbir__simdi i; stbir__simdi8 o0,o1;
+      stbir__simdf8 of0, of1;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdi_load( i, input );
+      stbir__simdi8_expand_u8_to_u32( o0, o1, i );
+      stbir__simdi8_convert_i32_to_float( of0, o0 );
+      stbir__simdi8_convert_i32_to_float( of1, o1 );
+      stbir__decode_simdf8_flip( of0 );
+      stbir__decode_simdf8_flip( of1 );
+      stbir__simdf8_store( decode + 0, of0 );
+      stbir__simdf8_store( decode + 8, of1 );
+      #else
+      stbir__simdi i, o0, o1, o2, o3;
+      stbir__simdf of0, of1, of2, of3;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdi_load( i, input );
+      stbir__simdi_expand_u8_to_u32( o0,o1,o2,o3,i);
+      stbir__simdi_convert_i32_to_float( of0, o0 );
+      stbir__simdi_convert_i32_to_float( of1, o1 );
+      stbir__simdi_convert_i32_to_float( of2, o2 );
+      stbir__simdi_convert_i32_to_float( of3, o3 );
+      stbir__decode_simdf4_flip( of0 );
+      stbir__decode_simdf4_flip( of1 );
+      stbir__decode_simdf4_flip( of2 );
+      stbir__decode_simdf4_flip( of3 );
+      stbir__simdf_store( decode + 0,  of0 );
+      stbir__simdf_store( decode + 4,  of1 );
+      stbir__simdf_store( decode + 8,  of2 );
+      stbir__simdf_store( decode + 12, of3 );
+#endif
+      decode += 16;
+      input += 16;
+      if ( decode <= decode_end ) 
+        continue;
+      if ( decode == ( decode_end + 16 ) )
+        break;
+      decode = decode_end; // backup and do last couple
+      input = end_input_m16;
+    }
+    return;
+  }
+  #endif
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  decode += 4;
+  while( decode <= decode_end )
+  {
+    STBIR_SIMD_NO_UNROLL(decode);
+    decode[0-4] = ((float)(input[stbir__decode_order0]));
+    decode[1-4] = ((float)(input[stbir__decode_order1]));
+    decode[2-4] = ((float)(input[stbir__decode_order2]));
+    decode[3-4] = ((float)(input[stbir__decode_order3]));
+    decode += 4;
+    input += 4;
+  }
+  decode -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( decode < decode_end )
+  {
+    STBIR_NO_UNROLL(decode);
+    decode[0] = ((float)(input[stbir__decode_order0]));
+    #if stbir__coder_min_num >= 2
+    decode[1] = ((float)(input[stbir__decode_order1]));
+    #endif
+    #if stbir__coder_min_num >= 3
+    decode[2] = ((float)(input[stbir__decode_order2]));
+    #endif
+    decode += stbir__coder_min_num;
+    input += stbir__coder_min_num;
+  }
+  #endif
+}
+
+static void STBIR__CODER_NAME( stbir__encode_uint8_linear )( void * outputp, int width_times_channels, float const * encode )
+{
+  unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char *) outputp;
+  unsigned char * end_output = ( (unsigned char *) output ) + width_times_channels;
+
+  #ifdef STBIR_SIMD
+  if ( width_times_channels >= stbir__simdfX_float_count*2 )
+  {
+    float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
+    end_output -= stbir__simdfX_float_count*2;
+    for(;;)
+    {
+      stbir__simdfX e0, e1;
+      stbir__simdi i;
+      STBIR_SIMD_NO_UNROLL(encode);
+      stbir__simdfX_add_mem( e0, STBIR_simd_point5X, encode );
+      stbir__simdfX_add_mem( e1, STBIR_simd_point5X, encode+stbir__simdfX_float_count );
+      stbir__encode_simdfX_unflip( e0 );
+      stbir__encode_simdfX_unflip( e1 );
+      #ifdef STBIR_SIMD8
+      stbir__simdf8_pack_to_16bytes( i, e0, e1 ); 
+      stbir__simdi_store( output, i );
+      #else
+      stbir__simdf_pack_to_8bytes( i, e0, e1 ); 
+      stbir__simdi_store2( output, i );
+      #endif
+      encode += stbir__simdfX_float_count*2;
+      output += stbir__simdfX_float_count*2;
+      if ( output <= end_output ) 
+        continue;
+      if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
+        break;
+      output = end_output; // backup and do last couple
+      encode = end_encode_m8;
+    }
+    return;
+  }
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  while( output <= end_output )
+  {
+    stbir__simdf e0;
+    stbir__simdi i0;
+    STBIR_NO_UNROLL(encode);
+    stbir__simdf_load( e0, encode );
+    stbir__simdf_add( e0, STBIR__CONSTF(STBIR_simd_point5), e0 );
+    stbir__encode_simdf4_unflip( e0 );
+    stbir__simdf_pack_to_8bytes( i0, e0, e0 );  // only use first 4
+    *(int*)(output-4) = stbir__simdi_to_int( i0 );
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  #else
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  while( output <= end_output )
+  {
+    float f;
+    f = encode[stbir__encode_order0] + 0.5f; STBIR_CLAMP(f, 0, 255); output[0-4] = (unsigned char)f;
+    f = encode[stbir__encode_order1] + 0.5f; STBIR_CLAMP(f, 0, 255); output[1-4] = (unsigned char)f;
+    f = encode[stbir__encode_order2] + 0.5f; STBIR_CLAMP(f, 0, 255); output[2-4] = (unsigned char)f;
+    f = encode[stbir__encode_order3] + 0.5f; STBIR_CLAMP(f, 0, 255); output[3-4] = (unsigned char)f;
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( output < end_output )
+  {
+    float f;
+    STBIR_NO_UNROLL(encode);
+    f = encode[stbir__encode_order0] + 0.5f; STBIR_CLAMP(f, 0, 255); output[0] = (unsigned char)f;
+    #if stbir__coder_min_num >= 2
+    f = encode[stbir__encode_order1] + 0.5f; STBIR_CLAMP(f, 0, 255); output[1] = (unsigned char)f;
+    #endif
+    #if stbir__coder_min_num >= 3
+    f = encode[stbir__encode_order2] + 0.5f; STBIR_CLAMP(f, 0, 255); output[2] = (unsigned char)f;
+    #endif
+    output += stbir__coder_min_num;
+    encode += stbir__coder_min_num;
+  }
+  #endif
+}
+
+static void STBIR__CODER_NAME(stbir__decode_uint8_srgb)( float * decodep, int width_times_channels, void const * inputp )
+{
+  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
+  float const * decode_end = (float*) decode + width_times_channels;
+  unsigned char const * input = (unsigned char const *)inputp;
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  decode += 4;
+  while( decode <= decode_end )
+  {
+    decode[0-4] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order0 ] ];
+    decode[1-4] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order1 ] ];
+    decode[2-4] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order2 ] ];
+    decode[3-4] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order3 ] ];
+    decode += 4;
+    input += 4;
+  }
+  decode -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( decode < decode_end )
+  {
+    STBIR_NO_UNROLL(decode);
+    decode[0] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order0 ] ];
+    #if stbir__coder_min_num >= 2
+    decode[1] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order1 ] ];
+    #endif
+    #if stbir__coder_min_num >= 3
+    decode[2] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order2 ] ];
+    #endif
+    decode += stbir__coder_min_num;
+    input += stbir__coder_min_num;
+  }
+  #endif
+}
+
+#define stbir__min_max_shift20( i, f ) \
+    stbir__simdf_max( f, f, stbir_simdf_casti(STBIR__CONSTI( STBIR_almost_zero )) ); \
+    stbir__simdf_min( f, f, stbir_simdf_casti(STBIR__CONSTI( STBIR_almost_one  )) ); \
+    stbir__simdi_32shr( i, stbir_simdi_castf( f ), 20 ); 
+
+#define stbir__scale_and_convert( i, f ) \
+    stbir__simdf_madd( f, STBIR__CONSTF( STBIR_simd_point5 ), STBIR__CONSTF( STBIR_max_uint8_as_float ), f ); \
+    stbir__simdf_max( f, f, stbir__simdf_zeroP() ); \
+    stbir__simdf_min( f, f, STBIR__CONSTF( STBIR_max_uint8_as_float ) ); \
+    stbir__simdf_convert_float_to_i32( i, f );
+
+#define stbir__linear_to_srgb_finish( i, f ) \
+{ \
+    stbir__simdi temp;  \
+    stbir__simdi_32shr( temp, stbir_simdi_castf( f ), 12 ) ; \
+    stbir__simdi_and( temp, temp, STBIR__CONSTI(STBIR_mastissa_mask) ); \
+    stbir__simdi_or( temp, temp, STBIR__CONSTI(STBIR_topscale) ); \
+    stbir__simdi_16madd( i, i, temp ); \
+    stbir__simdi_32shr( i, i, 16 ); \
+}
+
+#define stbir__simdi_table_lookup2( v0,v1, table ) \
+{ \
+  stbir__simdi_u32 temp0,temp1; \
+  temp0.m128i_i128 = v0; \
+  temp1.m128i_i128 = v1; \
+  temp0.m128i_u32[0] = table[temp0.m128i_i32[0]]; temp0.m128i_u32[1] = table[temp0.m128i_i32[1]]; temp0.m128i_u32[2] = table[temp0.m128i_i32[2]]; temp0.m128i_u32[3] = table[temp0.m128i_i32[3]]; \
+  temp1.m128i_u32[0] = table[temp1.m128i_i32[0]]; temp1.m128i_u32[1] = table[temp1.m128i_i32[1]]; temp1.m128i_u32[2] = table[temp1.m128i_i32[2]]; temp1.m128i_u32[3] = table[temp1.m128i_i32[3]]; \
+  v0 = temp0.m128i_i128; \
+  v1 = temp1.m128i_i128; \
+} 
+
+#define stbir__simdi_table_lookup3( v0,v1,v2, table ) \
+{ \
+  stbir__simdi_u32 temp0,temp1,temp2; \
+  temp0.m128i_i128 = v0; \
+  temp1.m128i_i128 = v1; \
+  temp2.m128i_i128 = v2; \
+  temp0.m128i_u32[0] = table[temp0.m128i_i32[0]]; temp0.m128i_u32[1] = table[temp0.m128i_i32[1]]; temp0.m128i_u32[2] = table[temp0.m128i_i32[2]]; temp0.m128i_u32[3] = table[temp0.m128i_i32[3]]; \
+  temp1.m128i_u32[0] = table[temp1.m128i_i32[0]]; temp1.m128i_u32[1] = table[temp1.m128i_i32[1]]; temp1.m128i_u32[2] = table[temp1.m128i_i32[2]]; temp1.m128i_u32[3] = table[temp1.m128i_i32[3]]; \
+  temp2.m128i_u32[0] = table[temp2.m128i_i32[0]]; temp2.m128i_u32[1] = table[temp2.m128i_i32[1]]; temp2.m128i_u32[2] = table[temp2.m128i_i32[2]]; temp2.m128i_u32[3] = table[temp2.m128i_i32[3]]; \
+  v0 = temp0.m128i_i128; \
+  v1 = temp1.m128i_i128; \
+  v2 = temp2.m128i_i128; \
+}
+
+#define stbir__simdi_table_lookup4( v0,v1,v2,v3, table ) \
+{ \
+  stbir__simdi_u32 temp0,temp1,temp2,temp3; \
+  temp0.m128i_i128 = v0; \
+  temp1.m128i_i128 = v1; \
+  temp2.m128i_i128 = v2; \
+  temp3.m128i_i128 = v3; \
+  temp0.m128i_u32[0] = table[temp0.m128i_i32[0]]; temp0.m128i_u32[1] = table[temp0.m128i_i32[1]]; temp0.m128i_u32[2] = table[temp0.m128i_i32[2]]; temp0.m128i_u32[3] = table[temp0.m128i_i32[3]]; \
+  temp1.m128i_u32[0] = table[temp1.m128i_i32[0]]; temp1.m128i_u32[1] = table[temp1.m128i_i32[1]]; temp1.m128i_u32[2] = table[temp1.m128i_i32[2]]; temp1.m128i_u32[3] = table[temp1.m128i_i32[3]]; \
+  temp2.m128i_u32[0] = table[temp2.m128i_i32[0]]; temp2.m128i_u32[1] = table[temp2.m128i_i32[1]]; temp2.m128i_u32[2] = table[temp2.m128i_i32[2]]; temp2.m128i_u32[3] = table[temp2.m128i_i32[3]]; \
+  temp3.m128i_u32[0] = table[temp3.m128i_i32[0]]; temp3.m128i_u32[1] = table[temp3.m128i_i32[1]]; temp3.m128i_u32[2] = table[temp3.m128i_i32[2]]; temp3.m128i_u32[3] = table[temp3.m128i_i32[3]]; \
+  v0 = temp0.m128i_i128; \
+  v1 = temp1.m128i_i128; \
+  v2 = temp2.m128i_i128; \
+  v3 = temp3.m128i_i128; \
+} 
+
+static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int width_times_channels, float const * encode )
+{
+  unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char*) outputp;
+  unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels;
+
+  #ifdef STBIR_SIMD
+  stbir_uint32 const * to_srgb = fp32_to_srgb8_tab4 - (127-13)*8;
+
+  if ( width_times_channels >= 16 )
+  {
+    float const * end_encode_m16 = encode + width_times_channels - 16;
+    end_output -= 16;
+    for(;;)
+    {
+      stbir__simdf f0, f1, f2, f3;
+      stbir__simdi i0, i1, i2, i3; 
+      STBIR_SIMD_NO_UNROLL(encode);
+
+      stbir__simdf_load4_transposed( f0, f1, f2, f3, encode );
+
+      stbir__min_max_shift20( i0, f0 );
+      stbir__min_max_shift20( i1, f1 );
+      stbir__min_max_shift20( i2, f2 );
+      stbir__min_max_shift20( i3, f3 );
+      
+      stbir__simdi_table_lookup4( i0, i1, i2, i3, to_srgb );
+     
+      stbir__linear_to_srgb_finish( i0, f0 );
+      stbir__linear_to_srgb_finish( i1, f1 );
+      stbir__linear_to_srgb_finish( i2, f2 );
+      stbir__linear_to_srgb_finish( i3, f3 );
+
+      stbir__interleave_pack_and_store_16_u8( output,  STBIR_strs_join1(i, ,stbir__encode_order0), STBIR_strs_join1(i, ,stbir__encode_order1), STBIR_strs_join1(i, ,stbir__encode_order2), STBIR_strs_join1(i, ,stbir__encode_order3) );
+
+      encode += 16;
+      output += 16;
+      if ( output <= end_output ) 
+        continue;
+      if ( output == ( end_output + 16 ) )
+        break;
+      output = end_output; // backup and do last couple
+      encode = end_encode_m16;
+    }
+    return;
+  }
+  #endif
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  while ( output <= end_output )
+  {
+    STBIR_SIMD_NO_UNROLL(encode);
+
+    output[0-4] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order0] );
+    output[1-4] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order1] );
+    output[2-4] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order2] );
+    output[3-4] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order3] );
+
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( output < end_output ) 
+  {
+    STBIR_NO_UNROLL(encode);
+    output[0] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order0] );
+    #if stbir__coder_min_num >= 2
+    output[1] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order1] );
+    #endif
+    #if stbir__coder_min_num >= 3
+    output[2] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order2] );
+    #endif
+    output += stbir__coder_min_num;
+    encode += stbir__coder_min_num;
+  }
+  #endif
+}
+
+#if ( stbir__coder_min_num == 4 ) || ( ( stbir__coder_min_num == 1 ) && ( !defined(stbir__decode_swizzle) ) )
+
+static void STBIR__CODER_NAME(stbir__decode_uint8_srgb4_linearalpha)( float * decodep, int width_times_channels, void const * inputp )
+{
+  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
+  float const * decode_end = (float*) decode + width_times_channels;
+  unsigned char const * input = (unsigned char const *)inputp;
+  do {
+    decode[0] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order0] ];
+    decode[1] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order1] ];
+    decode[2] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order2] ];
+    decode[3] = ( (float) input[stbir__decode_order3] ) * stbir__max_uint8_as_float_inverted;
+    input += 4;
+    decode += 4;
+  } while( decode < decode_end );
+}
+
+
+static void STBIR__CODER_NAME( stbir__encode_uint8_srgb4_linearalpha )( void * outputp, int width_times_channels, float const * encode )
+{
+  unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char*) outputp;
+  unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels;
+
+  #ifdef STBIR_SIMD
+  stbir_uint32 const * to_srgb = fp32_to_srgb8_tab4 - (127-13)*8;
+
+  if ( width_times_channels >= 16 )
+  {
+    float const * end_encode_m16 = encode + width_times_channels - 16;
+    end_output -= 16;
+    for(;;)
+    {
+      stbir__simdf f0, f1, f2, f3;
+      stbir__simdi i0, i1, i2, i3;
+
+      STBIR_SIMD_NO_UNROLL(encode);
+      stbir__simdf_load4_transposed( f0, f1, f2, f3, encode );
+
+      stbir__min_max_shift20( i0, f0 );
+      stbir__min_max_shift20( i1, f1 );
+      stbir__min_max_shift20( i2, f2 );
+      stbir__scale_and_convert( i3, f3 ); 
+      
+      stbir__simdi_table_lookup3( i0, i1, i2, to_srgb );
+     
+      stbir__linear_to_srgb_finish( i0, f0 );
+      stbir__linear_to_srgb_finish( i1, f1 );
+      stbir__linear_to_srgb_finish( i2, f2 );
+
+      stbir__interleave_pack_and_store_16_u8( output,  STBIR_strs_join1(i, ,stbir__encode_order0), STBIR_strs_join1(i, ,stbir__encode_order1), STBIR_strs_join1(i, ,stbir__encode_order2), STBIR_strs_join1(i, ,stbir__encode_order3) );
+
+      output += 16;
+      encode += 16;
+
+      if ( output <= end_output ) 
+        continue;
+      if ( output == ( end_output + 16 ) )
+        break;
+      output = end_output; // backup and do last couple
+      encode = end_encode_m16;
+    }
+    return;
+  }
+  #endif
+
+  do {
+    float f;
+    STBIR_SIMD_NO_UNROLL(encode);                                        
+
+    output[stbir__decode_order0] = stbir__linear_to_srgb_uchar( encode[0] );
+    output[stbir__decode_order1] = stbir__linear_to_srgb_uchar( encode[1] );
+    output[stbir__decode_order2] = stbir__linear_to_srgb_uchar( encode[2] );
+
+    f = encode[3] * stbir__max_uint8_as_float + 0.5f;
+    STBIR_CLAMP(f, 0, 255);
+    output[stbir__decode_order3] = (unsigned char) f;
+
+    output += 4;
+    encode += 4;
+  } while( output < end_output );
+}
+
+#endif
+
+#if ( stbir__coder_min_num == 2 ) || ( ( stbir__coder_min_num == 1 ) && ( !defined(stbir__decode_swizzle) ) )
+
+static void STBIR__CODER_NAME(stbir__decode_uint8_srgb2_linearalpha)( float * decodep, int width_times_channels, void const * inputp )
+{
+  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
+  float const * decode_end = (float*) decode + width_times_channels;
+  unsigned char const * input = (unsigned char const *)inputp;
+  decode += 4;
+  while( decode <= decode_end )
+  {
+    decode[0-4] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order0] ];
+    decode[1-4] = ( (float) input[stbir__decode_order1] ) * stbir__max_uint8_as_float_inverted;
+    decode[2-4] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order0+2] ];
+    decode[3-4] = ( (float) input[stbir__decode_order1+2] ) * stbir__max_uint8_as_float_inverted;
+    input += 4;
+    decode += 4;
+  }
+  decode -= 4;
+  if( decode < decode_end ) 
+  {
+    decode[0] = stbir__srgb_uchar_to_linear_float[ stbir__decode_order0 ];
+    decode[1] = ( (float) input[stbir__decode_order1] ) * stbir__max_uint8_as_float_inverted;
+  }
+}
+
+static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * outputp, int width_times_channels, float const * encode )
+{
+  unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char*) outputp;
+  unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels;
+
+  #ifdef STBIR_SIMD
+  stbir_uint32 const * to_srgb = fp32_to_srgb8_tab4 - (127-13)*8;
+
+  if ( width_times_channels >= 16 )
+  {
+    float const * end_encode_m16 = encode + width_times_channels - 16;
+    end_output -= 16;
+    for(;;)
+    {
+      stbir__simdf f0, f1, f2, f3;
+      stbir__simdi i0, i1, i2, i3; 
+
+      STBIR_SIMD_NO_UNROLL(encode);
+      stbir__simdf_load4_transposed( f0, f1, f2, f3, encode );
+
+      stbir__min_max_shift20( i0, f0 );
+      stbir__scale_and_convert( i1, f1 );
+      stbir__min_max_shift20( i2, f2 );
+      stbir__scale_and_convert( i3, f3 );
+      
+      stbir__simdi_table_lookup2( i0, i2, to_srgb );
+     
+      stbir__linear_to_srgb_finish( i0, f0 );
+      stbir__linear_to_srgb_finish( i2, f2 );
+
+      stbir__interleave_pack_and_store_16_u8( output,  STBIR_strs_join1(i, ,stbir__encode_order0), STBIR_strs_join1(i, ,stbir__encode_order1), STBIR_strs_join1(i, ,stbir__encode_order2), STBIR_strs_join1(i, ,stbir__encode_order3) );
+
+      output += 16;
+      encode += 16;
+      if ( output <= end_output ) 
+        continue;
+      if ( output == ( end_output + 16 ) )
+        break;
+      output = end_output; // backup and do last couple
+      encode = end_encode_m16;
+    }
+    return;
+  }
+  #endif
+
+  do {
+    float f;
+    STBIR_SIMD_NO_UNROLL(encode);
+
+    output[stbir__decode_order0] = stbir__linear_to_srgb_uchar( encode[0] );
+
+    f = encode[1] * stbir__max_uint8_as_float + 0.5f;
+    STBIR_CLAMP(f, 0, 255);
+    output[stbir__decode_order1] = (unsigned char) f;
+
+    output += 2;
+    encode += 2;
+  } while( output < end_output );
+}
+
+#endif
+
+static void STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decodep, int width_times_channels, void const * inputp )
+{
+  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
+  float * decode_end = (float*) decode + width_times_channels;
+  unsigned short const * input = (unsigned short const *)inputp;
+
+  #ifdef STBIR_SIMD
+  unsigned short const * end_input_m8 = input + width_times_channels - 8;
+  if ( width_times_channels >= 8 )
+  {
+    decode_end -= 8;
+    for(;;)
+    {
+      #ifdef STBIR_SIMD8
+      stbir__simdi i; stbir__simdi8 o;
+      stbir__simdf8 of;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdi_load( i, input );
+      stbir__simdi8_expand_u16_to_u32( o, i );
+      stbir__simdi8_convert_i32_to_float( of, o );
+      stbir__simdf8_mult( of, of, STBIR_max_uint16_as_float_inverted8);
+      stbir__decode_simdf8_flip( of );
+      stbir__simdf8_store( decode + 0, of );
+      #else
+      stbir__simdi i, o0, o1;
+      stbir__simdf of0, of1;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdi_load( i, input );
+      stbir__simdi_expand_u16_to_u32( o0,o1,i );
+      stbir__simdi_convert_i32_to_float( of0, o0 );
+      stbir__simdi_convert_i32_to_float( of1, o1 );
+      stbir__simdf_mult( of0, of0, STBIR__CONSTF(STBIR_max_uint16_as_float_inverted) );
+      stbir__simdf_mult( of1, of1, STBIR__CONSTF(STBIR_max_uint16_as_float_inverted));
+      stbir__decode_simdf4_flip( of0 );
+      stbir__decode_simdf4_flip( of1 );
+      stbir__simdf_store( decode + 0,  of0 );
+      stbir__simdf_store( decode + 4,  of1 );
+      #endif
+      decode += 8;  
+      input += 8;
+      if ( decode <= decode_end ) 
+        continue;
+      if ( decode == ( decode_end + 8 ) )
+        break;
+      decode = decode_end; // backup and do last couple
+      input = end_input_m8;
+    }
+    return;
+  }
+  #endif
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  decode += 4;
+  while( decode <= decode_end )
+  {
+    STBIR_SIMD_NO_UNROLL(decode);
+    decode[0-4] = ((float)(input[stbir__decode_order0])) * stbir__max_uint16_as_float_inverted;
+    decode[1-4] = ((float)(input[stbir__decode_order1])) * stbir__max_uint16_as_float_inverted;
+    decode[2-4] = ((float)(input[stbir__decode_order2])) * stbir__max_uint16_as_float_inverted;
+    decode[3-4] = ((float)(input[stbir__decode_order3])) * stbir__max_uint16_as_float_inverted;
+    decode += 4;
+    input += 4;
+  }
+  decode -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( decode < decode_end )
+  {
+    STBIR_NO_UNROLL(decode);
+    decode[0] = ((float)(input[stbir__decode_order0])) * stbir__max_uint16_as_float_inverted;
+    #if stbir__coder_min_num >= 2
+    decode[1] = ((float)(input[stbir__decode_order1])) * stbir__max_uint16_as_float_inverted;
+    #endif
+    #if stbir__coder_min_num >= 3
+    decode[2] = ((float)(input[stbir__decode_order2])) * stbir__max_uint16_as_float_inverted;
+    #endif
+    decode += stbir__coder_min_num;
+    input += stbir__coder_min_num;
+  }
+  #endif
+}
+
+
+static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * outputp, int width_times_channels, float const * encode )
+{
+  unsigned short STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned short*) outputp;
+  unsigned short * end_output = ( (unsigned short*) output ) + width_times_channels;
+
+  #ifdef STBIR_SIMD
+  {
+    if ( width_times_channels >= stbir__simdfX_float_count*2 )
+    {
+      float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
+      end_output -= stbir__simdfX_float_count*2;
+      for(;;)
+      {
+        stbir__simdfX e0, e1;
+        stbir__simdiX i;
+        STBIR_SIMD_NO_UNROLL(encode);
+        stbir__simdfX_madd_mem( e0, STBIR_simd_point5X, STBIR_max_uint16_as_floatX, encode );
+        stbir__simdfX_madd_mem( e1, STBIR_simd_point5X, STBIR_max_uint16_as_floatX, encode+stbir__simdfX_float_count );
+        stbir__encode_simdfX_unflip( e0 );
+        stbir__encode_simdfX_unflip( e1 );
+        stbir__simdfX_pack_to_words( i, e0, e1 );
+        stbir__simdiX_store( output, i );
+        encode += stbir__simdfX_float_count*2;
+        output += stbir__simdfX_float_count*2;
+        if ( output <= end_output ) 
+          continue;
+        if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
+          break;
+        output = end_output;     // backup and do last couple
+        encode = end_encode_m8;
+      }
+      return;
+    }
+  }
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  while( output <= end_output )
+  {
+    stbir__simdf e;
+    stbir__simdi i;
+    STBIR_NO_UNROLL(encode);
+    stbir__simdf_load( e, encode );
+    stbir__simdf_madd( e, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint16_as_float), e );
+    stbir__encode_simdf4_unflip( e );
+    stbir__simdf_pack_to_8words( i, e, e );  // only use first 4
+    stbir__simdi_store2( output-4, i );
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( output < end_output )
+  {
+    stbir__simdf e;
+    STBIR_NO_UNROLL(encode);
+    stbir__simdf_madd1_mem( e, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint16_as_float), encode+stbir__encode_order0 ); output[0] = stbir__simdf_convert_float_to_short( e );
+    #if stbir__coder_min_num >= 2
+    stbir__simdf_madd1_mem( e, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint16_as_float), encode+stbir__encode_order1 ); output[1] = stbir__simdf_convert_float_to_short( e );
+    #endif
+    #if stbir__coder_min_num >= 3
+    stbir__simdf_madd1_mem( e, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint16_as_float), encode+stbir__encode_order2 ); output[2] = stbir__simdf_convert_float_to_short( e );
+    #endif
+    output += stbir__coder_min_num;
+    encode += stbir__coder_min_num;
+  }
+  #endif
+  
+  #else
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  while( output <= end_output )
+  {
+    float f;
+    STBIR_SIMD_NO_UNROLL(encode);
+    f = encode[stbir__encode_order0] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[0-4] = (unsigned short)f;
+    f = encode[stbir__encode_order1] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[1-4] = (unsigned short)f;
+    f = encode[stbir__encode_order2] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[2-4] = (unsigned short)f;
+    f = encode[stbir__encode_order3] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[3-4] = (unsigned short)f;
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( output < end_output )
+  {
+    float f;
+    STBIR_NO_UNROLL(encode);
+    f = encode[stbir__encode_order0] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[0] = (unsigned short)f;
+    #if stbir__coder_min_num >= 2
+    f = encode[stbir__encode_order1] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[1] = (unsigned short)f;
+    #endif
+    #if stbir__coder_min_num >= 3
+    f = encode[stbir__encode_order2] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[2] = (unsigned short)f;
+    #endif
+    output += stbir__coder_min_num;
+    encode += stbir__coder_min_num;
+  }
+  #endif
+  #endif
+}
+
+static void STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int width_times_channels, void const * inputp )
+{
+  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
+  float * decode_end = (float*) decode + width_times_channels;
+  unsigned short const * input = (unsigned short const *)inputp;
+
+  #ifdef STBIR_SIMD
+  unsigned short const * end_input_m8 = input + width_times_channels - 8;
+  if ( width_times_channels >= 8 )
+  {
+    decode_end -= 8;
+    for(;;)
+    {
+      #ifdef STBIR_SIMD8
+      stbir__simdi i; stbir__simdi8 o;
+      stbir__simdf8 of;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdi_load( i, input );
+      stbir__simdi8_expand_u16_to_u32( o, i );
+      stbir__simdi8_convert_i32_to_float( of, o );
+      stbir__decode_simdf8_flip( of );
+      stbir__simdf8_store( decode + 0, of );
+      #else
+      stbir__simdi i, o0, o1;
+      stbir__simdf of0, of1;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdi_load( i, input );
+      stbir__simdi_expand_u16_to_u32( o0, o1, i );
+      stbir__simdi_convert_i32_to_float( of0, o0 );
+      stbir__simdi_convert_i32_to_float( of1, o1 );
+      stbir__decode_simdf4_flip( of0 );
+      stbir__decode_simdf4_flip( of1 );
+      stbir__simdf_store( decode + 0,  of0 );
+      stbir__simdf_store( decode + 4,  of1 );
+      #endif
+      decode += 8;
+      input += 8;
+      if ( decode <= decode_end ) 
+        continue;
+      if ( decode == ( decode_end + 8 ) )
+        break;
+      decode = decode_end; // backup and do last couple
+      input = end_input_m8;
+    }
+    return;
+  }
+  #endif
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  decode += 4;
+  while( decode <= decode_end )
+  {
+    STBIR_SIMD_NO_UNROLL(decode);
+    decode[0-4] = ((float)(input[stbir__decode_order0]));
+    decode[1-4] = ((float)(input[stbir__decode_order1]));
+    decode[2-4] = ((float)(input[stbir__decode_order2]));
+    decode[3-4] = ((float)(input[stbir__decode_order3]));
+    decode += 4;
+    input += 4;
+  }
+  decode -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( decode < decode_end )
+  {
+    STBIR_NO_UNROLL(decode);
+    decode[0] = ((float)(input[stbir__decode_order0]));
+    #if stbir__coder_min_num >= 2
+    decode[1] = ((float)(input[stbir__decode_order1]));
+    #endif
+    #if stbir__coder_min_num >= 3
+    decode[2] = ((float)(input[stbir__decode_order2]));
+    #endif
+    decode += stbir__coder_min_num;
+    input += stbir__coder_min_num;
+  }
+  #endif
+}
+
+static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int width_times_channels, float const * encode )
+{
+  unsigned short STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned short*) outputp;
+  unsigned short * end_output = ( (unsigned short*) output ) + width_times_channels;
+
+  #ifdef STBIR_SIMD
+  {
+    if ( width_times_channels >= stbir__simdfX_float_count*2 )
+    {
+      float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
+      end_output -= stbir__simdfX_float_count*2;
+      for(;;)
+      {
+        stbir__simdfX e0, e1;
+        stbir__simdiX i;
+        STBIR_SIMD_NO_UNROLL(encode);
+        stbir__simdfX_add_mem( e0, STBIR_simd_point5X, encode );
+        stbir__simdfX_add_mem( e1, STBIR_simd_point5X, encode+stbir__simdfX_float_count );
+        stbir__encode_simdfX_unflip( e0 );
+        stbir__encode_simdfX_unflip( e1 );
+        stbir__simdfX_pack_to_words( i, e0, e1 );
+        stbir__simdiX_store( output, i );
+        encode += stbir__simdfX_float_count*2;
+        output += stbir__simdfX_float_count*2;
+        if ( output <= end_output ) 
+          continue;
+        if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
+          break;
+        output = end_output; // backup and do last couple
+        encode = end_encode_m8;
+      }
+      return;
+    }
+  }
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  while( output <= end_output )
+  {
+    stbir__simdf e;
+    stbir__simdi i;
+    STBIR_NO_UNROLL(encode);
+    stbir__simdf_load( e, encode );
+    stbir__simdf_add( e, STBIR__CONSTF(STBIR_simd_point5), e );
+    stbir__encode_simdf4_unflip( e );
+    stbir__simdf_pack_to_8words( i, e, e );  // only use first 4
+    stbir__simdi_store2( output-4, i );
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  #else
+
+  // try to do blocks of 4 when you can
+  #if  stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  while( output <= end_output )
+  {
+    float f;
+    STBIR_SIMD_NO_UNROLL(encode);
+    f = encode[stbir__encode_order0] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[0-4] = (unsigned short)f;
+    f = encode[stbir__encode_order1] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[1-4] = (unsigned short)f;
+    f = encode[stbir__encode_order2] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[2-4] = (unsigned short)f;
+    f = encode[stbir__encode_order3] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[3-4] = (unsigned short)f;
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( output < end_output )
+  {
+    float f;
+    STBIR_NO_UNROLL(encode);
+    f = encode[stbir__encode_order0] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[0] = (unsigned short)f;
+    #if stbir__coder_min_num >= 2
+    f = encode[stbir__encode_order1] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[1] = (unsigned short)f;
+    #endif
+    #if stbir__coder_min_num >= 3
+    f = encode[stbir__encode_order2] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[2] = (unsigned short)f;
+    #endif
+    output += stbir__coder_min_num;
+    encode += stbir__coder_min_num;
+  }
+  #endif
+}
+
+static void STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep, int width_times_channels, void const * inputp )
+{
+  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
+  float * decode_end = (float*) decode + width_times_channels;
+  stbir__FP16 const * input = (stbir__FP16 const *)inputp;
+
+  #ifdef STBIR_SIMD
+  if ( width_times_channels >= 8 )
+  {
+    stbir__FP16 const * end_input_m8 = input + width_times_channels - 8;
+    decode_end -= 8;
+    for(;;)
+    {
+      STBIR_NO_UNROLL(decode);
+
+      stbir__half_to_float_SIMD( decode, input );
+      #ifdef stbir__decode_swizzle
+      #ifdef STBIR_SIMD8
+      {
+        stbir__simdf8 of;
+        stbir__simdf8_load( of, decode );
+        stbir__decode_simdf8_flip( of );
+        stbir__simdf8_store( decode, of );
+      }
+      #else
+      {
+        stbir__simdf of0,of1;
+        stbir__simdf_load( of0, decode );
+        stbir__simdf_load( of1, decode+4 );
+        stbir__decode_simdf4_flip( of0 );
+        stbir__decode_simdf4_flip( of1 );
+        stbir__simdf_store( decode, of0 );
+        stbir__simdf_store( decode+4, of1 );
+      }
+      #endif
+      #endif
+      decode += 8;
+      input += 8;
+      if ( decode <= decode_end ) 
+        continue;
+      if ( decode == ( decode_end + 8 ) )
+        break;
+      decode = decode_end; // backup and do last couple
+      input = end_input_m8;
+    }
+    return;
+  }
+  #endif
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  decode += 4;
+  while( decode <= decode_end )
+  {
+    STBIR_SIMD_NO_UNROLL(decode);
+    decode[0-4] = stbir__half_to_float(input[stbir__decode_order0]);
+    decode[1-4] = stbir__half_to_float(input[stbir__decode_order1]);
+    decode[2-4] = stbir__half_to_float(input[stbir__decode_order2]);
+    decode[3-4] = stbir__half_to_float(input[stbir__decode_order3]);
+    decode += 4;
+    input += 4;
+  }
+  decode -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( decode < decode_end )
+  {
+    STBIR_NO_UNROLL(decode);
+    decode[0] = stbir__half_to_float(input[stbir__decode_order0]);
+    #if stbir__coder_min_num >= 2
+    decode[1] = stbir__half_to_float(input[stbir__decode_order1]);
+    #endif
+    #if stbir__coder_min_num >= 3
+    decode[2] = stbir__half_to_float(input[stbir__decode_order2]);
+    #endif
+    decode += stbir__coder_min_num;
+    input += stbir__coder_min_num;
+  }
+  #endif
+}
+
+static void STBIR__CODER_NAME( stbir__encode_half_float_linear )( void * outputp, int width_times_channels, float const * encode )
+{
+  stbir__FP16 STBIR_SIMD_STREAMOUT_PTR( * ) output = (stbir__FP16*) outputp;
+  stbir__FP16 * end_output = ( (stbir__FP16*) output ) + width_times_channels;
+
+  #ifdef STBIR_SIMD
+  if ( width_times_channels >= 8 )
+  {
+    float const * end_encode_m8 = encode + width_times_channels - 8;
+    end_output -= 8;
+    for(;;)
+    {
+      STBIR_SIMD_NO_UNROLL(encode);
+      #ifdef stbir__decode_swizzle
+      #ifdef STBIR_SIMD8
+      {
+        stbir__simdf8 of;
+        stbir__simdf8_load( of, encode );
+        stbir__encode_simdf8_unflip( of );
+        stbir__float_to_half_SIMD( output, (float*)&of );
+      }
+      #else
+      {
+        stbir__simdf of[2];
+        stbir__simdf_load( of[0], encode );
+        stbir__simdf_load( of[1], encode+4 );
+        stbir__encode_simdf4_unflip( of[0] );
+        stbir__encode_simdf4_unflip( of[1] );
+        stbir__float_to_half_SIMD( output, (float*)of );
+      }
+      #endif
+      #else
+      stbir__float_to_half_SIMD( output, encode );
+      #endif
+      encode += 8;
+      output += 8;
+      if ( output <= end_output ) 
+        continue;
+      if ( output == ( end_output + 8 ) )
+        break;
+      output = end_output; // backup and do last couple
+      encode = end_encode_m8;
+    }
+    return;
+  }
+  #endif
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  while( output <= end_output )
+  {
+    STBIR_SIMD_NO_UNROLL(output);
+    output[0-4] = stbir__float_to_half(encode[stbir__encode_order0]);
+    output[1-4] = stbir__float_to_half(encode[stbir__encode_order1]);
+    output[2-4] = stbir__float_to_half(encode[stbir__encode_order2]);
+    output[3-4] = stbir__float_to_half(encode[stbir__encode_order3]);
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( output < end_output )
+  {
+    STBIR_NO_UNROLL(output);
+    output[0] = stbir__float_to_half(encode[stbir__encode_order0]);
+    #if stbir__coder_min_num >= 2
+    output[1] = stbir__float_to_half(encode[stbir__encode_order1]);
+    #endif
+    #if stbir__coder_min_num >= 3
+    output[2] = stbir__float_to_half(encode[stbir__encode_order2]);
+    #endif
+    output += stbir__coder_min_num;
+    encode += stbir__coder_min_num;
+  }
+  #endif
+}
+
+static void STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int width_times_channels, void const * inputp )
+{
+  #ifdef stbir__decode_swizzle
+  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
+  float * decode_end = (float*) decode + width_times_channels;
+  float const * input = (float const *)inputp;
+
+  #ifdef STBIR_SIMD
+  if ( width_times_channels >= 16 )
+  {
+    float const * end_input_m16 = input + width_times_channels - 16;
+    decode_end -= 16;
+    for(;;)
+    {
+      STBIR_NO_UNROLL(decode);
+      #ifdef stbir__decode_swizzle
+      #ifdef STBIR_SIMD8
+      {
+        stbir__simdf8 of0,of1;
+        stbir__simdf8_load( of0, input );
+        stbir__simdf8_load( of1, input+8 );
+        stbir__decode_simdf8_flip( of0 );
+        stbir__decode_simdf8_flip( of1 );
+        stbir__simdf8_store( decode, of0 );
+        stbir__simdf8_store( decode+8, of1 );
+      }
+      #else
+      {
+        stbir__simdf of0,of1,of2,of3;
+        stbir__simdf_load( of0, input );
+        stbir__simdf_load( of1, input+4 );
+        stbir__simdf_load( of2, input+8 );
+        stbir__simdf_load( of3, input+12 );
+        stbir__decode_simdf4_flip( of0 );
+        stbir__decode_simdf4_flip( of1 );
+        stbir__decode_simdf4_flip( of2 );
+        stbir__decode_simdf4_flip( of3 );
+        stbir__simdf_store( decode, of0 );
+        stbir__simdf_store( decode+4, of1 );
+        stbir__simdf_store( decode+8, of2 );
+        stbir__simdf_store( decode+12, of3 );
+      }
+      #endif
+      #endif
+      decode += 16;
+      input += 16;
+      if ( decode <= decode_end ) 
+        continue;
+      if ( decode == ( decode_end + 16 ) )
+        break;
+      decode = decode_end; // backup and do last couple
+      input = end_input_m16;
+    }
+    return;
+  }
+  #endif
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  decode += 4;
+  while( decode <= decode_end )
+  {
+    STBIR_SIMD_NO_UNROLL(decode);
+    decode[0-4] = input[stbir__decode_order0];
+    decode[1-4] = input[stbir__decode_order1];
+    decode[2-4] = input[stbir__decode_order2];
+    decode[3-4] = input[stbir__decode_order3];
+    decode += 4;
+    input += 4;
+  }
+  decode -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( decode < decode_end )
+  {
+    STBIR_NO_UNROLL(decode);
+    decode[0] = input[stbir__decode_order0];
+    #if stbir__coder_min_num >= 2
+    decode[1] = input[stbir__decode_order1];
+    #endif
+    #if stbir__coder_min_num >= 3
+    decode[2] = input[stbir__decode_order2];
+    #endif
+    decode += stbir__coder_min_num;
+    input += stbir__coder_min_num;
+  }
+  #endif
+
+  #else
+  
+  if ( (void*)decodep != inputp )
+    STBIR_MEMCPY( decodep, inputp, width_times_channels * sizeof( float ) );
+  
+  #endif
+}
+
+static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int width_times_channels, float const * encode )
+{
+  #if !defined( STBIR_FLOAT_HIGH_CLAMP ) && !defined(STBIR_FLOAT_LO_CLAMP) && !defined(stbir__decode_swizzle)
+
+  if ( (void*)outputp != (void*) encode )
+    STBIR_MEMCPY( outputp, encode, width_times_channels * sizeof( float ) );
+
+  #else
+
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = (float*) outputp;
+  float * end_output = ( (float*) output ) + width_times_channels;
+
+  #ifdef STBIR_FLOAT_HIGH_CLAMP
+  #define stbir_scalar_hi_clamp( v ) if ( v > STBIR_FLOAT_HIGH_CLAMP ) v = STBIR_FLOAT_HIGH_CLAMP;
+  #else
+  #define stbir_scalar_hi_clamp( v )
+  #endif
+  #ifdef STBIR_FLOAT_LOW_CLAMP
+  #define stbir_scalar_lo_clamp( v ) if ( v < STBIR_FLOAT_LOW_CLAMP ) v = STBIR_FLOAT_LOW_CLAMP;
+  #else
+  #define stbir_scalar_lo_clamp( v )
+  #endif
+
+  #ifdef STBIR_SIMD
+
+  #ifdef STBIR_FLOAT_HIGH_CLAMP
+  const stbir__simdfX high_clamp = stbir__simdf_frepX(STBIR_FLOAT_HIGH_CLAMP);
+  #endif
+  #ifdef STBIR_FLOAT_LOW_CLAMP
+  const stbir__simdfX low_clamp = stbir__simdf_frepX(STBIR_FLOAT_LOW_CLAMP);
+  #endif
+
+  if ( width_times_channels >= ( stbir__simdfX_float_count * 2 ) )
+  {
+    float const * end_encode_m8 = encode + width_times_channels - ( stbir__simdfX_float_count * 2 );
+    end_output -= ( stbir__simdfX_float_count * 2 );
+    for(;;)
+    {
+      stbir__simdfX e0, e1;
+      STBIR_SIMD_NO_UNROLL(encode);
+      stbir__simdfX_load( e0, encode );
+      stbir__simdfX_load( e1, encode+stbir__simdfX_float_count );
+#ifdef STBIR_FLOAT_HIGH_CLAMP
+      stbir__simdfX_min( e0, e0, high_clamp );
+      stbir__simdfX_min( e1, e1, high_clamp );
+#endif      
+#ifdef STBIR_FLOAT_LOW_CLAMP
+      stbir__simdfX_max( e0, e0, low_clamp );
+      stbir__simdfX_max( e1, e1, low_clamp );
+#endif      
+      stbir__encode_simdfX_unflip( e0 );
+      stbir__encode_simdfX_unflip( e1 );
+      stbir__simdfX_store( output, e0 );
+      stbir__simdfX_store( output+stbir__simdfX_float_count, e1 );
+      encode += stbir__simdfX_float_count * 2;
+      output += stbir__simdfX_float_count * 2;
+      if ( output < end_output ) 
+        continue;
+      if ( output == ( end_output + ( stbir__simdfX_float_count * 2 ) ) )
+        break;
+      output = end_output; // backup and do last couple
+      encode = end_encode_m8;
+    }
+    return;
+  }
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  while( output <= end_output )
+  {
+    stbir__simdf e0;
+    STBIR_NO_UNROLL(encode);
+    stbir__simdf_load( e0, encode );
+#ifdef STBIR_FLOAT_HIGH_CLAMP
+    stbir__simdf_min( e0, e0, high_clamp );
+#endif      
+#ifdef STBIR_FLOAT_LOW_CLAMP
+    stbir__simdf_max( e0, e0, low_clamp );
+#endif      
+    stbir__encode_simdf4_unflip( e0 );
+    stbir__simdf_store( output-4, e0 );
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  #else
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  while( output <= end_output )
+  {
+    float e;
+    STBIR_SIMD_NO_UNROLL(encode);
+    e = encode[ stbir__encode_order0 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[0-4] = e;
+    e = encode[ stbir__encode_order1 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[1-4] = e;
+    e = encode[ stbir__encode_order2 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[2-4] = e;
+    e = encode[ stbir__encode_order3 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[3-4] = e;
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+
+  #endif
+
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( output < end_output )
+  {
+    float e;
+    STBIR_NO_UNROLL(encode);
+    e = encode[ stbir__encode_order0 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[0] = e;
+    #if stbir__coder_min_num >= 2
+    e = encode[ stbir__encode_order1 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[1] = e;
+    #endif
+    #if stbir__coder_min_num >= 3
+    e = encode[ stbir__encode_order2 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[2] = e;
+    #endif
+    output += stbir__coder_min_num;
+    encode += stbir__coder_min_num;
+  }
+  #endif
+  
+  #endif
+}
+
+#undef stbir__decode_suffix 
+#undef stbir__decode_simdf8_flip
+#undef stbir__decode_simdf4_flip
+#undef stbir__decode_order0 
+#undef stbir__decode_order1
+#undef stbir__decode_order2
+#undef stbir__decode_order3
+#undef stbir__encode_order0 
+#undef stbir__encode_order1
+#undef stbir__encode_order2
+#undef stbir__encode_order3
+#undef stbir__encode_simdf8_unflip
+#undef stbir__encode_simdf4_unflip
+#undef stbir__encode_simdfX_unflip
+#undef STBIR__CODER_NAME
+#undef stbir__coder_min_num
+#undef stbir__decode_swizzle
+#undef stbir_scalar_hi_clamp
+#undef stbir_scalar_lo_clamp
+#undef STB_IMAGE_RESIZE_DO_CODERS
+
+#elif defined( STB_IMAGE_RESIZE_DO_VERTICALS)
+
+#ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#define STBIR_chans( start, end ) STBIR_strs_join14(start,STBIR__vertical_channels,end,_cont)
+#else
+#define STBIR_chans( start, end ) STBIR_strs_join1(start,STBIR__vertical_channels,end)
+#endif
+
+#if STBIR__vertical_channels >= 1
+#define stbIF0( code ) code
+#else
+#define stbIF0( code )
+#endif
+#if STBIR__vertical_channels >= 2
+#define stbIF1( code ) code
+#else
+#define stbIF1( code )
+#endif
+#if STBIR__vertical_channels >= 3
+#define stbIF2( code ) code
+#else
+#define stbIF2( code )
+#endif
+#if STBIR__vertical_channels >= 4
+#define stbIF3( code ) code
+#else
+#define stbIF3( code )
+#endif
+#if STBIR__vertical_channels >= 5
+#define stbIF4( code ) code
+#else
+#define stbIF4( code )
+#endif
+#if STBIR__vertical_channels >= 6
+#define stbIF5( code ) code
+#else
+#define stbIF5( code )
+#endif
+#if STBIR__vertical_channels >= 7
+#define stbIF6( code ) code
+#else
+#define stbIF6( code )
+#endif
+#if STBIR__vertical_channels >= 8
+#define stbIF7( code ) code
+#else
+#define stbIF7( code )
+#endif
+
+static void STBIR_chans( stbir__vertical_scatter_with_,_coeffs)( float ** outputs, float const * vertical_coefficients, float const * input, float const * input_end )
+{
+  stbIF0( float STBIR_SIMD_STREAMOUT_PTR( * ) output0 = outputs[0]; float c0s = vertical_coefficients[0]; )
+  stbIF1( float STBIR_SIMD_STREAMOUT_PTR( * ) output1 = outputs[1]; float c1s = vertical_coefficients[1]; )
+  stbIF2( float STBIR_SIMD_STREAMOUT_PTR( * ) output2 = outputs[2]; float c2s = vertical_coefficients[2]; )
+  stbIF3( float STBIR_SIMD_STREAMOUT_PTR( * ) output3 = outputs[3]; float c3s = vertical_coefficients[3]; )
+  stbIF4( float STBIR_SIMD_STREAMOUT_PTR( * ) output4 = outputs[4]; float c4s = vertical_coefficients[4]; )
+  stbIF5( float STBIR_SIMD_STREAMOUT_PTR( * ) output5 = outputs[5]; float c5s = vertical_coefficients[5]; )
+  stbIF6( float STBIR_SIMD_STREAMOUT_PTR( * ) output6 = outputs[6]; float c6s = vertical_coefficients[6]; )
+  stbIF7( float STBIR_SIMD_STREAMOUT_PTR( * ) output7 = outputs[7]; float c7s = vertical_coefficients[7]; )
+
+  #ifdef STBIR_SIMD
+  {
+    stbIF0(stbir__simdfX c0 = stbir__simdf_frepX( c0s ); )
+    stbIF1(stbir__simdfX c1 = stbir__simdf_frepX( c1s ); )
+    stbIF2(stbir__simdfX c2 = stbir__simdf_frepX( c2s ); )
+    stbIF3(stbir__simdfX c3 = stbir__simdf_frepX( c3s ); )
+    stbIF4(stbir__simdfX c4 = stbir__simdf_frepX( c4s ); )
+    stbIF5(stbir__simdfX c5 = stbir__simdf_frepX( c5s ); )
+    stbIF6(stbir__simdfX c6 = stbir__simdf_frepX( c6s ); )
+    stbIF7(stbir__simdfX c7 = stbir__simdf_frepX( c7s ); )
+    while ( ( (char*)input_end - (char*) input ) >= (16*stbir__simdfX_float_count) ) 
+    {
+      stbir__simdfX o0, o1, o2, o3, r0, r1, r2, r3;
+      STBIR_SIMD_NO_UNROLL(output0);
+
+      stbir__simdfX_load( r0, input );               stbir__simdfX_load( r1, input+stbir__simdfX_float_count );     stbir__simdfX_load( r2, input+(2*stbir__simdfX_float_count) );      stbir__simdfX_load( r3, input+(3*stbir__simdfX_float_count) );
+
+      #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+      stbIF0( stbir__simdfX_load( o0, output0 );     stbir__simdfX_load( o1, output0+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output0+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output0+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c0 );  stbir__simdfX_madd( o1, o1, r1, c0 );  stbir__simdfX_madd( o2, o2, r2, c0 );   stbir__simdfX_madd( o3, o3, r3, c0 );           
+              stbir__simdfX_store( output0, o0 );    stbir__simdfX_store( output0+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output0+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output0+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF1( stbir__simdfX_load( o0, output1 );     stbir__simdfX_load( o1, output1+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output1+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output1+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c1 );  stbir__simdfX_madd( o1, o1, r1, c1 );  stbir__simdfX_madd( o2, o2, r2, c1 );   stbir__simdfX_madd( o3, o3, r3, c1 );             
+              stbir__simdfX_store( output1, o0 );    stbir__simdfX_store( output1+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output1+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output1+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF2( stbir__simdfX_load( o0, output2 );     stbir__simdfX_load( o1, output2+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output2+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output2+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c2 );  stbir__simdfX_madd( o1, o1, r1, c2 );  stbir__simdfX_madd( o2, o2, r2, c2 );   stbir__simdfX_madd( o3, o3, r3, c2 );             
+              stbir__simdfX_store( output2, o0 );    stbir__simdfX_store( output2+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output2+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output2+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF3( stbir__simdfX_load( o0, output3 );     stbir__simdfX_load( o1, output3+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output3+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output3+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c3 );  stbir__simdfX_madd( o1, o1, r1, c3 );  stbir__simdfX_madd( o2, o2, r2, c3 );   stbir__simdfX_madd( o3, o3, r3, c3 );             
+              stbir__simdfX_store( output3, o0 );    stbir__simdfX_store( output3+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output3+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output3+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF4( stbir__simdfX_load( o0, output4 );     stbir__simdfX_load( o1, output4+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output4+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output4+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c4 );  stbir__simdfX_madd( o1, o1, r1, c4 );  stbir__simdfX_madd( o2, o2, r2, c4 );   stbir__simdfX_madd( o3, o3, r3, c4 );             
+              stbir__simdfX_store( output4, o0 );    stbir__simdfX_store( output4+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output4+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output4+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF5( stbir__simdfX_load( o0, output5 );     stbir__simdfX_load( o1, output5+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output5+(2*stbir__simdfX_float_count));    stbir__simdfX_load( o3, output5+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c5 );  stbir__simdfX_madd( o1, o1, r1, c5 );  stbir__simdfX_madd( o2, o2, r2, c5 );   stbir__simdfX_madd( o3, o3, r3, c5 );             
+              stbir__simdfX_store( output5, o0 );    stbir__simdfX_store( output5+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output5+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output5+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF6( stbir__simdfX_load( o0, output6 );     stbir__simdfX_load( o1, output6+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output6+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output6+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c6 );  stbir__simdfX_madd( o1, o1, r1, c6 );  stbir__simdfX_madd( o2, o2, r2, c6 );   stbir__simdfX_madd( o3, o3, r3, c6 );             
+              stbir__simdfX_store( output6, o0 );    stbir__simdfX_store( output6+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output6+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output6+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF7( stbir__simdfX_load( o0, output7 );     stbir__simdfX_load( o1, output7+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output7+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output7+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c7 );  stbir__simdfX_madd( o1, o1, r1, c7 );  stbir__simdfX_madd( o2, o2, r2, c7 );   stbir__simdfX_madd( o3, o3, r3, c7 );             
+              stbir__simdfX_store( output7, o0 );    stbir__simdfX_store( output7+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output7+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output7+(3*stbir__simdfX_float_count), o3 ); )
+      #else
+      stbIF0( stbir__simdfX_mult( o0, r0, c0 );      stbir__simdfX_mult( o1, r1, c0 );      stbir__simdfX_mult( o2, r2, c0 );       stbir__simdfX_mult( o3, r3, c0 );  
+              stbir__simdfX_store( output0, o0 );    stbir__simdfX_store( output0+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output0+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output0+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF1( stbir__simdfX_mult( o0, r0, c1 );      stbir__simdfX_mult( o1, r1, c1 );      stbir__simdfX_mult( o2, r2, c1 );       stbir__simdfX_mult( o3, r3, c1 );  
+              stbir__simdfX_store( output1, o0 );    stbir__simdfX_store( output1+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output1+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output1+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF2( stbir__simdfX_mult( o0, r0, c2 );      stbir__simdfX_mult( o1, r1, c2 );      stbir__simdfX_mult( o2, r2, c2 );       stbir__simdfX_mult( o3, r3, c2 );  
+              stbir__simdfX_store( output2, o0 );    stbir__simdfX_store( output2+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output2+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output2+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF3( stbir__simdfX_mult( o0, r0, c3 );      stbir__simdfX_mult( o1, r1, c3 );      stbir__simdfX_mult( o2, r2, c3 );       stbir__simdfX_mult( o3, r3, c3 );  
+              stbir__simdfX_store( output3, o0 );    stbir__simdfX_store( output3+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output3+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output3+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF4( stbir__simdfX_mult( o0, r0, c4 );      stbir__simdfX_mult( o1, r1, c4 );      stbir__simdfX_mult( o2, r2, c4 );       stbir__simdfX_mult( o3, r3, c4 );  
+              stbir__simdfX_store( output4, o0 );    stbir__simdfX_store( output4+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output4+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output4+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF5( stbir__simdfX_mult( o0, r0, c5 );      stbir__simdfX_mult( o1, r1, c5 );      stbir__simdfX_mult( o2, r2, c5 );       stbir__simdfX_mult( o3, r3, c5 );  
+              stbir__simdfX_store( output5, o0 );    stbir__simdfX_store( output5+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output5+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output5+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF6( stbir__simdfX_mult( o0, r0, c6 );      stbir__simdfX_mult( o1, r1, c6 );      stbir__simdfX_mult( o2, r2, c6 );       stbir__simdfX_mult( o3, r3, c6 );  
+              stbir__simdfX_store( output6, o0 );    stbir__simdfX_store( output6+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output6+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output6+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF7( stbir__simdfX_mult( o0, r0, c7 );      stbir__simdfX_mult( o1, r1, c7 );      stbir__simdfX_mult( o2, r2, c7 );       stbir__simdfX_mult( o3, r3, c7 );  
+              stbir__simdfX_store( output7, o0 );    stbir__simdfX_store( output7+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output7+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output7+(3*stbir__simdfX_float_count), o3 ); )
+      #endif
+
+      input += (4*stbir__simdfX_float_count);
+      stbIF0( output0 += (4*stbir__simdfX_float_count); ) stbIF1( output1 += (4*stbir__simdfX_float_count); ) stbIF2( output2 += (4*stbir__simdfX_float_count); ) stbIF3( output3 += (4*stbir__simdfX_float_count); ) stbIF4( output4 += (4*stbir__simdfX_float_count); ) stbIF5( output5 += (4*stbir__simdfX_float_count); ) stbIF6( output6 += (4*stbir__simdfX_float_count); ) stbIF7( output7 += (4*stbir__simdfX_float_count); )
+    }
+    while ( ( (char*)input_end - (char*) input ) >= 16 ) 
+    {
+      stbir__simdf o0, r0;
+      STBIR_SIMD_NO_UNROLL(output0);
+
+      stbir__simdf_load( r0, input );
+
+      #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+      stbIF0( stbir__simdf_load( o0, output0 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c0 ) );  stbir__simdf_store( output0, o0 ); )
+      stbIF1( stbir__simdf_load( o0, output1 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c1 ) );  stbir__simdf_store( output1, o0 ); )
+      stbIF2( stbir__simdf_load( o0, output2 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c2 ) );  stbir__simdf_store( output2, o0 ); )
+      stbIF3( stbir__simdf_load( o0, output3 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c3 ) );  stbir__simdf_store( output3, o0 ); )
+      stbIF4( stbir__simdf_load( o0, output4 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c4 ) );  stbir__simdf_store( output4, o0 ); )
+      stbIF5( stbir__simdf_load( o0, output5 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c5 ) );  stbir__simdf_store( output5, o0 ); )
+      stbIF6( stbir__simdf_load( o0, output6 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c6 ) );  stbir__simdf_store( output6, o0 ); )
+      stbIF7( stbir__simdf_load( o0, output7 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c7 ) );  stbir__simdf_store( output7, o0 ); )
+      #else
+      stbIF0( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c0 ) );   stbir__simdf_store( output0, o0 ); )
+      stbIF1( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c1 ) );   stbir__simdf_store( output1, o0 ); )
+      stbIF2( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c2 ) );   stbir__simdf_store( output2, o0 ); )
+      stbIF3( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c3 ) );   stbir__simdf_store( output3, o0 ); )
+      stbIF4( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c4 ) );   stbir__simdf_store( output4, o0 ); )
+      stbIF5( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c5 ) );   stbir__simdf_store( output5, o0 ); )
+      stbIF6( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c6 ) );   stbir__simdf_store( output6, o0 ); )
+      stbIF7( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c7 ) );   stbir__simdf_store( output7, o0 ); )
+      #endif
+      
+      input += 4;
+      stbIF0( output0 += 4; ) stbIF1( output1 += 4; ) stbIF2( output2 += 4; ) stbIF3( output3 += 4; ) stbIF4( output4 += 4; ) stbIF5( output5 += 4; ) stbIF6( output6 += 4; ) stbIF7( output7 += 4; )
+    }
+  }
+  #else
+  while ( ( (char*)input_end - (char*) input ) >= 16 ) 
+  {
+    float r0, r1, r2, r3;
+    STBIR_NO_UNROLL(input);
+
+    r0 = input[0], r1 = input[1], r2 = input[2], r3 = input[3];
+
+    #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+    stbIF0( output0[0] += ( r0 * c0s ); output0[1] += ( r1 * c0s ); output0[2] += ( r2 * c0s ); output0[3] += ( r3 * c0s ); )
+    stbIF1( output1[0] += ( r0 * c1s ); output1[1] += ( r1 * c1s ); output1[2] += ( r2 * c1s ); output1[3] += ( r3 * c1s ); )
+    stbIF2( output2[0] += ( r0 * c2s ); output2[1] += ( r1 * c2s ); output2[2] += ( r2 * c2s ); output2[3] += ( r3 * c2s ); )
+    stbIF3( output3[0] += ( r0 * c3s ); output3[1] += ( r1 * c3s ); output3[2] += ( r2 * c3s ); output3[3] += ( r3 * c3s ); )
+    stbIF4( output4[0] += ( r0 * c4s ); output4[1] += ( r1 * c4s ); output4[2] += ( r2 * c4s ); output4[3] += ( r3 * c4s ); )
+    stbIF5( output5[0] += ( r0 * c5s ); output5[1] += ( r1 * c5s ); output5[2] += ( r2 * c5s ); output5[3] += ( r3 * c5s ); )
+    stbIF6( output6[0] += ( r0 * c6s ); output6[1] += ( r1 * c6s ); output6[2] += ( r2 * c6s ); output6[3] += ( r3 * c6s ); )
+    stbIF7( output7[0] += ( r0 * c7s ); output7[1] += ( r1 * c7s ); output7[2] += ( r2 * c7s ); output7[3] += ( r3 * c7s ); )
+    #else
+    stbIF0( output0[0]  = ( r0 * c0s ); output0[1]  = ( r1 * c0s ); output0[2]  = ( r2 * c0s ); output0[3]  = ( r3 * c0s ); )
+    stbIF1( output1[0]  = ( r0 * c1s ); output1[1]  = ( r1 * c1s ); output1[2]  = ( r2 * c1s ); output1[3]  = ( r3 * c1s ); )
+    stbIF2( output2[0]  = ( r0 * c2s ); output2[1]  = ( r1 * c2s ); output2[2]  = ( r2 * c2s ); output2[3]  = ( r3 * c2s ); )
+    stbIF3( output3[0]  = ( r0 * c3s ); output3[1]  = ( r1 * c3s ); output3[2]  = ( r2 * c3s ); output3[3]  = ( r3 * c3s ); )
+    stbIF4( output4[0]  = ( r0 * c4s ); output4[1]  = ( r1 * c4s ); output4[2]  = ( r2 * c4s ); output4[3]  = ( r3 * c4s ); )
+    stbIF5( output5[0]  = ( r0 * c5s ); output5[1]  = ( r1 * c5s ); output5[2]  = ( r2 * c5s ); output5[3]  = ( r3 * c5s ); )
+    stbIF6( output6[0]  = ( r0 * c6s ); output6[1]  = ( r1 * c6s ); output6[2]  = ( r2 * c6s ); output6[3]  = ( r3 * c6s ); )
+    stbIF7( output7[0]  = ( r0 * c7s ); output7[1]  = ( r1 * c7s ); output7[2]  = ( r2 * c7s ); output7[3]  = ( r3 * c7s ); )
+    #endif
+
+    input += 4;
+    stbIF0( output0 += 4; ) stbIF1( output1 += 4; ) stbIF2( output2 += 4; ) stbIF3( output3 += 4; ) stbIF4( output4 += 4; ) stbIF5( output5 += 4; ) stbIF6( output6 += 4; ) stbIF7( output7 += 4; )
+  }
+  #endif
+  while ( input < input_end ) 
+  {
+    float r = input[0];
+    STBIR_NO_UNROLL(output0);
+
+    #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+    stbIF0( output0[0] += ( r * c0s ); )
+    stbIF1( output1[0] += ( r * c1s ); )
+    stbIF2( output2[0] += ( r * c2s ); )
+    stbIF3( output3[0] += ( r * c3s ); )
+    stbIF4( output4[0] += ( r * c4s ); )
+    stbIF5( output5[0] += ( r * c5s ); )
+    stbIF6( output6[0] += ( r * c6s ); )
+    stbIF7( output7[0] += ( r * c7s ); )
+    #else
+    stbIF0( output0[0]  = ( r * c0s ); )
+    stbIF1( output1[0]  = ( r * c1s ); )
+    stbIF2( output2[0]  = ( r * c2s ); )
+    stbIF3( output3[0]  = ( r * c3s ); )
+    stbIF4( output4[0]  = ( r * c4s ); )
+    stbIF5( output5[0]  = ( r * c5s ); )
+    stbIF6( output6[0]  = ( r * c6s ); )
+    stbIF7( output7[0]  = ( r * c7s ); )
+    #endif
+
+    ++input;
+    stbIF0( ++output0; ) stbIF1( ++output1; ) stbIF2( ++output2; ) stbIF3( ++output3; ) stbIF4( ++output4; ) stbIF5( ++output5; ) stbIF6( ++output6; ) stbIF7( ++output7; )
+  }
+}
+
+static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp, float const * vertical_coefficients, float const ** inputs, float const * input0_end )
+{
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = outputp;
+
+  stbIF0( float const * input0 = inputs[0]; float c0s = vertical_coefficients[0]; )
+  stbIF1( float const * input1 = inputs[1]; float c1s = vertical_coefficients[1]; )
+  stbIF2( float const * input2 = inputs[2]; float c2s = vertical_coefficients[2]; )
+  stbIF3( float const * input3 = inputs[3]; float c3s = vertical_coefficients[3]; )
+  stbIF4( float const * input4 = inputs[4]; float c4s = vertical_coefficients[4]; )
+  stbIF5( float const * input5 = inputs[5]; float c5s = vertical_coefficients[5]; )
+  stbIF6( float const * input6 = inputs[6]; float c6s = vertical_coefficients[6]; )
+  stbIF7( float const * input7 = inputs[7]; float c7s = vertical_coefficients[7]; )
+
+#if ( STBIR__vertical_channels == 1 ) && !defined(STB_IMAGE_RESIZE_VERTICAL_CONTINUE)
+  // check single channel one weight
+  if ( ( c0s >= (1.0f-0.000001f) ) && ( c0s <= (1.0f+0.000001f) ) )
+  {
+    STBIR_MEMCPY( output, input0, (char*)input0_end - (char*)input0 );
+    return;
+  }
+#endif  
+
+  #ifdef STBIR_SIMD
+  {
+    stbIF0(stbir__simdfX c0 = stbir__simdf_frepX( c0s ); )
+    stbIF1(stbir__simdfX c1 = stbir__simdf_frepX( c1s ); )
+    stbIF2(stbir__simdfX c2 = stbir__simdf_frepX( c2s ); )
+    stbIF3(stbir__simdfX c3 = stbir__simdf_frepX( c3s ); )
+    stbIF4(stbir__simdfX c4 = stbir__simdf_frepX( c4s ); )
+    stbIF5(stbir__simdfX c5 = stbir__simdf_frepX( c5s ); )
+    stbIF6(stbir__simdfX c6 = stbir__simdf_frepX( c6s ); )
+    stbIF7(stbir__simdfX c7 = stbir__simdf_frepX( c7s ); )
+    
+    while ( ( (char*)input0_end - (char*) input0 ) >= (16*stbir__simdfX_float_count) ) 
+    {
+      stbir__simdfX o0, o1, o2, o3, r0, r1, r2, r3;
+      STBIR_SIMD_NO_UNROLL(output);
+
+      // prefetch four loop iterations ahead (doesn't affect much for small resizes, but helps with big ones)
+      stbIF0( stbir__prefetch( input0 + (16*stbir__simdfX_float_count) ); ) 
+      stbIF1( stbir__prefetch( input1 + (16*stbir__simdfX_float_count) ); )
+      stbIF2( stbir__prefetch( input2 + (16*stbir__simdfX_float_count) ); )
+      stbIF3( stbir__prefetch( input3 + (16*stbir__simdfX_float_count) ); )
+      stbIF4( stbir__prefetch( input4 + (16*stbir__simdfX_float_count) ); )
+      stbIF5( stbir__prefetch( input5 + (16*stbir__simdfX_float_count) ); )
+      stbIF6( stbir__prefetch( input6 + (16*stbir__simdfX_float_count) ); )
+      stbIF7( stbir__prefetch( input7 + (16*stbir__simdfX_float_count) ); )
+
+      #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+      stbIF0( stbir__simdfX_load( o0, output );      stbir__simdfX_load( o1, output+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( o3, output+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_load( r0, input0 );      stbir__simdfX_load( r1, input0+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input0+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input0+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c0 );  stbir__simdfX_madd( o1, o1, r1, c0 );                         stbir__simdfX_madd( o2, o2, r2, c0 );                             stbir__simdfX_madd( o3, o3, r3, c0 ); )
+      #else
+      stbIF0( stbir__simdfX_load( r0, input0 );      stbir__simdfX_load( r1, input0+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input0+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input0+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_mult( o0, r0, c0 );      stbir__simdfX_mult( o1, r1, c0 );                             stbir__simdfX_mult( o2, r2, c0 );                                 stbir__simdfX_mult( o3, r3, c0 );  )
+      #endif
+
+      stbIF1( stbir__simdfX_load( r0, input1 );      stbir__simdfX_load( r1, input1+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input1+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input1+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c1 );  stbir__simdfX_madd( o1, o1, r1, c1 );                         stbir__simdfX_madd( o2, o2, r2, c1 );                             stbir__simdfX_madd( o3, o3, r3, c1 ); )
+      stbIF2( stbir__simdfX_load( r0, input2 );      stbir__simdfX_load( r1, input2+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input2+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input2+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c2 );  stbir__simdfX_madd( o1, o1, r1, c2 );                         stbir__simdfX_madd( o2, o2, r2, c2 );                             stbir__simdfX_madd( o3, o3, r3, c2 ); )
+      stbIF3( stbir__simdfX_load( r0, input3 );      stbir__simdfX_load( r1, input3+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input3+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input3+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c3 );  stbir__simdfX_madd( o1, o1, r1, c3 );                         stbir__simdfX_madd( o2, o2, r2, c3 );                             stbir__simdfX_madd( o3, o3, r3, c3 ); )
+      stbIF4( stbir__simdfX_load( r0, input4 );      stbir__simdfX_load( r1, input4+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input4+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input4+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c4 );  stbir__simdfX_madd( o1, o1, r1, c4 );                         stbir__simdfX_madd( o2, o2, r2, c4 );                             stbir__simdfX_madd( o3, o3, r3, c4 ); )
+      stbIF5( stbir__simdfX_load( r0, input5 );      stbir__simdfX_load( r1, input5+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input5+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input5+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c5 );  stbir__simdfX_madd( o1, o1, r1, c5 );                         stbir__simdfX_madd( o2, o2, r2, c5 );                             stbir__simdfX_madd( o3, o3, r3, c5 ); )
+      stbIF6( stbir__simdfX_load( r0, input6 );      stbir__simdfX_load( r1, input6+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input6+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input6+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c6 );  stbir__simdfX_madd( o1, o1, r1, c6 );                         stbir__simdfX_madd( o2, o2, r2, c6 );                             stbir__simdfX_madd( o3, o3, r3, c6 ); )
+      stbIF7( stbir__simdfX_load( r0, input7 );      stbir__simdfX_load( r1, input7+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input7+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input7+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c7 );  stbir__simdfX_madd( o1, o1, r1, c7 );                         stbir__simdfX_madd( o2, o2, r2, c7 );                             stbir__simdfX_madd( o3, o3, r3, c7 ); )
+
+      stbir__simdfX_store( output, o0 );             stbir__simdfX_store( output+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output+(2*stbir__simdfX_float_count), o2 );  stbir__simdfX_store( output+(3*stbir__simdfX_float_count), o3 );
+      output += (4*stbir__simdfX_float_count);
+      stbIF0( input0 += (4*stbir__simdfX_float_count); ) stbIF1( input1 += (4*stbir__simdfX_float_count); ) stbIF2( input2 += (4*stbir__simdfX_float_count); ) stbIF3( input3 += (4*stbir__simdfX_float_count); ) stbIF4( input4 += (4*stbir__simdfX_float_count); ) stbIF5( input5 += (4*stbir__simdfX_float_count); ) stbIF6( input6 += (4*stbir__simdfX_float_count); ) stbIF7( input7 += (4*stbir__simdfX_float_count); )
+    }
+
+    while ( ( (char*)input0_end - (char*) input0 ) >= 16 ) 
+    {
+      stbir__simdf o0, r0;
+      STBIR_SIMD_NO_UNROLL(output);
+
+      #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+      stbIF0( stbir__simdf_load( o0, output );   stbir__simdf_load( r0, input0 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c0 ) ); )
+      #else
+      stbIF0( stbir__simdf_load( r0, input0 );  stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c0 ) ); )
+      #endif
+      stbIF1( stbir__simdf_load( r0, input1 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c1 ) ); )
+      stbIF2( stbir__simdf_load( r0, input2 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c2 ) ); )
+      stbIF3( stbir__simdf_load( r0, input3 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c3 ) ); )
+      stbIF4( stbir__simdf_load( r0, input4 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c4 ) ); )
+      stbIF5( stbir__simdf_load( r0, input5 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c5 ) ); )
+      stbIF6( stbir__simdf_load( r0, input6 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c6 ) ); )
+      stbIF7( stbir__simdf_load( r0, input7 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c7 ) ); )
+
+      stbir__simdf_store( output, o0 );
+      output += 4;
+      stbIF0( input0 += 4; ) stbIF1( input1 += 4; ) stbIF2( input2 += 4; ) stbIF3( input3 += 4; ) stbIF4( input4 += 4; ) stbIF5( input5 += 4; ) stbIF6( input6 += 4; ) stbIF7( input7 += 4; )
+    }
+  }
+  #else
+  while ( ( (char*)input0_end - (char*) input0 ) >= 16 ) 
+  {
+    float o0, o1, o2, o3;
+    STBIR_NO_UNROLL(output);
+    #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+    stbIF0( o0 = output[0] + input0[0] * c0s; o1 = output[1] + input0[1] * c0s; o2 = output[2] + input0[2] * c0s; o3 = output[3] + input0[3] * c0s; )
+    #else
+    stbIF0( o0  = input0[0] * c0s; o1  = input0[1] * c0s; o2  = input0[2] * c0s; o3  = input0[3] * c0s; )
+    #endif
+    stbIF1( o0 += input1[0] * c1s; o1 += input1[1] * c1s; o2 += input1[2] * c1s; o3 += input1[3] * c1s; )
+    stbIF2( o0 += input2[0] * c2s; o1 += input2[1] * c2s; o2 += input2[2] * c2s; o3 += input2[3] * c2s; )
+    stbIF3( o0 += input3[0] * c3s; o1 += input3[1] * c3s; o2 += input3[2] * c3s; o3 += input3[3] * c3s; )
+    stbIF4( o0 += input4[0] * c4s; o1 += input4[1] * c4s; o2 += input4[2] * c4s; o3 += input4[3] * c4s; )
+    stbIF5( o0 += input5[0] * c5s; o1 += input5[1] * c5s; o2 += input5[2] * c5s; o3 += input5[3] * c5s; )
+    stbIF6( o0 += input6[0] * c6s; o1 += input6[1] * c6s; o2 += input6[2] * c6s; o3 += input6[3] * c6s; )
+    stbIF7( o0 += input7[0] * c7s; o1 += input7[1] * c7s; o2 += input7[2] * c7s; o3 += input7[3] * c7s; )
+    output[0] = o0; output[1] = o1; output[2] = o2; output[3] = o3;
+    output += 4;
+    stbIF0( input0 += 4; ) stbIF1( input1 += 4; ) stbIF2( input2 += 4; ) stbIF3( input3 += 4; ) stbIF4( input4 += 4; ) stbIF5( input5 += 4; ) stbIF6( input6 += 4; ) stbIF7( input7 += 4; )
+  }
+  #endif
+  while ( input0 < input0_end ) 
+  {
+    float o0;
+    STBIR_NO_UNROLL(output);
+    #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+    stbIF0( o0 = output[0] + input0[0] * c0s; )
+    #else
+    stbIF0( o0  = input0[0] * c0s; )
+    #endif
+    stbIF1( o0 += input1[0] * c1s; )
+    stbIF2( o0 += input2[0] * c2s; )
+    stbIF3( o0 += input3[0] * c3s; )
+    stbIF4( o0 += input4[0] * c4s; )
+    stbIF5( o0 += input5[0] * c5s; )
+    stbIF6( o0 += input6[0] * c6s; )
+    stbIF7( o0 += input7[0] * c7s; )
+    output[0] = o0; 
+    ++output;
+    stbIF0( ++input0; ) stbIF1( ++input1; ) stbIF2( ++input2; ) stbIF3( ++input3; ) stbIF4( ++input4; ) stbIF5( ++input5; ) stbIF6( ++input6; ) stbIF7( ++input7; )
+  }
+}
+
+#undef stbIF0
+#undef stbIF1
+#undef stbIF2
+#undef stbIF3
+#undef stbIF4
+#undef stbIF5
+#undef stbIF6
+#undef stbIF7
+#undef STB_IMAGE_RESIZE_DO_VERTICALS
+#undef STBIR__vertical_channels
+#undef STB_IMAGE_RESIZE_DO_HORIZONTALS
+#undef STBIR_strs_join24
+#undef STBIR_strs_join14
+#undef STBIR_chans
+#ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#undef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#endif
+
+#else // !STB_IMAGE_RESIZE_DO_VERTICALS
+
+#define STBIR_chans( start, end ) STBIR_strs_join1(start,STBIR__horizontal_channels,end)
+
+#ifndef stbir__2_coeff_only
+#define stbir__2_coeff_only()             \
+    stbir__1_coeff_only();                \
+    stbir__1_coeff_remnant(1);            
+#endif
+
+#ifndef stbir__2_coeff_remnant
+#define stbir__2_coeff_remnant( ofs )     \
+    stbir__1_coeff_remnant(ofs);          \
+    stbir__1_coeff_remnant((ofs)+1);      
+#endif
+    
+#ifndef stbir__3_coeff_only
+#define stbir__3_coeff_only()             \
+    stbir__2_coeff_only();                \
+    stbir__1_coeff_remnant(2);            
+#endif
+    
+#ifndef stbir__3_coeff_remnant
+#define stbir__3_coeff_remnant( ofs )     \
+    stbir__2_coeff_remnant(ofs);          \
+    stbir__1_coeff_remnant((ofs)+2);      
+#endif
+
+#ifndef stbir__3_coeff_setup
+#define stbir__3_coeff_setup()
+#endif
+
+#ifndef stbir__4_coeff_start
+#define stbir__4_coeff_start()            \
+    stbir__2_coeff_only();                \
+    stbir__2_coeff_remnant(2);            
+#endif
+    
+#ifndef stbir__4_coeff_continue_from_4
+#define stbir__4_coeff_continue_from_4( ofs )     \
+    stbir__2_coeff_remnant(ofs);                  \
+    stbir__2_coeff_remnant((ofs)+2);      
+#endif
+
+#ifndef stbir__store_output_tiny
+#define stbir__store_output_tiny stbir__store_output
+#endif
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_1_coeff)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * hc = horizontal_coefficients;
+    stbir__1_coeff_only();
+    stbir__store_output_tiny();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_2_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * hc = horizontal_coefficients;
+    stbir__2_coeff_only();
+    stbir__store_output_tiny();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_3_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * hc = horizontal_coefficients;
+    stbir__3_coeff_only();
+    stbir__store_output_tiny();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_4_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * hc = horizontal_coefficients;
+    stbir__4_coeff_start();
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_5_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * hc = horizontal_coefficients;
+    stbir__4_coeff_start();
+    stbir__1_coeff_remnant(4);
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_6_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * hc = horizontal_coefficients;
+    stbir__4_coeff_start();
+    stbir__2_coeff_remnant(4);
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_7_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  stbir__3_coeff_setup();
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * hc = horizontal_coefficients;
+  
+    stbir__4_coeff_start();
+    stbir__3_coeff_remnant(4);
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_8_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * hc = horizontal_coefficients;
+    stbir__4_coeff_start();
+    stbir__4_coeff_continue_from_4(4);
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_9_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * hc = horizontal_coefficients;
+    stbir__4_coeff_start();
+    stbir__4_coeff_continue_from_4(4);
+    stbir__1_coeff_remnant(8);
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_10_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * hc = horizontal_coefficients;
+    stbir__4_coeff_start();
+    stbir__4_coeff_continue_from_4(4);
+    stbir__2_coeff_remnant(8);
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_11_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  stbir__3_coeff_setup();
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * hc = horizontal_coefficients;
+    stbir__4_coeff_start();
+    stbir__4_coeff_continue_from_4(4);
+    stbir__3_coeff_remnant(8);
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_12_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * hc = horizontal_coefficients;
+    stbir__4_coeff_start();
+    stbir__4_coeff_continue_from_4(4);
+    stbir__4_coeff_continue_from_4(8);
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod0 )( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 4 + 3 ) >> 2; 
+    float const * hc = horizontal_coefficients;
+
+    stbir__4_coeff_start();
+    do {
+      hc += 4;
+      decode += STBIR__horizontal_channels * 4;
+      stbir__4_coeff_continue_from_4( 0 );
+      --n;
+    } while ( n > 0 );
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod1 )( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 5 + 3 ) >> 2; 
+    float const * hc = horizontal_coefficients;
+
+    stbir__4_coeff_start();
+    do {
+      hc += 4;
+      decode += STBIR__horizontal_channels * 4;
+      stbir__4_coeff_continue_from_4( 0 );
+      --n;
+    } while ( n > 0 );
+    stbir__1_coeff_remnant( 4 ); 
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod2 )( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 6 + 3 ) >> 2; 
+    float const * hc = horizontal_coefficients;
+
+    stbir__4_coeff_start();
+    do {
+      hc += 4;
+      decode += STBIR__horizontal_channels * 4;
+      stbir__4_coeff_continue_from_4( 0 );
+      --n;
+    } while ( n > 0 );
+    stbir__2_coeff_remnant( 4 ); 
+
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod3 )( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  stbir__3_coeff_setup();
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 7 + 3 ) >> 2; 
+    float const * hc = horizontal_coefficients;
+
+    stbir__4_coeff_start();
+    do {
+      hc += 4;
+      decode += STBIR__horizontal_channels * 4;
+      stbir__4_coeff_continue_from_4( 0 );
+      --n;
+    } while ( n > 0 );
+    stbir__3_coeff_remnant( 4 ); 
+
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static stbir__horizontal_gather_channels_func * STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_funcs)[4]=
+{
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod0),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod1),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod2),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod3),  
+};
+
+static stbir__horizontal_gather_channels_func * STBIR_chans(stbir__horizontal_gather_,_channels_funcs)[12]=
+{
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_1_coeff),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_2_coeffs),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_3_coeffs),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_4_coeffs),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_5_coeffs),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_6_coeffs),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_7_coeffs),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_8_coeffs),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_9_coeffs),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_10_coeffs),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_11_coeffs),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_12_coeffs),  
+};
+
+#undef STBIR__horizontal_channels
+#undef STB_IMAGE_RESIZE_DO_HORIZONTALS
+#undef stbir__1_coeff_only
+#undef stbir__1_coeff_remnant
+#undef stbir__2_coeff_only
+#undef stbir__2_coeff_remnant
+#undef stbir__3_coeff_only
+#undef stbir__3_coeff_remnant
+#undef stbir__3_coeff_setup
+#undef stbir__4_coeff_start
+#undef stbir__4_coeff_continue_from_4
+#undef stbir__store_output
+#undef stbir__store_output_tiny
+#undef STBIR_chans
+
+#endif  // HORIZONALS
+
+#undef STBIR_strs_join2
+#undef STBIR_strs_join1
+
+#endif // STB_IMAGE_RESIZE_DO_HORIZONTALS/VERTICALS/CODERS
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of 
+this software and associated documentation files (the "Software"), to deal in 
+the Software without restriction, including without limitation the rights to 
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 
+of the Software, and to permit persons to whom the Software is furnished to do 
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all 
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this 
+software, either in source code form or as a compiled binary, for any purpose, 
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this 
+software dedicate any and all copyright interest in the software to the public 
+domain. We make this dedication for the benefit of the public at large and to 
+the detriment of our heirs and successors. We intend this dedication to be an 
+overt act of relinquishment in perpetuity of all present and future rights to 
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/renderdoc/3rdparty/stb/stb_image_write.h b/renderdoc/3rdparty/stb/stb_image_write.h
index 4319c0de1d..e4b32ed1bc 100644
--- a/renderdoc/3rdparty/stb/stb_image_write.h
+++ b/renderdoc/3rdparty/stb/stb_image_write.h
@@ -1,5 +1,5 @@
-/* stb_image_write - v1.02 - public domain - http://nothings.org/stb/stb_image_write.h
-   writes out PNG/BMP/TGA images to C stdio - Sean Barrett 2010-2015
+/* stb_image_write - v1.16 - public domain - http://nothings.org/stb
+   writes out PNG/BMP/TGA/JPEG/HDR images to C stdio - Sean Barrett 2010-2015
                                      no warranty implied; use at your own risk
 
    Before #including,
@@ -12,41 +12,64 @@
 
 ABOUT:
 
-   This header file is a library for writing images to C stdio. It could be
-   adapted to write to memory or a general streaming interface; let me know.
+   This header file is a library for writing images to C stdio or a callback.
 
    The PNG output is not optimal; it is 20-50% larger than the file
-   written by a decent optimizing implementation. This library is designed
-   for source code compactness and simplicity, not optimal image file size
-   or run-time performance.
+   written by a decent optimizing implementation; though providing a custom
+   zlib compress function (see STBIW_ZLIB_COMPRESS) can mitigate that.
+   This library is designed for source code compactness and simplicity,
+   not optimal image file size or run-time performance.
 
 BUILDING:
 
    You can #define STBIW_ASSERT(x) before the #include to avoid using assert.h.
    You can #define STBIW_MALLOC(), STBIW_REALLOC(), and STBIW_FREE() to replace
    malloc,realloc,free.
-   You can define STBIW_MEMMOVE() to replace memmove()
+   You can #define STBIW_MEMMOVE() to replace memmove()
+   You can #define STBIW_ZLIB_COMPRESS to use a custom zlib-style compress function
+   for PNG compression (instead of the builtin one), it must have the following signature:
+   unsigned char * my_compress(unsigned char *data, int data_len, int *out_len, int quality);
+   The returned data will be freed with STBIW_FREE() (free() by default),
+   so it must be heap allocated with STBIW_MALLOC() (malloc() by default),
+
+UNICODE:
+
+   If compiling for Windows and you wish to use Unicode filenames, compile
+   with
+       #define STBIW_WINDOWS_UTF8
+   and pass utf8-encoded filenames. Call stbiw_convert_wchar_to_utf8 to convert
+   Windows wchar_t filenames to utf8.
 
 USAGE:
 
-   There are four functions, one for each image file format:
+   There are five functions, one for each image file format:
 
      int stbi_write_png(char const *filename, int w, int h, int comp, const void *data, int stride_in_bytes);
      int stbi_write_bmp(char const *filename, int w, int h, int comp, const void *data);
      int stbi_write_tga(char const *filename, int w, int h, int comp, const void *data);
+     int stbi_write_jpg(char const *filename, int w, int h, int comp, const void *data, int quality);
      int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
 
-   There are also four equivalent functions that use an arbitrary write function. You are
+     void stbi_flip_vertically_on_write(int flag); // flag is non-zero to flip data vertically
+
+   There are also five equivalent functions that use an arbitrary write function. You are
    expected to open/close your file-equivalent before and after calling these:
 
      int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
      int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
      int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
      int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
+     int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality);
 
    where the callback is:
       void stbi_write_func(void *context, void *data, int size);
 
+   You can configure it with these global variables:
+      int stbi_write_tga_with_rle;             // defaults to true; set to 0 to disable RLE
+      int stbi_write_png_compression_level;    // defaults to 8; set to higher for more compression
+      int stbi_write_force_png_filter;         // defaults to -1; set to 0..5 to force a filter mode
+
+
    You can define STBI_WRITE_NO_STDIO to disable the file variant of these
    functions, so the library will not use stdio.h at all. However, this will
    also disable HDR writing, because it requires stdio for formatted output.
@@ -73,6 +96,9 @@
    writer, both because it is in BGR order and because it may have padding
    at the end of the line.)
 
+   PNG allows you to set the deflate compression level by setting the global
+   variable 'stbi_write_png_compression_level' (it defaults to 8).
+
    HDR expects linear float data. Since the format is always 32-bit rgb(e)
    data, alpha (if provided) is discarded, and for monochrome data it is
    replicated across all three channels.
@@ -80,20 +106,23 @@
    TGA supports RLE or non-RLE compressed data. To use non-RLE-compressed
    data, set the global variable 'stbi_write_tga_with_rle' to 0.
 
+   JPEG does ignore alpha channels in input data; quality is between 1 and 100.
+   Higher quality looks better but results in a bigger image.
+   JPEG baseline (no JPEG progressive).
+
 CREDITS:
 
-   PNG/BMP/TGA
-      Sean Barrett
-   HDR
-      Baldur Karlsson
-   TGA monochrome:
-      Jean-Sebastien Guay
-   misc enhancements:
-      Tim Kelsey
-   TGA RLE
-      Alan Hickman
-   initial file IO callback implementation
-      Emmanuel Julien
+
+   Sean Barrett           -    PNG/BMP/TGA
+   Baldur Karlsson        -    HDR
+   Jean-Sebastien Guay    -    TGA monochrome
+   Tim Kelsey             -    misc enhancements
+   Alan Hickman           -    TGA RLE
+   Emmanuel Julien        -    initial file IO callback implementation
+   Jon Olick              -    original jo_jpeg.cpp code
+   Daniel Gibson          -    integrate JPEG, allow external zlib
+   Aarni Koskela          -    allow choosing PNG filter
+
    bugfixes:
       github:Chribba
       Guillaume Chereau
@@ -103,27 +132,44 @@
       Jonas Karlsson
       Filip Wasil
       Thatcher Ulrich
-      
+      github:poppolopoppo
+      Patrick Boettcher
+      github:xeekworx
+      Cap Petschulat
+      Simon Rodriguez
+      Ivan Tikhonov
+      github:ignotion
+      Adam Schackart
+      Andrew Kensler
+
 LICENSE
 
-This software is dual-licensed to the public domain and under the following
-license: you are granted a perpetual, irrevocable license to copy, modify,
-publish, and distribute this file as you see fit.
+  See end of file for license information.
 
 */
 
 #ifndef INCLUDE_STB_IMAGE_WRITE_H
 #define INCLUDE_STB_IMAGE_WRITE_H
 
-#ifdef __cplusplus
-extern "C" {
-#endif
+#include <stdlib.h>
 
+// if STB_IMAGE_WRITE_STATIC causes problems, try defining STBIWDEF to 'inline' or 'static inline'
+#ifndef STBIWDEF
 #ifdef STB_IMAGE_WRITE_STATIC
-#define STBIWDEF static
+#define STBIWDEF  static
+#else
+#ifdef __cplusplus
+#define STBIWDEF  extern "C"
 #else
-#define STBIWDEF extern
-extern int stbi_write_tga_with_rle;
+#define STBIWDEF  extern
+#endif
+#endif
+#endif
+
+#ifndef STB_IMAGE_WRITE_STATIC  // C++ forbids static forward declarations
+STBIWDEF int stbi_write_tga_with_rle;
+STBIWDEF int stbi_write_png_compression_level;
+STBIWDEF int stbi_write_force_png_filter;
 #endif
 
 #ifndef STBI_WRITE_NO_STDIO
@@ -131,6 +177,11 @@ STBIWDEF int stbi_write_png(char const *filename, int w, int h, int comp, const
 STBIWDEF int stbi_write_bmp(char const *filename, int w, int h, int comp, const void  *data);
 STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, const void  *data);
 STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
+STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void  *data, int quality);
+
+#ifdef STBIW_WINDOWS_UTF8
+STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
+#endif
 #endif
 
 typedef void stbi_write_func(void *context, void *data, int size);
@@ -139,10 +190,9 @@ STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int w,
 STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
 STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
 STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
+STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void  *data, int quality);
 
-#ifdef __cplusplus
-}
-#endif
+STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean);
 
 #endif//INCLUDE_STB_IMAGE_WRITE_H
 
@@ -197,10 +247,29 @@ STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w,
 
 #define STBIW_UCHAR(x) (unsigned char) ((x) & 0xff)
 
+#ifdef STB_IMAGE_WRITE_STATIC
+static int stbi_write_png_compression_level = 8;
+static int stbi_write_tga_with_rle = 1;
+static int stbi_write_force_png_filter = -1;
+#else
+int stbi_write_png_compression_level = 8;
+int stbi_write_tga_with_rle = 1;
+int stbi_write_force_png_filter = -1;
+#endif
+
+static int stbi__flip_vertically_on_write = 0;
+
+STBIWDEF void stbi_flip_vertically_on_write(int flag)
+{
+   stbi__flip_vertically_on_write = flag;
+}
+
 typedef struct
 {
    stbi_write_func *func;
    void *context;
+   unsigned char buffer[64];
+   int buf_used;
 } stbi__write_context;
 
 // initialize a callback-based context
@@ -217,9 +286,52 @@ static void stbi__stdio_write(void *context, void *data, int size)
    fwrite(data,1,size,(FILE*) context);
 }
 
+#if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8)
+#ifdef __cplusplus
+#define STBIW_EXTERN extern "C"
+#else
+#define STBIW_EXTERN extern
+#endif
+STBIW_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
+STBIW_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
+
+STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
+{
+   return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
+}
+#endif
+
+static FILE *stbiw__fopen(char const *filename, char const *mode)
+{
+   FILE *f;
+#if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8)
+   wchar_t wMode[64];
+   wchar_t wFilename[1024];
+   if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
+      return 0;
+
+   if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
+      return 0;
+
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+   if (0 != _wfopen_s(&f, wFilename, wMode))
+      f = 0;
+#else
+   f = _wfopen(wFilename, wMode);
+#endif
+
+#elif defined(_MSC_VER) && _MSC_VER >= 1400
+   if (0 != fopen_s(&f, filename, mode))
+      f=0;
+#else
+   f = fopen(filename, mode);
+#endif
+   return f;
+}
+
 static int stbi__start_write_file(stbi__write_context *s, const char *filename)
 {
-   FILE *f = fopen(filename, "wb");
+   FILE *f = stbiw__fopen(filename, "wb");
    stbi__start_write_callbacks(s, stbi__stdio_write, (void *) f);
    return f != NULL;
 }
@@ -234,12 +346,6 @@ static void stbi__end_write_file(stbi__write_context *s)
 typedef unsigned int stbiw_uint32;
 typedef int stb_image_write_test[sizeof(stbiw_uint32)==4 ? 1 : -1];
 
-#ifdef STB_IMAGE_WRITE_STATIC
-static int stbi_write_tga_with_rle = 1;
-#else
-int stbi_write_tga_with_rle = 1;
-#endif
-
 static void stbiw__writefv(stbi__write_context *s, const char *fmt, va_list v)
 {
    while (*fmt) {
@@ -277,11 +383,36 @@ static void stbiw__writef(stbi__write_context *s, const char *fmt, ...)
    va_end(v);
 }
 
+static void stbiw__write_flush(stbi__write_context *s)
+{
+   if (s->buf_used) {
+      s->func(s->context, &s->buffer, s->buf_used);
+      s->buf_used = 0;
+   }
+}
+
+static void stbiw__putc(stbi__write_context *s, unsigned char c)
+{
+   s->func(s->context, &c, 1);
+}
+
+static void stbiw__write1(stbi__write_context *s, unsigned char a)
+{
+   if ((size_t)s->buf_used + 1 > sizeof(s->buffer))
+      stbiw__write_flush(s);
+   s->buffer[s->buf_used++] = a;
+}
+
 static void stbiw__write3(stbi__write_context *s, unsigned char a, unsigned char b, unsigned char c)
 {
-   unsigned char arr[3];
-   arr[0] = a, arr[1] = b, arr[2] = c;
-   s->func(s->context, arr, 3);
+   int n;
+   if ((size_t)s->buf_used + 3 > sizeof(s->buffer))
+      stbiw__write_flush(s);
+   n = s->buf_used;
+   s->buf_used = n+3;
+   s->buffer[n+0] = a;
+   s->buffer[n+1] = b;
+   s->buffer[n+2] = c;
 }
 
 static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, int write_alpha, int expand_mono, unsigned char *d)
@@ -290,17 +421,15 @@ static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, in
    int k;
 
    if (write_alpha < 0)
-      s->func(s->context, &d[comp - 1], 1);
+      stbiw__write1(s, d[comp - 1]);
 
    switch (comp) {
+      case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case
       case 1:
-         s->func(s->context,d,1);
-         break;
-      case 2:
          if (expand_mono)
             stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp
          else
-            s->func(s->context, d, 1);  // monochrome TGA
+            stbiw__write1(s, d[0]);  // monochrome TGA
          break;
       case 4:
          if (!write_alpha) {
@@ -316,7 +445,7 @@ static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, in
          break;
    }
    if (write_alpha > 0)
-      s->func(s->context, &d[comp - 1], 1);
+      stbiw__write1(s, d[comp - 1]);
 }
 
 static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, void *data, int write_alpha, int scanline_pad, int expand_mono)
@@ -327,16 +456,21 @@ static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir, i
    if (y <= 0)
       return;
 
-   if (vdir < 0)
-      j_end = -1, j = y-1;
-   else
-      j_end =  y, j = 0;
+   if (stbi__flip_vertically_on_write)
+      vdir *= -1;
+
+   if (vdir < 0) {
+      j_end = -1; j = y-1;
+   } else {
+      j_end =  y; j = 0;
+   }
 
    for (; j != j_end; j += vdir) {
       for (i=0; i < x; ++i) {
          unsigned char *d = (unsigned char *) data + (j*x+i)*comp;
          stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d);
       }
+      stbiw__write_flush(s);
       s->func(s->context, &zero, scanline_pad);
    }
 }
@@ -357,16 +491,27 @@ static int stbiw__outfile(stbi__write_context *s, int rgb_dir, int vdir, int x,
 
 static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp, const void *data)
 {
-   int pad = (-x*3) & 3;
-   return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad,
-           "11 4 22 4" "4 44 22 444444",
-           'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40,  // file header
-            40, x,y, 1,24, 0,0,0,0,0,0);             // bitmap header
+   if (comp != 4) {
+      // write RGB bitmap
+      int pad = (-x*3) & 3;
+      return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad,
+              "11 4 22 4" "4 44 22 444444",
+              'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40,  // file header
+               40, x,y, 1,24, 0,0,0,0,0,0);             // bitmap header
+   } else {
+      // RGBA bitmaps need a v4 header
+      // use BI_BITFIELDS mode with 32bpp and alpha mask
+      // (straight BI_RGB with alpha mask doesn't work in most readers)
+      return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *)data,1,0,
+         "11 4 22 4" "4 44 22 444444 4444 4 444 444 444 444",
+         'B', 'M', 14+108+x*y*4, 0, 0, 14+108, // file header
+         108, x,y, 1,32, 3,0,0,0,0,0, 0xff0000,0xff00,0xff,0xff000000u, 0, 0,0,0, 0,0,0, 0,0,0, 0,0,0); // bitmap V4 header
+   }
 }
 
 STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
 {
-   stbi__write_context s;
+   stbi__write_context s = { 0 };
    stbi__start_write_callbacks(&s, func, context);
    return stbi_write_bmp_core(&s, x, y, comp, data);
 }
@@ -374,7 +519,7 @@ STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x,
 #ifndef STBI_WRITE_NO_STDIO
 STBIWDEF int stbi_write_bmp(char const *filename, int x, int y, int comp, const void *data)
 {
-   stbi__write_context s;
+   stbi__write_context s = { 0 };
    if (stbi__start_write_file(&s,filename)) {
       int r = stbi_write_bmp_core(&s, x, y, comp, data);
       stbi__end_write_file(&s);
@@ -398,11 +543,21 @@ static int stbi_write_tga_core(stbi__write_context *s, int x, int y, int comp, v
          "111 221 2222 11", 0, 0, format, 0, 0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8, has_alpha * 8);
    } else {
       int i,j,k;
+      int jend, jdir;
 
       stbiw__writef(s, "111 221 2222 11", 0,0,format+8, 0,0,0, 0,0,x,y, (colorbytes + has_alpha) * 8, has_alpha * 8);
 
-      for (j = y - 1; j >= 0; --j) {
-          unsigned char *row = (unsigned char *) data + j * x * comp;
+      if (stbi__flip_vertically_on_write) {
+         j = 0;
+         jend = y;
+         jdir = 1;
+      } else {
+         j = y-1;
+         jend = -1;
+         jdir = -1;
+      }
+      for (; j != jend; j += jdir) {
+         unsigned char *row = (unsigned char *) data + j * x * comp;
          int len;
 
          for (i = 0; i < x; i += len) {
@@ -437,32 +592,33 @@ static int stbi_write_tga_core(stbi__write_context *s, int x, int y, int comp, v
 
             if (diff) {
                unsigned char header = STBIW_UCHAR(len - 1);
-               s->func(s->context, &header, 1);
+               stbiw__write1(s, header);
                for (k = 0; k < len; ++k) {
                   stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp);
                }
             } else {
                unsigned char header = STBIW_UCHAR(len - 129);
-               s->func(s->context, &header, 1);
+               stbiw__write1(s, header);
                stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin);
             }
          }
       }
+      stbiw__write_flush(s);
    }
    return 1;
 }
 
-int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
+STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
 {
-   stbi__write_context s;
+   stbi__write_context s = { 0 };
    stbi__start_write_callbacks(&s, func, context);
    return stbi_write_tga_core(&s, x, y, comp, (void *) data);
 }
 
 #ifndef STBI_WRITE_NO_STDIO
-int stbi_write_tga(char const *filename, int x, int y, int comp, const void *data)
+STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp, const void *data)
 {
-   stbi__write_context s;
+   stbi__write_context s = { 0 };
    if (stbi__start_write_file(&s,filename)) {
       int r = stbi_write_tga_core(&s, x, y, comp, (void *) data);
       stbi__end_write_file(&s);
@@ -475,11 +631,12 @@ int stbi_write_tga(char const *filename, int x, int y, int comp, const void *dat
 // *************************************************************************************************
 // Radiance RGBE HDR writer
 // by Baldur Karlsson
-#ifndef STBI_WRITE_NO_STDIO
 
 #define stbiw__max(a, b)  ((a) > (b) ? (a) : (b))
 
-void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear)
+#ifndef STBI_WRITE_NO_STDIO
+
+static void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear)
 {
    int exponent;
    float maxcomp = stbiw__max(linear[0], stbiw__max(linear[1], linear[2]));
@@ -496,7 +653,7 @@ void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear)
    }
 }
 
-void stbiw__write_run_data(stbi__write_context *s, int length, unsigned char databyte)
+static void stbiw__write_run_data(stbi__write_context *s, int length, unsigned char databyte)
 {
    unsigned char lengthbyte = STBIW_UCHAR(length+128);
    STBIW_ASSERT(length+128 <= 255);
@@ -504,7 +661,7 @@ void stbiw__write_run_data(stbi__write_context *s, int length, unsigned char dat
    s->func(s->context, &databyte, 1);
 }
 
-void stbiw__write_dump_data(stbi__write_context *s, int length, unsigned char *data)
+static void stbiw__write_dump_data(stbi__write_context *s, int length, unsigned char *data)
 {
    unsigned char lengthbyte = STBIW_UCHAR(length);
    STBIW_ASSERT(length <= 128); // inconsistent with spec but consistent with official code
@@ -512,7 +669,7 @@ void stbiw__write_dump_data(stbi__write_context *s, int length, unsigned char *d
    s->func(s->context, data, length);
 }
 
-void stbiw__write_hdr_scanline(stbi__write_context *s, int width, int ncomp, unsigned char *scratch, float *scanline)
+static void stbiw__write_hdr_scanline(stbi__write_context *s, int width, int ncomp, unsigned char *scratch, float *scanline)
 {
    unsigned char scanlineheader[4] = { 2, 2, 0, 0 };
    unsigned char rgbe[4];
@@ -613,26 +770,30 @@ static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, f
       char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n";
       s->func(s->context, header, sizeof(header)-1);
 
+#ifdef __STDC_LIB_EXT1__
+      len = sprintf_s(buffer, sizeof(buffer), "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
+#else
       len = sprintf(buffer, "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
+#endif
       s->func(s->context, buffer, len);
 
       for(i=0; i < y; i++)
-         stbiw__write_hdr_scanline(s, x, comp, scratch, data + comp*i*x);
+         stbiw__write_hdr_scanline(s, x, comp, scratch, data + comp*x*(stbi__flip_vertically_on_write ? y-1-i : i));
       STBIW_FREE(scratch);
       return 1;
    }
 }
 
-int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const float *data)
+STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const float *data)
 {
-   stbi__write_context s;
+   stbi__write_context s = { 0 };
    stbi__start_write_callbacks(&s, func, context);
    return stbi_write_hdr_core(&s, x, y, comp, (float *) data);
 }
 
-int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data)
+STBIWDEF int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data)
 {
-   stbi__write_context s;
+   stbi__write_context s = { 0 };
    if (stbi__start_write_file(&s,filename)) {
       int r = stbi_write_hdr_core(&s, x, y, comp, (float *) data);
       stbi__end_write_file(&s);
@@ -648,8 +809,9 @@ int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *da
 // PNG writer
 //
 
+#ifndef STBIW_ZLIB_COMPRESS
 // stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount() == vector<>::size()
-#define stbiw__sbraw(a) ((int *) (a) - 2)
+#define stbiw__sbraw(a) ((int *) (void *) (a) - 2)
 #define stbiw__sbm(a)   stbiw__sbraw(a)[0]
 #define stbiw__sbn(a)   stbiw__sbraw(a)[1]
 
@@ -728,8 +890,14 @@ static unsigned int stbiw__zhash(unsigned char *data)
 
 #define stbiw__ZHASH   16384
 
-unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_len, int quality)
+#endif // STBIW_ZLIB_COMPRESS
+
+STBIWDEF unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_len, int quality)
 {
+#ifdef STBIW_ZLIB_COMPRESS
+   // user provided a zlib compress implementation, use that
+   return STBIW_ZLIB_COMPRESS(data, data_len, out_len, quality);
+#else // use builtin
    static unsigned short lengthc[] = { 3,4,5,6,7,8,9,10,11,13,15,17,19,23,27,31,35,43,51,59,67,83,99,115,131,163,195,227,258, 259 };
    static unsigned char  lengtheb[]= { 0,0,0,0,0,0,0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,  4,  5,  5,  5,  5,  0 };
    static unsigned short distc[]   = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577, 32768 };
@@ -737,7 +905,9 @@ unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_l
    unsigned int bitbuf=0;
    int i,j, bitcount=0;
    unsigned char *out = NULL;
-   unsigned char ***hash_table = (unsigned char***) STBIW_MALLOC(stbiw__ZHASH * sizeof(char**));
+   unsigned char ***hash_table = (unsigned char***) STBIW_MALLOC(stbiw__ZHASH * sizeof(unsigned char**));
+   if (hash_table == NULL)
+      return NULL;
    if (quality < 5) quality = 5;
 
    stbiw__sbpush(out, 0x78);   // DEFLATE 32K window
@@ -758,7 +928,7 @@ unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_l
       for (j=0; j < n; ++j) {
          if (hlist[j]-data > i-32768) { // if entry lies within window
             int d = stbiw__zlib_countm(hlist[j], data+i, data_len-i);
-            if (d >= best) best=d,bestloc=hlist[j];
+            if (d >= best) { best=d; bestloc=hlist[j]; }
          }
       }
       // when hash table entry is too long, delete half the entries
@@ -811,14 +981,31 @@ unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_l
       (void) stbiw__sbfree(hash_table[i]);
    STBIW_FREE(hash_table);
 
+   // store uncompressed instead if compression was worse
+   if (stbiw__sbn(out) > data_len + 2 + ((data_len+32766)/32767)*5) {
+      stbiw__sbn(out) = 2;  // truncate to DEFLATE 32K window and FLEVEL = 1
+      for (j = 0; j < data_len;) {
+         int blocklen = data_len - j;
+         if (blocklen > 32767) blocklen = 32767;
+         stbiw__sbpush(out, data_len - j == blocklen); // BFINAL = ?, BTYPE = 0 -- no compression
+         stbiw__sbpush(out, STBIW_UCHAR(blocklen)); // LEN
+         stbiw__sbpush(out, STBIW_UCHAR(blocklen >> 8));
+         stbiw__sbpush(out, STBIW_UCHAR(~blocklen)); // NLEN
+         stbiw__sbpush(out, STBIW_UCHAR(~blocklen >> 8));
+         memcpy(out+stbiw__sbn(out), data+j, blocklen);
+         stbiw__sbn(out) += blocklen;
+         j += blocklen;
+      }
+   }
+
    {
       // compute adler32 on input
       unsigned int s1=1, s2=0;
       int blocklen = (int) (data_len % 5552);
       j=0;
       while (j < data_len) {
-         for (i=0; i < blocklen; ++i) s1 += data[j+i], s2 += s1;
-         s1 %= 65521, s2 %= 65521;
+         for (i=0; i < blocklen; ++i) { s1 += data[j+i]; s2 += s1; }
+         s1 %= 65521; s2 %= 65521;
          j += blocklen;
          blocklen = 5552;
       }
@@ -831,10 +1018,14 @@ unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_l
    // make returned pointer freeable
    STBIW_MEMMOVE(stbiw__sbraw(out), out, *out_len);
    return (unsigned char *) stbiw__sbraw(out);
+#endif // STBIW_ZLIB_COMPRESS
 }
 
 static unsigned int stbiw__crc32(unsigned char *buffer, int len)
 {
+#ifdef STBIW_CRC32
+    return STBIW_CRC32(buffer, len);
+#else
    static unsigned int crc_table[256] =
    {
       0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3,
@@ -876,6 +1067,7 @@ static unsigned int stbiw__crc32(unsigned char *buffer, int len)
    for (i=0; i < len; ++i)
       crc = (crc >> 8) ^ crc_table[buffer[i] ^ (crc & 0xff)];
    return ~crc;
+#endif
 }
 
 #define stbiw__wpng4(o,a,b,c,d) ((o)[0]=STBIW_UCHAR(a),(o)[1]=STBIW_UCHAR(b),(o)[2]=STBIW_UCHAR(c),(o)[3]=STBIW_UCHAR(d),(o)+=4)
@@ -896,61 +1088,92 @@ static unsigned char stbiw__paeth(int a, int b, int c)
    return STBIW_UCHAR(c);
 }
 
-unsigned char *stbi_write_png_to_mem(unsigned char *pixels, int stride_bytes, int x, int y, int n, int *out_len)
+// @OPTIMIZE: provide an option that always forces left-predict or paeth predict
+static void stbiw__encode_png_line(unsigned char *pixels, int stride_bytes, int width, int height, int y, int n, int filter_type, signed char *line_buffer)
+{
+   static int mapping[] = { 0,1,2,3,4 };
+   static int firstmap[] = { 0,1,0,5,6 };
+   int *mymap = (y != 0) ? mapping : firstmap;
+   int i;
+   int type = mymap[filter_type];
+   unsigned char *z = pixels + stride_bytes * (stbi__flip_vertically_on_write ? height-1-y : y);
+   int signed_stride = stbi__flip_vertically_on_write ? -stride_bytes : stride_bytes;
+
+   if (type==0) {
+      memcpy(line_buffer, z, width*n);
+      return;
+   }
+
+   // first loop isn't optimized since it's just one pixel
+   for (i = 0; i < n; ++i) {
+      switch (type) {
+         case 1: line_buffer[i] = z[i]; break;
+         case 2: line_buffer[i] = z[i] - z[i-signed_stride]; break;
+         case 3: line_buffer[i] = z[i] - (z[i-signed_stride]>>1); break;
+         case 4: line_buffer[i] = (signed char) (z[i] - stbiw__paeth(0,z[i-signed_stride],0)); break;
+         case 5: line_buffer[i] = z[i]; break;
+         case 6: line_buffer[i] = z[i]; break;
+      }
+   }
+   switch (type) {
+      case 1: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - z[i-n]; break;
+      case 2: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - z[i-signed_stride]; break;
+      case 3: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - ((z[i-n] + z[i-signed_stride])>>1); break;
+      case 4: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - stbiw__paeth(z[i-n], z[i-signed_stride], z[i-signed_stride-n]); break;
+      case 5: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - (z[i-n]>>1); break;
+      case 6: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - stbiw__paeth(z[i-n], 0,0); break;
+   }
+}
+
+STBIWDEF unsigned char *stbi_write_png_to_mem(const unsigned char *pixels, int stride_bytes, int x, int y, int n, int *out_len)
 {
+   int force_filter = stbi_write_force_png_filter;
    int ctype[5] = { -1, 0, 4, 2, 6 };
    unsigned char sig[8] = { 137,80,78,71,13,10,26,10 };
    unsigned char *out,*o, *filt, *zlib;
    signed char *line_buffer;
-   int i,j,k,p,zlen;
+   int j,zlen;
 
    if (stride_bytes == 0)
       stride_bytes = x * n;
 
+   if (force_filter >= 5) {
+      force_filter = -1;
+   }
+
    filt = (unsigned char *) STBIW_MALLOC((x*n+1) * y); if (!filt) return 0;
    line_buffer = (signed char *) STBIW_MALLOC(x * n); if (!line_buffer) { STBIW_FREE(filt); return 0; }
    for (j=0; j < y; ++j) {
-      static int mapping[] = { 0,1,2,3,4 };
-      static int firstmap[] = { 0,1,0,5,6 };
-      int *mymap = j ? mapping : firstmap;
-      int best = 0, bestval = 0x7fffffff;
-      for (p=0; p < 2; ++p) {
-         for (k= p?best:0; k < 5; ++k) {
-            int type = mymap[k],est=0;
-            unsigned char *z = pixels + stride_bytes*j;
-            for (i=0; i < n; ++i)
-               switch (type) {
-                  case 0: line_buffer[i] = z[i]; break;
-                  case 1: line_buffer[i] = z[i]; break;
-                  case 2: line_buffer[i] = z[i] - z[i-stride_bytes]; break;
-                  case 3: line_buffer[i] = z[i] - (z[i-stride_bytes]>>1); break;
-                  case 4: line_buffer[i] = (signed char) (z[i] - stbiw__paeth(0,z[i-stride_bytes],0)); break;
-                  case 5: line_buffer[i] = z[i]; break;
-                  case 6: line_buffer[i] = z[i]; break;
-               }
-            for (i=n; i < x*n; ++i) {
-               switch (type) {
-                  case 0: line_buffer[i] = z[i]; break;
-                  case 1: line_buffer[i] = z[i] - z[i-n]; break;
-                  case 2: line_buffer[i] = z[i] - z[i-stride_bytes]; break;
-                  case 3: line_buffer[i] = z[i] - ((z[i-n] + z[i-stride_bytes])>>1); break;
-                  case 4: line_buffer[i] = z[i] - stbiw__paeth(z[i-n], z[i-stride_bytes], z[i-stride_bytes-n]); break;
-                  case 5: line_buffer[i] = z[i] - (z[i-n]>>1); break;
-                  case 6: line_buffer[i] = z[i] - stbiw__paeth(z[i-n], 0,0); break;
-               }
-            }
-            if (p) break;
-            for (i=0; i < x*n; ++i)
+      int filter_type;
+      if (force_filter > -1) {
+         filter_type = force_filter;
+         stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, force_filter, line_buffer);
+      } else { // Estimate the best filter by running through all of them:
+         int best_filter = 0, best_filter_val = 0x7fffffff, est, i;
+         for (filter_type = 0; filter_type < 5; filter_type++) {
+            stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, filter_type, line_buffer);
+
+            // Estimate the entropy of the line using this filter; the less, the better.
+            est = 0;
+            for (i = 0; i < x*n; ++i) {
                est += abs((signed char) line_buffer[i]);
-            if (est < bestval) { bestval = est; best = k; }
+            }
+            if (est < best_filter_val) {
+               best_filter_val = est;
+               best_filter = filter_type;
+            }
+         }
+         if (filter_type != best_filter) {  // If the last iteration already got us the best filter, don't redo it
+            stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, best_filter, line_buffer);
+            filter_type = best_filter;
          }
       }
-      // when we get here, best contains the filter type, and line_buffer contains the data
-      filt[j*(x*n+1)] = (unsigned char) best;
+      // when we get here, filter_type contains the filter type, and line_buffer contains the data
+      filt[j*(x*n+1)] = (unsigned char) filter_type;
       STBIW_MEMMOVE(filt+j*(x*n+1)+1, line_buffer, x*n);
    }
    STBIW_FREE(line_buffer);
-   zlib = stbi_zlib_compress(filt, y*( x*n+1), &zlen, 8); // increase 8 to get smaller but use more memory
+   zlib = stbi_zlib_compress(filt, y*( x*n+1), &zlen, stbi_write_png_compression_level);
    STBIW_FREE(filt);
    if (!zlib) return 0;
 
@@ -993,9 +1216,10 @@ STBIWDEF int stbi_write_png(char const *filename, int x, int y, int comp, const
 {
    FILE *f;
    int len;
-   unsigned char *png = stbi_write_png_to_mem((unsigned char *) data, stride_bytes, x, y, comp, &len);
+   unsigned char *png = stbi_write_png_to_mem((const unsigned char *) data, stride_bytes, x, y, comp, &len);
    if (png == NULL) return 0;
-   f = fopen(filename, "wb");
+
+   f = stbiw__fopen(filename, "wb");
    if (!f) { STBIW_FREE(png); return 0; }
    fwrite(png, 1, len, f);
    fclose(f);
@@ -1007,16 +1231,426 @@ STBIWDEF int stbi_write_png(char const *filename, int x, int y, int comp, const
 STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int stride_bytes)
 {
    int len;
-   unsigned char *png = stbi_write_png_to_mem((unsigned char *) data, stride_bytes, x, y, comp, &len);
+   unsigned char *png = stbi_write_png_to_mem((const unsigned char *) data, stride_bytes, x, y, comp, &len);
    if (png == NULL) return 0;
    func(context, png, len);
    STBIW_FREE(png);
    return 1;
 }
 
+
+/* ***************************************************************************
+ *
+ * JPEG writer
+ *
+ * This is based on Jon Olick's jo_jpeg.cpp:
+ * public domain Simple, Minimalistic JPEG writer - http://www.jonolick.com/code.html
+ */
+
+static const unsigned char stbiw__jpg_ZigZag[] = { 0,1,5,6,14,15,27,28,2,4,7,13,16,26,29,42,3,8,12,17,25,30,41,43,9,11,18,
+      24,31,40,44,53,10,19,23,32,39,45,52,54,20,22,33,38,46,51,55,60,21,34,37,47,50,56,59,61,35,36,48,49,57,58,62,63 };
+
+static void stbiw__jpg_writeBits(stbi__write_context *s, int *bitBufP, int *bitCntP, const unsigned short *bs) {
+   int bitBuf = *bitBufP, bitCnt = *bitCntP;
+   bitCnt += bs[1];
+   bitBuf |= bs[0] << (24 - bitCnt);
+   while(bitCnt >= 8) {
+      unsigned char c = (bitBuf >> 16) & 255;
+      stbiw__putc(s, c);
+      if(c == 255) {
+         stbiw__putc(s, 0);
+      }
+      bitBuf <<= 8;
+      bitCnt -= 8;
+   }
+   *bitBufP = bitBuf;
+   *bitCntP = bitCnt;
+}
+
+static void stbiw__jpg_DCT(float *d0p, float *d1p, float *d2p, float *d3p, float *d4p, float *d5p, float *d6p, float *d7p) {
+   float d0 = *d0p, d1 = *d1p, d2 = *d2p, d3 = *d3p, d4 = *d4p, d5 = *d5p, d6 = *d6p, d7 = *d7p;
+   float z1, z2, z3, z4, z5, z11, z13;
+
+   float tmp0 = d0 + d7;
+   float tmp7 = d0 - d7;
+   float tmp1 = d1 + d6;
+   float tmp6 = d1 - d6;
+   float tmp2 = d2 + d5;
+   float tmp5 = d2 - d5;
+   float tmp3 = d3 + d4;
+   float tmp4 = d3 - d4;
+
+   // Even part
+   float tmp10 = tmp0 + tmp3;   // phase 2
+   float tmp13 = tmp0 - tmp3;
+   float tmp11 = tmp1 + tmp2;
+   float tmp12 = tmp1 - tmp2;
+
+   d0 = tmp10 + tmp11;       // phase 3
+   d4 = tmp10 - tmp11;
+
+   z1 = (tmp12 + tmp13) * 0.707106781f; // c4
+   d2 = tmp13 + z1;       // phase 5
+   d6 = tmp13 - z1;
+
+   // Odd part
+   tmp10 = tmp4 + tmp5;       // phase 2
+   tmp11 = tmp5 + tmp6;
+   tmp12 = tmp6 + tmp7;
+
+   // The rotator is modified from fig 4-8 to avoid extra negations.
+   z5 = (tmp10 - tmp12) * 0.382683433f; // c6
+   z2 = tmp10 * 0.541196100f + z5; // c2-c6
+   z4 = tmp12 * 1.306562965f + z5; // c2+c6
+   z3 = tmp11 * 0.707106781f; // c4
+
+   z11 = tmp7 + z3;      // phase 5
+   z13 = tmp7 - z3;
+
+   *d5p = z13 + z2;         // phase 6
+   *d3p = z13 - z2;
+   *d1p = z11 + z4;
+   *d7p = z11 - z4;
+
+   *d0p = d0;  *d2p = d2;  *d4p = d4;  *d6p = d6;
+}
+
+static void stbiw__jpg_calcBits(int val, unsigned short bits[2]) {
+   int tmp1 = val < 0 ? -val : val;
+   val = val < 0 ? val-1 : val;
+   bits[1] = 1;
+   while(tmp1 >>= 1) {
+      ++bits[1];
+   }
+   bits[0] = val & ((1<<bits[1])-1);
+}
+
+static int stbiw__jpg_processDU(stbi__write_context *s, int *bitBuf, int *bitCnt, float *CDU, int du_stride, float *fdtbl, int DC, const unsigned short HTDC[256][2], const unsigned short HTAC[256][2]) {
+   const unsigned short EOB[2] = { HTAC[0x00][0], HTAC[0x00][1] };
+   const unsigned short M16zeroes[2] = { HTAC[0xF0][0], HTAC[0xF0][1] };
+   int dataOff, i, j, n, diff, end0pos, x, y;
+   int DU[64];
+
+   // DCT rows
+   for(dataOff=0, n=du_stride*8; dataOff<n; dataOff+=du_stride) {
+      stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+1], &CDU[dataOff+2], &CDU[dataOff+3], &CDU[dataOff+4], &CDU[dataOff+5], &CDU[dataOff+6], &CDU[dataOff+7]);
+   }
+   // DCT columns
+   for(dataOff=0; dataOff<8; ++dataOff) {
+      stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+du_stride], &CDU[dataOff+du_stride*2], &CDU[dataOff+du_stride*3], &CDU[dataOff+du_stride*4],
+                     &CDU[dataOff+du_stride*5], &CDU[dataOff+du_stride*6], &CDU[dataOff+du_stride*7]);
+   }
+   // Quantize/descale/zigzag the coefficients
+   for(y = 0, j=0; y < 8; ++y) {
+      for(x = 0; x < 8; ++x,++j) {
+         float v;
+         i = y*du_stride+x;
+         v = CDU[i]*fdtbl[j];
+         // DU[stbiw__jpg_ZigZag[j]] = (int)(v < 0 ? ceilf(v - 0.5f) : floorf(v + 0.5f));
+         // ceilf() and floorf() are C99, not C89, but I /think/ they're not needed here anyway?
+         DU[stbiw__jpg_ZigZag[j]] = (int)(v < 0 ? v - 0.5f : v + 0.5f);
+      }
+   }
+
+   // Encode DC
+   diff = DU[0] - DC;
+   if (diff == 0) {
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[0]);
+   } else {
+      unsigned short bits[2];
+      stbiw__jpg_calcBits(diff, bits);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[bits[1]]);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
+   }
+   // Encode ACs
+   end0pos = 63;
+   for(; (end0pos>0)&&(DU[end0pos]==0); --end0pos) {
+   }
+   // end0pos = first element in reverse order !=0
+   if(end0pos == 0) {
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
+      return DU[0];
+   }
+   for(i = 1; i <= end0pos; ++i) {
+      int startpos = i;
+      int nrzeroes;
+      unsigned short bits[2];
+      for (; DU[i]==0 && i<=end0pos; ++i) {
+      }
+      nrzeroes = i-startpos;
+      if ( nrzeroes >= 16 ) {
+         int lng = nrzeroes>>4;
+         int nrmarker;
+         for (nrmarker=1; nrmarker <= lng; ++nrmarker)
+            stbiw__jpg_writeBits(s, bitBuf, bitCnt, M16zeroes);
+         nrzeroes &= 15;
+      }
+      stbiw__jpg_calcBits(DU[i], bits);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTAC[(nrzeroes<<4)+bits[1]]);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
+   }
+   if(end0pos != 63) {
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
+   }
+   return DU[0];
+}
+
+static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, int comp, const void* data, int quality) {
+   // Constants that don't pollute global namespace
+   static const unsigned char std_dc_luminance_nrcodes[] = {0,0,1,5,1,1,1,1,1,1,0,0,0,0,0,0,0};
+   static const unsigned char std_dc_luminance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
+   static const unsigned char std_ac_luminance_nrcodes[] = {0,0,2,1,3,3,2,4,3,5,5,4,4,0,0,1,0x7d};
+   static const unsigned char std_ac_luminance_values[] = {
+      0x01,0x02,0x03,0x00,0x04,0x11,0x05,0x12,0x21,0x31,0x41,0x06,0x13,0x51,0x61,0x07,0x22,0x71,0x14,0x32,0x81,0x91,0xa1,0x08,
+      0x23,0x42,0xb1,0xc1,0x15,0x52,0xd1,0xf0,0x24,0x33,0x62,0x72,0x82,0x09,0x0a,0x16,0x17,0x18,0x19,0x1a,0x25,0x26,0x27,0x28,
+      0x29,0x2a,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,0x59,
+      0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x83,0x84,0x85,0x86,0x87,0x88,0x89,
+      0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,0xb5,0xb6,
+      0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xe1,0xe2,
+      0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
+   };
+   static const unsigned char std_dc_chrominance_nrcodes[] = {0,0,3,1,1,1,1,1,1,1,1,1,0,0,0,0,0};
+   static const unsigned char std_dc_chrominance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
+   static const unsigned char std_ac_chrominance_nrcodes[] = {0,0,2,1,2,4,4,3,4,7,5,4,4,0,1,2,0x77};
+   static const unsigned char std_ac_chrominance_values[] = {
+      0x00,0x01,0x02,0x03,0x11,0x04,0x05,0x21,0x31,0x06,0x12,0x41,0x51,0x07,0x61,0x71,0x13,0x22,0x32,0x81,0x08,0x14,0x42,0x91,
+      0xa1,0xb1,0xc1,0x09,0x23,0x33,0x52,0xf0,0x15,0x62,0x72,0xd1,0x0a,0x16,0x24,0x34,0xe1,0x25,0xf1,0x17,0x18,0x19,0x1a,0x26,
+      0x27,0x28,0x29,0x2a,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,
+      0x59,0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x82,0x83,0x84,0x85,0x86,0x87,
+      0x88,0x89,0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,
+      0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,
+      0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
+   };
+   // Huffman tables
+   static const unsigned short YDC_HT[256][2] = { {0,2},{2,3},{3,3},{4,3},{5,3},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9}};
+   static const unsigned short UVDC_HT[256][2] = { {0,2},{1,2},{2,2},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9},{1022,10},{2046,11}};
+   static const unsigned short YAC_HT[256][2] = {
+      {10,4},{0,2},{1,2},{4,3},{11,4},{26,5},{120,7},{248,8},{1014,10},{65410,16},{65411,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {12,4},{27,5},{121,7},{502,9},{2038,11},{65412,16},{65413,16},{65414,16},{65415,16},{65416,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {28,5},{249,8},{1015,10},{4084,12},{65417,16},{65418,16},{65419,16},{65420,16},{65421,16},{65422,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {58,6},{503,9},{4085,12},{65423,16},{65424,16},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {59,6},{1016,10},{65430,16},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {122,7},{2039,11},{65438,16},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {123,7},{4086,12},{65446,16},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {250,8},{4087,12},{65454,16},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {504,9},{32704,15},{65462,16},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {505,9},{65470,16},{65471,16},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {506,9},{65479,16},{65480,16},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {1017,10},{65488,16},{65489,16},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {1018,10},{65497,16},{65498,16},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {2040,11},{65506,16},{65507,16},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {65515,16},{65516,16},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {2041,11},{65525,16},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
+   };
+   static const unsigned short UVAC_HT[256][2] = {
+      {0,2},{1,2},{4,3},{10,4},{24,5},{25,5},{56,6},{120,7},{500,9},{1014,10},{4084,12},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {11,4},{57,6},{246,8},{501,9},{2038,11},{4085,12},{65416,16},{65417,16},{65418,16},{65419,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {26,5},{247,8},{1015,10},{4086,12},{32706,15},{65420,16},{65421,16},{65422,16},{65423,16},{65424,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {27,5},{248,8},{1016,10},{4087,12},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{65430,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {58,6},{502,9},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{65438,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {59,6},{1017,10},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{65446,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {121,7},{2039,11},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{65454,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {122,7},{2040,11},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{65462,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {249,8},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{65470,16},{65471,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {503,9},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{65479,16},{65480,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {504,9},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{65488,16},{65489,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {505,9},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{65497,16},{65498,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {506,9},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{65506,16},{65507,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {2041,11},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{65515,16},{65516,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {16352,14},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{65525,16},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {1018,10},{32707,15},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
+   };
+   static const int YQT[] = {16,11,10,16,24,40,51,61,12,12,14,19,26,58,60,55,14,13,16,24,40,57,69,56,14,17,22,29,51,87,80,62,18,22,
+                             37,56,68,109,103,77,24,35,55,64,81,104,113,92,49,64,78,87,103,121,120,101,72,92,95,98,112,100,103,99};
+   static const int UVQT[] = {17,18,24,47,99,99,99,99,18,21,26,66,99,99,99,99,24,26,56,99,99,99,99,99,47,66,99,99,99,99,99,99,
+                              99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99};
+   static const float aasf[] = { 1.0f * 2.828427125f, 1.387039845f * 2.828427125f, 1.306562965f * 2.828427125f, 1.175875602f * 2.828427125f,
+                                 1.0f * 2.828427125f, 0.785694958f * 2.828427125f, 0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f };
+
+   int row, col, i, k, subsample;
+   float fdtbl_Y[64], fdtbl_UV[64];
+   unsigned char YTable[64], UVTable[64];
+
+   if(!data || !width || !height || comp > 4 || comp < 1) {
+      return 0;
+   }
+
+   quality = quality ? quality : 90;
+   subsample = quality <= 90 ? 1 : 0;
+   quality = quality < 1 ? 1 : quality > 100 ? 100 : quality;
+   quality = quality < 50 ? 5000 / quality : 200 - quality * 2;
+
+   for(i = 0; i < 64; ++i) {
+      int uvti, yti = (YQT[i]*quality+50)/100;
+      YTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (yti < 1 ? 1 : yti > 255 ? 255 : yti);
+      uvti = (UVQT[i]*quality+50)/100;
+      UVTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (uvti < 1 ? 1 : uvti > 255 ? 255 : uvti);
+   }
+
+   for(row = 0, k = 0; row < 8; ++row) {
+      for(col = 0; col < 8; ++col, ++k) {
+         fdtbl_Y[k]  = 1 / (YTable [stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
+         fdtbl_UV[k] = 1 / (UVTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
+      }
+   }
+
+   // Write Headers
+   {
+      static const unsigned char head0[] = { 0xFF,0xD8,0xFF,0xE0,0,0x10,'J','F','I','F',0,1,1,0,0,1,0,1,0,0,0xFF,0xDB,0,0x84,0 };
+      static const unsigned char head2[] = { 0xFF,0xDA,0,0xC,3,1,0,2,0x11,3,0x11,0,0x3F,0 };
+      const unsigned char head1[] = { 0xFF,0xC0,0,0x11,8,(unsigned char)(height>>8),STBIW_UCHAR(height),(unsigned char)(width>>8),STBIW_UCHAR(width),
+                                      3,1,(unsigned char)(subsample?0x22:0x11),0,2,0x11,1,3,0x11,1,0xFF,0xC4,0x01,0xA2,0 };
+      s->func(s->context, (void*)head0, sizeof(head0));
+      s->func(s->context, (void*)YTable, sizeof(YTable));
+      stbiw__putc(s, 1);
+      s->func(s->context, UVTable, sizeof(UVTable));
+      s->func(s->context, (void*)head1, sizeof(head1));
+      s->func(s->context, (void*)(std_dc_luminance_nrcodes+1), sizeof(std_dc_luminance_nrcodes)-1);
+      s->func(s->context, (void*)std_dc_luminance_values, sizeof(std_dc_luminance_values));
+      stbiw__putc(s, 0x10); // HTYACinfo
+      s->func(s->context, (void*)(std_ac_luminance_nrcodes+1), sizeof(std_ac_luminance_nrcodes)-1);
+      s->func(s->context, (void*)std_ac_luminance_values, sizeof(std_ac_luminance_values));
+      stbiw__putc(s, 1); // HTUDCinfo
+      s->func(s->context, (void*)(std_dc_chrominance_nrcodes+1), sizeof(std_dc_chrominance_nrcodes)-1);
+      s->func(s->context, (void*)std_dc_chrominance_values, sizeof(std_dc_chrominance_values));
+      stbiw__putc(s, 0x11); // HTUACinfo
+      s->func(s->context, (void*)(std_ac_chrominance_nrcodes+1), sizeof(std_ac_chrominance_nrcodes)-1);
+      s->func(s->context, (void*)std_ac_chrominance_values, sizeof(std_ac_chrominance_values));
+      s->func(s->context, (void*)head2, sizeof(head2));
+   }
+
+   // Encode 8x8 macroblocks
+   {
+      static const unsigned short fillBits[] = {0x7F, 7};
+      int DCY=0, DCU=0, DCV=0;
+      int bitBuf=0, bitCnt=0;
+      // comp == 2 is grey+alpha (alpha is ignored)
+      int ofsG = comp > 2 ? 1 : 0, ofsB = comp > 2 ? 2 : 0;
+      const unsigned char *dataR = (const unsigned char *)data;
+      const unsigned char *dataG = dataR + ofsG;
+      const unsigned char *dataB = dataR + ofsB;
+      int x, y, pos;
+      if(subsample) {
+         for(y = 0; y < height; y += 16) {
+            for(x = 0; x < width; x += 16) {
+               float Y[256], U[256], V[256];
+               for(row = y, pos = 0; row < y+16; ++row) {
+                  // row >= height => use last input row
+                  int clamped_row = (row < height) ? row : height - 1;
+                  int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp;
+                  for(col = x; col < x+16; ++col, ++pos) {
+                     // if col >= width => use pixel from last input column
+                     int p = base_p + ((col < width) ? col : (width-1))*comp;
+                     float r = dataR[p], g = dataG[p], b = dataB[p];
+                     Y[pos]= +0.29900f*r + 0.58700f*g + 0.11400f*b - 128;
+                     U[pos]= -0.16874f*r - 0.33126f*g + 0.50000f*b;
+                     V[pos]= +0.50000f*r - 0.41869f*g - 0.08131f*b;
+                  }
+               }
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+0,   16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+8,   16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+128, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+136, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+
+               // subsample U,V
+               {
+                  float subU[64], subV[64];
+                  int yy, xx;
+                  for(yy = 0, pos = 0; yy < 8; ++yy) {
+                     for(xx = 0; xx < 8; ++xx, ++pos) {
+                        int j = yy*32+xx*2;
+                        subU[pos] = (U[j+0] + U[j+1] + U[j+16] + U[j+17]) * 0.25f;
+                        subV[pos] = (V[j+0] + V[j+1] + V[j+16] + V[j+17]) * 0.25f;
+                     }
+                  }
+                  DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subU, 8, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
+                  DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subV, 8, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
+               }
+            }
+         }
+      } else {
+         for(y = 0; y < height; y += 8) {
+            for(x = 0; x < width; x += 8) {
+               float Y[64], U[64], V[64];
+               for(row = y, pos = 0; row < y+8; ++row) {
+                  // row >= height => use last input row
+                  int clamped_row = (row < height) ? row : height - 1;
+                  int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp;
+                  for(col = x; col < x+8; ++col, ++pos) {
+                     // if col >= width => use pixel from last input column
+                     int p = base_p + ((col < width) ? col : (width-1))*comp;
+                     float r = dataR[p], g = dataG[p], b = dataB[p];
+                     Y[pos]= +0.29900f*r + 0.58700f*g + 0.11400f*b - 128;
+                     U[pos]= -0.16874f*r - 0.33126f*g + 0.50000f*b;
+                     V[pos]= +0.50000f*r - 0.41869f*g - 0.08131f*b;
+                  }
+               }
+
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y, 8, fdtbl_Y,  DCY, YDC_HT, YAC_HT);
+               DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, U, 8, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
+               DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, V, 8, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
+            }
+         }
+      }
+
+      // Do the bit alignment of the EOI marker
+      stbiw__jpg_writeBits(s, &bitBuf, &bitCnt, fillBits);
+   }
+
+   // EOI
+   stbiw__putc(s, 0xFF);
+   stbiw__putc(s, 0xD9);
+
+   return 1;
+}
+
+STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality)
+{
+   stbi__write_context s = { 0 };
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_jpg_core(&s, x, y, comp, (void *) data, quality);
+}
+
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality)
+{
+   stbi__write_context s = { 0 };
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_jpg_core(&s, x, y, comp, data, quality);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif
+
 #endif // STB_IMAGE_WRITE_IMPLEMENTATION
 
 /* Revision history
+      1.16  (2021-07-11)
+             make Deflate code emit uncompressed blocks when it would otherwise expand
+             support writing BMPs with alpha channel
+      1.15  (2020-07-13) unknown
+      1.14  (2020-02-02) updated JPEG writer to downsample chroma channels
+      1.13
+      1.12
+      1.11  (2019-08-11)
+
+      1.10  (2019-02-07)
+             support utf8 filenames in Windows; fix warnings and platform ifdefs
+      1.09  (2018-02-11)
+             fix typo in zlib quality API, improve STB_I_W_STATIC in C++
+      1.08  (2018-01-29)
+             add stbi__flip_vertically_on_write, external zlib, zlib quality, choose PNG filter
+      1.07  (2017-07-24)
+             doc fix
+      1.06 (2017-07-23)
+             writing JPEG (using Jon Olick's code)
+      1.05   ???
+      1.04 (2017-03-03)
+             monochrome BMP expansion
+      1.03   ???
       1.02 (2016-04-02)
              avoid allocating large structures on the stack
       1.01 (2016-01-16)
@@ -1035,7 +1669,7 @@ STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int x,
              add HDR output
              fix monochrome BMP
       0.95 (2014-08-17)
-		       add monochrome TGA output
+             add monochrome TGA output
       0.94 (2014-05-31)
              rename private functions to avoid conflicts with stb_image.h
       0.93 (2014-05-27)
@@ -1046,3 +1680,45 @@ STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int x,
              first public release
       0.90   first internal release
 */
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/renderdoc/3rdparty/stb/stb_impl.c b/renderdoc/3rdparty/stb/stb_impl.c
index 9c763667e6..f22957d06a 100644
--- a/renderdoc/3rdparty/stb/stb_impl.c
+++ b/renderdoc/3rdparty/stb/stb_impl.c
@@ -20,5 +20,5 @@
 
 #include "stb_image.h"
 #include "stb_image_write.h"
-#include "stb_image_resize.h"
+#include "stb_image_resize2.h"
 #include "stb_truetype.h"
diff --git a/renderdoc/3rdparty/stb/stb_truetype.h b/renderdoc/3rdparty/stb/stb_truetype.h
index d360d60920..bbf2284b16 100644
--- a/renderdoc/3rdparty/stb/stb_truetype.h
+++ b/renderdoc/3rdparty/stb/stb_truetype.h
@@ -1,11 +1,21 @@
-// stb_truetype.h - v1.11 - public domain
-// authored from 2009-2015 by Sean Barrett / RAD Game Tools
+// stb_truetype.h - v1.26 - public domain
+// authored from 2009-2021 by Sean Barrett / RAD Game Tools
+//
+// =======================================================================
+//
+//    NO SECURITY GUARANTEE -- DO NOT USE THIS ON UNTRUSTED FONT FILES
+//
+// This library does no range checking of the offsets found in the file,
+// meaning an attacker can use it to read arbitrary memory.
+//
+// =======================================================================
 //
 //   This library processes TrueType files:
 //        parse files
 //        extract glyph metrics
 //        extract glyph shapes
 //        render glyphs to one-channel bitmaps with antialiasing (box filter)
+//        render glyphs to one-channel SDF bitmaps (signed-distance field/function)
 //
 //   Todo:
 //        non-MS cmaps
@@ -20,37 +30,49 @@
 //
 //   Mikko Mononen: compound shape support, more cmap formats
 //   Tor Andersson: kerning, subpixel rendering
+//   Dougall Johnson: OpenType / Type 2 font handling
+//   Daniel Ribeiro Maciel: basic GPOS-based kerning
 //
 //   Misc other:
 //       Ryan Gordon
 //       Simon Glass
+//       github:IntellectualKitty
+//       Imanol Celaya
+//       Daniel Ribeiro Maciel
 //
 //   Bug/warning reports/fixes:
-//       "Zer" on mollyrocket (with fix)
-//       Cass Everitt
-//       stoiko (Haemimont Games)
-//       Brian Hook 
-//       Walter van Niftrik
-//       David Gow
-//       David Given
-//       Ivan-Assen Ivanov
-//       Anthony Pesch
-//       Johan Duparc
-//       Hou Qiming
-//       Fabian "ryg" Giesen
-//       Martins Mozeiko
-//       Cap Petschulat
-//       Omar Cornut
-//       github:aloucks
-//       Peter LaValle
-//       Sergey Popov
-//       Giumo X. Clanjor
-//       Higor Euripedes
-//       Thomas Fields
-//       Derek Vinyard
+//       "Zer" on mollyrocket       Fabian "ryg" Giesen   github:NiLuJe
+//       Cass Everitt               Martins Mozeiko       github:aloucks
+//       stoiko (Haemimont Games)   Cap Petschulat        github:oyvindjam
+//       Brian Hook                 Omar Cornut           github:vassvik
+//       Walter van Niftrik         Ryan Griege
+//       David Gow                  Peter LaValle
+//       David Given                Sergey Popov
+//       Ivan-Assen Ivanov          Giumo X. Clanjor
+//       Anthony Pesch              Higor Euripedes
+//       Johan Duparc               Thomas Fields
+//       Hou Qiming                 Derek Vinyard
+//       Rob Loach                  Cort Stratton
+//       Kenney Phillis Jr.         Brian Costabile
+//       Ken Voskuil (kaesve)
 //
 // VERSION HISTORY
 //
+//   1.26 (2021-08-28) fix broken rasterizer
+//   1.25 (2021-07-11) many fixes
+//   1.24 (2020-02-05) fix warning
+//   1.23 (2020-02-02) query SVG data for glyphs; query whole kerning table (but only kern not GPOS)
+//   1.22 (2019-08-11) minimize missing-glyph duplication; fix kerning if both 'GPOS' and 'kern' are defined
+//   1.21 (2019-02-25) fix warning
+//   1.20 (2019-02-07) PackFontRange skips missing codepoints; GetScaleFontVMetrics()
+//   1.19 (2018-02-11) GPOS kerning, STBTT_fmod
+//   1.18 (2018-01-29) add missing function
+//   1.17 (2017-07-23) make more arguments const; doc fix
+//   1.16 (2017-07-12) SDF support
+//   1.15 (2017-03-03) make more arguments const
+//   1.14 (2017-01-16) num-fonts-in-TTC function
+//   1.13 (2017-01-02) support OpenType fonts, certain Apple fonts
+//   1.12 (2016-10-25) suppress warnings about casting away const with -Wcast-qual
 //   1.11 (2016-04-02) fix unused-variable warning
 //   1.10 (2016-04-02) user-defined fabs(); rare memory leak; remove duplicate typedef
 //   1.09 (2016-01-16) warning fix; avoid crash on outofmem; use allocation userdata properly
@@ -60,24 +82,16 @@
 //                     fix stbtt_GetFontOFfsetForIndex (never worked for non-0 input?);
 //                     fixed an assert() bug in the new rasterizer
 //                     replace assert() with STBTT_assert() in new rasterizer
-//   1.06 (2015-07-14) performance improvements (~35% faster on x86 and x64 on test machine)
-//                     also more precise AA rasterizer, except if shapes overlap
-//                     remove need for STBTT_sort
-//   1.05 (2015-04-15) fix misplaced definitions for STBTT_STATIC
-//   1.04 (2015-04-15) typo in example
-//   1.03 (2015-04-12) STBTT_STATIC, fix memory leak in new packing, various fixes
 //
 //   Full history can be found at the end of this file.
 //
 // LICENSE
 //
-//   This software is dual-licensed to the public domain and under the following
-//   license: you are granted a perpetual, irrevocable license to copy, modify,
-//   publish, and distribute this file as you see fit.
+//   See end of file for license information.
 //
 // USAGE
 //
-//   Include this file in whatever places neeed to refer to it. In ONE C/C++
+//   Include this file in whatever places need to refer to it. In ONE C/C++
 //   file, write:
 //      #define STB_TRUETYPE_IMPLEMENTATION
 //   before the #include of this file. This expands out the actual
@@ -93,14 +107,15 @@
 //   Improved 3D API (more shippable):
 //           #include "stb_rect_pack.h"           -- optional, but you really want it
 //           stbtt_PackBegin()
-//           stbtt_PackSetOversample()            -- for improved quality on small fonts
+//           stbtt_PackSetOversampling()          -- for improved quality on small fonts
 //           stbtt_PackFontRanges()               -- pack and renders
 //           stbtt_PackEnd()
 //           stbtt_GetPackedQuad()
 //
 //   "Load" a font file from a memory buffer (you have to keep the buffer loaded)
 //           stbtt_InitFont()
-//           stbtt_GetFontOffsetForIndex()        -- use for TTC font collections
+//           stbtt_GetFontOffsetForIndex()        -- indexing for TTC font collections
+//           stbtt_GetNumberOfFonts()             -- number of fonts for TTC font collections
 //
 //   Render a unicode codepoint to a bitmap
 //           stbtt_GetCodepointBitmap()           -- allocates and returns a bitmap
@@ -110,6 +125,7 @@
 //   Character advance/positioning
 //           stbtt_GetCodepointHMetrics()
 //           stbtt_GetFontVMetrics()
+//           stbtt_GetFontVMetricsOS2()
 //           stbtt_GetCodepointKernAdvance()
 //
 //   Starting with version 1.06, the rasterizer was replaced with a new,
@@ -165,7 +181,7 @@
 //         measurement for describing font size, defined as 72 points per inch.
 //         stb_truetype provides a point API for compatibility. However, true
 //         "per inch" conventions don't make much sense on computer displays
-//         since they different monitors have different number of pixels per
+//         since different monitors have different number of pixels per
 //         inch. For example, Windows traditionally uses a convention that
 //         there are 96 pixels per inch, thus making 'inch' measurements have
 //         nothing to do with inches, and thus effectively defining a point to
@@ -175,6 +191,39 @@
 //         for non-commercial fonts, thus making fonts scaled in points
 //         according to the TrueType spec incoherently sized in practice.
 //
+// DETAILED USAGE:
+//
+//  Scale:
+//    Select how high you want the font to be, in points or pixels.
+//    Call ScaleForPixelHeight or ScaleForMappingEmToPixels to compute
+//    a scale factor SF that will be used by all other functions.
+//
+//  Baseline:
+//    You need to select a y-coordinate that is the baseline of where
+//    your text will appear. Call GetFontBoundingBox to get the baseline-relative
+//    bounding box for all characters. SF*-y0 will be the distance in pixels
+//    that the worst-case character could extend above the baseline, so if
+//    you want the top edge of characters to appear at the top of the
+//    screen where y=0, then you would set the baseline to SF*-y0.
+//
+//  Current point:
+//    Set the current point where the first character will appear. The
+//    first character could extend left of the current point; this is font
+//    dependent. You can either choose a current point that is the leftmost
+//    point and hope, or add some padding, or check the bounding box or
+//    left-side-bearing of the first character to be displayed and set
+//    the current point based on that.
+//
+//  Displaying a character:
+//    Compute the bounding box of the character. It will contain signed values
+//    relative to <current_point, baseline>. I.e. if it returns x0,y0,x1,y1,
+//    then the character should be displayed in the rectangle from
+//    <current_point+SF*x0, baseline+SF*y0> to <current_point+SF*x1,baseline+SF*y1).
+//
+//  Advancing for the next character:
+//    Call GlyphHMetrics, and compute 'current_point += SF * advance'.
+//
+//
 // ADVANCED USAGE
 //
 //   Quality:
@@ -209,19 +258,6 @@
 //   recommend it.
 //
 //
-// SOURCE STATISTICS (based on v0.6c, 2050 LOC)
-//
-//   Documentation & header file        520 LOC  \___ 660 LOC documentation
-//   Sample code                        140 LOC  /
-//   Truetype parsing                   620 LOC  ---- 620 LOC TrueType
-//   Software rasterization             240 LOC  \                           .
-//   Curve tesselation                  120 LOC   \__ 550 LOC Bitmap creation
-//   Bitmap management                  100 LOC   /
-//   Baked bitmap interface              70 LOC  /
-//   Font name matching & access        150 LOC  ---- 150 
-//   C runtime library abstraction       60 LOC  ----  60
-//
-//
 // PERFORMANCE MEASUREMENTS FOR 1.06:
 //
 //                      32-bit     64-bit
@@ -236,8 +272,8 @@
 ////  SAMPLE PROGRAMS
 ////
 //
-//  Incomplete text-in-3d-api example, which draws quads properly aligned to be lossless
-//
+//  Incomplete text-in-3d-api example, which draws quads properly aligned to be lossless.
+//  See "tests/truetype_demo_win32.c" for a complete version.
 #if 0
 #define STB_TRUETYPE_IMPLEMENTATION  // force following include to generate implementation
 #include "stb_truetype.h"
@@ -263,6 +299,8 @@ void my_stbtt_initfont(void)
 void my_stbtt_print(float x, float y, char *text)
 {
    // assume orthographic projection with units = screen pixels, origin at top left
+   glEnable(GL_BLEND);
+   glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
    glEnable(GL_TEXTURE_2D);
    glBindTexture(GL_TEXTURE_2D, ftex);
    glBegin(GL_QUADS);
@@ -270,10 +308,10 @@ void my_stbtt_print(float x, float y, char *text)
       if (*text >= 32 && *text < 128) {
          stbtt_aligned_quad q;
          stbtt_GetBakedQuad(cdata, 512,512, *text-32, &x,&y,&q,1);//1=opengl & d3d10+,0=d3d9
-         glTexCoord2f(q.s0,q.t1); glVertex2f(q.x0,q.y0);
-         glTexCoord2f(q.s1,q.t1); glVertex2f(q.x1,q.y0);
-         glTexCoord2f(q.s1,q.t0); glVertex2f(q.x1,q.y1);
-         glTexCoord2f(q.s0,q.t0); glVertex2f(q.x0,q.y1);
+         glTexCoord2f(q.s0,q.t0); glVertex2f(q.x0,q.y0);
+         glTexCoord2f(q.s1,q.t0); glVertex2f(q.x1,q.y0);
+         glTexCoord2f(q.s1,q.t1); glVertex2f(q.x1,q.y1);
+         glTexCoord2f(q.s0,q.t1); glVertex2f(q.x0,q.y1);
       }
       ++text;
    }
@@ -311,7 +349,7 @@ int main(int argc, char **argv)
    }
    return 0;
 }
-#endif 
+#endif
 //
 // Output:
 //
@@ -325,9 +363,9 @@ int main(int argc, char **argv)
 //  :@@.  M@M
 //   @@@o@@@@
 //   :M@@V:@@.
-//  
+//
 //////////////////////////////////////////////////////////////////////////////
-// 
+//
 // Complete program: print "Hello World!" banner, with bugs
 //
 #if 0
@@ -381,7 +419,8 @@ int main(int arg, char **argv)
 ////   INTEGRATION WITH YOUR CODEBASE
 ////
 ////   The following sections allow you to supply alternate definitions
-////   of C library functions used by stb_truetype.
+////   of C library functions used by stb_truetype, e.g. if you don't
+////   link with the C runtime library.
 
 #ifdef STB_TRUETYPE_IMPLEMENTATION
    // #define your own (u)stbtt_int8/16/32 before including to override this
@@ -397,7 +436,7 @@ int main(int arg, char **argv)
    typedef char stbtt__check_size32[sizeof(stbtt_int32)==4 ? 1 : -1];
    typedef char stbtt__check_size16[sizeof(stbtt_int16)==2 ? 1 : -1];
 
-   // #define your own STBTT_ifloor/STBTT_iceil() to avoid math.h
+   // e.g. #define your own STBTT_ifloor/STBTT_iceil() to avoid math.h
    #ifndef STBTT_ifloor
    #include <math.h>
    #define STBTT_ifloor(x)   ((int) floor(x))
@@ -407,6 +446,18 @@ int main(int arg, char **argv)
    #ifndef STBTT_sqrt
    #include <math.h>
    #define STBTT_sqrt(x)      sqrt(x)
+   #define STBTT_pow(x,y)     pow(x,y)
+   #endif
+
+   #ifndef STBTT_fmod
+   #include <math.h>
+   #define STBTT_fmod(x,y)    fmod(x,y)
+   #endif
+
+   #ifndef STBTT_cos
+   #include <math.h>
+   #define STBTT_cos(x)       cos(x)
+   #define STBTT_acos(x)      acos(x)
    #endif
 
    #ifndef STBTT_fabs
@@ -432,7 +483,7 @@ int main(int arg, char **argv)
    #endif
 
    #ifndef STBTT_memcpy
-   #include <memory.h>
+   #include <string.h>
    #define STBTT_memcpy       memcpy
    #define STBTT_memset       memset
    #endif
@@ -458,6 +509,14 @@ int main(int arg, char **argv)
 extern "C" {
 #endif
 
+// private structure
+typedef struct
+{
+   unsigned char *data;
+   int cursor;
+   int size;
+} stbtt__buf;
+
 //////////////////////////////////////////////////////////////////////////////
 //
 // TEXTURE BAKING API
@@ -487,7 +546,7 @@ typedef struct
    float x1,y1,s1,t1; // bottom-right
 } stbtt_aligned_quad;
 
-STBTT_DEF void stbtt_GetBakedQuad(stbtt_bakedchar *chardata, int pw, int ph,  // same data as above
+STBTT_DEF void stbtt_GetBakedQuad(const stbtt_bakedchar *chardata, int pw, int ph,  // same data as above
                                int char_index,             // character to display
                                float *xpos, float *ypos,   // pointers to current position in screen pixel space
                                stbtt_aligned_quad *q,      // output: quad to draw
@@ -502,6 +561,8 @@ STBTT_DEF void stbtt_GetBakedQuad(stbtt_bakedchar *chardata, int pw, int ph,  //
 //
 // It's inefficient; you might want to c&p it and optimize it.
 
+STBTT_DEF void stbtt_GetScaledFontVMetrics(const unsigned char *fontdata, int index, float size, float *ascent, float *descent, float *lineGap);
+// Query the font vertical metrics without having to create a font first.
 
 
 //////////////////////////////////////////////////////////////////////////////
@@ -527,7 +588,7 @@ typedef struct stbrp_rect stbrp_rect;
 STBTT_DEF int  stbtt_PackBegin(stbtt_pack_context *spc, unsigned char *pixels, int width, int height, int stride_in_bytes, int padding, void *alloc_context);
 // Initializes a packing context stored in the passed-in stbtt_pack_context.
 // Future calls using this context will pack characters into the bitmap passed
-// in here: a 1-channel bitmap that is weight x height. stride_in_bytes is
+// in here: a 1-channel bitmap that is width * height. stride_in_bytes is
 // the distance from one row to the next (or 0 to mean they are packed tightly
 // together). "padding" is the amount of padding to leave between each
 // character (normally you want '1' for bitmaps you'll use as textures with
@@ -540,7 +601,7 @@ STBTT_DEF void stbtt_PackEnd  (stbtt_pack_context *spc);
 
 #define STBTT_POINT_SIZE(x)   (-(x))
 
-STBTT_DEF int  stbtt_PackFontRange(stbtt_pack_context *spc, unsigned char *fontdata, int font_index, float font_size,
+STBTT_DEF int  stbtt_PackFontRange(stbtt_pack_context *spc, const unsigned char *fontdata, int font_index, float font_size,
                                 int first_unicode_char_in_range, int num_chars_in_range, stbtt_packedchar *chardata_for_range);
 // Creates character bitmaps from the font_index'th font found in fontdata (use
 // font_index=0 if you don't know what that is). It creates num_chars_in_range
@@ -565,7 +626,7 @@ typedef struct
    unsigned char h_oversample, v_oversample; // don't set these, they're used internally
 } stbtt_pack_range;
 
-STBTT_DEF int  stbtt_PackFontRanges(stbtt_pack_context *spc, unsigned char *fontdata, int font_index, stbtt_pack_range *ranges, int num_ranges);
+STBTT_DEF int  stbtt_PackFontRanges(stbtt_pack_context *spc, const unsigned char *fontdata, int font_index, stbtt_pack_range *ranges, int num_ranges);
 // Creates character bitmaps from multiple ranges of characters stored in
 // ranges. This will usually create a better-packed bitmap than multiple
 // calls to stbtt_PackFontRange. Note that you can call this multiple
@@ -587,19 +648,25 @@ STBTT_DEF void stbtt_PackSetOversampling(stbtt_pack_context *spc, unsigned int h
 // To use with PackFontRangesGather etc., you must set it before calls
 // call to PackFontRangesGatherRects.
 
-STBTT_DEF void stbtt_GetPackedQuad(stbtt_packedchar *chardata, int pw, int ph,  // same data as above
+STBTT_DEF void stbtt_PackSetSkipMissingCodepoints(stbtt_pack_context *spc, int skip);
+// If skip != 0, this tells stb_truetype to skip any codepoints for which
+// there is no corresponding glyph. If skip=0, which is the default, then
+// codepoints without a glyph recived the font's "missing character" glyph,
+// typically an empty box by convention.
+
+STBTT_DEF void stbtt_GetPackedQuad(const stbtt_packedchar *chardata, int pw, int ph,  // same data as above
                                int char_index,             // character to display
                                float *xpos, float *ypos,   // pointers to current position in screen pixel space
                                stbtt_aligned_quad *q,      // output: quad to draw
                                int align_to_integer);
 
-STBTT_DEF int  stbtt_PackFontRangesGatherRects(stbtt_pack_context *spc, stbtt_fontinfo *info, stbtt_pack_range *ranges, int num_ranges, stbrp_rect *rects);
+STBTT_DEF int  stbtt_PackFontRangesGatherRects(stbtt_pack_context *spc, const stbtt_fontinfo *info, stbtt_pack_range *ranges, int num_ranges, stbrp_rect *rects);
 STBTT_DEF void stbtt_PackFontRangesPackRects(stbtt_pack_context *spc, stbrp_rect *rects, int num_rects);
-STBTT_DEF int  stbtt_PackFontRangesRenderIntoRects(stbtt_pack_context *spc, stbtt_fontinfo *info, stbtt_pack_range *ranges, int num_ranges, stbrp_rect *rects);
+STBTT_DEF int  stbtt_PackFontRangesRenderIntoRects(stbtt_pack_context *spc, const stbtt_fontinfo *info, stbtt_pack_range *ranges, int num_ranges, stbrp_rect *rects);
 // Calling these functions in sequence is roughly equivalent to calling
 // stbtt_PackFontRanges(). If you more control over the packing of multiple
 // fonts, or if you want to pack custom data into a font texture, take a look
-// at the source to of stbtt_PackFontRanges() and create a custom version 
+// at the source to of stbtt_PackFontRanges() and create a custom version
 // using these functions, e.g. call GatherRects multiple times,
 // building up a single array of rects, then call PackRects once,
 // then call RenderIntoRects repeatedly. This may result in a
@@ -615,6 +682,7 @@ struct stbtt_pack_context {
    int   height;
    int   stride_in_bytes;
    int   padding;
+   int   skip_missing;
    unsigned int   h_oversample, v_oversample;
    unsigned char *pixels;
    void  *nodes;
@@ -626,16 +694,21 @@ struct stbtt_pack_context {
 //
 //
 
+STBTT_DEF int stbtt_GetNumberOfFonts(const unsigned char *data);
+// This function will determine the number of fonts in a font file.  TrueType
+// collection (.ttc) files may contain multiple fonts, while TrueType font
+// (.ttf) files only contain one font. The number of fonts can be used for
+// indexing with the previous function where the index is between zero and one
+// less than the total fonts. If an error occurs, -1 is returned.
+
 STBTT_DEF int stbtt_GetFontOffsetForIndex(const unsigned char *data, int index);
 // Each .ttf/.ttc file may have more than one font. Each font has a sequential
 // index number starting from 0. Call this function to get the font offset for
 // a given index; it returns -1 if the index is out of range. A regular .ttf
 // file will only define one font and it always be at offset 0, so it will
-// return '0' for index 0, and -1 for all other indices. You can just skip
-// this step if you know it's that kind of font.
+// return '0' for index 0, and -1 for all other indices.
 
-
-// The following structure is defined publically so you can declare one on
+// The following structure is defined publicly so you can declare one on
 // the stack or as a global or etc, but you should treat it as opaque.
 struct stbtt_fontinfo
 {
@@ -645,9 +718,16 @@ struct stbtt_fontinfo
 
    int numGlyphs;                     // number of glyphs, needed for range checking
 
-   int loca,head,glyf,hhea,hmtx,kern; // table locations as offset from start of .ttf
+   int loca,head,glyf,hhea,hmtx,kern,gpos,svg; // table locations as offset from start of .ttf
    int index_map;                     // a cmap mapping for our chosen character encoding
    int indexToLocFormat;              // format needed to map from glyph index to glyph
+
+   stbtt__buf cff;                    // cff font data
+   stbtt__buf charstrings;            // the charstring index
+   stbtt__buf gsubrs;                 // global charstring subroutines index
+   stbtt__buf subrs;                  // private charstring subroutines index
+   stbtt__buf fontdicts;              // array of font dicts
+   stbtt__buf fdselect;               // map from glyph to fontdict
 };
 
 STBTT_DEF int stbtt_InitFont(stbtt_fontinfo *info, const unsigned char *data, int offset);
@@ -667,6 +747,7 @@ STBTT_DEF int stbtt_FindGlyphIndex(const stbtt_fontinfo *info, int unicode_codep
 // and you want a speed-up, call this function with the character you're
 // going to process, then use glyph-based functions instead of the
 // codepoint-based functions.
+// Returns 0 if the character codepoint is not defined in the font.
 
 
 //////////////////////////////////////////////////////////////////////////////
@@ -695,6 +776,12 @@ STBTT_DEF void stbtt_GetFontVMetrics(const stbtt_fontinfo *info, int *ascent, in
 //   these are expressed in unscaled coordinates, so you must multiply by
 //   the scale factor for a given size
 
+STBTT_DEF int  stbtt_GetFontVMetricsOS2(const stbtt_fontinfo *info, int *typoAscent, int *typoDescent, int *typoLineGap);
+// analogous to GetFontVMetrics, but returns the "typographic" values from the OS/2
+// table (specific to MS/Windows TTF files).
+//
+// Returns 1 on success (table present), 0 on failure.
+
 STBTT_DEF void stbtt_GetFontBoundingBox(const stbtt_fontinfo *info, int *x0, int *y0, int *x1, int *y1);
 // the bounding box around all possible characters
 
@@ -714,6 +801,18 @@ STBTT_DEF int  stbtt_GetGlyphKernAdvance(const stbtt_fontinfo *info, int glyph1,
 STBTT_DEF int  stbtt_GetGlyphBox(const stbtt_fontinfo *info, int glyph_index, int *x0, int *y0, int *x1, int *y1);
 // as above, but takes one or more glyph indices for greater efficiency
 
+typedef struct stbtt_kerningentry
+{
+   int glyph1; // use stbtt_FindGlyphIndex
+   int glyph2;
+   int advance;
+} stbtt_kerningentry;
+
+STBTT_DEF int  stbtt_GetKerningTableLength(const stbtt_fontinfo *info);
+STBTT_DEF int  stbtt_GetKerningTable(const stbtt_fontinfo *info, stbtt_kerningentry* table, int table_length);
+// Retrieves a complete list of all of the kerning pairs provided by the font
+// stbtt_GetKerningTable never writes more than table_length entries and returns how many entries it did write.
+// The table will be sorted by (a.glyph1 == b.glyph1)?(a.glyph2 < b.glyph2):(a.glyph1 < b.glyph1)
 
 //////////////////////////////////////////////////////////////////////////////
 //
@@ -725,7 +824,8 @@ STBTT_DEF int  stbtt_GetGlyphBox(const stbtt_fontinfo *info, int glyph_index, in
    enum {
       STBTT_vmove=1,
       STBTT_vline,
-      STBTT_vcurve
+      STBTT_vcurve,
+      STBTT_vcubic
    };
 #endif
 
@@ -734,7 +834,7 @@ STBTT_DEF int  stbtt_GetGlyphBox(const stbtt_fontinfo *info, int glyph_index, in
    #define stbtt_vertex_type short // can't use stbtt_int16 because that's not visible in the header file
    typedef struct
    {
-      stbtt_vertex_type x,y,cx,cy;
+      stbtt_vertex_type x,y,cx,cy,cx1,cy1;
       unsigned char type,padding;
    } stbtt_vertex;
 #endif
@@ -747,7 +847,7 @@ STBTT_DEF int stbtt_GetGlyphShape(const stbtt_fontinfo *info, int glyph_index, s
 // returns # of vertices and fills *vertices with the pointer to them
 //   these are expressed in "unscaled" coordinates
 //
-// The shape is a series of countours. Each one starts with
+// The shape is a series of contours. Each one starts with
 // a STBTT_moveto, then consists of a series of mixed
 // STBTT_lineto and STBTT_curveto segments. A lineto
 // draws a line from previous endpoint to its x,y; a curveto
@@ -757,6 +857,12 @@ STBTT_DEF int stbtt_GetGlyphShape(const stbtt_fontinfo *info, int glyph_index, s
 STBTT_DEF void stbtt_FreeShape(const stbtt_fontinfo *info, stbtt_vertex *vertices);
 // frees the data allocated above
 
+STBTT_DEF unsigned char *stbtt_FindSVGDoc(const stbtt_fontinfo *info, int gl);
+STBTT_DEF int stbtt_GetCodepointSVG(const stbtt_fontinfo *info, int unicode_codepoint, const char **svg);
+STBTT_DEF int stbtt_GetGlyphSVG(const stbtt_fontinfo *info, int gl, const char **svg);
+// fills svg with the character's SVG data.
+// returns data size or 0 if SVG not found.
+
 //////////////////////////////////////////////////////////////////////////////
 //
 // BITMAP RENDERING
@@ -788,6 +894,10 @@ STBTT_DEF void stbtt_MakeCodepointBitmapSubpixel(const stbtt_fontinfo *info, uns
 // same as stbtt_MakeCodepointBitmap, but you can specify a subpixel
 // shift for the character
 
+STBTT_DEF void stbtt_MakeCodepointBitmapSubpixelPrefilter(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, float shift_x, float shift_y, int oversample_x, int oversample_y, float *sub_x, float *sub_y, int codepoint);
+// same as stbtt_MakeCodepointBitmapSubpixel, but prefiltering
+// is performed (see stbtt_PackSetOversampling)
+
 STBTT_DEF void stbtt_GetCodepointBitmapBox(const stbtt_fontinfo *font, int codepoint, float scale_x, float scale_y, int *ix0, int *iy0, int *ix1, int *iy1);
 // get the bbox of the bitmap centered around the glyph origin; so the
 // bitmap width is ix1-ix0, height is iy1-iy0, and location to place
@@ -805,6 +915,7 @@ STBTT_DEF unsigned char *stbtt_GetGlyphBitmap(const stbtt_fontinfo *info, float
 STBTT_DEF unsigned char *stbtt_GetGlyphBitmapSubpixel(const stbtt_fontinfo *info, float scale_x, float scale_y, float shift_x, float shift_y, int glyph, int *width, int *height, int *xoff, int *yoff);
 STBTT_DEF void stbtt_MakeGlyphBitmap(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, int glyph);
 STBTT_DEF void stbtt_MakeGlyphBitmapSubpixel(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, float shift_x, float shift_y, int glyph);
+STBTT_DEF void stbtt_MakeGlyphBitmapSubpixelPrefilter(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, float shift_x, float shift_y, int oversample_x, int oversample_y, float *sub_x, float *sub_y, int glyph);
 STBTT_DEF void stbtt_GetGlyphBitmapBox(const stbtt_fontinfo *font, int glyph, float scale_x, float scale_y, int *ix0, int *iy0, int *ix1, int *iy1);
 STBTT_DEF void stbtt_GetGlyphBitmapBoxSubpixel(const stbtt_fontinfo *font, int glyph, float scale_x, float scale_y,float shift_x, float shift_y, int *ix0, int *iy0, int *ix1, int *iy1);
 
@@ -827,6 +938,64 @@ STBTT_DEF void stbtt_Rasterize(stbtt__bitmap *result,        // 1-channel bitmap
                                int invert,                   // if non-zero, vertically flip shape
                                void *userdata);              // context for to STBTT_MALLOC
 
+//////////////////////////////////////////////////////////////////////////////
+//
+// Signed Distance Function (or Field) rendering
+
+STBTT_DEF void stbtt_FreeSDF(unsigned char *bitmap, void *userdata);
+// frees the SDF bitmap allocated below
+
+STBTT_DEF unsigned char * stbtt_GetGlyphSDF(const stbtt_fontinfo *info, float scale, int glyph, int padding, unsigned char onedge_value, float pixel_dist_scale, int *width, int *height, int *xoff, int *yoff);
+STBTT_DEF unsigned char * stbtt_GetCodepointSDF(const stbtt_fontinfo *info, float scale, int codepoint, int padding, unsigned char onedge_value, float pixel_dist_scale, int *width, int *height, int *xoff, int *yoff);
+// These functions compute a discretized SDF field for a single character, suitable for storing
+// in a single-channel texture, sampling with bilinear filtering, and testing against
+// larger than some threshold to produce scalable fonts.
+//        info              --  the font
+//        scale             --  controls the size of the resulting SDF bitmap, same as it would be creating a regular bitmap
+//        glyph/codepoint   --  the character to generate the SDF for
+//        padding           --  extra "pixels" around the character which are filled with the distance to the character (not 0),
+//                                 which allows effects like bit outlines
+//        onedge_value      --  value 0-255 to test the SDF against to reconstruct the character (i.e. the isocontour of the character)
+//        pixel_dist_scale  --  what value the SDF should increase by when moving one SDF "pixel" away from the edge (on the 0..255 scale)
+//                                 if positive, > onedge_value is inside; if negative, < onedge_value is inside
+//        width,height      --  output height & width of the SDF bitmap (including padding)
+//        xoff,yoff         --  output origin of the character
+//        return value      --  a 2D array of bytes 0..255, width*height in size
+//
+// pixel_dist_scale & onedge_value are a scale & bias that allows you to make
+// optimal use of the limited 0..255 for your application, trading off precision
+// and special effects. SDF values outside the range 0..255 are clamped to 0..255.
+//
+// Example:
+//      scale = stbtt_ScaleForPixelHeight(22)
+//      padding = 5
+//      onedge_value = 180
+//      pixel_dist_scale = 180/5.0 = 36.0
+//
+//      This will create an SDF bitmap in which the character is about 22 pixels
+//      high but the whole bitmap is about 22+5+5=32 pixels high. To produce a filled
+//      shape, sample the SDF at each pixel and fill the pixel if the SDF value
+//      is greater than or equal to 180/255. (You'll actually want to antialias,
+//      which is beyond the scope of this example.) Additionally, you can compute
+//      offset outlines (e.g. to stroke the character border inside & outside,
+//      or only outside). For example, to fill outside the character up to 3 SDF
+//      pixels, you would compare against (180-36.0*3)/255 = 72/255. The above
+//      choice of variables maps a range from 5 pixels outside the shape to
+//      2 pixels inside the shape to 0..255; this is intended primarily for apply
+//      outside effects only (the interior range is needed to allow proper
+//      antialiasing of the font at *smaller* sizes)
+//
+// The function computes the SDF analytically at each SDF pixel, not by e.g.
+// building a higher-res bitmap and approximating it. In theory the quality
+// should be as high as possible for an SDF of this size & representation, but
+// unclear if this is true in practice (perhaps building a higher-res bitmap
+// and computing from that can allow drop-out prevention).
+//
+// The algorithm has not been optimized at all, so expect it to be slow
+// if computing lots of characters or very large sizes.
+
+
+
 //////////////////////////////////////////////////////////////////////////////
 //
 // Finding the right font...
@@ -956,6 +1125,152 @@ typedef int stbtt__test_oversample_pow2[(STBTT_MAX_OVERSAMPLE & (STBTT_MAX_OVERS
 #define STBTT__NOTUSED(v)  (void)sizeof(v)
 #endif
 
+//////////////////////////////////////////////////////////////////////////
+//
+// stbtt__buf helpers to parse data from file
+//
+
+static stbtt_uint8 stbtt__buf_get8(stbtt__buf *b)
+{
+   if (b->cursor >= b->size)
+      return 0;
+   return b->data[b->cursor++];
+}
+
+static stbtt_uint8 stbtt__buf_peek8(stbtt__buf *b)
+{
+   if (b->cursor >= b->size)
+      return 0;
+   return b->data[b->cursor];
+}
+
+static void stbtt__buf_seek(stbtt__buf *b, int o)
+{
+   STBTT_assert(!(o > b->size || o < 0));
+   b->cursor = (o > b->size || o < 0) ? b->size : o;
+}
+
+static void stbtt__buf_skip(stbtt__buf *b, int o)
+{
+   stbtt__buf_seek(b, b->cursor + o);
+}
+
+static stbtt_uint32 stbtt__buf_get(stbtt__buf *b, int n)
+{
+   stbtt_uint32 v = 0;
+   int i;
+   STBTT_assert(n >= 1 && n <= 4);
+   for (i = 0; i < n; i++)
+      v = (v << 8) | stbtt__buf_get8(b);
+   return v;
+}
+
+static stbtt__buf stbtt__new_buf(const void *p, size_t size)
+{
+   stbtt__buf r;
+   STBTT_assert(size < 0x40000000);
+   r.data = (stbtt_uint8*) p;
+   r.size = (int) size;
+   r.cursor = 0;
+   return r;
+}
+
+#define stbtt__buf_get16(b)  stbtt__buf_get((b), 2)
+#define stbtt__buf_get32(b)  stbtt__buf_get((b), 4)
+
+static stbtt__buf stbtt__buf_range(const stbtt__buf *b, int o, int s)
+{
+   stbtt__buf r = stbtt__new_buf(NULL, 0);
+   if (o < 0 || s < 0 || o > b->size || s > b->size - o) return r;
+   r.data = b->data + o;
+   r.size = s;
+   return r;
+}
+
+static stbtt__buf stbtt__cff_get_index(stbtt__buf *b)
+{
+   int count, start, offsize;
+   start = b->cursor;
+   count = stbtt__buf_get16(b);
+   if (count) {
+      offsize = stbtt__buf_get8(b);
+      STBTT_assert(offsize >= 1 && offsize <= 4);
+      stbtt__buf_skip(b, offsize * count);
+      stbtt__buf_skip(b, stbtt__buf_get(b, offsize) - 1);
+   }
+   return stbtt__buf_range(b, start, b->cursor - start);
+}
+
+static stbtt_uint32 stbtt__cff_int(stbtt__buf *b)
+{
+   int b0 = stbtt__buf_get8(b);
+   if (b0 >= 32 && b0 <= 246)       return b0 - 139;
+   else if (b0 >= 247 && b0 <= 250) return (b0 - 247)*256 + stbtt__buf_get8(b) + 108;
+   else if (b0 >= 251 && b0 <= 254) return -(b0 - 251)*256 - stbtt__buf_get8(b) - 108;
+   else if (b0 == 28)               return stbtt__buf_get16(b);
+   else if (b0 == 29)               return stbtt__buf_get32(b);
+   STBTT_assert(0);
+   return 0;
+}
+
+static void stbtt__cff_skip_operand(stbtt__buf *b) {
+   int v, b0 = stbtt__buf_peek8(b);
+   STBTT_assert(b0 >= 28);
+   if (b0 == 30) {
+      stbtt__buf_skip(b, 1);
+      while (b->cursor < b->size) {
+         v = stbtt__buf_get8(b);
+         if ((v & 0xF) == 0xF || (v >> 4) == 0xF)
+            break;
+      }
+   } else {
+      stbtt__cff_int(b);
+   }
+}
+
+static stbtt__buf stbtt__dict_get(stbtt__buf *b, int key)
+{
+   stbtt__buf_seek(b, 0);
+   while (b->cursor < b->size) {
+      int start = b->cursor, end, op;
+      while (stbtt__buf_peek8(b) >= 28)
+         stbtt__cff_skip_operand(b);
+      end = b->cursor;
+      op = stbtt__buf_get8(b);
+      if (op == 12)  op = stbtt__buf_get8(b) | 0x100;
+      if (op == key) return stbtt__buf_range(b, start, end-start);
+   }
+   return stbtt__buf_range(b, 0, 0);
+}
+
+static void stbtt__dict_get_ints(stbtt__buf *b, int key, int outcount, stbtt_uint32 *out)
+{
+   int i;
+   stbtt__buf operands = stbtt__dict_get(b, key);
+   for (i = 0; i < outcount && operands.cursor < operands.size; i++)
+      out[i] = stbtt__cff_int(&operands);
+}
+
+static int stbtt__cff_index_count(stbtt__buf *b)
+{
+   stbtt__buf_seek(b, 0);
+   return stbtt__buf_get16(b);
+}
+
+static stbtt__buf stbtt__cff_index_get(stbtt__buf b, int i)
+{
+   int count, offsize, start, end;
+   stbtt__buf_seek(&b, 0);
+   count = stbtt__buf_get16(&b);
+   offsize = stbtt__buf_get8(&b);
+   STBTT_assert(i >= 0 && i < count);
+   STBTT_assert(offsize >= 1 && offsize <= 4);
+   stbtt__buf_skip(&b, i*offsize);
+   start = stbtt__buf_get(&b, offsize);
+   end = stbtt__buf_get(&b, offsize);
+   return stbtt__buf_range(&b, 2+(count+1)*offsize+start, end - start);
+}
+
 //////////////////////////////////////////////////////////////////////////
 //
 // accessors to parse data from file
@@ -968,32 +1283,22 @@ typedef int stbtt__test_oversample_pow2[(STBTT_MAX_OVERSAMPLE & (STBTT_MAX_OVERS
 #define ttCHAR(p)     (* (stbtt_int8 *) (p))
 #define ttFixed(p)    ttLONG(p)
 
-#if defined(STB_TRUETYPE_BIGENDIAN) && !defined(ALLOW_UNALIGNED_TRUETYPE)
-
-   #define ttUSHORT(p)   (* (stbtt_uint16 *) (p))
-   #define ttSHORT(p)    (* (stbtt_int16 *) (p))
-   #define ttULONG(p)    (* (stbtt_uint32 *) (p))
-   #define ttLONG(p)     (* (stbtt_int32 *) (p))
-
-#else
-
-   static stbtt_uint16 ttUSHORT(const stbtt_uint8 *p) { return p[0]*256 + p[1]; }
-   static stbtt_int16 ttSHORT(const stbtt_uint8 *p)   { return p[0]*256 + p[1]; }
-   static stbtt_uint32 ttULONG(const stbtt_uint8 *p)  { return (p[0]<<24) + (p[1]<<16) + (p[2]<<8) + p[3]; }
-   static stbtt_int32 ttLONG(const stbtt_uint8 *p)    { return (p[0]<<24) + (p[1]<<16) + (p[2]<<8) + p[3]; }
-
-#endif
+static stbtt_uint16 ttUSHORT(stbtt_uint8 *p) { return p[0]*256 + p[1]; }
+static stbtt_int16 ttSHORT(stbtt_uint8 *p)   { return p[0]*256 + p[1]; }
+static stbtt_uint32 ttULONG(stbtt_uint8 *p)  { return (p[0]<<24) + (p[1]<<16) + (p[2]<<8) + p[3]; }
+static stbtt_int32 ttLONG(stbtt_uint8 *p)    { return (p[0]<<24) + (p[1]<<16) + (p[2]<<8) + p[3]; }
 
 #define stbtt_tag4(p,c0,c1,c2,c3) ((p)[0] == (c0) && (p)[1] == (c1) && (p)[2] == (c2) && (p)[3] == (c3))
 #define stbtt_tag(p,str)           stbtt_tag4(p,str[0],str[1],str[2],str[3])
 
-static int stbtt__isfont(const stbtt_uint8 *font)
+static int stbtt__isfont(stbtt_uint8 *font)
 {
    // check the version number
    if (stbtt_tag4(font, '1',0,0,0))  return 1; // TrueType 1
    if (stbtt_tag(font, "typ1"))   return 1; // TrueType with type 1 font -- we don't support this!
    if (stbtt_tag(font, "OTTO"))   return 1; // OpenType with CFF
    if (stbtt_tag4(font, 0,1,0,0)) return 1; // OpenType 1.0
+   if (stbtt_tag(font, "true"))   return 1; // Apple specification for TrueType fonts
    return 0;
 }
 
@@ -1011,7 +1316,7 @@ static stbtt_uint32 stbtt__find_table(stbtt_uint8 *data, stbtt_uint32 fontstart,
    return 0;
 }
 
-STBTT_DEF int stbtt_GetFontOffsetForIndex(const unsigned char *font_collection, int index)
+static int stbtt_GetFontOffsetForIndex_internal(unsigned char *font_collection, int index)
 {
    // if it's just a font, there's only one valid index
    if (stbtt__isfont(font_collection))
@@ -1030,14 +1335,59 @@ STBTT_DEF int stbtt_GetFontOffsetForIndex(const unsigned char *font_collection,
    return -1;
 }
 
-STBTT_DEF int stbtt_InitFont(stbtt_fontinfo *info, const unsigned char *data2, int fontstart)
+static int stbtt_GetNumberOfFonts_internal(unsigned char *font_collection)
+{
+   // if it's just a font, there's only one valid font
+   if (stbtt__isfont(font_collection))
+      return 1;
+
+   // check if it's a TTC
+   if (stbtt_tag(font_collection, "ttcf")) {
+      // version 1?
+      if (ttULONG(font_collection+4) == 0x00010000 || ttULONG(font_collection+4) == 0x00020000) {
+         return ttLONG(font_collection+8);
+      }
+   }
+   return 0;
+}
+
+static stbtt__buf stbtt__get_subrs(stbtt__buf cff, stbtt__buf fontdict)
+{
+   stbtt_uint32 subrsoff = 0, private_loc[2] = { 0, 0 };
+   stbtt__buf pdict;
+   stbtt__dict_get_ints(&fontdict, 18, 2, private_loc);
+   if (!private_loc[1] || !private_loc[0]) return stbtt__new_buf(NULL, 0);
+   pdict = stbtt__buf_range(&cff, private_loc[1], private_loc[0]);
+   stbtt__dict_get_ints(&pdict, 19, 1, &subrsoff);
+   if (!subrsoff) return stbtt__new_buf(NULL, 0);
+   stbtt__buf_seek(&cff, private_loc[1]+subrsoff);
+   return stbtt__cff_get_index(&cff);
+}
+
+// since most people won't use this, find this table the first time it's needed
+static int stbtt__get_svg(stbtt_fontinfo *info)
+{
+   stbtt_uint32 t;
+   if (info->svg < 0) {
+      t = stbtt__find_table(info->data, info->fontstart, "SVG ");
+      if (t) {
+         stbtt_uint32 offset = ttULONG(info->data + t + 2);
+         info->svg = t + offset;
+      } else {
+         info->svg = 0;
+      }
+   }
+   return info->svg;
+}
+
+static int stbtt_InitFont_internal(stbtt_fontinfo *info, unsigned char *data, int fontstart)
 {
-   stbtt_uint8 *data = (stbtt_uint8 *) data2;
    stbtt_uint32 cmap, t;
    stbtt_int32 i,numTables;
 
    info->data = data;
    info->fontstart = fontstart;
+   info->cff = stbtt__new_buf(NULL, 0);
 
    cmap = stbtt__find_table(data, fontstart, "cmap");       // required
    info->loca = stbtt__find_table(data, fontstart, "loca"); // required
@@ -1046,8 +1396,62 @@ STBTT_DEF int stbtt_InitFont(stbtt_fontinfo *info, const unsigned char *data2, i
    info->hhea = stbtt__find_table(data, fontstart, "hhea"); // required
    info->hmtx = stbtt__find_table(data, fontstart, "hmtx"); // required
    info->kern = stbtt__find_table(data, fontstart, "kern"); // not required
-   if (!cmap || !info->loca || !info->head || !info->glyf || !info->hhea || !info->hmtx)
+   info->gpos = stbtt__find_table(data, fontstart, "GPOS"); // not required
+
+   if (!cmap || !info->head || !info->hhea || !info->hmtx)
       return 0;
+   if (info->glyf) {
+      // required for truetype
+      if (!info->loca) return 0;
+   } else {
+      // initialization for CFF / Type2 fonts (OTF)
+      stbtt__buf b, topdict, topdictidx;
+      stbtt_uint32 cstype = 2, charstrings = 0, fdarrayoff = 0, fdselectoff = 0;
+      stbtt_uint32 cff;
+
+      cff = stbtt__find_table(data, fontstart, "CFF ");
+      if (!cff) return 0;
+
+      info->fontdicts = stbtt__new_buf(NULL, 0);
+      info->fdselect = stbtt__new_buf(NULL, 0);
+
+      // @TODO this should use size from table (not 512MB)
+      info->cff = stbtt__new_buf(data+cff, 512*1024*1024);
+      b = info->cff;
+
+      // read the header
+      stbtt__buf_skip(&b, 2);
+      stbtt__buf_seek(&b, stbtt__buf_get8(&b)); // hdrsize
+
+      // @TODO the name INDEX could list multiple fonts,
+      // but we just use the first one.
+      stbtt__cff_get_index(&b);  // name INDEX
+      topdictidx = stbtt__cff_get_index(&b);
+      topdict = stbtt__cff_index_get(topdictidx, 0);
+      stbtt__cff_get_index(&b);  // string INDEX
+      info->gsubrs = stbtt__cff_get_index(&b);
+
+      stbtt__dict_get_ints(&topdict, 17, 1, &charstrings);
+      stbtt__dict_get_ints(&topdict, 0x100 | 6, 1, &cstype);
+      stbtt__dict_get_ints(&topdict, 0x100 | 36, 1, &fdarrayoff);
+      stbtt__dict_get_ints(&topdict, 0x100 | 37, 1, &fdselectoff);
+      info->subrs = stbtt__get_subrs(b, topdict);
+
+      // we only support Type 2 charstrings
+      if (cstype != 2) return 0;
+      if (charstrings == 0) return 0;
+
+      if (fdarrayoff) {
+         // looks like a CID font
+         if (!fdselectoff) return 0;
+         stbtt__buf_seek(&b, fdarrayoff);
+         info->fontdicts = stbtt__cff_get_index(&b);
+         info->fdselect = stbtt__buf_range(&b, fdselectoff, b.size-fdselectoff);
+      }
+
+      stbtt__buf_seek(&b, charstrings);
+      info->charstrings = stbtt__cff_get_index(&b);
+   }
 
    t = stbtt__find_table(data, fontstart, "maxp");
    if (t)
@@ -1055,6 +1459,8 @@ STBTT_DEF int stbtt_InitFont(stbtt_fontinfo *info, const unsigned char *data2, i
    else
       info->numGlyphs = 0xffff;
 
+   info->svg = -1;
+
    // find a cmap encoding table we understand *now* to avoid searching
    // later. (todo: could make this installable)
    // the same regardless of glyph.
@@ -1138,12 +1544,12 @@ STBTT_DEF int stbtt_FindGlyphIndex(const stbtt_fontinfo *info, int unicode_codep
       search += 2;
 
       {
-         stbtt_uint16 offset, start;
+         stbtt_uint16 offset, start, last;
          stbtt_uint16 item = (stbtt_uint16) ((search - endCount) >> 1);
 
-         STBTT_assert(unicode_codepoint <= ttUSHORT(data + endCount + 2*item));
          start = ttUSHORT(data + index_map + 14 + segcount*2 + 2 + 2*item);
-         if (unicode_codepoint < start)
+         last = ttUSHORT(data + endCount + 2*item);
+         if (unicode_codepoint < start || unicode_codepoint > last)
             return 0;
 
          offset = ttUSHORT(data + index_map + 14 + segcount*6 + 2 + 2*item);
@@ -1198,6 +1604,8 @@ static int stbtt__GetGlyfOffset(const stbtt_fontinfo *info, int glyph_index)
 {
    int g1,g2;
 
+   STBTT_assert(!info->cff.size);
+
    if (glyph_index >= info->numGlyphs) return -1; // glyph index out of range
    if (info->indexToLocFormat >= 2)    return -1; // unknown index->glyph map format
 
@@ -1212,15 +1620,21 @@ static int stbtt__GetGlyfOffset(const stbtt_fontinfo *info, int glyph_index)
    return g1==g2 ? -1 : g1; // if length is 0, return -1
 }
 
+static int stbtt__GetGlyphInfoT2(const stbtt_fontinfo *info, int glyph_index, int *x0, int *y0, int *x1, int *y1);
+
 STBTT_DEF int stbtt_GetGlyphBox(const stbtt_fontinfo *info, int glyph_index, int *x0, int *y0, int *x1, int *y1)
 {
-   int g = stbtt__GetGlyfOffset(info, glyph_index);
-   if (g < 0) return 0;
+   if (info->cff.size) {
+      stbtt__GetGlyphInfoT2(info, glyph_index, x0, y0, x1, y1);
+   } else {
+      int g = stbtt__GetGlyfOffset(info, glyph_index);
+      if (g < 0) return 0;
 
-   if (x0) *x0 = ttSHORT(info->data + g + 2);
-   if (y0) *y0 = ttSHORT(info->data + g + 4);
-   if (x1) *x1 = ttSHORT(info->data + g + 6);
-   if (y1) *y1 = ttSHORT(info->data + g + 8);
+      if (x0) *x0 = ttSHORT(info->data + g + 2);
+      if (y0) *y0 = ttSHORT(info->data + g + 4);
+      if (x1) *x1 = ttSHORT(info->data + g + 6);
+      if (y1) *y1 = ttSHORT(info->data + g + 8);
+   }
    return 1;
 }
 
@@ -1232,7 +1646,10 @@ STBTT_DEF int stbtt_GetCodepointBox(const stbtt_fontinfo *info, int codepoint, i
 STBTT_DEF int stbtt_IsGlyphEmpty(const stbtt_fontinfo *info, int glyph_index)
 {
    stbtt_int16 numberOfContours;
-   int g = stbtt__GetGlyfOffset(info, glyph_index);
+   int g;
+   if (info->cff.size)
+      return stbtt__GetGlyphInfoT2(info, glyph_index, NULL, NULL, NULL, NULL) == 0;
+   g = stbtt__GetGlyfOffset(info, glyph_index);
    if (g < 0) return 1;
    numberOfContours = ttSHORT(info->data + g);
    return numberOfContours == 0;
@@ -1254,7 +1671,7 @@ static int stbtt__close_shape(stbtt_vertex *vertices, int num_vertices, int was_
    return num_vertices;
 }
 
-STBTT_DEF int stbtt_GetGlyphShape(const stbtt_fontinfo *info, int glyph_index, stbtt_vertex **pvertices)
+static int stbtt__GetGlyphShapeTT(const stbtt_fontinfo *info, int glyph_index, stbtt_vertex **pvertices)
 {
    stbtt_int16 numberOfContours;
    stbtt_uint8 *endPtsOfContours;
@@ -1350,7 +1767,7 @@ STBTT_DEF int stbtt_GetGlyphShape(const stbtt_fontinfo *info, int glyph_index, s
             if (i != 0)
                num_vertices = stbtt__close_shape(vertices, num_vertices, was_off, start_off, sx,sy,scx,scy,cx,cy);
 
-            // now start the new one               
+            // now start the new one
             start_off = !(flags & 1);
             if (start_off) {
                // if we start off with an off-curve point, then when we need to find a point on the curve
@@ -1392,7 +1809,7 @@ STBTT_DEF int stbtt_GetGlyphShape(const stbtt_fontinfo *info, int glyph_index, s
          }
       }
       num_vertices = stbtt__close_shape(vertices, num_vertices, was_off, start_off, sx,sy,scx,scy,cx,cy);
-   } else if (numberOfContours == -1) {
+   } else if (numberOfContours < 0) {
       // Compound shapes.
       int more = 1;
       stbtt_uint8 *comp = data + g + 10;
@@ -1403,7 +1820,7 @@ STBTT_DEF int stbtt_GetGlyphShape(const stbtt_fontinfo *info, int glyph_index, s
          int comp_num_verts = 0, i;
          stbtt_vertex *comp_verts = 0, *tmp = 0;
          float mtx[6] = {1,0,0,1,0,0}, m, n;
-         
+
          flags = ttSHORT(comp); comp+=2;
          gidx = ttSHORT(comp); comp+=2;
 
@@ -1433,7 +1850,7 @@ STBTT_DEF int stbtt_GetGlyphShape(const stbtt_fontinfo *info, int glyph_index, s
             mtx[2] = ttSHORT(comp)/16384.0f; comp+=2;
             mtx[3] = ttSHORT(comp)/16384.0f; comp+=2;
          }
-         
+
          // Find transformation scales.
          m = (float) STBTT_sqrt(mtx[0]*mtx[0] + mtx[1]*mtx[1]);
          n = (float) STBTT_sqrt(mtx[2]*mtx[2] + mtx[3]*mtx[3]);
@@ -1459,7 +1876,7 @@ STBTT_DEF int stbtt_GetGlyphShape(const stbtt_fontinfo *info, int glyph_index, s
                if (comp_verts) STBTT_free(comp_verts, info->userdata);
                return 0;
             }
-            if (num_vertices > 0) STBTT_memcpy(tmp, vertices, num_vertices*sizeof(stbtt_vertex));
+            if (num_vertices > 0 && vertices) STBTT_memcpy(tmp, vertices, num_vertices*sizeof(stbtt_vertex));
             STBTT_memcpy(tmp+num_vertices, comp_verts, comp_num_verts*sizeof(stbtt_vertex));
             if (vertices) STBTT_free(vertices, info->userdata);
             vertices = tmp;
@@ -1469,30 +1886,477 @@ STBTT_DEF int stbtt_GetGlyphShape(const stbtt_fontinfo *info, int glyph_index, s
          // More components ?
          more = flags & (1<<5);
       }
-   } else if (numberOfContours < 0) {
-      // @TODO other compound variations?
-      STBTT_assert(0);
    } else {
       // numberOfCounters == 0, do nothing
    }
 
-   *pvertices = vertices;
-   return num_vertices;
-}
+   *pvertices = vertices;
+   return num_vertices;
+}
+
+typedef struct
+{
+   int bounds;
+   int started;
+   float first_x, first_y;
+   float x, y;
+   stbtt_int32 min_x, max_x, min_y, max_y;
+
+   stbtt_vertex *pvertices;
+   int num_vertices;
+} stbtt__csctx;
+
+#define STBTT__CSCTX_INIT(bounds) {bounds,0, 0,0, 0,0, 0,0,0,0, NULL, 0}
+
+static void stbtt__track_vertex(stbtt__csctx *c, stbtt_int32 x, stbtt_int32 y)
+{
+   if (x > c->max_x || !c->started) c->max_x = x;
+   if (y > c->max_y || !c->started) c->max_y = y;
+   if (x < c->min_x || !c->started) c->min_x = x;
+   if (y < c->min_y || !c->started) c->min_y = y;
+   c->started = 1;
+}
+
+static void stbtt__csctx_v(stbtt__csctx *c, stbtt_uint8 type, stbtt_int32 x, stbtt_int32 y, stbtt_int32 cx, stbtt_int32 cy, stbtt_int32 cx1, stbtt_int32 cy1)
+{
+   if (c->bounds) {
+      stbtt__track_vertex(c, x, y);
+      if (type == STBTT_vcubic) {
+         stbtt__track_vertex(c, cx, cy);
+         stbtt__track_vertex(c, cx1, cy1);
+      }
+   } else {
+      stbtt_setvertex(&c->pvertices[c->num_vertices], type, x, y, cx, cy);
+      c->pvertices[c->num_vertices].cx1 = (stbtt_int16) cx1;
+      c->pvertices[c->num_vertices].cy1 = (stbtt_int16) cy1;
+   }
+   c->num_vertices++;
+}
+
+static void stbtt__csctx_close_shape(stbtt__csctx *ctx)
+{
+   if (ctx->first_x != ctx->x || ctx->first_y != ctx->y)
+      stbtt__csctx_v(ctx, STBTT_vline, (int)ctx->first_x, (int)ctx->first_y, 0, 0, 0, 0);
+}
+
+static void stbtt__csctx_rmove_to(stbtt__csctx *ctx, float dx, float dy)
+{
+   stbtt__csctx_close_shape(ctx);
+   ctx->first_x = ctx->x = ctx->x + dx;
+   ctx->first_y = ctx->y = ctx->y + dy;
+   stbtt__csctx_v(ctx, STBTT_vmove, (int)ctx->x, (int)ctx->y, 0, 0, 0, 0);
+}
+
+static void stbtt__csctx_rline_to(stbtt__csctx *ctx, float dx, float dy)
+{
+   ctx->x += dx;
+   ctx->y += dy;
+   stbtt__csctx_v(ctx, STBTT_vline, (int)ctx->x, (int)ctx->y, 0, 0, 0, 0);
+}
+
+static void stbtt__csctx_rccurve_to(stbtt__csctx *ctx, float dx1, float dy1, float dx2, float dy2, float dx3, float dy3)
+{
+   float cx1 = ctx->x + dx1;
+   float cy1 = ctx->y + dy1;
+   float cx2 = cx1 + dx2;
+   float cy2 = cy1 + dy2;
+   ctx->x = cx2 + dx3;
+   ctx->y = cy2 + dy3;
+   stbtt__csctx_v(ctx, STBTT_vcubic, (int)ctx->x, (int)ctx->y, (int)cx1, (int)cy1, (int)cx2, (int)cy2);
+}
+
+static stbtt__buf stbtt__get_subr(stbtt__buf idx, int n)
+{
+   int count = stbtt__cff_index_count(&idx);
+   int bias = 107;
+   if (count >= 33900)
+      bias = 32768;
+   else if (count >= 1240)
+      bias = 1131;
+   n += bias;
+   if (n < 0 || n >= count)
+      return stbtt__new_buf(NULL, 0);
+   return stbtt__cff_index_get(idx, n);
+}
+
+static stbtt__buf stbtt__cid_get_glyph_subrs(const stbtt_fontinfo *info, int glyph_index)
+{
+   stbtt__buf fdselect = info->fdselect;
+   int nranges, start, end, v, fmt, fdselector = -1, i;
+
+   stbtt__buf_seek(&fdselect, 0);
+   fmt = stbtt__buf_get8(&fdselect);
+   if (fmt == 0) {
+      // untested
+      stbtt__buf_skip(&fdselect, glyph_index);
+      fdselector = stbtt__buf_get8(&fdselect);
+   } else if (fmt == 3) {
+      nranges = stbtt__buf_get16(&fdselect);
+      start = stbtt__buf_get16(&fdselect);
+      for (i = 0; i < nranges; i++) {
+         v = stbtt__buf_get8(&fdselect);
+         end = stbtt__buf_get16(&fdselect);
+         if (glyph_index >= start && glyph_index < end) {
+            fdselector = v;
+            break;
+         }
+         start = end;
+      }
+   }
+   if (fdselector == -1) stbtt__new_buf(NULL, 0);
+   return stbtt__get_subrs(info->cff, stbtt__cff_index_get(info->fontdicts, fdselector));
+}
+
+static int stbtt__run_charstring(const stbtt_fontinfo *info, int glyph_index, stbtt__csctx *c)
+{
+   int in_header = 1, maskbits = 0, subr_stack_height = 0, sp = 0, v, i, b0;
+   int has_subrs = 0, clear_stack;
+   float s[48];
+   stbtt__buf subr_stack[10], subrs = info->subrs, b;
+   float f;
+
+#define STBTT__CSERR(s) (0)
+
+   // this currently ignores the initial width value, which isn't needed if we have hmtx
+   b = stbtt__cff_index_get(info->charstrings, glyph_index);
+   while (b.cursor < b.size) {
+      i = 0;
+      clear_stack = 1;
+      b0 = stbtt__buf_get8(&b);
+      switch (b0) {
+      // @TODO implement hinting
+      case 0x13: // hintmask
+      case 0x14: // cntrmask
+         if (in_header)
+            maskbits += (sp / 2); // implicit "vstem"
+         in_header = 0;
+         stbtt__buf_skip(&b, (maskbits + 7) / 8);
+         break;
+
+      case 0x01: // hstem
+      case 0x03: // vstem
+      case 0x12: // hstemhm
+      case 0x17: // vstemhm
+         maskbits += (sp / 2);
+         break;
+
+      case 0x15: // rmoveto
+         in_header = 0;
+         if (sp < 2) return STBTT__CSERR("rmoveto stack");
+         stbtt__csctx_rmove_to(c, s[sp-2], s[sp-1]);
+         break;
+      case 0x04: // vmoveto
+         in_header = 0;
+         if (sp < 1) return STBTT__CSERR("vmoveto stack");
+         stbtt__csctx_rmove_to(c, 0, s[sp-1]);
+         break;
+      case 0x16: // hmoveto
+         in_header = 0;
+         if (sp < 1) return STBTT__CSERR("hmoveto stack");
+         stbtt__csctx_rmove_to(c, s[sp-1], 0);
+         break;
+
+      case 0x05: // rlineto
+         if (sp < 2) return STBTT__CSERR("rlineto stack");
+         for (; i + 1 < sp; i += 2)
+            stbtt__csctx_rline_to(c, s[i], s[i+1]);
+         break;
+
+      // hlineto/vlineto and vhcurveto/hvcurveto alternate horizontal and vertical
+      // starting from a different place.
+
+      case 0x07: // vlineto
+         if (sp < 1) return STBTT__CSERR("vlineto stack");
+         goto vlineto;
+      case 0x06: // hlineto
+         if (sp < 1) return STBTT__CSERR("hlineto stack");
+         for (;;) {
+            if (i >= sp) break;
+            stbtt__csctx_rline_to(c, s[i], 0);
+            i++;
+      vlineto:
+            if (i >= sp) break;
+            stbtt__csctx_rline_to(c, 0, s[i]);
+            i++;
+         }
+         break;
+
+      case 0x1F: // hvcurveto
+         if (sp < 4) return STBTT__CSERR("hvcurveto stack");
+         goto hvcurveto;
+      case 0x1E: // vhcurveto
+         if (sp < 4) return STBTT__CSERR("vhcurveto stack");
+         for (;;) {
+            if (i + 3 >= sp) break;
+            stbtt__csctx_rccurve_to(c, 0, s[i], s[i+1], s[i+2], s[i+3], (sp - i == 5) ? s[i + 4] : 0.0f);
+            i += 4;
+      hvcurveto:
+            if (i + 3 >= sp) break;
+            stbtt__csctx_rccurve_to(c, s[i], 0, s[i+1], s[i+2], (sp - i == 5) ? s[i+4] : 0.0f, s[i+3]);
+            i += 4;
+         }
+         break;
+
+      case 0x08: // rrcurveto
+         if (sp < 6) return STBTT__CSERR("rcurveline stack");
+         for (; i + 5 < sp; i += 6)
+            stbtt__csctx_rccurve_to(c, s[i], s[i+1], s[i+2], s[i+3], s[i+4], s[i+5]);
+         break;
+
+      case 0x18: // rcurveline
+         if (sp < 8) return STBTT__CSERR("rcurveline stack");
+         for (; i + 5 < sp - 2; i += 6)
+            stbtt__csctx_rccurve_to(c, s[i], s[i+1], s[i+2], s[i+3], s[i+4], s[i+5]);
+         if (i + 1 >= sp) return STBTT__CSERR("rcurveline stack");
+         stbtt__csctx_rline_to(c, s[i], s[i+1]);
+         break;
+
+      case 0x19: // rlinecurve
+         if (sp < 8) return STBTT__CSERR("rlinecurve stack");
+         for (; i + 1 < sp - 6; i += 2)
+            stbtt__csctx_rline_to(c, s[i], s[i+1]);
+         if (i + 5 >= sp) return STBTT__CSERR("rlinecurve stack");
+         stbtt__csctx_rccurve_to(c, s[i], s[i+1], s[i+2], s[i+3], s[i+4], s[i+5]);
+         break;
+
+      case 0x1A: // vvcurveto
+      case 0x1B: // hhcurveto
+         if (sp < 4) return STBTT__CSERR("(vv|hh)curveto stack");
+         f = 0.0;
+         if (sp & 1) { f = s[i]; i++; }
+         for (; i + 3 < sp; i += 4) {
+            if (b0 == 0x1B)
+               stbtt__csctx_rccurve_to(c, s[i], f, s[i+1], s[i+2], s[i+3], 0.0);
+            else
+               stbtt__csctx_rccurve_to(c, f, s[i], s[i+1], s[i+2], 0.0, s[i+3]);
+            f = 0.0;
+         }
+         break;
+
+      case 0x0A: // callsubr
+         if (!has_subrs) {
+            if (info->fdselect.size)
+               subrs = stbtt__cid_get_glyph_subrs(info, glyph_index);
+            has_subrs = 1;
+         }
+         // FALLTHROUGH
+      case 0x1D: // callgsubr
+         if (sp < 1) return STBTT__CSERR("call(g|)subr stack");
+         v = (int) s[--sp];
+         if (subr_stack_height >= 10) return STBTT__CSERR("recursion limit");
+         subr_stack[subr_stack_height++] = b;
+         b = stbtt__get_subr(b0 == 0x0A ? subrs : info->gsubrs, v);
+         if (b.size == 0) return STBTT__CSERR("subr not found");
+         b.cursor = 0;
+         clear_stack = 0;
+         break;
+
+      case 0x0B: // return
+         if (subr_stack_height <= 0) return STBTT__CSERR("return outside subr");
+         b = subr_stack[--subr_stack_height];
+         clear_stack = 0;
+         break;
+
+      case 0x0E: // endchar
+         stbtt__csctx_close_shape(c);
+         return 1;
+
+      case 0x0C: { // two-byte escape
+         float dx1, dx2, dx3, dx4, dx5, dx6, dy1, dy2, dy3, dy4, dy5, dy6;
+         float dx, dy;
+         int b1 = stbtt__buf_get8(&b);
+         switch (b1) {
+         // @TODO These "flex" implementations ignore the flex-depth and resolution,
+         // and always draw beziers.
+         case 0x22: // hflex
+            if (sp < 7) return STBTT__CSERR("hflex stack");
+            dx1 = s[0];
+            dx2 = s[1];
+            dy2 = s[2];
+            dx3 = s[3];
+            dx4 = s[4];
+            dx5 = s[5];
+            dx6 = s[6];
+            stbtt__csctx_rccurve_to(c, dx1, 0, dx2, dy2, dx3, 0);
+            stbtt__csctx_rccurve_to(c, dx4, 0, dx5, -dy2, dx6, 0);
+            break;
+
+         case 0x23: // flex
+            if (sp < 13) return STBTT__CSERR("flex stack");
+            dx1 = s[0];
+            dy1 = s[1];
+            dx2 = s[2];
+            dy2 = s[3];
+            dx3 = s[4];
+            dy3 = s[5];
+            dx4 = s[6];
+            dy4 = s[7];
+            dx5 = s[8];
+            dy5 = s[9];
+            dx6 = s[10];
+            dy6 = s[11];
+            //fd is s[12]
+            stbtt__csctx_rccurve_to(c, dx1, dy1, dx2, dy2, dx3, dy3);
+            stbtt__csctx_rccurve_to(c, dx4, dy4, dx5, dy5, dx6, dy6);
+            break;
+
+         case 0x24: // hflex1
+            if (sp < 9) return STBTT__CSERR("hflex1 stack");
+            dx1 = s[0];
+            dy1 = s[1];
+            dx2 = s[2];
+            dy2 = s[3];
+            dx3 = s[4];
+            dx4 = s[5];
+            dx5 = s[6];
+            dy5 = s[7];
+            dx6 = s[8];
+            stbtt__csctx_rccurve_to(c, dx1, dy1, dx2, dy2, dx3, 0);
+            stbtt__csctx_rccurve_to(c, dx4, 0, dx5, dy5, dx6, -(dy1+dy2+dy5));
+            break;
+
+         case 0x25: // flex1
+            if (sp < 11) return STBTT__CSERR("flex1 stack");
+            dx1 = s[0];
+            dy1 = s[1];
+            dx2 = s[2];
+            dy2 = s[3];
+            dx3 = s[4];
+            dy3 = s[5];
+            dx4 = s[6];
+            dy4 = s[7];
+            dx5 = s[8];
+            dy5 = s[9];
+            dx6 = dy6 = s[10];
+            dx = dx1+dx2+dx3+dx4+dx5;
+            dy = dy1+dy2+dy3+dy4+dy5;
+            if (STBTT_fabs(dx) > STBTT_fabs(dy))
+               dy6 = -dy;
+            else
+               dx6 = -dx;
+            stbtt__csctx_rccurve_to(c, dx1, dy1, dx2, dy2, dx3, dy3);
+            stbtt__csctx_rccurve_to(c, dx4, dy4, dx5, dy5, dx6, dy6);
+            break;
+
+         default:
+            return STBTT__CSERR("unimplemented");
+         }
+      } break;
+
+      default:
+         if (b0 != 255 && b0 != 28 && b0 < 32)
+            return STBTT__CSERR("reserved operator");
+
+         // push immediate
+         if (b0 == 255) {
+            f = (float)(stbtt_int32)stbtt__buf_get32(&b) / 0x10000;
+         } else {
+            stbtt__buf_skip(&b, -1);
+            f = (float)(stbtt_int16)stbtt__cff_int(&b);
+         }
+         if (sp >= 48) return STBTT__CSERR("push stack overflow");
+         s[sp++] = f;
+         clear_stack = 0;
+         break;
+      }
+      if (clear_stack) sp = 0;
+   }
+   return STBTT__CSERR("no endchar");
+
+#undef STBTT__CSERR
+}
+
+static int stbtt__GetGlyphShapeT2(const stbtt_fontinfo *info, int glyph_index, stbtt_vertex **pvertices)
+{
+   // runs the charstring twice, once to count and once to output (to avoid realloc)
+   stbtt__csctx count_ctx = STBTT__CSCTX_INIT(1);
+   stbtt__csctx output_ctx = STBTT__CSCTX_INIT(0);
+   if (stbtt__run_charstring(info, glyph_index, &count_ctx)) {
+      *pvertices = (stbtt_vertex*)STBTT_malloc(count_ctx.num_vertices*sizeof(stbtt_vertex), info->userdata);
+      output_ctx.pvertices = *pvertices;
+      if (stbtt__run_charstring(info, glyph_index, &output_ctx)) {
+         STBTT_assert(output_ctx.num_vertices == count_ctx.num_vertices);
+         return output_ctx.num_vertices;
+      }
+   }
+   *pvertices = NULL;
+   return 0;
+}
+
+static int stbtt__GetGlyphInfoT2(const stbtt_fontinfo *info, int glyph_index, int *x0, int *y0, int *x1, int *y1)
+{
+   stbtt__csctx c = STBTT__CSCTX_INIT(1);
+   int r = stbtt__run_charstring(info, glyph_index, &c);
+   if (x0)  *x0 = r ? c.min_x : 0;
+   if (y0)  *y0 = r ? c.min_y : 0;
+   if (x1)  *x1 = r ? c.max_x : 0;
+   if (y1)  *y1 = r ? c.max_y : 0;
+   return r ? c.num_vertices : 0;
+}
+
+STBTT_DEF int stbtt_GetGlyphShape(const stbtt_fontinfo *info, int glyph_index, stbtt_vertex **pvertices)
+{
+   if (!info->cff.size)
+      return stbtt__GetGlyphShapeTT(info, glyph_index, pvertices);
+   else
+      return stbtt__GetGlyphShapeT2(info, glyph_index, pvertices);
+}
+
+STBTT_DEF void stbtt_GetGlyphHMetrics(const stbtt_fontinfo *info, int glyph_index, int *advanceWidth, int *leftSideBearing)
+{
+   stbtt_uint16 numOfLongHorMetrics = ttUSHORT(info->data+info->hhea + 34);
+   if (glyph_index < numOfLongHorMetrics) {
+      if (advanceWidth)     *advanceWidth    = ttSHORT(info->data + info->hmtx + 4*glyph_index);
+      if (leftSideBearing)  *leftSideBearing = ttSHORT(info->data + info->hmtx + 4*glyph_index + 2);
+   } else {
+      if (advanceWidth)     *advanceWidth    = ttSHORT(info->data + info->hmtx + 4*(numOfLongHorMetrics-1));
+      if (leftSideBearing)  *leftSideBearing = ttSHORT(info->data + info->hmtx + 4*numOfLongHorMetrics + 2*(glyph_index - numOfLongHorMetrics));
+   }
+}
+
+STBTT_DEF int  stbtt_GetKerningTableLength(const stbtt_fontinfo *info)
+{
+   stbtt_uint8 *data = info->data + info->kern;
+
+   // we only look at the first table. it must be 'horizontal' and format 0.
+   if (!info->kern)
+      return 0;
+   if (ttUSHORT(data+2) < 1) // number of tables, need at least 1
+      return 0;
+   if (ttUSHORT(data+8) != 1) // horizontal flag must be set in format
+      return 0;
+
+   return ttUSHORT(data+10);
+}
+
+STBTT_DEF int stbtt_GetKerningTable(const stbtt_fontinfo *info, stbtt_kerningentry* table, int table_length)
+{
+   stbtt_uint8 *data = info->data + info->kern;
+   int k, length;
+
+   // we only look at the first table. it must be 'horizontal' and format 0.
+   if (!info->kern)
+      return 0;
+   if (ttUSHORT(data+2) < 1) // number of tables, need at least 1
+      return 0;
+   if (ttUSHORT(data+8) != 1) // horizontal flag must be set in format
+      return 0;
+
+   length = ttUSHORT(data+10);
+   if (table_length < length)
+      length = table_length;
 
-STBTT_DEF void stbtt_GetGlyphHMetrics(const stbtt_fontinfo *info, int glyph_index, int *advanceWidth, int *leftSideBearing)
-{
-   stbtt_uint16 numOfLongHorMetrics = ttUSHORT(info->data+info->hhea + 34);
-   if (glyph_index < numOfLongHorMetrics) {
-      if (advanceWidth)     *advanceWidth    = ttSHORT(info->data + info->hmtx + 4*glyph_index);
-      if (leftSideBearing)  *leftSideBearing = ttSHORT(info->data + info->hmtx + 4*glyph_index + 2);
-   } else {
-      if (advanceWidth)     *advanceWidth    = ttSHORT(info->data + info->hmtx + 4*(numOfLongHorMetrics-1));
-      if (leftSideBearing)  *leftSideBearing = ttSHORT(info->data + info->hmtx + 4*numOfLongHorMetrics + 2*(glyph_index - numOfLongHorMetrics));
+   for (k = 0; k < length; k++)
+   {
+      table[k].glyph1 = ttUSHORT(data+18+(k*6));
+      table[k].glyph2 = ttUSHORT(data+20+(k*6));
+      table[k].advance = ttSHORT(data+22+(k*6));
    }
+
+   return length;
 }
 
-STBTT_DEF int  stbtt_GetGlyphKernAdvance(const stbtt_fontinfo *info, int glyph1, int glyph2)
+static int stbtt__GetGlyphKernInfoAdvance(const stbtt_fontinfo *info, int glyph1, int glyph2)
 {
    stbtt_uint8 *data = info->data + info->kern;
    stbtt_uint32 needle, straw;
@@ -1522,9 +2386,242 @@ STBTT_DEF int  stbtt_GetGlyphKernAdvance(const stbtt_fontinfo *info, int glyph1,
    return 0;
 }
 
+static stbtt_int32 stbtt__GetCoverageIndex(stbtt_uint8 *coverageTable, int glyph)
+{
+   stbtt_uint16 coverageFormat = ttUSHORT(coverageTable);
+   switch (coverageFormat) {
+      case 1: {
+         stbtt_uint16 glyphCount = ttUSHORT(coverageTable + 2);
+
+         // Binary search.
+         stbtt_int32 l=0, r=glyphCount-1, m;
+         int straw, needle=glyph;
+         while (l <= r) {
+            stbtt_uint8 *glyphArray = coverageTable + 4;
+            stbtt_uint16 glyphID;
+            m = (l + r) >> 1;
+            glyphID = ttUSHORT(glyphArray + 2 * m);
+            straw = glyphID;
+            if (needle < straw)
+               r = m - 1;
+            else if (needle > straw)
+               l = m + 1;
+            else {
+               return m;
+            }
+         }
+         break;
+      }
+
+      case 2: {
+         stbtt_uint16 rangeCount = ttUSHORT(coverageTable + 2);
+         stbtt_uint8 *rangeArray = coverageTable + 4;
+
+         // Binary search.
+         stbtt_int32 l=0, r=rangeCount-1, m;
+         int strawStart, strawEnd, needle=glyph;
+         while (l <= r) {
+            stbtt_uint8 *rangeRecord;
+            m = (l + r) >> 1;
+            rangeRecord = rangeArray + 6 * m;
+            strawStart = ttUSHORT(rangeRecord);
+            strawEnd = ttUSHORT(rangeRecord + 2);
+            if (needle < strawStart)
+               r = m - 1;
+            else if (needle > strawEnd)
+               l = m + 1;
+            else {
+               stbtt_uint16 startCoverageIndex = ttUSHORT(rangeRecord + 4);
+               return startCoverageIndex + glyph - strawStart;
+            }
+         }
+         break;
+      }
+
+      default: return -1; // unsupported
+   }
+
+   return -1;
+}
+
+static stbtt_int32  stbtt__GetGlyphClass(stbtt_uint8 *classDefTable, int glyph)
+{
+   stbtt_uint16 classDefFormat = ttUSHORT(classDefTable);
+   switch (classDefFormat)
+   {
+      case 1: {
+         stbtt_uint16 startGlyphID = ttUSHORT(classDefTable + 2);
+         stbtt_uint16 glyphCount = ttUSHORT(classDefTable + 4);
+         stbtt_uint8 *classDef1ValueArray = classDefTable + 6;
+
+         if (glyph >= startGlyphID && glyph < startGlyphID + glyphCount)
+            return (stbtt_int32)ttUSHORT(classDef1ValueArray + 2 * (glyph - startGlyphID));
+         break;
+      }
+
+      case 2: {
+         stbtt_uint16 classRangeCount = ttUSHORT(classDefTable + 2);
+         stbtt_uint8 *classRangeRecords = classDefTable + 4;
+
+         // Binary search.
+         stbtt_int32 l=0, r=classRangeCount-1, m;
+         int strawStart, strawEnd, needle=glyph;
+         while (l <= r) {
+            stbtt_uint8 *classRangeRecord;
+            m = (l + r) >> 1;
+            classRangeRecord = classRangeRecords + 6 * m;
+            strawStart = ttUSHORT(classRangeRecord);
+            strawEnd = ttUSHORT(classRangeRecord + 2);
+            if (needle < strawStart)
+               r = m - 1;
+            else if (needle > strawEnd)
+               l = m + 1;
+            else
+               return (stbtt_int32)ttUSHORT(classRangeRecord + 4);
+         }
+         break;
+      }
+
+      default:
+         return -1; // Unsupported definition type, return an error.
+   }
+
+   // "All glyphs not assigned to a class fall into class 0". (OpenType spec)
+   return 0;
+}
+
+// Define to STBTT_assert(x) if you want to break on unimplemented formats.
+#define STBTT_GPOS_TODO_assert(x)
+
+static stbtt_int32 stbtt__GetGlyphGPOSInfoAdvance(const stbtt_fontinfo *info, int glyph1, int glyph2)
+{
+   stbtt_uint16 lookupListOffset;
+   stbtt_uint8 *lookupList;
+   stbtt_uint16 lookupCount;
+   stbtt_uint8 *data;
+   stbtt_int32 i, sti;
+
+   if (!info->gpos) return 0;
+
+   data = info->data + info->gpos;
+
+   if (ttUSHORT(data+0) != 1) return 0; // Major version 1
+   if (ttUSHORT(data+2) != 0) return 0; // Minor version 0
+
+   lookupListOffset = ttUSHORT(data+8);
+   lookupList = data + lookupListOffset;
+   lookupCount = ttUSHORT(lookupList);
+
+   for (i=0; i<lookupCount; ++i) {
+      stbtt_uint16 lookupOffset = ttUSHORT(lookupList + 2 + 2 * i);
+      stbtt_uint8 *lookupTable = lookupList + lookupOffset;
+
+      stbtt_uint16 lookupType = ttUSHORT(lookupTable);
+      stbtt_uint16 subTableCount = ttUSHORT(lookupTable + 4);
+      stbtt_uint8 *subTableOffsets = lookupTable + 6;
+      if (lookupType != 2) // Pair Adjustment Positioning Subtable
+         continue;
+
+      for (sti=0; sti<subTableCount; sti++) {
+         stbtt_uint16 subtableOffset = ttUSHORT(subTableOffsets + 2 * sti);
+         stbtt_uint8 *table = lookupTable + subtableOffset;
+         stbtt_uint16 posFormat = ttUSHORT(table);
+         stbtt_uint16 coverageOffset = ttUSHORT(table + 2);
+         stbtt_int32 coverageIndex = stbtt__GetCoverageIndex(table + coverageOffset, glyph1);
+         if (coverageIndex == -1) continue;
+
+         switch (posFormat) {
+            case 1: {
+               stbtt_int32 l, r, m;
+               int straw, needle;
+               stbtt_uint16 valueFormat1 = ttUSHORT(table + 4);
+               stbtt_uint16 valueFormat2 = ttUSHORT(table + 6);
+               if (valueFormat1 == 4 && valueFormat2 == 0) { // Support more formats?
+                  stbtt_int32 valueRecordPairSizeInBytes = 2;
+                  stbtt_uint16 pairSetCount = ttUSHORT(table + 8);
+                  stbtt_uint16 pairPosOffset = ttUSHORT(table + 10 + 2 * coverageIndex);
+                  stbtt_uint8 *pairValueTable = table + pairPosOffset;
+                  stbtt_uint16 pairValueCount = ttUSHORT(pairValueTable);
+                  stbtt_uint8 *pairValueArray = pairValueTable + 2;
+
+                  if (coverageIndex >= pairSetCount) return 0;
+
+                  needle=glyph2;
+                  r=pairValueCount-1;
+                  l=0;
+
+                  // Binary search.
+                  while (l <= r) {
+                     stbtt_uint16 secondGlyph;
+                     stbtt_uint8 *pairValue;
+                     m = (l + r) >> 1;
+                     pairValue = pairValueArray + (2 + valueRecordPairSizeInBytes) * m;
+                     secondGlyph = ttUSHORT(pairValue);
+                     straw = secondGlyph;
+                     if (needle < straw)
+                        r = m - 1;
+                     else if (needle > straw)
+                        l = m + 1;
+                     else {
+                        stbtt_int16 xAdvance = ttSHORT(pairValue + 2);
+                        return xAdvance;
+                     }
+                  }
+               } else
+                  return 0;
+               break;
+            }
+
+            case 2: {
+               stbtt_uint16 valueFormat1 = ttUSHORT(table + 4);
+               stbtt_uint16 valueFormat2 = ttUSHORT(table + 6);
+               if (valueFormat1 == 4 && valueFormat2 == 0) { // Support more formats?
+                  stbtt_uint16 classDef1Offset = ttUSHORT(table + 8);
+                  stbtt_uint16 classDef2Offset = ttUSHORT(table + 10);
+                  int glyph1class = stbtt__GetGlyphClass(table + classDef1Offset, glyph1);
+                  int glyph2class = stbtt__GetGlyphClass(table + classDef2Offset, glyph2);
+
+                  stbtt_uint16 class1Count = ttUSHORT(table + 12);
+                  stbtt_uint16 class2Count = ttUSHORT(table + 14);
+                  stbtt_uint8 *class1Records, *class2Records;
+                  stbtt_int16 xAdvance;
+
+                  if (glyph1class < 0 || glyph1class >= class1Count) return 0; // malformed
+                  if (glyph2class < 0 || glyph2class >= class2Count) return 0; // malformed
+
+                  class1Records = table + 16;
+                  class2Records = class1Records + 2 * (glyph1class * class2Count);
+                  xAdvance = ttSHORT(class2Records + 2 * glyph2class);
+                  return xAdvance;
+               } else
+                  return 0;
+               break;
+            }
+
+            default:
+               return 0; // Unsupported position format
+         }
+      }
+   }
+
+   return 0;
+}
+
+STBTT_DEF int  stbtt_GetGlyphKernAdvance(const stbtt_fontinfo *info, int g1, int g2)
+{
+   int xAdvance = 0;
+
+   if (info->gpos)
+      xAdvance += stbtt__GetGlyphGPOSInfoAdvance(info, g1, g2);
+   else if (info->kern)
+      xAdvance += stbtt__GetGlyphKernInfoAdvance(info, g1, g2);
+
+   return xAdvance;
+}
+
 STBTT_DEF int  stbtt_GetCodepointKernAdvance(const stbtt_fontinfo *info, int ch1, int ch2)
 {
-   if (!info->kern) // if no kerning table, don't waste time looking up both codepoint->glyphs
+   if (!info->kern && !info->gpos) // if no kerning table, don't waste time looking up both codepoint->glyphs
       return 0;
    return stbtt_GetGlyphKernAdvance(info, stbtt_FindGlyphIndex(info,ch1), stbtt_FindGlyphIndex(info,ch2));
 }
@@ -1541,6 +2638,17 @@ STBTT_DEF void stbtt_GetFontVMetrics(const stbtt_fontinfo *info, int *ascent, in
    if (lineGap) *lineGap = ttSHORT(info->data+info->hhea + 8);
 }
 
+STBTT_DEF int  stbtt_GetFontVMetricsOS2(const stbtt_fontinfo *info, int *typoAscent, int *typoDescent, int *typoLineGap)
+{
+   int tab = stbtt__find_table(info->data, info->fontstart, "OS/2");
+   if (!tab)
+      return 0;
+   if (typoAscent ) *typoAscent  = ttSHORT(info->data+tab + 68);
+   if (typoDescent) *typoDescent = ttSHORT(info->data+tab + 70);
+   if (typoLineGap) *typoLineGap = ttSHORT(info->data+tab + 72);
+   return 1;
+}
+
 STBTT_DEF void stbtt_GetFontBoundingBox(const stbtt_fontinfo *info, int *x0, int *y0, int *x1, int *y1)
 {
    *x0 = ttSHORT(info->data + info->head + 36);
@@ -1566,6 +2674,45 @@ STBTT_DEF void stbtt_FreeShape(const stbtt_fontinfo *info, stbtt_vertex *v)
    STBTT_free(v, info->userdata);
 }
 
+STBTT_DEF stbtt_uint8 *stbtt_FindSVGDoc(const stbtt_fontinfo *info, int gl)
+{
+   int i;
+   stbtt_uint8 *data = info->data;
+   stbtt_uint8 *svg_doc_list = data + stbtt__get_svg((stbtt_fontinfo *) info);
+
+   int numEntries = ttUSHORT(svg_doc_list);
+   stbtt_uint8 *svg_docs = svg_doc_list + 2;
+
+   for(i=0; i<numEntries; i++) {
+      stbtt_uint8 *svg_doc = svg_docs + (12 * i);
+      if ((gl >= ttUSHORT(svg_doc)) && (gl <= ttUSHORT(svg_doc + 2)))
+         return svg_doc;
+   }
+   return 0;
+}
+
+STBTT_DEF int stbtt_GetGlyphSVG(const stbtt_fontinfo *info, int gl, const char **svg)
+{
+   stbtt_uint8 *data = info->data;
+   stbtt_uint8 *svg_doc;
+
+   if (info->svg == 0)
+      return 0;
+
+   svg_doc = stbtt_FindSVGDoc(info, gl);
+   if (svg_doc != NULL) {
+      *svg = (char *) data + info->svg + ttULONG(svg_doc + 4);
+      return ttULONG(svg_doc + 8);
+   } else {
+      return 0;
+   }
+}
+
+STBTT_DEF int stbtt_GetCodepointSVG(const stbtt_fontinfo *info, int unicode_codepoint, const char **svg)
+{
+   return stbtt_GetGlyphSVG(info, stbtt_FindGlyphIndex(info, unicode_codepoint), svg);
+}
+
 //////////////////////////////////////////////////////////////////////////////
 //
 // antialiasing software rasterizer
@@ -1637,7 +2784,7 @@ static void *stbtt__hheap_alloc(stbtt__hheap *hh, size_t size, void *userdata)
          hh->num_remaining_in_head_chunk = count;
       }
       --hh->num_remaining_in_head_chunk;
-      return (char *) (hh->head) + size * hh->num_remaining_in_head_chunk;
+      return (char *) (hh->head) + sizeof(stbtt__hheap_chunk) + size * hh->num_remaining_in_head_chunk;
    }
 }
 
@@ -1691,7 +2838,7 @@ static stbtt__active_edge *stbtt__new_active(stbtt__hheap *hh, stbtt__edge *e, i
    float dxdy = (e->x1 - e->x0) / (e->y1 - e->y0);
    STBTT_assert(z != NULL);
    if (!z) return z;
-   
+
    // round dx down to avoid overshooting
    if (dxdy < 0)
       z->dx = -STBTT_ifloor(STBTT_FIX * -dxdy);
@@ -1769,7 +2916,7 @@ static void stbtt__fill_active_edges(unsigned char *scanline, int len, stbtt__ac
             }
          }
       }
-      
+
       e = e->next;
    }
 }
@@ -1915,6 +3062,23 @@ static void stbtt__handle_clipped_edge(float *scanline, int x, stbtt__active_edg
    }
 }
 
+static float stbtt__sized_trapezoid_area(float height, float top_width, float bottom_width)
+{
+   STBTT_assert(top_width >= 0);
+   STBTT_assert(bottom_width >= 0);
+   return (top_width + bottom_width) / 2.0f * height;
+}
+
+static float stbtt__position_trapezoid_area(float height, float tx0, float tx1, float bx0, float bx1)
+{
+   return stbtt__sized_trapezoid_area(height, tx1 - tx0, bx1 - bx0);
+}
+
+static float stbtt__sized_triangle_area(float height, float width)
+{
+   return height * width / 2;
+}
+
 static void stbtt__fill_active_edges_new(float *scanline, float *scanline_fill, int len, stbtt__active_edge *e, float y_top)
 {
    float y_bottom = y_top+1;
@@ -1969,13 +3133,13 @@ static void stbtt__fill_active_edges_new(float *scanline, float *scanline_fill,
                float height;
                // simple case, only spans one pixel
                int x = (int) x_top;
-               height = sy1 - sy0;
+               height = (sy1 - sy0) * e->direction;
                STBTT_assert(x >= 0 && x < len);
-               scanline[x] += e->direction * (1-((x_top - x) + (x_bottom-x))/2)  * height;
-               scanline_fill[x] += e->direction * height; // everything right of this pixel is filled
+               scanline[x]      += stbtt__position_trapezoid_area(height, x_top, x+1.0f, x_bottom, x+1.0f);
+               scanline_fill[x] += height; // everything right of this pixel is filled
             } else {
                int x,x1,x2;
-               float y_crossing, step, sign, area;
+               float y_crossing, y_final, step, sign, area;
                // covers 2+ pixels
                if (x_top > x_bottom) {
                   // flip scanline vertically; signed area is the same
@@ -1988,29 +3152,79 @@ static void stbtt__fill_active_edges_new(float *scanline, float *scanline_fill,
                   dy = -dy;
                   t = x0, x0 = xb, xb = t;
                }
+               STBTT_assert(dy >= 0);
+               STBTT_assert(dx >= 0);
 
                x1 = (int) x_top;
                x2 = (int) x_bottom;
                // compute intersection with y axis at x1+1
-               y_crossing = (x1+1 - x0) * dy + y_top;
+               y_crossing = y_top + dy * (x1+1 - x0);
+
+               // compute intersection with y axis at x2
+               y_final = y_top + dy * (x2 - x0);
+
+               //           x1    x_top                            x2    x_bottom
+               //     y_top  +------|-----+------------+------------+--------|---+------------+
+               //            |            |            |            |            |            |
+               //            |            |            |            |            |            |
+               //       sy0  |      Txxxxx|............|............|............|............|
+               // y_crossing |            *xxxxx.......|............|............|............|
+               //            |            |     xxxxx..|............|............|............|
+               //            |            |     /-   xx*xxxx........|............|............|
+               //            |            | dy <       |    xxxxxx..|............|............|
+               //   y_final  |            |     \-     |          xx*xxx.........|............|
+               //       sy1  |            |            |            |   xxxxxB...|............|
+               //            |            |            |            |            |            |
+               //            |            |            |            |            |            |
+               //  y_bottom  +------------+------------+------------+------------+------------+
+               //
+               // goal is to measure the area covered by '.' in each pixel
+
+               // if x2 is right at the right edge of x1, y_crossing can blow up, github #1057
+               // @TODO: maybe test against sy1 rather than y_bottom?
+               if (y_crossing > y_bottom)
+                  y_crossing = y_bottom;
 
                sign = e->direction;
-               // area of the rectangle covered from y0..y_crossing
+
+               // area of the rectangle covered from sy0..y_crossing
                area = sign * (y_crossing-sy0);
-               // area of the triangle (x_top,y0), (x+1,y0), (x+1,y_crossing)
-               scanline[x1] += area * (1-((x_top - x1)+(x1+1-x1))/2);
 
-               step = sign * dy;
+               // area of the triangle (x_top,sy0), (x1+1,sy0), (x1+1,y_crossing)
+               scanline[x1] += stbtt__sized_triangle_area(area, x1+1 - x_top);
+
+               // check if final y_crossing is blown up; no test case for this
+               if (y_final > y_bottom) {
+                  y_final = y_bottom;
+                  dy = (y_final - y_crossing ) / (x2 - (x1+1)); // if denom=0, y_final = y_crossing, so y_final <= y_bottom
+               }
+
+               // in second pixel, area covered by line segment found in first pixel
+               // is always a rectangle 1 wide * the height of that line segment; this
+               // is exactly what the variable 'area' stores. it also gets a contribution
+               // from the line segment within it. the THIRD pixel will get the first
+               // pixel's rectangle contribution, the second pixel's rectangle contribution,
+               // and its own contribution. the 'own contribution' is the same in every pixel except
+               // the leftmost and rightmost, a trapezoid that slides down in each pixel.
+               // the second pixel's contribution to the third pixel will be the
+               // rectangle 1 wide times the height change in the second pixel, which is dy.
+
+               step = sign * dy * 1; // dy is dy/dx, change in y for every 1 change in x,
+               // which multiplied by 1-pixel-width is how much pixel area changes for each step in x
+               // so the area advances by 'step' every time
+
                for (x = x1+1; x < x2; ++x) {
-                  scanline[x] += area + step/2;
+                  scanline[x] += area + step/2; // area of trapezoid is 1*step/2
                   area += step;
                }
-               y_crossing += dy * (x2 - (x1+1));
-
-               STBTT_assert(STBTT_fabs(area) <= 1.01f);
+               STBTT_assert(STBTT_fabs(area) <= 1.01f); // accumulated error from area += step unless we round step down
+               STBTT_assert(sy1 > y_final-0.01f);
 
-               scanline[x2] += area + sign * (1-((x2-x2)+(x_bottom-x2))/2) * (sy1-y_crossing);
+               // area covered in the last pixel is the rectangle from all the pixels to the left,
+               // plus the trapezoid filled by the line segment in this pixel all the way to the right edge
+               scanline[x2] += area + sign * stbtt__position_trapezoid_area(sy1-y_final, (float) x2, x2+1.0f, x_bottom, x2+1.0f);
 
+               // the rest of the line is filled based on the total height of the line segment in this pixel
                scanline_fill[x2] += sign * (sy1-sy0);
             }
          } else {
@@ -2018,6 +3232,9 @@ static void stbtt__fill_active_edges_new(float *scanline, float *scanline_fill,
             // clipping logic. since this does not match the intended use
             // of this library, we use a different, very slow brute
             // force implementation
+            // note though that this does happen some of the time because
+            // x_top and x_bottom can be extrapolated at the top & bottom of
+            // the shape and actually lie outside the bounding box
             int x;
             for (x=0; x < len; ++x) {
                // cases:
@@ -2033,19 +3250,18 @@ static void stbtt__fill_active_edges_new(float *scanline, float *scanline_fill,
                // from the other y segment, and it might ignored as an empty segment. to avoid
                // that, we need to explicitly produce segments based on x positions.
 
-               // rename variables to clear pairs
+               // rename variables to clearly-defined pairs
                float y0 = y_top;
                float x1 = (float) (x);
                float x2 = (float) (x+1);
                float x3 = xb;
                float y3 = y_bottom;
-               float y1,y2;
 
                // x = e->x + e->dx * (y-y_top)
                // (y-y_top) = (x - e->x) / e->dx
                // y = (x - e->x) / e->dx + y_top
-               y1 = (x - x0) / dx + y_top;
-               y2 = (x+1 - x0) / dx + y_top;
+               float y1 = (x - x0) / dx + y_top;
+               float y2 = (x+1 - x0) / dx + y_top;
 
                if (x0 < x1 && x3 > x2) {         // three segments descending down-right
                   stbtt__handle_clipped_edge(scanline,x,e, x0,y0, x1,y1);
@@ -2125,7 +3341,13 @@ static void stbtt__rasterize_sorted_edges(stbtt__bitmap *result, stbtt__edge *e,
          if (e->y0 != e->y1) {
             stbtt__active_edge *z = stbtt__new_active(&hh, e, off_x, scan_y_top, userdata);
             if (z != NULL) {
-               STBTT_assert(z->ey >= scan_y_top);
+               if (j == 0 && off_y != 0) {
+                  if (z->ey < scan_y_top) {
+                     // this can happen due to subpixel positioning and some kind of fp rounding error i think
+                     z->ey = scan_y_top;
+                  }
+               }
+               STBTT_assert(z->ey >= scan_y_top); // if we get really unlucky a tiny bit of an edge can be out of bounds
                // insert at front
                z->next = active;
                active = z;
@@ -2194,7 +3416,7 @@ static void stbtt__sort_edges_ins_sort(stbtt__edge *p, int n)
 
 static void stbtt__sort_edges_quicksort(stbtt__edge *p, int n)
 {
-   /* threshhold for transitioning to insertion sort */
+   /* threshold for transitioning to insertion sort */
    while (n > 12) {
       stbtt__edge t;
       int c01,c12,c,m,i,j;
@@ -2329,7 +3551,7 @@ static void stbtt__add_point(stbtt__point *points, int n, float x, float y)
    points[n].y = y;
 }
 
-// tesselate until threshhold p is happy... @TODO warped to compensate for non-linear stretching
+// tessellate until threshold p is happy... @TODO warped to compensate for non-linear stretching
 static int stbtt__tesselate_curve(stbtt__point *points, int *num_points, float x0, float y0, float x1, float y1, float x2, float y2, float objspace_flatness_squared, int n)
 {
    // midpoint
@@ -2350,6 +3572,48 @@ static int stbtt__tesselate_curve(stbtt__point *points, int *num_points, float x
    return 1;
 }
 
+static void stbtt__tesselate_cubic(stbtt__point *points, int *num_points, float x0, float y0, float x1, float y1, float x2, float y2, float x3, float y3, float objspace_flatness_squared, int n)
+{
+   // @TODO this "flatness" calculation is just made-up nonsense that seems to work well enough
+   float dx0 = x1-x0;
+   float dy0 = y1-y0;
+   float dx1 = x2-x1;
+   float dy1 = y2-y1;
+   float dx2 = x3-x2;
+   float dy2 = y3-y2;
+   float dx = x3-x0;
+   float dy = y3-y0;
+   float longlen = (float) (STBTT_sqrt(dx0*dx0+dy0*dy0)+STBTT_sqrt(dx1*dx1+dy1*dy1)+STBTT_sqrt(dx2*dx2+dy2*dy2));
+   float shortlen = (float) STBTT_sqrt(dx*dx+dy*dy);
+   float flatness_squared = longlen*longlen-shortlen*shortlen;
+
+   if (n > 16) // 65536 segments on one curve better be enough!
+      return;
+
+   if (flatness_squared > objspace_flatness_squared) {
+      float x01 = (x0+x1)/2;
+      float y01 = (y0+y1)/2;
+      float x12 = (x1+x2)/2;
+      float y12 = (y1+y2)/2;
+      float x23 = (x2+x3)/2;
+      float y23 = (y2+y3)/2;
+
+      float xa = (x01+x12)/2;
+      float ya = (y01+y12)/2;
+      float xb = (x12+x23)/2;
+      float yb = (y12+y23)/2;
+
+      float mx = (xa+xb)/2;
+      float my = (ya+yb)/2;
+
+      stbtt__tesselate_cubic(points, num_points, x0,y0, x01,y01, xa,ya, mx,my, objspace_flatness_squared,n+1);
+      stbtt__tesselate_cubic(points, num_points, mx,my, xb,yb, x23,y23, x3,y3, objspace_flatness_squared,n+1);
+   } else {
+      stbtt__add_point(points, *num_points,x3,y3);
+      *num_points = *num_points+1;
+   }
+}
+
 // returns number of contours
 static stbtt__point *stbtt_FlattenCurves(stbtt_vertex *vertices, int num_verts, float objspace_flatness, int **contour_lengths, int *num_contours, void *userdata)
 {
@@ -2406,6 +3670,14 @@ static stbtt__point *stbtt_FlattenCurves(stbtt_vertex *vertices, int num_verts,
                                         objspace_flatness_squared, 0);
                x = vertices[i].x, y = vertices[i].y;
                break;
+            case STBTT_vcubic:
+               stbtt__tesselate_cubic(points, &num_points, x,y,
+                                        vertices[i].cx, vertices[i].cy,
+                                        vertices[i].cx1, vertices[i].cy1,
+                                        vertices[i].x,  vertices[i].y,
+                                        objspace_flatness_squared, 0);
+               x = vertices[i].x, y = vertices[i].y;
+               break;
          }
       }
       (*contour_lengths)[n] = num_points - start;
@@ -2422,8 +3694,9 @@ static stbtt__point *stbtt_FlattenCurves(stbtt_vertex *vertices, int num_verts,
 
 STBTT_DEF void stbtt_Rasterize(stbtt__bitmap *result, float flatness_in_pixels, stbtt_vertex *vertices, int num_verts, float scale_x, float scale_y, float shift_x, float shift_y, int x_off, int y_off, int invert, void *userdata)
 {
-   float scale = scale_x > scale_y ? scale_y : scale_x;
-   int winding_count, *winding_lengths;
+   float scale            = scale_x > scale_y ? scale_y : scale_x;
+   int winding_count      = 0;
+   int *winding_lengths   = NULL;
    stbtt__point *windings = stbtt_FlattenCurves(vertices, num_verts, flatness_in_pixels / scale, &winding_lengths, &winding_count, userdata);
    if (windings) {
       stbtt__rasterize(result, windings, winding_lengths, winding_count, scale_x, scale_y, shift_x, shift_y, x_off, y_off, invert, userdata);
@@ -2441,7 +3714,7 @@ STBTT_DEF unsigned char *stbtt_GetGlyphBitmapSubpixel(const stbtt_fontinfo *info
 {
    int ix0,iy0,ix1,iy1;
    stbtt__bitmap gbm;
-   stbtt_vertex *vertices;   
+   stbtt_vertex *vertices;
    int num_verts = stbtt_GetGlyphShape(info, glyph, &vertices);
 
    if (scale_x == 0) scale_x = scale_y;
@@ -2464,7 +3737,7 @@ STBTT_DEF unsigned char *stbtt_GetGlyphBitmapSubpixel(const stbtt_fontinfo *info
    if (height) *height = gbm.h;
    if (xoff  ) *xoff   = ix0;
    if (yoff  ) *yoff   = iy0;
-   
+
    if (gbm.w && gbm.h) {
       gbm.pixels = (unsigned char *) STBTT_malloc(gbm.w * gbm.h, info->userdata);
       if (gbm.pixels) {
@@ -2475,7 +3748,7 @@ STBTT_DEF unsigned char *stbtt_GetGlyphBitmapSubpixel(const stbtt_fontinfo *info
    }
    STBTT_free(vertices, info->userdata);
    return gbm.pixels;
-}   
+}
 
 STBTT_DEF unsigned char *stbtt_GetGlyphBitmap(const stbtt_fontinfo *info, float scale_x, float scale_y, int glyph, int *width, int *height, int *xoff, int *yoff)
 {
@@ -2487,7 +3760,7 @@ STBTT_DEF void stbtt_MakeGlyphBitmapSubpixel(const stbtt_fontinfo *info, unsigne
    int ix0,iy0;
    stbtt_vertex *vertices;
    int num_verts = stbtt_GetGlyphShape(info, glyph, &vertices);
-   stbtt__bitmap gbm;   
+   stbtt__bitmap gbm;
 
    stbtt_GetGlyphBitmapBoxSubpixel(info, glyph, scale_x, scale_y, shift_x, shift_y, &ix0,&iy0,0,0);
    gbm.pixels = output;
@@ -2509,7 +3782,12 @@ STBTT_DEF void stbtt_MakeGlyphBitmap(const stbtt_fontinfo *info, unsigned char *
 STBTT_DEF unsigned char *stbtt_GetCodepointBitmapSubpixel(const stbtt_fontinfo *info, float scale_x, float scale_y, float shift_x, float shift_y, int codepoint, int *width, int *height, int *xoff, int *yoff)
 {
    return stbtt_GetGlyphBitmapSubpixel(info, scale_x, scale_y,shift_x,shift_y, stbtt_FindGlyphIndex(info,codepoint), width,height,xoff,yoff);
-}   
+}
+
+STBTT_DEF void stbtt_MakeCodepointBitmapSubpixelPrefilter(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, float shift_x, float shift_y, int oversample_x, int oversample_y, float *sub_x, float *sub_y, int codepoint)
+{
+   stbtt_MakeGlyphBitmapSubpixelPrefilter(info, output, out_w, out_h, out_stride, scale_x, scale_y, shift_x, shift_y, oversample_x, oversample_y, sub_x, sub_y, stbtt_FindGlyphIndex(info,codepoint));
+}
 
 STBTT_DEF void stbtt_MakeCodepointBitmapSubpixel(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, float shift_x, float shift_y, int codepoint)
 {
@@ -2519,7 +3797,7 @@ STBTT_DEF void stbtt_MakeCodepointBitmapSubpixel(const stbtt_fontinfo *info, uns
 STBTT_DEF unsigned char *stbtt_GetCodepointBitmap(const stbtt_fontinfo *info, float scale_x, float scale_y, int codepoint, int *width, int *height, int *xoff, int *yoff)
 {
    return stbtt_GetCodepointBitmapSubpixel(info, scale_x, scale_y, 0.0f,0.0f, codepoint, width,height,xoff,yoff);
-}   
+}
 
 STBTT_DEF void stbtt_MakeCodepointBitmap(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, int codepoint)
 {
@@ -2532,7 +3810,7 @@ STBTT_DEF void stbtt_MakeCodepointBitmap(const stbtt_fontinfo *info, unsigned ch
 //
 // This is SUPER-CRAPPY packing to keep source code small
 
-STBTT_DEF int stbtt_BakeFontBitmap(const unsigned char *data, int offset,  // font location (use offset=0 for plain .ttf)
+static int stbtt_BakeFontBitmap_internal(unsigned char *data, int offset,  // font location (use offset=0 for plain .ttf)
                                 float pixel_height,                     // height of font in pixels
                                 unsigned char *pixels, int pw, int ph,  // bitmap to be filled in
                                 int first_char, int num_chars,          // characters to bake
@@ -2578,11 +3856,11 @@ STBTT_DEF int stbtt_BakeFontBitmap(const unsigned char *data, int offset,  // fo
    return bottom_y;
 }
 
-STBTT_DEF void stbtt_GetBakedQuad(stbtt_bakedchar *chardata, int pw, int ph, int char_index, float *xpos, float *ypos, stbtt_aligned_quad *q, int opengl_fillrule)
+STBTT_DEF void stbtt_GetBakedQuad(const stbtt_bakedchar *chardata, int pw, int ph, int char_index, float *xpos, float *ypos, stbtt_aligned_quad *q, int opengl_fillrule)
 {
    float d3d_bias = opengl_fillrule ? 0 : -0.5f;
    float ipw = 1.0f / pw, iph = 1.0f / ph;
-   stbtt_bakedchar *b = chardata + char_index;
+   const stbtt_bakedchar *b = chardata + char_index;
    int round_x = STBTT_ifloor((*xpos + b->xoff) + 0.5f);
    int round_y = STBTT_ifloor((*ypos + b->yoff) + 0.5f);
 
@@ -2644,7 +3922,7 @@ static void stbrp_init_target(stbrp_context *con, int pw, int ph, stbrp_node *no
    con->y = 0;
    con->bottom_y = 0;
    STBTT__NOTUSED(nodes);
-   STBTT__NOTUSED(num_nodes);   
+   STBTT__NOTUSED(num_nodes);
 }
 
 static void stbrp_pack_rects(stbrp_context *con, stbrp_rect *rects, int num_rects)
@@ -2698,6 +3976,7 @@ STBTT_DEF int stbtt_PackBegin(stbtt_pack_context *spc, unsigned char *pixels, in
    spc->stride_in_bytes = stride_in_bytes != 0 ? stride_in_bytes : pw;
    spc->h_oversample = 1;
    spc->v_oversample = 1;
+   spc->skip_missing = 0;
 
    stbrp_init_target(context, pw-padding, ph-padding, nodes, num_nodes);
 
@@ -2723,6 +4002,11 @@ STBTT_DEF void stbtt_PackSetOversampling(stbtt_pack_context *spc, unsigned int h
       spc->v_oversample = v_oversample;
 }
 
+STBTT_DEF void stbtt_PackSetSkipMissingCodepoints(stbtt_pack_context *spc, int skip)
+{
+   spc->skip_missing = skip;
+}
+
 #define STBTT__OVER_MASK  (STBTT_MAX_OVERSAMPLE-1)
 
 static void stbtt__h_prefilter(unsigned char *pixels, int w, int h, int stride_in_bytes, unsigned int kernel_width)
@@ -2862,9 +4146,10 @@ static float stbtt__oversample_shift(int oversample)
 }
 
 // rects array must be big enough to accommodate all characters in the given ranges
-STBTT_DEF int stbtt_PackFontRangesGatherRects(stbtt_pack_context *spc, stbtt_fontinfo *info, stbtt_pack_range *ranges, int num_ranges, stbrp_rect *rects)
+STBTT_DEF int stbtt_PackFontRangesGatherRects(stbtt_pack_context *spc, const stbtt_fontinfo *info, stbtt_pack_range *ranges, int num_ranges, stbrp_rect *rects)
 {
    int i,j,k;
+   int missing_glyph_added = 0;
 
    k=0;
    for (i=0; i < num_ranges; ++i) {
@@ -2876,13 +4161,19 @@ STBTT_DEF int stbtt_PackFontRangesGatherRects(stbtt_pack_context *spc, stbtt_fon
          int x0,y0,x1,y1;
          int codepoint = ranges[i].array_of_unicode_codepoints == NULL ? ranges[i].first_unicode_codepoint_in_range + j : ranges[i].array_of_unicode_codepoints[j];
          int glyph = stbtt_FindGlyphIndex(info, codepoint);
-         stbtt_GetGlyphBitmapBoxSubpixel(info,glyph,
-                                         scale * spc->h_oversample,
-                                         scale * spc->v_oversample,
-                                         0,0,
-                                         &x0,&y0,&x1,&y1);
-         rects[k].w = (stbrp_coord) (x1-x0 + spc->padding + spc->h_oversample-1);
-         rects[k].h = (stbrp_coord) (y1-y0 + spc->padding + spc->v_oversample-1);
+         if (glyph == 0 && (spc->skip_missing || missing_glyph_added)) {
+            rects[k].w = rects[k].h = 0;
+         } else {
+            stbtt_GetGlyphBitmapBoxSubpixel(info,glyph,
+                                            scale * spc->h_oversample,
+                                            scale * spc->v_oversample,
+                                            0,0,
+                                            &x0,&y0,&x1,&y1);
+            rects[k].w = (stbrp_coord) (x1-x0 + spc->padding + spc->h_oversample-1);
+            rects[k].h = (stbrp_coord) (y1-y0 + spc->padding + spc->v_oversample-1);
+            if (glyph == 0)
+               missing_glyph_added = 1;
+         }
          ++k;
       }
    }
@@ -2890,10 +4181,33 @@ STBTT_DEF int stbtt_PackFontRangesGatherRects(stbtt_pack_context *spc, stbtt_fon
    return k;
 }
 
+STBTT_DEF void stbtt_MakeGlyphBitmapSubpixelPrefilter(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, float shift_x, float shift_y, int prefilter_x, int prefilter_y, float *sub_x, float *sub_y, int glyph)
+{
+   stbtt_MakeGlyphBitmapSubpixel(info,
+                                 output,
+                                 out_w - (prefilter_x - 1),
+                                 out_h - (prefilter_y - 1),
+                                 out_stride,
+                                 scale_x,
+                                 scale_y,
+                                 shift_x,
+                                 shift_y,
+                                 glyph);
+
+   if (prefilter_x > 1)
+      stbtt__h_prefilter(output, out_w, out_h, out_stride, prefilter_x);
+
+   if (prefilter_y > 1)
+      stbtt__v_prefilter(output, out_w, out_h, out_stride, prefilter_y);
+
+   *sub_x = stbtt__oversample_shift(prefilter_x);
+   *sub_y = stbtt__oversample_shift(prefilter_y);
+}
+
 // rects array must be big enough to accommodate all characters in the given ranges
-STBTT_DEF int stbtt_PackFontRangesRenderIntoRects(stbtt_pack_context *spc, stbtt_fontinfo *info, stbtt_pack_range *ranges, int num_ranges, stbrp_rect *rects)
+STBTT_DEF int stbtt_PackFontRangesRenderIntoRects(stbtt_pack_context *spc, const stbtt_fontinfo *info, stbtt_pack_range *ranges, int num_ranges, stbrp_rect *rects)
 {
-   int i,j,k, return_value = 1;
+   int i,j,k, missing_glyph = -1, return_value = 1;
 
    // save current values
    int old_h_over = spc->h_oversample;
@@ -2912,7 +4226,7 @@ STBTT_DEF int stbtt_PackFontRangesRenderIntoRects(stbtt_pack_context *spc, stbtt
       sub_y = stbtt__oversample_shift(spc->v_oversample);
       for (j=0; j < ranges[i].num_chars; ++j) {
          stbrp_rect *r = &rects[k];
-         if (r->was_packed) {
+         if (r->was_packed && r->w != 0 && r->h != 0) {
             stbtt_packedchar *bc = &ranges[i].chardata_for_range[j];
             int advance, lsb, x0,y0,x1,y1;
             int codepoint = ranges[i].array_of_unicode_codepoints == NULL ? ranges[i].first_unicode_codepoint_in_range + j : ranges[i].array_of_unicode_codepoints[j];
@@ -2958,6 +4272,13 @@ STBTT_DEF int stbtt_PackFontRangesRenderIntoRects(stbtt_pack_context *spc, stbtt
             bc->yoff     =       (float)  y0 * recip_v + sub_y;
             bc->xoff2    =                (x0 + r->w) * recip_h + sub_x;
             bc->yoff2    =                (y0 + r->h) * recip_v + sub_y;
+
+            if (glyph == 0)
+               missing_glyph = j;
+         } else if (spc->skip_missing) {
+            return_value = 0;
+         } else if (r->was_packed && r->w == 0 && r->h == 0 && missing_glyph >= 0) {
+            ranges[i].chardata_for_range[j] = ranges[i].chardata_for_range[missing_glyph];
          } else {
             return_value = 0; // if any fail, report failure
          }
@@ -2978,7 +4299,7 @@ STBTT_DEF void stbtt_PackFontRangesPackRects(stbtt_pack_context *spc, stbrp_rect
    stbrp_pack_rects((stbrp_context *) spc->pack_info, rects, num_rects);
 }
 
-STBTT_DEF int stbtt_PackFontRanges(stbtt_pack_context *spc, unsigned char *fontdata, int font_index, stbtt_pack_range *ranges, int num_ranges)
+STBTT_DEF int stbtt_PackFontRanges(stbtt_pack_context *spc, const unsigned char *fontdata, int font_index, stbtt_pack_range *ranges, int num_ranges)
 {
    stbtt_fontinfo info;
    int i,j,n, return_value = 1;
@@ -2996,7 +4317,7 @@ STBTT_DEF int stbtt_PackFontRanges(stbtt_pack_context *spc, unsigned char *fontd
    n = 0;
    for (i=0; i < num_ranges; ++i)
       n += ranges[i].num_chars;
-         
+
    rects = (stbrp_rect *) STBTT_malloc(sizeof(*rects) * n, spc->user_allocator_context);
    if (rects == NULL)
       return 0;
@@ -3007,14 +4328,14 @@ STBTT_DEF int stbtt_PackFontRanges(stbtt_pack_context *spc, unsigned char *fontd
    n = stbtt_PackFontRangesGatherRects(spc, &info, ranges, num_ranges, rects);
 
    stbtt_PackFontRangesPackRects(spc, rects, n);
-  
+
    return_value = stbtt_PackFontRangesRenderIntoRects(spc, &info, ranges, num_ranges, rects);
 
    STBTT_free(rects, spc->user_allocator_context);
    return return_value;
 }
 
-STBTT_DEF int stbtt_PackFontRange(stbtt_pack_context *spc, unsigned char *fontdata, int font_index, float font_size,
+STBTT_DEF int stbtt_PackFontRange(stbtt_pack_context *spc, const unsigned char *fontdata, int font_index, float font_size,
             int first_unicode_codepoint_in_range, int num_chars_in_range, stbtt_packedchar *chardata_for_range)
 {
    stbtt_pack_range range;
@@ -3026,10 +4347,23 @@ STBTT_DEF int stbtt_PackFontRange(stbtt_pack_context *spc, unsigned char *fontda
    return stbtt_PackFontRanges(spc, fontdata, font_index, &range, 1);
 }
 
-STBTT_DEF void stbtt_GetPackedQuad(stbtt_packedchar *chardata, int pw, int ph, int char_index, float *xpos, float *ypos, stbtt_aligned_quad *q, int align_to_integer)
+STBTT_DEF void stbtt_GetScaledFontVMetrics(const unsigned char *fontdata, int index, float size, float *ascent, float *descent, float *lineGap)
+{
+   int i_ascent, i_descent, i_lineGap;
+   float scale;
+   stbtt_fontinfo info;
+   stbtt_InitFont(&info, fontdata, stbtt_GetFontOffsetForIndex(fontdata, index));
+   scale = size > 0 ? stbtt_ScaleForPixelHeight(&info, size) : stbtt_ScaleForMappingEmToPixels(&info, -size);
+   stbtt_GetFontVMetrics(&info, &i_ascent, &i_descent, &i_lineGap);
+   *ascent  = (float) i_ascent  * scale;
+   *descent = (float) i_descent * scale;
+   *lineGap = (float) i_lineGap * scale;
+}
+
+STBTT_DEF void stbtt_GetPackedQuad(const stbtt_packedchar *chardata, int pw, int ph, int char_index, float *xpos, float *ypos, stbtt_aligned_quad *q, int align_to_integer)
 {
    float ipw = 1.0f / pw, iph = 1.0f / ph;
-   stbtt_packedchar *b = chardata + char_index;
+   const stbtt_packedchar *b = chardata + char_index;
 
    if (align_to_integer) {
       float x = (float) STBTT_ifloor((*xpos + b->xoff) + 0.5f);
@@ -3053,6 +4387,385 @@ STBTT_DEF void stbtt_GetPackedQuad(stbtt_packedchar *chardata, int pw, int ph, i
    *xpos += b->xadvance;
 }
 
+//////////////////////////////////////////////////////////////////////////////
+//
+// sdf computation
+//
+
+#define STBTT_min(a,b)  ((a) < (b) ? (a) : (b))
+#define STBTT_max(a,b)  ((a) < (b) ? (b) : (a))
+
+static int stbtt__ray_intersect_bezier(float orig[2], float ray[2], float q0[2], float q1[2], float q2[2], float hits[2][2])
+{
+   float q0perp = q0[1]*ray[0] - q0[0]*ray[1];
+   float q1perp = q1[1]*ray[0] - q1[0]*ray[1];
+   float q2perp = q2[1]*ray[0] - q2[0]*ray[1];
+   float roperp = orig[1]*ray[0] - orig[0]*ray[1];
+
+   float a = q0perp - 2*q1perp + q2perp;
+   float b = q1perp - q0perp;
+   float c = q0perp - roperp;
+
+   float s0 = 0., s1 = 0.;
+   int num_s = 0;
+
+   if (a != 0.0) {
+      float discr = b*b - a*c;
+      if (discr > 0.0) {
+         float rcpna = -1 / a;
+         float d = (float) STBTT_sqrt(discr);
+         s0 = (b+d) * rcpna;
+         s1 = (b-d) * rcpna;
+         if (s0 >= 0.0 && s0 <= 1.0)
+            num_s = 1;
+         if (d > 0.0 && s1 >= 0.0 && s1 <= 1.0) {
+            if (num_s == 0) s0 = s1;
+            ++num_s;
+         }
+      }
+   } else {
+      // 2*b*s + c = 0
+      // s = -c / (2*b)
+      s0 = c / (-2 * b);
+      if (s0 >= 0.0 && s0 <= 1.0)
+         num_s = 1;
+   }
+
+   if (num_s == 0)
+      return 0;
+   else {
+      float rcp_len2 = 1 / (ray[0]*ray[0] + ray[1]*ray[1]);
+      float rayn_x = ray[0] * rcp_len2, rayn_y = ray[1] * rcp_len2;
+
+      float q0d =   q0[0]*rayn_x +   q0[1]*rayn_y;
+      float q1d =   q1[0]*rayn_x +   q1[1]*rayn_y;
+      float q2d =   q2[0]*rayn_x +   q2[1]*rayn_y;
+      float rod = orig[0]*rayn_x + orig[1]*rayn_y;
+
+      float q10d = q1d - q0d;
+      float q20d = q2d - q0d;
+      float q0rd = q0d - rod;
+
+      hits[0][0] = q0rd + s0*(2.0f - 2.0f*s0)*q10d + s0*s0*q20d;
+      hits[0][1] = a*s0+b;
+
+      if (num_s > 1) {
+         hits[1][0] = q0rd + s1*(2.0f - 2.0f*s1)*q10d + s1*s1*q20d;
+         hits[1][1] = a*s1+b;
+         return 2;
+      } else {
+         return 1;
+      }
+   }
+}
+
+static int equal(float *a, float *b)
+{
+   return (a[0] == b[0] && a[1] == b[1]);
+}
+
+static int stbtt__compute_crossings_x(float x, float y, int nverts, stbtt_vertex *verts)
+{
+   int i;
+   float orig[2], ray[2] = { 1, 0 };
+   float y_frac;
+   int winding = 0;
+
+   // make sure y never passes through a vertex of the shape
+   y_frac = (float) STBTT_fmod(y, 1.0f);
+   if (y_frac < 0.01f)
+      y += 0.01f;
+   else if (y_frac > 0.99f)
+      y -= 0.01f;
+
+   orig[0] = x;
+   orig[1] = y;
+
+   // test a ray from (-infinity,y) to (x,y)
+   for (i=0; i < nverts; ++i) {
+      if (verts[i].type == STBTT_vline) {
+         int x0 = (int) verts[i-1].x, y0 = (int) verts[i-1].y;
+         int x1 = (int) verts[i  ].x, y1 = (int) verts[i  ].y;
+         if (y > STBTT_min(y0,y1) && y < STBTT_max(y0,y1) && x > STBTT_min(x0,x1)) {
+            float x_inter = (y - y0) / (y1 - y0) * (x1-x0) + x0;
+            if (x_inter < x)
+               winding += (y0 < y1) ? 1 : -1;
+         }
+      }
+      if (verts[i].type == STBTT_vcurve) {
+         int x0 = (int) verts[i-1].x , y0 = (int) verts[i-1].y ;
+         int x1 = (int) verts[i  ].cx, y1 = (int) verts[i  ].cy;
+         int x2 = (int) verts[i  ].x , y2 = (int) verts[i  ].y ;
+         int ax = STBTT_min(x0,STBTT_min(x1,x2)), ay = STBTT_min(y0,STBTT_min(y1,y2));
+         int by = STBTT_max(y0,STBTT_max(y1,y2));
+         if (y > ay && y < by && x > ax) {
+            float q0[2],q1[2],q2[2];
+            float hits[2][2];
+            q0[0] = (float)x0;
+            q0[1] = (float)y0;
+            q1[0] = (float)x1;
+            q1[1] = (float)y1;
+            q2[0] = (float)x2;
+            q2[1] = (float)y2;
+            if (equal(q0,q1) || equal(q1,q2)) {
+               x0 = (int)verts[i-1].x;
+               y0 = (int)verts[i-1].y;
+               x1 = (int)verts[i  ].x;
+               y1 = (int)verts[i  ].y;
+               if (y > STBTT_min(y0,y1) && y < STBTT_max(y0,y1) && x > STBTT_min(x0,x1)) {
+                  float x_inter = (y - y0) / (y1 - y0) * (x1-x0) + x0;
+                  if (x_inter < x)
+                     winding += (y0 < y1) ? 1 : -1;
+               }
+            } else {
+               int num_hits = stbtt__ray_intersect_bezier(orig, ray, q0, q1, q2, hits);
+               if (num_hits >= 1)
+                  if (hits[0][0] < 0)
+                     winding += (hits[0][1] < 0 ? -1 : 1);
+               if (num_hits >= 2)
+                  if (hits[1][0] < 0)
+                     winding += (hits[1][1] < 0 ? -1 : 1);
+            }
+         }
+      }
+   }
+   return winding;
+}
+
+static float stbtt__cuberoot( float x )
+{
+   if (x<0)
+      return -(float) STBTT_pow(-x,1.0f/3.0f);
+   else
+      return  (float) STBTT_pow( x,1.0f/3.0f);
+}
+
+// x^3 + a*x^2 + b*x + c = 0
+static int stbtt__solve_cubic(float a, float b, float c, float* r)
+{
+   float s = -a / 3;
+   float p = b - a*a / 3;
+   float q = a * (2*a*a - 9*b) / 27 + c;
+   float p3 = p*p*p;
+   float d = q*q + 4*p3 / 27;
+   if (d >= 0) {
+      float z = (float) STBTT_sqrt(d);
+      float u = (-q + z) / 2;
+      float v = (-q - z) / 2;
+      u = stbtt__cuberoot(u);
+      v = stbtt__cuberoot(v);
+      r[0] = s + u + v;
+      return 1;
+   } else {
+      float u = (float) STBTT_sqrt(-p/3);
+      float v = (float) STBTT_acos(-STBTT_sqrt(-27/p3) * q / 2) / 3; // p3 must be negative, since d is negative
+      float m = (float) STBTT_cos(v);
+      float n = (float) STBTT_cos(v-3.141592/2)*1.732050808f;
+      r[0] = s + u * 2 * m;
+      r[1] = s - u * (m + n);
+      r[2] = s - u * (m - n);
+
+      //STBTT_assert( STBTT_fabs(((r[0]+a)*r[0]+b)*r[0]+c) < 0.05f);  // these asserts may not be safe at all scales, though they're in bezier t parameter units so maybe?
+      //STBTT_assert( STBTT_fabs(((r[1]+a)*r[1]+b)*r[1]+c) < 0.05f);
+      //STBTT_assert( STBTT_fabs(((r[2]+a)*r[2]+b)*r[2]+c) < 0.05f);
+      return 3;
+   }
+}
+
+STBTT_DEF unsigned char * stbtt_GetGlyphSDF(const stbtt_fontinfo *info, float scale, int glyph, int padding, unsigned char onedge_value, float pixel_dist_scale, int *width, int *height, int *xoff, int *yoff)
+{
+   float scale_x = scale, scale_y = scale;
+   int ix0,iy0,ix1,iy1;
+   int w,h;
+   unsigned char *data;
+
+   if (scale == 0) return NULL;
+
+   stbtt_GetGlyphBitmapBoxSubpixel(info, glyph, scale, scale, 0.0f,0.0f, &ix0,&iy0,&ix1,&iy1);
+
+   // if empty, return NULL
+   if (ix0 == ix1 || iy0 == iy1)
+      return NULL;
+
+   ix0 -= padding;
+   iy0 -= padding;
+   ix1 += padding;
+   iy1 += padding;
+
+   w = (ix1 - ix0);
+   h = (iy1 - iy0);
+
+   if (width ) *width  = w;
+   if (height) *height = h;
+   if (xoff  ) *xoff   = ix0;
+   if (yoff  ) *yoff   = iy0;
+
+   // invert for y-downwards bitmaps
+   scale_y = -scale_y;
+
+   {
+      int x,y,i,j;
+      float *precompute;
+      stbtt_vertex *verts;
+      int num_verts = stbtt_GetGlyphShape(info, glyph, &verts);
+      data = (unsigned char *) STBTT_malloc(w * h, info->userdata);
+      precompute = (float *) STBTT_malloc(num_verts * sizeof(float), info->userdata);
+
+      for (i=0,j=num_verts-1; i < num_verts; j=i++) {
+         if (verts[i].type == STBTT_vline) {
+            float x0 = verts[i].x*scale_x, y0 = verts[i].y*scale_y;
+            float x1 = verts[j].x*scale_x, y1 = verts[j].y*scale_y;
+            float dist = (float) STBTT_sqrt((x1-x0)*(x1-x0) + (y1-y0)*(y1-y0));
+            precompute[i] = (dist == 0) ? 0.0f : 1.0f / dist;
+         } else if (verts[i].type == STBTT_vcurve) {
+            float x2 = verts[j].x *scale_x, y2 = verts[j].y *scale_y;
+            float x1 = verts[i].cx*scale_x, y1 = verts[i].cy*scale_y;
+            float x0 = verts[i].x *scale_x, y0 = verts[i].y *scale_y;
+            float bx = x0 - 2*x1 + x2, by = y0 - 2*y1 + y2;
+            float len2 = bx*bx + by*by;
+            if (len2 != 0.0f)
+               precompute[i] = 1.0f / (bx*bx + by*by);
+            else
+               precompute[i] = 0.0f;
+         } else
+            precompute[i] = 0.0f;
+      }
+
+      for (y=iy0; y < iy1; ++y) {
+         for (x=ix0; x < ix1; ++x) {
+            float val;
+            float min_dist = 999999.0f;
+            float sx = (float) x + 0.5f;
+            float sy = (float) y + 0.5f;
+            float x_gspace = (sx / scale_x);
+            float y_gspace = (sy / scale_y);
+
+            int winding = stbtt__compute_crossings_x(x_gspace, y_gspace, num_verts, verts); // @OPTIMIZE: this could just be a rasterization, but needs to be line vs. non-tesselated curves so a new path
+
+            for (i=0; i < num_verts; ++i) {
+               float x0 = verts[i].x*scale_x, y0 = verts[i].y*scale_y;
+
+               if (verts[i].type == STBTT_vline && precompute[i] != 0.0f) {
+                  float x1 = verts[i-1].x*scale_x, y1 = verts[i-1].y*scale_y;
+
+                  float dist,dist2 = (x0-sx)*(x0-sx) + (y0-sy)*(y0-sy);
+                  if (dist2 < min_dist*min_dist)
+                     min_dist = (float) STBTT_sqrt(dist2);
+
+                  // coarse culling against bbox
+                  //if (sx > STBTT_min(x0,x1)-min_dist && sx < STBTT_max(x0,x1)+min_dist &&
+                  //    sy > STBTT_min(y0,y1)-min_dist && sy < STBTT_max(y0,y1)+min_dist)
+                  dist = (float) STBTT_fabs((x1-x0)*(y0-sy) - (y1-y0)*(x0-sx)) * precompute[i];
+                  STBTT_assert(i != 0);
+                  if (dist < min_dist) {
+                     // check position along line
+                     // x' = x0 + t*(x1-x0), y' = y0 + t*(y1-y0)
+                     // minimize (x'-sx)*(x'-sx)+(y'-sy)*(y'-sy)
+                     float dx = x1-x0, dy = y1-y0;
+                     float px = x0-sx, py = y0-sy;
+                     // minimize (px+t*dx)^2 + (py+t*dy)^2 = px*px + 2*px*dx*t + t^2*dx*dx + py*py + 2*py*dy*t + t^2*dy*dy
+                     // derivative: 2*px*dx + 2*py*dy + (2*dx*dx+2*dy*dy)*t, set to 0 and solve
+                     float t = -(px*dx + py*dy) / (dx*dx + dy*dy);
+                     if (t >= 0.0f && t <= 1.0f)
+                        min_dist = dist;
+                  }
+               } else if (verts[i].type == STBTT_vcurve) {
+                  float x2 = verts[i-1].x *scale_x, y2 = verts[i-1].y *scale_y;
+                  float x1 = verts[i  ].cx*scale_x, y1 = verts[i  ].cy*scale_y;
+                  float box_x0 = STBTT_min(STBTT_min(x0,x1),x2);
+                  float box_y0 = STBTT_min(STBTT_min(y0,y1),y2);
+                  float box_x1 = STBTT_max(STBTT_max(x0,x1),x2);
+                  float box_y1 = STBTT_max(STBTT_max(y0,y1),y2);
+                  // coarse culling against bbox to avoid computing cubic unnecessarily
+                  if (sx > box_x0-min_dist && sx < box_x1+min_dist && sy > box_y0-min_dist && sy < box_y1+min_dist) {
+                     int num=0;
+                     float ax = x1-x0, ay = y1-y0;
+                     float bx = x0 - 2*x1 + x2, by = y0 - 2*y1 + y2;
+                     float mx = x0 - sx, my = y0 - sy;
+                     float res[3] = {0.f,0.f,0.f};
+                     float px,py,t,it,dist2;
+                     float a_inv = precompute[i];
+                     if (a_inv == 0.0) { // if a_inv is 0, it's 2nd degree so use quadratic formula
+                        float a = 3*(ax*bx + ay*by);
+                        float b = 2*(ax*ax + ay*ay) + (mx*bx+my*by);
+                        float c = mx*ax+my*ay;
+                        if (a == 0.0) { // if a is 0, it's linear
+                           if (b != 0.0) {
+                              res[num++] = -c/b;
+                           }
+                        } else {
+                           float discriminant = b*b - 4*a*c;
+                           if (discriminant < 0)
+                              num = 0;
+                           else {
+                              float root = (float) STBTT_sqrt(discriminant);
+                              res[0] = (-b - root)/(2*a);
+                              res[1] = (-b + root)/(2*a);
+                              num = 2; // don't bother distinguishing 1-solution case, as code below will still work
+                           }
+                        }
+                     } else {
+                        float b = 3*(ax*bx + ay*by) * a_inv; // could precompute this as it doesn't depend on sample point
+                        float c = (2*(ax*ax + ay*ay) + (mx*bx+my*by)) * a_inv;
+                        float d = (mx*ax+my*ay) * a_inv;
+                        num = stbtt__solve_cubic(b, c, d, res);
+                     }
+                     dist2 = (x0-sx)*(x0-sx) + (y0-sy)*(y0-sy);
+                     if (dist2 < min_dist*min_dist)
+                        min_dist = (float) STBTT_sqrt(dist2);
+
+                     if (num >= 1 && res[0] >= 0.0f && res[0] <= 1.0f) {
+                        t = res[0], it = 1.0f - t;
+                        px = it*it*x0 + 2*t*it*x1 + t*t*x2;
+                        py = it*it*y0 + 2*t*it*y1 + t*t*y2;
+                        dist2 = (px-sx)*(px-sx) + (py-sy)*(py-sy);
+                        if (dist2 < min_dist * min_dist)
+                           min_dist = (float) STBTT_sqrt(dist2);
+                     }
+                     if (num >= 2 && res[1] >= 0.0f && res[1] <= 1.0f) {
+                        t = res[1], it = 1.0f - t;
+                        px = it*it*x0 + 2*t*it*x1 + t*t*x2;
+                        py = it*it*y0 + 2*t*it*y1 + t*t*y2;
+                        dist2 = (px-sx)*(px-sx) + (py-sy)*(py-sy);
+                        if (dist2 < min_dist * min_dist)
+                           min_dist = (float) STBTT_sqrt(dist2);
+                     }
+                     if (num >= 3 && res[2] >= 0.0f && res[2] <= 1.0f) {
+                        t = res[2], it = 1.0f - t;
+                        px = it*it*x0 + 2*t*it*x1 + t*t*x2;
+                        py = it*it*y0 + 2*t*it*y1 + t*t*y2;
+                        dist2 = (px-sx)*(px-sx) + (py-sy)*(py-sy);
+                        if (dist2 < min_dist * min_dist)
+                           min_dist = (float) STBTT_sqrt(dist2);
+                     }
+                  }
+               }
+            }
+            if (winding == 0)
+               min_dist = -min_dist;  // if outside the shape, value is negative
+            val = onedge_value + pixel_dist_scale * min_dist;
+            if (val < 0)
+               val = 0;
+            else if (val > 255)
+               val = 255;
+            data[(y-iy0)*w+(x-ix0)] = (unsigned char) val;
+         }
+      }
+      STBTT_free(precompute, info->userdata);
+      STBTT_free(verts, info->userdata);
+   }
+   return data;
+}
+
+STBTT_DEF unsigned char * stbtt_GetCodepointSDF(const stbtt_fontinfo *info, float scale, int codepoint, int padding, unsigned char onedge_value, float pixel_dist_scale, int *width, int *height, int *xoff, int *yoff)
+{
+   return stbtt_GetGlyphSDF(info, scale, stbtt_FindGlyphIndex(info, codepoint), padding, onedge_value, pixel_dist_scale, width, height, xoff, yoff);
+}
+
+STBTT_DEF void stbtt_FreeSDF(unsigned char *bitmap, void *userdata)
+{
+   STBTT_free(bitmap, userdata);
+}
 
 //////////////////////////////////////////////////////////////////////////////
 //
@@ -3060,7 +4773,7 @@ STBTT_DEF void stbtt_GetPackedQuad(stbtt_packedchar *chardata, int pw, int ph, i
 //
 
 // check if a utf8 string contains a prefix which is the utf16 string; if so return length of matching utf8 string
-static stbtt_int32 stbtt__CompareUTF8toUTF16_bigendian_prefix(const stbtt_uint8 *s1, stbtt_int32 len1, const stbtt_uint8 *s2, stbtt_int32 len2) 
+static stbtt_int32 stbtt__CompareUTF8toUTF16_bigendian_prefix(stbtt_uint8 *s1, stbtt_int32 len1, stbtt_uint8 *s2, stbtt_int32 len2)
 {
    stbtt_int32 i=0;
 
@@ -3099,9 +4812,9 @@ static stbtt_int32 stbtt__CompareUTF8toUTF16_bigendian_prefix(const stbtt_uint8
    return i;
 }
 
-STBTT_DEF int stbtt_CompareUTF8toUTF16_bigendian(const char *s1, int len1, const char *s2, int len2) 
+static int stbtt_CompareUTF8toUTF16_bigendian_internal(char *s1, int len1, char *s2, int len2)
 {
-   return len1 == stbtt__CompareUTF8toUTF16_bigendian_prefix((const stbtt_uint8*) s1, len1, (const stbtt_uint8*) s2, len2);
+   return len1 == stbtt__CompareUTF8toUTF16_bigendian_prefix((stbtt_uint8*) s1, len1, (stbtt_uint8*) s2, len2);
 }
 
 // returns results in whatever encoding you request... but note that 2-byte encodings
@@ -3157,7 +4870,7 @@ static int stbtt__matchpair(stbtt_uint8 *fc, stbtt_uint32 nm, stbtt_uint8 *name,
                         return 1;
                   } else if (matchlen < nlen && name[matchlen] == ' ') {
                      ++matchlen;
-                     if (stbtt_CompareUTF8toUTF16_bigendian((char*) (name+matchlen), nlen-matchlen, (char*)(fc+stringOffset+off),slen))
+                     if (stbtt_CompareUTF8toUTF16_bigendian_internal((char*) (name+matchlen), nlen-matchlen, (char*)(fc+stringOffset+off),slen))
                         return 1;
                   }
                } else {
@@ -3203,7 +4916,7 @@ static int stbtt__matches(stbtt_uint8 *fc, stbtt_uint32 offset, stbtt_uint8 *nam
    return 0;
 }
 
-STBTT_DEF int stbtt_FindMatchingFont(const unsigned char *font_collection, const char *name_utf8, stbtt_int32 flags)
+static int stbtt_FindMatchingFont_internal(unsigned char *font_collection, char *name_utf8, stbtt_int32 flags)
 {
    stbtt_int32 i;
    for (i=0;;++i) {
@@ -3214,11 +4927,66 @@ STBTT_DEF int stbtt_FindMatchingFont(const unsigned char *font_collection, const
    }
 }
 
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcast-qual"
+#endif
+
+STBTT_DEF int stbtt_BakeFontBitmap(const unsigned char *data, int offset,
+                                float pixel_height, unsigned char *pixels, int pw, int ph,
+                                int first_char, int num_chars, stbtt_bakedchar *chardata)
+{
+   return stbtt_BakeFontBitmap_internal((unsigned char *) data, offset, pixel_height, pixels, pw, ph, first_char, num_chars, chardata);
+}
+
+STBTT_DEF int stbtt_GetFontOffsetForIndex(const unsigned char *data, int index)
+{
+   return stbtt_GetFontOffsetForIndex_internal((unsigned char *) data, index);
+}
+
+STBTT_DEF int stbtt_GetNumberOfFonts(const unsigned char *data)
+{
+   return stbtt_GetNumberOfFonts_internal((unsigned char *) data);
+}
+
+STBTT_DEF int stbtt_InitFont(stbtt_fontinfo *info, const unsigned char *data, int offset)
+{
+   return stbtt_InitFont_internal(info, (unsigned char *) data, offset);
+}
+
+STBTT_DEF int stbtt_FindMatchingFont(const unsigned char *fontdata, const char *name, int flags)
+{
+   return stbtt_FindMatchingFont_internal((unsigned char *) fontdata, (char *) name, flags);
+}
+
+STBTT_DEF int stbtt_CompareUTF8toUTF16_bigendian(const char *s1, int len1, const char *s2, int len2)
+{
+   return stbtt_CompareUTF8toUTF16_bigendian_internal((char *) s1, len1, (char *) s2, len2);
+}
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+
 #endif // STB_TRUETYPE_IMPLEMENTATION
 
 
 // FULL VERSION HISTORY
 //
+//   1.25 (2021-07-11) many fixes
+//   1.24 (2020-02-05) fix warning
+//   1.23 (2020-02-02) query SVG data for glyphs; query whole kerning table (but only kern not GPOS)
+//   1.22 (2019-08-11) minimize missing-glyph duplication; fix kerning if both 'GPOS' and 'kern' are defined
+//   1.21 (2019-02-25) fix warning
+//   1.20 (2019-02-07) PackFontRange skips missing codepoints; GetScaleFontVMetrics()
+//   1.19 (2018-02-11) OpenType GPOS kerning (horizontal only), STBTT_fmod
+//   1.18 (2018-01-29) add missing function
+//   1.17 (2017-07-23) make more arguments const; doc fix
+//   1.16 (2017-07-12) SDF support
+//   1.15 (2017-03-03) make more arguments const
+//   1.14 (2017-01-16) num-fonts-in-TTC function
+//   1.13 (2017-01-02) support OpenType fonts, certain Apple fonts
+//   1.12 (2016-10-25) suppress warnings about casting away const with -Wcast-qual
 //   1.11 (2016-04-02) fix unused-variable warning
 //   1.10 (2016-04-02) allow user-defined fabs() replacement
 //                     fix memory leak if fontsize=0.0
@@ -3265,3 +5033,45 @@ STBTT_DEF int stbtt_FindMatchingFont(const unsigned char *font_collection, const
 //   0.2  (2009-03-11) Fix unsigned/signed char warnings
 //   0.1  (2009-03-09) First public release
 //
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/renderdoc/CMakeLists.txt b/renderdoc/CMakeLists.txt
index 7d7d9c35fa..b4fb97de80 100644
--- a/renderdoc/CMakeLists.txt
+++ b/renderdoc/CMakeLists.txt
@@ -282,7 +282,7 @@ set(sources
     3rdparty/zstd/zstdmt_compress.h
     3rdparty/stb/stb_image.h
     3rdparty/stb/stb_image_write.h
-    3rdparty/stb/stb_image_resize.h
+    3rdparty/stb/stb_image_resize2.h
     3rdparty/stb/stb_impl.c
     3rdparty/stb/stb_truetype.h
     3rdparty/tinyexr/tinyexr.cpp
diff --git a/renderdoc/os/win32/win32_shellext.cpp b/renderdoc/os/win32/win32_shellext.cpp
index 25fc3966ba..c150027911 100644
--- a/renderdoc/os/win32/win32_shellext.cpp
+++ b/renderdoc/os/win32/win32_shellext.cpp
@@ -38,7 +38,8 @@
 #include "maths/formatpacking.h"
 #include "maths/half_convert.h"
 #include "serialise/rdcfile.h"
-#include "stb/stb_image_resize.h"
+
+#include "stb/stb_image_resize2.h"
 
 // {5D6BF029-A6BA-417A-8523-120492B1DCE3}
 static const GUID CLSID_RDCThumbnailProvider = {0x5d6bf029,
@@ -625,7 +626,7 @@ struct RDCThumbnailProvider : public IThumbnailProvider, IInitializeWithStream
       byte *resizedpixels = (byte *)malloc(3 * bi.bV5Width * bi.bV5Height);
 
       stbir_resize_uint8_srgb(thumbpixels, thumbwidth, thumbheight, 0, resizedpixels, bi.bV5Width,
-                              bi.bV5Height, 0, 3, -1, 0);
+                              bi.bV5Height, 0, STBIR_RGB);
 
       free(thumbpixels);
 
diff --git a/renderdoc/renderdoc.vcxproj b/renderdoc/renderdoc.vcxproj
index 318274be05..65709359dd 100644
--- a/renderdoc/renderdoc.vcxproj
+++ b/renderdoc/renderdoc.vcxproj
@@ -140,7 +140,7 @@
     <ClInclude Include="3rdparty\pugixml\pugiconfig.hpp" />
     <ClInclude Include="3rdparty\pugixml\pugixml.hpp" />
     <ClInclude Include="3rdparty\stb\stb_image.h" />
-    <ClInclude Include="3rdparty\stb\stb_image_resize.h" />
+    <ClInclude Include="3rdparty\stb\stb_image_resize2.h" />
     <ClInclude Include="3rdparty\stb\stb_image_write.h" />
     <ClInclude Include="3rdparty\stb\stb_truetype.h" />
     <ClInclude Include="3rdparty\superluminal\PerformanceAPI_capi.h" />
diff --git a/renderdoc/renderdoc.vcxproj.filters b/renderdoc/renderdoc.vcxproj.filters
index a0a6bef1ce..b29a17b0ba 100644
--- a/renderdoc/renderdoc.vcxproj.filters
+++ b/renderdoc/renderdoc.vcxproj.filters
@@ -294,7 +294,7 @@
     <ClInclude Include="3rdparty\plthook\plthook.h">
       <Filter>3rdparty\plthook</Filter>
     </ClInclude>
-    <ClInclude Include="3rdparty\stb\stb_image_resize.h">
+    <ClInclude Include="3rdparty\stb\stb_image_resize2.h">
       <Filter>3rdparty\stb</Filter>
     </ClInclude>
     <ClInclude Include="api\replay\version.h">
diff --git a/renderdoc/replay/capture_file.cpp b/renderdoc/replay/capture_file.cpp
index 9b314f22fd..390ef9e939 100644
--- a/renderdoc/replay/capture_file.cpp
+++ b/renderdoc/replay/capture_file.cpp
@@ -29,7 +29,7 @@
 #include "serialise/rdcfile.h"
 #include "serialise/serialiser.h"
 #include "stb/stb_image.h"
-#include "stb/stb_image_resize.h"
+#include "stb/stb_image_resize2.h"
 #include "stb/stb_image_write.h"
 
 static void writeToBytebuf(void *context, void *data, int size)
@@ -624,7 +624,7 @@ Thumbnail CaptureFile::GetThumbnail(FileType type, uint32_t maxsize)
         byte *resizedpixels = (byte *)malloc(3 * clampedWidth * clampedHeight);
 
         stbir_resize_uint8_srgb(thumbpixels, thumbwidth, thumbheight, 0, resizedpixels,
-                                clampedWidth, clampedHeight, 0, 3, -1, 0);
+                                clampedWidth, clampedHeight, 0, STBIR_RGB);
 
         free(allocatedBuffer);