src/nbvtf/stb/stb_dxt.h

   1 // stb_dxt.h - v1.10 - DXT1/DXT5 compressor - public domain
   2 // original by fabian "ryg" giesen - ported to C by stb
   3 // use '#define STB_DXT_IMPLEMENTATION' before including to create the implementation
   4 //
   5 // USAGE:
   6 //   call stb_compress_dxt_block() for every block (you must pad)
   7 //     source should be a 4x4 block of RGBA data in row-major order;
   8 //     Alpha channel is not stored if you specify alpha=0 (but you
   9 //     must supply some constant alpha in the alpha channel).
  10 //     You can turn on dithering and "high quality" using mode.
  11 //
  12 // version history:
  13 //   v1.10  - (i.c) various small quality improvements
  14 //   v1.09  - (stb) update documentation re: surprising alpha channel requirement
  15 //   v1.08  - (stb) fix bug in dxt-with-alpha block
  16 //   v1.07  - (stb) bc4; allow not using libc; add STB_DXT_STATIC
  17 //   v1.06  - (stb) fix to known-broken 1.05
  18 //   v1.05  - (stb) support bc5/3dc (Arvids Kokins), use extern "C" in C++ (Pavel Krajcevski)
  19 //   v1.04  - (ryg) default to no rounding bias for lerped colors (as per S3TC/DX10 spec);
  20 //            single color match fix (allow for inexact color interpolation);
  21 //            optimal DXT5 index finder; "high quality" mode that runs multiple refinement steps.
  22 //   v1.03  - (stb) endianness support
  23 //   v1.02  - (stb) fix alpha encoding bug
  24 //   v1.01  - (stb) fix bug converting to RGB that messed up quality, thanks ryg & cbloom
  25 //   v1.00  - (stb) first release
  26 //
  27 // contributors:
  28 //   Rich Geldreich (more accurate index selection)
  29 //   Kevin Schmidt (#defines for "freestanding" compilation)
  30 //   github:ppiastucki (BC4 support)
  31 //   Ignacio Castano - improve DXT endpoint quantization
  32 //
  33 // LICENSE
  34 //
  35 //   See end of file for license information.
  36
  37 #ifndef STB_INCLUDE_STB_DXT_H
  38 #define STB_INCLUDE_STB_DXT_H
  39
  40 #ifdef __cplusplus
  41 extern "C" {
  42 #endif
  43
  44 #ifdef STB_DXT_STATIC
  45 #define STBDDEF static
  46 #else
  47 #define STBDDEF extern
  48 #endif
  49
  50 // compression mode (bitflags)
  51 #define STB_DXT_NORMAL    0
  52 #define STB_DXT_DITHER    1   // use dithering. dubious win. never use for normal maps and the like!
  53 #define STB_DXT_HIGHQUAL  2   // high quality mode, does two refinement steps instead of 1. ~30-40% slower.
  54
  55 STBDDEF void stb_compress_dxt_block(unsigned char *dest, const unsigned char *src_rgba_four_bytes_per_pixel, int alpha, int mode);
  56 STBDDEF void stb_compress_bc4_block(unsigned char *dest, const unsigned char *src_r_one_byte_per_pixel);
  57 STBDDEF void stb_compress_bc5_block(unsigned char *dest, const unsigned char *src_rg_two_byte_per_pixel);
  58
  59 #define STB_COMPRESS_DXT_BLOCK
  60
  61 #ifdef __cplusplus
  62 }
  63 #endif
  64 #endif // STB_INCLUDE_STB_DXT_H
  65
  66 #ifdef STB_DXT_IMPLEMENTATION
  67
  68 // configuration options for DXT encoder. set them in the project/makefile or just define
  69 // them at the top.
  70
  71 // STB_DXT_USE_ROUNDING_BIAS
  72 //     use a rounding bias during color interpolation. this is closer to what "ideal"
  73 //     interpolation would do but doesn't match the S3TC/DX10 spec. old versions (pre-1.03)
  74 //     implicitly had this turned on.
  75 //
  76 //     in case you're targeting a specific type of hardware (e.g. console programmers):
  77 //     NVidia and Intel GPUs (as of 2010) as well as DX9 ref use DXT decoders that are closer
  78 //     to STB_DXT_USE_ROUNDING_BIAS. AMD/ATI, S3 and DX10 ref are closer to rounding with no bias.
  79 //     you also see "(a*5 + b*3) / 8" on some old GPU designs.
  80 // #define STB_DXT_USE_ROUNDING_BIAS
  81
  82 #include <stdlib.h>
  83
  84 #if !defined(STBD_ABS) || !defined(STBI_FABS)
  85 #include <math.h>
  86 #endif
  87
  88 #ifndef STBD_ABS
  89 #define STBD_ABS(i)           abs(i)
  90 #endif
  91
  92 #ifndef STBD_FABS
  93 #define STBD_FABS(x)          fabs(x)
  94 #endif
  95
  96 #ifndef STBD_MEMSET
  97 #include <string.h>
  98 #define STBD_MEMSET           memset
  99 #endif
 100
 101 static unsigned char stb__Expand5[32];
 102 static unsigned char stb__Expand6[64];
 103 static unsigned char stb__OMatch5[256][2];
 104 static unsigned char stb__OMatch6[256][2];
 105 static unsigned char stb__QuantRBTab[256+16];
 106 static unsigned char stb__QuantGTab[256+16];
 107
 108 static int stb__Mul8Bit(int a, int b)
 109 {
 110   int t = a*b + 128;
 111   return (t + (t >> 8)) >> 8;
 112 }
 113
 114 static void stb__From16Bit(unsigned char *out, unsigned short v)
 115 {
 116    int rv = (v & 0xf800) >> 11;
 117    int gv = (v & 0x07e0) >>  5;
 118    int bv = (v & 0x001f) >>  0;
 119
 120    out[0] = stb__Expand5[rv];
 121    out[1] = stb__Expand6[gv];
 122    out[2] = stb__Expand5[bv];
 123    out[3] = 0;
 124 }
 125
 126 static unsigned short stb__As16Bit(int r, int g, int b)
 127 {
 128    return (unsigned short)((stb__Mul8Bit(r,31) << 11) + (stb__Mul8Bit(g,63) << 5) + stb__Mul8Bit(b,31));
 129 }
 130
 131 // linear interpolation at 1/3 point between a and b, using desired rounding type
 132 static int stb__Lerp13(int a, int b)
 133 {
 134 #ifdef STB_DXT_USE_ROUNDING_BIAS
 135    // with rounding bias
 136    return a + stb__Mul8Bit(b-a, 0x55);
 137 #else
 138    // without rounding bias
 139    // replace "/ 3" by "* 0xaaab) >> 17" if your compiler sucks or you really need every ounce of speed.
 140    return (2*a + b) / 3;
 141 #endif
 142 }
 143
 144 // lerp RGB color
 145 static void stb__Lerp13RGB(unsigned char *out, unsigned char *p1, unsigned char *p2)
 146 {
 147    out[0] = (unsigned char)stb__Lerp13(p1[0], p2[0]);
 148    out[1] = (unsigned char)stb__Lerp13(p1[1], p2[1]);
 149    out[2] = (unsigned char)stb__Lerp13(p1[2], p2[2]);
 150 }
 151
 152 /****************************************************************************/
 153
 154 // compute table to reproduce constant colors as accurately as possible
 155 static void stb__PrepareOptTable(unsigned char *Table,const unsigned char *expand,int size)
 156 {
 157    int i,mn,mx;
 158    for (i=0;i<256;i++) {
 159       int bestErr = 256;
 160       for (mn=0;mn<size;mn++) {
 161          for (mx=0;mx<size;mx++) {
 162             int mine = expand[mn];
 163             int maxe = expand[mx];
 164             int err = STBD_ABS(stb__Lerp13(maxe, mine) - i);
 165
 166             // DX10 spec says that interpolation must be within 3% of "correct" result,
 167             // add this as error term. (normally we'd expect a random distribution of
 168             // +-1.5% error, but nowhere in the spec does it say that the error has to be
 169             // unbiased - better safe than sorry).
 170             err += STBD_ABS(maxe - mine) * 3 / 100;
 171
 172             if(err < bestErr)
 173             {
 174                Table[i*2+0] = (unsigned char)mx;
 175                Table[i*2+1] = (unsigned char)mn;
 176                bestErr = err;
 177             }
 178          }
 179       }
 180    }
 181 }
 182
 183 static void stb__EvalColors(unsigned char *color,unsigned short c0,unsigned short c1)
 184 {
 185    stb__From16Bit(color+ 0, c0);
 186    stb__From16Bit(color+ 4, c1);
 187    stb__Lerp13RGB(color+ 8, color+0, color+4);
 188    stb__Lerp13RGB(color+12, color+4, color+0);
 189 }
 190
 191 // Block dithering function. Simply dithers a block to 565 RGB.
 192 // (Floyd-Steinberg)
 193 static void stb__DitherBlock(unsigned char *dest, unsigned char *block)
 194 {
 195   int err[8],*ep1 = err,*ep2 = err+4, *et;
 196   int ch,y;
 197
 198   // process channels separately
 199   for (ch=0; ch<3; ++ch) {
 200       unsigned char *bp = block+ch, *dp = dest+ch;
 201       unsigned char *quant = (ch == 1) ? stb__QuantGTab+8 : stb__QuantRBTab+8;
 202       STBD_MEMSET(err, 0, sizeof(err));
 203       for(y=0; y<4; ++y) {
 204          dp[ 0] = quant[bp[ 0] + ((3*ep2[1] + 5*ep2[0]) >> 4)];
 205          ep1[0] = bp[ 0] - dp[ 0];
 206          dp[ 4] = quant[bp[ 4] + ((7*ep1[0] + 3*ep2[2] + 5*ep2[1] + ep2[0]) >> 4)];
 207          ep1[1] = bp[ 4] - dp[ 4];
 208          dp[ 8] = quant[bp[ 8] + ((7*ep1[1] + 3*ep2[3] + 5*ep2[2] + ep2[1]) >> 4)];
 209          ep1[2] = bp[ 8] - dp[ 8];
 210          dp[12] = quant[bp[12] + ((7*ep1[2] + 5*ep2[3] + ep2[2]) >> 4)];
 211          ep1[3] = bp[12] - dp[12];
 212          bp += 16;
 213          dp += 16;
 214          et = ep1, ep1 = ep2, ep2 = et; // swap
 215       }
 216    }
 217 }
 218
 219 // The color matching function
 220 static unsigned int stb__MatchColorsBlock(unsigned char *block, unsigned char *color,int dither)
 221 {
 222    unsigned int mask = 0;
 223    int dirr = color[0*4+0] - color[1*4+0];
 224    int dirg = color[0*4+1] - color[1*4+1];
 225    int dirb = color[0*4+2] - color[1*4+2];
 226    int dots[16];
 227    int stops[4];
 228    int i;
 229    int c0Point, halfPoint, c3Point;
 230
 231    for(i=0;i<16;i++)
 232       dots[i] = block[i*4+0]*dirr + block[i*4+1]*dirg + block[i*4+2]*dirb;
 233
 234    for(i=0;i<4;i++)
 235       stops[i] = color[i*4+0]*dirr + color[i*4+1]*dirg + color[i*4+2]*dirb;
 236
 237    // think of the colors as arranged on a line; project point onto that line, then choose
 238    // next color out of available ones. we compute the crossover points for "best color in top
 239    // half"/"best in bottom half" and then the same inside that subinterval.
 240    //
 241    // relying on this 1d approximation isn't always optimal in terms of euclidean distance,
 242    // but it's very close and a lot faster.
 243    // http://cbloomrants.blogspot.com/2008/12/12-08-08-dxtc-summary.html
 244
 245    c0Point   = (stops[1] + stops[3]);
 246    halfPoint = (stops[3] + stops[2]);
 247    c3Point   = (stops[2] + stops[0]);
 248
 249    if(!dither) {
 250       // the version without dithering is straightforward
 251       for (i=15;i>=0;i--) {
 252          int dot = dots[i]*2;
 253          mask <<= 2;
 254
 255          if(dot < halfPoint)
 256            mask |= (dot < c0Point) ? 1 : 3;
 257          else
 258            mask |= (dot < c3Point) ? 2 : 0;
 259       }
 260   } else {
 261       // with floyd-steinberg dithering
 262       int err[8],*ep1 = err,*ep2 = err+4;
 263       int *dp = dots, y;
 264
 265       c0Point   <<= 3;
 266       halfPoint <<= 3;
 267       c3Point   <<= 3;
 268       for(i=0;i<8;i++)
 269          err[i] = 0;
 270
 271       for(y=0;y<4;y++)
 272       {
 273          int dot,lmask,step;
 274
 275          dot = (dp[0] << 4) + (3*ep2[1] + 5*ep2[0]);
 276          if(dot < halfPoint)
 277            step = (dot < c0Point) ? 1 : 3;
 278          else
 279            step = (dot < c3Point) ? 2 : 0;
 280          ep1[0] = dp[0] - stops[step];
 281          lmask = step;
 282
 283          dot = (dp[1] << 4) + (7*ep1[0] + 3*ep2[2] + 5*ep2[1] + ep2[0]);
 284          if(dot < halfPoint)
 285            step = (dot < c0Point) ? 1 : 3;
 286          else
 287            step = (dot < c3Point) ? 2 : 0;
 288          ep1[1] = dp[1] - stops[step];
 289          lmask |= step<<2;
 290
 291          dot = (dp[2] << 4) + (7*ep1[1] + 3*ep2[3] + 5*ep2[2] + ep2[1]);
 292          if(dot < halfPoint)
 293            step = (dot < c0Point) ? 1 : 3;
 294          else
 295            step = (dot < c3Point) ? 2 : 0;
 296          ep1[2] = dp[2] - stops[step];
 297          lmask |= step<<4;
 298
 299          dot = (dp[3] << 4) + (7*ep1[2] + 5*ep2[3] + ep2[2]);
 300          if(dot < halfPoint)
 301            step = (dot < c0Point) ? 1 : 3;
 302          else
 303            step = (dot < c3Point) ? 2 : 0;
 304          ep1[3] = dp[3] - stops[step];
 305          lmask |= step<<6;
 306
 307          dp += 4;
 308          mask |= lmask << (y*8);
 309          { int *et = ep1; ep1 = ep2; ep2 = et; } // swap
 310       }
 311    }
 312
 313    return mask;
 314 }
 315
 316 // The color optimization function. (Clever code, part 1)
 317 static void stb__OptimizeColorsBlock(unsigned char *block, unsigned short *pmax16, unsigned short *pmin16)
 318 {
 319   int mind = 0x7fffffff,maxd = -0x7fffffff;
 320   unsigned char *minp, *maxp;
 321   double magn;
 322   int v_r,v_g,v_b;
 323   static const int nIterPower = 4;
 324   float covf[6],vfr,vfg,vfb;
 325
 326   // determine color distribution
 327   int cov[6];
 328   int mu[3],min[3],max[3];
 329   int ch,i,iter;
 330
 331   for(ch=0;ch<3;ch++)
 332   {
 333     const unsigned char *bp = ((const unsigned char *) block) + ch;
 334     int muv,minv,maxv;
 335
 336     muv = minv = maxv = bp[0];
 337     for(i=4;i<64;i+=4)
 338     {
 339       muv += bp[i];
 340       if (bp[i] < minv) minv = bp[i];
 341       else if (bp[i] > maxv) maxv = bp[i];
 342     }
 343
 344     mu[ch] = (muv + 8) >> 4;
 345     min[ch] = minv;
 346     max[ch] = maxv;
 347   }
 348
 349   // determine covariance matrix
 350   for (i=0;i<6;i++)
 351      cov[i] = 0;
 352
 353   for (i=0;i<16;i++)
 354   {
 355     int r = block[i*4+0] - mu[0];
 356     int g = block[i*4+1] - mu[1];
 357     int b = block[i*4+2] - mu[2];
 358
 359     cov[0] += r*r;
 360     cov[1] += r*g;
 361     cov[2] += r*b;
 362     cov[3] += g*g;
 363     cov[4] += g*b;
 364     cov[5] += b*b;
 365   }
 366
 367   // convert covariance matrix to float, find principal axis via power iter
 368   for(i=0;i<6;i++)
 369     covf[i] = cov[i] / 255.0f;
 370
 371   vfr = (float) (max[0] - min[0]);
 372   vfg = (float) (max[1] - min[1]);
 373   vfb = (float) (max[2] - min[2]);
 374
 375   for(iter=0;iter<nIterPower;iter++)
 376   {
 377     float r = vfr*covf[0] + vfg*covf[1] + vfb*covf[2];
 378     float g = vfr*covf[1] + vfg*covf[3] + vfb*covf[4];
 379     float b = vfr*covf[2] + vfg*covf[4] + vfb*covf[5];
 380
 381     vfr = r;
 382     vfg = g;
 383     vfb = b;
 384   }
 385
 386   magn = STBD_FABS(vfr);
 387   if (STBD_FABS(vfg) > magn) magn = STBD_FABS(vfg);
 388   if (STBD_FABS(vfb) > magn) magn = STBD_FABS(vfb);
 389
 390    if(magn < 4.0f) { // too small, default to luminance
 391       v_r = 299; // JPEG YCbCr luma coefs, scaled by 1000.
 392       v_g = 587;
 393       v_b = 114;
 394    } else {
 395       magn = 512.0 / magn;
 396       v_r = (int) (vfr * magn);
 397       v_g = (int) (vfg * magn);
 398       v_b = (int) (vfb * magn);
 399    }
 400
 401    // Pick colors at extreme points
 402    for(i=0;i<16;i++)
 403    {
 404       int dot = block[i*4+0]*v_r + block[i*4+1]*v_g + block[i*4+2]*v_b;
 405
 406       if (dot < mind) {
 407          mind = dot;
 408          minp = block+i*4;
 409       }
 410
 411       if (dot > maxd) {
 412          maxd = dot;
 413          maxp = block+i*4;
 414       }
 415    }
 416
 417    *pmax16 = stb__As16Bit(maxp[0],maxp[1],maxp[2]);
 418    *pmin16 = stb__As16Bit(minp[0],minp[1],minp[2]);
 419 }
 420
 421 static const float midpoints5[32] = {
 422    0.015686f, 0.047059f, 0.078431f, 0.111765f, 0.145098f, 0.176471f, 0.207843f, 0.241176f, 0.274510f, 0.305882f, 0.337255f, 0.370588f, 0.403922f, 0.435294f, 0.466667f, 0.5f,
 423    0.533333f, 0.564706f, 0.596078f, 0.629412f, 0.662745f, 0.694118f, 0.725490f, 0.758824f, 0.792157f, 0.823529f, 0.854902f, 0.888235f, 0.921569f, 0.952941f, 0.984314f, 1.0f
 424 };
 425
 426 static const float midpoints6[64] = {
 427    0.007843f, 0.023529f, 0.039216f, 0.054902f, 0.070588f, 0.086275f, 0.101961f, 0.117647f, 0.133333f, 0.149020f, 0.164706f, 0.180392f, 0.196078f, 0.211765f, 0.227451f, 0.245098f,
 428    0.262745f, 0.278431f, 0.294118f, 0.309804f, 0.325490f, 0.341176f, 0.356863f, 0.372549f, 0.388235f, 0.403922f, 0.419608f, 0.435294f, 0.450980f, 0.466667f, 0.482353f, 0.500000f,
 429    0.517647f, 0.533333f, 0.549020f, 0.564706f, 0.580392f, 0.596078f, 0.611765f, 0.627451f, 0.643137f, 0.658824f, 0.674510f, 0.690196f, 0.705882f, 0.721569f, 0.737255f, 0.754902f,
 430    0.772549f, 0.788235f, 0.803922f, 0.819608f, 0.835294f, 0.850980f, 0.866667f, 0.882353f, 0.898039f, 0.913725f, 0.929412f, 0.945098f, 0.960784f, 0.976471f, 0.992157f, 1.0f
 431 };
 432
 433 static unsigned short stb__Quantize5(float x)
 434 {
 435    unsigned short q;
 436    x = x < 0 ? 0 : x > 1 ? 1 : x;  // saturate
 437    q = (unsigned short)(x * 31);
 438    q += (x > midpoints5[q]);
 439    return q;
 440 }
 441
 442 static unsigned short stb__Quantize6(float x)
 443 {
 444    unsigned short q;
 445    x = x < 0 ? 0 : x > 1 ? 1 : x;  // saturate
 446    q = (unsigned short)(x * 63);
 447    q += (x > midpoints6[q]);
 448    return q;
 449 }
 450
 451 // The refinement function. (Clever code, part 2)
 452 // Tries to optimize colors to suit block contents better.
 453 // (By solving a least squares system via normal equations+Cramer's rule)
 454 static int stb__RefineBlock(unsigned char *block, unsigned short *pmax16, unsigned short *pmin16, unsigned int mask)
 455 {
 456    static const int w1Tab[4] = { 3,0,2,1 };
 457    static const int prods[4] = { 0x090000,0x000900,0x040102,0x010402 };
 458    // ^some magic to save a lot of multiplies in the accumulating loop...
 459    // (precomputed products of weights for least squares system, accumulated inside one 32-bit register)
 460
 461    float f;
 462    unsigned short oldMin, oldMax, min16, max16;
 463    int i, akku = 0, xx,xy,yy;
 464    int At1_r,At1_g,At1_b;
 465    int At2_r,At2_g,At2_b;
 466    unsigned int cm = mask;
 467
 468    oldMin = *pmin16;
 469    oldMax = *pmax16;
 470
 471    if((mask ^ (mask<<2)) < 4) // all pixels have the same index?
 472    {
 473       // yes, linear system would be singular; solve using optimal
 474       // single-color match on average color
 475       int r = 8, g = 8, b = 8;
 476       for (i=0;i<16;++i) {
 477          r += block[i*4+0];
 478          g += block[i*4+1];
 479          b += block[i*4+2];
 480       }
 481
 482       r >>= 4; g >>= 4; b >>= 4;
 483
 484       max16 = (stb__OMatch5[r][0]<<11) | (stb__OMatch6[g][0]<<5) | stb__OMatch5[b][0];
 485       min16 = (stb__OMatch5[r][1]<<11) | (stb__OMatch6[g][1]<<5) | stb__OMatch5[b][1];
 486    } else {
 487       At1_r = At1_g = At1_b = 0;
 488       At2_r = At2_g = At2_b = 0;
 489       for (i=0;i<16;++i,cm>>=2) {
 490          int step = cm&3;
 491          int w1 = w1Tab[step];
 492          int r = block[i*4+0];
 493          int g = block[i*4+1];
 494          int b = block[i*4+2];
 495
 496          akku    += prods[step];
 497          At1_r   += w1*r;
 498          At1_g   += w1*g;
 499          At1_b   += w1*b;
 500          At2_r   += r;
 501          At2_g   += g;
 502          At2_b   += b;
 503       }
 504
 505       At2_r = 3*At2_r - At1_r;
 506       At2_g = 3*At2_g - At1_g;
 507       At2_b = 3*At2_b - At1_b;
 508
 509       // extract solutions and decide solvability
 510       xx = akku >> 16;
 511       yy = (akku >> 8) & 0xff;
 512       xy = (akku >> 0) & 0xff;
 513
 514       f = 3.0f / 255.0f / (xx*yy - xy*xy);
 515
 516       max16 =  stb__Quantize5((At1_r*yy - At2_r * xy) * f) << 11;
 517       max16 |= stb__Quantize6((At1_g*yy - At2_g * xy) * f) << 5;
 518       max16 |= stb__Quantize5((At1_b*yy - At2_b * xy) * f) << 0;
 519
 520       min16 =  stb__Quantize5((At2_r*xx - At1_r * xy) * f) << 11;
 521       min16 |= stb__Quantize6((At2_g*xx - At1_g * xy) * f) << 5;
 522       min16 |= stb__Quantize5((At2_b*xx - At1_b * xy) * f) << 0;
 523    }
 524
 525    *pmin16 = min16;
 526    *pmax16 = max16;
 527    return oldMin != min16 || oldMax != max16;
 528 }
 529
 530 // Color block compression
 531 static void stb__CompressColorBlock(unsigned char *dest, unsigned char *block, int mode)
 532 {
 533    unsigned int mask;
 534    int i;
 535    int dither;
 536    int refinecount;
 537    unsigned short max16, min16;
 538    unsigned char dblock[16*4],color[4*4];
 539
 540    dither = mode & STB_DXT_DITHER;
 541    refinecount = (mode & STB_DXT_HIGHQUAL) ? 2 : 1;
 542
 543    // check if block is constant
 544    for (i=1;i<16;i++)
 545       if (((unsigned int *) block)[i] != ((unsigned int *) block)[0])
 546          break;
 547
 548    if(i == 16) { // constant color
 549       int r = block[0], g = block[1], b = block[2];
 550       mask  = 0xaaaaaaaa;
 551       max16 = (stb__OMatch5[r][0]<<11) | (stb__OMatch6[g][0]<<5) | stb__OMatch5[b][0];
 552       min16 = (stb__OMatch5[r][1]<<11) | (stb__OMatch6[g][1]<<5) | stb__OMatch5[b][1];
 553    } else {
 554       // first step: compute dithered version for PCA if desired
 555       if(dither)
 556          stb__DitherBlock(dblock,block);
 557
 558       // second step: pca+map along principal axis
 559       stb__OptimizeColorsBlock(dither ? dblock : block,&max16,&min16);
 560       if (max16 != min16) {
 561          stb__EvalColors(color,max16,min16);
 562          mask = stb__MatchColorsBlock(block,color,dither);
 563       } else
 564          mask = 0;
 565
 566       // third step: refine (multiple times if requested)
 567       for (i=0;i<refinecount;i++) {
 568          unsigned int lastmask = mask;
 569
 570          if (stb__RefineBlock(dither ? dblock : block,&max16,&min16,mask)) {
 571             if (max16 != min16) {
 572                stb__EvalColors(color,max16,min16);
 573                mask = stb__MatchColorsBlock(block,color,dither);
 574             } else {
 575                mask = 0;
 576                break;
 577             }
 578          }
 579
 580          if(mask == lastmask)
 581             break;
 582       }
 583   }
 584
 585   // write the color block
 586   if(max16 < min16)
 587   {
 588      unsigned short t = min16;
 589      min16 = max16;
 590      max16 = t;
 591      mask ^= 0x55555555;
 592   }
 593
 594   dest[0] = (unsigned char) (max16);
 595   dest[1] = (unsigned char) (max16 >> 8);
 596   dest[2] = (unsigned char) (min16);
 597   dest[3] = (unsigned char) (min16 >> 8);
 598   dest[4] = (unsigned char) (mask);
 599   dest[5] = (unsigned char) (mask >> 8);
 600   dest[6] = (unsigned char) (mask >> 16);
 601   dest[7] = (unsigned char) (mask >> 24);
 602 }
 603
 604 // Alpha block compression (this is easy for a change)
 605 static void stb__CompressAlphaBlock(unsigned char *dest,unsigned char *src, int stride)
 606 {
 607    int i,dist,bias,dist4,dist2,bits,mask;
 608
 609    // find min/max color
 610    int mn,mx;
 611    mn = mx = src[0];
 612
 613    for (i=1;i<16;i++)
 614    {
 615       if (src[i*stride] < mn) mn = src[i*stride];
 616       else if (src[i*stride] > mx) mx = src[i*stride];
 617    }
 618
 619    // encode them
 620    dest[0] = (unsigned char)mx;
 621    dest[1] = (unsigned char)mn;
 622    dest += 2;
 623
 624    // determine bias and emit color indices
 625    // given the choice of mx/mn, these indices are optimal:
 626    // http://fgiesen.wordpress.com/2009/12/15/dxt5-alpha-block-index-determination/
 627    dist = mx-mn;
 628    dist4 = dist*4;
 629    dist2 = dist*2;
 630    bias = (dist < 8) ? (dist - 1) : (dist/2 + 2);
 631    bias -= mn * 7;
 632    bits = 0,mask=0;
 633
 634    for (i=0;i<16;i++) {
 635       int a = src[i*stride]*7 + bias;
 636       int ind,t;
 637
 638       // select index. this is a "linear scale" lerp factor between 0 (val=min) and 7 (val=max).
 639       t = (a >= dist4) ? -1 : 0; ind =  t & 4; a -= dist4 & t;
 640       t = (a >= dist2) ? -1 : 0; ind += t & 2; a -= dist2 & t;
 641       ind += (a >= dist);
 642
 643       // turn linear scale into DXT index (0/1 are extremal pts)
 644       ind = -ind & 7;
 645       ind ^= (2 > ind);
 646
 647       // write index
 648       mask |= ind << bits;
 649       if((bits += 3) >= 8) {
 650          *dest++ = (unsigned char)mask;
 651          mask >>= 8;
 652          bits -= 8;
 653       }
 654    }
 655 }
 656
 657 static void stb__InitDXT()
 658 {
 659    int i;
 660    for(i=0;i<32;i++)
 661       stb__Expand5[i] = (unsigned char)((i<<3)|(i>>2));
 662
 663    for(i=0;i<64;i++)
 664       stb__Expand6[i] = (unsigned char)((i<<2)|(i>>4));
 665
 666    for(i=0;i<256+16;i++)
 667    {
 668       int v = i-8 < 0 ? 0 : i-8 > 255 ? 255 : i-8;
 669       stb__QuantRBTab[i] = stb__Expand5[stb__Mul8Bit(v,31)];
 670       stb__QuantGTab[i] = stb__Expand6[stb__Mul8Bit(v,63)];
 671    }
 672
 673    stb__PrepareOptTable(&stb__OMatch5[0][0],stb__Expand5,32);
 674    stb__PrepareOptTable(&stb__OMatch6[0][0],stb__Expand6,64);
 675 }
 676
 677 void stb_compress_dxt_block(unsigned char *dest, const unsigned char *src, int alpha, int mode)
 678 {
 679    unsigned char data[16][4];
 680    static int init=1;
 681    if (init) {
 682       stb__InitDXT();
 683       init=0;
 684    }
 685
 686    if (alpha) {
 687       int i;
 688       stb__CompressAlphaBlock(dest,(unsigned char*) src+3, 4);
 689       dest += 8;
 690       // make a new copy of the data in which alpha is opaque,
 691       // because code uses a fast test for color constancy
 692       memcpy(data, src, 4*16);
 693       for (i=0; i < 16; ++i)
 694          data[i][3] = 255;
 695       src = &data[0][0];
 696    }
 697
 698    stb__CompressColorBlock(dest,(unsigned char*) src,mode);
 699 }
 700
 701 void stb_compress_bc4_block(unsigned char *dest, const unsigned char *src)
 702 {
 703    stb__CompressAlphaBlock(dest,(unsigned char*) src, 1);
 704 }
 705
 706 void stb_compress_bc5_block(unsigned char *dest, const unsigned char *src)
 707 {
 708    stb__CompressAlphaBlock(dest,(unsigned char*) src,2);
 709    stb__CompressAlphaBlock(dest + 8,(unsigned char*) src+1,2);
 710 }
 711 #endif // STB_DXT_IMPLEMENTATION
 712
 713 /*
 714 ------------------------------------------------------------------------------
 715 This software is available under 2 licenses -- choose whichever you prefer.
 716 ------------------------------------------------------------------------------
 717 ALTERNATIVE A - MIT License
 718 Copyright (c) 2017 Sean Barrett
 719 Permission is hereby granted, free of charge, to any person obtaining a copy of
 720 this software and associated documentation files (the "Software"), to deal in
 721 the Software without restriction, including without limitation the rights to
 722 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 723 of the Software, and to permit persons to whom the Software is furnished to do
 724 so, subject to the following conditions:
 725 The above copyright notice and this permission notice shall be included in all
 726 copies or substantial portions of the Software.
 727 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 728 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 729 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 730 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 731 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 732 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 733 SOFTWARE.
 734 ------------------------------------------------------------------------------
 735 ALTERNATIVE B - Public Domain (www.unlicense.org)
 736 This is free and unencumbered software released into the public domain.
 737 Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
 738 software, either in source code form or as a compiled binary, for any purpose,
 739 commercial or non-commercial, and by any means.
 740 In jurisdictions that recognize copyright laws, the author or authors of this
 741 software dedicate any and all copyright interest in the software to the public
 742 domain. We make this dedication for the benefit of the public at large and to
 743 the detriment of our heirs and successors. We intend this dedication to be an
 744 overt act of relinquishment in perpetuity of all present and future rights to
 745 this software under copyright law.
 746 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 747 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 748 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 749 AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 750 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 751 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 752 ------------------------------------------------------------------------------
 753 */