MCDV/stb_dxt.h

   1 // stb_dxt.h - v1.08b - DXT1/DXT5 compressor - public domain
   2 // original by fabian "ryg" giesen - ported to C by stb
   3 // use '#define STB_DXT_IMPLEMENTATION' before including to create the implementation
   4 //
   5 // USAGE:
   6 //   call stb_compress_dxt_block() for every block (you must pad)
   7 //     source should be a 4x4 block of RGBA data in row-major order;
   8 //     A is ignored if you specify alpha=0; you can turn on dithering
   9 //     and "high quality" using mode.
  10 //
  11 // version history:
  12 //   v1.08  - (sbt) fix bug in dxt-with-alpha block
  13 //   v1.07  - (stb) bc4; allow not using libc; add STB_DXT_STATIC
  14 //   v1.06  - (stb) fix to known-broken 1.05
  15 //   v1.05  - (stb) support bc5/3dc (Arvids Kokins), use extern "C" in C++ (Pavel Krajcevski)
  16 //   v1.04  - (ryg) default to no rounding bias for lerped colors (as per S3TC/DX10 spec);
  17 //            single color match fix (allow for inexact color interpolation);
  18 //            optimal DXT5 index finder; "high quality" mode that runs multiple refinement steps.
  19 //   v1.03  - (stb) endianness support
  20 //   v1.02  - (stb) fix alpha encoding bug
  21 //   v1.01  - (stb) fix bug converting to RGB that messed up quality, thanks ryg & cbloom
  22 //   v1.00  - (stb) first release
  23 //
  24 // contributors:
  25 //   Kevin Schmidt (#defines for "freestanding" compilation)
  26 //   github:ppiastucki (BC4 support)
  27 //
  28 // LICENSE
  29 //
  30 //   See end of file for license information.
  31
  32 #ifndef STB_INCLUDE_STB_DXT_H
  33 #define STB_INCLUDE_STB_DXT_H
  34
  35 #ifdef __cplusplus
  36 extern "C" {
  37 #endif
  38
  39 #ifdef STB_DXT_STATIC
  40 #define STBDDEF static
  41 #else
  42 #define STBDDEF extern
  43 #endif
  44
  45         // compression mode (bitflags)
  46 #define STB_DXT_NORMAL    0
  47 #define STB_DXT_DITHER    1   // use dithering. dubious win. never use for normal maps and the like!
  48 #define STB_DXT_HIGHQUAL  2   // high quality mode, does two refinement steps instead of 1. ~30-40% slower.
  49
  50         STBDDEF void stb_compress_dxt_block(unsigned char *dest, const unsigned char *src_rgba_four_bytes_per_pixel, int alpha, int mode);
  51         STBDDEF void stb_compress_bc4_block(unsigned char *dest, const unsigned char *src_r_one_byte_per_pixel);
  52         STBDDEF void stb_compress_bc5_block(unsigned char *dest, const unsigned char *src_rg_two_byte_per_pixel);
  53
  54 #define STB_COMPRESS_DXT_BLOCK
  55
  56 #ifdef __cplusplus
  57 }
  58 #endif
  59 #endif // STB_INCLUDE_STB_DXT_H
  60
  61 #ifdef STB_DXT_IMPLEMENTATION
  62
  63 // configuration options for DXT encoder. set them in the project/makefile or just define
  64 // them at the top.
  65
  66 // STB_DXT_USE_ROUNDING_BIAS
  67 //     use a rounding bias during color interpolation. this is closer to what "ideal"
  68 //     interpolation would do but doesn't match the S3TC/DX10 spec. old versions (pre-1.03)
  69 //     implicitly had this turned on.
  70 //
  71 //     in case you're targeting a specific type of hardware (e.g. console programmers):
  72 //     NVidia and Intel GPUs (as of 2010) as well as DX9 ref use DXT decoders that are closer
  73 //     to STB_DXT_USE_ROUNDING_BIAS. AMD/ATI, S3 and DX10 ref are closer to rounding with no bias.
  74 //     you also see "(a*5 + b*3) / 8" on some old GPU designs.
  75 // #define STB_DXT_USE_ROUNDING_BIAS
  76
  77 #include <stdlib.h>
  78
  79 #if !defined(STBD_ABS) || !defined(STBI_FABS)
  80 #include <math.h>
  81 #endif
  82
  83 #ifndef STBD_ABS
  84 #define STBD_ABS(i)           abs(i)
  85 #endif
  86
  87 #ifndef STBD_FABS
  88 #define STBD_FABS(x)          fabs(x)
  89 #endif
  90
  91 #ifndef STBD_MEMSET
  92 #include <string.h>
  93 #define STBD_MEMSET           memset
  94 #endif
  95
  96 static unsigned char stb__Expand5[32];
  97 static unsigned char stb__Expand6[64];
  98 static unsigned char stb__OMatch5[256][2];
  99 static unsigned char stb__OMatch6[256][2];
 100 static unsigned char stb__QuantRBTab[256 + 16];
 101 static unsigned char stb__QuantGTab[256 + 16];
 102
 103 static int stb__Mul8Bit(int a, int b)
 104 {
 105         int t = a * b + 128;
 106         return (t + (t >> 8)) >> 8;
 107 }
 108
 109 static void stb__From16Bit(unsigned char *out, unsigned short v)
 110 {
 111         int rv = (v & 0xf800) >> 11;
 112         int gv = (v & 0x07e0) >> 5;
 113         int bv = (v & 0x001f) >> 0;
 114
 115         out[0] = stb__Expand5[rv];
 116         out[1] = stb__Expand6[gv];
 117         out[2] = stb__Expand5[bv];
 118         out[3] = 0;
 119 }
 120
 121 static unsigned short stb__As16Bit(int r, int g, int b)
 122 {
 123         return (stb__Mul8Bit(r, 31) << 11) + (stb__Mul8Bit(g, 63) << 5) + stb__Mul8Bit(b, 31);
 124 }
 125
 126 // linear interpolation at 1/3 point between a and b, using desired rounding type
 127 static int stb__Lerp13(int a, int b)
 128 {
 129 #ifdef STB_DXT_USE_ROUNDING_BIAS
 130         // with rounding bias
 131         return a + stb__Mul8Bit(b - a, 0x55);
 132 #else
 133         // without rounding bias
 134         // replace "/ 3" by "* 0xaaab) >> 17" if your compiler sucks or you really need every ounce of speed.
 135         return (2 * a + b) / 3;
 136 #endif
 137 }
 138
 139 // lerp RGB color
 140 static void stb__Lerp13RGB(unsigned char *out, unsigned char *p1, unsigned char *p2)
 141 {
 142         out[0] = stb__Lerp13(p1[0], p2[0]);
 143         out[1] = stb__Lerp13(p1[1], p2[1]);
 144         out[2] = stb__Lerp13(p1[2], p2[2]);
 145 }
 146
 147 /****************************************************************************/
 148
 149 // compute table to reproduce constant colors as accurately as possible
 150 static void stb__PrepareOptTable(unsigned char *Table, const unsigned char *expand, int size)
 151 {
 152         int i, mn, mx;
 153         for (i = 0; i<256; i++) {
 154                 int bestErr = 256;
 155                 for (mn = 0; mn<size; mn++) {
 156                         for (mx = 0; mx<size; mx++) {
 157                                 int mine = expand[mn];
 158                                 int maxe = expand[mx];
 159                                 int err = STBD_ABS(stb__Lerp13(maxe, mine) - i);
 160
 161                                 // DX10 spec says that interpolation must be within 3% of "correct" result,
 162                                 // add this as error term. (normally we'd expect a random distribution of
 163                                 // +-1.5% error, but nowhere in the spec does it say that the error has to be
 164                                 // unbiased - better safe than sorry).
 165                                 err += STBD_ABS(maxe - mine) * 3 / 100;
 166
 167                                 if (err < bestErr)
 168                                 {
 169                                         Table[i * 2 + 0] = mx;
 170                                         Table[i * 2 + 1] = mn;
 171                                         bestErr = err;
 172                                 }
 173                         }
 174                 }
 175         }
 176 }
 177
 178 static void stb__EvalColors(unsigned char *color, unsigned short c0, unsigned short c1)
 179 {
 180         stb__From16Bit(color + 0, c0);
 181         stb__From16Bit(color + 4, c1);
 182         stb__Lerp13RGB(color + 8, color + 0, color + 4);
 183         stb__Lerp13RGB(color + 12, color + 4, color + 0);
 184 }
 185
 186 // Block dithering function. Simply dithers a block to 565 RGB.
 187 // (Floyd-Steinberg)
 188 static void stb__DitherBlock(unsigned char *dest, unsigned char *block)
 189 {
 190         int err[8], *ep1 = err, *ep2 = err + 4, *et;
 191         int ch, y;
 192
 193         // process channels separately
 194         for (ch = 0; ch<3; ++ch) {
 195                 unsigned char *bp = block + ch, *dp = dest + ch;
 196                 unsigned char *quant = (ch == 1) ? stb__QuantGTab + 8 : stb__QuantRBTab + 8;
 197                 STBD_MEMSET(err, 0, sizeof(err));
 198                 for (y = 0; y<4; ++y) {
 199                         dp[0] = quant[bp[0] + ((3 * ep2[1] + 5 * ep2[0]) >> 4)];
 200                         ep1[0] = bp[0] - dp[0];
 201                         dp[4] = quant[bp[4] + ((7 * ep1[0] + 3 * ep2[2] + 5 * ep2[1] + ep2[0]) >> 4)];
 202                         ep1[1] = bp[4] - dp[4];
 203                         dp[8] = quant[bp[8] + ((7 * ep1[1] + 3 * ep2[3] + 5 * ep2[2] + ep2[1]) >> 4)];
 204                         ep1[2] = bp[8] - dp[8];
 205                         dp[12] = quant[bp[12] + ((7 * ep1[2] + 5 * ep2[3] + ep2[2]) >> 4)];
 206                         ep1[3] = bp[12] - dp[12];
 207                         bp += 16;
 208                         dp += 16;
 209                         et = ep1, ep1 = ep2, ep2 = et; // swap
 210                 }
 211         }
 212 }
 213
 214 // The color matching function
 215 static unsigned int stb__MatchColorsBlock(unsigned char *block, unsigned char *color, int dither)
 216 {
 217         unsigned int mask = 0;
 218         int dirr = color[0 * 4 + 0] - color[1 * 4 + 0];
 219         int dirg = color[0 * 4 + 1] - color[1 * 4 + 1];
 220         int dirb = color[0 * 4 + 2] - color[1 * 4 + 2];
 221         int dots[16];
 222         int stops[4];
 223         int i;
 224         int c0Point, halfPoint, c3Point;
 225
 226         for (i = 0; i<16; i++)
 227                 dots[i] = block[i * 4 + 0] * dirr + block[i * 4 + 1] * dirg + block[i * 4 + 2] * dirb;
 228
 229         for (i = 0; i<4; i++)
 230                 stops[i] = color[i * 4 + 0] * dirr + color[i * 4 + 1] * dirg + color[i * 4 + 2] * dirb;
 231
 232         // think of the colors as arranged on a line; project point onto that line, then choose
 233         // next color out of available ones. we compute the crossover points for "best color in top
 234         // half"/"best in bottom half" and then the same inside that subinterval.
 235         //
 236         // relying on this 1d approximation isn't always optimal in terms of euclidean distance,
 237         // but it's very close and a lot faster.
 238         // http://cbloomrants.blogspot.com/2008/12/12-08-08-dxtc-summary.html
 239
 240         c0Point = (stops[1] + stops[3]) >> 1;
 241         halfPoint = (stops[3] + stops[2]) >> 1;
 242         c3Point = (stops[2] + stops[0]) >> 1;
 243
 244         if (!dither) {
 245                 // the version without dithering is straightforward
 246                 for (i = 15; i >= 0; i--) {
 247                         int dot = dots[i];
 248                         mask <<= 2;
 249
 250                         if (dot < halfPoint)
 251                                 mask |= (dot < c0Point) ? 1 : 3;
 252                         else
 253                                 mask |= (dot < c3Point) ? 2 : 0;
 254                 }
 255         }
 256         else {
 257                 // with floyd-steinberg dithering
 258                 int err[8], *ep1 = err, *ep2 = err + 4;
 259                 int *dp = dots, y;
 260
 261                 c0Point <<= 4;
 262                 halfPoint <<= 4;
 263                 c3Point <<= 4;
 264                 for (i = 0; i<8; i++)
 265                         err[i] = 0;
 266
 267                 for (y = 0; y<4; y++)
 268                 {
 269                         int dot, lmask, step;
 270
 271                         dot = (dp[0] << 4) + (3 * ep2[1] + 5 * ep2[0]);
 272                         if (dot < halfPoint)
 273                                 step = (dot < c0Point) ? 1 : 3;
 274                         else
 275                                 step = (dot < c3Point) ? 2 : 0;
 276                         ep1[0] = dp[0] - stops[step];
 277                         lmask = step;
 278
 279                         dot = (dp[1] << 4) + (7 * ep1[0] + 3 * ep2[2] + 5 * ep2[1] + ep2[0]);
 280                         if (dot < halfPoint)
 281                                 step = (dot < c0Point) ? 1 : 3;
 282                         else
 283                                 step = (dot < c3Point) ? 2 : 0;
 284                         ep1[1] = dp[1] - stops[step];
 285                         lmask |= step << 2;
 286
 287                         dot = (dp[2] << 4) + (7 * ep1[1] + 3 * ep2[3] + 5 * ep2[2] + ep2[1]);
 288                         if (dot < halfPoint)
 289                                 step = (dot < c0Point) ? 1 : 3;
 290                         else
 291                                 step = (dot < c3Point) ? 2 : 0;
 292                         ep1[2] = dp[2] - stops[step];
 293                         lmask |= step << 4;
 294
 295                         dot = (dp[3] << 4) + (7 * ep1[2] + 5 * ep2[3] + ep2[2]);
 296                         if (dot < halfPoint)
 297                                 step = (dot < c0Point) ? 1 : 3;
 298                         else
 299                                 step = (dot < c3Point) ? 2 : 0;
 300                         ep1[3] = dp[3] - stops[step];
 301                         lmask |= step << 6;
 302
 303                         dp += 4;
 304                         mask |= lmask << (y * 8);
 305                         { int *et = ep1; ep1 = ep2; ep2 = et; } // swap
 306                 }
 307         }
 308
 309         return mask;
 310 }
 311
 312 // The color optimization function. (Clever code, part 1)
 313 static void stb__OptimizeColorsBlock(unsigned char *block, unsigned short *pmax16, unsigned short *pmin16)
 314 {
 315         int mind = 0x7fffffff, maxd = -0x7fffffff;
 316         unsigned char *minp, *maxp;
 317         double magn;
 318         int v_r, v_g, v_b;
 319         static const int nIterPower = 4;
 320         float covf[6], vfr, vfg, vfb;
 321
 322         // determine color distribution
 323         int cov[6];
 324         int mu[3], min[3], max[3];
 325         int ch, i, iter;
 326
 327         for (ch = 0; ch<3; ch++)
 328         {
 329                 const unsigned char *bp = ((const unsigned char *)block) + ch;
 330                 int muv, minv, maxv;
 331
 332                 muv = minv = maxv = bp[0];
 333                 for (i = 4; i<64; i += 4)
 334                 {
 335                         muv += bp[i];
 336                         if (bp[i] < minv) minv = bp[i];
 337                         else if (bp[i] > maxv) maxv = bp[i];
 338                 }
 339
 340                 mu[ch] = (muv + 8) >> 4;
 341                 min[ch] = minv;
 342                 max[ch] = maxv;
 343         }
 344
 345         // determine covariance matrix
 346         for (i = 0; i<6; i++)
 347                 cov[i] = 0;
 348
 349         for (i = 0; i<16; i++)
 350         {
 351                 int r = block[i * 4 + 0] - mu[0];
 352                 int g = block[i * 4 + 1] - mu[1];
 353                 int b = block[i * 4 + 2] - mu[2];
 354
 355                 cov[0] += r * r;
 356                 cov[1] += r * g;
 357                 cov[2] += r * b;
 358                 cov[3] += g * g;
 359                 cov[4] += g * b;
 360                 cov[5] += b * b;
 361         }
 362
 363         // convert covariance matrix to float, find principal axis via power iter
 364         for (i = 0; i<6; i++)
 365                 covf[i] = cov[i] / 255.0f;
 366
 367         vfr = (float)(max[0] - min[0]);
 368         vfg = (float)(max[1] - min[1]);
 369         vfb = (float)(max[2] - min[2]);
 370
 371         for (iter = 0; iter<nIterPower; iter++)
 372         {
 373                 float r = vfr * covf[0] + vfg * covf[1] + vfb * covf[2];
 374                 float g = vfr * covf[1] + vfg * covf[3] + vfb * covf[4];
 375                 float b = vfr * covf[2] + vfg * covf[4] + vfb * covf[5];
 376
 377                 vfr = r;
 378                 vfg = g;
 379                 vfb = b;
 380         }
 381
 382         magn = STBD_FABS(vfr);
 383         if (STBD_FABS(vfg) > magn) magn = STBD_FABS(vfg);
 384         if (STBD_FABS(vfb) > magn) magn = STBD_FABS(vfb);
 385
 386         if (magn < 4.0f) { // too small, default to luminance
 387                 v_r = 299; // JPEG YCbCr luma coefs, scaled by 1000.
 388                 v_g = 587;
 389                 v_b = 114;
 390         }
 391         else {
 392                 magn = 512.0 / magn;
 393                 v_r = (int)(vfr * magn);
 394                 v_g = (int)(vfg * magn);
 395                 v_b = (int)(vfb * magn);
 396         }
 397
 398         // Pick colors at extreme points
 399         for (i = 0; i<16; i++)
 400         {
 401                 int dot = block[i * 4 + 0] * v_r + block[i * 4 + 1] * v_g + block[i * 4 + 2] * v_b;
 402
 403                 if (dot < mind) {
 404                         mind = dot;
 405                         minp = block + i * 4;
 406                 }
 407
 408                 if (dot > maxd) {
 409                         maxd = dot;
 410                         maxp = block + i * 4;
 411                 }
 412         }
 413
 414         *pmax16 = stb__As16Bit(maxp[0], maxp[1], maxp[2]);
 415         *pmin16 = stb__As16Bit(minp[0], minp[1], minp[2]);
 416 }
 417
 418 static int stb__sclamp(float y, int p0, int p1)
 419 {
 420         int x = (int)y;
 421         if (x < p0) return p0;
 422         if (x > p1) return p1;
 423         return x;
 424 }
 425
 426 // The refinement function. (Clever code, part 2)
 427 // Tries to optimize colors to suit block contents better.
 428 // (By solving a least squares system via normal equations+Cramer's rule)
 429 static int stb__RefineBlock(unsigned char *block, unsigned short *pmax16, unsigned short *pmin16, unsigned int mask)
 430 {
 431         static const int w1Tab[4] = { 3,0,2,1 };
 432         static const int prods[4] = { 0x090000,0x000900,0x040102,0x010402 };
 433         // ^some magic to save a lot of multiplies in the accumulating loop...
 434         // (precomputed products of weights for least squares system, accumulated inside one 32-bit register)
 435
 436         float frb, fg;
 437         unsigned short oldMin, oldMax, min16, max16;
 438         int i, akku = 0, xx, xy, yy;
 439         int At1_r, At1_g, At1_b;
 440         int At2_r, At2_g, At2_b;
 441         unsigned int cm = mask;
 442
 443         oldMin = *pmin16;
 444         oldMax = *pmax16;
 445
 446         if ((mask ^ (mask << 2)) < 4) // all pixels have the same index?
 447         {
 448                 // yes, linear system would be singular; solve using optimal
 449                 // single-color match on average color
 450                 int r = 8, g = 8, b = 8;
 451                 for (i = 0; i<16; ++i) {
 452                         r += block[i * 4 + 0];
 453                         g += block[i * 4 + 1];
 454                         b += block[i * 4 + 2];
 455                 }
 456
 457                 r >>= 4; g >>= 4; b >>= 4;
 458
 459                 max16 = (stb__OMatch5[r][0] << 11) | (stb__OMatch6[g][0] << 5) | stb__OMatch5[b][0];
 460                 min16 = (stb__OMatch5[r][1] << 11) | (stb__OMatch6[g][1] << 5) | stb__OMatch5[b][1];
 461         }
 462         else {
 463                 At1_r = At1_g = At1_b = 0;
 464                 At2_r = At2_g = At2_b = 0;
 465                 for (i = 0; i<16; ++i, cm >>= 2) {
 466                         int step = cm & 3;
 467                         int w1 = w1Tab[step];
 468                         int r = block[i * 4 + 0];
 469                         int g = block[i * 4 + 1];
 470                         int b = block[i * 4 + 2];
 471
 472                         akku += prods[step];
 473                         At1_r += w1 * r;
 474                         At1_g += w1 * g;
 475                         At1_b += w1 * b;
 476                         At2_r += r;
 477                         At2_g += g;
 478                         At2_b += b;
 479                 }
 480
 481                 At2_r = 3 * At2_r - At1_r;
 482                 At2_g = 3 * At2_g - At1_g;
 483                 At2_b = 3 * At2_b - At1_b;
 484
 485                 // extract solutions and decide solvability
 486                 xx = akku >> 16;
 487                 yy = (akku >> 8) & 0xff;
 488                 xy = (akku >> 0) & 0xff;
 489
 490                 frb = 3.0f * 31.0f / 255.0f / (xx*yy - xy * xy);
 491                 fg = frb * 63.0f / 31.0f;
 492
 493                 // solve.
 494                 max16 = stb__sclamp((At1_r*yy - At2_r * xy)*frb + 0.5f, 0, 31) << 11;
 495                 max16 |= stb__sclamp((At1_g*yy - At2_g * xy)*fg + 0.5f, 0, 63) << 5;
 496                 max16 |= stb__sclamp((At1_b*yy - At2_b * xy)*frb + 0.5f, 0, 31) << 0;
 497
 498                 min16 = stb__sclamp((At2_r*xx - At1_r * xy)*frb + 0.5f, 0, 31) << 11;
 499                 min16 |= stb__sclamp((At2_g*xx - At1_g * xy)*fg + 0.5f, 0, 63) << 5;
 500                 min16 |= stb__sclamp((At2_b*xx - At1_b * xy)*frb + 0.5f, 0, 31) << 0;
 501         }
 502
 503         *pmin16 = min16;
 504         *pmax16 = max16;
 505         return oldMin != min16 || oldMax != max16;
 506 }
 507
 508 // Color block compression
 509 static void stb__CompressColorBlock(unsigned char *dest, unsigned char *block, int mode)
 510 {
 511         unsigned int mask;
 512         int i;
 513         int dither;
 514         int refinecount;
 515         unsigned short max16, min16;
 516         unsigned char dblock[16 * 4], color[4 * 4];
 517
 518         dither = mode & STB_DXT_DITHER;
 519         refinecount = (mode & STB_DXT_HIGHQUAL) ? 2 : 1;
 520
 521         // check if block is constant
 522         for (i = 1; i<16; i++)
 523                 if (((unsigned int *)block)[i] != ((unsigned int *)block)[0])
 524                         break;
 525
 526         if (i == 16) { // constant color
 527                 int r = block[0], g = block[1], b = block[2];
 528                 mask = 0xaaaaaaaa;
 529                 max16 = (stb__OMatch5[r][0] << 11) | (stb__OMatch6[g][0] << 5) | stb__OMatch5[b][0];
 530                 min16 = (stb__OMatch5[r][1] << 11) | (stb__OMatch6[g][1] << 5) | stb__OMatch5[b][1];
 531         }
 532         else {
 533                 // first step: compute dithered version for PCA if desired
 534                 if (dither)
 535                         stb__DitherBlock(dblock, block);
 536
 537                 // second step: pca+map along principal axis
 538                 stb__OptimizeColorsBlock(dither ? dblock : block, &max16, &min16);
 539                 if (max16 != min16) {
 540                         stb__EvalColors(color, max16, min16);
 541                         mask = stb__MatchColorsBlock(block, color, dither);
 542                 }
 543                 else
 544                         mask = 0;
 545
 546                 // third step: refine (multiple times if requested)
 547                 for (i = 0; i<refinecount; i++) {
 548                         unsigned int lastmask = mask;
 549
 550                         if (stb__RefineBlock(dither ? dblock : block, &max16, &min16, mask)) {
 551                                 if (max16 != min16) {
 552                                         stb__EvalColors(color, max16, min16);
 553                                         mask = stb__MatchColorsBlock(block, color, dither);
 554                                 }
 555                                 else {
 556                                         mask = 0;
 557                                         break;
 558                                 }
 559                         }
 560
 561                         if (mask == lastmask)
 562                                 break;
 563                 }
 564         }
 565
 566         // write the color block
 567         if (max16 < min16)
 568         {
 569                 unsigned short t = min16;
 570                 min16 = max16;
 571                 max16 = t;
 572                 mask ^= 0x55555555;
 573         }
 574
 575         dest[0] = (unsigned char)(max16);
 576         dest[1] = (unsigned char)(max16 >> 8);
 577         dest[2] = (unsigned char)(min16);
 578         dest[3] = (unsigned char)(min16 >> 8);
 579         dest[4] = (unsigned char)(mask);
 580         dest[5] = (unsigned char)(mask >> 8);
 581         dest[6] = (unsigned char)(mask >> 16);
 582         dest[7] = (unsigned char)(mask >> 24);
 583 }
 584
 585 // Alpha block compression (this is easy for a change)
 586 static void stb__CompressAlphaBlock(unsigned char *dest, unsigned char *src, int stride)
 587 {
 588         int i, dist, bias, dist4, dist2, bits, mask;
 589
 590         // find min/max color
 591         int mn, mx;
 592         mn = mx = src[0];
 593
 594         for (i = 1; i<16; i++)
 595         {
 596                 if (src[i*stride] < mn) mn = src[i*stride];
 597                 else if (src[i*stride] > mx) mx = src[i*stride];
 598         }
 599
 600         // encode them
 601         ((unsigned char *)dest)[0] = mx;
 602         ((unsigned char *)dest)[1] = mn;
 603         dest += 2;
 604
 605         // determine bias and emit color indices
 606         // given the choice of mx/mn, these indices are optimal:
 607         // http://fgiesen.wordpress.com/2009/12/15/dxt5-alpha-block-index-determination/
 608         dist = mx - mn;
 609         dist4 = dist * 4;
 610         dist2 = dist * 2;
 611         bias = (dist < 8) ? (dist - 1) : (dist / 2 + 2);
 612         bias -= mn * 7;
 613         bits = 0, mask = 0;
 614
 615         for (i = 0; i<16; i++) {
 616                 int a = src[i*stride] * 7 + bias;
 617                 int ind, t;
 618
 619                 // select index. this is a "linear scale" lerp factor between 0 (val=min) and 7 (val=max).
 620                 t = (a >= dist4) ? -1 : 0; ind = t & 4; a -= dist4 & t;
 621                 t = (a >= dist2) ? -1 : 0; ind += t & 2; a -= dist2 & t;
 622                 ind += (a >= dist);
 623
 624                 // turn linear scale into DXT index (0/1 are extremal pts)
 625                 ind = -ind & 7;
 626                 ind ^= (2 > ind);
 627
 628                 // write index
 629                 mask |= ind << bits;
 630                 if ((bits += 3) >= 8) {
 631                         *dest++ = mask;
 632                         mask >>= 8;
 633                         bits -= 8;
 634                 }
 635         }
 636 }
 637
 638 static void stb__InitDXT()
 639 {
 640         int i;
 641         for (i = 0; i<32; i++)
 642                 stb__Expand5[i] = (i << 3) | (i >> 2);
 643
 644         for (i = 0; i<64; i++)
 645                 stb__Expand6[i] = (i << 2) | (i >> 4);
 646
 647         for (i = 0; i<256 + 16; i++)
 648         {
 649                 int v = i - 8 < 0 ? 0 : i - 8 > 255 ? 255 : i - 8;
 650                 stb__QuantRBTab[i] = stb__Expand5[stb__Mul8Bit(v, 31)];
 651                 stb__QuantGTab[i] = stb__Expand6[stb__Mul8Bit(v, 63)];
 652         }
 653
 654         stb__PrepareOptTable(&stb__OMatch5[0][0], stb__Expand5, 32);
 655         stb__PrepareOptTable(&stb__OMatch6[0][0], stb__Expand6, 64);
 656 }
 657
 658 void stb_compress_dxt_block(unsigned char *dest, const unsigned char *src, int alpha, int mode)
 659 {
 660         unsigned char data[16][4];
 661         static int init = 1;
 662         if (init) {
 663                 stb__InitDXT();
 664                 init = 0;
 665         }
 666
 667         if (alpha) {
 668                 int i;
 669                 stb__CompressAlphaBlock(dest, (unsigned char*)src + 3, 4);
 670                 dest += 8;
 671                 // make a new copy of the data in which alpha is opaque,
 672                 // because code uses a fast test for color constancy
 673                 memcpy(data, src, 4 * 16);
 674                 for (i = 0; i < 16; ++i)
 675                         data[i][3] = 255;
 676                 src = &data[0][0];
 677         }
 678
 679         stb__CompressColorBlock(dest, (unsigned char*)src, mode);
 680 }
 681
 682 void stb_compress_bc4_block(unsigned char *dest, const unsigned char *src)
 683 {
 684         stb__CompressAlphaBlock(dest, (unsigned char*)src, 1);
 685 }
 686
 687 void stb_compress_bc5_block(unsigned char *dest, const unsigned char *src)
 688 {
 689         stb__CompressAlphaBlock(dest, (unsigned char*)src, 2);
 690         stb__CompressAlphaBlock(dest + 8, (unsigned char*)src + 1, 2);
 691 }
 692 #endif // STB_DXT_IMPLEMENTATION
 693
 694 /*
 695 ------------------------------------------------------------------------------
 696 This software is available under 2 licenses -- choose whichever you prefer.
 697 ------------------------------------------------------------------------------
 698 ALTERNATIVE A - MIT License
 699 Copyright (c) 2017 Sean Barrett
 700 Permission is hereby granted, free of charge, to any person obtaining a copy of
 701 this software and associated documentation files (the "Software"), to deal in
 702 the Software without restriction, including without limitation the rights to
 703 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 704 of the Software, and to permit persons to whom the Software is furnished to do
 705 so, subject to the following conditions:
 706 The above copyright notice and this permission notice shall be included in all
 707 copies or substantial portions of the Software.
 708 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 709 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 710 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 711 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 712 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 713 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 714 SOFTWARE.
 715 ------------------------------------------------------------------------------
 716 ALTERNATIVE B - Public Domain (www.unlicense.org)
 717 This is free and unencumbered software released into the public domain.
 718 Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
 719 software, either in source code form or as a compiled binary, for any purpose,
 720 commercial or non-commercial, and by any means.
 721 In jurisdictions that recognize copyright laws, the author or authors of this
 722 software dedicate any and all copyright interest in the software to the public
 723 domain. We make this dedication for the benefit of the public at large and to
 724 the detriment of our heirs and successors. We intend this dedication to be an
 725 overt act of relinquishment in perpetuity of all present and future rights to
 726 this software under copyright law.
 727 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 728 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 729 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 730 AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 731 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 732 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 733 ------------------------------------------------------------------------------
 734 */