git.cworth.org Git - vogl/blob - src/voglcore/vogl_dxt_fast.cpp

   1 /**************************************************************************
   2  *
   3  * Copyright 2013-2014 RAD Game Tools and Valve Software
   4  * Copyright 2010-2014 Rich Geldreich and Tenacious Software LLC
   5  * All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a copy
   8  * of this software and associated documentation files (the "Software"), to deal
   9  * in the Software without restriction, including without limitation the rights
  10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11  * copies of the Software, and to permit persons to whom the Software is
  12  * furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included in
  15  * all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  23  * THE SOFTWARE.
  24  *
  25  **************************************************************************/
  26
  27 // File: vogl_dxt_fast.cpp
  28 // Parts of this module are derived from RYG's excellent public domain DXTx compressor.
  29 #include "vogl_core.h"
  30 #include "vogl_dxt_fast.h"
  31 #include "vogl_ryg_dxt.hpp"
  32
  33 namespace vogl
  34 {
  35     namespace dxt_fast
  36     {
  37         static inline int mul_8bit(int a, int b)
  38         {
  39             int t = a * b + 128;
  40             return (t + (t >> 8)) >> 8;
  41         }
  42
  43         static inline color_quad_u8 &unpack_color(color_quad_u8 &c, uint v)
  44         {
  45             uint rv = (v & 0xf800) >> 11;
  46             uint gv = (v & 0x07e0) >> 5;
  47             uint bv = (v & 0x001f) >> 0;
  48
  49             c.r = ryg_dxt::Expand5[rv];
  50             c.g = ryg_dxt::Expand6[gv];
  51             c.b = ryg_dxt::Expand5[bv];
  52             c.a = 0;
  53
  54             return c;
  55         }
  56
  57         static inline uint pack_color(const color_quad_u8 &c)
  58         {
  59             return (mul_8bit(c.r, 31) << 11) + (mul_8bit(c.g, 63) << 5) + mul_8bit(c.b, 31);
  60         }
  61
  62 #if 0
  63         static inline void lerp_color(color_quad_u8 &result, const color_quad_u8 &p1, const color_quad_u8 &p2, uint f)
  64         {
  65             VOGL_ASSERT(f <= 255);
  66
  67             result.r = static_cast<uint8>(p1.r + mul_8bit(p2.r - p1.r, f));
  68             result.g = static_cast<uint8>(p1.g + mul_8bit(p2.g - p1.g, f));
  69             result.b = static_cast<uint8>(p1.b + mul_8bit(p2.b - p1.b, f));
  70         }
  71 #endif
  72
  73         static inline void eval_colors(color_quad_u8 *pColors, uint c0, uint c1)
  74         {
  75             unpack_color(pColors[0], c0);
  76             unpack_color(pColors[1], c1);
  77
  78 #if 0
  79             lerp_color(pColors[2], pColors[0], pColors[1], 0x55);
  80             lerp_color(pColors[3], pColors[0], pColors[1], 0xAA);
  81 #else
  82             pColors[2].r = (pColors[0].r * 2 + pColors[1].r) / 3;
  83             pColors[2].g = (pColors[0].g * 2 + pColors[1].g) / 3;
  84             pColors[2].b = (pColors[0].b * 2 + pColors[1].b) / 3;
  85
  86             pColors[3].r = (pColors[1].r * 2 + pColors[0].r) / 3;
  87             pColors[3].g = (pColors[1].g * 2 + pColors[0].g) / 3;
  88             pColors[3].b = (pColors[1].b * 2 + pColors[0].b) / 3;
  89 #endif
  90         }
  91
  92         // false if all selectors equal
  93         static bool match_block_colors(uint n, const color_quad_u8 *pBlock, const color_quad_u8 *pColors, uint8 *pSelectors)
  94         {
  95             int dirr = pColors[0].r - pColors[1].r;
  96             int dirg = pColors[0].g - pColors[1].g;
  97             int dirb = pColors[0].b - pColors[1].b;
  98
  99             int stops[4];
 100             for (int i = 0; i < 4; i++)
 101                 stops[i] = pColors[i].r * dirr + pColors[i].g * dirg + pColors[i].b * dirb;
 102
 103             // 0 2 3 1
 104             int c0Point = stops[1] + stops[3];
 105             int halfPoint = stops[3] + stops[2];
 106             int c3Point = stops[2] + stops[0];
 107
 108             //dirr *= 2;
 109             //dirg *= 2;
 110             //dirb *= 2;
 111             c0Point >>= 1;
 112             halfPoint >>= 1;
 113             c3Point >>= 1;
 114
 115             bool status = false;
 116             for (uint i = 0; i < n; i++)
 117             {
 118                 int dot = pBlock[i].r * dirr + pBlock[i].g * dirg + pBlock[i].b * dirb;
 119
 120                 uint8 s;
 121                 if (dot < halfPoint)
 122                     s = (dot < c0Point) ? 1 : 3;
 123                 else
 124                     s = (dot < c3Point) ? 2 : 0;
 125
 126                 pSelectors[i] = s;
 127
 128                 if (s != pSelectors[0])
 129                     status = true;
 130             }
 131
 132             return status;
 133         }
 134
 135         static bool optimize_block_colors(uint n, const color_quad_u8 *block, uint &max16, uint &min16, uint ave_color[3], float axis[3])
 136         {
 137             int min[3], max[3];
 138
 139             for (uint ch = 0; ch < 3; ch++)
 140             {
 141                 const uint8 *bp = ((const uint8 *)block) + ch;
 142                 int minv, maxv;
 143
 144                 int64_t muv = bp[0];
 145                 minv = maxv = bp[0];
 146
 147                 const uint l = n << 2;
 148                 for (uint i = 4; i < l; i += 4)
 149                 {
 150                     muv += bp[i];
 151                     minv = math::minimum<int>(minv, bp[i]);
 152                     maxv = math::maximum<int>(maxv, bp[i]);
 153                 }
 154
 155                 ave_color[ch] = static_cast<int>((muv + (n / 2)) / n);
 156                 min[ch] = minv;
 157                 max[ch] = maxv;
 158             }
 159
 160             if ((min[0] == max[0]) && (min[1] == max[1]) && (min[2] == max[2]))
 161                 return false;
 162
 163             // determine covariance matrix
 164             double cov[6];
 165             for (int i = 0; i < 6; i++)
 166                 cov[i] = 0;
 167
 168             for (uint i = 0; i < n; i++)
 169             {
 170                 double r = (int)block[i].r - (int)ave_color[0];
 171                 double g = (int)block[i].g - (int)ave_color[1];
 172                 double b = (int)block[i].b - (int)ave_color[2];
 173
 174                 cov[0] += r * r;
 175                 cov[1] += r * g;
 176                 cov[2] += r * b;
 177                 cov[3] += g * g;
 178                 cov[4] += g * b;
 179                 cov[5] += b * b;
 180             }
 181
 182             double covf[6], vfr, vfg, vfb;
 183             for (int i = 0; i < 6; i++)
 184                 covf[i] = cov[i] * (1.0f / 255.0f);
 185
 186             vfr = max[0] - min[0];
 187             vfg = max[1] - min[1];
 188             vfb = max[2] - min[2];
 189
 190             static const uint nIterPower = 4;
 191             for (uint iter = 0; iter < nIterPower; iter++)
 192             {
 193                 double r = vfr * covf[0] + vfg * covf[1] + vfb * covf[2];
 194                 double g = vfr * covf[1] + vfg * covf[3] + vfb * covf[4];
 195                 double b = vfr * covf[2] + vfg * covf[4] + vfb * covf[5];
 196
 197                 vfr = r;
 198                 vfg = g;
 199                 vfb = b;
 200             }
 201
 202             double magn = math::maximum(math::maximum(fabs(vfr), fabs(vfg)), fabs(vfb));
 203             int v_r, v_g, v_b;
 204
 205             if (magn < 4.0f) // too small, default to luminance
 206             {
 207                 v_r = 148;
 208                 v_g = 300;
 209                 v_b = 58;
 210
 211                 axis[0] = (float)v_r;
 212                 axis[1] = (float)v_g;
 213                 axis[2] = (float)v_b;
 214             }
 215             else
 216             {
 217                 magn = 512.0f / magn;
 218                 vfr *= magn;
 219                 vfg *= magn;
 220                 vfb *= magn;
 221                 v_r = static_cast<int>(vfr);
 222                 v_g = static_cast<int>(vfg);
 223                 v_b = static_cast<int>(vfb);
 224
 225                 axis[0] = (float)vfr;
 226                 axis[1] = (float)vfg;
 227                 axis[2] = (float)vfb;
 228             }
 229
 230             int mind = block[0].r * v_r + block[0].g * v_g + block[0].b * v_b;
 231             int maxd = mind;
 232             color_quad_u8 minp(block[0]);
 233             color_quad_u8 maxp(block[0]);
 234
 235             for (uint i = 1; i < n; i++)
 236             {
 237                 int dot = block[i].r * v_r + block[i].g * v_g + block[i].b * v_b;
 238
 239                 if (dot < mind)
 240                 {
 241                     mind = dot;
 242                     minp = block[i];
 243                 }
 244
 245                 if (dot > maxd)
 246                 {
 247                     maxd = dot;
 248                     maxp = block[i];
 249                 }
 250             }
 251
 252             max16 = pack_color(maxp);
 253             min16 = pack_color(minp);
 254
 255             return true;
 256         }
 257
 258         // The refinement function. (Clever code, part 2)
 259         // Tries to optimize colors to suit block contents better.
 260         // (By solving a least squares system via normal equations+Cramer's rule)
 261         static bool refine_block(uint n, const color_quad_u8 *block, uint &max16, uint &min16, const uint8 *pSelectors)
 262         {
 263             static const int w1Tab[4] = { 3, 0, 2, 1 };
 264
 265             static const int prods_0[4] = { 0x00, 0x00, 0x02, 0x02 };
 266             static const int prods_1[4] = { 0x00, 0x09, 0x01, 0x04 };
 267             static const int prods_2[4] = { 0x09, 0x00, 0x04, 0x01 };
 268
 269             double akku_0 = 0;
 270             double akku_1 = 0;
 271             double akku_2 = 0;
 272             double At1_r, At1_g, At1_b;
 273             double At2_r, At2_g, At2_b;
 274
 275             At1_r = At1_g = At1_b = 0;
 276             At2_r = At2_g = At2_b = 0;
 277             for (uint i = 0; i < n; i++)
 278             {
 279                 double r = block[i].r;
 280                 double g = block[i].g;
 281                 double b = block[i].b;
 282                 int step = pSelectors[i];
 283
 284                 int w1 = w1Tab[step];
 285
 286                 akku_0 += prods_0[step];
 287                 akku_1 += prods_1[step];
 288                 akku_2 += prods_2[step];
 289                 At1_r += w1 * r;
 290                 At1_g += w1 * g;
 291                 At1_b += w1 * b;
 292                 At2_r += r;
 293                 At2_g += g;
 294                 At2_b += b;
 295             }
 296
 297             At2_r = 3 * At2_r - At1_r;
 298             At2_g = 3 * At2_g - At1_g;
 299             At2_b = 3 * At2_b - At1_b;
 300
 301             double xx = akku_2;
 302             double yy = akku_1;
 303             double xy = akku_0;
 304
 305             double t = xx * yy - xy * xy;
 306             if (!yy || !xx || (fabs(t) < .0000125f))
 307                 return false;
 308
 309             double frb = (3.0f * 31.0f / 255.0f) / t;
 310             double fg = frb * (63.0f / 31.0f);
 311
 312             uint oldMin = min16;
 313             uint oldMax = max16;
 314
 315             // solve.
 316             max16 = math::clamp<int>(static_cast<int>((At1_r * yy - At2_r * xy) * frb + 0.5f), 0, 31) << 11;
 317             max16 |= math::clamp<int>(static_cast<int>((At1_g * yy - At2_g * xy) * fg + 0.5f), 0, 63) << 5;
 318             max16 |= math::clamp<int>(static_cast<int>((At1_b * yy - At2_b * xy) * frb + 0.5f), 0, 31) << 0;
 319
 320             min16 = math::clamp<int>(static_cast<int>((At2_r * xx - At1_r * xy) * frb + 0.5f), 0, 31) << 11;
 321             min16 |= math::clamp<int>(static_cast<int>((At2_g * xx - At1_g * xy) * fg + 0.5f), 0, 63) << 5;
 322             min16 |= math::clamp<int>(static_cast<int>((At2_b * xx - At1_b * xy) * frb + 0.5f), 0, 31) << 0;
 323
 324             return (oldMin != min16) || (oldMax != max16);
 325         }
 326
 327         // false if all selectors equal
 328         static bool determine_selectors(uint n, const color_quad_u8 *block, uint min16, uint max16, uint8 *pSelectors)
 329         {
 330             color_quad_u8 color[4];
 331
 332             if (max16 != min16)
 333             {
 334                 eval_colors(color, min16, max16);
 335
 336                 return match_block_colors(n, block, color, pSelectors);
 337             }
 338
 339             memset(pSelectors, 0, n);
 340             return false;
 341         }
 342
 343         static uint64_t determine_error(uint n, const color_quad_u8 *block, uint min16, uint max16, uint64_t early_out_error)
 344         {
 345             color_quad_u8 color[4];
 346
 347             eval_colors(color, min16, max16);
 348
 349             int dirr = color[0].r - color[1].r;
 350             int dirg = color[0].g - color[1].g;
 351             int dirb = color[0].b - color[1].b;
 352
 353             int stops[4];
 354             for (int i = 0; i < 4; i++)
 355                 stops[i] = color[i].r * dirr + color[i].g * dirg + color[i].b * dirb;
 356
 357             // 0 2 3 1
 358             int c0Point = stops[1] + stops[3];
 359             int halfPoint = stops[3] + stops[2];
 360             int c3Point = stops[2] + stops[0];
 361
 362             c0Point >>= 1;
 363             halfPoint >>= 1;
 364             c3Point >>= 1;
 365
 366             uint64_t total_error = 0;
 367
 368             for (uint i = 0; i < n; i++)
 369             {
 370                 const color_quad_u8 &a = block[i];
 371
 372                 uint s = 0;
 373                 if (min16 != max16)
 374                 {
 375                     int dot = a.r * dirr + a.g * dirg + a.b * dirb;
 376
 377                     if (dot < halfPoint)
 378                         s = (dot < c0Point) ? 1 : 3;
 379                     else
 380                         s = (dot < c3Point) ? 2 : 0;
 381                 }
 382
 383                 const color_quad_u8 &b = color[s];
 384
 385                 int e = a[0] - b[0];
 386                 total_error += e * e;
 387
 388                 e = a[1] - b[1];
 389                 total_error += e * e;
 390
 391                 e = a[2] - b[2];
 392                 total_error += e * e;
 393
 394                 if (total_error >= early_out_error)
 395                     break;
 396             }
 397
 398             return total_error;
 399         }
 400
 401         static bool refine_endpoints(uint n, const color_quad_u8 *pBlock, uint &low16, uint &high16, uint8 *pSelectors)
 402         {
 403             bool optimized = false;
 404
 405             const int limits[3] = { 31, 63, 31 };
 406
 407             for (uint trial = 0; trial < 2; trial++)
 408             {
 409                 color_quad_u8 color[4];
 410                 eval_colors(color, low16, high16);
 411
 412                 uint64_t total_error[3] = { 0, 0, 0 };
 413
 414                 for (uint i = 0; i < n; i++)
 415                 {
 416                     const color_quad_u8 &a = pBlock[i];
 417
 418                     const uint s = pSelectors[i];
 419                     const color_quad_u8 &b = color[s];
 420
 421                     int e = a[0] - b[0];
 422                     total_error[0] += e * e;
 423
 424                     e = a[1] - b[1];
 425                     total_error[1] += e * e;
 426
 427                     e = a[2] - b[2];
 428                     total_error[2] += e * e;
 429                 }
 430
 431                 color_quad_u8 endpoints[2];
 432                 endpoints[0] = dxt1_block::unpack_color((uint16)low16, false);
 433                 endpoints[1] = dxt1_block::unpack_color((uint16)high16, false);
 434
 435                 color_quad_u8 expanded_endpoints[2];
 436                 expanded_endpoints[0] = dxt1_block::unpack_color((uint16)low16, true);
 437                 expanded_endpoints[1] = dxt1_block::unpack_color((uint16)high16, true);
 438
 439                 bool trial_optimized = false;
 440
 441                 for (uint axis = 0; axis < 3; axis++)
 442                 {
 443                     if (!total_error[axis])
 444                         continue;
 445
 446                     const sU8 *const pExpand = (axis == 1) ? ryg_dxt::Expand6 : ryg_dxt::Expand5;
 447
 448                     for (uint e = 0; e < 2; e++)
 449                     {
 450                         uint v[4];
 451                         v[e ^ 1] = expanded_endpoints[e ^ 1][axis];
 452
 453                         for (int t = -1; t <= 1; t += 2)
 454                         {
 455                             int a = endpoints[e][axis] + t;
 456                             if ((a < 0) || (a > limits[axis]))
 457                                 continue;
 458
 459                             v[e] = pExpand[a];
 460
 461                             //int delta = v[1] - v[0];
 462                             //v[2] = v[0] + mul_8bit(delta, 0x55);
 463                             //v[3] = v[0] + mul_8bit(delta, 0xAA);
 464
 465                             v[2] = (v[0] * 2 + v[1]) / 3;
 466                             v[3] = (v[0] + v[1] * 2) / 3;
 467
 468                             uint64_t axis_error = 0;
 469
 470                             for (uint i = 0; i < n; i++)
 471                             {
 472                                 const color_quad_u8 &p = pBlock[i];
 473
 474                                 int e = v[pSelectors[i]] - p[axis];
 475
 476                                 axis_error += e * e;
 477
 478                                 if (axis_error >= total_error[axis])
 479                                     break;
 480                             }
 481
 482                             if (axis_error < total_error[axis])
 483                             {
 484                                 //total_error[axis] = axis_error;
 485
 486                                 endpoints[e][axis] = (uint8)a;
 487                                 expanded_endpoints[e][axis] = (uint8)v[e];
 488
 489                                 if (e)
 490                                     high16 = dxt1_block::pack_color(endpoints[1], false);
 491                                 else
 492                                     low16 = dxt1_block::pack_color(endpoints[0], false);
 493
 494                                 determine_selectors(n, pBlock, low16, high16, pSelectors);
 495
 496                                 eval_colors(color, low16, high16);
 497
 498                                 utils::zero_object(total_error);
 499
 500                                 for (uint i = 0; i < n; i++)
 501                                 {
 502                                     const color_quad_u8 &a = pBlock[i];
 503
 504                                     const uint s = pSelectors[i];
 505                                     const color_quad_u8 &b = color[s];
 506
 507                                     int e = a[0] - b[0];
 508                                     total_error[0] += e * e;
 509
 510                                     e = a[1] - b[1];
 511                                     total_error[1] += e * e;
 512
 513                                     e = a[2] - b[2];
 514                                     total_error[2] += e * e;
 515                                 }
 516
 517                                 trial_optimized = true;
 518                             }
 519
 520                         } // t
 521
 522                     } // e
 523                 }     // axis
 524
 525                 if (!trial_optimized)
 526                     break;
 527
 528                 optimized = true;
 529
 530             } // for ( ; ; )
 531
 532             return optimized;
 533         }
 534
 535         static void refine_endpoints2(uint n, const color_quad_u8 *pBlock, uint &low16, uint &high16, uint8 *pSelectors, float axis[3])
 536         {
 537             uint64_t orig_error = determine_error(n, pBlock, low16, high16, cUINT64_MAX);
 538             if (!orig_error)
 539                 return;
 540
 541             float l = 1.0f / sqrt(axis[0] * axis[0] + axis[1] * axis[1] + axis[2] * axis[2]);
 542             vec3F principle_axis(axis[0] * l, axis[1] * l, axis[2] * l);
 543
 544             const float dist_per_trial = 0.027063293f;
 545
 546             const uint cMaxProbeRange = 8;
 547             uint probe_low[cMaxProbeRange * 2 + 1];
 548             uint probe_high[cMaxProbeRange * 2 + 1];
 549
 550             int probe_range = 8;
 551             uint num_iters = 4;
 552
 553             const uint num_trials = probe_range * 2 + 1;
 554
 555             vec3F scaled_principle_axis(principle_axis * dist_per_trial);
 556             scaled_principle_axis[0] *= 31.0f;
 557             scaled_principle_axis[1] *= 63.0f;
 558             scaled_principle_axis[2] *= 31.0f;
 559             vec3F initial_ofs(scaled_principle_axis * (float)-probe_range);
 560             initial_ofs[0] += .5f;
 561             initial_ofs[1] += .5f;
 562             initial_ofs[2] += .5f;
 563
 564             uint64_t cur_error = orig_error;
 565
 566             for (uint iter = 0; iter < num_iters; iter++)
 567             {
 568                 color_quad_u8 endpoints[2];
 569
 570                 endpoints[0] = dxt1_block::unpack_color((uint16)low16, false);
 571                 endpoints[1] = dxt1_block::unpack_color((uint16)high16, false);
 572
 573                 vec3F low_color(endpoints[0][0], endpoints[0][1], endpoints[0][2]);
 574                 vec3F high_color(endpoints[1][0], endpoints[1][1], endpoints[1][2]);
 575
 576                 vec3F probe_low_color(low_color + initial_ofs);
 577                 for (uint i = 0; i < num_trials; i++)
 578                 {
 579                     int r = math::clamp((int)floor(probe_low_color[0]), 0, 31);
 580                     int g = math::clamp((int)floor(probe_low_color[1]), 0, 63);
 581                     int b = math::clamp((int)floor(probe_low_color[2]), 0, 31);
 582                     probe_low[i] = b | (g << 5U) | (r << 11U);
 583
 584                     probe_low_color += scaled_principle_axis;
 585                 }
 586
 587                 vec3F probe_high_color(high_color + initial_ofs);
 588                 for (uint i = 0; i < num_trials; i++)
 589                 {
 590                     int r = math::clamp((int)floor(probe_high_color[0]), 0, 31);
 591                     int g = math::clamp((int)floor(probe_high_color[1]), 0, 63);
 592                     int b = math::clamp((int)floor(probe_high_color[2]), 0, 31);
 593                     probe_high[i] = b | (g << 5U) | (r << 11U);
 594
 595                     probe_high_color += scaled_principle_axis;
 596                 }
 597
 598                 uint best_l = low16;
 599                 uint best_h = high16;
 600
 601                 enum
 602                 {
 603                     cMaxHash = 4
 604                 };
 605                 uint64_t hash[cMaxHash];
 606                 for (uint i = 0; i < cMaxHash; i++)
 607                     hash[i] = 0;
 608
 609                 uint c = best_l | (best_h << 16);
 610                 c = fast_hash(&c, sizeof(c));
 611                 hash[(c >> 6) & 3] = 1ULL << (c & 63);
 612
 613                 for (uint i = 0; i < num_trials; i++)
 614                 {
 615                     for (uint j = 0; j < num_trials; j++)
 616                     {
 617                         uint l = probe_low[i];
 618                         uint h = probe_high[j];
 619                         if (l < h)
 620                             utils::swap(l, h);
 621
 622                         uint c = l | (h << 16);
 623                         c = fast_hash(&c, sizeof(c));
 624                         uint64_t mask = 1ULL << (c & 63);
 625                         uint ofs = (c >> 6) & 3;
 626                         if (hash[ofs] & mask)
 627                             continue;
 628
 629                         hash[ofs] |= mask;
 630
 631                         uint64_t new_error = determine_error(n, pBlock, l, h, cur_error);
 632                         if (new_error < cur_error)
 633                         {
 634                             best_l = l;
 635                             best_h = h;
 636                             cur_error = new_error;
 637                         }
 638                     }
 639                 }
 640
 641                 bool improved = false;
 642
 643                 if ((best_l != low16) || (best_h != high16))
 644                 {
 645                     low16 = best_l;
 646                     high16 = best_h;
 647
 648                     determine_selectors(n, pBlock, low16, high16, pSelectors);
 649                     improved = true;
 650                 }
 651
 652                 if (refine_endpoints(n, pBlock, low16, high16, pSelectors))
 653                 {
 654                     improved = true;
 655
 656                     uint64_t cur_error = determine_error(n, pBlock, low16, high16, cUINT64_MAX);
 657                     if (!cur_error)
 658                         return;
 659                 }
 660
 661                 if (!improved)
 662                     break;
 663
 664             } // iter
 665
 666             //uint64_t end_error = determine_error(n, pBlock, low16, high16, UINT64_MAX);
 667             //if (end_error > orig_error) DebugBreak();
 668         }
 669
 670         static void compress_solid_block(uint n, uint ave_color[3], uint &low16, uint &high16, uint8 *pSelectors)
 671         {
 672             uint r = ave_color[0];
 673             uint g = ave_color[1];
 674             uint b = ave_color[2];
 675
 676             memset(pSelectors, 2, n);
 677
 678             low16 = (ryg_dxt::OMatch5[r][0] << 11) | (ryg_dxt::OMatch6[g][0] << 5) | ryg_dxt::OMatch5[b][0];
 679             high16 = (ryg_dxt::OMatch5[r][1] << 11) | (ryg_dxt::OMatch6[g][1] << 5) | ryg_dxt::OMatch5[b][1];
 680         }
 681
 682         void compress_color_block(uint n, const color_quad_u8 *block, uint &low16, uint &high16, uint8 *pSelectors, bool refine)
 683         {
 684             VOGL_ASSERT((n & 15) == 0);
 685
 686             uint ave_color[3];
 687             float axis[3];
 688
 689             if (!optimize_block_colors(n, block, low16, high16, ave_color, axis))
 690             {
 691                 compress_solid_block(n, ave_color, low16, high16, pSelectors);
 692             }
 693             else
 694             {
 695                 if (!determine_selectors(n, block, low16, high16, pSelectors))
 696                     compress_solid_block(n, ave_color, low16, high16, pSelectors);
 697                 else
 698                 {
 699                     if (refine_block(n, block, low16, high16, pSelectors))
 700                         determine_selectors(n, block, low16, high16, pSelectors);
 701
 702                     if (refine)
 703                         refine_endpoints2(n, block, low16, high16, pSelectors, axis);
 704                 }
 705             }
 706
 707             if (low16 < high16)
 708             {
 709                 utils::swap(low16, high16);
 710                 for (uint i = 0; i < n; i++)
 711                     pSelectors[i] ^= 1;
 712             }
 713         }
 714
 715         void compress_color_block(dxt1_block *pDXT1_block, const color_quad_u8 *pBlock, bool refine)
 716         {
 717             uint8 color_selectors[16];
 718             uint low16, high16;
 719             dxt_fast::compress_color_block(16, pBlock, low16, high16, color_selectors, refine);
 720
 721             pDXT1_block->set_low_color(static_cast<uint16>(low16));
 722             pDXT1_block->set_high_color(static_cast<uint16>(high16));
 723
 724             uint mask = 0;
 725             for (int i = 15; i >= 0; i--)
 726             {
 727                 mask <<= 2;
 728                 mask |= color_selectors[i];
 729             }
 730
 731             pDXT1_block->m_selectors[0] = (uint8)(mask & 0xFF);
 732             pDXT1_block->m_selectors[1] = (uint8)((mask >> 8) & 0xFF);
 733             pDXT1_block->m_selectors[2] = (uint8)((mask >> 16) & 0xFF);
 734             pDXT1_block->m_selectors[3] = (uint8)((mask >> 24) & 0xFF);
 735         }
 736
 737         void compress_alpha_block(uint n, const color_quad_u8 *block, uint &low8, uint &high8, uint8 *pSelectors, uint comp_index)
 738         {
 739             int min, max;
 740             min = max = block[0][comp_index];
 741
 742             for (uint i = 1; i < n; i++)
 743             {
 744                 min = math::minimum<int>(min, block[i][comp_index]);
 745                 max = math::maximum<int>(max, block[i][comp_index]);
 746             }
 747
 748             low8 = max;
 749             high8 = min;
 750
 751             int dist = max - min;
 752             int bias = min * 7 - (dist >> 1);
 753             int dist4 = dist * 4;
 754             int dist2 = dist * 2;
 755
 756             for (uint i = 0; i < n; i++)
 757             {
 758                 int a = block[i][comp_index] * 7 - bias;
 759                 int ind, t;
 760
 761                 t = (dist4 - a) >> 31;
 762                 ind = t & 4;
 763                 a -= dist4 & t;
 764                 t = (dist2 - a) >> 31;
 765                 ind += t & 2;
 766                 a -= dist2 & t;
 767                 t = (dist - a) >> 31;
 768                 ind += t & 1;
 769
 770                 ind = -ind & 7;
 771                 ind ^= (2 > ind);
 772
 773                 pSelectors[i] = static_cast<uint8>(ind);
 774             }
 775         }
 776
 777         void compress_alpha_block(dxt5_block *pDXT5_block, const color_quad_u8 *pBlock, uint comp_index)
 778         {
 779             uint8 selectors[16];
 780             uint low8, high8;
 781
 782             compress_alpha_block(16, pBlock, low8, high8, selectors, comp_index);
 783
 784             pDXT5_block->set_low_alpha(low8);
 785             pDXT5_block->set_high_alpha(high8);
 786
 787             uint mask = 0;
 788             uint bits = 0;
 789             uint8 *pDst = pDXT5_block->m_selectors;
 790
 791             for (uint i = 0; i < 16; i++)
 792             {
 793                 mask |= (selectors[i] << bits);
 794
 795                 if ((bits += 3) >= 8)
 796                 {
 797                     *pDst++ = static_cast<uint8>(mask);
 798                     mask >>= 8;
 799                     bits -= 8;
 800                 }
 801             }
 802         }
 803
 804         void find_representative_colors(uint n, const color_quad_u8 *pBlock, color_quad_u8 &lo, color_quad_u8 &hi)
 805         {
 806             uint64_t ave64[3];
 807             ave64[0] = 0;
 808             ave64[1] = 0;
 809             ave64[2] = 0;
 810
 811             for (uint i = 0; i < n; i++)
 812             {
 813                 ave64[0] += pBlock[i].r;
 814                 ave64[1] += pBlock[i].g;
 815                 ave64[2] += pBlock[i].b;
 816             }
 817
 818             uint ave[3];
 819             ave[0] = static_cast<uint>((ave64[0] + (n / 2)) / n);
 820             ave[1] = static_cast<uint>((ave64[1] + (n / 2)) / n);
 821             ave[2] = static_cast<uint>((ave64[2] + (n / 2)) / n);
 822
 823             int furthest_dist = -1;
 824             uint furthest_index = 0;
 825             for (uint i = 0; i < n; i++)
 826             {
 827                 int r = pBlock[i].r - ave[0];
 828                 int g = pBlock[i].g - ave[1];
 829                 int b = pBlock[i].b - ave[2];
 830                 int dist = r * r + g * g + b * b;
 831                 if (dist > furthest_dist)
 832                 {
 833                     furthest_dist = dist;
 834                     furthest_index = i;
 835                 }
 836             }
 837
 838             color_quad_u8 lo_color(pBlock[furthest_index]);
 839
 840             int opp_dist = -1;
 841             uint opp_index = 0;
 842             for (uint i = 0; i < n; i++)
 843             {
 844                 int r = pBlock[i].r - lo_color.r;
 845                 int g = pBlock[i].g - lo_color.g;
 846                 int b = pBlock[i].b - lo_color.b;
 847                 int dist = r * r + g * g + b * b;
 848                 if (dist > opp_dist)
 849                 {
 850                     opp_dist = dist;
 851                     opp_index = i;
 852                 }
 853             }
 854
 855             color_quad_u8 hi_color(pBlock[opp_index]);
 856
 857             for (uint i = 0; i < 3; i++)
 858             {
 859                 lo_color[i] = static_cast<uint8>((lo_color[i] + ave[i]) >> 1);
 860                 hi_color[i] = static_cast<uint8>((hi_color[i] + ave[i]) >> 1);
 861             }
 862
 863             const uint cMaxIters = 4;
 864             for (uint iter_index = 0; iter_index < cMaxIters; iter_index++)
 865             {
 866                 if ((lo_color[0] == hi_color[0]) && (lo_color[1] == hi_color[1]) && (lo_color[2] == hi_color[2]))
 867                     break;
 868
 869                 uint64_t new_color[2][3];
 870                 uint weight[2];
 871
 872                 utils::zero_object(new_color);
 873                 utils::zero_object(weight);
 874
 875                 int vec_r = hi_color[0] - lo_color[0];
 876                 int vec_g = hi_color[1] - lo_color[1];
 877                 int vec_b = hi_color[2] - lo_color[2];
 878
 879                 int lo_dot = vec_r * lo_color[0] + vec_g * lo_color[1] + vec_b * lo_color[2];
 880                 int hi_dot = vec_r * hi_color[0] + vec_g * hi_color[1] + vec_b * hi_color[2];
 881                 int mid_dot = lo_dot + hi_dot;
 882
 883                 vec_r *= 2;
 884                 vec_g *= 2;
 885                 vec_b *= 2;
 886
 887                 for (uint i = 0; i < n; i++)
 888                 {
 889                     const color_quad_u8 &c = pBlock[i];
 890
 891                     const int dot = c[0] * vec_r + c[1] * vec_g + c[2] * vec_b;
 892                     const uint match_index = (dot > mid_dot);
 893
 894                     new_color[match_index][0] += c.r;
 895                     new_color[match_index][1] += c.g;
 896                     new_color[match_index][2] += c.b;
 897                     weight[match_index]++;
 898                 }
 899
 900                 if ((!weight[0]) || (!weight[1]))
 901                     break;
 902
 903                 uint8 new_color8[2][3];
 904
 905                 for (uint j = 0; j < 2; j++)
 906                     for (uint i = 0; i < 3; i++)
 907                         new_color8[j][i] = static_cast<uint8>((new_color[j][i] + (weight[j] / 2)) / weight[j]);
 908
 909                 if ((new_color8[0][0] == lo_color[0]) && (new_color8[0][1] == lo_color[1]) && (new_color8[0][2] == lo_color[2]) &&
 910                     (new_color8[1][0] == hi_color[0]) && (new_color8[1][1] == hi_color[1]) && (new_color8[1][2] == hi_color[2]))
 911                     break;
 912
 913                 for (uint i = 0; i < 3; i++)
 914                 {
 915                     lo_color[i] = new_color8[0][i];
 916                     hi_color[i] = new_color8[1][i];
 917                 }
 918             }
 919
 920             uint energy[2] = { 0, 0 };
 921             for (uint i = 0; i < 3; i++)
 922             {
 923                 energy[0] += lo_color[i] * lo_color[i];
 924                 energy[1] += hi_color[i] * hi_color[i];
 925             }
 926
 927             if (energy[0] > energy[1])
 928                 utils::swap(lo_color, hi_color);
 929
 930             lo = lo_color;
 931             hi = hi_color;
 932         }
 933
 934     } // namespace dxt_fast
 935
 936 } // namespace vogl