git.cworth.org Git - vogl/blob - src/voglcore/vogl_dxt1.cpp

   1 /**************************************************************************
   2  *
   3  * Copyright 2013-2014 RAD Game Tools and Valve Software
   4  * Copyright 2010-2014 Rich Geldreich and Tenacious Software LLC
   5  * All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a copy
   8  * of this software and associated documentation files (the "Software"), to deal
   9  * in the Software without restriction, including without limitation the rights
  10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11  * copies of the Software, and to permit persons to whom the Software is
  12  * furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included in
  15  * all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  23  * THE SOFTWARE.
  24  *
  25  **************************************************************************/
  26
  27 // File: vogl_dxt1.cpp
  28 //
  29 // Notes:
  30 // This class is not optimized for performance on small blocks, unlike typical DXT1 compressors. It's optimized for scalability and quality:
  31 // - Very high quality in terms of avg. RMSE or Luma RMSE. Goal is to always match or beat every other known offline DXTc compressor: ATI_Compress, squish, NVidia texture tools, nvdxt.exe, etc.
  32 // - Reasonable scalability and stability with hundreds to many thousands of input colors (including inputs with many thousands of equal/nearly equal colors).
  33 // - Any quality optimization which results in even a tiny improvement is worth it -- as long as it's either a constant or linear slowdown.
  34 //   Tiny quality improvements can be extremely valuable in large clusters.
  35 // - Quality should scale well vs. CPU time cost, i.e. the more time you spend the higher the quality.
  36 #include "vogl_core.h"
  37 #include "vogl_dxt1.h"
  38 #include "vogl_ryg_dxt.hpp"
  39 #include "vogl_dxt_fast.h"
  40 #include "vogl_intersect.h"
  41 #include "vogl_vec_interval.h"
  42
  43 namespace vogl
  44 {
  45     //-----------------------------------------------------------------------------------------------------------------------------------------
  46
  47     static const int16 g_fast_probe_table[] = { 0, 1, 2, 3 };
  48     static const uint cFastProbeTableSize = sizeof(g_fast_probe_table) / sizeof(g_fast_probe_table[0]);
  49
  50     static const int16 g_normal_probe_table[] = { 0, 1, 3, 5, 7 };
  51     static const uint cNormalProbeTableSize = sizeof(g_normal_probe_table) / sizeof(g_normal_probe_table[0]);
  52
  53     static const int16 g_better_probe_table[] = { 0, 1, 2, 3, 5, 9, 15, 19, 27, 43 };
  54     static const uint cBetterProbeTableSize = sizeof(g_better_probe_table) / sizeof(g_better_probe_table[0]);
  55
  56     static const int16 g_uber_probe_table[] = { 0, 1, 2, 3, 5, 7, 9, 10, 13, 15, 19, 27, 43, 59, 91 };
  57     static const uint cUberProbeTableSize = sizeof(g_uber_probe_table) / sizeof(g_uber_probe_table[0]);
  58
  59     //-----------------------------------------------------------------------------------------------------------------------------------------
  60
  61     dxt1_endpoint_optimizer::dxt1_endpoint_optimizer()
  62         : m_pParams(NULL),
  63           m_pResults(NULL),
  64           m_pSolutions(NULL),
  65           m_perceptual(false),
  66           m_has_color_weighting(false),
  67           m_all_pixels_grayscale(false)
  68     {
  69         m_low_coords.reserve(512);
  70         m_high_coords.reserve(512);
  71
  72         m_unique_colors.reserve(512);
  73         m_temp_unique_colors.reserve(512);
  74         m_unique_packed_colors.reserve(512);
  75
  76         m_norm_unique_colors.reserve(512);
  77         m_norm_unique_colors_weighted.reserve(512);
  78
  79         m_lo_cells.reserve(128);
  80         m_hi_cells.reserve(128);
  81     }
  82
  83     void dxt1_endpoint_optimizer::clear()
  84     {
  85         m_pParams = NULL;
  86         m_pResults = NULL;
  87         m_pSolutions = NULL;
  88
  89         if (m_unique_color_hash_map.get_table_size() > 8192)
  90             m_unique_color_hash_map.clear();
  91         else
  92             m_unique_color_hash_map.reset();
  93
  94         if (m_solutions_tried.get_table_size() > 8192)
  95             m_solutions_tried.clear();
  96
  97         m_unique_colors.resize(0);
  98
  99         m_has_transparent_pixels = false;
 100         m_total_unique_color_weight = 0;
 101
 102         m_norm_unique_colors.resize(0);
 103         m_mean_norm_color.clear();
 104
 105         m_norm_unique_colors_weighted.resize(0);
 106         m_mean_norm_color_weighted.clear();
 107
 108         m_principle_axis.clear();
 109
 110         m_total_evals = 0;
 111         m_all_pixels_grayscale = false;
 112         m_has_color_weighting = false;
 113         m_perceptual = false;
 114     }
 115
 116     bool dxt1_endpoint_optimizer::handle_all_transparent_block()
 117     {
 118         m_pResults->m_low_color = 0;
 119         m_pResults->m_high_color = 0;
 120         m_pResults->m_alpha_block = true;
 121
 122         memset(m_pResults->m_pSelectors, 3, m_pParams->m_num_pixels);
 123
 124         return true;
 125     }
 126
 127     // All selectors are equal. Try compressing as if it was solid, using the block's average color, using ryg's optimal single color compression tables.
 128     bool dxt1_endpoint_optimizer::try_average_block_as_solid()
 129     {
 130         uint64_t tot_r = 0;
 131         uint64_t tot_g = 0;
 132         uint64_t tot_b = 0;
 133
 134         uint total_weight = 0;
 135         for (uint i = 0; i < m_unique_colors.size(); i++)
 136         {
 137             uint weight = m_unique_colors[i].m_weight;
 138             total_weight += weight;
 139
 140             tot_r += m_unique_colors[i].m_color.r * weight;
 141             tot_g += m_unique_colors[i].m_color.g * weight;
 142             tot_b += m_unique_colors[i].m_color.b * weight;
 143         }
 144
 145         const uint half_total_weight = total_weight >> 1;
 146         uint ave_r = static_cast<uint>((tot_r + half_total_weight) / total_weight);
 147         uint ave_g = static_cast<uint>((tot_g + half_total_weight) / total_weight);
 148         uint ave_b = static_cast<uint>((tot_b + half_total_weight) / total_weight);
 149
 150         uint low_color = (ryg_dxt::OMatch5[ave_r][0] << 11) | (ryg_dxt::OMatch6[ave_g][0] << 5) | ryg_dxt::OMatch5[ave_b][0];
 151         uint high_color = (ryg_dxt::OMatch5[ave_r][1] << 11) | (ryg_dxt::OMatch6[ave_g][1] << 5) | ryg_dxt::OMatch5[ave_b][1];
 152         bool improved = evaluate_solution(dxt1_solution_coordinates((uint16)low_color, (uint16)high_color), true, &m_best_solution);
 153
 154         if ((m_pParams->m_use_alpha_blocks) && (m_best_solution.m_error))
 155         {
 156             low_color = (ryg_dxt::OMatch5_3[ave_r][0] << 11) | (ryg_dxt::OMatch6_3[ave_g][0] << 5) | ryg_dxt::OMatch5_3[ave_b][0];
 157             high_color = (ryg_dxt::OMatch5_3[ave_r][1] << 11) | (ryg_dxt::OMatch6_3[ave_g][1] << 5) | ryg_dxt::OMatch5_3[ave_b][1];
 158             improved |= evaluate_solution(dxt1_solution_coordinates((uint16)low_color, (uint16)high_color), true, &m_best_solution);
 159         }
 160
 161         if (m_pParams->m_quality == cCRNDXTQualityUber)
 162         {
 163             // Try compressing as all-solid using the other (non-average) colors in the block in uber.
 164             for (uint i = 0; i < m_unique_colors.size(); i++)
 165             {
 166                 uint r = m_unique_colors[i].m_color[0];
 167                 uint g = m_unique_colors[i].m_color[1];
 168                 uint b = m_unique_colors[i].m_color[2];
 169                 if ((r == ave_r) && (g == ave_g) && (b == ave_b))
 170                     continue;
 171
 172                 uint low_color = (ryg_dxt::OMatch5[r][0] << 11) | (ryg_dxt::OMatch6[g][0] << 5) | ryg_dxt::OMatch5[b][0];
 173                 uint high_color = (ryg_dxt::OMatch5[r][1] << 11) | (ryg_dxt::OMatch6[g][1] << 5) | ryg_dxt::OMatch5[b][1];
 174                 improved |= evaluate_solution(dxt1_solution_coordinates((uint16)low_color, (uint16)high_color), true, &m_best_solution);
 175
 176                 if ((m_pParams->m_use_alpha_blocks) && (m_best_solution.m_error))
 177                 {
 178                     low_color = (ryg_dxt::OMatch5_3[r][0] << 11) | (ryg_dxt::OMatch6_3[g][0] << 5) | ryg_dxt::OMatch5_3[b][0];
 179                     high_color = (ryg_dxt::OMatch5_3[r][1] << 11) | (ryg_dxt::OMatch6_3[g][1] << 5) | ryg_dxt::OMatch5_3[b][1];
 180                     improved |= evaluate_solution(dxt1_solution_coordinates((uint16)low_color, (uint16)high_color), true, &m_best_solution);
 181                 }
 182             }
 183         }
 184
 185         return improved;
 186     }
 187
 188     // Block is solid, trying using ryg's optimal single color tables.
 189     bool dxt1_endpoint_optimizer::handle_solid_block()
 190     {
 191         int r = m_unique_colors[0].m_color.r;
 192         int g = m_unique_colors[0].m_color.g;
 193         int b = m_unique_colors[0].m_color.b;
 194
 195         //uint packed_color = dxt1_block::pack_color(r, g, b, true);
 196         //evaluate_solution(dxt1_solution_coordinates((uint16)packed_color, (uint16)packed_color), false, &m_best_solution);
 197
 198         uint low_color = (ryg_dxt::OMatch5[r][0] << 11) | (ryg_dxt::OMatch6[g][0] << 5) | ryg_dxt::OMatch5[b][0];
 199         uint high_color = (ryg_dxt::OMatch5[r][1] << 11) | (ryg_dxt::OMatch6[g][1] << 5) | ryg_dxt::OMatch5[b][1];
 200         evaluate_solution(dxt1_solution_coordinates((uint16)low_color, (uint16)high_color), false, &m_best_solution);
 201
 202         if ((m_pParams->m_use_alpha_blocks) && (m_best_solution.m_error))
 203         {
 204             low_color = (ryg_dxt::OMatch5_3[r][0] << 11) | (ryg_dxt::OMatch6_3[g][0] << 5) | ryg_dxt::OMatch5_3[b][0];
 205             high_color = (ryg_dxt::OMatch5_3[r][1] << 11) | (ryg_dxt::OMatch6_3[g][1] << 5) | ryg_dxt::OMatch5_3[b][1];
 206             evaluate_solution(dxt1_solution_coordinates((uint16)low_color, (uint16)high_color), true, &m_best_solution);
 207         }
 208
 209         return_solution(*m_pResults, m_best_solution);
 210
 211         return true;
 212     }
 213
 214     void dxt1_endpoint_optimizer::compute_vectors(const vec3F &perceptual_weights)
 215     {
 216         m_norm_unique_colors.resize(0);
 217         m_norm_unique_colors_weighted.resize(0);
 218
 219         m_mean_norm_color.clear();
 220         m_mean_norm_color_weighted.clear();
 221
 222         for (uint i = 0; i < m_unique_colors.size(); i++)
 223         {
 224             const color_quad_u8 &color = m_unique_colors[i].m_color;
 225             const uint weight = m_unique_colors[i].m_weight;
 226
 227             vec3F norm_color(color.r * 1.0f / 255.0f, color.g * 1.0f / 255.0f, color.b * 1.0f / 255.0f);
 228             vec3F norm_color_weighted(vec3F::mul_components(perceptual_weights, norm_color));
 229
 230             m_norm_unique_colors.push_back(norm_color);
 231             m_norm_unique_colors_weighted.push_back(norm_color_weighted);
 232
 233             m_mean_norm_color += norm_color * (float)weight;
 234             m_mean_norm_color_weighted += norm_color_weighted * (float)weight;
 235         }
 236
 237         if (m_total_unique_color_weight)
 238         {
 239             m_mean_norm_color *= (1.0f / m_total_unique_color_weight);
 240             m_mean_norm_color_weighted *= (1.0f / m_total_unique_color_weight);
 241         }
 242
 243         for (uint i = 0; i < m_unique_colors.size(); i++)
 244         {
 245             m_norm_unique_colors[i] -= m_mean_norm_color;
 246             m_norm_unique_colors_weighted[i] -= m_mean_norm_color_weighted;
 247         }
 248     }
 249
 250     // Compute PCA (principle axis, i.e. direction of largest variance) of input vectors.
 251     void dxt1_endpoint_optimizer::compute_pca(vec3F &axis, const vec3F_array &norm_colors, const vec3F &def)
 252     {
 253 #if 0
 254         axis.clear();
 255
 256         VOGL_ASSERT(m_unique_colors.size() == norm_colors.size());
 257
 258         // Incremental PCA
 259         bool first = true;
 260         for (uint i = 0; i < norm_colors.size(); i++)
 261         {
 262                 const uint weight = m_unique_colors[i].m_weight;
 263
 264                 for (uint j = 0; j < weight; j++)
 265                 {
 266                         vec3F x(norm_colors[i] * norm_colors[i][0]);
 267                         vec3F y(norm_colors[i] * norm_colors[i][1]);
 268                         vec3F z(norm_colors[i] * norm_colors[i][2]);
 269
 270                         vec3F v(first ? norm_colors[0] : axis);
 271                         first = false;
 272
 273                         v.normalize(&def);
 274
 275                         axis[0] += (x * v);
 276                         axis[1] += (y * v);
 277                         axis[2] += (z * v);
 278                 }
 279         }
 280
 281         axis.normalize(&def);
 282 #else
 283         double cov[6] = { 0, 0, 0, 0, 0, 0 };
 284
 285         //vec3F lo(math::cNearlyInfinite);
 286         //vec3F hi(-math::cNearlyInfinite);
 287
 288         for (uint i = 0; i < norm_colors.size(); i++)
 289         {
 290             const vec3F &v = norm_colors[i];
 291
 292             //if (v[0] < lo[0]) lo[0] = v[0];
 293             //if (v[1] < lo[1]) lo[1] = v[1];
 294             //if (v[2] < lo[2]) lo[2] = v[2];
 295             //if (v[0] > hi[0]) hi[0] = v[0];
 296             //if (v[1] > hi[1]) hi[1] = v[1];
 297             //if (v[2] > hi[2]) hi[2] = v[2];
 298
 299             float r = v[0];
 300             float g = v[1];
 301             float b = v[2];
 302
 303             if (m_unique_colors[i].m_weight > 1)
 304             {
 305                 const double weight = m_unique_colors[i].m_weight;
 306
 307                 cov[0] += r * r * weight;
 308                 cov[1] += r * g * weight;
 309                 cov[2] += r * b * weight;
 310                 cov[3] += g * g * weight;
 311                 cov[4] += g * b * weight;
 312                 cov[5] += b * b * weight;
 313             }
 314             else
 315             {
 316                 cov[0] += r * r;
 317                 cov[1] += r * g;
 318                 cov[2] += r * b;
 319                 cov[3] += g * g;
 320                 cov[4] += g * b;
 321                 cov[5] += b * b;
 322             }
 323         }
 324
 325         double vfr, vfg, vfb;
 326         //vfr = hi[0] - lo[0];
 327         //vfg = hi[1] - lo[1];
 328         //vfb = hi[2] - lo[2];
 329         // This is more stable.
 330         vfr = .9f;
 331         vfg = 1.0f;
 332         vfb = .7f;
 333
 334         const uint cNumIters = 8;
 335
 336         for (uint iter = 0; iter < cNumIters; iter++)
 337         {
 338             double r = vfr * cov[0] + vfg * cov[1] + vfb * cov[2];
 339             double g = vfr * cov[1] + vfg * cov[3] + vfb * cov[4];
 340             double b = vfr * cov[2] + vfg * cov[4] + vfb * cov[5];
 341
 342             double m = math::maximum(fabs(r), fabs(g), fabs(b));
 343             if (m > 1e-10)
 344             {
 345                 m = 1.0f / m;
 346                 r *= m;
 347                 g *= m;
 348                 b *= m;
 349             }
 350
 351             double delta = math::square(vfr - r) + math::square(vfg - g) + math::square(vfb - b);
 352
 353             vfr = r;
 354             vfg = g;
 355             vfb = b;
 356
 357             if ((iter > 2) && (delta < 1e-8))
 358                 break;
 359         }
 360
 361         double len = vfr * vfr + vfg * vfg + vfb * vfb;
 362
 363         if (len < 1e-10)
 364         {
 365             axis = def;
 366         }
 367         else
 368         {
 369             len = 1.0f / sqrt(len);
 370             vfr *= len;
 371             vfg *= len;
 372             vfb *= len;
 373
 374             axis.set(static_cast<float>(vfr), static_cast<float>(vfg), static_cast<float>(vfb));
 375         }
 376 #endif
 377     }
 378
 379     static const uint8 g_invTableNull[4] = { 0, 1, 2, 3 };
 380     static const uint8 g_invTableAlpha[4] = { 1, 0, 2, 3 };
 381     static const uint8 g_invTableColor[4] = { 1, 0, 3, 2 };
 382
 383     // Computes a valid (encodable) DXT1 solution (low/high colors, swizzled selectors) from input.
 384     void dxt1_endpoint_optimizer::return_solution(results &res, const potential_solution &solution)
 385     {
 386         bool invert_selectors;
 387
 388         if (solution.m_alpha_block)
 389             invert_selectors = (solution.m_coords.m_low_color > solution.m_coords.m_high_color);
 390         else
 391         {
 392             VOGL_ASSERT(solution.m_coords.m_low_color != solution.m_coords.m_high_color);
 393
 394             invert_selectors = (solution.m_coords.m_low_color < solution.m_coords.m_high_color);
 395         }
 396
 397         if (invert_selectors)
 398         {
 399             res.m_low_color = solution.m_coords.m_high_color;
 400             res.m_high_color = solution.m_coords.m_low_color;
 401         }
 402         else
 403         {
 404             res.m_low_color = solution.m_coords.m_low_color;
 405             res.m_high_color = solution.m_coords.m_high_color;
 406         }
 407
 408         const uint8 *pInvert_table = g_invTableNull;
 409         if (invert_selectors)
 410             pInvert_table = solution.m_alpha_block ? g_invTableAlpha : g_invTableColor;
 411
 412         const uint alpha_thresh = m_pParams->m_pixels_have_alpha ? (m_pParams->m_dxt1a_alpha_threshold << 24U) : 0;
 413
 414         const uint32 *pSrc_pixels = reinterpret_cast<const uint32 *>(m_pParams->m_pPixels);
 415         uint8 *pDst_selectors = res.m_pSelectors;
 416
 417         if ((m_unique_colors.size() == 1) && (!m_pParams->m_pixels_have_alpha))
 418         {
 419             uint32 c = utils::read_le32(pSrc_pixels);
 420
 421             VOGL_ASSERT(c >= alpha_thresh);
 422
 423             c |= 0xFF000000U;
 424
 425             unique_color_hash_map::const_iterator it(m_unique_color_hash_map.find(c));
 426             VOGL_ASSERT(it != m_unique_color_hash_map.end());
 427
 428             uint unique_color_index = it->second;
 429
 430             uint selector = pInvert_table[solution.m_selectors[unique_color_index]];
 431
 432             memset(pDst_selectors, selector, m_pParams->m_num_pixels);
 433         }
 434         else
 435         {
 436             uint8 *pDst_selectors_end = pDst_selectors + m_pParams->m_num_pixels;
 437
 438             uint8 prev_selector = 0;
 439             uint32 prev_color = 0;
 440
 441             do
 442             {
 443                 uint32 c = utils::read_le32(pSrc_pixels);
 444                 pSrc_pixels++;
 445
 446                 uint8 selector = 3;
 447
 448                 if (c >= alpha_thresh)
 449                 {
 450                     c |= 0xFF000000U;
 451
 452                     if (c == prev_color)
 453                         selector = prev_selector;
 454                     else
 455                     {
 456                         unique_color_hash_map::const_iterator it(m_unique_color_hash_map.find(c));
 457
 458                         VOGL_ASSERT(it != m_unique_color_hash_map.end());
 459
 460                         uint unique_color_index = it->second;
 461
 462                         selector = pInvert_table[solution.m_selectors[unique_color_index]];
 463
 464                         prev_color = c;
 465                         prev_selector = selector;
 466                     }
 467                 }
 468
 469                 *pDst_selectors++ = selector;
 470
 471             } while (pDst_selectors != pDst_selectors_end);
 472         }
 473
 474         res.m_alpha_block = solution.m_alpha_block;
 475         res.m_error = solution.m_error;
 476     }
 477
 478     inline vec3F dxt1_endpoint_optimizer::unpack_to_vec3F(uint16 packed_color)
 479     {
 480         color_quad_u8 c(dxt1_block::unpack_color(packed_color, false));
 481
 482         return vec3F(c.r * 1.0f / 31.0f, c.g * 1.0f / 63.0f, c.b * 1.0f / 31.0f);
 483     }
 484
 485     inline vec3F dxt1_endpoint_optimizer::unpack_to_vec3F_raw(uint16 packed_color)
 486     {
 487         color_quad_u8 c(dxt1_block::unpack_color(packed_color, false));
 488
 489         return vec3F(c.r, c.g, c.b);
 490     }
 491
 492     // Per-component 1D endpoint optimization.
 493     void dxt1_endpoint_optimizer::optimize_endpoint_comps()
 494     {
 495         if ((m_best_solution.m_alpha_block) || (!m_best_solution.m_error))
 496             return;
 497
 498         //color_quad_u8 orig_l(dxt1_block::unpack_color(m_best_solution.m_coords.m_low_color, false));
 499         //color_quad_u8 orig_h(dxt1_block::unpack_color(m_best_solution.m_coords.m_high_color, false));
 500         //uint orig_error = m_best_solution.m_error;
 501
 502         color_quad_u8 orig_l_scaled(dxt1_block::unpack_color(m_best_solution.m_coords.m_low_color, true));
 503         color_quad_u8 orig_h_scaled(dxt1_block::unpack_color(m_best_solution.m_coords.m_high_color, true));
 504
 505         color_quad_u8 min_color(0xFF, 0xFF, 0xFF, 0xFF);
 506         color_quad_u8 max_color(0, 0, 0, 0);
 507         for (uint i = 0; i < m_unique_colors.size(); i++)
 508         {
 509             min_color = color_quad_u8::component_min(min_color, m_unique_colors[i].m_color);
 510             max_color = color_quad_u8::component_max(max_color, m_unique_colors[i].m_color);
 511         }
 512
 513         // Try to separately optimize each component. This is a 1D problem so it's easy to compute accurate per-component error bounds.
 514         for (uint comp_index = 0; comp_index < 3; comp_index++)
 515         {
 516             uint ll[4];
 517             ll[0] = orig_l_scaled[comp_index];
 518             ll[1] = orig_h_scaled[comp_index];
 519             ll[2] = (ll[0] * 2 + ll[1]) / 3;
 520             ll[3] = (ll[0] + ll[1] * 2) / 3;
 521
 522             uint error_to_beat = 0;
 523             uint min_color_weight = 0;
 524             uint max_color_weight = 0;
 525             for (uint i = 0; i < m_unique_colors.size(); i++)
 526             {
 527                 uint c = m_unique_colors[i].m_color[comp_index];
 528                 uint w = m_unique_colors[i].m_weight;
 529
 530                 int delta = ll[m_best_solution.m_selectors[i]] - c;
 531                 error_to_beat += (int)w * (delta * delta);
 532
 533                 if (c == min_color[comp_index])
 534                     min_color_weight += w;
 535                 if (c == max_color[comp_index])
 536                     max_color_weight += w;
 537             }
 538
 539             if (!error_to_beat)
 540                 continue;
 541
 542             VOGL_ASSERT((min_color_weight > 0) && (max_color_weight > 0));
 543             const uint error_to_beat_div_min_color_weight = min_color_weight ? ((error_to_beat + min_color_weight - 1) / min_color_weight) : 0;
 544             const uint error_to_beat_div_max_color_weight = max_color_weight ? ((error_to_beat + max_color_weight - 1) / max_color_weight) : 0;
 545
 546             const uint m = (comp_index == 1) ? 63 : 31;
 547             const uint m_shift = (comp_index == 1) ? 3 : 2;
 548
 549             for (uint o = 0; o <= m; o++)
 550             {
 551                 uint tl[4];
 552
 553                 tl[0] = (comp_index == 1) ? ((o << 2) | (o >> 4)) : ((o << 3) | (o >> 2));
 554
 555                 for (uint h = 0; h < 8; h++)
 556                 {
 557                     const uint pl = h << m_shift;
 558                     const uint ph = ((h + 1) << m_shift) - 1;
 559
 560                     uint tl_l = (comp_index == 1) ? ((pl << 2) | (pl >> 4)) : ((pl << 3) | (pl >> 2));
 561                     uint tl_h = (comp_index == 1) ? ((ph << 2) | (ph >> 4)) : ((ph << 3) | (ph >> 2));
 562
 563                     tl_l = math::minimum(tl_l, tl[0]);
 564                     tl_h = math::maximum(tl_h, tl[0]);
 565
 566                     uint c_l = min_color[comp_index];
 567                     uint c_h = max_color[comp_index];
 568
 569                     if (c_h < tl_l)
 570                     {
 571                         uint min_possible_error = math::square<int>(tl_l - c_l);
 572                         if (min_possible_error > error_to_beat_div_min_color_weight)
 573                             continue;
 574                     }
 575                     else if (c_l > tl_h)
 576                     {
 577                         uint min_possible_error = math::square<int>(c_h - tl_h);
 578                         if (min_possible_error > error_to_beat_div_max_color_weight)
 579                             continue;
 580                     }
 581
 582                     for (uint p = pl; p <= ph; p++)
 583                     {
 584                         tl[1] = (comp_index == 1) ? ((p << 2) | (p >> 4)) : ((p << 3) | (p >> 2));
 585
 586                         tl[2] = (tl[0] * 2 + tl[1]) / 3;
 587                         tl[3] = (tl[0] + tl[1] * 2) / 3;
 588
 589                         uint trial_error = 0;
 590                         for (uint i = 0; i < m_unique_colors.size(); i++)
 591                         {
 592                             int delta = tl[m_best_solution.m_selectors[i]] - m_unique_colors[i].m_color[comp_index];
 593                             trial_error += m_unique_colors[i].m_weight * (delta * delta);
 594                             if (trial_error >= error_to_beat)
 595                                 break;
 596                         }
 597
 598                         //VOGL_ASSERT(trial_error >= min_possible_error);
 599
 600                         if (trial_error < error_to_beat)
 601                         {
 602                             color_quad_u8 l(dxt1_block::unpack_color(m_best_solution.m_coords.m_low_color, false));
 603                             color_quad_u8 h(dxt1_block::unpack_color(m_best_solution.m_coords.m_high_color, false));
 604                             l[comp_index] = static_cast<uint8>(o);
 605                             h[comp_index] = static_cast<uint8>(p);
 606
 607                             bool better = evaluate_solution(
 608                                 dxt1_solution_coordinates(dxt1_block::pack_color(l, false), dxt1_block::pack_color(h, false)),
 609                                 true, &m_best_solution);
 610                             VOGL_NOTE_UNUSED(better);
 611
 612                             if (better)
 613                             {
 614 #if 0
 615                                                         printf("comp: %u, orig: %u %u, new: %u %u, orig_error: %u, new_error: %u\n", comp_index,
 616                                                                orig_l[comp_index], orig_h[comp_index],
 617                                                                l[comp_index], h[comp_index],
 618                                                                orig_error, m_best_solution.m_error);
 619 #endif
 620                                 if (!m_best_solution.m_error)
 621                                     return;
 622
 623                                 error_to_beat = 0;
 624                                 for (uint i = 0; i < m_unique_colors.size(); i++)
 625                                 {
 626                                     int delta = tl[m_best_solution.m_selectors[i]] - m_unique_colors[i].m_color[comp_index];
 627                                     error_to_beat += m_unique_colors[i].m_weight * (delta * delta);
 628                                 }
 629
 630                             } // better
 631
 632                             //goto early_out;
 633                         } // if (trial_error < error_to_beat)
 634
 635                     } // for (uint p = 0; p <= m; p++)
 636                 }
 637
 638             } // for (uint o = 0; o <= m; o++)
 639
 640         } // comp_index
 641     }
 642
 643     // Voxel adjacency delta coordinations.
 644     static const struct adjacent_coords
 645     {
 646         int8 x, y, z;
 647     } g_adjacency[26] =
 648           {
 649               { -1, -1, -1 },
 650               { 0, -1, -1 },
 651               { 1, -1, -1 },
 652               { -1, 0, -1 },
 653               { 0, 0, -1 },
 654               { 1, 0, -1 },
 655               { -1, 1, -1 },
 656               { 0, 1, -1 },
 657               { 1, 1, -1 },
 658               { -1, -1, 0 },
 659               { 0, -1, 0 },
 660               { 1, -1, 0 },
 661               { -1, 0, 0 },
 662               { 1, 0, 0 },
 663               { -1, 1, 0 },
 664               { 0, 1, 0 },
 665               { 1, 1, 0 },
 666               { -1, -1, 1 },
 667               { 0, -1, 1 },
 668               { 1, -1, 1 },
 669               { -1, 0, 1 },
 670               { 0, 0, 1 },
 671               { 1, 0, 1 },
 672               { -1, 1, 1 },
 673               { 0, 1, 1 },
 674               { 1, 1, 1 }
 675           };
 676
 677     // Attempt to refine current solution's endpoints given the current selectors using least squares.
 678     bool dxt1_endpoint_optimizer::refine_solution(int refinement_level)
 679     {
 680         VOGL_ASSERT(m_best_solution.m_valid);
 681
 682         static const int w1Tab[4] = { 3, 0, 2, 1 };
 683
 684         static const int prods_0[4] = { 0x00, 0x00, 0x02, 0x02 };
 685         static const int prods_1[4] = { 0x00, 0x09, 0x01, 0x04 };
 686         static const int prods_2[4] = { 0x09, 0x00, 0x04, 0x01 };
 687
 688         double akku_0 = 0;
 689         double akku_1 = 0;
 690         double akku_2 = 0;
 691         double At1_r, At1_g, At1_b;
 692         double At2_r, At2_g, At2_b;
 693
 694         At1_r = At1_g = At1_b = 0;
 695         At2_r = At2_g = At2_b = 0;
 696         for (uint i = 0; i < m_unique_colors.size(); i++)
 697         {
 698             const color_quad_u8 &c = m_unique_colors[i].m_color;
 699             const double weight = m_unique_colors[i].m_weight;
 700
 701             double r = c.r * weight;
 702             double g = c.g * weight;
 703             double b = c.b * weight;
 704             int step = m_best_solution.m_selectors[i] ^ 1;
 705
 706             int w1 = w1Tab[step];
 707
 708             akku_0 += prods_0[step] * weight;
 709             akku_1 += prods_1[step] * weight;
 710             akku_2 += prods_2[step] * weight;
 711             At1_r += w1 * r;
 712             At1_g += w1 * g;
 713             At1_b += w1 * b;
 714             At2_r += r;
 715             At2_g += g;
 716             At2_b += b;
 717         }
 718
 719         At2_r = 3 * At2_r - At1_r;
 720         At2_g = 3 * At2_g - At1_g;
 721         At2_b = 3 * At2_b - At1_b;
 722
 723         double xx = akku_2;
 724         double yy = akku_1;
 725         double xy = akku_0;
 726
 727         double t = xx * yy - xy * xy;
 728         if (!yy || !xx || (fabs(t) < .0000125f))
 729             return false;
 730
 731         double frb = (3.0f * 31.0f / 255.0f) / t;
 732         double fg = frb * (63.0f / 31.0f);
 733
 734         bool improved = false;
 735
 736         if (refinement_level == 0)
 737         {
 738             uint max16;
 739             max16 = math::clamp<int>(static_cast<int>((At1_r * yy - At2_r * xy) * frb + 0.5f), 0, 31) << 11;
 740             max16 |= math::clamp<int>(static_cast<int>((At1_g * yy - At2_g * xy) * fg + 0.5f), 0, 63) << 5;
 741             max16 |= math::clamp<int>(static_cast<int>((At1_b * yy - At2_b * xy) * frb + 0.5f), 0, 31) << 0;
 742
 743             uint min16;
 744             min16 = math::clamp<int>(static_cast<int>((At2_r * xx - At1_r * xy) * frb + 0.5f), 0, 31) << 11;
 745             min16 |= math::clamp<int>(static_cast<int>((At2_g * xx - At1_g * xy) * fg + 0.5f), 0, 63) << 5;
 746             min16 |= math::clamp<int>(static_cast<int>((At2_b * xx - At1_b * xy) * frb + 0.5f), 0, 31) << 0;
 747
 748             dxt1_solution_coordinates nc((uint16)min16, (uint16)max16);
 749             nc.canonicalize();
 750             improved |= evaluate_solution(nc, true, &m_best_solution, false);
 751         }
 752         else if (refinement_level == 1)
 753         {
 754             // Try exploring the local lattice neighbors of the least squares optimized result.
 755             color_quad_u8 e[2];
 756
 757             e[0].clear();
 758             e[0][0] = (uint8)math::clamp<int>(static_cast<int>((At1_r * yy - At2_r * xy) * frb + 0.5f), 0, 31);
 759             e[0][1] = (uint8)math::clamp<int>(static_cast<int>((At1_g * yy - At2_g * xy) * fg + 0.5f), 0, 63);
 760             e[0][2] = (uint8)math::clamp<int>(static_cast<int>((At1_b * yy - At2_b * xy) * frb + 0.5f), 0, 31);
 761
 762             e[1].clear();
 763             e[1][0] = (uint8)math::clamp<int>(static_cast<int>((At2_r * xx - At1_r * xy) * frb + 0.5f), 0, 31);
 764             e[1][1] = (uint8)math::clamp<int>(static_cast<int>((At2_g * xx - At1_g * xy) * fg + 0.5f), 0, 63);
 765             e[1][2] = (uint8)math::clamp<int>(static_cast<int>((At2_b * xx - At1_b * xy) * frb + 0.5f), 0, 31);
 766
 767             for (uint i = 0; i < 2; i++)
 768             {
 769                 for (int rr = -1; rr <= 1; rr++)
 770                 {
 771                     for (int gr = -1; gr <= 1; gr++)
 772                     {
 773                         for (int br = -1; br <= 1; br++)
 774                         {
 775                             dxt1_solution_coordinates nc;
 776
 777                             color_quad_u8 c[2];
 778                             c[0] = e[0];
 779                             c[1] = e[1];
 780
 781                             c[i][0] = (uint8)math::clamp<int>(c[i][0] + rr, 0, 31);
 782                             c[i][1] = (uint8)math::clamp<int>(c[i][1] + gr, 0, 63);
 783                             c[i][2] = (uint8)math::clamp<int>(c[i][2] + br, 0, 31);
 784
 785                             nc.m_low_color = dxt1_block::pack_color(c[0], false);
 786                             nc.m_high_color = dxt1_block::pack_color(c[1], false);
 787
 788                             nc.canonicalize();
 789
 790                             if ((nc.m_low_color != m_best_solution.m_coords.m_low_color) || (nc.m_high_color != m_best_solution.m_coords.m_high_color))
 791                             {
 792                                 improved |= evaluate_solution(nc, true, &m_best_solution, false);
 793                             }
 794                         }
 795                     }
 796                 }
 797             }
 798         }
 799         else
 800         {
 801             // Try even harder to explore the local lattice neighbors of the least squares optimized result.
 802             color_quad_u8 e[2];
 803             e[0].clear();
 804             e[0][0] = (uint8)math::clamp<int>(static_cast<int>((At1_r * yy - At2_r * xy) * frb + 0.5f), 0, 31);
 805             e[0][1] = (uint8)math::clamp<int>(static_cast<int>((At1_g * yy - At2_g * xy) * fg + 0.5f), 0, 63);
 806             e[0][2] = (uint8)math::clamp<int>(static_cast<int>((At1_b * yy - At2_b * xy) * frb + 0.5f), 0, 31);
 807
 808             e[1].clear();
 809             e[1][0] = (uint8)math::clamp<int>(static_cast<int>((At2_r * xx - At1_r * xy) * frb + 0.5f), 0, 31);
 810             e[1][1] = (uint8)math::clamp<int>(static_cast<int>((At2_g * xx - At1_g * xy) * fg + 0.5f), 0, 63);
 811             e[1][2] = (uint8)math::clamp<int>(static_cast<int>((At2_b * xx - At1_b * xy) * frb + 0.5f), 0, 31);
 812
 813             for (int orr = -1; orr <= 1; orr++)
 814             {
 815                 for (int ogr = -1; ogr <= 1; ogr++)
 816                 {
 817                     for (int obr = -1; obr <= 1; obr++)
 818                     {
 819                         dxt1_solution_coordinates nc;
 820
 821                         color_quad_u8 c[2];
 822                         c[0] = e[0];
 823                         c[1] = e[1];
 824
 825                         c[0][0] = (uint8)math::clamp<int>(c[0][0] + orr, 0, 31);
 826                         c[0][1] = (uint8)math::clamp<int>(c[0][1] + ogr, 0, 63);
 827                         c[0][2] = (uint8)math::clamp<int>(c[0][2] + obr, 0, 31);
 828
 829                         for (int rr = -1; rr <= 1; rr++)
 830                         {
 831                             for (int gr = -1; gr <= 1; gr++)
 832                             {
 833                                 for (int br = -1; br <= 1; br++)
 834                                 {
 835                                     c[1][0] = (uint8)math::clamp<int>(c[1][0] + rr, 0, 31);
 836                                     c[1][1] = (uint8)math::clamp<int>(c[1][1] + gr, 0, 63);
 837                                     c[1][2] = (uint8)math::clamp<int>(c[1][2] + br, 0, 31);
 838
 839                                     nc.m_low_color = dxt1_block::pack_color(c[0], false);
 840                                     nc.m_high_color = dxt1_block::pack_color(c[1], false);
 841                                     nc.canonicalize();
 842
 843                                     improved |= evaluate_solution(nc, true, &m_best_solution, false);
 844                                 }
 845                             }
 846                         }
 847                     }
 848                 }
 849             }
 850         }
 851
 852         return improved;
 853     }
 854
 855     //-----------------------------------------------------------------------------------------------------------------------------------------
 856
 857     // Primary endpoint optimization entrypoint.
 858     bool dxt1_endpoint_optimizer::optimize_endpoints(vec3F &low_color, vec3F &high_color)
 859     {
 860         vec3F orig_low_color(low_color);
 861         vec3F orig_high_color(high_color);
 862
 863         m_trial_solution.clear();
 864
 865         uint num_passes;
 866         const int16 *pProbe_table = g_uber_probe_table;
 867         uint probe_range;
 868         float dist_per_trial = .015625f;
 869
 870         // How many probes, and the distance between each probe depends on the quality level.
 871         switch (m_pParams->m_quality)
 872         {
 873             case cCRNDXTQualitySuperFast:
 874                 pProbe_table = g_fast_probe_table;
 875                 probe_range = cFastProbeTableSize;
 876                 dist_per_trial = .027063293f;
 877                 num_passes = 1;
 878                 break;
 879             case cCRNDXTQualityFast:
 880                 pProbe_table = g_fast_probe_table;
 881                 probe_range = cFastProbeTableSize;
 882                 dist_per_trial = .027063293f;
 883                 num_passes = 2;
 884                 break;
 885             case cCRNDXTQualityNormal:
 886                 pProbe_table = g_normal_probe_table;
 887                 probe_range = cNormalProbeTableSize;
 888                 dist_per_trial = .027063293f;
 889                 num_passes = 2;
 890                 break;
 891             case cCRNDXTQualityBetter:
 892                 pProbe_table = g_better_probe_table;
 893                 probe_range = cBetterProbeTableSize;
 894                 num_passes = 2;
 895                 break;
 896             default:
 897                 pProbe_table = g_uber_probe_table;
 898                 probe_range = cUberProbeTableSize;
 899                 num_passes = 4;
 900                 break;
 901         }
 902
 903         m_solutions_tried.reset();
 904
 905         if (m_pParams->m_endpoint_caching)
 906         {
 907             // Try the previous X winning endpoints. This may not give us optimal results, but it may increase the probability of early outs while evaluating potential solutions.
 908             const uint num_prev_results = math::minimum<uint>(cMaxPrevResults, m_num_prev_results);
 909             for (uint i = 0; i < num_prev_results; i++)
 910             {
 911                 const dxt1_solution_coordinates &coords = m_prev_results[i];
 912
 913                 solution_hash_map::insert_result solution_res(m_solutions_tried.insert(coords.m_low_color | (coords.m_high_color << 16U)));
 914                 if (!solution_res.second)
 915                     continue;
 916
 917                 evaluate_solution(coords, true, &m_best_solution);
 918             }
 919
 920             if (!m_best_solution.m_error)
 921             {
 922                 // Got lucky - one of the previous endpoints is optimal.
 923                 return_solution(*m_pResults, m_best_solution);
 924                 return true;
 925             }
 926         }
 927
 928         if (m_pParams->m_quality >= cCRNDXTQualityBetter)
 929         {
 930             //evaluate_solution(dxt1_solution_coordinates(low_color, high_color), true, &m_best_solution);
 931             //refine_solution();
 932
 933             try_median4(orig_low_color, orig_high_color);
 934         }
 935
 936         uint probe_low[cUberProbeTableSize * 2 + 1];
 937         uint probe_high[cUberProbeTableSize * 2 + 1];
 938
 939         vec3F scaled_principle_axis[2];
 940
 941         scaled_principle_axis[1] = m_principle_axis * dist_per_trial;
 942         scaled_principle_axis[1][0] *= 31.0f;
 943         scaled_principle_axis[1][1] *= 63.0f;
 944         scaled_principle_axis[1][2] *= 31.0f;
 945
 946         scaled_principle_axis[0] = -scaled_principle_axis[1];
 947
 948         //vec3F initial_ofs(scaled_principle_axis * (float)-probe_range);
 949         //initial_ofs[0] += .5f;
 950         //initial_ofs[1] += .5f;
 951         //initial_ofs[2] += .5f;
 952
 953         low_color[0] = math::clamp(low_color[0] * 31.0f, 0.0f, 31.0f);
 954         low_color[1] = math::clamp(low_color[1] * 63.0f, 0.0f, 63.0f);
 955         low_color[2] = math::clamp(low_color[2] * 31.0f, 0.0f, 31.0f);
 956
 957         high_color[0] = math::clamp(high_color[0] * 31.0f, 0.0f, 31.0f);
 958         high_color[1] = math::clamp(high_color[1] * 63.0f, 0.0f, 63.0f);
 959         high_color[2] = math::clamp(high_color[2] * 31.0f, 0.0f, 31.0f);
 960
 961         for (uint pass = 0; pass < num_passes; pass++)
 962         {
 963             // Now separately sweep or probe the low and high colors along the principle axis, both positively and negatively.
 964             // This results in two arrays of candidate low/high endpoints. Every unique combination of candidate endpoints is tried as a potential solution.
 965             // In higher quality modes, the various nearby lattice neighbors of each candidate endpoint are also explored, which allows the current solution to "wobble" or "migrate"
 966             // to areas with lower error.
 967             // This entire process can be repeated up to X times (depending on the quality level) until a local minimum is established.
 968             // This method is very stable and scalable. It could be implemented more elegantly, but I'm now very cautious of touching this code.
 969             if (pass)
 970             {
 971                 low_color = unpack_to_vec3F_raw(m_best_solution.m_coords.m_low_color);
 972                 high_color = unpack_to_vec3F_raw(m_best_solution.m_coords.m_high_color);
 973             }
 974
 975             const uint64_t prev_best_error = m_best_solution.m_error;
 976             if (!prev_best_error)
 977                 break;
 978
 979             // Sweep low endpoint along principle axis, record positions
 980             int prev_packed_color[2] = { -1, -1 };
 981             uint num_low_trials = 0;
 982             vec3F initial_probe_low_color(low_color + vec3F(.5f));
 983             for (uint i = 0; i < probe_range; i++)
 984             {
 985                 const int ls = i ? 0 : 1;
 986                 int x = pProbe_table[i];
 987
 988                 for (int s = ls; s < 2; s++)
 989                 {
 990                     vec3F probe_low_color(initial_probe_low_color + scaled_principle_axis[s] * (float)x);
 991
 992                     int r = math::clamp((int)floor(probe_low_color[0]), 0, 31);
 993                     int g = math::clamp((int)floor(probe_low_color[1]), 0, 63);
 994                     int b = math::clamp((int)floor(probe_low_color[2]), 0, 31);
 995
 996                     int packed_color = b | (g << 5U) | (r << 11U);
 997                     if (packed_color != prev_packed_color[s])
 998                     {
 999                         probe_low[num_low_trials++] = packed_color;
1000                         prev_packed_color[s] = packed_color;
1001                     }
1002                 }
1003             }
1004
1005             prev_packed_color[0] = -1;
1006             prev_packed_color[1] = -1;
1007
1008             // Sweep high endpoint along principle axis, record positions
1009             uint num_high_trials = 0;
1010             vec3F initial_probe_high_color(high_color + vec3F(.5f));
1011             for (uint i = 0; i < probe_range; i++)
1012             {
1013                 const int ls = i ? 0 : 1;
1014                 int x = pProbe_table[i];
1015
1016                 for (int s = ls; s < 2; s++)
1017                 {
1018                     vec3F probe_high_color(initial_probe_high_color + scaled_principle_axis[s] * (float)x);
1019
1020                     int r = math::clamp((int)floor(probe_high_color[0]), 0, 31);
1021                     int g = math::clamp((int)floor(probe_high_color[1]), 0, 63);
1022                     int b = math::clamp((int)floor(probe_high_color[2]), 0, 31);
1023
1024                     int packed_color = b | (g << 5U) | (r << 11U);
1025                     if (packed_color != prev_packed_color[s])
1026                     {
1027                         probe_high[num_high_trials++] = packed_color;
1028                         prev_packed_color[s] = packed_color;
1029                     }
1030                 }
1031             }
1032
1033             // Now try all unique combinations.
1034             for (uint i = 0; i < num_low_trials; i++)
1035             {
1036                 for (uint j = 0; j < num_high_trials; j++)
1037                 {
1038                     dxt1_solution_coordinates coords((uint16)probe_low[i], (uint16)probe_high[j]);
1039                     coords.canonicalize();
1040
1041                     solution_hash_map::insert_result solution_res(m_solutions_tried.insert(coords.m_low_color | (coords.m_high_color << 16U)));
1042                     if (!solution_res.second)
1043                         continue;
1044
1045                     evaluate_solution(coords, true, &m_best_solution);
1046                 }
1047             }
1048
1049             if (m_pParams->m_quality >= cCRNDXTQualityNormal)
1050             {
1051                 // Generate new candidates by exploring the low color's direct lattice neighbors
1052                 color_quad_u8 lc(dxt1_block::unpack_color(m_best_solution.m_coords.m_low_color, false));
1053
1054                 for (int i = 0; i < 26; i++)
1055                 {
1056                     int r = lc.r + g_adjacency[i].x;
1057                     if ((r < 0) || (r > 31))
1058                         continue;
1059
1060                     int g = lc.g + g_adjacency[i].y;
1061                     if ((g < 0) || (g > 63))
1062                         continue;
1063
1064                     int b = lc.b + g_adjacency[i].z;
1065                     if ((b < 0) || (b > 31))
1066                         continue;
1067
1068                     dxt1_solution_coordinates coords(dxt1_block::pack_color(r, g, b, false), m_best_solution.m_coords.m_high_color);
1069                     coords.canonicalize();
1070
1071                     solution_hash_map::insert_result solution_res(m_solutions_tried.insert(coords.m_low_color | (coords.m_high_color << 16U)));
1072                     if (solution_res.second)
1073                         evaluate_solution(coords, true, &m_best_solution);
1074                 }
1075
1076                 if (m_pParams->m_quality == cCRNDXTQualityUber)
1077                 {
1078                     // Generate new candidates by exploring the low color's direct lattice neighbors - this time, explore much further separately on each axis.
1079                     lc = dxt1_block::unpack_color(m_best_solution.m_coords.m_low_color, false);
1080
1081                     for (int a = 0; a < 3; a++)
1082                     {
1083                         int limit = (a == 1) ? 63 : 31;
1084
1085                         for (int s = -2; s <= 2; s += 4)
1086                         {
1087                             color_quad_u8 c(lc);
1088                             int q = c[a] + s;
1089                             if ((q < 0) || (q > limit))
1090                                 continue;
1091
1092                             c[a] = (uint8)q;
1093
1094                             dxt1_solution_coordinates coords(dxt1_block::pack_color(c, false), m_best_solution.m_coords.m_high_color);
1095                             coords.canonicalize();
1096
1097                             solution_hash_map::insert_result solution_res(m_solutions_tried.insert(coords.m_low_color | (coords.m_high_color << 16U)));
1098                             if (solution_res.second)
1099                                 evaluate_solution(coords, true, &m_best_solution);
1100                         }
1101                     }
1102                 }
1103
1104                 // Generate new candidates by exploring the high color's direct lattice neighbors
1105                 color_quad_u8 hc(dxt1_block::unpack_color(m_best_solution.m_coords.m_high_color, false));
1106
1107                 for (int i = 0; i < 26; i++)
1108                 {
1109                     int r = hc.r + g_adjacency[i].x;
1110                     if ((r < 0) || (r > 31))
1111                         continue;
1112
1113                     int g = hc.g + g_adjacency[i].y;
1114                     if ((g < 0) || (g > 63))
1115                         continue;
1116
1117                     int b = hc.b + g_adjacency[i].z;
1118                     if ((b < 0) || (b > 31))
1119                         continue;
1120
1121                     dxt1_solution_coordinates coords(m_best_solution.m_coords.m_low_color, dxt1_block::pack_color(r, g, b, false));
1122                     coords.canonicalize();
1123
1124                     solution_hash_map::insert_result solution_res(m_solutions_tried.insert(coords.m_low_color | (coords.m_high_color << 16U)));
1125                     if (solution_res.second)
1126                         evaluate_solution(coords, true, &m_best_solution);
1127                 }
1128
1129                 if (m_pParams->m_quality == cCRNDXTQualityUber)
1130                 {
1131                     // Generate new candidates by exploring the high color's direct lattice neighbors - this time, explore much further separately on each axis.
1132                     hc = dxt1_block::unpack_color(m_best_solution.m_coords.m_high_color, false);
1133
1134                     for (int a = 0; a < 3; a++)
1135                     {
1136                         int limit = (a == 1) ? 63 : 31;
1137
1138                         for (int s = -2; s <= 2; s += 4)
1139                         {
1140                             color_quad_u8 c(hc);
1141                             int q = c[a] + s;
1142                             if ((q < 0) || (q > limit))
1143                                 continue;
1144
1145                             c[a] = (uint8)q;
1146
1147                             dxt1_solution_coordinates coords(m_best_solution.m_coords.m_low_color, dxt1_block::pack_color(c, false));
1148                             coords.canonicalize();
1149
1150                             solution_hash_map::insert_result solution_res(m_solutions_tried.insert(coords.m_low_color | (coords.m_high_color << 16U)));
1151                             if (solution_res.second)
1152                                 evaluate_solution(coords, true, &m_best_solution);
1153                         }
1154                     }
1155                 }
1156             }
1157
1158             if ((!m_best_solution.m_error) || ((pass) && (m_best_solution.m_error == prev_best_error)))
1159                 break;
1160
1161             if (m_pParams->m_quality >= cCRNDXTQualityUber)
1162             {
1163                 // Attempt to refine current solution's endpoints given the current selectors using least squares.
1164                 refine_solution(1);
1165             }
1166         }
1167
1168         if (m_pParams->m_quality >= cCRNDXTQualityNormal)
1169         {
1170             if ((m_best_solution.m_error) && (!m_pParams->m_pixels_have_alpha))
1171             {
1172                 bool choose_solid_block = false;
1173                 if (m_best_solution.are_selectors_all_equal())
1174                 {
1175                     // All selectors equal - try various solid-block optimizations
1176                     choose_solid_block = try_average_block_as_solid();
1177                 }
1178
1179                 if ((!choose_solid_block) && (m_pParams->m_quality == cCRNDXTQualityUber))
1180                 {
1181                     // Per-component 1D endpoint optimization.
1182                     optimize_endpoint_comps();
1183                 }
1184             }
1185
1186             if (m_pParams->m_quality == cCRNDXTQualityUber)
1187             {
1188                 if (m_best_solution.m_error)
1189                 {
1190                     // The pixels may have already been DXTc compressed by another compressor.
1191                     // It's usually possible to recover the endpoints used to previously pack the block.
1192                     try_combinatorial_encoding();
1193                 }
1194             }
1195         }
1196
1197         return_solution(*m_pResults, m_best_solution);
1198
1199         if (m_pParams->m_endpoint_caching)
1200         {
1201             // Remember result for later reruse.
1202             m_prev_results[m_num_prev_results & (cMaxPrevResults - 1)] = m_best_solution.m_coords;
1203             m_num_prev_results++;
1204         }
1205
1206         return true;
1207     }
1208
1209     static inline int mul_8bit(int a, int b)
1210     {
1211         int t = a * b + 128;
1212         return (t + (t >> 8)) >> 8;
1213     }
1214
1215     bool dxt1_endpoint_optimizer::handle_multicolor_block()
1216     {
1217         uint num_passes = 1;
1218         vec3F perceptual_weights(1.0f);
1219
1220         if (m_perceptual)
1221         {
1222             // Compute RGB weighting for use in perceptual mode.
1223             // The more saturated the block, the more the weights deviate from (1,1,1).
1224             float ave_redness = 0;
1225             float ave_blueness = 0;
1226             float ave_l = 0;
1227
1228             for (uint i = 0; i < m_unique_colors.size(); i++)
1229             {
1230                 const color_quad_u8 &c = m_unique_colors[i].m_color;
1231                 const float weight = (float)m_unique_colors[i].m_weight;
1232
1233                 int l = mul_8bit(c.r + c.g + c.b, 0x55); // /3
1234                 ave_l += l;
1235                 l = math::maximum(1, l);
1236
1237                 float scale = weight / static_cast<float>(l);
1238
1239                 ave_redness += scale * c.r;
1240                 ave_blueness += scale * c.b;
1241             }
1242
1243             ave_redness /= m_total_unique_color_weight;
1244             ave_blueness /= m_total_unique_color_weight;
1245             ave_l /= m_total_unique_color_weight;
1246
1247             ave_l = math::minimum(1.0f, ave_l * 16.0f / 255.0f);
1248
1249             //float r = ave_l * powf(math::saturate(ave_redness / 3.0f), 5.0f);
1250             //float b = ave_l * powf(math::saturate(ave_blueness / 3.0f), 5.0f);
1251
1252             float p = ave_l * powf(math::saturate(math::maximum(ave_redness, ave_blueness) * 1.0f / 3.0f), 2.75f);
1253
1254             if (p >= 1.0f)
1255                 num_passes = 1;
1256             else
1257             {
1258                 num_passes = 2;
1259                 perceptual_weights = vec3F::lerp(vec3F(.212f, .72f, .072f), perceptual_weights, p);
1260             }
1261         }
1262
1263         for (uint pass_index = 0; pass_index < num_passes; pass_index++)
1264         {
1265             compute_vectors(perceptual_weights);
1266
1267             compute_pca(m_principle_axis, m_norm_unique_colors_weighted, vec3F(.2837149f, 0.9540631f, 0.096277453f));
1268
1269 #if 0
1270                 matrix44F m(matrix44F::make_scale_matrix(perceptual_weights[0], perceptual_weights[1], perceptual_weights[2]));
1271                 matrix44F im(m.get_inverse());
1272                 im.transpose_in_place();
1273                 m_principle_axis = m_principle_axis * im;
1274 #else
1275             // Purposely scale the components of the principle axis by the perceptual weighting.
1276             // There's probably a cleaner way to go about this, but it works (more competitive in perceptual mode against nvdxt.exe or ATI_Compress).
1277             m_principle_axis[0] /= perceptual_weights[0];
1278             m_principle_axis[1] /= perceptual_weights[1];
1279             m_principle_axis[2] /= perceptual_weights[2];
1280 #endif
1281             m_principle_axis.normalize_in_place();
1282
1283             if (num_passes > 1)
1284             {
1285                 // Check for obviously wild principle axes and try to compensate by backing off the component weightings.
1286                 if (fabs(m_principle_axis[0]) >= .795f)
1287                     perceptual_weights.set(.424f, .6f, .072f);
1288                 else if (fabs(m_principle_axis[2]) >= .795f)
1289                     perceptual_weights.set(.212f, .6f, .212f);
1290                 else
1291                     break;
1292             }
1293         }
1294
1295         // Find bounds of projection onto (potentially skewed) principle axis.
1296         float l = 1e+9;
1297         float h = -1e+9;
1298
1299         for (uint i = 0; i < m_norm_unique_colors.size(); i++)
1300         {
1301             float d = m_norm_unique_colors[i].dot(m_principle_axis);
1302             l = math::minimum(l, d);
1303             h = math::maximum(h, d);
1304         }
1305
1306         vec3F low_color(m_mean_norm_color + l * m_principle_axis);
1307         vec3F high_color(m_mean_norm_color + h * m_principle_axis);
1308
1309         if (!low_color.is_within_bounds(0.0f, 1.0f))
1310         {
1311             // Low color is outside the lattice, so bring it back in by casting a ray.
1312             vec3F coord;
1313             float t;
1314             aabb3F bounds(vec3F(0.0f), vec3F(1.0f));
1315             intersection::result res = intersection::ray_aabb(coord, t, ray3F(low_color, m_principle_axis), bounds);
1316             if (res == intersection::cSuccess)
1317                 low_color = coord;
1318         }
1319
1320         if (!high_color.is_within_bounds(0.0f, 1.0f))
1321         {
1322             // High color is outside the lattice, so bring it back in by casting a ray.
1323             vec3F coord;
1324             float t;
1325             aabb3F bounds(vec3F(0.0f), vec3F(1.0f));
1326             intersection::result res = intersection::ray_aabb(coord, t, ray3F(high_color, -m_principle_axis), bounds);
1327             if (res == intersection::cSuccess)
1328                 high_color = coord;
1329         }
1330
1331         // Now optimize the endpoints using the projection bounds on the (potentially skewed) principle axis as a starting point.
1332         if (!optimize_endpoints(low_color, high_color))
1333             return false;
1334
1335         return true;
1336     }
1337
1338     bool dxt1_endpoint_optimizer::handle_grayscale_block()
1339     {
1340         // TODO
1341         return true;
1342     }
1343
1344     // Tries quantizing the block to 4 colors using vanilla LBG. It tries all combinations of the quantized results as potential endpoints.
1345     bool dxt1_endpoint_optimizer::try_median4(const vec3F &low_color, const vec3F &high_color)
1346     {
1347         vec3F means[4];
1348
1349         if (m_unique_colors.size() <= 4)
1350         {
1351             for (uint i = 0; i < 4; i++)
1352                 means[i] = m_norm_unique_colors[math::minimum<int>(m_norm_unique_colors.size() - 1, i)];
1353         }
1354         else
1355         {
1356             means[0] = low_color - m_mean_norm_color;
1357             means[3] = high_color - m_mean_norm_color;
1358             means[1] = vec3F::lerp(means[0], means[3], 1.0f / 3.0f);
1359             means[2] = vec3F::lerp(means[0], means[3], 2.0f / 3.0f);
1360
1361             fast_random rm;
1362
1363             const uint cMaxIters = 8;
1364             uint reassign_rover = 0;
1365             float prev_total_dist = math::cNearlyInfinite;
1366             for (uint iter = 0; iter < cMaxIters; iter++)
1367             {
1368                 vec3F new_means[4];
1369                 float new_weights[4];
1370                 utils::zero_object(new_means);
1371                 utils::zero_object(new_weights);
1372
1373                 float total_dist = 0;
1374
1375                 for (uint i = 0; i < m_unique_colors.size(); i++)
1376                 {
1377                     const vec3F &v = m_norm_unique_colors[i];
1378
1379                     float best_dist = means[0].squared_distance(v);
1380                     int best_index = 0;
1381
1382                     for (uint j = 1; j < 4; j++)
1383                     {
1384                         float dist = means[j].squared_distance(v);
1385                         if (dist < best_dist)
1386                         {
1387                             best_dist = dist;
1388                             best_index = j;
1389                         }
1390                     }
1391
1392                     total_dist += best_dist;
1393
1394                     new_means[best_index] += v * (float)m_unique_colors[i].m_weight;
1395                     new_weights[best_index] += (float)m_unique_colors[i].m_weight;
1396                 }
1397
1398                 uint highest_index = 0;
1399                 float highest_weight = 0;
1400                 bool empty_cell = false;
1401                 for (uint j = 0; j < 4; j++)
1402                 {
1403                     if (new_weights[j] > 0.0f)
1404                     {
1405                         means[j] = new_means[j] / new_weights[j];
1406                         if (new_weights[j] > highest_weight)
1407                         {
1408                             highest_weight = new_weights[j];
1409                             highest_index = j;
1410                         }
1411                     }
1412                     else
1413                         empty_cell = true;
1414                 }
1415
1416                 if (!empty_cell)
1417                 {
1418                     if (fabs(total_dist - prev_total_dist) < .00001f)
1419                         break;
1420
1421                     prev_total_dist = total_dist;
1422                 }
1423                 else
1424                     prev_total_dist = math::cNearlyInfinite;
1425
1426                 if ((empty_cell) && (iter != (cMaxIters - 1)))
1427                 {
1428                     const uint ri = (highest_index + reassign_rover) & 3;
1429                     reassign_rover++;
1430
1431                     for (uint j = 0; j < 4; j++)
1432                     {
1433                         if (new_weights[j] == 0.0f)
1434                         {
1435                             means[j] = means[ri];
1436                             means[j] += vec3F::make_random(rm, -.00196f, .00196f);
1437                         }
1438                     }
1439                 }
1440             }
1441         }
1442
1443         bool improved = false;
1444
1445         for (uint i = 0; i < 3; i++)
1446         {
1447             for (uint j = i + 1; j < 4; j++)
1448             {
1449                 const vec3F v0(means[i] + m_mean_norm_color);
1450                 const vec3F v1(means[j] + m_mean_norm_color);
1451
1452                 dxt1_solution_coordinates sc(
1453                     color_quad_u8((int)floor(.5f + v0[0] * 31.0f), (int)floor(.5f + v0[1] * 63.0f), (int)floor(.5f + v0[2] * 31.0f), 255),
1454                     color_quad_u8((int)floor(.5f + v1[0] * 31.0f), (int)floor(.5f + v1[1] * 63.0f), (int)floor(.5f + v1[2] * 31.0f), 255), false);
1455
1456                 sc.canonicalize();
1457
1458                 improved |= evaluate_solution(sc, true, &m_best_solution, false);
1459             }
1460         }
1461
1462         improved |= refine_solution((m_pParams->m_quality == cCRNDXTQualityUber) ? 1 : 0);
1463
1464         return improved;
1465     }
1466
1467     // Given candidate low/high endpoints, find the optimal selectors for 3 and 4 color blocks, compute the resulting error,
1468     // and use the candidate if it results in less error than the best found result so far.
1469     bool dxt1_endpoint_optimizer::evaluate_solution(
1470         const dxt1_solution_coordinates &coords,
1471         bool early_out,
1472         potential_solution *pBest_solution,
1473         bool alternate_rounding)
1474     {
1475         m_total_evals++;
1476
1477         if ((!m_pSolutions) || (alternate_rounding))
1478         {
1479             if (m_pParams->m_quality >= cCRNDXTQualityBetter)
1480                 return evaluate_solution_uber(m_trial_solution, coords, early_out, pBest_solution, alternate_rounding);
1481             else
1482                 return evaluate_solution_fast(m_trial_solution, coords, early_out, pBest_solution, alternate_rounding);
1483         }
1484
1485         evaluate_solution_uber(m_trial_solution, coords, false, NULL, alternate_rounding);
1486
1487         VOGL_ASSERT(m_trial_solution.m_valid);
1488
1489         // Caller has requested all considered candidate solutions for later analysis.
1490         m_pSolutions->resize(m_pSolutions->size() + 1);
1491         solution &new_solution = m_pSolutions->back();
1492         new_solution.m_selectors.resize(m_pParams->m_num_pixels);
1493         new_solution.m_results.m_pSelectors = &new_solution.m_selectors[0];
1494
1495         return_solution(new_solution.m_results, m_trial_solution);
1496
1497         if ((pBest_solution) && (m_trial_solution.m_error < m_best_solution.m_error))
1498         {
1499             *pBest_solution = m_trial_solution;
1500             return true;
1501         }
1502
1503         return false;
1504     }
1505
1506     inline uint dxt1_endpoint_optimizer::color_distance(bool perceptual, const color_quad_u8 &e1, const color_quad_u8 &e2, bool alpha)
1507     {
1508         if (perceptual)
1509         {
1510             return color::color_distance(true, e1, e2, alpha);
1511         }
1512         else if (m_pParams->m_grayscale_sampling)
1513         {
1514             // Computes error assuming shader will be converting the result to grayscale.
1515             int y0 = color::RGB_to_Y(e1);
1516             int y1 = color::RGB_to_Y(e2);
1517             int yd = y0 - y1;
1518             if (alpha)
1519             {
1520                 int da = (int)e1[3] - (int)e2[3];
1521                 return yd * yd + da * da;
1522             }
1523             else
1524             {
1525                 return yd * yd;
1526             }
1527         }
1528         else if (m_has_color_weighting)
1529         {
1530             // Compute error using user provided color component weights.
1531             int dr = (int)e1[0] - (int)e2[0];
1532             int dg = (int)e1[1] - (int)e2[1];
1533             int db = (int)e1[2] - (int)e2[2];
1534
1535             dr = (dr * dr) * m_pParams->m_color_weights[0];
1536             dg = (dg * dg) * m_pParams->m_color_weights[1];
1537             db = (db * db) * m_pParams->m_color_weights[2];
1538
1539             if (alpha)
1540             {
1541                 int da = (int)e1[3] - (int)e2[3];
1542                 da = (da * da) * (m_pParams->m_color_weights[0] + m_pParams->m_color_weights[1] + m_pParams->m_color_weights[2]);
1543                 return dr + dg + db + da;
1544             }
1545             else
1546             {
1547                 return dr + dg + db;
1548             }
1549         }
1550         else
1551         {
1552             return color::color_distance(false, e1, e2, alpha);
1553         }
1554     }
1555
1556     bool dxt1_endpoint_optimizer::evaluate_solution_uber(
1557         potential_solution &solution,
1558         const dxt1_solution_coordinates &coords,
1559         bool early_out,
1560         potential_solution *pBest_solution,
1561         bool alternate_rounding)
1562     {
1563         solution.m_coords = coords;
1564         solution.m_selectors.resize(m_unique_colors.size());
1565
1566         if ((pBest_solution) && (early_out))
1567             solution.m_error = pBest_solution->m_error;
1568         else
1569             solution.m_error = cUINT64_MAX;
1570
1571         solution.m_alpha_block = false;
1572         solution.m_valid = false;
1573
1574         uint first_block_type = 0;
1575         uint last_block_type = 1;
1576
1577         if ((m_pParams->m_pixels_have_alpha) || (m_pParams->m_force_alpha_blocks))
1578             first_block_type = 1;
1579         else if (!m_pParams->m_use_alpha_blocks)
1580             last_block_type = 0;
1581
1582         m_trial_selectors.resize(m_unique_colors.size());
1583
1584         color_quad_u8 colors[cDXT1SelectorValues];
1585
1586         colors[0] = dxt1_block::unpack_color(coords.m_low_color, true);
1587         colors[1] = dxt1_block::unpack_color(coords.m_high_color, true);
1588
1589         for (uint block_type = first_block_type; block_type <= last_block_type; block_type++)
1590         {
1591             uint64_t trial_error = 0;
1592
1593             if (!block_type)
1594             {
1595                 colors[2].set_noclamp_rgba((colors[0].r * 2 + colors[1].r + alternate_rounding) / 3, (colors[0].g * 2 + colors[1].g + alternate_rounding) / 3, (colors[0].b * 2 + colors[1].b + alternate_rounding) / 3, 0);
1596                 colors[3].set_noclamp_rgba((colors[1].r * 2 + colors[0].r + alternate_rounding) / 3, (colors[1].g * 2 + colors[0].g + alternate_rounding) / 3, (colors[1].b * 2 + colors[0].b + alternate_rounding) / 3, 0);
1597
1598                 if (m_perceptual)
1599                 {
1600                     for (int unique_color_index = (int)m_unique_colors.size() - 1; unique_color_index >= 0; unique_color_index--)
1601                     {
1602                         const color_quad_u8 &c = m_unique_colors[unique_color_index].m_color;
1603
1604                         uint best_error = color_distance(true, c, colors[0], false);
1605                         uint best_color_index = 0;
1606
1607                         uint err = color_distance(true, c, colors[1], false);
1608                         if (err < best_error)
1609                         {
1610                             best_error = err;
1611                             best_color_index = 1;
1612                         }
1613
1614                         err = color_distance(true, c, colors[2], false);
1615                         if (err < best_error)
1616                         {
1617                             best_error = err;
1618                             best_color_index = 2;
1619                         }
1620
1621                         err = color_distance(true, c, colors[3], false);
1622                         if (err < best_error)
1623                         {
1624                             best_error = err;
1625                             best_color_index = 3;
1626                         }
1627
1628                         trial_error += best_error * m_unique_colors[unique_color_index].m_weight;
1629                         if (trial_error >= solution.m_error)
1630                             break;
1631
1632                         m_trial_selectors[unique_color_index] = static_cast<uint8>(best_color_index);
1633                     }
1634                 }
1635                 else
1636                 {
1637                     for (int unique_color_index = (int)m_unique_colors.size() - 1; unique_color_index >= 0; unique_color_index--)
1638                     {
1639                         const color_quad_u8 &c = m_unique_colors[unique_color_index].m_color;
1640
1641                         uint best_error = color_distance(false, c, colors[0], false);
1642                         uint best_color_index = 0;
1643
1644                         uint err = color_distance(false, c, colors[1], false);
1645                         if (err < best_error)
1646                         {
1647                             best_error = err;
1648                             best_color_index = 1;
1649                         }
1650
1651                         err = color_distance(false, c, colors[2], false);
1652                         if (err < best_error)
1653                         {
1654                             best_error = err;
1655                             best_color_index = 2;
1656                         }
1657
1658                         err = color_distance(false, c, colors[3], false);
1659                         if (err < best_error)
1660                         {
1661                             best_error = err;
1662                             best_color_index = 3;
1663                         }
1664
1665                         trial_error += best_error * m_unique_colors[unique_color_index].m_weight;
1666                         if (trial_error >= solution.m_error)
1667                             break;
1668
1669                         m_trial_selectors[unique_color_index] = static_cast<uint8>(best_color_index);
1670                     }
1671                 }
1672             }
1673             else
1674             {
1675                 colors[2].set_noclamp_rgba((colors[0].r + colors[1].r + alternate_rounding) >> 1, (colors[0].g + colors[1].g + alternate_rounding) >> 1, (colors[0].b + colors[1].b + alternate_rounding) >> 1, 255U);
1676
1677                 if (m_perceptual)
1678                 {
1679                     for (int unique_color_index = (int)m_unique_colors.size() - 1; unique_color_index >= 0; unique_color_index--)
1680                     {
1681                         const color_quad_u8 &c = m_unique_colors[unique_color_index].m_color;
1682
1683                         uint best_error = color_distance(true, c, colors[0], false);
1684                         uint best_color_index = 0;
1685
1686                         uint err = color_distance(true, c, colors[1], false);
1687                         if (err < best_error)
1688                         {
1689                             best_error = err;
1690                             best_color_index = 1;
1691                         }
1692
1693                         err = color_distance(true, c, colors[2], false);
1694                         if (err < best_error)
1695                         {
1696                             best_error = err;
1697                             best_color_index = 2;
1698                         }
1699
1700                         trial_error += best_error * m_unique_colors[unique_color_index].m_weight;
1701                         if (trial_error >= solution.m_error)
1702                             break;
1703
1704                         m_trial_selectors[unique_color_index] = static_cast<uint8>(best_color_index);
1705                     }
1706                 }
1707                 else
1708                 {
1709                     for (int unique_color_index = (int)m_unique_colors.size() - 1; unique_color_index >= 0; unique_color_index--)
1710                     {
1711                         const color_quad_u8 &c = m_unique_colors[unique_color_index].m_color;
1712
1713                         uint best_error = color_distance(false, c, colors[0], false);
1714                         uint best_color_index = 0;
1715
1716                         uint err = color_distance(false, c, colors[1], false);
1717                         if (err < best_error)
1718                         {
1719                             best_error = err;
1720                             best_color_index = 1;
1721                         }
1722
1723                         err = color_distance(false, c, colors[2], false);
1724                         if (err < best_error)
1725                         {
1726                             best_error = err;
1727                             best_color_index = 2;
1728                         }
1729
1730                         trial_error += best_error * m_unique_colors[unique_color_index].m_weight;
1731                         if (trial_error >= solution.m_error)
1732                             break;
1733
1734                         m_trial_selectors[unique_color_index] = static_cast<uint8>(best_color_index);
1735                     }
1736                 }
1737             }
1738
1739             if (trial_error < solution.m_error)
1740             {
1741                 solution.m_error = trial_error;
1742                 solution.m_alpha_block = (block_type != 0);
1743                 solution.m_selectors = m_trial_selectors;
1744                 solution.m_valid = true;
1745             }
1746         }
1747
1748         if ((!solution.m_alpha_block) && (solution.m_coords.m_low_color == solution.m_coords.m_high_color))
1749         {
1750             uint s;
1751             if ((solution.m_coords.m_low_color & 31) != 31)
1752             {
1753                 solution.m_coords.m_low_color++;
1754                 s = 1;
1755             }
1756             else
1757             {
1758                 solution.m_coords.m_high_color--;
1759                 s = 0;
1760             }
1761
1762             for (uint i = 0; i < m_unique_colors.size(); i++)
1763                 solution.m_selectors[i] = static_cast<uint8>(s);
1764         }
1765
1766         if ((pBest_solution) && (solution.m_error < pBest_solution->m_error))
1767         {
1768             *pBest_solution = solution;
1769             return true;
1770         }
1771
1772         return false;
1773     }
1774
1775     bool dxt1_endpoint_optimizer::evaluate_solution_fast(
1776         potential_solution &solution,
1777         const dxt1_solution_coordinates &coords,
1778         bool early_out,
1779         potential_solution *pBest_solution,
1780         bool alternate_rounding)
1781     {
1782         solution.m_coords = coords;
1783         solution.m_selectors.resize(m_unique_colors.size());
1784
1785         if ((pBest_solution) && (early_out))
1786             solution.m_error = pBest_solution->m_error;
1787         else
1788             solution.m_error = cUINT64_MAX;
1789
1790         solution.m_alpha_block = false;
1791         solution.m_valid = false;
1792
1793         uint first_block_type = 0;
1794         uint last_block_type = 1;
1795
1796         if ((m_pParams->m_pixels_have_alpha) || (m_pParams->m_force_alpha_blocks))
1797             first_block_type = 1;
1798         else if (!m_pParams->m_use_alpha_blocks)
1799             last_block_type = 0;
1800
1801         m_trial_selectors.resize(m_unique_colors.size());
1802
1803         color_quad_u8 colors[cDXT1SelectorValues];
1804         colors[0] = dxt1_block::unpack_color(coords.m_low_color, true);
1805         colors[1] = dxt1_block::unpack_color(coords.m_high_color, true);
1806
1807         int vr = colors[1].r - colors[0].r;
1808         int vg = colors[1].g - colors[0].g;
1809         int vb = colors[1].b - colors[0].b;
1810         if (m_perceptual)
1811         {
1812             vr *= 8;
1813             vg *= 24;
1814         }
1815
1816         int stops[4];
1817         stops[0] = colors[0].r * vr + colors[0].g * vg + colors[0].b * vb;
1818         stops[1] = colors[1].r * vr + colors[1].g * vg + colors[1].b * vb;
1819
1820         int dirr = vr * 2;
1821         int dirg = vg * 2;
1822         int dirb = vb * 2;
1823
1824         for (uint block_type = first_block_type; block_type <= last_block_type; block_type++)
1825         {
1826             uint64_t trial_error = 0;
1827
1828             if (!block_type)
1829             {
1830                 colors[2].set_noclamp_rgba((colors[0].r * 2 + colors[1].r + alternate_rounding) / 3, (colors[0].g * 2 + colors[1].g + alternate_rounding) / 3, (colors[0].b * 2 + colors[1].b + alternate_rounding) / 3, 255U);
1831                 colors[3].set_noclamp_rgba((colors[1].r * 2 + colors[0].r + alternate_rounding) / 3, (colors[1].g * 2 + colors[0].g + alternate_rounding) / 3, (colors[1].b * 2 + colors[0].b + alternate_rounding) / 3, 255U);
1832
1833                 stops[2] = colors[2].r * vr + colors[2].g * vg + colors[2].b * vb;
1834                 stops[3] = colors[3].r * vr + colors[3].g * vg + colors[3].b * vb;
1835
1836                 // 0 2 3 1
1837                 int c0Point = stops[1] + stops[3];
1838                 int halfPoint = stops[3] + stops[2];
1839                 int c3Point = stops[2] + stops[0];
1840
1841                 for (int unique_color_index = (int)m_unique_colors.size() - 1; unique_color_index >= 0; unique_color_index--)
1842                 {
1843                     const color_quad_u8 &c = m_unique_colors[unique_color_index].m_color;
1844
1845                     int dot = c.r * dirr + c.g * dirg + c.b * dirb;
1846
1847                     uint8 best_color_index;
1848                     if (dot < halfPoint)
1849                         best_color_index = (dot < c3Point) ? 0 : 2;
1850                     else
1851                         best_color_index = (dot < c0Point) ? 3 : 1;
1852
1853                     uint best_error = color_distance(m_perceptual, c, colors[best_color_index], false);
1854
1855                     trial_error += best_error * m_unique_colors[unique_color_index].m_weight;
1856                     if (trial_error >= solution.m_error)
1857                         break;
1858
1859                     m_trial_selectors[unique_color_index] = static_cast<uint8>(best_color_index);
1860                 }
1861             }
1862             else
1863             {
1864                 colors[2].set_noclamp_rgba((colors[0].r + colors[1].r + alternate_rounding) >> 1, (colors[0].g + colors[1].g + alternate_rounding) >> 1, (colors[0].b + colors[1].b + alternate_rounding) >> 1, 255U);
1865
1866                 stops[2] = colors[2].r * vr + colors[2].g * vg + colors[2].b * vb;
1867
1868                 // 0 2 1
1869                 int c02Point = stops[0] + stops[2];
1870                 int c21Point = stops[2] + stops[1];
1871
1872                 for (int unique_color_index = (int)m_unique_colors.size() - 1; unique_color_index >= 0; unique_color_index--)
1873                 {
1874                     const color_quad_u8 &c = m_unique_colors[unique_color_index].m_color;
1875
1876                     int dot = c.r * dirr + c.g * dirg + c.b * dirb;
1877
1878                     uint8 best_color_index;
1879                     if (dot < c02Point)
1880                         best_color_index = 0;
1881                     else if (dot < c21Point)
1882                         best_color_index = 2;
1883                     else
1884                         best_color_index = 1;
1885
1886                     uint best_error = color_distance(m_perceptual, c, colors[best_color_index], false);
1887
1888                     trial_error += best_error * m_unique_colors[unique_color_index].m_weight;
1889                     if (trial_error >= solution.m_error)
1890                         break;
1891
1892                     m_trial_selectors[unique_color_index] = static_cast<uint8>(best_color_index);
1893                 }
1894             }
1895
1896             if (trial_error < solution.m_error)
1897             {
1898                 solution.m_error = trial_error;
1899                 solution.m_alpha_block = (block_type != 0);
1900                 solution.m_selectors = m_trial_selectors;
1901                 solution.m_valid = true;
1902             }
1903         }
1904
1905         if ((!solution.m_alpha_block) && (solution.m_coords.m_low_color == solution.m_coords.m_high_color))
1906         {
1907             uint s;
1908             if ((solution.m_coords.m_low_color & 31) != 31)
1909             {
1910                 solution.m_coords.m_low_color++;
1911                 s = 1;
1912             }
1913             else
1914             {
1915                 solution.m_coords.m_high_color--;
1916                 s = 0;
1917             }
1918
1919             for (uint i = 0; i < m_unique_colors.size(); i++)
1920                 solution.m_selectors[i] = static_cast<uint8>(s);
1921         }
1922
1923         if ((pBest_solution) && (solution.m_error < pBest_solution->m_error))
1924         {
1925             *pBest_solution = solution;
1926             return true;
1927         }
1928
1929         return false;
1930     }
1931
1932     unique_color dxt1_endpoint_optimizer::lerp_color(const color_quad_u8 &a, const color_quad_u8 &b, float f, int rounding)
1933     {
1934         color_quad_u8 res;
1935
1936         float r = rounding ? 1.0f : 0.0f;
1937         res[0] = static_cast<uint8>(math::clamp(math::float_to_int(r + math::lerp<float>(a[0], b[0], f)), 0, 255));
1938         res[1] = static_cast<uint8>(math::clamp(math::float_to_int(r + math::lerp<float>(a[1], b[1], f)), 0, 255));
1939         res[2] = static_cast<uint8>(math::clamp(math::float_to_int(r + math::lerp<float>(a[2], b[2], f)), 0, 255));
1940         res[3] = 255;
1941
1942         return unique_color(res, 1);
1943     }
1944
1945     // The block may have been already compressed using another DXTc compressor, such as squish, ATI_Compress, ryg_dxt, etc.
1946     // Attempt to recover the endpoints used by that block compressor.
1947     void dxt1_endpoint_optimizer::try_combinatorial_encoding()
1948     {
1949         if ((m_unique_colors.size() < 2) || (m_unique_colors.size() > 4))
1950             return;
1951
1952         m_temp_unique_colors = m_unique_colors;
1953
1954         if (m_temp_unique_colors.size() == 2)
1955         {
1956             // a    b    c   d
1957             // 0.0  1/3 2/3  1.0
1958
1959             for (uint k = 0; k < 2; k++)
1960             {
1961                 for (uint q = 0; q < 2; q++)
1962                 {
1963                     const uint r = q ^ 1;
1964
1965                     // a b
1966                     m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[q].m_color, m_temp_unique_colors[r].m_color, 2.0f, k));
1967                     m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[q].m_color, m_temp_unique_colors[r].m_color, 3.0f, k));
1968
1969                     // a c
1970                     m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[q].m_color, m_temp_unique_colors[r].m_color, .5f, k));
1971                     m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[q].m_color, m_temp_unique_colors[r].m_color, 1.5f, k));
1972
1973                     // a d
1974
1975                     // b c
1976                     m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[q].m_color, m_temp_unique_colors[r].m_color, -1.0f, k));
1977                     m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[q].m_color, m_temp_unique_colors[r].m_color, 2.0f, k));
1978
1979                     // b d
1980                     m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[q].m_color, m_temp_unique_colors[r].m_color, -.5f, k));
1981                     m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[q].m_color, m_temp_unique_colors[r].m_color, .5f, k));
1982
1983                     // c d
1984                     m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[q].m_color, m_temp_unique_colors[r].m_color, -2.0f, k));
1985                     m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[q].m_color, m_temp_unique_colors[r].m_color, -1.0f, k));
1986                 }
1987             }
1988         }
1989         else if (m_temp_unique_colors.size() == 3)
1990         {
1991             // a    b    c   d
1992             // 0.0  1/3 2/3  1.0
1993
1994             for (uint i = 0; i <= 2; i++)
1995             {
1996                 for (uint j = 0; j <= 2; j++)
1997                 {
1998                     if (i == j)
1999                         continue;
2000
2001                     // a b c
2002                     m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[i].m_color, m_temp_unique_colors[j].m_color, 1.5f));
2003
2004                     // a b d
2005                     m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[i].m_color, m_temp_unique_colors[j].m_color, 2.0f / 3.0f));
2006
2007                     // a c d
2008                     m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[i].m_color, m_temp_unique_colors[j].m_color, 1.0f / 3.0f));
2009
2010                     // b c d
2011                     m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[i].m_color, m_temp_unique_colors[j].m_color, -.5f));
2012                 }
2013             }
2014         }
2015
2016         m_unique_packed_colors.resize(0);
2017
2018         for (uint i = 0; i < m_temp_unique_colors.size(); i++)
2019         {
2020             const color_quad_u8 &unique_color = m_temp_unique_colors[i].m_color;
2021             const uint16 packed_color = dxt1_block::pack_color(unique_color, true);
2022
2023             if (std::find(m_unique_packed_colors.begin(), m_unique_packed_colors.end(), packed_color) != m_unique_packed_colors.end())
2024                 continue;
2025
2026             m_unique_packed_colors.push_back(packed_color);
2027         }
2028
2029         if (m_unique_packed_colors.size() < 2)
2030             return;
2031
2032         for (uint alt_rounding = 0; alt_rounding < 2; alt_rounding++)
2033         {
2034             for (uint i = 0; i < m_unique_packed_colors.size() - 1; i++)
2035             {
2036                 for (uint j = i + 1; j < m_unique_packed_colors.size(); j++)
2037                 {
2038                     evaluate_solution(
2039                         dxt1_solution_coordinates(m_unique_packed_colors[i], m_unique_packed_colors[j]),
2040                         true,
2041                         (alt_rounding == 0) ? &m_best_solution : NULL,
2042                         (alt_rounding != 0));
2043
2044                     if (m_trial_solution.m_error == 0)
2045                     {
2046                         if (alt_rounding)
2047                             m_best_solution = m_trial_solution;
2048
2049                         return;
2050                     }
2051                 }
2052             }
2053         }
2054
2055         return;
2056     }
2057
2058     // The fourth (transparent) color in 3 color "transparent" blocks is black, which can be optionally exploited for small gains in DXT1 mode if the caller
2059     // doesn't actually use alpha. (But not in DXT5 mode, because 3-color blocks aren't permitted by GPU's for DXT5.)
2060     bool dxt1_endpoint_optimizer::try_alpha_as_black_optimization()
2061     {
2062         const params *pOrig_params = m_pParams;
2063         VOGL_NOTE_UNUSED(pOrig_params);
2064         results *pOrig_results = m_pResults;
2065
2066         uint num_dark_colors = 0;
2067
2068         for (uint i = 0; i < m_unique_colors.size(); i++)
2069             if ((m_unique_colors[i].m_color[0] <= 4) && (m_unique_colors[i].m_color[1] <= 4) && (m_unique_colors[i].m_color[2] <= 4))
2070                 num_dark_colors++;
2071
2072         if ((!num_dark_colors) || (num_dark_colors == m_unique_colors.size()))
2073             return true;
2074
2075         params trial_params(*m_pParams);
2076         vogl::vector<color_quad_u8> trial_colors;
2077         trial_colors.insert(0, m_pParams->m_pPixels, m_pParams->m_num_pixels);
2078
2079         trial_params.m_pPixels = trial_colors.get_ptr();
2080         trial_params.m_pixels_have_alpha = true;
2081
2082         for (uint i = 0; i < trial_colors.size(); i++)
2083             if ((trial_colors[i][0] <= 4) && (trial_colors[i][1] <= 4) && (trial_colors[i][2] <= 4))
2084                 trial_colors[i][3] = 0;
2085
2086         results trial_results;
2087
2088         vogl::vector<uint8> trial_selectors(m_pParams->m_num_pixels);
2089         trial_results.m_pSelectors = trial_selectors.get_ptr();
2090
2091         if (!compute_internal(trial_params, trial_results, NULL))
2092             return false;
2093
2094         VOGL_ASSERT(trial_results.m_alpha_block);
2095
2096         color_quad_u8 c[4];
2097         dxt1_block::get_block_colors3(c, trial_results.m_low_color, trial_results.m_high_color);
2098
2099         uint64_t trial_error = 0;
2100
2101         for (uint i = 0; i < trial_colors.size(); i++)
2102         {
2103             if (trial_colors[i][3] == 0)
2104             {
2105                 VOGL_ASSERT(trial_selectors[i] == 3);
2106             }
2107             else
2108             {
2109                 VOGL_ASSERT(trial_selectors[i] != 3);
2110             }
2111
2112             trial_error += color_distance(m_perceptual, trial_colors[i], c[trial_selectors[i]], false);
2113         }
2114
2115         if (trial_error < pOrig_results->m_error)
2116         {
2117             pOrig_results->m_error = trial_error;
2118
2119             pOrig_results->m_low_color = trial_results.m_low_color;
2120             pOrig_results->m_high_color = trial_results.m_high_color;
2121
2122             if (pOrig_results->m_pSelectors)
2123                 memcpy(pOrig_results->m_pSelectors, trial_results.m_pSelectors, m_pParams->m_num_pixels);
2124
2125             pOrig_results->m_alpha_block = true;
2126         }
2127
2128         return true;
2129     }
2130
2131     bool dxt1_endpoint_optimizer::compute_internal(const params &p, results &r, solution_vec *pSolutions)
2132     {
2133         clear();
2134
2135         m_pParams = &p;
2136         m_pResults = &r;
2137         m_pSolutions = pSolutions;
2138
2139         m_has_color_weighting = (m_pParams->m_color_weights[0] != 1) || (m_pParams->m_color_weights[1] != 1) || (m_pParams->m_color_weights[2] != 1);
2140         m_perceptual = m_pParams->m_perceptual && !m_has_color_weighting && !m_pParams->m_grayscale_sampling;
2141
2142         find_unique_colors();
2143
2144         m_best_solution.clear();
2145
2146         if (m_unique_colors.is_empty())
2147             return handle_all_transparent_block();
2148         else if ((m_unique_colors.size() == 1) && (!m_has_transparent_pixels))
2149             return handle_solid_block();
2150         else
2151         {
2152             if (!handle_multicolor_block())
2153                 return false;
2154
2155             if ((m_all_pixels_grayscale) && (m_best_solution.m_error))
2156             {
2157                 if (!handle_grayscale_block())
2158                     return false;
2159             }
2160         }
2161
2162         return true;
2163     }
2164
2165     bool dxt1_endpoint_optimizer::compute(const params &p, results &r, solution_vec *pSolutions)
2166     {
2167         if (!p.m_pPixels)
2168             return false;
2169
2170         bool status = compute_internal(p, r, pSolutions);
2171         if (!status)
2172             return false;
2173
2174         if ((m_pParams->m_use_alpha_blocks) && (m_pParams->m_use_transparent_indices_for_black) && (!m_pParams->m_pixels_have_alpha) && (!pSolutions))
2175         {
2176             if (!try_alpha_as_black_optimization())
2177                 return false;
2178         }
2179
2180         return true;
2181     }
2182
2183     // Build array of unique colors and their weights.
2184     void dxt1_endpoint_optimizer::find_unique_colors()
2185     {
2186         m_has_transparent_pixels = false;
2187
2188         uint num_opaque_pixels = 0;
2189
2190         const uint alpha_thresh = m_pParams->m_pixels_have_alpha ? (m_pParams->m_dxt1a_alpha_threshold << 24U) : 0;
2191
2192         const uint32 *pSrc_pixels = reinterpret_cast<const uint32 *>(m_pParams->m_pPixels);
2193         const uint32 *pSrc_pixels_end = pSrc_pixels + m_pParams->m_num_pixels;
2194
2195         m_unique_colors.resize(m_pParams->m_num_pixels);
2196         uint num_unique_colors = 0;
2197
2198         m_all_pixels_grayscale = true;
2199
2200         do
2201         {
2202             uint32 c = utils::read_le32(pSrc_pixels);
2203             pSrc_pixels++;
2204
2205             if (c < alpha_thresh)
2206             {
2207                 m_has_transparent_pixels = true;
2208                 continue;
2209             }
2210
2211             if (m_all_pixels_grayscale)
2212             {
2213                 uint r = c & 0xFF;
2214                 uint g = (c >> 8) & 0xFF;
2215                 uint b = (c >> 16) & 0xFF;
2216                 if ((r != g) || (r != b))
2217                     m_all_pixels_grayscale = false;
2218             }
2219
2220             c |= 0xFF000000U;
2221
2222             unique_color_hash_map::insert_result ins_result(m_unique_color_hash_map.insert(c, num_unique_colors));
2223
2224             if (ins_result.second)
2225             {
2226                 utils::write_le32(&m_unique_colors[num_unique_colors].m_color.m_u32, c);
2227                 m_unique_colors[num_unique_colors].m_weight = 1;
2228                 num_unique_colors++;
2229             }
2230             else
2231                 m_unique_colors[ins_result.first->second].m_weight++;
2232
2233             num_opaque_pixels++;
2234
2235         } while (pSrc_pixels != pSrc_pixels_end);
2236
2237         m_unique_colors.resize(num_unique_colors);
2238
2239         m_total_unique_color_weight = num_opaque_pixels;
2240     }
2241
2242 } // namespace vogl