]> git.cworth.org Git - vogl/blob - src/voglcore/vogl_dxt1.cpp
Initial vogl checkin
[vogl] / src / voglcore / vogl_dxt1.cpp
1 /**************************************************************************
2  *
3  * Copyright 2013-2014 RAD Game Tools and Valve Software
4  * Copyright 2010-2014 Rich Geldreich and Tenacious Software LLC
5  * All Rights Reserved.
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23  * THE SOFTWARE.
24  *
25  **************************************************************************/
26
27 // File: vogl_dxt1.cpp
28 //
29 // Notes:
30 // This class is not optimized for performance on small blocks, unlike typical DXT1 compressors. It's optimized for scalability and quality:
31 // - Very high quality in terms of avg. RMSE or Luma RMSE. Goal is to always match or beat every other known offline DXTc compressor: ATI_Compress, squish, NVidia texture tools, nvdxt.exe, etc.
32 // - Reasonable scalability and stability with hundreds to many thousands of input colors (including inputs with many thousands of equal/nearly equal colors).
33 // - Any quality optimization which results in even a tiny improvement is worth it -- as long as it's either a constant or linear slowdown.
34 //   Tiny quality improvements can be extremely valuable in large clusters.
35 // - Quality should scale well vs. CPU time cost, i.e. the more time you spend the higher the quality.
36 #include "vogl_core.h"
37 #include "vogl_dxt1.h"
38 #include "vogl_ryg_dxt.hpp"
39 #include "vogl_dxt_fast.h"
40 #include "vogl_intersect.h"
41 #include "vogl_vec_interval.h"
42
43 namespace vogl
44 {
45     //-----------------------------------------------------------------------------------------------------------------------------------------
46
47     static const int16 g_fast_probe_table[] = { 0, 1, 2, 3 };
48     static const uint cFastProbeTableSize = sizeof(g_fast_probe_table) / sizeof(g_fast_probe_table[0]);
49
50     static const int16 g_normal_probe_table[] = { 0, 1, 3, 5, 7 };
51     static const uint cNormalProbeTableSize = sizeof(g_normal_probe_table) / sizeof(g_normal_probe_table[0]);
52
53     static const int16 g_better_probe_table[] = { 0, 1, 2, 3, 5, 9, 15, 19, 27, 43 };
54     static const uint cBetterProbeTableSize = sizeof(g_better_probe_table) / sizeof(g_better_probe_table[0]);
55
56     static const int16 g_uber_probe_table[] = { 0, 1, 2, 3, 5, 7, 9, 10, 13, 15, 19, 27, 43, 59, 91 };
57     static const uint cUberProbeTableSize = sizeof(g_uber_probe_table) / sizeof(g_uber_probe_table[0]);
58
59     //-----------------------------------------------------------------------------------------------------------------------------------------
60
61     dxt1_endpoint_optimizer::dxt1_endpoint_optimizer()
62         : m_pParams(NULL),
63           m_pResults(NULL),
64           m_pSolutions(NULL),
65           m_perceptual(false),
66           m_has_color_weighting(false),
67           m_all_pixels_grayscale(false)
68     {
69         m_low_coords.reserve(512);
70         m_high_coords.reserve(512);
71
72         m_unique_colors.reserve(512);
73         m_temp_unique_colors.reserve(512);
74         m_unique_packed_colors.reserve(512);
75
76         m_norm_unique_colors.reserve(512);
77         m_norm_unique_colors_weighted.reserve(512);
78
79         m_lo_cells.reserve(128);
80         m_hi_cells.reserve(128);
81     }
82
83     void dxt1_endpoint_optimizer::clear()
84     {
85         m_pParams = NULL;
86         m_pResults = NULL;
87         m_pSolutions = NULL;
88
89         if (m_unique_color_hash_map.get_table_size() > 8192)
90             m_unique_color_hash_map.clear();
91         else
92             m_unique_color_hash_map.reset();
93
94         if (m_solutions_tried.get_table_size() > 8192)
95             m_solutions_tried.clear();
96
97         m_unique_colors.resize(0);
98
99         m_has_transparent_pixels = false;
100         m_total_unique_color_weight = 0;
101
102         m_norm_unique_colors.resize(0);
103         m_mean_norm_color.clear();
104
105         m_norm_unique_colors_weighted.resize(0);
106         m_mean_norm_color_weighted.clear();
107
108         m_principle_axis.clear();
109
110         m_total_evals = 0;
111         m_all_pixels_grayscale = false;
112         m_has_color_weighting = false;
113         m_perceptual = false;
114     }
115
116     bool dxt1_endpoint_optimizer::handle_all_transparent_block()
117     {
118         m_pResults->m_low_color = 0;
119         m_pResults->m_high_color = 0;
120         m_pResults->m_alpha_block = true;
121
122         memset(m_pResults->m_pSelectors, 3, m_pParams->m_num_pixels);
123
124         return true;
125     }
126
127     // All selectors are equal. Try compressing as if it was solid, using the block's average color, using ryg's optimal single color compression tables.
128     bool dxt1_endpoint_optimizer::try_average_block_as_solid()
129     {
130         uint64_t tot_r = 0;
131         uint64_t tot_g = 0;
132         uint64_t tot_b = 0;
133
134         uint total_weight = 0;
135         for (uint i = 0; i < m_unique_colors.size(); i++)
136         {
137             uint weight = m_unique_colors[i].m_weight;
138             total_weight += weight;
139
140             tot_r += m_unique_colors[i].m_color.r * weight;
141             tot_g += m_unique_colors[i].m_color.g * weight;
142             tot_b += m_unique_colors[i].m_color.b * weight;
143         }
144
145         const uint half_total_weight = total_weight >> 1;
146         uint ave_r = static_cast<uint>((tot_r + half_total_weight) / total_weight);
147         uint ave_g = static_cast<uint>((tot_g + half_total_weight) / total_weight);
148         uint ave_b = static_cast<uint>((tot_b + half_total_weight) / total_weight);
149
150         uint low_color = (ryg_dxt::OMatch5[ave_r][0] << 11) | (ryg_dxt::OMatch6[ave_g][0] << 5) | ryg_dxt::OMatch5[ave_b][0];
151         uint high_color = (ryg_dxt::OMatch5[ave_r][1] << 11) | (ryg_dxt::OMatch6[ave_g][1] << 5) | ryg_dxt::OMatch5[ave_b][1];
152         bool improved = evaluate_solution(dxt1_solution_coordinates((uint16)low_color, (uint16)high_color), true, &m_best_solution);
153
154         if ((m_pParams->m_use_alpha_blocks) && (m_best_solution.m_error))
155         {
156             low_color = (ryg_dxt::OMatch5_3[ave_r][0] << 11) | (ryg_dxt::OMatch6_3[ave_g][0] << 5) | ryg_dxt::OMatch5_3[ave_b][0];
157             high_color = (ryg_dxt::OMatch5_3[ave_r][1] << 11) | (ryg_dxt::OMatch6_3[ave_g][1] << 5) | ryg_dxt::OMatch5_3[ave_b][1];
158             improved |= evaluate_solution(dxt1_solution_coordinates((uint16)low_color, (uint16)high_color), true, &m_best_solution);
159         }
160
161         if (m_pParams->m_quality == cCRNDXTQualityUber)
162         {
163             // Try compressing as all-solid using the other (non-average) colors in the block in uber.
164             for (uint i = 0; i < m_unique_colors.size(); i++)
165             {
166                 uint r = m_unique_colors[i].m_color[0];
167                 uint g = m_unique_colors[i].m_color[1];
168                 uint b = m_unique_colors[i].m_color[2];
169                 if ((r == ave_r) && (g == ave_g) && (b == ave_b))
170                     continue;
171
172                 uint low_color = (ryg_dxt::OMatch5[r][0] << 11) | (ryg_dxt::OMatch6[g][0] << 5) | ryg_dxt::OMatch5[b][0];
173                 uint high_color = (ryg_dxt::OMatch5[r][1] << 11) | (ryg_dxt::OMatch6[g][1] << 5) | ryg_dxt::OMatch5[b][1];
174                 improved |= evaluate_solution(dxt1_solution_coordinates((uint16)low_color, (uint16)high_color), true, &m_best_solution);
175
176                 if ((m_pParams->m_use_alpha_blocks) && (m_best_solution.m_error))
177                 {
178                     low_color = (ryg_dxt::OMatch5_3[r][0] << 11) | (ryg_dxt::OMatch6_3[g][0] << 5) | ryg_dxt::OMatch5_3[b][0];
179                     high_color = (ryg_dxt::OMatch5_3[r][1] << 11) | (ryg_dxt::OMatch6_3[g][1] << 5) | ryg_dxt::OMatch5_3[b][1];
180                     improved |= evaluate_solution(dxt1_solution_coordinates((uint16)low_color, (uint16)high_color), true, &m_best_solution);
181                 }
182             }
183         }
184
185         return improved;
186     }
187
188     // Block is solid, trying using ryg's optimal single color tables.
189     bool dxt1_endpoint_optimizer::handle_solid_block()
190     {
191         int r = m_unique_colors[0].m_color.r;
192         int g = m_unique_colors[0].m_color.g;
193         int b = m_unique_colors[0].m_color.b;
194
195         //uint packed_color = dxt1_block::pack_color(r, g, b, true);
196         //evaluate_solution(dxt1_solution_coordinates((uint16)packed_color, (uint16)packed_color), false, &m_best_solution);
197
198         uint low_color = (ryg_dxt::OMatch5[r][0] << 11) | (ryg_dxt::OMatch6[g][0] << 5) | ryg_dxt::OMatch5[b][0];
199         uint high_color = (ryg_dxt::OMatch5[r][1] << 11) | (ryg_dxt::OMatch6[g][1] << 5) | ryg_dxt::OMatch5[b][1];
200         evaluate_solution(dxt1_solution_coordinates((uint16)low_color, (uint16)high_color), false, &m_best_solution);
201
202         if ((m_pParams->m_use_alpha_blocks) && (m_best_solution.m_error))
203         {
204             low_color = (ryg_dxt::OMatch5_3[r][0] << 11) | (ryg_dxt::OMatch6_3[g][0] << 5) | ryg_dxt::OMatch5_3[b][0];
205             high_color = (ryg_dxt::OMatch5_3[r][1] << 11) | (ryg_dxt::OMatch6_3[g][1] << 5) | ryg_dxt::OMatch5_3[b][1];
206             evaluate_solution(dxt1_solution_coordinates((uint16)low_color, (uint16)high_color), true, &m_best_solution);
207         }
208
209         return_solution(*m_pResults, m_best_solution);
210
211         return true;
212     }
213
214     void dxt1_endpoint_optimizer::compute_vectors(const vec3F &perceptual_weights)
215     {
216         m_norm_unique_colors.resize(0);
217         m_norm_unique_colors_weighted.resize(0);
218
219         m_mean_norm_color.clear();
220         m_mean_norm_color_weighted.clear();
221
222         for (uint i = 0; i < m_unique_colors.size(); i++)
223         {
224             const color_quad_u8 &color = m_unique_colors[i].m_color;
225             const uint weight = m_unique_colors[i].m_weight;
226
227             vec3F norm_color(color.r * 1.0f / 255.0f, color.g * 1.0f / 255.0f, color.b * 1.0f / 255.0f);
228             vec3F norm_color_weighted(vec3F::mul_components(perceptual_weights, norm_color));
229
230             m_norm_unique_colors.push_back(norm_color);
231             m_norm_unique_colors_weighted.push_back(norm_color_weighted);
232
233             m_mean_norm_color += norm_color * (float)weight;
234             m_mean_norm_color_weighted += norm_color_weighted * (float)weight;
235         }
236
237         if (m_total_unique_color_weight)
238         {
239             m_mean_norm_color *= (1.0f / m_total_unique_color_weight);
240             m_mean_norm_color_weighted *= (1.0f / m_total_unique_color_weight);
241         }
242
243         for (uint i = 0; i < m_unique_colors.size(); i++)
244         {
245             m_norm_unique_colors[i] -= m_mean_norm_color;
246             m_norm_unique_colors_weighted[i] -= m_mean_norm_color_weighted;
247         }
248     }
249
250     // Compute PCA (principle axis, i.e. direction of largest variance) of input vectors.
251     void dxt1_endpoint_optimizer::compute_pca(vec3F &axis, const vec3F_array &norm_colors, const vec3F &def)
252     {
253 #if 0
254         axis.clear();
255
256         VOGL_ASSERT(m_unique_colors.size() == norm_colors.size());
257
258         // Incremental PCA
259         bool first = true;
260         for (uint i = 0; i < norm_colors.size(); i++)
261         {
262                 const uint weight = m_unique_colors[i].m_weight;
263
264                 for (uint j = 0; j < weight; j++)
265                 {
266                         vec3F x(norm_colors[i] * norm_colors[i][0]);
267                         vec3F y(norm_colors[i] * norm_colors[i][1]);
268                         vec3F z(norm_colors[i] * norm_colors[i][2]);
269
270                         vec3F v(first ? norm_colors[0] : axis);
271                         first = false;
272
273                         v.normalize(&def);
274
275                         axis[0] += (x * v);
276                         axis[1] += (y * v);
277                         axis[2] += (z * v);
278                 }
279         }
280
281         axis.normalize(&def);
282 #else
283         double cov[6] = { 0, 0, 0, 0, 0, 0 };
284
285         //vec3F lo(math::cNearlyInfinite);
286         //vec3F hi(-math::cNearlyInfinite);
287
288         for (uint i = 0; i < norm_colors.size(); i++)
289         {
290             const vec3F &v = norm_colors[i];
291
292             //if (v[0] < lo[0]) lo[0] = v[0];
293             //if (v[1] < lo[1]) lo[1] = v[1];
294             //if (v[2] < lo[2]) lo[2] = v[2];
295             //if (v[0] > hi[0]) hi[0] = v[0];
296             //if (v[1] > hi[1]) hi[1] = v[1];
297             //if (v[2] > hi[2]) hi[2] = v[2];
298
299             float r = v[0];
300             float g = v[1];
301             float b = v[2];
302
303             if (m_unique_colors[i].m_weight > 1)
304             {
305                 const double weight = m_unique_colors[i].m_weight;
306
307                 cov[0] += r * r * weight;
308                 cov[1] += r * g * weight;
309                 cov[2] += r * b * weight;
310                 cov[3] += g * g * weight;
311                 cov[4] += g * b * weight;
312                 cov[5] += b * b * weight;
313             }
314             else
315             {
316                 cov[0] += r * r;
317                 cov[1] += r * g;
318                 cov[2] += r * b;
319                 cov[3] += g * g;
320                 cov[4] += g * b;
321                 cov[5] += b * b;
322             }
323         }
324
325         double vfr, vfg, vfb;
326         //vfr = hi[0] - lo[0];
327         //vfg = hi[1] - lo[1];
328         //vfb = hi[2] - lo[2];
329         // This is more stable.
330         vfr = .9f;
331         vfg = 1.0f;
332         vfb = .7f;
333
334         const uint cNumIters = 8;
335
336         for (uint iter = 0; iter < cNumIters; iter++)
337         {
338             double r = vfr * cov[0] + vfg * cov[1] + vfb * cov[2];
339             double g = vfr * cov[1] + vfg * cov[3] + vfb * cov[4];
340             double b = vfr * cov[2] + vfg * cov[4] + vfb * cov[5];
341
342             double m = math::maximum(fabs(r), fabs(g), fabs(b));
343             if (m > 1e-10)
344             {
345                 m = 1.0f / m;
346                 r *= m;
347                 g *= m;
348                 b *= m;
349             }
350
351             double delta = math::square(vfr - r) + math::square(vfg - g) + math::square(vfb - b);
352
353             vfr = r;
354             vfg = g;
355             vfb = b;
356
357             if ((iter > 2) && (delta < 1e-8))
358                 break;
359         }
360
361         double len = vfr * vfr + vfg * vfg + vfb * vfb;
362
363         if (len < 1e-10)
364         {
365             axis = def;
366         }
367         else
368         {
369             len = 1.0f / sqrt(len);
370             vfr *= len;
371             vfg *= len;
372             vfb *= len;
373
374             axis.set(static_cast<float>(vfr), static_cast<float>(vfg), static_cast<float>(vfb));
375         }
376 #endif
377     }
378
379     static const uint8 g_invTableNull[4] = { 0, 1, 2, 3 };
380     static const uint8 g_invTableAlpha[4] = { 1, 0, 2, 3 };
381     static const uint8 g_invTableColor[4] = { 1, 0, 3, 2 };
382
383     // Computes a valid (encodable) DXT1 solution (low/high colors, swizzled selectors) from input.
384     void dxt1_endpoint_optimizer::return_solution(results &res, const potential_solution &solution)
385     {
386         bool invert_selectors;
387
388         if (solution.m_alpha_block)
389             invert_selectors = (solution.m_coords.m_low_color > solution.m_coords.m_high_color);
390         else
391         {
392             VOGL_ASSERT(solution.m_coords.m_low_color != solution.m_coords.m_high_color);
393
394             invert_selectors = (solution.m_coords.m_low_color < solution.m_coords.m_high_color);
395         }
396
397         if (invert_selectors)
398         {
399             res.m_low_color = solution.m_coords.m_high_color;
400             res.m_high_color = solution.m_coords.m_low_color;
401         }
402         else
403         {
404             res.m_low_color = solution.m_coords.m_low_color;
405             res.m_high_color = solution.m_coords.m_high_color;
406         }
407
408         const uint8 *pInvert_table = g_invTableNull;
409         if (invert_selectors)
410             pInvert_table = solution.m_alpha_block ? g_invTableAlpha : g_invTableColor;
411
412         const uint alpha_thresh = m_pParams->m_pixels_have_alpha ? (m_pParams->m_dxt1a_alpha_threshold << 24U) : 0;
413
414         const uint32 *pSrc_pixels = reinterpret_cast<const uint32 *>(m_pParams->m_pPixels);
415         uint8 *pDst_selectors = res.m_pSelectors;
416
417         if ((m_unique_colors.size() == 1) && (!m_pParams->m_pixels_have_alpha))
418         {
419             uint32 c = utils::read_le32(pSrc_pixels);
420
421             VOGL_ASSERT(c >= alpha_thresh);
422
423             c |= 0xFF000000U;
424
425             unique_color_hash_map::const_iterator it(m_unique_color_hash_map.find(c));
426             VOGL_ASSERT(it != m_unique_color_hash_map.end());
427
428             uint unique_color_index = it->second;
429
430             uint selector = pInvert_table[solution.m_selectors[unique_color_index]];
431
432             memset(pDst_selectors, selector, m_pParams->m_num_pixels);
433         }
434         else
435         {
436             uint8 *pDst_selectors_end = pDst_selectors + m_pParams->m_num_pixels;
437
438             uint8 prev_selector = 0;
439             uint32 prev_color = 0;
440
441             do
442             {
443                 uint32 c = utils::read_le32(pSrc_pixels);
444                 pSrc_pixels++;
445
446                 uint8 selector = 3;
447
448                 if (c >= alpha_thresh)
449                 {
450                     c |= 0xFF000000U;
451
452                     if (c == prev_color)
453                         selector = prev_selector;
454                     else
455                     {
456                         unique_color_hash_map::const_iterator it(m_unique_color_hash_map.find(c));
457
458                         VOGL_ASSERT(it != m_unique_color_hash_map.end());
459
460                         uint unique_color_index = it->second;
461
462                         selector = pInvert_table[solution.m_selectors[unique_color_index]];
463
464                         prev_color = c;
465                         prev_selector = selector;
466                     }
467                 }
468
469                 *pDst_selectors++ = selector;
470
471             } while (pDst_selectors != pDst_selectors_end);
472         }
473
474         res.m_alpha_block = solution.m_alpha_block;
475         res.m_error = solution.m_error;
476     }
477
478     inline vec3F dxt1_endpoint_optimizer::unpack_to_vec3F(uint16 packed_color)
479     {
480         color_quad_u8 c(dxt1_block::unpack_color(packed_color, false));
481
482         return vec3F(c.r * 1.0f / 31.0f, c.g * 1.0f / 63.0f, c.b * 1.0f / 31.0f);
483     }
484
485     inline vec3F dxt1_endpoint_optimizer::unpack_to_vec3F_raw(uint16 packed_color)
486     {
487         color_quad_u8 c(dxt1_block::unpack_color(packed_color, false));
488
489         return vec3F(c.r, c.g, c.b);
490     }
491
492     // Per-component 1D endpoint optimization.
493     void dxt1_endpoint_optimizer::optimize_endpoint_comps()
494     {
495         if ((m_best_solution.m_alpha_block) || (!m_best_solution.m_error))
496             return;
497
498         //color_quad_u8 orig_l(dxt1_block::unpack_color(m_best_solution.m_coords.m_low_color, false));
499         //color_quad_u8 orig_h(dxt1_block::unpack_color(m_best_solution.m_coords.m_high_color, false));
500         //uint orig_error = m_best_solution.m_error;
501
502         color_quad_u8 orig_l_scaled(dxt1_block::unpack_color(m_best_solution.m_coords.m_low_color, true));
503         color_quad_u8 orig_h_scaled(dxt1_block::unpack_color(m_best_solution.m_coords.m_high_color, true));
504
505         color_quad_u8 min_color(0xFF, 0xFF, 0xFF, 0xFF);
506         color_quad_u8 max_color(0, 0, 0, 0);
507         for (uint i = 0; i < m_unique_colors.size(); i++)
508         {
509             min_color = color_quad_u8::component_min(min_color, m_unique_colors[i].m_color);
510             max_color = color_quad_u8::component_max(max_color, m_unique_colors[i].m_color);
511         }
512
513         // Try to separately optimize each component. This is a 1D problem so it's easy to compute accurate per-component error bounds.
514         for (uint comp_index = 0; comp_index < 3; comp_index++)
515         {
516             uint ll[4];
517             ll[0] = orig_l_scaled[comp_index];
518             ll[1] = orig_h_scaled[comp_index];
519             ll[2] = (ll[0] * 2 + ll[1]) / 3;
520             ll[3] = (ll[0] + ll[1] * 2) / 3;
521
522             uint error_to_beat = 0;
523             uint min_color_weight = 0;
524             uint max_color_weight = 0;
525             for (uint i = 0; i < m_unique_colors.size(); i++)
526             {
527                 uint c = m_unique_colors[i].m_color[comp_index];
528                 uint w = m_unique_colors[i].m_weight;
529
530                 int delta = ll[m_best_solution.m_selectors[i]] - c;
531                 error_to_beat += (int)w * (delta * delta);
532
533                 if (c == min_color[comp_index])
534                     min_color_weight += w;
535                 if (c == max_color[comp_index])
536                     max_color_weight += w;
537             }
538
539             if (!error_to_beat)
540                 continue;
541
542             VOGL_ASSERT((min_color_weight > 0) && (max_color_weight > 0));
543             const uint error_to_beat_div_min_color_weight = min_color_weight ? ((error_to_beat + min_color_weight - 1) / min_color_weight) : 0;
544             const uint error_to_beat_div_max_color_weight = max_color_weight ? ((error_to_beat + max_color_weight - 1) / max_color_weight) : 0;
545
546             const uint m = (comp_index == 1) ? 63 : 31;
547             const uint m_shift = (comp_index == 1) ? 3 : 2;
548
549             for (uint o = 0; o <= m; o++)
550             {
551                 uint tl[4];
552
553                 tl[0] = (comp_index == 1) ? ((o << 2) | (o >> 4)) : ((o << 3) | (o >> 2));
554
555                 for (uint h = 0; h < 8; h++)
556                 {
557                     const uint pl = h << m_shift;
558                     const uint ph = ((h + 1) << m_shift) - 1;
559
560                     uint tl_l = (comp_index == 1) ? ((pl << 2) | (pl >> 4)) : ((pl << 3) | (pl >> 2));
561                     uint tl_h = (comp_index == 1) ? ((ph << 2) | (ph >> 4)) : ((ph << 3) | (ph >> 2));
562
563                     tl_l = math::minimum(tl_l, tl[0]);
564                     tl_h = math::maximum(tl_h, tl[0]);
565
566                     uint c_l = min_color[comp_index];
567                     uint c_h = max_color[comp_index];
568
569                     if (c_h < tl_l)
570                     {
571                         uint min_possible_error = math::square<int>(tl_l - c_l);
572                         if (min_possible_error > error_to_beat_div_min_color_weight)
573                             continue;
574                     }
575                     else if (c_l > tl_h)
576                     {
577                         uint min_possible_error = math::square<int>(c_h - tl_h);
578                         if (min_possible_error > error_to_beat_div_max_color_weight)
579                             continue;
580                     }
581
582                     for (uint p = pl; p <= ph; p++)
583                     {
584                         tl[1] = (comp_index == 1) ? ((p << 2) | (p >> 4)) : ((p << 3) | (p >> 2));
585
586                         tl[2] = (tl[0] * 2 + tl[1]) / 3;
587                         tl[3] = (tl[0] + tl[1] * 2) / 3;
588
589                         uint trial_error = 0;
590                         for (uint i = 0; i < m_unique_colors.size(); i++)
591                         {
592                             int delta = tl[m_best_solution.m_selectors[i]] - m_unique_colors[i].m_color[comp_index];
593                             trial_error += m_unique_colors[i].m_weight * (delta * delta);
594                             if (trial_error >= error_to_beat)
595                                 break;
596                         }
597
598                         //VOGL_ASSERT(trial_error >= min_possible_error);
599
600                         if (trial_error < error_to_beat)
601                         {
602                             color_quad_u8 l(dxt1_block::unpack_color(m_best_solution.m_coords.m_low_color, false));
603                             color_quad_u8 h(dxt1_block::unpack_color(m_best_solution.m_coords.m_high_color, false));
604                             l[comp_index] = static_cast<uint8>(o);
605                             h[comp_index] = static_cast<uint8>(p);
606
607                             bool better = evaluate_solution(
608                                 dxt1_solution_coordinates(dxt1_block::pack_color(l, false), dxt1_block::pack_color(h, false)),
609                                 true, &m_best_solution);
610                             VOGL_NOTE_UNUSED(better);
611
612                             if (better)
613                             {
614 #if 0
615                                                         printf("comp: %u, orig: %u %u, new: %u %u, orig_error: %u, new_error: %u\n", comp_index,
616                                                                orig_l[comp_index], orig_h[comp_index],
617                                                                l[comp_index], h[comp_index],
618                                                                orig_error, m_best_solution.m_error);
619 #endif
620                                 if (!m_best_solution.m_error)
621                                     return;
622
623                                 error_to_beat = 0;
624                                 for (uint i = 0; i < m_unique_colors.size(); i++)
625                                 {
626                                     int delta = tl[m_best_solution.m_selectors[i]] - m_unique_colors[i].m_color[comp_index];
627                                     error_to_beat += m_unique_colors[i].m_weight * (delta * delta);
628                                 }
629
630                             } // better
631
632                             //goto early_out;
633                         } // if (trial_error < error_to_beat)
634
635                     } // for (uint p = 0; p <= m; p++)
636                 }
637
638             } // for (uint o = 0; o <= m; o++)
639
640         } // comp_index
641     }
642
643     // Voxel adjacency delta coordinations.
644     static const struct adjacent_coords
645     {
646         int8 x, y, z;
647     } g_adjacency[26] =
648           {
649               { -1, -1, -1 },
650               { 0, -1, -1 },
651               { 1, -1, -1 },
652               { -1, 0, -1 },
653               { 0, 0, -1 },
654               { 1, 0, -1 },
655               { -1, 1, -1 },
656               { 0, 1, -1 },
657               { 1, 1, -1 },
658               { -1, -1, 0 },
659               { 0, -1, 0 },
660               { 1, -1, 0 },
661               { -1, 0, 0 },
662               { 1, 0, 0 },
663               { -1, 1, 0 },
664               { 0, 1, 0 },
665               { 1, 1, 0 },
666               { -1, -1, 1 },
667               { 0, -1, 1 },
668               { 1, -1, 1 },
669               { -1, 0, 1 },
670               { 0, 0, 1 },
671               { 1, 0, 1 },
672               { -1, 1, 1 },
673               { 0, 1, 1 },
674               { 1, 1, 1 }
675           };
676
677     // Attempt to refine current solution's endpoints given the current selectors using least squares.
678     bool dxt1_endpoint_optimizer::refine_solution(int refinement_level)
679     {
680         VOGL_ASSERT(m_best_solution.m_valid);
681
682         static const int w1Tab[4] = { 3, 0, 2, 1 };
683
684         static const int prods_0[4] = { 0x00, 0x00, 0x02, 0x02 };
685         static const int prods_1[4] = { 0x00, 0x09, 0x01, 0x04 };
686         static const int prods_2[4] = { 0x09, 0x00, 0x04, 0x01 };
687
688         double akku_0 = 0;
689         double akku_1 = 0;
690         double akku_2 = 0;
691         double At1_r, At1_g, At1_b;
692         double At2_r, At2_g, At2_b;
693
694         At1_r = At1_g = At1_b = 0;
695         At2_r = At2_g = At2_b = 0;
696         for (uint i = 0; i < m_unique_colors.size(); i++)
697         {
698             const color_quad_u8 &c = m_unique_colors[i].m_color;
699             const double weight = m_unique_colors[i].m_weight;
700
701             double r = c.r * weight;
702             double g = c.g * weight;
703             double b = c.b * weight;
704             int step = m_best_solution.m_selectors[i] ^ 1;
705
706             int w1 = w1Tab[step];
707
708             akku_0 += prods_0[step] * weight;
709             akku_1 += prods_1[step] * weight;
710             akku_2 += prods_2[step] * weight;
711             At1_r += w1 * r;
712             At1_g += w1 * g;
713             At1_b += w1 * b;
714             At2_r += r;
715             At2_g += g;
716             At2_b += b;
717         }
718
719         At2_r = 3 * At2_r - At1_r;
720         At2_g = 3 * At2_g - At1_g;
721         At2_b = 3 * At2_b - At1_b;
722
723         double xx = akku_2;
724         double yy = akku_1;
725         double xy = akku_0;
726
727         double t = xx * yy - xy * xy;
728         if (!yy || !xx || (fabs(t) < .0000125f))
729             return false;
730
731         double frb = (3.0f * 31.0f / 255.0f) / t;
732         double fg = frb * (63.0f / 31.0f);
733
734         bool improved = false;
735
736         if (refinement_level == 0)
737         {
738             uint max16;
739             max16 = math::clamp<int>(static_cast<int>((At1_r * yy - At2_r * xy) * frb + 0.5f), 0, 31) << 11;
740             max16 |= math::clamp<int>(static_cast<int>((At1_g * yy - At2_g * xy) * fg + 0.5f), 0, 63) << 5;
741             max16 |= math::clamp<int>(static_cast<int>((At1_b * yy - At2_b * xy) * frb + 0.5f), 0, 31) << 0;
742
743             uint min16;
744             min16 = math::clamp<int>(static_cast<int>((At2_r * xx - At1_r * xy) * frb + 0.5f), 0, 31) << 11;
745             min16 |= math::clamp<int>(static_cast<int>((At2_g * xx - At1_g * xy) * fg + 0.5f), 0, 63) << 5;
746             min16 |= math::clamp<int>(static_cast<int>((At2_b * xx - At1_b * xy) * frb + 0.5f), 0, 31) << 0;
747
748             dxt1_solution_coordinates nc((uint16)min16, (uint16)max16);
749             nc.canonicalize();
750             improved |= evaluate_solution(nc, true, &m_best_solution, false);
751         }
752         else if (refinement_level == 1)
753         {
754             // Try exploring the local lattice neighbors of the least squares optimized result.
755             color_quad_u8 e[2];
756
757             e[0].clear();
758             e[0][0] = (uint8)math::clamp<int>(static_cast<int>((At1_r * yy - At2_r * xy) * frb + 0.5f), 0, 31);
759             e[0][1] = (uint8)math::clamp<int>(static_cast<int>((At1_g * yy - At2_g * xy) * fg + 0.5f), 0, 63);
760             e[0][2] = (uint8)math::clamp<int>(static_cast<int>((At1_b * yy - At2_b * xy) * frb + 0.5f), 0, 31);
761
762             e[1].clear();
763             e[1][0] = (uint8)math::clamp<int>(static_cast<int>((At2_r * xx - At1_r * xy) * frb + 0.5f), 0, 31);
764             e[1][1] = (uint8)math::clamp<int>(static_cast<int>((At2_g * xx - At1_g * xy) * fg + 0.5f), 0, 63);
765             e[1][2] = (uint8)math::clamp<int>(static_cast<int>((At2_b * xx - At1_b * xy) * frb + 0.5f), 0, 31);
766
767             for (uint i = 0; i < 2; i++)
768             {
769                 for (int rr = -1; rr <= 1; rr++)
770                 {
771                     for (int gr = -1; gr <= 1; gr++)
772                     {
773                         for (int br = -1; br <= 1; br++)
774                         {
775                             dxt1_solution_coordinates nc;
776
777                             color_quad_u8 c[2];
778                             c[0] = e[0];
779                             c[1] = e[1];
780
781                             c[i][0] = (uint8)math::clamp<int>(c[i][0] + rr, 0, 31);
782                             c[i][1] = (uint8)math::clamp<int>(c[i][1] + gr, 0, 63);
783                             c[i][2] = (uint8)math::clamp<int>(c[i][2] + br, 0, 31);
784
785                             nc.m_low_color = dxt1_block::pack_color(c[0], false);
786                             nc.m_high_color = dxt1_block::pack_color(c[1], false);
787
788                             nc.canonicalize();
789
790                             if ((nc.m_low_color != m_best_solution.m_coords.m_low_color) || (nc.m_high_color != m_best_solution.m_coords.m_high_color))
791                             {
792                                 improved |= evaluate_solution(nc, true, &m_best_solution, false);
793                             }
794                         }
795                     }
796                 }
797             }
798         }
799         else
800         {
801             // Try even harder to explore the local lattice neighbors of the least squares optimized result.
802             color_quad_u8 e[2];
803             e[0].clear();
804             e[0][0] = (uint8)math::clamp<int>(static_cast<int>((At1_r * yy - At2_r * xy) * frb + 0.5f), 0, 31);
805             e[0][1] = (uint8)math::clamp<int>(static_cast<int>((At1_g * yy - At2_g * xy) * fg + 0.5f), 0, 63);
806             e[0][2] = (uint8)math::clamp<int>(static_cast<int>((At1_b * yy - At2_b * xy) * frb + 0.5f), 0, 31);
807
808             e[1].clear();
809             e[1][0] = (uint8)math::clamp<int>(static_cast<int>((At2_r * xx - At1_r * xy) * frb + 0.5f), 0, 31);
810             e[1][1] = (uint8)math::clamp<int>(static_cast<int>((At2_g * xx - At1_g * xy) * fg + 0.5f), 0, 63);
811             e[1][2] = (uint8)math::clamp<int>(static_cast<int>((At2_b * xx - At1_b * xy) * frb + 0.5f), 0, 31);
812
813             for (int orr = -1; orr <= 1; orr++)
814             {
815                 for (int ogr = -1; ogr <= 1; ogr++)
816                 {
817                     for (int obr = -1; obr <= 1; obr++)
818                     {
819                         dxt1_solution_coordinates nc;
820
821                         color_quad_u8 c[2];
822                         c[0] = e[0];
823                         c[1] = e[1];
824
825                         c[0][0] = (uint8)math::clamp<int>(c[0][0] + orr, 0, 31);
826                         c[0][1] = (uint8)math::clamp<int>(c[0][1] + ogr, 0, 63);
827                         c[0][2] = (uint8)math::clamp<int>(c[0][2] + obr, 0, 31);
828
829                         for (int rr = -1; rr <= 1; rr++)
830                         {
831                             for (int gr = -1; gr <= 1; gr++)
832                             {
833                                 for (int br = -1; br <= 1; br++)
834                                 {
835                                     c[1][0] = (uint8)math::clamp<int>(c[1][0] + rr, 0, 31);
836                                     c[1][1] = (uint8)math::clamp<int>(c[1][1] + gr, 0, 63);
837                                     c[1][2] = (uint8)math::clamp<int>(c[1][2] + br, 0, 31);
838
839                                     nc.m_low_color = dxt1_block::pack_color(c[0], false);
840                                     nc.m_high_color = dxt1_block::pack_color(c[1], false);
841                                     nc.canonicalize();
842
843                                     improved |= evaluate_solution(nc, true, &m_best_solution, false);
844                                 }
845                             }
846                         }
847                     }
848                 }
849             }
850         }
851
852         return improved;
853     }
854
855     //-----------------------------------------------------------------------------------------------------------------------------------------
856
857     // Primary endpoint optimization entrypoint.
858     bool dxt1_endpoint_optimizer::optimize_endpoints(vec3F &low_color, vec3F &high_color)
859     {
860         vec3F orig_low_color(low_color);
861         vec3F orig_high_color(high_color);
862
863         m_trial_solution.clear();
864
865         uint num_passes;
866         const int16 *pProbe_table = g_uber_probe_table;
867         uint probe_range;
868         float dist_per_trial = .015625f;
869
870         // How many probes, and the distance between each probe depends on the quality level.
871         switch (m_pParams->m_quality)
872         {
873             case cCRNDXTQualitySuperFast:
874                 pProbe_table = g_fast_probe_table;
875                 probe_range = cFastProbeTableSize;
876                 dist_per_trial = .027063293f;
877                 num_passes = 1;
878                 break;
879             case cCRNDXTQualityFast:
880                 pProbe_table = g_fast_probe_table;
881                 probe_range = cFastProbeTableSize;
882                 dist_per_trial = .027063293f;
883                 num_passes = 2;
884                 break;
885             case cCRNDXTQualityNormal:
886                 pProbe_table = g_normal_probe_table;
887                 probe_range = cNormalProbeTableSize;
888                 dist_per_trial = .027063293f;
889                 num_passes = 2;
890                 break;
891             case cCRNDXTQualityBetter:
892                 pProbe_table = g_better_probe_table;
893                 probe_range = cBetterProbeTableSize;
894                 num_passes = 2;
895                 break;
896             default:
897                 pProbe_table = g_uber_probe_table;
898                 probe_range = cUberProbeTableSize;
899                 num_passes = 4;
900                 break;
901         }
902
903         m_solutions_tried.reset();
904
905         if (m_pParams->m_endpoint_caching)
906         {
907             // Try the previous X winning endpoints. This may not give us optimal results, but it may increase the probability of early outs while evaluating potential solutions.
908             const uint num_prev_results = math::minimum<uint>(cMaxPrevResults, m_num_prev_results);
909             for (uint i = 0; i < num_prev_results; i++)
910             {
911                 const dxt1_solution_coordinates &coords = m_prev_results[i];
912
913                 solution_hash_map::insert_result solution_res(m_solutions_tried.insert(coords.m_low_color | (coords.m_high_color << 16U)));
914                 if (!solution_res.second)
915                     continue;
916
917                 evaluate_solution(coords, true, &m_best_solution);
918             }
919
920             if (!m_best_solution.m_error)
921             {
922                 // Got lucky - one of the previous endpoints is optimal.
923                 return_solution(*m_pResults, m_best_solution);
924                 return true;
925             }
926         }
927
928         if (m_pParams->m_quality >= cCRNDXTQualityBetter)
929         {
930             //evaluate_solution(dxt1_solution_coordinates(low_color, high_color), true, &m_best_solution);
931             //refine_solution();
932
933             try_median4(orig_low_color, orig_high_color);
934         }
935
936         uint probe_low[cUberProbeTableSize * 2 + 1];
937         uint probe_high[cUberProbeTableSize * 2 + 1];
938
939         vec3F scaled_principle_axis[2];
940
941         scaled_principle_axis[1] = m_principle_axis * dist_per_trial;
942         scaled_principle_axis[1][0] *= 31.0f;
943         scaled_principle_axis[1][1] *= 63.0f;
944         scaled_principle_axis[1][2] *= 31.0f;
945
946         scaled_principle_axis[0] = -scaled_principle_axis[1];
947
948         //vec3F initial_ofs(scaled_principle_axis * (float)-probe_range);
949         //initial_ofs[0] += .5f;
950         //initial_ofs[1] += .5f;
951         //initial_ofs[2] += .5f;
952
953         low_color[0] = math::clamp(low_color[0] * 31.0f, 0.0f, 31.0f);
954         low_color[1] = math::clamp(low_color[1] * 63.0f, 0.0f, 63.0f);
955         low_color[2] = math::clamp(low_color[2] * 31.0f, 0.0f, 31.0f);
956
957         high_color[0] = math::clamp(high_color[0] * 31.0f, 0.0f, 31.0f);
958         high_color[1] = math::clamp(high_color[1] * 63.0f, 0.0f, 63.0f);
959         high_color[2] = math::clamp(high_color[2] * 31.0f, 0.0f, 31.0f);
960
961         for (uint pass = 0; pass < num_passes; pass++)
962         {
963             // Now separately sweep or probe the low and high colors along the principle axis, both positively and negatively.
964             // This results in two arrays of candidate low/high endpoints. Every unique combination of candidate endpoints is tried as a potential solution.
965             // In higher quality modes, the various nearby lattice neighbors of each candidate endpoint are also explored, which allows the current solution to "wobble" or "migrate"
966             // to areas with lower error.
967             // This entire process can be repeated up to X times (depending on the quality level) until a local minimum is established.
968             // This method is very stable and scalable. It could be implemented more elegantly, but I'm now very cautious of touching this code.
969             if (pass)
970             {
971                 low_color = unpack_to_vec3F_raw(m_best_solution.m_coords.m_low_color);
972                 high_color = unpack_to_vec3F_raw(m_best_solution.m_coords.m_high_color);
973             }
974
975             const uint64_t prev_best_error = m_best_solution.m_error;
976             if (!prev_best_error)
977                 break;
978
979             // Sweep low endpoint along principle axis, record positions
980             int prev_packed_color[2] = { -1, -1 };
981             uint num_low_trials = 0;
982             vec3F initial_probe_low_color(low_color + vec3F(.5f));
983             for (uint i = 0; i < probe_range; i++)
984             {
985                 const int ls = i ? 0 : 1;
986                 int x = pProbe_table[i];
987
988                 for (int s = ls; s < 2; s++)
989                 {
990                     vec3F probe_low_color(initial_probe_low_color + scaled_principle_axis[s] * (float)x);
991
992                     int r = math::clamp((int)floor(probe_low_color[0]), 0, 31);
993                     int g = math::clamp((int)floor(probe_low_color[1]), 0, 63);
994                     int b = math::clamp((int)floor(probe_low_color[2]), 0, 31);
995
996                     int packed_color = b | (g << 5U) | (r << 11U);
997                     if (packed_color != prev_packed_color[s])
998                     {
999                         probe_low[num_low_trials++] = packed_color;
1000                         prev_packed_color[s] = packed_color;
1001                     }
1002                 }
1003             }
1004
1005             prev_packed_color[0] = -1;
1006             prev_packed_color[1] = -1;
1007
1008             // Sweep high endpoint along principle axis, record positions
1009             uint num_high_trials = 0;
1010             vec3F initial_probe_high_color(high_color + vec3F(.5f));
1011             for (uint i = 0; i < probe_range; i++)
1012             {
1013                 const int ls = i ? 0 : 1;
1014                 int x = pProbe_table[i];
1015
1016                 for (int s = ls; s < 2; s++)
1017                 {
1018                     vec3F probe_high_color(initial_probe_high_color + scaled_principle_axis[s] * (float)x);
1019
1020                     int r = math::clamp((int)floor(probe_high_color[0]), 0, 31);
1021                     int g = math::clamp((int)floor(probe_high_color[1]), 0, 63);
1022                     int b = math::clamp((int)floor(probe_high_color[2]), 0, 31);
1023
1024                     int packed_color = b | (g << 5U) | (r << 11U);
1025                     if (packed_color != prev_packed_color[s])
1026                     {
1027                         probe_high[num_high_trials++] = packed_color;
1028                         prev_packed_color[s] = packed_color;
1029                     }
1030                 }
1031             }
1032
1033             // Now try all unique combinations.
1034             for (uint i = 0; i < num_low_trials; i++)
1035             {
1036                 for (uint j = 0; j < num_high_trials; j++)
1037                 {
1038                     dxt1_solution_coordinates coords((uint16)probe_low[i], (uint16)probe_high[j]);
1039                     coords.canonicalize();
1040
1041                     solution_hash_map::insert_result solution_res(m_solutions_tried.insert(coords.m_low_color | (coords.m_high_color << 16U)));
1042                     if (!solution_res.second)
1043                         continue;
1044
1045                     evaluate_solution(coords, true, &m_best_solution);
1046                 }
1047             }
1048
1049             if (m_pParams->m_quality >= cCRNDXTQualityNormal)
1050             {
1051                 // Generate new candidates by exploring the low color's direct lattice neighbors
1052                 color_quad_u8 lc(dxt1_block::unpack_color(m_best_solution.m_coords.m_low_color, false));
1053
1054                 for (int i = 0; i < 26; i++)
1055                 {
1056                     int r = lc.r + g_adjacency[i].x;
1057                     if ((r < 0) || (r > 31))
1058                         continue;
1059
1060                     int g = lc.g + g_adjacency[i].y;
1061                     if ((g < 0) || (g > 63))
1062                         continue;
1063
1064                     int b = lc.b + g_adjacency[i].z;
1065                     if ((b < 0) || (b > 31))
1066                         continue;
1067
1068                     dxt1_solution_coordinates coords(dxt1_block::pack_color(r, g, b, false), m_best_solution.m_coords.m_high_color);
1069                     coords.canonicalize();
1070
1071                     solution_hash_map::insert_result solution_res(m_solutions_tried.insert(coords.m_low_color | (coords.m_high_color << 16U)));
1072                     if (solution_res.second)
1073                         evaluate_solution(coords, true, &m_best_solution);
1074                 }
1075
1076                 if (m_pParams->m_quality == cCRNDXTQualityUber)
1077                 {
1078                     // Generate new candidates by exploring the low color's direct lattice neighbors - this time, explore much further separately on each axis.
1079                     lc = dxt1_block::unpack_color(m_best_solution.m_coords.m_low_color, false);
1080
1081                     for (int a = 0; a < 3; a++)
1082                     {
1083                         int limit = (a == 1) ? 63 : 31;
1084
1085                         for (int s = -2; s <= 2; s += 4)
1086                         {
1087                             color_quad_u8 c(lc);
1088                             int q = c[a] + s;
1089                             if ((q < 0) || (q > limit))
1090                                 continue;
1091
1092                             c[a] = (uint8)q;
1093
1094                             dxt1_solution_coordinates coords(dxt1_block::pack_color(c, false), m_best_solution.m_coords.m_high_color);
1095                             coords.canonicalize();
1096
1097                             solution_hash_map::insert_result solution_res(m_solutions_tried.insert(coords.m_low_color | (coords.m_high_color << 16U)));
1098                             if (solution_res.second)
1099                                 evaluate_solution(coords, true, &m_best_solution);
1100                         }
1101                     }
1102                 }
1103
1104                 // Generate new candidates by exploring the high color's direct lattice neighbors
1105                 color_quad_u8 hc(dxt1_block::unpack_color(m_best_solution.m_coords.m_high_color, false));
1106
1107                 for (int i = 0; i < 26; i++)
1108                 {
1109                     int r = hc.r + g_adjacency[i].x;
1110                     if ((r < 0) || (r > 31))
1111                         continue;
1112
1113                     int g = hc.g + g_adjacency[i].y;
1114                     if ((g < 0) || (g > 63))
1115                         continue;
1116
1117                     int b = hc.b + g_adjacency[i].z;
1118                     if ((b < 0) || (b > 31))
1119                         continue;
1120
1121                     dxt1_solution_coordinates coords(m_best_solution.m_coords.m_low_color, dxt1_block::pack_color(r, g, b, false));
1122                     coords.canonicalize();
1123
1124                     solution_hash_map::insert_result solution_res(m_solutions_tried.insert(coords.m_low_color | (coords.m_high_color << 16U)));
1125                     if (solution_res.second)
1126                         evaluate_solution(coords, true, &m_best_solution);
1127                 }
1128
1129                 if (m_pParams->m_quality == cCRNDXTQualityUber)
1130                 {
1131                     // Generate new candidates by exploring the high color's direct lattice neighbors - this time, explore much further separately on each axis.
1132                     hc = dxt1_block::unpack_color(m_best_solution.m_coords.m_high_color, false);
1133
1134                     for (int a = 0; a < 3; a++)
1135                     {
1136                         int limit = (a == 1) ? 63 : 31;
1137
1138                         for (int s = -2; s <= 2; s += 4)
1139                         {
1140                             color_quad_u8 c(hc);
1141                             int q = c[a] + s;
1142                             if ((q < 0) || (q > limit))
1143                                 continue;
1144
1145                             c[a] = (uint8)q;
1146
1147                             dxt1_solution_coordinates coords(m_best_solution.m_coords.m_low_color, dxt1_block::pack_color(c, false));
1148                             coords.canonicalize();
1149
1150                             solution_hash_map::insert_result solution_res(m_solutions_tried.insert(coords.m_low_color | (coords.m_high_color << 16U)));
1151                             if (solution_res.second)
1152                                 evaluate_solution(coords, true, &m_best_solution);
1153                         }
1154                     }
1155                 }
1156             }
1157
1158             if ((!m_best_solution.m_error) || ((pass) && (m_best_solution.m_error == prev_best_error)))
1159                 break;
1160
1161             if (m_pParams->m_quality >= cCRNDXTQualityUber)
1162             {
1163                 // Attempt to refine current solution's endpoints given the current selectors using least squares.
1164                 refine_solution(1);
1165             }
1166         }
1167
1168         if (m_pParams->m_quality >= cCRNDXTQualityNormal)
1169         {
1170             if ((m_best_solution.m_error) && (!m_pParams->m_pixels_have_alpha))
1171             {
1172                 bool choose_solid_block = false;
1173                 if (m_best_solution.are_selectors_all_equal())
1174                 {
1175                     // All selectors equal - try various solid-block optimizations
1176                     choose_solid_block = try_average_block_as_solid();
1177                 }
1178
1179                 if ((!choose_solid_block) && (m_pParams->m_quality == cCRNDXTQualityUber))
1180                 {
1181                     // Per-component 1D endpoint optimization.
1182                     optimize_endpoint_comps();
1183                 }
1184             }
1185
1186             if (m_pParams->m_quality == cCRNDXTQualityUber)
1187             {
1188                 if (m_best_solution.m_error)
1189                 {
1190                     // The pixels may have already been DXTc compressed by another compressor.
1191                     // It's usually possible to recover the endpoints used to previously pack the block.
1192                     try_combinatorial_encoding();
1193                 }
1194             }
1195         }
1196
1197         return_solution(*m_pResults, m_best_solution);
1198
1199         if (m_pParams->m_endpoint_caching)
1200         {
1201             // Remember result for later reruse.
1202             m_prev_results[m_num_prev_results & (cMaxPrevResults - 1)] = m_best_solution.m_coords;
1203             m_num_prev_results++;
1204         }
1205
1206         return true;
1207     }
1208
1209     static inline int mul_8bit(int a, int b)
1210     {
1211         int t = a * b + 128;
1212         return (t + (t >> 8)) >> 8;
1213     }
1214
1215     bool dxt1_endpoint_optimizer::handle_multicolor_block()
1216     {
1217         uint num_passes = 1;
1218         vec3F perceptual_weights(1.0f);
1219
1220         if (m_perceptual)
1221         {
1222             // Compute RGB weighting for use in perceptual mode.
1223             // The more saturated the block, the more the weights deviate from (1,1,1).
1224             float ave_redness = 0;
1225             float ave_blueness = 0;
1226             float ave_l = 0;
1227
1228             for (uint i = 0; i < m_unique_colors.size(); i++)
1229             {
1230                 const color_quad_u8 &c = m_unique_colors[i].m_color;
1231                 const float weight = (float)m_unique_colors[i].m_weight;
1232
1233                 int l = mul_8bit(c.r + c.g + c.b, 0x55); // /3
1234                 ave_l += l;
1235                 l = math::maximum(1, l);
1236
1237                 float scale = weight / static_cast<float>(l);
1238
1239                 ave_redness += scale * c.r;
1240                 ave_blueness += scale * c.b;
1241             }
1242
1243             ave_redness /= m_total_unique_color_weight;
1244             ave_blueness /= m_total_unique_color_weight;
1245             ave_l /= m_total_unique_color_weight;
1246
1247             ave_l = math::minimum(1.0f, ave_l * 16.0f / 255.0f);
1248
1249             //float r = ave_l * powf(math::saturate(ave_redness / 3.0f), 5.0f);
1250             //float b = ave_l * powf(math::saturate(ave_blueness / 3.0f), 5.0f);
1251
1252             float p = ave_l * powf(math::saturate(math::maximum(ave_redness, ave_blueness) * 1.0f / 3.0f), 2.75f);
1253
1254             if (p >= 1.0f)
1255                 num_passes = 1;
1256             else
1257             {
1258                 num_passes = 2;
1259                 perceptual_weights = vec3F::lerp(vec3F(.212f, .72f, .072f), perceptual_weights, p);
1260             }
1261         }
1262
1263         for (uint pass_index = 0; pass_index < num_passes; pass_index++)
1264         {
1265             compute_vectors(perceptual_weights);
1266
1267             compute_pca(m_principle_axis, m_norm_unique_colors_weighted, vec3F(.2837149f, 0.9540631f, 0.096277453f));
1268
1269 #if 0
1270                 matrix44F m(matrix44F::make_scale_matrix(perceptual_weights[0], perceptual_weights[1], perceptual_weights[2]));
1271                 matrix44F im(m.get_inverse());
1272                 im.transpose_in_place();
1273                 m_principle_axis = m_principle_axis * im;
1274 #else
1275             // Purposely scale the components of the principle axis by the perceptual weighting.
1276             // There's probably a cleaner way to go about this, but it works (more competitive in perceptual mode against nvdxt.exe or ATI_Compress).
1277             m_principle_axis[0] /= perceptual_weights[0];
1278             m_principle_axis[1] /= perceptual_weights[1];
1279             m_principle_axis[2] /= perceptual_weights[2];
1280 #endif
1281             m_principle_axis.normalize_in_place();
1282
1283             if (num_passes > 1)
1284             {
1285                 // Check for obviously wild principle axes and try to compensate by backing off the component weightings.
1286                 if (fabs(m_principle_axis[0]) >= .795f)
1287                     perceptual_weights.set(.424f, .6f, .072f);
1288                 else if (fabs(m_principle_axis[2]) >= .795f)
1289                     perceptual_weights.set(.212f, .6f, .212f);
1290                 else
1291                     break;
1292             }
1293         }
1294
1295         // Find bounds of projection onto (potentially skewed) principle axis.
1296         float l = 1e+9;
1297         float h = -1e+9;
1298
1299         for (uint i = 0; i < m_norm_unique_colors.size(); i++)
1300         {
1301             float d = m_norm_unique_colors[i].dot(m_principle_axis);
1302             l = math::minimum(l, d);
1303             h = math::maximum(h, d);
1304         }
1305
1306         vec3F low_color(m_mean_norm_color + l * m_principle_axis);
1307         vec3F high_color(m_mean_norm_color + h * m_principle_axis);
1308
1309         if (!low_color.is_within_bounds(0.0f, 1.0f))
1310         {
1311             // Low color is outside the lattice, so bring it back in by casting a ray.
1312             vec3F coord;
1313             float t;
1314             aabb3F bounds(vec3F(0.0f), vec3F(1.0f));
1315             intersection::result res = intersection::ray_aabb(coord, t, ray3F(low_color, m_principle_axis), bounds);
1316             if (res == intersection::cSuccess)
1317                 low_color = coord;
1318         }
1319
1320         if (!high_color.is_within_bounds(0.0f, 1.0f))
1321         {
1322             // High color is outside the lattice, so bring it back in by casting a ray.
1323             vec3F coord;
1324             float t;
1325             aabb3F bounds(vec3F(0.0f), vec3F(1.0f));
1326             intersection::result res = intersection::ray_aabb(coord, t, ray3F(high_color, -m_principle_axis), bounds);
1327             if (res == intersection::cSuccess)
1328                 high_color = coord;
1329         }
1330
1331         // Now optimize the endpoints using the projection bounds on the (potentially skewed) principle axis as a starting point.
1332         if (!optimize_endpoints(low_color, high_color))
1333             return false;
1334
1335         return true;
1336     }
1337
1338     bool dxt1_endpoint_optimizer::handle_grayscale_block()
1339     {
1340         // TODO
1341         return true;
1342     }
1343
1344     // Tries quantizing the block to 4 colors using vanilla LBG. It tries all combinations of the quantized results as potential endpoints.
1345     bool dxt1_endpoint_optimizer::try_median4(const vec3F &low_color, const vec3F &high_color)
1346     {
1347         vec3F means[4];
1348
1349         if (m_unique_colors.size() <= 4)
1350         {
1351             for (uint i = 0; i < 4; i++)
1352                 means[i] = m_norm_unique_colors[math::minimum<int>(m_norm_unique_colors.size() - 1, i)];
1353         }
1354         else
1355         {
1356             means[0] = low_color - m_mean_norm_color;
1357             means[3] = high_color - m_mean_norm_color;
1358             means[1] = vec3F::lerp(means[0], means[3], 1.0f / 3.0f);
1359             means[2] = vec3F::lerp(means[0], means[3], 2.0f / 3.0f);
1360
1361             fast_random rm;
1362
1363             const uint cMaxIters = 8;
1364             uint reassign_rover = 0;
1365             float prev_total_dist = math::cNearlyInfinite;
1366             for (uint iter = 0; iter < cMaxIters; iter++)
1367             {
1368                 vec3F new_means[4];
1369                 float new_weights[4];
1370                 utils::zero_object(new_means);
1371                 utils::zero_object(new_weights);
1372
1373                 float total_dist = 0;
1374
1375                 for (uint i = 0; i < m_unique_colors.size(); i++)
1376                 {
1377                     const vec3F &v = m_norm_unique_colors[i];
1378
1379                     float best_dist = means[0].squared_distance(v);
1380                     int best_index = 0;
1381
1382                     for (uint j = 1; j < 4; j++)
1383                     {
1384                         float dist = means[j].squared_distance(v);
1385                         if (dist < best_dist)
1386                         {
1387                             best_dist = dist;
1388                             best_index = j;
1389                         }
1390                     }
1391
1392                     total_dist += best_dist;
1393
1394                     new_means[best_index] += v * (float)m_unique_colors[i].m_weight;
1395                     new_weights[best_index] += (float)m_unique_colors[i].m_weight;
1396                 }
1397
1398                 uint highest_index = 0;
1399                 float highest_weight = 0;
1400                 bool empty_cell = false;
1401                 for (uint j = 0; j < 4; j++)
1402                 {
1403                     if (new_weights[j] > 0.0f)
1404                     {
1405                         means[j] = new_means[j] / new_weights[j];
1406                         if (new_weights[j] > highest_weight)
1407                         {
1408                             highest_weight = new_weights[j];
1409                             highest_index = j;
1410                         }
1411                     }
1412                     else
1413                         empty_cell = true;
1414                 }
1415
1416                 if (!empty_cell)
1417                 {
1418                     if (fabs(total_dist - prev_total_dist) < .00001f)
1419                         break;
1420
1421                     prev_total_dist = total_dist;
1422                 }
1423                 else
1424                     prev_total_dist = math::cNearlyInfinite;
1425
1426                 if ((empty_cell) && (iter != (cMaxIters - 1)))
1427                 {
1428                     const uint ri = (highest_index + reassign_rover) & 3;
1429                     reassign_rover++;
1430
1431                     for (uint j = 0; j < 4; j++)
1432                     {
1433                         if (new_weights[j] == 0.0f)
1434                         {
1435                             means[j] = means[ri];
1436                             means[j] += vec3F::make_random(rm, -.00196f, .00196f);
1437                         }
1438                     }
1439                 }
1440             }
1441         }
1442
1443         bool improved = false;
1444
1445         for (uint i = 0; i < 3; i++)
1446         {
1447             for (uint j = i + 1; j < 4; j++)
1448             {
1449                 const vec3F v0(means[i] + m_mean_norm_color);
1450                 const vec3F v1(means[j] + m_mean_norm_color);
1451
1452                 dxt1_solution_coordinates sc(
1453                     color_quad_u8((int)floor(.5f + v0[0] * 31.0f), (int)floor(.5f + v0[1] * 63.0f), (int)floor(.5f + v0[2] * 31.0f), 255),
1454                     color_quad_u8((int)floor(.5f + v1[0] * 31.0f), (int)floor(.5f + v1[1] * 63.0f), (int)floor(.5f + v1[2] * 31.0f), 255), false);
1455
1456                 sc.canonicalize();
1457
1458                 improved |= evaluate_solution(sc, true, &m_best_solution, false);
1459             }
1460         }
1461
1462         improved |= refine_solution((m_pParams->m_quality == cCRNDXTQualityUber) ? 1 : 0);
1463
1464         return improved;
1465     }
1466
1467     // Given candidate low/high endpoints, find the optimal selectors for 3 and 4 color blocks, compute the resulting error,
1468     // and use the candidate if it results in less error than the best found result so far.
1469     bool dxt1_endpoint_optimizer::evaluate_solution(
1470         const dxt1_solution_coordinates &coords,
1471         bool early_out,
1472         potential_solution *pBest_solution,
1473         bool alternate_rounding)
1474     {
1475         m_total_evals++;
1476
1477         if ((!m_pSolutions) || (alternate_rounding))
1478         {
1479             if (m_pParams->m_quality >= cCRNDXTQualityBetter)
1480                 return evaluate_solution_uber(m_trial_solution, coords, early_out, pBest_solution, alternate_rounding);
1481             else
1482                 return evaluate_solution_fast(m_trial_solution, coords, early_out, pBest_solution, alternate_rounding);
1483         }
1484
1485         evaluate_solution_uber(m_trial_solution, coords, false, NULL, alternate_rounding);
1486
1487         VOGL_ASSERT(m_trial_solution.m_valid);
1488
1489         // Caller has requested all considered candidate solutions for later analysis.
1490         m_pSolutions->resize(m_pSolutions->size() + 1);
1491         solution &new_solution = m_pSolutions->back();
1492         new_solution.m_selectors.resize(m_pParams->m_num_pixels);
1493         new_solution.m_results.m_pSelectors = &new_solution.m_selectors[0];
1494
1495         return_solution(new_solution.m_results, m_trial_solution);
1496
1497         if ((pBest_solution) && (m_trial_solution.m_error < m_best_solution.m_error))
1498         {
1499             *pBest_solution = m_trial_solution;
1500             return true;
1501         }
1502
1503         return false;
1504     }
1505
1506     inline uint dxt1_endpoint_optimizer::color_distance(bool perceptual, const color_quad_u8 &e1, const color_quad_u8 &e2, bool alpha)
1507     {
1508         if (perceptual)
1509         {
1510             return color::color_distance(true, e1, e2, alpha);
1511         }
1512         else if (m_pParams->m_grayscale_sampling)
1513         {
1514             // Computes error assuming shader will be converting the result to grayscale.
1515             int y0 = color::RGB_to_Y(e1);
1516             int y1 = color::RGB_to_Y(e2);
1517             int yd = y0 - y1;
1518             if (alpha)
1519             {
1520                 int da = (int)e1[3] - (int)e2[3];
1521                 return yd * yd + da * da;
1522             }
1523             else
1524             {
1525                 return yd * yd;
1526             }
1527         }
1528         else if (m_has_color_weighting)
1529         {
1530             // Compute error using user provided color component weights.
1531             int dr = (int)e1[0] - (int)e2[0];
1532             int dg = (int)e1[1] - (int)e2[1];
1533             int db = (int)e1[2] - (int)e2[2];
1534
1535             dr = (dr * dr) * m_pParams->m_color_weights[0];
1536             dg = (dg * dg) * m_pParams->m_color_weights[1];
1537             db = (db * db) * m_pParams->m_color_weights[2];
1538
1539             if (alpha)
1540             {
1541                 int da = (int)e1[3] - (int)e2[3];
1542                 da = (da * da) * (m_pParams->m_color_weights[0] + m_pParams->m_color_weights[1] + m_pParams->m_color_weights[2]);
1543                 return dr + dg + db + da;
1544             }
1545             else
1546             {
1547                 return dr + dg + db;
1548             }
1549         }
1550         else
1551         {
1552             return color::color_distance(false, e1, e2, alpha);
1553         }
1554     }
1555
1556     bool dxt1_endpoint_optimizer::evaluate_solution_uber(
1557         potential_solution &solution,
1558         const dxt1_solution_coordinates &coords,
1559         bool early_out,
1560         potential_solution *pBest_solution,
1561         bool alternate_rounding)
1562     {
1563         solution.m_coords = coords;
1564         solution.m_selectors.resize(m_unique_colors.size());
1565
1566         if ((pBest_solution) && (early_out))
1567             solution.m_error = pBest_solution->m_error;
1568         else
1569             solution.m_error = cUINT64_MAX;
1570
1571         solution.m_alpha_block = false;
1572         solution.m_valid = false;
1573
1574         uint first_block_type = 0;
1575         uint last_block_type = 1;
1576
1577         if ((m_pParams->m_pixels_have_alpha) || (m_pParams->m_force_alpha_blocks))
1578             first_block_type = 1;
1579         else if (!m_pParams->m_use_alpha_blocks)
1580             last_block_type = 0;
1581
1582         m_trial_selectors.resize(m_unique_colors.size());
1583
1584         color_quad_u8 colors[cDXT1SelectorValues];
1585
1586         colors[0] = dxt1_block::unpack_color(coords.m_low_color, true);
1587         colors[1] = dxt1_block::unpack_color(coords.m_high_color, true);
1588
1589         for (uint block_type = first_block_type; block_type <= last_block_type; block_type++)
1590         {
1591             uint64_t trial_error = 0;
1592
1593             if (!block_type)
1594             {
1595                 colors[2].set_noclamp_rgba((colors[0].r * 2 + colors[1].r + alternate_rounding) / 3, (colors[0].g * 2 + colors[1].g + alternate_rounding) / 3, (colors[0].b * 2 + colors[1].b + alternate_rounding) / 3, 0);
1596                 colors[3].set_noclamp_rgba((colors[1].r * 2 + colors[0].r + alternate_rounding) / 3, (colors[1].g * 2 + colors[0].g + alternate_rounding) / 3, (colors[1].b * 2 + colors[0].b + alternate_rounding) / 3, 0);
1597
1598                 if (m_perceptual)
1599                 {
1600                     for (int unique_color_index = (int)m_unique_colors.size() - 1; unique_color_index >= 0; unique_color_index--)
1601                     {
1602                         const color_quad_u8 &c = m_unique_colors[unique_color_index].m_color;
1603
1604                         uint best_error = color_distance(true, c, colors[0], false);
1605                         uint best_color_index = 0;
1606
1607                         uint err = color_distance(true, c, colors[1], false);
1608                         if (err < best_error)
1609                         {
1610                             best_error = err;
1611                             best_color_index = 1;
1612                         }
1613
1614                         err = color_distance(true, c, colors[2], false);
1615                         if (err < best_error)
1616                         {
1617                             best_error = err;
1618                             best_color_index = 2;
1619                         }
1620
1621                         err = color_distance(true, c, colors[3], false);
1622                         if (err < best_error)
1623                         {
1624                             best_error = err;
1625                             best_color_index = 3;
1626                         }
1627
1628                         trial_error += best_error * m_unique_colors[unique_color_index].m_weight;
1629                         if (trial_error >= solution.m_error)
1630                             break;
1631
1632                         m_trial_selectors[unique_color_index] = static_cast<uint8>(best_color_index);
1633                     }
1634                 }
1635                 else
1636                 {
1637                     for (int unique_color_index = (int)m_unique_colors.size() - 1; unique_color_index >= 0; unique_color_index--)
1638                     {
1639                         const color_quad_u8 &c = m_unique_colors[unique_color_index].m_color;
1640
1641                         uint best_error = color_distance(false, c, colors[0], false);
1642                         uint best_color_index = 0;
1643
1644                         uint err = color_distance(false, c, colors[1], false);
1645                         if (err < best_error)
1646                         {
1647                             best_error = err;
1648                             best_color_index = 1;
1649                         }
1650
1651                         err = color_distance(false, c, colors[2], false);
1652                         if (err < best_error)
1653                         {
1654                             best_error = err;
1655                             best_color_index = 2;
1656                         }
1657
1658                         err = color_distance(false, c, colors[3], false);
1659                         if (err < best_error)
1660                         {
1661                             best_error = err;
1662                             best_color_index = 3;
1663                         }
1664
1665                         trial_error += best_error * m_unique_colors[unique_color_index].m_weight;
1666                         if (trial_error >= solution.m_error)
1667                             break;
1668
1669                         m_trial_selectors[unique_color_index] = static_cast<uint8>(best_color_index);
1670                     }
1671                 }
1672             }
1673             else
1674             {
1675                 colors[2].set_noclamp_rgba((colors[0].r + colors[1].r + alternate_rounding) >> 1, (colors[0].g + colors[1].g + alternate_rounding) >> 1, (colors[0].b + colors[1].b + alternate_rounding) >> 1, 255U);
1676
1677                 if (m_perceptual)
1678                 {
1679                     for (int unique_color_index = (int)m_unique_colors.size() - 1; unique_color_index >= 0; unique_color_index--)
1680                     {
1681                         const color_quad_u8 &c = m_unique_colors[unique_color_index].m_color;
1682
1683                         uint best_error = color_distance(true, c, colors[0], false);
1684                         uint best_color_index = 0;
1685
1686                         uint err = color_distance(true, c, colors[1], false);
1687                         if (err < best_error)
1688                         {
1689                             best_error = err;
1690                             best_color_index = 1;
1691                         }
1692
1693                         err = color_distance(true, c, colors[2], false);
1694                         if (err < best_error)
1695                         {
1696                             best_error = err;
1697                             best_color_index = 2;
1698                         }
1699
1700                         trial_error += best_error * m_unique_colors[unique_color_index].m_weight;
1701                         if (trial_error >= solution.m_error)
1702                             break;
1703
1704                         m_trial_selectors[unique_color_index] = static_cast<uint8>(best_color_index);
1705                     }
1706                 }
1707                 else
1708                 {
1709                     for (int unique_color_index = (int)m_unique_colors.size() - 1; unique_color_index >= 0; unique_color_index--)
1710                     {
1711                         const color_quad_u8 &c = m_unique_colors[unique_color_index].m_color;
1712
1713                         uint best_error = color_distance(false, c, colors[0], false);
1714                         uint best_color_index = 0;
1715
1716                         uint err = color_distance(false, c, colors[1], false);
1717                         if (err < best_error)
1718                         {
1719                             best_error = err;
1720                             best_color_index = 1;
1721                         }
1722
1723                         err = color_distance(false, c, colors[2], false);
1724                         if (err < best_error)
1725                         {
1726                             best_error = err;
1727                             best_color_index = 2;
1728                         }
1729
1730                         trial_error += best_error * m_unique_colors[unique_color_index].m_weight;
1731                         if (trial_error >= solution.m_error)
1732                             break;
1733
1734                         m_trial_selectors[unique_color_index] = static_cast<uint8>(best_color_index);
1735                     }
1736                 }
1737             }
1738
1739             if (trial_error < solution.m_error)
1740             {
1741                 solution.m_error = trial_error;
1742                 solution.m_alpha_block = (block_type != 0);
1743                 solution.m_selectors = m_trial_selectors;
1744                 solution.m_valid = true;
1745             }
1746         }
1747
1748         if ((!solution.m_alpha_block) && (solution.m_coords.m_low_color == solution.m_coords.m_high_color))
1749         {
1750             uint s;
1751             if ((solution.m_coords.m_low_color & 31) != 31)
1752             {
1753                 solution.m_coords.m_low_color++;
1754                 s = 1;
1755             }
1756             else
1757             {
1758                 solution.m_coords.m_high_color--;
1759                 s = 0;
1760             }
1761
1762             for (uint i = 0; i < m_unique_colors.size(); i++)
1763                 solution.m_selectors[i] = static_cast<uint8>(s);
1764         }
1765
1766         if ((pBest_solution) && (solution.m_error < pBest_solution->m_error))
1767         {
1768             *pBest_solution = solution;
1769             return true;
1770         }
1771
1772         return false;
1773     }
1774
1775     bool dxt1_endpoint_optimizer::evaluate_solution_fast(
1776         potential_solution &solution,
1777         const dxt1_solution_coordinates &coords,
1778         bool early_out,
1779         potential_solution *pBest_solution,
1780         bool alternate_rounding)
1781     {
1782         solution.m_coords = coords;
1783         solution.m_selectors.resize(m_unique_colors.size());
1784
1785         if ((pBest_solution) && (early_out))
1786             solution.m_error = pBest_solution->m_error;
1787         else
1788             solution.m_error = cUINT64_MAX;
1789
1790         solution.m_alpha_block = false;
1791         solution.m_valid = false;
1792
1793         uint first_block_type = 0;
1794         uint last_block_type = 1;
1795
1796         if ((m_pParams->m_pixels_have_alpha) || (m_pParams->m_force_alpha_blocks))
1797             first_block_type = 1;
1798         else if (!m_pParams->m_use_alpha_blocks)
1799             last_block_type = 0;
1800
1801         m_trial_selectors.resize(m_unique_colors.size());
1802
1803         color_quad_u8 colors[cDXT1SelectorValues];
1804         colors[0] = dxt1_block::unpack_color(coords.m_low_color, true);
1805         colors[1] = dxt1_block::unpack_color(coords.m_high_color, true);
1806
1807         int vr = colors[1].r - colors[0].r;
1808         int vg = colors[1].g - colors[0].g;
1809         int vb = colors[1].b - colors[0].b;
1810         if (m_perceptual)
1811         {
1812             vr *= 8;
1813             vg *= 24;
1814         }
1815
1816         int stops[4];
1817         stops[0] = colors[0].r * vr + colors[0].g * vg + colors[0].b * vb;
1818         stops[1] = colors[1].r * vr + colors[1].g * vg + colors[1].b * vb;
1819
1820         int dirr = vr * 2;
1821         int dirg = vg * 2;
1822         int dirb = vb * 2;
1823
1824         for (uint block_type = first_block_type; block_type <= last_block_type; block_type++)
1825         {
1826             uint64_t trial_error = 0;
1827
1828             if (!block_type)
1829             {
1830                 colors[2].set_noclamp_rgba((colors[0].r * 2 + colors[1].r + alternate_rounding) / 3, (colors[0].g * 2 + colors[1].g + alternate_rounding) / 3, (colors[0].b * 2 + colors[1].b + alternate_rounding) / 3, 255U);
1831                 colors[3].set_noclamp_rgba((colors[1].r * 2 + colors[0].r + alternate_rounding) / 3, (colors[1].g * 2 + colors[0].g + alternate_rounding) / 3, (colors[1].b * 2 + colors[0].b + alternate_rounding) / 3, 255U);
1832
1833                 stops[2] = colors[2].r * vr + colors[2].g * vg + colors[2].b * vb;
1834                 stops[3] = colors[3].r * vr + colors[3].g * vg + colors[3].b * vb;
1835
1836                 // 0 2 3 1
1837                 int c0Point = stops[1] + stops[3];
1838                 int halfPoint = stops[3] + stops[2];
1839                 int c3Point = stops[2] + stops[0];
1840
1841                 for (int unique_color_index = (int)m_unique_colors.size() - 1; unique_color_index >= 0; unique_color_index--)
1842                 {
1843                     const color_quad_u8 &c = m_unique_colors[unique_color_index].m_color;
1844
1845                     int dot = c.r * dirr + c.g * dirg + c.b * dirb;
1846
1847                     uint8 best_color_index;
1848                     if (dot < halfPoint)
1849                         best_color_index = (dot < c3Point) ? 0 : 2;
1850                     else
1851                         best_color_index = (dot < c0Point) ? 3 : 1;
1852
1853                     uint best_error = color_distance(m_perceptual, c, colors[best_color_index], false);
1854
1855                     trial_error += best_error * m_unique_colors[unique_color_index].m_weight;
1856                     if (trial_error >= solution.m_error)
1857                         break;
1858
1859                     m_trial_selectors[unique_color_index] = static_cast<uint8>(best_color_index);
1860                 }
1861             }
1862             else
1863             {
1864                 colors[2].set_noclamp_rgba((colors[0].r + colors[1].r + alternate_rounding) >> 1, (colors[0].g + colors[1].g + alternate_rounding) >> 1, (colors[0].b + colors[1].b + alternate_rounding) >> 1, 255U);
1865
1866                 stops[2] = colors[2].r * vr + colors[2].g * vg + colors[2].b * vb;
1867
1868                 // 0 2 1
1869                 int c02Point = stops[0] + stops[2];
1870                 int c21Point = stops[2] + stops[1];
1871
1872                 for (int unique_color_index = (int)m_unique_colors.size() - 1; unique_color_index >= 0; unique_color_index--)
1873                 {
1874                     const color_quad_u8 &c = m_unique_colors[unique_color_index].m_color;
1875
1876                     int dot = c.r * dirr + c.g * dirg + c.b * dirb;
1877
1878                     uint8 best_color_index;
1879                     if (dot < c02Point)
1880                         best_color_index = 0;
1881                     else if (dot < c21Point)
1882                         best_color_index = 2;
1883                     else
1884                         best_color_index = 1;
1885
1886                     uint best_error = color_distance(m_perceptual, c, colors[best_color_index], false);
1887
1888                     trial_error += best_error * m_unique_colors[unique_color_index].m_weight;
1889                     if (trial_error >= solution.m_error)
1890                         break;
1891
1892                     m_trial_selectors[unique_color_index] = static_cast<uint8>(best_color_index);
1893                 }
1894             }
1895
1896             if (trial_error < solution.m_error)
1897             {
1898                 solution.m_error = trial_error;
1899                 solution.m_alpha_block = (block_type != 0);
1900                 solution.m_selectors = m_trial_selectors;
1901                 solution.m_valid = true;
1902             }
1903         }
1904
1905         if ((!solution.m_alpha_block) && (solution.m_coords.m_low_color == solution.m_coords.m_high_color))
1906         {
1907             uint s;
1908             if ((solution.m_coords.m_low_color & 31) != 31)
1909             {
1910                 solution.m_coords.m_low_color++;
1911                 s = 1;
1912             }
1913             else
1914             {
1915                 solution.m_coords.m_high_color--;
1916                 s = 0;
1917             }
1918
1919             for (uint i = 0; i < m_unique_colors.size(); i++)
1920                 solution.m_selectors[i] = static_cast<uint8>(s);
1921         }
1922
1923         if ((pBest_solution) && (solution.m_error < pBest_solution->m_error))
1924         {
1925             *pBest_solution = solution;
1926             return true;
1927         }
1928
1929         return false;
1930     }
1931
1932     unique_color dxt1_endpoint_optimizer::lerp_color(const color_quad_u8 &a, const color_quad_u8 &b, float f, int rounding)
1933     {
1934         color_quad_u8 res;
1935
1936         float r = rounding ? 1.0f : 0.0f;
1937         res[0] = static_cast<uint8>(math::clamp(math::float_to_int(r + math::lerp<float>(a[0], b[0], f)), 0, 255));
1938         res[1] = static_cast<uint8>(math::clamp(math::float_to_int(r + math::lerp<float>(a[1], b[1], f)), 0, 255));
1939         res[2] = static_cast<uint8>(math::clamp(math::float_to_int(r + math::lerp<float>(a[2], b[2], f)), 0, 255));
1940         res[3] = 255;
1941
1942         return unique_color(res, 1);
1943     }
1944
1945     // The block may have been already compressed using another DXTc compressor, such as squish, ATI_Compress, ryg_dxt, etc.
1946     // Attempt to recover the endpoints used by that block compressor.
1947     void dxt1_endpoint_optimizer::try_combinatorial_encoding()
1948     {
1949         if ((m_unique_colors.size() < 2) || (m_unique_colors.size() > 4))
1950             return;
1951
1952         m_temp_unique_colors = m_unique_colors;
1953
1954         if (m_temp_unique_colors.size() == 2)
1955         {
1956             // a    b    c   d
1957             // 0.0  1/3 2/3  1.0
1958
1959             for (uint k = 0; k < 2; k++)
1960             {
1961                 for (uint q = 0; q < 2; q++)
1962                 {
1963                     const uint r = q ^ 1;
1964
1965                     // a b
1966                     m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[q].m_color, m_temp_unique_colors[r].m_color, 2.0f, k));
1967                     m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[q].m_color, m_temp_unique_colors[r].m_color, 3.0f, k));
1968
1969                     // a c
1970                     m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[q].m_color, m_temp_unique_colors[r].m_color, .5f, k));
1971                     m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[q].m_color, m_temp_unique_colors[r].m_color, 1.5f, k));
1972
1973                     // a d
1974
1975                     // b c
1976                     m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[q].m_color, m_temp_unique_colors[r].m_color, -1.0f, k));
1977                     m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[q].m_color, m_temp_unique_colors[r].m_color, 2.0f, k));
1978
1979                     // b d
1980                     m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[q].m_color, m_temp_unique_colors[r].m_color, -.5f, k));
1981                     m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[q].m_color, m_temp_unique_colors[r].m_color, .5f, k));
1982
1983                     // c d
1984                     m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[q].m_color, m_temp_unique_colors[r].m_color, -2.0f, k));
1985                     m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[q].m_color, m_temp_unique_colors[r].m_color, -1.0f, k));
1986                 }
1987             }
1988         }
1989         else if (m_temp_unique_colors.size() == 3)
1990         {
1991             // a    b    c   d
1992             // 0.0  1/3 2/3  1.0
1993
1994             for (uint i = 0; i <= 2; i++)
1995             {
1996                 for (uint j = 0; j <= 2; j++)
1997                 {
1998                     if (i == j)
1999                         continue;
2000
2001                     // a b c
2002                     m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[i].m_color, m_temp_unique_colors[j].m_color, 1.5f));
2003
2004                     // a b d
2005                     m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[i].m_color, m_temp_unique_colors[j].m_color, 2.0f / 3.0f));
2006
2007                     // a c d
2008                     m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[i].m_color, m_temp_unique_colors[j].m_color, 1.0f / 3.0f));
2009
2010                     // b c d
2011                     m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[i].m_color, m_temp_unique_colors[j].m_color, -.5f));
2012                 }
2013             }
2014         }
2015
2016         m_unique_packed_colors.resize(0);
2017
2018         for (uint i = 0; i < m_temp_unique_colors.size(); i++)
2019         {
2020             const color_quad_u8 &unique_color = m_temp_unique_colors[i].m_color;
2021             const uint16 packed_color = dxt1_block::pack_color(unique_color, true);
2022
2023             if (std::find(m_unique_packed_colors.begin(), m_unique_packed_colors.end(), packed_color) != m_unique_packed_colors.end())
2024                 continue;
2025
2026             m_unique_packed_colors.push_back(packed_color);
2027         }
2028
2029         if (m_unique_packed_colors.size() < 2)
2030             return;
2031
2032         for (uint alt_rounding = 0; alt_rounding < 2; alt_rounding++)
2033         {
2034             for (uint i = 0; i < m_unique_packed_colors.size() - 1; i++)
2035             {
2036                 for (uint j = i + 1; j < m_unique_packed_colors.size(); j++)
2037                 {
2038                     evaluate_solution(
2039                         dxt1_solution_coordinates(m_unique_packed_colors[i], m_unique_packed_colors[j]),
2040                         true,
2041                         (alt_rounding == 0) ? &m_best_solution : NULL,
2042                         (alt_rounding != 0));
2043
2044                     if (m_trial_solution.m_error == 0)
2045                     {
2046                         if (alt_rounding)
2047                             m_best_solution = m_trial_solution;
2048
2049                         return;
2050                     }
2051                 }
2052             }
2053         }
2054
2055         return;
2056     }
2057
2058     // The fourth (transparent) color in 3 color "transparent" blocks is black, which can be optionally exploited for small gains in DXT1 mode if the caller
2059     // doesn't actually use alpha. (But not in DXT5 mode, because 3-color blocks aren't permitted by GPU's for DXT5.)
2060     bool dxt1_endpoint_optimizer::try_alpha_as_black_optimization()
2061     {
2062         const params *pOrig_params = m_pParams;
2063         VOGL_NOTE_UNUSED(pOrig_params);
2064         results *pOrig_results = m_pResults;
2065
2066         uint num_dark_colors = 0;
2067
2068         for (uint i = 0; i < m_unique_colors.size(); i++)
2069             if ((m_unique_colors[i].m_color[0] <= 4) && (m_unique_colors[i].m_color[1] <= 4) && (m_unique_colors[i].m_color[2] <= 4))
2070                 num_dark_colors++;
2071
2072         if ((!num_dark_colors) || (num_dark_colors == m_unique_colors.size()))
2073             return true;
2074
2075         params trial_params(*m_pParams);
2076         vogl::vector<color_quad_u8> trial_colors;
2077         trial_colors.insert(0, m_pParams->m_pPixels, m_pParams->m_num_pixels);
2078
2079         trial_params.m_pPixels = trial_colors.get_ptr();
2080         trial_params.m_pixels_have_alpha = true;
2081
2082         for (uint i = 0; i < trial_colors.size(); i++)
2083             if ((trial_colors[i][0] <= 4) && (trial_colors[i][1] <= 4) && (trial_colors[i][2] <= 4))
2084                 trial_colors[i][3] = 0;
2085
2086         results trial_results;
2087
2088         vogl::vector<uint8> trial_selectors(m_pParams->m_num_pixels);
2089         trial_results.m_pSelectors = trial_selectors.get_ptr();
2090
2091         if (!compute_internal(trial_params, trial_results, NULL))
2092             return false;
2093
2094         VOGL_ASSERT(trial_results.m_alpha_block);
2095
2096         color_quad_u8 c[4];
2097         dxt1_block::get_block_colors3(c, trial_results.m_low_color, trial_results.m_high_color);
2098
2099         uint64_t trial_error = 0;
2100
2101         for (uint i = 0; i < trial_colors.size(); i++)
2102         {
2103             if (trial_colors[i][3] == 0)
2104             {
2105                 VOGL_ASSERT(trial_selectors[i] == 3);
2106             }
2107             else
2108             {
2109                 VOGL_ASSERT(trial_selectors[i] != 3);
2110             }
2111
2112             trial_error += color_distance(m_perceptual, trial_colors[i], c[trial_selectors[i]], false);
2113         }
2114
2115         if (trial_error < pOrig_results->m_error)
2116         {
2117             pOrig_results->m_error = trial_error;
2118
2119             pOrig_results->m_low_color = trial_results.m_low_color;
2120             pOrig_results->m_high_color = trial_results.m_high_color;
2121
2122             if (pOrig_results->m_pSelectors)
2123                 memcpy(pOrig_results->m_pSelectors, trial_results.m_pSelectors, m_pParams->m_num_pixels);
2124
2125             pOrig_results->m_alpha_block = true;
2126         }
2127
2128         return true;
2129     }
2130
2131     bool dxt1_endpoint_optimizer::compute_internal(const params &p, results &r, solution_vec *pSolutions)
2132     {
2133         clear();
2134
2135         m_pParams = &p;
2136         m_pResults = &r;
2137         m_pSolutions = pSolutions;
2138
2139         m_has_color_weighting = (m_pParams->m_color_weights[0] != 1) || (m_pParams->m_color_weights[1] != 1) || (m_pParams->m_color_weights[2] != 1);
2140         m_perceptual = m_pParams->m_perceptual && !m_has_color_weighting && !m_pParams->m_grayscale_sampling;
2141
2142         find_unique_colors();
2143
2144         m_best_solution.clear();
2145
2146         if (m_unique_colors.is_empty())
2147             return handle_all_transparent_block();
2148         else if ((m_unique_colors.size() == 1) && (!m_has_transparent_pixels))
2149             return handle_solid_block();
2150         else
2151         {
2152             if (!handle_multicolor_block())
2153                 return false;
2154
2155             if ((m_all_pixels_grayscale) && (m_best_solution.m_error))
2156             {
2157                 if (!handle_grayscale_block())
2158                     return false;
2159             }
2160         }
2161
2162         return true;
2163     }
2164
2165     bool dxt1_endpoint_optimizer::compute(const params &p, results &r, solution_vec *pSolutions)
2166     {
2167         if (!p.m_pPixels)
2168             return false;
2169
2170         bool status = compute_internal(p, r, pSolutions);
2171         if (!status)
2172             return false;
2173
2174         if ((m_pParams->m_use_alpha_blocks) && (m_pParams->m_use_transparent_indices_for_black) && (!m_pParams->m_pixels_have_alpha) && (!pSolutions))
2175         {
2176             if (!try_alpha_as_black_optimization())
2177                 return false;
2178         }
2179
2180         return true;
2181     }
2182
2183     // Build array of unique colors and their weights.
2184     void dxt1_endpoint_optimizer::find_unique_colors()
2185     {
2186         m_has_transparent_pixels = false;
2187
2188         uint num_opaque_pixels = 0;
2189
2190         const uint alpha_thresh = m_pParams->m_pixels_have_alpha ? (m_pParams->m_dxt1a_alpha_threshold << 24U) : 0;
2191
2192         const uint32 *pSrc_pixels = reinterpret_cast<const uint32 *>(m_pParams->m_pPixels);
2193         const uint32 *pSrc_pixels_end = pSrc_pixels + m_pParams->m_num_pixels;
2194
2195         m_unique_colors.resize(m_pParams->m_num_pixels);
2196         uint num_unique_colors = 0;
2197
2198         m_all_pixels_grayscale = true;
2199
2200         do
2201         {
2202             uint32 c = utils::read_le32(pSrc_pixels);
2203             pSrc_pixels++;
2204
2205             if (c < alpha_thresh)
2206             {
2207                 m_has_transparent_pixels = true;
2208                 continue;
2209             }
2210
2211             if (m_all_pixels_grayscale)
2212             {
2213                 uint r = c & 0xFF;
2214                 uint g = (c >> 8) & 0xFF;
2215                 uint b = (c >> 16) & 0xFF;
2216                 if ((r != g) || (r != b))
2217                     m_all_pixels_grayscale = false;
2218             }
2219
2220             c |= 0xFF000000U;
2221
2222             unique_color_hash_map::insert_result ins_result(m_unique_color_hash_map.insert(c, num_unique_colors));
2223
2224             if (ins_result.second)
2225             {
2226                 utils::write_le32(&m_unique_colors[num_unique_colors].m_color.m_u32, c);
2227                 m_unique_colors[num_unique_colors].m_weight = 1;
2228                 num_unique_colors++;
2229             }
2230             else
2231                 m_unique_colors[ins_result.first->second].m_weight++;
2232
2233             num_opaque_pixels++;
2234
2235         } while (pSrc_pixels != pSrc_pixels_end);
2236
2237         m_unique_colors.resize(num_unique_colors);
2238
2239         m_total_unique_color_weight = num_opaque_pixels;
2240     }
2241
2242 } // namespace vogl