From f6a9034a9d6d58ab27b574a0c146a36782762d55 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Jos=C3=A9=20Fonseca?= Date: Fri, 7 Dec 2012 11:14:11 +0000 Subject: [PATCH] thirdparty/directxtex: Import DirectXTex library. Will be useful for converting DirectX image formats. --- thirdparty/directxtex/DirectXTex/BC.cpp | 1131 ++ thirdparty/directxtex/DirectXTex/BC.h | 897 + thirdparty/directxtex/DirectXTex/BC4BC5.cpp | 534 + thirdparty/directxtex/DirectXTex/BC6HBC7.cpp | 2822 ++++ thirdparty/directxtex/DirectXTex/DDS.h | 214 + thirdparty/directxtex/DirectXTex/DirectXTex.h | 466 + .../directxtex/DirectXTex/DirectXTex.inl | 223 + .../DirectXTex/DirectXTexCompress.cpp | 697 + .../DirectXTex/DirectXTexConvert.cpp | 2421 +++ .../directxtex/DirectXTex/DirectXTexD3D11.cpp | 820 + .../directxtex/DirectXTex/DirectXTexDDS.cpp | 1684 ++ .../DirectXTex/DirectXTexFlipRotate.cpp | 327 + .../directxtex/DirectXTex/DirectXTexImage.cpp | 674 + .../DirectXTex/DirectXTexMipmaps.cpp | 1163 ++ .../directxtex/DirectXTex/DirectXTexMisc.cpp | 265 + .../DirectXTex/DirectXTexNormalMaps.cpp | 377 + .../directxtex/DirectXTex/DirectXTexP.h | 197 + .../DirectXTex/DirectXTexResize.cpp | 358 + .../directxtex/DirectXTex/DirectXTexTGA.cpp | 1387 ++ .../directxtex/DirectXTex/DirectXTexUtil.cpp | 759 + .../directxtex/DirectXTex/DirectXTexWIC.cpp | 946 ++ thirdparty/directxtex/DirectXTex/scoped.h | 70 + .../directxtex/Microsoft Public License.rtf | 234 + thirdparty/directxtex/ReadMe.txt | 192 + thirdparty/directxtex/XNAMath/xnamath.h | 3397 ++++ .../directxtex/XNAMath/xnamathconvert.inl | 6383 ++++++++ .../directxtex/XNAMath/xnamathmatrix.inl | 3293 ++++ thirdparty/directxtex/XNAMath/xnamathmisc.inl | 2460 +++ .../directxtex/XNAMath/xnamathvector.inl | 13673 ++++++++++++++++ 29 files changed, 48064 insertions(+) create mode 100644 thirdparty/directxtex/DirectXTex/BC.cpp create mode 100644 thirdparty/directxtex/DirectXTex/BC.h create mode 100644 thirdparty/directxtex/DirectXTex/BC4BC5.cpp create mode 100644 thirdparty/directxtex/DirectXTex/BC6HBC7.cpp create mode 100644 thirdparty/directxtex/DirectXTex/DDS.h create mode 100644 thirdparty/directxtex/DirectXTex/DirectXTex.h create mode 100644 thirdparty/directxtex/DirectXTex/DirectXTex.inl create mode 100644 thirdparty/directxtex/DirectXTex/DirectXTexCompress.cpp create mode 100644 thirdparty/directxtex/DirectXTex/DirectXTexConvert.cpp create mode 100644 thirdparty/directxtex/DirectXTex/DirectXTexD3D11.cpp create mode 100644 thirdparty/directxtex/DirectXTex/DirectXTexDDS.cpp create mode 100644 thirdparty/directxtex/DirectXTex/DirectXTexFlipRotate.cpp create mode 100644 thirdparty/directxtex/DirectXTex/DirectXTexImage.cpp create mode 100644 thirdparty/directxtex/DirectXTex/DirectXTexMipmaps.cpp create mode 100644 thirdparty/directxtex/DirectXTex/DirectXTexMisc.cpp create mode 100644 thirdparty/directxtex/DirectXTex/DirectXTexNormalMaps.cpp create mode 100644 thirdparty/directxtex/DirectXTex/DirectXTexP.h create mode 100644 thirdparty/directxtex/DirectXTex/DirectXTexResize.cpp create mode 100644 thirdparty/directxtex/DirectXTex/DirectXTexTGA.cpp create mode 100644 thirdparty/directxtex/DirectXTex/DirectXTexUtil.cpp create mode 100644 thirdparty/directxtex/DirectXTex/DirectXTexWIC.cpp create mode 100644 thirdparty/directxtex/DirectXTex/scoped.h create mode 100644 thirdparty/directxtex/Microsoft Public License.rtf create mode 100644 thirdparty/directxtex/ReadMe.txt create mode 100644 thirdparty/directxtex/XNAMath/xnamath.h create mode 100644 thirdparty/directxtex/XNAMath/xnamathconvert.inl create mode 100644 thirdparty/directxtex/XNAMath/xnamathmatrix.inl create mode 100644 thirdparty/directxtex/XNAMath/xnamathmisc.inl create mode 100644 thirdparty/directxtex/XNAMath/xnamathvector.inl diff --git a/thirdparty/directxtex/DirectXTex/BC.cpp b/thirdparty/directxtex/DirectXTex/BC.cpp new file mode 100644 index 0000000..169a68c --- /dev/null +++ b/thirdparty/directxtex/DirectXTex/BC.cpp @@ -0,0 +1,1131 @@ +//------------------------------------------------------------------------------------- +// BC.cpp +// +// Block-compression (BC) functionality for BC1, BC2, BC3 (orginal DXTn formats) +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkId=248926 +//------------------------------------------------------------------------------------- + +#include "directxtexp.h" + +// Experiemental encoding variants, not enabled by default +//#define COLOR_WEIGHTS +//#define COLOR_AVG_0WEIGHTS + +#include "BC.h" + +namespace DirectX +{ + +//------------------------------------------------------------------------------------- +// Constants +//------------------------------------------------------------------------------------- + +// Perceptual weightings for the importance of each channel. +static const HDRColorA g_Luminance (0.2125f / 0.7154f, 1.0f, 0.0721f / 0.7154f, 1.0f); +static const HDRColorA g_LuminanceInv(0.7154f / 0.2125f, 1.0f, 0.7154f / 0.0721f, 1.0f); + +//------------------------------------------------------------------------------------- +// Decode/Encode RGB 5/6/5 colors +//------------------------------------------------------------------------------------- +inline static void Decode565(_Out_ HDRColorA *pColor, _In_ const uint16_t w565) +{ + pColor->r = (float) ((w565 >> 11) & 31) * (1.0f / 31.0f); + pColor->g = (float) ((w565 >> 5) & 63) * (1.0f / 63.0f); + pColor->b = (float) ((w565 >> 0) & 31) * (1.0f / 31.0f); + pColor->a = 1.0f; +} + +inline static uint16_t Encode565(_In_ const HDRColorA *pColor) +{ + HDRColorA Color; + + Color.r = (pColor->r < 0.0f) ? 0.0f : (pColor->r > 1.0f) ? 1.0f : pColor->r; + Color.g = (pColor->g < 0.0f) ? 0.0f : (pColor->g > 1.0f) ? 1.0f : pColor->g; + Color.b = (pColor->b < 0.0f) ? 0.0f : (pColor->b > 1.0f) ? 1.0f : pColor->b; + + uint16_t w; + + w = (uint16_t) ((static_cast(Color.r * 31.0f + 0.5f) << 11) | + (static_cast(Color.g * 63.0f + 0.5f) << 5) | + (static_cast(Color.b * 31.0f + 0.5f) << 0)); + + return w; +} + + +//------------------------------------------------------------------------------------- +static void OptimizeRGB(_Out_ HDRColorA *pX, _Out_ HDRColorA *pY, + _In_count_c_(NUM_PIXELS_PER_BLOCK) const HDRColorA *pPoints, _In_ size_t cSteps, _In_ DWORD flags) +{ + static const float fEpsilon = (0.25f / 64.0f) * (0.25f / 64.0f); + static const float pC3[] = { 2.0f/2.0f, 1.0f/2.0f, 0.0f/2.0f }; + static const float pD3[] = { 0.0f/2.0f, 1.0f/2.0f, 2.0f/2.0f }; + static const float pC4[] = { 3.0f/3.0f, 2.0f/3.0f, 1.0f/3.0f, 0.0f/3.0f }; + static const float pD4[] = { 0.0f/3.0f, 1.0f/3.0f, 2.0f/3.0f, 3.0f/3.0f }; + + const float *pC = (3 == cSteps) ? pC3 : pC4; + const float *pD = (3 == cSteps) ? pD3 : pD4; + + // Find Min and Max points, as starting point + HDRColorA X = (flags & BC_FLAGS_UNIFORM) ? HDRColorA(1.f, 1.f, 1.f, 1.f) : g_Luminance; + HDRColorA Y = HDRColorA(0.0f, 0.0f, 0.0f, 1.0f); + + for(size_t iPoint = 0; iPoint < NUM_PIXELS_PER_BLOCK; iPoint++) + { +#ifdef COLOR_WEIGHTS + if(pPoints[iPoint].a > 0.0f) +#endif // COLOR_WEIGHTS + { + if(pPoints[iPoint].r < X.r) + X.r = pPoints[iPoint].r; + + if(pPoints[iPoint].g < X.g) + X.g = pPoints[iPoint].g; + + if(pPoints[iPoint].b < X.b) + X.b = pPoints[iPoint].b; + + if(pPoints[iPoint].r > Y.r) + Y.r = pPoints[iPoint].r; + + if(pPoints[iPoint].g > Y.g) + Y.g = pPoints[iPoint].g; + + if(pPoints[iPoint].b > Y.b) + Y.b = pPoints[iPoint].b; + } + } + + // Diagonal axis + HDRColorA AB; + + AB.r = Y.r - X.r; + AB.g = Y.g - X.g; + AB.b = Y.b - X.b; + + float fAB = AB.r * AB.r + AB.g * AB.g + AB.b * AB.b; + + // Single color block.. no need to root-find + if(fAB < FLT_MIN) + { + pX->r = X.r; pX->g = X.g; pX->b = X.b; + pY->r = Y.r; pY->g = Y.g; pY->b = Y.b; + return; + } + + // Try all four axis directions, to determine which diagonal best fits data + float fABInv = 1.0f / fAB; + + HDRColorA Dir; + Dir.r = AB.r * fABInv; + Dir.g = AB.g * fABInv; + Dir.b = AB.b * fABInv; + + HDRColorA Mid; + Mid.r = (X.r + Y.r) * 0.5f; + Mid.g = (X.g + Y.g) * 0.5f; + Mid.b = (X.b + Y.b) * 0.5f; + + float fDir[4]; + fDir[0] = fDir[1] = fDir[2] = fDir[3] = 0.0f; + + + for(size_t iPoint = 0; iPoint < NUM_PIXELS_PER_BLOCK; iPoint++) + { + HDRColorA Pt; + Pt.r = (pPoints[iPoint].r - Mid.r) * Dir.r; + Pt.g = (pPoints[iPoint].g - Mid.g) * Dir.g; + Pt.b = (pPoints[iPoint].b - Mid.b) * Dir.b; + + float f; + +#ifdef COLOR_WEIGHTS + f = Pt.r + Pt.g + Pt.b; + fDir[0] += pPoints[iPoint].a * f * f; + + f = Pt.r + Pt.g - Pt.b; + fDir[1] += pPoints[iPoint].a * f * f; + + f = Pt.r - Pt.g + Pt.b; + fDir[2] += pPoints[iPoint].a * f * f; + + f = Pt.r - Pt.g - Pt.b; + fDir[3] += pPoints[iPoint].a * f * f; +#else + f = Pt.r + Pt.g + Pt.b; + fDir[0] += f * f; + + f = Pt.r + Pt.g - Pt.b; + fDir[1] += f * f; + + f = Pt.r - Pt.g + Pt.b; + fDir[2] += f * f; + + f = Pt.r - Pt.g - Pt.b; + fDir[3] += f * f; +#endif // COLOR_WEIGHTS + } + + float fDirMax = fDir[0]; + size_t iDirMax = 0; + + for(size_t iDir = 1; iDir < 4; iDir++) + { + if(fDir[iDir] > fDirMax) + { + fDirMax = fDir[iDir]; + iDirMax = iDir; + } + } + + if(iDirMax & 2) + { + float f = X.g; X.g = Y.g; Y.g = f; + } + + if(iDirMax & 1) + { + float f = X.b; X.b = Y.b; Y.b = f; + } + + + // Two color block.. no need to root-find + if(fAB < 1.0f / 4096.0f) + { + pX->r = X.r; pX->g = X.g; pX->b = X.b; + pY->r = Y.r; pY->g = Y.g; pY->b = Y.b; + return; + } + + + // Use Newton's Method to find local minima of sum-of-squares error. + float fSteps = (float) (cSteps - 1); + + for(size_t iIteration = 0; iIteration < 8; iIteration++) + { + // Calculate new steps + HDRColorA pSteps[4]; + + for(size_t iStep = 0; iStep < cSteps; iStep++) + { + pSteps[iStep].r = X.r * pC[iStep] + Y.r * pD[iStep]; + pSteps[iStep].g = X.g * pC[iStep] + Y.g * pD[iStep]; + pSteps[iStep].b = X.b * pC[iStep] + Y.b * pD[iStep]; + } + + + // Calculate color direction + Dir.r = Y.r - X.r; + Dir.g = Y.g - X.g; + Dir.b = Y.b - X.b; + + float fLen = (Dir.r * Dir.r + Dir.g * Dir.g + Dir.b * Dir.b); + + if(fLen < (1.0f / 4096.0f)) + break; + + float fScale = fSteps / fLen; + + Dir.r *= fScale; + Dir.g *= fScale; + Dir.b *= fScale; + + + // Evaluate function, and derivatives + float d2X, d2Y; + HDRColorA dX, dY; + d2X = d2Y = dX.r = dX.g = dX.b = dY.r = dY.g = dY.b = 0.0f; + + for(size_t iPoint = 0; iPoint < NUM_PIXELS_PER_BLOCK; iPoint++) + { + float fDot = (pPoints[iPoint].r - X.r) * Dir.r + + (pPoints[iPoint].g - X.g) * Dir.g + + (pPoints[iPoint].b - X.b) * Dir.b; + + + size_t iStep; + if(fDot <= 0.0f) + iStep = 0; + if(fDot >= fSteps) + iStep = cSteps - 1; + else + iStep = static_cast(fDot + 0.5f); + + + HDRColorA Diff; + Diff.r = pSteps[iStep].r - pPoints[iPoint].r; + Diff.g = pSteps[iStep].g - pPoints[iPoint].g; + Diff.b = pSteps[iStep].b - pPoints[iPoint].b; + +#ifdef COLOR_WEIGHTS + float fC = pC[iStep] * pPoints[iPoint].a * (1.0f / 8.0f); + float fD = pD[iStep] * pPoints[iPoint].a * (1.0f / 8.0f); +#else + float fC = pC[iStep] * (1.0f / 8.0f); + float fD = pD[iStep] * (1.0f / 8.0f); +#endif // COLOR_WEIGHTS + + d2X += fC * pC[iStep]; + dX.r += fC * Diff.r; + dX.g += fC * Diff.g; + dX.b += fC * Diff.b; + + d2Y += fD * pD[iStep]; + dY.r += fD * Diff.r; + dY.g += fD * Diff.g; + dY.b += fD * Diff.b; + } + + + // Move endpoints + if(d2X > 0.0f) + { + float f = -1.0f / d2X; + + X.r += dX.r * f; + X.g += dX.g * f; + X.b += dX.b * f; + } + + if(d2Y > 0.0f) + { + float f = -1.0f / d2Y; + + Y.r += dY.r * f; + Y.g += dY.g * f; + Y.b += dY.b * f; + } + + if((dX.r * dX.r < fEpsilon) && (dX.g * dX.g < fEpsilon) && (dX.b * dX.b < fEpsilon) && + (dY.r * dY.r < fEpsilon) && (dY.g * dY.g < fEpsilon) && (dY.b * dY.b < fEpsilon)) + { + break; + } + } + + pX->r = X.r; pX->g = X.g; pX->b = X.b; + pY->r = Y.r; pY->g = Y.g; pY->b = Y.b; +} + + +//------------------------------------------------------------------------------------- +inline static void DecodeBC1( _Out_cap_c_(NUM_PIXELS_PER_BLOCK) XMVECTOR *pColor, _In_ const D3DX_BC1 *pBC ) +{ + assert( pColor && pBC ); + static_assert( sizeof(D3DX_BC1) == 8, "D3DX_BC1 should be 8 bytes" ); + + static XMVECTORF32 s_Scale = { 1.f/31.f, 1.f/63.f, 1.f/31.f, 1.f }; + + XMVECTOR clr0 = XMLoadU565( reinterpret_cast(&pBC->rgb[0]) ); + XMVECTOR clr1 = XMLoadU565( reinterpret_cast(&pBC->rgb[1]) ); + + clr0 = XMVectorMultiply( clr0, s_Scale ); + clr1 = XMVectorMultiply( clr1, s_Scale ); + + clr0 = XMVectorSwizzle( clr0, 2, 1, 0, 3 ); + clr1 = XMVectorSwizzle( clr1, 2, 1, 0, 3 ); + + clr0 = XMVectorSelect( g_XMIdentityR3, clr0, g_XMSelect1110 ); + clr1 = XMVectorSelect( g_XMIdentityR3, clr1, g_XMSelect1110 ); + + XMVECTOR clr2, clr3; + if(pBC->rgb[0] <= pBC->rgb[1]) + { + clr2 = XMVectorLerp( clr0, clr1, 0.5f ); + clr3 = XMVectorZero(); // Alpha of 0 + } + else + { + clr2 = XMVectorLerp( clr0, clr1, 1.f/3.f ); + clr3 = XMVectorLerp( clr0, clr1, 2.f/3.f ); + } + + uint32_t dw = pBC->bitmap; + + for(size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i, dw >>= 2) + { + switch(dw & 3) + { + case 0: pColor[i] = clr0; break; + case 1: pColor[i] = clr1; break; + case 2: pColor[i] = clr2; break; + + case 3: + default: pColor[i] = clr3; break; + } + } +} + + +//------------------------------------------------------------------------------------- +#pragma warning(disable: 4616 6001 6201) + +static void EncodeBC1(_Out_ D3DX_BC1 *pBC, _In_count_c_(NUM_PIXELS_PER_BLOCK) const HDRColorA *pColor, + _In_ bool bColorKey, _In_ float alphaRef, _In_ DWORD flags) +{ + assert( pBC && pColor ); + static_assert( sizeof(D3DX_BC1) == 8, "D3DX_BC1 should be 8 bytes" ); + + // Determine if we need to colorkey this block + size_t uSteps; + + if (bColorKey) + { + size_t uColorKey = 0; + + for(size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) + { + if(pColor[i].a < alphaRef) + uColorKey++; + } + + if(NUM_PIXELS_PER_BLOCK == uColorKey) + { + pBC->rgb[0] = 0x0000; + pBC->rgb[1] = 0xffff; + pBC->bitmap = 0xffffffff; + return; + } + + uSteps = (uColorKey > 0) ? 3 : 4; + } + else + { + uSteps = 4; + } + + // Quantize block to R56B5, using Floyd Stienberg error diffusion. This + // increases the chance that colors will map directly to the quantized + // axis endpoints. + HDRColorA Color[NUM_PIXELS_PER_BLOCK]; + HDRColorA Error[NUM_PIXELS_PER_BLOCK]; + + if (flags & BC_FLAGS_DITHER_RGB) + memset(Error, 0x00, NUM_PIXELS_PER_BLOCK * sizeof(HDRColorA)); + + size_t i; + for(i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) + { + HDRColorA Clr; + Clr.r = pColor[i].r; + Clr.g = pColor[i].g; + Clr.b = pColor[i].b; + + if (flags & BC_FLAGS_DITHER_RGB) + { + Clr.r += Error[i].r; + Clr.g += Error[i].g; + Clr.b += Error[i].b; + } + + Color[i].r = (float) static_cast(Clr.r * 31.0f + 0.5f) * (1.0f / 31.0f); + Color[i].g = (float) static_cast(Clr.g * 63.0f + 0.5f) * (1.0f / 63.0f); + Color[i].b = (float) static_cast(Clr.b * 31.0f + 0.5f) * (1.0f / 31.0f); + +#ifdef COLOR_WEIGHTS + Color[i].a = pColor[i].a; +#else + Color[i].a = 1.0f; +#endif // COLOR_WEIGHTS + + if (flags & BC_FLAGS_DITHER_RGB) + { + HDRColorA Diff; + Diff.r = Color[i].a * (Clr.r - Color[i].r); + Diff.g = Color[i].a * (Clr.g - Color[i].g); + Diff.b = Color[i].a * (Clr.b - Color[i].b); + + if(3 != (i & 3)) + { + assert( i < 15 ); + __analysis_assume( i < 15 ); + Error[i + 1].r += Diff.r * (7.0f / 16.0f); + Error[i + 1].g += Diff.g * (7.0f / 16.0f); + Error[i + 1].b += Diff.b * (7.0f / 16.0f); + } + + if(i < 12) + { + if(i & 3) + { + Error[i + 3].r += Diff.r * (3.0f / 16.0f); + Error[i + 3].g += Diff.g * (3.0f / 16.0f); + Error[i + 3].b += Diff.b * (3.0f / 16.0f); + } + + Error[i + 4].r += Diff.r * (5.0f / 16.0f); + Error[i + 4].g += Diff.g * (5.0f / 16.0f); + Error[i + 4].b += Diff.b * (5.0f / 16.0f); + + if(3 != (i & 3)) + { + assert( i < 11 ); + __analysis_assume(i < 11 ); + Error[i + 5].r += Diff.r * (1.0f / 16.0f); + Error[i + 5].g += Diff.g * (1.0f / 16.0f); + Error[i + 5].b += Diff.b * (1.0f / 16.0f); + } + } + } + + if ( !( flags & BC_FLAGS_UNIFORM ) ) + { + Color[i].r *= g_Luminance.r; + Color[i].g *= g_Luminance.g; + Color[i].b *= g_Luminance.b; + } + } + + // Perform 6D root finding function to find two endpoints of color axis. + // Then quantize and sort the endpoints depending on mode. + HDRColorA ColorA, ColorB, ColorC, ColorD; + + OptimizeRGB(&ColorA, &ColorB, Color, uSteps, flags); + + if ( flags & BC_FLAGS_UNIFORM ) + { + ColorC = ColorA; + ColorD = ColorB; + } + else + { + ColorC.r = ColorA.r * g_LuminanceInv.r; + ColorC.g = ColorA.g * g_LuminanceInv.g; + ColorC.b = ColorA.b * g_LuminanceInv.b; + + ColorD.r = ColorB.r * g_LuminanceInv.r; + ColorD.g = ColorB.g * g_LuminanceInv.g; + ColorD.b = ColorB.b * g_LuminanceInv.b; + } + + uint16_t wColorA = Encode565(&ColorC); + uint16_t wColorB = Encode565(&ColorD); + + if((uSteps == 4) && (wColorA == wColorB)) + { + pBC->rgb[0] = wColorA; + pBC->rgb[1] = wColorB; + pBC->bitmap = 0x00000000; + return; + } + + Decode565(&ColorC, wColorA); + Decode565(&ColorD, wColorB); + + if ( flags & BC_FLAGS_UNIFORM ) + { + ColorA = ColorC; + ColorB = ColorD; + } + else + { + ColorA.r = ColorC.r * g_Luminance.r; + ColorA.g = ColorC.g * g_Luminance.g; + ColorA.b = ColorC.b * g_Luminance.b; + + ColorB.r = ColorD.r * g_Luminance.r; + ColorB.g = ColorD.g * g_Luminance.g; + ColorB.b = ColorD.b * g_Luminance.b; + } + + // Calculate color steps + HDRColorA Step[4]; + + if((3 == uSteps) == (wColorA <= wColorB)) + { + pBC->rgb[0] = wColorA; + pBC->rgb[1] = wColorB; + + Step[0] = ColorA; + Step[1] = ColorB; + } + else + { + pBC->rgb[0] = wColorB; + pBC->rgb[1] = wColorA; + + Step[0] = ColorB; + Step[1] = ColorA; + } + + static const size_t pSteps3[] = { 0, 2, 1 }; + static const size_t pSteps4[] = { 0, 2, 3, 1 }; + const size_t *pSteps; + + if(3 == uSteps) + { + pSteps = pSteps3; + + HDRColorALerp(&Step[2], &Step[0], &Step[1], 0.5f); + } + else + { + pSteps = pSteps4; + + HDRColorALerp(&Step[2], &Step[0], &Step[1], 1.0f / 3.0f); + HDRColorALerp(&Step[3], &Step[0], &Step[1], 2.0f / 3.0f); + } + + // Calculate color direction + HDRColorA Dir; + + Dir.r = Step[1].r - Step[0].r; + Dir.g = Step[1].g - Step[0].g; + Dir.b = Step[1].b - Step[0].b; + + float fSteps = (float) (uSteps - 1); + float fScale = (wColorA != wColorB) ? (fSteps / (Dir.r * Dir.r + Dir.g * Dir.g + Dir.b * Dir.b)) : 0.0f; + + Dir.r *= fScale; + Dir.g *= fScale; + Dir.b *= fScale; + + // Encode colors + uint32_t dw = 0; + if (flags & BC_FLAGS_DITHER_RGB) + memset(Error, 0x00, NUM_PIXELS_PER_BLOCK * sizeof(HDRColorA)); + + for(i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) + { + if((3 == uSteps) && (pColor[i].a < alphaRef)) + { + dw = (3 << 30) | (dw >> 2); + } + else + { + HDRColorA Clr; + if ( flags & BC_FLAGS_UNIFORM ) + { + Clr.r = pColor[i].r; + Clr.g = pColor[i].g; + Clr.b = pColor[i].b; + } + else + { + Clr.r = pColor[i].r * g_Luminance.r; + Clr.g = pColor[i].g * g_Luminance.g; + Clr.b = pColor[i].b * g_Luminance.b; + } + + if (flags & BC_FLAGS_DITHER_RGB) + { + Clr.r += Error[i].r; + Clr.g += Error[i].g; + Clr.b += Error[i].b; + } + + float fDot = (Clr.r - Step[0].r) * Dir.r + (Clr.g - Step[0].g) * Dir.g + (Clr.b - Step[0].b) * Dir.b; + uint32_t iStep; + + if(fDot <= 0.0f) + iStep = 0; + else if(fDot >= fSteps) + iStep = 1; + else + iStep = static_cast( pSteps[static_cast(fDot + 0.5f)] ); + + dw = (iStep << 30) | (dw >> 2); + + if (flags & BC_FLAGS_DITHER_RGB) + { + HDRColorA Diff; + Diff.r = Color[i].a * (Clr.r - Step[iStep].r); + Diff.g = Color[i].a * (Clr.g - Step[iStep].g); + Diff.b = Color[i].a * (Clr.b - Step[iStep].b); + + if(3 != (i & 3)) + { + Error[i + 1].r += Diff.r * (7.0f / 16.0f); + Error[i + 1].g += Diff.g * (7.0f / 16.0f); + Error[i + 1].b += Diff.b * (7.0f / 16.0f); + } + + if(i < 12) + { + if(i & 3) + { + Error[i + 3].r += Diff.r * (3.0f / 16.0f); + Error[i + 3].g += Diff.g * (3.0f / 16.0f); + Error[i + 3].b += Diff.b * (3.0f / 16.0f); + } + + Error[i + 4].r += Diff.r * (5.0f / 16.0f); + Error[i + 4].g += Diff.g * (5.0f / 16.0f); + Error[i + 4].b += Diff.b * (5.0f / 16.0f); + + if(3 != (i & 3)) + { + Error[i + 5].r += Diff.r * (1.0f / 16.0f); + Error[i + 5].g += Diff.g * (1.0f / 16.0f); + Error[i + 5].b += Diff.b * (1.0f / 16.0f); + } + } + } + } + } + + pBC->bitmap = dw; +} + +//------------------------------------------------------------------------------------- +#ifdef COLOR_WEIGHTS +static void EncodeSolidBC1(_Out_ D3DX_BC1 *pBC, _In_count_c_(NUM_PIXELS_PER_BLOCK) const HDRColorA *pColor) +{ +#ifdef COLOR_AVG_0WEIGHTS + // Compute avg color + HDRColorA Color; + Color.r = pColor[0].r; + Color.g = pColor[0].g; + Color.b = pColor[0].b; + + for(size_t i = 1; i < NUM_PIXELS_PER_BLOCK; ++i) + { + Color.r += pColor[i].r; + Color.g += pColor[i].g; + Color.b += pColor[i].b; + } + + Color.r *= 1.0f / 16.0f; + Color.g *= 1.0f / 16.0f; + Color.b *= 1.0f / 16.0f; + + uint16_t wColor = Encode565(&Color); +#else + uint16_t wColor = 0x0000; +#endif // COLOR_AVG_0WEIGHTS + + // Encode solid block + pBC->rgb[0] = wColor; + pBC->rgb[1] = wColor; + pBC->bitmap = 0x00000000; +} +#endif // COLOR_WEIGHTS + + +//===================================================================================== +// Entry points +//===================================================================================== + +//------------------------------------------------------------------------------------- +// BC1 Compression +//------------------------------------------------------------------------------------- +void D3DXDecodeBC1(XMVECTOR *pColor, const uint8_t *pBC) +{ + const D3DX_BC1 *pBC1 = reinterpret_cast(pBC); + DecodeBC1( pColor, pBC1 ); +} + +void D3DXEncodeBC1(uint8_t *pBC, const XMVECTOR *pColor, float alphaRef, DWORD flags) +{ + assert( pBC && pColor ); + + HDRColorA Color[NUM_PIXELS_PER_BLOCK]; + + if (flags & BC_FLAGS_DITHER_A) + { + float fError[NUM_PIXELS_PER_BLOCK]; + memset(fError, 0x00, NUM_PIXELS_PER_BLOCK * sizeof(float)); + + for(size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) + { + HDRColorA clr; + XMStoreFloat4( reinterpret_cast( &clr ), pColor[i] ); + + float fAlph = clr.a + fError[i]; + + Color[i].r = clr.r; + Color[i].g = clr.g; + Color[i].b = clr.b; + Color[i].a = (float) static_cast(clr.a + fError[i] + 0.5f); + + float fDiff = fAlph - Color[i].a; + + if(3 != (i & 3)) + { + assert( i < 15 ); + __analysis_assume( i < 15 ); + fError[i + 1] += fDiff * (7.0f / 16.0f); + } + + if(i < 12) + { + if(i & 3) + fError[i + 3] += fDiff * (3.0f / 16.0f); + + fError[i + 4] += fDiff * (5.0f / 16.0f); + + if(3 != (i & 3)) + { + assert( i < 11 ); + __analysis_assume( i < 11 ); + fError[i + 5] += fDiff * (1.0f / 16.0f); + } + } + } + } + else + { + for(size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) + { + XMStoreFloat4( reinterpret_cast( &Color[i] ), pColor[i] ); + } + } + + D3DX_BC1 *pBC1 = reinterpret_cast(pBC); + EncodeBC1(pBC1, Color, true, alphaRef, flags); +} + + +//------------------------------------------------------------------------------------- +// BC2 Compression +//------------------------------------------------------------------------------------- +void D3DXDecodeBC2(XMVECTOR *pColor, const uint8_t *pBC) +{ + assert( pColor && pBC ); + static_assert( sizeof(D3DX_BC2) == 16, "D3DX_BC2 should be 16 bytes" ); + + const D3DX_BC2 *pBC2 = reinterpret_cast(pBC); + + // RGB part + DecodeBC1(pColor, &pBC2->bc1); + + // 4-bit alpha part + DWORD dw = pBC2->bitmap[0]; + + for(size_t i = 0; i < 8; ++i, dw >>= 4) + pColor[i] = XMVectorSetW( pColor[i], (float) (dw & 0xf) * (1.0f / 15.0f) ); + + dw = pBC2->bitmap[1]; + + for(size_t i = 8; i < NUM_PIXELS_PER_BLOCK; ++i, dw >>= 4) + pColor[i] = XMVectorSetW( pColor[i], (float) (dw & 0xf) * (1.0f / 15.0f) ); +} + +void D3DXEncodeBC2(uint8_t *pBC, const XMVECTOR *pColor, DWORD flags) +{ + assert( pBC && pColor ); + static_assert( sizeof(D3DX_BC2) == 16, "D3DX_BC2 should be 16 bytes" ); + + HDRColorA Color[NUM_PIXELS_PER_BLOCK]; + for(size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) + { + XMStoreFloat4( reinterpret_cast( &Color[i] ), pColor[i] ); + } + + D3DX_BC2 *pBC2 = reinterpret_cast(pBC); + + // 4-bit alpha part. Dithered using Floyd Stienberg error diffusion. + pBC2->bitmap[0] = 0; + pBC2->bitmap[1] = 0; + + float fError[NUM_PIXELS_PER_BLOCK]; + if (flags & BC_FLAGS_DITHER_A) + memset(fError, 0x00, NUM_PIXELS_PER_BLOCK * sizeof(float)); + + for(size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) + { + float fAlph = Color[i].a; + if (flags & BC_FLAGS_DITHER_A) + fAlph += fError[i]; + + uint32_t u = (uint32_t) static_cast(fAlph * 15.0f + 0.5f); + + pBC2->bitmap[i >> 3] >>= 4; + pBC2->bitmap[i >> 3] |= (u << 28); + + if (flags & BC_FLAGS_DITHER_A) + { + float fDiff = fAlph - (float) u * (1.0f / 15.0f); + + if(3 != (i & 3)) + { + assert( i < 15 ); + __analysis_assume( i < 15 ); + fError[i + 1] += fDiff * (7.0f / 16.0f); + } + + if(i < 12) + { + if(i & 3) + fError[i + 3] += fDiff * (3.0f / 16.0f); + + fError[i + 4] += fDiff * (5.0f / 16.0f); + + if(3 != (i & 3)) + { + assert( i < 11 ); + __analysis_assume( i < 11 ); + fError[i + 5] += fDiff * (1.0f / 16.0f); + } + } + } + } + + // RGB part +#ifdef COLOR_WEIGHTS + if(!pBC2->bitmap[0] && !pBC2->bitmap[1]) + { + EncodeSolidBC1(pBC2->dxt1, Color); + return; + } +#endif // COLOR_WEIGHTS + + EncodeBC1(&pBC2->bc1, Color, false, 0.f, flags); +} + + +//------------------------------------------------------------------------------------- +// BC3 Compression +//------------------------------------------------------------------------------------- +void D3DXDecodeBC3(XMVECTOR *pColor, const uint8_t *pBC) +{ + assert( pColor && pBC ); + static_assert( sizeof(D3DX_BC3) == 16, "D3DX_BC3 should be 16 bytes" ); + + const D3DX_BC3 *pBC3 = reinterpret_cast(pBC); + + // RGB part + DecodeBC1(pColor, &pBC3->bc1); + + // Adaptive 3-bit alpha part + float fAlpha[8]; + + fAlpha[0] = ((float) pBC3->alpha[0]) * (1.0f / 255.0f); + fAlpha[1] = ((float) pBC3->alpha[1]) * (1.0f / 255.0f); + + if(pBC3->alpha[0] > pBC3->alpha[1]) + { + for(size_t i = 1; i < 7; ++i) + fAlpha[i + 1] = (fAlpha[0] * (7 - i) + fAlpha[1] * i) * (1.0f / 7.0f); + } + else + { + for(size_t i = 1; i < 5; ++i) + fAlpha[i + 1] = (fAlpha[0] * (5 - i) + fAlpha[1] * i) * (1.0f / 5.0f); + + fAlpha[6] = 0.0f; + fAlpha[7] = 1.0f; + } + + DWORD dw = pBC3->bitmap[0] | (pBC3->bitmap[1] << 8) | (pBC3->bitmap[2] << 16); + + for(size_t i = 0; i < 8; ++i, dw >>= 3) + pColor[i] = XMVectorSetW( pColor[i], fAlpha[dw & 0x7] ); + + dw = pBC3->bitmap[3] | (pBC3->bitmap[4] << 8) | (pBC3->bitmap[5] << 16); + + for(size_t i = 8; i < NUM_PIXELS_PER_BLOCK; ++i, dw >>= 3) + pColor[i] = XMVectorSetW( pColor[i], fAlpha[dw & 0x7] ); +} + +void D3DXEncodeBC3(uint8_t *pBC, const XMVECTOR *pColor, DWORD flags) +{ + assert( pBC && pColor ); + static_assert( sizeof(D3DX_BC3) == 16, "D3DX_BC3 should be 16 bytes" ); + + HDRColorA Color[NUM_PIXELS_PER_BLOCK]; + for(size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) + { + XMStoreFloat4( reinterpret_cast( &Color[i] ), pColor[i] ); + } + + D3DX_BC3 *pBC3 = reinterpret_cast(pBC); + + // Quantize block to A8, using Floyd Stienberg error diffusion. This + // increases the chance that colors will map directly to the quantized + // axis endpoints. + float fAlpha[NUM_PIXELS_PER_BLOCK]; + float fError[NUM_PIXELS_PER_BLOCK]; + + float fMinAlpha = Color[0].a; + float fMaxAlpha = Color[0].a; + + if (flags & BC_FLAGS_DITHER_A) + memset(fError, 0x00, NUM_PIXELS_PER_BLOCK * sizeof(float)); + + for(size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) + { + float fAlph = Color[i].a; + if (flags & BC_FLAGS_DITHER_A) + fAlph += fError[i]; + + fAlpha[i] = static_cast(fAlph * 255.0f + 0.5f) * (1.0f / 255.0f); + + if(fAlpha[i] < fMinAlpha) + fMinAlpha = fAlpha[i]; + else if(fAlpha[i] > fMaxAlpha) + fMaxAlpha = fAlpha[i]; + + if (flags & BC_FLAGS_DITHER_A) + { + float fDiff = fAlph - fAlpha[i]; + + if(3 != (i & 3)) + { + assert( i < 15 ); + __analysis_assume( i < 15 ); + fError[i + 1] += fDiff * (7.0f / 16.0f); + } + + if(i < 12) + { + if(i & 3) + fError[i + 3] += fDiff * (3.0f / 16.0f); + + fError[i + 4] += fDiff * (5.0f / 16.0f); + + if(3 != (i & 3)) + { + assert( i < 11 ); + __analysis_assume( i < 11 ); + fError[i + 5] += fDiff * (1.0f / 16.0f); + } + } + } + } + +#ifdef COLOR_WEIGHTS + if(0.0f == fMaxAlpha) + { + EncodeSolidBC1(&pBC3->dxt1, Color); + pBC3->alpha[0] = 0x00; + pBC3->alpha[1] = 0x00; + memset(pBC3->bitmap, 0x00, 6); + } +#endif + + // RGB part + EncodeBC1(&pBC3->bc1, Color, false, 0.f, flags); + + // Alpha part + if(1.0f == fMinAlpha) + { + pBC3->alpha[0] = 0xff; + pBC3->alpha[1] = 0xff; + memset(pBC3->bitmap, 0x00, 6); + return; + } + + // Optimize and Quantize Min and Max values + size_t uSteps = ((0.0f == fMinAlpha) || (1.0f == fMaxAlpha)) ? 6 : 8; + + float fAlphaA, fAlphaB; + OptimizeAlpha(&fAlphaA, &fAlphaB, fAlpha, uSteps); + + uint8_t bAlphaA = (uint8_t) static_cast(fAlphaA * 255.0f + 0.5f); + uint8_t bAlphaB = (uint8_t) static_cast(fAlphaB * 255.0f + 0.5f); + + fAlphaA = (float) bAlphaA * (1.0f / 255.0f); + fAlphaB = (float) bAlphaB * (1.0f / 255.0f); + + // Setup block + if((8 == uSteps) && (bAlphaA == bAlphaB)) + { + pBC3->alpha[0] = bAlphaA; + pBC3->alpha[1] = bAlphaB; + memset(pBC3->bitmap, 0x00, 6); + return; + } + + static const size_t pSteps6[] = { 0, 2, 3, 4, 5, 1 }; + static const size_t pSteps8[] = { 0, 2, 3, 4, 5, 6, 7, 1 }; + + const size_t *pSteps; + float fStep[8]; + + if(6 == uSteps) + { + pBC3->alpha[0] = bAlphaA; + pBC3->alpha[1] = bAlphaB; + + fStep[0] = fAlphaA; + fStep[1] = fAlphaB; + + for(size_t i = 1; i < 5; ++i) + fStep[i + 1] = (fStep[0] * (5 - i) + fStep[1] * i) * (1.0f / 5.0f); + + fStep[6] = 0.0f; + fStep[7] = 1.0f; + + pSteps = pSteps6; + } + else + { + pBC3->alpha[0] = bAlphaB; + pBC3->alpha[1] = bAlphaA; + + fStep[0] = fAlphaB; + fStep[1] = fAlphaA; + + for(size_t i = 1; i < 7; ++i) + fStep[i + 1] = (fStep[0] * (7 - i) + fStep[1] * i) * (1.0f / 7.0f); + + pSteps = pSteps8; + } + + // Encode alpha bitmap + float fSteps = (float) (uSteps - 1); + float fScale = (fStep[0] != fStep[1]) ? (fSteps / (fStep[1] - fStep[0])) : 0.0f; + + if (flags & BC_FLAGS_DITHER_A) + memset(fError, 0x00, NUM_PIXELS_PER_BLOCK * sizeof(float)); + + for(size_t iSet = 0; iSet < 2; iSet++) + { + uint32_t dw = 0; + + size_t iMin = iSet * 8; + size_t iLim = iMin + 8; + + for(size_t i = iMin; i < iLim; ++i) + { + float fAlph = Color[i].a; + if (flags & BC_FLAGS_DITHER_A) + fAlph += fError[i]; + float fDot = (fAlph - fStep[0]) * fScale; + + uint32_t iStep; + if(fDot <= 0.0f) + iStep = ((6 == uSteps) && (fAlph <= fStep[0] * 0.5f)) ? 6 : 0; + else if(fDot >= fSteps) + iStep = ((6 == uSteps) && (fAlph >= (fStep[1] + 1.0f) * 0.5f)) ? 7 : 1; + else + iStep = static_cast( pSteps[static_cast(fDot + 0.5f)] ); + + dw = (iStep << 21) | (dw >> 3); + + if (flags & BC_FLAGS_DITHER_A) + { + float fDiff = (fAlph - fStep[iStep]); + + if(3 != (i & 3)) + fError[i + 1] += fDiff * (7.0f / 16.0f); + + if(i < 12) + { + if(i & 3) + fError[i + 3] += fDiff * (3.0f / 16.0f); + + fError[i + 4] += fDiff * (5.0f / 16.0f); + + if(3 != (i & 3)) + fError[i + 5] += fDiff * (1.0f / 16.0f); + } + } + } + + pBC3->bitmap[0 + iSet * 3] = ((uint8_t *) &dw)[0]; + pBC3->bitmap[1 + iSet * 3] = ((uint8_t *) &dw)[1]; + pBC3->bitmap[2 + iSet * 3] = ((uint8_t *) &dw)[2]; + } +} + +} // namespace \ No newline at end of file diff --git a/thirdparty/directxtex/DirectXTex/BC.h b/thirdparty/directxtex/DirectXTex/BC.h new file mode 100644 index 0000000..638058e --- /dev/null +++ b/thirdparty/directxtex/DirectXTex/BC.h @@ -0,0 +1,897 @@ +//------------------------------------------------------------------------------------- +// BC.h +// +// Block-compression (BC) functionality +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkId=248926 +//------------------------------------------------------------------------------------- + +#if defined(_MSC_VER) && (_MSC_VER > 1000) +#pragma once +#endif + +#include + +#ifdef USE_XNAMATH +#include +#else +#include +#include +#endif + +#include + +#pragma warning(push) +#pragma warning(disable : 4005) +#include +#pragma warning(pop) + +namespace DirectX +{ + +#ifndef USE_XNAMATH +typedef PackedVector::HALF HALF; +typedef PackedVector::XMHALF4 XMHALF4; +typedef PackedVector::XMU565 XMU565; +#endif + +//------------------------------------------------------------------------------------- +// Constants +//------------------------------------------------------------------------------------- + +const uint16_t F16S_MASK = 0x8000; // f16 sign mask +const uint16_t F16EM_MASK = 0x7fff; // f16 exp & mantissa mask +const uint16_t F16MAX = 0x7bff; // MAXFLT bit pattern for XMHALF + +#define SIGN_EXTEND(x,nb) ((((x)&(1<<((nb)-1)))?((~0)<<(nb)):0)|(x)) + +// Because these are used in SAL annotations, they need to remain macros rather than const values +#define NUM_PIXELS_PER_BLOCK 16 +#define BC6H_MAX_REGIONS 2 +#define BC6H_MAX_INDICES 16 +#define BC7_MAX_REGIONS 3 +#define BC7_MAX_INDICES 16 + +const size_t BC6H_NUM_CHANNELS = 3; +const size_t BC6H_MAX_SHAPES = 32; + +const size_t BC7_NUM_CHANNELS = 4; +const size_t BC7_MAX_SHAPES = 64; + +const uint32_t BC67_WEIGHT_MAX = 64; +const uint32_t BC67_WEIGHT_SHIFT = 6; +const uint32_t BC67_WEIGHT_ROUND = 32; + +extern const int g_aWeights2[4]; +extern const int g_aWeights3[8]; +extern const int g_aWeights4[16]; + +enum BC_FLAGS +{ + BC_FLAGS_NONE = 0x0, + BC_FLAGS_DITHER_RGB = 0x10000, // Enables dithering for RGB colors for BC1-3 + BC_FLAGS_DITHER_A = 0x20000, // Enables dithering for Alpha channel for BC1-3 + BC_FLAGS_UNIFORM = 0x40000, // By default, uses perceptual weighting for BC1-3; this flag makes it a uniform weighting +}; + +//------------------------------------------------------------------------------------- +// Structures +//------------------------------------------------------------------------------------- +class HDRColorA; + +class LDRColorA +{ +public: + uint8_t r, g, b, a; + + LDRColorA() {} + LDRColorA(uint8_t _r, uint8_t _g, uint8_t _b, uint8_t _a) : r(_r), g(_g), b(_b), a(_a) {} + + const uint8_t& operator [] (_In_range_(0,3) size_t uElement) const + { + switch(uElement) + { + case 0: return r; + case 1: return g; + case 2: return b; + case 3: return a; + default: assert(false); return r; + } + } + + uint8_t& operator [] (_In_range_(0,3) size_t uElement) + { + switch(uElement) + { + case 0: return r; + case 1: return g; + case 2: return b; + case 3: return a; + default: assert(false); return r; + } + } + + LDRColorA operator = (_In_ const HDRColorA& c); + + static void InterpolateRGB(_In_ const LDRColorA& c0, _In_ const LDRColorA& c1, _In_ size_t wc, _In_ size_t wcprec, _Out_ LDRColorA& out) + { + const int* aWeights = nullptr; + switch(wcprec) + { + case 2: aWeights = g_aWeights2; assert( wc < 4 ); __analysis_assume( wc < 4 ); break; + case 3: aWeights = g_aWeights3; assert( wc < 8 ); __analysis_assume( wc < 8 ); break; + case 4: aWeights = g_aWeights4; assert( wc < 16 ); __analysis_assume( wc < 16 ); break; + default: assert(false); out.r = out.g = out.b = 0; return; + } + out.r = uint8_t((uint32_t(c0.r) * uint32_t(BC67_WEIGHT_MAX - aWeights[wc]) + uint32_t(c1.r) * uint32_t(aWeights[wc]) + BC67_WEIGHT_ROUND) >> BC67_WEIGHT_SHIFT); + out.g = uint8_t((uint32_t(c0.g) * uint32_t(BC67_WEIGHT_MAX - aWeights[wc]) + uint32_t(c1.g) * uint32_t(aWeights[wc]) + BC67_WEIGHT_ROUND) >> BC67_WEIGHT_SHIFT); + out.b = uint8_t((uint32_t(c0.b) * uint32_t(BC67_WEIGHT_MAX - aWeights[wc]) + uint32_t(c1.b) * uint32_t(aWeights[wc]) + BC67_WEIGHT_ROUND) >> BC67_WEIGHT_SHIFT); + } + + static void InterpolateA(_In_ const LDRColorA& c0, _In_ const LDRColorA& c1, _In_ size_t wa, _In_ size_t waprec, _Out_ LDRColorA& out) + { + const int* aWeights = nullptr; + switch(waprec) + { + case 2: aWeights = g_aWeights2; assert( wa < 4 ); __analysis_assume( wa < 4 ); break; + case 3: aWeights = g_aWeights3; assert( wa < 8 ); __analysis_assume( wa < 8 ); break; + case 4: aWeights = g_aWeights4; assert( wa < 16 ); __analysis_assume( wa < 16 ); break; + default: assert(false); out.a = 0; return; + } + out.a = uint8_t((uint32_t(c0.a) * uint32_t(BC67_WEIGHT_MAX - aWeights[wa]) + uint32_t(c1.a) * uint32_t(aWeights[wa]) + BC67_WEIGHT_ROUND) >> BC67_WEIGHT_SHIFT); + } + + static void Interpolate(_In_ const LDRColorA& c0, _In_ const LDRColorA& c1, _In_ size_t wc, _In_ size_t wa, _In_ size_t wcprec, _In_ size_t waprec, _Out_ LDRColorA& out) + { + InterpolateRGB(c0, c1, wc, wcprec, out); + InterpolateA(c0, c1, wa, waprec, out); + } +}; + +class HDRColorA +{ +public: + float r, g, b, a; + +public: + HDRColorA() {} + HDRColorA(float _r, float _g, float _b, float _a) : r(_r), g(_g), b(_b), a(_a) {} + HDRColorA(const HDRColorA& c) : r(c.r), g(c.g), b(c.b), a(c.a) {} + HDRColorA(const LDRColorA& c) + { + r = float(c.r) * (1.0f/255.0f); + g = float(c.g) * (1.0f/255.0f); + b = float(c.b) * (1.0f/255.0f); + a = float(c.a) * (1.0f/255.0f); + } + + // binary operators + HDRColorA operator + ( _In_ const HDRColorA& c ) const + { + return HDRColorA(r + c.r, g + c.g, b + c.b, a + c.a); + } + + HDRColorA operator - ( _In_ const HDRColorA& c ) const + { + return HDRColorA(r - c.r, g - c.g, b - c.b, a - c.a); + } + + HDRColorA operator * ( _In_ float f ) const + { + return HDRColorA(r * f, g * f, b * f, a * f); + } + + HDRColorA operator / ( _In_ float f ) const + { + float fInv = 1.0f / f; + return HDRColorA(r * fInv, g * fInv, b * fInv, a * fInv); + } + + float operator * ( _In_ const HDRColorA& c ) const + { + return r * c.r + g * c.g + b * c.b + a * c.a; + } + + // assignment operators + HDRColorA& operator += ( _In_ const HDRColorA& c ) + { + r += c.r; + g += c.g; + b += c.b; + a += c.a; + return *this; + } + + HDRColorA& operator -= ( _In_ const HDRColorA& c ) + { + r -= c.r; + g -= c.g; + b -= c.b; + a -= c.a; + return *this; + } + + HDRColorA& operator *= ( _In_ float f ) + { + r *= f; + g *= f; + b *= f; + a *= f; + return *this; + } + + HDRColorA& operator /= ( _In_ float f ) + { + float fInv = 1.0f / f; + r *= fInv; + g *= fInv; + b *= fInv; + a *= fInv; + return *this; + } + + HDRColorA& operator = (_In_ const LDRColorA& c) + { + r = (float) c.r; + g = (float) c.g; + b = (float) c.b; + a = (float) c.a; + return *this; + } + + HDRColorA& Clamp(_In_ float fMin, _In_ float fMax) + { + r = std::min(fMax, std::max(fMin, r)); + g = std::min(fMax, std::max(fMin, g)); + b = std::min(fMax, std::max(fMin, b)); + a = std::min(fMax, std::max(fMin, a)); + return *this; + } + + LDRColorA ToLDRColorA() const + { + return LDRColorA((uint8_t) (r + 0.01f), (uint8_t) (g + 0.01f), (uint8_t) (b + 0.01f), (uint8_t) (a + 0.01f)); + } +}; + +inline LDRColorA LDRColorA::operator = (_In_ const HDRColorA& c) +{ + LDRColorA ret; + HDRColorA tmp(c); + tmp = tmp.Clamp(0.0f, 1.0f) * 255.0f; + ret.r = uint8_t(tmp.r + 0.001f); + ret.g = uint8_t(tmp.g + 0.001f); + ret.b = uint8_t(tmp.b + 0.001f); + ret.a = uint8_t(tmp.a + 0.001f); + return ret; +} + +struct LDREndPntPair +{ + LDRColorA A; + LDRColorA B; +}; + +struct HDREndPntPair +{ + HDRColorA A; + HDRColorA B; +}; + +inline HDRColorA* HDRColorALerp(_Out_ HDRColorA *pOut, _In_ const HDRColorA *pC1, _In_ const HDRColorA *pC2, _In_ float s) +{ + pOut->r = pC1->r + s * (pC2->r - pC1->r); + pOut->g = pC1->g + s * (pC2->g - pC1->g); + pOut->b = pC1->b + s * (pC2->b - pC1->b); + pOut->a = pC1->a + s * (pC2->a - pC1->a); + return pOut; +} + +#pragma pack(push,1) +// BC1/DXT1 compression (4 bits per texel) +struct D3DX_BC1 +{ + uint16_t rgb[2]; // 565 colors + uint32_t bitmap; // 2bpp rgb bitmap +}; + +// BC2/DXT2/3 compression (8 bits per texel) +struct D3DX_BC2 +{ + uint32_t bitmap[2]; // 4bpp alpha bitmap + D3DX_BC1 bc1; // BC1 rgb data +}; + +// BC3/DXT4/5 compression (8 bits per texel) +struct D3DX_BC3 +{ + uint8_t alpha[2]; // alpha values + uint8_t bitmap[6]; // 3bpp alpha bitmap + D3DX_BC1 bc1; // BC1 rgb data +}; +#pragma pack(pop) + +class INTColor +{ +public: + int r, g, b; + +public: + INTColor() {} + INTColor(int nr, int ng, int nb) {r = nr; g = ng; b = nb;} + INTColor(const INTColor& c) {r = c.r; g = c.g; b = c.b;} + + INTColor operator - ( _In_ const INTColor& c ) const + { + return INTColor(r - c.r, g - c.g, b - c.b); + } + + INTColor& operator += ( _In_ const INTColor& c ) + { + r += c.r; + g += c.g; + b += c.b; + return *this; + } + + INTColor& operator -= ( _In_ const INTColor& c ) + { + r -= c.r; + g -= c.g; + b -= c.b; + return *this; + } + + INTColor& operator &= ( _In_ const INTColor& c ) + { + r &= c.r; + g &= c.g; + b &= c.b; + return *this; + } + + int& operator [] ( _In_ uint8_t i ) + { + assert(i < sizeof(INTColor) / sizeof(int)); + __analysis_assume(i < sizeof(INTColor) / sizeof(int)); + return ((int*) this)[i]; + } + + void Set(_In_ const HDRColorA& c, _In_ bool bSigned) + { + XMHALF4 aF16; + + XMVECTOR v = XMLoadFloat4( (const XMFLOAT4*)& c ); + XMStoreHalf4( &aF16, v ); + + r = F16ToINT(aF16.x, bSigned); + g = F16ToINT(aF16.y, bSigned); + b = F16ToINT(aF16.z, bSigned); + } + + INTColor& Clamp(_In_ int iMin, _In_ int iMax) + { + r = std::min(iMax, std::max(iMin, r)); + g = std::min(iMax, std::max(iMin, g)); + b = std::min(iMax, std::max(iMin, b)); + return *this; + } + + INTColor& SignExtend(_In_ const LDRColorA& Prec) + { + r = SIGN_EXTEND(r, Prec.r); + g = SIGN_EXTEND(g, Prec.g); + b = SIGN_EXTEND(b, Prec.b); + return *this; + } + + void ToF16(_Out_cap_c_(3) HALF aF16[3], _In_ bool bSigned) const + { + aF16[0] = INT2F16(r, bSigned); + aF16[1] = INT2F16(g, bSigned); + aF16[2] = INT2F16(b, bSigned); + } + +private: + static int F16ToINT(_In_ const HALF& f, _In_ bool bSigned) + { + uint16_t input = *((const uint16_t*) &f); + int out, s; + if(bSigned) + { + s = input & F16S_MASK; + input &= F16EM_MASK; + if(input > F16MAX) out = F16MAX; + else out = input; + out = s ? -out : out; + } + else + { + if(input & F16S_MASK) out = 0; + else out = input; + } + return out; + } + + static HALF INT2F16(_In_ int input, _In_ bool bSigned) + { + HALF h; + uint16_t out; + if(bSigned) + { + int s = 0; + if(input < 0) + { + s = F16S_MASK; + input = -input; + } + out = uint16_t(s | input); + } + else + { + assert(input >= 0 && input <= F16MAX); + out = (uint16_t) input; + } + + *((uint16_t*) &h) = out; + return h; + } +}; + +struct INTEndPntPair +{ + INTColor A; + INTColor B; +}; + +template< size_t SizeInBytes > +class CBits +{ +public: + uint8_t GetBit(_Inout_ size_t& uStartBit) const + { + assert(uStartBit < 128); + __analysis_assume(uStartBit < 128); + size_t uIndex = uStartBit >> 3; + uint8_t ret = (m_uBits[uIndex] >> (uStartBit - (uIndex << 3))) & 0x01; + uStartBit++; + return ret; + } + + uint8_t GetBits(_Inout_ size_t& uStartBit, _In_ size_t uNumBits) const + { + if(uNumBits == 0) return 0; + assert(uStartBit + uNumBits <= 128 && uNumBits <= 8); + __analysis_assume(uStartBit + uNumBits <= 128 && uNumBits <= 8); + uint8_t ret; + size_t uIndex = uStartBit >> 3; + size_t uBase = uStartBit - (uIndex << 3); + if(uBase + uNumBits > 8) + { + size_t uFirstIndexBits = 8 - uBase; + size_t uNextIndexBits = uNumBits - uFirstIndexBits; + ret = (m_uBits[uIndex] >> uBase) | ((m_uBits[uIndex+1] & ((1 << uNextIndexBits) - 1)) << uFirstIndexBits); + } + else + { + ret = (m_uBits[uIndex] >> uBase) & ((1 << uNumBits) - 1); + } + assert(ret < (1 << uNumBits)); + uStartBit += uNumBits; + return ret; + } + + void SetBit(_Inout_ size_t& uStartBit, _In_ uint8_t uValue) + { + assert(uStartBit < 128 && uValue < 2); + __analysis_assume(uStartBit < 128 && uValue < 2); + size_t uIndex = uStartBit >> 3; + size_t uBase = uStartBit - (uIndex << 3); + m_uBits[uIndex] &= ~(1 << uBase); + m_uBits[uIndex] |= uValue << uBase; + uStartBit++; + } + + void SetBits(_Inout_ size_t& uStartBit, _In_ size_t uNumBits, _In_ uint8_t uValue) + { + if(uNumBits == 0) + return; + assert(uStartBit + uNumBits <= 128 && uNumBits <= 8); + __analysis_assume(uStartBit + uNumBits <= 128 && uNumBits <= 8); + assert(uValue < (1 << uNumBits)); + size_t uIndex = uStartBit >> 3; + size_t uBase = uStartBit - (uIndex << 3); + if(uBase + uNumBits > 8) + { + size_t uFirstIndexBits = 8 - uBase; + size_t uNextIndexBits = uNumBits - uFirstIndexBits; + m_uBits[uIndex] &= ~(((1 << uFirstIndexBits) - 1) << uBase); + m_uBits[uIndex] |= uValue << uBase; + m_uBits[uIndex+1] &= ~((1 << uNextIndexBits) - 1); + m_uBits[uIndex+1] |= uValue >> uFirstIndexBits; + } + else + { + m_uBits[uIndex] &= ~(((1 << uNumBits) - 1) << uBase); + m_uBits[uIndex] |= uValue << uBase; + } + uStartBit += uNumBits; + } + +private: + uint8_t m_uBits[ SizeInBytes ]; +}; + +#pragma warning(push) +#pragma warning(disable : 4127 4480 4512) + +// BC6H compression (16 bits per texel) +class D3DX_BC6H : private CBits< 16 > +{ +public: + void Decode(_In_ bool bSigned, _Out_cap_c_(NUM_PIXELS_PER_BLOCK) HDRColorA* pOut) const; + void Encode(_In_ bool bSigned, _In_count_c_(NUM_PIXELS_PER_BLOCK) const HDRColorA* const pIn); + +private: + enum EField : uint8_t + { + NA, // N/A + M, // Mode + D, // Shape + RW, + RX, + RY, + RZ, + GW, + GX, + GY, + GZ, + BW, + BX, + BY, + BZ, + }; + + struct ModeDescriptor + { + EField m_eField; + uint8_t m_uBit; + }; + + struct ModeInfo + { + uint8_t uMode; + uint8_t uPartitions; + bool bTransformed; + uint8_t uIndexPrec; + LDRColorA RGBAPrec[BC6H_MAX_REGIONS][2]; + }; + + struct EncodeParams + { + float fBestErr; + const bool bSigned; + uint8_t uMode; + uint8_t uShape; + const HDRColorA* const aHDRPixels; + INTEndPntPair aUnqEndPts[BC6H_MAX_SHAPES][BC6H_MAX_REGIONS]; + INTColor aIPixels[NUM_PIXELS_PER_BLOCK]; + + EncodeParams(const HDRColorA* const aOriginal, bool bSignedFormat) : + aHDRPixels(aOriginal), fBestErr(FLT_MAX), bSigned(bSignedFormat) + { + for(size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) + { + aIPixels[i].Set(aOriginal[i], bSigned); + } + } + }; + + static int Quantize(_In_ int iValue, _In_ int prec, _In_ bool bSigned); + static int Unquantize(_In_ int comp, _In_ uint8_t uBitsPerComp, _In_ bool bSigned); + static int FinishUnquantize(_In_ int comp, _In_ bool bSigned); + + static bool EndPointsFit(_In_ const EncodeParams* pEP, _In_count_c_(BC6H_MAX_REGIONS) const INTEndPntPair aEndPts[]); + + void GeneratePaletteQuantized(_In_ const EncodeParams* pEP, _In_ const INTEndPntPair& endPts, + _Out_cap_c_(BC6H_MAX_INDICES) INTColor aPalette[]) const; + float MapColorsQuantized(_In_ const EncodeParams* pEP, _In_count_(np) const INTColor aColors[], _In_ size_t np, _In_ const INTEndPntPair &endPts) const; + float PerturbOne(_In_ const EncodeParams* pEP, _In_count_(np) const INTColor aColors[], _In_ size_t np, _In_ uint8_t ch, + _In_ const INTEndPntPair& oldEndPts, _Out_ INTEndPntPair& newEndPts, _In_ float fOldErr, _In_ int do_b) const; + void OptimizeOne(_In_ const EncodeParams* pEP, _In_count_(np) const INTColor aColors[], _In_ size_t np, _In_ float aOrgErr, + _In_ const INTEndPntPair &aOrgEndPts, _Out_ INTEndPntPair &aOptEndPts) const; + void OptimizeEndPoints(_In_ const EncodeParams* pEP, _In_count_c_(BC6H_MAX_REGIONS) const float aOrgErr[], + _In_count_c_(BC6H_MAX_REGIONS) const INTEndPntPair aOrgEndPts[], + _Inout_count_c_(BC6H_MAX_REGIONS) INTEndPntPair aOptEndPts[]) const; + static void SwapIndices(_In_ const EncodeParams* pEP, _Inout_count_c_(BC6H_MAX_REGIONS) INTEndPntPair aEndPts[], + _In_count_c_(NUM_PIXELS_PER_BLOCK) size_t aIndices[]); + void AssignIndices(_In_ const EncodeParams* pEP, _In_count_c_(BC6H_MAX_REGIONS) const INTEndPntPair aEndPts[], + _Out_cap_c_(NUM_PIXELS_PER_BLOCK) size_t aIndices[], + _Out_cap_c_(BC6H_MAX_REGIONS) float aTotErr[]) const; + void QuantizeEndPts(_In_ const EncodeParams* pEP, _Out_cap_c_(BC6H_MAX_REGIONS) INTEndPntPair* qQntEndPts) const; + void EmitBlock(_In_ const EncodeParams* pEP, _In_count_c_(BC6H_MAX_REGIONS) const INTEndPntPair aEndPts[], + _In_count_c_(NUM_PIXELS_PER_BLOCK) const size_t aIndices[]); + void Refine(_Inout_ EncodeParams* pEP); + + static void GeneratePaletteUnquantized(_In_ const EncodeParams* pEP, _In_ size_t uRegion, _Out_cap_c_(BC6H_MAX_INDICES) INTColor aPalette[]); + float MapColors(_In_ const EncodeParams* pEP, _In_ size_t uRegion, _In_ size_t np, _In_count_(np) const size_t* auIndex) const; + float RoughMSE(_Inout_ EncodeParams* pEP) const; + +private: + const static ModeDescriptor ms_aDesc[][82]; + const static ModeInfo ms_aInfo[]; + const static int ms_aModeToInfo[]; +}; + +// BC67 compression (16b bits per texel) +class D3DX_BC7 : private CBits< 16 > +{ +public: + void Decode(_Out_cap_c_(NUM_PIXELS_PER_BLOCK) HDRColorA* pOut) const; + void Encode(_In_count_c_(NUM_PIXELS_PER_BLOCK) const HDRColorA* const pIn); + +private: + struct ModeInfo + { + uint8_t uPartitions; + uint8_t uPartitionBits; + uint8_t uPBits; + uint8_t uRotationBits; + uint8_t uIndexModeBits; + uint8_t uIndexPrec; + uint8_t uIndexPrec2; + LDRColorA RGBAPrec; + LDRColorA RGBAPrecWithP; + }; + + struct EncodeParams + { + uint8_t uMode; + LDREndPntPair aEndPts[BC7_MAX_SHAPES][BC7_MAX_REGIONS]; + LDRColorA aLDRPixels[NUM_PIXELS_PER_BLOCK]; + const HDRColorA* const aHDRPixels; + + EncodeParams(const HDRColorA* const aOriginal) : aHDRPixels(aOriginal) {} + }; + + static uint8_t Quantize(_In_ uint8_t comp, _In_ uint8_t uPrec) + { + assert(0 < uPrec && uPrec <= 8); + uint8_t rnd = (uint8_t) std::min(255, uint16_t(comp) + (1 << (7 - uPrec))); + return rnd >> (8 - uPrec); + } + + static LDRColorA Quantize(_In_ const LDRColorA& c, _In_ const LDRColorA& RGBAPrec) + { + LDRColorA q; + q.r = Quantize(c.r, RGBAPrec.r); + q.g = Quantize(c.g, RGBAPrec.g); + q.b = Quantize(c.b, RGBAPrec.b); + if(RGBAPrec.a) + q.a = Quantize(c.a, RGBAPrec.a); + else + q.a = 255; + return q; + } + + static uint8_t Unquantize(_In_ uint8_t comp, _In_ size_t uPrec) + { + assert(0 < uPrec && uPrec <= 8); + comp = comp << (8 - uPrec); + return comp | (comp >> uPrec); + } + + static LDRColorA Unquantize(_In_ const LDRColorA& c, _In_ const LDRColorA& RGBAPrec) + { + LDRColorA q; + q.r = Unquantize(c.r, RGBAPrec.r); + q.g = Unquantize(c.g, RGBAPrec.g); + q.b = Unquantize(c.b, RGBAPrec.b); + q.a = RGBAPrec.a > 0 ? Unquantize(c.a, RGBAPrec.a) : 255; + return q; + } + + void GeneratePaletteQuantized(_In_ const EncodeParams* pEP, _In_ size_t uIndexMode, _In_ const LDREndPntPair& endpts, + _Out_cap_c_(BC7_MAX_INDICES) LDRColorA aPalette[]) const; + float PerturbOne(_In_ const EncodeParams* pEP, _In_count_(np) const LDRColorA colors[], _In_ size_t np, _In_ size_t uIndexMode, + _In_ size_t ch, _In_ const LDREndPntPair &old_endpts, + _Out_ LDREndPntPair &new_endpts, _In_ float old_err, _In_ uint8_t do_b) const; + void Exhaustive(_In_ const EncodeParams* pEP, _In_count_(np) const LDRColorA aColors[], _In_ size_t np, _In_ size_t uIndexMode, + _In_ size_t ch, _Inout_ float& fOrgErr, _Inout_ LDREndPntPair& optEndPt) const; + void OptimizeOne(_In_ const EncodeParams* pEP, _In_count_(np) const LDRColorA colors[], _In_ size_t np, _In_ size_t uIndexMode, + _In_ float orig_err, _In_ const LDREndPntPair &orig_endpts, _Out_ LDREndPntPair &opt_endpts) const; + void OptimizeEndPoints(_In_ const EncodeParams* pEP, _In_ size_t uShape, _In_ size_t uIndexMode, + _In_count_c_(BC7_MAX_REGIONS) const float orig_err[], + _In_count_c_(BC7_MAX_REGIONS) const LDREndPntPair orig_endpts[], + _Out_cap_c_(BC7_MAX_REGIONS) LDREndPntPair opt_endpts[]) const; + void AssignIndices(_In_ const EncodeParams* pEP, _In_ size_t uShape, _In_ size_t uIndexMode, + _In_count_c_(BC7_MAX_REGIONS) LDREndPntPair endpts[], + _Out_cap_c_(NUM_PIXELS_PER_BLOCK) size_t aIndices[], _Out_cap_c_(NUM_PIXELS_PER_BLOCK) size_t aIndices2[], + _Out_cap_c_(BC7_MAX_REGIONS) float afTotErr[]) const; + void EmitBlock(_In_ const EncodeParams* pEP, _In_ size_t uShape, _In_ size_t uRotation, _In_ size_t uIndexMode, + _In_count_c_(BC7_MAX_REGIONS) const LDREndPntPair aEndPts[], + _In_count_c_(NUM_PIXELS_PER_BLOCK) const size_t aIndex[], + _In_count_c_(NUM_PIXELS_PER_BLOCK) const size_t aIndex2[]); + float Refine(_In_ const EncodeParams* pEP, _In_ size_t uShape, _In_ size_t uRotation, _In_ size_t uIndexMode); + + float MapColors(_In_ const EncodeParams* pEP, _In_count_(np) const LDRColorA aColors[], _In_ size_t np, _In_ size_t uIndexMode, + _In_ const LDREndPntPair& endPts, _In_ float fMinErr) const; + static float RoughMSE(_Inout_ EncodeParams* pEP, _In_ size_t uShape, _In_ size_t uIndexMode); + +private: + const static ModeInfo ms_aInfo[]; +}; + +//------------------------------------------------------------------------------------- +template void OptimizeAlpha(float *pX, float *pY, const float *pPoints, size_t cSteps) +{ + static const float pC6[] = { 5.0f/5.0f, 4.0f/5.0f, 3.0f/5.0f, 2.0f/5.0f, 1.0f/5.0f, 0.0f/5.0f }; + static const float pD6[] = { 0.0f/5.0f, 1.0f/5.0f, 2.0f/5.0f, 3.0f/5.0f, 4.0f/5.0f, 5.0f/5.0f }; + static const float pC8[] = { 7.0f/7.0f, 6.0f/7.0f, 5.0f/7.0f, 4.0f/7.0f, 3.0f/7.0f, 2.0f/7.0f, 1.0f/7.0f, 0.0f/7.0f }; + static const float pD8[] = { 0.0f/7.0f, 1.0f/7.0f, 2.0f/7.0f, 3.0f/7.0f, 4.0f/7.0f, 5.0f/7.0f, 6.0f/7.0f, 7.0f/7.0f }; + + const float *pC = (6 == cSteps) ? pC6 : pC8; + const float *pD = (6 == cSteps) ? pD6 : pD8; + + float MAX_VALUE = 1.0f; + float MIN_VALUE; + if (bRange) + { + MIN_VALUE = -1.0f; + } + else + { + MIN_VALUE = 0.0f; + } + + // Find Min and Max points, as starting point + float fX = MAX_VALUE; + float fY = MIN_VALUE; + + if(8 == cSteps) + { + for(size_t iPoint = 0; iPoint < NUM_PIXELS_PER_BLOCK; iPoint++) + { + if(pPoints[iPoint] < fX) + fX = pPoints[iPoint]; + + if(pPoints[iPoint] > fY) + fY = pPoints[iPoint]; + } + } + else + { + for(size_t iPoint = 0; iPoint < NUM_PIXELS_PER_BLOCK; iPoint++) + { + if(pPoints[iPoint] < fX && pPoints[iPoint] > MIN_VALUE) + fX = pPoints[iPoint]; + + if(pPoints[iPoint] > fY && pPoints[iPoint] < MAX_VALUE) + fY = pPoints[iPoint]; + } + + if (fX == fY) + { + fY = MAX_VALUE; + } + } + + // Use Newton's Method to find local minima of sum-of-squares error. + float fSteps = (float) (cSteps - 1); + + for(size_t iIteration = 0; iIteration < 8; iIteration++) + { + float fScale; + + if((fY - fX) < (1.0f / 256.0f)) + break; + + fScale = fSteps / (fY - fX); + + // Calculate new steps + float pSteps[8]; + + for(size_t iStep = 0; iStep < cSteps; iStep++) + pSteps[iStep] = pC[iStep] * fX + pD[iStep] * fY; + + if(6 == cSteps) + { + pSteps[6] = MIN_VALUE; + pSteps[7] = MAX_VALUE; + } + + // Evaluate function, and derivatives + float dX = 0.0f; + float dY = 0.0f; + float d2X = 0.0f; + float d2Y = 0.0f; + + for(size_t iPoint = 0; iPoint < NUM_PIXELS_PER_BLOCK; iPoint++) + { + float fDot = (pPoints[iPoint] - fX) * fScale; + + size_t iStep; + + if(fDot <= 0.0f) + iStep = ((6 == cSteps) && (pPoints[iPoint] <= fX * 0.5f)) ? 6 : 0; + else if(fDot >= fSteps) + iStep = ((6 == cSteps) && (pPoints[iPoint] >= (fY + 1.0f) * 0.5f)) ? 7 : (cSteps - 1); + else + iStep = static_cast(fDot + 0.5f); + + + if(iStep < cSteps) + { + // D3DX had this computation backwards (pPoints[iPoint] - pSteps[iStep]) + // this fix improves RMS of the alpha component + float fDiff = pSteps[iStep] - pPoints[iPoint]; + + dX += pC[iStep] * fDiff; + d2X += pC[iStep] * pC[iStep]; + + dY += pD[iStep] * fDiff; + d2Y += pD[iStep] * pD[iStep]; + } + } + + // Move endpoints + if(d2X > 0.0f) + fX -= dX / d2X; + + if(d2Y > 0.0f) + fY -= dY / d2Y; + + if(fX > fY) + { + float f = fX; fX = fY; fY = f; + } + + if((dX * dX < (1.0f / 64.0f)) && (dY * dY < (1.0f / 64.0f))) + break; + } + + *pX = (fX < MIN_VALUE) ? MIN_VALUE : (fX > MAX_VALUE) ? MAX_VALUE : fX; + *pY = (fY < MIN_VALUE) ? MIN_VALUE : (fY > MAX_VALUE) ? MAX_VALUE : fY; +} +#pragma warning(pop) + + +//------------------------------------------------------------------------------------- +// Functions +//------------------------------------------------------------------------------------- + +typedef void (*BC_DECODE)(XMVECTOR *pColor, const uint8_t *pBC); +typedef void (*BC_ENCODE)(uint8_t *pDXT, const XMVECTOR *pColor, DWORD flags); + +void D3DXDecodeBC1(_Out_cap_c_(NUM_PIXELS_PER_BLOCK) XMVECTOR *pColor, _In_count_c_(8) const uint8_t *pBC); +void D3DXDecodeBC2(_Out_cap_c_(NUM_PIXELS_PER_BLOCK) XMVECTOR *pColor, _In_count_c_(16) const uint8_t *pBC); +void D3DXDecodeBC3(_Out_cap_c_(NUM_PIXELS_PER_BLOCK) XMVECTOR *pColor, _In_count_c_(16) const uint8_t *pBC); +void D3DXDecodeBC4U(_Out_cap_c_(NUM_PIXELS_PER_BLOCK) XMVECTOR *pColor, _In_count_c_(8) const uint8_t *pBC); +void D3DXDecodeBC4S(_Out_cap_c_(NUM_PIXELS_PER_BLOCK) XMVECTOR *pColor, _In_count_c_(8) const uint8_t *pBC); +void D3DXDecodeBC5U(_Out_cap_c_(NUM_PIXELS_PER_BLOCK) XMVECTOR *pColor, _In_count_c_(16) const uint8_t *pBC); +void D3DXDecodeBC5S(_Out_cap_c_(NUM_PIXELS_PER_BLOCK) XMVECTOR *pColor, _In_count_c_(16) const uint8_t *pBC); +void D3DXDecodeBC6HU(_Out_cap_c_(NUM_PIXELS_PER_BLOCK) XMVECTOR *pColor, _In_count_c_(16) const uint8_t *pBC); +void D3DXDecodeBC6HS(_Out_cap_c_(NUM_PIXELS_PER_BLOCK) XMVECTOR *pColor, _In_count_c_(16) const uint8_t *pBC); +void D3DXDecodeBC7(_Out_cap_c_(NUM_PIXELS_PER_BLOCK) XMVECTOR *pColor, _In_count_c_(16) const uint8_t *pBC); + +void D3DXEncodeBC1(_Out_cap_c_(8) uint8_t *pBC, _In_count_c_(NUM_PIXELS_PER_BLOCK) const XMVECTOR *pColor, _In_ float alphaRef, _In_ DWORD flags); + // BC1 requires one additional parameter, so it doesn't match signature of BC_ENCODE above + +void D3DXEncodeBC2(_Out_cap_c_(16) uint8_t *pBC, _In_count_c_(NUM_PIXELS_PER_BLOCK) const XMVECTOR *pColor, _In_ DWORD flags); +void D3DXEncodeBC3(_Out_cap_c_(16) uint8_t *pBC, _In_count_c_(NUM_PIXELS_PER_BLOCK) const XMVECTOR *pColor, _In_ DWORD flags); +void D3DXEncodeBC4U(_Out_cap_c_(8) uint8_t *pBC, _In_count_c_(NUM_PIXELS_PER_BLOCK) const XMVECTOR *pColor, _In_ DWORD flags); +void D3DXEncodeBC4S(_Out_cap_c_(8) uint8_t *pBC, _In_count_c_(NUM_PIXELS_PER_BLOCK) const XMVECTOR *pColor, _In_ DWORD flags); +void D3DXEncodeBC5U(_Out_cap_c_(16) uint8_t *pBC, _In_count_c_(NUM_PIXELS_PER_BLOCK) const XMVECTOR *pColor, _In_ DWORD flags); +void D3DXEncodeBC5S(_Out_cap_c_(16) uint8_t *pBC, _In_count_c_(NUM_PIXELS_PER_BLOCK) const XMVECTOR *pColor, _In_ DWORD flags); +void D3DXEncodeBC6HU(_Out_cap_c_(16) uint8_t *pBC, _In_count_c_(NUM_PIXELS_PER_BLOCK) const XMVECTOR *pColor, _In_ DWORD flags); +void D3DXEncodeBC6HS(_Out_cap_c_(16) uint8_t *pBC, _In_count_c_(NUM_PIXELS_PER_BLOCK) const XMVECTOR *pColor, _In_ DWORD flags); +void D3DXEncodeBC7(_Out_cap_c_(16) uint8_t *pBC, _In_count_c_(NUM_PIXELS_PER_BLOCK) const XMVECTOR *pColor, _In_ DWORD flags); + +}; // namespace diff --git a/thirdparty/directxtex/DirectXTex/BC4BC5.cpp b/thirdparty/directxtex/DirectXTex/BC4BC5.cpp new file mode 100644 index 0000000..62b5aee --- /dev/null +++ b/thirdparty/directxtex/DirectXTex/BC4BC5.cpp @@ -0,0 +1,534 @@ +//------------------------------------------------------------------------------------- +// BC4BC5.cpp +// +// Block-compression (BC) functionality for BC4 and BC5 (DirectX 10 texture compression) +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkId=248926 +//------------------------------------------------------------------------------------- + +#include "directxtexp.h" + +#include "BC.h" + +#pragma warning(disable : 4201) + +namespace DirectX +{ + +//------------------------------------------------------------------------------------ +// Constants +//------------------------------------------------------------------------------------ + +// Because these are used in SAL annotations, they need to remain macros rather than const values +#define BLOCK_LEN 4 + // length of each block in texel + +#define BLOCK_SIZE (BLOCK_LEN * BLOCK_LEN) + // total texels in a 4x4 block. + +//------------------------------------------------------------------------------------ +// Structures +//------------------------------------------------------------------------------------- + +// BC4U/BC5U +struct BC4_UNORM +{ + float R(size_t uOffset) const + { + size_t uIndex = GetIndex(uOffset); + return DecodeFromIndex(uIndex); + } + + float DecodeFromIndex(size_t uIndex) const + { + if (uIndex == 0) + return red_0 / 255.0f; + if (uIndex == 1) + return red_1 / 255.0f; + float fred_0 = red_0 / 255.0f; + float fred_1 = red_1 / 255.0f; + if (red_0 > red_1) + { + uIndex -= 1; + return (fred_0 * (7-uIndex) + fred_1 * uIndex) / 7.0f; + } + else + { + if (uIndex == 6) + return 0.0f; + if (uIndex == 7) + return 1.0f; + uIndex -= 1; + return (fred_0 * (5-uIndex) + fred_1 * uIndex) / 5.0f; + } + } + + size_t GetIndex(size_t uOffset) const + { + return (size_t) ((data >> (3*uOffset + 16)) & 0x07); + } + + void SetIndex(size_t uOffset, size_t uIndex) + { + data &= ~((uint64_t) 0x07 << (3*uOffset + 16)); + data |= ((uint64_t) uIndex << (3*uOffset + 16)); + } + + union + { + struct + { + uint8_t red_0; + uint8_t red_1; + uint8_t indices[6]; + }; + uint64_t data; + }; +}; + +// BC4S/BC5S +struct BC4_SNORM +{ + float R(size_t uOffset) const + { + size_t uIndex = GetIndex(uOffset); + return DecodeFromIndex(uIndex); + } + + float DecodeFromIndex(size_t uIndex) const + { + int8_t sred_0 = (red_0 == -128)? -127 : red_0; + int8_t sred_1 = (red_1 == -128)? -127 : red_1; + + if (uIndex == 0) + return sred_0 / 127.0f; + if (uIndex == 1) + return sred_1 / 127.0f; + float fred_0 = sred_0 / 127.0f; + float fred_1 = sred_1 / 127.0f; + if (red_0 > red_1) + { + uIndex -= 1; + return (fred_0 * (7-uIndex) + fred_1 * uIndex) / 7.0f; + } + else + { + if (uIndex == 6) + return -1.0f; + if (uIndex == 7) + return 1.0f; + uIndex -= 1; + return (fred_0 * (5-uIndex) + fred_1 * uIndex) / 5.0f; + } + } + + size_t GetIndex(size_t uOffset) const + { + return (size_t) ((data >> (3*uOffset + 16)) & 0x07); + } + + void SetIndex(size_t uOffset, size_t uIndex) + { + data &= ~((uint64_t) 0x07 << (3*uOffset + 16)); + data |= ((uint64_t) uIndex << (3*uOffset + 16)); + } + + union + { + struct + { + int8_t red_0; + int8_t red_1; + uint8_t indices[6]; + }; + uint64_t data; + }; +}; + + +//------------------------------------------------------------------------------------- +// Convert a floating point value to an 8-bit SNORM +//------------------------------------------------------------------------------------- +static void inline FloatToSNorm( _In_ float fVal, _Out_ int8_t *piSNorm ) +{ + const uint32_t dwMostNeg = ( 1 << ( 8 * sizeof( int8_t ) - 1 ) ); + + if( _isnan( fVal ) ) + fVal = 0; + else + if( fVal > 1 ) + fVal = 1; // Clamp to 1 + else + if( fVal < -1 ) + fVal = -1; // Clamp to -1 + + fVal = fVal * (int8_t) ( dwMostNeg - 1 ); + + if( fVal >= 0 ) + fVal += .5f; + else + fVal -= .5f; + + *piSNorm = (int8_t) (fVal); +} + + +//------------------------------------------------------------------------------ +static void FindEndPointsBC4U( _In_count_c_(BLOCK_SIZE) const float theTexelsU[], _Out_ uint8_t &endpointU_0, _Out_ uint8_t &endpointU_1) +{ + // The boundary of codec for signed/unsigned format + float MIN_NORM; + float MAX_NORM = 1.0f; + int8_t iStart, iEnd; + size_t i; + + MIN_NORM = 0.0f; + + // Find max/min of input texels + float fBlockMax = theTexelsU[0]; + float fBlockMin = theTexelsU[0]; + for (i = 0; i < BLOCK_SIZE; ++i) + { + if (theTexelsU[i]fBlockMax) + { + fBlockMax = theTexelsU[i]; + } + } + + // If there are boundary values in input texels, Should use 4 block-codec to guarantee + // the exact code of the boundary values. + bool bUsing4BlockCodec = ( MIN_NORM == fBlockMin || MAX_NORM == fBlockMax ); + + // Using Optimize + float fStart, fEnd; + + if (!bUsing4BlockCodec) + { + OptimizeAlpha(&fStart, &fEnd, theTexelsU, 8); + + iStart = (uint8_t) (fStart * 255.0f); + iEnd = (uint8_t) (fEnd * 255.0f); + + endpointU_0 = iEnd; + endpointU_1 = iStart; + } + else + { + OptimizeAlpha(&fStart, &fEnd, theTexelsU, 6); + + iStart = (uint8_t) (fStart * 255.0f); + iEnd = (uint8_t) (fEnd * 255.0f); + + endpointU_1 = iEnd; + endpointU_0 = iStart; + } +} + +static void FindEndPointsBC4S(_In_count_c_(BLOCK_SIZE) const float theTexelsU[], _Out_ int8_t &endpointU_0, _Out_ int8_t &endpointU_1) +{ + // The boundary of codec for signed/unsigned format + float MIN_NORM; + float MAX_NORM = 1.0f; + int8_t iStart, iEnd; + size_t i; + + MIN_NORM = -1.0f; + + // Find max/min of input texels + float fBlockMax = theTexelsU[0]; + float fBlockMin = theTexelsU[0]; + for (i = 0; i < BLOCK_SIZE; ++i) + { + if (theTexelsU[i]fBlockMax) + { + fBlockMax = theTexelsU[i]; + } + } + + // If there are boundary values in input texels, Should use 4 block-codec to guarantee + // the exact code of the boundary values. + bool bUsing4BlockCodec = ( MIN_NORM == fBlockMin || MAX_NORM == fBlockMax ); + + // Using Optimize + float fStart, fEnd; + + if (!bUsing4BlockCodec) + { + OptimizeAlpha(&fStart, &fEnd, theTexelsU, 8); + + FloatToSNorm(fStart, &iStart); + FloatToSNorm(fEnd, &iEnd); + + endpointU_0 = iEnd; + endpointU_1 = iStart; + } + else + { + OptimizeAlpha(&fStart, &fEnd, theTexelsU, 6); + + FloatToSNorm(fStart, &iStart); + FloatToSNorm(fEnd, &iEnd); + + endpointU_1 = iEnd; + endpointU_0 = iStart; + } +} + + +//------------------------------------------------------------------------------ +static inline void FindEndPointsBC5U( _In_count_c_(BLOCK_SIZE) const float theTexelsU[], _In_count_c_(BLOCK_SIZE) const float theTexelsV[], + _Out_ uint8_t &endpointU_0, _Out_ uint8_t &endpointU_1, _Out_ uint8_t &endpointV_0, _Out_ uint8_t &endpointV_1) +{ + //Encoding the U and V channel by BC4 codec separately. + FindEndPointsBC4U( theTexelsU, endpointU_0, endpointU_1); + FindEndPointsBC4U( theTexelsV, endpointV_0, endpointV_1); +} + +static inline void FindEndPointsBC5S( _In_count_c_(BLOCK_SIZE) const float theTexelsU[], _In_count_c_(BLOCK_SIZE) const float theTexelsV[], + _Out_ int8_t &endpointU_0, _Out_ int8_t &endpointU_1, _Out_ int8_t &endpointV_0, _Out_ int8_t &endpointV_1) +{ + //Encoding the U and V channel by BC4 codec separately. + FindEndPointsBC4S( theTexelsU, endpointU_0, endpointU_1); + FindEndPointsBC4S( theTexelsV, endpointV_0, endpointV_1); +} + + +//------------------------------------------------------------------------------ +static void FindClosestUNORM(_Inout_ BC4_UNORM* pBC, _In_count_c_(NUM_PIXELS_PER_BLOCK) const float theTexelsU[]) +{ + float rGradient[8]; + int i; + for (i = 0; i < 8; ++i) + { + rGradient[i] = pBC->DecodeFromIndex(i); + } + for (i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) + { + size_t uBestIndex = 0; + float fBestDelta = 100000; + for (size_t uIndex = 0; uIndex < 8; uIndex++) + { + float fCurrentDelta = fabsf(rGradient[uIndex]-theTexelsU[i]); + if (fCurrentDelta < fBestDelta) + { + uBestIndex = uIndex; + fBestDelta = fCurrentDelta; + } + } + pBC->SetIndex(i, uBestIndex); + } +} + +static void FindClosestSNORM(_Inout_ BC4_SNORM* pBC, _In_count_c_(NUM_PIXELS_PER_BLOCK) const float theTexelsU[]) +{ + float rGradient[8]; + int i; + for (i = 0; i < 8; ++i) + { + rGradient[i] = pBC->DecodeFromIndex(i); + } + for (i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) + { + size_t uBestIndex = 0; + float fBestDelta = 100000; + for (size_t uIndex = 0; uIndex < 8; uIndex++) + { + float fCurrentDelta = fabsf(rGradient[uIndex]-theTexelsU[i]); + if (fCurrentDelta < fBestDelta) + { + uBestIndex = uIndex; + fBestDelta = fCurrentDelta; + } + } + pBC->SetIndex(i, uBestIndex); + } +} + + +//===================================================================================== +// Entry points +//===================================================================================== + +//------------------------------------------------------------------------------------- +// BC4 Compression +//------------------------------------------------------------------------------------- +void D3DXDecodeBC4U( XMVECTOR *pColor, const uint8_t *pBC ) +{ + assert( pColor && pBC ); + static_assert( sizeof(BC4_UNORM) == 8, "BC4_UNORM should be 8 bytes" ); + + const BC4_UNORM * pBC4 = reinterpret_cast(pBC); + + for (size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) + { + pColor[i] = XMVectorSet( pBC4->R(i), 0, 0, 1.0f); + } +} + +void D3DXDecodeBC4S(XMVECTOR *pColor, const uint8_t *pBC) +{ + assert( pColor && pBC ); + static_assert( sizeof(BC4_SNORM) == 8, "BC4_SNORM should be 8 bytes" ); + + const BC4_SNORM * pBC4 = reinterpret_cast(pBC); + + for (size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) + { + pColor[i] = XMVectorSet( pBC4->R(i), 0, 0, 1.0f); + } +} + +void D3DXEncodeBC4U( uint8_t *pBC, const XMVECTOR *pColor, DWORD flags ) +{ + UNREFERENCED_PARAMETER( flags ); + + assert( pBC && pColor ); + static_assert( sizeof(BC4_UNORM) == 8, "BC4_UNORM should be 8 bytes" ); + + memset(pBC, 0, sizeof(BC4_UNORM)); + BC4_UNORM * pBC4 = reinterpret_cast(pBC); + float theTexelsU[NUM_PIXELS_PER_BLOCK]; + + for (size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) + { + theTexelsU[i] = XMVectorGetX( pColor[i] ); + } + + FindEndPointsBC4U(theTexelsU, pBC4->red_0, pBC4->red_1); + FindClosestUNORM(pBC4, theTexelsU); +} + +void D3DXEncodeBC4S( uint8_t *pBC, const XMVECTOR *pColor, DWORD flags ) +{ + UNREFERENCED_PARAMETER( flags ); + + assert( pBC && pColor ); + static_assert( sizeof(BC4_SNORM) == 8, "BC4_SNORM should be 8 bytes" ); + + memset(pBC, 0, sizeof(BC4_UNORM)); + BC4_SNORM * pBC4 = reinterpret_cast(pBC); + float theTexelsU[NUM_PIXELS_PER_BLOCK]; + + for (size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) + { + theTexelsU[i] = XMVectorGetX( pColor[i] ); + } + + FindEndPointsBC4S(theTexelsU, pBC4->red_0, pBC4->red_1); + FindClosestSNORM(pBC4, theTexelsU); +} + + +//------------------------------------------------------------------------------------- +// BC5 Compression +//------------------------------------------------------------------------------------- +void D3DXDecodeBC5U(XMVECTOR *pColor, const uint8_t *pBC) +{ + assert( pColor && pBC ); + static_assert( sizeof(BC4_UNORM) == 8, "BC4_UNORM should be 8 bytes" ); + + const BC4_UNORM * pBCR = reinterpret_cast(pBC); + const BC4_UNORM * pBCG = reinterpret_cast(pBC+sizeof(BC4_UNORM)); + + for (size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) + { + pColor[i] = XMVectorSet(pBCR->R(i), pBCG->R(i), 0, 1.0f); + } +} + +void D3DXDecodeBC5S(XMVECTOR *pColor, const uint8_t *pBC) +{ + assert( pColor && pBC ); + static_assert( sizeof(BC4_SNORM) == 8, "BC4_SNORM should be 8 bytes" ); + + const BC4_SNORM * pBCR = reinterpret_cast(pBC); + const BC4_SNORM * pBCG = reinterpret_cast(pBC+sizeof(BC4_SNORM)); + + for (size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) + { + pColor[i] = XMVectorSet(pBCR->R(i), pBCG->R(i), 0, 1.0f); + } +} + +void D3DXEncodeBC5U( uint8_t *pBC, const XMVECTOR *pColor, DWORD flags ) +{ + UNREFERENCED_PARAMETER( flags ); + + assert( pBC && pColor ); + static_assert( sizeof(BC4_UNORM) == 8, "BC4_UNORM should be 8 bytes" ); + + memset(pBC, 0, sizeof(BC4_UNORM)*2); + BC4_UNORM * pBCR = reinterpret_cast(pBC); + BC4_UNORM * pBCG = reinterpret_cast(pBC+sizeof(BC4_UNORM)); + float theTexelsU[NUM_PIXELS_PER_BLOCK]; + float theTexelsV[NUM_PIXELS_PER_BLOCK]; + + for (size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) + { + XMFLOAT4A clr; + XMStoreFloat4A( &clr, pColor[i] ); + theTexelsU[i] = clr.x; + theTexelsV[i] = clr.y; + } + + FindEndPointsBC5U( + theTexelsU, + theTexelsV, + pBCR->red_0, + pBCR->red_1, + pBCG->red_0, + pBCG->red_1); + + FindClosestUNORM(pBCR, theTexelsU); + FindClosestUNORM(pBCG, theTexelsV); +} + +void D3DXEncodeBC5S( uint8_t *pBC, const XMVECTOR *pColor, DWORD flags ) +{ + UNREFERENCED_PARAMETER( flags ); + + assert( pBC && pColor ); + static_assert( sizeof(BC4_SNORM) == 8, "BC4_SNORM should be 8 bytes" ); + + memset(pBC, 0, sizeof(BC4_UNORM)*2); + BC4_SNORM * pBCR = reinterpret_cast(pBC); + BC4_SNORM * pBCG = reinterpret_cast(pBC+sizeof(BC4_SNORM)); + float theTexelsU[NUM_PIXELS_PER_BLOCK]; + float theTexelsV[NUM_PIXELS_PER_BLOCK]; + + for (size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) + { + XMFLOAT4A clr; + XMStoreFloat4A( &clr, pColor[i] ); + theTexelsU[i] = clr.x; + theTexelsV[i] = clr.y; + } + + FindEndPointsBC5S( + theTexelsU, + theTexelsV, + pBCR->red_0, + pBCR->red_1, + pBCG->red_0, + pBCG->red_1); + + FindClosestSNORM(pBCR, theTexelsU); + FindClosestSNORM(pBCG, theTexelsV); +} + +} // namespace \ No newline at end of file diff --git a/thirdparty/directxtex/DirectXTex/BC6HBC7.cpp b/thirdparty/directxtex/DirectXTex/BC6HBC7.cpp new file mode 100644 index 0000000..5ed640c --- /dev/null +++ b/thirdparty/directxtex/DirectXTex/BC6HBC7.cpp @@ -0,0 +1,2822 @@ +//------------------------------------------------------------------------------------- +// BC6HBC7.cpp +// +// Block-compression (BC) functionality for BC6H and BC7 (DirectX 11 texture compression) +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkId=248926 +//------------------------------------------------------------------------------------- + +#include "directxtexp.h" + +#include "BC.h" + +#ifndef USE_XNAMATH +using namespace DirectX::PackedVector; +#endif + +namespace DirectX +{ + +//------------------------------------------------------------------------------------- +// Constants +//------------------------------------------------------------------------------------- + +static const float fEpsilon = (0.25f / 64.0f) * (0.25f / 64.0f); +static const float pC3[] = { 2.0f/2.0f, 1.0f/2.0f, 0.0f/2.0f }; +static const float pD3[] = { 0.0f/2.0f, 1.0f/2.0f, 2.0f/2.0f }; +static const float pC4[] = { 3.0f/3.0f, 2.0f/3.0f, 1.0f/3.0f, 0.0f/3.0f }; +static const float pD4[] = { 0.0f/3.0f, 1.0f/3.0f, 2.0f/3.0f, 3.0f/3.0f }; + +const int g_aWeights2[] = {0, 21, 43, 64}; +const int g_aWeights3[] = {0, 9, 18, 27, 37, 46, 55, 64}; +const int g_aWeights4[] = {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64}; + +// Partition, Shape, Pixel (index into 4x4 block) +static const uint8_t g_aPartitionTable[3][64][16] = +{ + { // 1 Region case has no subsets (all 0) + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } + }, + + { // BC6H/BC7 Partition Set for 2 Subsets + { 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1 }, // Shape 0 + { 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1 }, // Shape 1 + { 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1 }, // Shape 2 + { 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1 }, // Shape 3 + { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1 }, // Shape 4 + { 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, // Shape 5 + { 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, // Shape 6 + { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1 }, // Shape 7 + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1 }, // Shape 8 + { 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, // Shape 9 + { 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, // Shape 10 + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1 }, // Shape 11 + { 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, // Shape 12 + { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 }, // Shape 13 + { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, // Shape 14 + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1 }, // Shape 15 + { 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1 }, // Shape 16 + { 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 }, // Shape 17 + { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0 }, // Shape 18 + { 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0 }, // Shape 19 + { 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 }, // Shape 20 + { 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0 }, // Shape 21 + { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0 }, // Shape 22 + { 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1 }, // Shape 23 + { 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0 }, // Shape 24 + { 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0 }, // Shape 25 + { 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0 }, // Shape 26 + { 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0 }, // Shape 27 + { 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0 }, // Shape 28 + { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 }, // Shape 29 + { 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0 }, // Shape 30 + { 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0 }, // Shape 31 + + // BC7 Partition Set for 2 Subsets (second-half) + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 }, // Shape 32 + { 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1 }, // Shape 33 + { 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0 }, // Shape 34 + { 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0 }, // Shape 35 + { 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0 }, // Shape 36 + { 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0 }, // Shape 37 + { 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1 }, // Shape 38 + { 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1 }, // Shape 39 + { 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0 }, // Shape 40 + { 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0 }, // Shape 41 + { 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0 }, // Shape 42 + { 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0 }, // Shape 43 + { 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0 }, // Shape 44 + { 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1 }, // Shape 45 + { 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1 }, // Shape 46 + { 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0 }, // Shape 47 + { 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0 }, // Shape 48 + { 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0 }, // Shape 49 + { 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0 }, // Shape 50 + { 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0 }, // Shape 51 + { 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1 }, // Shape 52 + { 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1 }, // Shape 53 + { 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0 }, // Shape 54 + { 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0 }, // Shape 55 + { 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1 }, // Shape 56 + { 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1 }, // Shape 57 + { 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1 }, // Shape 58 + { 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1 }, // Shape 59 + { 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1 }, // Shape 60 + { 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 }, // Shape 61 + { 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0 }, // Shape 62 + { 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1 } // Shape 63 + }, + + { // BC7 Partition Set for 3 Subsets + { 0, 0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 1, 2, 2, 2, 2 }, // Shape 0 + { 0, 0, 0, 1, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1 }, // Shape 1 + { 0, 0, 0, 0, 2, 0, 0, 1, 2, 2, 1, 1, 2, 2, 1, 1 }, // Shape 2 + { 0, 2, 2, 2, 0, 0, 2, 2, 0, 0, 1, 1, 0, 1, 1, 1 }, // Shape 3 + { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2 }, // Shape 4 + { 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 2, 2, 0, 0, 2, 2 }, // Shape 5 + { 0, 0, 2, 2, 0, 0, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1 }, // Shape 6 + { 0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1 }, // Shape 7 + { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2 }, // Shape 8 + { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2 }, // Shape 9 + { 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2 }, // Shape 10 + { 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2 }, // Shape 11 + { 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2 }, // Shape 12 + { 0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 2 }, // Shape 13 + { 0, 0, 1, 1, 0, 1, 1, 2, 1, 1, 2, 2, 1, 2, 2, 2 }, // Shape 14 + { 0, 0, 1, 1, 2, 0, 0, 1, 2, 2, 0, 0, 2, 2, 2, 0 }, // Shape 15 + { 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 2, 1, 1, 2, 2 }, // Shape 16 + { 0, 1, 1, 1, 0, 0, 1, 1, 2, 0, 0, 1, 2, 2, 0, 0 }, // Shape 17 + { 0, 0, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2 }, // Shape 18 + { 0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 2, 2, 1, 1, 1, 1 }, // Shape 19 + { 0, 1, 1, 1, 0, 1, 1, 1, 0, 2, 2, 2, 0, 2, 2, 2 }, // Shape 20 + { 0, 0, 0, 1, 0, 0, 0, 1, 2, 2, 2, 1, 2, 2, 2, 1 }, // Shape 21 + { 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2 }, // Shape 22 + { 0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 1, 0, 2, 2, 1, 0 }, // Shape 23 + { 0, 1, 2, 2, 0, 1, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0 }, // Shape 24 + { 0, 0, 1, 2, 0, 0, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2 }, // Shape 25 + { 0, 1, 1, 0, 1, 2, 2, 1, 1, 2, 2, 1, 0, 1, 1, 0 }, // Shape 26 + { 0, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2, 1, 1, 2, 2, 1 }, // Shape 27 + { 0, 0, 2, 2, 1, 1, 0, 2, 1, 1, 0, 2, 0, 0, 2, 2 }, // Shape 28 + { 0, 1, 1, 0, 0, 1, 1, 0, 2, 0, 0, 2, 2, 2, 2, 2 }, // Shape 29 + { 0, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 0, 1, 1 }, // Shape 30 + { 0, 0, 0, 0, 2, 0, 0, 0, 2, 2, 1, 1, 2, 2, 2, 1 }, // Shape 31 + { 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 2, 2, 2 }, // Shape 32 + { 0, 2, 2, 2, 0, 0, 2, 2, 0, 0, 1, 2, 0, 0, 1, 1 }, // Shape 33 + { 0, 0, 1, 1, 0, 0, 1, 2, 0, 0, 2, 2, 0, 2, 2, 2 }, // Shape 34 + { 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0 }, // Shape 35 + { 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 0, 0, 0, 0 }, // Shape 36 + { 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0 }, // Shape 37 + { 0, 1, 2, 0, 2, 0, 1, 2, 1, 2, 0, 1, 0, 1, 2, 0 }, // Shape 38 + { 0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 2, 2, 0, 0, 1, 1 }, // Shape 39 + { 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 0, 0, 0, 0, 1, 1 }, // Shape 40 + { 0, 1, 0, 1, 0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2 }, // Shape 41 + { 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 2, 1, 2, 1, 2, 1 }, // Shape 42 + { 0, 0, 2, 2, 1, 1, 2, 2, 0, 0, 2, 2, 1, 1, 2, 2 }, // Shape 43 + { 0, 0, 2, 2, 0, 0, 1, 1, 0, 0, 2, 2, 0, 0, 1, 1 }, // Shape 44 + { 0, 2, 2, 0, 1, 2, 2, 1, 0, 2, 2, 0, 1, 2, 2, 1 }, // Shape 45 + { 0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1, 0, 1 }, // Shape 46 + { 0, 0, 0, 0, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1 }, // Shape 47 + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 2, 2, 2 }, // Shape 48 + { 0, 2, 2, 2, 0, 1, 1, 1, 0, 2, 2, 2, 0, 1, 1, 1 }, // Shape 49 + { 0, 0, 0, 2, 1, 1, 1, 2, 0, 0, 0, 2, 1, 1, 1, 2 }, // Shape 50 + { 0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2 }, // Shape 51 + { 0, 2, 2, 2, 0, 1, 1, 1, 0, 1, 1, 1, 0, 2, 2, 2 }, // Shape 52 + { 0, 0, 0, 2, 1, 1, 1, 2, 1, 1, 1, 2, 0, 0, 0, 2 }, // Shape 53 + { 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 2, 2 }, // Shape 54 + { 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 1, 2 }, // Shape 55 + { 0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2 }, // Shape 56 + { 0, 0, 2, 2, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 2, 2 }, // Shape 57 + { 0, 0, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 0, 0, 2, 2 }, // Shape 58 + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 2 }, // Shape 59 + { 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 1 }, // Shape 60 + { 0, 2, 2, 2, 1, 2, 2, 2, 0, 2, 2, 2, 1, 2, 2, 2 }, // Shape 61 + { 0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 }, // Shape 62 + { 0, 1, 1, 1, 2, 0, 1, 1, 2, 2, 0, 1, 2, 2, 2, 0 } // Shape 63 + } +}; + +// Partition, Shape, Fixup +static const uint8_t g_aFixUp[3][64][3] = +{ + { // No fix-ups for 1st subset for BC6H or BC7 + { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, + { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, + { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, + { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, + { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, + { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, + { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, + { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, + { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, + { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, + { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, + { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, + { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, + { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, + { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, + { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0} + }, + + { // BC6H/BC7 Partition Set Fixups for 2 Subsets + { 0,15, 0}, { 0,15, 0}, { 0,15, 0}, { 0,15, 0}, + { 0,15, 0}, { 0,15, 0}, { 0,15, 0}, { 0,15, 0}, + { 0,15, 0}, { 0,15, 0}, { 0,15, 0}, { 0,15, 0}, + { 0,15, 0}, { 0,15, 0}, { 0,15, 0}, { 0,15, 0}, + { 0,15, 0}, { 0, 2, 0}, { 0, 8, 0}, { 0, 2, 0}, + { 0, 2, 0}, { 0, 8, 0}, { 0, 8, 0}, { 0,15, 0}, + { 0, 2, 0}, { 0, 8, 0}, { 0, 2, 0}, { 0, 2, 0}, + { 0, 8, 0}, { 0, 8, 0}, { 0, 2, 0}, { 0, 2, 0}, + + // BC7 Partition Set Fixups for 2 Subsets (second-half) + { 0,15, 0}, { 0,15, 0}, { 0, 6, 0}, { 0, 8, 0}, + { 0, 2, 0}, { 0, 8, 0}, { 0,15, 0}, { 0,15, 0}, + { 0, 2, 0}, { 0, 8, 0}, { 0, 2, 0}, { 0, 2, 0}, + { 0, 2, 0}, { 0,15, 0}, { 0,15, 0}, { 0, 6, 0}, + { 0, 6, 0}, { 0, 2, 0}, { 0, 6, 0}, { 0, 8, 0}, + { 0,15, 0}, { 0,15, 0}, { 0, 2, 0}, { 0, 2, 0}, + { 0,15, 0}, { 0,15, 0}, { 0,15, 0}, { 0,15, 0}, + { 0,15, 0}, { 0, 2, 0}, { 0, 2, 0}, { 0,15, 0} + }, + + { // BC7 Partition Set Fixups for 3 Subsets + { 0, 3,15}, { 0, 3, 8}, { 0,15, 8}, { 0,15, 3}, + { 0, 8,15}, { 0, 3,15}, { 0,15, 3}, { 0,15, 8}, + { 0, 8,15}, { 0, 8,15}, { 0, 6,15}, { 0, 6,15}, + { 0, 6,15}, { 0, 5,15}, { 0, 3,15}, { 0, 3, 8}, + { 0, 3,15}, { 0, 3, 8}, { 0, 8,15}, { 0,15, 3}, + { 0, 3,15}, { 0, 3, 8}, { 0, 6,15}, { 0,10, 8}, + { 0, 5, 3}, { 0, 8,15}, { 0, 8, 6}, { 0, 6,10}, + { 0, 8,15}, { 0, 5,15}, { 0,15,10}, { 0,15, 8}, + { 0, 8,15}, { 0,15, 3}, { 0, 3,15}, { 0, 5,10}, + { 0, 6,10}, { 0,10, 8}, { 0, 8, 9}, { 0,15,10}, + { 0,15, 6}, { 0, 3,15}, { 0,15, 8}, { 0, 5,15}, + { 0,15, 3}, { 0,15, 6}, { 0,15, 6}, { 0,15, 8}, + { 0, 3,15}, { 0,15, 3}, { 0, 5,15}, { 0, 5,15}, + { 0, 5,15}, { 0, 8,15}, { 0, 5,15}, { 0,10,15}, + { 0, 5,15}, { 0,10,15}, { 0, 8,15}, { 0,13,15}, + { 0,15, 3}, { 0,12,15}, { 0, 3,15}, { 0, 3, 8} + } +}; + +// BC6H Compression +const D3DX_BC6H::ModeDescriptor D3DX_BC6H::ms_aDesc[14][82] = +{ + { // 0x00 - 10 5 5 5 + { M, 0}, { M, 1}, {GY, 4}, {BY, 4}, {BZ, 4}, {RW, 0}, {RW, 1}, {RW, 2}, {RW, 3}, {RW, 4}, + {RW, 5}, {RW, 6}, {RW, 7}, {RW, 8}, {RW, 9}, {GW, 0}, {GW, 1}, {GW, 2}, {GW, 3}, {GW, 4}, + {GW, 5}, {GW, 6}, {GW, 7}, {GW, 8}, {GW, 9}, {BW, 0}, {BW, 1}, {BW, 2}, {BW, 3}, {BW, 4}, + {BW, 5}, {BW, 6}, {BW, 7}, {BW, 8}, {BW, 9}, {RX, 0}, {RX, 1}, {RX, 2}, {RX, 3}, {RX, 4}, + {GZ, 4}, {GY, 0}, {GY, 1}, {GY, 2}, {GY, 3}, {GX, 0}, {GX, 1}, {GX, 2}, {GX, 3}, {GX, 4}, + {BZ, 0}, {GZ, 0}, {GZ, 1}, {GZ, 2}, {GZ, 3}, {BX, 0}, {BX, 1}, {BX, 2}, {BX, 3}, {BX, 4}, + {BZ, 1}, {BY, 0}, {BY, 1}, {BY, 2}, {BY, 3}, {RY, 0}, {RY, 1}, {RY, 2}, {RY, 3}, {RY, 4}, + {BZ, 2}, {RZ, 0}, {RZ, 1}, {RZ, 2}, {RZ, 3}, {RZ, 4}, {BZ, 3}, { D, 0}, { D, 1}, { D, 2}, + { D, 3}, { D, 4}, + }, + + { // 0x01 - 7 6 6 6 + { M, 0}, { M, 1}, {GY, 5}, {GZ, 4}, {GZ, 5}, {RW, 0}, {RW, 1}, {RW, 2}, {RW, 3}, {RW, 4}, + {RW, 5}, {RW, 6}, {BZ, 0}, {BZ, 1}, {BY, 4}, {GW, 0}, {GW, 1}, {GW, 2}, {GW, 3}, {GW, 4}, + {GW, 5}, {GW, 6}, {BY, 5}, {BZ, 2}, {GY, 4}, {BW, 0}, {BW, 1}, {BW, 2}, {BW, 3}, {BW, 4}, + {BW, 5}, {BW, 6}, {BZ, 3}, {BZ, 5}, {BZ, 4}, {RX, 0}, {RX, 1}, {RX, 2}, {RX, 3}, {RX, 4}, + {RX, 5}, {GY, 0}, {GY, 1}, {GY, 2}, {GY, 3}, {GX, 0}, {GX, 1}, {GX, 2}, {GX, 3}, {GX, 4}, + {GX, 5}, {GZ, 0}, {GZ, 1}, {GZ, 2}, {GZ, 3}, {BX, 0}, {BX, 1}, {BX, 2}, {BX, 3}, {BX, 4}, + {BX, 5}, {BY, 0}, {BY, 1}, {BY, 2}, {BY, 3}, {RY, 0}, {RY, 1}, {RY, 2}, {RY, 3}, {RY, 4}, + {RY, 5}, {RZ, 0}, {RZ, 1}, {RZ, 2}, {RZ, 3}, {RZ, 4}, {RZ, 5}, { D, 0}, { D, 1}, { D, 2}, + { D, 3}, { D, 4}, + }, + + { // 0x02 - 11 5 4 4 + { M, 0}, { M, 1}, { M, 2}, { M, 3}, { M, 4}, {RW, 0}, {RW, 1}, {RW, 2}, {RW, 3}, {RW, 4}, + {RW, 5}, {RW, 6}, {RW, 7}, {RW, 8}, {RW, 9}, {GW, 0}, {GW, 1}, {GW, 2}, {GW, 3}, {GW, 4}, + {GW, 5}, {GW, 6}, {GW, 7}, {GW, 8}, {GW, 9}, {BW, 0}, {BW, 1}, {BW, 2}, {BW, 3}, {BW, 4}, + {BW, 5}, {BW, 6}, {BW, 7}, {BW, 8}, {BW, 9}, {RX, 0}, {RX, 1}, {RX, 2}, {RX, 3}, {RX, 4}, + {RW,10}, {GY, 0}, {GY, 1}, {GY, 2}, {GY, 3}, {GX, 0}, {GX, 1}, {GX, 2}, {GX, 3}, {GW,10}, + {BZ, 0}, {GZ, 0}, {GZ, 1}, {GZ, 2}, {GZ, 3}, {BX, 0}, {BX, 1}, {BX, 2}, {BX, 3}, {BW,10}, + {BZ, 1}, {BY, 0}, {BY, 1}, {BY, 2}, {BY, 3}, {RY, 0}, {RY, 1}, {RY, 2}, {RY, 3}, {RY, 4}, + {BZ, 2}, {RZ, 0}, {RZ, 1}, {RZ, 2}, {RZ, 3}, {RZ, 4}, {BZ, 3}, { D, 0}, { D, 1}, { D, 2}, + { D, 3}, { D, 4}, + }, + + { // 0x06 - 11 4 5 4 + { M, 0}, { M, 1}, { M, 2}, { M, 3}, { M, 4}, {RW, 0}, {RW, 1}, {RW, 2}, {RW, 3}, {RW, 4}, + {RW, 5}, {RW, 6}, {RW, 7}, {RW, 8}, {RW, 9}, {GW, 0}, {GW, 1}, {GW, 2}, {GW, 3}, {GW, 4}, + {GW, 5}, {GW, 6}, {GW, 7}, {GW, 8}, {GW, 9}, {BW, 0}, {BW, 1}, {BW, 2}, {BW, 3}, {BW, 4}, + {BW, 5}, {BW, 6}, {BW, 7}, {BW, 8}, {BW, 9}, {RX, 0}, {RX, 1}, {RX, 2}, {RX, 3}, {RW,10}, + {GZ, 4}, {GY, 0}, {GY, 1}, {GY, 2}, {GY, 3}, {GX, 0}, {GX, 1}, {GX, 2}, {GX, 3}, {GX, 4}, + {GW,10}, {GZ, 0}, {GZ, 1}, {GZ, 2}, {GZ, 3}, {BX, 0}, {BX, 1}, {BX, 2}, {BX, 3}, {BW,10}, + {BZ, 1}, {BY, 0}, {BY, 1}, {BY, 2}, {BY, 3}, {RY, 0}, {RY, 1}, {RY, 2}, {RY, 3}, {BZ, 0}, + {BZ, 2}, {RZ, 0}, {RZ, 1}, {RZ, 2}, {RZ, 3}, {GY, 4}, {BZ, 3}, { D, 0}, { D, 1}, { D, 2}, + { D, 3}, { D, 4}, + }, + + { // 0x0a - 11 4 4 5 + { M, 0}, { M, 1}, { M, 2}, { M, 3}, { M, 4}, {RW, 0}, {RW, 1}, {RW, 2}, {RW, 3}, {RW, 4}, + {RW, 5}, {RW, 6}, {RW, 7}, {RW, 8}, {RW, 9}, {GW, 0}, {GW, 1}, {GW, 2}, {GW, 3}, {GW, 4}, + {GW, 5}, {GW, 6}, {GW, 7}, {GW, 8}, {GW, 9}, {BW, 0}, {BW, 1}, {BW, 2}, {BW, 3}, {BW, 4}, + {BW, 5}, {BW, 6}, {BW, 7}, {BW, 8}, {BW, 9}, {RX, 0}, {RX, 1}, {RX, 2}, {RX, 3}, {RW,10}, + {BY, 4}, {GY, 0}, {GY, 1}, {GY, 2}, {GY, 3}, {GX, 0}, {GX, 1}, {GX, 2}, {GX, 3}, {GW,10}, + {BZ, 0}, {GZ, 0}, {GZ, 1}, {GZ, 2}, {GZ, 3}, {BX, 0}, {BX, 1}, {BX, 2}, {BX, 3}, {BX, 4}, + {BW,10}, {BY, 0}, {BY, 1}, {BY, 2}, {BY, 3}, {RY, 0}, {RY, 1}, {RY, 2}, {RY, 3}, {BZ, 1}, + {BZ, 2}, {RZ, 0}, {RZ, 1}, {RZ, 2}, {RZ, 3}, {BZ, 4}, {BZ, 3}, { D, 0}, { D, 1}, { D, 2}, + { D, 3}, { D, 4}, + }, + + { // 0x0e - 9 5 5 5 + { M, 0}, { M, 1}, { M, 2}, { M, 3}, { M, 4}, {RW, 0}, {RW, 1}, {RW, 2}, {RW, 3}, {RW, 4}, + {RW, 5}, {RW, 6}, {RW, 7}, {RW, 8}, {BY, 4}, {GW, 0}, {GW, 1}, {GW, 2}, {GW, 3}, {GW, 4}, + {GW, 5}, {GW, 6}, {GW, 7}, {GW, 8}, {GY, 4}, {BW, 0}, {BW, 1}, {BW, 2}, {BW, 3}, {BW, 4}, + {BW, 5}, {BW, 6}, {BW, 7}, {BW, 8}, {BZ, 4}, {RX, 0}, {RX, 1}, {RX, 2}, {RX, 3}, {RX, 4}, + {GZ, 4}, {GY, 0}, {GY, 1}, {GY, 2}, {GY, 3}, {GX, 0}, {GX, 1}, {GX, 2}, {GX, 3}, {GX, 4}, + {BZ, 0}, {GZ, 0}, {GZ, 1}, {GZ, 2}, {GZ, 3}, {BX, 0}, {BX, 1}, {BX, 2}, {BX, 3}, {BX, 4}, + {BZ, 1}, {BY, 0}, {BY, 1}, {BY, 2}, {BY, 3}, {RY, 0}, {RY, 1}, {RY, 2}, {RY, 3}, {RY, 4}, + {BZ, 2}, {RZ, 0}, {RZ, 1}, {RZ, 2}, {RZ, 3}, {RZ, 4}, {BZ, 3}, { D, 0}, { D, 1}, { D, 2}, + { D, 3}, { D, 4}, + }, + + { // 0x12 - 8 6 5 5 + { M, 0}, { M, 1}, { M, 2}, { M, 3}, { M, 4}, {RW, 0}, {RW, 1}, {RW, 2}, {RW, 3}, {RW, 4}, + {RW, 5}, {RW, 6}, {RW, 7}, {GZ, 4}, {BY, 4}, {GW, 0}, {GW, 1}, {GW, 2}, {GW, 3}, {GW, 4}, + {GW, 5}, {GW, 6}, {GW, 7}, {BZ, 2}, {GY, 4}, {BW, 0}, {BW, 1}, {BW, 2}, {BW, 3}, {BW, 4}, + {BW, 5}, {BW, 6}, {BW, 7}, {BZ, 3}, {BZ, 4}, {RX, 0}, {RX, 1}, {RX, 2}, {RX, 3}, {RX, 4}, + {RX, 5}, {GY, 0}, {GY, 1}, {GY, 2}, {GY, 3}, {GX, 0}, {GX, 1}, {GX, 2}, {GX, 3}, {GX, 4}, + {BZ, 0}, {GZ, 0}, {GZ, 1}, {GZ, 2}, {GZ, 3}, {BX, 0}, {BX, 1}, {BX, 2}, {BX, 3}, {BX, 4}, + {BZ, 1}, {BY, 0}, {BY, 1}, {BY, 2}, {BY, 3}, {RY, 0}, {RY, 1}, {RY, 2}, {RY, 3}, {RY, 4}, + {RY, 5}, {RZ, 0}, {RZ, 1}, {RZ, 2}, {RZ, 3}, {RZ, 4}, {RZ, 5}, { D, 0}, { D, 1}, { D, 2}, + { D, 3}, { D, 4}, + }, + + { // 0x16 - 8 5 6 5 + { M, 0}, { M, 1}, { M, 2}, { M, 3}, { M, 4}, {RW, 0}, {RW, 1}, {RW, 2}, {RW, 3}, {RW, 4}, + {RW, 5}, {RW, 6}, {RW, 7}, {BZ, 0}, {BY, 4}, {GW, 0}, {GW, 1}, {GW, 2}, {GW, 3}, {GW, 4}, + {GW, 5}, {GW, 6}, {GW, 7}, {GY, 5}, {GY, 4}, {BW, 0}, {BW, 1}, {BW, 2}, {BW, 3}, {BW, 4}, + {BW, 5}, {BW, 6}, {BW, 7}, {GZ, 5}, {BZ, 4}, {RX, 0}, {RX, 1}, {RX, 2}, {RX, 3}, {RX, 4}, + {GZ, 4}, {GY, 0}, {GY, 1}, {GY, 2}, {GY, 3}, {GX, 0}, {GX, 1}, {GX, 2}, {GX, 3}, {GX, 4}, + {GX, 5}, {GZ, 0}, {GZ, 1}, {GZ, 2}, {GZ, 3}, {BX, 0}, {BX, 1}, {BX, 2}, {BX, 3}, {BX, 4}, + {BZ, 1}, {BY, 0}, {BY, 1}, {BY, 2}, {BY, 3}, {RY, 0}, {RY, 1}, {RY, 2}, {RY, 3}, {RY, 4}, + {BZ, 2}, {RZ, 0}, {RZ, 1}, {RZ, 2}, {RZ, 3}, {RZ, 4}, {BZ, 3}, { D, 0}, { D, 1}, { D, 2}, + { D, 3}, { D, 4}, + }, + + { // 0x1a - 8 5 5 6 + { M, 0}, { M, 1}, { M, 2}, { M, 3}, { M, 4}, {RW, 0}, {RW, 1}, {RW, 2}, {RW, 3}, {RW, 4}, + {RW, 5}, {RW, 6}, {RW, 7}, {BZ, 1}, {BY, 4}, {GW, 0}, {GW, 1}, {GW, 2}, {GW, 3}, {GW, 4}, + {GW, 5}, {GW, 6}, {GW, 7}, {BY, 5}, {GY, 4}, {BW, 0}, {BW, 1}, {BW, 2}, {BW, 3}, {BW, 4}, + {BW, 5}, {BW, 6}, {BW, 7}, {BZ, 5}, {BZ, 4}, {RX, 0}, {RX, 1}, {RX, 2}, {RX, 3}, {RX, 4}, + {GZ, 4}, {GY, 0}, {GY, 1}, {GY, 2}, {GY, 3}, {GX, 0}, {GX, 1}, {GX, 2}, {GX, 3}, {GX, 4}, + {BZ, 0}, {GZ, 0}, {GZ, 1}, {GZ, 2}, {GZ, 3}, {BX, 0}, {BX, 1}, {BX, 2}, {BX, 3}, {BX, 4}, + {BX, 5}, {BY, 0}, {BY, 1}, {BY, 2}, {BY, 3}, {RY, 0}, {RY, 1}, {RY, 2}, {RY, 3}, {RY, 4}, + {BZ, 2}, {RZ, 0}, {RZ, 1}, {RZ, 2}, {RZ, 3}, {RZ, 4}, {BZ, 3}, { D, 0}, { D, 1}, { D, 2}, + { D, 3}, { D, 4}, + }, + + { // 0x1e - 6 6 6 6 + { M, 0}, { M, 1}, { M, 2}, { M, 3}, { M, 4}, {RW, 0}, {RW, 1}, {RW, 2}, {RW, 3}, {RW, 4}, + {RW, 5}, {GZ, 4}, {BZ, 0}, {BZ, 1}, {BY, 4}, {GW, 0}, {GW, 1}, {GW, 2}, {GW, 3}, {GW, 4}, + {GW, 5}, {GY, 5}, {BY, 5}, {BZ, 2}, {GY, 4}, {BW, 0}, {BW, 1}, {BW, 2}, {BW, 3}, {BW, 4}, + {BW, 5}, {GZ, 5}, {BZ, 3}, {BZ, 5}, {BZ, 4}, {RX, 0}, {RX, 1}, {RX, 2}, {RX, 3}, {RX, 4}, + {RX, 5}, {GY, 0}, {GY, 1}, {GY, 2}, {GY, 3}, {GX, 0}, {GX, 1}, {GX, 2}, {GX, 3}, {GX, 4}, + {GX, 5}, {GZ, 0}, {GZ, 1}, {GZ, 2}, {GZ, 3}, {BX, 0}, {BX, 1}, {BX, 2}, {BX, 3}, {BX, 4}, + {BX, 5}, {BY, 0}, {BY, 1}, {BY, 2}, {BY, 3}, {RY, 0}, {RY, 1}, {RY, 2}, {RY, 3}, {RY, 4}, + {RY, 5}, {RZ, 0}, {RZ, 1}, {RZ, 2}, {RZ, 3}, {RZ, 4}, {RZ, 5}, { D, 0}, { D, 1}, { D, 2}, + { D, 3}, { D, 4}, + }, + + { // 0x03 - 10 10 + { M, 0}, { M, 1}, { M, 2}, { M, 3}, { M, 4}, {RW, 0}, {RW, 1}, {RW, 2}, {RW, 3}, {RW, 4}, + {RW, 5}, {RW, 6}, {RW, 7}, {RW, 8}, {RW, 9}, {GW, 0}, {GW, 1}, {GW, 2}, {GW, 3}, {GW, 4}, + {GW, 5}, {GW, 6}, {GW, 7}, {GW, 8}, {GW, 9}, {BW, 0}, {BW, 1}, {BW, 2}, {BW, 3}, {BW, 4}, + {BW, 5}, {BW, 6}, {BW, 7}, {BW, 8}, {BW, 9}, {RX, 0}, {RX, 1}, {RX, 2}, {RX, 3}, {RX, 4}, + {RX, 5}, {RX, 6}, {RX, 7}, {RX, 8}, {RX, 9}, {GX, 0}, {GX, 1}, {GX, 2}, {GX, 3}, {GX, 4}, + {GX, 5}, {GX, 6}, {GX, 7}, {GX, 8}, {GX, 9}, {BX, 0}, {BX, 1}, {BX, 2}, {BX, 3}, {BX, 4}, + {BX, 5}, {BX, 6}, {BX, 7}, {BX, 8}, {BX, 9}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, + {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, + {NA, 0}, {NA, 0}, + }, + + { // 0x07 - 11 9 + { M, 0}, { M, 1}, { M, 2}, { M, 3}, { M, 4}, {RW, 0}, {RW, 1}, {RW, 2}, {RW, 3}, {RW, 4}, + {RW, 5}, {RW, 6}, {RW, 7}, {RW, 8}, {RW, 9}, {GW, 0}, {GW, 1}, {GW, 2}, {GW, 3}, {GW, 4}, + {GW, 5}, {GW, 6}, {GW, 7}, {GW, 8}, {GW, 9}, {BW, 0}, {BW, 1}, {BW, 2}, {BW, 3}, {BW, 4}, + {BW, 5}, {BW, 6}, {BW, 7}, {BW, 8}, {BW, 9}, {RX, 0}, {RX, 1}, {RX, 2}, {RX, 3}, {RX, 4}, + {RX, 5}, {RX, 6}, {RX, 7}, {RX, 8}, {RW,10}, {GX, 0}, {GX, 1}, {GX, 2}, {GX, 3}, {GX, 4}, + {GX, 5}, {GX, 6}, {GX, 7}, {GX, 8}, {GW,10}, {BX, 0}, {BX, 1}, {BX, 2}, {BX, 3}, {BX, 4}, + {BX, 5}, {BX, 6}, {BX, 7}, {BX, 8}, {BW,10}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, + {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, + {NA, 0}, {NA, 0}, + }, + + { // 0x0b - 12 8 + { M, 0}, { M, 1}, { M, 2}, { M, 3}, { M, 4}, {RW, 0}, {RW, 1}, {RW, 2}, {RW, 3}, {RW, 4}, + {RW, 5}, {RW, 6}, {RW, 7}, {RW, 8}, {RW, 9}, {GW, 0}, {GW, 1}, {GW, 2}, {GW, 3}, {GW, 4}, + {GW, 5}, {GW, 6}, {GW, 7}, {GW, 8}, {GW, 9}, {BW, 0}, {BW, 1}, {BW, 2}, {BW, 3}, {BW, 4}, + {BW, 5}, {BW, 6}, {BW, 7}, {BW, 8}, {BW, 9}, {RX, 0}, {RX, 1}, {RX, 2}, {RX, 3}, {RX, 4}, + {RX, 5}, {RX, 6}, {RX, 7}, {RW,11}, {RW,10}, {GX, 0}, {GX, 1}, {GX, 2}, {GX, 3}, {GX, 4}, + {GX, 5}, {GX, 6}, {GX, 7}, {GW,11}, {GW,10}, {BX, 0}, {BX, 1}, {BX, 2}, {BX, 3}, {BX, 4}, + {BX, 5}, {BX, 6}, {BX, 7}, {BW,11}, {BW,10}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, + {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, + {NA, 0}, {NA, 0}, + }, + + { // 0x0f - 16 4 + { M, 0}, { M, 1}, { M, 2}, { M, 3}, { M, 4}, {RW, 0}, {RW, 1}, {RW, 2}, {RW, 3}, {RW, 4}, + {RW, 5}, {RW, 6}, {RW, 7}, {RW, 8}, {RW, 9}, {GW, 0}, {GW, 1}, {GW, 2}, {GW, 3}, {GW, 4}, + {GW, 5}, {GW, 6}, {GW, 7}, {GW, 8}, {GW, 9}, {BW, 0}, {BW, 1}, {BW, 2}, {BW, 3}, {BW, 4}, + {BW, 5}, {BW, 6}, {BW, 7}, {BW, 8}, {BW, 9}, {RX, 0}, {RX, 1}, {RX, 2}, {RX, 3}, {RW,15}, + {RW,14}, {RW,13}, {RW,12}, {RW,11}, {RW,10}, {GX, 0}, {GX, 1}, {GX, 2}, {GX, 3}, {GW,15}, + {GW,14}, {GW,13}, {GW,12}, {GW,11}, {GW,10}, {BX, 0}, {BX, 1}, {BX, 2}, {BX, 3}, {BW,15}, + {BW,14}, {BW,13}, {BW,12}, {BW,11}, {BW,10}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, + {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, + {NA, 0}, {NA, 0}, + }, +}; + +// Mode, Partitions, Transformed, IndexPrec, RGBAPrec +const D3DX_BC6H::ModeInfo D3DX_BC6H::ms_aInfo[] = +{ + {0x00, 1, true, 3, LDRColorA(10,10,10,0), LDRColorA( 5, 5, 5,0), LDRColorA(5,5,5,0), LDRColorA(5,5,5,0)}, // Mode 0 + {0x01, 1, true, 3, LDRColorA( 7, 7, 7,0), LDRColorA( 6, 6, 6,0), LDRColorA(6,6,6,0), LDRColorA(6,6,6,0)}, // Mode 1 + {0x02, 1, true, 3, LDRColorA(11,11,11,0), LDRColorA( 5, 4, 4,0), LDRColorA(5,4,4,0), LDRColorA(5,4,4,0)}, // Mode 2 + {0x06, 1, true, 3, LDRColorA(11,11,11,0), LDRColorA( 4, 5, 4,0), LDRColorA(4,5,4,0), LDRColorA(4,5,4,0)}, // Mode 3 + {0x0a, 1, true, 3, LDRColorA(11,11,11,0), LDRColorA( 4, 4, 5,0), LDRColorA(4,4,5,0), LDRColorA(4,4,5,0)}, // Mode 4 + {0x0e, 1, true, 3, LDRColorA( 9, 9, 9,0), LDRColorA( 5, 5, 5,0), LDRColorA(5,5,5,0), LDRColorA(5,5,5,0)}, // Mode 5 + {0x12, 1, true, 3, LDRColorA( 8, 8, 8,0), LDRColorA( 6, 5, 5,0), LDRColorA(6,5,5,0), LDRColorA(6,5,5,0)}, // Mode 6 + {0x16, 1, true, 3, LDRColorA( 8, 8, 8,0), LDRColorA( 5, 6, 5,0), LDRColorA(5,6,5,0), LDRColorA(5,6,5,0)}, // Mode 7 + {0x1a, 1, true, 3, LDRColorA( 8, 8, 8,0), LDRColorA( 5, 5, 6,0), LDRColorA(5,5,6,0), LDRColorA(5,5,6,0)}, // Mode 8 + {0x1e, 1, false, 3, LDRColorA( 6, 6, 6,0), LDRColorA( 6, 6, 6,0), LDRColorA(6,6,6,0), LDRColorA(6,6,6,0)}, // Mode 9 + {0x03, 0, false, 4, LDRColorA(10,10,10,0), LDRColorA(10,10,10,0), LDRColorA(0,0,0,0), LDRColorA(0,0,0,0)}, // Mode 10 + {0x07, 0, true, 4, LDRColorA(11,11,11,0), LDRColorA( 9, 9, 9,0), LDRColorA(0,0,0,0), LDRColorA(0,0,0,0)}, // Mode 11 + {0x0b, 0, true, 4, LDRColorA(12,12,12,0), LDRColorA( 8, 8, 8,0), LDRColorA(0,0,0,0), LDRColorA(0,0,0,0)}, // Mode 12 + {0x0f, 0, true, 4, LDRColorA(16,16,16,0), LDRColorA( 4, 4, 4,0), LDRColorA(0,0,0,0), LDRColorA(0,0,0,0)}, // Mode 13 +}; + +const int D3DX_BC6H::ms_aModeToInfo[] = +{ + 0, // 0x00 + 1, // 0x01 + 2, // 0x02 + 10, // 0x03 + -1, // 0x04 + -1, // 0x05 + 3, // 0x06 + 11, // 0x07 + -1, // 0x08 + -1, // 0x09 + 4, // 0x0a + 12, // 0x0b + -1, // 0x0c + -1, // 0x0d + 5, // 0x0e + 13, // 0x0f + -1, // 0x10 + -1, // 0x11 + 6, // 0x12 + -1, // 0x13 + -1, // 0x14 + -1, // 0x15 + 7, // 0x16 + -1, // 0x17 + -1, // 0x18 + -1, // 0x19 + 8, // 0x1a + -1, // 0x1b + -1, // 0x1c + -1, // 0x1d + 9, // 0x1e + -1, // 0x1f +}; + +// BC7 compression: uPartitions, uPartitionBits, uPBits, uRotationBits, uIndexModeBits, uIndexPrec, uIndexPrec2, RGBAPrec, RGBAPrecWithP +const D3DX_BC7::ModeInfo D3DX_BC7::ms_aInfo[] = +{ + {2, 4, 6, 0, 0, 3, 0, LDRColorA(4,4,4,0), LDRColorA(5,5,5,0)}, + // Mode 0: Color only, 3 Subsets, RGBP 4441 (unique P-bit), 3-bit indecies, 16 partitions + {1, 6, 2, 0, 0, 3, 0, LDRColorA(6,6,6,0), LDRColorA(7,7,7,0)}, + // Mode 1: Color only, 2 Subsets, RGBP 6661 (shared P-bit), 3-bit indecies, 64 partitions + {2, 6, 0, 0, 0, 2, 0, LDRColorA(5,5,5,0), LDRColorA(5,5,5,0)}, + // Mode 2: Color only, 3 Subsets, RGB 555, 2-bit indecies, 64 partitions + {1, 6, 4, 0, 0, 2, 0, LDRColorA(7,7,7,0), LDRColorA(8,8,8,0)}, + // Mode 3: Color only, 2 Subsets, RGBP 7771 (unique P-bit), 2-bits indecies, 64 partitions + {0, 0, 0, 2, 1, 2, 3, LDRColorA(5,5,5,6), LDRColorA(5,5,5,6)}, + // Mode 4: Color w/ Separate Alpha, 1 Subset, RGB 555, A6, 16x2/16x3-bit indices, 2-bit rotation, 1-bit index selector + {0, 0, 0, 2, 0, 2, 2, LDRColorA(7,7,7,8), LDRColorA(7,7,7,8)}, + // Mode 5: Color w/ Separate Alpha, 1 Subset, RGB 777, A8, 16x2/16x2-bit indices, 2-bit rotation + {0, 0, 2, 0, 0, 4, 0, LDRColorA(7,7,7,7), LDRColorA(8,8,8,8)}, + // Mode 6: Color+Alpha, 1 Subset, RGBAP 77771 (unique P-bit), 16x4-bit indecies + {1, 6, 4, 0, 0, 2, 0, LDRColorA(5,5,5,5), LDRColorA(6,6,6,6)} + // Mode 7: Color+Alpha, 2 Subsets, RGBAP 55551 (unique P-bit), 2-bit indices, 64 partitions +}; + + +//------------------------------------------------------------------------------------- +// Helper functions +//------------------------------------------------------------------------------------- +template< class T > +inline static void Swap( T& a, T& b ) +{ + T temp = a; + a = b; + b = temp; +} + +inline static bool IsFixUpOffset(_In_range_(0,2) size_t uPartitions, _In_range_(0,63) size_t uShape, _In_range_(0,15) size_t uOffset) +{ + assert(uPartitions < 3 && uShape < 64 && uOffset < 16); + __analysis_assume(uPartitions < 3 && uShape < 64 && uOffset < 16); + for(size_t p = 0; p <= uPartitions; p++) + { + if(uOffset == g_aFixUp[uPartitions][uShape][p]) + { + return true; + } + } + return false; +} + +inline static float ErrorMetricRGB(_In_ const LDRColorA& a, _In_ const LDRColorA& b) +{ + float er = float(a.r) - float(b.r); + float eg = float(a.g) - float(b.g); + float eb = float(a.b) - float(b.b); + // weigh the components nonuniformly + //er *= 0.299; + //eg *= 0.587; + //eb *= 0.114; + return er*er + eg*eg + eb*eb; +} + +inline static float ErrorMetricAlpha(_In_ const LDRColorA& a, _In_ const LDRColorA& b) +{ + float ea = float(a.a) - float(b.a); + return ea*ea; +} + +inline static float ErrorMetric(_In_ const LDRColorA& a, _In_ const LDRColorA& b) +{ + return ErrorMetricRGB(a, b) + ErrorMetricAlpha(a, b); +} + +inline static void TransformForward(_Inout_count_c_(BC6H_MAX_REGIONS) INTEndPntPair aEndPts[]) +{ + aEndPts[0].B -= aEndPts[0].A; + aEndPts[1].A -= aEndPts[0].A; + aEndPts[1].B -= aEndPts[0].A; +} + +inline static void TransformInverse(_Inout_count_c_(BC6H_MAX_REGIONS) INTEndPntPair aEndPts[], _In_ const LDRColorA& Prec, _In_ bool bSigned) +{ + INTColor WrapMask((1 << Prec.r) - 1, (1 << Prec.g) - 1, (1 << Prec.b) - 1); + aEndPts[0].B += aEndPts[0].A; aEndPts[0].B &= WrapMask; + aEndPts[1].A += aEndPts[0].A; aEndPts[1].A &= WrapMask; + aEndPts[1].B += aEndPts[0].A; aEndPts[1].B &= WrapMask; + if(bSigned) + { + aEndPts[0].B.SignExtend(Prec); + aEndPts[1].A.SignExtend(Prec); + aEndPts[1].B.SignExtend(Prec); + } +} + +inline static float Norm(_In_ const INTColor& a, _In_ const INTColor& b) +{ + float dr = float(a.r) - float(b.r); + float dg = float(a.g) - float(b.g); + float db = float(a.b) - float(b.b); + return dr * dr + dg * dg + db * db; +} + +// return # of bits needed to store n. handle signed or unsigned cases properly +inline static int NBits(_In_ int n, _In_ bool bIsSigned) +{ + int nb; + if(n == 0) + { + return 0; // no bits needed for 0, signed or not + } + else if(n > 0) + { + for(nb = 0; n; ++nb, n >>= 1); + return nb + (bIsSigned ? 1 : 0); + } + else + { + assert(bIsSigned); + for(nb = 0; n < -1; ++nb, n >>= 1) ; + return nb + 1; + } +} + + +//------------------------------------------------------------------------------------- +static float OptimizeRGB(_In_count_c_(NUM_PIXELS_PER_BLOCK) const HDRColorA* const pPoints, + _Out_ HDRColorA* pX, _Out_ HDRColorA* pY, + _In_ size_t cSteps, _In_ size_t cPixels, _In_count_(cPixels) const size_t* pIndex) +{ + float fError = FLT_MAX; + const float *pC = (3 == cSteps) ? pC3 : pC4; + const float *pD = (3 == cSteps) ? pD3 : pD4; + + // Find Min and Max points, as starting point + HDRColorA X(1.0f, 1.0f, 1.0f, 0.0f); + HDRColorA Y(0.0f, 0.0f, 0.0f, 0.0f); + + for(size_t iPoint = 0; iPoint < cPixels; iPoint++) + { + if(pPoints[pIndex[iPoint]].r < X.r) X.r = pPoints[pIndex[iPoint]].r; + if(pPoints[pIndex[iPoint]].g < X.g) X.g = pPoints[pIndex[iPoint]].g; + if(pPoints[pIndex[iPoint]].b < X.b) X.b = pPoints[pIndex[iPoint]].b; + if(pPoints[pIndex[iPoint]].r > Y.r) Y.r = pPoints[pIndex[iPoint]].r; + if(pPoints[pIndex[iPoint]].g > Y.g) Y.g = pPoints[pIndex[iPoint]].g; + if(pPoints[pIndex[iPoint]].b > Y.b) Y.b = pPoints[pIndex[iPoint]].b; + } + + // Diagonal axis + HDRColorA AB; + AB.r = Y.r - X.r; + AB.g = Y.g - X.g; + AB.b = Y.b - X.b; + + float fAB = AB.r * AB.r + AB.g * AB.g + AB.b * AB.b; + + // Single color block.. no need to root-find + if(fAB < FLT_MIN) + { + pX->r = X.r; pX->g = X.g; pX->b = X.b; + pY->r = Y.r; pY->g = Y.g; pY->b = Y.b; + return 0.0f; + } + + // Try all four axis directions, to determine which diagonal best fits data + float fABInv = 1.0f / fAB; + + HDRColorA Dir; + Dir.r = AB.r * fABInv; + Dir.g = AB.g * fABInv; + Dir.b = AB.b * fABInv; + + HDRColorA Mid; + Mid.r = (X.r + Y.r) * 0.5f; + Mid.g = (X.g + Y.g) * 0.5f; + Mid.b = (X.b + Y.b) * 0.5f; + + float fDir[4]; + fDir[0] = fDir[1] = fDir[2] = fDir[3] = 0.0f; + + for(size_t iPoint = 0; iPoint < cPixels; iPoint++) + { + HDRColorA Pt; + Pt.r = (pPoints[pIndex[iPoint]].r - Mid.r) * Dir.r; + Pt.g = (pPoints[pIndex[iPoint]].g - Mid.g) * Dir.g; + Pt.b = (pPoints[pIndex[iPoint]].b - Mid.b) * Dir.b; + + float f; + f = Pt.r + Pt.g + Pt.b; fDir[0] += f * f; + f = Pt.r + Pt.g - Pt.b; fDir[1] += f * f; + f = Pt.r - Pt.g + Pt.b; fDir[2] += f * f; + f = Pt.r - Pt.g - Pt.b; fDir[3] += f * f; + } + + float fDirMax = fDir[0]; + size_t iDirMax = 0; + + for(size_t iDir = 1; iDir < 4; iDir++) + { + if(fDir[iDir] > fDirMax) + { + fDirMax = fDir[iDir]; + iDirMax = iDir; + } + } + + if(iDirMax & 2) Swap( X.g, Y.g ); + if(iDirMax & 1) Swap( X.b, Y.b ); + + // Two color block.. no need to root-find + if(fAB < 1.0f / 4096.0f) + { + pX->r = X.r; pX->g = X.g; pX->b = X.b; + pY->r = Y.r; pY->g = Y.g; pY->b = Y.b; + return 0.0f; + } + + // Use Newton's Method to find local minima of sum-of-squares error. + float fSteps = (float) (cSteps - 1); + + for(size_t iIteration = 0; iIteration < 8; iIteration++) + { + // Calculate new steps + HDRColorA pSteps[4]; + + for(size_t iStep = 0; iStep < cSteps; iStep++) + { + pSteps[iStep].r = X.r * pC[iStep] + Y.r * pD[iStep]; + pSteps[iStep].g = X.g * pC[iStep] + Y.g * pD[iStep]; + pSteps[iStep].b = X.b * pC[iStep] + Y.b * pD[iStep]; + } + + // Calculate color direction + Dir.r = Y.r - X.r; + Dir.g = Y.g - X.g; + Dir.b = Y.b - X.b; + + float fLen = (Dir.r * Dir.r + Dir.g * Dir.g + Dir.b * Dir.b); + + if(fLen < (1.0f / 4096.0f)) + break; + + float fScale = fSteps / fLen; + + Dir.r *= fScale; + Dir.g *= fScale; + Dir.b *= fScale; + + // Evaluate function, and derivatives + float d2X = 0.0f, d2Y = 0.0f; + HDRColorA dX(0.0f, 0.0f, 0.0f, 0.0f), dY(0.0f, 0.0f, 0.0f, 0.0f); + + for(size_t iPoint = 0; iPoint < cPixels; iPoint++) + { + float fDot = (pPoints[pIndex[iPoint]].r - X.r) * Dir.r + + (pPoints[pIndex[iPoint]].g - X.g) * Dir.g + + (pPoints[pIndex[iPoint]].b - X.b) * Dir.b; + + size_t iStep; + if(fDot <= 0.0f) + iStep = 0; + if(fDot >= fSteps) + iStep = cSteps - 1; + else + iStep = size_t(fDot + 0.5f); + + HDRColorA Diff; + Diff.r = pSteps[iStep].r - pPoints[pIndex[iPoint]].r; + Diff.g = pSteps[iStep].g - pPoints[pIndex[iPoint]].g; + Diff.b = pSteps[iStep].b - pPoints[pIndex[iPoint]].b; + + float fC = pC[iStep] * (1.0f / 8.0f); + float fD = pD[iStep] * (1.0f / 8.0f); + + d2X += fC * pC[iStep]; + dX.r += fC * Diff.r; + dX.g += fC * Diff.g; + dX.b += fC * Diff.b; + + d2Y += fD * pD[iStep]; + dY.r += fD * Diff.r; + dY.g += fD * Diff.g; + dY.b += fD * Diff.b; + } + + // Move endpoints + if(d2X > 0.0f) + { + float f = -1.0f / d2X; + + X.r += dX.r * f; + X.g += dX.g * f; + X.b += dX.b * f; + } + + if(d2Y > 0.0f) + { + float f = -1.0f / d2Y; + + Y.r += dY.r * f; + Y.g += dY.g * f; + Y.b += dY.b * f; + } + + if((dX.r * dX.r < fEpsilon) && (dX.g * dX.g < fEpsilon) && (dX.b * dX.b < fEpsilon) && + (dY.r * dY.r < fEpsilon) && (dY.g * dY.g < fEpsilon) && (dY.b * dY.b < fEpsilon)) + { + break; + } + } + + pX->r = X.r; pX->g = X.g; pX->b = X.b; + pY->r = Y.r; pY->g = Y.g; pY->b = Y.b; + return fError; +} + + +//------------------------------------------------------------------------------------- +static float OptimizeRGBA(_In_count_c_(NUM_PIXELS_PER_BLOCK) const HDRColorA* const pPoints, + _Out_ HDRColorA* pX, _Out_ HDRColorA* pY, + _In_ size_t cSteps, _In_ size_t cPixels, _In_count_(cPixels) const size_t* pIndex) +{ + float fError = FLT_MAX; + const float *pC = (3 == cSteps) ? pC3 : pC4; + const float *pD = (3 == cSteps) ? pD3 : pD4; + + // Find Min and Max points, as starting point + HDRColorA X(1.0f, 1.0f, 1.0f, 1.0f); + HDRColorA Y(0.0f, 0.0f, 0.0f, 0.0f); + + for(size_t iPoint = 0; iPoint < cPixels; iPoint++) + { + if(pPoints[pIndex[iPoint]].r < X.r) X.r = pPoints[pIndex[iPoint]].r; + if(pPoints[pIndex[iPoint]].g < X.g) X.g = pPoints[pIndex[iPoint]].g; + if(pPoints[pIndex[iPoint]].b < X.b) X.b = pPoints[pIndex[iPoint]].b; + if(pPoints[pIndex[iPoint]].a < X.a) X.a = pPoints[pIndex[iPoint]].a; + if(pPoints[pIndex[iPoint]].r > Y.r) Y.r = pPoints[pIndex[iPoint]].r; + if(pPoints[pIndex[iPoint]].g > Y.g) Y.g = pPoints[pIndex[iPoint]].g; + if(pPoints[pIndex[iPoint]].b > Y.b) Y.b = pPoints[pIndex[iPoint]].b; + if(pPoints[pIndex[iPoint]].a > Y.a) Y.a = pPoints[pIndex[iPoint]].a; + } + + // Diagonal axis + HDRColorA AB = Y - X; + float fAB = AB * AB; + + // Single color block.. no need to root-find + if(fAB < FLT_MIN) + { + *pX = X; + *pY = Y; + return 0.0f; + } + + // Try all four axis directions, to determine which diagonal best fits data + float fABInv = 1.0f / fAB; + HDRColorA Dir = AB * fABInv; + HDRColorA Mid = (X + Y) * 0.5f; + + float fDir[8]; + fDir[0] = fDir[1] = fDir[2] = fDir[3] = fDir[4] = fDir[5] = fDir[6] = fDir[7] = 0.0f; + + for(size_t iPoint = 0; iPoint < cPixels; iPoint++) + { + HDRColorA Pt; + Pt.r = (pPoints[pIndex[iPoint]].r - Mid.r) * Dir.r; + Pt.g = (pPoints[pIndex[iPoint]].g - Mid.g) * Dir.g; + Pt.b = (pPoints[pIndex[iPoint]].b - Mid.b) * Dir.b; + Pt.a = (pPoints[pIndex[iPoint]].a - Mid.a) * Dir.a; + + float f; + f = Pt.r + Pt.g + Pt.b + Pt.a; fDir[0] += f * f; + f = Pt.r + Pt.g + Pt.b - Pt.a; fDir[1] += f * f; + f = Pt.r + Pt.g - Pt.b + Pt.a; fDir[2] += f * f; + f = Pt.r + Pt.g - Pt.b - Pt.a; fDir[3] += f * f; + f = Pt.r - Pt.g + Pt.b + Pt.a; fDir[4] += f * f; + f = Pt.r - Pt.g + Pt.b - Pt.a; fDir[5] += f * f; + f = Pt.r - Pt.g - Pt.b + Pt.a; fDir[6] += f * f; + f = Pt.r - Pt.g - Pt.b - Pt.a; fDir[7] += f * f; + } + + float fDirMax = fDir[0]; + size_t iDirMax = 0; + + for(size_t iDir = 1; iDir < 8; iDir++) + { + if(fDir[iDir] > fDirMax) + { + fDirMax = fDir[iDir]; + iDirMax = iDir; + } + } + + if(iDirMax & 4) Swap(X.g, Y.g); + if(iDirMax & 2) Swap(X.b, Y.b); + if(iDirMax & 1) Swap(X.a, Y.a); + + // Two color block.. no need to root-find + if(fAB < 1.0f / 4096.0f) + { + *pX = X; + *pY = Y; + return 0.0f; + } + + // Use Newton's Method to find local minima of sum-of-squares error. + float fSteps = (float) (cSteps - 1); + + for(size_t iIteration = 0; iIteration < 8 && fError > 0.0f; iIteration++) + { + // Calculate new steps + HDRColorA pSteps[BC7_MAX_INDICES]; + + LDRColorA aSteps[BC7_MAX_INDICES]; + LDRColorA lX, lY; + lX = (X * 255.0f).ToLDRColorA(); + lY = (Y * 255.0f).ToLDRColorA(); + + for(size_t iStep = 0; iStep < cSteps; iStep++) + { + pSteps[iStep] = X * pC[iStep] + Y * pD[iStep]; + //LDRColorA::Interpolate(lX, lY, i, i, wcprec, waprec, aSteps[i]); + } + + // Calculate color direction + Dir = Y - X; + float fLen = Dir * Dir; + if(fLen < (1.0f / 4096.0f)) + break; + + float fScale = fSteps / fLen; + Dir *= fScale; + + // Evaluate function, and derivatives + float d2X = 0.0f, d2Y = 0.0f; + HDRColorA dX(0.0f, 0.0f, 0.0f, 0.0f), dY(0.0f, 0.0f, 0.0f, 0.0f); + + for(size_t iPoint = 0; iPoint < cPixels; ++iPoint) + { + float fDot = (pPoints[pIndex[iPoint]] - X) * Dir; + size_t iStep; + if(fDot <= 0.0f) + iStep = 0; + if(fDot >= fSteps) + iStep = cSteps - 1; + else + iStep = size_t(fDot + 0.5f); + + HDRColorA Diff = pSteps[iStep] - pPoints[pIndex[iPoint]]; + float fC = pC[iStep] * (1.0f / 8.0f); + float fD = pD[iStep] * (1.0f / 8.0f); + + d2X += fC * pC[iStep]; + dX += Diff * fC; + + d2Y += fD * pD[iStep]; + dY += Diff * fD; + } + + // Move endpoints + if(d2X > 0.0f) + { + float f = -1.0f / d2X; + X += dX * f; + } + + if(d2Y > 0.0f) + { + float f = -1.0f / d2Y; + Y += dY * f; + } + + if((dX * dX < fEpsilon) && (dY * dY < fEpsilon)) + break; + } + + *pX = X; + *pY = Y; + return fError; +} + + +//------------------------------------------------------------------------------------- +#pragma warning(disable: 4616 6001 6297) + +static float ComputeError(_Inout_ const LDRColorA& pixel, _In_count_x_(1 << uIndexPrec) const LDRColorA aPalette[], + _In_ uint8_t uIndexPrec, _In_ uint8_t uIndexPrec2, _Out_opt_ size_t* pBestIndex = nullptr, _Out_opt_ size_t* pBestIndex2 = nullptr) +{ + const size_t uNumIndices = 1 << uIndexPrec; + const size_t uNumIndices2 = 1 << uIndexPrec2; + float fTotalErr = 0; + float fBestErr = FLT_MAX; + + if(pBestIndex) + *pBestIndex = 0; + if(pBestIndex2) + *pBestIndex2 = 0; + + if(uIndexPrec2 == 0) + { + for(register size_t i = 0; i < uNumIndices && fBestErr > 0; i++) + { + float fErr = ErrorMetric(pixel, aPalette[i]); + if(fErr > fBestErr) // error increased, so we're done searching + break; + if(fErr < fBestErr) + { + fBestErr = fErr; + if(pBestIndex) + *pBestIndex = i; + } + } + fTotalErr += fBestErr; + } + else + { + for(register size_t i = 0; i < uNumIndices && fBestErr > 0; i++) + { + float fErr = ErrorMetricRGB(pixel, aPalette[i]); + if(fErr > fBestErr) // error increased, so we're done searching + break; + if(fErr < fBestErr) + { + fBestErr = fErr; + if(pBestIndex) + *pBestIndex = i; + } + } + fTotalErr += fBestErr; + fBestErr = FLT_MAX; + for(register size_t i = 0; i < uNumIndices2 && fBestErr > 0; i++) + { + float fErr = ErrorMetricAlpha(pixel, aPalette[i]); + if(fErr > fBestErr) // error increased, so we're done searching + break; + if(fErr < fBestErr) + { + fBestErr = fErr; + if(pBestIndex2) + *pBestIndex2 = i; + } + } + fTotalErr += fBestErr; + } + + return fTotalErr; +} + + +inline static void FillWithErrorColors( _Out_cap_c_(NUM_PIXELS_PER_BLOCK) HDRColorA* pOut ) +{ + for(size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) + { +#ifdef _DEBUG + // Use Magenta in debug as a highly-visible error color + pOut[i] = HDRColorA(1.0f, 0.0f, 1.0f, 1.0f); +#else + // In production use, default to black + pOut[i] = HDRColorA(0.0f, 0.0f, 0.0f, 1.0f); +#endif + } +} + + +//------------------------------------------------------------------------------------- +// BC6H Compression +//------------------------------------------------------------------------------------- +void D3DX_BC6H::Decode(bool bSigned, HDRColorA* pOut) const +{ + assert(pOut ); + + size_t uStartBit = 0; + uint8_t uMode = GetBits(uStartBit, 2); + if(uMode != 0x00 && uMode != 0x01) + { + uMode = (GetBits(uStartBit, 3) << 2) | uMode; + } + + assert( uMode < 32 ); + __analysis_assume( uMode < 32 ); + + if ( ms_aModeToInfo[uMode] >= 0 ) + { + assert(ms_aModeToInfo[uMode] < ARRAYSIZE(ms_aInfo)); + __analysis_assume(ms_aModeToInfo[uMode] < ARRAYSIZE(ms_aInfo)); + const ModeDescriptor* desc = ms_aDesc[ms_aModeToInfo[uMode]]; + + assert(ms_aModeToInfo[uMode] < ARRAYSIZE(ms_aDesc)); + __analysis_assume(ms_aModeToInfo[uMode] < ARRAYSIZE(ms_aDesc)); + const ModeInfo& info = ms_aInfo[ms_aModeToInfo[uMode]]; + + INTEndPntPair aEndPts[BC6H_MAX_REGIONS]; + memset(aEndPts, 0, BC6H_MAX_REGIONS * 2 * sizeof(INTColor)); + uint32_t uShape = 0; + + // Read header + const size_t uHeaderBits = info.uPartitions > 0 ? 82 : 65; + while(uStartBit < uHeaderBits) + { + size_t uCurBit = uStartBit; + if(GetBit(uStartBit)) + { + switch(desc[uCurBit].m_eField) + { + case D: uShape |= 1 << uint32_t(desc[uCurBit].m_uBit); break; + case RW: aEndPts[0].A.r |= 1 << uint32_t(desc[uCurBit].m_uBit); break; + case RX: aEndPts[0].B.r |= 1 << uint32_t(desc[uCurBit].m_uBit); break; + case RY: aEndPts[1].A.r |= 1 << uint32_t(desc[uCurBit].m_uBit); break; + case RZ: aEndPts[1].B.r |= 1 << uint32_t(desc[uCurBit].m_uBit); break; + case GW: aEndPts[0].A.g |= 1 << uint32_t(desc[uCurBit].m_uBit); break; + case GX: aEndPts[0].B.g |= 1 << uint32_t(desc[uCurBit].m_uBit); break; + case GY: aEndPts[1].A.g |= 1 << uint32_t(desc[uCurBit].m_uBit); break; + case GZ: aEndPts[1].B.g |= 1 << uint32_t(desc[uCurBit].m_uBit); break; + case BW: aEndPts[0].A.b |= 1 << uint32_t(desc[uCurBit].m_uBit); break; + case BX: aEndPts[0].B.b |= 1 << uint32_t(desc[uCurBit].m_uBit); break; + case BY: aEndPts[1].A.b |= 1 << uint32_t(desc[uCurBit].m_uBit); break; + case BZ: aEndPts[1].B.b |= 1 << uint32_t(desc[uCurBit].m_uBit); break; + default: + { +#ifdef _DEBUG + OutputDebugStringA( "BC6H: Invalid header bits encountered during decoding\n" ); +#endif + FillWithErrorColors( pOut ); + return; + } + } + } + } + + assert( uShape < 64 ); + __analysis_assume( uShape < 64 ); + + // Sign extend necessary end points + if(bSigned) + { + aEndPts[0].A.SignExtend(info.RGBAPrec[0][0]); + } + if(bSigned || info.bTransformed) + { + assert( info.uPartitions < BC6H_MAX_REGIONS ); + __analysis_assume( info.uPartitions < BC6H_MAX_REGIONS ); + for(size_t p = 0; p <= info.uPartitions; ++p) + { + if(p != 0) + { + aEndPts[p].A.SignExtend(info.RGBAPrec[p][0]); + } + aEndPts[p].B.SignExtend(info.RGBAPrec[p][1]); + } + } + + // Inverse transform the end points + if(info.bTransformed) + { + TransformInverse(aEndPts, info.RGBAPrec[0][0], bSigned); + } + + // Read indices + for(size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) + { + size_t uNumBits = IsFixUpOffset(info.uPartitions, uShape, i) ? info.uIndexPrec-1 : info.uIndexPrec; + if ( uStartBit + uNumBits > 128 ) + { +#ifdef _DEBUG + OutputDebugStringA( "BC6H: Invalid block encountered during decoding\n" ); +#endif + FillWithErrorColors( pOut ); + return; + } + uint8_t uIndex = GetBits(uStartBit, uNumBits); + + if ( uIndex >= ((info.uPartitions > 0) ? 8 : 16) ) + { +#ifdef _DEBUG + OutputDebugStringA( "BC6H: Invalid index encountered during decoding\n" ); +#endif + FillWithErrorColors( pOut ); + return; + } + + size_t uRegion = g_aPartitionTable[info.uPartitions][uShape][i]; + assert( uRegion < BC6H_MAX_REGIONS ); + __analysis_assume( uRegion < BC6H_MAX_REGIONS ); + + // Unquantize endpoints and interpolate + int r1 = Unquantize(aEndPts[uRegion].A.r, info.RGBAPrec[0][0].r, bSigned); + int g1 = Unquantize(aEndPts[uRegion].A.g, info.RGBAPrec[0][0].g, bSigned); + int b1 = Unquantize(aEndPts[uRegion].A.b, info.RGBAPrec[0][0].b, bSigned); + int r2 = Unquantize(aEndPts[uRegion].B.r, info.RGBAPrec[0][0].r, bSigned); + int g2 = Unquantize(aEndPts[uRegion].B.g, info.RGBAPrec[0][0].g, bSigned); + int b2 = Unquantize(aEndPts[uRegion].B.b, info.RGBAPrec[0][0].b, bSigned); + const int* aWeights = info.uPartitions > 0 ? g_aWeights3 : g_aWeights4; + INTColor fc; + fc.r = FinishUnquantize((r1 * (BC67_WEIGHT_MAX - aWeights[uIndex]) + r2 * aWeights[uIndex] + BC67_WEIGHT_ROUND) >> BC67_WEIGHT_SHIFT, bSigned); + fc.g = FinishUnquantize((g1 * (BC67_WEIGHT_MAX - aWeights[uIndex]) + g2 * aWeights[uIndex] + BC67_WEIGHT_ROUND) >> BC67_WEIGHT_SHIFT, bSigned); + fc.b = FinishUnquantize((b1 * (BC67_WEIGHT_MAX - aWeights[uIndex]) + b2 * aWeights[uIndex] + BC67_WEIGHT_ROUND) >> BC67_WEIGHT_SHIFT, bSigned); + + HALF rgb[3]; + fc.ToF16(rgb, bSigned); + + pOut[i].r = XMConvertHalfToFloat( rgb[0] ); + pOut[i].g = XMConvertHalfToFloat( rgb[1] ); + pOut[i].b = XMConvertHalfToFloat( rgb[2] ); + pOut[i].a = 1.0f; + } + } + else + { +#ifdef _DEBUG + OutputDebugStringA( "BC6H: Invalid mode encountered during decoding\n" ); +#endif + FillWithErrorColors( pOut ); + } +} + +void D3DX_BC6H::Encode(bool bSigned, const HDRColorA* const pIn) +{ + assert( pIn ); + + EncodeParams EP(pIn, bSigned); + + for(EP.uMode = 0; EP.uMode < ARRAYSIZE(ms_aInfo) && EP.fBestErr > 0; ++EP.uMode) + { + const uint8_t uShapes = ms_aInfo[EP.uMode].uPartitions ? 32 : 1; + // Number of rough cases to look at. reasonable values of this are 1, uShapes/4, and uShapes + // uShapes/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out + const size_t uItems = std::max(1, uShapes >> 2); + float afRoughMSE[BC6H_MAX_SHAPES]; + uint8_t auShape[BC6H_MAX_SHAPES]; + + // pick the best uItems shapes and refine these. + for(EP.uShape = 0; EP.uShape < uShapes; ++EP.uShape) + { + size_t uShape = EP.uShape; + afRoughMSE[uShape] = RoughMSE(&EP); + auShape[uShape] = static_cast(uShape); + } + + // Bubble up the first uItems items + for(register size_t i = 0; i < uItems; i++) + { + for(register size_t j = i + 1; j < uShapes; j++) + { + if(afRoughMSE[i] > afRoughMSE[j]) + { + Swap(afRoughMSE[i], afRoughMSE[j]); + Swap(auShape[i], auShape[j]); + } + } + } + + for(size_t i = 0; i < uItems && EP.fBestErr > 0; i++) + { + EP.uShape = auShape[i]; + Refine(&EP); + } + } +} + + +//------------------------------------------------------------------------------------- +int D3DX_BC6H::Quantize(int iValue, int prec, bool bSigned) +{ + assert(prec > 1); // didn't bother to make it work for 1 + int q, s = 0; + if(bSigned) + { + assert(iValue >= -F16MAX && iValue <= F16MAX); + if(iValue < 0) + { + s = 1; + iValue = -iValue; + } + q = (prec >= 16) ? iValue : (iValue << (prec-1)) / (F16MAX+1); + if(s) + q = -q; + assert (q > -(1 << (prec-1)) && q < (1 << (prec-1))); + } + else + { + assert(iValue >= 0 && iValue <= F16MAX); + q = (prec >= 15) ? iValue : (iValue << prec) / (F16MAX+1); + assert (q >= 0 && q < (1 << prec)); + } + + return q; +} + +int D3DX_BC6H::Unquantize(int comp, uint8_t uBitsPerComp, bool bSigned) +{ + int unq = 0, s = 0; + if(bSigned) + { + if(uBitsPerComp >= 16) + { + unq = comp; + } + else + { + if(comp < 0) + { + s = 1; + comp = -comp; + } + + if(comp == 0) unq = 0; + else if(comp >= ((1 << (uBitsPerComp - 1)) - 1)) unq = 0x7FFF; + else unq = ((comp << 15) + 0x4000) >> (uBitsPerComp-1); + + if(s) unq = -unq; + } + } + else + { + if(uBitsPerComp >= 15) unq = comp; + else if(comp == 0) unq = 0; + else if(comp == ((1 << uBitsPerComp) - 1)) unq = 0xFFFF; + else unq = ((comp << 16) + 0x8000) >> uBitsPerComp; + } + + return unq; +} + +int D3DX_BC6H::FinishUnquantize(int comp, bool bSigned) +{ + if(bSigned) + { + return (comp < 0) ? -(((-comp) * 31) >> 5) : (comp * 31) >> 5; // scale the magnitude by 31/32 + } + else + { + return (comp * 31) >> 6; // scale the magnitude by 31/64 + } +} + + +//------------------------------------------------------------------------------------- +bool D3DX_BC6H::EndPointsFit(const EncodeParams* pEP, const INTEndPntPair aEndPts[]) +{ + assert( pEP ); + const bool bTransformed = ms_aInfo[pEP->uMode].bTransformed; + const bool bIsSigned = pEP->bSigned; + const LDRColorA& Prec0 = ms_aInfo[pEP->uMode].RGBAPrec[0][0]; + const LDRColorA& Prec1 = ms_aInfo[pEP->uMode].RGBAPrec[0][1]; + const LDRColorA& Prec2 = ms_aInfo[pEP->uMode].RGBAPrec[1][0]; + const LDRColorA& Prec3 = ms_aInfo[pEP->uMode].RGBAPrec[1][1]; + + INTColor aBits[4]; + aBits[0].r = NBits(aEndPts[0].A.r, bIsSigned); + aBits[0].g = NBits(aEndPts[0].A.g, bIsSigned); + aBits[0].b = NBits(aEndPts[0].A.b, bIsSigned); + aBits[1].r = NBits(aEndPts[0].B.r, bTransformed || bIsSigned); + aBits[1].g = NBits(aEndPts[0].B.g, bTransformed || bIsSigned); + aBits[1].b = NBits(aEndPts[0].B.b, bTransformed || bIsSigned); + if(aBits[0].r > Prec0.r || aBits[1].r > Prec1.r || + aBits[0].g > Prec0.g || aBits[1].g > Prec1.g || + aBits[0].b > Prec0.b || aBits[1].b > Prec1.b) + return false; + + if(ms_aInfo[pEP->uMode].uPartitions) + { + aBits[2].r = NBits(aEndPts[1].A.r, bTransformed || bIsSigned); + aBits[2].g = NBits(aEndPts[1].A.g, bTransformed || bIsSigned); + aBits[2].b = NBits(aEndPts[1].A.b, bTransformed || bIsSigned); + aBits[3].r = NBits(aEndPts[1].B.r, bTransformed || bIsSigned); + aBits[3].g = NBits(aEndPts[1].B.g, bTransformed || bIsSigned); + aBits[3].b = NBits(aEndPts[1].B.b, bTransformed || bIsSigned); + + if(aBits[2].r > Prec2.r || aBits[3].r > Prec3.r || + aBits[2].g > Prec2.g || aBits[3].g > Prec3.g || + aBits[2].b > Prec2.b || aBits[3].b > Prec3.b) + return false; + } + + return true; +} + +void D3DX_BC6H::GeneratePaletteQuantized(const EncodeParams* pEP, const INTEndPntPair& endPts, INTColor aPalette[]) const +{ + assert( pEP ); + const size_t uIndexPrec = ms_aInfo[pEP->uMode].uIndexPrec; + const size_t uNumIndices = 1 << uIndexPrec; + assert( uNumIndices > 0 ); + __analysis_assume( uNumIndices > 0 ); + const LDRColorA& Prec = ms_aInfo[pEP->uMode].RGBAPrec[0][0]; + + // scale endpoints + INTEndPntPair unqEndPts; + unqEndPts.A.r = Unquantize(endPts.A.r, Prec.r, pEP->bSigned); + unqEndPts.A.g = Unquantize(endPts.A.g, Prec.g, pEP->bSigned); + unqEndPts.A.b = Unquantize(endPts.A.b, Prec.b, pEP->bSigned); + unqEndPts.B.r = Unquantize(endPts.B.r, Prec.r, pEP->bSigned); + unqEndPts.B.g = Unquantize(endPts.B.g, Prec.g, pEP->bSigned); + unqEndPts.B.b = Unquantize(endPts.B.b, Prec.b, pEP->bSigned); + + // interpolate + const int* aWeights = nullptr; + switch(uIndexPrec) + { + case 3: aWeights = g_aWeights3; assert(uNumIndices <= 8); __analysis_assume(uNumIndices <= 8); break; + case 4: aWeights = g_aWeights4; assert(uNumIndices <= 16); __analysis_assume(uNumIndices <= 16); break; + default: assert(false); for(size_t i=0; i < uNumIndices; ++i) aPalette[i] = INTColor(0,0,0); return; + } + + for (size_t i = 0; i < uNumIndices; ++i) + { + aPalette[i].r = FinishUnquantize( + (unqEndPts.A.r * (BC67_WEIGHT_MAX - aWeights[i]) + unqEndPts.B.r * aWeights[i] + BC67_WEIGHT_ROUND) >> BC67_WEIGHT_SHIFT, + pEP->bSigned); + aPalette[i].g = FinishUnquantize( + (unqEndPts.A.g * (BC67_WEIGHT_MAX - aWeights[i]) + unqEndPts.B.g * aWeights[i] + BC67_WEIGHT_ROUND) >> BC67_WEIGHT_SHIFT, + pEP->bSigned); + aPalette[i].b = FinishUnquantize( + (unqEndPts.A.b * (BC67_WEIGHT_MAX - aWeights[i]) + unqEndPts.B.b * aWeights[i] + BC67_WEIGHT_ROUND) >> BC67_WEIGHT_SHIFT, + pEP->bSigned); + } +} + +// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr +float D3DX_BC6H::MapColorsQuantized(const EncodeParams* pEP, const INTColor aColors[], size_t np, const INTEndPntPair &endPts) const +{ + assert( pEP ); + + const uint8_t uIndexPrec = ms_aInfo[pEP->uMode].uIndexPrec; + const uint8_t uNumIndices = 1 << uIndexPrec; + INTColor aPalette[BC6H_MAX_INDICES]; + GeneratePaletteQuantized(pEP, endPts, aPalette); + + float fTotErr = 0; + for(size_t i = 0; i < np; ++i) + { + float fBestErr = Norm(aColors[i], aPalette[0]); + for(int j = 1; j < uNumIndices && fBestErr > 0; ++j) + { + float fErr = Norm(aColors[i], aPalette[j]); + if(fErr > fBestErr) break; // error increased, so we're done searching + if(fErr < fBestErr) fBestErr = fErr; + } + fTotErr += fBestErr; + } + return fTotErr; +} + +float D3DX_BC6H::PerturbOne(const EncodeParams* pEP, const INTColor aColors[], size_t np, uint8_t ch, + const INTEndPntPair& oldEndPts, INTEndPntPair& newEndPts, float fOldErr, int do_b) const +{ + assert( pEP ); + uint8_t uPrec; + switch(ch) + { + case 0: uPrec = ms_aInfo[pEP->uMode].RGBAPrec[0][0].r; break; + case 1: uPrec = ms_aInfo[pEP->uMode].RGBAPrec[0][0].g; break; + case 2: uPrec = ms_aInfo[pEP->uMode].RGBAPrec[0][0].b; break; + default: assert(false); newEndPts = oldEndPts; return FLT_MAX; + } + INTEndPntPair tmpEndPts; + float fMinErr = fOldErr; + int beststep = 0; + + // copy real endpoints so we can perturb them + tmpEndPts = newEndPts = oldEndPts; + + // do a logarithmic search for the best error for this endpoint (which) + for(int step = 1 << (uPrec-1); step; step >>= 1) + { + bool bImproved = false; + for(int sign = -1; sign <= 1; sign += 2) + { + if(do_b == 0) + { + tmpEndPts.A[ch] = newEndPts.A[ch] + sign * step; + if(tmpEndPts.A[ch] < 0 || tmpEndPts.A[ch] >= (1 << uPrec)) + continue; + } + else + { + tmpEndPts.B[ch] = newEndPts.B[ch] + sign * step; + if(tmpEndPts.B[ch] < 0 || tmpEndPts.B[ch] >= (1 << uPrec)) + continue; + } + + float fErr = MapColorsQuantized(pEP, aColors, np, tmpEndPts); + + if(fErr < fMinErr) + { + bImproved = true; + fMinErr = fErr; + beststep = sign * step; + } + } + // if this was an improvement, move the endpoint and continue search from there + if(bImproved) + { + if(do_b == 0) + newEndPts.A[ch] += beststep; + else + newEndPts.B[ch] += beststep; + } + } + return fMinErr; +} + +void D3DX_BC6H::OptimizeOne(const EncodeParams* pEP, const INTColor aColors[], size_t np, float aOrgErr, + const INTEndPntPair &aOrgEndPts, INTEndPntPair &aOptEndPts) const +{ + assert( pEP ); + float aOptErr = aOrgErr; + aOptEndPts.A = aOrgEndPts.A; + aOptEndPts.B = aOrgEndPts.B; + + INTEndPntPair new_a, new_b; + INTEndPntPair newEndPts; + int do_b; + + // now optimize each channel separately + for(uint8_t ch = 0; ch < 3; ++ch) + { + // figure out which endpoint when perturbed gives the most improvement and start there + // if we just alternate, we can easily end up in a local minima + float fErr0 = PerturbOne(pEP, aColors, np, ch, aOptEndPts, new_a, aOptErr, 0); // perturb endpt A + float fErr1 = PerturbOne(pEP, aColors, np, ch, aOptEndPts, new_b, aOptErr, 1); // perturb endpt B + + if(fErr0 < fErr1) + { + if(fErr0 >= aOptErr) continue; + aOptEndPts.A[ch] = new_a.A[ch]; + aOptErr = fErr0; + do_b = 1; // do B next + } + else + { + if(fErr1 >= aOptErr) continue; + aOptEndPts.B[ch] = new_b.B[ch]; + aOptErr = fErr1; + do_b = 0; // do A next + } + + // now alternate endpoints and keep trying until there is no improvement + for(;;) + { + float fErr = PerturbOne(pEP, aColors, np, ch, aOptEndPts, newEndPts, aOptErr, do_b); + if(fErr >= aOptErr) + break; + if(do_b == 0) + aOptEndPts.A[ch] = newEndPts.A[ch]; + else + aOptEndPts.B[ch] = newEndPts.B[ch]; + aOptErr = fErr; + do_b = 1 - do_b; // now move the other endpoint + } + } +} + +void D3DX_BC6H::OptimizeEndPoints(const EncodeParams* pEP, const float aOrgErr[], const INTEndPntPair aOrgEndPts[], INTEndPntPair aOptEndPts[]) const +{ + assert( pEP ); + const uint8_t uPartitions = ms_aInfo[pEP->uMode].uPartitions; + assert( uPartitions < BC6H_MAX_REGIONS ); + __analysis_assume( uPartitions < BC6H_MAX_REGIONS ); + INTColor aPixels[NUM_PIXELS_PER_BLOCK]; + + for(size_t p = 0; p <= uPartitions; ++p) + { + // collect the pixels in the region + size_t np = 0; + for(size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) + { + if(g_aPartitionTable[p][pEP->uShape][i] == p) + { + aPixels[np++] = pEP->aIPixels[i]; + } + } + + OptimizeOne(pEP, aPixels, np, aOrgErr[p], aOrgEndPts[p], aOptEndPts[p]); + } +} + +// Swap endpoints as needed to ensure that the indices at fix up have a 0 high-order bit +void D3DX_BC6H::SwapIndices(const EncodeParams* pEP, INTEndPntPair aEndPts[], size_t aIndices[]) +{ + assert( pEP ); + const size_t uPartitions = ms_aInfo[pEP->uMode].uPartitions; + const size_t uNumIndices = 1 << ms_aInfo[pEP->uMode].uIndexPrec; + const size_t uHighIndexBit = uNumIndices >> 1; + + assert( uPartitions < BC6H_MAX_REGIONS && pEP->uShape < BC6H_MAX_SHAPES ); + __analysis_assume( uPartitions < BC6H_MAX_REGIONS && pEP->uShape < BC6H_MAX_SHAPES ); + + for(size_t p = 0; p <= uPartitions; ++p) + { + size_t i = g_aFixUp[uPartitions][pEP->uShape][p]; + assert(g_aPartitionTable[uPartitions][pEP->uShape][i] == p); + if(aIndices[i] & uHighIndexBit) + { + // high bit is set, swap the aEndPts and indices for this region + Swap(aEndPts[p].A, aEndPts[p].B); + + for(size_t j = 0; j < NUM_PIXELS_PER_BLOCK; ++j) + if(g_aPartitionTable[uPartitions][pEP->uShape][j] == p) + aIndices[j] = uNumIndices - 1 - aIndices[j]; + } + } +} + +// assign indices given a tile, shape, and quantized endpoints, return toterr for each region +void D3DX_BC6H::AssignIndices(const EncodeParams* pEP, const INTEndPntPair aEndPts[], size_t aIndices[], float aTotErr[]) const +{ + assert( pEP ); + const uint8_t uPartitions = ms_aInfo[pEP->uMode].uPartitions; + const uint8_t uNumIndices = 1 << ms_aInfo[pEP->uMode].uIndexPrec; + + assert( uPartitions < BC6H_MAX_REGIONS && pEP->uShape < BC6H_MAX_SHAPES ); + __analysis_assume( uPartitions < BC6H_MAX_REGIONS && pEP->uShape < BC6H_MAX_SHAPES ); + + // build list of possibles + INTColor aPalette[BC6H_MAX_REGIONS][BC6H_MAX_INDICES]; + + for(size_t p = 0; p <= uPartitions; ++p) + { + GeneratePaletteQuantized(pEP, aEndPts[p], aPalette[p]); + aTotErr[p] = 0; + } + + for(size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) + { + const uint8_t uRegion = g_aPartitionTable[uPartitions][pEP->uShape][i]; + assert( uRegion < BC6H_MAX_REGIONS ); + __analysis_assume( uRegion < BC6H_MAX_REGIONS ); + float fBestErr = Norm(pEP->aIPixels[i], aPalette[uRegion][0]); + aIndices[i] = 0; + + for(uint8_t j = 1; j < uNumIndices && fBestErr > 0; ++j) + { + float fErr = Norm(pEP->aIPixels[i], aPalette[uRegion][j]); + if(fErr > fBestErr) break; // error increased, so we're done searching + if(fErr < fBestErr) + { + fBestErr = fErr; + aIndices[i] = j; + } + } + aTotErr[uRegion] += fBestErr; + } +} + +void D3DX_BC6H::QuantizeEndPts(const EncodeParams* pEP, INTEndPntPair* aQntEndPts) const +{ + assert( pEP && aQntEndPts ); + const INTEndPntPair* aUnqEndPts = pEP->aUnqEndPts[pEP->uShape]; + const LDRColorA& Prec = ms_aInfo[pEP->uMode].RGBAPrec[0][0]; + const uint8_t uPartitions = ms_aInfo[pEP->uMode].uPartitions; + assert( uPartitions < BC6H_MAX_REGIONS ); + __analysis_assume( uPartitions < BC6H_MAX_REGIONS ); + + for(size_t p = 0; p <= uPartitions; ++p) + { + aQntEndPts[p].A.r = Quantize(aUnqEndPts[p].A.r, Prec.r, pEP->bSigned); + aQntEndPts[p].A.g = Quantize(aUnqEndPts[p].A.g, Prec.g, pEP->bSigned); + aQntEndPts[p].A.b = Quantize(aUnqEndPts[p].A.b, Prec.b, pEP->bSigned); + aQntEndPts[p].B.r = Quantize(aUnqEndPts[p].B.r, Prec.r, pEP->bSigned); + aQntEndPts[p].B.g = Quantize(aUnqEndPts[p].B.g, Prec.g, pEP->bSigned); + aQntEndPts[p].B.b = Quantize(aUnqEndPts[p].B.b, Prec.b, pEP->bSigned); + } +} + +void D3DX_BC6H::EmitBlock(const EncodeParams* pEP, const INTEndPntPair aEndPts[], const size_t aIndices[]) +{ + assert( pEP ); + const uint8_t uRealMode = ms_aInfo[pEP->uMode].uMode; + const uint8_t uPartitions = ms_aInfo[pEP->uMode].uPartitions; + const uint8_t uIndexPrec = ms_aInfo[pEP->uMode].uIndexPrec; + const size_t uHeaderBits = uPartitions > 0 ? 82 : 65; + const ModeDescriptor* desc = ms_aDesc[pEP->uMode]; + size_t uStartBit = 0; + + while(uStartBit < uHeaderBits) + { + switch(desc[uStartBit].m_eField) + { + case M: SetBit(uStartBit, uint8_t(uRealMode >> desc[uStartBit].m_uBit) & 0x01); break; + case D: SetBit(uStartBit, uint8_t(pEP->uShape >> desc[uStartBit].m_uBit) & 0x01); break; + case RW: SetBit(uStartBit, uint8_t(aEndPts[0].A.r >> desc[uStartBit].m_uBit) & 0x01); break; + case RX: SetBit(uStartBit, uint8_t(aEndPts[0].B.r >> desc[uStartBit].m_uBit) & 0x01); break; + case RY: SetBit(uStartBit, uint8_t(aEndPts[1].A.r >> desc[uStartBit].m_uBit) & 0x01); break; + case RZ: SetBit(uStartBit, uint8_t(aEndPts[1].B.r >> desc[uStartBit].m_uBit) & 0x01); break; + case GW: SetBit(uStartBit, uint8_t(aEndPts[0].A.g >> desc[uStartBit].m_uBit) & 0x01); break; + case GX: SetBit(uStartBit, uint8_t(aEndPts[0].B.g >> desc[uStartBit].m_uBit) & 0x01); break; + case GY: SetBit(uStartBit, uint8_t(aEndPts[1].A.g >> desc[uStartBit].m_uBit) & 0x01); break; + case GZ: SetBit(uStartBit, uint8_t(aEndPts[1].B.g >> desc[uStartBit].m_uBit) & 0x01); break; + case BW: SetBit(uStartBit, uint8_t(aEndPts[0].A.b >> desc[uStartBit].m_uBit) & 0x01); break; + case BX: SetBit(uStartBit, uint8_t(aEndPts[0].B.b >> desc[uStartBit].m_uBit) & 0x01); break; + case BY: SetBit(uStartBit, uint8_t(aEndPts[1].A.b >> desc[uStartBit].m_uBit) & 0x01); break; + case BZ: SetBit(uStartBit, uint8_t(aEndPts[1].B.b >> desc[uStartBit].m_uBit) & 0x01); break; + default: assert(false); + } + } + + for(size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) + { + if(IsFixUpOffset(ms_aInfo[pEP->uMode].uPartitions, pEP->uShape, i)) + SetBits(uStartBit, uIndexPrec - 1, static_cast( aIndices[i] )); + else + SetBits(uStartBit, uIndexPrec, static_cast( aIndices[i] )); + } + assert(uStartBit == 128); +} + +void D3DX_BC6H::Refine(EncodeParams* pEP) +{ + assert( pEP ); + const uint8_t uPartitions = ms_aInfo[pEP->uMode].uPartitions; + assert( uPartitions < BC6H_MAX_REGIONS ); + __analysis_assume( uPartitions < BC6H_MAX_REGIONS ); + + const bool bTransformed = ms_aInfo[pEP->uMode].bTransformed; + float aOrgErr[BC6H_MAX_REGIONS], aOptErr[BC6H_MAX_REGIONS]; + INTEndPntPair aOrgEndPts[BC6H_MAX_REGIONS], aOptEndPts[BC6H_MAX_REGIONS]; + size_t aOrgIdx[NUM_PIXELS_PER_BLOCK], aOptIdx[NUM_PIXELS_PER_BLOCK]; + + QuantizeEndPts(pEP, aOrgEndPts); + AssignIndices(pEP, aOrgEndPts, aOrgIdx, aOrgErr); + SwapIndices(pEP, aOrgEndPts, aOrgIdx); + + if(bTransformed) TransformForward(aOrgEndPts); + if(EndPointsFit(pEP, aOrgEndPts)) + { + if(bTransformed) TransformInverse(aOrgEndPts, ms_aInfo[pEP->uMode].RGBAPrec[0][0], pEP->bSigned); + OptimizeEndPoints(pEP, aOrgErr, aOrgEndPts, aOptEndPts); + AssignIndices(pEP, aOptEndPts, aOptIdx, aOptErr); + SwapIndices(pEP, aOptEndPts, aOptIdx); + + float fOrgTotErr = 0.0f, fOptTotErr = 0.0f; + for(size_t p = 0; p <= uPartitions; ++p) + { + fOrgTotErr += aOrgErr[p]; + fOptTotErr += aOptErr[p]; + } + + if(bTransformed) TransformForward(aOptEndPts); + if(EndPointsFit(pEP, aOptEndPts) && fOptTotErr < fOrgTotErr && fOptTotErr < pEP->fBestErr) + { + pEP->fBestErr = fOptTotErr; + EmitBlock(pEP, aOptEndPts, aOptIdx); + } + else if(fOrgTotErr < pEP->fBestErr) + { + // either it stopped fitting when we optimized it, or there was no improvement + // so go back to the unoptimized endpoints which we know will fit + if(bTransformed) TransformForward(aOrgEndPts); + pEP->fBestErr = fOrgTotErr; + EmitBlock(pEP, aOrgEndPts, aOrgIdx); + } + } +} + +void D3DX_BC6H::GeneratePaletteUnquantized(const EncodeParams* pEP, size_t uRegion, INTColor aPalette[]) +{ + assert( pEP ); + assert( uRegion < BC6H_MAX_REGIONS && pEP->uShape < BC6H_MAX_SHAPES ); + __analysis_assume( uRegion < BC6H_MAX_REGIONS && pEP->uShape < BC6H_MAX_SHAPES ); + const INTEndPntPair& endPts = pEP->aUnqEndPts[pEP->uShape][uRegion]; + const uint8_t uIndexPrec = ms_aInfo[pEP->uMode].uIndexPrec; + const uint8_t uNumIndices = 1 << uIndexPrec; + assert( uNumIndices > 0 ); + __analysis_assume( uNumIndices > 0 ); + + const int* aWeights = nullptr; + switch(uIndexPrec) + { + case 3: aWeights = g_aWeights3; assert(uNumIndices <= 8); __analysis_assume(uNumIndices <= 8); break; + case 4: aWeights = g_aWeights4; assert(uNumIndices <= 16); __analysis_assume(uNumIndices <= 16); break; + default: assert(false); for(size_t i = 0; i < uNumIndices; ++i) aPalette[i] = INTColor(0,0,0); return; + } + + for(register size_t i = 0; i < uNumIndices; ++i) + { + aPalette[i].r = (endPts.A.r * (BC67_WEIGHT_MAX - aWeights[i]) + endPts.B.r * aWeights[i] + BC67_WEIGHT_ROUND) >> BC67_WEIGHT_SHIFT; + aPalette[i].g = (endPts.A.g * (BC67_WEIGHT_MAX - aWeights[i]) + endPts.B.g * aWeights[i] + BC67_WEIGHT_ROUND) >> BC67_WEIGHT_SHIFT; + aPalette[i].b = (endPts.A.b * (BC67_WEIGHT_MAX - aWeights[i]) + endPts.B.b * aWeights[i] + BC67_WEIGHT_ROUND) >> BC67_WEIGHT_SHIFT; + } +} + +float D3DX_BC6H::MapColors(const EncodeParams* pEP, size_t uRegion, size_t np, const size_t* auIndex) const +{ + assert( pEP ); + const uint8_t uIndexPrec = ms_aInfo[pEP->uMode].uIndexPrec; + const uint8_t uNumIndices = 1 << uIndexPrec; + INTColor aPalette[BC6H_MAX_INDICES]; + GeneratePaletteUnquantized(pEP, uRegion, aPalette); + + float fTotalErr = 0.0f; + for(size_t i = 0; i < np; ++i) + { + float fBestErr = Norm(pEP->aIPixels[auIndex[i]], aPalette[0]); + for(uint8_t j = 1; j < uNumIndices && fBestErr > 0.0f; ++j) + { + float fErr = Norm(pEP->aIPixels[auIndex[i]], aPalette[j]); + if(fErr > fBestErr) break; // error increased, so we're done searching + if(fErr < fBestErr) fBestErr = fErr; + } + fTotalErr += fBestErr; + } + + return fTotalErr; +} + +float D3DX_BC6H::RoughMSE(EncodeParams* pEP) const +{ + assert( pEP ); + assert( pEP->uShape < BC6H_MAX_SHAPES); + __analysis_assume( pEP->uShape < BC6H_MAX_SHAPES); + + INTEndPntPair* aEndPts = pEP->aUnqEndPts[pEP->uShape]; + + const uint8_t uPartitions = ms_aInfo[pEP->uMode].uPartitions; + assert( uPartitions < BC6H_MAX_REGIONS ); + __analysis_assume( uPartitions < BC6H_MAX_REGIONS ); + + size_t auPixIdx[NUM_PIXELS_PER_BLOCK]; + + float fError = 0.0f; + for(size_t p = 0; p <= uPartitions; ++p) + { + size_t np = 0; + for(register size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) + { + if(g_aPartitionTable[uPartitions][pEP->uShape][i] == p) + { + auPixIdx[np++] = i; + } + } + + // handle simple cases + assert(np > 0); + if(np == 1) + { + aEndPts[p].A = pEP->aIPixels[auPixIdx[0]]; + aEndPts[p].B = pEP->aIPixels[auPixIdx[0]]; + continue; + } + else if(np == 2) + { + aEndPts[p].A = pEP->aIPixels[auPixIdx[0]]; + aEndPts[p].B = pEP->aIPixels[auPixIdx[1]]; + continue; + } + + HDRColorA epA, epB; + OptimizeRGB(pEP->aHDRPixels, &epA, &epB, 4, np, auPixIdx); + aEndPts[p].A.Set(epA, pEP->bSigned); + aEndPts[p].B.Set(epB, pEP->bSigned); + if(pEP->bSigned) + { + aEndPts[p].A.Clamp(-F16MAX, F16MAX); + aEndPts[p].B.Clamp(-F16MAX, F16MAX); + } + else + { + aEndPts[p].A.Clamp(0, F16MAX); + aEndPts[p].B.Clamp(0, F16MAX); + } + + fError += MapColors(pEP, p, np, auPixIdx); + } + + return fError; +} + + + +//------------------------------------------------------------------------------------- +// BC7 Compression +//------------------------------------------------------------------------------------- +void D3DX_BC7::Decode(HDRColorA* pOut) const +{ + assert( pOut ); + + size_t uFirst = 0; + while(uFirst < 128 && !GetBit(uFirst)) {} + uint8_t uMode = uint8_t(uFirst - 1); + + if(uMode < 8) + { + const uint8_t uPartitions = ms_aInfo[uMode].uPartitions; + assert( uPartitions < BC7_MAX_REGIONS ); + __analysis_assume( uPartitions < BC7_MAX_REGIONS ); + + const uint8_t uNumEndPts = (uPartitions + 1) << 1; + const uint8_t uIndexPrec = ms_aInfo[uMode].uIndexPrec; + const uint8_t uIndexPrec2 = ms_aInfo[uMode].uIndexPrec2; + register size_t i; + size_t uStartBit = uMode + 1; + uint8_t P[6]; + uint8_t uShape = GetBits(uStartBit, ms_aInfo[uMode].uPartitionBits); + assert( uShape < BC7_MAX_SHAPES ); + __analysis_assume( uShape < BC7_MAX_SHAPES ); + + uint8_t uRotation = GetBits(uStartBit, ms_aInfo[uMode].uRotationBits); + assert( uRotation < 4 ); + + uint8_t uIndexMode = GetBits(uStartBit, ms_aInfo[uMode].uIndexModeBits); + assert( uIndexMode < 2 ); + + LDRColorA c[BC7_MAX_REGIONS << 1]; + const LDRColorA RGBAPrec = ms_aInfo[uMode].RGBAPrec; + const LDRColorA RGBAPrecWithP = ms_aInfo[uMode].RGBAPrecWithP; + + assert( uNumEndPts <= (BC7_MAX_REGIONS << 1) ); + + // Red channel + for(i = 0; i < uNumEndPts; i++) + { + if ( uStartBit + RGBAPrec.r > 128 ) + { +#ifdef _DEBUG + OutputDebugStringA( "BC7: Invalid block encountered during decoding\n" ); +#endif + FillWithErrorColors( pOut ); + return; + } + + c[i].r = GetBits(uStartBit, RGBAPrec.r); + } + + // Green channel + for(i = 0; i < uNumEndPts; i++) + { + if ( uStartBit + RGBAPrec.g > 128 ) + { +#ifdef _DEBUG + OutputDebugStringA( "BC7: Invalid block encountered during decoding\n" ); +#endif + FillWithErrorColors( pOut ); + return; + } + + c[i].g = GetBits(uStartBit, RGBAPrec.g); + } + + // Blue channel + for(i = 0; i < uNumEndPts; i++) + { + if ( uStartBit + RGBAPrec.b > 128 ) + { +#ifdef _DEBUG + OutputDebugStringA( "BC7: Invalid block encountered during decoding\n" ); +#endif + FillWithErrorColors( pOut ); + return; + } + + c[i].b = GetBits(uStartBit, RGBAPrec.b); + } + + // Alpha channel + for(i = 0; i < uNumEndPts; i++) + { + if ( uStartBit + RGBAPrec.a > 128 ) + { +#ifdef _DEBUG + OutputDebugStringA( "BC7: Invalid block encountered during decoding\n" ); +#endif + FillWithErrorColors( pOut ); + return; + } + + c[i].a = RGBAPrec.a ? GetBits(uStartBit, RGBAPrec.a) : 255; + } + + // P-bits + assert( ms_aInfo[uMode].uPBits <= 6 ); + __analysis_assume( ms_aInfo[uMode].uPBits <= 6 ); + for(i = 0; i < ms_aInfo[uMode].uPBits; i++) + { + if ( uStartBit > 127 ) + { +#ifdef _DEBUG + OutputDebugStringA( "BC7: Invalid block encountered during decoding\n" ); +#endif + FillWithErrorColors( pOut ); + return; + } + + P[i] = GetBit(uStartBit); + } + + if(ms_aInfo[uMode].uPBits) + { + for(i = 0; i < uNumEndPts; i++) + { + size_t pi = i * ms_aInfo[uMode].uPBits / uNumEndPts; + for(register uint8_t ch = 0; ch < BC7_NUM_CHANNELS; ch++) + { + if(RGBAPrec[ch] != RGBAPrecWithP[ch]) + { + c[i][ch] = (c[i][ch] << 1) | P[pi]; + } + } + } + } + + for(i = 0; i < uNumEndPts; i++) + { + c[i] = Unquantize(c[i], RGBAPrecWithP); + } + + uint8_t w1[NUM_PIXELS_PER_BLOCK], w2[NUM_PIXELS_PER_BLOCK]; + + // read color indices + for(i = 0; i < NUM_PIXELS_PER_BLOCK; i++) + { + size_t uNumBits = IsFixUpOffset(ms_aInfo[uMode].uPartitions, uShape, i) ? uIndexPrec - 1 : uIndexPrec; + if ( uStartBit + uNumBits > 128 ) + { +#ifdef _DEBUG + OutputDebugStringA( "BC7: Invalid block encountered during decoding\n" ); +#endif + FillWithErrorColors( pOut ); + return; + } + w1[i] = GetBits(uStartBit, uNumBits); + } + + // read alpha indices + if(uIndexPrec2) + { + for(i = 0; i < NUM_PIXELS_PER_BLOCK; i++) + { + size_t uNumBits = i ? uIndexPrec2 : uIndexPrec2 - 1; + if ( uStartBit + uNumBits > 128 ) + { +#ifdef _DEBUG + OutputDebugStringA( "BC7: Invalid block encountered during decoding\n" ); +#endif + FillWithErrorColors( pOut ); + return; + } + w2[i] = GetBits(uStartBit, uNumBits ); + } + } + + for(i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) + { + uint8_t uRegion = g_aPartitionTable[uPartitions][uShape][i]; + LDRColorA outPixel; + if(uIndexPrec2 == 0) + { + LDRColorA::Interpolate(c[uRegion << 1], c[(uRegion << 1) + 1], w1[i], w1[i], uIndexPrec, uIndexPrec, outPixel); + } + else + { + if(uIndexMode == 0) + { + LDRColorA::Interpolate(c[uRegion << 1], c[(uRegion << 1) + 1], w1[i], w2[i], uIndexPrec, uIndexPrec2, outPixel); + } + else + { + LDRColorA::Interpolate(c[uRegion << 1], c[(uRegion << 1) + 1], w2[i], w1[i], uIndexPrec2, uIndexPrec, outPixel); + } + } + + switch(uRotation) + { + case 1: Swap(outPixel.r, outPixel.a); break; + case 2: Swap(outPixel.g, outPixel.a); break; + case 3: Swap(outPixel.b, outPixel.a); break; + } + + pOut[i] = HDRColorA(outPixel); + } + } + else + { +#ifdef _DEBUG + OutputDebugStringA( "BC7: Invalid mode encountered during decoding\n" ); +#endif + FillWithErrorColors( pOut ); + } +} + +void D3DX_BC7::Encode(const HDRColorA* const pIn) +{ + assert( pIn ); + + D3DX_BC7 final = *this; + EncodeParams EP(pIn); + float fMSEBest = FLT_MAX; + + for(size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) + { + EP.aLDRPixels[i].r = uint8_t( std::max( 0.0f, std::min( 255.0f, pIn[i].r * 255.0f + 0.01f ) ) ); + EP.aLDRPixels[i].g = uint8_t( std::max( 0.0f, std::min( 255.0f, pIn[i].g * 255.0f + 0.01f ) ) ); + EP.aLDRPixels[i].b = uint8_t( std::max( 0.0f, std::min( 255.0f, pIn[i].b * 255.0f + 0.01f ) ) ); + EP.aLDRPixels[i].a = uint8_t( std::max( 0.0f, std::min( 255.0f, pIn[i].a * 255.0f + 0.01f ) ) ); + } + + for(EP.uMode = 0; EP.uMode < 8 && fMSEBest > 0; ++EP.uMode) + { + const size_t uShapes = 1 << ms_aInfo[EP.uMode].uPartitionBits; + assert( uShapes <= BC7_MAX_SHAPES ); + __analysis_assume( uShapes <= BC7_MAX_SHAPES ); + + const size_t uNumRots = 1 << ms_aInfo[EP.uMode].uRotationBits; + const size_t uNumIdxMode = 1 << ms_aInfo[EP.uMode].uIndexModeBits; + // Number of rough cases to look at. reasonable values of this are 1, uShapes/4, and uShapes + // uShapes/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out + const size_t uItems = std::max(1, uShapes >> 2); + float afRoughMSE[BC7_MAX_SHAPES]; + size_t auShape[BC7_MAX_SHAPES]; + + for(size_t r = 0; r < uNumRots && fMSEBest > 0; ++r) + { + switch(r) + { + case 1: for(register size_t i = 0; i < NUM_PIXELS_PER_BLOCK; i++) Swap(EP.aLDRPixels[i].r, EP.aLDRPixels[i].a); break; + case 2: for(register size_t i = 0; i < NUM_PIXELS_PER_BLOCK; i++) Swap(EP.aLDRPixels[i].g, EP.aLDRPixels[i].a); break; + case 3: for(register size_t i = 0; i < NUM_PIXELS_PER_BLOCK; i++) Swap(EP.aLDRPixels[i].b, EP.aLDRPixels[i].a); break; + } + + for(size_t im = 0; im < uNumIdxMode && fMSEBest > 0; ++im) + { + // pick the best uItems shapes and refine these. + for(size_t s = 0; s < uShapes; s++) + { + afRoughMSE[s] = RoughMSE(&EP, s, im); + auShape[s] = s; + } + + // Bubble up the first uItems items + for(size_t i = 0; i < uItems; i++) + { + for(size_t j = i + 1; j < uShapes; j++) + { + if(afRoughMSE[i] > afRoughMSE[j]) + { + Swap(afRoughMSE[i], afRoughMSE[j]); + Swap(auShape[i], auShape[j]); + } + } + } + + for(size_t i = 0; i < uItems && fMSEBest > 0; i++) + { + float fMSE = Refine(&EP, auShape[i], r, im); + if(fMSE < fMSEBest) + { + final = *this; + fMSEBest = fMSE; + } + } + } + + switch(r) + { + case 1: for(register size_t i = 0; i < NUM_PIXELS_PER_BLOCK; i++) Swap(EP.aLDRPixels[i].r, EP.aLDRPixels[i].a); break; + case 2: for(register size_t i = 0; i < NUM_PIXELS_PER_BLOCK; i++) Swap(EP.aLDRPixels[i].g, EP.aLDRPixels[i].a); break; + case 3: for(register size_t i = 0; i < NUM_PIXELS_PER_BLOCK; i++) Swap(EP.aLDRPixels[i].b, EP.aLDRPixels[i].a); break; + } + } + } + + *this = final; +} + + +//------------------------------------------------------------------------------------- +void D3DX_BC7::GeneratePaletteQuantized(const EncodeParams* pEP, size_t uIndexMode, const LDREndPntPair& endPts, LDRColorA aPalette[]) const +{ + assert( pEP ); + const size_t uIndexPrec = uIndexMode ? ms_aInfo[pEP->uMode].uIndexPrec2 : ms_aInfo[pEP->uMode].uIndexPrec; + const size_t uIndexPrec2 = uIndexMode ? ms_aInfo[pEP->uMode].uIndexPrec : ms_aInfo[pEP->uMode].uIndexPrec2; + const size_t uNumIndices = 1 << uIndexPrec; + const size_t uNumIndices2 = 1 << uIndexPrec2; + assert( uNumIndices > 0 && uNumIndices2 > 0 ); + __analysis_assume( uNumIndices > 0 && uNumIndices2 > 0 ); + assert( (uNumIndices <= BC7_MAX_INDICES) && (uNumIndices2 <= BC7_MAX_INDICES) ); + __analysis_assume( (uNumIndices <= BC7_MAX_INDICES) && (uNumIndices2 <= BC7_MAX_INDICES) ); + + LDRColorA a = Unquantize(endPts.A, ms_aInfo[pEP->uMode].RGBAPrecWithP); + LDRColorA b = Unquantize(endPts.B, ms_aInfo[pEP->uMode].RGBAPrecWithP); + if(uIndexPrec2 == 0) + { + for(register size_t i = 0; i < uNumIndices; i++) + LDRColorA::Interpolate(a, b, i, i, uIndexPrec, uIndexPrec, aPalette[i]); + } + else + { + for(register size_t i = 0; i < uNumIndices; i++) + LDRColorA::InterpolateRGB(a, b, i, uIndexPrec, aPalette[i]); + for(register size_t i = 0; i < uNumIndices2; i++) + LDRColorA::InterpolateA(a, b, i, uIndexPrec2, aPalette[i]); + } +} + +float D3DX_BC7::PerturbOne(const EncodeParams* pEP, const LDRColorA aColors[], size_t np, size_t uIndexMode, size_t ch, + const LDREndPntPair &oldEndPts, LDREndPntPair &newEndPts, float fOldErr, uint8_t do_b) const +{ + assert( pEP ); + const int prec = ms_aInfo[pEP->uMode].RGBAPrecWithP[ch]; + LDREndPntPair tmp_endPts = newEndPts = oldEndPts; + float fMinErr = fOldErr; + uint8_t* pnew_c = (do_b ? &newEndPts.B[ch] : &newEndPts.A[ch]); + uint8_t* ptmp_c = (do_b ? &tmp_endPts.B[ch] : &tmp_endPts.A[ch]); + + // do a logarithmic search for the best error for this endpoint (which) + for(int step = 1 << (prec-1); step; step >>= 1) + { + bool bImproved = false; + int beststep = 0; + for(int sign = -1; sign <= 1; sign += 2) + { + int tmp = int(*pnew_c) + sign * step; + if(tmp < 0 || tmp >= (1 << prec)) + continue; + else + *ptmp_c = (uint8_t) tmp; + + float fTotalErr = MapColors(pEP, aColors, np, uIndexMode, tmp_endPts, fMinErr); + if(fTotalErr < fMinErr) + { + bImproved = true; + fMinErr = fTotalErr; + beststep = sign * step; + } + } + + // if this was an improvement, move the endpoint and continue search from there + if(bImproved) + *pnew_c = uint8_t(int(*pnew_c) + beststep); + } + return fMinErr; +} + +// perturb the endpoints at least -3 to 3. +// always ensure endpoint ordering is preserved (no need to overlap the scan) +void D3DX_BC7::Exhaustive(const EncodeParams* pEP, const LDRColorA aColors[], size_t np, size_t uIndexMode, size_t ch, + float& fOrgErr, LDREndPntPair& optEndPt) const +{ + assert( pEP ); + const uint8_t uPrec = ms_aInfo[pEP->uMode].RGBAPrecWithP[ch]; + LDREndPntPair tmpEndPt; + if(fOrgErr == 0) + return; + + int delta = 5; + + // ok figure out the range of A and B + tmpEndPt = optEndPt; + int alow = std::max(0, int(optEndPt.A[ch]) - delta); + int ahigh = std::min((1 << uPrec) - 1, int(optEndPt.A[ch]) + delta); + int blow = std::max(0, int(optEndPt.B[ch]) - delta); + int bhigh = std::min((1 << uPrec) - 1, int(optEndPt.B[ch]) + delta); + int amin = 0; + int bmin = 0; + + float fBestErr = fOrgErr; + if(optEndPt.A[ch] <= optEndPt.B[ch]) + { + // keep a <= b + for(int a = alow; a <= ahigh; ++a) + { + for(int b = std::max(a, blow); b < bhigh; ++b) + { + tmpEndPt.A[ch] = (uint8_t) a; + tmpEndPt.B[ch] = (uint8_t) b; + + float fErr = MapColors(pEP, aColors, np, uIndexMode, tmpEndPt, fBestErr); + if(fErr < fBestErr) + { + amin = a; + bmin = b; + fBestErr = fErr; + } + } + } + } + else + { + // keep b <= a + for(int b = blow; b < bhigh; ++b) + { + for(int a = std::max(b, alow); a <= ahigh; ++a) + { + tmpEndPt.A[ch] = (uint8_t) a; + tmpEndPt.B[ch] = (uint8_t) b; + + float fErr = MapColors(pEP, aColors, np, uIndexMode, tmpEndPt, fBestErr); + if(fErr < fBestErr) + { + amin = a; + bmin = b; + fBestErr = fErr; + } + } + } + } + + if(fBestErr < fOrgErr) + { + optEndPt.A[ch] = (uint8_t) amin; + optEndPt.B[ch] = (uint8_t) bmin; + fOrgErr = fBestErr; + } +} + +void D3DX_BC7::OptimizeOne(const EncodeParams* pEP, const LDRColorA aColors[], size_t np, size_t uIndexMode, + float fOrgErr, const LDREndPntPair& org, LDREndPntPair& opt) const +{ + assert( pEP ); + + float fOptErr = fOrgErr; + opt = org; + + LDREndPntPair new_a, new_b; + LDREndPntPair newEndPts; + uint8_t do_b; + + // now optimize each channel separately + for(size_t ch = 0; ch < BC7_NUM_CHANNELS; ++ch) + { + if(ms_aInfo[pEP->uMode].RGBAPrecWithP[ch] == 0) + continue; + + // figure out which endpoint when perturbed gives the most improvement and start there + // if we just alternate, we can easily end up in a local minima + float fErr0 = PerturbOne(pEP, aColors, np, uIndexMode, ch, opt, new_a, fOptErr, 0); // perturb endpt A + float fErr1 = PerturbOne(pEP, aColors, np, uIndexMode, ch, opt, new_b, fOptErr, 1); // perturb endpt B + + uint8_t& copt_a = opt.A[ch]; + uint8_t& copt_b = opt.B[ch]; + uint8_t& cnew_a = new_a.A[ch]; + uint8_t& cnew_b = new_a.B[ch]; + + if(fErr0 < fErr1) + { + if(fErr0 >= fOptErr) + continue; + copt_a = cnew_a; + fOptErr = fErr0; + do_b = 1; // do B next + } + else + { + if(fErr1 >= fOptErr) + continue; + copt_b = cnew_b; + fOptErr = fErr1; + do_b = 0; // do A next + } + + // now alternate endpoints and keep trying until there is no improvement + for( ; ; ) + { + float fErr = PerturbOne(pEP, aColors, np, uIndexMode, ch, opt, newEndPts, fOptErr, do_b); + if(fErr >= fOptErr) + break; + if(do_b == 0) + copt_a = cnew_a; + else + copt_b = cnew_b; + fOptErr = fErr; + do_b = 1 - do_b; // now move the other endpoint + } + } + + // finally, do a small exhaustive search around what we think is the global minima to be sure + for(size_t ch = 0; ch < BC7_NUM_CHANNELS; ch++) + Exhaustive(pEP, aColors, np, uIndexMode, ch, fOptErr, opt); +} + +void D3DX_BC7::OptimizeEndPoints(const EncodeParams* pEP, size_t uShape, size_t uIndexMode, const float afOrgErr[], + const LDREndPntPair aOrgEndPts[], LDREndPntPair aOptEndPts[]) const +{ + assert( pEP ); + const uint8_t uPartitions = ms_aInfo[pEP->uMode].uPartitions; + assert( uPartitions < BC7_MAX_REGIONS && uShape < BC7_MAX_SHAPES ); + __analysis_assume( uPartitions < BC7_MAX_REGIONS && uShape < BC7_MAX_SHAPES ); + + LDRColorA aPixels[NUM_PIXELS_PER_BLOCK]; + + for(size_t p = 0; p <= uPartitions; ++p) + { + // collect the pixels in the region + size_t np = 0; + for(register size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) + if(g_aPartitionTable[uPartitions][uShape][i] == p) + aPixels[np++] = pEP->aLDRPixels[i]; + + OptimizeOne(pEP, aPixels, np, uIndexMode, afOrgErr[p], aOrgEndPts[p], aOptEndPts[p]); + } +} + +void D3DX_BC7::AssignIndices(const EncodeParams* pEP, size_t uShape, size_t uIndexMode, LDREndPntPair endPts[], size_t aIndices[], size_t aIndices2[], + float afTotErr[]) const +{ + assert( pEP ); + assert( uShape < BC7_MAX_SHAPES ); + __analysis_assume( uShape < BC7_MAX_SHAPES ); + + const uint8_t uPartitions = ms_aInfo[pEP->uMode].uPartitions; + assert( uPartitions < BC7_MAX_REGIONS ); + __analysis_assume( uPartitions < BC7_MAX_REGIONS ); + + const uint8_t uIndexPrec = uIndexMode ? ms_aInfo[pEP->uMode].uIndexPrec2 : ms_aInfo[pEP->uMode].uIndexPrec; + const uint8_t uIndexPrec2 = uIndexMode ? ms_aInfo[pEP->uMode].uIndexPrec : ms_aInfo[pEP->uMode].uIndexPrec2; + const uint8_t uNumIndices = 1 << uIndexPrec; + const uint8_t uNumIndices2 = 1 << uIndexPrec2; + + assert( (uNumIndices <= BC7_MAX_INDICES) && (uNumIndices2 <= BC7_MAX_INDICES) ); + __analysis_assume( (uNumIndices <= BC7_MAX_INDICES) && (uNumIndices2 <= BC7_MAX_INDICES) ); + + const uint8_t uHighestIndexBit = uNumIndices >> 1; + const uint8_t uHighestIndexBit2 = uNumIndices2 >> 1; + LDRColorA aPalette[BC7_MAX_REGIONS][BC7_MAX_INDICES]; + + // build list of possibles + LDREndPntPair adjusted_endPts; + for(size_t p = 0; p <= uPartitions; p++) + { + GeneratePaletteQuantized(pEP, uIndexMode, endPts[p], aPalette[p]); + afTotErr[p] = 0; + } + + for(register size_t i = 0; i < NUM_PIXELS_PER_BLOCK; i++) + { + uint8_t uRegion = g_aPartitionTable[uPartitions][uShape][i]; + assert( uRegion < BC7_MAX_REGIONS ); + __analysis_assume( uRegion < BC7_MAX_REGIONS ); + afTotErr[uRegion] += ComputeError(pEP->aLDRPixels[i], aPalette[uRegion], uIndexPrec, uIndexPrec2, &(aIndices[i]), &(aIndices2[i])); + } + + // swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit + if(uIndexPrec2 == 0) + { + for(register size_t p = 0; p <= uPartitions; p++) + { + if(aIndices[g_aFixUp[uPartitions][uShape][p]] & uHighestIndexBit) + { + Swap(endPts[p].A, endPts[p].B); + for(register size_t i = 0; i < NUM_PIXELS_PER_BLOCK; i++) + if(g_aPartitionTable[uPartitions][uShape][i] == p) + aIndices[i] = uNumIndices - 1 - aIndices[i]; + } + assert((aIndices[g_aFixUp[uPartitions][uShape][p]] & uHighestIndexBit) == 0); + } + } + else + { + for(register size_t p = 0; p <= uPartitions; p++) + { + if(aIndices[g_aFixUp[uPartitions][uShape][p]] & uHighestIndexBit) + { + Swap(endPts[p].A.r, endPts[p].B.r); + Swap(endPts[p].A.g, endPts[p].B.g); + Swap(endPts[p].A.b, endPts[p].B.b); + for(register size_t i = 0; i < NUM_PIXELS_PER_BLOCK; i++) + if(g_aPartitionTable[uPartitions][uShape][i] == p) + aIndices[i] = uNumIndices - 1 - aIndices[i]; + } + assert((aIndices[g_aFixUp[uPartitions][uShape][p]] & uHighestIndexBit) == 0); + + if(aIndices2[0] & uHighestIndexBit2) + { + Swap(endPts[p].A.a, endPts[p].B.a); + for(register size_t i = 0; i < NUM_PIXELS_PER_BLOCK; i++) + aIndices2[i] = uNumIndices2 - 1 - aIndices2[i]; + } + assert((aIndices2[0] & uHighestIndexBit2) == 0); + } + } +} + +void D3DX_BC7::EmitBlock(const EncodeParams* pEP, size_t uShape, size_t uRotation, size_t uIndexMode, const LDREndPntPair aEndPts[], const size_t aIndex[], const size_t aIndex2[]) +{ + assert( pEP ); + const uint8_t uPartitions = ms_aInfo[pEP->uMode].uPartitions; + assert( uPartitions < BC7_MAX_REGIONS ); + __analysis_assume( uPartitions < BC7_MAX_REGIONS ); + + const size_t uPBits = ms_aInfo[pEP->uMode].uPBits; + const size_t uIndexPrec = ms_aInfo[pEP->uMode].uIndexPrec; + const size_t uIndexPrec2 = ms_aInfo[pEP->uMode].uIndexPrec2; + const LDRColorA RGBAPrec = ms_aInfo[pEP->uMode].RGBAPrec; + const LDRColorA RGBAPrecWithP = ms_aInfo[pEP->uMode].RGBAPrecWithP; + register size_t i; + size_t uStartBit = 0; + SetBits(uStartBit, pEP->uMode, 0); + SetBits(uStartBit, 1, 1); + SetBits(uStartBit, ms_aInfo[pEP->uMode].uRotationBits, static_cast( uRotation )); + SetBits(uStartBit, ms_aInfo[pEP->uMode].uIndexModeBits, static_cast( uIndexMode )); + SetBits(uStartBit, ms_aInfo[pEP->uMode].uPartitionBits, static_cast( uShape )); + + if(uPBits) + { + const size_t uNumEP = (1 + uPartitions) << 1; + uint8_t aPVote[BC7_MAX_REGIONS << 1] = {0,0,0,0,0,0}; + uint8_t aCount[BC7_MAX_REGIONS << 1] = {0,0,0,0,0,0}; + for(uint8_t ch = 0; ch < BC7_NUM_CHANNELS; ch++) + { + uint8_t ep = 0; + for(i = 0; i <= uPartitions; i++) + { + if(RGBAPrec[ch] == RGBAPrecWithP[ch]) + { + SetBits(uStartBit, RGBAPrec[ch], aEndPts[i].A[ch]); + SetBits(uStartBit, RGBAPrec[ch], aEndPts[i].B[ch]); + } + else + { + SetBits(uStartBit, RGBAPrec[ch], aEndPts[i].A[ch] >> 1); + SetBits(uStartBit, RGBAPrec[ch], aEndPts[i].B[ch] >> 1); + size_t idx = ep++ * uPBits / uNumEP; + assert(idx < (BC7_MAX_REGIONS << 1)); + __analysis_assume(idx < (BC7_MAX_REGIONS << 1)); + aPVote[idx] += aEndPts[i].A[ch] & 0x01; + aCount[idx]++; + idx = ep++ * uPBits / uNumEP; + assert(idx < (BC7_MAX_REGIONS << 1)); + __analysis_assume(idx < (BC7_MAX_REGIONS << 1)); + aPVote[idx] += aEndPts[i].B[ch] & 0x01; + aCount[idx]++; + } + } + } + + for(i = 0; i < uPBits; i++) + { + SetBits(uStartBit, 1, aPVote[i] > (aCount[i] >> 1) ? 1 : 0); + } + } + else + { + for(size_t ch = 0; ch < BC7_NUM_CHANNELS; ch++) + { + for(i = 0; i <= uPartitions; i++) + { + SetBits(uStartBit, RGBAPrec[ch], aEndPts[i].A[ch] ); + SetBits(uStartBit, RGBAPrec[ch], aEndPts[i].B[ch] ); + } + } + } + + const size_t* aI1 = uIndexMode ? aIndex2 : aIndex; + const size_t* aI2 = uIndexMode ? aIndex : aIndex2; + for(i = 0; i < NUM_PIXELS_PER_BLOCK; i++) + { + if(IsFixUpOffset(ms_aInfo[pEP->uMode].uPartitions, uShape, i)) + SetBits(uStartBit, uIndexPrec - 1, static_cast( aI1[i] )); + else + SetBits(uStartBit, uIndexPrec, static_cast( aI1[i] )); + } + if(uIndexPrec2) + for(i = 0; i < NUM_PIXELS_PER_BLOCK; i++) + SetBits(uStartBit, i ? uIndexPrec2 : uIndexPrec2 - 1, static_cast( aI2[i] )); + + assert(uStartBit == 128); +} + +float D3DX_BC7::Refine(const EncodeParams* pEP, size_t uShape, size_t uRotation, size_t uIndexMode) +{ + assert( pEP ); + assert( uShape < BC7_MAX_SHAPES ); + __analysis_assume( uShape < BC7_MAX_SHAPES ); + const LDREndPntPair* aEndPts = pEP->aEndPts[uShape]; + + const size_t uPartitions = ms_aInfo[pEP->uMode].uPartitions; + assert( uPartitions < BC7_MAX_REGIONS ); + __analysis_assume( uPartitions < BC7_MAX_REGIONS ); + + LDREndPntPair aOrgEndPts[BC7_MAX_REGIONS]; + LDREndPntPair aOptEndPts[BC7_MAX_REGIONS]; + size_t aOrgIdx[NUM_PIXELS_PER_BLOCK]; + size_t aOrgIdx2[NUM_PIXELS_PER_BLOCK]; + size_t aOptIdx[NUM_PIXELS_PER_BLOCK]; + size_t aOptIdx2[NUM_PIXELS_PER_BLOCK]; + float aOrgErr[BC7_MAX_REGIONS]; + float aOptErr[BC7_MAX_REGIONS]; + + for(register size_t p = 0; p <= uPartitions; p++) + { + aOrgEndPts[p].A = Quantize(aEndPts[p].A, ms_aInfo[pEP->uMode].RGBAPrecWithP); + aOrgEndPts[p].B = Quantize(aEndPts[p].B, ms_aInfo[pEP->uMode].RGBAPrecWithP); + } + + AssignIndices(pEP, uShape, uIndexMode, aOrgEndPts, aOrgIdx, aOrgIdx2, aOrgErr); + OptimizeEndPoints(pEP, uShape, uIndexMode, aOrgErr, aOrgEndPts, aOptEndPts); + AssignIndices(pEP, uShape, uIndexMode, aOptEndPts, aOptIdx, aOptIdx2, aOptErr); + + float fOrgTotErr = 0, fOptTotErr = 0; + for(register size_t p = 0; p <= uPartitions; p++) + { + fOrgTotErr += aOrgErr[p]; + fOptTotErr += aOptErr[p]; + } + if(fOptTotErr < fOrgTotErr) + { + EmitBlock(pEP, uShape, uRotation, uIndexMode, aOptEndPts, aOptIdx, aOptIdx2); + return fOptTotErr; + } + else + { + EmitBlock(pEP, uShape, uRotation, uIndexMode, aOrgEndPts, aOrgIdx, aOrgIdx2); + return fOrgTotErr; + } +} + +float D3DX_BC7::MapColors(const EncodeParams* pEP, const LDRColorA aColors[], size_t np, size_t uIndexMode, const LDREndPntPair& endPts, float fMinErr) const +{ + assert( pEP ); + const uint8_t uIndexPrec = uIndexMode ? ms_aInfo[pEP->uMode].uIndexPrec2 : ms_aInfo[pEP->uMode].uIndexPrec; + const uint8_t uIndexPrec2 = uIndexMode ? ms_aInfo[pEP->uMode].uIndexPrec : ms_aInfo[pEP->uMode].uIndexPrec2; + LDRColorA aPalette[BC7_MAX_INDICES]; + float fTotalErr = 0; + + GeneratePaletteQuantized(pEP, uIndexMode, endPts, aPalette); + for(register size_t i = 0; i < np; ++i) + { + fTotalErr += ComputeError(aColors[i], aPalette, uIndexPrec, uIndexPrec2); + if(fTotalErr > fMinErr) // check for early exit + { + fTotalErr = FLT_MAX; + break; + } + } + + return fTotalErr; +} + +float D3DX_BC7::RoughMSE(EncodeParams* pEP, size_t uShape, size_t uIndexMode) +{ + assert( pEP ); + assert( uShape < BC7_MAX_SHAPES ); + __analysis_assume( uShape < BC7_MAX_SHAPES ); + LDREndPntPair* aEndPts = pEP->aEndPts[uShape]; + + const uint8_t uPartitions = ms_aInfo[pEP->uMode].uPartitions; + assert( uPartitions < BC7_MAX_REGIONS ); + __analysis_assume( uPartitions < BC7_MAX_REGIONS ); + + const uint8_t uIndexPrec = uIndexMode ? ms_aInfo[pEP->uMode].uIndexPrec2 : ms_aInfo[pEP->uMode].uIndexPrec; + const uint8_t uIndexPrec2 = uIndexMode ? ms_aInfo[pEP->uMode].uIndexPrec : ms_aInfo[pEP->uMode].uIndexPrec2; + const uint8_t uNumIndices = 1 << uIndexPrec; + const uint8_t uNumIndices2 = 1 << uIndexPrec2; + size_t auPixIdx[NUM_PIXELS_PER_BLOCK]; + LDRColorA aPalette[BC7_MAX_REGIONS][BC7_MAX_INDICES]; + + for(size_t p = 0; p <= uPartitions; p++) + { + size_t np = 0; + for(register size_t i = 0; i < NUM_PIXELS_PER_BLOCK; i++) + { + if (g_aPartitionTable[uPartitions][uShape][i] == p) + { + auPixIdx[np++] = i; + } + } + + // handle simple cases + assert(np > 0); + if(np == 1) + { + aEndPts[p].A = pEP->aLDRPixels[auPixIdx[0]]; + aEndPts[p].B = pEP->aLDRPixels[auPixIdx[0]]; + continue; + } + else if(np == 2) + { + aEndPts[p].A = pEP->aLDRPixels[auPixIdx[0]]; + aEndPts[p].B = pEP->aLDRPixels[auPixIdx[1]]; + continue; + } + + if(uIndexPrec2 == 0) + { + HDRColorA epA, epB; + OptimizeRGBA(pEP->aHDRPixels, &epA, &epB, 4, np, auPixIdx); + epA.Clamp(0.0f, 1.0f); + epB.Clamp(0.0f, 1.0f); + epA *= 255.0f; + epB *= 255.0f; + aEndPts[p].A = epA.ToLDRColorA(); + aEndPts[p].B = epB.ToLDRColorA(); + } + else + { + uint8_t uMinAlpha = 255, uMaxAlpha = 0; + for(register size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) + { + uMinAlpha = std::min(uMinAlpha, pEP->aLDRPixels[auPixIdx[i]].a); + uMaxAlpha = std::max(uMaxAlpha, pEP->aLDRPixels[auPixIdx[i]].a); + } + + HDRColorA epA, epB; + OptimizeRGB(pEP->aHDRPixels, &epA, &epB, 4, np, auPixIdx); + epA.Clamp(0.0f, 1.0f); + epB.Clamp(0.0f, 1.0f); + epA *= 255.0f; + epB *= 255.0f; + aEndPts[p].A = epA.ToLDRColorA(); + aEndPts[p].B = epB.ToLDRColorA(); + aEndPts[p].A.a = uMinAlpha; + aEndPts[p].B.a = uMaxAlpha; + } + } + + if(uIndexPrec2 == 0) + { + for(size_t p = 0; p <= uPartitions; p++) + for(register size_t i = 0; i < uNumIndices; i++) + LDRColorA::Interpolate(aEndPts[p].A, aEndPts[p].B, i, i, uIndexPrec, uIndexPrec, aPalette[p][i]); + } + else + { + for(size_t p = 0; p <= uPartitions; p++) + { + for(register size_t i = 0; i < uNumIndices; i++) + LDRColorA::InterpolateRGB(aEndPts[p].A, aEndPts[p].B, i, uIndexPrec, aPalette[p][i]); + for(register size_t i = 0; i < uNumIndices2; i++) + LDRColorA::InterpolateA(aEndPts[p].A, aEndPts[p].B, i, uIndexPrec2, aPalette[p][i]); + } + } + + float fTotalErr = 0; + for(register size_t i = 0; i < NUM_PIXELS_PER_BLOCK; i++) + { + uint8_t uRegion = g_aPartitionTable[uPartitions][uShape][i]; + fTotalErr += ComputeError(pEP->aLDRPixels[i], aPalette[uRegion], uIndexPrec, uIndexPrec2); + } + + return fTotalErr; +} + +//===================================================================================== +// Entry points +//===================================================================================== + +//------------------------------------------------------------------------------------- +// BC6H Compression +//------------------------------------------------------------------------------------- +void D3DXDecodeBC6HU(XMVECTOR *pColor, const uint8_t *pBC) +{ + assert( pColor && pBC ); + static_assert( sizeof(D3DX_BC6H) == 16, "D3DX_BC6H should be 16 bytes" ); + reinterpret_cast< const D3DX_BC6H* >( pBC )->Decode(false, reinterpret_cast(pColor)); +} + +void D3DXDecodeBC6HS(XMVECTOR *pColor, const uint8_t *pBC) +{ + assert( pColor && pBC ); + static_assert( sizeof(D3DX_BC6H) == 16, "D3DX_BC6H should be 16 bytes" ); + reinterpret_cast< const D3DX_BC6H* >( pBC )->Decode(true, reinterpret_cast(pColor)); +} + +void D3DXEncodeBC6HU(uint8_t *pBC, const XMVECTOR *pColor, DWORD flags) +{ + UNREFERENCED_PARAMETER(flags); + assert( pBC && pColor ); + static_assert( sizeof(D3DX_BC6H) == 16, "D3DX_BC6H should be 16 bytes" ); + reinterpret_cast< D3DX_BC6H* >( pBC )->Encode(false, reinterpret_cast(pColor)); +} + +void D3DXEncodeBC6HS(uint8_t *pBC, const XMVECTOR *pColor, DWORD flags) +{ + UNREFERENCED_PARAMETER(flags); + assert( pBC && pColor ); + static_assert( sizeof(D3DX_BC6H) == 16, "D3DX_BC6H should be 16 bytes" ); + reinterpret_cast< D3DX_BC6H* >( pBC )->Encode(true, reinterpret_cast(pColor)); +} + + +//------------------------------------------------------------------------------------- +// BC7 Compression +//------------------------------------------------------------------------------------- +void D3DXDecodeBC7(XMVECTOR *pColor, const uint8_t *pBC) +{ + assert( pColor && pBC ); + static_assert( sizeof(D3DX_BC7) == 16, "D3DX_BC7 should be 16 bytes" ); + reinterpret_cast< const D3DX_BC7* >( pBC )->Decode(reinterpret_cast(pColor)); +} + +void D3DXEncodeBC7(uint8_t *pBC, const XMVECTOR *pColor, DWORD flags) +{ + UNREFERENCED_PARAMETER(flags); + assert( pBC && pColor ); + static_assert( sizeof(D3DX_BC7) == 16, "D3DX_BC7 should be 16 bytes" ); + reinterpret_cast< D3DX_BC7* >( pBC )->Encode(reinterpret_cast(pColor)); +} + +} // namespace \ No newline at end of file diff --git a/thirdparty/directxtex/DirectXTex/DDS.h b/thirdparty/directxtex/DirectXTex/DDS.h new file mode 100644 index 0000000..6e91395 --- /dev/null +++ b/thirdparty/directxtex/DirectXTex/DDS.h @@ -0,0 +1,214 @@ +//-------------------------------------------------------------------------------------- +// dds.h +// +// This header defines constants and structures that are useful when parsing +// DDS files. DDS files were originally designed to use several structures +// and constants that are native to DirectDraw and are defined in ddraw.h, +// such as DDSURFACEDESC2 and DDSCAPS2. This file defines similar +// (compatible) constants and structures so that one can use DDS files +// without needing to include ddraw.h. +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkId=248926 +//-------------------------------------------------------------------------------------- + +#if defined(_MSC_VER) && (_MSC_VER > 1000) +#pragma once +#endif + +#include + +#pragma warning(push) +#pragma warning(disable : 4005) +#include +#pragma warning(pop) + +namespace DirectX +{ + +#pragma pack(push,1) + +const uint32_t DDS_MAGIC = 0x20534444; // "DDS " + +struct DDS_PIXELFORMAT +{ + uint32_t dwSize; + uint32_t dwFlags; + uint32_t dwFourCC; + uint32_t dwRGBBitCount; + uint32_t dwRBitMask; + uint32_t dwGBitMask; + uint32_t dwBBitMask; + uint32_t dwABitMask; +}; + +#define DDS_FOURCC 0x00000004 // DDPF_FOURCC +#define DDS_RGB 0x00000040 // DDPF_RGB +#define DDS_RGBA 0x00000041 // DDPF_RGB | DDPF_ALPHAPIXELS +#define DDS_LUMINANCE 0x00020000 // DDPF_LUMINANCE +#define DDS_LUMINANCEA 0x00020001 // DDPF_LUMINANCE | DDPF_ALPHAPIXELS +#define DDS_ALPHA 0x00000002 // DDPF_ALPHA +#define DDS_PAL8 0x00000020 // DDPF_PALETTEINDEXED8 + +#ifndef MAKEFOURCC + #define MAKEFOURCC(ch0, ch1, ch2, ch3) \ + ((uint32_t)(uint8_t)(ch0) | ((uint32_t)(uint8_t)(ch1) << 8) | \ + ((uint32_t)(uint8_t)(ch2) << 16) | ((uint32_t)(uint8_t)(ch3) << 24 )) +#endif /* defined(MAKEFOURCC) */ + +extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_DXT1 = + { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, MAKEFOURCC('D','X','T','1'), 0, 0, 0, 0, 0 }; + +extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_DXT2 = + { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, MAKEFOURCC('D','X','T','2'), 0, 0, 0, 0, 0 }; + +extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_DXT3 = + { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, MAKEFOURCC('D','X','T','3'), 0, 0, 0, 0, 0 }; + +extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_DXT4 = + { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, MAKEFOURCC('D','X','T','4'), 0, 0, 0, 0, 0 }; + +extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_DXT5 = + { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, MAKEFOURCC('D','X','T','5'), 0, 0, 0, 0, 0 }; + +extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_BC4_UNORM = + { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, MAKEFOURCC('B','C','4','U'), 0, 0, 0, 0, 0 }; + +extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_BC4_SNORM = + { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, MAKEFOURCC('B','C','4','S'), 0, 0, 0, 0, 0 }; + +extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_BC5_UNORM = + { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, MAKEFOURCC('B','C','5','U'), 0, 0, 0, 0, 0 }; + +extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_BC5_SNORM = + { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, MAKEFOURCC('B','C','5','S'), 0, 0, 0, 0, 0 }; + +extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_R8G8_B8G8 = + { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, MAKEFOURCC('R','G','B','G'), 0, 0, 0, 0, 0 }; + +extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_G8R8_G8B8 = + { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, MAKEFOURCC('G','R','G','B'), 0, 0, 0, 0, 0 }; + +extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_A8R8G8B8 = + { sizeof(DDS_PIXELFORMAT), DDS_RGBA, 0, 32, 0x00ff0000, 0x0000ff00, 0x000000ff, 0xff000000 }; + +extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_X8R8G8B8 = + { sizeof(DDS_PIXELFORMAT), DDS_RGB, 0, 32, 0x00ff0000, 0x0000ff00, 0x000000ff, 0x00000000 }; + +extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_A8B8G8R8 = + { sizeof(DDS_PIXELFORMAT), DDS_RGBA, 0, 32, 0x000000ff, 0x0000ff00, 0x00ff0000, 0xff000000 }; + +extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_X8B8G8R8 = + { sizeof(DDS_PIXELFORMAT), DDS_RGB, 0, 32, 0x000000ff, 0x0000ff00, 0x00ff0000, 0x00000000 }; + +extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_G16R16 = + { sizeof(DDS_PIXELFORMAT), DDS_RGB, 0, 32, 0x0000ffff, 0xffff0000, 0x00000000, 0x00000000 }; + +extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_R5G6B5 = + { sizeof(DDS_PIXELFORMAT), DDS_RGB, 0, 16, 0x0000f800, 0x000007e0, 0x0000001f, 0x00000000 }; + +extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_A1R5G5B5 = + { sizeof(DDS_PIXELFORMAT), DDS_RGBA, 0, 16, 0x00007c00, 0x000003e0, 0x0000001f, 0x00008000 }; + +extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_A4R4G4B4 = + { sizeof(DDS_PIXELFORMAT), DDS_RGBA, 0, 16, 0x00000f00, 0x000000f0, 0x0000000f, 0x0000f000 }; + +extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_R8G8B8 = + { sizeof(DDS_PIXELFORMAT), DDS_RGB, 0, 24, 0x00ff0000, 0x0000ff00, 0x000000ff, 0x00000000 }; + +extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_L8 = + { sizeof(DDS_PIXELFORMAT), DDS_LUMINANCE, 0, 8, 0xff, 0x00, 0x00, 0x00 }; + +extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_L16 = + { sizeof(DDS_PIXELFORMAT), DDS_LUMINANCE, 0, 16, 0xffff, 0x0000, 0x0000, 0x0000 }; + +extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_A8L8 = + { sizeof(DDS_PIXELFORMAT), DDS_LUMINANCEA, 0, 16, 0x00ff, 0x0000, 0x0000, 0xff00 }; + +extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_A8 = + { sizeof(DDS_PIXELFORMAT), DDS_ALPHA, 0, 8, 0x00, 0x00, 0x00, 0xff }; + +// D3DFMT_A2R10G10B10/D3DFMT_A2B10G10R10 should be written using DX10 extension to avoid D3DX 10:10:10:2 reversal issue + +// This indicates the DDS_HEADER_DXT10 extension is present (the format is in dxgiFormat) +extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_DX10 = + { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, MAKEFOURCC('D','X','1','0'), 0, 0, 0, 0, 0 }; + +#define DDS_HEADER_FLAGS_TEXTURE 0x00001007 // DDSD_CAPS | DDSD_HEIGHT | DDSD_WIDTH | DDSD_PIXELFORMAT +#define DDS_HEADER_FLAGS_MIPMAP 0x00020000 // DDSD_MIPMAPCOUNT +#define DDS_HEADER_FLAGS_VOLUME 0x00800000 // DDSD_DEPTH +#define DDS_HEADER_FLAGS_PITCH 0x00000008 // DDSD_PITCH +#define DDS_HEADER_FLAGS_LINEARSIZE 0x00080000 // DDSD_LINEARSIZE + +#define DDS_HEIGHT 0x00000002 // DDSD_HEIGHT +#define DDS_WIDTH 0x00000004 // DDSD_WIDTH + +#define DDS_SURFACE_FLAGS_TEXTURE 0x00001000 // DDSCAPS_TEXTURE +#define DDS_SURFACE_FLAGS_MIPMAP 0x00400008 // DDSCAPS_COMPLEX | DDSCAPS_MIPMAP +#define DDS_SURFACE_FLAGS_CUBEMAP 0x00000008 // DDSCAPS_COMPLEX + +#define DDS_CUBEMAP_POSITIVEX 0x00000600 // DDSCAPS2_CUBEMAP | DDSCAPS2_CUBEMAP_POSITIVEX +#define DDS_CUBEMAP_NEGATIVEX 0x00000a00 // DDSCAPS2_CUBEMAP | DDSCAPS2_CUBEMAP_NEGATIVEX +#define DDS_CUBEMAP_POSITIVEY 0x00001200 // DDSCAPS2_CUBEMAP | DDSCAPS2_CUBEMAP_POSITIVEY +#define DDS_CUBEMAP_NEGATIVEY 0x00002200 // DDSCAPS2_CUBEMAP | DDSCAPS2_CUBEMAP_NEGATIVEY +#define DDS_CUBEMAP_POSITIVEZ 0x00004200 // DDSCAPS2_CUBEMAP | DDSCAPS2_CUBEMAP_POSITIVEZ +#define DDS_CUBEMAP_NEGATIVEZ 0x00008200 // DDSCAPS2_CUBEMAP | DDSCAPS2_CUBEMAP_NEGATIVEZ + +#define DDS_CUBEMAP_ALLFACES ( DDS_CUBEMAP_POSITIVEX | DDS_CUBEMAP_NEGATIVEX |\ + DDS_CUBEMAP_POSITIVEY | DDS_CUBEMAP_NEGATIVEY |\ + DDS_CUBEMAP_POSITIVEZ | DDS_CUBEMAP_NEGATIVEZ ) + +#define DDS_CUBEMAP 0x00000200 // DDSCAPS2_CUBEMAP + +#define DDS_FLAGS_VOLUME 0x00200000 // DDSCAPS2_VOLUME + +// Subset here matches D3D10_RESOURCE_DIMENSION and D3D11_RESOURCE_DIMENSION +typedef enum DDS_RESOURCE_DIMENSION +{ + DDS_DIMENSION_TEXTURE1D = 2, + DDS_DIMENSION_TEXTURE2D = 3, + DDS_DIMENSION_TEXTURE3D = 4, +} DDS_RESOURCE_DIMENSION; + +// Subset here matches D3D10_RESOURCE_MISC_FLAG and D3D11_RESOURCE_MISC_FLAG +typedef enum DDS_RESOURCE_MISC_FLAG +{ + DDS_RESOURCE_MISC_TEXTURECUBE = 0x4L, +} DDS_RESOURCE_MISC_FLAG; + +typedef struct +{ + uint32_t dwSize; + uint32_t dwFlags; + uint32_t dwHeight; + uint32_t dwWidth; + uint32_t dwPitchOrLinearSize; + uint32_t dwDepth; // only if DDS_HEADER_FLAGS_VOLUME is set in dwFlags + uint32_t dwMipMapCount; + uint32_t dwReserved1[11]; + DDS_PIXELFORMAT ddspf; + uint32_t dwCaps; + uint32_t dwCaps2; + uint32_t dwCaps3; + uint32_t dwCaps4; + uint32_t dwReserved2; +} DDS_HEADER; + +typedef struct +{ + DXGI_FORMAT dxgiFormat; + uint32_t resourceDimension; + uint32_t miscFlag; // see DDS_RESOURCE_MISC_FLAG + uint32_t arraySize; + uint32_t reserved; +} DDS_HEADER_DXT10; + +#pragma pack(pop) + +}; // namespace diff --git a/thirdparty/directxtex/DirectXTex/DirectXTex.h b/thirdparty/directxtex/DirectXTex/DirectXTex.h new file mode 100644 index 0000000..c4d4b73 --- /dev/null +++ b/thirdparty/directxtex/DirectXTex/DirectXTex.h @@ -0,0 +1,466 @@ +//------------------------------------------------------------------------------------- +// DirectXTex.h +// +// DirectX Texture Library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkId=248926 +//------------------------------------------------------------------------------------- + +#if defined(_MSC_VER) && (_MSC_VER > 1000) +#pragma once +#endif + +#pragma warning(push) +#pragma warning(disable : 4005) +#include +#pragma warning(pop) + +#include + +#include +#include + +#define DIRECTX_TEX_VERSION 100 + +namespace DirectX +{ + //--------------------------------------------------------------------------------- + // DXGI Format Utilities + bool IsValid( _In_ DXGI_FORMAT fmt ); + bool IsCompressed( _In_ DXGI_FORMAT fmt ); + bool IsPacked( _In_ DXGI_FORMAT fmt ); + bool IsVideo( _In_ DXGI_FORMAT fmt ); + bool IsSRGB( _In_ DXGI_FORMAT fmt ); + bool IsTypeless( _In_ DXGI_FORMAT fmt ); + + size_t BitsPerPixel( _In_ DXGI_FORMAT fmt ); + + enum CP_FLAGS + { + CP_FLAGS_NONE = 0x0, // Normal operation + CP_FLAGS_LEGACY_DWORD = 0x1, // Assume pitch is DWORD aligned instead of BYTE aligned + CP_FLAGS_24BPP = 0x10000, // Override with a legacy 24 bits-per-pixel format size + CP_FLAGS_16BPP = 0x20000, // Override with a legacy 16 bits-per-pixel format size + CP_FLAGS_8BPP = 0x40000, // Override with a legacy 8 bits-per-pixel format size + }; + + void ComputePitch( _In_ DXGI_FORMAT fmt, _In_ size_t width, _In_ size_t height, + _Out_ size_t& rowPitch, _Out_ size_t& slicePitch, _In_ DWORD flags = CP_FLAGS_NONE ); + + size_t ComputeScanlines( _In_ DXGI_FORMAT fmt, _In_ size_t height ); + + DXGI_FORMAT MakeSRGB( _In_ DXGI_FORMAT fmt ); + DXGI_FORMAT MakeTypeless( _In_ DXGI_FORMAT fmt ); + DXGI_FORMAT MakeTypelessUNORM( _In_ DXGI_FORMAT fmt ); + DXGI_FORMAT MakeTypelessFLOAT( _In_ DXGI_FORMAT fmt ); + + //--------------------------------------------------------------------------------- + // Texture metadata + enum TEX_DIMENSION + // Subset here matches D3D10_RESOURCE_DIMENSION and D3D11_RESOURCE_DIMENSION + { + TEX_DIMENSION_TEXTURE1D = 2, + TEX_DIMENSION_TEXTURE2D = 3, + TEX_DIMENSION_TEXTURE3D = 4, + }; + + enum TEX_MISC_FLAG + // Subset here matches D3D10_RESOURCE_MISC_FLAG and D3D11_RESOURCE_MISC_FLAG + { + TEX_MISC_TEXTURECUBE = 0x4L, + }; + + struct TexMetadata + { + size_t width; + size_t height; // Should be 1 for 1D textures + size_t depth; // Should be 1 for 1D or 2D textures + size_t arraySize; // For cubemap, this is a multiple of 6 + size_t mipLevels; + uint32_t miscFlags; + DXGI_FORMAT format; + TEX_DIMENSION dimension; + + size_t ComputeIndex( _In_ size_t mip, _In_ size_t item, _In_ size_t slice ) const; + // Returns size_t(-1) to indicate an out-of-range error + }; + + enum DDS_FLAGS + { + DDS_FLAGS_NONE = 0x0, + + DDS_FLAGS_LEGACY_DWORD = 0x1, + // Assume pitch is DWORD aligned instead of BYTE aligned (used by some legacy DDS files) + + DDS_FLAGS_NO_LEGACY_EXPANSION = 0x2, + // Do not implicitly convert legacy formats that result in larger pixel sizes (24 bpp, 3:3:2, A8L8, A4L4, P8, A8P8) + + DDS_FLAGS_NO_R10B10G10A2_FIXUP = 0x4, + // Do not use work-around for long-standing D3DX DDS file format issue which reversed the 10:10:10:2 color order masks + + DDS_FLAGS_FORCE_RGB = 0x8, + // Convert DXGI 1.1 BGR formats to DXGI_FORMAT_R8G8B8A8_UNORM to avoid use of optional WDDM 1.1 formats + + DDS_FLAGS_NO_16BPP = 0x10, + // Conversions avoid use of 565, 5551, and 4444 formats and instead expand to 8888 to avoid use of optional WDDM 1.2 formats + + DDS_FLAGS_FORCE_DX10_EXT = 0x10000, + // Always use the 'DX10' header extension for DDS writer (i.e. don't try to write DX9 compatible DDS files) + }; + + enum WIC_FLAGS + { + WIC_FLAGS_NONE = 0x0, + + WIC_FLAGS_FORCE_RGB = 0x1, + // Loads DXGI 1.1 BGR formats as DXGI_FORMAT_R8G8B8A8_UNORM to avoid use of optional WDDM 1.1 formats + + WIC_FLAGS_NO_X2_BIAS = 0x2, + // Loads DXGI 1.1 X2 10:10:10:2 format as DXGI_FORMAT_R10G10B10A2_UNORM + + WIC_FLAGS_NO_16BPP = 0x4, + // Loads 565, 5551, and 4444 formats as 8888 to avoid use of optional WDDM 1.2 formats + + WIC_FLAGS_ALLOW_MONO = 0x8, + // Loads 1-bit monochrome (black & white) as R1_UNORM rather than 8-bit greyscale + + WIC_FLAGS_ALL_FRAMES = 0x10, + // Loads all images in a multi-frame file, converting/resizing to match the first frame as needed, defaults to 0th frame otherwise + + WIC_FLAGS_DITHER = 0x10000, + // Use ordered 4x4 dithering for any required conversions + + WIC_FLAGS_DITHER_DIFFUSION = 0x20000, + // Use error-diffusion dithering for any required conversions + + WIC_FLAGS_FILTER_POINT = 0x100000, + WIC_FLAGS_FILTER_LINEAR = 0x200000, + WIC_FLAGS_FILTER_CUBIC = 0x300000, + WIC_FLAGS_FILTER_FANT = 0x400000, // Combination of Linear and Box filter + // Filtering mode to use for any required image resizing (only needed when loading arrays of differently sized images; defaults to Fant) + }; + + HRESULT GetMetadataFromDDSMemory( _In_bytecount_(size) LPCVOID pSource, _In_ size_t size, _In_ DWORD flags, + _Out_ TexMetadata& metadata ); + HRESULT GetMetadataFromDDSFile( _In_z_ LPCWSTR szFile, DWORD flags, + _Out_ TexMetadata& metadata ); + + HRESULT GetMetadataFromTGAMemory( _In_bytecount_(size) LPCVOID pSource, _In_ size_t size, + _Out_ TexMetadata& metadata ); + HRESULT GetMetadataFromTGAFile( _In_z_ LPCWSTR szFile, + _Out_ TexMetadata& metadata ); + + HRESULT GetMetadataFromWICMemory( _In_bytecount_(size) LPCVOID pSource, _In_ size_t size, _In_ DWORD flags, + _Out_ TexMetadata& metadata ); + HRESULT GetMetadataFromWICFile( _In_z_ LPCWSTR szFile, _In_ DWORD flags, + _Out_ TexMetadata& metadata ); + + //--------------------------------------------------------------------------------- + // Bitmap image container + struct Image + { + size_t width; + size_t height; + DXGI_FORMAT format; + size_t rowPitch; + size_t slicePitch; + uint8_t* pixels; + }; + + class ScratchImage + { + public: + ScratchImage() : _nimages(0), _size(0), _image(0), _memory(0) {} + ~ScratchImage() { Release(); } + + HRESULT Initialize( _In_ const TexMetadata& mdata ); + + HRESULT Initialize1D( _In_ DXGI_FORMAT fmt, _In_ size_t length, _In_ size_t arraySize, _In_ size_t mipLevels ); + HRESULT Initialize2D( _In_ DXGI_FORMAT fmt, _In_ size_t width, _In_ size_t height, _In_ size_t arraySize, _In_ size_t mipLevels ); + HRESULT Initialize3D( _In_ DXGI_FORMAT fmt, _In_ size_t width, _In_ size_t height, _In_ size_t depth, _In_ size_t mipLevels ); + HRESULT InitializeCube( _In_ DXGI_FORMAT fmt, _In_ size_t width, _In_ size_t height, _In_ size_t nCubes, _In_ size_t mipLevels ); + + HRESULT InitializeFromImage( _In_ const Image& srcImage, _In_ bool allow1D = false ); + HRESULT InitializeArrayFromImages( _In_count_(nImages) const Image* images, _In_ size_t nImages, _In_ bool allow1D = false ); + HRESULT InitializeCubeFromImages( _In_count_(nImages) const Image* images, _In_ size_t nImages ); + HRESULT Initialize3DFromImages( _In_count_(depth) const Image* images, _In_ size_t depth ); + + void Release(); + + bool OverrideFormat( _In_ DXGI_FORMAT f ); + + const TexMetadata& GetMetadata() const { return _metadata; } + const Image* GetImage(_In_ size_t mip, _In_ size_t item, _In_ size_t slice) const; + + const Image* GetImages() const { return _image; } + size_t GetImageCount() const { return _nimages; } + + uint8_t* GetPixels() const { return _memory; } + size_t GetPixelsSize() const { return _size; } + + private: + size_t _nimages; + size_t _size; + TexMetadata _metadata; + Image* _image; + uint8_t* _memory; + + // Hide copy constructor and assignment operator + ScratchImage( const ScratchImage& ); + ScratchImage& operator=( const ScratchImage& ); + }; + + //--------------------------------------------------------------------------------- + // Memory blob (allocated buffer pointer is always 16-byte aligned) + class Blob + { + public: + Blob() : _buffer(0), _size(0) {} + ~Blob() { Release(); } + + HRESULT Initialize( _In_ size_t size ); + + void Release(); + + void *GetBufferPointer() const { return _buffer; } + size_t GetBufferSize() const { return _size; } + + private: + void* _buffer; + size_t _size; + + // Hide copy constructor and assignment operator + Blob( const Blob& ); + Blob& operator=( const Blob& ); + }; + + //--------------------------------------------------------------------------------- + // Image I/O + + // DDS operations + HRESULT LoadFromDDSMemory( _In_bytecount_(size) LPCVOID pSource, _In_ size_t size, _In_ DWORD flags, + _Out_opt_ TexMetadata* metadata, _Out_ ScratchImage& image ); + HRESULT LoadFromDDSFile( _In_z_ LPCWSTR szFile, _In_ DWORD flags, + _Out_opt_ TexMetadata* metadata, _Out_ ScratchImage& image ); + + HRESULT SaveToDDSMemory( _In_ const Image& image, _In_ DWORD flags, + _Out_ Blob& blob ); + HRESULT SaveToDDSMemory( _In_count_(nimages) const Image* images, _In_ size_t nimages, _In_ const TexMetadata& metadata, _In_ DWORD flags, + _Out_ Blob& blob ); + + HRESULT SaveToDDSFile( _In_ const Image& image, _In_ DWORD flags, _In_z_ LPCWSTR szFile ); + HRESULT SaveToDDSFile( _In_count_(nimages) const Image* images, _In_ size_t nimages, _In_ const TexMetadata& metadata, _In_ DWORD flags, _In_z_ LPCWSTR szFile ); + + // TGA operations + HRESULT LoadFromTGAMemory( _In_bytecount_(size) LPCVOID pSource, _In_ size_t size, + _Out_opt_ TexMetadata* metadata, _Out_ ScratchImage& image ); + HRESULT LoadFromTGAFile( _In_z_ LPCWSTR szFile, + _Out_opt_ TexMetadata* metadata, _Out_ ScratchImage& image ); + + HRESULT SaveToTGAMemory( _In_ const Image& image, _Out_ Blob& blob ); + HRESULT SaveToTGAFile( _In_ const Image& image, _In_z_ LPCWSTR szFile ); + + // WIC operations + HRESULT LoadFromWICMemory( _In_bytecount_(size) LPCVOID pSource, _In_ size_t size, _In_ DWORD flags, + _Out_opt_ TexMetadata* metadata, _Out_ ScratchImage& image ); + HRESULT LoadFromWICFile( _In_z_ LPCWSTR szFile, _In_ DWORD flags, + _Out_opt_ TexMetadata* metadata, _Out_ ScratchImage& image ); + + HRESULT SaveToWICMemory( _In_ const Image& image, _In_ DWORD flags, _In_ REFGUID guidContainerFormat, + _Out_ Blob& blob, _In_opt_ const GUID* targetFormat = nullptr ); + HRESULT SaveToWICMemory( _In_count_(nimages) const Image* images, _In_ size_t nimages, _In_ DWORD flags, _In_ REFGUID guidContainerFormat, + _Out_ Blob& blob, _In_opt_ const GUID* targetFormat = nullptr ); + + HRESULT SaveToWICFile( _In_ const Image& image, _In_ DWORD flags, _In_ REFGUID guidContainerFormat, + _In_z_ LPCWSTR szFile, _In_opt_ const GUID* targetFormat = nullptr ); + HRESULT SaveToWICFile( _In_count_(nimages) const Image* images, _In_ size_t nimages, _In_ DWORD flags, _In_ REFGUID guidContainerFormat, + _In_z_ LPCWSTR szFile, _In_opt_ const GUID* targetFormat = nullptr ); + + enum WICCodecs + { + WIC_CODEC_BMP =1, // Windows Bitmap (.bmp) + WIC_CODEC_JPEG, // Joint Photographic Experts Group (.jpg, .jpeg) + WIC_CODEC_PNG, // Portable Network Graphics (.png) + WIC_CODEC_TIFF, // Tagged Image File Format (.tif, .tiff) + WIC_CODEC_GIF, // Graphics Interchange Format (.gif) + WIC_CODEC_WMP, // Windows Media Photo / HD Photo / JPEG XR (.hdp, .jxr, .wdp) + WIC_CODEC_ICO, // Windows Icon (.ico) + }; + + REFGUID GetWICCodec( _In_ WICCodecs codec ); + + //--------------------------------------------------------------------------------- + // Texture conversion, resizing, mipmap generation, and block compression + + enum TEX_FR_FLAGS + { + TEX_FR_ROTATE0 = 0x0, + TEX_FR_ROTATE90 = 0x1, + TEX_FR_ROTATE180 = 0x2, + TEX_FR_ROTATE270 = 0x3, + TEX_FR_FLIP_HORIZONTAL = 0x08, + TEX_FR_FLIP_VERTICAL = 0x10, + }; + + HRESULT FlipRotate( _In_ const Image& srcImage, _In_ DWORD flags, _Out_ ScratchImage& image ); + HRESULT FlipRotate( _In_count_(nimages) const Image* srcImages, _In_ size_t nimages, _In_ const TexMetadata& metadata, + _In_ DWORD flags, _Out_ ScratchImage& result ); + // Flip and/or rotate image + + enum TEX_FILTER_FLAGS + { + TEX_FILTER_DEFAULT = 0, + + // Clamp filtering only + + TEX_FILTER_SEPARATE_ALPHA = 0x100, + // Resize color and alpha channel independently + + TEX_FILTER_DITHER = 0x10000, + // Use ordered 4x4 dithering for any required conversions + TEX_FILTER_DITHER_DIFFUSION = 0x20000, + // Use error-diffusion dithering for any required conversions + + TEX_FILTER_POINT = 0x100000, + TEX_FILTER_LINEAR = 0x200000, + TEX_FILTER_CUBIC = 0x300000, + TEX_FILTER_FANT = 0x400000, // Equiv to Box filtering for mipmap generation + // Filtering mode to use for any required image resizing + + TEX_FILTER_SRGB_IN = 0x1000000, + TEX_FILTER_SRGB_OUT = 0x2000000, + TEX_FILTER_SRGB = 0x3000000, + // sRGB <-> RGB for use in conversion operations + // if the input format type is IsSRGB(), then SRGB_IN is on by default + // if the output format type is IsSRGB(), then SRGB_OUT is on by default + }; + + HRESULT Resize( _In_ const Image& srcImage, _In_ size_t width, _In_ size_t height, _In_ DWORD filter, + _Out_ ScratchImage& image ); + HRESULT Resize( _In_count_(nimages) const Image* srcImages, _In_ size_t nimages, _In_ const TexMetadata& metadata, + _In_ size_t width, _In_ size_t height, _In_ DWORD filter, _Out_ ScratchImage& result ); + // Resize the image to width x height. Defaults to Fant filtering. + // Note for a complex resize, the result will always have mipLevels == 1 + + HRESULT Convert( _In_ const Image& srcImage, _In_ DXGI_FORMAT format, _In_ DWORD filter, _In_ float threshold, + _Out_ ScratchImage& image ); + HRESULT Convert( _In_count_(nimages) const Image* srcImages, _In_ size_t nimages, _In_ const TexMetadata& metadata, + _In_ DXGI_FORMAT format, _In_ DWORD filter, _In_ float threshold, _Out_ ScratchImage& result ); + // Convert the image to a new format + + HRESULT GenerateMipMaps( _In_ const Image& baseImage, _In_ DWORD filter, _In_ size_t levels, + _Out_ ScratchImage& mipChain, bool allow1D = false ); + HRESULT GenerateMipMaps( _In_count_(nimages) const Image* srcImages, _In_ size_t nimages, _In_ const TexMetadata& metadata, + _In_ DWORD filter, _In_ size_t levels, _Out_ ScratchImage& mipChain ); + // levels of '0' indicates a full mipchain, otherwise is generates that number of total levels (including the source base image) + // Defaults to Fant filtering which is equivalent to a box filter + + HRESULT GenerateMipMaps3D( _In_count_(depth) const Image* baseImages, _In_ size_t depth, _In_ DWORD filter, _In_ size_t levels, + _Out_ ScratchImage& mipChain ); + HRESULT GenerateMipMaps3D( _In_count_(nimages) const Image* srcImages, _In_ size_t nimages, _In_ const TexMetadata& metadata, + _In_ DWORD filter, _In_ size_t levels, _Out_ ScratchImage& mipChain ); + // levels of '0' indicates a full mipchain, otherwise is generates that number of total levels (including the source base image) + // Defaults to Fant filtering which is equivalent to a box filter + + enum TEX_COMPRESS_FLAGS + { + TEX_COMPRESS_DEFAULT = 0, + + TEX_COMPRESS_RGB_DITHER = 0x10000, + // Enables dithering RGB colors for BC1-3 compression + + TEX_COMPRESS_A_DITHER = 0x20000, + // Enables dithering alpha for BC1-3 compression + + TEX_COMPRESS_DITHER = 0x30000, + // Enables both RGB and alpha dithering for BC1-3 compression + + TEX_COMPRESS_UNIFORM = 0x40000, + // Uniform color weighting for BC1-3 compression; by default uses perceptual weighting + + TEX_COMPRESS_PARALLEL = 0x10000000, + // Compress is free to use multithreading to improve performance (by default it does not use multithreading) + }; + + HRESULT Compress( _In_ const Image& srcImage, _In_ DXGI_FORMAT format, _In_ DWORD compress, _In_ float alphaRef, + _Out_ ScratchImage& cImage ); + HRESULT Compress( _In_count_(nimages) const Image* srcImages, _In_ size_t nimages, _In_ const TexMetadata& metadata, + _In_ DXGI_FORMAT format, _In_ DWORD compress, _In_ float alphaRef, _Out_ ScratchImage& cImages ); + // Note that alphaRef is only used by BC1. 0.5f is a typical value to use + + HRESULT Decompress( _In_ const Image& cImage, _In_ DXGI_FORMAT format, _Out_ ScratchImage& image ); + HRESULT Decompress( _In_count_(nimages) const Image* cImages, _In_ size_t nimages, _In_ const TexMetadata& metadata, + _In_ DXGI_FORMAT format, _Out_ ScratchImage& images ); + + //--------------------------------------------------------------------------------- + // Normal map operations + + enum CNMAP_FLAGS + { + CNMAP_DEFAULT = 0, + + CNMAP_CHANNEL_RED = 0x1, + CNMAP_CHANNEL_GREEN = 0x2, + CNMAP_CHANNEL_BLUE = 0x3, + CNMAP_CHANNEL_ALPHA = 0x4, + CNMAP_CHANNEL_LUMINANCE = 0x5, + // Channel selection when evaluting color value for height + // Luminance is a combination of red, green, and blue + + CNMAP_MIRROR_U = 0x1000, + CNMAP_MIRROR_V = 0x2000, + CNMAP_MIRROR = 0x3000, + // Use mirror semantics for scanline references (defaults to wrap) + + CNMAP_INVERT_SIGN = 0x4000, + // Inverts normal sign + + CNMAP_COMPUTE_OCCLUSION = 0x8000, + // Computes a crude occlusion term stored in the alpha channel + }; + + HRESULT ComputeNormalMap( _In_ const Image& srcImage, _In_ DWORD flags, _In_ float amplitude, + _In_ DXGI_FORMAT format, _Out_ ScratchImage& normalMap ); + HRESULT ComputeNormalMap( _In_count_(nimages) const Image* srcImages, _In_ size_t nimages, _In_ const TexMetadata& metadata, + _In_ DWORD flags, _In_ float amplitude, _In_ DXGI_FORMAT format, _Out_ ScratchImage& normalMaps ); + + //--------------------------------------------------------------------------------- + // Misc image operations + struct Rect + { + size_t x; + size_t y; + size_t w; + size_t h; + + Rect() {} + Rect( size_t _x, size_t _y, size_t _w, size_t _h ) : x(_x), y(_y), w(_w), h(_h) {} + }; + + HRESULT CopyRectangle( _In_ const Image& srcImage, _In_ const Rect& srcRect, _In_ const Image& dstImage, + _In_ DWORD filter, _In_ size_t xOffset, _In_ size_t yOffset ); + + HRESULT ComputeMSE( _In_ const Image& image1, _In_ const Image& image2, _Out_ float& mse, _Out_opt_cap_c_(4) float* mseV ); + + //--------------------------------------------------------------------------------- + // Direct3D 11 functions + bool IsSupportedTexture( _In_ ID3D11Device* pDevice, _In_ const TexMetadata& metadata ); + + HRESULT CreateTexture( _In_ ID3D11Device* pDevice, _In_count_(nimages) const Image* srcImages, _In_ size_t nimages, _In_ const TexMetadata& metadata, + _Deref_out_ ID3D11Resource** ppResource ); + + HRESULT CreateShaderResourceView( _In_ ID3D11Device* pDevice, _In_count_(nimages) const Image* srcImages, _In_ size_t nimages, _In_ const TexMetadata& metadata, + _Deref_out_ ID3D11ShaderResourceView** ppSRV ); + + HRESULT CaptureTexture( _In_ ID3D11Device* pDevice, _In_ ID3D11DeviceContext* pContext, _In_ ID3D11Resource* pSource, _Out_ ScratchImage& result ); + +#include "DirectXTex.inl" + +}; // namespace diff --git a/thirdparty/directxtex/DirectXTex/DirectXTex.inl b/thirdparty/directxtex/DirectXTex/DirectXTex.inl new file mode 100644 index 0000000..909cd40 --- /dev/null +++ b/thirdparty/directxtex/DirectXTex/DirectXTex.inl @@ -0,0 +1,223 @@ +//------------------------------------------------------------------------------------- +// DirectXTex.inl +// +// DirectX Texture Library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkId=248926 +//------------------------------------------------------------------------------------- + +#if defined(_MSC_VER) && (_MSC_VER > 1000) +#pragma once +#endif + +//===================================================================================== +// DXGI Format Utilities +//===================================================================================== + +inline bool IsValid( DXGI_FORMAT fmt ) +{ +#ifdef DXGI_1_2_FORMATS + return ( static_cast(fmt) >= 1 && static_cast(fmt) <= 115 ); +#else + return ( static_cast(fmt) >= 1 && static_cast(fmt) <= 99 ); +#endif +} + +inline bool IsCompressed( DXGI_FORMAT fmt ) +{ + switch ( fmt ) + { + case DXGI_FORMAT_BC1_TYPELESS: + case DXGI_FORMAT_BC1_UNORM: + case DXGI_FORMAT_BC1_UNORM_SRGB: + case DXGI_FORMAT_BC2_TYPELESS: + case DXGI_FORMAT_BC2_UNORM: + case DXGI_FORMAT_BC2_UNORM_SRGB: + case DXGI_FORMAT_BC3_TYPELESS: + case DXGI_FORMAT_BC3_UNORM: + case DXGI_FORMAT_BC3_UNORM_SRGB: + case DXGI_FORMAT_BC4_TYPELESS: + case DXGI_FORMAT_BC4_UNORM: + case DXGI_FORMAT_BC4_SNORM: + case DXGI_FORMAT_BC5_TYPELESS: + case DXGI_FORMAT_BC5_UNORM: + case DXGI_FORMAT_BC5_SNORM: + case DXGI_FORMAT_BC6H_TYPELESS: + case DXGI_FORMAT_BC6H_UF16: + case DXGI_FORMAT_BC6H_SF16: + case DXGI_FORMAT_BC7_TYPELESS: + case DXGI_FORMAT_BC7_UNORM: + case DXGI_FORMAT_BC7_UNORM_SRGB: + return true; + + default: + return false; + } +} + +inline bool IsPacked( DXGI_FORMAT fmt ) +{ + return ( (fmt == DXGI_FORMAT_R8G8_B8G8_UNORM) || (fmt == DXGI_FORMAT_G8R8_G8B8_UNORM) ); +} + +inline bool IsVideo( DXGI_FORMAT fmt ) +{ +#ifdef DXGI_1_2_FORMATS + switch ( fmt ) + { + case DXGI_FORMAT_AYUV: + case DXGI_FORMAT_Y410: + case DXGI_FORMAT_Y416: + case DXGI_FORMAT_NV12: + case DXGI_FORMAT_P010: + case DXGI_FORMAT_P016: + case DXGI_FORMAT_YUY2: + case DXGI_FORMAT_Y210: + case DXGI_FORMAT_Y216: + case DXGI_FORMAT_NV11: + // These video formats can be used with the 3D pipeline through special view mappings + return true; + + case DXGI_FORMAT_420_OPAQUE: + case DXGI_FORMAT_AI44: + case DXGI_FORMAT_IA44: + case DXGI_FORMAT_P8: + case DXGI_FORMAT_A8P8: + // These are limited use video formats not usable in any way by the 3D pipeline + return true; + + default: + return false; + } +#else // !DXGI_1_2_FORMATS + UNREFERENCED_PARAMETER(fmt); + return false; +#endif +} + +inline bool IsSRGB( DXGI_FORMAT fmt ) +{ + switch( fmt ) + { + case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB: + case DXGI_FORMAT_BC1_UNORM_SRGB: + case DXGI_FORMAT_BC2_UNORM_SRGB: + case DXGI_FORMAT_BC3_UNORM_SRGB: + case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB: + case DXGI_FORMAT_B8G8R8X8_UNORM_SRGB: + case DXGI_FORMAT_BC7_UNORM_SRGB: + return true; + + default: + return false; + } +} + +inline bool IsTypeless( DXGI_FORMAT fmt ) +{ + switch( fmt ) + { + case DXGI_FORMAT_R32G32B32A32_TYPELESS: + case DXGI_FORMAT_R32G32B32_TYPELESS: + case DXGI_FORMAT_R16G16B16A16_TYPELESS: + case DXGI_FORMAT_R32G32_TYPELESS: + case DXGI_FORMAT_R32G8X24_TYPELESS: + case DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS: + case DXGI_FORMAT_X32_TYPELESS_G8X24_UINT: + case DXGI_FORMAT_R10G10B10A2_TYPELESS: + case DXGI_FORMAT_R8G8B8A8_TYPELESS: + case DXGI_FORMAT_R16G16_TYPELESS: + case DXGI_FORMAT_R32_TYPELESS: + case DXGI_FORMAT_R24G8_TYPELESS: + case DXGI_FORMAT_R24_UNORM_X8_TYPELESS: + case DXGI_FORMAT_X24_TYPELESS_G8_UINT: + case DXGI_FORMAT_R8G8_TYPELESS: + case DXGI_FORMAT_R16_TYPELESS: + case DXGI_FORMAT_R8_TYPELESS: + case DXGI_FORMAT_BC1_TYPELESS: + case DXGI_FORMAT_BC2_TYPELESS: + case DXGI_FORMAT_BC3_TYPELESS: + case DXGI_FORMAT_BC4_TYPELESS: + case DXGI_FORMAT_BC5_TYPELESS: + case DXGI_FORMAT_B8G8R8A8_TYPELESS: + case DXGI_FORMAT_B8G8R8X8_TYPELESS: + case DXGI_FORMAT_BC6H_TYPELESS: + case DXGI_FORMAT_BC7_TYPELESS: + return true; + + default: + return false; + } +} + +inline size_t ComputeScanlines( _In_ DXGI_FORMAT fmt, _In_ size_t height ) +{ + switch ( fmt ) + { + case DXGI_FORMAT_BC1_TYPELESS: + case DXGI_FORMAT_BC1_UNORM: + case DXGI_FORMAT_BC1_UNORM_SRGB: + case DXGI_FORMAT_BC2_TYPELESS: + case DXGI_FORMAT_BC2_UNORM: + case DXGI_FORMAT_BC2_UNORM_SRGB: + case DXGI_FORMAT_BC3_TYPELESS: + case DXGI_FORMAT_BC3_UNORM: + case DXGI_FORMAT_BC3_UNORM_SRGB: + case DXGI_FORMAT_BC4_TYPELESS: + case DXGI_FORMAT_BC4_UNORM: + case DXGI_FORMAT_BC4_SNORM: + case DXGI_FORMAT_BC5_TYPELESS: + case DXGI_FORMAT_BC5_UNORM: + case DXGI_FORMAT_BC5_SNORM: + case DXGI_FORMAT_BC6H_TYPELESS: + case DXGI_FORMAT_BC6H_UF16: + case DXGI_FORMAT_BC6H_SF16: + case DXGI_FORMAT_BC7_TYPELESS: + case DXGI_FORMAT_BC7_UNORM: + case DXGI_FORMAT_BC7_UNORM_SRGB: + return std::max( 1, (height + 3) / 4 ); + + default: + return height; + } +} + +//===================================================================================== +// Image I/O +//===================================================================================== +inline HRESULT SaveToDDSMemory( const Image& image, DWORD flags, Blob& blob ) +{ + TexMetadata mdata; + memset( &mdata, 0, sizeof(mdata) ); + mdata.width = image.width; + mdata.height = image.height; + mdata.depth = 1; + mdata.arraySize = 1; + mdata.mipLevels = 1; + mdata.format = image.format; + mdata.dimension = TEX_DIMENSION_TEXTURE2D; + + return SaveToDDSMemory( &image, 1, mdata, flags, blob ); +} + +inline HRESULT SaveToDDSFile( const Image& image, DWORD flags, LPCWSTR szFile ) +{ + TexMetadata mdata; + memset( &mdata, 0, sizeof(mdata) ); + mdata.width = image.width; + mdata.height = image.height; + mdata.depth = 1; + mdata.arraySize = 1; + mdata.mipLevels = 1; + mdata.format = image.format; + mdata.dimension = TEX_DIMENSION_TEXTURE2D; + + return SaveToDDSFile( &image, 1, mdata, flags, szFile ); +} diff --git a/thirdparty/directxtex/DirectXTex/DirectXTexCompress.cpp b/thirdparty/directxtex/DirectXTex/DirectXTexCompress.cpp new file mode 100644 index 0000000..b529086 --- /dev/null +++ b/thirdparty/directxtex/DirectXTex/DirectXTexCompress.cpp @@ -0,0 +1,697 @@ +//------------------------------------------------------------------------------------- +// DirectXTexCompress.cpp +// +// DirectX Texture Library - Texture compression +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkId=248926 +//------------------------------------------------------------------------------------- + +#include "directxtexp.h" + +#ifdef _OPENMP +#include +#pragma warning(disable : 4616 6001 6993) +#endif + +#include "bc.h" + +namespace DirectX +{ + +inline static DWORD _GetBCFlags( _In_ DWORD compress ) +{ + static_assert( TEX_COMPRESS_RGB_DITHER == BC_FLAGS_DITHER_RGB, "TEX_COMPRESS_* flags should match BC_FLAGS_*" ); + static_assert( TEX_COMPRESS_A_DITHER == BC_FLAGS_DITHER_A, "TEX_COMPRESS_* flags should match BC_FLAGS_*" ); + static_assert( TEX_COMPRESS_DITHER == (BC_FLAGS_DITHER_RGB | BC_FLAGS_DITHER_A), "TEX_COMPRESS_* flags should match BC_FLAGS_*" ); + static_assert( TEX_COMPRESS_UNIFORM == BC_FLAGS_UNIFORM, "TEX_COMPRESS_* flags should match BC_FLAGS_*" ); + return ( compress & (BC_FLAGS_DITHER_RGB|BC_FLAGS_DITHER_A|BC_FLAGS_UNIFORM) ); +} + + +//------------------------------------------------------------------------------------- +static HRESULT _CompressBC( _In_ const Image& image, _In_ const Image& result, _In_ DWORD bcflags, + _In_ float alphaRef, _In_ bool degenerate ) +{ + if ( !image.pixels || !result.pixels ) + return E_POINTER; + + assert( image.width == result.width ); + assert( image.height == result.height ); + + const DXGI_FORMAT format = image.format; + size_t sbpp = BitsPerPixel( format ); + if ( !sbpp ) + return E_FAIL; + + if ( sbpp < 8 ) + { + // We don't support compressing from monochrome (DXGI_FORMAT_R1_UNORM) + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + } + + // Round to bytes + sbpp = ( sbpp + 7 ) / 8; + + uint8_t *pDest = result.pixels; + + // Determine BC format encoder + BC_ENCODE pfEncode; + size_t blocksize; + switch(result.format) + { + case DXGI_FORMAT_BC1_UNORM: + case DXGI_FORMAT_BC1_UNORM_SRGB: pfEncode = nullptr; blocksize = 8; break; + case DXGI_FORMAT_BC2_UNORM: + case DXGI_FORMAT_BC2_UNORM_SRGB: pfEncode = D3DXEncodeBC2; blocksize = 16; break; + case DXGI_FORMAT_BC3_UNORM: + case DXGI_FORMAT_BC3_UNORM_SRGB: pfEncode = D3DXEncodeBC3; blocksize = 16; break; + case DXGI_FORMAT_BC4_UNORM: pfEncode = D3DXEncodeBC4U; blocksize = 8; break; + case DXGI_FORMAT_BC4_SNORM: pfEncode = D3DXEncodeBC4S; blocksize = 8; break; + case DXGI_FORMAT_BC5_UNORM: pfEncode = D3DXEncodeBC5U; blocksize = 16; break; + case DXGI_FORMAT_BC5_SNORM: pfEncode = D3DXEncodeBC5S; blocksize = 16; break; + case DXGI_FORMAT_BC6H_UF16: pfEncode = D3DXEncodeBC6HU; blocksize = 16; break; + case DXGI_FORMAT_BC6H_SF16: pfEncode = D3DXEncodeBC6HS; blocksize = 16; break; + case DXGI_FORMAT_BC7_UNORM: + case DXGI_FORMAT_BC7_UNORM_SRGB: pfEncode = D3DXEncodeBC7; blocksize = 16; break; + default: + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + } + + XMVECTOR temp[16]; + const uint8_t *pSrc = image.pixels; + const size_t rowPitch = image.rowPitch; + for( size_t h=0; h < image.height; h += 4 ) + { + const uint8_t *sptr = pSrc; + uint8_t* dptr = pDest; + for( size_t count = 0; count < rowPitch; count += sbpp*4 ) + { + if ( !_LoadScanline( &temp[0], 4, sptr, rowPitch, format ) ) + return E_FAIL; + + if ( image.height > 1 ) + { + if ( !_LoadScanline( &temp[4], 4, sptr + rowPitch, rowPitch, format ) ) + return E_FAIL; + + if ( image.height > 2 ) + { + if ( !_LoadScanline( &temp[8], 4, sptr + rowPitch*2, rowPitch, format ) ) + return E_FAIL; + + if ( !_LoadScanline( &temp[12], 4, sptr + rowPitch*3, rowPitch, format ) ) + return E_FAIL; + } + } + + if ( degenerate ) + { + assert( image.width < 4 || image.height < 4 ); + const size_t uSrc[] = { 0, 0, 0, 1 }; + + if ( image.width < 4 ) + { + for( size_t t=0; t < image.height && t < 4; ++t ) + { + for( size_t s = image.width; s < 4; ++s ) + { + temp[ t*4 + s ] = temp[ t*4 + uSrc[s] ]; + } + } + } + + if ( image.height < 4 ) + { + for( size_t t=image.height; t < 4; ++t ) + { + for( size_t s =0; s < 4; ++s ) + { + temp[ t*4 + s ] = temp[ uSrc[t]*4 + s ]; + } + } + } + } + + _ConvertScanline( temp, 16, result.format, format, 0 ); + + if ( pfEncode ) + pfEncode( dptr, temp, bcflags ); + else + D3DXEncodeBC1( dptr, temp, alphaRef, bcflags ); + + sptr += sbpp*4; + dptr += blocksize; + } + + pSrc += rowPitch*4; + pDest += result.rowPitch; + } + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +#ifdef _OPENMP +static HRESULT _CompressBC_Parallel( _In_ const Image& image, _In_ const Image& result, _In_ DWORD bcflags, + _In_ float alphaRef ) +{ + if ( !image.pixels || !result.pixels ) + return E_POINTER; + + // Parallel version doesn't support degenerate case + assert( ((image.width % 4) == 0) && ((image.height % 4) == 0 ) ); + + assert( image.width == result.width ); + assert( image.height == result.height ); + + const DXGI_FORMAT format = image.format; + size_t sbpp = BitsPerPixel( format ); + if ( !sbpp ) + return E_FAIL; + + if ( sbpp < 8 ) + { + // We don't support compressing from monochrome (DXGI_FORMAT_R1_UNORM) + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + } + + // Round to bytes + sbpp = ( sbpp + 7 ) / 8; + + // Determine BC format encoder + BC_ENCODE pfEncode; + size_t blocksize; + switch(result.format) + { + case DXGI_FORMAT_BC1_UNORM: + case DXGI_FORMAT_BC1_UNORM_SRGB: pfEncode = nullptr; blocksize = 8; break; + case DXGI_FORMAT_BC2_UNORM: + case DXGI_FORMAT_BC2_UNORM_SRGB: pfEncode = D3DXEncodeBC2; blocksize = 16; break; + case DXGI_FORMAT_BC3_UNORM: + case DXGI_FORMAT_BC3_UNORM_SRGB: pfEncode = D3DXEncodeBC3; blocksize = 16; break; + case DXGI_FORMAT_BC4_UNORM: pfEncode = D3DXEncodeBC4U; blocksize = 8; break; + case DXGI_FORMAT_BC4_SNORM: pfEncode = D3DXEncodeBC4S; blocksize = 8; break; + case DXGI_FORMAT_BC5_UNORM: pfEncode = D3DXEncodeBC5U; blocksize = 16; break; + case DXGI_FORMAT_BC5_SNORM: pfEncode = D3DXEncodeBC5S; blocksize = 16; break; + case DXGI_FORMAT_BC6H_UF16: pfEncode = D3DXEncodeBC6HU; blocksize = 16; break; + case DXGI_FORMAT_BC6H_SF16: pfEncode = D3DXEncodeBC6HS; blocksize = 16; break; + case DXGI_FORMAT_BC7_UNORM: + case DXGI_FORMAT_BC7_UNORM_SRGB: pfEncode = D3DXEncodeBC7; blocksize = 16; break; + default: + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + } + + // Refactored version of loop to support parallel independance + const size_t nBlocks = std::max(1, image.width / 4) * std::max(1, image.height / 4); + + bool fail = false; + +#pragma omp parallel for + for( int nb=0; nb < static_cast( nBlocks ); ++nb ) + { + const size_t nbWidth = std::max(1, image.width / 4); + + const size_t y = nb / nbWidth; + const size_t x = nb - (y*nbWidth); + + assert( x < image.width && y < image.height ); + + size_t rowPitch = image.rowPitch; + const uint8_t *pSrc = image.pixels + (y*4*rowPitch) + (x*4*sbpp); + + uint8_t *pDest = result.pixels + (nb*blocksize); + + XMVECTOR temp[16]; + if ( !_LoadScanline( &temp[0], 4, pSrc, rowPitch, format ) ) + fail = true; + + if ( !_LoadScanline( &temp[4], 4, pSrc + rowPitch, rowPitch, format ) ) + fail = true; + + if ( !_LoadScanline( &temp[8], 4, pSrc + rowPitch*2, rowPitch, format ) ) + fail = true; + + if ( !_LoadScanline( &temp[12], 4, pSrc + rowPitch*3, rowPitch, format ) ) + fail = true; + + _ConvertScanline( temp, 16, result.format, format, 0 ); + + if ( pfEncode ) + pfEncode( pDest, temp, bcflags ); + else + D3DXEncodeBC1( pDest, temp, alphaRef, bcflags ); + } + + return (fail) ? E_FAIL : S_OK; +} + +#endif // _OPENMP + + +//------------------------------------------------------------------------------------- +static DXGI_FORMAT _DefaultDecompress( _In_ DXGI_FORMAT format ) +{ + switch( format ) + { + case DXGI_FORMAT_BC1_TYPELESS: + case DXGI_FORMAT_BC1_UNORM: + case DXGI_FORMAT_BC2_TYPELESS: + case DXGI_FORMAT_BC2_UNORM: + case DXGI_FORMAT_BC3_TYPELESS: + case DXGI_FORMAT_BC3_UNORM: + case DXGI_FORMAT_BC7_TYPELESS: + case DXGI_FORMAT_BC7_UNORM: + return DXGI_FORMAT_R8G8B8A8_UNORM; + + case DXGI_FORMAT_BC1_UNORM_SRGB: + case DXGI_FORMAT_BC2_UNORM_SRGB: + case DXGI_FORMAT_BC3_UNORM_SRGB: + case DXGI_FORMAT_BC7_UNORM_SRGB: + return DXGI_FORMAT_R8G8B8A8_UNORM_SRGB; + + case DXGI_FORMAT_BC4_TYPELESS: + case DXGI_FORMAT_BC4_UNORM: + return DXGI_FORMAT_R8_UNORM; + + case DXGI_FORMAT_BC4_SNORM: + return DXGI_FORMAT_R8_SNORM; + + case DXGI_FORMAT_BC5_TYPELESS: + case DXGI_FORMAT_BC5_UNORM: + return DXGI_FORMAT_R8G8_UNORM; + + case DXGI_FORMAT_BC5_SNORM: + return DXGI_FORMAT_R8G8_SNORM; + + case DXGI_FORMAT_BC6H_TYPELESS: + case DXGI_FORMAT_BC6H_UF16: + case DXGI_FORMAT_BC6H_SF16: + // We could use DXGI_FORMAT_R32G32B32_FLOAT here since BC6H is always Alpha 1.0, + // but this format is more supported by viewers + return DXGI_FORMAT_R32G32B32A32_FLOAT; + + default: + return DXGI_FORMAT_UNKNOWN; + } +} + + +//------------------------------------------------------------------------------------- +static HRESULT _DecompressBC( _In_ const Image& cImage, _In_ const Image& result ) +{ + if ( !cImage.pixels || !result.pixels ) + return E_POINTER; + + assert( cImage.width == result.width ); + assert( cImage.height == result.height ); + + // Image must be a multiple of 4 (degenerate cases of 1x1, 1x2, 2x1, and 2x2 are allowed) + size_t width = cImage.width; + if ( (width % 4) != 0 ) + { + if ( width != 1 && width != 2 ) + return E_INVALIDARG; + } + + size_t height = cImage.height; + if ( (height % 4) != 0 ) + { + if ( height != 1 && height != 2 ) + return E_INVALIDARG; + } + + const DXGI_FORMAT format = result.format; + size_t dbpp = BitsPerPixel( format ); + if ( !dbpp ) + return E_FAIL; + + if ( dbpp < 8 ) + { + // We don't support decompressing to monochrome (DXGI_FORMAT_R1_UNORM) + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + } + + // Round to bytes + dbpp = ( dbpp + 7 ) / 8; + + uint8_t *pDest = result.pixels; + if ( !pDest ) + return E_POINTER; + + // Promote "typeless" BC formats + DXGI_FORMAT cformat; + switch( cImage.format ) + { + case DXGI_FORMAT_BC1_TYPELESS: cformat = DXGI_FORMAT_BC1_UNORM; break; + case DXGI_FORMAT_BC2_TYPELESS: cformat = DXGI_FORMAT_BC2_UNORM; break; + case DXGI_FORMAT_BC3_TYPELESS: cformat = DXGI_FORMAT_BC3_UNORM; break; + case DXGI_FORMAT_BC4_TYPELESS: cformat = DXGI_FORMAT_BC4_UNORM; break; + case DXGI_FORMAT_BC5_TYPELESS: cformat = DXGI_FORMAT_BC5_UNORM; break; + case DXGI_FORMAT_BC6H_TYPELESS: cformat = DXGI_FORMAT_BC6H_UF16; break; + case DXGI_FORMAT_BC7_TYPELESS: cformat = DXGI_FORMAT_BC7_UNORM; break; + default: cformat = cImage.format; break; + } + + // Determine BC format decoder + BC_DECODE pfDecode; + size_t sbpp; + switch(cformat) + { + case DXGI_FORMAT_BC1_UNORM: + case DXGI_FORMAT_BC1_UNORM_SRGB: pfDecode = D3DXDecodeBC1; sbpp = 8; break; + case DXGI_FORMAT_BC2_UNORM: + case DXGI_FORMAT_BC2_UNORM_SRGB: pfDecode = D3DXDecodeBC2; sbpp = 16; break; + case DXGI_FORMAT_BC3_UNORM: + case DXGI_FORMAT_BC3_UNORM_SRGB: pfDecode = D3DXDecodeBC3; sbpp = 16; break; + case DXGI_FORMAT_BC4_UNORM: pfDecode = D3DXDecodeBC4U; sbpp = 8; break; + case DXGI_FORMAT_BC4_SNORM: pfDecode = D3DXDecodeBC4S; sbpp = 8; break; + case DXGI_FORMAT_BC5_UNORM: pfDecode = D3DXDecodeBC5U; sbpp = 16; break; + case DXGI_FORMAT_BC5_SNORM: pfDecode = D3DXDecodeBC5S; sbpp = 16; break; + case DXGI_FORMAT_BC6H_UF16: pfDecode = D3DXDecodeBC6HU; sbpp = 16; break; + case DXGI_FORMAT_BC6H_SF16: pfDecode = D3DXDecodeBC6HS; sbpp = 16; break; + case DXGI_FORMAT_BC7_UNORM: + case DXGI_FORMAT_BC7_UNORM_SRGB: pfDecode = D3DXDecodeBC7; sbpp = 16; break; + default: + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + } + + XMVECTOR temp[16]; + const uint8_t *pSrc = cImage.pixels; + const size_t rowPitch = result.rowPitch; + for( size_t h=0; h < cImage.height; h += 4 ) + { + const uint8_t *sptr = pSrc; + uint8_t* dptr = pDest; + for( size_t count = 0; count < cImage.rowPitch; count += sbpp ) + { + pfDecode( temp, sptr ); + _ConvertScanline( temp, 16, format, cformat, 0 ); + + if ( !_StoreScanline( dptr, rowPitch, format, &temp[0], 4 ) ) + return E_FAIL; + + if ( result.height > 1 ) + { + if ( !_StoreScanline( dptr + rowPitch, rowPitch, format, &temp[4], 4 ) ) + return E_FAIL; + + if ( result.height > 2 ) + { + if ( !_StoreScanline( dptr + rowPitch*2, rowPitch, format, &temp[8], 4 ) ) + return E_FAIL; + + if ( !_StoreScanline( dptr + rowPitch*3, rowPitch, format, &temp[12], 4 ) ) + return E_FAIL; + } + } + + sptr += sbpp; + dptr += dbpp*4; + } + + pSrc += cImage.rowPitch; + pDest += rowPitch*4; + } + + return S_OK; +} + + +//===================================================================================== +// Entry-points +//===================================================================================== + +//------------------------------------------------------------------------------------- +// Compression +//------------------------------------------------------------------------------------- +HRESULT Compress( const Image& srcImage, DXGI_FORMAT format, DWORD compress, float alphaRef, ScratchImage& image ) +{ + if ( IsCompressed(srcImage.format) || !IsCompressed(format) || IsTypeless(format) ) + return E_INVALIDARG; + + // Image size must be a multiple of 4 (degenerate cases for mipmaps are allowed) + bool degenerate = false; + + size_t width = srcImage.width; + if ( (width % 4) != 0 ) + { + if ( width != 1 && width != 2 ) + return E_INVALIDARG; + + degenerate = true; + } + + size_t height = srcImage.height; + if ( (height % 4) != 0 ) + { + if ( height != 1 && height != 2 ) + return E_INVALIDARG; + + degenerate = true; + } + + // Create compressed image + HRESULT hr = image.Initialize2D( format, width, height, 1, 1 ); + if ( FAILED(hr) ) + return hr; + + const Image *img = image.GetImage( 0, 0, 0 ); + if ( !img ) + { + image.Release(); + return E_POINTER; + } + + // Compress single image + if ( (compress & TEX_COMPRESS_PARALLEL) && !degenerate ) + { +#ifndef _OPENMP + return E_NOTIMPL; +#else + hr = _CompressBC_Parallel( srcImage, *img, _GetBCFlags( compress ), alphaRef ); +#endif // _OPENMP + } + else + { + hr = _CompressBC( srcImage, *img, _GetBCFlags( compress ), alphaRef, degenerate ); + } + + if ( FAILED(hr) ) + image.Release(); + + return hr; +} + +HRESULT Compress( const Image* srcImages, size_t nimages, const TexMetadata& metadata, + DXGI_FORMAT format, DWORD compress, float alphaRef, ScratchImage& cImages ) +{ + if ( !srcImages || !nimages ) + return E_INVALIDARG; + + if ( !IsCompressed(format) || IsTypeless(format) ) + return E_INVALIDARG; + + // Image size must be a multiple of 4 (degenerate cases for mipmaps are allowed) + size_t width = srcImages[0].width; + if ( (width % 4) != 0 ) + { + if ( width != 1 && width != 2 ) + return E_INVALIDARG; + } + + size_t height = srcImages[0].height; + if ( (height % 4) != 0 ) + { + if ( height != 1 && height != 2 ) + return E_INVALIDARG; + } + + cImages.Release(); + + TexMetadata mdata2 = metadata; + mdata2.format = format; + HRESULT hr = cImages.Initialize( mdata2 ); + if ( FAILED(hr) ) + return hr; + + if ( nimages != cImages.GetImageCount() ) + { + cImages.Release(); + return E_FAIL; + } + + const Image* dest = cImages.GetImages(); + if ( !dest ) + { + cImages.Release(); + return E_POINTER; + } + + for( size_t index=0; index < nimages; ++index ) + { + assert( dest[ index ].format == format ); + + const Image& src = srcImages[ index ]; + + height = src.height; + width = src.width; + if ( width != dest[ index ].width || height != dest[ index ].height ) + { + cImages.Release(); + return E_FAIL; + } + + bool degenerate = ((height < 4) || (width < 4)) != 0; + + if ( (compress & TEX_COMPRESS_PARALLEL) && !degenerate) + { +#ifndef _OPENMP + return E_NOTIMPL; +#else + if ( compress & TEX_COMPRESS_PARALLEL ) + { + hr = _CompressBC_Parallel( src, dest[ index ], _GetBCFlags( compress ), alphaRef ); + if ( FAILED(hr) ) + { + cImages.Release(); + return hr; + } + } +#endif // _OPENMP + } + else + { + hr = _CompressBC( src, dest[ index ], _GetBCFlags( compress ), alphaRef, degenerate ); + if ( FAILED(hr) ) + { + cImages.Release(); + return hr; + } + } + } + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Decompression +//------------------------------------------------------------------------------------- +HRESULT Decompress( const Image& cImage, DXGI_FORMAT format, ScratchImage& image ) +{ + if ( IsCompressed(format) || IsTypeless(format) ) + return E_INVALIDARG; + + if ( format == DXGI_FORMAT_UNKNOWN ) + { + // Pick a default decompressed format based on BC input format + format = _DefaultDecompress( cImage.format ); + if ( format == DXGI_FORMAT_UNKNOWN ) + { + // Input is not a compressed format + return E_INVALIDARG; + } + } + else if ( !IsCompressed(cImage.format) || !IsValid(format) ) + return E_INVALIDARG; + + // Create decompressed image + HRESULT hr = image.Initialize2D( format, cImage.width, cImage.height, 1, 1 ); + if ( FAILED(hr) ) + return hr; + + const Image *img = image.GetImage( 0, 0, 0 ); + if ( !img ) + { + image.Release(); + return E_POINTER; + } + + // Decompress single image + hr = _DecompressBC( cImage, *img ); + if ( FAILED(hr) ) + image.Release(); + + return hr; +} + +HRESULT Decompress( const Image* cImages, size_t nimages, const TexMetadata& metadata, + DXGI_FORMAT format, ScratchImage& images ) +{ + if ( !cImages || !nimages ) + return E_INVALIDARG; + + if ( IsCompressed(format) || IsTypeless(format) ) + return E_INVALIDARG; + + if ( format == DXGI_FORMAT_UNKNOWN ) + { + // Pick a default decompressed format based on BC input format + format = _DefaultDecompress( cImages[0].format ); + if ( format == DXGI_FORMAT_UNKNOWN ) + { + // Input is not a compressed format + return E_FAIL; + } + } + else if ( !IsValid(format) ) + return E_INVALIDARG; + + images.Release(); + + TexMetadata mdata2 = metadata; + mdata2.format = format; + HRESULT hr = images.Initialize( mdata2 ); + if ( FAILED(hr) ) + return hr; + + if ( nimages != images.GetImageCount() ) + { + images.Release(); + return E_FAIL; + } + + const Image* dest = images.GetImages(); + if ( !dest ) + { + images.Release(); + return E_POINTER; + } + + for( size_t index=0; index < nimages; ++index ) + { + assert( dest[ index ].format == format ); + + const Image& src = cImages[ index ]; + if ( !IsCompressed( src.format ) ) + { + images.Release(); + return E_FAIL; + } + + if ( src.width != dest[ index ].width || src.height != dest[ index ].height ) + { + images.Release(); + return E_FAIL; + } + + hr = _DecompressBC( src, dest[ index ] ); + if ( FAILED(hr) ) + { + images.Release(); + return hr; + } + } + + return S_OK; +} + +}; // namespace diff --git a/thirdparty/directxtex/DirectXTex/DirectXTexConvert.cpp b/thirdparty/directxtex/DirectXTex/DirectXTexConvert.cpp new file mode 100644 index 0000000..f509c9b --- /dev/null +++ b/thirdparty/directxtex/DirectXTex/DirectXTexConvert.cpp @@ -0,0 +1,2421 @@ +//------------------------------------------------------------------------------------- +// DirectXTexConvert.cpp +// +// DirectX Texture Library - Image conversion +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkId=248926 +//------------------------------------------------------------------------------------- + +#include "directxtexp.h" + +#ifdef USE_XNAMATH +#if XNAMATH_VERSION < 205 +#error This file requires XNAMATH v2.05 or later +#endif +#else +using namespace DirectX::PackedVector; +#endif + +namespace DirectX +{ + +//------------------------------------------------------------------------------------- +// Copies an image row with optional clearing of alpha value to 1.0 +// (can be used in place as well) otherwise copies the image row unmodified. +//------------------------------------------------------------------------------------- +void _CopyScanline( LPVOID pDestination, size_t outSize, LPCVOID pSource, size_t inSize, DXGI_FORMAT format, DWORD flags ) +{ + assert( pDestination && outSize > 0 ); + assert( pSource && inSize > 0 ); + assert( IsValid(format) && !IsVideo(format) ); + + if ( flags & TEXP_SCANLINE_SETALPHA ) + { + switch( format ) + { + //----------------------------------------------------------------------------- + case DXGI_FORMAT_R32G32B32A32_TYPELESS: + case DXGI_FORMAT_R32G32B32A32_FLOAT: + case DXGI_FORMAT_R32G32B32A32_UINT: + case DXGI_FORMAT_R32G32B32A32_SINT: + { + uint32_t alpha; + if ( format == DXGI_FORMAT_R32G32B32A32_FLOAT ) + alpha = 0x3f800000; + else if ( format == DXGI_FORMAT_R32G32B32A32_SINT ) + alpha = 0x7fffffff; + else + alpha = 0xffffffff; + + if ( pDestination == pSource ) + { + uint32_t *dPtr = reinterpret_cast(pDestination); + for( size_t count = 0; count < outSize; count += 16 ) + { + dPtr += 3; + *(dPtr++) = alpha; + } + } + else + { + const uint32_t * __restrict sPtr = reinterpret_cast(pSource); + uint32_t * __restrict dPtr = reinterpret_cast(pDestination); + size_t size = std::min( outSize, inSize ); + for( size_t count = 0; count < size; count += 16 ) + { + *(dPtr++) = *(sPtr++); + *(dPtr++) = *(sPtr++); + *(dPtr++) = *(sPtr++); + *(dPtr++) = alpha; + sPtr++; + } + } + } + return; + + //----------------------------------------------------------------------------- + case DXGI_FORMAT_R16G16B16A16_TYPELESS: + case DXGI_FORMAT_R16G16B16A16_FLOAT: + case DXGI_FORMAT_R16G16B16A16_UNORM: + case DXGI_FORMAT_R16G16B16A16_UINT: + case DXGI_FORMAT_R16G16B16A16_SNORM: + case DXGI_FORMAT_R16G16B16A16_SINT: + { + uint16_t alpha; + if ( format == DXGI_FORMAT_R16G16B16A16_FLOAT ) + alpha = 0x3c00; + else if ( format == DXGI_FORMAT_R16G16B16A16_SNORM || format == DXGI_FORMAT_R16G16B16A16_SINT ) + alpha = 0x7fff; + else + alpha = 0xffff; + + if ( pDestination == pSource ) + { + uint16_t *dPtr = reinterpret_cast(pDestination); + for( size_t count = 0; count < outSize; count += 8 ) + { + dPtr += 3; + *(dPtr++) = alpha; + } + } + else + { + const uint16_t * __restrict sPtr = reinterpret_cast(pSource); + uint16_t * __restrict dPtr = reinterpret_cast(pDestination); + size_t size = std::min( outSize, inSize ); + for( size_t count = 0; count < size; count += 8 ) + { + *(dPtr++) = *(sPtr++); + *(dPtr++) = *(sPtr++); + *(dPtr++) = *(sPtr++); + *(dPtr++) = alpha; + sPtr++; + } + } + } + return; + + //----------------------------------------------------------------------------- + case DXGI_FORMAT_R10G10B10A2_TYPELESS: + case DXGI_FORMAT_R10G10B10A2_UNORM: + case DXGI_FORMAT_R10G10B10A2_UINT: + case DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM: + if ( pDestination == pSource ) + { + uint32_t *dPtr = reinterpret_cast(pDestination); + for( size_t count = 0; count < outSize; count += 4 ) + { +#pragma warning(suppress: 6001 6101) // PREFast doesn't properly understand the aliasing here. + *dPtr |= 0xC0000000; + ++dPtr; + } + } + else + { + const uint32_t * __restrict sPtr = reinterpret_cast(pSource); + uint32_t * __restrict dPtr = reinterpret_cast(pDestination); + size_t size = std::min( outSize, inSize ); + for( size_t count = 0; count < size; count += 4 ) + { + *(dPtr++) = *(sPtr++) | 0xC0000000; + } + } + return; + + //----------------------------------------------------------------------------- + case DXGI_FORMAT_R8G8B8A8_TYPELESS: + case DXGI_FORMAT_R8G8B8A8_UNORM: + case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB: + case DXGI_FORMAT_R8G8B8A8_UINT: + case DXGI_FORMAT_R8G8B8A8_SNORM: + case DXGI_FORMAT_R8G8B8A8_SINT: + case DXGI_FORMAT_B8G8R8A8_UNORM: + case DXGI_FORMAT_B8G8R8A8_TYPELESS: + case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB: + { + const uint32_t alpha = ( format == DXGI_FORMAT_R8G8B8A8_SNORM || format == DXGI_FORMAT_R8G8B8A8_SINT ) ? 0x7f000000 : 0xff000000; + + if ( pDestination == pSource ) + { + uint32_t *dPtr = reinterpret_cast(pDestination); + for( size_t count = 0; count < outSize; count += 4 ) + { + uint32_t t = *dPtr & 0xFFFFFF; + t |= alpha; + *(dPtr++) = t; + } + } + else + { + const uint32_t * __restrict sPtr = reinterpret_cast(pSource); + uint32_t * __restrict dPtr = reinterpret_cast(pDestination); + size_t size = std::min( outSize, inSize ); + for( size_t count = 0; count < size; count += 4 ) + { + uint32_t t = *(sPtr++) & 0xFFFFFF; + t |= alpha; + *(dPtr++) = t; + } + } + } + return; + + //----------------------------------------------------------------------------- + case DXGI_FORMAT_B5G5R5A1_UNORM: + if ( pDestination == pSource ) + { + uint16_t *dPtr = reinterpret_cast(pDestination); + for( size_t count = 0; count < outSize; count += 2 ) + { + *(dPtr++) |= 0x8000; + } + } + else + { + const uint16_t * __restrict sPtr = reinterpret_cast(pSource); + uint16_t * __restrict dPtr = reinterpret_cast(pDestination); + size_t size = std::min( outSize, inSize ); + for( size_t count = 0; count < size; count += 2 ) + { + *(dPtr++) = *(sPtr++) | 0x8000; + } + } + return; + + //----------------------------------------------------------------------------- + case DXGI_FORMAT_A8_UNORM: + memset( pDestination, 0xff, outSize ); + return; + +#ifdef DXGI_1_2_FORMATS + //----------------------------------------------------------------------------- + case DXGI_FORMAT_B4G4R4A4_UNORM: + if ( pDestination == pSource ) + { + uint16_t *dPtr = reinterpret_cast(pDestination); + for( size_t count = 0; count < outSize; count += 2 ) + { + *(dPtr++) |= 0xF000; + } + } + else + { + const uint16_t * __restrict sPtr = reinterpret_cast(pSource); + uint16_t * __restrict dPtr = reinterpret_cast(pDestination); + size_t size = std::min( outSize, inSize ); + for( size_t count = 0; count < size; count += 2 ) + { + *(dPtr++) = *(sPtr++) | 0xF000; + } + } + return; +#endif // DXGI_1_2_FORMATS + } + } + + // Fall-through case is to just use memcpy (assuming this is not an in-place operation) + if ( pDestination == pSource ) + return; + + size_t size = std::min( outSize, inSize ); + memcpy_s( pDestination, outSize, pSource, size ); +} + + +//------------------------------------------------------------------------------------- +// Swizzles (RGB <-> BGR) an image row with optional clearing of alpha value to 1.0 +// (can be used in place as well) otherwise copies the image row unmodified. +//------------------------------------------------------------------------------------- +void _SwizzleScanline( LPVOID pDestination, size_t outSize, LPCVOID pSource, size_t inSize, DXGI_FORMAT format, DWORD flags ) +{ + assert( pDestination && outSize > 0 ); + assert( pSource && inSize > 0 ); + assert( IsValid(format) && !IsVideo(format) ); + + switch( format ) + { + //--------------------------------------------------------------------------------- + case DXGI_FORMAT_R10G10B10A2_TYPELESS: + case DXGI_FORMAT_R10G10B10A2_UNORM: + case DXGI_FORMAT_R10G10B10A2_UINT: + case DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM: + if ( flags & TEXP_SCANLINE_LEGACY ) + { + // Swap Red (R) and Blue (B) channel (used for D3DFMT_A2R10G10B10 legacy sources) + if ( pDestination == pSource ) + { + uint32_t *dPtr = reinterpret_cast(pDestination); + for( size_t count = 0; count < outSize; count += 4 ) + { +#pragma warning(suppress: 6001 6101) // PREFast doesn't properly understand the aliasing here. + uint32_t t = *dPtr; + + uint32_t t1 = (t & 0x3ff00000) >> 20; + uint32_t t2 = (t & 0x000003ff) << 20; + uint32_t t3 = (t & 0x000ffc00); + uint32_t ta = ( flags & TEXP_SCANLINE_SETALPHA ) ? 0xC0000000 : (t & 0xC0000000); + + *(dPtr++) = t1 | t2 | t3 | ta; + } + } + else + { + const uint32_t * __restrict sPtr = reinterpret_cast(pSource); + uint32_t * __restrict dPtr = reinterpret_cast(pDestination); + size_t size = std::min( outSize, inSize ); + for( size_t count = 0; count < size; count += 4 ) + { + uint32_t t = *(sPtr++); + + uint32_t t1 = (t & 0x3ff00000) >> 20; + uint32_t t2 = (t & 0x000003ff) << 20; + uint32_t t3 = (t & 0x000ffc00); + uint32_t ta = ( flags & TEXP_SCANLINE_SETALPHA ) ? 0xC0000000 : (t & 0xC0000000); + + *(dPtr++) = t1 | t2 | t3 | ta; + } + } + return; + } + break; + + //--------------------------------------------------------------------------------- + case DXGI_FORMAT_R8G8B8A8_TYPELESS: + case DXGI_FORMAT_R8G8B8A8_UNORM: + case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB: + case DXGI_FORMAT_B8G8R8A8_UNORM: + case DXGI_FORMAT_B8G8R8X8_UNORM: + case DXGI_FORMAT_B8G8R8A8_TYPELESS: + case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB: + case DXGI_FORMAT_B8G8R8X8_TYPELESS: + case DXGI_FORMAT_B8G8R8X8_UNORM_SRGB: + // Swap Red (R) and Blue (B) channels (used to convert from DXGI 1.1 BGR formats to DXGI 1.0 RGB) + if ( pDestination == pSource ) + { + uint32_t *dPtr = reinterpret_cast(pDestination); + for( size_t count = 0; count < outSize; count += 4 ) + { + uint32_t t = *dPtr; + + uint32_t t1 = (t & 0x00ff0000) >> 16; + uint32_t t2 = (t & 0x000000ff) << 16; + uint32_t t3 = (t & 0x0000ff00); + uint32_t ta = ( flags & TEXP_SCANLINE_SETALPHA ) ? 0xff000000 : (t & 0xFF000000); + + *(dPtr++) = t1 | t2 | t3 | ta; + } + } + else + { + const uint32_t * __restrict sPtr = reinterpret_cast(pSource); + uint32_t * __restrict dPtr = reinterpret_cast(pDestination); + size_t size = std::min( outSize, inSize ); + for( size_t count = 0; count < size; count += 4 ) + { + uint32_t t = *(sPtr++); + + uint32_t t1 = (t & 0x00ff0000) >> 16; + uint32_t t2 = (t & 0x000000ff) << 16; + uint32_t t3 = (t & 0x0000ff00); + uint32_t ta = ( flags & TEXP_SCANLINE_SETALPHA ) ? 0xff000000 : (t & 0xFF000000); + + *(dPtr++) = t1 | t2 | t3 | ta; + } + } + return; + } + + // Fall-through case is to just use memcpy (assuming this is not an in-place operation) + if ( pDestination == pSource ) + return; + + size_t size = std::min( outSize, inSize ); + memcpy_s( pDestination, outSize, pSource, size ); +} + + +//------------------------------------------------------------------------------------- +// Converts an image row with optional clearing of alpha value to 1.0 +// Returns true if supported, false if expansion case not supported +//------------------------------------------------------------------------------------- +bool _ExpandScanline( LPVOID pDestination, size_t outSize, DXGI_FORMAT outFormat, + LPCVOID pSource, size_t inSize, DXGI_FORMAT inFormat, DWORD flags ) +{ + assert( pDestination && outSize > 0 ); + assert( pSource && inSize > 0 ); + assert( IsValid(outFormat) && !IsVideo(outFormat) ); + assert( IsValid(inFormat) && !IsVideo(inFormat) ); + + switch( inFormat ) + { + case DXGI_FORMAT_B5G6R5_UNORM: + if ( outFormat != DXGI_FORMAT_R8G8B8A8_UNORM ) + return false; + + // DXGI_FORMAT_B5G6R5_UNORM -> DXGI_FORMAT_R8G8B8A8_UNORM + { + const uint16_t * __restrict sPtr = reinterpret_cast(pSource); + uint32_t * __restrict dPtr = reinterpret_cast(pDestination); + + for( size_t ocount = 0, icount = 0; ((icount < inSize) && (ocount < outSize)); icount += 2, ocount += 4 ) + { + uint16_t t = *(sPtr++); + + uint32_t t1 = ((t & 0xf800) >> 8) | ((t & 0xe000) >> 13); + uint32_t t2 = ((t & 0x07e0) << 5) | ((t & 0x0600) >> 5); + uint32_t t3 = ((t & 0x001f) << 19) | ((t & 0x001c) << 14); + + *(dPtr++) = t1 | t2 | t3 | 0xff000000; + } + } + return true; + + case DXGI_FORMAT_B5G5R5A1_UNORM: + if ( outFormat != DXGI_FORMAT_R8G8B8A8_UNORM ) + return false; + + // DXGI_FORMAT_B5G5R5A1_UNORM -> DXGI_FORMAT_R8G8B8A8_UNORM + { + const uint16_t * __restrict sPtr = reinterpret_cast(pSource); + uint32_t * __restrict dPtr = reinterpret_cast(pDestination); + + for( size_t ocount = 0, icount = 0; ((icount < inSize) && (ocount < outSize)); icount += 2, ocount += 4 ) + { + uint16_t t = *(sPtr++); + + uint32_t t1 = ((t & 0x7c00) >> 7) | ((t & 0x7000) >> 12); + uint32_t t2 = ((t & 0x03e0) << 6) | ((t & 0x0380) << 1); + uint32_t t3 = ((t & 0x001f) << 19) | ((t & 0x001c) << 14); + uint32_t ta = ( flags & TEXP_SCANLINE_SETALPHA ) ? 0xff000000 : ((t & 0x8000) ? 0xff000000 : 0); + + *(dPtr++) = t1 | t2 | t3 | ta; + } + } + return true; + +#ifdef DXGI_1_2_FORMATS + case DXGI_FORMAT_B4G4R4A4_UNORM: + if ( outFormat != DXGI_FORMAT_R8G8B8A8_UNORM ) + return false; + + // DXGI_FORMAT_B4G4R4A4_UNORM -> DXGI_FORMAT_R8G8B8A8_UNORM + { + const uint16_t * __restrict sPtr = reinterpret_cast(pSource); + uint32_t * __restrict dPtr = reinterpret_cast(pDestination); + + for( size_t ocount = 0, icount = 0; ((icount < inSize) && (ocount < outSize)); icount += 2, ocount += 4 ) + { + uint16_t t = *(sPtr++); + + uint32_t t1 = ((t & 0x0f00) >> 4) | ((t & 0x0f00) >> 8); + uint32_t t2 = ((t & 0x00f0) << 8) | ((t & 0x00f0) << 4); + uint32_t t3 = ((t & 0x000f) << 20) | ((t & 0x000f) << 16); + uint32_t ta = ( flags & TEXP_SCANLINE_SETALPHA ) ? 0xff000000 : (((t & 0xf000) << 16) | ((t & 0xf000) << 12)); + + *(dPtr++) = t1 | t2 | t3 | ta; + } + } + return true; +#endif // DXGI_1_2_FORMATS + } + + return false; +} + + +//------------------------------------------------------------------------------------- +// Loads an image row into standard RGBA XMVECTOR (aligned) array +//------------------------------------------------------------------------------------- +#define LOAD_SCANLINE( type, func )\ + if ( size >= sizeof(type) )\ + {\ + const type * __restrict sPtr = reinterpret_cast(pSource);\ + for( size_t icount = 0; icount < size; icount += sizeof(type) )\ + {\ + if ( dPtr >= ePtr ) break;\ + *(dPtr++) = func( sPtr++ );\ + }\ + return true;\ + }\ + return false; + +#define LOAD_SCANLINE3( type, func, defvec )\ + if ( size >= sizeof(type) )\ + {\ + const type * __restrict sPtr = reinterpret_cast(pSource);\ + for( size_t icount = 0; icount < size; icount += sizeof(type) )\ + {\ + XMVECTOR v = func( sPtr++ );\ + if ( dPtr >= ePtr ) break;\ + *(dPtr++) = XMVectorSelect( defvec, v, g_XMSelect1110 );\ + }\ + return true;\ + }\ + return false; + +#define LOAD_SCANLINE2( type, func, defvec )\ + if ( size >= sizeof(type) )\ + {\ + const type * __restrict sPtr = reinterpret_cast(pSource);\ + for( size_t icount = 0; icount < size; icount += sizeof(type) )\ + {\ + XMVECTOR v = func( sPtr++ );\ + if ( dPtr >= ePtr ) break;\ + *(dPtr++) = XMVectorSelect( defvec, v, g_XMSelect1100 );\ + }\ + return true;\ + }\ + return false; + +bool _LoadScanline( XMVECTOR* pDestination, size_t count, + LPCVOID pSource, size_t size, DXGI_FORMAT format ) +{ + assert( pDestination && count > 0 && (((uintptr_t)pDestination & 0xF) == 0) ); + assert( pSource && size > 0 ); + assert( IsValid(format) && !IsVideo(format) && !IsTypeless(format) && !IsCompressed(format) ); + + XMVECTOR* __restrict dPtr = pDestination; + if ( !dPtr ) + return false; + + const XMVECTOR* ePtr = pDestination + count; + + switch( format ) + { + case DXGI_FORMAT_R32G32B32A32_FLOAT: + { + size_t msize = (size > (sizeof(XMVECTOR)*count)) ? (sizeof(XMVECTOR)*count) : size; + memcpy_s( dPtr, sizeof(XMVECTOR)*count, pSource, msize ); + } + return true; + + case DXGI_FORMAT_R32G32B32A32_UINT: + LOAD_SCANLINE( XMUINT4, XMLoadUInt4 ) + + case DXGI_FORMAT_R32G32B32A32_SINT: + LOAD_SCANLINE( XMINT4, XMLoadSInt4 ) + + case DXGI_FORMAT_R32G32B32_FLOAT: + LOAD_SCANLINE3( XMFLOAT3, XMLoadFloat3, g_XMIdentityR3 ) + + case DXGI_FORMAT_R32G32B32_UINT: + LOAD_SCANLINE3( XMUINT3, XMLoadUInt3, g_XMIdentityR3 ) + + case DXGI_FORMAT_R32G32B32_SINT: + LOAD_SCANLINE3( XMINT3, XMLoadSInt3, g_XMIdentityR3 ) + + case DXGI_FORMAT_R16G16B16A16_FLOAT: + LOAD_SCANLINE( XMHALF4, XMLoadHalf4 ) + + case DXGI_FORMAT_R16G16B16A16_UNORM: + LOAD_SCANLINE( XMUSHORTN4, XMLoadUShortN4 ) + + case DXGI_FORMAT_R16G16B16A16_UINT: + LOAD_SCANLINE( XMUSHORT4, XMLoadUShort4 ) + + case DXGI_FORMAT_R16G16B16A16_SNORM: + LOAD_SCANLINE( XMSHORTN4, XMLoadShortN4 ) + + case DXGI_FORMAT_R16G16B16A16_SINT: + LOAD_SCANLINE( XMSHORT4, XMLoadShort4 ) + + case DXGI_FORMAT_R32G32_FLOAT: + LOAD_SCANLINE2( XMFLOAT2, XMLoadFloat2, g_XMIdentityR3 ) + + case DXGI_FORMAT_R32G32_UINT: + LOAD_SCANLINE2( XMUINT2, XMLoadUInt2, g_XMIdentityR3 ) + + case DXGI_FORMAT_R32G32_SINT: + LOAD_SCANLINE2( XMINT2, XMLoadSInt2, g_XMIdentityR3 ) + + case DXGI_FORMAT_D32_FLOAT_S8X24_UINT: + if ( size >= (sizeof(float)+sizeof(uint32_t)) ) + { + const float * sPtr = reinterpret_cast(pSource); + for( size_t icount = 0; icount < size; icount += (sizeof(float)+sizeof(uint32_t)) ) + { + const uint8_t* ps8 = reinterpret_cast( &sPtr[1] ); + if ( dPtr >= ePtr ) break; + *(dPtr++) = XMVectorSet( sPtr[0], static_cast( *ps8 ), 0.f, 1.f ); + sPtr += 2; + } + return true; + } + return false; + + case DXGI_FORMAT_R10G10B10A2_UNORM: + case DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM: + LOAD_SCANLINE( XMUDECN4, XMLoadUDecN4 ); + + case DXGI_FORMAT_R10G10B10A2_UINT: + LOAD_SCANLINE( XMUDEC4, XMLoadUDec4 ); + + case DXGI_FORMAT_R11G11B10_FLOAT: + LOAD_SCANLINE3( XMFLOAT3PK, XMLoadFloat3PK, g_XMIdentityR3 ); + + case DXGI_FORMAT_R8G8B8A8_UNORM: + case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB: + LOAD_SCANLINE( XMUBYTEN4, XMLoadUByteN4 ) + + case DXGI_FORMAT_R8G8B8A8_UINT: + LOAD_SCANLINE( XMUBYTE4, XMLoadUByte4 ) + + case DXGI_FORMAT_R8G8B8A8_SNORM: + LOAD_SCANLINE( XMBYTEN4, XMLoadByteN4 ) + + case DXGI_FORMAT_R8G8B8A8_SINT: + LOAD_SCANLINE( XMBYTE4, XMLoadByte4 ) + + case DXGI_FORMAT_R16G16_FLOAT: + LOAD_SCANLINE2( XMHALF2, XMLoadHalf2, g_XMIdentityR3 ) + + case DXGI_FORMAT_R16G16_UNORM: + LOAD_SCANLINE2( XMUSHORTN2, XMLoadUShortN2, g_XMIdentityR3 ) + + case DXGI_FORMAT_R16G16_UINT: + LOAD_SCANLINE2( XMUSHORT2, XMLoadUShort2, g_XMIdentityR3 ) + + case DXGI_FORMAT_R16G16_SNORM: + LOAD_SCANLINE2( XMSHORTN2, XMLoadShortN2, g_XMIdentityR3 ) + + case DXGI_FORMAT_R16G16_SINT: + LOAD_SCANLINE2( XMSHORT2, XMLoadShort2, g_XMIdentityR3 ) + + case DXGI_FORMAT_D32_FLOAT: + case DXGI_FORMAT_R32_FLOAT: + if ( size >= sizeof(float) ) + { + const float* __restrict sPtr = reinterpret_cast(pSource); + for( size_t icount = 0; icount < size; icount += sizeof(float) ) + { + XMVECTOR v = XMLoadFloat( sPtr++ ); + if ( dPtr >= ePtr ) break; + *(dPtr++) = XMVectorSelect( g_XMIdentityR3, v, g_XMSelect1000 ); + } + return true; + } + return false; + + case DXGI_FORMAT_R32_UINT: + if ( size >= sizeof(uint32_t) ) + { + const uint32_t* __restrict sPtr = reinterpret_cast(pSource); + for( size_t icount = 0; icount < size; icount += sizeof(uint32_t) ) + { + XMVECTOR v = XMLoadInt( sPtr++ ); + v = XMConvertVectorUIntToFloat( v, 0 ); + if ( dPtr >= ePtr ) break; + *(dPtr++) = XMVectorSelect( g_XMIdentityR3, v, g_XMSelect1000 ); + } + return true; + } + return false; + + case DXGI_FORMAT_R32_SINT: + if ( size >= sizeof(int32_t) ) + { + const int32_t * __restrict sPtr = reinterpret_cast(pSource); + for( size_t icount = 0; icount < size; icount += sizeof(int32_t) ) + { + XMVECTOR v = XMLoadInt( reinterpret_cast (sPtr++) ); + v = XMConvertVectorIntToFloat( v, 0 ); + if ( dPtr >= ePtr ) break; + *(dPtr++) = XMVectorSelect( g_XMIdentityR3, v, g_XMSelect1000 ); + } + return true; + } + return false; + + case DXGI_FORMAT_D24_UNORM_S8_UINT: + if ( size >= sizeof(uint32_t) ) + { + const uint32_t * sPtr = reinterpret_cast(pSource); + for( size_t icount = 0; icount < size; icount += sizeof(uint32_t) ) + { + float d = static_cast( *sPtr & 0xFFFFFF ) / 16777215.f; + float s = static_cast( ( *sPtr & 0xFF000000 ) >> 24 ); + ++sPtr; + if ( dPtr >= ePtr ) break; + *(dPtr++) = XMVectorSet( d, s, 0.f, 1.f ); + } + return true; + } + return false; + + case DXGI_FORMAT_R8G8_UNORM: + LOAD_SCANLINE2( XMUBYTEN2, XMLoadUByteN2, g_XMIdentityR3 ) + + case DXGI_FORMAT_R8G8_UINT: + LOAD_SCANLINE2( XMUBYTE2, XMLoadUByte2, g_XMIdentityR3 ) + + case DXGI_FORMAT_R8G8_SNORM: + LOAD_SCANLINE2( XMBYTEN2, XMLoadByteN2, g_XMIdentityR3 ) + + case DXGI_FORMAT_R8G8_SINT: + LOAD_SCANLINE2( XMBYTE2, XMLoadByte2, g_XMIdentityR3 ) + + case DXGI_FORMAT_R16_FLOAT: + if ( size >= sizeof(HALF) ) + { + const HALF * __restrict sPtr = reinterpret_cast(pSource); + for( size_t icount = 0; icount < size; icount += sizeof(HALF) ) + { + if ( dPtr >= ePtr ) break; + *(dPtr++) = XMVectorSet( XMConvertHalfToFloat(*sPtr++), 0.f, 0.f, 1.f ); + } + return true; + } + return false; + + case DXGI_FORMAT_D16_UNORM: + case DXGI_FORMAT_R16_UNORM: + if ( size >= sizeof(uint16_t) ) + { + const uint16_t* __restrict sPtr = reinterpret_cast(pSource); + for( size_t icount = 0; icount < size; icount += sizeof(uint16_t) ) + { + if ( dPtr >= ePtr ) break; + *(dPtr++) = XMVectorSet( static_cast(*sPtr++) / 65535.f, 0.f, 0.f, 1.f ); + } + return true; + } + return false; + + case DXGI_FORMAT_R16_UINT: + if ( size >= sizeof(uint16_t) ) + { + const uint16_t * __restrict sPtr = reinterpret_cast(pSource); + for( size_t icount = 0; icount < size; icount += sizeof(uint16_t) ) + { + if ( dPtr >= ePtr ) break; + *(dPtr++) = XMVectorSet( static_cast(*sPtr++), 0.f, 0.f, 1.f ); + } + return true; + } + return false; + + case DXGI_FORMAT_R16_SNORM: + if ( size >= sizeof(int16_t) ) + { + const int16_t * __restrict sPtr = reinterpret_cast(pSource); + for( size_t icount = 0; icount < size; icount += sizeof(int16_t) ) + { + if ( dPtr >= ePtr ) break; + *(dPtr++) = XMVectorSet( static_cast(*sPtr++) / 32767.f, 0.f, 0.f, 1.f ); + } + return true; + } + return false; + + case DXGI_FORMAT_R16_SINT: + if ( size >= sizeof(int16_t) ) + { + const int16_t * __restrict sPtr = reinterpret_cast(pSource); + for( size_t icount = 0; icount < size; icount += sizeof(int16_t) ) + { + if ( dPtr >= ePtr ) break; + *(dPtr++) = XMVectorSet( static_cast(*sPtr++), 0.f, 0.f, 1.f ); + } + return true; + } + return false; + + case DXGI_FORMAT_R8_UNORM: + if ( size >= sizeof(uint8_t) ) + { + const uint8_t * __restrict sPtr = reinterpret_cast(pSource); + for( size_t icount = 0; icount < size; icount += sizeof(uint8_t) ) + { + if ( dPtr >= ePtr ) break; + *(dPtr++) = XMVectorSet( static_cast(*sPtr++) / 255.f, 0.f, 0.f, 1.f ); + } + return true; + } + return false; + + case DXGI_FORMAT_R8_UINT: + if ( size >= sizeof(uint8_t) ) + { + const uint8_t * __restrict sPtr = reinterpret_cast(pSource); + for( size_t icount = 0; icount < size; icount += sizeof(uint8_t) ) + { + if ( dPtr >= ePtr ) break; + *(dPtr++) = XMVectorSet( static_cast(*sPtr++), 0.f, 0.f, 1.f ); + } + return true; + } + return false; + + case DXGI_FORMAT_R8_SNORM: + if ( size >= sizeof(char) ) + { + const char * __restrict sPtr = reinterpret_cast(pSource); + for( size_t icount = 0; icount < size; icount += sizeof(char) ) + { + if ( dPtr >= ePtr ) break; + *(dPtr++) = XMVectorSet( static_cast(*sPtr++) / 127.f, 0.f, 0.f, 1.f ); + } + return true; + } + return false; + + case DXGI_FORMAT_R8_SINT: + if ( size >= sizeof(char) ) + { + const char * __restrict sPtr = reinterpret_cast(pSource); + for( size_t icount = 0; icount < size; icount += sizeof(char) ) + { + if ( dPtr >= ePtr ) break; + *(dPtr++) = XMVectorSet( static_cast(*sPtr++), 0.f, 0.f, 1.f ); + } + return true; + } + return false; + + case DXGI_FORMAT_A8_UNORM: + if ( size >= sizeof(uint8_t) ) + { + const uint8_t * __restrict sPtr = reinterpret_cast(pSource); + for( size_t icount = 0; icount < size; icount += sizeof(uint8_t) ) + { + if ( dPtr >= ePtr ) break; + *(dPtr++) = XMVectorSet( 0.f, 0.f, 0.f, static_cast(*sPtr++) / 255.f ); + } + return true; + } + return false; + + case DXGI_FORMAT_R1_UNORM: + if ( size >= sizeof(uint8_t) ) + { + const uint8_t * __restrict sPtr = reinterpret_cast(pSource); + for( size_t icount = 0; icount < size; icount += sizeof(uint8_t) ) + { + for( size_t bcount = 0; bcount < 8; ++bcount ) + { + if ( dPtr >= ePtr ) break; + *(dPtr++) = XMVectorSet( (((*sPtr >> bcount) & 0x1) ? 1.f : 0.f), 0.f, 0.f, 1.f ); + } + + ++sPtr; + } + return true; + } + return false; + + case DXGI_FORMAT_R9G9B9E5_SHAREDEXP: + LOAD_SCANLINE3( XMFLOAT3SE, XMLoadFloat3SE, g_XMIdentityR3 ) + + case DXGI_FORMAT_R8G8_B8G8_UNORM: + if ( size >= sizeof(XMUBYTEN4) ) + { + const XMUBYTEN4 * __restrict sPtr = reinterpret_cast(pSource); + for( size_t icount = 0; icount < size; icount += sizeof(XMUBYTEN4) ) + { + XMVECTOR v = XMLoadUByteN4( sPtr++ ); + XMVECTOR v1 = XMVectorSwizzle<0, 3, 2, 1>( v ); + if ( dPtr >= ePtr ) break; + *(dPtr++) = XMVectorSelect( g_XMIdentityR3, v, g_XMSelect1110 ); + if ( dPtr >= ePtr ) break; + *(dPtr++) = XMVectorSelect( g_XMIdentityR3, v1, g_XMSelect1110 ); + } + return true; + } + return false; + + case DXGI_FORMAT_G8R8_G8B8_UNORM: + if ( size >= sizeof(XMUBYTEN4) ) + { + const XMUBYTEN4 * __restrict sPtr = reinterpret_cast(pSource); + for( size_t icount = 0; icount < size; icount += sizeof(XMUBYTEN4) ) + { + XMVECTOR v = XMLoadUByteN4( sPtr++ ); + XMVECTOR v0 = XMVectorSwizzle<1, 0, 3, 2>( v ); + XMVECTOR v1 = XMVectorSwizzle<1, 2, 3, 0>( v ); + if ( dPtr >= ePtr ) break; + *(dPtr++) = XMVectorSelect( g_XMIdentityR3, v0, g_XMSelect1110 ); + if ( dPtr >= ePtr ) break; + *(dPtr++) = XMVectorSelect( g_XMIdentityR3, v1, g_XMSelect1110 ); + } + return true; + } + return false; + + case DXGI_FORMAT_B5G6R5_UNORM: + if ( size >= sizeof(XMU565) ) + { + static XMVECTORF32 s_Scale = { 1.f/31.f, 1.f/63.f, 1.f/31.f, 1.f }; + const XMU565 * __restrict sPtr = reinterpret_cast(pSource); + for( size_t icount = 0; icount < size; icount += sizeof(XMU565) ) + { + XMVECTOR v = XMLoadU565( sPtr++ ); + v = XMVectorMultiply( v, s_Scale ); + v = XMVectorSwizzle<2, 1, 0, 3>( v ); + if ( dPtr >= ePtr ) break; + *(dPtr++) = XMVectorSelect( g_XMIdentityR3, v, g_XMSelect1110 ); + } + return true; + } + return false; + + case DXGI_FORMAT_B5G5R5A1_UNORM: + if ( size >= sizeof(XMU555) ) + { + static XMVECTORF32 s_Scale = { 1.f/31.f, 1.f/31.f, 1.f/31.f, 1.f }; + const XMU555 * __restrict sPtr = reinterpret_cast(pSource); + for( size_t icount = 0; icount < size; icount += sizeof(XMU555) ) + { + XMVECTOR v = XMLoadU555( sPtr++ ); + v = XMVectorMultiply( v, s_Scale ); + if ( dPtr >= ePtr ) break; + *(dPtr++) = XMVectorSwizzle<2, 1, 0, 3>( v ); + } + return true; + } + return false; + + case DXGI_FORMAT_B8G8R8A8_UNORM: + case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB: + if ( size >= sizeof(XMUBYTEN4) ) + { + const XMUBYTEN4 * __restrict sPtr = reinterpret_cast(pSource); + for( size_t icount = 0; icount < size; icount += sizeof(XMUBYTEN4) ) + { + XMVECTOR v = XMLoadUByteN4( sPtr++ ); + if ( dPtr >= ePtr ) break; + *(dPtr++) = XMVectorSwizzle<2, 1, 0, 3>( v ); + } + return true; + } + return false; + + case DXGI_FORMAT_B8G8R8X8_UNORM: + case DXGI_FORMAT_B8G8R8X8_UNORM_SRGB: + if ( size >= sizeof(XMUBYTEN4) ) + { + const XMUBYTEN4 * __restrict sPtr = reinterpret_cast(pSource); + for( size_t icount = 0; icount < size; icount += sizeof(XMUBYTEN4) ) + { + XMVECTOR v = XMLoadUByteN4( sPtr++ ); + v = XMVectorSwizzle<2, 1, 0, 3>( v ); + if ( dPtr >= ePtr ) break; + *(dPtr++) = XMVectorSelect( g_XMIdentityR3, v, g_XMSelect1110 ); + } + return true; + } + return false; + +#ifdef DXGI_1_2_FORMATS + case DXGI_FORMAT_B4G4R4A4_UNORM: + if ( size >= sizeof(XMUNIBBLE4) ) + { + static XMVECTORF32 s_Scale = { 1.f/15.f, 1.f/15.f, 1.f/15.f, 1.f/15.f }; + const XMUNIBBLE4 * __restrict sPtr = reinterpret_cast(pSource); + for( size_t icount = 0; icount < size; icount += sizeof(XMUNIBBLE4) ) + { + XMVECTOR v = XMLoadUNibble4( sPtr++ ); + v = XMVectorMultiply( v, s_Scale ); + if ( dPtr >= ePtr ) break; + *(dPtr++) = XMVectorSwizzle<2, 1, 0, 3>( v ); + } + return true; + } + return false; + + // we don't support the video formats ( see IsVideo function ) +#endif // DXGI_1_2_FORMATS + + default: + return false; + } +} + + +//------------------------------------------------------------------------------------- +// Stores an image row from standard RGBA XMVECTOR (aligned) array +//------------------------------------------------------------------------------------- +#define STORE_SCANLINE( type, func )\ + if ( size >= sizeof(type) )\ + {\ + type * __restrict dPtr = reinterpret_cast(pDestination);\ + for( size_t icount = 0; icount < size; icount += sizeof(type) )\ + {\ + if ( sPtr >= ePtr ) break;\ + func( dPtr++, *sPtr++ );\ + }\ + }\ + return true; + +bool _StoreScanline( LPVOID pDestination, size_t size, DXGI_FORMAT format, + const XMVECTOR* pSource, size_t count ) +{ + assert( pDestination && size > 0 ); + assert( pSource && count > 0 && (((uintptr_t)pSource & 0xF) == 0) ); + assert( IsValid(format) && !IsVideo(format) && !IsTypeless(format) && !IsCompressed(format) ); + + const XMVECTOR* __restrict sPtr = pSource; + if ( !sPtr ) + return false; + + const XMVECTOR* ePtr = pSource + count; + + switch( format ) + { + case DXGI_FORMAT_R32G32B32A32_FLOAT: + STORE_SCANLINE( XMFLOAT4, XMStoreFloat4 ) + + case DXGI_FORMAT_R32G32B32A32_UINT: + STORE_SCANLINE( XMUINT4, XMStoreUInt4 ) + + case DXGI_FORMAT_R32G32B32A32_SINT: + STORE_SCANLINE( XMINT4, XMStoreSInt4 ) + + case DXGI_FORMAT_R32G32B32_FLOAT: + STORE_SCANLINE( XMFLOAT3, XMStoreFloat3 ) + + case DXGI_FORMAT_R32G32B32_UINT: + STORE_SCANLINE( XMUINT3, XMStoreUInt3 ) + + case DXGI_FORMAT_R32G32B32_SINT: + STORE_SCANLINE( XMINT3, XMStoreSInt3 ) + + case DXGI_FORMAT_R16G16B16A16_FLOAT: + STORE_SCANLINE( XMHALF4, XMStoreHalf4 ) + + case DXGI_FORMAT_R16G16B16A16_UNORM: + STORE_SCANLINE( XMUSHORTN4, XMStoreUShortN4 ) + + case DXGI_FORMAT_R16G16B16A16_UINT: + STORE_SCANLINE( XMUSHORT4, XMStoreUShort4 ) + + case DXGI_FORMAT_R16G16B16A16_SNORM: + STORE_SCANLINE( XMSHORTN4, XMStoreShortN4 ) + + case DXGI_FORMAT_R16G16B16A16_SINT: + STORE_SCANLINE( XMSHORT4, XMStoreShort4 ) + + case DXGI_FORMAT_R32G32_FLOAT: + STORE_SCANLINE( XMFLOAT2, XMStoreFloat2 ) + + case DXGI_FORMAT_R32G32_UINT: + STORE_SCANLINE( XMUINT2, XMStoreUInt2 ) + + case DXGI_FORMAT_R32G32_SINT: + STORE_SCANLINE( XMINT2, XMStoreSInt2 ) + + case DXGI_FORMAT_D32_FLOAT_S8X24_UINT: + if ( size >= (sizeof(float)+sizeof(uint32_t)) ) + { + float *dPtr = reinterpret_cast(pDestination); + for( size_t icount = 0; icount < size; icount += (sizeof(float)+sizeof(uint32_t)) ) + { + if ( sPtr >= ePtr ) break; + XMFLOAT4 f; + XMStoreFloat4( &f, *sPtr++ ); + dPtr[0] = f.x; + uint8_t* ps8 = reinterpret_cast( &dPtr[1] ); + ps8[0] = static_cast( std::min( 255.f, std::max( 0.f, f.y ) ) ); + ps8[1] = ps8[2] = ps8[3] = 0; + dPtr += 2; + } + } + return true; + + case DXGI_FORMAT_R10G10B10A2_UNORM: + case DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM: + STORE_SCANLINE( XMUDECN4, XMStoreUDecN4 ); + + case DXGI_FORMAT_R10G10B10A2_UINT: + STORE_SCANLINE( XMUDEC4, XMStoreUDec4 ); + + case DXGI_FORMAT_R11G11B10_FLOAT: + STORE_SCANLINE( XMFLOAT3PK, XMStoreFloat3PK ); + + case DXGI_FORMAT_R8G8B8A8_UNORM: + case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB: + STORE_SCANLINE( XMUBYTEN4, XMStoreUByteN4 ) + + case DXGI_FORMAT_R8G8B8A8_UINT: + STORE_SCANLINE( XMUBYTE4, XMStoreUByte4 ) + + case DXGI_FORMAT_R8G8B8A8_SNORM: + STORE_SCANLINE( XMBYTEN4, XMStoreByteN4 ) + + case DXGI_FORMAT_R8G8B8A8_SINT: + STORE_SCANLINE( XMBYTE4, XMStoreByte4 ) + + case DXGI_FORMAT_R16G16_FLOAT: + STORE_SCANLINE( XMHALF2, XMStoreHalf2 ) + + case DXGI_FORMAT_R16G16_UNORM: + STORE_SCANLINE( XMUSHORTN2, XMStoreUShortN2 ) + + case DXGI_FORMAT_R16G16_UINT: + STORE_SCANLINE( XMUSHORT2, XMStoreUShort2 ) + + case DXGI_FORMAT_R16G16_SNORM: + STORE_SCANLINE( XMSHORTN2, XMStoreShortN2 ) + + case DXGI_FORMAT_R16G16_SINT: + STORE_SCANLINE( XMSHORT2, XMStoreShort2 ) + + case DXGI_FORMAT_D32_FLOAT: + case DXGI_FORMAT_R32_FLOAT: + if ( size >= sizeof(float) ) + { + float * __restrict dPtr = reinterpret_cast(pDestination); + for( size_t icount = 0; icount < size; icount += sizeof(float) ) + { + if ( sPtr >= ePtr ) break; + XMStoreFloat( dPtr++, *(sPtr++) ); + } + } + return true; + + case DXGI_FORMAT_R32_UINT: + if ( size >= sizeof(uint32_t) ) + { + uint32_t * __restrict dPtr = reinterpret_cast(pDestination); + for( size_t icount = 0; icount < size; icount += sizeof(uint32_t) ) + { + if ( sPtr >= ePtr ) break; + XMVECTOR v = XMConvertVectorFloatToUInt( *(sPtr++), 0 ); + XMStoreInt( dPtr++, v ); + } + } + return true; + + case DXGI_FORMAT_R32_SINT: + if ( size >= sizeof(uint32_t) ) + { + uint32_t * __restrict dPtr = reinterpret_cast(pDestination); + for( size_t icount = 0; icount < size; icount += sizeof(uint32_t) ) + { + if ( sPtr >= ePtr ) break; + XMVECTOR v = XMConvertVectorFloatToInt( *(sPtr++), 0 ); + XMStoreInt( dPtr++, v ); + } + } + return true; + + case DXGI_FORMAT_D24_UNORM_S8_UINT: + if ( size >= sizeof(uint32_t) ) + { + static const XMVECTORF32 clamp = { 1.f, 255.f, 0.f, 0.f }; + XMVECTOR zero = XMVectorZero(); + uint32_t *dPtr = reinterpret_cast(pDestination); + for( size_t icount = 0; icount < size; icount += sizeof(uint32_t) ) + { + if ( sPtr >= ePtr ) break; + XMFLOAT4 f; + XMStoreFloat4( &f, XMVectorClamp( *sPtr++, zero, clamp ) ); + *dPtr++ = (static_cast( f.x * 16777215.f ) & 0xFFFFFF) + | ((static_cast( f.y ) & 0xFF) << 24); + } + } + return true; + + case DXGI_FORMAT_R8G8_UNORM: + STORE_SCANLINE( XMUBYTEN2, XMStoreUByteN2 ) + + case DXGI_FORMAT_R8G8_UINT: + STORE_SCANLINE( XMUBYTE2, XMStoreUByte2 ) + + case DXGI_FORMAT_R8G8_SNORM: + STORE_SCANLINE( XMBYTEN2, XMStoreByteN2 ) + + case DXGI_FORMAT_R8G8_SINT: + STORE_SCANLINE( XMBYTE2, XMStoreByte2 ) + + case DXGI_FORMAT_R16_FLOAT: + if ( size >= sizeof(HALF) ) + { + HALF * __restrict dPtr = reinterpret_cast(pDestination); + for( size_t icount = 0; icount < size; icount += sizeof(HALF) ) + { + if ( sPtr >= ePtr ) break; + float v = XMVectorGetX( *sPtr++ ); + *(dPtr++) = XMConvertFloatToHalf(v); + } + } + return true; + + case DXGI_FORMAT_D16_UNORM: + case DXGI_FORMAT_R16_UNORM: + if ( size >= sizeof(int16_t) ) + { + int16_t * __restrict dPtr = reinterpret_cast(pDestination); + for( size_t icount = 0; icount < size; icount += sizeof(int16_t) ) + { + if ( sPtr >= ePtr ) break; + float v = XMVectorGetX( *sPtr++ ); + v = std::max( std::min( v, 1.f ), 0.f ); + *(dPtr++) = static_cast( v*65535.f + 0.5f ); + } + } + return true; + + case DXGI_FORMAT_R16_UINT: + if ( size >= sizeof(uint16_t) ) + { + uint16_t * __restrict dPtr = reinterpret_cast(pDestination); + for( size_t icount = 0; icount < size; icount += sizeof(uint16_t) ) + { + if ( sPtr >= ePtr ) break; + float v = XMVectorGetX( *sPtr++ ); + v = std::max( std::min( v, 65535.f ), 0.f ); + *(dPtr++) = static_cast(v); + } + } + return true; + + case DXGI_FORMAT_R16_SNORM: + if ( size >= sizeof(int16_t) ) + { + int16_t * __restrict dPtr = reinterpret_cast(pDestination); + for( size_t icount = 0; icount < size; icount += sizeof(int16_t) ) + { + if ( sPtr >= ePtr ) break; + float v = XMVectorGetX( *sPtr++ ); + v = std::max( std::min( v, 1.f ), -1.f ); + *(dPtr++) = static_cast( v * 32767.f ); + } + } + return true; + + case DXGI_FORMAT_R16_SINT: + if ( size >= sizeof(int16_t) ) + { + int16_t * __restrict dPtr = reinterpret_cast(pDestination); + for( size_t icount = 0; icount < size; icount += sizeof(int16_t) ) + { + if ( sPtr >= ePtr ) break; + float v = XMVectorGetX( *sPtr++ ); + v = std::max( std::min( v, 32767.f ), -32767.f ); + *(dPtr++) = static_cast(v); + } + } + return true; + + case DXGI_FORMAT_R8_UNORM: + if ( size >= sizeof(uint8_t) ) + { + uint8_t * __restrict dPtr = reinterpret_cast(pDestination); + for( size_t icount = 0; icount < size; icount += sizeof(uint8_t) ) + { + if ( sPtr >= ePtr ) break; + float v = XMVectorGetX( *sPtr++ ); + v = std::max( std::min( v, 1.f ), 0.f ); + *(dPtr++) = static_cast( v * 255.f); + } + } + return true; + + case DXGI_FORMAT_R8_UINT: + if ( size >= sizeof(uint8_t) ) + { + uint8_t * __restrict dPtr = reinterpret_cast(pDestination); + for( size_t icount = 0; icount < size; icount += sizeof(uint8_t) ) + { + if ( sPtr >= ePtr ) break; + float v = XMVectorGetX( *sPtr++ ); + v = std::max( std::min( v, 255.f ), 0.f ); + *(dPtr++) = static_cast(v); + } + } + return true; + + case DXGI_FORMAT_R8_SNORM: + if ( size >= sizeof(char) ) + { + char * __restrict dPtr = reinterpret_cast(pDestination); + for( size_t icount = 0; icount < size; icount += sizeof(char) ) + { + if ( sPtr >= ePtr ) break; + float v = XMVectorGetX( *sPtr++ ); + v = std::max( std::min( v, 1.f ), -1.f ); + *(dPtr++) = static_cast( v * 127.f ); + } + } + return true; + + case DXGI_FORMAT_R8_SINT: + if ( size >= sizeof(char) ) + { + char * __restrict dPtr = reinterpret_cast(pDestination); + for( size_t icount = 0; icount < size; icount += sizeof(char) ) + { + if ( sPtr >= ePtr ) break; + float v = XMVectorGetX( *sPtr++ ); + v = std::max( std::min( v, 127.f ), -127.f ); + *(dPtr++) = static_cast( v ); + } + } + return true; + + case DXGI_FORMAT_A8_UNORM: + if ( size >= sizeof(uint8_t) ) + { + uint8_t * __restrict dPtr = reinterpret_cast(pDestination); + for( size_t icount = 0; icount < size; icount += sizeof(uint8_t) ) + { + if ( sPtr >= ePtr ) break; + float v = XMVectorGetW( *sPtr++ ); + v = std::max( std::min( v, 1.f ), 0.f ); + *(dPtr++) = static_cast( v * 255.f); + } + } + return true; + + case DXGI_FORMAT_R1_UNORM: + if ( size >= sizeof(uint8_t) ) + { + uint8_t * __restrict dPtr = reinterpret_cast(pDestination); + for( size_t icount = 0; icount < size; icount += sizeof(uint8_t) ) + { + uint8_t pixels = 0; + for( size_t bcount = 0; bcount < 8; ++bcount ) + { + if ( sPtr >= ePtr ) break; + float v = XMVectorGetX( *sPtr++ ); + if ( v > 0.5f ) + pixels |= 1 << bcount; + } + *(dPtr++) = pixels; + } + } + return true; + + case DXGI_FORMAT_R9G9B9E5_SHAREDEXP: + STORE_SCANLINE( XMFLOAT3SE, XMStoreFloat3SE ) + + case DXGI_FORMAT_R8G8_B8G8_UNORM: + if ( size >= sizeof(XMUBYTEN4) ) + { + XMUBYTEN4 * __restrict dPtr = reinterpret_cast(pDestination); + for( size_t icount = 0; icount < size; icount += sizeof(XMUBYTEN4) ) + { + if ( sPtr >= ePtr ) break; + XMVECTOR v0 = *sPtr++; + XMVECTOR v1 = (sPtr < ePtr) ? XMVectorSplatY( *sPtr++ ) : XMVectorZero(); + XMVECTOR v = XMVectorSelect( v1, v0, g_XMSelect1110 ); + XMStoreUByteN4( dPtr++, v ); + } + } + return true; + + case DXGI_FORMAT_G8R8_G8B8_UNORM: + if ( size >= sizeof(XMUBYTEN4) ) + { + static XMVECTORI32 select1101 = {XM_SELECT_1, XM_SELECT_1, XM_SELECT_0, XM_SELECT_1}; + + XMUBYTEN4 * __restrict dPtr = reinterpret_cast(pDestination); + for( size_t icount = 0; icount < size; icount += sizeof(XMUBYTEN4) ) + { + if ( sPtr >= ePtr ) break; + XMVECTOR v0 = XMVectorSwizzle<1, 0, 3, 2>( *sPtr++ ); + XMVECTOR v1 = (sPtr < ePtr) ? XMVectorSplatY( *sPtr++ ) : XMVectorZero(); + XMVECTOR v = XMVectorSelect( v1, v0, select1101 ); + XMStoreUByteN4( dPtr++, v ); + } + } + return true; + + case DXGI_FORMAT_B5G6R5_UNORM: + if ( size >= sizeof(XMU565) ) + { + static XMVECTORF32 s_Scale = { 31.f, 63.f, 31.f, 1.f }; + XMU565 * __restrict dPtr = reinterpret_cast(pDestination); + for( size_t icount = 0; icount < size; icount += sizeof(XMU565) ) + { + if ( sPtr >= ePtr ) break; + XMVECTOR v = XMVectorSwizzle<2, 1, 0, 3>( *sPtr++ ); + v = XMVectorMultiply( v, s_Scale ); + XMStoreU565( dPtr++, v ); + } + } + return true; + + case DXGI_FORMAT_B5G5R5A1_UNORM: + if ( size >= sizeof(XMU555) ) + { + static XMVECTORF32 s_Scale = { 31.f, 31.f, 31.f, 1.f }; + XMU555 * __restrict dPtr = reinterpret_cast(pDestination); + for( size_t icount = 0; icount < size; icount += sizeof(XMU555) ) + { + if ( sPtr >= ePtr ) break; + XMVECTOR v = XMVectorSwizzle<2, 1, 0, 3>( *sPtr++ ); + v = XMVectorMultiply( v, s_Scale ); + XMStoreU555( dPtr++, v ); + } + } + return true; + + case DXGI_FORMAT_B8G8R8A8_UNORM: + case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB: + if ( size >= sizeof(XMUBYTEN4) ) + { + XMUBYTEN4 * __restrict dPtr = reinterpret_cast(pDestination); + for( size_t icount = 0; icount < size; icount += sizeof(XMUBYTEN4) ) + { + if ( sPtr >= ePtr ) break; + XMVECTOR v = XMVectorSwizzle<2, 1, 0, 3>( *sPtr++ ); + XMStoreUByteN4( dPtr++, v ); + } + } + return true; + + case DXGI_FORMAT_B8G8R8X8_UNORM: + case DXGI_FORMAT_B8G8R8X8_UNORM_SRGB: + if ( size >= sizeof(XMUBYTEN4) ) + { + XMUBYTEN4 * __restrict dPtr = reinterpret_cast(pDestination); + for( size_t icount = 0; icount < size; icount += sizeof(XMUBYTEN4) ) + { + if ( sPtr >= ePtr ) break; + XMVECTOR v = XMVectorPermute<2, 1, 0, 7>( *sPtr++, g_XMIdentityR3 ); + XMStoreUByteN4( dPtr++, v ); + } + } + return true; + +#ifdef DXGI_1_2_FORMATS + case DXGI_FORMAT_B4G4R4A4_UNORM: + if ( size >= sizeof(XMUNIBBLE4) ) + { + static XMVECTORF32 s_Scale = { 15.f, 15.f, 15.f, 15.f }; + XMUNIBBLE4 * __restrict dPtr = reinterpret_cast(pDestination); + for( size_t icount = 0; icount < size; icount += sizeof(XMUNIBBLE4) ) + { + if ( sPtr >= ePtr ) break; + XMVECTOR v = XMVectorSwizzle<2, 1, 0, 3>( *sPtr++ ); + v = XMVectorMultiply( v, s_Scale ); + XMStoreUNibble4( dPtr++, v ); + } + } + return true; + + // We don't support the video formats ( see IsVideo function ) +#endif // DXGI_1_2_FORMATS + + default: + return false; + } +} + + +//------------------------------------------------------------------------------------- +// Convert DXGI image to/from GUID_WICPixelFormat128bppRGBAFloat (no range conversions) +//------------------------------------------------------------------------------------- +HRESULT _ConvertToR32G32B32A32( const Image& srcImage, ScratchImage& image ) +{ + if ( !srcImage.pixels ) + return E_POINTER; + + HRESULT hr = image.Initialize2D( DXGI_FORMAT_R32G32B32A32_FLOAT, srcImage.width, srcImage.height, 1, 1 ); + if ( FAILED(hr) ) + return hr; + + const Image *img = image.GetImage( 0, 0, 0 ); + if ( !img ) + { + image.Release(); + return E_POINTER; + } + + uint8_t* pDest = img->pixels; + if ( !pDest ) + { + image.Release(); + return E_POINTER; + } + + const uint8_t *pSrc = srcImage.pixels; + for( size_t h = 0; h < srcImage.height; ++h ) + { + if ( !_LoadScanline( reinterpret_cast(pDest), srcImage.width, pSrc, srcImage.rowPitch, srcImage.format ) ) + { + image.Release(); + return E_FAIL; + } + + pSrc += srcImage.rowPitch; + pDest += img->rowPitch; + } + + return S_OK; +} + +HRESULT _ConvertFromR32G32B32A32( _In_ const Image& srcImage, _In_ const Image& destImage ) +{ + assert( srcImage.format == DXGI_FORMAT_R32G32B32A32_FLOAT ); + + if ( !srcImage.pixels || !destImage.pixels ) + return E_POINTER; + + if ( srcImage.width != destImage.width || srcImage.height != destImage.height ) + return E_FAIL; + + const uint8_t *pSrc = srcImage.pixels; + uint8_t* pDest = destImage.pixels; + + for( size_t h = 0; h < srcImage.height; ++h ) + { + if ( !_StoreScanline( pDest, destImage.rowPitch, destImage.format, reinterpret_cast(pSrc), srcImage.width ) ) + return E_FAIL; + + pSrc += srcImage.rowPitch; + pDest += destImage.rowPitch; + } + + return S_OK; +} + +HRESULT _ConvertFromR32G32B32A32( const Image& srcImage, DXGI_FORMAT format, ScratchImage& image ) +{ + if ( !srcImage.pixels ) + return E_POINTER; + + HRESULT hr = image.Initialize2D( format, srcImage.width, srcImage.height, 1, 1 ); + if ( FAILED(hr) ) + return hr; + + const Image *img = image.GetImage( 0, 0, 0 ); + if ( !img ) + { + image.Release(); + return E_POINTER; + } + + hr = _ConvertFromR32G32B32A32( srcImage, *img ); + if ( FAILED(hr) ) + { + image.Release(); + return hr; + } + + return S_OK; +} + +HRESULT _ConvertFromR32G32B32A32( const Image* srcImages, size_t nimages, const TexMetadata& metadata, DXGI_FORMAT format, ScratchImage& result ) +{ + if ( !srcImages ) + return E_POINTER; + + result.Release(); + + assert( metadata.format == DXGI_FORMAT_R32G32B32A32_FLOAT ); + + TexMetadata mdata2 = metadata; + mdata2.format = format; + HRESULT hr = result.Initialize( mdata2 ); + if ( FAILED(hr) ) + return hr; + + if ( nimages != result.GetImageCount() ) + { + result.Release(); + return E_FAIL; + } + + const Image* dest = result.GetImages(); + if ( !dest ) + { + result.Release(); + return E_POINTER; + } + + for( size_t index=0; index < nimages; ++index ) + { + const Image& src = srcImages[ index ]; + const Image& dst = dest[ index ]; + + assert( src.format == DXGI_FORMAT_R32G32B32A32_FLOAT ); + assert( dst.format == format ); + + if ( src.width != dst.width || src.height != dst.height ) + { + result.Release(); + return E_FAIL; + } + + const uint8_t* pSrc = src.pixels; + uint8_t* pDest = dst.pixels; + if ( !pSrc || !pDest ) + { + result.Release(); + return E_POINTER; + } + + for( size_t h=0; h < src.height; ++h ) + { + if ( !_StoreScanline( pDest, dst.rowPitch, format, reinterpret_cast(pSrc), src.width ) ) + { + result.Release(); + return E_FAIL; + } + + pSrc += src.rowPitch; + pDest += dst.rowPitch; + } + } + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// RGB -> sRGB +//------------------------------------------------------------------------------------- +static const uint32_t g_fEncodeGamma22[] = +{ + 0x00000000, 0x3bd56bd3, 0x3c486344, 0x3c90da15, 0x3cbc2677, 0x3ce67704, 0x3d080183, 0x3d1c7728, + 0x3d30a8fb, 0x3d44a03c, 0x3d586400, 0x3d6bf9e7, 0x3d7f6679, 0x3d8956bd, 0x3d92e906, 0x3d9c6b70, + 0x3da5df22, 0x3daf451b, 0x3db89e3e, 0x3dc1eb50, 0x3dcb2d04, 0x3dd463f7, 0x3ddd90b9, 0x3de6b3ca, + 0x3defcda0, 0x3df8dea6, 0x3e00f3a0, 0x3e0573e3, 0x3e09f046, 0x3e0e68f0, 0x3e12de06, 0x3e174fa6, + 0x3e1bbdf2, 0x3e202906, 0x3e2490fd, 0x3e28f5f1, 0x3e2d57fb, 0x3e31b72f, 0x3e3613a4, 0x3e3a6d6e, + 0x3e3ec4a0, 0x3e43194d, 0x3e476b84, 0x3e4bbb57, 0x3e5008d7, 0x3e54540f, 0x3e589d0f, 0x3e5ce3e5, + 0x3e61289d, 0x3e656b44, 0x3e69abe5, 0x3e6dea8d, 0x3e722745, 0x3e766217, 0x3e7a9b0e, 0x3e7ed235, + 0x3e8183c9, 0x3e839d98, 0x3e85b68c, 0x3e87cea8, 0x3e89e5f2, 0x3e8bfc6b, 0x3e8e1219, 0x3e9026ff, + 0x3e923b20, 0x3e944e7f, 0x3e966120, 0x3e987307, 0x3e9a8436, 0x3e9c94af, 0x3e9ea476, 0x3ea0b38e, + 0x3ea2c1fb, 0x3ea4cfbb, 0x3ea6dcd5, 0x3ea8e94a, 0x3eaaf51c, 0x3ead004e, 0x3eaf0ae2, 0x3eb114d9, + 0x3eb31e37, 0x3eb526fe, 0x3eb72f2f, 0x3eb936cd, 0x3ebb3dd8, 0x3ebd4454, 0x3ebf4a43, 0x3ec14fa5, + 0x3ec3547e, 0x3ec558cd, 0x3ec75c95, 0x3ec95fd8, 0x3ecb6297, 0x3ecd64d4, 0x3ecf6690, 0x3ed167ce, + 0x3ed3688e, 0x3ed568d1, 0x3ed76899, 0x3ed967e9, 0x3edb66bf, 0x3edd651f, 0x3edf630a, 0x3ee16080, + 0x3ee35d84, 0x3ee55a16, 0x3ee75636, 0x3ee951e8, 0x3eeb4d2a, 0x3eed4800, 0x3eef4269, 0x3ef13c68, + 0x3ef335fc, 0x3ef52f26, 0x3ef727ea, 0x3ef92046, 0x3efb183c, 0x3efd0fcd, 0x3eff06fa, 0x3f007ee2, + 0x3f017a16, 0x3f027519, 0x3f036fec, 0x3f046a8f, 0x3f056502, 0x3f065f47, 0x3f07595d, 0x3f085344, + 0x3f094cfe, 0x3f0a468b, 0x3f0b3feb, 0x3f0c391e, 0x3f0d3224, 0x3f0e2aff, 0x3f0f23af, 0x3f101c32, + 0x3f11148c, 0x3f120cba, 0x3f1304bf, 0x3f13fc9a, 0x3f14f44b, 0x3f15ebd3, 0x3f16e333, 0x3f17da6b, + 0x3f18d17a, 0x3f19c860, 0x3f1abf1f, 0x3f1bb5b7, 0x3f1cac28, 0x3f1da272, 0x3f1e9895, 0x3f1f8e92, + 0x3f20846a, 0x3f217a1c, 0x3f226fa8, 0x3f23650f, 0x3f245a52, 0x3f254f70, 0x3f264469, 0x3f27393f, + 0x3f282df1, 0x3f29227f, 0x3f2a16ea, 0x3f2b0b31, 0x3f2bff56, 0x3f2cf358, 0x3f2de738, 0x3f2edaf6, + 0x3f2fce91, 0x3f30c20b, 0x3f31b564, 0x3f32a89b, 0x3f339bb1, 0x3f348ea6, 0x3f35817a, 0x3f36742f, + 0x3f3766c3, 0x3f385936, 0x3f394b8a, 0x3f3a3dbe, 0x3f3b2fd3, 0x3f3c21c8, 0x3f3d139e, 0x3f3e0556, + 0x3f3ef6ee, 0x3f3fe868, 0x3f40d9c4, 0x3f41cb01, 0x3f42bc20, 0x3f43ad22, 0x3f449e06, 0x3f458ecc, + 0x3f467f75, 0x3f477001, 0x3f486071, 0x3f4950c2, 0x3f4a40f8, 0x3f4b3111, 0x3f4c210d, 0x3f4d10ed, + 0x3f4e00b2, 0x3f4ef05a, 0x3f4fdfe7, 0x3f50cf58, 0x3f51beae, 0x3f52ade8, 0x3f539d07, 0x3f548c0c, + 0x3f557af5, 0x3f5669c4, 0x3f575878, 0x3f584711, 0x3f593590, 0x3f5a23f6, 0x3f5b1241, 0x3f5c0072, + 0x3f5cee89, 0x3f5ddc87, 0x3f5eca6b, 0x3f5fb835, 0x3f60a5e7, 0x3f619380, 0x3f6280ff, 0x3f636e65, + 0x3f645bb3, 0x3f6548e8, 0x3f663604, 0x3f672309, 0x3f680ff4, 0x3f68fcc8, 0x3f69e983, 0x3f6ad627, + 0x3f6bc2b3, 0x3f6caf27, 0x3f6d9b83, 0x3f6e87c8, 0x3f6f73f5, 0x3f70600c, 0x3f714c0b, 0x3f7237f4, + 0x3f7323c4, 0x3f740f7f, 0x3f74fb22, 0x3f75e6af, 0x3f76d225, 0x3f77bd85, 0x3f78a8ce, 0x3f799401, + 0x3f7a7f1e, 0x3f7b6a25, 0x3f7c5516, 0x3f7d3ff1, 0x3f7e2ab6, 0x3f7f1566, 0x3f800000, 0x3f800000 +}; + +#pragma prefast(suppress : 25000, "FXMVECTOR is 16 bytes") +static inline XMVECTOR _TableEncodeGamma22( FXMVECTOR v ) +{ + float f[4]; + XMStoreFloat4( (XMFLOAT4*)f, v ); + + for( size_t i=0; i < 4; ++i ) + { + float f2 = sqrtf(f[i]) * 254.0f; + + uint32_t i2 = static_cast(f2); + i2 = std::min( i2, _countof( g_fEncodeGamma22 )-2 ); + + float fS = f2 - (float) i2; + float fA = ((float *) g_fEncodeGamma22)[i2]; + float fB = ((float *) g_fEncodeGamma22)[i2 + 1]; + + f[i] = fA + fS * (fB - fA); + } + + return XMLoadFloat4( (XMFLOAT4*)f ); +} + + +//------------------------------------------------------------------------------------- +// sRGB -> RGB +//------------------------------------------------------------------------------------- +static const uint32_t g_fDecodeGamma22[] = +{ + 0x00000000, 0x3b144eb0, 0x3b9ef3b0, 0x3bf84b42, 0x3c2a5c46, 0x3c59c180, 0x3c850eb5, 0x3c9da52a, + 0x3cb6967a, 0x3ccfd852, 0x3ce9628b, 0x3d01974b, 0x3d0e9b82, 0x3d1bbba3, 0x3d28f5bc, 0x3d364822, + 0x3d43b159, 0x3d51301d, 0x3d5ec344, 0x3d6c69c9, 0x3d7a22c4, 0x3d83f6ad, 0x3d8ae465, 0x3d91da35, + 0x3d98d7c7, 0x3d9fdcd2, 0x3da6e914, 0x3dadfc47, 0x3db51635, 0x3dbc36a3, 0x3dc35d62, 0x3dca8a3a, + 0x3dd1bd02, 0x3dd8f591, 0x3de033bb, 0x3de7775d, 0x3deec050, 0x3df60e74, 0x3dfd61a6, 0x3e025ce5, + 0x3e060b61, 0x3e09bc38, 0x3e0d6f5f, 0x3e1124c8, 0x3e14dc68, 0x3e189630, 0x3e1c521a, 0x3e201016, + 0x3e23d01d, 0x3e279225, 0x3e2b5624, 0x3e2f1c10, 0x3e32e3e4, 0x3e36ad94, 0x3e3a7918, 0x3e3e4668, + 0x3e42157f, 0x3e45e654, 0x3e49b8e0, 0x3e4d8d1d, 0x3e516304, 0x3e553a8d, 0x3e5913b4, 0x3e5cee70, + 0x3e60cabf, 0x3e64a89b, 0x3e6887fb, 0x3e6c68db, 0x3e704b3a, 0x3e742f0e, 0x3e781454, 0x3e7bfb04, + 0x3e7fe321, 0x3e81e650, 0x3e83dbc0, 0x3e85d1dc, 0x3e87c8a3, 0x3e89c015, 0x3e8bb830, 0x3e8db0ee, + 0x3e8faa51, 0x3e91a454, 0x3e939ef9, 0x3e959a3b, 0x3e97961b, 0x3e999295, 0x3e9b8fa7, 0x3e9d8d52, + 0x3e9f8b93, 0x3ea18a6a, 0x3ea389d2, 0x3ea589cb, 0x3ea78a56, 0x3ea98b6e, 0x3eab8d15, 0x3ead8f47, + 0x3eaf9204, 0x3eb1954a, 0x3eb39917, 0x3eb59d6c, 0x3eb7a246, 0x3eb9a7a5, 0x3ebbad88, 0x3ebdb3ec, + 0x3ebfbad3, 0x3ec1c237, 0x3ec3ca1a, 0x3ec5d27c, 0x3ec7db58, 0x3ec9e4b4, 0x3ecbee85, 0x3ecdf8d3, + 0x3ed0039a, 0x3ed20ed8, 0x3ed41a8a, 0x3ed626b5, 0x3ed83351, 0x3eda4065, 0x3edc4de9, 0x3ede5be0, + 0x3ee06a4a, 0x3ee27923, 0x3ee4886a, 0x3ee69821, 0x3ee8a845, 0x3eeab8d8, 0x3eecc9d6, 0x3eeedb3f, + 0x3ef0ed13, 0x3ef2ff53, 0x3ef511fb, 0x3ef7250a, 0x3ef93883, 0x3efb4c61, 0x3efd60a7, 0x3eff7553, + 0x3f00c531, 0x3f01cfeb, 0x3f02dad9, 0x3f03e5f5, 0x3f04f145, 0x3f05fcc4, 0x3f070875, 0x3f081456, + 0x3f092067, 0x3f0a2ca8, 0x3f0b3917, 0x3f0c45b7, 0x3f0d5284, 0x3f0e5f7f, 0x3f0f6caa, 0x3f107a03, + 0x3f118789, 0x3f12953b, 0x3f13a31d, 0x3f14b12b, 0x3f15bf64, 0x3f16cdca, 0x3f17dc5e, 0x3f18eb1b, + 0x3f19fa05, 0x3f1b091b, 0x3f1c185c, 0x3f1d27c7, 0x3f1e375c, 0x3f1f471d, 0x3f205707, 0x3f21671b, + 0x3f227759, 0x3f2387c2, 0x3f249852, 0x3f25a90c, 0x3f26b9ef, 0x3f27cafb, 0x3f28dc30, 0x3f29ed8b, + 0x3f2aff11, 0x3f2c10bd, 0x3f2d2290, 0x3f2e348b, 0x3f2f46ad, 0x3f3058f7, 0x3f316b66, 0x3f327dfd, + 0x3f3390ba, 0x3f34a39d, 0x3f35b6a7, 0x3f36c9d6, 0x3f37dd2b, 0x3f38f0a5, 0x3f3a0443, 0x3f3b1808, + 0x3f3c2bf2, 0x3f3d4000, 0x3f3e5434, 0x3f3f688c, 0x3f407d07, 0x3f4191a8, 0x3f42a66c, 0x3f43bb54, + 0x3f44d05f, 0x3f45e58e, 0x3f46fadf, 0x3f481054, 0x3f4925ed, 0x3f4a3ba8, 0x3f4b5186, 0x3f4c6789, + 0x3f4d7daa, 0x3f4e93f0, 0x3f4faa57, 0x3f50c0e0, 0x3f51d78b, 0x3f52ee58, 0x3f540545, 0x3f551c55, + 0x3f563386, 0x3f574ad7, 0x3f58624b, 0x3f5979de, 0x3f5a9191, 0x3f5ba965, 0x3f5cc15b, 0x3f5dd971, + 0x3f5ef1a6, 0x3f6009fc, 0x3f612272, 0x3f623b08, 0x3f6353bc, 0x3f646c90, 0x3f658586, 0x3f669e98, + 0x3f67b7cb, 0x3f68d11b, 0x3f69ea8d, 0x3f6b041b, 0x3f6c1dc9, 0x3f6d3795, 0x3f6e5180, 0x3f6f6b8b, + 0x3f7085b2, 0x3f719ff7, 0x3f72ba5b, 0x3f73d4dc, 0x3f74ef7c, 0x3f760a38, 0x3f772512, 0x3f78400b, + 0x3f795b20, 0x3f7a7651, 0x3f7b91a2, 0x3f7cad0e, 0x3f7dc896, 0x3f7ee43c, 0x3f800000, 0x3f800000 +}; + + +#pragma prefast(suppress : 25000, "FXMVECTOR is 16 bytes") +static inline XMVECTOR _TableDecodeGamma22( FXMVECTOR v ) +{ + float f[4]; + XMStoreFloat4( (XMFLOAT4*)f, v ); + + for( size_t i=0; i < 4; ++i ) + { + float f2 = f[i] * f[i] * 254.0f; + uint32_t i2 = static_cast(f2); + i2 = std::min( i2, _countof(g_fDecodeGamma22)-2 ); + + float fS = f2 - (float) i2; + float fA = ((float *) g_fDecodeGamma22)[i2]; + float fB = ((float *) g_fDecodeGamma22)[i2 + 1]; + + f[i] = fA + fS * (fB - fA); + } + + return XMLoadFloat4( (XMFLOAT4*)f ); +} + + +//------------------------------------------------------------------------------------- +// Convert scanline based on source/target formats +//------------------------------------------------------------------------------------- +struct ConvertData +{ + DXGI_FORMAT format; + size_t datasize; + DWORD flags; +}; + +static const ConvertData g_ConvertTable[] = { + { DXGI_FORMAT_R32G32B32A32_FLOAT, 32, CONVF_FLOAT | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, + { DXGI_FORMAT_R32G32B32A32_UINT, 32, CONVF_UINT | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, + { DXGI_FORMAT_R32G32B32A32_SINT, 32, CONVF_SINT | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, + { DXGI_FORMAT_R32G32B32_FLOAT, 32, CONVF_FLOAT | CONVF_R | CONVF_G | CONVF_B }, + { DXGI_FORMAT_R32G32B32_UINT, 32, CONVF_UINT | CONVF_R | CONVF_G | CONVF_B }, + { DXGI_FORMAT_R32G32B32_SINT, 32, CONVF_SINT | CONVF_R | CONVF_G | CONVF_B }, + { DXGI_FORMAT_R16G16B16A16_FLOAT, 16, CONVF_FLOAT | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, + { DXGI_FORMAT_R16G16B16A16_UNORM, 16, CONVF_UNORM | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, + { DXGI_FORMAT_R16G16B16A16_UINT, 16, CONVF_UINT | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, + { DXGI_FORMAT_R16G16B16A16_SNORM, 16, CONVF_SNORM | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, + { DXGI_FORMAT_R16G16B16A16_SINT, 16, CONVF_SINT | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, + { DXGI_FORMAT_R32G32_FLOAT, 32, CONVF_FLOAT | CONVF_R | CONVF_G }, + { DXGI_FORMAT_R32G32_UINT, 32, CONVF_UINT | CONVF_R | CONVF_G }, + { DXGI_FORMAT_R32G32_SINT, 32, CONVF_SINT | CONVF_R | CONVF_G }, + { DXGI_FORMAT_D32_FLOAT_S8X24_UINT, 32, CONVF_FLOAT | CONVF_DEPTH | CONVF_STENCIL }, + { DXGI_FORMAT_R10G10B10A2_UNORM, 10, CONVF_UNORM | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, + { DXGI_FORMAT_R10G10B10A2_UINT, 10, CONVF_UINT | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, + { DXGI_FORMAT_R11G11B10_FLOAT, 10, CONVF_FLOAT | CONVF_R | CONVF_G | CONVF_B }, + { DXGI_FORMAT_R8G8B8A8_UNORM, 8, CONVF_UNORM | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, + { DXGI_FORMAT_R8G8B8A8_UNORM_SRGB, 8, CONVF_UNORM | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, + { DXGI_FORMAT_R8G8B8A8_UINT, 8, CONVF_UINT | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, + { DXGI_FORMAT_R8G8B8A8_SNORM, 8, CONVF_SNORM | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, + { DXGI_FORMAT_R8G8B8A8_SINT, 8, CONVF_SINT | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, + { DXGI_FORMAT_R16G16_FLOAT, 16, CONVF_FLOAT | CONVF_R | CONVF_G }, + { DXGI_FORMAT_R16G16_UNORM, 16, CONVF_UNORM | CONVF_R | CONVF_G }, + { DXGI_FORMAT_R16G16_UINT, 16, CONVF_UINT | CONVF_R | CONVF_G }, + { DXGI_FORMAT_R16G16_SNORM, 16, CONVF_SNORM | CONVF_R | CONVF_G }, + { DXGI_FORMAT_R16G16_SINT, 16, CONVF_SINT | CONVF_R | CONVF_G }, + { DXGI_FORMAT_D32_FLOAT, 32, CONVF_FLOAT | CONVF_DEPTH }, + { DXGI_FORMAT_R32_FLOAT, 32, CONVF_FLOAT | CONVF_R }, + { DXGI_FORMAT_R32_UINT, 32, CONVF_UINT | CONVF_R }, + { DXGI_FORMAT_R32_SINT, 32, CONVF_SINT | CONVF_R }, + { DXGI_FORMAT_D24_UNORM_S8_UINT, 32, CONVF_UNORM | CONVF_DEPTH | CONVF_STENCIL }, + { DXGI_FORMAT_R8G8_UNORM, 8, CONVF_UNORM | CONVF_R | CONVF_G }, + { DXGI_FORMAT_R8G8_UINT, 8, CONVF_UINT | CONVF_R | CONVF_G }, + { DXGI_FORMAT_R8G8_SNORM, 8, CONVF_SNORM | CONVF_R | CONVF_G }, + { DXGI_FORMAT_R8G8_SINT, 8, CONVF_SINT | CONVF_R | CONVF_G }, + { DXGI_FORMAT_R16_FLOAT, 16, CONVF_FLOAT | CONVF_R }, + { DXGI_FORMAT_D16_UNORM, 16, CONVF_UNORM | CONVF_DEPTH }, + { DXGI_FORMAT_R16_UNORM, 16, CONVF_UNORM | CONVF_R }, + { DXGI_FORMAT_R16_UINT, 16, CONVF_UINT | CONVF_R }, + { DXGI_FORMAT_R16_SNORM, 16, CONVF_SNORM | CONVF_R }, + { DXGI_FORMAT_R16_SINT, 16, CONVF_SINT | CONVF_R }, + { DXGI_FORMAT_R8_UNORM, 8, CONVF_UNORM | CONVF_R }, + { DXGI_FORMAT_R8_UINT, 8, CONVF_UINT | CONVF_R }, + { DXGI_FORMAT_R8_SNORM, 8, CONVF_SNORM | CONVF_R }, + { DXGI_FORMAT_R8_SINT, 8, CONVF_SINT | CONVF_R }, + { DXGI_FORMAT_A8_UNORM, 8, CONVF_UNORM | CONVF_A }, + { DXGI_FORMAT_R1_UNORM, 1, CONVF_UNORM | CONVF_R }, + { DXGI_FORMAT_R9G9B9E5_SHAREDEXP, 9, CONVF_SHAREDEXP | CONVF_R | CONVF_G | CONVF_B }, + { DXGI_FORMAT_R8G8_B8G8_UNORM, 8, CONVF_UNORM | CONVF_PACKED | CONVF_R | CONVF_G | CONVF_B }, + { DXGI_FORMAT_G8R8_G8B8_UNORM, 8, CONVF_UNORM | CONVF_PACKED | CONVF_R | CONVF_G | CONVF_B }, + { DXGI_FORMAT_BC1_UNORM, 8, CONVF_UNORM | CONVF_BC | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, + { DXGI_FORMAT_BC1_UNORM_SRGB, 8, CONVF_UNORM | CONVF_BC | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, + { DXGI_FORMAT_BC2_UNORM, 8, CONVF_UNORM | CONVF_BC | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, + { DXGI_FORMAT_BC2_UNORM_SRGB, 8, CONVF_UNORM | CONVF_BC | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, + { DXGI_FORMAT_BC3_UNORM, 8, CONVF_UNORM | CONVF_BC | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, + { DXGI_FORMAT_BC3_UNORM_SRGB, 8, CONVF_UNORM | CONVF_BC | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, + { DXGI_FORMAT_BC4_UNORM, 8, CONVF_UNORM | CONVF_BC | CONVF_R }, + { DXGI_FORMAT_BC4_SNORM, 8, CONVF_SNORM | CONVF_BC | CONVF_R }, + { DXGI_FORMAT_BC5_UNORM, 8, CONVF_UNORM | CONVF_BC | CONVF_R | CONVF_G }, + { DXGI_FORMAT_BC5_SNORM, 8, CONVF_SNORM | CONVF_BC | CONVF_R | CONVF_G }, + { DXGI_FORMAT_B5G6R5_UNORM, 5, CONVF_UNORM | CONVF_R | CONVF_G | CONVF_B }, + { DXGI_FORMAT_B5G5R5A1_UNORM, 5, CONVF_UNORM | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, + { DXGI_FORMAT_B8G8R8A8_UNORM, 8, CONVF_UNORM | CONVF_BGR | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, + { DXGI_FORMAT_B8G8R8X8_UNORM, 8, CONVF_UNORM | CONVF_BGR | CONVF_R | CONVF_G | CONVF_B }, + { DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM, 10, CONVF_UNORM | CONVF_X2 | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, + { DXGI_FORMAT_B8G8R8A8_UNORM_SRGB, 8, CONVF_UNORM | CONVF_BGR | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, + { DXGI_FORMAT_B8G8R8X8_UNORM_SRGB, 8, CONVF_UNORM | CONVF_BGR | CONVF_R | CONVF_G | CONVF_B }, + { DXGI_FORMAT_BC6H_UF16, 16, CONVF_FLOAT | CONVF_BC | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, + { DXGI_FORMAT_BC6H_SF16, 16, CONVF_FLOAT | CONVF_BC | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, + { DXGI_FORMAT_BC7_UNORM, 8, CONVF_UNORM | CONVF_BC | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, + { DXGI_FORMAT_BC7_UNORM_SRGB, 8, CONVF_UNORM | CONVF_BC | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, +#ifdef DXGI_1_2_FORMATS + { DXGI_FORMAT_B4G4R4A4_UNORM, 4, CONVF_UNORM | CONVF_BGR | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, +#endif +}; + +#pragma prefast( suppress : 25004, "Signature must match bsearch_s" ); +static int __cdecl _ConvertCompare( void *context, const void* ptr1, const void *ptr2 ) +{ + UNREFERENCED_PARAMETER(context); + const ConvertData *p1 = reinterpret_cast(ptr1); + const ConvertData *p2 = reinterpret_cast(ptr2); + if ( p1->format == p2->format ) return 0; + else return (p1->format < p2->format ) ? -1 : 1; +} + +DWORD _GetConvertFlags( DXGI_FORMAT format ) +{ +#ifdef _DEBUG + // Ensure conversion table is in ascending order + assert( _countof(g_ConvertTable) > 0 ); + DXGI_FORMAT lastvalue = g_ConvertTable[0].format; + for( size_t index=1; index < _countof(g_ConvertTable); ++index ) + { + assert( g_ConvertTable[index].format > lastvalue ); + lastvalue = g_ConvertTable[index].format; + } +#endif + + ConvertData key = { format, 0 }; + const ConvertData* in = (const ConvertData*) bsearch_s( &key, g_ConvertTable, _countof(g_ConvertTable), sizeof(ConvertData), + _ConvertCompare, 0 ); + return (in) ? in->flags : 0; +} + +void _ConvertScanline( XMVECTOR* pBuffer, size_t count, DXGI_FORMAT outFormat, DXGI_FORMAT inFormat, DWORD flags ) +{ + assert( pBuffer && count > 0 && (((uintptr_t)pBuffer & 0xF) == 0) ); + assert( IsValid(outFormat) && !IsVideo(outFormat) && !IsTypeless(outFormat) ); + assert( IsValid(inFormat) && !IsVideo(inFormat) && !IsTypeless(inFormat) ); + + if ( !pBuffer ) + return; + +#ifdef _DEBUG + // Ensure conversion table is in ascending order + assert( _countof(g_ConvertTable) > 0 ); + DXGI_FORMAT lastvalue = g_ConvertTable[0].format; + for( size_t index=1; index < _countof(g_ConvertTable); ++index ) + { + assert( g_ConvertTable[index].format > lastvalue ); + lastvalue = g_ConvertTable[index].format; + } +#endif + + // Determine conversion details about source and dest formats + ConvertData key = { inFormat, 0 }; + const ConvertData* in = (const ConvertData*) bsearch_s( &key, g_ConvertTable, _countof(g_ConvertTable), sizeof(ConvertData), + _ConvertCompare, 0 ); + key.format = outFormat; + const ConvertData* out = (const ConvertData*) bsearch_s( &key, g_ConvertTable, _countof(g_ConvertTable), sizeof(ConvertData), + _ConvertCompare, 0 ); + if ( !in || !out ) + { + assert(false); + return; + } + + assert( _GetConvertFlags( inFormat ) == in->flags ); + assert( _GetConvertFlags( outFormat ) == out->flags ); + + // Handle SRGB filtering modes + if ( IsSRGB( inFormat ) ) + flags |= TEX_FILTER_SRGB_IN; + + if ( IsSRGB( outFormat ) ) + flags |= TEX_FILTER_SRGB_OUT; + + if ( in->flags & CONVF_SNORM ) + flags &= ~TEX_FILTER_SRGB_IN; + + if ( out->flags & CONVF_SNORM ) + flags &= ~TEX_FILTER_SRGB_OUT; + + if ( (flags & (TEX_FILTER_SRGB_IN|TEX_FILTER_SRGB_OUT)) == (TEX_FILTER_SRGB_IN|TEX_FILTER_SRGB_OUT) ) + { + flags &= ~(TEX_FILTER_SRGB_IN|TEX_FILTER_SRGB_OUT); + } + + // sRGB input processing (sRGB -> RGB) + if ( flags & TEX_FILTER_SRGB_IN ) + { + if ( (in->flags & CONVF_FLOAT) || (in->flags & CONVF_UNORM) ) + { + XMVECTOR* ptr = pBuffer; + for( size_t i=0; i < count; ++i ) + { + // rgb = rgb^(2.2); a=a + XMVECTOR v = *ptr; + XMVECTOR v1 = _TableDecodeGamma22( v ); + *ptr++ = XMVectorSelect( v, v1, g_XMSelect1110 ); + } + } + } + + // Handle conversion special cases + DWORD diffFlags = in->flags ^ out->flags; + if ( diffFlags != 0) + { + if ( out->flags & CONVF_UNORM ) + { + if ( in->flags & CONVF_SNORM ) + { + // SNORM -> UNORM + XMVECTOR* ptr = pBuffer; + for( size_t i=0; i < count; ++i ) + { + XMVECTOR v = *ptr; + *ptr++ = XMVectorMultiplyAdd( v, g_XMOneHalf, g_XMOneHalf ); + } + } + else if ( in->flags & CONVF_FLOAT ) + { + // FLOAT -> UNORM + XMVECTOR* ptr = pBuffer; + for( size_t i=0; i < count; ++i ) + { + XMVECTOR v = *ptr; + *ptr++ = XMVectorSaturate( v ); + } + } + } + else if ( out->flags & CONVF_SNORM ) + { + if ( in->flags & CONVF_UNORM ) + { + // UNORM -> SNORM + static XMVECTORF32 two = { 2.0f, 2.0f, 2.0f, 2.0f }; + XMVECTOR* ptr = pBuffer; + for( size_t i=0; i < count; ++i ) + { + XMVECTOR v = *ptr; + *ptr++ = XMVectorMultiplyAdd( v, two, g_XMNegativeOne ); + } + } + else if ( in->flags & CONVF_FLOAT ) + { + // FLOAT -> SNORM + XMVECTOR* ptr = pBuffer; + for( size_t i=0; i < count; ++i ) + { + XMVECTOR v = *ptr; + *ptr++ = XMVectorClamp( v, g_XMNegativeOne, g_XMOne ); + } + } + } + + // !CONVF_A -> CONVF_A is handled because LoadScanline ensures alpha defaults to 1.0 for no-alpha formats + + // CONVF_PACKED cases are handled because LoadScanline/StoreScanline handles packing/unpacking + + if ( ((out->flags & CONVF_RGBA_MASK) == CONVF_A) && !(in->flags & CONVF_A) ) + { + // !CONVF_A -> A format + XMVECTOR* ptr = pBuffer; + for( size_t i=0; i < count; ++i ) + { + XMVECTOR v = *ptr; + *ptr++ = XMVectorSplatX( v ); + } + } + else if ( ((in->flags & CONVF_RGBA_MASK) == CONVF_A) && !(out->flags & CONVF_A) ) + { + // A format -> !CONVF_A + XMVECTOR* ptr = pBuffer; + for( size_t i=0; i < count; ++i ) + { + XMVECTOR v = *ptr; + *ptr++ = XMVectorSplatW( v ); + } + } + else if ( ((in->flags & CONVF_RGB_MASK) == CONVF_R) && ((out->flags & CONVF_RGB_MASK) == (CONVF_R|CONVF_G|CONVF_B)) ) + { + // R format -> RGB format + XMVECTOR* ptr = pBuffer; + for( size_t i=0; i < count; ++i ) + { + XMVECTOR v = *ptr; + XMVECTOR v1 = XMVectorSplatX( v ); + *ptr++ = XMVectorSelect( v, v1, g_XMSelect1110 ); + } + } + } + + // sRGB output processing (RGB -> sRGB) + if ( flags & TEX_FILTER_SRGB_OUT ) + { + if ( (out->flags & CONVF_FLOAT) || (out->flags & CONVF_UNORM) ) + { + XMVECTOR* ptr = pBuffer; + for( size_t i=0; i < count; ++i ) + { + // rgb = rgb^(1/2.2); a=a + XMVECTOR v = *ptr; + XMVECTOR v1 = _TableEncodeGamma22( v ); + *ptr++ = XMVectorSelect( v, v1, g_XMSelect1110 ); + } + } + } +} + + +//------------------------------------------------------------------------------------- +// Convert the source image using WIC +//------------------------------------------------------------------------------------- +static HRESULT _ConvertUsingWIC( _In_ const Image& srcImage, _In_ const WICPixelFormatGUID& pfGUID, + _In_ const WICPixelFormatGUID& targetGUID, + _In_ DWORD filter, _In_ float threshold, _In_ const Image& destImage ) +{ + assert( srcImage.width == destImage.width ); + assert( srcImage.height == destImage.height ); + + IWICImagingFactory* pWIC = _GetWIC(); + if ( !pWIC ) + return E_NOINTERFACE; + + ScopedObject FC; + HRESULT hr = pWIC->CreateFormatConverter( &FC ); + if ( FAILED(hr) ) + return hr; + + // Need to implement usage of TEX_FILTER_SRGB_IN/TEX_FILTER_SRGB_OUT + + BOOL canConvert = FALSE; + hr = FC->CanConvert( pfGUID, targetGUID, &canConvert ); + if ( FAILED(hr) || !canConvert ) + { + // This case is not an issue for the subset of WIC formats that map directly to DXGI + return E_UNEXPECTED; + } + + ScopedObject source; + hr = pWIC->CreateBitmapFromMemory( static_cast( srcImage.width ), static_cast( srcImage.height ), pfGUID, + static_cast( srcImage.rowPitch ), static_cast( srcImage.slicePitch ), + srcImage.pixels, &source ); + if ( FAILED(hr) ) + return hr; + + hr = FC->Initialize( source.Get(), targetGUID, _GetWICDither( filter ), 0, threshold, WICBitmapPaletteTypeCustom ); + if ( FAILED(hr) ) + return hr; + + hr = FC->CopyPixels( 0, static_cast( destImage.rowPitch ), static_cast( destImage.slicePitch ), destImage.pixels ); + if ( FAILED(hr) ) + return hr; + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Convert the source using WIC and then convert to DXGI format from there +//------------------------------------------------------------------------------------- +static HRESULT _ConvertFromWIC( _In_ const Image& srcImage, _In_ const WICPixelFormatGUID& pfGUID, + _In_ DWORD filter, _In_ float threshold, _In_ const Image& destImage ) +{ + assert( srcImage.width == destImage.width ); + assert( srcImage.height == destImage.height ); + + IWICImagingFactory* pWIC = _GetWIC(); + if ( !pWIC ) + return E_NOINTERFACE; + + ScopedObject FC; + HRESULT hr = pWIC->CreateFormatConverter( &FC ); + if ( FAILED(hr) ) + return hr; + + BOOL canConvert = FALSE; + hr = FC->CanConvert( pfGUID, GUID_WICPixelFormat128bppRGBAFloat, &canConvert ); + if ( FAILED(hr) || !canConvert ) + { + // This case is not an issue for the subset of WIC formats that map directly to DXGI + return E_UNEXPECTED; + } + + ScratchImage temp; + hr = temp.Initialize2D( DXGI_FORMAT_R32G32B32A32_FLOAT, srcImage.width, srcImage.height, 1, 1 ); + if ( FAILED(hr) ) + return hr; + + const Image *timg = temp.GetImage( 0, 0, 0 ); + if ( !timg ) + return E_POINTER; + + ScopedObject source; + hr = pWIC->CreateBitmapFromMemory( static_cast( srcImage.width ), static_cast( srcImage.height ), pfGUID, + static_cast( srcImage.rowPitch ), static_cast( srcImage.slicePitch ), + srcImage.pixels, &source ); + if ( FAILED(hr) ) + return hr; + + hr = FC->Initialize( source.Get(), GUID_WICPixelFormat128bppRGBAFloat, _GetWICDither( filter ), 0, threshold, WICBitmapPaletteTypeCustom ); + if ( FAILED(hr) ) + return hr; + + hr = FC->CopyPixels( 0, static_cast( timg->rowPitch ), static_cast( timg->slicePitch ), timg->pixels ); + if ( FAILED(hr) ) + return hr; + + // Perform conversion on temp image which is now in R32G32B32A32_FLOAT format to final image + uint8_t *pSrc = timg->pixels; + uint8_t *pDest = destImage.pixels; + if ( !pSrc || !pDest ) + return E_POINTER; + + for( size_t h = 0; h < srcImage.height; ++h ) + { + _ConvertScanline( reinterpret_cast(pSrc), srcImage.width, destImage.format, DXGI_FORMAT_R32G32B32A32_FLOAT, filter ); + + if ( !_StoreScanline( pDest, destImage.rowPitch, destImage.format, reinterpret_cast(pSrc), srcImage.width ) ) + return E_FAIL; + + pSrc += timg->rowPitch; + pDest += destImage.rowPitch; + } + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Convert the source from DXGI format then use WIC to convert to final format +//------------------------------------------------------------------------------------- +static HRESULT _ConvertToWIC( _In_ const Image& srcImage, + _In_ const WICPixelFormatGUID& targetGUID, _In_ DWORD filter, _In_ float threshold, _In_ const Image& destImage ) +{ + assert( srcImage.width == destImage.width ); + assert( srcImage.height == destImage.height ); + + IWICImagingFactory* pWIC = _GetWIC(); + if ( !pWIC ) + return E_NOINTERFACE; + + ScopedObject FC; + HRESULT hr = pWIC->CreateFormatConverter( &FC ); + if ( FAILED(hr) ) + return hr; + + BOOL canConvert = FALSE; + hr = FC->CanConvert( GUID_WICPixelFormat128bppRGBAFloat, targetGUID, &canConvert ); + if ( FAILED(hr) || !canConvert ) + { + // This case is not an issue for the subset of WIC formats that map directly to DXGI + return E_UNEXPECTED; + } + + ScratchImage temp; + hr = temp.Initialize2D( DXGI_FORMAT_R32G32B32A32_FLOAT, srcImage.width, srcImage.height, 1, 1 ); + if ( FAILED(hr) ) + return hr; + + const Image *timg = temp.GetImage( 0, 0, 0 ); + if ( !timg ) + return E_POINTER; + + const uint8_t *pSrc = srcImage.pixels; + if ( !pSrc ) + return E_POINTER; + + uint8_t *pDest = timg->pixels; + if ( !pDest ) + return E_POINTER; + + for( size_t h = 0; h < srcImage.height; ++h ) + { + if ( !_LoadScanline( reinterpret_cast(pDest), srcImage.width, pSrc, srcImage.rowPitch, srcImage.format ) ) + return E_FAIL; + + _ConvertScanline( reinterpret_cast(pDest), srcImage.width, DXGI_FORMAT_R32G32B32A32_FLOAT, srcImage.format, filter ); + + pSrc += srcImage.rowPitch; + pDest += timg->rowPitch; + } + + // Perform conversion on temp image which is now in R32G32B32A32_FLOAT format + ScopedObject source; + hr = pWIC->CreateBitmapFromMemory( static_cast( timg->width ), static_cast( timg->height ), GUID_WICPixelFormat128bppRGBAFloat, + static_cast( timg->rowPitch ), static_cast( timg->slicePitch ), + timg->pixels, &source ); + if ( FAILED(hr) ) + return hr; + + hr = FC->Initialize( source.Get(), targetGUID, _GetWICDither( filter ), 0, threshold, WICBitmapPaletteTypeCustom ); + if ( FAILED(hr) ) + return hr; + + hr = FC->CopyPixels( 0, static_cast( destImage.rowPitch ), static_cast( destImage.slicePitch ), destImage.pixels ); + if ( FAILED(hr) ) + return hr; + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Convert the source image (not using WIC) +//------------------------------------------------------------------------------------- +static HRESULT _Convert( _In_ const Image& srcImage, _In_ DWORD filter, _In_ const Image& destImage ) +{ + assert( srcImage.width == destImage.width ); + assert( srcImage.height == destImage.height ); + + ScopedAlignedArrayXMVECTOR scanline( reinterpret_cast( _aligned_malloc( (sizeof(XMVECTOR)*srcImage.width), 16 ) ) ); + if ( !scanline ) + return E_OUTOFMEMORY; + + const uint8_t *pSrc = srcImage.pixels; + uint8_t *pDest = destImage.pixels; + if ( !pSrc || !pDest ) + return E_POINTER; + + for( size_t h = 0; h < srcImage.height; ++h ) + { + if ( !_LoadScanline( scanline.get(), srcImage.width, pSrc, srcImage.rowPitch, srcImage.format ) ) + return E_FAIL; + + _ConvertScanline( scanline.get(), srcImage.width, destImage.format, srcImage.format, filter ); + + if ( !_StoreScanline( pDest, destImage.rowPitch, destImage.format, scanline.get(), srcImage.width ) ) + return E_FAIL; + + pSrc += srcImage.rowPitch; + pDest += destImage.rowPitch; + } + + return S_OK; +} + + +//===================================================================================== +// Entry-points +//===================================================================================== + +//------------------------------------------------------------------------------------- +// Convert image +//------------------------------------------------------------------------------------- +HRESULT Convert( const Image& srcImage, DXGI_FORMAT format, DWORD filter, float threshold, ScratchImage& image ) +{ + if ( (srcImage.format == format) || !IsValid( format ) ) + return E_INVALIDARG; + + if ( !srcImage.pixels ) + return E_POINTER; + + if ( IsCompressed(srcImage.format) || IsCompressed(format) + || IsVideo(srcImage.format) || IsVideo(format) + || IsTypeless(srcImage.format) || IsTypeless(format) ) + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + +#ifdef _AMD64_ + if ( (srcImage.width > 0xFFFFFFFF) || (srcImage.height > 0xFFFFFFFF) ) + return E_INVALIDARG; +#endif + + HRESULT hr = image.Initialize2D( format, srcImage.width, srcImage.height, 1, 1 ); + if ( FAILED(hr) ) + return hr; + + const Image *rimage = image.GetImage( 0, 0, 0 ); + if ( !rimage ) + { + image.Release(); + return E_POINTER; + } + + WICPixelFormatGUID pfGUID; + if ( _DXGIToWIC( srcImage.format, pfGUID ) ) + { + WICPixelFormatGUID targetGUID; + if ( _DXGIToWIC( format, targetGUID ) ) + { + // Case 1: Both source and target formats are WIC supported + hr = _ConvertUsingWIC( srcImage, pfGUID, targetGUID, filter, threshold, *rimage ); + } + else + { + // Case 2: Source format is supported by WIC, but not the target format + hr = _ConvertFromWIC( srcImage, pfGUID, filter, threshold, *rimage ); + } + } + else + { + WICPixelFormatGUID targetGUID; + if ( _DXGIToWIC( format, targetGUID ) ) + { + // Case 3: Source format is not supported by WIC, but does support the target format + hr = _ConvertToWIC( srcImage, targetGUID, filter, threshold, *rimage ); + } + else + { + // Case 4: Both source and target format are not supported by WIC + hr = _Convert( srcImage, filter, *rimage ); + } + } + + if ( FAILED(hr) ) + { + image.Release(); + return hr; + } + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Convert image (complex) +//------------------------------------------------------------------------------------- +HRESULT Convert( const Image* srcImages, size_t nimages, const TexMetadata& metadata, + DXGI_FORMAT format, DWORD filter, float threshold, ScratchImage& result ) +{ + if ( !srcImages || !nimages || (metadata.format == format) || !IsValid(format) ) + return E_INVALIDARG; + + if ( IsCompressed(metadata.format) || IsCompressed(format) + || IsVideo(metadata.format) || IsVideo(format) + || IsTypeless(metadata.format) || IsTypeless(format) ) + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + +#ifdef _AMD64_ + if ( (metadata.width > 0xFFFFFFFF) || (metadata.height > 0xFFFFFFFF) ) + return E_INVALIDARG; +#endif + + TexMetadata mdata2 = metadata; + mdata2.format = format; + HRESULT hr = result.Initialize( mdata2 ); + if ( FAILED(hr) ) + return hr; + + if ( nimages != result.GetImageCount() ) + { + result.Release(); + return E_FAIL; + } + + const Image* dest = result.GetImages(); + if ( !dest ) + { + result.Release(); + return E_POINTER; + } + + WICPixelFormatGUID pfGUID, targetGUID; + bool wicpf = _DXGIToWIC( metadata.format, pfGUID ); + bool wictargetpf = _DXGIToWIC( format, targetGUID ); + + for( size_t index=0; index < nimages; ++index ) + { + const Image& src = srcImages[ index ]; + if ( src.format != metadata.format ) + { + result.Release(); + return E_FAIL; + } + +#ifdef _AMD64_ + if ( (src.width > 0xFFFFFFFF) || (src.height > 0xFFFFFFFF) ) + return E_FAIL; +#endif + + const Image& dst = dest[ index ]; + assert( dst.format == format ); + + if ( src.width != dst.width || src.height != dst.height ) + { + result.Release(); + return E_FAIL; + } + + if ( wicpf ) + { + if ( wictargetpf ) + { + // Case 1: Both source and target formats are WIC supported + hr = _ConvertUsingWIC( src, pfGUID, targetGUID, filter, threshold, dst ); + } + else + { + // Case 2: Source format is supported by WIC, but not the target format + hr = _ConvertFromWIC( src, pfGUID, filter, threshold, dst ); + } + } + else + { + if ( wictargetpf ) + { + // Case 3: Source format is not supported by WIC, but does support the target format + hr = _ConvertToWIC( src, targetGUID, filter, threshold, dst ); + } + else + { + // Case 4: Both source and target format are not supported by WIC + hr = _Convert( src, filter, dst ); + } + } + + if ( FAILED(hr) ) + { + result.Release(); + return hr; + } + } + + return S_OK; +} + +}; // namespace diff --git a/thirdparty/directxtex/DirectXTex/DirectXTexD3D11.cpp b/thirdparty/directxtex/DirectXTex/DirectXTexD3D11.cpp new file mode 100644 index 0000000..70712f0 --- /dev/null +++ b/thirdparty/directxtex/DirectXTex/DirectXTexD3D11.cpp @@ -0,0 +1,820 @@ +//------------------------------------------------------------------------------------- +// DirectXTexD3D11.cpp +// +// DirectX Texture Library - Direct3D 11 helpers +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkId=248926 +//------------------------------------------------------------------------------------- + +#include "directxtexp.h" + +#include + +namespace DirectX +{ + +static HRESULT _Capture( _In_ ID3D11DeviceContext* pContext, _In_ ID3D11Resource* pSource, _In_ const TexMetadata& metadata, + _In_ const ScratchImage& result ) +{ + if ( !pContext || !pSource || !result.GetPixels() ) + return E_POINTER; + + if ( metadata.dimension == TEX_DIMENSION_TEXTURE3D ) + { + //--- Volume texture ---------------------------------------------------------- + assert( metadata.arraySize == 1 ); + + size_t height = metadata.height; + size_t depth = metadata.depth; + + for( size_t level = 0; level < metadata.mipLevels; ++level ) + { + UINT dindex = D3D11CalcSubresource( static_cast( level ), 0, static_cast( metadata.mipLevels ) ); + + D3D11_MAPPED_SUBRESOURCE mapped; + HRESULT hr = pContext->Map( pSource, dindex, D3D11_MAP_READ, 0, &mapped ); + if ( FAILED(hr) ) + return hr; + + const uint8_t* pslice = reinterpret_cast( mapped.pData ); + if ( !pslice ) + { + pContext->Unmap( pSource, dindex ); + return E_POINTER; + } + + size_t lines = ComputeScanlines( metadata.format, height ); + + for( size_t slice = 0; slice < depth; ++slice ) + { + const Image* img = result.GetImage( level, 0, slice ); + if ( !img ) + { + pContext->Unmap( pSource, dindex ); + return E_FAIL; + } + + if ( !img->pixels ) + { + pContext->Unmap( pSource, dindex ); + return E_POINTER; + } + + const uint8_t* sptr = pslice; + uint8_t* dptr = img->pixels; + for( size_t h = 0; h < lines; ++h ) + { + size_t msize = std::min( img->rowPitch, mapped.RowPitch ); + memcpy_s( dptr, img->rowPitch, sptr, msize ); + sptr += mapped.RowPitch; + dptr += img->rowPitch; + } + + pslice += mapped.DepthPitch; + } + + pContext->Unmap( pSource, dindex ); + + if ( height > 1 ) + height >>= 1; + if ( depth > 1 ) + depth >>= 1; + } + } + else + { + //--- 1D or 2D texture -------------------------------------------------------- + assert( metadata.depth == 1 ); + + for( size_t item = 0; item < metadata.arraySize; ++item ) + { + size_t height = metadata.height; + + for( size_t level = 0; level < metadata.mipLevels; ++level ) + { + UINT dindex = D3D11CalcSubresource( static_cast( level ), static_cast( item ), static_cast( metadata.mipLevels ) ); + + D3D11_MAPPED_SUBRESOURCE mapped; + HRESULT hr = pContext->Map( pSource, dindex, D3D11_MAP_READ, 0, &mapped ); + if ( FAILED(hr) ) + return hr; + + const Image* img = result.GetImage( level, item, 0 ); + if ( !img ) + { + pContext->Unmap( pSource, dindex ); + return E_FAIL; + } + + if ( !img->pixels ) + { + pContext->Unmap( pSource, dindex ); + return E_POINTER; + } + + size_t lines = ComputeScanlines( metadata.format, height ); + + const uint8_t* sptr = reinterpret_cast( mapped.pData ); + uint8_t* dptr = img->pixels; + for( size_t h = 0; h < lines; ++h ) + { + size_t msize = std::min( img->rowPitch, mapped.RowPitch ); + memcpy_s( dptr, img->rowPitch, sptr, msize ); + sptr += mapped.RowPitch; + dptr += img->rowPitch; + } + + pContext->Unmap( pSource, dindex ); + + if ( height > 1 ) + height >>= 1; + } + } + } + + return S_OK; +} + + +//===================================================================================== +// Entry-points +//===================================================================================== + +//------------------------------------------------------------------------------------- +// Determine if given texture metadata is supported on the given device +//------------------------------------------------------------------------------------- +bool IsSupportedTexture( ID3D11Device* pDevice, const TexMetadata& metadata ) +{ + if ( !pDevice ) + return false; + + D3D_FEATURE_LEVEL fl = pDevice->GetFeatureLevel(); + + // Validate format + DXGI_FORMAT fmt = metadata.format; + + if ( !IsValid( fmt ) ) + return false; + + if ( IsVideo(fmt) ) + return false; + + switch( fmt ) + { + case DXGI_FORMAT_BC4_TYPELESS: + case DXGI_FORMAT_BC4_UNORM: + case DXGI_FORMAT_BC4_SNORM: + case DXGI_FORMAT_BC5_TYPELESS: + case DXGI_FORMAT_BC5_UNORM: + case DXGI_FORMAT_BC5_SNORM: + if ( fl < D3D_FEATURE_LEVEL_10_0 ) + return false; + break; + + case DXGI_FORMAT_BC6H_TYPELESS: + case DXGI_FORMAT_BC6H_UF16: + case DXGI_FORMAT_BC6H_SF16: + case DXGI_FORMAT_BC7_TYPELESS: + case DXGI_FORMAT_BC7_UNORM: + case DXGI_FORMAT_BC7_UNORM_SRGB: + if ( fl < D3D_FEATURE_LEVEL_11_0 ) + return false; + break; + } + + // Validate miplevel count + if ( metadata.mipLevels > D3D11_REQ_MIP_LEVELS ) + return false; + + // Validate array size, dimension, and width/height + size_t arraySize = metadata.arraySize; + size_t iWidth = metadata.width; + size_t iHeight = metadata.height; + size_t iDepth = metadata.depth; + + // Most cases are known apriori based on feature level, but we use this for robustness to handle the few optional cases + UINT formatSupport = 0; + pDevice->CheckFormatSupport( fmt, &formatSupport ); + + switch ( metadata.dimension ) + { + case TEX_DIMENSION_TEXTURE1D: + if ( !(formatSupport & D3D11_FORMAT_SUPPORT_TEXTURE1D) ) + return false; + + if ( (arraySize > D3D11_REQ_TEXTURE1D_ARRAY_AXIS_DIMENSION) + || (iWidth > D3D11_REQ_TEXTURE1D_U_DIMENSION) ) + return false; + + if ( fl < D3D_FEATURE_LEVEL_11_0 ) + { + if ( (arraySize > D3D10_REQ_TEXTURE1D_ARRAY_AXIS_DIMENSION) + || (iWidth > D3D10_REQ_TEXTURE1D_U_DIMENSION) ) + return false; + + if ( fl < D3D_FEATURE_LEVEL_10_0 ) + { + if ( (arraySize > 1) || (iWidth > 4096 /*D3D_FL9_3_REQ_TEXTURE1D_U_DIMENSION*/) ) + return false; + + if ( (fl < D3D_FEATURE_LEVEL_9_3) && (iWidth > 2048 /*D3D_FL9_1_REQ_TEXTURE1D_U_DIMENSION*/ ) ) + return false; + } + } + break; + + case TEX_DIMENSION_TEXTURE2D: + if ( metadata.miscFlags & TEX_MISC_TEXTURECUBE ) + { + if ( !(formatSupport & D3D11_FORMAT_SUPPORT_TEXTURECUBE) ) + return false; + + if ( (arraySize > D3D11_REQ_TEXTURE2D_ARRAY_AXIS_DIMENSION) + || (iWidth > D3D11_REQ_TEXTURECUBE_DIMENSION) + || (iHeight > D3D11_REQ_TEXTURECUBE_DIMENSION)) + return false; + + if ( fl < D3D_FEATURE_LEVEL_11_0 ) + { + if ( (arraySize > D3D10_REQ_TEXTURE2D_ARRAY_AXIS_DIMENSION) + || (iWidth > D3D10_REQ_TEXTURECUBE_DIMENSION) + || (iHeight > D3D10_REQ_TEXTURECUBE_DIMENSION)) + return false; + + if ( (fl < D3D_FEATURE_LEVEL_10_1) && (arraySize != 6) ) + return false; + + if ( fl < D3D_FEATURE_LEVEL_10_0 ) + { + if ( (iWidth > 4096 /*D3D_FL9_3_REQ_TEXTURECUBE_DIMENSION*/ ) + || (iHeight > 4096 /*D3D_FL9_3_REQ_TEXTURECUBE_DIMENSION*/ ) ) + return false; + + if ( (fl < D3D_FEATURE_LEVEL_9_3) + && ( (iWidth > 512 /*D3D_FL9_1_REQ_TEXTURECUBE_DIMENSION*/) + || (iHeight > 512 /*D3D_FL9_1_REQ_TEXTURECUBE_DIMENSION*/) ) ) + return false; + } + } + } + else // Not a cube map + { + if ( !(formatSupport & D3D11_FORMAT_SUPPORT_TEXTURE2D) ) + return false; + + if ( (arraySize > D3D11_REQ_TEXTURE2D_ARRAY_AXIS_DIMENSION) + || (iWidth > D3D11_REQ_TEXTURE2D_U_OR_V_DIMENSION) + || (iHeight > D3D11_REQ_TEXTURE2D_U_OR_V_DIMENSION)) + return false; + + if ( fl < D3D_FEATURE_LEVEL_11_0 ) + { + if ( (arraySize > D3D10_REQ_TEXTURE2D_ARRAY_AXIS_DIMENSION) + || (iWidth > D3D10_REQ_TEXTURE2D_U_OR_V_DIMENSION) + || (iHeight > D3D10_REQ_TEXTURE2D_U_OR_V_DIMENSION)) + return false; + + if ( fl < D3D_FEATURE_LEVEL_10_0 ) + { + if ( (arraySize > 1) + || (iWidth > 4096 /*D3D_FL9_3_REQ_TEXTURE2D_U_OR_V_DIMENSION*/) + || (iHeight > 4096 /*D3D_FL9_3_REQ_TEXTURE2D_U_OR_V_DIMENSION*/) ) + return false; + + if ( (fl < D3D_FEATURE_LEVEL_9_3) + && ( (iWidth > 2048 /*D3D_FL9_1_REQ_TEXTURE2D_U_OR_V_DIMENSION*/) + || (iHeight > 2048 /*D3D_FL9_1_REQ_TEXTURE2D_U_OR_V_DIMENSION*/) ) ) + return false; + } + } + } + break; + + case TEX_DIMENSION_TEXTURE3D: + if ( !(formatSupport & D3D11_FORMAT_SUPPORT_TEXTURE3D) ) + return false; + + if ( (arraySize > 1) + || (iWidth > D3D11_REQ_TEXTURE3D_U_V_OR_W_DIMENSION) + || (iHeight > D3D11_REQ_TEXTURE3D_U_V_OR_W_DIMENSION) + || (iDepth > D3D11_REQ_TEXTURE3D_U_V_OR_W_DIMENSION) ) + return false; + + if ( fl < D3D_FEATURE_LEVEL_11_0 ) + { + if ( (iWidth > D3D10_REQ_TEXTURE3D_U_V_OR_W_DIMENSION) + || (iHeight > D3D10_REQ_TEXTURE3D_U_V_OR_W_DIMENSION) + || (iDepth > D3D10_REQ_TEXTURE3D_U_V_OR_W_DIMENSION) ) + return false; + + if ( fl < D3D_FEATURE_LEVEL_10_0 ) + { + if ( (iWidth > 256 /*D3D_FL9_1_REQ_TEXTURE3D_U_V_OR_W_DIMENSION*/) + || (iHeight > 256 /*D3D_FL9_1_REQ_TEXTURE3D_U_V_OR_W_DIMENSION*/) + || (iDepth > 256 /*D3D_FL9_1_REQ_TEXTURE3D_U_V_OR_W_DIMENSION*/) ) + return false; + } + } + break; + + default: + // Not a supported dimension + return false; + } + + return true; +} + + +//------------------------------------------------------------------------------------- +// Create a texture resource +//------------------------------------------------------------------------------------- +HRESULT CreateTexture( ID3D11Device* pDevice, const Image* srcImages, size_t nimages, const TexMetadata& metadata, + ID3D11Resource** ppResource ) +{ + if ( !pDevice || !srcImages || !nimages || !ppResource ) + return E_INVALIDARG; + + if ( !metadata.mipLevels || !metadata.arraySize ) + return E_INVALIDARG; + +#ifdef _AMD64_ + if ( (metadata.width > 0xFFFFFFFF) || (metadata.height > 0xFFFFFFFF) + || (metadata.mipLevels > 0xFFFFFFFF) || (metadata.arraySize > 0xFFFFFFFF) ) + return E_INVALIDARG; +#endif + + std::unique_ptr initData( new D3D11_SUBRESOURCE_DATA[ metadata.mipLevels * metadata.arraySize ] ); + if ( !initData ) + return E_OUTOFMEMORY; + + // Fill out subresource array + if ( metadata.dimension == TEX_DIMENSION_TEXTURE3D ) + { + //--- Volume case ------------------------------------------------------------- + if ( !metadata.depth ) + return E_INVALIDARG; + +#ifdef _AMD64_ + if ( metadata.depth > 0xFFFFFFFF ) + return E_INVALIDARG; +#endif + + if ( metadata.arraySize > 1 ) + // Direct3D 11 doesn't support arrays of 3D textures + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + + size_t depth = metadata.depth; + + size_t idx = 0; + for( size_t level = 0; level < metadata.mipLevels; ++level ) + { + size_t index = metadata.ComputeIndex( level, 0, 0 ); + if ( index >= nimages ) + return E_FAIL; + + const Image& img = srcImages[ index ]; + + if ( img.format != metadata.format ) + return E_FAIL; + + if ( !img.pixels ) + return E_POINTER; + + // Verify pixels in image 1 .. (depth-1) are exactly image->slicePitch apart + // For 3D textures, this relies on all slices of the same miplevel being continous in memory + // (this is how ScratchImage lays them out), which is why we just give the 0th slice to Direct3D 11 + const uint8_t* pSlice = img.pixels + img.slicePitch; + for( size_t slice = 1; slice < depth; ++slice ) + { + size_t tindex = metadata.ComputeIndex( level, 0, slice ); + if ( tindex >= nimages ) + return E_FAIL; + + const Image& timg = srcImages[ tindex ]; + + if ( !timg.pixels ) + return E_POINTER; + + if ( timg.pixels != pSlice + || timg.format != metadata.format + || timg.rowPitch != img.rowPitch + || timg.slicePitch != img.slicePitch ) + return E_FAIL; + + pSlice = timg.pixels + img.slicePitch; + } + + assert( idx < (metadata.mipLevels * metadata.arraySize) ); + + initData[idx].pSysMem = img.pixels; + initData[idx].SysMemPitch = static_cast( img.rowPitch ); + initData[idx].SysMemSlicePitch = static_cast( img.slicePitch ); + ++idx; + + if ( depth > 1 ) + depth >>= 1; + } + } + else + { + //--- 1D or 2D texture case --------------------------------------------------- + size_t idx = 0; + for( size_t item = 0; item < metadata.arraySize; ++item ) + { + for( size_t level = 0; level < metadata.mipLevels; ++level ) + { + size_t index = metadata.ComputeIndex( level, item, 0 ); + if ( index >= nimages ) + return E_FAIL; + + const Image& img = srcImages[ index ]; + + if ( img.format != metadata.format ) + return E_FAIL; + + if ( !img.pixels ) + return E_POINTER; + + assert( idx < (metadata.mipLevels * metadata.arraySize) ); + + initData[idx].pSysMem = img.pixels; + initData[idx].SysMemPitch = static_cast( img.rowPitch ); + initData[idx].SysMemSlicePitch = static_cast( img.slicePitch ); + ++idx; + } + } + } + + // Create texture using static initialization data + HRESULT hr = E_FAIL; + + switch ( metadata.dimension ) + { + case TEX_DIMENSION_TEXTURE1D: + { + D3D11_TEXTURE1D_DESC desc; + desc.Width = static_cast( metadata.width ); + desc.MipLevels = static_cast( metadata.mipLevels ); + desc.ArraySize = static_cast( metadata.arraySize ); + desc.Format = metadata.format; + desc.Usage = D3D11_USAGE_DEFAULT; + desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; + desc.CPUAccessFlags = 0; + desc.MiscFlags = 0; + + hr = pDevice->CreateTexture1D( &desc, initData.get(), reinterpret_cast(ppResource) ); + } + break; + + case TEX_DIMENSION_TEXTURE2D: + { + D3D11_TEXTURE2D_DESC desc; + desc.Width = static_cast( metadata.width ); + desc.Height = static_cast( metadata.height ); + desc.MipLevels = static_cast( metadata.mipLevels ); + desc.ArraySize = static_cast( metadata.arraySize ); + desc.Format = metadata.format; + desc.SampleDesc.Count = 1; + desc.SampleDesc.Quality = 0; + desc.Usage = D3D11_USAGE_DEFAULT; + desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; + desc.CPUAccessFlags = 0; + desc.MiscFlags = (metadata.miscFlags & TEX_MISC_TEXTURECUBE) ? D3D11_RESOURCE_MISC_TEXTURECUBE : 0; + + hr = pDevice->CreateTexture2D( &desc, initData.get(), reinterpret_cast(ppResource) ); + } + break; + + case TEX_DIMENSION_TEXTURE3D: + { + D3D11_TEXTURE3D_DESC desc; + desc.Width = static_cast( metadata.width ); + desc.Height = static_cast( metadata.height ); + desc.Depth = static_cast( metadata.depth ); + desc.MipLevels = static_cast( metadata.mipLevels ); + desc.Format = metadata.format; + desc.Usage = D3D11_USAGE_DEFAULT; + desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; + desc.CPUAccessFlags = 0; + desc.MiscFlags = 0; + + hr = pDevice->CreateTexture3D( &desc, initData.get(), reinterpret_cast(ppResource) ); + } + break; + } + + return hr; +} + + +//------------------------------------------------------------------------------------- +// Create a shader resource view and associated texture +//------------------------------------------------------------------------------------- +HRESULT CreateShaderResourceView( ID3D11Device* pDevice, const Image* srcImages, size_t nimages, const TexMetadata& metadata, + ID3D11ShaderResourceView** ppSRV ) +{ + if ( !ppSRV ) + return E_INVALIDARG; + + ScopedObject resource; + HRESULT hr = CreateTexture( pDevice, srcImages, nimages, metadata, &resource ); + if ( FAILED(hr) ) + return hr; + + assert( !resource.IsNull() ); + + D3D11_SHADER_RESOURCE_VIEW_DESC SRVDesc; + memset( &SRVDesc, 0, sizeof(SRVDesc) ); + SRVDesc.Format = metadata.format; + + switch ( metadata.dimension ) + { + case TEX_DIMENSION_TEXTURE1D: + if ( metadata.arraySize > 1 ) + { + SRVDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE1DARRAY; + SRVDesc.Texture1DArray.MipLevels = static_cast( metadata.mipLevels ); + SRVDesc.Texture1DArray.ArraySize = static_cast( metadata.arraySize ); + } + else + { + SRVDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE1D; + SRVDesc.Texture1D.MipLevels = static_cast( metadata.mipLevels ); + } + break; + + case TEX_DIMENSION_TEXTURE2D: + if ( metadata.miscFlags & TEX_MISC_TEXTURECUBE ) + { + if (metadata.arraySize > 6) + { + assert( (metadata.arraySize % 6) == 0 ); + SRVDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURECUBEARRAY; + SRVDesc.TextureCubeArray.MipLevels = static_cast( metadata.mipLevels ); + SRVDesc.TextureCubeArray.NumCubes = static_cast( metadata.arraySize / 6 ); + } + else + { + SRVDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURECUBE; + SRVDesc.TextureCube.MipLevels = static_cast( metadata.mipLevels ); + } + } + else if ( metadata.arraySize > 1 ) + { + SRVDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2DARRAY; + SRVDesc.Texture2DArray.MipLevels = static_cast( metadata.mipLevels ); + SRVDesc.Texture2DArray.ArraySize = static_cast( metadata.arraySize ); + } + else + { + SRVDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D; + SRVDesc.Texture2D.MipLevels = static_cast( metadata.mipLevels ); + } + break; + + case TEX_DIMENSION_TEXTURE3D: + assert( metadata.arraySize == 1 ); + SRVDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE3D; + SRVDesc.Texture3D.MipLevels = static_cast( metadata.mipLevels ); + break; + + default: + return E_FAIL; + } + + hr = pDevice->CreateShaderResourceView( resource.Get(), &SRVDesc, ppSRV ); + if ( FAILED(hr) ) + return hr; + + assert( *ppSRV ); + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Save a texture resource to a DDS file in memory/on disk +//------------------------------------------------------------------------------------- +HRESULT CaptureTexture( ID3D11Device* pDevice, ID3D11DeviceContext* pContext, ID3D11Resource* pSource, ScratchImage& result ) +{ + if ( !pDevice || !pContext || !pSource ) + return E_INVALIDARG; + + D3D11_RESOURCE_DIMENSION resType = D3D11_RESOURCE_DIMENSION_UNKNOWN; + pSource->GetType( &resType ); + + HRESULT hr; + + switch( resType ) + { + case D3D11_RESOURCE_DIMENSION_TEXTURE1D: + { + ScopedObject pTexture; + hr = pSource->QueryInterface( __uuidof(ID3D11Texture1D), (void**) &pTexture ); + if ( FAILED(hr) ) + break; + + assert( pTexture.Get() ); + + D3D11_TEXTURE1D_DESC desc; + pTexture->GetDesc( &desc ); + + desc.BindFlags = 0; + desc.MiscFlags = 0; + desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ; + desc.Usage = D3D11_USAGE_STAGING; + + ScopedObject pStaging; + hr = pDevice->CreateTexture1D( &desc, 0, &pStaging ); + if ( FAILED(hr) ) + break; + + assert( pStaging.Get() ); + + pContext->CopyResource( pStaging.Get(), pSource ); + + TexMetadata mdata; + mdata.width = desc.Width; + mdata.height = mdata.depth = 1; + mdata.arraySize = desc.ArraySize; + mdata.mipLevels = desc.MipLevels; + mdata.miscFlags = 0; + mdata.format = desc.Format; + mdata.dimension = TEX_DIMENSION_TEXTURE1D; + + hr = result.Initialize( mdata ); + if ( FAILED(hr) ) + break; + + hr = _Capture( pContext, pStaging.Get(), mdata, result ); + } + break; + + case D3D11_RESOURCE_DIMENSION_TEXTURE2D: + { + ScopedObject pTexture; + hr = pSource->QueryInterface( __uuidof(ID3D11Texture2D), (void**) &pTexture ); + if ( FAILED(hr) ) + break; + + assert( pTexture.Get() ); + + D3D11_TEXTURE2D_DESC desc; + pTexture->GetDesc( &desc ); + + ScopedObject pStaging; + if ( desc.SampleDesc.Count > 1 ) + { + desc.SampleDesc.Count = 1; + desc.SampleDesc.Quality = 0; + + ScopedObject pTemp; + hr = pDevice->CreateTexture2D( &desc, 0, &pTemp ); + if ( FAILED(hr) ) + break; + + assert( pTemp.Get() ); + + DXGI_FORMAT fmt = desc.Format; + if ( IsTypeless(fmt) ) + { + // Assume a UNORM if it exists otherwise use FLOAT + fmt = MakeTypelessUNORM( fmt ); + fmt = MakeTypelessFLOAT( fmt ); + } + + UINT support = 0; + hr = pDevice->CheckFormatSupport( fmt, &support ); + if ( FAILED(hr) ) + break; + + if ( !(support & D3D11_FORMAT_SUPPORT_MULTISAMPLE_RESOLVE) ) + { + hr = E_FAIL; + break; + } + + for( UINT item = 0; item < desc.ArraySize; ++item ) + { + for( UINT level = 0; level < desc.MipLevels; ++level ) + { + UINT index = D3D11CalcSubresource( level, item, desc.MipLevels ); + + pContext->ResolveSubresource( pTemp.Get(), index, pSource, index, fmt ); + } + } + + desc.BindFlags = 0; + desc.MiscFlags &= D3D11_RESOURCE_MISC_TEXTURECUBE; + desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ; + desc.Usage = D3D11_USAGE_STAGING; + + hr = pDevice->CreateTexture2D( &desc, 0, &pStaging ); + if ( FAILED(hr) ) + break; + + assert( pStaging.Get() ); + + pContext->CopyResource( pStaging.Get(), pTemp.Get() ); + } + else + { + desc.BindFlags = 0; + desc.MiscFlags &= D3D11_RESOURCE_MISC_TEXTURECUBE; + desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ; + desc.Usage = D3D11_USAGE_STAGING; + + hr = pDevice->CreateTexture2D( &desc, 0, &pStaging ); + if ( FAILED(hr) ) + break; + + assert( pStaging.Get() ); + + pContext->CopyResource( pStaging.Get(), pSource ); + } + + TexMetadata mdata; + mdata.width = desc.Width; + mdata.height = desc.Height; + mdata.depth = 1; + mdata.arraySize = desc.ArraySize; + mdata.mipLevels = desc.MipLevels; + mdata.miscFlags = (desc.MiscFlags & D3D11_RESOURCE_MISC_TEXTURECUBE) ? TEX_MISC_TEXTURECUBE : 0; + mdata.format = desc.Format; + mdata.dimension = TEX_DIMENSION_TEXTURE2D; + + hr = result.Initialize( mdata ); + if ( FAILED(hr) ) + break; + + hr = _Capture( pContext, pStaging.Get(), mdata, result ); + } + break; + + case D3D11_RESOURCE_DIMENSION_TEXTURE3D: + { + ScopedObject pTexture; + hr = pSource->QueryInterface( __uuidof(ID3D11Texture3D), (void**) &pTexture ); + if ( FAILED(hr) ) + break; + + assert( pTexture.Get() ); + + D3D11_TEXTURE3D_DESC desc; + pTexture->GetDesc( &desc ); + + desc.BindFlags = 0; + desc.MiscFlags = 0; + desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ; + desc.Usage = D3D11_USAGE_STAGING; + + ScopedObject pStaging; + hr = pDevice->CreateTexture3D( &desc, 0, &pStaging ); + if ( FAILED(hr) ) + break; + + assert( pStaging.Get() ); + + pContext->CopyResource( pStaging.Get(), pSource ); + + TexMetadata mdata; + mdata.width = desc.Width; + mdata.height = desc.Height; + mdata.depth = desc.Depth; + mdata.arraySize = 1; + mdata.mipLevels = desc.MipLevels; + mdata.miscFlags = 0; + mdata.format = desc.Format; + mdata.dimension = TEX_DIMENSION_TEXTURE3D; + + hr = result.Initialize( mdata ); + if ( FAILED(hr) ) + break; + + hr = _Capture( pContext, pStaging.Get(), mdata, result ); + } + break; + + default: + hr = E_FAIL; + break; + } + + if ( FAILED(hr) ) + { + result.Release(); + return hr; + } + + return S_OK; +} + +}; // namespace diff --git a/thirdparty/directxtex/DirectXTex/DirectXTexDDS.cpp b/thirdparty/directxtex/DirectXTex/DirectXTexDDS.cpp new file mode 100644 index 0000000..c91e41b --- /dev/null +++ b/thirdparty/directxtex/DirectXTex/DirectXTexDDS.cpp @@ -0,0 +1,1684 @@ +//------------------------------------------------------------------------------------- +// DirectXTexDDS.cpp +// +// DirectX Texture Library - Microsoft DirectDraw Surface (DDS) file format reader/writer +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkId=248926 +//------------------------------------------------------------------------------------- + +#include "directxtexp.h" + +#include "dds.h" + +namespace DirectX +{ + +//------------------------------------------------------------------------------------- +// Legacy format mapping table (used for DDS files without 'DX10' extended header) +//------------------------------------------------------------------------------------- +enum CONVERSION_FLAGS +{ + CONV_FLAGS_NONE = 0x0, + CONV_FLAGS_EXPAND = 0x1, // Conversion requires expanded pixel size + CONV_FLAGS_NOALPHA = 0x2, // Conversion requires setting alpha to known value + CONV_FLAGS_SWIZZLE = 0x4, // BGR/RGB order swizzling required + CONV_FLAGS_PAL8 = 0x8, // Has an 8-bit palette + CONV_FLAGS_888 = 0x10, // Source is an 8:8:8 (24bpp) format + CONV_FLAGS_565 = 0x20, // Source is a 5:6:5 (16bpp) format + CONV_FLAGS_5551 = 0x40, // Source is a 5:5:5:1 (16bpp) format + CONV_FLAGS_4444 = 0x80, // Source is a 4:4:4:4 (16bpp) format + CONV_FLAGS_44 = 0x100, // Source is a 4:4 (8bpp) format + CONV_FLAGS_332 = 0x200, // Source is a 3:3:2 (8bpp) format + CONV_FLAGS_8332 = 0x400, // Source is a 8:3:3:2 (16bpp) format + CONV_FLAGS_A8P8 = 0x800, // Has an 8-bit palette with an alpha channel + CONV_FLAGS_DX10 = 0x10000, // Has the 'DX10' extension header +}; + +struct LegacyDDS +{ + DXGI_FORMAT format; + DWORD convFlags; + DDS_PIXELFORMAT ddpf; +}; + +const LegacyDDS g_LegacyDDSMap[] = +{ + { DXGI_FORMAT_BC1_UNORM, CONV_FLAGS_NONE, DDSPF_DXT1 }, // D3DFMT_DXT1 + { DXGI_FORMAT_BC2_UNORM, CONV_FLAGS_NONE, DDSPF_DXT3 }, // D3DFMT_DXT3 + { DXGI_FORMAT_BC3_UNORM, CONV_FLAGS_NONE, DDSPF_DXT5 }, // D3DFMT_DXT5 + + { DXGI_FORMAT_BC2_UNORM, CONV_FLAGS_NONE, DDSPF_DXT2 }, // D3DFMT_DXT2 (ignore premultiply) + { DXGI_FORMAT_BC3_UNORM, CONV_FLAGS_NONE, DDSPF_DXT4 }, // D3DFMT_DXT4 (ignore premultiply) + + { DXGI_FORMAT_BC4_UNORM, CONV_FLAGS_NONE, DDSPF_BC4_UNORM }, + { DXGI_FORMAT_BC4_SNORM, CONV_FLAGS_NONE, DDSPF_BC4_SNORM }, + { DXGI_FORMAT_BC5_UNORM, CONV_FLAGS_NONE, DDSPF_BC5_UNORM }, + { DXGI_FORMAT_BC5_SNORM, CONV_FLAGS_NONE, DDSPF_BC5_SNORM }, + + { DXGI_FORMAT_BC4_UNORM, CONV_FLAGS_NONE, { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, MAKEFOURCC( 'A', 'T', 'I', '1' ), 0, 0, 0, 0, 0 } }, + { DXGI_FORMAT_BC5_UNORM, CONV_FLAGS_NONE, { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, MAKEFOURCC( 'A', 'T', 'I', '2' ), 0, 0, 0, 0, 0 } }, + + { DXGI_FORMAT_R8G8_B8G8_UNORM, CONV_FLAGS_NONE, DDSPF_R8G8_B8G8 }, // D3DFMT_R8G8_B8G8 + { DXGI_FORMAT_G8R8_G8B8_UNORM, CONV_FLAGS_NONE, DDSPF_G8R8_G8B8 }, // D3DFMT_G8R8_G8B8 + + { DXGI_FORMAT_B8G8R8A8_UNORM, CONV_FLAGS_NONE, DDSPF_A8R8G8B8 }, // D3DFMT_A8R8G8B8 (uses DXGI 1.1 format) + { DXGI_FORMAT_B8G8R8X8_UNORM, CONV_FLAGS_NONE, DDSPF_X8R8G8B8 }, // D3DFMT_X8R8G8B8 (uses DXGI 1.1 format) + { DXGI_FORMAT_R8G8B8A8_UNORM, CONV_FLAGS_NONE, DDSPF_A8B8G8R8 }, // D3DFMT_A8B8G8R8 + { DXGI_FORMAT_R8G8B8A8_UNORM, CONV_FLAGS_NOALPHA, DDSPF_X8B8G8R8 }, // D3DFMT_X8B8G8R8 + { DXGI_FORMAT_R16G16_UNORM, CONV_FLAGS_NONE, DDSPF_G16R16 }, // D3DFMT_G16R16 + + { DXGI_FORMAT_R10G10B10A2_UNORM, CONV_FLAGS_SWIZZLE, { sizeof(DDS_PIXELFORMAT), DDS_RGB, 0, 32, 0x000003ff, 0x000ffc00, 0x3ff00000, 0xc0000000 } }, // D3DFMT_A2R10G10B10 (D3DX reversal issue workaround) + { DXGI_FORMAT_R10G10B10A2_UNORM, CONV_FLAGS_NONE, { sizeof(DDS_PIXELFORMAT), DDS_RGB, 0, 32, 0x3ff00000, 0x000ffc00, 0x000003ff, 0xc0000000 } }, // D3DFMT_A2B10G10R10 (D3DX reversal issue workaround) + + { DXGI_FORMAT_R8G8B8A8_UNORM, CONV_FLAGS_EXPAND + | CONV_FLAGS_NOALPHA + | CONV_FLAGS_888, DDSPF_R8G8B8 }, // D3DFMT_R8G8B8 + + { DXGI_FORMAT_B5G6R5_UNORM, CONV_FLAGS_565, DDSPF_R5G6B5 }, // D3DFMT_R5G6B5 + { DXGI_FORMAT_B5G5R5A1_UNORM, CONV_FLAGS_5551, DDSPF_A1R5G5B5 }, // D3DFMT_A1R5G5B5 + { DXGI_FORMAT_B5G5R5A1_UNORM, CONV_FLAGS_5551 + | CONV_FLAGS_NOALPHA, { sizeof(DDS_PIXELFORMAT), DDS_RGB, 0, 16, 0x7c00, 0x03e0, 0x001f, 0x0000 } }, // D3DFMT_X1R5G5B5 + + { DXGI_FORMAT_R8G8B8A8_UNORM, CONV_FLAGS_EXPAND + | CONV_FLAGS_8332, { sizeof(DDS_PIXELFORMAT), DDS_RGB, 0, 16, 0x00e0, 0x001c, 0x0003, 0xff00 } }, // D3DFMT_A8R3G3B2 + { DXGI_FORMAT_B5G6R5_UNORM, CONV_FLAGS_EXPAND + | CONV_FLAGS_332, { sizeof(DDS_PIXELFORMAT), DDS_RGB, 0, 8, 0xe0, 0x1c, 0x03, 0x00 } }, // D3DFMT_R3G3B2 + + { DXGI_FORMAT_R8_UNORM, CONV_FLAGS_NONE, DDSPF_L8 }, // D3DFMT_L8 + { DXGI_FORMAT_R16_UNORM, CONV_FLAGS_NONE, DDSPF_L16 }, // D3DFMT_L16 + { DXGI_FORMAT_R8G8_UNORM, CONV_FLAGS_NONE, DDSPF_A8L8 }, // D3DFMT_A8L8 + + { DXGI_FORMAT_A8_UNORM, CONV_FLAGS_NONE, DDSPF_A8 }, // D3DFMT_A8 + + { DXGI_FORMAT_R16G16B16A16_UNORM, CONV_FLAGS_NONE, { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, 36, 0, 0, 0, 0, 0 } }, // D3DFMT_A16B16G16R16 + { DXGI_FORMAT_R16G16B16A16_SNORM, CONV_FLAGS_NONE, { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, 110, 0, 0, 0, 0, 0 } }, // D3DFMT_Q16W16V16U16 + { DXGI_FORMAT_R16_FLOAT, CONV_FLAGS_NONE, { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, 111, 0, 0, 0, 0, 0 } }, // D3DFMT_R16F + { DXGI_FORMAT_R16G16_FLOAT, CONV_FLAGS_NONE, { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, 112, 0, 0, 0, 0, 0 } }, // D3DFMT_G16R16F + { DXGI_FORMAT_R16G16B16A16_FLOAT, CONV_FLAGS_NONE, { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, 113, 0, 0, 0, 0, 0 } }, // D3DFMT_A16B16G16R16F + { DXGI_FORMAT_R32_FLOAT, CONV_FLAGS_NONE, { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, 114, 0, 0, 0, 0, 0 } }, // D3DFMT_R32F + { DXGI_FORMAT_R32G32_FLOAT, CONV_FLAGS_NONE, { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, 115, 0, 0, 0, 0, 0 } }, // D3DFMT_G32R32F + { DXGI_FORMAT_R32G32B32A32_FLOAT, CONV_FLAGS_NONE, { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, 116, 0, 0, 0, 0, 0 } }, // D3DFMT_A32B32G32R32F + + { DXGI_FORMAT_R32_FLOAT, CONV_FLAGS_NONE, { sizeof(DDS_PIXELFORMAT), DDS_RGB, 0, 32, 0xffffffff, 0x00000000, 0x00000000, 0x00000000 } }, // D3DFMT_R32F (D3DX uses FourCC 114 instead) + + { DXGI_FORMAT_R8G8B8A8_UNORM, CONV_FLAGS_EXPAND + | CONV_FLAGS_PAL8 + | CONV_FLAGS_A8P8, { sizeof(DDS_PIXELFORMAT), DDS_PAL8, 0, 16, 0, 0, 0, 0 } }, // D3DFMT_A8P8 + { DXGI_FORMAT_R8G8B8A8_UNORM, CONV_FLAGS_EXPAND + | CONV_FLAGS_PAL8, { sizeof(DDS_PIXELFORMAT), DDS_PAL8, 0, 8, 0, 0, 0, 0 } }, // D3DFMT_P8 + +#ifdef DXGI_1_2_FORMATS + { DXGI_FORMAT_B4G4R4A4_UNORM, CONV_FLAGS_4444, DDSPF_A4R4G4B4 }, // D3DFMT_A4R4G4B4 (uses DXGI 1.2 format) + { DXGI_FORMAT_B4G4R4A4_UNORM, CONV_FLAGS_NOALPHA + | CONV_FLAGS_4444, { sizeof(DDS_PIXELFORMAT), DDS_RGB, 0, 16, 0x0f00, 0x00f0, 0x000f, 0x0000 } }, // D3DFMT_X4R4G4B4 (uses DXGI 1.2 format) + { DXGI_FORMAT_B4G4R4A4_UNORM, CONV_FLAGS_EXPAND + | CONV_FLAGS_44, { sizeof(DDS_PIXELFORMAT), DDS_LUMINANCE, 0, 8, 0x0f, 0x00, 0x00, 0xf0 } }, // D3DFMT_A4L4 (uses DXGI 1.2 format) +#else // !DXGI_1_2_FORMATS + { DXGI_FORMAT_R8G8B8A8_UNORM, CONV_FLAGS_EXPAND + | CONV_FLAGS_4444, DDSPF_A4R4G4B4 }, // D3DFMT_A4R4G4B4 + { DXGI_FORMAT_R8G8B8A8_UNORM, CONV_FLAGS_EXPAND + | CONV_FLAGS_NOALPHA + | CONV_FLAGS_4444, { sizeof(DDS_PIXELFORMAT), DDS_RGB, 0, 16, 0x0f00, 0x00f0, 0x000f, 0x0000 } }, // D3DFMT_X4R4G4B4 + { DXGI_FORMAT_R8G8B8A8_UNORM, CONV_FLAGS_EXPAND + | CONV_FLAGS_44, { sizeof(DDS_PIXELFORMAT), DDS_LUMINANCE, 0, 8, 0x0f, 0x00, 0x00, 0xf0 } }, // D3DFMT_A4L4 +#endif +}; + +// Note that many common DDS reader/writers (including D3DX) swap the +// the RED/BLUE masks for 10:10:10:2 formats. We assumme +// below that the 'backwards' header mask is being used since it is most +// likely written by D3DX. The more robust solution is to use the 'DX10' +// header extension and specify the DXGI_FORMAT_R10G10B10A2_UNORM format directly + +// We do not support the following legacy Direct3D 9 formats: +// BumpDuDv D3DFMT_V8U8, D3DFMT_Q8W8V8U8, D3DFMT_V16U16, D3DFMT_A2W10V10U10 +// BumpLuminance D3DFMT_L6V5U5, D3DFMT_X8L8V8U8 +// FourCC "UYVY" D3DFMT_UYVY +// FourCC "YUY2" D3DFMT_YUY2 +// FourCC 117 D3DFMT_CxV8U8 +// ZBuffer D3DFMT_D16_LOCKABLE +// FourCC 82 D3DFMT_D32F_LOCKABLE + +static DXGI_FORMAT _GetDXGIFormat( const DDS_PIXELFORMAT& ddpf, DWORD flags, _Inout_opt_ DWORD* convFlags ) +{ + const size_t MAP_SIZE = sizeof(g_LegacyDDSMap) / sizeof(LegacyDDS); + size_t index = 0; + for( index = 0; index < MAP_SIZE; ++index ) + { + const LegacyDDS* entry = &g_LegacyDDSMap[index]; + + if ( ddpf.dwFlags & entry->ddpf.dwFlags ) + { + if ( entry->ddpf.dwFlags & DDS_FOURCC ) + { + if ( ddpf.dwFourCC == entry->ddpf.dwFourCC ) + break; + } + else if ( entry->ddpf.dwFlags & DDS_PAL8 ) + { + if ( ddpf.dwRGBBitCount == entry->ddpf.dwRGBBitCount ) + break; + } + else if ( ddpf.dwRGBBitCount == entry->ddpf.dwRGBBitCount ) + { + // RGB, RGBA, ALPHA, LUMINANCE + if ( ddpf.dwRBitMask == entry->ddpf.dwRBitMask + && ddpf.dwGBitMask == entry->ddpf.dwGBitMask + && ddpf.dwBBitMask == entry->ddpf.dwBBitMask + && ddpf.dwABitMask == entry->ddpf.dwABitMask ) + break; + } + } + } + + if ( index >= MAP_SIZE ) + return DXGI_FORMAT_UNKNOWN; + + DWORD cflags = g_LegacyDDSMap[index].convFlags; + DXGI_FORMAT format = g_LegacyDDSMap[index].format; + + if ( (cflags & CONV_FLAGS_EXPAND) && (flags & DDS_FLAGS_NO_LEGACY_EXPANSION) ) + return DXGI_FORMAT_UNKNOWN; + + if ( (format == DXGI_FORMAT_R10G10B10A2_UNORM) && (flags & DDS_FLAGS_NO_R10B10G10A2_FIXUP) ) + { + cflags ^= CONV_FLAGS_SWIZZLE; + } + + if ( convFlags ) + *convFlags = cflags; + + return format; +} + + +//------------------------------------------------------------------------------------- +// Decodes DDS header including optional DX10 extended header +//------------------------------------------------------------------------------------- +static HRESULT _DecodeDDSHeader( _In_bytecount_(size) LPCVOID pSource, size_t size, DWORD flags, _Out_ TexMetadata& metadata, + _Inout_opt_ DWORD* convFlags ) +{ + if ( !pSource ) + return E_INVALIDARG; + + memset( &metadata, 0, sizeof(TexMetadata) ); + + if ( size < (sizeof(DDS_HEADER) + sizeof(uint32_t)) ) + { + return HRESULT_FROM_WIN32( ERROR_INVALID_DATA ); + } + + // DDS files always start with the same magic number ("DDS ") + uint32_t dwMagicNumber = *reinterpret_cast(pSource); + if ( dwMagicNumber != DDS_MAGIC ) + { + return E_FAIL; + } + + const DDS_HEADER* pHeader = reinterpret_cast( (const uint8_t*)pSource + sizeof( uint32_t ) ); + assert( pHeader ); + + // Verify header to validate DDS file + if ( pHeader->dwSize != sizeof(DDS_HEADER) + || pHeader->ddspf.dwSize != sizeof(DDS_PIXELFORMAT) ) + { + return E_FAIL; + } + + metadata.mipLevels = pHeader->dwMipMapCount; + if ( metadata.mipLevels == 0 ) + metadata.mipLevels = 1; + + // Check for DX10 extension + if ( (pHeader->ddspf.dwFlags & DDS_FOURCC) + && (MAKEFOURCC( 'D', 'X', '1', '0' ) == pHeader->ddspf.dwFourCC) ) + { + // Buffer must be big enough for both headers and magic value + if ( size < (sizeof(DDS_HEADER)+sizeof(uint32_t)+sizeof(DDS_HEADER_DXT10)) ) + { + return E_FAIL; + } + + const DDS_HEADER_DXT10* d3d10ext = reinterpret_cast( (const uint8_t*)pSource + sizeof( uint32_t ) + sizeof(DDS_HEADER) ); + if ( convFlags ) + *convFlags |= CONV_FLAGS_DX10; + + metadata.arraySize = d3d10ext->arraySize; + if ( metadata.arraySize == 0 ) + { + return HRESULT_FROM_WIN32( ERROR_INVALID_DATA ); + } + + metadata.format = d3d10ext->dxgiFormat; + if ( !IsValid( metadata.format ) ) + { + HRESULT_FROM_WIN32( ERROR_INVALID_DATA ); + } + + switch ( d3d10ext->resourceDimension ) + { + case DDS_DIMENSION_TEXTURE1D: + + // D3DX writes 1D textures with a fixed Height of 1 + if ( (pHeader->dwFlags & DDS_HEIGHT) && pHeader->dwHeight != 1 ) + { + return HRESULT_FROM_WIN32( ERROR_INVALID_DATA ); + } + + metadata.width = pHeader->dwWidth; + metadata.height = 1; + metadata.depth = 1; + metadata.dimension = TEX_DIMENSION_TEXTURE1D; + break; + + case DDS_DIMENSION_TEXTURE2D: + if ( d3d10ext->miscFlag & DDS_RESOURCE_MISC_TEXTURECUBE ) + { + metadata.miscFlags |= TEX_MISC_TEXTURECUBE; + metadata.arraySize *= 6; + } + + metadata.width = pHeader->dwWidth; + metadata.height = pHeader->dwHeight; + metadata.depth = 1; + metadata.dimension = TEX_DIMENSION_TEXTURE2D; + break; + + case DDS_DIMENSION_TEXTURE3D: + if ( !(pHeader->dwFlags & DDS_HEADER_FLAGS_VOLUME) ) + { + return HRESULT_FROM_WIN32( ERROR_INVALID_DATA ); + } + + if ( metadata.arraySize > 1 ) + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + + metadata.width = pHeader->dwWidth; + metadata.height = pHeader->dwHeight; + metadata.depth = pHeader->dwDepth; + metadata.dimension = TEX_DIMENSION_TEXTURE3D; + break; + + default: + return HRESULT_FROM_WIN32( ERROR_INVALID_DATA ); + } + } + else + { + metadata.arraySize = 1; + + if ( pHeader->dwFlags & DDS_HEADER_FLAGS_VOLUME ) + { + metadata.width = pHeader->dwWidth; + metadata.height = pHeader->dwHeight; + metadata.depth = pHeader->dwDepth; + metadata.dimension = TEX_DIMENSION_TEXTURE3D; + } + else + { + if ( pHeader->dwCaps2 & DDS_CUBEMAP ) + { + // We require all six faces to be defined + if ( (pHeader->dwCaps2 & DDS_CUBEMAP_ALLFACES ) != DDS_CUBEMAP_ALLFACES ) + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + + metadata.arraySize = 6; + metadata.miscFlags |= TEX_MISC_TEXTURECUBE; + } + + metadata.width = pHeader->dwWidth; + metadata.height = pHeader->dwHeight; + metadata.depth = 1; + metadata.dimension = TEX_DIMENSION_TEXTURE2D; + + // Note there's no way for a legacy Direct3D 9 DDS to express a '1D' texture + } + + metadata.format = _GetDXGIFormat( pHeader->ddspf, flags, convFlags ); + + if ( metadata.format == DXGI_FORMAT_UNKNOWN ) + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + } + + // Special flag for handling BGR DXGI 1.1 formats + if (flags & DDS_FLAGS_FORCE_RGB) + { + switch ( metadata.format ) + { + case DXGI_FORMAT_B8G8R8A8_UNORM: + metadata.format = DXGI_FORMAT_R8G8B8A8_UNORM; + if ( convFlags ) + *convFlags |= CONV_FLAGS_SWIZZLE; + break; + + case DXGI_FORMAT_B8G8R8X8_UNORM: + metadata.format = DXGI_FORMAT_R8G8B8A8_UNORM; + if ( convFlags ) + *convFlags |= CONV_FLAGS_SWIZZLE | CONV_FLAGS_NOALPHA; + break; + + case DXGI_FORMAT_B8G8R8A8_TYPELESS: + metadata.format = DXGI_FORMAT_R8G8B8A8_TYPELESS; + if ( convFlags ) + *convFlags |= CONV_FLAGS_SWIZZLE; + break; + + case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB: + metadata.format = DXGI_FORMAT_R8G8B8A8_UNORM_SRGB; + if ( convFlags ) + *convFlags |= CONV_FLAGS_SWIZZLE; + break; + + case DXGI_FORMAT_B8G8R8X8_TYPELESS: + metadata.format = DXGI_FORMAT_R8G8B8A8_TYPELESS; + if ( convFlags ) + *convFlags |= CONV_FLAGS_SWIZZLE | CONV_FLAGS_NOALPHA; + break; + + case DXGI_FORMAT_B8G8R8X8_UNORM_SRGB: + metadata.format = DXGI_FORMAT_R8G8B8A8_UNORM_SRGB; + if ( convFlags ) + *convFlags |= CONV_FLAGS_SWIZZLE | CONV_FLAGS_NOALPHA; + break; + } + } + + // Special flag for handling 16bpp formats + if (flags & DDS_FLAGS_NO_16BPP) + { + switch ( metadata.format ) + { + case DXGI_FORMAT_B5G6R5_UNORM: + case DXGI_FORMAT_B5G5R5A1_UNORM: +#ifdef DXGI_1_2_FORMATS + case DXGI_FORMAT_B4G4R4A4_UNORM: +#endif + metadata.format = DXGI_FORMAT_R8G8B8A8_UNORM; + if ( convFlags ) + { + *convFlags |= CONV_FLAGS_EXPAND; + if ( metadata.format == DXGI_FORMAT_B5G6R5_UNORM ) + *convFlags |= CONV_FLAGS_NOALPHA; + } + } + } + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Encodes DDS file header (magic value, header, optional DX10 extended header) +//------------------------------------------------------------------------------------- +HRESULT _EncodeDDSHeader( _In_ const TexMetadata& metadata, DWORD flags, + _Out_opt_cap_x_(maxsize) LPVOID pDestination, _In_ size_t maxsize, _Out_ size_t& required ) +{ + assert( IsValid( metadata.format ) && !IsVideo( metadata.format ) ); + + if ( metadata.arraySize > 1 ) + { + if ( (metadata.arraySize != 6) || (metadata.dimension != TEX_DIMENSION_TEXTURE2D) || !(metadata.miscFlags & TEX_MISC_TEXTURECUBE) ) + { + flags |= DDS_FLAGS_FORCE_DX10_EXT; + } + } + + DDS_PIXELFORMAT ddpf = { 0 }; + if ( !(flags & DDS_FLAGS_FORCE_DX10_EXT) ) + { + switch( metadata.format ) + { + case DXGI_FORMAT_R8G8B8A8_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_A8B8G8R8, sizeof(DDS_PIXELFORMAT) ); break; + case DXGI_FORMAT_R16G16_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_G16R16, sizeof(DDS_PIXELFORMAT) ); break; + case DXGI_FORMAT_R8G8_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_A8L8, sizeof(DDS_PIXELFORMAT) ); break; + case DXGI_FORMAT_R16_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_L16, sizeof(DDS_PIXELFORMAT) ); break; + case DXGI_FORMAT_R8_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_L8, sizeof(DDS_PIXELFORMAT) ); break; + case DXGI_FORMAT_A8_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_A8, sizeof(DDS_PIXELFORMAT) ); break; + case DXGI_FORMAT_R8G8_B8G8_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_R8G8_B8G8, sizeof(DDS_PIXELFORMAT) ); break; + case DXGI_FORMAT_G8R8_G8B8_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_G8R8_G8B8, sizeof(DDS_PIXELFORMAT) ); break; + case DXGI_FORMAT_BC1_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_DXT1, sizeof(DDS_PIXELFORMAT) ); break; + case DXGI_FORMAT_BC2_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_DXT3, sizeof(DDS_PIXELFORMAT) ); break; + case DXGI_FORMAT_BC3_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_DXT5, sizeof(DDS_PIXELFORMAT) ); break; + case DXGI_FORMAT_BC4_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_BC4_UNORM, sizeof(DDS_PIXELFORMAT) ); break; + case DXGI_FORMAT_BC4_SNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_BC4_SNORM, sizeof(DDS_PIXELFORMAT) ); break; + case DXGI_FORMAT_BC5_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_BC5_UNORM, sizeof(DDS_PIXELFORMAT) ); break; + case DXGI_FORMAT_BC5_SNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_BC5_SNORM, sizeof(DDS_PIXELFORMAT) ); break; + case DXGI_FORMAT_B5G6R5_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_R5G6B5, sizeof(DDS_PIXELFORMAT) ); break; + case DXGI_FORMAT_B5G5R5A1_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_A1R5G5B5, sizeof(DDS_PIXELFORMAT) ); break; + case DXGI_FORMAT_B8G8R8A8_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_A8R8G8B8, sizeof(DDS_PIXELFORMAT) ); break; // DXGI 1.1 + case DXGI_FORMAT_B8G8R8X8_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_X8R8G8B8, sizeof(DDS_PIXELFORMAT) ); break; // DXGI 1.1 + +#ifdef DXGI_1_2_FORMATS + case DXGI_FORMAT_B4G4R4A4_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_A4R4G4B4, sizeof(DDS_PIXELFORMAT) ); break; +#endif + + // Legacy D3DX formats using D3DFMT enum value as FourCC + case DXGI_FORMAT_R32G32B32A32_FLOAT: + ddpf.dwSize = sizeof(DDS_PIXELFORMAT); ddpf.dwFlags = DDS_FOURCC; ddpf.dwFourCC = 116; // D3DFMT_A32B32G32R32F + break; + case DXGI_FORMAT_R16G16B16A16_FLOAT: + ddpf.dwSize = sizeof(DDS_PIXELFORMAT); ddpf.dwFlags = DDS_FOURCC; ddpf.dwFourCC = 113; // D3DFMT_A16B16G16R16F + break; + case DXGI_FORMAT_R16G16B16A16_UNORM: + ddpf.dwSize = sizeof(DDS_PIXELFORMAT); ddpf.dwFlags = DDS_FOURCC; ddpf.dwFourCC = 36; // D3DFMT_A16B16G16R16 + break; + case DXGI_FORMAT_R16G16B16A16_SNORM: + ddpf.dwSize = sizeof(DDS_PIXELFORMAT); ddpf.dwFlags = DDS_FOURCC; ddpf.dwFourCC = 110; // D3DFMT_Q16W16V16U16 + break; + case DXGI_FORMAT_R32G32_FLOAT: + ddpf.dwSize = sizeof(DDS_PIXELFORMAT); ddpf.dwFlags = DDS_FOURCC; ddpf.dwFourCC = 115; // D3DFMT_G32R32F + break; + case DXGI_FORMAT_R16G16_FLOAT: + ddpf.dwSize = sizeof(DDS_PIXELFORMAT); ddpf.dwFlags = DDS_FOURCC; ddpf.dwFourCC = 112; // D3DFMT_G16R16F + break; + case DXGI_FORMAT_R32_FLOAT: + ddpf.dwSize = sizeof(DDS_PIXELFORMAT); ddpf.dwFlags = DDS_FOURCC; ddpf.dwFourCC = 114; // D3DFMT_R32F + break; + case DXGI_FORMAT_R16_FLOAT: + ddpf.dwSize = sizeof(DDS_PIXELFORMAT); ddpf.dwFlags = DDS_FOURCC; ddpf.dwFourCC = 111; // D3DFMT_R16F + break; + } + } + + required = sizeof(uint32_t) + sizeof(DDS_HEADER); + + if ( ddpf.dwSize == 0 ) + required += sizeof(DDS_HEADER_DXT10); + + if ( !pDestination ) + return S_OK; + + if ( maxsize < required ) + return E_NOT_SUFFICIENT_BUFFER; + + *reinterpret_cast(pDestination) = DDS_MAGIC; + + DDS_HEADER* header = reinterpret_cast( reinterpret_cast(pDestination) + sizeof(uint32_t) ); + assert( header ); + + memset( header, 0, sizeof(DDS_HEADER ) ); + header->dwSize = sizeof( DDS_HEADER ); + header->dwFlags = DDS_HEADER_FLAGS_TEXTURE; + header->dwCaps = DDS_SURFACE_FLAGS_TEXTURE; + + if (metadata.mipLevels > 0) + { + header->dwFlags |= DDS_HEADER_FLAGS_MIPMAP; + +#ifdef _AMD64_ + if ( metadata.mipLevels > 0xFFFFFFFF ) + return E_INVALIDARG; +#endif + + header->dwMipMapCount = static_cast( metadata.mipLevels ); + + if ( header->dwMipMapCount > 1 ) + header->dwCaps |= DDS_SURFACE_FLAGS_MIPMAP; + } + + switch( metadata.dimension ) + { + case TEX_DIMENSION_TEXTURE1D: +#ifdef _AMD64_ + if ( metadata.height > 0xFFFFFFFF ) + return E_INVALIDARG; +#endif + + header->dwWidth = static_cast( metadata.width ); + header->dwHeight = header->dwDepth = 1; + break; + + case TEX_DIMENSION_TEXTURE2D: +#ifdef _AMD64_ + if ( metadata.height > 0xFFFFFFFF + || metadata.width > 0xFFFFFFFF) + return E_INVALIDARG; +#endif + + header->dwHeight = static_cast( metadata.height ); + header->dwWidth = static_cast( metadata.width ); + header->dwDepth = 1; + + if ( metadata.miscFlags & TEX_MISC_TEXTURECUBE ) + { + header->dwCaps |= DDS_SURFACE_FLAGS_CUBEMAP; + header->dwCaps2 |= DDS_CUBEMAP_ALLFACES; + } + break; + + case TEX_DIMENSION_TEXTURE3D: +#ifdef _AMD64_ + if ( metadata.height > 0xFFFFFFFF + || metadata.width > 0xFFFFFFFF + || metadata.depth > 0xFFFFFFFF ) + return E_INVALIDARG; +#endif + + header->dwFlags |= DDS_HEADER_FLAGS_VOLUME; + header->dwCaps2 |= DDS_FLAGS_VOLUME; + header->dwHeight = static_cast( metadata.height ); + header->dwWidth = static_cast( metadata.width ); + header->dwDepth = static_cast( metadata.depth ); + break; + + default: + return E_FAIL; + } + + size_t rowPitch, slicePitch; + ComputePitch( metadata.format, metadata.width, metadata.height, rowPitch, slicePitch, CP_FLAGS_NONE ); + +#ifdef _AMD64_ + if ( slicePitch > 0xFFFFFFFF + || rowPitch > 0xFFFFFFFF ) + return E_FAIL; +#endif + + if ( IsCompressed( metadata.format ) ) + { + header->dwFlags |= DDS_HEADER_FLAGS_LINEARSIZE; + header->dwPitchOrLinearSize = static_cast( slicePitch ); + } + else + { + header->dwFlags |= DDS_HEADER_FLAGS_PITCH; + header->dwPitchOrLinearSize = static_cast( rowPitch ); + } + + if ( ddpf.dwSize == 0 ) + { + memcpy_s( &header->ddspf, sizeof(header->ddspf), &DDSPF_DX10, sizeof(DDS_PIXELFORMAT) ); + + DDS_HEADER_DXT10* ext = reinterpret_cast( reinterpret_cast(header) + sizeof(DDS_HEADER) ); + assert( ext ); + + memset( ext, 0, sizeof(DDS_HEADER_DXT10) ); + ext->dxgiFormat = metadata.format; + ext->resourceDimension = metadata.dimension; + +#ifdef _AMD64_ + if ( metadata.arraySize > 0xFFFFFFFF ) + return E_INVALIDARG; +#endif + + if ( metadata.miscFlags & TEX_MISC_TEXTURECUBE ) + { + ext->miscFlag |= TEX_MISC_TEXTURECUBE; + assert( (metadata.arraySize % 6) == 0 ); + ext->arraySize = static_cast( metadata.arraySize / 6 ); + } + else + { + ext->arraySize = static_cast( metadata.arraySize ); + } + } + else + { + memcpy_s( &header->ddspf, sizeof(header->ddspf), &ddpf, sizeof(ddpf) ); + } + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Converts an image row with optional clearing of alpha value to 1.0 +// Returns true if supported, false if expansion case not supported +//------------------------------------------------------------------------------------- +enum TEXP_LEGACY_FORMAT +{ + TEXP_LEGACY_UNKNOWN = 0, + TEXP_LEGACY_R8G8B8, + TEXP_LEGACY_R3G3B2, + TEXP_LEGACY_A8R3G3B2, + TEXP_LEGACY_P8, + TEXP_LEGACY_A8P8, + TEXP_LEGACY_A4L4, + TEXP_LEGACY_B4G4R4A4, +}; + +inline static TEXP_LEGACY_FORMAT _FindLegacyFormat( DWORD flags ) +{ + TEXP_LEGACY_FORMAT lformat = TEXP_LEGACY_UNKNOWN; + + if ( flags & CONV_FLAGS_PAL8 ) + { + lformat = ( flags & CONV_FLAGS_A8P8 ) ? TEXP_LEGACY_A8P8 : TEXP_LEGACY_P8; + } + else if ( flags & CONV_FLAGS_888 ) + lformat = TEXP_LEGACY_R8G8B8; + else if ( flags & CONV_FLAGS_332 ) + lformat = TEXP_LEGACY_R3G3B2; + else if ( flags & CONV_FLAGS_8332 ) + lformat = TEXP_LEGACY_A8R3G3B2; + else if ( flags & CONV_FLAGS_44 ) + lformat = TEXP_LEGACY_A4L4; +#ifndef DXGI_1_2_FORMATS + else if ( flags & CONV_FLAGS_4444 ) + lformat = TEXP_LEGACY_B4G4R4A4; +#endif + + return lformat; +} + +static bool _LegacyExpandScanline( _Out_bytecap_(outSize) LPVOID pDestination, size_t outSize, _In_ DXGI_FORMAT outFormat, + _In_bytecount_(inSize) LPCVOID pSource, size_t inSize, _In_ TEXP_LEGACY_FORMAT inFormat, + _In_opt_count_c_(256) const uint32_t* pal8, _In_ DWORD flags ) +{ + assert( pDestination && outSize > 0 ); + assert( pSource && inSize > 0 ); + assert( IsValid(outFormat) && !IsVideo(outFormat) ); + + switch( inFormat ) + { + case TEXP_LEGACY_R8G8B8: + if ( outFormat != DXGI_FORMAT_R8G8B8A8_UNORM ) + return false; + + // D3DFMT_R8G8B8 -> DXGI_FORMAT_R8G8B8A8_UNORM + { + const uint8_t * __restrict sPtr = reinterpret_cast(pSource); + uint32_t * __restrict dPtr = reinterpret_cast(pDestination); + + for( size_t ocount = 0, icount = 0; ((icount < inSize) && (ocount < outSize)); icount += 3, ocount += 4 ) + { + // 24bpp Direct3D 9 files are actually BGR, so need to swizzle as well + uint32_t t1 = ( *(sPtr) << 16 ); + uint32_t t2 = ( *(sPtr+1) << 8 ); + uint32_t t3 = *(sPtr+2); + + *(dPtr++) = t1 | t2 | t3 | 0xff000000; + sPtr += 3; + } + } + return true; + + case TEXP_LEGACY_R3G3B2: + switch( outFormat ) + { + case DXGI_FORMAT_R8G8B8A8_UNORM: + // D3DFMT_R3G3B2 -> DXGI_FORMAT_R8G8B8A8_UNORM + { + const uint8_t* __restrict sPtr = reinterpret_cast(pSource); + uint32_t * __restrict dPtr = reinterpret_cast(pDestination); + + for( size_t ocount = 0, icount = 0; ((icount < inSize) && (ocount < outSize)); ++icount, ocount += 4 ) + { + uint8_t t = *(sPtr++); + + uint32_t t1 = (t & 0xe0) | ((t & 0xe0) >> 3) | ((t & 0xc0) >> 6); + uint32_t t2 = ((t & 0x1c) << 11) | ((t & 0x1c) << 8) | ((t & 0x18) << 5); + uint32_t t3 = ((t & 0x03) << 22) | ((t & 0x03) << 20) | ((t & 0x03) << 18) | ((t & 0x03) << 16); + + *(dPtr++) = t1 | t2 | t3 | 0xff000000; + } + } + return true; + + case DXGI_FORMAT_B5G6R5_UNORM: + // D3DFMT_R3G3B2 -> DXGI_FORMAT_B5G6R5_UNORM + { + const uint8_t* __restrict sPtr = reinterpret_cast(pSource); + uint16_t * __restrict dPtr = reinterpret_cast(pDestination); + + for( size_t ocount = 0, icount = 0; ((icount < inSize) && (ocount < outSize)); ++icount, ocount += 2 ) + { + uint8_t t = *(sPtr++); + + uint16_t t1 = ((t & 0xe0) << 8) | ((t & 0xc0) << 5); + uint16_t t2 = ((t & 0x1c) << 6) | ((t & 0x1c) << 3); + uint16_t t3 = ((t & 0x03) << 3) | ((t & 0x03) << 1) | ((t & 0x02) >> 1); + + *(dPtr++) = t1 | t2 | t3; + } + } + return true; + } + break; + + case TEXP_LEGACY_A8R3G3B2: + if ( outFormat != DXGI_FORMAT_R8G8B8A8_UNORM ) + return false; + + // D3DFMT_A8R3G3B2 -> DXGI_FORMAT_R8G8B8A8_UNORM + { + const uint16_t* __restrict sPtr = reinterpret_cast(pSource); + uint32_t * __restrict dPtr = reinterpret_cast(pDestination); + + for( size_t ocount = 0, icount = 0; ((icount < inSize) && (ocount < outSize)); icount += 2, ocount += 4 ) + { + uint16_t t = *(sPtr++); + + uint32_t t1 = (t & 0x00e0) | ((t & 0x00e0) >> 3) | ((t & 0x00c0) >> 6); + uint32_t t2 = ((t & 0x001c) << 11) | ((t & 0x001c) << 8) | ((t & 0x0018) << 5); + uint32_t t3 = ((t & 0x0003) << 22) | ((t & 0x0003) << 20) | ((t & 0x0003) << 18) | ((t & 0x0003) << 16); + uint32_t ta = ( flags & TEXP_SCANLINE_SETALPHA ) ? 0xff000000 : ((t & 0xff00) << 16); + + *(dPtr++) = t1 | t2 | t3 | ta; + } + } + return true; + + case TEXP_LEGACY_P8: + if ( (outFormat != DXGI_FORMAT_R8G8B8A8_UNORM) || !pal8 ) + return false; + + // D3DFMT_P8 -> DXGI_FORMAT_R8G8B8A8_UNORM + { + const uint8_t* __restrict sPtr = reinterpret_cast(pSource); + uint32_t * __restrict dPtr = reinterpret_cast(pDestination); + + for( size_t ocount = 0, icount = 0; ((icount < inSize) && (ocount < outSize)); ++icount, ocount += 4 ) + { + uint8_t t = *(sPtr++); + + *(dPtr++) = pal8[ t ]; + } + } + return true; + + case TEXP_LEGACY_A8P8: + if ( (outFormat != DXGI_FORMAT_R8G8B8A8_UNORM) || !pal8 ) + return false; + + // D3DFMT_A8P8 -> DXGI_FORMAT_R8G8B8A8_UNORM + { + const uint16_t* __restrict sPtr = reinterpret_cast(pSource); + uint32_t * __restrict dPtr = reinterpret_cast(pDestination); + + for( size_t ocount = 0, icount = 0; ((icount < inSize) && (ocount < outSize)); icount += 2, ocount += 4 ) + { + uint16_t t = *(sPtr++); + + uint32_t t1 = pal8[ t & 0xff ]; + uint32_t ta = ( flags & TEXP_SCANLINE_SETALPHA ) ? 0xff000000 : ((t & 0xff00) << 16); + + *(dPtr++) = t1 | ta; + } + } + return true; + + case TEXP_LEGACY_A4L4: + switch( outFormat ) + { +#ifdef DXGI_1_2_FORMATS + case DXGI_FORMAT_B4G4R4A4_UNORM : + // D3DFMT_A4L4 -> DXGI_FORMAT_B4G4R4A4_UNORM + { + const uint8_t * __restrict sPtr = reinterpret_cast(pSource); + uint16_t * __restrict dPtr = reinterpret_cast(pDestination); + + for( size_t ocount = 0, icount = 0; ((icount < inSize) && (ocount < outSize)); ++icount, ocount += 2 ) + { + uint8_t t = *(sPtr++); + + uint16_t t1 = (t & 0x0f); + uint16_t ta = ( flags & TEXP_SCANLINE_SETALPHA ) ? 0xf000 : ((t & 0xf0) << 8); + + *(dPtr++) = t1 | (t1 << 4) | (t1 << 8) | ta; + } + } + return true; +#endif // DXGI_1_2_FORMATS + + case DXGI_FORMAT_R8G8B8A8_UNORM: + // D3DFMT_A4L4 -> DXGI_FORMAT_R8G8B8A8_UNORM + { + const uint8_t * __restrict sPtr = reinterpret_cast(pSource); + uint32_t * __restrict dPtr = reinterpret_cast(pDestination); + + for( size_t ocount = 0, icount = 0; ((icount < inSize) && (ocount < outSize)); ++icount, ocount += 4 ) + { + uint8_t t = *(sPtr++); + + uint32_t t1 = ((t & 0x0f) << 4) | (t & 0x0f); + uint32_t ta = ( flags & TEXP_SCANLINE_SETALPHA ) ? 0xff000000 : (((t & 0xf0) << 24) | ((t & 0xf0) << 20)); + + *(dPtr++) = t1 | (t1 << 8) | (t1 << 16) | ta; + } + } + return true; + } + break; + +#ifndef DXGI_1_2_FORMATS + case TEXP_LEGACY_B4G4R4A4: + if (outFormat != DXGI_FORMAT_R8G8B8A8_UNORM) + return false; + + // D3DFMT_A4R4G4B4 -> DXGI_FORMAT_R8G8B8A8_UNORM + { + const uint16_t * __restrict sPtr = reinterpret_cast(pSource); + uint32_t * __restrict dPtr = reinterpret_cast(pDestination); + + for( size_t ocount = 0, icount = 0; ((icount < inSize) && (ocount < outSize)); icount += 2, ocount += 4 ) + { + uint16_t t = *(sPtr++); + + uint32_t t1 = ((t & 0x0f00) >> 4) | ((t & 0x0f00) >> 8); + uint32_t t2 = ((t & 0x00f0) << 8) | ((t & 0x00f0) << 4); + uint32_t t3 = ((t & 0x000f) << 20) | ((t & 0x000f) << 16); + uint32_t ta = ( flags & TEXP_SCANLINE_SETALPHA ) ? 0xff000000 : (((t & 0xf000) << 16) | ((t & 0xf000) << 12)); + + *(dPtr++) = t1 | t2 | t3 | ta; + } + } + return true; +#endif + } + + return false; +} + + +//------------------------------------------------------------------------------------- +// Converts or copies image data from pPixels into scratch image data +//------------------------------------------------------------------------------------- +static HRESULT _CopyImage( _In_bytecount_(size) const void* pPixels, _In_ size_t size, + _In_ const TexMetadata& metadata, _In_ DWORD cpFlags, _In_ DWORD convFlags, _In_opt_count_c_(256) const uint32_t *pal8, _In_ const ScratchImage& image ) +{ + assert( pPixels ); + assert( image.GetPixels() ); + + if ( !size ) + return E_FAIL; + + if ( convFlags & CONV_FLAGS_EXPAND ) + { + if ( convFlags & CONV_FLAGS_888 ) + cpFlags |= CP_FLAGS_24BPP; + else if ( convFlags & (CONV_FLAGS_565 | CONV_FLAGS_5551 | CONV_FLAGS_4444 | CONV_FLAGS_8332 | CONV_FLAGS_A8P8 ) ) + cpFlags |= CP_FLAGS_16BPP; + else if ( convFlags & (CONV_FLAGS_44 | CONV_FLAGS_332 | CONV_FLAGS_PAL8) ) + cpFlags |= CP_FLAGS_8BPP; + } + + size_t pixelSize, nimages; + _DetermineImageArray( metadata, cpFlags, nimages, pixelSize ); + if ( (nimages == 0) || (nimages != image.GetImageCount()) ) + { + return E_FAIL; + } + + assert( pixelSize <= size ); + + std::unique_ptr timages( new Image[nimages] ); + if ( !_SetupImageArray( (uint8_t*)pPixels, size, metadata, cpFlags, timages.get(), nimages ) ) + { + return E_FAIL; + } + + if ( nimages != image.GetImageCount() ) + { + return E_FAIL; + } + + const Image* images = image.GetImages(); + if ( !images ) + { + return E_FAIL; + } + + DWORD tflags = (convFlags & CONV_FLAGS_NOALPHA) ? TEXP_SCANLINE_SETALPHA : 0; + if ( convFlags & CONV_FLAGS_SWIZZLE ) + tflags |= TEXP_SCANLINE_LEGACY; + + switch (metadata.dimension) + { + case TEX_DIMENSION_TEXTURE1D: + case TEX_DIMENSION_TEXTURE2D: + { + size_t index = 0; + for( size_t item = 0; item < metadata.arraySize; ++item ) + { + for( size_t level = 0; level < metadata.mipLevels; ++level, ++index ) + { + if ( index >= nimages ) + return E_FAIL; + + if ( images[ index ].height != timages[ index ].height ) + return E_FAIL; + + size_t dpitch = images[ index ].rowPitch; + size_t spitch = timages[ index ].rowPitch; + + const uint8_t *pSrc = const_cast( timages[ index ].pixels ); + if ( !pSrc ) + return E_POINTER; + + uint8_t *pDest = images[ index ].pixels; + if ( !pDest ) + return E_POINTER; + + if ( IsCompressed( metadata.format ) ) + { + size_t csize = std::min( images[ index ].slicePitch, timages[ index ].slicePitch ); + memcpy_s( pDest, images[ index ].slicePitch, pSrc, csize ); + } + else + { + for( size_t h = 0; h < images[ index ].height; ++h ) + { + if ( convFlags & CONV_FLAGS_EXPAND ) + { +#ifdef DXGI_1_2_FORMATS + if ( convFlags & (CONV_FLAGS_565|CONV_FLAGS_5551|CONV_FLAGS_4444) ) +#else + if ( convFlags & (CONV_FLAGS_565|CONV_FLAGS_5551) ) +#endif + { + if ( !_ExpandScanline( pDest, dpitch, DXGI_FORMAT_R8G8B8A8_UNORM, + pSrc, spitch, + (convFlags & CONV_FLAGS_565) ? DXGI_FORMAT_B5G6R5_UNORM : DXGI_FORMAT_B5G5R5A1_UNORM, + tflags ) ) + return E_FAIL; + } + else + { + TEXP_LEGACY_FORMAT lformat = _FindLegacyFormat( convFlags ); + if ( !_LegacyExpandScanline( pDest, dpitch, metadata.format, + pSrc, spitch, lformat, pal8, + tflags ) ) + return E_FAIL; + } + } + else if ( convFlags & CONV_FLAGS_SWIZZLE ) + { + _SwizzleScanline( pDest, dpitch, pSrc, spitch, + metadata.format, tflags ); + } + else + { + _CopyScanline( pDest, dpitch, pSrc, spitch, + metadata.format, tflags ); + } + + pSrc += spitch; + pDest += dpitch; + } + } + } + } + } + break; + + case TEX_DIMENSION_TEXTURE3D: + { + size_t index = 0; + size_t d = metadata.depth; + + for( size_t level = 0; level < metadata.mipLevels; ++level ) + { + for( size_t slice = 0; slice < d; ++slice, ++index ) + { + if ( index >= nimages ) + return E_FAIL; + + if ( images[ index ].height != timages[ index ].height ) + return E_FAIL; + + size_t dpitch = images[ index ].rowPitch; + size_t spitch = timages[ index ].rowPitch; + + const uint8_t *pSrc = const_cast( timages[ index ].pixels ); + if ( !pSrc ) + return E_POINTER; + + uint8_t *pDest = images[ index ].pixels; + if ( !pDest ) + return E_POINTER; + + if ( IsCompressed( metadata.format ) ) + { + size_t csize = std::min( images[ index ].slicePitch, timages[ index ].slicePitch ); + memcpy_s( pDest, images[ index ].slicePitch, pSrc, csize ); + } + else + { + for( size_t h = 0; h < images[ index ].height; ++h ) + { + if ( convFlags & CONV_FLAGS_EXPAND ) + { +#ifdef DXGI_1_2_FORMATS + if ( convFlags & (CONV_FLAGS_565|CONV_FLAGS_5551|CONV_FLAGS_4444) ) +#else + if ( convFlags & (CONV_FLAGS_565|CONV_FLAGS_5551) ) +#endif + { + if ( !_ExpandScanline( pDest, dpitch, DXGI_FORMAT_R8G8B8A8_UNORM, + pSrc, spitch, + (convFlags & CONV_FLAGS_565) ? DXGI_FORMAT_B5G6R5_UNORM : DXGI_FORMAT_B5G5R5A1_UNORM, + tflags ) ) + return E_FAIL; + } + else + { + TEXP_LEGACY_FORMAT lformat = _FindLegacyFormat( convFlags ); + if ( !_LegacyExpandScanline( pDest, dpitch, metadata.format, + pSrc, spitch, lformat, pal8, + tflags ) ) + return E_FAIL; + } + } + else if ( convFlags & CONV_FLAGS_SWIZZLE ) + { + _SwizzleScanline( pDest, dpitch, pSrc, spitch, metadata.format, tflags ); + } + else + { + _CopyScanline( pDest, dpitch, pSrc, spitch, metadata.format, tflags ); + } + + pSrc += spitch; + pDest += dpitch; + } + } + } + + if ( d > 1 ) + d >>= 1; + } + } + break; + + default: + return E_FAIL; + } + + return S_OK; +} + +static HRESULT _CopyImageInPlace( DWORD convFlags, _In_ const ScratchImage& image ) +{ + if ( !image.GetPixels() ) + return E_FAIL; + + const Image* images = image.GetImages(); + if ( !images ) + return E_FAIL; + + const TexMetadata& metadata = image.GetMetadata(); + + DWORD tflags = (convFlags & CONV_FLAGS_NOALPHA) ? TEXP_SCANLINE_SETALPHA : 0; + if ( convFlags & CONV_FLAGS_SWIZZLE ) + tflags |= TEXP_SCANLINE_LEGACY; + + for( size_t i = 0; i < image.GetImageCount(); ++i ) + { + const Image* img = &images[ i ]; + uint8_t *pPixels = img->pixels; + if ( !pPixels ) + return E_POINTER; + + size_t rowPitch = img->rowPitch; + + for( size_t h = 0; h < img->height; ++h ) + { + if ( convFlags & CONV_FLAGS_SWIZZLE ) + { + _SwizzleScanline( pPixels, rowPitch, pPixels, rowPitch, metadata.format, tflags ); + } + else + { + _CopyScanline( pPixels, rowPitch, pPixels, rowPitch, metadata.format, tflags ); + } + + pPixels += rowPitch; + } + } + + return S_OK; +} + + +//===================================================================================== +// Entry-points +//===================================================================================== + +//------------------------------------------------------------------------------------- +// Obtain metadata from DDS file in memory/on disk +//------------------------------------------------------------------------------------- + +HRESULT GetMetadataFromDDSMemory( LPCVOID pSource, size_t size, DWORD flags, TexMetadata& metadata ) +{ + if ( !pSource || size == 0 ) + return E_INVALIDARG; + + return _DecodeDDSHeader( pSource, size, flags, metadata, 0 ); +} + +HRESULT GetMetadataFromDDSFile( LPCWSTR szFile, DWORD flags, TexMetadata& metadata ) +{ + if ( !szFile ) + return E_INVALIDARG; + +#if (_WIN32_WINNT >= 0x0602 /*_WIN32_WINNT_WIN8*/) + ScopedHandle hFile( safe_handle( CreateFile2( szFile, GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING, 0 ) ) ); +#else + ScopedHandle hFile( safe_handle( CreateFileW( szFile, GENERIC_READ, FILE_SHARE_READ, 0, OPEN_EXISTING, + FILE_FLAG_SEQUENTIAL_SCAN, 0 ) ) ); +#endif + if ( !hFile ) + { + return HRESULT_FROM_WIN32( GetLastError() ); + } + + // Get the file size + LARGE_INTEGER fileSize = {0}; + +#if (_WIN32_WINNT >= _WIN32_WINNT_VISTA) + FILE_STANDARD_INFO fileInfo; + if ( !GetFileInformationByHandleEx( hFile.get(), FileStandardInfo, &fileInfo, sizeof(fileInfo) ) ) + { + return HRESULT_FROM_WIN32( GetLastError() ); + } + fileSize = fileInfo.EndOfFile; +#else + if ( !GetFileSizeEx( hFile.get(), &fileSize ) ) + { + return HRESULT_FROM_WIN32( GetLastError() ); + } +#endif + + // File is too big for 32-bit allocation, so reject read (4 GB should be plenty large enough for a valid DDS file) + if ( fileSize.HighPart > 0 ) + { + return HRESULT_FROM_WIN32( ERROR_FILE_TOO_LARGE ); + } + + // Need at least enough data to fill the standard header and magic number to be a valid DDS + if ( fileSize.LowPart < ( sizeof(DDS_HEADER) + sizeof(uint32_t) ) ) + { + return E_FAIL; + } + + // Read the header in (including extended header if present) + const size_t MAX_HEADER_SIZE = sizeof(uint32_t) + sizeof(DDS_HEADER) + sizeof(DDS_HEADER_DXT10); + uint8_t header[MAX_HEADER_SIZE]; + + DWORD bytesRead = 0; + if ( !ReadFile( hFile.get(), header, MAX_HEADER_SIZE, &bytesRead, 0 ) ) + { + return HRESULT_FROM_WIN32( GetLastError() ); + } + + return _DecodeDDSHeader( header, bytesRead, flags, metadata, 0 ); +} + + +//------------------------------------------------------------------------------------- +// Load a DDS file in memory +//------------------------------------------------------------------------------------- +HRESULT LoadFromDDSMemory( LPCVOID pSource, size_t size, DWORD flags, TexMetadata* metadata, ScratchImage& image ) +{ + if ( !pSource || size == 0 ) + return E_INVALIDARG; + + image.Release(); + + DWORD convFlags = 0; + TexMetadata mdata; + HRESULT hr = _DecodeDDSHeader( pSource, size, flags, mdata, &convFlags ); + if ( FAILED(hr) ) + return hr; + + size_t offset = sizeof(uint32_t) + sizeof(DDS_HEADER); + if ( convFlags & CONV_FLAGS_DX10 ) + offset += sizeof(DDS_HEADER_DXT10); + + assert( offset <= size ); + + const uint32_t *pal8 = nullptr; + if ( convFlags & CONV_FLAGS_PAL8 ) + { + pal8 = reinterpret_cast( reinterpret_cast(pSource) + offset ); + assert( pal8 ); + offset += ( 256 * sizeof(uint32_t) ); + if ( size < offset ) + return E_FAIL; + } + + hr = image.Initialize( mdata ); + if ( FAILED(hr) ) + return hr; + + LPCVOID pPixels = reinterpret_cast( reinterpret_cast(pSource) + offset ); + assert( pPixels ); + hr = _CopyImage( pPixels, size - offset, mdata, + (flags & DDS_FLAGS_LEGACY_DWORD) ? CP_FLAGS_LEGACY_DWORD : CP_FLAGS_NONE, convFlags, pal8, image ); + if ( FAILED(hr) ) + { + image.Release(); + return hr; + } + if ( metadata ) + memcpy( metadata, &mdata, sizeof(TexMetadata) ); + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Load a DDS file from disk +//------------------------------------------------------------------------------------- +HRESULT LoadFromDDSFile( LPCWSTR szFile, DWORD flags, TexMetadata* metadata, ScratchImage& image ) +{ + if ( !szFile ) + return E_INVALIDARG; + + image.Release(); + +#if (_WIN32_WINNT >= 0x0602 /*_WIN32_WINNT_WIN8*/) + ScopedHandle hFile( safe_handle ( CreateFile2( szFile, GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING, 0 ) ) ); +#else + ScopedHandle hFile( safe_handle ( CreateFileW( szFile, GENERIC_READ, FILE_SHARE_READ, 0, OPEN_EXISTING, + FILE_FLAG_SEQUENTIAL_SCAN, 0 ) ) ); +#endif + + if ( !hFile ) + { + return HRESULT_FROM_WIN32( GetLastError() ); + } + + // Get the file size + LARGE_INTEGER fileSize = {0}; + +#if (_WIN32_WINNT >= _WIN32_WINNT_VISTA) + FILE_STANDARD_INFO fileInfo; + if ( !GetFileInformationByHandleEx( hFile.get(), FileStandardInfo, &fileInfo, sizeof(fileInfo) ) ) + { + return HRESULT_FROM_WIN32( GetLastError() ); + } + fileSize = fileInfo.EndOfFile; +#else + if ( !GetFileSizeEx( hFile.get(), &fileSize ) ) + { + return HRESULT_FROM_WIN32( GetLastError() ); + } +#endif + + // File is too big for 32-bit allocation, so reject read (4 GB should be plenty large enough for a valid DDS file) + if ( fileSize.HighPart > 0 ) + { + return HRESULT_FROM_WIN32( ERROR_FILE_TOO_LARGE ); + } + + // Need at least enough data to fill the standard header and magic number to be a valid DDS + if ( fileSize.LowPart < ( sizeof(DDS_HEADER) + sizeof(uint32_t) ) ) + { + return E_FAIL; + } + + // Read the header in (including extended header if present) + const size_t MAX_HEADER_SIZE = sizeof(uint32_t) + sizeof(DDS_HEADER) + sizeof(DDS_HEADER_DXT10); + uint8_t header[MAX_HEADER_SIZE]; + + DWORD bytesRead = 0; + if ( !ReadFile( hFile.get(), header, MAX_HEADER_SIZE, &bytesRead, 0 ) ) + { + return HRESULT_FROM_WIN32( GetLastError() ); + } + + DWORD convFlags = 0; + TexMetadata mdata; + HRESULT hr = _DecodeDDSHeader( header, bytesRead, flags, mdata, &convFlags ); + if ( FAILED(hr) ) + return hr; + + DWORD offset = MAX_HEADER_SIZE; + + if ( !(convFlags & CONV_FLAGS_DX10) ) + { + // Must reset file position since we read more than the standard header above + LARGE_INTEGER filePos = { sizeof(uint32_t) + sizeof(DDS_HEADER), 0}; + if ( !SetFilePointerEx( hFile.get(), filePos, 0, FILE_BEGIN ) ) + { + return HRESULT_FROM_WIN32( GetLastError() ); + } + + offset = sizeof(uint32_t) + sizeof(DDS_HEADER); + } + + std::unique_ptr pal8; + if ( convFlags & CONV_FLAGS_PAL8 ) + { + pal8.reset( new uint32_t[256] ); + if ( !pal8 ) + { + return E_OUTOFMEMORY; + } + + if ( !ReadFile( hFile.get(), pal8.get(), 256 * sizeof(uint32_t), &bytesRead, 0 ) ) + { + return HRESULT_FROM_WIN32( GetLastError() ); + } + + if ( bytesRead != (256 * sizeof(uint32_t)) ) + { + return E_FAIL; + } + + offset += ( 256 * sizeof(uint32_t) ); + } + + DWORD remaining = fileSize.LowPart - offset; + if ( remaining == 0 ) + return E_FAIL; + + hr = image.Initialize( mdata ); + if ( FAILED(hr) ) + return hr; + + if ( (convFlags & CONV_FLAGS_EXPAND) || (flags & DDS_FLAGS_LEGACY_DWORD) ) + { + std::unique_ptr temp( new uint8_t[ remaining ] ); + if ( !temp ) + { + image.Release(); + return E_OUTOFMEMORY; + } + + if ( !ReadFile( hFile.get(), temp.get(), remaining, &bytesRead, 0 ) ) + { + image.Release(); + return HRESULT_FROM_WIN32( GetLastError() ); + } + + if ( bytesRead != remaining ) + { + image.Release(); + return E_FAIL; + } + + hr = _CopyImage( temp.get(), remaining, mdata, + (flags & DDS_FLAGS_LEGACY_DWORD) ? CP_FLAGS_LEGACY_DWORD : CP_FLAGS_NONE, + convFlags, pal8.get(), image ); + if ( FAILED(hr) ) + { + image.Release(); + return hr; + } + } + else + { + if ( remaining > image.GetPixelsSize() ) + { + image.Release(); + return E_FAIL; + } + + if ( !ReadFile( hFile.get(), image.GetPixels(), static_cast( image.GetPixelsSize() ), &bytesRead, 0 ) ) + { + image.Release(); + return HRESULT_FROM_WIN32( GetLastError() ); + } + + if ( convFlags & (CONV_FLAGS_SWIZZLE|CONV_FLAGS_NOALPHA) ) + { + // Swizzle/copy image in place + hr = _CopyImageInPlace( convFlags, image ); + if ( FAILED(hr) ) + { + image.Release(); + return hr; + } + } + } + + if ( metadata ) + memcpy( metadata, &mdata, sizeof(TexMetadata) ); + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Save a DDS file to memory +//------------------------------------------------------------------------------------- +HRESULT SaveToDDSMemory( const Image* images, size_t nimages, const TexMetadata& metadata, DWORD flags, Blob& blob ) +{ + if ( !images || (nimages == 0) ) + return E_INVALIDARG; + + // Determine memory required + size_t required = 0; + HRESULT hr = _EncodeDDSHeader( metadata, flags, 0, 0, required ); + if ( FAILED(hr) ) + return hr; + + for( size_t i = 0; i < nimages; ++i ) + { + required += images[ i ].slicePitch; + if ( !images[ i ].pixels ) + return E_POINTER; + } + + assert( required > 0 ); + + blob.Release(); + + hr = blob.Initialize( required ); + if ( FAILED(hr) ) + return hr; + + uint8_t* pDestination = reinterpret_cast( blob.GetBufferPointer() ); + assert( pDestination ); + + hr = _EncodeDDSHeader( metadata, flags, pDestination, blob.GetBufferSize(), required ); + if ( FAILED(hr) ) + { + blob.Release(); + return hr; + } + + size_t remaining = blob.GetBufferSize() - required; + pDestination += required; + + if ( !remaining ) + { + blob.Release(); + return E_FAIL; + } + + switch( metadata.dimension ) + { + case DDS_DIMENSION_TEXTURE1D: + case DDS_DIMENSION_TEXTURE2D: + { + size_t index = 0; + for( size_t item = 0; item < metadata.arraySize; ++item ) + { + for( size_t level = 0; level < metadata.mipLevels; ++level ) + { + if ( index >= nimages ) + { + blob.Release(); + return E_FAIL; + } + + size_t pixsize = images[ index ].slicePitch; + if ( memcpy_s( pDestination, remaining, images[ index ].pixels, pixsize ) ) + { + blob.Release(); + return E_FAIL; + } + pDestination += pixsize; + remaining -= pixsize; + + ++index; + } + } + } + break; + + case DDS_DIMENSION_TEXTURE3D: + { + if ( metadata.arraySize != 1 ) + { + blob.Release(); + return E_FAIL; + } + + size_t d = metadata.depth; + + size_t index = 0; + for( size_t level = 0; level < metadata.mipLevels; ++level ) + { + for( size_t slice = 0; slice < d; ++slice ) + { + if ( index >= nimages ) + { + blob.Release(); + return E_FAIL; + } + + size_t pixsize = images[ index ].slicePitch; + if ( memcpy_s( pDestination, remaining, images[ index ].pixels, pixsize ) ) + { + blob.Release(); + return E_FAIL; + } + pDestination += pixsize; + remaining -= pixsize; + + ++index; + } + + if ( d > 1 ) + d >>= 1; + } + } + break; + + default: + blob.Release(); + return E_FAIL; + } + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Save a DDS file to disk +//------------------------------------------------------------------------------------- +HRESULT SaveToDDSFile( const Image* images, size_t nimages, const TexMetadata& metadata, DWORD flags, LPCWSTR szFile ) +{ + if ( !szFile ) + return E_INVALIDARG; + + // Create DDS Header + const size_t MAX_HEADER_SIZE = sizeof(uint32_t) + sizeof(DDS_HEADER) + sizeof(DDS_HEADER_DXT10); + uint8_t header[MAX_HEADER_SIZE]; + size_t required; + HRESULT hr = _EncodeDDSHeader( metadata, flags, header, MAX_HEADER_SIZE, required ); + if ( FAILED(hr) ) + return hr; + + // Create file and write header +#if (_WIN32_WINNT >= 0x0602 /*_WIN32_WINNT_WIN8*/) + ScopedHandle hFile( safe_handle( CreateFile2( szFile, GENERIC_WRITE, 0, CREATE_ALWAYS, 0 ) ) ); +#else + ScopedHandle hFile( safe_handle( CreateFileW( szFile, GENERIC_WRITE, 0, 0, CREATE_ALWAYS, 0, 0 ) ) ); +#endif + if ( !hFile ) + { + return HRESULT_FROM_WIN32( GetLastError() ); + } + + DWORD bytesWritten; + if ( !WriteFile( hFile.get(), header, static_cast( required ), &bytesWritten, 0 ) ) + { + return HRESULT_FROM_WIN32( GetLastError() ); + } + + if ( bytesWritten != required ) + { + return E_FAIL; + } + + // Write images + switch( metadata.dimension ) + { + case DDS_DIMENSION_TEXTURE1D: + case DDS_DIMENSION_TEXTURE2D: + { + size_t index = 0; + for( size_t item = 0; item < metadata.arraySize; ++item ) + { + for( size_t level = 0; level < metadata.mipLevels; ++level, ++index ) + { + if ( index >= nimages ) + return E_FAIL; + + if ( !images[ index ].pixels ) + return E_POINTER; + + size_t pixsize = images[ index ].slicePitch; + + if ( !WriteFile( hFile.get(), images[ index ].pixels, static_cast( pixsize ), &bytesWritten, 0 ) ) + { + return HRESULT_FROM_WIN32( GetLastError() ); + } + + if ( bytesWritten != pixsize ) + { + return E_FAIL; + } + } + } + } + break; + + case DDS_DIMENSION_TEXTURE3D: + { + if ( metadata.arraySize != 1 ) + return E_FAIL; + + size_t d = metadata.depth; + + size_t index = 0; + for( size_t level = 0; level < metadata.mipLevels; ++level ) + { + for( size_t slice = 0; slice < d; ++slice, ++index ) + { + if ( index >= nimages ) + return E_FAIL; + + if ( !images[ index ].pixels ) + return E_POINTER; + + size_t pixsize = images[ index ].slicePitch; + + if ( !WriteFile( hFile.get(), images[ index ].pixels, static_cast( pixsize ), &bytesWritten, 0 ) ) + { + return HRESULT_FROM_WIN32( GetLastError() ); + } + + if ( bytesWritten != pixsize ) + { + return E_FAIL; + } + } + + if ( d > 1 ) + d >>= 1; + } + } + break; + + default: + return E_FAIL; + } + + return S_OK; +} + +}; // namespace diff --git a/thirdparty/directxtex/DirectXTex/DirectXTexFlipRotate.cpp b/thirdparty/directxtex/DirectXTex/DirectXTexFlipRotate.cpp new file mode 100644 index 0000000..5ff94d5 --- /dev/null +++ b/thirdparty/directxtex/DirectXTex/DirectXTexFlipRotate.cpp @@ -0,0 +1,327 @@ +//------------------------------------------------------------------------------------- +// DirectXTexFlipRotate.cpp +// +// DirectX Texture Library - Image flip/rotate operations +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkId=248926 +//------------------------------------------------------------------------------------- + +#include "directxtexp.h" + +namespace DirectX +{ + +//------------------------------------------------------------------------------------- +// Do flip/rotate operation using WIC +//------------------------------------------------------------------------------------- +static HRESULT _PerformFlipRotateUsingWIC( _In_ const Image& srcImage, _In_ DWORD flags, + _In_ const WICPixelFormatGUID& pfGUID, _In_ const Image& destImage ) +{ + if ( !srcImage.pixels || !destImage.pixels ) + return E_POINTER; + + assert( srcImage.format == destImage.format ); + + IWICImagingFactory* pWIC = _GetWIC(); + if ( !pWIC ) + return E_NOINTERFACE; + + ScopedObject source; + HRESULT hr = pWIC->CreateBitmapFromMemory( static_cast( srcImage.width ), static_cast( srcImage.height ), pfGUID, + static_cast( srcImage.rowPitch ), static_cast( srcImage.slicePitch ), + srcImage.pixels, &source ); + if ( FAILED(hr) ) + return hr; + + ScopedObject FR; + hr = pWIC->CreateBitmapFlipRotator( &FR ); + if ( FAILED(hr) ) + return hr; + + hr = FR->Initialize( source.Get(), static_cast( flags ) ); + if ( FAILED(hr) ) + return hr; + + WICPixelFormatGUID pfFR; + hr = FR->GetPixelFormat( &pfFR ); + if ( FAILED(hr) ) + return hr; + + if ( memcmp( &pfFR, &pfGUID, sizeof(GUID) ) != 0 ) + { + // Flip/rotate should return the same format as the source... + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + } + + UINT nwidth, nheight; + hr = FR->GetSize( &nwidth, &nheight ); + if ( FAILED(hr) ) + return hr; + + if ( destImage.width != nwidth || destImage.height != nheight ) + return E_FAIL; + + hr = FR->CopyPixels( 0, static_cast( destImage.rowPitch ), static_cast( destImage.slicePitch ), destImage.pixels ); + if ( FAILED(hr) ) + return hr; + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Do conversion, flip/rotate using WIC, conversion cycle +//------------------------------------------------------------------------------------- +static HRESULT _PerformFlipRotateViaF32( _In_ const Image& srcImage, _In_ DWORD flags, _In_ const Image& destImage ) +{ + if ( !srcImage.pixels || !destImage.pixels ) + return E_POINTER; + + assert( srcImage.format != DXGI_FORMAT_R32G32B32A32_FLOAT ); + assert( srcImage.format == destImage.format ); + + ScratchImage temp; + HRESULT hr = _ConvertToR32G32B32A32( srcImage, temp ); + if ( FAILED(hr) ) + return hr; + + const Image *tsrc = temp.GetImage( 0, 0, 0 ); + if ( !tsrc ) + return E_POINTER; + + ScratchImage rtemp; + hr = rtemp.Initialize2D( DXGI_FORMAT_R32G32B32A32_FLOAT, destImage.width, destImage.height, 1, 1 ); + if ( FAILED(hr) ) + return hr; + + const Image *tdest = rtemp.GetImage( 0, 0, 0 ); + if ( !tdest ) + return E_POINTER; + + hr = _PerformFlipRotateUsingWIC( *tsrc, flags, GUID_WICPixelFormat128bppRGBAFloat, *tdest ); + if ( FAILED(hr) ) + return hr; + + temp.Release(); + + hr = _ConvertFromR32G32B32A32( *tdest, destImage ); + if ( FAILED(hr) ) + return hr; + + return S_OK; +} + + +//===================================================================================== +// Entry-points +//===================================================================================== + +//------------------------------------------------------------------------------------- +// Flip/rotate image +//------------------------------------------------------------------------------------- +HRESULT FlipRotate( const Image& srcImage, DWORD flags, ScratchImage& image ) +{ + if ( !srcImage.pixels ) + return E_POINTER; + + if ( !flags ) + return E_INVALIDARG; + +#ifdef _AMD64_ + if ( (srcImage.width > 0xFFFFFFFF) || (srcImage.height > 0xFFFFFFFF) ) + return E_INVALIDARG; +#endif + + if ( IsCompressed( srcImage.format ) ) + { + // We don't support flip/rotate operations on compressed images + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + } + + static_assert( TEX_FR_ROTATE0 == WICBitmapTransformRotate0, "TEX_FR_ROTATE0 no longer matches WIC" ); + static_assert( TEX_FR_ROTATE90 == WICBitmapTransformRotate90, "TEX_FR_ROTATE90 no longer matches WIC" ); + static_assert( TEX_FR_ROTATE180 == WICBitmapTransformRotate180, "TEX_FR_ROTATE180 no longer matches WIC" ); + static_assert( TEX_FR_ROTATE270 == WICBitmapTransformRotate270, "TEX_FR_ROTATE270 no longer matches WIC" ); + static_assert( TEX_FR_FLIP_HORIZONTAL == WICBitmapTransformFlipHorizontal, "TEX_FR_FLIP_HORIZONTAL no longer matches WIC" ); + static_assert( TEX_FR_FLIP_VERTICAL == WICBitmapTransformFlipVertical, "TEX_FR_FLIP_VERTICAL no longer matches WIC" ); + + // Only supports 90, 180, 270, or no rotation flags... not a combination of rotation flags + switch ( flags & (TEX_FR_ROTATE90|TEX_FR_ROTATE180|TEX_FR_ROTATE270) ) + { + case 0: + case TEX_FR_ROTATE90: + case TEX_FR_ROTATE180: + case TEX_FR_ROTATE270: + break; + + default: + return E_INVALIDARG; + } + + size_t nwidth = srcImage.width; + size_t nheight = srcImage.height; + + if (flags & (TEX_FR_ROTATE90|TEX_FR_ROTATE270)) + { + nwidth = srcImage.height; + nheight = srcImage.width; + } + + HRESULT hr = image.Initialize2D( srcImage.format, nwidth, nheight, 1, 1 ); + if ( FAILED(hr) ) + return hr; + + const Image *rimage = image.GetImage( 0, 0, 0 ); + if ( !rimage ) + return E_POINTER; + + WICPixelFormatGUID pfGUID; + if ( _DXGIToWIC( srcImage.format, pfGUID ) ) + { + // Case 1: Source format is supported by Windows Imaging Component + hr = _PerformFlipRotateUsingWIC( srcImage, flags, pfGUID, *rimage ); + } + else + { + // Case 2: Source format is not supported by WIC, so we have to convert, flip/rotate, and convert back + hr = _PerformFlipRotateViaF32( srcImage, flags, *rimage ); + } + + if ( FAILED(hr) ) + { + image.Release(); + return hr; + } + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Flip/rotate image (complex) +//------------------------------------------------------------------------------------- +HRESULT FlipRotate( const Image* srcImages, size_t nimages, const TexMetadata& metadata, + DWORD flags, ScratchImage& result ) +{ + if ( !srcImages || !nimages ) + return E_INVALIDARG; + + if ( IsCompressed( metadata.format ) ) + { + // We don't support flip/rotate operations on compressed images + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + } + + static_assert( TEX_FR_ROTATE0 == WICBitmapTransformRotate0, "TEX_FR_ROTATE0 no longer matches WIC" ); + static_assert( TEX_FR_ROTATE90 == WICBitmapTransformRotate90, "TEX_FR_ROTATE90 no longer matches WIC" ); + static_assert( TEX_FR_ROTATE180 == WICBitmapTransformRotate180, "TEX_FR_ROTATE180 no longer matches WIC" ); + static_assert( TEX_FR_ROTATE270 == WICBitmapTransformRotate270, "TEX_FR_ROTATE270 no longer matches WIC" ); + static_assert( TEX_FR_FLIP_HORIZONTAL == WICBitmapTransformFlipHorizontal, "TEX_FR_FLIP_HORIZONTAL no longer matches WIC" ); + static_assert( TEX_FR_FLIP_VERTICAL == WICBitmapTransformFlipVertical, "TEX_FR_FLIP_VERTICAL no longer matches WIC" ); + + // Only supports 90, 180, 270, or no rotation flags... not a combination of rotation flags + switch ( flags & (TEX_FR_ROTATE90|TEX_FR_ROTATE180|TEX_FR_ROTATE270) ) + { + case 0: + case TEX_FR_ROTATE90: + case TEX_FR_ROTATE180: + case TEX_FR_ROTATE270: + break; + + default: + return E_INVALIDARG; + } + + TexMetadata mdata2 = metadata; + + bool flipwh = false; + if (flags & (TEX_FR_ROTATE90|TEX_FR_ROTATE270)) + { + flipwh = true; + mdata2.width = metadata.height; + mdata2.height = metadata.width; + } + + HRESULT hr = result.Initialize( mdata2 ); + if ( FAILED(hr) ) + return hr; + + if ( nimages != result.GetImageCount() ) + { + result.Release(); + return E_FAIL; + } + + const Image* dest = result.GetImages(); + if ( !dest ) + { + result.Release(); + return E_POINTER; + } + + WICPixelFormatGUID pfGUID; + bool wicpf = _DXGIToWIC( metadata.format, pfGUID ); + + for( size_t index=0; index < nimages; ++index ) + { + const Image& src = srcImages[ index ]; + if ( src.format != metadata.format ) + { + result.Release(); + return E_FAIL; + } + +#ifdef _AMD64_ + if ( (src.width > 0xFFFFFFFF) || (src.height > 0xFFFFFFFF) ) + return E_FAIL; +#endif + + const Image& dst = dest[ index ]; + assert( dst.format == metadata.format ); + + if ( flipwh ) + { + if ( src.width != dst.height || src.height != dst.width ) + { + result.Release(); + return E_FAIL; + } + } + else + { + if ( src.width != dst.width || src.height != dst.height ) + { + result.Release(); + return E_FAIL; + } + } + + if (wicpf) + { + // Case 1: Source format is supported by Windows Imaging Component + hr = _PerformFlipRotateUsingWIC( src, flags, pfGUID, dst ); + } + else + { + // Case 2: Source format is not supported by WIC, so we have to convert, flip/rotate, and convert back + hr = _PerformFlipRotateViaF32( src, flags, dst ); + } + + if ( FAILED(hr) ) + { + result.Release(); + return hr; + } + } + + return S_OK; +} + +}; // namespace diff --git a/thirdparty/directxtex/DirectXTex/DirectXTexImage.cpp b/thirdparty/directxtex/DirectXTex/DirectXTexImage.cpp new file mode 100644 index 0000000..fb1b383 --- /dev/null +++ b/thirdparty/directxtex/DirectXTex/DirectXTexImage.cpp @@ -0,0 +1,674 @@ +//------------------------------------------------------------------------------------- +// DirectXTexImage.cpp +// +// DirectX Texture Library - Image container +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkId=248926 +//------------------------------------------------------------------------------------- + +#include "directxtexp.h" + +namespace DirectX +{ + +extern bool _CalculateMipLevels( _In_ size_t width, _In_ size_t height, _Inout_ size_t& mipLevels ); +extern bool _CalculateMipLevels3D( _In_ size_t width, _In_ size_t height, _In_ size_t depth, _Inout_ size_t& mipLevels ); + +//------------------------------------------------------------------------------------- +// Determines number of image array entries and pixel size +//------------------------------------------------------------------------------------- +void _DetermineImageArray( const TexMetadata& metadata, DWORD cpFlags, + size_t& nImages, size_t& pixelSize ) +{ + assert( metadata.width > 0 && metadata.height > 0 && metadata.depth > 0 ); + assert( metadata.arraySize > 0 ); + assert( metadata.mipLevels > 0 ); + + size_t _pixelSize = 0; + size_t _nimages = 0; + + switch( metadata.dimension ) + { + case TEX_DIMENSION_TEXTURE1D: + case TEX_DIMENSION_TEXTURE2D: + for( size_t item = 0; item < metadata.arraySize; ++item ) + { + size_t w = metadata.width; + size_t h = metadata.height; + + for( size_t level=0; level < metadata.mipLevels; ++level ) + { + size_t rowPitch, slicePitch; + ComputePitch( metadata.format, w, h, rowPitch, slicePitch, cpFlags ); + + _pixelSize += slicePitch; + ++_nimages; + + if ( h > 1 ) + h >>= 1; + + if ( w > 1 ) + w >>= 1; + } + } + break; + + case TEX_DIMENSION_TEXTURE3D: + { + size_t w = metadata.width; + size_t h = metadata.height; + size_t d = metadata.depth; + + for( size_t level=0; level < metadata.mipLevels; ++level ) + { + size_t rowPitch, slicePitch; + ComputePitch( metadata.format, w, h, rowPitch, slicePitch, cpFlags ); + + for( size_t slice=0; slice < d; ++slice ) + { + _pixelSize += slicePitch; + ++_nimages; + } + + if ( h > 1 ) + h >>= 1; + + if ( w > 1 ) + w >>= 1; + + if ( d > 1 ) + d >>= 1; + } + } + break; + + default: + assert( false ); + break; + } + + nImages = _nimages; + pixelSize = _pixelSize; +} + + +//------------------------------------------------------------------------------------- +// Fills in the image array entries +//------------------------------------------------------------------------------------- +bool _SetupImageArray( uint8_t *pMemory, size_t pixelSize, + const TexMetadata& metadata, DWORD cpFlags, + Image* images, size_t nImages ) +{ + assert( pMemory ); + assert( pixelSize > 0 ); + assert( nImages > 0 ); + + if ( !images ) + return false; + + size_t index = 0; + uint8_t* pixels = pMemory; + const uint8_t* pEndBits = pMemory + pixelSize; + + switch( metadata.dimension ) + { + case TEX_DIMENSION_TEXTURE1D: + case TEX_DIMENSION_TEXTURE2D: + if (metadata.arraySize == 0 || metadata.mipLevels == 0) + { + return false; + } + + for( size_t item = 0; item < metadata.arraySize; ++item ) + { + size_t w = metadata.width; + size_t h = metadata.height; + + for( size_t level=0; level < metadata.mipLevels; ++level ) + { + if ( index >= nImages ) + { + return false; + } + + size_t rowPitch, slicePitch; + ComputePitch( metadata.format, w, h, rowPitch, slicePitch, cpFlags ); + + images[index].width = w; + images[index].height = h; + images[index].format = metadata.format; + images[index].rowPitch = rowPitch; + images[index].slicePitch = slicePitch; + images[index].pixels = pixels; + ++index; + + pixels += slicePitch; + if ( pixels > pEndBits ) + { + return false; + } + + if ( h > 1 ) + h >>= 1; + + if ( w > 1 ) + w >>= 1; + } + } + return true; + + case TEX_DIMENSION_TEXTURE3D: + { + if (metadata.mipLevels == 0 || metadata.depth == 0) + { + return false; + } + + size_t w = metadata.width; + size_t h = metadata.height; + size_t d = metadata.depth; + + for( size_t level=0; level < metadata.mipLevels; ++level ) + { + size_t rowPitch, slicePitch; + ComputePitch( metadata.format, w, h, rowPitch, slicePitch, cpFlags ); + + for( size_t slice=0; slice < d; ++slice ) + { + if ( index >= nImages ) + { + return false; + } + + // We use the same memory organization that Direct3D 11 needs for D3D11_SUBRESOURCE_DATA + // with all slices of a given miplevel being continuous in memory + images[index].width = w; + images[index].height = h; + images[index].format = metadata.format; + images[index].rowPitch = rowPitch; + images[index].slicePitch = slicePitch; + images[index].pixels = pixels; + ++index; + + pixels += slicePitch; + if ( pixels > pEndBits ) + { + return false; + } + } + + if ( h > 1 ) + h >>= 1; + + if ( w > 1 ) + w >>= 1; + + if ( d > 1 ) + d >>= 1; + } + } + return true; + + default: + return false; + } +} + + +//===================================================================================== +// ScratchImage - Bitmap image container +//===================================================================================== + +//------------------------------------------------------------------------------------- +// Methods +//------------------------------------------------------------------------------------- +HRESULT ScratchImage::Initialize( const TexMetadata& mdata ) +{ + if ( !IsValid(mdata.format) || IsVideo(mdata.format) ) + return E_INVALIDARG; + + size_t mipLevels = mdata.mipLevels; + + switch( mdata.dimension ) + { + case TEX_DIMENSION_TEXTURE1D: + if ( !mdata.width || mdata.height != 1 || mdata.depth != 1 || !mdata.arraySize ) + return E_INVALIDARG; + + if ( !_CalculateMipLevels(mdata.width,1,mipLevels) ) + return E_INVALIDARG; + break; + + case TEX_DIMENSION_TEXTURE2D: + if ( !mdata.width || !mdata.height || mdata.depth != 1 || !mdata.arraySize ) + return E_INVALIDARG; + + if ( mdata.miscFlags & TEX_MISC_TEXTURECUBE ) + { + if ( (mdata.arraySize % 6) != 0 ) + return E_INVALIDARG; + } + + if ( !_CalculateMipLevels(mdata.width,mdata.height,mipLevels) ) + return E_INVALIDARG; + break; + + case TEX_DIMENSION_TEXTURE3D: + if ( !mdata.width || !mdata.height || !mdata.depth || mdata.arraySize != 1 ) + return E_INVALIDARG; + + if ( !_CalculateMipLevels3D(mdata.width,mdata.height,mdata.depth,mipLevels) ) + return E_INVALIDARG; + break; + + default: + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + } + + Release(); + + _metadata.width = mdata.width; + _metadata.height = mdata.height; + _metadata.depth = mdata.depth; + _metadata.arraySize = mdata.arraySize; + _metadata.mipLevels = mipLevels; + _metadata.miscFlags = mdata.miscFlags & TEX_MISC_TEXTURECUBE; + _metadata.format = mdata.format; + _metadata.dimension = mdata.dimension; + + size_t pixelSize, nimages; + _DetermineImageArray( _metadata, CP_FLAGS_NONE, nimages, pixelSize ); + + _image = new Image[ nimages ]; + if ( !_image ) + return E_OUTOFMEMORY; + + _nimages = nimages; + memset( _image, 0, sizeof(Image) * nimages ); + + _memory = reinterpret_cast( _aligned_malloc( pixelSize, 16 ) ); + if ( !_memory ) + { + Release(); + return E_OUTOFMEMORY; + } + _size = pixelSize; + if ( !_SetupImageArray( _memory, pixelSize, _metadata, CP_FLAGS_NONE, _image, nimages ) ) + { + Release(); + return E_FAIL; + } + + return S_OK; +} + +HRESULT ScratchImage::Initialize1D( DXGI_FORMAT fmt, size_t length, size_t arraySize, size_t mipLevels ) +{ + if ( !IsValid(fmt) || IsVideo(fmt) || !length || !arraySize ) + return E_INVALIDARG; + + // 1D is a special case of the 2D case + HRESULT hr = Initialize2D( fmt, length, 1, arraySize, mipLevels ); + if ( FAILED(hr) ) + return hr; + + _metadata.dimension = TEX_DIMENSION_TEXTURE1D; + + return S_OK; +} + +HRESULT ScratchImage::Initialize2D( DXGI_FORMAT fmt, size_t width, size_t height, size_t arraySize, size_t mipLevels ) +{ + if ( !IsValid(fmt) || IsVideo(fmt) || !width || !height || !arraySize ) + return E_INVALIDARG; + + if ( !_CalculateMipLevels(width,height,mipLevels) ) + return E_INVALIDARG; + + Release(); + + _metadata.width = width; + _metadata.height = height; + _metadata.depth = 1; + _metadata.arraySize = arraySize; + _metadata.mipLevels = mipLevels; + _metadata.miscFlags = 0; + _metadata.format = fmt; + _metadata.dimension = TEX_DIMENSION_TEXTURE2D; + + size_t pixelSize, nimages; + _DetermineImageArray( _metadata, CP_FLAGS_NONE, nimages, pixelSize ); + + _image = new Image[ nimages ]; + if ( !_image ) + return E_OUTOFMEMORY; + + _nimages = nimages; + memset( _image, 0, sizeof(Image) * nimages ); + + _memory = reinterpret_cast( _aligned_malloc( pixelSize, 16 ) ); + if ( !_memory ) + { + Release(); + return E_OUTOFMEMORY; + } + _size = pixelSize; + if ( !_SetupImageArray( _memory, pixelSize, _metadata, CP_FLAGS_NONE, _image, nimages ) ) + { + Release(); + return E_FAIL; + } + + return S_OK; +} + +HRESULT ScratchImage::Initialize3D( DXGI_FORMAT fmt, size_t width, size_t height, size_t depth, size_t mipLevels ) +{ + if ( !IsValid(fmt) || IsVideo(fmt) || !width || !height || !depth ) + return E_INVALIDARG; + + if ( !_CalculateMipLevels3D(width,height,depth,mipLevels) ) + return E_INVALIDARG; + + Release(); + + _metadata.width = width; + _metadata.height = height; + _metadata.depth = depth; + _metadata.arraySize = 1; // Direct3D 10.x/11 does not support arrays of 3D textures + _metadata.mipLevels = mipLevels; + _metadata.miscFlags = 0; + _metadata.format = fmt; + _metadata.dimension = TEX_DIMENSION_TEXTURE3D; + + size_t pixelSize, nimages; + _DetermineImageArray( _metadata, CP_FLAGS_NONE, nimages, pixelSize ); + + _image = new Image[ nimages ]; + if ( !_image ) + { + Release(); + return E_OUTOFMEMORY; + } + _nimages = nimages; + memset( _image, 0, sizeof(Image) * nimages ); + + _memory = reinterpret_cast( _aligned_malloc( pixelSize, 16 ) ); + if ( !_memory ) + { + Release(); + return E_OUTOFMEMORY; + } + _size = pixelSize; + + if ( !_SetupImageArray( _memory, pixelSize, _metadata, CP_FLAGS_NONE, _image, nimages ) ) + { + Release(); + return E_FAIL; + } + + return S_OK; +} + +HRESULT ScratchImage::InitializeCube( DXGI_FORMAT fmt, size_t width, size_t height, size_t nCubes, size_t mipLevels ) +{ + if ( !IsValid(fmt) || IsVideo(fmt) || !width || !height || !nCubes ) + return E_INVALIDARG; + + // A DirectX11 cubemap is just a 2D texture array that is a multiple of 6 for each cube + HRESULT hr = Initialize2D( fmt, width, height, nCubes * 6, mipLevels ); + if ( FAILED(hr) ) + return hr; + + _metadata.miscFlags |= TEX_MISC_TEXTURECUBE; + + return S_OK; +} + +HRESULT ScratchImage::InitializeFromImage( const Image& srcImage, bool allow1D ) +{ + HRESULT hr = ( srcImage.height > 1 || !allow1D ) + ? Initialize2D( srcImage.format, srcImage.width, srcImage.height, 1, 1 ) + : Initialize1D( srcImage.format, srcImage.width, 1, 1 ); + + if ( FAILED(hr) ) + return hr; + + const uint8_t* sptr = reinterpret_cast( srcImage.pixels ); + if ( !sptr ) + return E_POINTER; + + uint8_t* dptr = reinterpret_cast( _image[0].pixels ); + if ( !dptr ) + return E_POINTER; + + for( size_t y = 0; y < srcImage.height; ++y ) + { + _CopyScanline( dptr, _image[0].rowPitch, sptr, srcImage.rowPitch, srcImage.format, TEXP_SCANLINE_NONE ); + sptr += srcImage.rowPitch; + dptr += _image[0].rowPitch; + } + + return S_OK; +} + +HRESULT ScratchImage::InitializeArrayFromImages( const Image* images, size_t nImages, bool allow1D ) +{ + if ( !images || !nImages ) + return E_INVALIDARG; + + DXGI_FORMAT format = images[0].format; + size_t width = images[0].width; + size_t height = images[0].height; + + for( size_t index=0; index < nImages; ++index ) + { + if ( !images[index].pixels ) + return E_POINTER; + + if ( images[index].format != format || images[index].width != width || images[index].height != height ) + { + // All images must be the same format, width, and height + return E_FAIL; + } + } + + HRESULT hr = ( height > 1 || !allow1D ) + ? Initialize2D( format, width, height, nImages, 1 ) + : Initialize1D( format, width, nImages, 1 ); + + if ( FAILED(hr) ) + return hr; + + for( size_t index=0; index < nImages; ++index ) + { + const uint8_t* sptr = reinterpret_cast( images[index].pixels ); + if ( !sptr ) + return E_POINTER; + + assert( index < _nimages ); + uint8_t* dptr = reinterpret_cast( _image[index].pixels ); + if ( !dptr ) + return E_POINTER; + + for( size_t y = 0; y < height; ++y ) + { + _CopyScanline( dptr, _image[index].rowPitch, sptr, images[index].rowPitch, format, TEXP_SCANLINE_NONE ); + sptr += images[index].rowPitch; + dptr += _image[index].rowPitch; + } + } + + return S_OK; +} + +HRESULT ScratchImage::InitializeCubeFromImages( const Image* images, size_t nImages ) +{ + if ( !images || !nImages ) + return E_INVALIDARG; + + // A DirectX11 cubemap is just a 2D texture array that is a multiple of 6 for each cube + if ( ( nImages % 6 ) != 0 ) + return E_INVALIDARG; + + HRESULT hr = InitializeArrayFromImages( images, nImages, false ); + if ( FAILED(hr) ) + return hr; + + _metadata.miscFlags |= TEX_MISC_TEXTURECUBE; + + return S_OK; +} + +HRESULT ScratchImage::Initialize3DFromImages( const Image* images, size_t depth ) +{ + if ( !images || !depth ) + return E_INVALIDARG; + + DXGI_FORMAT format = images[0].format; + size_t width = images[0].width; + size_t height = images[0].height; + + for( size_t slice=0; slice < depth; ++slice ) + { + if ( !images[slice].pixels ) + return E_POINTER; + + if ( images[slice].format != format || images[slice].width != width || images[slice].height != height ) + { + // All images must be the same format, width, and height + return E_FAIL; + } + } + + HRESULT hr = Initialize3D( format, width, height, depth, 1 ); + if ( FAILED(hr) ) + return hr; + + for( size_t slice=0; slice < depth; ++slice ) + { + const uint8_t* sptr = reinterpret_cast( images[slice].pixels ); + if ( !sptr ) + return E_POINTER; + + assert( slice < _nimages ); + uint8_t* dptr = reinterpret_cast( _image[slice].pixels ); + if ( !dptr ) + return E_POINTER; + + for( size_t y = 0; y < height; ++y ) + { + _CopyScanline( dptr, _image[slice].rowPitch, sptr, images[slice].rowPitch, format, TEXP_SCANLINE_NONE ); + sptr += images[slice].rowPitch; + dptr += _image[slice].rowPitch; + } + } + + return S_OK; +} + +void ScratchImage::Release() +{ + _nimages = 0; + _size = 0; + + if ( _image ) + { + delete [] _image; + _image = 0; + } + + if ( _memory ) + { + _aligned_free( _memory ); + _memory = 0; + } + + memset(&_metadata, 0, sizeof(_metadata)); +} + +bool ScratchImage::OverrideFormat( DXGI_FORMAT f ) +{ + if ( !_image ) + return false; + + if ( !IsValid( f ) || IsVideo( f ) ) + return false; + + if ( ( BitsPerPixel( f ) != BitsPerPixel( _metadata.format ) ) + || ( IsCompressed( f ) != IsCompressed( _metadata.format ) ) + || ( IsPacked( f ) != IsPacked( _metadata.format ) ) ) + { + // Can't change the effective pitch of the format this way + return false; + } + + for( size_t index = 0; index < _nimages; ++index ) + { + _image[ index ].format = f; + } + + _metadata.format = f; + + return true; +} + +const Image* ScratchImage::GetImage(size_t mip, size_t item, size_t slice) const +{ + if ( mip >= _metadata.mipLevels ) + return nullptr; + + size_t index = 0; + + switch( _metadata.dimension ) + { + case TEX_DIMENSION_TEXTURE1D: + case TEX_DIMENSION_TEXTURE2D: + if ( slice > 0 ) + return nullptr; + + if ( item >= _metadata.arraySize ) + return nullptr; + + index = item*( _metadata.mipLevels ) + mip; + break; + + case TEX_DIMENSION_TEXTURE3D: + if ( item > 0 ) + { + // No support for arrays of volumes + return nullptr; + } + else + { + size_t d = _metadata.depth; + + for( size_t level = 0; level < mip; ++level ) + { + index += d; + if ( d > 1 ) + d >>= 1; + } + + if ( slice >= d ) + return nullptr; + + index += slice; + } + break; + + default: + return nullptr; + } + + return &_image[index]; +} + +}; // namespace diff --git a/thirdparty/directxtex/DirectXTex/DirectXTexMipmaps.cpp b/thirdparty/directxtex/DirectXTex/DirectXTexMipmaps.cpp new file mode 100644 index 0000000..cb1c4c2 --- /dev/null +++ b/thirdparty/directxtex/DirectXTex/DirectXTexMipmaps.cpp @@ -0,0 +1,1163 @@ +//------------------------------------------------------------------------------------- +// DirectXTexMipMaps.cpp +// +// DirectX Texture Library - Mip-map generation +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkId=248926 +//------------------------------------------------------------------------------------- + +#include "directxtexp.h" + +namespace DirectX +{ + +static const XMVECTORF32 s_boxScale = { 0.25f, 0.25f, 0.25f, 0.25f }; +static const XMVECTORF32 s_boxScale3D = { 0.125f, 0.125f, 0.125f, 0.125f }; + +//------------------------------------------------------------------------------------- +// Mipmap helper functions +//------------------------------------------------------------------------------------- +inline static bool ispow2( _In_ size_t x ) +{ + return ((x != 0) && !(x & (x - 1))); +} + +static size_t _CountMips( _In_ size_t width, _In_ size_t height) +{ + size_t mipLevels = 1; + + while ( height > 1 || width > 1 ) + { + if ( height > 1 ) + height >>= 1; + + if ( width > 1 ) + width >>= 1; + + ++mipLevels; + } + + return mipLevels; +} + +bool _CalculateMipLevels( _In_ size_t width, _In_ size_t height, _Inout_ size_t& mipLevels ) +{ + if ( mipLevels > 1 ) + { + size_t maxMips = _CountMips(width,height); + if ( mipLevels > maxMips ) + return false; + } + else if ( mipLevels == 0 ) + { + mipLevels = _CountMips(width,height); + } + else + { + mipLevels = 1; + } + return true; +} + +static size_t _CountMips3D( _In_ size_t width, _In_ size_t height, _In_ size_t depth) +{ + size_t mipLevels = 1; + + while ( height > 1 || width > 1 || depth > 1 ) + { + if ( height > 1 ) + height >>= 1; + + if ( width > 1 ) + width >>= 1; + + if ( depth > 1 ) + depth >>= 1; + + ++mipLevels; + } + + return mipLevels; +} + +bool _CalculateMipLevels3D( _In_ size_t width, _In_ size_t height, _In_ size_t depth, _Inout_ size_t& mipLevels ) +{ + if ( mipLevels > 1 ) + { + if ( !ispow2(width) || !ispow2(height) || !ispow2(depth) ) + return false; + + size_t maxMips = _CountMips3D(width,height,depth); + if ( mipLevels > maxMips ) + return false; + } + else if ( mipLevels == 0 && ispow2(width) && ispow2(height) && ispow2(depth) ) + { + mipLevels = _CountMips3D(width,height,depth); + } + else + { + mipLevels = 1; + } + return true; +} + +static HRESULT _EnsureWicBitmapPixelFormat( _In_ IWICImagingFactory* pWIC, _In_ IWICBitmap* src, _In_ DWORD filter, + _In_ const WICPixelFormatGUID& desiredPixelFormat, + _Deref_out_ IWICBitmap** dest ) +{ + if ( !pWIC || !src || !dest ) + return E_POINTER; + + *dest = nullptr; + + WICPixelFormatGUID actualPixelFormat; + HRESULT hr = src->GetPixelFormat( &actualPixelFormat ); + + if ( SUCCEEDED(hr) ) + { + if ( memcmp( &actualPixelFormat, &desiredPixelFormat, sizeof(WICPixelFormatGUID) ) == 0 ) + { + src->AddRef(); + *dest = src; + } + else + { + ScopedObject converter; + hr = pWIC->CreateFormatConverter( &converter ); + if ( SUCCEEDED(hr) ) + { + hr = converter->Initialize( src, desiredPixelFormat, _GetWICDither(filter), 0, 0, WICBitmapPaletteTypeCustom ); + } + + if ( SUCCEEDED(hr) ) + { + hr = pWIC->CreateBitmapFromSource( converter.Get(), WICBitmapCacheOnDemand, dest ); + } + } + } + + return hr; +} + +HRESULT _ResizeSeparateColorAndAlpha( _In_ IWICImagingFactory* pWIC, _In_ IWICBitmap* original, + _In_ size_t newWidth, _In_ size_t newHeight, _In_ DWORD filter, _Inout_ const Image* img ) +{ + if ( !pWIC || !original || !img ) + return E_POINTER; + + const WICBitmapInterpolationMode interpolationMode = _GetWICInterp(filter); + + WICPixelFormatGUID desiredPixelFormat = GUID_WICPixelFormatUndefined; + HRESULT hr = original->GetPixelFormat( &desiredPixelFormat ); + + size_t colorBytesInPixel = 0; + size_t colorBytesPerPixel = 0; + size_t colorWithAlphaBytesPerPixel = 0; + WICPixelFormatGUID colorPixelFormat = GUID_WICPixelFormatUndefined; + WICPixelFormatGUID colorWithAlphaPixelFormat = GUID_WICPixelFormatUndefined; + + if ( SUCCEEDED(hr) ) + { + ScopedObject componentInfo; + hr = pWIC->CreateComponentInfo( desiredPixelFormat, &componentInfo ); + + ScopedObject pixelFormatInfo; + if ( SUCCEEDED(hr) ) + { + hr = componentInfo->QueryInterface( __uuidof(IWICPixelFormatInfo), (void**)&pixelFormatInfo ); + } + + UINT bitsPerPixel = 0; + if ( SUCCEEDED(hr) ) + { + hr = pixelFormatInfo->GetBitsPerPixel( &bitsPerPixel ); + } + + if ( SUCCEEDED(hr) ) + { + if ( bitsPerPixel <= 32 ) + { + colorBytesInPixel = colorBytesPerPixel = 3; + colorPixelFormat = GUID_WICPixelFormat24bppBGR; + + colorWithAlphaBytesPerPixel = 4; + colorWithAlphaPixelFormat = GUID_WICPixelFormat32bppBGRA; + } + else + { +#if(_WIN32_WINNT >= 0x0602 /*_WIN32_WINNT_WIN8*/) || defined(_WIN7_PLATFORM_UPDATE) + if ( _IsWIC2() ) + { + colorBytesInPixel = colorBytesPerPixel = 12; + colorPixelFormat = GUID_WICPixelFormat96bppRGBFloat; + } + else +#endif + { + colorBytesInPixel = 12; + colorBytesPerPixel = 16; + colorPixelFormat = GUID_WICPixelFormat128bppRGBFloat; + } + + colorWithAlphaBytesPerPixel = 16; + colorWithAlphaPixelFormat = GUID_WICPixelFormat128bppRGBAFloat; + } + } + } + + // Resize color only image (no alpha channel) + ScopedObject resizedColor; + if ( SUCCEEDED(hr) ) + { + ScopedObject colorScaler; + + hr = pWIC->CreateBitmapScaler(&colorScaler); + if ( SUCCEEDED(hr) ) + { + ScopedObject converted; + + hr = _EnsureWicBitmapPixelFormat( pWIC, original, filter, colorPixelFormat, &converted ); + if ( SUCCEEDED(hr) ) + { + hr = colorScaler->Initialize( converted.Get(), static_cast(newWidth), static_cast(newHeight), interpolationMode ); + } + } + + if ( SUCCEEDED(hr) ) + { + ScopedObject resized; + + hr = pWIC->CreateBitmapFromSource( colorScaler.Get(), WICBitmapCacheOnDemand, &resized ); + if ( SUCCEEDED(hr) ) + { + hr = _EnsureWicBitmapPixelFormat( pWIC, resized.Get(), filter, colorPixelFormat, &resizedColor ); + } + } + } + + // Resize color+alpha image + ScopedObject resizedColorWithAlpha; + if ( SUCCEEDED(hr) ) + { + ScopedObject colorWithAlphaScaler; + + hr = pWIC->CreateBitmapScaler( &colorWithAlphaScaler ); + if ( SUCCEEDED(hr) ) + { + ScopedObject converted; + + hr = _EnsureWicBitmapPixelFormat( pWIC, original, filter, colorWithAlphaPixelFormat, &converted ); + if ( SUCCEEDED(hr) ) + { + hr = colorWithAlphaScaler->Initialize( converted.Get(), static_cast(newWidth), static_cast(newHeight), interpolationMode ); + } + } + + if ( SUCCEEDED(hr) ) + { + ScopedObject resized; + + hr = pWIC->CreateBitmapFromSource( colorWithAlphaScaler.Get(), WICBitmapCacheOnDemand, &resized ); + if ( SUCCEEDED(hr) ) + { + hr = _EnsureWicBitmapPixelFormat( pWIC, resized.Get(), filter, colorWithAlphaPixelFormat, &resizedColorWithAlpha ); + } + } + } + + // Merge pixels (copying color channels from color only image to color+alpha image) + if ( SUCCEEDED(hr) ) + { + ScopedObject colorLock; + ScopedObject colorWithAlphaLock; + + hr = resizedColor->Lock( nullptr, WICBitmapLockRead, &colorLock ); + if ( SUCCEEDED(hr) ) + { + hr = resizedColorWithAlpha->Lock( nullptr, WICBitmapLockWrite, &colorWithAlphaLock ); + } + + if ( SUCCEEDED(hr) ) + { + WICInProcPointer colorWithAlphaData = nullptr; + UINT colorWithAlphaSizeInBytes = 0; + UINT colorWithAlphaStride = 0; + + hr = colorWithAlphaLock->GetDataPointer( &colorWithAlphaSizeInBytes, &colorWithAlphaData ); + if ( SUCCEEDED(hr) ) + { + if ( !colorWithAlphaData ) + { + hr = E_POINTER; + } + else + { + hr = colorWithAlphaLock->GetStride( &colorWithAlphaStride ); + } + } + + WICInProcPointer colorData = nullptr; + UINT colorSizeInBytes = 0; + UINT colorStride = 0; + if ( SUCCEEDED(hr) ) + { + hr = colorLock->GetDataPointer( &colorSizeInBytes, &colorData ); + if ( SUCCEEDED(hr) ) + { + if ( !colorData ) + { + hr = E_POINTER; + } + else + { + hr = colorLock->GetStride( &colorStride ); + } + } + } + + for ( size_t j = 0; SUCCEEDED(hr) && j < newHeight; j++ ) + { + for ( size_t i = 0; SUCCEEDED(hr) && i < newWidth; i++ ) + { + size_t colorWithAlphaIndex = (j * colorWithAlphaStride) + (i * colorWithAlphaBytesPerPixel); + size_t colorIndex = (j * colorStride) + (i * colorBytesPerPixel); + + if ( ((colorWithAlphaIndex + colorBytesInPixel) > colorWithAlphaSizeInBytes) + || ( (colorIndex + colorBytesPerPixel) > colorSizeInBytes) ) + { + hr = E_INVALIDARG; + } + else + { + memcpy_s( colorWithAlphaData + colorWithAlphaIndex, colorWithAlphaBytesPerPixel, colorData + colorIndex, colorBytesInPixel ); + } + } + } + } + } + + if ( SUCCEEDED(hr) ) + { + ScopedObject wicBitmap; + hr = _EnsureWicBitmapPixelFormat( pWIC, resizedColorWithAlpha.Get(), filter, desiredPixelFormat, &wicBitmap ); + if ( SUCCEEDED(hr) ) + { + hr = wicBitmap->CopyPixels( nullptr, static_cast(img->rowPitch), static_cast(img->slicePitch), img->pixels ); + } + } + + return hr; +} + + +//------------------------------------------------------------------------------------- +// Generate a (2D) mip-map chain from a base image using WIC's image scaler +//------------------------------------------------------------------------------------- +static HRESULT _GenerateMipMapsUsingWIC( _In_ const Image& baseImage, _In_ DWORD filter, _In_ size_t levels, + _In_ const WICPixelFormatGUID& pfGUID, _In_ const ScratchImage& mipChain, _In_ size_t item ) +{ + assert( levels > 1 ); + + if ( !baseImage.pixels || !mipChain.GetPixels() ) + return E_POINTER; + + IWICImagingFactory* pWIC = _GetWIC(); + if ( !pWIC ) + return E_NOINTERFACE; + + size_t width = baseImage.width; + size_t height = baseImage.height; + + ScopedObject source; + HRESULT hr = pWIC->CreateBitmapFromMemory( static_cast( width ), static_cast( height ), pfGUID, + static_cast( baseImage.rowPitch ), static_cast( baseImage.slicePitch ), + baseImage.pixels, &source ); + if ( FAILED(hr) ) + return hr; + + // Copy base image to top miplevel + const Image *img0 = mipChain.GetImage( 0, item, 0 ); + if ( !img0 ) + return E_POINTER; + + uint8_t* pDest = img0->pixels; + if ( !pDest ) + return E_POINTER; + + const uint8_t *pSrc = baseImage.pixels; + for( size_t h=0; h < height; ++h ) + { + size_t msize = std::min( img0->rowPitch, baseImage.rowPitch ); + memcpy_s( pDest, img0->rowPitch, pSrc, msize ); + pSrc += baseImage.rowPitch; + pDest += img0->rowPitch; + } + + ScopedObject componentInfo; + hr = pWIC->CreateComponentInfo( pfGUID, &componentInfo ); + if ( FAILED(hr) ) + return hr; + + ScopedObject pixelFormatInfo; + hr = componentInfo->QueryInterface( __uuidof(IWICPixelFormatInfo2), (void**)&pixelFormatInfo ); + if ( FAILED(hr) ) + return hr; + + BOOL supportsTransparency = FALSE; + hr = pixelFormatInfo->SupportsTransparency( &supportsTransparency ); + if ( FAILED(hr) ) + return hr; + + // Resize base image to each target mip level + for( size_t level = 1; level < levels; ++level ) + { + const Image *img = mipChain.GetImage( level, item, 0 ); + if ( !img ) + return E_POINTER; + + if ( height > 1 ) + height >>= 1; + + if ( width > 1 ) + width >>= 1; + + assert( img->width == width && img->height == height && img->format == baseImage.format ); + + if ( (filter & TEX_FILTER_SEPARATE_ALPHA) && supportsTransparency ) + { + hr = _ResizeSeparateColorAndAlpha( pWIC, source.Get(), width, height, filter, img ); + if ( FAILED(hr) ) + return hr; + } + else + { + ScopedObject scaler; + hr = pWIC->CreateBitmapScaler( &scaler ); + if ( FAILED(hr) ) + return hr; + + hr = scaler->Initialize( source.Get(), static_cast( width ), static_cast( height ), _GetWICInterp( filter ) ); + if ( FAILED(hr) ) + return hr; + + WICPixelFormatGUID pfScaler; + hr = scaler->GetPixelFormat( &pfScaler ); + if ( FAILED(hr) ) + return hr; + + if ( memcmp( &pfScaler, &pfGUID, sizeof(WICPixelFormatGUID) ) == 0 ) + { + hr = scaler->CopyPixels( 0, static_cast( img->rowPitch ), static_cast( img->slicePitch ), img->pixels ); + if ( FAILED(hr) ) + return hr; + } + else + { + // The WIC bitmap scaler is free to return a different pixel format than the source image, so here we + // convert it back + ScopedObject FC; + hr = pWIC->CreateFormatConverter( &FC ); + if ( FAILED(hr) ) + return hr; + + hr = FC->Initialize( scaler.Get(), pfGUID, _GetWICDither( filter ), 0, 0, WICBitmapPaletteTypeCustom ); + if ( FAILED(hr) ) + return hr; + + hr = FC->CopyPixels( 0, static_cast( img->rowPitch ), static_cast( img->slicePitch ), img->pixels ); + if ( FAILED(hr) ) + return hr; + } + } + } + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Generate volume mip-map helpers +//------------------------------------------------------------------------------------- +static HRESULT _Setup3DMips( _In_count_(depth) const Image* baseImages, _In_ size_t depth, size_t levels, + _Out_ ScratchImage& mipChain ) +{ + if ( !baseImages || !depth ) + return E_INVALIDARG; + + assert( levels > 1 ); + + size_t width = baseImages[0].width; + size_t height = baseImages[0].height; + + HRESULT hr = mipChain.Initialize3D( baseImages[0].format, width, height, depth, levels ); + if ( FAILED(hr) ) + return hr; + + // Copy base images to top slice + for( size_t slice=0; slice < depth; ++slice ) + { + const Image& src = baseImages[slice]; + + const Image *dest = mipChain.GetImage( 0, 0, slice ); + if ( !dest ) + { + mipChain.Release(); + return E_POINTER; + } + + assert( src.format == dest->format ); + + uint8_t* pDest = dest->pixels; + if ( !pDest ) + { + mipChain.Release(); + return E_POINTER; + } + + const uint8_t *pSrc = src.pixels; + size_t rowPitch = src.rowPitch; + for( size_t h=0; h < height; ++h ) + { + size_t msize = std::min( dest->rowPitch, rowPitch ); + memcpy_s( pDest, dest->rowPitch, pSrc, msize ); + pSrc += rowPitch; + pDest += dest->rowPitch; + } + } + + return S_OK; +} + +static HRESULT _Generate3DMipsPointFilter( _In_ size_t depth, _In_ size_t levels, _In_ const ScratchImage& mipChain ) +{ + if ( !depth || !mipChain.GetImages() ) + return E_INVALIDARG; + + // This assumes that the base images are already placed into the mipChain at the top level... (see _Setup3DMips) + + assert( levels > 1 ); + + size_t width = mipChain.GetMetadata().width; + size_t height = mipChain.GetMetadata().height; + + assert( ispow2(width) && ispow2(height) && ispow2(depth) ); + + // Allocate temporary space (2 scanlines) + ScopedAlignedArrayXMVECTOR scanline( reinterpret_cast( _aligned_malloc( (sizeof(XMVECTOR)*width*2), 16 ) ) ); + if ( !scanline ) + return E_OUTOFMEMORY; + + XMVECTOR* target = scanline.get(); + + XMVECTOR* row = target + width; + + // Resize base image to each target mip level + for( size_t level=1; level < levels; ++level ) + { + if ( depth > 1 ) + { + // 3D point filter + for( size_t slice=0; slice < depth; slice += 2 ) + { + const Image* src = mipChain.GetImage( level-1, 0, slice ); + const Image* dest = mipChain.GetImage( level, 0, slice >> 1 ); + + if ( !src || !dest ) + return E_POINTER; + + const uint8_t* pSrc = src->pixels; + uint8_t* pDest = dest->pixels; + + size_t rowPitch = src->rowPitch; + + size_t nheight = height >> 1; + + for( size_t y = 0; y < nheight; ++y ) + { + if ( !_LoadScanline( row, width, pSrc, rowPitch, src->format ) ) + return E_FAIL; + pSrc += rowPitch*2; + + size_t nwidth = width >> 1; + + for( size_t x = 0; x < nwidth; ++x ) + { + target[ x ] = row[ x*2 ]; + } + + if ( !_StoreScanline( pDest, dest->rowPitch, dest->format, target, nwidth ) ) + return E_FAIL; + pDest += dest->rowPitch; + } + } + } + else + { + // 2D point filter + const Image* src = mipChain.GetImage( level-1, 0, 0 ); + const Image* dest = mipChain.GetImage( level, 0, 0 ); + + if ( !src || !dest ) + return E_POINTER; + + const uint8_t* pSrc = src->pixels; + uint8_t* pDest = dest->pixels; + + size_t rowPitch = src->rowPitch; + + size_t nheight = height >> 1; + + for( size_t y = 0; y < nheight; ++y ) + { + if ( !_LoadScanline( row, width, pSrc, rowPitch, src->format ) ) + return E_FAIL; + pSrc += rowPitch*2; + + size_t nwidth = width >> 1; + + for( size_t x = 0; x < nwidth; ++x ) + { + target[ x ] = row[ x*2 ]; + } + + if ( !_StoreScanline( pDest, dest->rowPitch, dest->format, target, nwidth ) ) + return E_FAIL; + pDest += dest->rowPitch; + } + } + + if ( height > 1 ) + height >>= 1; + + if ( width > 1 ) + width >>= 1; + + if ( depth > 1 ) + depth >>= 1; + } + + assert( height == 1 && width == 1 && depth == 1 ); + + return S_OK; +} + +static HRESULT _Generate3DMipsBoxFilter( _In_ size_t depth, _In_ size_t levels, _In_ const ScratchImage& mipChain ) +{ + if ( !depth || !mipChain.GetImages() ) + return E_INVALIDARG; + + // This assumes that the base images are already placed into the mipChain at the top level... (see _Setup3DMips) + + assert( levels > 1 ); + + size_t width = mipChain.GetMetadata().width; + size_t height = mipChain.GetMetadata().height; + + assert( ispow2(width) && ispow2(height) && ispow2(depth) ); + + // Allocate temporary space (5 scanlines) + ScopedAlignedArrayXMVECTOR scanline( reinterpret_cast( _aligned_malloc( (sizeof(XMVECTOR)*width*5), 16 ) ) ); + if ( !scanline ) + return E_OUTOFMEMORY; + + XMVECTOR* target = scanline.get(); + + XMVECTOR* urow0 = target + width; + XMVECTOR* urow1 = target + width*2; + XMVECTOR* vrow0 = target + width*3; + XMVECTOR* vrow1 = target + width*4; + + const XMVECTOR* urow2 = urow0 + 1; + const XMVECTOR* urow3 = urow1 + 1; + const XMVECTOR* vrow2 = vrow0 + 1; + const XMVECTOR* vrow3 = vrow1 + 1; + + // Resize base image to each target mip level + for( size_t level=1; level < levels; ++level ) + { + if ( height == 1) + { + urow0 = vrow0; + urow1 = vrow1; + } + + if ( width == 1 ) + { + urow2 = urow0; + urow3 = urow1; + vrow2 = vrow0; + vrow3 = vrow1; + } + + if ( depth > 1 ) + { + // 3D box filter + for( size_t slice=0; slice < depth; slice += 2 ) + { + const Image* srca = mipChain.GetImage( level-1, 0, slice ); + const Image* srcb = mipChain.GetImage( level-1, 0, slice+1 ); + const Image* dest = mipChain.GetImage( level, 0, slice >> 1 ); + + if ( !srca || !srcb || !dest ) + return E_POINTER; + + const uint8_t* pSrc1 = srca->pixels; + const uint8_t* pSrc2 = srcb->pixels; + uint8_t* pDest = dest->pixels; + + size_t aRowPitch = srca->rowPitch; + size_t bRowPitch = srcb->rowPitch; + + size_t nheight = height >> 1; + + for( size_t y = 0; y < nheight; ++y ) + { + if ( !_LoadScanline( urow0, width, pSrc1, aRowPitch, srca->format ) ) + return E_FAIL; + pSrc1 += aRowPitch; + + if ( urow0 != urow1 ) + { + if ( !_LoadScanline( urow1, width, pSrc1, aRowPitch, srca->format ) ) + return E_FAIL; + pSrc1 += aRowPitch; + } + + if ( urow0 != vrow0 ) + { + if ( !_LoadScanline( vrow0, width, pSrc2, bRowPitch, srcb->format ) ) + return E_FAIL; + pSrc2 += bRowPitch; + } + + if ( urow0 != vrow1 && vrow0 != vrow1 ) + { + if ( !_LoadScanline( vrow1, width, pSrc2, bRowPitch, srcb->format ) ) + return E_FAIL; + pSrc2 += bRowPitch; + } + + size_t nwidth = width >> 1; + + for( size_t x = 0; x < nwidth; ++x ) + { + size_t x2 = x*2; + + // Box filter: Average 2x2x2 pixels + XMVECTOR v = XMVectorAdd( urow0[ x2 ], urow1[ x2 ] ); + v = XMVectorAdd( v, urow2[ x2 ] ); + v = XMVectorAdd( v, urow3[ x2 ] ); + v = XMVectorAdd( v, vrow0[ x2 ] ); + v = XMVectorAdd( v, vrow1[ x2 ] ); + v = XMVectorAdd( v, vrow2[ x2 ] ); + v = XMVectorAdd( v, vrow3[ x2 ] ); + + target[ x ] = XMVectorMultiply( v, s_boxScale3D ); + } + + if ( !_StoreScanline( pDest, dest->rowPitch, dest->format, target, nwidth ) ) + return E_FAIL; + pDest += dest->rowPitch; + } + } + } + else + { + // 2D box filter + const Image* src = mipChain.GetImage( level-1, 0, 0 ); + const Image* dest = mipChain.GetImage( level, 0, 0 ); + + if ( !src || !dest ) + return E_POINTER; + + const uint8_t* pSrc = src->pixels; + uint8_t* pDest = dest->pixels; + + size_t rowPitch = src->rowPitch; + + size_t nheight = height >> 1; + + for( size_t y = 0; y < nheight; ++y ) + { + if ( !_LoadScanline( urow0, width, pSrc, rowPitch, src->format ) ) + return E_FAIL; + pSrc += rowPitch; + + if ( urow0 != urow1 ) + { + if ( !_LoadScanline( urow1, width, pSrc, rowPitch, src->format ) ) + return E_FAIL; + pSrc += rowPitch; + } + + size_t nwidth = width >> 1; + + for( size_t x = 0; x < nwidth; ++x ) + { + size_t x2 = x*2; + + // Box filter: Average 2x2 pixels + XMVECTOR v = XMVectorAdd( urow0[ x2 ], urow1[ x2 ] ); + v = XMVectorAdd( v, urow2[ x2 ] ); + v = XMVectorAdd( v, urow3[ x2 ] ); + + target[ x ] = XMVectorMultiply( v, s_boxScale ); + } + + if ( !_StoreScanline( pDest, dest->rowPitch, dest->format, target, nwidth ) ) + return E_FAIL; + pDest += dest->rowPitch; + } + } + + if ( height > 1 ) + height >>= 1; + + if ( width > 1 ) + width >>= 1; + + if ( depth > 1 ) + depth >>= 1; + } + + assert( height == 1 && width == 1 && depth == 1 ); + + return S_OK; +} + + +//===================================================================================== +// Entry-points +//===================================================================================== + +//------------------------------------------------------------------------------------- +// Generate mipmap chain +//------------------------------------------------------------------------------------- +HRESULT GenerateMipMaps( const Image& baseImage, DWORD filter, size_t levels, ScratchImage& mipChain, bool allow1D ) +{ + if ( !IsValid( baseImage.format ) ) + return E_INVALIDARG; + + if ( !baseImage.pixels ) + return E_POINTER; + + if ( !_CalculateMipLevels(baseImage.width, baseImage.height, levels) ) + return E_INVALIDARG; + + if ( IsCompressed( baseImage.format ) || IsVideo( baseImage.format ) ) + { + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + } + + static_assert( TEX_FILTER_POINT == 0x100000, "TEX_FILTER_ flag values don't match TEX_FILTER_MASK" ); + switch(filter & TEX_FILTER_MASK) + { + case 0: + case TEX_FILTER_POINT: + case TEX_FILTER_FANT: // Equivalent to Box filter + case TEX_FILTER_LINEAR: + case TEX_FILTER_CUBIC: + { + WICPixelFormatGUID pfGUID; + if ( _DXGIToWIC( baseImage.format, pfGUID ) ) + { + // Case 1: Base image format is supported by Windows Imaging Component + HRESULT hr = (baseImage.height > 1 || !allow1D) + ? mipChain.Initialize2D( baseImage.format, baseImage.width, baseImage.height, 1, levels ) + : mipChain.Initialize1D( baseImage.format, baseImage.width, 1, levels ); + if ( FAILED(hr) ) + return hr; + + return _GenerateMipMapsUsingWIC( baseImage, filter, levels, pfGUID, mipChain, 0 ); + } + else + { + // Case 2: Base image format is not supported by WIC, so we have to convert, generate, and convert back + assert( baseImage.format != DXGI_FORMAT_R32G32B32A32_FLOAT ); + ScratchImage temp; + HRESULT hr = _ConvertToR32G32B32A32( baseImage, temp ); + if ( FAILED(hr) ) + return hr; + + const Image *timg = temp.GetImage( 0, 0, 0 ); + if ( !timg ) + return E_POINTER; + + ScratchImage tMipChain; + hr = _GenerateMipMapsUsingWIC( *timg, filter, levels, GUID_WICPixelFormat128bppRGBAFloat, tMipChain, 0 ); + if ( FAILED(hr) ) + return hr; + + temp.Release(); + + return _ConvertFromR32G32B32A32( tMipChain.GetImages(), tMipChain.GetImageCount(), tMipChain.GetMetadata(), baseImage.format, mipChain ); + } + } + break; + + default: + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + } +} + +HRESULT GenerateMipMaps( const Image* srcImages, size_t nimages, const TexMetadata& metadata, + DWORD filter, size_t levels, ScratchImage& mipChain ) +{ + if ( !srcImages || !nimages || !IsValid(metadata.format) ) + return E_INVALIDARG; + + if ( metadata.dimension == TEX_DIMENSION_TEXTURE3D + || IsCompressed( metadata.format ) || IsVideo( metadata.format ) ) + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + + if ( !_CalculateMipLevels(metadata.width, metadata.height, levels) ) + return E_INVALIDARG; + + static_assert( TEX_FILTER_POINT == 0x100000, "TEX_FILTER_ flag values don't match TEX_FILTER_MASK" ); + switch(filter & TEX_FILTER_MASK) + { + case 0: + case TEX_FILTER_POINT: + case TEX_FILTER_FANT: // Equivalent to Box filter + case TEX_FILTER_LINEAR: + case TEX_FILTER_CUBIC: + { + WICPixelFormatGUID pfGUID; + if ( _DXGIToWIC( metadata.format, pfGUID ) ) + { + // Case 1: Base image format is supported by Windows Imaging Component + TexMetadata mdata2 = metadata; + mdata2.mipLevels = levels; + HRESULT hr = mipChain.Initialize( mdata2 ); + if ( FAILED(hr) ) + return hr; + + for( size_t item = 0; item < metadata.arraySize; ++item ) + { + size_t index = metadata.ComputeIndex( 0, item, 0 ); + if ( index >= nimages ) + { + mipChain.Release(); + return E_FAIL; + } + + const Image& baseImage = srcImages[ index ]; + + hr = _GenerateMipMapsUsingWIC( baseImage, filter, levels, pfGUID, mipChain, item ); + if ( FAILED(hr) ) + { + mipChain.Release(); + return hr; + } + } + + return S_OK; + } + else + { + // Case 2: Base image format is not supported by WIC, so we have to convert, generate, and convert back + assert( metadata.format != DXGI_FORMAT_R32G32B32A32_FLOAT ); + + TexMetadata mdata2 = metadata; + mdata2.mipLevels = levels; + mdata2.format = DXGI_FORMAT_R32G32B32A32_FLOAT; + ScratchImage tMipChain; + HRESULT hr = tMipChain.Initialize( mdata2 ); + if ( FAILED(hr) ) + return hr; + + for( size_t item = 0; item < metadata.arraySize; ++item ) + { + size_t index = metadata.ComputeIndex( 0, item, 0 ); + if ( index >= nimages ) + return E_FAIL; + + const Image& baseImage = srcImages[ index ]; + + ScratchImage temp; + hr = _ConvertToR32G32B32A32( baseImage, temp ); + if ( FAILED(hr) ) + return hr; + + const Image *timg = temp.GetImage( 0, 0, 0 ); + if ( !timg ) + return E_POINTER; + + hr = _GenerateMipMapsUsingWIC( *timg, filter, levels, GUID_WICPixelFormat128bppRGBAFloat, tMipChain, item ); + if ( FAILED(hr) ) + return hr; + } + + return _ConvertFromR32G32B32A32( tMipChain.GetImages(), tMipChain.GetImageCount(), tMipChain.GetMetadata(), metadata.format, mipChain ); + } + } + break; + + default: + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );; + } +} + + +//------------------------------------------------------------------------------------- +// Generate mipmap chain for volume texture +//------------------------------------------------------------------------------------- +HRESULT GenerateMipMaps3D( const Image* baseImages, size_t depth, DWORD filter, size_t levels, ScratchImage& mipChain ) +{ + if ( !baseImages || !depth ) + return E_INVALIDARG; + + DXGI_FORMAT format = baseImages[0].format; + size_t width = baseImages[0].width; + size_t height = baseImages[0].height; + + if ( !ispow2(width) || !ispow2(height) || !ispow2(depth) ) + return E_INVALIDARG; + + if ( !_CalculateMipLevels3D(width, height, depth, levels) ) + return E_INVALIDARG; + + for( size_t slice=0; slice < depth; ++slice ) + { + if ( !baseImages[slice].pixels ) + return E_POINTER; + + if ( baseImages[slice].format != format || baseImages[slice].width != width || baseImages[slice].height != height ) + { + // All base images must be the same format, width, and height + return E_FAIL; + } + } + + if ( IsCompressed( format ) ) + { + // We don't support generating mipmaps from compressed images, as those should be generated before compression + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + } + + HRESULT hr; + + static_assert( TEX_FILTER_POINT == 0x100000, "TEX_FILTER_ flag values don't match TEX_FILTER_MASK" ); + switch( filter & TEX_FILTER_MASK ) + { + case 0: + case TEX_FILTER_FANT: + hr = _Setup3DMips( baseImages, depth, levels, mipChain ); + if ( FAILED(hr) ) + return hr; + + // For decimation, Fant is equivalent to a Box filter + hr = _Generate3DMipsBoxFilter( depth, levels, mipChain ); + if ( FAILED(hr) ) + mipChain.Release(); + return hr; + + case WIC_FLAGS_FILTER_POINT: + hr = _Setup3DMips( baseImages, depth, levels, mipChain ); + if ( FAILED(hr) ) + return hr; + + hr = _Generate3DMipsPointFilter( depth, levels, mipChain ); + if ( FAILED(hr) ) + mipChain.Release(); + return hr; + + case WIC_FLAGS_FILTER_LINEAR: + // Need to implement a 3D bi-linear filter (2x2x2) + return E_NOTIMPL; + + case WIC_FLAGS_FILTER_CUBIC: + // Need to implement a 3D bi-cubic filter (3x3x3) + return E_NOTIMPL; + + default: + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );; + } +} + +HRESULT GenerateMipMaps3D( const Image* srcImages, size_t nimages, const TexMetadata& metadata, + DWORD filter, size_t levels, ScratchImage& mipChain ) +{ + if ( !srcImages || !nimages || !IsValid(metadata.format) + || !ispow2(metadata.width) || !ispow2(metadata.height) || !ispow2(metadata.depth) ) + return E_INVALIDARG; + + if ( metadata.dimension != TEX_DIMENSION_TEXTURE3D + || IsCompressed( metadata.format ) || IsVideo( metadata.format ) ) + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + + if ( !_CalculateMipLevels3D(metadata.width, metadata.height, metadata.depth, levels) ) + return E_INVALIDARG; + + std::vector baseImages; + baseImages.reserve( metadata.depth ); + for( size_t slice=0; slice < metadata.depth; ++slice ) + { + size_t index = metadata.ComputeIndex( 0, 0, slice ); + if ( index >= nimages ) + return E_FAIL; + + const Image& src = srcImages[ index ]; + if ( !src.pixels ) + return E_POINTER; + + if ( src.format != metadata.format || src.width != metadata.width || src.height != metadata.height ) + { + // All base images must be the same format, width, and height + return E_FAIL; + } + + baseImages.push_back( src ); + } + + assert( baseImages.size() == metadata.depth ); + + HRESULT hr; + + static_assert( TEX_FILTER_POINT == 0x100000, "TEX_FILTER_ flag values don't match TEX_FILTER_MASK" ); + switch( filter & TEX_FILTER_MASK ) + { + case 0: + case TEX_FILTER_FANT: + hr = _Setup3DMips( &baseImages[0], metadata.depth, levels, mipChain ); + if ( FAILED(hr) ) + return hr; + + // For decimation, Fant is equivalent to a Box filter + hr = _Generate3DMipsBoxFilter( metadata.depth, levels, mipChain ); + if ( FAILED(hr) ) + mipChain.Release(); + return hr; + + case WIC_FLAGS_FILTER_POINT: + hr = _Setup3DMips( &baseImages[0], metadata.depth, levels, mipChain ); + if ( FAILED(hr) ) + return hr; + + hr = _Generate3DMipsPointFilter( metadata.depth, levels, mipChain ); + if ( FAILED(hr) ) + mipChain.Release(); + return hr; + + case WIC_FLAGS_FILTER_LINEAR: + // Need to implement a 3D bi-linear filter (2x2x2) + return E_NOTIMPL; + + case WIC_FLAGS_FILTER_CUBIC: + // Need to implement a 3D bi-cubic filter (3x3x3) + return E_NOTIMPL; + + default: + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );; + } +} + +}; // namespace diff --git a/thirdparty/directxtex/DirectXTex/DirectXTexMisc.cpp b/thirdparty/directxtex/DirectXTex/DirectXTexMisc.cpp new file mode 100644 index 0000000..f550c12 --- /dev/null +++ b/thirdparty/directxtex/DirectXTex/DirectXTexMisc.cpp @@ -0,0 +1,265 @@ +//------------------------------------------------------------------------------------- +// DirectXTexMisc.cpp +// +// DirectX Texture Library - Misc image operations +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkId=248926 +//------------------------------------------------------------------------------------- + +#include "directxtexp.h" + +namespace DirectX +{ + +//------------------------------------------------------------------------------------- +static HRESULT _ComputeMSE( _In_ const Image& image1, _In_ const Image& image2, + _Out_ float& mse, _Out_opt_cap_c_(4) float* mseV ) +{ + if ( !image1.pixels || !image2.pixels ) + return E_POINTER; + + assert( image1.width == image2.width && image1.height == image2.height ); + assert( !IsCompressed( image1.format ) && !IsCompressed( image2.format ) ); + + const size_t width = image1.width; + + ScopedAlignedArrayXMVECTOR scanline( reinterpret_cast( _aligned_malloc( (sizeof(XMVECTOR)*width)*2, 16 ) ) ); + if ( !scanline ) + return E_OUTOFMEMORY; + + const uint8_t *pSrc1 = image1.pixels; + const size_t rowPitch1 = image1.rowPitch; + + const uint8_t *pSrc2 = image2.pixels; + const size_t rowPitch2 = image2.rowPitch; + + XMVECTOR acc = XMVectorZero(); + + for( size_t h = 0; h < image1.height; ++h ) + { + XMVECTOR* ptr1 = scanline.get(); + if ( !_LoadScanline( ptr1, width, pSrc1, rowPitch1, image1.format ) ) + return E_FAIL; + + XMVECTOR* ptr2 = scanline.get() + width; + if ( !_LoadScanline( ptr2, width, pSrc2, rowPitch2, image2.format ) ) + return E_FAIL; + + for( size_t i = 0; i < width; ++i, ++ptr1, ++ptr2 ) + { + // sum[ (I1 - I2)^2 ] + XMVECTOR v = XMVectorSubtract( *ptr1, *ptr2 ); + acc = XMVectorMultiplyAdd( v, v, acc ); + } + + pSrc1 += rowPitch1; + pSrc2 += rowPitch2; + } + + // MSE = sum[ (I1 - I2)^2 ] / w*h + XMVECTOR d = XMVectorReplicate( float(image1.width * image1.height) ); + XMVECTOR v = XMVectorDivide( acc, d ); + if ( mseV ) + { + XMStoreFloat4( reinterpret_cast( mseV ), v ); + mse = mseV[0] + mseV[1] + mseV[2] + mseV[3]; + } + else + { + XMFLOAT4 _mseV; + XMStoreFloat4( &_mseV, v ); + mse = _mseV.x + _mseV.y + _mseV.z + _mseV.w; + } + + return S_OK; +} + + +//===================================================================================== +// Entry points +//===================================================================================== + +//------------------------------------------------------------------------------------- +// Copies a rectangle from one image into another +//------------------------------------------------------------------------------------- +HRESULT CopyRectangle( const Image& srcImage, const Rect& srcRect, const Image& dstImage, DWORD filter, size_t xOffset, size_t yOffset ) +{ + if ( !srcImage.pixels || !dstImage.pixels ) + return E_POINTER; + + if ( IsCompressed( srcImage.format ) || IsCompressed( dstImage.format ) ) + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + + // Validate rectangle/offset + if ( !srcRect.w || !srcRect.h || ( (srcRect.x + srcRect.w) > srcImage.width ) || ( (srcRect.y + srcRect.h) > srcImage.height ) ) + { + return E_INVALIDARG; + } + + if ( ( (xOffset + srcRect.w) > dstImage.width ) || ( (yOffset + srcRect.h) > dstImage.height ) ) + { + return E_INVALIDARG; + } + + // Compute source bytes-per-pixel + size_t sbpp = BitsPerPixel( srcImage.format ); + if ( !sbpp ) + return E_FAIL; + + if ( sbpp < 8 ) + { + // We don't support monochrome (DXGI_FORMAT_R1_UNORM) + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + } + + const uint8_t* pEndSrc = srcImage.pixels + srcImage.rowPitch*srcImage.height; + const uint8_t* pEndDest = dstImage.pixels + dstImage.rowPitch*dstImage.height; + + // Round to bytes + sbpp = ( sbpp + 7 ) / 8; + + const uint8_t* pSrc = srcImage.pixels + (srcRect.y * srcImage.rowPitch) + (srcRect.x * sbpp); + + if ( srcImage.format == dstImage.format ) + { + // Direct copy case (avoid intermediate conversions) + uint8_t* pDest = dstImage.pixels + (yOffset * dstImage.rowPitch) + (xOffset * sbpp); + const size_t copyW = srcRect.w * sbpp; + for( size_t h=0; h < srcRect.h; ++h ) + { + if ( ( (pSrc+copyW) > pEndSrc ) || (pDest > pEndDest) ) + return E_FAIL; + + memcpy_s( pDest, pEndDest - pDest, pSrc, copyW ); + + pSrc += srcImage.rowPitch; + pDest += dstImage.rowPitch; + } + + return S_OK; + } + + // Compute destination bytes-per-pixel (not the same format as source) + size_t dbpp = BitsPerPixel( dstImage.format ); + if ( !dbpp ) + return E_FAIL; + + if ( dbpp < 8 ) + { + // We don't support monochrome (DXGI_FORMAT_R1_UNORM) + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + } + + // Round to bytes + dbpp = ( dbpp + 7 ) / 8; + + uint8_t* pDest = dstImage.pixels + (yOffset * dstImage.rowPitch) + (xOffset * dbpp); + + ScopedAlignedArrayXMVECTOR scanline( reinterpret_cast( _aligned_malloc( (sizeof(XMVECTOR)*srcRect.w), 16 ) ) ); + if ( !scanline ) + return E_OUTOFMEMORY; + + const size_t copyS = srcRect.w * sbpp; + const size_t copyD = srcRect.w * dbpp; + + for( size_t h=0; h < srcRect.h; ++h ) + { + if ( ( (pSrc+copyS) > pEndSrc) || ((pDest+copyD) > pEndDest) ) + return E_FAIL; + + if ( !_LoadScanline( scanline.get(), srcRect.w, pSrc, copyS, srcImage.format ) ) + return E_FAIL; + + _ConvertScanline( scanline.get(), srcRect.w, dstImage.format, srcImage.format, filter ); + + if ( !_StoreScanline( pDest, copyD, dstImage.format, scanline.get(), srcRect.w ) ) + return E_FAIL; + + pSrc += srcImage.rowPitch; + pDest += dstImage.rowPitch; + } + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Computes the Mean-Squared-Error (MSE) between two images +//------------------------------------------------------------------------------------- +HRESULT ComputeMSE( const Image& image1, const Image& image2, float& mse, float* mseV ) +{ + if ( !image1.pixels || !image2.pixels ) + return E_POINTER; + + if ( image1.width != image2.width || image1.height != image2.height ) + return E_INVALIDARG; + + if ( IsCompressed(image1.format) ) + { + if ( IsCompressed(image2.format) ) + { + // Case 1: both images are compressed, expand to RGBA32F + ScratchImage temp1; + HRESULT hr = Decompress( image1, DXGI_FORMAT_R32G32B32A32_FLOAT, temp1 ); + if ( FAILED(hr) ) + return hr; + + ScratchImage temp2; + hr = Decompress( image2, DXGI_FORMAT_R32G32B32A32_FLOAT, temp2 ); + if ( FAILED(hr) ) + return hr; + + const Image* img1 = temp1.GetImage(0,0,0); + const Image* img2 = temp2.GetImage(0,0,0); + if ( !img1 || !img2 ) + return E_POINTER; + + return _ComputeMSE( *img1, *img2, mse, mseV ); + } + else + { + // Case 2: image1 is compressed, expand to RGBA32F + ScratchImage temp; + HRESULT hr = Decompress( image1, DXGI_FORMAT_R32G32B32A32_FLOAT, temp ); + if ( FAILED(hr) ) + return hr; + + const Image* img = temp.GetImage(0,0,0); + if ( !img ) + return E_POINTER; + + return _ComputeMSE( *img, image2, mse, mseV ); + } + } + else + { + if ( IsCompressed(image2.format) ) + { + // Case 3: image2 is compressed, expand to RGBA32F + ScratchImage temp; + HRESULT hr = Decompress( image2, DXGI_FORMAT_R32G32B32A32_FLOAT, temp ); + if ( FAILED(hr) ) + return hr; + + const Image* img = temp.GetImage(0,0,0); + if ( !img ) + return E_POINTER; + + return _ComputeMSE( image1, *img, mse, mseV ); + } + else + { + // Case 4: neither image is compressed + return _ComputeMSE( image1, image2, mse, mseV ); + } + } +} + +}; // namespace diff --git a/thirdparty/directxtex/DirectXTex/DirectXTexNormalMaps.cpp b/thirdparty/directxtex/DirectXTex/DirectXTexNormalMaps.cpp new file mode 100644 index 0000000..7591b91 --- /dev/null +++ b/thirdparty/directxtex/DirectXTex/DirectXTexNormalMaps.cpp @@ -0,0 +1,377 @@ +//------------------------------------------------------------------------------------- +// DirectXTexNormalMaps.cpp +// +// DirectX Texture Library - Normal map operations +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkId=248926 +//------------------------------------------------------------------------------------- + +#include "directxtexp.h" + +namespace DirectX +{ + +#pragma prefast(suppress : 25000, "FXMVECTOR is 16 bytes") +static inline float _EvaluateColor( _In_ FXMVECTOR val, _In_ DWORD flags ) +{ + XMFLOAT4A f; + + static XMVECTORF32 lScale = { 0.2125f, 0.7154f, 0.0721f, 1.f }; + + static_assert( CNMAP_CHANNEL_RED == 0x1, "CNMAP_CHANNEL_ flag values don't match mask" ); + switch( flags & 0xf ) + { + case 0: + case CNMAP_CHANNEL_RED: return XMVectorGetX( val ); + case CNMAP_CHANNEL_GREEN: return XMVectorGetY( val ); + case CNMAP_CHANNEL_BLUE: return XMVectorGetZ( val ); + case CNMAP_CHANNEL_ALPHA: return XMVectorGetW( val ); + + case CNMAP_CHANNEL_LUMINANCE: + { + XMVECTOR v = XMVectorMultiply( val, lScale ); + XMStoreFloat4A( &f, v ); + return f.x + f.y + f.z; + } + break; + + default: + assert(false); + return 0.f; + } +} + +static void _EvaluateRow( _In_count_(width) const XMVECTOR* pSource, _Out_cap_(width+2) float* pDest, + _In_ size_t width, _In_ DWORD flags ) +{ + assert( pSource && pDest ); + assert( width > 0 ); + + for( size_t x = 0; x < width; ++x ) + { + pDest[x+1] = _EvaluateColor( pSource[x], flags ); + } + + if ( flags & CNMAP_MIRROR_U ) + { + // Mirror in U + pDest[0] = _EvaluateColor( pSource[0], flags ); + pDest[width+1] = _EvaluateColor( pSource[width-1], flags ); + } + else + { + // Wrap in U + pDest[0] = _EvaluateColor( pSource[width-1], flags ); + pDest[width+1] = _EvaluateColor( pSource[0], flags ); + } +} + +static HRESULT _ComputeNMap( _In_ const Image& srcImage, _In_ DWORD flags, _In_ float amplitude, + _In_ DXGI_FORMAT format, _In_ const Image& normalMap ) +{ + if ( !srcImage.pixels || !normalMap.pixels ) + return E_INVALIDARG; + + assert( !IsCompressed(format) && !IsTypeless( format ) ); + + const DWORD convFlags = _GetConvertFlags( format ); + if ( !convFlags ) + return E_FAIL; + + if ( !( convFlags & (CONVF_UNORM | CONVF_SNORM | CONVF_FLOAT) ) ) + HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + + const size_t width = srcImage.width; + const size_t height = srcImage.height; + if ( width != normalMap.width || height != normalMap.height ) + return E_FAIL; + + // Allocate temporary space (4 scanlines and 3 evaluated rows) + ScopedAlignedArrayXMVECTOR scanline( reinterpret_cast( _aligned_malloc( (sizeof(XMVECTOR)*width*4), 16 ) ) ); + if ( !scanline ) + return E_OUTOFMEMORY; + + ScopedAlignedArrayFloat buffer( reinterpret_cast( _aligned_malloc( ( ( sizeof(float) * ( width + 2 ) ) * 3 ), 16 ) ) ); + if ( !buffer ) + return E_OUTOFMEMORY; + + uint8_t* pDest = normalMap.pixels; + if ( !pDest ) + return E_POINTER; + + XMVECTOR* row0 = scanline.get(); + XMVECTOR* row1 = row0 + width; + XMVECTOR* row2 = row1 + width; + XMVECTOR* target = row2 + width; + + float* val0 = buffer.get(); + float* val1 = val0 + width + 2; + float* val2 = val1 + width + 2; + + const size_t rowPitch = srcImage.rowPitch; + const uint8_t* pSrc = srcImage.pixels; + + // Read first scanline row into 'row1' + if ( !_LoadScanline( row1, width, pSrc, rowPitch, srcImage.format ) ) + return E_FAIL; + + // Setup 'row0' + if ( flags & CNMAP_MIRROR_V ) + { + // Mirror first row + memcpy_s( row0, rowPitch, row1, rowPitch ); + } + else + { + // Read last row (Wrap V) + if ( !_LoadScanline( row0, width, pSrc + (rowPitch * (height-1)), rowPitch, srcImage.format ) ) + return E_FAIL; + } + + // Evaluate the initial rows + _EvaluateRow( row0, val0, width, flags ); + _EvaluateRow( row1, val1, width, flags ); + + pSrc += rowPitch; + + for( size_t y = 0; y < height; ++y ) + { + // Load next scanline of source image + if ( y < (height-1) ) + { + if ( !_LoadScanline( row2, width, pSrc, rowPitch, srcImage.format ) ) + return E_FAIL; + } + else + { + if ( flags & CNMAP_MIRROR_V ) + { + // Use last row of source image + if ( !_LoadScanline( row2, width, srcImage.pixels + (rowPitch * (height-1)), rowPitch, srcImage.format ) ) + return E_FAIL; + } + else + { + // Use first row of source image (Wrap V) + if ( !_LoadScanline( row2, width, srcImage.pixels, rowPitch, srcImage.format ) ) + return E_FAIL; + } + } + + // Evaluate row + _EvaluateRow( row2, val2, width, flags ); + + // Generate target scanline + XMVECTOR *dptr = target; + for( size_t x = 0; x < width; ++x ) + { + // Compute normal via central differencing + float totDelta = ( val0[x] - val0[x+2] ) + ( val1[x] - val1[x+2] ) + ( val2[x] - val2[x+2] ); + float deltaZX = totDelta * amplitude / 6.f; + + totDelta = ( val0[x] - val2[x] ) + ( val0[x+1] - val2[x+1] ) + ( val0[x+2] - val2[x+2] ); + float deltaZY = totDelta * amplitude / 6.f; + + XMVECTOR vx = XMVectorSetZ( g_XMNegIdentityR0, deltaZX ); // (-1.0f, 0.0f, deltaZX) + XMVECTOR vy = XMVectorSetZ( g_XMNegIdentityR1, deltaZY ); // (0.0f, -1.0f, deltaZY) + + XMVECTOR normal = XMVector3Normalize( XMVector3Cross( vx, vy ) ); + + // Compute alpha (1.0 or an occlusion term) + float alpha = 1.f; + + if ( flags & CNMAP_COMPUTE_OCCLUSION ) + { + float delta = 0.f; + float c = val1[x+1]; + + float t = val0[x] - c; if ( t > 0.f ) delta += t; + t = val0[x+1] - c; if ( t > 0.f ) delta += t; + t = val0[x+2] - c; if ( t > 0.f ) delta += t; + t = val1[x] - c; if ( t > 0.f ) delta += t; + // Skip current pixel + t = val1[x+2] - c; if ( t > 0.f ) delta += t; + t = val2[x] - c; if ( t > 0.f ) delta += t; + t = val2[x+1] - c; if ( t > 0.f ) delta += t; + t = val2[x+2] - c; if ( t > 0.f ) delta += t; + + // Average delta (divide by 8, scale by amplitude factor) + delta *= 0.125f * amplitude; + if ( delta > 0.f ) + { + // If < 0, then no occlusion + float r = sqrtf( 1.f + delta*delta ); + alpha = (r - delta) / r; + } + } + + // Encode based on target format + if ( convFlags & CONVF_UNORM ) + { + // 0.5f*normal + 0.5f -or- invert sign case: -0.5f*normal + 0.5f + XMVECTOR n1 = XMVectorMultiplyAdd( (flags & CNMAP_INVERT_SIGN) ? g_XMNegativeOneHalf : g_XMOneHalf, normal, g_XMOneHalf ); + *dptr++ = XMVectorSetW( n1, alpha ); + } + else if ( flags & CNMAP_INVERT_SIGN ) + { + *dptr++ = XMVectorSetW( XMVectorNegate( normal ), alpha ); + } + else + { + *dptr++ = XMVectorSetW( normal, alpha ); + } + } + + if ( !_StoreScanline( pDest, normalMap.rowPitch, format, target, width ) ) + return E_FAIL; + + // Cycle buffers + float* temp = val0; + val0 = val1; + val1 = val2; + val2 = temp; + + pSrc += rowPitch; + pDest += normalMap.rowPitch; + } + + return S_OK; +} + + +//===================================================================================== +// Entry points +//===================================================================================== + +//------------------------------------------------------------------------------------- +// Generates a normal map from a height-map +//------------------------------------------------------------------------------------- +HRESULT ComputeNormalMap( const Image& srcImage, DWORD flags, float amplitude, + DXGI_FORMAT format, ScratchImage& normalMap ) +{ + if ( !srcImage.pixels || !IsValid(format) || IsCompressed( format ) || IsTypeless( format ) ) + return E_INVALIDARG; + + static_assert( CNMAP_CHANNEL_RED == 0x1, "CNMAP_CHANNEL_ flag values don't match mask" ); + switch( flags & 0xf ) + { + case 0: + case CNMAP_CHANNEL_RED: + case CNMAP_CHANNEL_GREEN: + case CNMAP_CHANNEL_BLUE: + case CNMAP_CHANNEL_ALPHA: + case CNMAP_CHANNEL_LUMINANCE: + break; + + default: + return E_INVALIDARG; + } + + if ( IsCompressed( srcImage.format ) || IsTypeless( srcImage.format ) ) + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + + // Setup target image + normalMap.Release(); + + HRESULT hr = normalMap.Initialize2D( format, srcImage.width, srcImage.height, 1, 1 ); + if ( FAILED(hr) ) + return hr; + + const Image *img = normalMap.GetImage( 0, 0, 0 ); + if ( !img ) + { + normalMap.Release(); + return E_POINTER; + } + + hr = _ComputeNMap( srcImage, flags, amplitude, format, *img ); + if ( FAILED(hr) ) + { + normalMap.Release(); + return hr; + } + + return S_OK; +} + +HRESULT ComputeNormalMap( const Image* srcImages, size_t nimages, const TexMetadata& metadata, + DWORD flags, float amplitude, DXGI_FORMAT format, ScratchImage& normalMaps ) +{ + if ( !srcImages || !nimages ) + return E_INVALIDARG; + + if ( !IsValid(format) || IsCompressed(format) || IsTypeless(format) ) + return E_INVALIDARG; + + static_assert( CNMAP_CHANNEL_RED == 0x1, "CNMAP_CHANNEL_ flag values don't match mask" ); + switch( flags & 0xf ) + { + case 0: + case CNMAP_CHANNEL_RED: + case CNMAP_CHANNEL_GREEN: + case CNMAP_CHANNEL_BLUE: + case CNMAP_CHANNEL_ALPHA: + case CNMAP_CHANNEL_LUMINANCE: + break; + + default: + return E_INVALIDARG; + } + + normalMaps.Release(); + + TexMetadata mdata2 = metadata; + mdata2.format = format; + HRESULT hr = normalMaps.Initialize( mdata2 ); + if ( FAILED(hr) ) + return hr; + + if ( nimages != normalMaps.GetImageCount() ) + { + normalMaps.Release(); + return E_FAIL; + } + + const Image* dest = normalMaps.GetImages(); + if ( !dest ) + { + normalMaps.Release(); + return E_POINTER; + } + + for( size_t index=0; index < nimages; ++index ) + { + assert( dest[ index ].format == format ); + + const Image& src = srcImages[ index ]; + if ( IsCompressed( src.format ) || IsTypeless( src.format ) ) + { + normalMaps.Release(); + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + } + + if ( src.width != dest[ index ].width || src.height != dest[ index ].height ) + { + normalMaps.Release(); + return E_FAIL; + } + + hr = _ComputeNMap( src, flags, amplitude, format, dest[ index ] ); + if ( FAILED(hr) ) + { + normalMaps.Release(); + return hr; + } + } + + return S_OK; +} + +}; // namespace diff --git a/thirdparty/directxtex/DirectXTex/DirectXTexP.h b/thirdparty/directxtex/DirectXTex/DirectXTexP.h new file mode 100644 index 0000000..b530254 --- /dev/null +++ b/thirdparty/directxtex/DirectXTex/DirectXTexP.h @@ -0,0 +1,197 @@ +//------------------------------------------------------------------------------------- +// DirectXTexp.h +// +// DirectX Texture Library - Private header +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkId=248926 +//------------------------------------------------------------------------------------- + +#if defined(_MSC_VER) && (_MSC_VER > 1000) +#pragma once +#endif + +#define NOMINMAX +#include + +#ifdef USE_XNAMATH +#include +#else +#include +#include +#endif + +#include + +#include +#include + +#include + +#include +#include + +#include + +#if (_WIN32_WINNT >= 0x0602 /*_WIN32_WINNT_WIN8*/) || defined(_WIN7_PLATFORM_UPDATE) +#include +#endif + +#pragma warning(push) +#pragma warning(disable : 4005) +#include +#pragma warning(pop) + +#if (_WIN32_WINNT >= 0x0602 /*_WIN32_WINNT_WIN8*/) && !defined(DXGI_1_2_FORMATS) +#define DXGI_1_2_FORMATS +#endif + +#include "directxtex.h" + +#include "scoped.h" + +struct IWICImagingFactory; + +#define TEX_FILTER_MASK 0xF00000 + +namespace DirectX +{ + //--------------------------------------------------------------------------------- + // WIC helper functions + DXGI_FORMAT _WICToDXGI( _In_ const GUID& guid ); + bool _DXGIToWIC( _In_ DXGI_FORMAT format, _Out_ GUID& guid ); + + IWICImagingFactory* _GetWIC(); + + bool _IsWIC2(); + + inline WICBitmapDitherType _GetWICDither( _In_ DWORD flags ) + { + static_assert( TEX_FILTER_DITHER == 0x10000, "TEX_FILTER_DITHER* flag values don't match mask" ); + + static_assert( TEX_FILTER_DITHER == WIC_FLAGS_DITHER, "TEX_FILTER_DITHER* should match WIC_FLAGS_DITHER*" ); + static_assert( TEX_FILTER_DITHER_DIFFUSION == WIC_FLAGS_DITHER_DIFFUSION, "TEX_FILTER_DITHER* should match WIC_FLAGS_DITHER*" ); + + switch( flags & 0xF0000 ) + { + case TEX_FILTER_DITHER: + return WICBitmapDitherTypeOrdered4x4; + + case TEX_FILTER_DITHER_DIFFUSION: + return WICBitmapDitherTypeErrorDiffusion; + + default: + return WICBitmapDitherTypeNone; + } + } + + inline WICBitmapInterpolationMode _GetWICInterp( _In_ DWORD flags ) + { + static_assert( TEX_FILTER_POINT == 0x100000, "TEX_FILTER_ flag values don't match TEX_FILTER_MASK" ); + + static_assert( TEX_FILTER_POINT == WIC_FLAGS_FILTER_POINT, "TEX_FILTER_* flags should match WIC_FLAGS_FILTER_*" ); + static_assert( TEX_FILTER_LINEAR == WIC_FLAGS_FILTER_LINEAR, "TEX_FILTER_* flags should match WIC_FLAGS_FILTER_*" ); + static_assert( TEX_FILTER_CUBIC == WIC_FLAGS_FILTER_CUBIC, "TEX_FILTER_* flags should match WIC_FLAGS_FILTER_*" ); + static_assert( TEX_FILTER_FANT == WIC_FLAGS_FILTER_FANT, "TEX_FILTER_* flags should match WIC_FLAGS_FILTER_*" ); + + switch( flags & TEX_FILTER_MASK ) + { + case TEX_FILTER_POINT: + return WICBitmapInterpolationModeNearestNeighbor; + + case TEX_FILTER_LINEAR: + return WICBitmapInterpolationModeLinear; + + case TEX_FILTER_CUBIC: + return WICBitmapInterpolationModeCubic; + + case TEX_FILTER_FANT: + default: + return WICBitmapInterpolationModeFant; + } + } + + //--------------------------------------------------------------------------------- + // Image helper functions + void _DetermineImageArray( _In_ const TexMetadata& metadata, _In_ DWORD cpFlags, + _Out_ size_t& nImages, _Out_ size_t& pixelSize ); + + bool _SetupImageArray( _In_bytecount_(pixelSize) uint8_t *pMemory, _In_ size_t pixelSize, + _In_ const TexMetadata& metadata, _In_ DWORD cpFlags, + _Out_cap_(nImages) Image* images, _In_ size_t nImages ); + + //--------------------------------------------------------------------------------- + // Conversion helper functions + + enum TEXP_SCANLINE_FLAGS + { + TEXP_SCANLINE_NONE = 0, + TEXP_SCANLINE_SETALPHA = 0x1, // Set alpha channel to known opaque value + TEXP_SCANLINE_LEGACY = 0x2, // Enables specific legacy format conversion cases + }; + + enum CONVERT_FLAGS + { + CONVF_FLOAT = 0x1, + CONVF_UNORM = 0x2, + CONVF_UINT = 0x4, + CONVF_SNORM = 0x8, + CONVF_SINT = 0x10, + CONVF_DEPTH = 0x20, + CONVF_STENCIL = 0x40, + CONVF_SHAREDEXP = 0x80, + CONVF_BGR = 0x100, + CONVF_X2 = 0x200, + CONVF_PACKED = 0x400, + CONVF_BC = 0x800, + CONVF_R = 0x10000, + CONVF_G = 0x20000, + CONVF_B = 0x40000, + CONVF_A = 0x80000, + CONVF_RGB_MASK = 0x70000, + CONVF_RGBA_MASK = 0xF0000, + }; + + DWORD _GetConvertFlags( _In_ DXGI_FORMAT format ); + + void _CopyScanline( _Out_bytecap_(outSize) LPVOID pDestination, _In_ size_t outSize, + _In_bytecount_(inSize) LPCVOID pSource, _In_ size_t inSize, + _In_ DXGI_FORMAT format, _In_ DWORD flags ); + + void _SwizzleScanline( _Out_bytecap_(outSize) LPVOID pDestination, _In_ size_t outSize, + _In_bytecount_(inSize) LPCVOID pSource, _In_ size_t inSize, + _In_ DXGI_FORMAT format, _In_ DWORD flags ); + + bool _ExpandScanline( _Out_bytecap_(outSize) LPVOID pDestination, _In_ size_t outSize, + _In_ DXGI_FORMAT outFormat, + _In_bytecount_(inSize) LPCVOID pSource, _In_ size_t inSize, + _In_ DXGI_FORMAT inFormat, _In_ DWORD flags ); + + bool _LoadScanline( _Out_cap_(count) XMVECTOR* pDestination, _In_ size_t count, + _In_bytecount_(size) LPCVOID pSource, _In_ size_t size, _In_ DXGI_FORMAT format ); + + bool _StoreScanline( _Out_bytecap_(size) LPVOID pDestination, _In_ size_t size, _In_ DXGI_FORMAT format, + _In_count_(count) const XMVECTOR* pSource, _In_ size_t count ); + + HRESULT _ConvertToR32G32B32A32( _In_ const Image& srcImage, _Inout_ ScratchImage& image ); + + HRESULT _ConvertFromR32G32B32A32( _In_ const Image& srcImage, _In_ const Image& destImage ); + HRESULT _ConvertFromR32G32B32A32( _In_ const Image& srcImage, _In_ DXGI_FORMAT format, _Inout_ ScratchImage& image ); + HRESULT _ConvertFromR32G32B32A32( _In_count_(nimages) const Image* srcImages, _In_ size_t nimages, _In_ const TexMetadata& metadata, + _In_ DXGI_FORMAT format, _Out_ ScratchImage& result ); + + void _ConvertScanline( _Inout_count_(count) XMVECTOR* pBuffer, _In_ size_t count, + _In_ DXGI_FORMAT outFormat, _In_ DXGI_FORMAT inFormat, _In_ DWORD flags ); + + //--------------------------------------------------------------------------------- + // DDS helper functions + HRESULT _EncodeDDSHeader( _In_ const TexMetadata& metadata, DWORD flags, + _Out_opt_cap_x_(maxsize) LPVOID pDestination, _In_ size_t maxsize, _Out_ size_t& required ); + +}; // namespace diff --git a/thirdparty/directxtex/DirectXTex/DirectXTexResize.cpp b/thirdparty/directxtex/DirectXTex/DirectXTexResize.cpp new file mode 100644 index 0000000..f30afc3 --- /dev/null +++ b/thirdparty/directxtex/DirectXTex/DirectXTexResize.cpp @@ -0,0 +1,358 @@ +//------------------------------------------------------------------------------------- +// DirectXTexResize.cpp +// +// DirectX Texture Library - Image resizing operations +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkId=248926 +//------------------------------------------------------------------------------------- + +#include "directxtexp.h" + +namespace DirectX +{ + +extern HRESULT _ResizeSeparateColorAndAlpha( _In_ IWICImagingFactory* pWIC, _In_ IWICBitmap* original, + _In_ size_t newWidth, _In_ size_t newHeight, _In_ DWORD filter, _Inout_ const Image* img ); + +//------------------------------------------------------------------------------------- +// Do image resize using WIC +//------------------------------------------------------------------------------------- +static HRESULT _PerformResizeUsingWIC( _In_ const Image& srcImage, _In_ DWORD filter, + _In_ const WICPixelFormatGUID& pfGUID, _In_ const Image& destImage ) +{ + if ( !srcImage.pixels || !destImage.pixels ) + return E_POINTER; + + assert( srcImage.format == destImage.format ); + + IWICImagingFactory* pWIC = _GetWIC(); + if ( !pWIC ) + return E_NOINTERFACE; + + ScopedObject componentInfo; + HRESULT hr = pWIC->CreateComponentInfo( pfGUID, &componentInfo ); + if ( FAILED(hr) ) + return hr; + + ScopedObject pixelFormatInfo; + hr = componentInfo->QueryInterface( __uuidof(IWICPixelFormatInfo2), (void**)&pixelFormatInfo ); + if ( FAILED(hr) ) + return hr; + + BOOL supportsTransparency = FALSE; + hr = pixelFormatInfo->SupportsTransparency( &supportsTransparency ); + if ( FAILED(hr) ) + return hr; + + ScopedObject source; + hr = pWIC->CreateBitmapFromMemory( static_cast( srcImage.width ), static_cast( srcImage.height ), pfGUID, + static_cast( srcImage.rowPitch ), static_cast( srcImage.slicePitch ), + srcImage.pixels, &source ); + if ( FAILED(hr) ) + return hr; + + if ( (filter & TEX_FILTER_SEPARATE_ALPHA) && supportsTransparency ) + { + hr = _ResizeSeparateColorAndAlpha( pWIC, source.Get(), destImage.width, destImage.height, filter, &destImage ); + if ( FAILED(hr) ) + return hr; + } + else + { + ScopedObject scaler; + hr = pWIC->CreateBitmapScaler( &scaler ); + if ( FAILED(hr) ) + return hr; + + hr = scaler->Initialize( source.Get(), static_cast( destImage.width ), static_cast( destImage.height ), _GetWICInterp( filter ) ); + if ( FAILED(hr) ) + return hr; + + WICPixelFormatGUID pfScaler; + hr = scaler->GetPixelFormat( &pfScaler ); + if ( FAILED(hr) ) + return hr; + + if ( memcmp( &pfScaler, &pfGUID, sizeof(WICPixelFormatGUID) ) == 0 ) + { + hr = scaler->CopyPixels( 0, static_cast( destImage.rowPitch ), static_cast( destImage.slicePitch ), destImage.pixels ); + if ( FAILED(hr) ) + return hr; + } + else + { + // The WIC bitmap scaler is free to return a different pixel format than the source image, so here we + // convert it back + ScopedObject FC; + hr = pWIC->CreateFormatConverter( &FC ); + if ( FAILED(hr) ) + return hr; + + hr = FC->Initialize( scaler.Get(), pfGUID, _GetWICDither( filter ), 0, 0, WICBitmapPaletteTypeCustom ); + if ( FAILED(hr) ) + return hr; + + hr = FC->CopyPixels( 0, static_cast( destImage.rowPitch ), static_cast( destImage.slicePitch ), destImage.pixels ); + if ( FAILED(hr) ) + return hr; + } + } + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Do conversion, resize using WIC, conversion cycle +//------------------------------------------------------------------------------------- +static HRESULT _PerformResizeViaF32( _In_ const Image& srcImage, _In_ DWORD filter, _In_ const Image& destImage ) +{ + if ( !srcImage.pixels || !destImage.pixels ) + return E_POINTER; + + assert( srcImage.format != DXGI_FORMAT_R32G32B32A32_FLOAT ); + assert( srcImage.format == destImage.format ); + + ScratchImage temp; + HRESULT hr = _ConvertToR32G32B32A32( srcImage, temp ); + if ( FAILED(hr) ) + return hr; + + const Image *tsrc = temp.GetImage( 0, 0, 0 ); + if ( !tsrc ) + return E_POINTER; + + ScratchImage rtemp; + hr = rtemp.Initialize2D( DXGI_FORMAT_R32G32B32A32_FLOAT, destImage.width, destImage.height, 1, 1 ); + if ( FAILED(hr) ) + return hr; + + const Image *tdest = rtemp.GetImage( 0, 0, 0 ); + if ( !tdest ) + return E_POINTER; + + hr = _PerformResizeUsingWIC( *tsrc, filter, GUID_WICPixelFormat128bppRGBAFloat, *tdest ); + if ( FAILED(hr) ) + return hr; + + temp.Release(); + + hr = _ConvertFromR32G32B32A32( *tdest, destImage ); + if ( FAILED(hr) ) + return hr; + + return S_OK; +} + + +//===================================================================================== +// Entry-points +//===================================================================================== + +//------------------------------------------------------------------------------------- +// Resize image +//------------------------------------------------------------------------------------- +HRESULT Resize( const Image& srcImage, size_t width, size_t height, DWORD filter, ScratchImage& image ) +{ + if ( width == 0 || height == 0 ) + return E_INVALIDARG; + +#ifdef _AMD64_ + if ( (srcImage.width > 0xFFFFFFFF) || (srcImage.height > 0xFFFFFFFF) ) + return E_INVALIDARG; + + if ( (width > 0xFFFFFFFF) || (height > 0xFFFFFFFF) ) + return E_INVALIDARG; +#endif + + if ( !srcImage.pixels ) + return E_POINTER; + + if ( IsCompressed( srcImage.format ) ) + { + // We don't support resizing compressed images + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + } + + HRESULT hr = image.Initialize2D( srcImage.format, width, height, 1, 1 ); + if ( FAILED(hr) ) + return hr; + + const Image *rimage = image.GetImage( 0, 0, 0 ); + if ( !rimage ) + return E_POINTER; + + // WIC only supports CLAMP + + WICPixelFormatGUID pfGUID; + if ( _DXGIToWIC( srcImage.format, pfGUID ) ) + { + // Case 1: Source format is supported by Windows Imaging Component + hr = _PerformResizeUsingWIC( srcImage, filter, pfGUID, *rimage ); + } + else + { + // Case 2: Source format is not supported by WIC, so we have to convert, resize, and convert back + hr = _PerformResizeViaF32( srcImage, filter, *rimage ); + } + + if ( FAILED(hr) ) + { + image.Release(); + return hr; + } + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Resize image (complex) +//------------------------------------------------------------------------------------- +HRESULT Resize( const Image* srcImages, size_t nimages, const TexMetadata& metadata, + size_t width, size_t height, DWORD filter, ScratchImage& result ) +{ + if ( !srcImages || !nimages || width == 0 || height == 0 ) + return E_INVALIDARG; + +#ifdef _AMD64_ + if ( (width > 0xFFFFFFFF) || (height > 0xFFFFFFFF) ) + return E_INVALIDARG; +#endif + + TexMetadata mdata2 = metadata; + mdata2.width = width; + mdata2.height = height; + mdata2.mipLevels = 1; + HRESULT hr = result.Initialize( mdata2 ); + if ( FAILED(hr) ) + return hr; + + WICPixelFormatGUID pfGUID; + bool wicpf = _DXGIToWIC( metadata.format, pfGUID ); + + switch ( metadata.dimension ) + { + case TEX_DIMENSION_TEXTURE1D: + case TEX_DIMENSION_TEXTURE2D: + assert( metadata.depth == 1 ); + + for( size_t item = 0; item < metadata.arraySize; ++item ) + { + size_t srcIndex = metadata.ComputeIndex( 0, item, 0 ); + if ( srcIndex >= nimages ) + { + result.Release(); + return E_FAIL; + } + + const Image* srcimg = &srcImages[ srcIndex ]; + const Image* destimg = result.GetImage( 0, item, 0 ); + if ( !srcimg || !destimg ) + { + result.Release(); + return E_POINTER; + } + + if ( srcimg->format != metadata.format ) + { + result.Release(); + return E_FAIL; + } + +#ifdef _AMD64_ + if ( (srcimg->width > 0xFFFFFFFF) || (srcimg->height > 0xFFFFFFFF) ) + { + result.Release(); + return E_FAIL; + } +#endif + + if ( wicpf ) + { + // Case 1: Source format is supported by Windows Imaging Component + hr = _PerformResizeUsingWIC( *srcimg, filter, pfGUID, *destimg ); + } + else + { + // Case 2: Source format is not supported by WIC, so we have to convert, resize, and convert back + hr = _PerformResizeViaF32( *srcimg, filter, *destimg ); + } + + if ( FAILED(hr) ) + { + result.Release(); + return hr; + } + } + break; + + case TEX_DIMENSION_TEXTURE3D: + assert( metadata.arraySize == 1 ); + + for( size_t slice = 0; slice < metadata.depth; ++slice ) + { + size_t srcIndex = metadata.ComputeIndex( 0, 0, slice ); + if ( srcIndex >= nimages ) + { + result.Release(); + return E_FAIL; + } + + const Image* srcimg = &srcImages[ srcIndex ]; + const Image* destimg = result.GetImage( 0, 0, slice ); + if ( !srcimg || !destimg ) + { + result.Release(); + return E_POINTER; + } + + if ( srcimg->format != metadata.format ) + { + result.Release(); + return E_FAIL; + } + +#ifdef _AMD64_ + if ( (srcimg->width > 0xFFFFFFFF) || (srcimg->height > 0xFFFFFFFF) ) + { + result.Release(); + return E_FAIL; + } +#endif + + if ( wicpf ) + { + // Case 1: Source format is supported by Windows Imaging Component + hr = _PerformResizeUsingWIC( *srcimg, filter, pfGUID, *destimg ); + } + else + { + // Case 2: Source format is not supported by WIC, so we have to convert, resize, and convert back + hr = _PerformResizeViaF32( *srcimg, filter, *destimg ); + } + + if ( FAILED(hr) ) + { + result.Release(); + return hr; + } + } + break; + + default: + result.Release(); + return E_FAIL; + } + + return S_OK; +} + +}; // namespace diff --git a/thirdparty/directxtex/DirectXTex/DirectXTexTGA.cpp b/thirdparty/directxtex/DirectXTex/DirectXTexTGA.cpp new file mode 100644 index 0000000..87cc43f --- /dev/null +++ b/thirdparty/directxtex/DirectXTex/DirectXTexTGA.cpp @@ -0,0 +1,1387 @@ +//------------------------------------------------------------------------------------- +// DirectXTexTGA.cpp +// +// DirectX Texture Library - Targa Truevision (TGA) file format reader/writer +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkId=248926 +//------------------------------------------------------------------------------------- + +#include "directxtexp.h" + +// +// The implementation here has the following limitations: +// * Does not support files that contain color maps (these are rare in practice) +// * Interleaved files are not supported (deprecated aspect of TGA format) +// * Only supports 8-bit greyscale; 16-, 24-, and 32-bit truecolor images +// * Always writes uncompressed files (i.e. can read RLE compression, but does not write it) +// + +enum TGAImageType +{ + TGA_NO_IMAGE = 0, + TGA_COLOR_MAPPED = 1, + TGA_TRUECOLOR = 2, + TGA_BLACK_AND_WHITE = 3, + TGA_COLOR_MAPPED_RLE = 9, + TGA_TRUECOLOR_RLE = 10, + TGA_BLACK_AND_WHITE_RLE = 11, +}; + +enum TGADescriptorFlags +{ + TGA_FLAGS_INVERTX = 0x10, + TGA_FLAGS_INVERTY = 0x20, + TGA_FLAGS_INTERLEAVED_2WAY = 0x40, // Deprecated + TGA_FLAGS_INTERLEAVED_4WAY = 0x80, // Deprecated +}; + +const char* g_TGA20_Signature = "TRUEVISION-XFILE."; + +#pragma pack(push,1) +struct TGA_HEADER +{ + uint8_t bIDLength; + uint8_t bColorMapType; + uint8_t bImageType; + uint16_t wColorMapFirst; + uint16_t wColorMapLength; + uint8_t bColorMapSize; + uint16_t wXOrigin; + uint16_t wYOrigin; + uint16_t wWidth; + uint16_t wHeight; + uint8_t bBitsPerPixel; + uint8_t bDescriptor; +}; + +struct TGA_FOOTER +{ + uint16_t dwExtensionOffset; + uint16_t dwDeveloperOffset; + char Signature[18]; +}; + +struct TGA_EXTENSION +{ + uint16_t wSize; + char szAuthorName[41]; + char szAuthorComment[324]; + uint16_t wStampMonth; + uint16_t wStampDay; + uint16_t wStampYear; + uint16_t wStampHour; + uint16_t wStampMinute; + uint16_t wStampSecond; + char szJobName[41]; + uint16_t wJobHour; + uint16_t wJobMinute; + uint16_t wJobSecond; + char szSoftwareId[41]; + uint16_t wVersionNumber; + uint8_t bVersionLetter; + uint32_t dwKeyColor; + uint16_t wPixelNumerator; + uint16_t wPixelDenominator; + uint16_t wGammaNumerator; + uint16_t wGammaDenominator; + uint32_t dwColorOffset; + uint32_t dwStampOffset; + uint32_t dwScanOffset; + uint8_t bAttributesType; +}; +#pragma pack(pop) + +enum CONVERSION_FLAGS +{ + CONV_FLAGS_NONE = 0x0, + CONV_FLAGS_EXPAND = 0x1, // Conversion requires expanded pixel size + CONV_FLAGS_INVERTX = 0x2, // If set, scanlines are right-to-left + CONV_FLAGS_INVERTY = 0x4, // If set, scanlines are top-to-bottom + CONV_FLAGS_RLE = 0x8, // Source data is RLE compressed + + CONV_FLAGS_SWIZZLE = 0x10000, // Swizzle BGR<->RGB data + CONV_FLAGS_888 = 0x20000, // 24bpp format +}; + +namespace DirectX +{ + +//------------------------------------------------------------------------------------- +// Decodes TGA header +//------------------------------------------------------------------------------------- +static HRESULT _DecodeTGAHeader( _In_bytecount_(size) LPCVOID pSource, size_t size, _Out_ TexMetadata& metadata, size_t& offset, + _Inout_opt_ DWORD* convFlags ) +{ + if ( !pSource ) + return E_INVALIDARG; + + memset( &metadata, 0, sizeof(TexMetadata) ); + + if ( size < sizeof(TGA_HEADER) ) + { + return HRESULT_FROM_WIN32( ERROR_INVALID_DATA ); + } + + const TGA_HEADER* pHeader = reinterpret_cast( pSource ); + assert( pHeader ); + + if ( pHeader->bColorMapType != 0 + || pHeader->wColorMapLength != 0 ) + { + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + } + + if ( pHeader->bDescriptor & (TGA_FLAGS_INTERLEAVED_2WAY|TGA_FLAGS_INTERLEAVED_4WAY) ) + { + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + } + + if ( !pHeader->wWidth || !pHeader->wHeight ) + { + return HRESULT_FROM_WIN32( ERROR_INVALID_DATA ); + } + + switch ( pHeader->bImageType ) + { + case TGA_TRUECOLOR: + case TGA_TRUECOLOR_RLE: + switch( pHeader->bBitsPerPixel ) + { + case 16: + metadata.format = DXGI_FORMAT_B5G5R5A1_UNORM; + break; + + case 24: + metadata.format = DXGI_FORMAT_R8G8B8A8_UNORM; + if ( convFlags ) + *convFlags |= CONV_FLAGS_EXPAND; + // We could use DXGI_FORMAT_B8G8R8X8_UNORM, but we prefer DXGI 1.0 formats + break; + + case 32: + metadata.format = DXGI_FORMAT_R8G8B8A8_UNORM; + // We could use DXGI_FORMAT_B8G8R8A8_UNORM, but we prefer DXGI 1.0 formats + break; + } + + if ( convFlags && (pHeader->bImageType == TGA_TRUECOLOR_RLE) ) + { + *convFlags |= CONV_FLAGS_RLE; + } + break; + + case TGA_BLACK_AND_WHITE: + case TGA_BLACK_AND_WHITE_RLE: + switch( pHeader->bBitsPerPixel ) + { + case 8: + metadata.format = DXGI_FORMAT_R8_UNORM; + break; + + default: + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + } + + if ( convFlags && (pHeader->bImageType == TGA_BLACK_AND_WHITE_RLE) ) + { + *convFlags |= CONV_FLAGS_RLE; + } + break; + + case TGA_NO_IMAGE: + case TGA_COLOR_MAPPED: + case TGA_COLOR_MAPPED_RLE: + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + + default: + return HRESULT_FROM_WIN32( ERROR_INVALID_DATA ); + } + + metadata.width = pHeader->wWidth; + metadata.height = pHeader->wHeight; + metadata.depth = metadata.arraySize = metadata.mipLevels = 1; + metadata.dimension = TEX_DIMENSION_TEXTURE2D; + + if ( convFlags ) + { + if ( pHeader->bDescriptor & TGA_FLAGS_INVERTX ) + *convFlags |= CONV_FLAGS_INVERTX; + + if ( pHeader->bDescriptor & TGA_FLAGS_INVERTY ) + *convFlags |= CONV_FLAGS_INVERTY; + } + + offset = sizeof( TGA_HEADER ); + + if ( pHeader->bIDLength != 0 ) + { + offset += pHeader->bIDLength; + } + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Set alpha for images with all 0 alpha channel +//------------------------------------------------------------------------------------- +static HRESULT _SetAlphaChannelToOpaque( _In_ const Image* image ) +{ + assert( image ); + + uint8_t* pPixels = reinterpret_cast( image->pixels ); + if ( !pPixels ) + return E_POINTER; + + for( size_t y = 0; y < image->height; ++y ) + { + _CopyScanline( pPixels, image->rowPitch, pPixels, image->rowPitch, image->format, TEXP_SCANLINE_SETALPHA ); + pPixels += image->rowPitch; + } + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Uncompress pixel data from a TGA into the target image +//------------------------------------------------------------------------------------- +static HRESULT _UncompressPixels( _In_bytecount_(size) LPCVOID pSource, size_t size, _In_ const Image* image, DWORD convFlags ) +{ + assert( pSource && size > 0 ); + + if ( !image || !image->pixels ) + return E_POINTER; + + // Compute TGA image data pitch + size_t rowPitch; + if ( convFlags & CONV_FLAGS_EXPAND ) + { + rowPitch = image->width * 3; + } + else + { + size_t slicePitch; + ComputePitch( image->format, image->width, image->height, rowPitch, slicePitch, CP_FLAGS_NONE ); + } + + const uint8_t* sPtr = reinterpret_cast( pSource ); + const uint8_t* endPtr = sPtr + size; + + switch( image->format ) + { + //--------------------------------------------------------------------------- 8-bit + case DXGI_FORMAT_R8_UNORM: + for( size_t y=0; y < image->height; ++y ) + { + size_t offset = ( (convFlags & CONV_FLAGS_INVERTX ) ? (image->width - 1) : 0 ); + assert( offset < rowPitch); + + uint8_t* dPtr = reinterpret_cast( image->pixels ) + + ( image->rowPitch * ( (convFlags & CONV_FLAGS_INVERTY) ? y : (image->height - y - 1) ) ) + + offset; + + for( size_t x=0; x < image->width; ) + { + if ( sPtr >= endPtr ) + return E_FAIL; + + if ( *sPtr & 0x80 ) + { + // Repeat + size_t j = (*sPtr & 0x7F) + 1; + if ( ++sPtr >= endPtr ) + return E_FAIL; + + for( ; j > 0; --j, ++x ) + { + if ( x >= image->width ) + return E_FAIL; + + *dPtr = *sPtr; + + if ( convFlags & CONV_FLAGS_INVERTX ) + --dPtr; + else + ++dPtr; + } + + ++sPtr; + } + else + { + // Literal + size_t j = (*sPtr & 0x7F) + 1; + ++sPtr; + + if ( sPtr+j > endPtr ) + return E_FAIL; + + for( ; j > 0; --j, ++x ) + { + if ( x >= image->width ) + return E_FAIL; + + *dPtr = *(sPtr++); + + if ( convFlags & CONV_FLAGS_INVERTX ) + --dPtr; + else + ++dPtr; + } + } + } + } + break; + + //-------------------------------------------------------------------------- 16-bit + case DXGI_FORMAT_B5G5R5A1_UNORM: + { + bool nonzeroa = false; + for( size_t y=0; y < image->height; ++y ) + { + size_t offset = ( (convFlags & CONV_FLAGS_INVERTX ) ? (image->width - 1) : 0 ); + assert( offset*2 < rowPitch); + + uint16_t* dPtr = reinterpret_cast( reinterpret_cast( image->pixels ) + + ( image->rowPitch * ( (convFlags & CONV_FLAGS_INVERTY) ? y : (image->height - y - 1) ) ) ) + + offset; + + for( size_t x=0; x < image->width; ) + { + if ( sPtr >= endPtr ) + return E_FAIL; + + if ( *sPtr & 0x80 ) + { + // Repeat + size_t j = (*sPtr & 0x7F) + 1; + ++sPtr; + + if ( sPtr+1 >= endPtr ) + return E_FAIL; + + uint16_t t = *sPtr | (*(sPtr+1) << 8); + if ( t & 0x8000 ) + nonzeroa = true; + sPtr += 2; + + for( ; j > 0; --j, ++x ) + { + if ( x >= image->width ) + return E_FAIL; + + *dPtr = t; + + if ( convFlags & CONV_FLAGS_INVERTX ) + --dPtr; + else + ++dPtr; + } + } + else + { + // Literal + size_t j = (*sPtr & 0x7F) + 1; + ++sPtr; + + if ( sPtr+(j*2) > endPtr ) + return E_FAIL; + + for( ; j > 0; --j, ++x ) + { + if ( x >= image->width ) + return E_FAIL; + + uint16_t t = *sPtr | (*(sPtr+1) << 8); + if ( t & 0x8000 ) + nonzeroa = true; + sPtr += 2; + *dPtr = t; + + if ( convFlags & CONV_FLAGS_INVERTX ) + --dPtr; + else + ++dPtr; + } + } + } + } + + // If there are no non-zero alpha channel entries, we'll assume alpha is not used and force it to opaque + if ( !nonzeroa ) + { + HRESULT hr = _SetAlphaChannelToOpaque( image ); + if ( FAILED(hr) ) + return hr; + } + } + break; + + //----------------------------------------------------------------------- 24/32-bit + case DXGI_FORMAT_R8G8B8A8_UNORM: + { + bool nonzeroa = false; + for( size_t y=0; y < image->height; ++y ) + { + size_t offset = ( (convFlags & CONV_FLAGS_INVERTX ) ? (image->width - 1) : 0 ); + + uint32_t* dPtr = reinterpret_cast( reinterpret_cast( image->pixels ) + + ( image->rowPitch * ( (convFlags & CONV_FLAGS_INVERTY) ? y : (image->height - y - 1) ) ) ) + + offset; + + for( size_t x=0; x < image->width; ) + { + if ( sPtr >= endPtr ) + return E_FAIL; + + if ( *sPtr & 0x80 ) + { + // Repeat + size_t j = (*sPtr & 0x7F) + 1; + ++sPtr; + + DWORD t; + if ( convFlags & CONV_FLAGS_EXPAND ) + { + assert( offset*3 < rowPitch); + + if ( sPtr+2 >= endPtr ) + return E_FAIL; + + // BGR -> RGBA + t = ( *sPtr << 16 ) | ( *(sPtr+1) << 8 ) | ( *(sPtr+2) ) | 0xFF000000; + sPtr += 3; + + nonzeroa = true; + } + else + { + assert( offset*4 < rowPitch); + + if ( sPtr+3 >= endPtr ) + return E_FAIL; + + // BGRA -> RGBA + t = ( *sPtr << 16 ) | ( *(sPtr+1) << 8 ) | ( *(sPtr+2) ) | ( *(sPtr+3) << 24 ); + + if ( *(sPtr+3) > 0 ) + nonzeroa = true; + + sPtr += 4; + } + + for( ; j > 0; --j, ++x ) + { + if ( x >= image->width ) + return E_FAIL; + + *dPtr = t; + + if ( convFlags & CONV_FLAGS_INVERTX ) + --dPtr; + else + ++dPtr; + } + } + else + { + // Literal + size_t j = (*sPtr & 0x7F) + 1; + ++sPtr; + + if ( convFlags & CONV_FLAGS_EXPAND ) + { + if ( sPtr+(j*3) > endPtr ) + return E_FAIL; + } + else + { + if ( sPtr+(j*4) > endPtr ) + return E_FAIL; + } + + for( ; j > 0; --j, ++x ) + { + if ( x >= image->width ) + return E_FAIL; + + if ( convFlags & CONV_FLAGS_EXPAND ) + { + assert( offset*3 < rowPitch); + + if ( sPtr+2 >= endPtr ) + return E_FAIL; + + // BGR -> RGBA + *dPtr = ( *sPtr << 16 ) | ( *(sPtr+1) << 8 ) | ( *(sPtr+2) ) | 0xFF000000; + sPtr += 3; + + nonzeroa = true; + } + else + { + assert( offset*4 < rowPitch); + + if ( sPtr+3 >= endPtr ) + return E_FAIL; + + // BGRA -> RGBA + *dPtr = ( *sPtr << 16 ) | ( *(sPtr+1) << 8 ) | ( *(sPtr+2) ) | ( *(sPtr+3) << 24 ); + + if ( *(sPtr+3) > 0 ) + nonzeroa = true; + + sPtr += 4; + } + + if ( convFlags & CONV_FLAGS_INVERTX ) + --dPtr; + else + ++dPtr; + } + } + } + } + + // If there are no non-zero alpha channel entries, we'll assume alpha is not used and force it to opaque + if ( !nonzeroa ) + { + HRESULT hr = _SetAlphaChannelToOpaque( image ); + if ( FAILED(hr) ) + return hr; + } + } + break; + + //--------------------------------------------------------------------------------- + default: + return E_FAIL; + } + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Copies pixel data from a TGA into the target image +//------------------------------------------------------------------------------------- +static HRESULT _CopyPixels( _In_bytecount_(size) LPCVOID pSource, size_t size, _In_ const Image* image, DWORD convFlags ) +{ + assert( pSource && size > 0 ); + + if ( !image || !image->pixels ) + return E_POINTER; + + // Compute TGA image data pitch + size_t rowPitch; + if ( convFlags & CONV_FLAGS_EXPAND ) + { + rowPitch = image->width * 3; + } + else + { + size_t slicePitch; + ComputePitch( image->format, image->width, image->height, rowPitch, slicePitch, CP_FLAGS_NONE ); + } + + const uint8_t* sPtr = reinterpret_cast( pSource ); + const uint8_t* endPtr = sPtr + size; + + switch( image->format ) + { + //--------------------------------------------------------------------------- 8-bit + case DXGI_FORMAT_R8_UNORM: + for( size_t y=0; y < image->height; ++y ) + { + size_t offset = ( (convFlags & CONV_FLAGS_INVERTX ) ? (image->width - 1) : 0 ); + assert( offset < rowPitch); + + uint8_t* dPtr = reinterpret_cast( image->pixels ) + + ( image->rowPitch * ( (convFlags & CONV_FLAGS_INVERTY) ? y : (image->height - y - 1) ) ) + + offset; + + for( size_t x=0; x < image->width; ++x ) + { + if ( sPtr >= endPtr ) + return E_FAIL; + + *dPtr = *(sPtr++); + + if ( convFlags & CONV_FLAGS_INVERTX ) + --dPtr; + else + ++dPtr; + } + } + break; + + //-------------------------------------------------------------------------- 16-bit + case DXGI_FORMAT_B5G5R5A1_UNORM: + { + bool nonzeroa = false; + for( size_t y=0; y < image->height; ++y ) + { + size_t offset = ( (convFlags & CONV_FLAGS_INVERTX ) ? (image->width - 1) : 0 ); + assert( offset*2 < rowPitch); + + uint16_t* dPtr = reinterpret_cast( reinterpret_cast( image->pixels ) + + ( image->rowPitch * ( (convFlags & CONV_FLAGS_INVERTY) ? y : (image->height - y - 1) ) ) ) + + offset; + + for( size_t x=0; x < image->width; ++x ) + { + if ( sPtr+1 >= endPtr ) + return E_FAIL; + + uint16_t t = *sPtr | (*(sPtr+1) << 8); + sPtr += 2; + *dPtr = t; + + if ( t & 0x8000 ) + nonzeroa = true; + + if ( convFlags & CONV_FLAGS_INVERTX ) + --dPtr; + else + ++dPtr; + } + } + + // If there are no non-zero alpha channel entries, we'll assume alpha is not used and force it to opaque + if ( !nonzeroa ) + { + HRESULT hr = _SetAlphaChannelToOpaque( image ); + if ( FAILED(hr) ) + return hr; + } + } + break; + + //----------------------------------------------------------------------- 24/32-bit + case DXGI_FORMAT_R8G8B8A8_UNORM: + { + bool nonzeroa = false; + for( size_t y=0; y < image->height; ++y ) + { + size_t offset = ( (convFlags & CONV_FLAGS_INVERTX ) ? (image->width - 1) : 0 ); + + uint32_t* dPtr = reinterpret_cast( reinterpret_cast( image->pixels ) + + ( image->rowPitch * ( (convFlags & CONV_FLAGS_INVERTY) ? y : (image->height - y - 1) ) ) ) + + offset; + + for( size_t x=0; x < image->width; ++x ) + { + if ( convFlags & CONV_FLAGS_EXPAND ) + { + assert( offset*3 < rowPitch); + + if ( sPtr+2 >= endPtr ) + return E_FAIL; + + // BGR -> RGBA + *dPtr = ( *sPtr << 16 ) | ( *(sPtr+1) << 8 ) | ( *(sPtr+2) ) | 0xFF000000; + sPtr += 3; + + nonzeroa = true; + } + else + { + assert( offset*4 < rowPitch); + + if ( sPtr+3 >= endPtr ) + return E_FAIL; + + // BGRA -> RGBA + *dPtr = ( *sPtr << 16 ) | ( *(sPtr+1) << 8 ) | ( *(sPtr+2) ) | ( *(sPtr+3) << 24 ); + + if ( *(sPtr+3) > 0 ) + nonzeroa = true; + + sPtr += 4; + } + + if ( convFlags & CONV_FLAGS_INVERTX ) + --dPtr; + else + ++dPtr; + } + } + + // If there are no non-zero alpha channel entries, we'll assume alpha is not used and force it to opaque + if ( !nonzeroa ) + { + HRESULT hr = _SetAlphaChannelToOpaque( image ); + if ( FAILED(hr) ) + return hr; + } + } + break; + + //--------------------------------------------------------------------------------- + default: + return E_FAIL; + } + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Encodes TGA file header +//------------------------------------------------------------------------------------- +static HRESULT _EncodeTGAHeader( _In_ const Image& image, _Out_ TGA_HEADER& header, DWORD& convFlags ) +{ + assert( IsValid( image.format ) && !IsVideo( image.format ) ); + + memset( &header, 0, sizeof(TGA_HEADER) ); + + if ( (image.width > 0xFFFF) + || (image.height > 0xFFFF) ) + { + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + } + + header.wWidth = static_cast( image.width ); + header.wHeight = static_cast( image.height ); + + switch( image.format ) + { + case DXGI_FORMAT_R8G8B8A8_UNORM: + case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB: + header.bImageType = TGA_TRUECOLOR; + header.bBitsPerPixel = 32; + header.bDescriptor = TGA_FLAGS_INVERTY | 8; + convFlags |= CONV_FLAGS_SWIZZLE; + break; + + case DXGI_FORMAT_B8G8R8A8_UNORM: + case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB: + header.bImageType = TGA_TRUECOLOR; + header.bBitsPerPixel = 32; + header.bDescriptor = TGA_FLAGS_INVERTY | 8; + break; + + case DXGI_FORMAT_B8G8R8X8_UNORM: + case DXGI_FORMAT_B8G8R8X8_UNORM_SRGB: + header.bImageType = TGA_TRUECOLOR; + header.bBitsPerPixel = 24; + header.bDescriptor = TGA_FLAGS_INVERTY; + convFlags |= CONV_FLAGS_888; + break; + + case DXGI_FORMAT_R8_UNORM: + case DXGI_FORMAT_A8_UNORM: + header.bImageType = TGA_BLACK_AND_WHITE; + header.bBitsPerPixel = 8; + header.bDescriptor = TGA_FLAGS_INVERTY; + break; + + case DXGI_FORMAT_B5G5R5A1_UNORM: + header.bImageType = TGA_TRUECOLOR; + header.bBitsPerPixel = 16; + header.bDescriptor = TGA_FLAGS_INVERTY | 1; + break; + + default: + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + } + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Copies BGRX data to form BGR 24bpp data +//------------------------------------------------------------------------------------- +#pragma warning(suppress: 6001 6101) // In the case where outSize is insufficient we do not write to pDestination +static void _Copy24bppScanline( _Out_bytecap_(outSize) LPVOID pDestination, _In_ size_t outSize, + _In_bytecount_(inSize) LPCVOID pSource, _In_ size_t inSize ) +{ + assert( pDestination && outSize > 0 ); + assert( pSource && inSize > 0 ); + + assert( pDestination != pSource ); + + const uint32_t * __restrict sPtr = reinterpret_cast(pSource); + uint8_t * __restrict dPtr = reinterpret_cast(pDestination); + + const uint8_t* endPtr = dPtr + outSize; + + for( size_t count = 0; count < inSize; count += 4 ) + { + uint32_t t = *(sPtr++); + + if ( dPtr+2 > endPtr ) + return; + + *(dPtr++) = uint8_t(t & 0xFF); // Blue + *(dPtr++) = uint8_t((t & 0xFF00) >> 8); // Green + *(dPtr++) = uint8_t((t & 0xFF0000) >> 16); // Red + } +} + + +//===================================================================================== +// Entry-points +//===================================================================================== + +//------------------------------------------------------------------------------------- +// Obtain metadata from TGA file in memory/on disk +//------------------------------------------------------------------------------------- +HRESULT GetMetadataFromTGAMemory( LPCVOID pSource, size_t size, TexMetadata& metadata ) +{ + if ( !pSource || size == 0 ) + return E_INVALIDARG; + + size_t offset; + return _DecodeTGAHeader( pSource, size, metadata, offset, 0 ); +} + +HRESULT GetMetadataFromTGAFile( LPCWSTR szFile, TexMetadata& metadata ) +{ + if ( !szFile ) + return E_INVALIDARG; + +#if (_WIN32_WINNT >= 0x0602 /*_WIN32_WINNT_WIN8*/) + ScopedHandle hFile( safe_handle( CreateFile2( szFile, GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING, 0 ) ) ); +#else + ScopedHandle hFile( safe_handle( CreateFileW( szFile, GENERIC_READ, FILE_SHARE_READ, 0, OPEN_EXISTING, + FILE_FLAG_SEQUENTIAL_SCAN, 0 ) ) ); +#endif + if ( !hFile ) + { + return HRESULT_FROM_WIN32( GetLastError() ); + } + + // Get the file size + LARGE_INTEGER fileSize = {0}; + +#if (_WIN32_WINNT >= _WIN32_WINNT_VISTA) + FILE_STANDARD_INFO fileInfo; + if ( !GetFileInformationByHandleEx( hFile.get(), FileStandardInfo, &fileInfo, sizeof(fileInfo) ) ) + { + return HRESULT_FROM_WIN32( GetLastError() ); + } + fileSize = fileInfo.EndOfFile; +#else + if ( !GetFileSizeEx( hFile.get(), &fileSize ) ) + { + return HRESULT_FROM_WIN32( GetLastError() ); + } +#endif + + // File is too big for 32-bit allocation, so reject read (4 GB should be plenty large enough for a valid TGA file) + if ( fileSize.HighPart > 0 ) + { + return HRESULT_FROM_WIN32( ERROR_FILE_TOO_LARGE ); + } + + // Need at least enough data to fill the standard header to be a valid TGA + if ( fileSize.LowPart < ( sizeof(TGA_HEADER) ) ) + { + return E_FAIL; + } + + // Read the standard header (we don't need the file footer to parse the file) + uint8_t header[sizeof(TGA_HEADER)]; + DWORD bytesRead = 0; + if ( !ReadFile( hFile.get(), header, sizeof(TGA_HEADER), &bytesRead, 0 ) ) + { + return HRESULT_FROM_WIN32( GetLastError() ); + } + + size_t offset; + return _DecodeTGAHeader( header, bytesRead, metadata, offset, 0 ); +} + + +//------------------------------------------------------------------------------------- +// Load a TGA file in memory +//------------------------------------------------------------------------------------- +HRESULT LoadFromTGAMemory( LPCVOID pSource, size_t size, TexMetadata* metadata, ScratchImage& image ) +{ + if ( !pSource || size == 0 ) + return E_INVALIDARG; + + image.Release(); + + size_t offset; + DWORD convFlags = 0; + TexMetadata mdata; + HRESULT hr = _DecodeTGAHeader( pSource, size, mdata, offset, &convFlags ); + if ( FAILED(hr) ) + return hr; + + if ( offset > size ) + return E_FAIL; + + LPCVOID pPixels = reinterpret_cast( reinterpret_cast(pSource) + offset ); + assert( pPixels ); + + size_t remaining = size - offset; + if ( remaining == 0 ) + return E_FAIL; + + hr = image.Initialize2D( mdata.format, mdata.width, mdata.height, 1, 1 ); + if ( FAILED(hr) ) + return hr; + + if ( convFlags & CONV_FLAGS_RLE ) + { + hr = _UncompressPixels( pPixels, remaining, image.GetImage(0,0,0), convFlags ); + } + else + { + hr = _CopyPixels( pPixels, remaining, image.GetImage(0,0,0), convFlags ); + } + + if ( FAILED(hr) ) + { + image.Release(); + return hr; + } + + if ( metadata ) + memcpy( metadata, &mdata, sizeof(TexMetadata) ); + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Load a TGA file from disk +//------------------------------------------------------------------------------------- +HRESULT LoadFromTGAFile( LPCWSTR szFile, TexMetadata* metadata, ScratchImage& image ) +{ + if ( !szFile ) + return E_INVALIDARG; + + image.Release(); + +#if (_WIN32_WINNT >= 0x0602 /*_WIN32_WINNT_WIN8*/) + ScopedHandle hFile( safe_handle( CreateFile2( szFile, GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING, 0 ) ) ); +#else + ScopedHandle hFile( safe_handle( CreateFileW( szFile, GENERIC_READ, FILE_SHARE_READ, 0, OPEN_EXISTING, + FILE_FLAG_SEQUENTIAL_SCAN, 0 ) ) ); +#endif + if ( !hFile ) + { + return HRESULT_FROM_WIN32( GetLastError() ); + } + + // Get the file size + LARGE_INTEGER fileSize = {0}; + +#if (_WIN32_WINNT >= _WIN32_WINNT_VISTA) + FILE_STANDARD_INFO fileInfo; + if ( !GetFileInformationByHandleEx( hFile.get(), FileStandardInfo, &fileInfo, sizeof(fileInfo) ) ) + { + return HRESULT_FROM_WIN32( GetLastError() ); + } + fileSize = fileInfo.EndOfFile; +#else + if ( !GetFileSizeEx( hFile.get(), &fileSize ) ) + { + return HRESULT_FROM_WIN32( GetLastError() ); + } +#endif + + // File is too big for 32-bit allocation, so reject read (4 GB should be plenty large enough for a valid TGA file) + if ( fileSize.HighPart > 0 ) + { + return HRESULT_FROM_WIN32( ERROR_FILE_TOO_LARGE ); + } + + // Need at least enough data to fill the header to be a valid TGA + if ( fileSize.LowPart < sizeof(TGA_HEADER) ) + { + return E_FAIL; + } + + // Read the header + uint8_t header[sizeof(TGA_HEADER)]; + DWORD bytesRead = 0; + if ( !ReadFile( hFile.get(), header, sizeof(TGA_HEADER), &bytesRead, 0 ) ) + { + return HRESULT_FROM_WIN32( GetLastError() ); + } + + size_t offset; + DWORD convFlags = 0; + TexMetadata mdata; + HRESULT hr = _DecodeTGAHeader( header, bytesRead, mdata, offset, &convFlags ); + if ( FAILED(hr) ) + return hr; + + // Read the pixels + DWORD remaining = static_cast( fileSize.LowPart - offset ); + if ( remaining == 0 ) + return E_FAIL; + + if ( offset > sizeof(TGA_HEADER) ) + { + // Skip past the id string + LARGE_INTEGER filePos = { static_cast(offset), 0 }; + if ( !SetFilePointerEx( hFile.get(), filePos, 0, FILE_BEGIN ) ) + { + return HRESULT_FROM_WIN32( GetLastError() ); + } + } + + hr = image.Initialize2D( mdata.format, mdata.width, mdata.height, 1, 1 ); + if ( FAILED(hr) ) + return hr; + + assert( image.GetPixels() ); + + if ( !(convFlags & (CONV_FLAGS_RLE | CONV_FLAGS_EXPAND | CONV_FLAGS_INVERTX)) && (convFlags & CONV_FLAGS_INVERTY) ) + { + // This case we can read directly into the image buffer in place + if ( !ReadFile( hFile.get(), image.GetPixels(), static_cast( image.GetPixelsSize() ), &bytesRead, 0 ) ) + { + image.Release(); + return HRESULT_FROM_WIN32( GetLastError() ); + } + + if ( bytesRead != image.GetPixelsSize() ) + { + image.Release(); + return E_FAIL; + } + + switch( mdata.format ) + { + case DXGI_FORMAT_R8G8B8A8_UNORM: + { + // TGA stores 32-bit data in BGRA form, need to swizzle to RGBA + assert( image.GetImageCount() == 1 ); + const Image* img = image.GetImage(0,0,0); + if ( !img ) + return E_POINTER; + + uint8_t *pPixels = img->pixels; + if ( !pPixels ) + return E_POINTER; + + size_t rowPitch = img->rowPitch; + + // Scan for non-zero alpha channel + bool nonzeroa = false; + + for( size_t h = 0; h < img->height; ++h ) + { + const uint32_t* sPtr = reinterpret_cast( pPixels ); + + for( size_t x=0; x < img->width; ++x ) + { + if ( (*sPtr) & 0xff000000 ) + { + nonzeroa = true; + break; + } + + ++sPtr; + } + + if ( nonzeroa ) + break; + + pPixels += rowPitch; + } + + DWORD tflags = ( !nonzeroa ) ? TEXP_SCANLINE_SETALPHA : TEXP_SCANLINE_NONE; + + // Swizzle scanlines + pPixels = img->pixels; + + for( size_t h = 0; h < img->height; ++h ) + { + _SwizzleScanline( pPixels, rowPitch, pPixels, rowPitch, mdata.format, tflags ); + + pPixels += rowPitch; + } + } + break; + + // If we start using DXGI_FORMAT_B8G8R8X8_UNORM or DXGI_FORMAT_B8G8R8A8_UNORM we need to check for a fully 0 alpha channel + + case DXGI_FORMAT_B5G5R5A1_UNORM: + { + assert( image.GetImageCount() == 1 ); + const Image* img = image.GetImage(0,0,0); + if ( !img ) + return E_POINTER; + + // Scan for non-zero alpha channel + bool nonzeroa = false; + + const uint8_t *pPixels = img->pixels; + if ( !pPixels ) + return E_POINTER; + + size_t rowPitch = img->rowPitch; + + for( size_t h = 0; h < img->height; ++h ) + { + const uint16_t* sPtr = reinterpret_cast( pPixels ); + + for( size_t x=0; x < img->width; ++x ) + { + if ( *sPtr & 0x8000 ) + { + nonzeroa = true; + break; + } + + ++sPtr; + } + + if ( nonzeroa ) + break; + + pPixels += rowPitch; + } + + // If there are no non-zero alpha channel entries, we'll assume alpha is not used and force it to opaque + if ( !nonzeroa ) + { + hr = _SetAlphaChannelToOpaque( img ); + if ( FAILED(hr) ) + return hr; + } + } + break; + } + } + else // RLE || EXPAND || INVERTX || !INVERTY + { + std::unique_ptr temp( new uint8_t[ remaining ] ); + if ( !temp ) + { + image.Release(); + return E_OUTOFMEMORY; + } + + if ( !ReadFile( hFile.get(), temp.get(), remaining, &bytesRead, 0 ) ) + { + image.Release(); + return HRESULT_FROM_WIN32( GetLastError() ); + } + + if ( bytesRead != remaining ) + { + image.Release(); + return E_FAIL; + } + + if ( convFlags & CONV_FLAGS_RLE ) + { + hr = _UncompressPixels( temp.get(), remaining, image.GetImage(0,0,0), convFlags ); + } + else + { + hr = _CopyPixels( temp.get(), remaining, image.GetImage(0,0,0), convFlags ); + } + + if ( FAILED(hr) ) + { + image.Release(); + return hr; + } + } + + if ( metadata ) + memcpy( metadata, &mdata, sizeof(TexMetadata) ); + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Save a TGA file to memory +//------------------------------------------------------------------------------------- +HRESULT SaveToTGAMemory( const Image& image, Blob& blob ) +{ + if ( !image.pixels ) + return E_POINTER; + + TGA_HEADER tga_header; + DWORD convFlags = 0; + HRESULT hr = _EncodeTGAHeader( image, tga_header, convFlags ); + if ( FAILED(hr) ) + return hr; + + blob.Release(); + + // Determine memory required for image data + size_t rowPitch, slicePitch; + if ( convFlags & CONV_FLAGS_888 ) + { + rowPitch = image.width * 3; + slicePitch = image.height * rowPitch; + } + else + { + ComputePitch( image.format, image.width, image.height, rowPitch, slicePitch, CP_FLAGS_NONE ); + } + + hr = blob.Initialize( sizeof(TGA_HEADER) + slicePitch ); + if ( FAILED(hr) ) + return hr; + + // Copy header + uint8_t* dPtr = reinterpret_cast( blob.GetBufferPointer() ); + assert( dPtr != 0 ); + memcpy_s( dPtr, blob.GetBufferSize(), &tga_header, sizeof(TGA_HEADER) ); + dPtr += sizeof(TGA_HEADER); + + const uint8_t* pPixels = reinterpret_cast( image.pixels ); + assert( pPixels ); + + for( size_t y = 0; y < image.height; ++y ) + { + // Copy pixels + if ( convFlags & CONV_FLAGS_888 ) + { + _Copy24bppScanline( dPtr, rowPitch, pPixels, image.rowPitch ); + } + else if ( convFlags & CONV_FLAGS_SWIZZLE ) + { + _SwizzleScanline( dPtr, rowPitch, pPixels, image.rowPitch, image.format, TEXP_SCANLINE_NONE ); + } + else + { + _CopyScanline( dPtr, rowPitch, pPixels, image.rowPitch, image.format, TEXP_SCANLINE_NONE ); + } + + dPtr += rowPitch; + pPixels += image.rowPitch; + } + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Save a TGA file to disk +//------------------------------------------------------------------------------------- +HRESULT SaveToTGAFile( const Image& image, LPCWSTR szFile ) +{ + if ( !szFile ) + return E_INVALIDARG; + + if ( !image.pixels ) + return E_POINTER; + + TGA_HEADER tga_header; + DWORD convFlags = 0; + HRESULT hr = _EncodeTGAHeader( image, tga_header, convFlags ); + if ( FAILED(hr) ) + return hr; + + // Create file and write header +#if (_WIN32_WINNT >= 0x0602 /*_WIN32_WINNT_WIN8*/) + ScopedHandle hFile( safe_handle( CreateFile2( szFile, GENERIC_WRITE, 0, CREATE_ALWAYS, 0 ) ) ); +#else + ScopedHandle hFile( safe_handle( CreateFileW( szFile, GENERIC_WRITE, 0, 0, CREATE_ALWAYS, 0, 0 ) ) ); +#endif + if ( !hFile ) + { + return HRESULT_FROM_WIN32( GetLastError() ); + } + + // Determine size for TGA pixel data + size_t rowPitch, slicePitch; + if ( convFlags & CONV_FLAGS_888 ) + { + rowPitch = image.width * 3; + slicePitch = image.height * rowPitch; + } + else + { + ComputePitch( image.format, image.width, image.height, rowPitch, slicePitch, CP_FLAGS_NONE ); + } + + if ( slicePitch < 65535 ) + { + // For small images, it is better to create an in-memory file and write it out + Blob blob; + + hr = SaveToTGAMemory( image, blob ); + if ( FAILED(hr) ) + return hr; + + // Write blob + const DWORD bytesToWrite = static_cast( blob.GetBufferSize() ); + DWORD bytesWritten; + if ( !WriteFile( hFile.get(), blob.GetBufferPointer(), bytesToWrite, + &bytesWritten, 0 ) ) + { + return HRESULT_FROM_WIN32( GetLastError() ); + } + + if ( bytesWritten != bytesToWrite ) + { + return E_FAIL; + } + } + else + { + // Otherwise, write the image one scanline at a time... + std::unique_ptr temp( new uint8_t[ rowPitch ] ); + if ( !temp ) + return E_OUTOFMEMORY; + + // Write header + DWORD bytesWritten; + if ( !WriteFile( hFile.get(), &tga_header, sizeof(TGA_HEADER), &bytesWritten, 0 ) ) + { + return HRESULT_FROM_WIN32( GetLastError() ); + } + + if ( bytesWritten != sizeof(TGA_HEADER) ) + return E_FAIL; + + // Write pixels + const uint8_t* pPixels = reinterpret_cast( image.pixels ); + + for( size_t y = 0; y < image.height; ++y ) + { + // Copy pixels + if ( convFlags & CONV_FLAGS_888 ) + { + _Copy24bppScanline( temp.get(), rowPitch, pPixels, image.rowPitch ); + } + else if ( convFlags & CONV_FLAGS_SWIZZLE ) + { + _SwizzleScanline( temp.get(), rowPitch, pPixels, image.rowPitch, image.format, TEXP_SCANLINE_NONE ); + } + else + { + _CopyScanline( temp.get(), rowPitch, pPixels, image.rowPitch, image.format, TEXP_SCANLINE_NONE ); + } + + pPixels += image.rowPitch; + + if ( !WriteFile( hFile.get(), temp.get(), static_cast( rowPitch ), &bytesWritten, 0 ) ) + { + return HRESULT_FROM_WIN32( GetLastError() ); + } + + if ( bytesWritten != rowPitch ) + return E_FAIL; + } + } + + return S_OK; +} + +}; // namespace diff --git a/thirdparty/directxtex/DirectXTex/DirectXTexUtil.cpp b/thirdparty/directxtex/DirectXTex/DirectXTexUtil.cpp new file mode 100644 index 0000000..b7fe756 --- /dev/null +++ b/thirdparty/directxtex/DirectXTex/DirectXTexUtil.cpp @@ -0,0 +1,759 @@ +//------------------------------------------------------------------------------------- +// DirectXTexUtil.cpp +// +// DirectX Texture Library - Utilities +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkId=248926 +//------------------------------------------------------------------------------------- + +#include "directxtexp.h" + +//------------------------------------------------------------------------------------- +// WIC Pixel Format Translation Data +//------------------------------------------------------------------------------------- +struct WICTranslate +{ + GUID wic; + DXGI_FORMAT format; +}; + +static WICTranslate g_WICFormats[] = +{ + { GUID_WICPixelFormat128bppRGBAFloat, DXGI_FORMAT_R32G32B32A32_FLOAT }, + + { GUID_WICPixelFormat64bppRGBAHalf, DXGI_FORMAT_R16G16B16A16_FLOAT }, + { GUID_WICPixelFormat64bppRGBA, DXGI_FORMAT_R16G16B16A16_UNORM }, + + { GUID_WICPixelFormat32bppRGBA, DXGI_FORMAT_R8G8B8A8_UNORM }, + { GUID_WICPixelFormat32bppBGRA, DXGI_FORMAT_B8G8R8A8_UNORM }, // DXGI 1.1 + { GUID_WICPixelFormat32bppBGR, DXGI_FORMAT_B8G8R8X8_UNORM }, // DXGI 1.1 + + { GUID_WICPixelFormat32bppRGBA1010102XR, DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM }, // DXGI 1.1 + { GUID_WICPixelFormat32bppRGBA1010102, DXGI_FORMAT_R10G10B10A2_UNORM }, + { GUID_WICPixelFormat32bppRGBE, DXGI_FORMAT_R9G9B9E5_SHAREDEXP }, + + { GUID_WICPixelFormat16bppBGRA5551, DXGI_FORMAT_B5G5R5A1_UNORM }, + { GUID_WICPixelFormat16bppBGR565, DXGI_FORMAT_B5G6R5_UNORM }, + + { GUID_WICPixelFormat32bppGrayFloat, DXGI_FORMAT_R32_FLOAT }, + { GUID_WICPixelFormat16bppGrayHalf, DXGI_FORMAT_R16_FLOAT }, + { GUID_WICPixelFormat16bppGray, DXGI_FORMAT_R16_UNORM }, + { GUID_WICPixelFormat8bppGray, DXGI_FORMAT_R8_UNORM }, + + { GUID_WICPixelFormat8bppAlpha, DXGI_FORMAT_A8_UNORM }, + + { GUID_WICPixelFormatBlackWhite, DXGI_FORMAT_R1_UNORM }, +}; + +static bool g_WIC2 = false; + +namespace DirectX +{ + +//===================================================================================== +// WIC Utilities +//===================================================================================== + +DXGI_FORMAT _WICToDXGI( const GUID& guid ) +{ + for( size_t i=0; i < _countof(g_WICFormats); ++i ) + { + if ( memcmp( &g_WICFormats[i].wic, &guid, sizeof(GUID) ) == 0 ) + return g_WICFormats[i].format; + } + +#if (_WIN32_WINNT >= 0x0602 /*_WIN32_WINNT_WIN8*/) || defined(_WIN7_PLATFORM_UPDATE) + if ( g_WIC2 ) + { + if ( memcmp( &GUID_WICPixelFormat96bppRGBFloat, &guid, sizeof(GUID) ) == 0 ) + return DXGI_FORMAT_R32G32B32_FLOAT; + } +#endif + + return DXGI_FORMAT_UNKNOWN; +} + +bool _DXGIToWIC( DXGI_FORMAT format, GUID& guid ) +{ + switch( format ) + { + case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB: + memcpy( &guid, &GUID_WICPixelFormat32bppRGBA, sizeof(GUID) ); + return true; + + case DXGI_FORMAT_D32_FLOAT: + memcpy( &guid, &GUID_WICPixelFormat32bppGrayFloat, sizeof(GUID) ); + return true; + + case DXGI_FORMAT_D16_UNORM: + memcpy( &guid, &GUID_WICPixelFormat16bppGray, sizeof(GUID) ); + return true; + + case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB: + memcpy( &guid, &GUID_WICPixelFormat32bppBGRA, sizeof(GUID) ); + return true; + + case DXGI_FORMAT_B8G8R8X8_UNORM_SRGB: + memcpy( &guid, &GUID_WICPixelFormat32bppBGR, sizeof(GUID) ); + return true; + +#if (_WIN32_WINNT >= 0x0602 /*_WIN32_WINNT_WIN8*/) || defined(_WIN7_PLATFORM_UPDATE) + case DXGI_FORMAT_R32G32B32_FLOAT: + if ( g_WIC2 ) + { + memcpy( &guid, &GUID_WICPixelFormat96bppRGBFloat, sizeof(GUID) ); + return true; + } + break; +#endif + + default: + for( size_t i=0; i < _countof(g_WICFormats); ++i ) + { + if ( g_WICFormats[i].format == format ) + { + memcpy( &guid, &g_WICFormats[i].wic, sizeof(GUID) ); + return true; + } + } + break; + } + + memcpy( &guid, &GUID_NULL, sizeof(GUID) ); + return false; +} + +bool _IsWIC2() +{ + return g_WIC2; +} + +IWICImagingFactory* _GetWIC() +{ + static IWICImagingFactory* s_Factory = nullptr; + + if ( s_Factory ) + return s_Factory; + +#if(_WIN32_WINNT >= 0x0602 /*_WIN32_WINNT_WIN8*/) || defined(_WIN7_PLATFORM_UPDATE) + HRESULT hr = CoCreateInstance( + CLSID_WICImagingFactory2, + nullptr, + CLSCTX_INPROC_SERVER, + __uuidof(IWICImagingFactory2), + (LPVOID*)&s_Factory + ); + + if ( SUCCEEDED(hr) ) + { + // WIC2 is available on Windows 8 and Windows 7 SP1 with KB 2670838 installed + g_WIC2 = true; + } + else + { + hr = CoCreateInstance( + CLSID_WICImagingFactory1, + nullptr, + CLSCTX_INPROC_SERVER, + __uuidof(IWICImagingFactory), + (LPVOID*)&s_Factory + ); + + if ( FAILED(hr) ) + { + s_Factory = nullptr; + return nullptr; + } + } +#else + HRESULT hr = CoCreateInstance( + CLSID_WICImagingFactory, + nullptr, + CLSCTX_INPROC_SERVER, + __uuidof(IWICImagingFactory), + (LPVOID*)&s_Factory + ); + + if ( FAILED(hr) ) + { + s_Factory = nullptr; + return nullptr; + } +#endif + + return s_Factory; +} + + +//------------------------------------------------------------------------------------- +// Public helper function to get common WIC codec GUIDs +//------------------------------------------------------------------------------------- +REFGUID GetWICCodec( _In_ WICCodecs codec ) +{ + switch( codec ) + { + case WIC_CODEC_BMP: + return GUID_ContainerFormatBmp; + + case WIC_CODEC_JPEG: + return GUID_ContainerFormatJpeg; + + case WIC_CODEC_PNG: + return GUID_ContainerFormatPng; + + case WIC_CODEC_TIFF: + return GUID_ContainerFormatTiff; + + case WIC_CODEC_GIF: + return GUID_ContainerFormatGif; + + case WIC_CODEC_WMP: + return GUID_ContainerFormatWmp; + + case WIC_CODEC_ICO: + return GUID_ContainerFormatIco; + + default: + return GUID_NULL; + } +} + + +//===================================================================================== +// DXGI Format Utilities +//===================================================================================== + +//------------------------------------------------------------------------------------- +// Returns bits-per-pixel for a given DXGI format, or 0 on failure +//------------------------------------------------------------------------------------- +size_t BitsPerPixel( DXGI_FORMAT fmt ) +{ + switch( fmt ) + { + case DXGI_FORMAT_R32G32B32A32_TYPELESS: + case DXGI_FORMAT_R32G32B32A32_FLOAT: + case DXGI_FORMAT_R32G32B32A32_UINT: + case DXGI_FORMAT_R32G32B32A32_SINT: + return 128; + + case DXGI_FORMAT_R32G32B32_TYPELESS: + case DXGI_FORMAT_R32G32B32_FLOAT: + case DXGI_FORMAT_R32G32B32_UINT: + case DXGI_FORMAT_R32G32B32_SINT: + return 96; + + case DXGI_FORMAT_R16G16B16A16_TYPELESS: + case DXGI_FORMAT_R16G16B16A16_FLOAT: + case DXGI_FORMAT_R16G16B16A16_UNORM: + case DXGI_FORMAT_R16G16B16A16_UINT: + case DXGI_FORMAT_R16G16B16A16_SNORM: + case DXGI_FORMAT_R16G16B16A16_SINT: + case DXGI_FORMAT_R32G32_TYPELESS: + case DXGI_FORMAT_R32G32_FLOAT: + case DXGI_FORMAT_R32G32_UINT: + case DXGI_FORMAT_R32G32_SINT: + case DXGI_FORMAT_R32G8X24_TYPELESS: + case DXGI_FORMAT_D32_FLOAT_S8X24_UINT: + case DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS: + case DXGI_FORMAT_X32_TYPELESS_G8X24_UINT: + return 64; + + case DXGI_FORMAT_R10G10B10A2_TYPELESS: + case DXGI_FORMAT_R10G10B10A2_UNORM: + case DXGI_FORMAT_R10G10B10A2_UINT: + case DXGI_FORMAT_R11G11B10_FLOAT: + case DXGI_FORMAT_R8G8B8A8_TYPELESS: + case DXGI_FORMAT_R8G8B8A8_UNORM: + case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB: + case DXGI_FORMAT_R8G8B8A8_UINT: + case DXGI_FORMAT_R8G8B8A8_SNORM: + case DXGI_FORMAT_R8G8B8A8_SINT: + case DXGI_FORMAT_R16G16_TYPELESS: + case DXGI_FORMAT_R16G16_FLOAT: + case DXGI_FORMAT_R16G16_UNORM: + case DXGI_FORMAT_R16G16_UINT: + case DXGI_FORMAT_R16G16_SNORM: + case DXGI_FORMAT_R16G16_SINT: + case DXGI_FORMAT_R32_TYPELESS: + case DXGI_FORMAT_D32_FLOAT: + case DXGI_FORMAT_R32_FLOAT: + case DXGI_FORMAT_R32_UINT: + case DXGI_FORMAT_R32_SINT: + case DXGI_FORMAT_R24G8_TYPELESS: + case DXGI_FORMAT_D24_UNORM_S8_UINT: + case DXGI_FORMAT_R24_UNORM_X8_TYPELESS: + case DXGI_FORMAT_X24_TYPELESS_G8_UINT: + case DXGI_FORMAT_R9G9B9E5_SHAREDEXP: + case DXGI_FORMAT_R8G8_B8G8_UNORM: + case DXGI_FORMAT_G8R8_G8B8_UNORM: + case DXGI_FORMAT_B8G8R8A8_UNORM: + case DXGI_FORMAT_B8G8R8X8_UNORM: + case DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM: + case DXGI_FORMAT_B8G8R8A8_TYPELESS: + case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB: + case DXGI_FORMAT_B8G8R8X8_TYPELESS: + case DXGI_FORMAT_B8G8R8X8_UNORM_SRGB: + return 32; + + case DXGI_FORMAT_R8G8_TYPELESS: + case DXGI_FORMAT_R8G8_UNORM: + case DXGI_FORMAT_R8G8_UINT: + case DXGI_FORMAT_R8G8_SNORM: + case DXGI_FORMAT_R8G8_SINT: + case DXGI_FORMAT_R16_TYPELESS: + case DXGI_FORMAT_R16_FLOAT: + case DXGI_FORMAT_D16_UNORM: + case DXGI_FORMAT_R16_UNORM: + case DXGI_FORMAT_R16_UINT: + case DXGI_FORMAT_R16_SNORM: + case DXGI_FORMAT_R16_SINT: + case DXGI_FORMAT_B5G6R5_UNORM: + case DXGI_FORMAT_B5G5R5A1_UNORM: + return 16; + + case DXGI_FORMAT_R8_TYPELESS: + case DXGI_FORMAT_R8_UNORM: + case DXGI_FORMAT_R8_UINT: + case DXGI_FORMAT_R8_SNORM: + case DXGI_FORMAT_R8_SINT: + case DXGI_FORMAT_A8_UNORM: + return 8; + + case DXGI_FORMAT_R1_UNORM: + return 1; + + case DXGI_FORMAT_BC1_TYPELESS: + case DXGI_FORMAT_BC1_UNORM: + case DXGI_FORMAT_BC1_UNORM_SRGB: + case DXGI_FORMAT_BC4_TYPELESS: + case DXGI_FORMAT_BC4_UNORM: + case DXGI_FORMAT_BC4_SNORM: + return 4; + + case DXGI_FORMAT_BC2_TYPELESS: + case DXGI_FORMAT_BC2_UNORM: + case DXGI_FORMAT_BC2_UNORM_SRGB: + case DXGI_FORMAT_BC3_TYPELESS: + case DXGI_FORMAT_BC3_UNORM: + case DXGI_FORMAT_BC3_UNORM_SRGB: + case DXGI_FORMAT_BC5_TYPELESS: + case DXGI_FORMAT_BC5_UNORM: + case DXGI_FORMAT_BC5_SNORM: + case DXGI_FORMAT_BC6H_TYPELESS: + case DXGI_FORMAT_BC6H_UF16: + case DXGI_FORMAT_BC6H_SF16: + case DXGI_FORMAT_BC7_TYPELESS: + case DXGI_FORMAT_BC7_UNORM: + case DXGI_FORMAT_BC7_UNORM_SRGB: + return 8; + +#ifdef DXGI_1_2_FORMATS + case DXGI_FORMAT_B4G4R4A4_UNORM: + return 16; + + // We don't support the video formats ( see IsVideo function ) + +#endif // DXGI_1_2_FORMATS + + default: + return 0; + } +} + + +//------------------------------------------------------------------------------------- +// Computes the image row pitch in bytes, and the slice ptich (size in bytes of the image) +// based on DXGI format, width, and height +//------------------------------------------------------------------------------------- +void ComputePitch( DXGI_FORMAT fmt, size_t width, size_t height, + size_t& rowPitch, size_t& slicePitch, DWORD flags ) +{ + assert( IsValid(fmt) && !IsVideo(fmt) ); + + if ( IsCompressed(fmt) ) + { + size_t bpb = ( fmt == DXGI_FORMAT_BC1_TYPELESS + || fmt == DXGI_FORMAT_BC1_UNORM + || fmt == DXGI_FORMAT_BC1_UNORM_SRGB + || fmt == DXGI_FORMAT_BC4_TYPELESS + || fmt == DXGI_FORMAT_BC4_UNORM + || fmt == DXGI_FORMAT_BC4_SNORM) ? 8 : 16; + size_t nbw = std::max( 1, (width + 3) / 4 ); + size_t nbh = std::max( 1, (height + 3) / 4 ); + rowPitch = nbw * bpb; + + slicePitch = rowPitch * nbh; + } + else if ( IsPacked(fmt) ) + { + rowPitch = ( ( width + 1 ) >> 1) * 4; + + slicePitch = rowPitch * height; + } + else + { + size_t bpp; + + if ( flags & CP_FLAGS_24BPP ) + bpp = 24; + else if ( flags & CP_FLAGS_16BPP ) + bpp = 16; + else if ( flags & CP_FLAGS_8BPP ) + bpp = 8; + else + bpp = BitsPerPixel( fmt ); + + if ( flags & CP_FLAGS_LEGACY_DWORD ) + { + // Special computation for some incorrectly created DDS files based on + // legacy DirectDraw assumptions about pitch alignment + rowPitch = ( ( width * bpp + 31 ) / 32 ) * sizeof(uint32_t); + slicePitch = rowPitch * height; + } + else + { + rowPitch = ( width * bpp + 7 ) / 8; + slicePitch = rowPitch * height; + } + } +} + + +//------------------------------------------------------------------------------------- +// Converts to an SRGB equivalent type if available +//------------------------------------------------------------------------------------- +DXGI_FORMAT MakeSRGB( _In_ DXGI_FORMAT fmt ) +{ + switch( fmt ) + { + case DXGI_FORMAT_R8G8B8A8_UNORM: + return DXGI_FORMAT_R8G8B8A8_UNORM_SRGB; + + case DXGI_FORMAT_BC1_UNORM: + return DXGI_FORMAT_BC1_UNORM_SRGB; + + case DXGI_FORMAT_BC2_UNORM: + return DXGI_FORMAT_BC2_UNORM_SRGB; + + case DXGI_FORMAT_BC3_UNORM: + return DXGI_FORMAT_BC3_UNORM_SRGB; + + case DXGI_FORMAT_B8G8R8A8_UNORM: + return DXGI_FORMAT_B8G8R8A8_UNORM_SRGB; + + case DXGI_FORMAT_B8G8R8X8_UNORM: + return DXGI_FORMAT_B8G8R8X8_UNORM_SRGB; + + case DXGI_FORMAT_BC7_UNORM: + return DXGI_FORMAT_BC7_UNORM_SRGB; + + default: + return fmt; + } +} + + +//------------------------------------------------------------------------------------- +// Converts to a format to an equivalent TYPELESS format if available +//------------------------------------------------------------------------------------- +DXGI_FORMAT MakeTypeless( _In_ DXGI_FORMAT fmt ) +{ + switch( fmt ) + { + case DXGI_FORMAT_R32G32B32A32_FLOAT: + case DXGI_FORMAT_R32G32B32A32_UINT: + case DXGI_FORMAT_R32G32B32A32_SINT: + return DXGI_FORMAT_R32G32B32A32_TYPELESS; + + case DXGI_FORMAT_R32G32B32_FLOAT: + case DXGI_FORMAT_R32G32B32_UINT: + case DXGI_FORMAT_R32G32B32_SINT: + return DXGI_FORMAT_R32G32B32_TYPELESS; + + case DXGI_FORMAT_R16G16B16A16_FLOAT: + case DXGI_FORMAT_R16G16B16A16_UNORM: + case DXGI_FORMAT_R16G16B16A16_UINT: + case DXGI_FORMAT_R16G16B16A16_SNORM: + case DXGI_FORMAT_R16G16B16A16_SINT: + return DXGI_FORMAT_R16G16B16A16_TYPELESS; + + case DXGI_FORMAT_R32G32_FLOAT: + case DXGI_FORMAT_R32G32_UINT: + case DXGI_FORMAT_R32G32_SINT: + return DXGI_FORMAT_R32G32_TYPELESS; + + case DXGI_FORMAT_R10G10B10A2_UNORM: + case DXGI_FORMAT_R10G10B10A2_UINT: + return DXGI_FORMAT_R10G10B10A2_TYPELESS; + + case DXGI_FORMAT_R8G8B8A8_UNORM: + case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB: + case DXGI_FORMAT_R8G8B8A8_UINT: + case DXGI_FORMAT_R8G8B8A8_SNORM: + case DXGI_FORMAT_R8G8B8A8_SINT: + return DXGI_FORMAT_R8G8B8A8_TYPELESS; + + case DXGI_FORMAT_R16G16_FLOAT: + case DXGI_FORMAT_R16G16_UNORM: + case DXGI_FORMAT_R16G16_UINT: + case DXGI_FORMAT_R16G16_SNORM: + case DXGI_FORMAT_R16G16_SINT: + return DXGI_FORMAT_R16G16_TYPELESS; + + case DXGI_FORMAT_D32_FLOAT: + case DXGI_FORMAT_R32_FLOAT: + case DXGI_FORMAT_R32_UINT: + case DXGI_FORMAT_R32_SINT: + return DXGI_FORMAT_R32_TYPELESS; + + case DXGI_FORMAT_R8G8_UNORM: + case DXGI_FORMAT_R8G8_UINT: + case DXGI_FORMAT_R8G8_SNORM: + case DXGI_FORMAT_R8G8_SINT: + return DXGI_FORMAT_R8G8_TYPELESS; + + case DXGI_FORMAT_R16_FLOAT: + case DXGI_FORMAT_D16_UNORM: + case DXGI_FORMAT_R16_UNORM: + case DXGI_FORMAT_R16_UINT: + case DXGI_FORMAT_R16_SNORM: + case DXGI_FORMAT_R16_SINT: + return DXGI_FORMAT_R16_TYPELESS; + + case DXGI_FORMAT_R8_UNORM: + case DXGI_FORMAT_R8_UINT: + case DXGI_FORMAT_R8_SNORM: + case DXGI_FORMAT_R8_SINT: + case DXGI_FORMAT_A8_UNORM: + return DXGI_FORMAT_R8_TYPELESS; + + case DXGI_FORMAT_BC1_UNORM: + case DXGI_FORMAT_BC1_UNORM_SRGB: + return DXGI_FORMAT_BC1_TYPELESS; + + case DXGI_FORMAT_BC2_UNORM: + case DXGI_FORMAT_BC2_UNORM_SRGB: + return DXGI_FORMAT_BC2_TYPELESS; + + case DXGI_FORMAT_BC3_UNORM: + case DXGI_FORMAT_BC3_UNORM_SRGB: + return DXGI_FORMAT_BC3_TYPELESS; + + case DXGI_FORMAT_BC4_UNORM: + case DXGI_FORMAT_BC4_SNORM: + return DXGI_FORMAT_BC4_TYPELESS; + + case DXGI_FORMAT_BC5_UNORM: + case DXGI_FORMAT_BC5_SNORM: + return DXGI_FORMAT_BC5_TYPELESS; + + case DXGI_FORMAT_B8G8R8A8_UNORM: + case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB: + return DXGI_FORMAT_B8G8R8A8_TYPELESS; + + case DXGI_FORMAT_B8G8R8X8_UNORM: + case DXGI_FORMAT_B8G8R8X8_UNORM_SRGB: + return DXGI_FORMAT_B8G8R8X8_TYPELESS; + + case DXGI_FORMAT_BC6H_UF16: + case DXGI_FORMAT_BC6H_SF16: + return DXGI_FORMAT_BC6H_TYPELESS; + + case DXGI_FORMAT_BC7_UNORM: + case DXGI_FORMAT_BC7_UNORM_SRGB: + return DXGI_FORMAT_BC7_TYPELESS; + + default: + return fmt; + } +} + + +//------------------------------------------------------------------------------------- +// Converts to a TYPELESS format to an equivalent UNORM format if available +//------------------------------------------------------------------------------------- +DXGI_FORMAT MakeTypelessUNORM( _In_ DXGI_FORMAT fmt ) +{ + switch( fmt ) + { + case DXGI_FORMAT_R16G16B16A16_TYPELESS: + return DXGI_FORMAT_R16G16B16A16_UNORM; + + case DXGI_FORMAT_R10G10B10A2_TYPELESS: + return DXGI_FORMAT_R10G10B10A2_UNORM; + + case DXGI_FORMAT_R8G8B8A8_TYPELESS: + return DXGI_FORMAT_R8G8B8A8_UNORM; + + case DXGI_FORMAT_R16G16_TYPELESS: + return DXGI_FORMAT_R16G16_UNORM; + + case DXGI_FORMAT_R8G8_TYPELESS: + return DXGI_FORMAT_R8G8_UNORM; + + case DXGI_FORMAT_R16_TYPELESS: + return DXGI_FORMAT_R16_UNORM; + + case DXGI_FORMAT_R8_TYPELESS: + return DXGI_FORMAT_R8_UNORM; + + case DXGI_FORMAT_BC1_TYPELESS: + return DXGI_FORMAT_BC1_UNORM; + + case DXGI_FORMAT_BC2_TYPELESS: + return DXGI_FORMAT_BC2_UNORM; + + case DXGI_FORMAT_BC3_TYPELESS: + return DXGI_FORMAT_BC3_UNORM; + + case DXGI_FORMAT_BC4_TYPELESS: + return DXGI_FORMAT_BC4_UNORM; + + case DXGI_FORMAT_BC5_TYPELESS: + return DXGI_FORMAT_BC5_UNORM; + + case DXGI_FORMAT_B8G8R8A8_TYPELESS: + return DXGI_FORMAT_B8G8R8A8_UNORM; + + case DXGI_FORMAT_B8G8R8X8_TYPELESS: + return DXGI_FORMAT_B8G8R8X8_UNORM; + + case DXGI_FORMAT_BC7_TYPELESS: + return DXGI_FORMAT_BC7_UNORM; + + default: + return fmt; + } +} + + +//------------------------------------------------------------------------------------- +// Converts to a TYPELESS format to an equivalent FLOAT format if available +//------------------------------------------------------------------------------------- +DXGI_FORMAT MakeTypelessFLOAT( _In_ DXGI_FORMAT fmt ) +{ + switch( fmt ) + { + case DXGI_FORMAT_R32G32B32A32_TYPELESS: + return DXGI_FORMAT_R32G32B32A32_FLOAT; + + case DXGI_FORMAT_R32G32B32_TYPELESS: + return DXGI_FORMAT_R32G32B32_FLOAT; + + case DXGI_FORMAT_R16G16B16A16_TYPELESS: + return DXGI_FORMAT_R16G16B16A16_FLOAT; + + case DXGI_FORMAT_R32G32_TYPELESS: + return DXGI_FORMAT_R32G32_FLOAT; + + case DXGI_FORMAT_R16G16_TYPELESS: + return DXGI_FORMAT_R16G16_FLOAT; + + case DXGI_FORMAT_R32_TYPELESS: + return DXGI_FORMAT_R32_FLOAT; + + case DXGI_FORMAT_R16_TYPELESS: + return DXGI_FORMAT_R16_FLOAT; + + default: + return fmt; + } +} + + +//===================================================================================== +// TexMetadata +//===================================================================================== + +size_t TexMetadata::ComputeIndex( _In_ size_t mip, _In_ size_t item, _In_ size_t slice ) const +{ + if ( mip >= mipLevels ) + return size_t(-1); + + switch( dimension ) + { + case TEX_DIMENSION_TEXTURE1D: + case TEX_DIMENSION_TEXTURE2D: + if ( slice > 0 ) + return size_t(-1); + + if ( item >= arraySize ) + return size_t(-1); + + return (item*( mipLevels ) + mip); + + case TEX_DIMENSION_TEXTURE3D: + if ( item > 0 ) + { + // No support for arrays of volumes + return size_t(-1); + } + else + { + size_t index = 0; + size_t d = depth; + + for( size_t level = 0; level < mip; ++level ) + { + index += d; + if ( d > 1 ) + d >>= 1; + } + + if ( slice >= d ) + return size_t(-1); + + index += slice; + + return index; + } + break; + + default: + return size_t(-1); + } +} + + +//===================================================================================== +// Blob - Bitmap image container +//===================================================================================== + +void Blob::Release() +{ + if ( _buffer ) + { + _aligned_free( _buffer ); + _buffer = nullptr; + } + + _size = 0; +} + +HRESULT Blob::Initialize( size_t size ) +{ + if ( !size ) + return E_INVALIDARG; + + Release(); + + _buffer = _aligned_malloc( size, 16 ); + if ( !_buffer ) + { + Release(); + return E_OUTOFMEMORY; + } + + _size = size; + + return S_OK; +} + +}; // namespace diff --git a/thirdparty/directxtex/DirectXTex/DirectXTexWIC.cpp b/thirdparty/directxtex/DirectXTex/DirectXTexWIC.cpp new file mode 100644 index 0000000..7e3e3c9 --- /dev/null +++ b/thirdparty/directxtex/DirectXTex/DirectXTexWIC.cpp @@ -0,0 +1,946 @@ +//------------------------------------------------------------------------------------- +// DirectXTexWIC.cpp +// +// DirectX Texture Library - WIC-based file reader/writer +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkId=248926 +//------------------------------------------------------------------------------------- + +#include "directxtexp.h" + +//------------------------------------------------------------------------------------- +// WIC Pixel Format nearest conversion table +//------------------------------------------------------------------------------------- + +struct WICConvert +{ + GUID source; + GUID target; +}; + +static WICConvert g_WICConvert[] = +{ + // Directly support the formats listed in XnaTexUtil::g_WICFormats, so no conversion required + // Note target GUID in this conversion table must be one of those directly supported formats. + + { GUID_WICPixelFormat1bppIndexed, GUID_WICPixelFormat32bppRGBA }, // DXGI_FORMAT_R8G8B8A8_UNORM + { GUID_WICPixelFormat2bppIndexed, GUID_WICPixelFormat32bppRGBA }, // DXGI_FORMAT_R8G8B8A8_UNORM + { GUID_WICPixelFormat4bppIndexed, GUID_WICPixelFormat32bppRGBA }, // DXGI_FORMAT_R8G8B8A8_UNORM + { GUID_WICPixelFormat8bppIndexed, GUID_WICPixelFormat32bppRGBA }, // DXGI_FORMAT_R8G8B8A8_UNORM + + { GUID_WICPixelFormat2bppGray, GUID_WICPixelFormat8bppGray }, // DXGI_FORMAT_R8_UNORM + { GUID_WICPixelFormat4bppGray, GUID_WICPixelFormat8bppGray }, // DXGI_FORMAT_R8_UNORM + + { GUID_WICPixelFormat16bppGrayFixedPoint, GUID_WICPixelFormat16bppGrayHalf }, // DXGI_FORMAT_R16_FLOAT + { GUID_WICPixelFormat32bppGrayFixedPoint, GUID_WICPixelFormat32bppGrayFloat }, // DXGI_FORMAT_R32_FLOAT + + { GUID_WICPixelFormat16bppBGR555, GUID_WICPixelFormat16bppBGRA5551 }, // DXGI_FORMAT_B5G5R5A1_UNORM + { GUID_WICPixelFormat32bppBGR101010, GUID_WICPixelFormat32bppRGBA1010102 }, // DXGI_FORMAT_R10G10B10A2_UNORM + + { GUID_WICPixelFormat24bppBGR, GUID_WICPixelFormat32bppRGBA }, // DXGI_FORMAT_R8G8B8A8_UNORM + { GUID_WICPixelFormat24bppRGB, GUID_WICPixelFormat32bppRGBA }, // DXGI_FORMAT_R8G8B8A8_UNORM + { GUID_WICPixelFormat32bppPBGRA, GUID_WICPixelFormat32bppRGBA }, // DXGI_FORMAT_R8G8B8A8_UNORM + { GUID_WICPixelFormat32bppPRGBA, GUID_WICPixelFormat32bppRGBA }, // DXGI_FORMAT_R8G8B8A8_UNORM + + { GUID_WICPixelFormat48bppRGB, GUID_WICPixelFormat64bppRGBA }, // DXGI_FORMAT_R16G16B16A16_UNORM + { GUID_WICPixelFormat48bppBGR, GUID_WICPixelFormat64bppRGBA }, // DXGI_FORMAT_R16G16B16A16_UNORM + { GUID_WICPixelFormat64bppBGRA, GUID_WICPixelFormat64bppRGBA }, // DXGI_FORMAT_R16G16B16A16_UNORM + { GUID_WICPixelFormat64bppPRGBA, GUID_WICPixelFormat64bppRGBA }, // DXGI_FORMAT_R16G16B16A16_UNORM + { GUID_WICPixelFormat64bppPBGRA, GUID_WICPixelFormat64bppRGBA }, // DXGI_FORMAT_R16G16B16A16_UNORM + + { GUID_WICPixelFormat48bppRGBFixedPoint, GUID_WICPixelFormat64bppRGBAHalf }, // DXGI_FORMAT_R16G16B16A16_FLOAT + { GUID_WICPixelFormat48bppBGRFixedPoint, GUID_WICPixelFormat64bppRGBAHalf }, // DXGI_FORMAT_R16G16B16A16_FLOAT + { GUID_WICPixelFormat64bppRGBAFixedPoint, GUID_WICPixelFormat64bppRGBAHalf }, // DXGI_FORMAT_R16G16B16A16_FLOAT + { GUID_WICPixelFormat64bppBGRAFixedPoint, GUID_WICPixelFormat64bppRGBAHalf }, // DXGI_FORMAT_R16G16B16A16_FLOAT + { GUID_WICPixelFormat64bppRGBFixedPoint, GUID_WICPixelFormat64bppRGBAHalf }, // DXGI_FORMAT_R16G16B16A16_FLOAT + { GUID_WICPixelFormat64bppRGBHalf, GUID_WICPixelFormat64bppRGBAHalf }, // DXGI_FORMAT_R16G16B16A16_FLOAT + { GUID_WICPixelFormat48bppRGBHalf, GUID_WICPixelFormat64bppRGBAHalf }, // DXGI_FORMAT_R16G16B16A16_FLOAT + + { GUID_WICPixelFormat128bppPRGBAFloat, GUID_WICPixelFormat128bppRGBAFloat }, // DXGI_FORMAT_R32G32B32A32_FLOAT + { GUID_WICPixelFormat128bppRGBFloat, GUID_WICPixelFormat128bppRGBAFloat }, // DXGI_FORMAT_R32G32B32A32_FLOAT + { GUID_WICPixelFormat128bppRGBAFixedPoint, GUID_WICPixelFormat128bppRGBAFloat }, // DXGI_FORMAT_R32G32B32A32_FLOAT + { GUID_WICPixelFormat128bppRGBFixedPoint, GUID_WICPixelFormat128bppRGBAFloat }, // DXGI_FORMAT_R32G32B32A32_FLOAT + + { GUID_WICPixelFormat32bppCMYK, GUID_WICPixelFormat32bppRGBA }, // DXGI_FORMAT_R8G8B8A8_UNORM + { GUID_WICPixelFormat64bppCMYK, GUID_WICPixelFormat64bppRGBA }, // DXGI_FORMAT_R16G16B16A16_UNORM + { GUID_WICPixelFormat40bppCMYKAlpha, GUID_WICPixelFormat64bppRGBA }, // DXGI_FORMAT_R16G16B16A16_UNORM + { GUID_WICPixelFormat80bppCMYKAlpha, GUID_WICPixelFormat64bppRGBA }, // DXGI_FORMAT_R16G16B16A16_UNORM + +#if (_WIN32_WINNT >= 0x0602 /*_WIN32_WINNT_WIN8*/) || defined(_WIN7_PLATFORM_UPDATE) + { GUID_WICPixelFormat32bppRGB, GUID_WICPixelFormat32bppRGBA }, // DXGI_FORMAT_R8G8B8A8_UNORM + { GUID_WICPixelFormat64bppRGB, GUID_WICPixelFormat64bppRGBA }, // DXGI_FORMAT_R16G16B16A16_UNORM + { GUID_WICPixelFormat64bppPRGBAHalf, GUID_WICPixelFormat64bppRGBAHalf }, // DXGI_FORMAT_R16G16B16A16_FLOAT +#endif + + // We don't support n-channel formats +}; + +namespace DirectX +{ + +//------------------------------------------------------------------------------------- +// Returns the DXGI format and optionally the WIC pixel GUID to convert to +//------------------------------------------------------------------------------------- +static DXGI_FORMAT _DetermineFormat( _In_ const WICPixelFormatGUID& pixelFormat, _In_ DWORD flags, + _Out_opt_ WICPixelFormatGUID* pConvert ) +{ + if ( pConvert ) + memset( pConvert, 0, sizeof(WICPixelFormatGUID) ); + + DXGI_FORMAT format = _WICToDXGI( pixelFormat ); + + if ( format == DXGI_FORMAT_UNKNOWN ) + { + if ( memcmp( &GUID_WICPixelFormat96bppRGBFixedPoint, &pixelFormat, sizeof(WICPixelFormatGUID) ) == 0 ) + { +#if (_WIN32_WINNT >= 0x0602 /*_WIN32_WINNT_WIN8*/) || defined(_WIN7_PLATFORM_UPDATE) + if ( _IsWIC2() ) + { + if ( pConvert ) + memcpy( pConvert, &GUID_WICPixelFormat96bppRGBFloat, sizeof(WICPixelFormatGUID) ); + format = DXGI_FORMAT_R32G32B32_FLOAT; + } + else +#endif + { + if ( pConvert ) + memcpy( pConvert, &GUID_WICPixelFormat128bppRGBAFloat, sizeof(WICPixelFormatGUID) ); + format = DXGI_FORMAT_R32G32B32A32_FLOAT; + } + } + else + { + for( size_t i=0; i < _countof(g_WICConvert); ++i ) + { + if ( memcmp( &g_WICConvert[i].source, &pixelFormat, sizeof(WICPixelFormatGUID) ) == 0 ) + { + if ( pConvert ) + memcpy( pConvert, &g_WICConvert[i].target, sizeof(WICPixelFormatGUID) ); + + format = _WICToDXGI( g_WICConvert[i].target ); + assert( format != DXGI_FORMAT_UNKNOWN ); + break; + } + } + } + } + + // Handle special cases based on flags + switch (format) + { + case DXGI_FORMAT_B8G8R8A8_UNORM: // BGRA + case DXGI_FORMAT_B8G8R8X8_UNORM: // BGRX + if ( flags & WIC_FLAGS_FORCE_RGB ) + { + format = DXGI_FORMAT_R8G8B8A8_UNORM; + if ( pConvert ) + memcpy( pConvert, &GUID_WICPixelFormat32bppRGBA, sizeof(WICPixelFormatGUID) ); + } + break; + + case DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM: + if ( flags & WIC_FLAGS_NO_X2_BIAS ) + { + format = DXGI_FORMAT_R10G10B10A2_UNORM; + if ( pConvert ) + memcpy( pConvert, &GUID_WICPixelFormat32bppRGBA1010102, sizeof(WICPixelFormatGUID) ); + } + break; + + case DXGI_FORMAT_B5G5R5A1_UNORM: + case DXGI_FORMAT_B5G6R5_UNORM: + if ( flags & WIC_FLAGS_NO_16BPP ) + { + format = DXGI_FORMAT_R8G8B8A8_UNORM; + if ( pConvert ) + memcpy( pConvert, &GUID_WICPixelFormat32bppRGBA, sizeof(WICPixelFormatGUID) ); + } + break; + + case DXGI_FORMAT_R1_UNORM: + if ( !(flags & WIC_FLAGS_ALLOW_MONO ) ) + { + // By default we want to promote a black & white to gresycale since R1 is not a generally supported D3D format + format = DXGI_FORMAT_R8_UNORM; + if ( pConvert ) + memcpy( pConvert, &GUID_WICPixelFormat8bppGray, sizeof(WICPixelFormatGUID) ); + } + } + + return format; +} + + +//------------------------------------------------------------------------------------- +// Determines metadata for image +//------------------------------------------------------------------------------------- +static HRESULT _DecodeMetadata( _In_ DWORD flags, + _In_ IWICBitmapDecoder *decoder, _In_ IWICBitmapFrameDecode *frame, + _Out_ TexMetadata& metadata, _Out_opt_ WICPixelFormatGUID* pConvert ) +{ + if ( !decoder || !frame ) + return E_POINTER; + + memset( &metadata, 0, sizeof(TexMetadata) ); + metadata.depth = 1; + metadata.mipLevels = 1; + metadata.dimension = TEX_DIMENSION_TEXTURE2D; + + UINT w, h; + HRESULT hr = frame->GetSize( &w, &h ); + if ( FAILED(hr) ) + return hr; + + metadata.width = w; + metadata.height = h; + + if ( flags & WIC_FLAGS_ALL_FRAMES ) + { + UINT fcount; + hr = decoder->GetFrameCount( &fcount ); + if ( FAILED(hr) ) + return hr; + + metadata.arraySize = fcount; + } + else + metadata.arraySize = 1; + + WICPixelFormatGUID pixelFormat; + hr = frame->GetPixelFormat( &pixelFormat ); + if ( FAILED(hr) ) + return hr; + + metadata.format = _DetermineFormat( pixelFormat, flags, pConvert ); + if ( metadata.format == DXGI_FORMAT_UNKNOWN ) + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Decodes a single frame +//------------------------------------------------------------------------------------- +static HRESULT _DecodeSingleFrame( _In_ DWORD flags, _In_ const TexMetadata& metadata, _In_ const WICPixelFormatGUID& convertGUID, + _In_ IWICBitmapFrameDecode *frame, _Inout_ ScratchImage& image ) +{ + if ( !frame ) + return E_POINTER; + + HRESULT hr = image.Initialize2D( metadata.format, metadata.width, metadata.height, 1, 1 ); + if ( FAILED(hr) ) + return hr; + + const Image *img = image.GetImage( 0, 0, 0 ); + if ( !img ) + return E_POINTER; + + IWICImagingFactory* pWIC = _GetWIC(); + if ( !pWIC ) + return E_NOINTERFACE; + + if ( memcmp( &convertGUID, &GUID_NULL, sizeof(GUID) ) == 0 ) + { + hr = frame->CopyPixels( 0, static_cast( img->rowPitch ), static_cast( img->slicePitch ), img->pixels ); + if ( FAILED(hr) ) + return hr; + } + else + { + ScopedObject FC; + hr = pWIC->CreateFormatConverter( &FC ); + if ( FAILED(hr) ) + return hr; + + hr = FC->Initialize( frame, convertGUID, _GetWICDither( flags ), 0, 0, WICBitmapPaletteTypeCustom ); + if ( FAILED(hr) ) + return hr; + + hr = FC->CopyPixels( 0, static_cast( img->rowPitch ), static_cast( img->slicePitch ), img->pixels ); + if ( FAILED(hr) ) + return hr; + } + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Decodes an image array, resizing/format converting as needed +//------------------------------------------------------------------------------------- +static HRESULT _DecodeMultiframe( _In_ DWORD flags, _In_ const TexMetadata& metadata, + _In_ IWICBitmapDecoder *decoder, _Inout_ ScratchImage& image ) +{ + if ( !decoder ) + return E_POINTER; + + HRESULT hr = image.Initialize2D( metadata.format, metadata.width, metadata.height, metadata.arraySize, 1 ); + if ( FAILED(hr) ) + return hr; + + IWICImagingFactory* pWIC = _GetWIC(); + if ( !pWIC ) + return E_NOINTERFACE; + + WICPixelFormatGUID sourceGUID; + if ( !_DXGIToWIC( metadata.format, sourceGUID ) ) + return E_FAIL; + + for( size_t index = 0; index < metadata.arraySize; ++index ) + { + const Image* img = image.GetImage( 0, index, 0 ); + if ( !img ) + return E_POINTER; + + ScopedObject frame; + hr = decoder->GetFrame( static_cast( index ), &frame ); + if ( FAILED(hr) ) + return hr; + + WICPixelFormatGUID pfGuid; + hr = frame->GetPixelFormat( &pfGuid ); + if ( FAILED(hr) ) + return hr; + + UINT w, h; + hr = frame->GetSize( &w, &h ); + if ( FAILED(hr) ) + return hr; + + if ( memcmp( &pfGuid, &sourceGUID, sizeof(WICPixelFormatGUID) ) == 0 ) + { + if ( w == metadata.width && h == metadata.height ) + { + // This frame does not need resized or format converted, just copy... + hr = frame->CopyPixels( 0, static_cast( img->rowPitch ), static_cast( img->slicePitch ), img->pixels ); + if ( FAILED(hr) ) + return hr; + } + else + { + // This frame needs resizing, but not format converted + ScopedObject scaler; + hr = pWIC->CreateBitmapScaler( &scaler ); + if ( FAILED(hr) ) + return hr; + + hr = scaler->Initialize( frame.Get(), static_cast( metadata.width ), static_cast( metadata.height ), _GetWICInterp( flags ) ); + if ( FAILED(hr) ) + return hr; + + hr = scaler->CopyPixels( 0, static_cast( img->rowPitch ), static_cast( img->slicePitch ), img->pixels ); + if ( FAILED(hr) ) + return hr; + } + } + else + { + // This frame required format conversion + ScopedObject FC; + hr = pWIC->CreateFormatConverter( &FC ); + if ( FAILED(hr) ) + return hr; + + hr = FC->Initialize( frame.Get(), pfGuid, _GetWICDither( flags ), 0, 0, WICBitmapPaletteTypeCustom ); + if ( FAILED(hr) ) + return hr; + + if ( w == metadata.width && h == metadata.height ) + { + // This frame is the same size, no need to scale + hr = FC->CopyPixels( 0, static_cast( img->rowPitch ), static_cast( img->slicePitch ), img->pixels ); + if ( FAILED(hr) ) + return hr; + } + else + { + // This frame needs resizing and format converted + ScopedObject scaler; + hr = pWIC->CreateBitmapScaler( &scaler ); + if ( FAILED(hr) ) + return hr; + + hr = scaler->Initialize( FC.Get(), static_cast( metadata.width ), static_cast( metadata.height ), _GetWICInterp( flags ) ); + if ( FAILED(hr) ) + return hr; + + hr = scaler->CopyPixels( 0, static_cast( img->rowPitch ), static_cast( img->slicePitch ), img->pixels ); + if ( FAILED(hr) ) + return hr; + } + } + } + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Encodes a single frame +//------------------------------------------------------------------------------------- +static HRESULT _EncodeImage( _In_ const Image& image, _In_ DWORD flags, _In_ IWICBitmapFrameEncode* frame, _In_opt_ IPropertyBag2* props, _In_opt_ const GUID* targetFormat ) +{ + if ( !frame ) + return E_INVALIDARG; + + if ( !image.pixels ) + return E_POINTER; + + WICPixelFormatGUID pfGuid; + if ( !_DXGIToWIC( image.format, pfGuid ) ) + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + + HRESULT hr = frame->Initialize( props ); + if ( FAILED(hr) ) + return hr; + +#ifdef _AMD64_ + if ( (image.width > 0xFFFFFFFF) || (image.height > 0xFFFFFFFF) ) + return E_INVALIDARG; +#endif + + hr = frame->SetSize( static_cast( image.width ), static_cast( image.height ) ); + if ( FAILED(hr) ) + return hr; + + hr = frame->SetResolution( 72, 72 ); + if ( FAILED(hr) ) + return hr; + + WICPixelFormatGUID targetGuid = (targetFormat) ? (*targetFormat) : pfGuid; + hr = frame->SetPixelFormat( &targetGuid ); + if ( FAILED(hr) ) + return hr; + + if ( memcmp( &targetGuid, &pfGuid, sizeof(WICPixelFormatGUID) ) != 0 ) + { + // Conversion required to write + IWICImagingFactory* pWIC = _GetWIC(); + if ( !pWIC ) + return E_NOINTERFACE; + + ScopedObject source; + hr = pWIC->CreateBitmapFromMemory( static_cast( image.width ), static_cast( image.height ), pfGuid, + static_cast( image.rowPitch ), static_cast( image.slicePitch ), + image.pixels, &source ); + if ( FAILED(hr) ) + return hr; + + ScopedObject FC; + hr = pWIC->CreateFormatConverter( &FC ); + if ( FAILED(hr) ) + return hr; + + hr = FC->Initialize( source.Get(), targetGuid, _GetWICDither( flags ), 0, 0, WICBitmapPaletteTypeCustom ); + if ( FAILED(hr) ) + return hr; + + WICRect rect = { 0, 0, static_cast( image.width ), static_cast( image.height ) }; + hr = frame->WriteSource( FC.Get(), &rect ); + if ( FAILED(hr) ) + return hr; + } + else + { + // No conversion required + hr = frame->WritePixels( static_cast( image.height ), static_cast( image.rowPitch ), static_cast( image.slicePitch ), + reinterpret_cast( image.pixels ) ); + if ( FAILED(hr) ) + return hr; + } + + hr = frame->Commit(); + if ( FAILED(hr) ) + return hr; + + return S_OK; +} + +static HRESULT _EncodeSingleFrame( _In_ const Image& image, _In_ DWORD flags, + _In_ REFGUID guidContainerFormat, _Inout_ IStream* stream, _In_opt_ const GUID* targetFormat ) +{ + if ( !stream ) + return E_INVALIDARG; + + // Initialize WIC + IWICImagingFactory* pWIC = _GetWIC(); + if ( !pWIC ) + return E_NOINTERFACE; + + ScopedObject encoder; + HRESULT hr = pWIC->CreateEncoder( guidContainerFormat, 0, &encoder ); + if ( FAILED(hr) ) + return hr; + + hr = encoder->Initialize( stream, WICBitmapEncoderNoCache ); + if ( FAILED(hr) ) + return hr; + + ScopedObject frame; + ScopedObject props; + hr = encoder->CreateNewFrame( &frame, &props ); + if ( FAILED(hr) ) + return hr; + + if ( memcmp( &guidContainerFormat, &GUID_ContainerFormatBmp, sizeof(WICPixelFormatGUID) ) == 0 ) + { + // Opt-in to the Windows 8 support for writing 32-bit Windows BMP files with an alpha channel if supported + PROPBAG2 option = { 0 }; + option.pstrName = L"EnableV5Header32bppBGRA"; + + VARIANT varValue; + varValue.vt = VT_BOOL; + varValue.boolVal = VARIANT_TRUE; + hr = props->Write( 1, &option, &varValue ); + if ( FAILED(hr) ) + { + // Fails on older versions of WIC, so we default to the null property bag + props.Reset(); + } + } + + hr = _EncodeImage( image, flags, frame.Get(), props.Get(), targetFormat ); + if ( FAILED(hr) ) + return hr; + + hr = encoder->Commit(); + if ( FAILED(hr) ) + return hr; + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Encodes an image array +//------------------------------------------------------------------------------------- +static HRESULT _EncodeMultiframe( _In_count_(nimages) const Image* images, _In_ size_t nimages, _In_ DWORD flags, + _In_ REFGUID guidContainerFormat, _Inout_ IStream* stream, _In_opt_ const GUID* targetFormat ) +{ + if ( !stream || nimages < 2 ) + return E_INVALIDARG; + + if ( !images ) + return E_POINTER; + + // Initialize WIC + IWICImagingFactory* pWIC = _GetWIC(); + if ( !pWIC ) + return E_NOINTERFACE; + + ScopedObject encoder; + HRESULT hr = pWIC->CreateEncoder( guidContainerFormat, 0, &encoder ); + if ( FAILED(hr) ) + return hr; + + ScopedObject einfo; + hr = encoder->GetEncoderInfo( &einfo ); + if ( FAILED(hr) ) + return hr; + + BOOL mframe = FALSE; + hr = einfo->DoesSupportMultiframe( &mframe ); + if ( FAILED(hr) ) + return hr; + + if ( !mframe ) + return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); + + hr = encoder->Initialize( stream, WICBitmapEncoderNoCache ); + if ( FAILED(hr) ) + return hr; + + for( size_t index=0; index < nimages; ++index ) + { + ScopedObject frame; + hr = encoder->CreateNewFrame( &frame, nullptr ); + if ( FAILED(hr) ) + return hr; + + hr = _EncodeImage( images[index], flags, frame.Get(), nullptr, targetFormat ); + if ( FAILED(hr) ) + return hr; + } + + hr = encoder->Commit(); + if ( FAILED(hr) ) + return hr; + + return S_OK; +} + + +//===================================================================================== +// Entry-points +//===================================================================================== + +//------------------------------------------------------------------------------------- +// Obtain metadata from WIC-supported file in memory +//------------------------------------------------------------------------------------- +HRESULT GetMetadataFromWICMemory( LPCVOID pSource, size_t size, DWORD flags, TexMetadata& metadata ) +{ + if ( !pSource || size == 0 ) + return E_INVALIDARG; + +#ifdef _AMD64_ + if ( size > 0xFFFFFFFF ) + return HRESULT_FROM_WIN32( ERROR_FILE_TOO_LARGE ); +#endif + + IWICImagingFactory* pWIC = _GetWIC(); + if ( !pWIC ) + return E_NOINTERFACE; + + // Create input stream for memory + ScopedObject stream; + HRESULT hr = pWIC->CreateStream( &stream ); + if ( FAILED(hr) ) + return hr; + + hr = stream->InitializeFromMemory( reinterpret_cast( const_cast( pSource ) ), + static_cast( size ) ); + if ( FAILED(hr) ) + return hr; + + // Initialize WIC + ScopedObject decoder; + hr = pWIC->CreateDecoderFromStream( stream.Get(), 0, WICDecodeMetadataCacheOnDemand, &decoder ); + if ( FAILED(hr) ) + return hr; + + ScopedObject frame; + hr = decoder->GetFrame( 0, &frame ); + if ( FAILED(hr) ) + return hr; + + // Get metadata + hr = _DecodeMetadata( flags, decoder.Get(), frame.Get(), metadata, 0 ); + if ( FAILED(hr) ) + return hr; + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Obtain metadata from WIC-supported file on disk +//------------------------------------------------------------------------------------- +HRESULT GetMetadataFromWICFile( LPCWSTR szFile, DWORD flags, TexMetadata& metadata ) +{ + if ( !szFile ) + return E_INVALIDARG; + + IWICImagingFactory* pWIC = _GetWIC(); + if ( !pWIC ) + return E_NOINTERFACE; + + // Initialize WIC + ScopedObject decoder; + HRESULT hr = pWIC->CreateDecoderFromFilename( szFile, 0, GENERIC_READ, WICDecodeMetadataCacheOnDemand, &decoder ); + if ( FAILED(hr) ) + return hr; + + ScopedObject frame; + hr = decoder->GetFrame( 0, &frame ); + if ( FAILED(hr) ) + return hr; + + // Get metadata + hr = _DecodeMetadata( flags, decoder.Get(), frame.Get(), metadata, 0 ); + if ( FAILED(hr) ) + return hr; + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Load a WIC-supported file in memory +//------------------------------------------------------------------------------------- +HRESULT LoadFromWICMemory( LPCVOID pSource, size_t size, DWORD flags, TexMetadata* metadata, ScratchImage& image ) +{ + if ( !pSource || size == 0 ) + return E_INVALIDARG; + +#ifdef _AMD64_ + if ( size > 0xFFFFFFFF ) + return HRESULT_FROM_WIN32( ERROR_FILE_TOO_LARGE ); +#endif + + IWICImagingFactory* pWIC = _GetWIC(); + if ( !pWIC ) + return E_NOINTERFACE; + + image.Release(); + + // Create input stream for memory + ScopedObject stream; + HRESULT hr = pWIC->CreateStream( &stream ); + if ( FAILED(hr) ) + return hr; + + hr = stream->InitializeFromMemory( reinterpret_cast( const_cast( pSource ) ), static_cast( size ) ); + if ( FAILED(hr) ) + return hr; + + // Initialize WIC + ScopedObject decoder; + hr = pWIC->CreateDecoderFromStream( stream.Get(), 0, WICDecodeMetadataCacheOnDemand, &decoder ); + if ( FAILED(hr) ) + return hr; + + ScopedObject frame; + hr = decoder->GetFrame( 0, &frame ); + if ( FAILED(hr) ) + return hr; + + // Get metadata + TexMetadata mdata; + WICPixelFormatGUID convertGUID = {0}; + hr = _DecodeMetadata( flags, decoder.Get(), frame.Get(), mdata, &convertGUID ); + if ( FAILED(hr) ) + return hr; + + if ( (mdata.arraySize > 1) && (flags & WIC_FLAGS_ALL_FRAMES) ) + { + hr = _DecodeMultiframe( flags, mdata, decoder.Get(), image ); + } + else + { + hr = _DecodeSingleFrame( flags, mdata, convertGUID, frame.Get(), image ); + } + + if ( FAILED(hr) ) + { + image.Release(); + return hr; + } + + if ( metadata ) + memcpy( metadata, &mdata, sizeof(TexMetadata) ); + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Load a WIC-supported file from disk +//------------------------------------------------------------------------------------- +HRESULT LoadFromWICFile( LPCWSTR szFile, DWORD flags, TexMetadata* metadata, ScratchImage& image ) +{ + if ( !szFile ) + return E_INVALIDARG; + + IWICImagingFactory* pWIC = _GetWIC(); + if ( !pWIC ) + return E_NOINTERFACE; + + image.Release(); + + // Initialize WIC + ScopedObject decoder; + HRESULT hr = pWIC->CreateDecoderFromFilename( szFile, 0, GENERIC_READ, WICDecodeMetadataCacheOnDemand, &decoder ); + if ( FAILED(hr) ) + return hr; + + ScopedObject frame; + hr = decoder->GetFrame( 0, &frame ); + if ( FAILED(hr) ) + return hr; + + // Get metadata + TexMetadata mdata; + WICPixelFormatGUID convertGUID = {0}; + hr = _DecodeMetadata( flags, decoder.Get(), frame.Get(), mdata, &convertGUID ); + if ( FAILED(hr) ) + return hr; + + if ( (mdata.arraySize > 1) && (flags & WIC_FLAGS_ALL_FRAMES) ) + { + hr = _DecodeMultiframe( flags, mdata, decoder.Get(), image ); + } + else + { + hr = _DecodeSingleFrame( flags, mdata, convertGUID, frame.Get(), image ); + } + + if ( FAILED(hr) ) + { + image.Release(); + return hr; + } + + if ( metadata ) + memcpy( metadata, &mdata, sizeof(TexMetadata) ); + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Save a WIC-supported file to memory +//------------------------------------------------------------------------------------- +HRESULT SaveToWICMemory( const Image& image, DWORD flags, REFGUID guidContainerFormat, Blob& blob, const GUID* targetFormat ) +{ + if ( !image.pixels ) + return E_POINTER; + + blob.Release(); + + ScopedObject stream; + HRESULT hr = CreateStreamOnHGlobal( 0, TRUE, &stream ); + if ( FAILED(hr) ) + return hr; + + hr = _EncodeSingleFrame( image, flags, guidContainerFormat, stream.Get(), targetFormat ); + if ( FAILED(hr) ) + return hr; + + // Copy stream data into blob + STATSTG stat; + hr = stream->Stat( &stat, STATFLAG_NONAME ); + if ( FAILED(hr) ) + return hr; + + if ( stat.cbSize.HighPart > 0 ) + return HRESULT_FROM_WIN32( ERROR_FILE_TOO_LARGE ); + + hr = blob.Initialize( stat.cbSize.LowPart ); + if ( FAILED(hr) ) + return hr; + + LARGE_INTEGER li = { 0 }; + hr = stream->Seek( li, STREAM_SEEK_SET, 0 ); + if ( FAILED(hr) ) + return hr; + + DWORD bytesRead; + hr = stream->Read( blob.GetBufferPointer(), static_cast( blob.GetBufferSize() ), &bytesRead ); + if ( FAILED(hr) ) + return hr; + + if ( bytesRead != blob.GetBufferSize() ) + return E_FAIL; + + return S_OK; +} + +HRESULT SaveToWICMemory( const Image* images, size_t nimages, DWORD flags, REFGUID guidContainerFormat, Blob& blob, const GUID* targetFormat ) +{ + if ( !images || nimages == 0 ) + return E_INVALIDARG; + + blob.Release(); + + ScopedObject stream; + HRESULT hr = CreateStreamOnHGlobal( 0, TRUE, &stream ); + if ( FAILED(hr) ) + return hr; + + if ( nimages > 1 ) + hr = _EncodeMultiframe( images, nimages, flags, guidContainerFormat, stream.Get(), targetFormat ); + else + hr = _EncodeSingleFrame( images[0], flags, guidContainerFormat, stream.Get(), targetFormat ); + + if ( FAILED(hr) ) + return hr; + + // Copy stream data into blob + STATSTG stat; + hr = stream->Stat( &stat, STATFLAG_NONAME ); + if ( FAILED(hr) ) + return hr; + + if ( stat.cbSize.HighPart > 0 ) + return HRESULT_FROM_WIN32( ERROR_FILE_TOO_LARGE ); + + hr = blob.Initialize( stat.cbSize.LowPart ); + if ( FAILED(hr) ) + return hr; + + LARGE_INTEGER li = { 0 }; + hr = stream->Seek( li, STREAM_SEEK_SET, 0 ); + if ( FAILED(hr) ) + return hr; + + DWORD bytesRead; + hr = stream->Read( blob.GetBufferPointer(), static_cast( blob.GetBufferSize() ), &bytesRead ); + if ( FAILED(hr) ) + return hr; + + if ( bytesRead != blob.GetBufferSize() ) + return E_FAIL; + + return S_OK; +} + + +//------------------------------------------------------------------------------------- +// Save a WIC-supported file to disk +//------------------------------------------------------------------------------------- +HRESULT SaveToWICFile( const Image& image, DWORD flags, REFGUID guidContainerFormat, LPCWSTR szFile, const GUID* targetFormat ) +{ + if ( !szFile ) + return E_INVALIDARG; + + if ( !image.pixels ) + return E_POINTER; + + IWICImagingFactory* pWIC = _GetWIC(); + if ( !pWIC ) + return E_NOINTERFACE; + + ScopedObject stream; + HRESULT hr = pWIC->CreateStream( &stream ); + if ( FAILED(hr) ) + return hr; + + hr = stream->InitializeFromFilename( szFile, GENERIC_WRITE ); + if ( FAILED(hr) ) + return hr; + + hr = _EncodeSingleFrame( image, flags, guidContainerFormat, stream.Get(), targetFormat ); + if ( FAILED(hr) ) + return hr; + + return S_OK; +} + +HRESULT SaveToWICFile( const Image* images, size_t nimages, DWORD flags, REFGUID guidContainerFormat, LPCWSTR szFile, const GUID* targetFormat ) +{ + if ( !szFile || !images || nimages == 0 ) + return E_INVALIDARG; + + IWICImagingFactory* pWIC = _GetWIC(); + if ( !pWIC ) + return E_NOINTERFACE; + + ScopedObject stream; + HRESULT hr = pWIC->CreateStream( &stream ); + if ( FAILED(hr) ) + return hr; + + hr = stream->InitializeFromFilename( szFile, GENERIC_WRITE ); + if ( FAILED(hr) ) + return hr; + + if ( nimages > 1 ) + hr = _EncodeMultiframe( images, nimages, flags, guidContainerFormat, stream.Get(), targetFormat ); + else + hr = _EncodeSingleFrame( images[0], flags, guidContainerFormat, stream.Get(), targetFormat ); + + if ( FAILED(hr) ) + return hr; + + return S_OK; +} + +}; // namespace diff --git a/thirdparty/directxtex/DirectXTex/scoped.h b/thirdparty/directxtex/DirectXTex/scoped.h new file mode 100644 index 0000000..358290d --- /dev/null +++ b/thirdparty/directxtex/DirectXTex/scoped.h @@ -0,0 +1,70 @@ +//------------------------------------------------------------------------------------- +// scoped.h +// +// Utility header with helper classes for exception-safe handling of resources +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------------------------------- + +#if defined(_MSC_VER) && (_MSC_VER > 1000) +#pragma once +#endif + +#include +#include +#include + +//--------------------------------------------------------------------------------- +struct aligned_deleter { void operator()(void* p) { _aligned_free(p); } }; + +typedef std::unique_ptr ScopedAlignedArrayFloat; + +#ifdef USE_XNAMATH +typedef std::unique_ptr ScopedAlignedArrayXMVECTOR; +#else +typedef std::unique_ptr ScopedAlignedArrayXMVECTOR; +#endif + +//--------------------------------------------------------------------------------- +struct handle_closer { void operator()(HANDLE h) { assert(h != INVALID_HANDLE_VALUE); if (h) CloseHandle(h); } }; + +typedef public std::unique_ptr ScopedHandle; + +inline HANDLE safe_handle( HANDLE h ) { return (h == INVALID_HANDLE_VALUE) ? 0 : h; } + + +//--------------------------------------------------------------------------------- +template class ScopedObject +{ +public: + explicit ScopedObject( T *p = 0 ) : _pointer(p) {} + ~ScopedObject() + { + if ( _pointer ) + { + _pointer->Release(); + _pointer = nullptr; + } + } + + bool IsNull() const { return (!_pointer); } + + T& operator*() { return *_pointer; } + T* operator->() { return _pointer; } + T** operator&() { return &_pointer; } + + void Reset(T *p = 0) { if ( _pointer ) { _pointer->Release(); } _pointer = p; } + + T* Get() const { return _pointer; } + +private: + ScopedObject(const ScopedObject&); + ScopedObject& operator=(const ScopedObject&); + + T* _pointer; +}; diff --git a/thirdparty/directxtex/Microsoft Public License.rtf b/thirdparty/directxtex/Microsoft Public License.rtf new file mode 100644 index 0000000..390c7ad --- /dev/null +++ b/thirdparty/directxtex/Microsoft Public License.rtf @@ -0,0 +1,234 @@ +{\rtf1\adeflang1025\ansi\ansicpg1252\uc1\adeff1\deff0\stshfdbch0\stshfloch0\stshfhich0\stshfbi0\deflang1033\deflangfe1033\themelang1033\themelangfe0\themelangcs0{\fonttbl{\f0\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f1\fbidi \fswiss\fcharset0\fprq2{\*\panose 020b0604020202020204}Arial;} +{\f34\fbidi \froman\fcharset0\fprq2{\*\panose 02040503050406030204}Cambria Math;}{\f36\fbidi \froman\fcharset0\fprq2{\*\panose 02040503050406030204}Cambria;}{\f38\fbidi \fswiss\fcharset0\fprq2{\*\panose 020b0604030504040204}Tahoma;} +{\f39\fbidi \fswiss\fcharset0\fprq2{\*\panose 00000000000000000000}Verdana;}{\flomajor\f31500\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;} +{\fdbmajor\f31501\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\fhimajor\f31502\fbidi \froman\fcharset0\fprq2{\*\panose 02040503050406030204}Cambria;} +{\fbimajor\f31503\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\flominor\f31504\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;} +{\fdbminor\f31505\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\fhiminor\f31506\fbidi \fswiss\fcharset0\fprq2{\*\panose 020f0502020204030204}Calibri;} +{\fbiminor\f31507\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f40\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}{\f41\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;} +{\f43\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}{\f44\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}{\f45\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\f46\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);} +{\f47\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}{\f48\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}{\f50\fbidi \fswiss\fcharset238\fprq2 Arial CE;}{\f51\fbidi \fswiss\fcharset204\fprq2 Arial Cyr;} +{\f53\fbidi \fswiss\fcharset161\fprq2 Arial Greek;}{\f54\fbidi \fswiss\fcharset162\fprq2 Arial Tur;}{\f55\fbidi \fswiss\fcharset177\fprq2 Arial (Hebrew);}{\f56\fbidi \fswiss\fcharset178\fprq2 Arial (Arabic);} +{\f57\fbidi \fswiss\fcharset186\fprq2 Arial Baltic;}{\f58\fbidi \fswiss\fcharset163\fprq2 Arial (Vietnamese);}{\f380\fbidi \froman\fcharset238\fprq2 Cambria Math CE;}{\f381\fbidi \froman\fcharset204\fprq2 Cambria Math Cyr;} +{\f383\fbidi \froman\fcharset161\fprq2 Cambria Math Greek;}{\f384\fbidi \froman\fcharset162\fprq2 Cambria Math Tur;}{\f387\fbidi \froman\fcharset186\fprq2 Cambria Math Baltic;}{\f388\fbidi \froman\fcharset163\fprq2 Cambria Math (Vietnamese);} +{\f400\fbidi \froman\fcharset238\fprq2 Cambria CE;}{\f401\fbidi \froman\fcharset204\fprq2 Cambria Cyr;}{\f403\fbidi \froman\fcharset161\fprq2 Cambria Greek;}{\f404\fbidi \froman\fcharset162\fprq2 Cambria Tur;} +{\f407\fbidi \froman\fcharset186\fprq2 Cambria Baltic;}{\f408\fbidi \froman\fcharset163\fprq2 Cambria (Vietnamese);}{\f420\fbidi \fswiss\fcharset238\fprq2 Tahoma CE;}{\f421\fbidi \fswiss\fcharset204\fprq2 Tahoma Cyr;} +{\f423\fbidi \fswiss\fcharset161\fprq2 Tahoma Greek;}{\f424\fbidi \fswiss\fcharset162\fprq2 Tahoma Tur;}{\f425\fbidi \fswiss\fcharset177\fprq2 Tahoma (Hebrew);}{\f426\fbidi \fswiss\fcharset178\fprq2 Tahoma (Arabic);} +{\f427\fbidi \fswiss\fcharset186\fprq2 Tahoma Baltic;}{\f428\fbidi \fswiss\fcharset163\fprq2 Tahoma (Vietnamese);}{\f429\fbidi \fswiss\fcharset222\fprq2 Tahoma (Thai);}{\f430\fbidi \fswiss\fcharset238\fprq2 Verdana CE;} +{\f431\fbidi \fswiss\fcharset204\fprq2 Verdana Cyr;}{\f433\fbidi \fswiss\fcharset161\fprq2 Verdana Greek;}{\f434\fbidi \fswiss\fcharset162\fprq2 Verdana Tur;}{\f437\fbidi \fswiss\fcharset186\fprq2 Verdana Baltic;} +{\f438\fbidi \fswiss\fcharset163\fprq2 Verdana (Vietnamese);}{\flomajor\f31508\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}{\flomajor\f31509\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;} +{\flomajor\f31511\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}{\flomajor\f31512\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}{\flomajor\f31513\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);} +{\flomajor\f31514\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}{\flomajor\f31515\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}{\flomajor\f31516\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);} +{\fdbmajor\f31518\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}{\fdbmajor\f31519\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}{\fdbmajor\f31521\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;} +{\fdbmajor\f31522\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}{\fdbmajor\f31523\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\fdbmajor\f31524\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);} +{\fdbmajor\f31525\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}{\fdbmajor\f31526\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}{\fhimajor\f31528\fbidi \froman\fcharset238\fprq2 Cambria CE;} +{\fhimajor\f31529\fbidi \froman\fcharset204\fprq2 Cambria Cyr;}{\fhimajor\f31531\fbidi \froman\fcharset161\fprq2 Cambria Greek;}{\fhimajor\f31532\fbidi \froman\fcharset162\fprq2 Cambria Tur;} +{\fhimajor\f31535\fbidi \froman\fcharset186\fprq2 Cambria Baltic;}{\fhimajor\f31536\fbidi \froman\fcharset163\fprq2 Cambria (Vietnamese);}{\fbimajor\f31538\fbidi \froman\fcharset238\fprq2 Times New Roman CE;} +{\fbimajor\f31539\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}{\fbimajor\f31541\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}{\fbimajor\f31542\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;} +{\fbimajor\f31543\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\fbimajor\f31544\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}{\fbimajor\f31545\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;} +{\fbimajor\f31546\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}{\flominor\f31548\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}{\flominor\f31549\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;} +{\flominor\f31551\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}{\flominor\f31552\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}{\flominor\f31553\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);} +{\flominor\f31554\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}{\flominor\f31555\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}{\flominor\f31556\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);} +{\fdbminor\f31558\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}{\fdbminor\f31559\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}{\fdbminor\f31561\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;} +{\fdbminor\f31562\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}{\fdbminor\f31563\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\fdbminor\f31564\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);} +{\fdbminor\f31565\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}{\fdbminor\f31566\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}{\fhiminor\f31568\fbidi \fswiss\fcharset238\fprq2 Calibri CE;} +{\fhiminor\f31569\fbidi \fswiss\fcharset204\fprq2 Calibri Cyr;}{\fhiminor\f31571\fbidi \fswiss\fcharset161\fprq2 Calibri Greek;}{\fhiminor\f31572\fbidi \fswiss\fcharset162\fprq2 Calibri Tur;} +{\fhiminor\f31575\fbidi \fswiss\fcharset186\fprq2 Calibri Baltic;}{\fhiminor\f31576\fbidi \fswiss\fcharset163\fprq2 Calibri (Vietnamese);}{\fbiminor\f31578\fbidi \froman\fcharset238\fprq2 Times New Roman CE;} +{\fbiminor\f31579\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}{\fbiminor\f31581\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}{\fbiminor\f31582\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;} +{\fbiminor\f31583\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\fbiminor\f31584\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}{\fbiminor\f31585\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;} +{\fbiminor\f31586\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}}{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0; +\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}{\*\defchp }{\*\defpap +\ql \li0\ri0\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 }\noqfpromote {\stylesheet{\ql \li0\ri0\nowidctlpar\wrapdefault\faauto\rin0\lin0\itap0 \rtlch\fcs1 \af1\afs24\alang1025 \ltrch\fcs0 +\f1\fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 \snext0 \sqformat \spriority0 Normal;}{\s1\ql \li0\ri0\nowidctlpar\wrapdefault\faauto\outlinelevel0\rin0\lin0\itap0 \rtlch\fcs1 \af1\afs24\alang1025 \ltrch\fcs0 +\f1\fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext0 \slink15 \sqformat \spriority9 heading 1;}{\s2\ql \li0\ri0\nowidctlpar\wrapdefault\faauto\outlinelevel1\rin0\lin0\itap0 \rtlch\fcs1 \af1\afs24\alang1025 \ltrch\fcs0 +\f1\fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext0 \slink16 \sqformat \spriority9 heading 2;}{\*\cs10 \additive \ssemihidden Default Paragraph Font;}{\* +\ts11\tsrowd\trftsWidthB3\trpaddl108\trpaddr108\trpaddfl3\trpaddft3\trpaddfb3\trpaddfr3\trcbpat1\trcfpat1\tblind0\tblindtype3\tsvertalt\tsbrdrt\tsbrdrl\tsbrdrb\tsbrdrr\tsbrdrdgl\tsbrdrdgr\tsbrdrh\tsbrdrv +\ql \li0\ri0\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af0\afs20\alang1025 \ltrch\fcs0 \fs20\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 \snext11 \ssemihidden \sunhideused Normal Table;}{\*\cs15 \additive +\rtlch\fcs1 \ab\af0\afs32 \ltrch\fcs0 \b\f36\fs32\kerning32 \sbasedon10 \slink1 \slocked \spriority9 Heading 1 Char;}{\*\cs16 \additive \rtlch\fcs1 \ab\ai\af0\afs28 \ltrch\fcs0 \b\i\f36\fs28 \sbasedon10 \slink2 \slocked \spriority9 Heading 2 Char;}{ +\s17\ql \li0\ri0\nowidctlpar\wrapdefault\faauto\rin0\lin0\itap0 \rtlch\fcs1 \af38\afs16\alang1025 \ltrch\fcs0 \f38\fs16\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext17 \slink18 \ssemihidden \sunhideused \styrsid7424395 Balloon Text;} +{\*\cs18 \additive \rtlch\fcs1 \af38\afs16 \ltrch\fcs0 \f38\fs16 \sbasedon10 \slink17 \slocked \ssemihidden \styrsid7424395 Balloon Text Char;}{\*\cs19 \additive \rtlch\fcs1 \af0\afs16 \ltrch\fcs0 \fs16 +\sbasedon10 \ssemihidden \sunhideused \styrsid4538388 annotation reference;}{\s20\ql \li0\ri0\nowidctlpar\wrapdefault\faauto\rin0\lin0\itap0 \rtlch\fcs1 \af1\afs20\alang1025 \ltrch\fcs0 \f1\fs20\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 +\sbasedon0 \snext20 \slink21 \ssemihidden \sunhideused \styrsid4538388 annotation text;}{\*\cs21 \additive \rtlch\fcs1 \af1 \ltrch\fcs0 \f1 \sbasedon10 \slink20 \slocked \ssemihidden \styrsid4538388 Comment Text Char;}{ +\s22\ql \li0\ri0\nowidctlpar\wrapdefault\faauto\rin0\lin0\itap0 \rtlch\fcs1 \ab\af1\afs20\alang1025 \ltrch\fcs0 \b\f1\fs20\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 \sbasedon20 \snext20 \slink23 \ssemihidden \sunhideused \styrsid4538388 +annotation subject;}{\*\cs23 \additive \rtlch\fcs1 \ab\af1 \ltrch\fcs0 \b\f1 \sbasedon21 \slink22 \slocked \ssemihidden \styrsid4538388 Comment Subject Char;}}{\*\rsidtbl \rsid213160\rsid284417\rsid417145\rsid481196\rsid551334\rsid723397\rsid786968 +\rsid1382437\rsid1390003\rsid1521043\rsid1530955\rsid1708989\rsid1783212\rsid1903779\rsid2431884\rsid3165084\rsid3416120\rsid3419781\rsid3754103\rsid3768194\rsid3831520\rsid4538130\rsid4538388\rsid4552277\rsid4680449\rsid4729674\rsid4865270\rsid4987534 +\rsid5128131\rsid5186068\rsid5601121\rsid5864350\rsid6186044\rsid6311778\rsid6384507\rsid6434687\rsid6561471\rsid6910344\rsid6947552\rsid7033180\rsid7424395\rsid7682010\rsid7690850\rsid7744081\rsid8151618\rsid8196281\rsid8198206\rsid8342723\rsid8350925 +\rsid8722561\rsid8852349\rsid8934457\rsid8944153\rsid9573035\rsid9635349\rsid9638545\rsid9724918\rsid10044820\rsid10095979\rsid10228618\rsid10449644\rsid10494075\rsid11166278\rsid11166751\rsid11285353\rsid11366513\rsid11494815\rsid11932529\rsid12061202 +\rsid12533699\rsid12536400\rsid12916885\rsid13264736\rsid13322831\rsid13440556\rsid13455614\rsid13597357\rsid13768671\rsid14097590\rsid14157399\rsid14229900\rsid14305025\rsid14314735\rsid14436896\rsid14565916\rsid14572556\rsid14688892\rsid14752433 +\rsid14904394\rsid15086147\rsid15749945\rsid15814398\rsid15927751\rsid16071312\rsid16126175\rsid16279402\rsid16391569\rsid16404661\rsid16452939\rsid16537688\rsid16606866\rsid16674896}{\mmathPr\mmathFont34\mbrkBin0\mbrkBinSub0\msmallFrac0\mdispDef1 +\mlMargin0\mrMargin0\mdefJc1\mwrapIndent1440\mintLim0\mnaryLim1}{\info{\title Microsoft Permissive License (Ms-PL)}{\author Jonr}{\operator Chuck Walbourn}{\creatim\yr2007\mo2\dy23\hr15\min10}{\revtim\yr2011\mo8\dy15\hr15\min2} +{\printim\yr2006\mo9\dy28\hr8\min46}{\version3}{\edmins1}{\nofpages1}{\nofwords391}{\nofchars2230}{\*\company Microsoft}{\nofcharsws2616}{\vern49273}}{\*\userprops {\propname _NewReviewCycle}\proptype30{\staticval }}{\*\xmlnstbl {\xmlns1 http://schemas.mi +crosoft.com/office/word/2003/wordml}{\xmlns2 urn:schemas-microsoft-com:office:smarttags}}\paperw12240\paperh15840\margl1440\margr1440\margt1440\margb1440\gutter0\ltrsect +\widowctrl\ftnbj\aenddoc\trackmoves0\trackformatting1\donotembedsysfont0\relyonvml0\donotembedlingdata1\grfdocevents0\validatexml0\showplaceholdtext0\ignoremixedcontent0\saveinvalidxml0\showxmlerrors0\hyphcaps0\horzdoc\dghspace120\dgvspace120 +\dghorigin1701\dgvorigin1984\dghshow0\dgvshow3\jcompress\viewkind1\viewscale100\splytwnine\ftnlytwnine\htmautsp\useltbaln\alntblind\lytcalctblwd\lyttblrtgr\lnbrkrule\nobrkwrptbl\snaptogridincell\allowfieldendsel\wrppunct\asianbrkrule\rsidroot10494075 +\newtblstyruls\nogrowautofit\utinl \fet0{\*\wgrffmtfilter 2450}\ilfomacatclnup0\ltrpar \sectd \ltrsect\linex0\sectdefaultcl\sftnbj {\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl2\pnucltr\pnstart1\pnindent720\pnhang {\pntxta .}} +{\*\pnseclvl3\pndec\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl4\pnlcltr\pnstart1\pnindent720\pnhang {\pntxta )}}{\*\pnseclvl5\pndec\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl6\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb (} +{\pntxta )}}{\*\pnseclvl7\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl8\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}\pard\plain \ltrpar +\s1\ql \li0\ri0\sb180\nowidctlpar\wrapdefault\faauto\outlinelevel0\rin0\lin0\itap0 \rtlch\fcs1 \af1\afs24\alang1025 \ltrch\fcs0 \f1\fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 {\rtlch\fcs1 \af1\afs31 \ltrch\fcs0 +\fs31\cf1\kerning36\insrsid10494075\charrsid14688892 Microsoft}{\rtlch\fcs1 \af1\afs31 \ltrch\fcs0 \fs31\cf1\kerning36\insrsid10494075 }{\rtlch\fcs1 \af1\afs31 \ltrch\fcs0 \fs31\cf1\kerning36\insrsid5601121 Public}{\rtlch\fcs1 \af1\afs31 \ltrch\fcs0 +\fs31\cf1\kerning36\insrsid14688892 }{\rtlch\fcs1 \af1\afs31 \ltrch\fcs0 \fs31\cf1\kerning36\insrsid10494075 License (Ms-PL}{\rtlch\fcs1 \af1\afs31 \ltrch\fcs0 \fs31\cf1\kerning36\insrsid4552277 )}{\rtlch\fcs1 \af1\afs31 \ltrch\fcs0 +\fs31\cf1\kerning36\insrsid10494075 +\par }\pard\plain \ltrpar\ql \li0\ri0\sl336\slmult1\nowidctlpar\wrapdefault\faauto\rin0\lin0\itap0 \rtlch\fcs1 \af1\afs24\alang1025 \ltrch\fcs0 \f1\fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 {\rtlch\fcs1 \ab\af39\afs17 \ltrch\fcs0 +\b\f39\fs17\insrsid10494075 +\par This license governs use of the accompanying software. If you use the software, you accept this license. If you do not accept the license, do not use the software. +\par }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid6910344 +\par }\pard\plain \ltrpar\s2\ql \li0\ri0\nowidctlpar\wrapdefault\faauto\outlinelevel1\rin0\lin0\itap0 \rtlch\fcs1 \af1\afs24\alang1025 \ltrch\fcs0 \f1\fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 {\rtlch\fcs1 \ab\af39\afs23 \ltrch\fcs0 +\b\f39\fs23\insrsid10494075 1. Definitions +\par }\pard\plain \ltrpar\ql \li0\ri0\sl336\slmult1\nowidctlpar\wrapdefault\faauto\rin0\lin0\itap0 \rtlch\fcs1 \af1\afs24\alang1025 \ltrch\fcs0 \f1\fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 {\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 +\f39\fs17\insrsid10494075 The terms \'93reproduce,\'94 \'93reproduction}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid7744081 ,}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 \'94 }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 +\f39\fs17\insrsid551334 \'93derivative works,\'94}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid7744081\charrsid7744081 }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 and \'93distribution\'94 have the same meaning here as under +{\*\xmlopen\xmlns2{\factoidname place}}{\*\xmlopen\xmlns2{\factoidname country-region}}U.S.{\*\xmlclose}{\*\xmlclose} copyright law. +\par }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid12536400 A \'93contribution\'94 is the original software}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid4865270 ,}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid12536400 }{\rtlch\fcs1 +\af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid11932529 or}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid12536400 any additions or changes to the software. +\par }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid551334 A \'93c}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid551334\charrsid551334 ontributor\'94 }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid12536400 is}{\rtlch\fcs1 \af39\afs17 +\ltrch\fcs0 \f39\fs17\insrsid12536400\charrsid551334 }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid551334\charrsid551334 any person that }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid12536400 +distributes its contribution under this license.}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 +\par }\pard \ltrpar\ql \li0\ri0\nowidctlpar\wrapdefault\faauto\rin0\lin0\itap0\pararsid14229900 {\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid4729674\delrsid4729674 }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 \'93Licensed patents +\'94 }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid12536400 are }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid3831520 a contributor\rquote s }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 patent claims }{\rtlch\fcs1 +\af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid3831520 that }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 read directly on }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid3831520 its contribution.}{\rtlch\fcs1 \af1 \ltrch\fcs0 +\insrsid14229900\charrsid14229900 +\par }\pard\plain \ltrpar\s2\ql \li0\ri0\nowidctlpar\wrapdefault\faauto\outlinelevel1\rin0\lin0\itap0 \rtlch\fcs1 \af1\afs24\alang1025 \ltrch\fcs0 \f1\fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 {\rtlch\fcs1 \ab\af39\afs23 \ltrch\fcs0 +\b\f39\fs23\insrsid5186068 +\par }{\rtlch\fcs1 \ab\af39\afs23 \ltrch\fcs0 \b\f39\fs23\insrsid10494075 2. Grant of Rights +\par }\pard\plain \ltrpar\ql \li0\ri0\sl336\slmult1\nowidctlpar\wrapdefault\faauto\rin0\lin0\itap0 \rtlch\fcs1 \af1\afs24\alang1025 \ltrch\fcs0 \f1\fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 {\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 +\f39\fs17\insrsid10494075 (A) Copyright Grant- Subject to the terms of this license, including the license conditions and limitations in section 3, }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid3754103 each contributor }{\rtlch\fcs1 \af39\afs17 +\ltrch\fcs0 \f39\fs17\insrsid10494075 grants you a non-exclusive, worldwide, royalty-free copyright license to reproduce }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid3754103 its contribution}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 +\f39\fs17\insrsid10494075 , prepare derivative works of }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid3754103 its contribution}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid12536400 ,}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 +\f39\fs17\insrsid10494075 and distribute }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid3754103 its contribution}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 or any derivative works that you create. +\par (B) Patent Grant- Subject to the terms of this license, including the license conditions and limitations in section 3, }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid9724918 each contributor }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 +\f39\fs17\insrsid10494075 grants you a non-exclusive, worldwide, royalty-free license under }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid15814398 its }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 +licensed patents to make, have made, use, sell, offer for sale, }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid1390003 import, }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 and/or otherwise dispose of }{\rtlch\fcs1 \af39\afs17 +\ltrch\fcs0 \f39\fs17\insrsid8944153 its contribution in }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 the software or derivative works of }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid8944153 the contribution in }{\rtlch\fcs1 +\af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 the software. +\par }\pard\plain \ltrpar\s2\ql \li0\ri0\nowidctlpar\wrapdefault\faauto\outlinelevel1\rin0\lin0\itap0 \rtlch\fcs1 \af1\afs24\alang1025 \ltrch\fcs0 \f1\fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 {\rtlch\fcs1 \ab\af39\afs23 \ltrch\fcs0 +\b\f39\fs23\insrsid5186068 +\par }{\rtlch\fcs1 \ab\af39\afs23 \ltrch\fcs0 \b\f39\fs23\insrsid10494075 3. Conditions and Limitations +\par }\pard\plain \ltrpar\ql \li0\ri0\sl336\slmult1\nowidctlpar\wrapdefault\faauto\rin0\lin0\itap0 \rtlch\fcs1 \af1\afs24\alang1025 \ltrch\fcs0 \f1\fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 {\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 +\f39\fs17\insrsid1530955 }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 (A) No Trademark License- This license does not grant you rights to use }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid1708989 any contributors\rquote }{ +\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 name, logo, or trademarks. +\par (B) If you }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid8934457 bring a patent claim against }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10095979 any contributor}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 + over patents that you }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid6947552 claim }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid7682010 are }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid6947552 infringe}{\rtlch\fcs1 +\af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid7682010 d by}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 the software, your }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid7682010 patent }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 +\f39\fs17\insrsid10494075 license}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid7682010 from such contributor}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 to the software ends automatically. +\par (C) If you distribute }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid3165084 any portion of }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 +the software, you must retain all copyright, patent, trademark, and attribution notices that are present in the software. +\par (D) If you distribute }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid15749945 any portion of the }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 software in source code form}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 +\f39\fs17\insrsid14904394 ,}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 you may do so only under this license}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid6384507 }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 +\f39\fs17\insrsid14904394 by including }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 a complete copy of this license with your distribution}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid6384507 .}{\rtlch\fcs1 \af39\afs17 +\ltrch\fcs0 \f39\fs17\insrsid10494075 }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid6384507 I}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 f you distribute }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid15749945 +any portion of }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 the software in compiled or object code form}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid16452939 ,}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 + you may only do so under a license that complies with this license. +\par }\pard \ltrpar\ql \li0\ri0\sl336\slmult1\nowidctlpar\wrapdefault\faauto\rin0\lin0\itap0\pararsid14572556 {\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 (E) The software is licensed \'93as-is.\'94 You bear the risk of using it. }{ +\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid284417 The contributors }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 +give no express warranties, guarantees or conditions. You may have additional consumer rights under your local laws which this license cannot change. To the extent permitted under your local laws, }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 +\f39\fs17\insrsid1783212 the contributors }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 exclude the implied warranties of merchantability, fitness for a particular purpose and non-infringement.}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 +\f39\fs17\insrsid10494075\charrsid14572556 +\par }{\*\themedata 504b030414000600080000002100e9de0fbfff0000001c020000130000005b436f6e74656e745f54797065735d2e786d6cac91cb4ec3301045f748fc83e52d4a +9cb2400825e982c78ec7a27cc0c8992416c9d8b2a755fbf74cd25442a820166c2cd933f79e3be372bd1f07b5c3989ca74aaff2422b24eb1b475da5df374fd9ad +5689811a183c61a50f98f4babebc2837878049899a52a57be670674cb23d8e90721f90a4d2fa3802cb35762680fd800ecd7551dc18eb899138e3c943d7e503b6 +b01d583deee5f99824e290b4ba3f364eac4a430883b3c092d4eca8f946c916422ecab927f52ea42b89a1cd59c254f919b0e85e6535d135a8de20f20b8c12c3b0 +0c895fcf6720192de6bf3b9e89ecdbd6596cbcdd8eb28e7c365ecc4ec1ff1460f53fe813d3cc7f5b7f020000ffff0300504b030414000600080000002100a5d6 +a7e7c0000000360100000b0000005f72656c732f2e72656c73848fcf6ac3300c87ef85bd83d17d51d2c31825762fa590432fa37d00e1287f68221bdb1bebdb4f +c7060abb0884a4eff7a93dfeae8bf9e194e720169aaa06c3e2433fcb68e1763dbf7f82c985a4a725085b787086a37bdbb55fbc50d1a33ccd311ba548b6309512 +0f88d94fbc52ae4264d1c910d24a45db3462247fa791715fd71f989e19e0364cd3f51652d73760ae8fa8c9ffb3c330cc9e4fc17faf2ce545046e37944c69e462 +a1a82fe353bd90a865aad41ed0b5b8f9d6fd010000ffff0300504b0304140006000800000021006b799616830000008a0000001c0000007468656d652f746865 +6d652f7468656d654d616e616765722e786d6c0ccc4d0ac3201040e17da17790d93763bb284562b2cbaebbf600439c1a41c7a0d29fdbd7e5e38337cedf14d59b +4b0d592c9c070d8a65cd2e88b7f07c2ca71ba8da481cc52c6ce1c715e6e97818c9b48d13df49c873517d23d59085adb5dd20d6b52bd521ef2cdd5eb9246a3d8b +4757e8d3f729e245eb2b260a0238fd010000ffff0300504b03041400060008000000210096b5ade296060000501b0000160000007468656d652f7468656d652f +7468656d65312e786d6cec594f6fdb3614bf0fd87720746f6327761a07758ad8b19b2d4d1bc46e871e698996d850a240d2497d1bdae38001c3ba618715d86d87 +615b8116d8a5fb34d93a6c1dd0afb0475292c5585e9236d88aad3e2412f9e3fbff1e1fa9abd7eec70c1d1221294fda5efd72cd4324f1794093b0eddd1ef62fad +79482a9c0498f184b4bd2991deb58df7dfbb8ad755446282607d22d771db8b944ad79796a40fc3585ee62949606ecc458c15bc8a702910f808e8c66c69b9565b +5d8a314d3c94e018c8de1a8fa94fd05093f43672e23d06af89927ac06762a049136785c10607758d9053d965021d62d6f6804fc08f86e4bef210c352c144dbab +999fb7b4717509af678b985ab0b6b4ae6f7ed9ba6c4170b06c788a705430adf71bad2b5b057d03606a1ed7ebf5babd7a41cf00b0ef83a6569632cd467faddec9 +699640f6719e76b7d6ac355c7c89feca9cccad4ea7d36c65b258a206641f1b73f8b5da6a6373d9c11b90c537e7f08dce66b7bbeae00dc8e257e7f0fd2badd586 +8b37a088d1e4600ead1ddaef67d40bc898b3ed4af81ac0d76a197c86826828a24bb318f3442d8ab518dfe3a20f000d6458d104a9694ac6d88728eee2782428d6 +0cf03ac1a5193be4cbb921cd0b495fd054b5bd0f530c1931a3f7eaf9f7af9e3f45c70f9e1d3ff8e9f8e1c3e3073f5a42ceaa6d9c84e5552fbffdeccfc71fa33f +9e7ef3f2d117d57859c6fffac327bffcfc793510d26726ce8b2f9ffcf6ecc98baf3efdfdbb4715f04d814765f890c644a29be408edf3181433567125272371be +15c308d3f28acd249438c19a4b05fd9e8a1cf4cd296699771c393ac4b5e01d01e5a30a787d72cf1178108989a2159c77a2d801ee72ce3a5c545a6147f32a9979 +3849c26ae66252c6ed637c58c5bb8b13c7bfbd490a75330f4b47f16e441c31f7184e140e494214d273fc80900aedee52ead87597fa824b3e56e82e451d4c2b4d +32a423279a668bb6690c7e9956e90cfe766cb37b077538abd27a8b1cba48c80acc2a841f12e698f13a9e281c57911ce298950d7e03aba84ac8c154f8655c4f2a +f074481847bd804859b5e696007d4b4edfc150b12addbecba6b18b148a1e54d1bc81392f23b7f84137c2715a851dd0242a633f900710a218ed715505dfe56e86 +e877f0034e16bafb0e258ebb4faf06b769e888340b103d3311da9750aa9d0a1cd3e4efca31a3508f6d0c5c5c398602f8e2ebc71591f5b616e24dd893aa3261fb +44f95d843b5974bb5c04f4edafb95b7892ec1108f3f98de75dc97d5772bdff7cc95d94cf672db4b3da0a6557f70db629362d72bcb0431e53c6066acac80d699a +6409fb44d08741bdce9c0e4971624a2378cceaba830b05366b90e0ea23aaa241845368b0eb9e2612ca8c742851ca251ceccc70256d8d87265dd96361531f186c +3d9058edf2c00eafe8e1fc5c509031bb4d680e9f39a3154de0accc56ae644441edd76156d7429d995bdd88664a9dc3ad50197c38af1a0c16d684060441db0256 +5e85f3b9660d0713cc48a0ed6ef7dedc2dc60b17e92219e180643ed27acffba86e9c94c78ab90980d8a9f0913ee49d62b512b79626fb06dccee2a432bbc60276 +b9f7dec44b7904cfbca4f3f6443ab2a49c9c2c41476dafd55c6e7ac8c769db1bc399161ee314bc2e75cf8759081743be1236ec4f4d6693e5336fb672c5dc24a8 +c33585b5fb9cc24e1d4885545b58463634cc5416022cd19cacfccb4d30eb45296023fd35a458598360f8d7a4003bbaae25e331f155d9d9a5116d3bfb9a95523e +51440ca2e0088dd844ec6370bf0e55d027a012ae264c45d02f708fa6ad6da6dce29c255df9f6cae0ec38666984b372ab5334cf640b37795cc860de4ae2816e95 +b21be5ceaf8a49f90b52a51cc6ff3355f47e0237052b81f6800fd7b802239daf6d8f0b1571a8426944fdbe80c6c1d40e8816b88b8569082ab84c36ff0539d4ff +6dce591a26ade1c0a7f669880485fd484582903d284b26fa4e2156cff62e4b9265844c4495c495a9157b440e091bea1ab8aaf7760f4510eaa69a6465c0e04ec6 +9ffb9e65d028d44d4e39df9c1a52ecbd3607fee9cec7263328e5d661d3d0e4f62f44acd855ed7ab33cdf7bcb8ae889599bd5c8b3029895b6825696f6af29c239 +b75a5bb1e6345e6ee6c28117e73586c1a2214ae1be07e93fb0ff51e133fb65426fa843be0fb515c187064d0cc206a2fa926d3c902e907670048d931db4c1a449 +59d366ad93b65abe595f70a75bf03d616c2dd959fc7d4e6317cd99cbcec9c58b34766661c7d6766ca1a9c1b327531486c6f941c638c67cd22a7f75e2a37be0e8 +2db8df9f30254d30c1372581a1f51c983c80e4b71ccdd28dbf000000ffff0300504b0304140006000800000021000dd1909fb60000001b010000270000007468 +656d652f7468656d652f5f72656c732f7468656d654d616e616765722e786d6c2e72656c73848f4d0ac2301484f78277086f6fd3ba109126dd88d0add40384e4 +350d363f2451eced0dae2c082e8761be9969bb979dc9136332de3168aa1a083ae995719ac16db8ec8e4052164e89d93b64b060828e6f37ed1567914b284d2624 +52282e3198720e274a939cd08a54f980ae38a38f56e422a3a641c8bbd048f7757da0f19b017cc524bd62107bd5001996509affb3fd381a89672f1f165dfe5141 +73d9850528a2c6cce0239baa4c04ca5bbabac4df000000ffff0300504b01022d0014000600080000002100e9de0fbfff0000001c020000130000000000000000 +0000000000000000005b436f6e74656e745f54797065735d2e786d6c504b01022d0014000600080000002100a5d6a7e7c0000000360100000b00000000000000 +000000000000300100005f72656c732f2e72656c73504b01022d00140006000800000021006b799616830000008a0000001c0000000000000000000000000019 +0200007468656d652f7468656d652f7468656d654d616e616765722e786d6c504b01022d001400060008000000210096b5ade296060000501b00001600000000 +000000000000000000d60200007468656d652f7468656d652f7468656d65312e786d6c504b01022d00140006000800000021000dd1909fb60000001b01000027 +00000000000000000000000000a00900007468656d652f7468656d652f5f72656c732f7468656d654d616e616765722e786d6c2e72656c73504b050600000000050005005d0100009b0a00000000} +{\*\colorschememapping 3c3f786d6c2076657273696f6e3d22312e302220656e636f64696e673d225554462d3822207374616e64616c6f6e653d22796573223f3e0d0a3c613a636c724d +617020786d6c6e733a613d22687474703a2f2f736368656d61732e6f70656e786d6c666f726d6174732e6f72672f64726177696e676d6c2f323030362f6d6169 +6e22206267313d226c743122207478313d22646b3122206267323d226c743222207478323d22646b322220616363656e74313d22616363656e74312220616363 +656e74323d22616363656e74322220616363656e74333d22616363656e74332220616363656e74343d22616363656e74342220616363656e74353d22616363656e74352220616363656e74363d22616363656e74362220686c696e6b3d22686c696e6b2220666f6c486c696e6b3d22666f6c486c696e6b222f3e} +{\*\latentstyles\lsdstimax267\lsdlockeddef0\lsdsemihiddendef1\lsdunhideuseddef1\lsdqformatdef0\lsdprioritydef99{\lsdlockedexcept \lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority0 \lsdlocked0 Normal; +\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority9 \lsdlocked0 heading 1;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority9 \lsdlocked0 heading 2;\lsdqformat1 \lsdpriority9 \lsdlocked0 heading 3; +\lsdqformat1 \lsdpriority9 \lsdlocked0 heading 4;\lsdqformat1 \lsdpriority9 \lsdlocked0 heading 5;\lsdqformat1 \lsdpriority9 \lsdlocked0 heading 6;\lsdqformat1 \lsdpriority9 \lsdlocked0 heading 7;\lsdqformat1 \lsdpriority9 \lsdlocked0 heading 8; +\lsdqformat1 \lsdpriority9 \lsdlocked0 heading 9;\lsdqformat1 \lsdpriority39 \lsdlocked0 toc 1;\lsdqformat1 \lsdpriority39 \lsdlocked0 toc 2;\lsdqformat1 \lsdpriority39 \lsdlocked0 toc 3;\lsdqformat1 \lsdpriority39 \lsdlocked0 toc 4; +\lsdqformat1 \lsdpriority39 \lsdlocked0 toc 5;\lsdqformat1 \lsdpriority39 \lsdlocked0 toc 6;\lsdqformat1 \lsdpriority39 \lsdlocked0 toc 7;\lsdqformat1 \lsdpriority39 \lsdlocked0 toc 8;\lsdqformat1 \lsdpriority39 \lsdlocked0 toc 9; +\lsdqformat1 \lsdpriority35 \lsdlocked0 caption;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority10 \lsdlocked0 Title;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority11 \lsdlocked0 Subtitle; +\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority22 \lsdlocked0 Strong;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority20 \lsdlocked0 Emphasis;\lsdsemihidden0 \lsdunhideused0 \lsdpriority59 \lsdlocked0 Table Grid; +\lsdunhideused0 \lsdlocked0 Placeholder Text;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority1 \lsdlocked0 No Spacing;\lsdsemihidden0 \lsdunhideused0 \lsdpriority60 \lsdlocked0 Light Shading; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority61 \lsdlocked0 Light List;\lsdsemihidden0 \lsdunhideused0 \lsdpriority62 \lsdlocked0 Light Grid;\lsdsemihidden0 \lsdunhideused0 \lsdpriority63 \lsdlocked0 Medium Shading 1; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority64 \lsdlocked0 Medium Shading 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority65 \lsdlocked0 Medium List 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority66 \lsdlocked0 Medium List 2; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority67 \lsdlocked0 Medium Grid 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority68 \lsdlocked0 Medium Grid 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority69 \lsdlocked0 Medium Grid 3; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority70 \lsdlocked0 Dark List;\lsdsemihidden0 \lsdunhideused0 \lsdpriority71 \lsdlocked0 Colorful Shading;\lsdsemihidden0 \lsdunhideused0 \lsdpriority72 \lsdlocked0 Colorful List; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority73 \lsdlocked0 Colorful Grid;\lsdsemihidden0 \lsdunhideused0 \lsdpriority60 \lsdlocked0 Light Shading Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority61 \lsdlocked0 Light List Accent 1; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority62 \lsdlocked0 Light Grid Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 1; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority65 \lsdlocked0 Medium List 1 Accent 1;\lsdunhideused0 \lsdlocked0 Revision;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority34 \lsdlocked0 List Paragraph; +\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority29 \lsdlocked0 Quote;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority30 \lsdlocked0 Intense Quote;\lsdsemihidden0 \lsdunhideused0 \lsdpriority66 \lsdlocked0 Medium List 2 Accent 1; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 1; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority70 \lsdlocked0 Dark List Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority71 \lsdlocked0 Colorful Shading Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority72 \lsdlocked0 Colorful List Accent 1; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority73 \lsdlocked0 Colorful Grid Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority60 \lsdlocked0 Light Shading Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority61 \lsdlocked0 Light List Accent 2; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority62 \lsdlocked0 Light Grid Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 2; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority65 \lsdlocked0 Medium List 1 Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority66 \lsdlocked0 Medium List 2 Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 2; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority70 \lsdlocked0 Dark List Accent 2; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority71 \lsdlocked0 Colorful Shading Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority72 \lsdlocked0 Colorful List Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority73 \lsdlocked0 Colorful Grid Accent 2; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority60 \lsdlocked0 Light Shading Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority61 \lsdlocked0 Light List Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority62 \lsdlocked0 Light Grid Accent 3; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority65 \lsdlocked0 Medium List 1 Accent 3; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority66 \lsdlocked0 Medium List 2 Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 3; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority70 \lsdlocked0 Dark List Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority71 \lsdlocked0 Colorful Shading Accent 3; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority72 \lsdlocked0 Colorful List Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority73 \lsdlocked0 Colorful Grid Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority60 \lsdlocked0 Light Shading Accent 4; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority61 \lsdlocked0 Light List Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority62 \lsdlocked0 Light Grid Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 4; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority65 \lsdlocked0 Medium List 1 Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority66 \lsdlocked0 Medium List 2 Accent 4; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 4; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority70 \lsdlocked0 Dark List Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority71 \lsdlocked0 Colorful Shading Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority72 \lsdlocked0 Colorful List Accent 4; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority73 \lsdlocked0 Colorful Grid Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority60 \lsdlocked0 Light Shading Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority61 \lsdlocked0 Light List Accent 5; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority62 \lsdlocked0 Light Grid Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 5; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority65 \lsdlocked0 Medium List 1 Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority66 \lsdlocked0 Medium List 2 Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 5; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority70 \lsdlocked0 Dark List Accent 5; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority71 \lsdlocked0 Colorful Shading Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority72 \lsdlocked0 Colorful List Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority73 \lsdlocked0 Colorful Grid Accent 5; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority60 \lsdlocked0 Light Shading Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority61 \lsdlocked0 Light List Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority62 \lsdlocked0 Light Grid Accent 6; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority65 \lsdlocked0 Medium List 1 Accent 6; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority66 \lsdlocked0 Medium List 2 Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 6; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority70 \lsdlocked0 Dark List Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority71 \lsdlocked0 Colorful Shading Accent 6; +\lsdsemihidden0 \lsdunhideused0 \lsdpriority72 \lsdlocked0 Colorful List Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority73 \lsdlocked0 Colorful Grid Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority19 \lsdlocked0 Subtle Emphasis; +\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority21 \lsdlocked0 Intense Emphasis;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority31 \lsdlocked0 Subtle Reference; +\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority32 \lsdlocked0 Intense Reference;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority33 \lsdlocked0 Book Title;\lsdpriority37 \lsdlocked0 Bibliography; +\lsdqformat1 \lsdpriority39 \lsdlocked0 TOC Heading;}}{\*\datastore 0105000002000000180000004d73786d6c322e534158584d4c5265616465722e362e3000000000000000000000060000 +d0cf11e0a1b11ae1000000000000000000000000000000003e000300feff090006000000000000000000000001000000010000000000000000100000feffffff00000000feffffff0000000000000000ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff +ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff +ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff +ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff +fffffffffffffffffdfffffffeffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff +ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff +ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff +ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff +ffffffffffffffffffffffffffffffff52006f006f007400200045006e00740072007900000000000000000000000000000000000000000000000000000000000000000000000000000000000000000016000500ffffffffffffffffffffffff0c6ad98892f1d411a65f0040963251e5000000000000000000000000808a +33fc965bcc01feffffff00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000ffffffffffffffffffffffff00000000000000000000000000000000000000000000000000000000 +00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000ffffffffffffffffffffffff0000000000000000000000000000000000000000000000000000 +000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000ffffffffffffffffffffffff000000000000000000000000000000000000000000000000 +0000000000000000000000000000000000000000000000000105000000000000}} \ No newline at end of file diff --git a/thirdparty/directxtex/ReadMe.txt b/thirdparty/directxtex/ReadMe.txt new file mode 100644 index 0000000..0423b92 --- /dev/null +++ b/thirdparty/directxtex/ReadMe.txt @@ -0,0 +1,192 @@ +DIRECTX TEXTURE LIBRARY (DirectXTex) +------------------------------------ + +Copyright (c) Microsoft Corporation. All rights reserved. + +November 15, 2012 + +This package contains DirectXTex, a shared source library for reading and writing DDS +files, and performing various texture content processing operations including +resizing, format conversion, mip-map generation, block compression for Direct3D runtime +texture resources, and height-map to normal-map conversion. This library makes +use of the Windows Image Component (WIC) APIs. It also includes a simple .TGA reader and +writer since this image file format is commonly used for texture content processing pipelines, +but is not currently supported by a built-in WIC codec. + +The source is written for Visual C++ 2010 using the Direct3D headers from either +a current DirectX SDK or Windows SDK. It can also be compiled using Visual Studio 2012 and the +Windows SDK 8.0 headers. + +It is recommended that you make use of Visual C++ 2010 Service Pack 1 or VS 2012, and +Windows 7 Service Pack 1 or Windows 8. + +DDSTextureLoader\ + This contains a streamlined version of the DirectX SDK sample DDSWithoutD3DX11 texture + loading code for a simple light-weight runtime DDS loader. This version only supports + Direct3D 11 and performs no runtime pixel data conversions (i.e. 24bpp legacy DDS files + always fail). This is ideal for runtime usage, and supports the full complement of + Direct3D 11 texture resources (1D, 2D, volume maps, cubemaps, mipmap levels, + texture arrays, BC formats, etc.). + +WICTextureLoader\ + This contains a Direct3D 11 2D texture loader that uses WIC to load a bitmap + (BMP, JPEG, PNG, HD Photo, or other WIC supported file container), resize if needed + based on the current feature level (or by explicit parameter), format convert to a + DXGI_FORMAT if required, and then create a 2D texture. Furthermore, if a Direct3D 11 + device context is provided and the current device supports it for the given pixel format, + it will auto-generate mipmaps. Note this does not support 1D textures, volume textures, + cubemaps, or texture arrays. DDSTextureLoader is recommended for fully "precooked" textures + for maximum performance and image quality, but this loader can be useful for creating + simple 2D texture from standard image files at runtime. + + Note: This function is not thread-safe if given a non-NULL device context for the auto-gen + mip-map support. + +DirectXTex\ + This contains the DirectXTex library. This includes a full-featured DDS reader and writer + including legacy format conversions, a TGA reader and writer, a WIC-based bitmap reader and + writer (BMP, JPEG, PNG, TIFF, and HD Photo), and various texture processing functions. This + is intended primarily for tool usage. + + Note that the majority of the header files here are intended for internal implementation + of the library only (BC.h, DDS.h, DirectXTexP.h, and scoped.h). Only DirectXTex.h is + meant as a 'public' header for the library. + +Texconv\ + This DirectXTex sample is an implementation of the "texconv" command-line texture utility + from the DirectX SDK utilizing DirectXTex rather than D3DX. + + It supports the same arguments as the Texture Conversion Tool Extended (texconvex.exe) DirectX + SDK utility. See . The primary differences + are the -10 and -11 arguments are not applicable; the filter names (POINT, LINEAR, CUBIC, + FANT, POINT_DITHER, LINEAR_DITHER, CUBIC_DITHER, FANT_DITHER); and support for the .TGA file format. + This also includes support for JPEG XR/HD Photo bitmap formats (see + ) + +DDSView\ + This DirectXTex sample is a simple Direct3D 11-based viewer for DDS files. For array textures + or volume maps, the "<" and ">" keyboard keys will show different images contained in the DDS. + The "1" through "0" keys can also be used to jump to a specific image index. + +XNAMath\ + This contains a copy of XNA Math version 2.05, which is an updated version of the library. This is + required if building content with USE_XNAMATH (the default for the VS 2010 projects). The VS 2012 + projects use DirectXMath in the Windows SDK 8.0 instead. + For details see + + +All content and source code for this package except XNA Math are bound to the Microsoft Public License (Ms-PL) +. The XNA Math library is subject +to the DirectX SDK (June 2010) End-User License Agreement. + +http://go.microsoft.com/fwlink/?LinkId=248926 + + +------------------------------------ +RELEASE NOTES + +* The DirectXTex library does not support block compression or decompression of mipmapped non-power-of-2 textures, + although DDSTextureLoader will load these files correctly if the underlying device supports it. + +* The DirectXTex library only supports CLAMP filtering, and does not yet support MIRROR or WRAP filtering + (WIC operations only support CLAMP filtering). + +* The DirectXTex library only supports box and POINT filtering, and does not support LINEAR or CUBIC filtering, + for 3D volume mipmap-generation. + +* Due to the underlying Windows BMP WIC codec, alpha channels are not supported for 16bpp or 32bpp BMP pixel format files. The Windows 8 + version of the Windows BMP WIC codec does support 32bpp pixel formats with alpha when using the BITMAPV5HEADER file header. Note the updated + WIC is available on Windows 7 SP1 with KB 2670838 installed. + +* The WIC conversion cases currently ignore TEX_FILTER_SRGB_IN and TEX_FILTER_SRGB_OUT out. + +* For the DXGI 1.1 version of DirectXTex, 4:4:4:4 pixel format DDS files are always expanded to 8:8:8:8 upon load since DXGI 1.0 + and DXGI 1.1 versions of Direct3D do not support these resource formats. The DXGI 1.2 versions of DirectXTex and DDSTextureLoader + make use of the DXGI_FORMAT_B4G4R4A4_UNORM format instead. + +* While DXGI 1.0 and DXGI 1.1 include 5:6:5 (DXGI_FORMAT_B5G6R5_UNORM) and 5:5:5:1 (DXGI_FORMAT_B5G5R5A1_UNORM) + pixel format enumerations, the DirectX 10.x and 11.0 Runtimes do not support these formats for use with Direct3D. The DirectX 11.1 runtime, + DXGI 1.2, and the WDDM 1.2 driver model fully support 16bpp formats (5:6:5, 5:5:5:1, and 4:4:4:4). The DXGI 1.2 version of WICTextureLoader + will load 16bpp pixel images as 5:6:5 or 5:5:5:1 rather than expand them to 32bpp RGBA. + +* WICTextureLoader cannot load .TGA files unless the system has a 3rd party WIC codec installed. You must use the DirectXTex + library for TGA file format support without relying on an add-on WIC codec. + +* Loading of 96bpp floating-point TIFF files results in a corrupted image prior to Windows 8. This fix is available on Windows 7 SP1 with + KB 2670838 installed. + + +------------------------------------ +RELEASE HISTORY + +November 15, 2012 + Added support for WIC2 when available on Windows 8 and Windows 7 with KB 2670838 + Added optional targetGUID parameter to SaveWIC* APIs to influence final container pixel format choice + Fixed bug in SaveDDS* which was generating invalid DDS files for 1D dimension textures + Improved robustness of CaptureTexture when resolving MSAA source textures + Sync'd DDSTextureLoader, ScreenGrab, and WICTextureLoader standalone versions with latest DirectXTK release + +September 28, 2012 + Added ScreenGrab module for creating runtime screenshots + Renamed project files for better naming consistency + New Typeless utilities for DirectXTex + Some minor code cleanup for DirectXTex's WIC writer function + Bug fixes and new -tu/-tf options for texconv + +June 22, 2012 + Moved to using XNA Math 2.05 instead of XNA Math 2.04 for USE_XNAMATH builds + Fixed BGR vs. RGB color channel swizzle problem with 24bpp legacy .DDS files in DirectXTex + Update to DirectXTex WIC and WICTextureLoader for additional 96bpp float format handling on Windows 8 + +May 31, 2012 + Minor fix for DDSTextureLoader's retry fallback that can happen with 10level9 feature levels + Switched to use "_DEBUG" instead of "DEBUG" and cleaned up debug warnings + added Metro style application project files for DirectXTex + +April 20, 2012 + DirectTex's WIC-based writer opts-in for the Windows 8 BMP encoder option for writing 32 bpp RGBA files with the BITMAPV5HEADER + +March 30, 2012 + WICTextureLoader updated with Windows 8 WIC pixel formats + DirectXTex updated with limited non-power-of-2 texture support and TEX_FILTER_SEPARATE_ALPHA option + Texconv updated with '-sepalpha' command-line option + Added USE_XNAMATH control define to build DirectXTex using either XNAMath or DirectXMath + Added VS 2012 project files (which use DirectXMath instead of XNAMath and define DXGI_1_2_FORMATS) + +March 15, 2012 + Fix for resource leak in CreateShaderResourceView() Direct3D 11 helper function in DirectXTex + +March 5, 2012 + Fix for too much temp memory allocated by WICTextureLoader; cleaned up legacy 'min/max' macro usage in DirectXTex + +February 21, 2012 + WICTextureLoader updated to handle systems and device drivers without BGRA or 16bpp format support + +February 20, 2012 + Some code cleanup for DirectXTex and DDSTextureLoader + Fixed bug in 10:10:10:2 format fixup in the LoadDDSFromMemory function + Fixed bugs in "non-zero alpha" special-case handling in LoadTGAFromFile + Fixed bug in _SwizzleScanline when copying alpha channel for BGRA<->RGBA swizzling + +February 11, 2012 + Update of DDSTextureLoader to also build in Metro style apps; added WICTextureLoader + Added CMYK WIC pixel formats to the DirectXTex conversion table + +January 30, 2012 + Minor code-cleanup for DirectXTex to enable use of PCH through 'directxtexp.h' header + +January 24, 2011 + Some code-cleanup for DirectXTex + Added DXGI 1.2 implementation for DDSTextureLoader and DirectXTex guarded with DXGI_1_2_FORMATS compiliation define + +December 16, 2011 + Fixed x64 compilation warnings in DDSTextureLoader + +November 30, 2011 + Fixed some of the constants used in IsSupportedTexture(), + added ability to strip off top levels of mips in DDSTextureLoader, + changed DirectXTex to use CoCreateInstance rather than LoadLibrary to obtain the WIC factory, + a few minor /analyze related annotations for DirectXTex + +October 27, 2011 + Original release \ No newline at end of file diff --git a/thirdparty/directxtex/XNAMath/xnamath.h b/thirdparty/directxtex/XNAMath/xnamath.h new file mode 100644 index 0000000..941af11 --- /dev/null +++ b/thirdparty/directxtex/XNAMath/xnamath.h @@ -0,0 +1,3397 @@ +/************************************************************************ +* * +* XNAMath.h -- SIMD C++ Math library for Windows and Xbox 360 * +* * +* Copyright (c) Microsoft Corp. All rights reserved. * +* * +************************************************************************/ + +#if defined(_MSC_VER) && (_MSC_VER > 1000) +#pragma once +#endif + +#ifndef __XNAMATH_H__ +#define __XNAMATH_H__ + +#ifdef __XBOXMATH_H__ +#error XNAMATH and XBOXMATH are incompatible in the same compilation module. Use one or the other. +#endif + +#define XNAMATH_VERSION 205 + +#if !defined(_XM_X64_) && !defined(_XM_X86_) +#if defined(_M_AMD64) || defined(_AMD64_) +#define _XM_X64_ +#elif defined(_M_IX86) || defined(_X86_) +#define _XM_X86_ +#endif +#endif + + +#if !defined(_XM_BIGENDIAN_) && !defined(_XM_LITTLEENDIAN_) +#if defined(_XM_X64_) || defined(_XM_X86_) +#define _XM_LITTLEENDIAN_ +#elif defined(_XBOX_VER) +#define _XM_BIGENDIAN_ +#else +#error xnamath.h does not support this target +#endif +#endif + +#if defined(_XM_X86_) || defined(_XM_X64_) +#define _XM_SSE_INTRINSICS_ +#if !defined(__cplusplus) && !defined(_XM_NO_INTRINSICS_) +#error xnamath.h only supports C compliation for Xbox 360 targets and no intrinsics cases for x86/x64 +#endif +#elif defined(_XBOX_VER) +#if !defined(__VMX128_SUPPORTED) && !defined(_XM_NO_INTRINSICS_) +#error xnamath.h requires VMX128 compiler support for XBOX 360 +#endif // !__VMX128_SUPPORTED && !_XM_NO_INTRINSICS_ +#define _XM_VMX128_INTRINSICS_ +#elif !defined(_XM_NO_INTRINSICS_) +#error xnamath.h does not support this target +#endif + + +#if defined(_XM_SSE_INTRINSICS_) +#ifndef _XM_NO_INTRINSICS_ +#include +#include +#endif +#elif defined(_XM_VMX128_INTRINSICS_) +#error This version of xnamath.h does not support Xbox 360 +#endif + +#if defined(_XM_SSE_INTRINSICS_) +#pragma warning(push) +#pragma warning(disable:4985) +#endif +#include +#if defined(_XM_SSE_INTRINSICS_) +#pragma warning(pop) +#endif + + +#include + + +#if !defined(XMINLINE) +#if !defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#define XMINLINE __inline +#else +#define XMINLINE __forceinline +#endif +#endif + +#if !defined(XMFINLINE) +#define XMFINLINE __forceinline +#endif + +#if !defined(XMDEBUG) +#if defined(_DEBUG) +#define XMDEBUG +#endif +#endif // !XMDEBUG + +#if !defined(XMASSERT) +#if defined(_PREFAST_) +#define XMASSERT(Expression) __analysis_assume((Expression)) +#elif defined(XMDEBUG) // !_PREFAST_ +#define XMASSERT(Expression) ((VOID)((Expression) || (XMAssert(#Expression, __FILE__, __LINE__), 0))) +#else // !XMDEBUG +#define XMASSERT(Expression) ((VOID)0) +#endif // !XMDEBUG +#endif // !XMASSERT + +#if !defined(XM_NO_ALIGNMENT) +#define _DECLSPEC_ALIGN_16_ __declspec(align(16)) +#else +#define _DECLSPEC_ALIGN_16_ +#endif + + +#if defined(_MSC_VER) && (_MSC_VER<1500) && (_MSC_VER>=1400) +#define _XM_ISVS2005_ +#endif + +/**************************************************************************** + * + * Constant definitions + * + ****************************************************************************/ + +#define XM_PI 3.141592654f +#define XM_2PI 6.283185307f +#define XM_1DIVPI 0.318309886f +#define XM_1DIV2PI 0.159154943f +#define XM_PIDIV2 1.570796327f +#define XM_PIDIV4 0.785398163f + +#define XM_SELECT_0 0x00000000 +#define XM_SELECT_1 0xFFFFFFFF + +#define XM_PERMUTE_0X 0x00010203 +#define XM_PERMUTE_0Y 0x04050607 +#define XM_PERMUTE_0Z 0x08090A0B +#define XM_PERMUTE_0W 0x0C0D0E0F +#define XM_PERMUTE_1X 0x10111213 +#define XM_PERMUTE_1Y 0x14151617 +#define XM_PERMUTE_1Z 0x18191A1B +#define XM_PERMUTE_1W 0x1C1D1E1F + +#define XM_CRMASK_CR6 0x000000F0 +#define XM_CRMASK_CR6TRUE 0x00000080 +#define XM_CRMASK_CR6FALSE 0x00000020 +#define XM_CRMASK_CR6BOUNDS XM_CRMASK_CR6FALSE + + +#define XM_CACHE_LINE_SIZE 64 + +/**************************************************************************** + * + * Macros + * + ****************************************************************************/ + +// Unit conversion + +XMFINLINE FLOAT XMConvertToRadians(FLOAT fDegrees) { return fDegrees * (XM_PI / 180.0f); } +XMFINLINE FLOAT XMConvertToDegrees(FLOAT fRadians) { return fRadians * (180.0f / XM_PI); } + +// Condition register evaluation proceeding a recording (Rc) comparison + +#define XMComparisonAllTrue(CR) (((CR) & XM_CRMASK_CR6TRUE) == XM_CRMASK_CR6TRUE) +#define XMComparisonAnyTrue(CR) (((CR) & XM_CRMASK_CR6FALSE) != XM_CRMASK_CR6FALSE) +#define XMComparisonAllFalse(CR) (((CR) & XM_CRMASK_CR6FALSE) == XM_CRMASK_CR6FALSE) +#define XMComparisonAnyFalse(CR) (((CR) & XM_CRMASK_CR6TRUE) != XM_CRMASK_CR6TRUE) +#define XMComparisonMixed(CR) (((CR) & XM_CRMASK_CR6) == 0) +#define XMComparisonAllInBounds(CR) (((CR) & XM_CRMASK_CR6BOUNDS) == XM_CRMASK_CR6BOUNDS) +#define XMComparisonAnyOutOfBounds(CR) (((CR) & XM_CRMASK_CR6BOUNDS) != XM_CRMASK_CR6BOUNDS) + + +#define XMMin(a, b) (((a) < (b)) ? (a) : (b)) +#define XMMax(a, b) (((a) > (b)) ? (a) : (b)) + +/**************************************************************************** + * + * Data types + * + ****************************************************************************/ + +#pragma warning(push) +#pragma warning(disable:4201 4365 4324) + +#ifdef _XM_BIGENDIAN_ +#pragma bitfield_order(push) +#pragma bitfield_order(lsb_to_msb) +#endif + +//------------------------------------------------------------------------------ +#if defined(_XM_NO_INTRINSICS_) && !defined(_XBOX_VER) +// The __vector4 structure is an intrinsic on Xbox but must be separately defined +// for x86/x64 +typedef struct __vector4 +{ + union + { + float vector4_f32[4]; + unsigned int vector4_u32[4]; +#ifndef XM_STRICT_VECTOR4 + struct + { + FLOAT x; + FLOAT y; + FLOAT z; + FLOAT w; + }; + FLOAT v[4]; + UINT u[4]; +#endif // !XM_STRICT_VECTOR4 + }; +} __vector4; +#endif // _XM_NO_INTRINSICS_ + +//------------------------------------------------------------------------------ +#if (defined (_XM_X86_) || defined(_XM_X64_)) && defined(_XM_NO_INTRINSICS_) +typedef UINT __vector4i[4]; +#else +typedef __declspec(align(16)) UINT __vector4i[4]; +#endif + +//------------------------------------------------------------------------------ +// Vector intrinsic: Four 32 bit floating point components aligned on a 16 byte +// boundary and mapped to hardware vector registers +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +typedef __m128 XMVECTOR; +#else +typedef __vector4 XMVECTOR; +#endif + +// Fix-up for (1st-3rd) XMVECTOR parameters that are pass-in-register for x86 and Xbox 360, but not for other targets +#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +typedef const XMVECTOR FXMVECTOR; +#elif defined(_XM_X86_) && !defined(_XM_NO_INTRINSICS_) +typedef const XMVECTOR FXMVECTOR; +#elif defined(__cplusplus) +typedef const XMVECTOR& FXMVECTOR; +#else +typedef const XMVECTOR FXMVECTOR; +#endif + +// Fix-up for (4th+) XMVECTOR parameters to pass in-register for Xbox 360 and by reference otherwise +#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +typedef const XMVECTOR CXMVECTOR; +#elif defined(__cplusplus) +typedef const XMVECTOR& CXMVECTOR; +#else +typedef const XMVECTOR CXMVECTOR; +#endif + +//------------------------------------------------------------------------------ +// Conversion types for constants +typedef _DECLSPEC_ALIGN_16_ struct XMVECTORF32 { + union { + float f[4]; + XMVECTOR v; + }; + +#if defined(__cplusplus) + inline operator XMVECTOR() const { return v; } + inline operator const float*() const { return f; } +#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_) + inline operator __m128i() const { return reinterpret_cast(&v)[0]; } + inline operator __m128d() const { return reinterpret_cast(&v)[0]; } +#endif +#endif // __cplusplus +} XMVECTORF32; + +typedef _DECLSPEC_ALIGN_16_ struct XMVECTORI32 { + union { + INT i[4]; + XMVECTOR v; + }; +#if defined(__cplusplus) + inline operator XMVECTOR() const { return v; } +#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_) + inline operator __m128i() const { return reinterpret_cast(&v)[0]; } + inline operator __m128d() const { return reinterpret_cast(&v)[0]; } +#endif +#endif // __cplusplus +} XMVECTORI32; + +typedef _DECLSPEC_ALIGN_16_ struct XMVECTORU8 { + union { + BYTE u[16]; + XMVECTOR v; + }; +#if defined(__cplusplus) + inline operator XMVECTOR() const { return v; } +#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_) + inline operator __m128i() const { return reinterpret_cast(&v)[0]; } + inline operator __m128d() const { return reinterpret_cast(&v)[0]; } +#endif +#endif // __cplusplus +} XMVECTORU8; + +typedef _DECLSPEC_ALIGN_16_ struct XMVECTORU32 { + union { + UINT u[4]; + XMVECTOR v; + }; +#if defined(__cplusplus) + inline operator XMVECTOR() const { return v; } +#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_) + inline operator __m128i() const { return reinterpret_cast(&v)[0]; } + inline operator __m128d() const { return reinterpret_cast(&v)[0]; } +#endif +#endif // __cplusplus +} XMVECTORU32; + +//------------------------------------------------------------------------------ +// Vector operators +#if defined(__cplusplus) && !defined(XM_NO_OPERATOR_OVERLOADS) + +XMVECTOR operator+ (FXMVECTOR V); +XMVECTOR operator- (FXMVECTOR V); + +XMVECTOR& operator+= (XMVECTOR& V1, FXMVECTOR V2); +XMVECTOR& operator-= (XMVECTOR& V1, FXMVECTOR V2); +XMVECTOR& operator*= (XMVECTOR& V1, FXMVECTOR V2); +XMVECTOR& operator/= (XMVECTOR& V1, FXMVECTOR V2); +XMVECTOR& operator*= (XMVECTOR& V, FLOAT S); +XMVECTOR& operator/= (XMVECTOR& V, FLOAT S); + +XMVECTOR operator+ (FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR operator- (FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR operator* (FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR operator/ (FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR operator* (FXMVECTOR V, FLOAT S); +XMVECTOR operator* (FLOAT S, FXMVECTOR V); +XMVECTOR operator/ (FXMVECTOR V, FLOAT S); + +#endif // __cplusplus && !XM_NO_OPERATOR_OVERLOADS + +//------------------------------------------------------------------------------ +// Matrix type: Sixteen 32 bit floating point components aligned on a +// 16 byte boundary and mapped to four hardware vector registers +#if (defined(_XM_X86_) || defined(_XM_X64_)) && defined(_XM_NO_INTRINSICS_) +typedef struct _XMMATRIX +#else +typedef _DECLSPEC_ALIGN_16_ struct _XMMATRIX +#endif +{ +#if defined(_XM_NO_INTRINSICS_) || !defined(XM_STRICT_MATRIX) + union + { + XMVECTOR r[4]; + struct + { + FLOAT _11, _12, _13, _14; + FLOAT _21, _22, _23, _24; + FLOAT _31, _32, _33, _34; + FLOAT _41, _42, _43, _44; + }; + FLOAT m[4][4]; + }; +#else + XMVECTOR r[4]; +#endif + +#ifdef __cplusplus + + _XMMATRIX() {}; + _XMMATRIX(FXMVECTOR R0, FXMVECTOR R1, FXMVECTOR R2, CXMVECTOR R3); + _XMMATRIX(FLOAT m00, FLOAT m01, FLOAT m02, FLOAT m03, + FLOAT m10, FLOAT m11, FLOAT m12, FLOAT m13, + FLOAT m20, FLOAT m21, FLOAT m22, FLOAT m23, + FLOAT m30, FLOAT m31, FLOAT m32, FLOAT m33); + explicit _XMMATRIX(_In_count_c_(16) CONST FLOAT *pArray); + +#if defined(_XM_NO_INTRINSICS_) || !defined(XM_STRICT_MATRIX) + FLOAT operator() (UINT Row, UINT Column) CONST { return m[Row][Column]; } + FLOAT& operator() (UINT Row, UINT Column) { return m[Row][Column]; } +#endif + + _XMMATRIX& operator= (CONST _XMMATRIX& M); + +#ifndef XM_NO_OPERATOR_OVERLOADS + _XMMATRIX& operator*= (CONST _XMMATRIX& M); + _XMMATRIX operator* (CONST _XMMATRIX& M) CONST; +#endif // !XM_NO_OPERATOR_OVERLOADS + +#endif // __cplusplus + +} XMMATRIX; + +// Fix-up for XMMATRIX parameters to pass in-register on Xbox 360, by reference otherwise +#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +typedef const XMMATRIX CXMMATRIX; +#elif defined(__cplusplus) +typedef const XMMATRIX& CXMMATRIX; +#else +typedef const XMMATRIX CXMMATRIX; +#endif + +//------------------------------------------------------------------------------ +// 16 bit floating point number consisting of a sign bit, a 5 bit biased +// exponent, and a 10 bit mantissa +typedef USHORT HALF; + +//------------------------------------------------------------------------------ +// 2D Vector; 32 bit floating point components +typedef struct _XMFLOAT2 +{ + FLOAT x; + FLOAT y; + +#ifdef __cplusplus + + _XMFLOAT2() {}; + _XMFLOAT2(FLOAT _x, FLOAT _y) : x(_x), y(_y) {}; + _XMFLOAT2(_In_count_c_(2) CONST FLOAT *pArray); + + _XMFLOAT2& operator= (CONST _XMFLOAT2& Float2); + +#endif // __cplusplus + +} XMFLOAT2; + +// 2D Vector; 32 bit floating point components aligned on a 16 byte boundary +#ifdef __cplusplus +__declspec(align(16)) struct XMFLOAT2A : public XMFLOAT2 +{ + XMFLOAT2A() : XMFLOAT2() {}; + XMFLOAT2A(FLOAT _x, FLOAT _y) : XMFLOAT2(_x, _y) {}; + XMFLOAT2A(_In_count_c_(2) CONST FLOAT *pArray) : XMFLOAT2(pArray) {}; + + XMFLOAT2A& operator= (CONST XMFLOAT2A& Float2); +}; +#else +typedef __declspec(align(16)) XMFLOAT2 XMFLOAT2A; +#endif // __cplusplus + +//------------------------------------------------------------------------------ +// 2D Vector; 32 bit signed integer components +typedef struct _XMINT2 +{ + INT x; + INT y; + +#ifdef __cplusplus + + _XMINT2() {}; + _XMINT2(INT _x, INT _y) : x(_x), y(_y) {}; + explicit _XMINT2(_In_count_c_(2) CONST INT *pArray); + + _XMINT2& operator= (CONST _XMINT2& Int2); + +#endif // __cplusplus + +} XMINT2; + +// 2D Vector; 32 bit unsigned integer components +typedef struct _XMUINT2 +{ + UINT x; + UINT y; + +#ifdef __cplusplus + + _XMUINT2() {}; + _XMUINT2(UINT _x, UINT _y) : x(_x), y(_y) {}; + explicit _XMUINT2(_In_count_c_(2) CONST UINT *pArray); + + _XMUINT2& operator= (CONST _XMUINT2& UInt2); + +#endif // __cplusplus + +} XMUINT2; + +//------------------------------------------------------------------------------ +// 2D Vector; 16 bit floating point components +typedef struct _XMHALF2 +{ + HALF x; + HALF y; + +#ifdef __cplusplus + + _XMHALF2() {}; + _XMHALF2(HALF _x, HALF _y) : x(_x), y(_y) {}; + explicit _XMHALF2(_In_count_c_(2) CONST HALF *pArray); + _XMHALF2(FLOAT _x, FLOAT _y); + explicit _XMHALF2(_In_count_c_(2) CONST FLOAT *pArray); + + _XMHALF2& operator= (CONST _XMHALF2& Half2); + +#endif // __cplusplus + +} XMHALF2; + +//------------------------------------------------------------------------------ +// 2D Vector; 16 bit signed normalized integer components +typedef struct _XMSHORTN2 +{ + SHORT x; + SHORT y; + +#ifdef __cplusplus + + _XMSHORTN2() {}; + _XMSHORTN2(SHORT _x, SHORT _y) : x(_x), y(_y) {}; + explicit _XMSHORTN2(_In_count_c_(2) CONST SHORT *pArray); + _XMSHORTN2(FLOAT _x, FLOAT _y); + explicit _XMSHORTN2(_In_count_c_(2) CONST FLOAT *pArray); + + _XMSHORTN2& operator= (CONST _XMSHORTN2& ShortN2); + +#endif // __cplusplus + +} XMSHORTN2; + +// 2D Vector; 16 bit signed integer components +typedef struct _XMSHORT2 +{ + SHORT x; + SHORT y; + +#ifdef __cplusplus + + _XMSHORT2() {}; + _XMSHORT2(SHORT _x, SHORT _y) : x(_x), y(_y) {}; + explicit _XMSHORT2(_In_count_c_(2) CONST SHORT *pArray); + _XMSHORT2(FLOAT _x, FLOAT _y); + explicit _XMSHORT2(_In_count_c_(2) CONST FLOAT *pArray); + + _XMSHORT2& operator= (CONST _XMSHORT2& Short2); + +#endif // __cplusplus + +} XMSHORT2; + +// 2D Vector; 16 bit unsigned normalized integer components +typedef struct _XMUSHORTN2 +{ + USHORT x; + USHORT y; + +#ifdef __cplusplus + + _XMUSHORTN2() {}; + _XMUSHORTN2(USHORT _x, USHORT _y) : x(_x), y(_y) {}; + explicit _XMUSHORTN2(_In_count_c_(2) CONST USHORT *pArray); + _XMUSHORTN2(FLOAT _x, FLOAT _y); + explicit _XMUSHORTN2(_In_count_c_(2) CONST FLOAT *pArray); + + _XMUSHORTN2& operator= (CONST _XMUSHORTN2& UShortN2); + +#endif // __cplusplus + +} XMUSHORTN2; + +// 2D Vector; 16 bit unsigned integer components +typedef struct _XMUSHORT2 +{ + USHORT x; + USHORT y; + +#ifdef __cplusplus + + _XMUSHORT2() {}; + _XMUSHORT2(USHORT _x, USHORT _y) : x(_x), y(_y) {}; + explicit _XMUSHORT2(_In_count_c_(2) CONST USHORT *pArray); + _XMUSHORT2(FLOAT _x, FLOAT _y); + explicit _XMUSHORT2(_In_count_c_(2) CONST FLOAT *pArray); + + _XMUSHORT2& operator= (CONST _XMUSHORT2& UShort2); + +#endif // __cplusplus + +} XMUSHORT2; + +//------------------------------------------------------------------------------ +// 2D Vector; 8 bit signed normalized integer components +typedef struct _XMBYTEN2 +{ + CHAR x; + CHAR y; + +#ifdef __cplusplus + + _XMBYTEN2() {}; + _XMBYTEN2(CHAR _x, CHAR _y) : x(_x), y(_y) {}; + explicit _XMBYTEN2(_In_count_c_(2) CONST CHAR *pArray); + _XMBYTEN2(FLOAT _x, FLOAT _y); + explicit _XMBYTEN2(_In_count_c_(2) CONST FLOAT *pArray); + + _XMBYTEN2& operator= (CONST _XMBYTEN2& ByteN2); + +#endif // __cplusplus + +} XMBYTEN2; + +// 2D Vector; 8 bit signed integer components +typedef struct _XMBYTE2 +{ + CHAR x; + CHAR y; + +#ifdef __cplusplus + + _XMBYTE2() {}; + _XMBYTE2(CHAR _x, CHAR _y) : x(_x), y(_y) {}; + explicit _XMBYTE2(_In_count_c_(2) CONST CHAR *pArray); + _XMBYTE2(FLOAT _x, FLOAT _y); + explicit _XMBYTE2(_In_count_c_(2) CONST FLOAT *pArray); + + _XMBYTE2& operator= (CONST _XMBYTE2& Byte2); + +#endif // __cplusplus + +} XMBYTE2; + +// 2D Vector; 8 bit unsigned normalized integer components +typedef struct _XMUBYTEN2 +{ + BYTE x; + BYTE y; + +#ifdef __cplusplus + + _XMUBYTEN2() {}; + _XMUBYTEN2(BYTE _x, BYTE _y) : x(_x), y(_y) {}; + explicit _XMUBYTEN2(_In_count_c_(2) CONST BYTE *pArray); + _XMUBYTEN2(FLOAT _x, FLOAT _y); + explicit _XMUBYTEN2(_In_count_c_(2) CONST FLOAT *pArray); + + _XMUBYTEN2& operator= (CONST _XMUBYTEN2& UByteN2); + +#endif // __cplusplus + +} XMUBYTEN2; + + +// 2D Vector; 8 bit unsigned integer components +typedef struct _XMUBYTE2 +{ + BYTE x; + BYTE y; + +#ifdef __cplusplus + + _XMUBYTE2() {}; + _XMUBYTE2(BYTE _x, BYTE _y) : x(_x), y(_y) {}; + explicit _XMUBYTE2(_In_count_c_(2) CONST BYTE *pArray); + _XMUBYTE2(FLOAT _x, FLOAT _y); + explicit _XMUBYTE2(_In_count_c_(2) CONST FLOAT *pArray); + + _XMUBYTE2& operator= (CONST _XMUBYTE2& UByte2); + +#endif // __cplusplus + +} XMUBYTE2; + +//------------------------------------------------------------------------------ +// 3D Vector; 32 bit floating point components +typedef struct _XMFLOAT3 +{ + FLOAT x; + FLOAT y; + FLOAT z; + +#ifdef __cplusplus + + _XMFLOAT3() {}; + _XMFLOAT3(FLOAT _x, FLOAT _y, FLOAT _z) : x(_x), y(_y), z(_z) {}; + _XMFLOAT3(_In_count_c_(3) CONST FLOAT *pArray); + + _XMFLOAT3& operator= (CONST _XMFLOAT3& Float3); + +#endif // __cplusplus + +} XMFLOAT3; + +// 3D Vector; 32 bit floating point components aligned on a 16 byte boundary +#ifdef __cplusplus +__declspec(align(16)) struct XMFLOAT3A : public XMFLOAT3 +{ + XMFLOAT3A() : XMFLOAT3() {}; + XMFLOAT3A(FLOAT _x, FLOAT _y, FLOAT _z) : XMFLOAT3(_x, _y, _z) {}; + XMFLOAT3A(_In_count_c_(3) CONST FLOAT *pArray) : XMFLOAT3(pArray) {}; + + XMFLOAT3A& operator= (CONST XMFLOAT3A& Float3); +}; +#else +typedef __declspec(align(16)) XMFLOAT3 XMFLOAT3A; +#endif // __cplusplus + +//------------------------------------------------------------------------------ +// 3D Vector; 32 bit signed integer components +typedef struct _XMINT3 +{ + INT x; + INT y; + INT z; + +#ifdef __cplusplus + + _XMINT3() {}; + _XMINT3(INT _x, INT _y, INT _z) : x(_x), y(_y), z(_z) {}; + explicit _XMINT3(_In_count_c_(3) CONST INT *pArray); + + _XMINT3& operator= (CONST _XMINT3& Int3); + +#endif // __cplusplus + +} XMINT3; + +// 3D Vector; 32 bit unsigned integer components +typedef struct _XMUINT3 +{ + UINT x; + UINT y; + UINT z; + +#ifdef __cplusplus + + _XMUINT3() {}; + _XMUINT3(UINT _x, UINT _y, UINT _z) : x(_x), y(_y), z(_z) {}; + explicit _XMUINT3(_In_count_c_(3) CONST UINT *pArray); + + _XMUINT3& operator= (CONST _XMUINT3& UInt3); + +#endif // __cplusplus + +} XMUINT3; + +//------------------------------------------------------------------------------ +// 3D Vector; 11-11-10 bit normalized components packed into a 32 bit integer +// The normalized 3D Vector is packed into 32 bits as follows: a 10 bit signed, +// normalized integer for the z component and 11 bit signed, normalized +// integers for the x and y components. The z component is stored in the +// most significant bits and the x component in the least significant bits +// (Z10Y11X11): [32] zzzzzzzz zzyyyyyy yyyyyxxx xxxxxxxx [0] +typedef struct _XMHENDN3 +{ + union + { + struct + { + INT x : 11; // -1023/1023 to 1023/1023 + INT y : 11; // -1023/1023 to 1023/1023 + INT z : 10; // -511/511 to 511/511 + }; + UINT v; + }; + +#ifdef __cplusplus + + _XMHENDN3() {}; + explicit _XMHENDN3(UINT Packed) : v(Packed) {}; + _XMHENDN3(FLOAT _x, FLOAT _y, FLOAT _z); + explicit _XMHENDN3(_In_count_c_(3) CONST FLOAT *pArray); + + operator UINT () const { return v; } + + _XMHENDN3& operator= (CONST _XMHENDN3& HenDN3); + _XMHENDN3& operator= (CONST UINT Packed); + +#endif // __cplusplus + +} XMHENDN3; + +// 3D Vector; 11-11-10 bit components packed into a 32 bit integer +// The 3D Vector is packed into 32 bits as follows: a 10 bit signed, +// integer for the z component and 11 bit signed integers for the +// x and y components. The z component is stored in the +// most significant bits and the x component in the least significant bits +// (Z10Y11X11): [32] zzzzzzzz zzyyyyyy yyyyyxxx xxxxxxxx [0] +typedef struct _XMHEND3 +{ + union + { + struct + { + INT x : 11; // -1023 to 1023 + INT y : 11; // -1023 to 1023 + INT z : 10; // -511 to 511 + }; + UINT v; + }; + +#ifdef __cplusplus + + _XMHEND3() {}; + explicit _XMHEND3(UINT Packed) : v(Packed) {}; + _XMHEND3(FLOAT _x, FLOAT _y, FLOAT _z); + explicit _XMHEND3(_In_count_c_(3) CONST FLOAT *pArray); + + operator UINT () const { return v; } + + _XMHEND3& operator= (CONST _XMHEND3& HenD3); + _XMHEND3& operator= (CONST UINT Packed); + +#endif // __cplusplus + +} XMHEND3; + +// 3D Vector; 11-11-10 bit normalized components packed into a 32 bit integer +// The normalized 3D Vector is packed into 32 bits as follows: a 10 bit unsigned, +// normalized integer for the z component and 11 bit unsigned, normalized +// integers for the x and y components. The z component is stored in the +// most significant bits and the x component in the least significant bits +// (Z10Y11X11): [32] zzzzzzzz zzyyyyyy yyyyyxxx xxxxxxxx [0] +typedef struct _XMUHENDN3 +{ + union + { + struct + { + UINT x : 11; // 0/2047 to 2047/2047 + UINT y : 11; // 0/2047 to 2047/2047 + UINT z : 10; // 0/1023 to 1023/1023 + }; + UINT v; + }; + +#ifdef __cplusplus + + _XMUHENDN3() {}; + explicit _XMUHENDN3(UINT Packed) : v(Packed) {}; + _XMUHENDN3(FLOAT _x, FLOAT _y, FLOAT _z); + explicit _XMUHENDN3(_In_count_c_(3) CONST FLOAT *pArray); + + operator UINT () const { return v; } + + _XMUHENDN3& operator= (CONST _XMUHENDN3& UHenDN3); + _XMUHENDN3& operator= (CONST UINT Packed); + +#endif // __cplusplus + +} XMUHENDN3; + +// 3D Vector; 11-11-10 bit components packed into a 32 bit integer +// The 3D Vector is packed into 32 bits as follows: a 10 bit unsigned +// integer for the z component and 11 bit unsigned integers +// for the x and y components. The z component is stored in the +// most significant bits and the x component in the least significant bits +// (Z10Y11X11): [32] zzzzzzzz zzyyyyyy yyyyyxxx xxxxxxxx [0] +typedef struct _XMUHEND3 +{ + union + { + struct + { + UINT x : 11; // 0 to 2047 + UINT y : 11; // 0 to 2047 + UINT z : 10; // 0 to 1023 + }; + UINT v; + }; + +#ifdef __cplusplus + + _XMUHEND3() {}; + explicit _XMUHEND3(UINT Packed) : v(Packed) {}; + _XMUHEND3(FLOAT _x, FLOAT _y, FLOAT _z); + explicit _XMUHEND3(_In_count_c_(3) CONST FLOAT *pArray); + + operator UINT () const { return v; } + + _XMUHEND3& operator= (CONST _XMUHEND3& UHenD3); + _XMUHEND3& operator= (CONST UINT Packed); + +#endif // __cplusplus + +} XMUHEND3; + +// 3D Vector; 10-11-11 bit normalized components packed into a 32 bit integer +// The normalized 3D Vector is packed into 32 bits as follows: a 10 bit signed, +// normalized integer for the x component and 11 bit signed, normalized +// integers for the y and z components. The z component is stored in the +// most significant bits and the x component in the least significant bits +// (Z11Y11X10): [32] zzzzzzzz zzzyyyyy yyyyyyxx xxxxxxxx [0] +typedef struct _XMDHENN3 +{ + union + { + struct + { + INT x : 10; // -511/511 to 511/511 + INT y : 11; // -1023/1023 to 1023/1023 + INT z : 11; // -1023/1023 to 1023/1023 + }; + UINT v; + }; + +#ifdef __cplusplus + + _XMDHENN3() {}; + explicit _XMDHENN3(UINT Packed) : v(Packed) {}; + _XMDHENN3(FLOAT _x, FLOAT _y, FLOAT _z); + explicit _XMDHENN3(_In_count_c_(3) CONST FLOAT *pArray); + + operator UINT () const { return v; } + + _XMDHENN3& operator= (CONST _XMDHENN3& DHenN3); + _XMDHENN3& operator= (CONST UINT Packed); + +#endif // __cplusplus + +} XMDHENN3; + +// 3D Vector; 10-11-11 bit components packed into a 32 bit integer +// The 3D Vector is packed into 32 bits as follows: a 10 bit signed, +// integer for the x component and 11 bit signed integers for the +// y and z components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (Z11Y11X10): [32] zzzzzzzz zzzyyyyy yyyyyyxx xxxxxxxx [0] +typedef struct _XMDHEN3 +{ + union + { + struct + { + INT x : 10; // -511 to 511 + INT y : 11; // -1023 to 1023 + INT z : 11; // -1023 to 1023 + }; + UINT v; + }; + +#ifdef __cplusplus + + _XMDHEN3() {}; + explicit _XMDHEN3(UINT Packed) : v(Packed) {}; + _XMDHEN3(FLOAT _x, FLOAT _y, FLOAT _z); + explicit _XMDHEN3(_In_count_c_(3) CONST FLOAT *pArray); + + operator UINT () const { return v; } + + _XMDHEN3& operator= (CONST _XMDHEN3& DHen3); + _XMDHEN3& operator= (CONST UINT Packed); + +#endif // __cplusplus + +} XMDHEN3; + +// 3D Vector; 10-11-11 bit normalized components packed into a 32 bit integer +// The normalized 3D Vector is packed into 32 bits as follows: a 10 bit unsigned, +// normalized integer for the x component and 11 bit unsigned, normalized +// integers for the y and z components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (Z11Y11X10): [32] zzzzzzzz zzzyyyyy yyyyyyxx xxxxxxxx [0] +typedef struct _XMUDHENN3 +{ + union + { + struct + { + UINT x : 10; // 0/1023 to 1023/1023 + UINT y : 11; // 0/2047 to 2047/2047 + UINT z : 11; // 0/2047 to 2047/2047 + }; + UINT v; + }; + +#ifdef __cplusplus + + _XMUDHENN3() {}; + explicit _XMUDHENN3(UINT Packed) : v(Packed) {}; + _XMUDHENN3(FLOAT _x, FLOAT _y, FLOAT _z); + explicit _XMUDHENN3(_In_count_c_(3) CONST FLOAT *pArray); + + operator UINT () const { return v; } + + _XMUDHENN3& operator= (CONST _XMUDHENN3& UDHenN3); + _XMUDHENN3& operator= (CONST UINT Packed); + +#endif // __cplusplus + +} XMUDHENN3; + +// 3D Vector; 10-11-11 bit components packed into a 32 bit integer +// The 3D Vector is packed into 32 bits as follows: a 10 bit unsigned, +// integer for the x component and 11 bit unsigned integers +// for the y and z components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (Z11Y11X10): [32] zzzzzzzz zzzyyyyy yyyyyyxx xxxxxxxx [0] +typedef struct _XMUDHEN3 +{ + union + { + struct + { + UINT x : 10; // 0 to 1023 + UINT y : 11; // 0 to 2047 + UINT z : 11; // 0 to 2047 + }; + UINT v; + }; + +#ifdef __cplusplus + + _XMUDHEN3() {}; + explicit _XMUDHEN3(UINT Packed) : v(Packed) {}; + _XMUDHEN3(FLOAT _x, FLOAT _y, FLOAT _z); + explicit _XMUDHEN3(_In_count_c_(3) CONST FLOAT *pArray); + + operator UINT () const { return v; } + + _XMUDHEN3& operator= (CONST _XMUDHEN3& UDHen3); + _XMUDHEN3& operator= (CONST UINT Packed); + +#endif // __cplusplus + +} XMUDHEN3; + +//------------------------------------------------------------------------------ +// 3D vector: 5/6/5 unsigned integer components +typedef struct _XMU565 +{ + union + { + struct + { + USHORT x : 5; + USHORT y : 6; + USHORT z : 5; + }; + USHORT v; + }; + +#ifdef __cplusplus + + _XMU565() {}; + explicit _XMU565(USHORT Packed) : v(Packed) {}; + _XMU565(CHAR _x, CHAR _y, CHAR _z) : x(_x), y(_y), z(_z) {}; + explicit _XMU565(_In_count_c_(3) CONST CHAR *pArray); + _XMU565(FLOAT _x, FLOAT _y, FLOAT _z); + explicit _XMU565(_In_count_c_(3) CONST FLOAT *pArray); + + operator USHORT () const { return v; } + + _XMU565& operator= (CONST _XMU565& U565); + _XMU565& operator= (CONST USHORT Packed); + +#endif // __cplusplus + +} XMU565; + +//------------------------------------------------------------------------------ +// 3D vector: 11/11/10 floating-point components +// The 3D vector is packed into 32 bits as follows: a 5-bit biased exponent +// and 6-bit mantissa for x component, a 5-bit biased exponent and +// 6-bit mantissa for y component, a 5-bit biased exponent and a 5-bit +// mantissa for z. The z component is stored in the most significant bits +// and the x component in the least significant bits. No sign bits so +// all partial-precision numbers are positive. +// (Z10Y11X11): [32] ZZZZZzzz zzzYYYYY yyyyyyXX XXXxxxxx [0] +typedef struct _XMFLOAT3PK +{ + union + { + struct + { + UINT xm : 6; + UINT xe : 5; + UINT ym : 6; + UINT ye : 5; + UINT zm : 5; + UINT ze : 5; + }; + UINT v; + }; + +#ifdef __cplusplus + + _XMFLOAT3PK() {}; + explicit _XMFLOAT3PK(UINT Packed) : v(Packed) {}; + _XMFLOAT3PK(FLOAT _x, FLOAT _y, FLOAT _z); + explicit _XMFLOAT3PK(_In_count_c_(3) CONST FLOAT *pArray); + + operator UINT () const { return v; } + + _XMFLOAT3PK& operator= (CONST _XMFLOAT3PK& float3pk); + _XMFLOAT3PK& operator= (CONST UINT Packed); + +#endif // __cplusplus + +} XMFLOAT3PK; + +//------------------------------------------------------------------------------ +// 3D vector: 9/9/9 floating-point components with shared 5-bit exponent +// The 3D vector is packed into 32 bits as follows: a 5-bit biased exponent +// with 9-bit mantissa for the x, y, and z component. The shared exponent +// is stored in the most significant bits and the x component mantissa is in +// the least significant bits. No sign bits so all partial-precision numbers +// are positive. +// (E5Z9Y9X9): [32] EEEEEzzz zzzzzzyy yyyyyyyx xxxxxxxx [0] +typedef struct _XMFLOAT3SE +{ + union + { + struct + { + UINT xm : 9; + UINT ym : 9; + UINT zm : 9; + UINT e : 5; + }; + UINT v; + }; + +#ifdef __cplusplus + + _XMFLOAT3SE() {}; + explicit _XMFLOAT3SE(UINT Packed) : v(Packed) {}; + _XMFLOAT3SE(FLOAT _x, FLOAT _y, FLOAT _z); + explicit _XMFLOAT3SE(_In_count_c_(3) CONST FLOAT *pArray); + + operator UINT () const { return v; } + + _XMFLOAT3SE& operator= (CONST _XMFLOAT3SE& float3se); + _XMFLOAT3SE& operator= (CONST UINT Packed); + +#endif // __cplusplus + +} XMFLOAT3SE; + +//------------------------------------------------------------------------------ +// 4D Vector; 32 bit floating point components +typedef struct _XMFLOAT4 +{ + FLOAT x; + FLOAT y; + FLOAT z; + FLOAT w; + +#ifdef __cplusplus + + _XMFLOAT4() {}; + _XMFLOAT4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w) : x(_x), y(_y), z(_z), w(_w) {}; + _XMFLOAT4(_In_count_c_(4) CONST FLOAT *pArray); + + _XMFLOAT4& operator= (CONST _XMFLOAT4& Float4); + +#endif // __cplusplus + +} XMFLOAT4; + +// 4D Vector; 32 bit floating point components aligned on a 16 byte boundary +#ifdef __cplusplus +__declspec(align(16)) struct XMFLOAT4A : public XMFLOAT4 +{ + XMFLOAT4A() : XMFLOAT4() {}; + XMFLOAT4A(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w) : XMFLOAT4(_x, _y, _z, _w) {}; + XMFLOAT4A(_In_count_c_(4) CONST FLOAT *pArray) : XMFLOAT4(pArray) {}; + + XMFLOAT4A& operator= (CONST XMFLOAT4A& Float4); +}; +#else +typedef __declspec(align(16)) XMFLOAT4 XMFLOAT4A; +#endif // __cplusplus + +//------------------------------------------------------------------------------ +// 4D Vector; 32 bit signed integer components +typedef struct _XMINT4 +{ + INT x; + INT y; + INT z; + INT w; + +#ifdef __cplusplus + + _XMINT4() {}; + _XMINT4(INT _x, INT _y, INT _z, INT _w) : x(_x), y(_y), z(_z), w(_w) {}; + explicit _XMINT4(_In_count_c_(4) CONST INT *pArray); + + _XMINT4& operator= (CONST _XMINT4& Int4); + +#endif // __cplusplus + +} XMINT4; + +// 4D Vector; 32 bit unsigned integer components +typedef struct _XMUINT4 +{ + UINT x; + UINT y; + UINT z; + UINT w; + +#ifdef __cplusplus + + _XMUINT4() {}; + _XMUINT4(UINT _x, UINT _y, UINT _z, UINT _w) : x(_x), y(_y), z(_z), w(_w) {}; + explicit _XMUINT4(_In_count_c_(4) CONST UINT *pArray); + + _XMUINT4& operator= (CONST _XMUINT4& UInt4); + +#endif // __cplusplus + +} XMUINT4; + +//------------------------------------------------------------------------------ +// 4D Vector; 16 bit floating point components +typedef struct _XMHALF4 +{ + HALF x; + HALF y; + HALF z; + HALF w; + +#ifdef __cplusplus + + _XMHALF4() {}; + _XMHALF4(HALF _x, HALF _y, HALF _z, HALF _w) : x(_x), y(_y), z(_z), w(_w) {}; + explicit _XMHALF4(_In_count_c_(4) CONST HALF *pArray); + _XMHALF4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w); + explicit _XMHALF4(_In_count_c_(4) CONST FLOAT *pArray); + + _XMHALF4& operator= (CONST _XMHALF4& Half4); + +#endif // __cplusplus + +} XMHALF4; + +//------------------------------------------------------------------------------ +// 4D Vector; 16 bit signed normalized integer components +typedef struct _XMSHORTN4 +{ + SHORT x; + SHORT y; + SHORT z; + SHORT w; + +#ifdef __cplusplus + + _XMSHORTN4() {}; + _XMSHORTN4(SHORT _x, SHORT _y, SHORT _z, SHORT _w) : x(_x), y(_y), z(_z), w(_w) {}; + explicit _XMSHORTN4(_In_count_c_(4) CONST SHORT *pArray); + _XMSHORTN4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w); + explicit _XMSHORTN4(_In_count_c_(4) CONST FLOAT *pArray); + + _XMSHORTN4& operator= (CONST _XMSHORTN4& ShortN4); + +#endif // __cplusplus + +} XMSHORTN4; + +// 4D Vector; 16 bit signed integer components +typedef struct _XMSHORT4 +{ + SHORT x; + SHORT y; + SHORT z; + SHORT w; + +#ifdef __cplusplus + + _XMSHORT4() {}; + _XMSHORT4(SHORT _x, SHORT _y, SHORT _z, SHORT _w) : x(_x), y(_y), z(_z), w(_w) {}; + explicit _XMSHORT4(_In_count_c_(4) CONST SHORT *pArray); + _XMSHORT4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w); + explicit _XMSHORT4(_In_count_c_(4) CONST FLOAT *pArray); + + _XMSHORT4& operator= (CONST _XMSHORT4& Short4); + +#endif // __cplusplus + +} XMSHORT4; + +// 4D Vector; 16 bit unsigned normalized integer components +typedef struct _XMUSHORTN4 +{ + USHORT x; + USHORT y; + USHORT z; + USHORT w; + +#ifdef __cplusplus + + _XMUSHORTN4() {}; + _XMUSHORTN4(USHORT _x, USHORT _y, USHORT _z, USHORT _w) : x(_x), y(_y), z(_z), w(_w) {}; + explicit _XMUSHORTN4(_In_count_c_(4) CONST USHORT *pArray); + _XMUSHORTN4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w); + explicit _XMUSHORTN4(_In_count_c_(4) CONST FLOAT *pArray); + + _XMUSHORTN4& operator= (CONST _XMUSHORTN4& UShortN4); + +#endif // __cplusplus + +} XMUSHORTN4; + +// 4D Vector; 16 bit unsigned integer components +typedef struct _XMUSHORT4 +{ + USHORT x; + USHORT y; + USHORT z; + USHORT w; + +#ifdef __cplusplus + + _XMUSHORT4() {}; + _XMUSHORT4(USHORT _x, USHORT _y, USHORT _z, USHORT _w) : x(_x), y(_y), z(_z), w(_w) {}; + explicit _XMUSHORT4(_In_count_c_(4) CONST USHORT *pArray); + _XMUSHORT4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w); + explicit _XMUSHORT4(_In_count_c_(4) CONST FLOAT *pArray); + + _XMUSHORT4& operator= (CONST _XMUSHORT4& UShort4); + +#endif // __cplusplus + +} XMUSHORT4; + +//------------------------------------------------------------------------------ +// 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer +// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned, +// normalized integer for the w component and 10 bit signed, normalized +// integers for the z, y, and x components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] +typedef struct _XMXDECN4 +{ + union + { + struct + { + INT x : 10; // -511/511 to 511/511 + INT y : 10; // -511/511 to 511/511 + INT z : 10; // -511/511 to 511/511 + UINT w : 2; // 0/3 to 3/3 + }; + UINT v; + }; + +#ifdef __cplusplus + + _XMXDECN4() {}; + explicit _XMXDECN4(UINT Packed) : v(Packed) {}; + _XMXDECN4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w); + explicit _XMXDECN4(_In_count_c_(4) CONST FLOAT *pArray); + + operator UINT () const { return v; } + + _XMXDECN4& operator= (CONST _XMXDECN4& XDecN4); + _XMXDECN4& operator= (CONST UINT Packed); + +#endif // __cplusplus + +} XMXDECN4; + +// 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer +// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned +// integer for the w component and 10 bit signed integers for the +// z, y, and x components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] +typedef struct _XMXDEC4 +{ + union + { + struct + { + INT x : 10; // -511 to 511 + INT y : 10; // -511 to 511 + INT z : 10; // -511 to 511 + UINT w : 2; // 0 to 3 + }; + UINT v; + }; + +#ifdef __cplusplus + + _XMXDEC4() {}; + explicit _XMXDEC4(UINT Packed) : v(Packed) {}; + _XMXDEC4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w); + explicit _XMXDEC4(_In_count_c_(4) CONST FLOAT *pArray); + + operator UINT () const { return v; } + + _XMXDEC4& operator= (CONST _XMXDEC4& XDec4); + _XMXDEC4& operator= (CONST UINT Packed); + +#endif // __cplusplus + +} XMXDEC4; + +// 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer +// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit signed, +// normalized integer for the w component and 10 bit signed, normalized +// integers for the z, y, and x components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] +typedef struct _XMDECN4 +{ + union + { + struct + { + INT x : 10; // -511/511 to 511/511 + INT y : 10; // -511/511 to 511/511 + INT z : 10; // -511/511 to 511/511 + INT w : 2; // -1/1 to 1/1 + }; + UINT v; + }; + +#ifdef __cplusplus + + _XMDECN4() {}; + explicit _XMDECN4(UINT Packed) : v(Packed) {}; + _XMDECN4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w); + explicit _XMDECN4(_In_count_c_(4) CONST FLOAT *pArray); + + operator UINT () const { return v; } + + _XMDECN4& operator= (CONST _XMDECN4& DecN4); + _XMDECN4& operator= (CONST UINT Packed); + +#endif // __cplusplus + +} XMDECN4; + +// 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer +// The 4D Vector is packed into 32 bits as follows: a 2 bit signed, +// integer for the w component and 10 bit signed integers for the +// z, y, and x components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] +typedef struct _XMDEC4 +{ + union + { + struct + { + INT x : 10; // -511 to 511 + INT y : 10; // -511 to 511 + INT z : 10; // -511 to 511 + INT w : 2; // -1 to 1 + }; + UINT v; + }; + +#ifdef __cplusplus + + _XMDEC4() {}; + explicit _XMDEC4(UINT Packed) : v(Packed) {}; + _XMDEC4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w); + explicit _XMDEC4(_In_count_c_(4) CONST FLOAT *pArray); + + operator UINT () const { return v; } + + _XMDEC4& operator= (CONST _XMDEC4& Dec4); + _XMDEC4& operator= (CONST UINT Packed); + +#endif // __cplusplus + +} XMDEC4; + +// 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer +// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned, +// normalized integer for the w component and 10 bit unsigned, normalized +// integers for the z, y, and x components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] +typedef struct _XMUDECN4 +{ + union + { + struct + { + UINT x : 10; // 0/1023 to 1023/1023 + UINT y : 10; // 0/1023 to 1023/1023 + UINT z : 10; // 0/1023 to 1023/1023 + UINT w : 2; // 0/3 to 3/3 + }; + UINT v; + }; + +#ifdef __cplusplus + + _XMUDECN4() {}; + explicit _XMUDECN4(UINT Packed) : v(Packed) {}; + _XMUDECN4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w); + explicit _XMUDECN4(_In_count_c_(4) CONST FLOAT *pArray); + + operator UINT () const { return v; } + + _XMUDECN4& operator= (CONST _XMUDECN4& UDecN4); + _XMUDECN4& operator= (CONST UINT Packed); + +#endif // __cplusplus + +} XMUDECN4; + +// 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer +// The 4D Vector is packed into 32 bits as follows: a 2 bit unsigned, +// integer for the w component and 10 bit unsigned integers +// for the z, y, and x components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] +typedef struct _XMUDEC4 +{ + union + { + struct + { + UINT x : 10; // 0 to 1023 + UINT y : 10; // 0 to 1023 + UINT z : 10; // 0 to 1023 + UINT w : 2; // 0 to 3 + }; + UINT v; + }; + +#ifdef __cplusplus + + _XMUDEC4() {}; + explicit _XMUDEC4(UINT Packed) : v(Packed) {}; + _XMUDEC4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w); + explicit _XMUDEC4(_In_count_c_(4) CONST FLOAT *pArray); + + operator UINT () const { return v; } + + _XMUDEC4& operator= (CONST _XMUDEC4& UDec4); + _XMUDEC4& operator= (CONST UINT Packed); + +#endif // __cplusplus + +} XMUDEC4; + +//------------------------------------------------------------------------------ +// 4D Vector; 20-20-20-4 bit normalized components packed into a 64 bit integer +// The normalized 4D Vector is packed into 64 bits as follows: a 4 bit unsigned, +// normalized integer for the w component and 20 bit signed, normalized +// integers for the z, y, and x components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (W4Z20Y20X20): [64] wwwwzzzz zzzzzzzz zzzzzzzz yyyyyyyy yyyyyyyy yyyyxxxx xxxxxxxx xxxxxxxx [0] +typedef struct _XMXICON4 +{ + union + { + struct + { + INT64 x : 20; // -524287/524287 to 524287/524287 + INT64 y : 20; // -524287/524287 to 524287/524287 + INT64 z : 20; // -524287/524287 to 524287/524287 + UINT64 w : 4; // 0/15 to 15/15 + }; + UINT64 v; + }; + +#ifdef __cplusplus + + _XMXICON4() {}; + explicit _XMXICON4(UINT64 Packed) : v(Packed) {}; + _XMXICON4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w); + explicit _XMXICON4(_In_count_c_(4) CONST FLOAT *pArray); + + operator UINT64 () const { return v; } + + _XMXICON4& operator= (CONST _XMXICON4& XIcoN4); + _XMXICON4& operator= (CONST UINT64 Packed); + +#endif // __cplusplus + +} XMXICON4; + +// 4D Vector; 20-20-20-4 bit components packed into a 64 bit integer +// The 4D Vector is packed into 64 bits as follows: a 4 bit unsigned +// integer for the w component and 20 bit signed integers for the +// z, y, and x components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (W4Z20Y20X20): [64] wwwwzzzz zzzzzzzz zzzzzzzz yyyyyyyy yyyyyyyy yyyyxxxx xxxxxxxx xxxxxxxx [0] +typedef struct _XMXICO4 +{ + union + { + struct + { + INT64 x : 20; // -524287 to 524287 + INT64 y : 20; // -524287 to 524287 + INT64 z : 20; // -524287 to 524287 + UINT64 w : 4; // 0 to 15 + }; + UINT64 v; + }; + +#ifdef __cplusplus + + _XMXICO4() {}; + explicit _XMXICO4(UINT64 Packed) : v(Packed) {}; + _XMXICO4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w); + explicit _XMXICO4(_In_count_c_(4) CONST FLOAT *pArray); + + operator UINT64 () const { return v; } + + _XMXICO4& operator= (CONST _XMXICO4& XIco4); + _XMXICO4& operator= (CONST UINT64 Packed); + +#endif // __cplusplus + +} XMXICO4; + +// 4D Vector; 20-20-20-4 bit normalized components packed into a 64 bit integer +// The normalized 4D Vector is packed into 64 bits as follows: a 4 bit signed, +// normalized integer for the w component and 20 bit signed, normalized +// integers for the z, y, and x components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (W4Z20Y20X20): [64] wwwwzzzz zzzzzzzz zzzzzzzz yyyyyyyy yyyyyyyy yyyyxxxx xxxxxxxx xxxxxxxx [0] +typedef struct _XMICON4 +{ + union + { + struct + { + INT64 x : 20; // -524287/524287 to 524287/524287 + INT64 y : 20; // -524287/524287 to 524287/524287 + INT64 z : 20; // -524287/524287 to 524287/524287 + INT64 w : 4; // -7/7 to 7/7 + }; + UINT64 v; + }; + +#ifdef __cplusplus + + _XMICON4() {}; + explicit _XMICON4(UINT64 Packed) : v(Packed) {}; + _XMICON4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w); + explicit _XMICON4(_In_count_c_(4) CONST FLOAT *pArray); + + operator UINT64 () const { return v; } + + _XMICON4& operator= (CONST _XMICON4& IcoN4); + _XMICON4& operator= (CONST UINT64 Packed); + +#endif // __cplusplus + +} XMICON4; + +// 4D Vector; 20-20-20-4 bit components packed into a 64 bit integer +// The 4D Vector is packed into 64 bits as follows: a 4 bit signed, +// integer for the w component and 20 bit signed integers for the +// z, y, and x components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (W4Z20Y20X20): [64] wwwwzzzz zzzzzzzz zzzzzzzz yyyyyyyy yyyyyyyy yyyyxxxx xxxxxxxx xxxxxxxx [0] +typedef struct _XMICO4 +{ + union + { + struct + { + INT64 x : 20; // -524287 to 524287 + INT64 y : 20; // -524287 to 524287 + INT64 z : 20; // -524287 to 524287 + INT64 w : 4; // -7 to 7 + }; + UINT64 v; + }; + +#ifdef __cplusplus + + _XMICO4() {}; + explicit _XMICO4(UINT64 Packed) : v(Packed) {}; + _XMICO4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w); + explicit _XMICO4(_In_count_c_(4) CONST FLOAT *pArray); + + operator UINT64 () const { return v; } + + _XMICO4& operator= (CONST _XMICO4& Ico4); + _XMICO4& operator= (CONST UINT64 Packed); + +#endif // __cplusplus + +} XMICO4; + +// 4D Vector; 20-20-20-4 bit normalized components packed into a 64 bit integer +// The normalized 4D Vector is packed into 64 bits as follows: a 4 bit unsigned, +// normalized integer for the w component and 20 bit unsigned, normalized +// integers for the z, y, and x components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (W4Z20Y20X20): [64] wwwwzzzz zzzzzzzz zzzzzzzz yyyyyyyy yyyyyyyy yyyyxxxx xxxxxxxx xxxxxxxx [0] +typedef struct _XMUICON4 +{ + union + { + struct + { + UINT64 x : 20; // 0/1048575 to 1048575/1048575 + UINT64 y : 20; // 0/1048575 to 1048575/1048575 + UINT64 z : 20; // 0/1048575 to 1048575/1048575 + UINT64 w : 4; // 0/15 to 15/15 + }; + UINT64 v; + }; + +#ifdef __cplusplus + + _XMUICON4() {}; + explicit _XMUICON4(UINT64 Packed) : v(Packed) {}; + _XMUICON4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w); + explicit _XMUICON4(_In_count_c_(4) CONST FLOAT *pArray); + + operator UINT64 () const { return v; } + + _XMUICON4& operator= (CONST _XMUICON4& UIcoN4); + _XMUICON4& operator= (CONST UINT64 Packed); + +#endif // __cplusplus + +} XMUICON4; + +// 4D Vector; 20-20-20-4 bit components packed into a 64 bit integer +// The 4D Vector is packed into 64 bits as follows: a 4 bit unsigned +// integer for the w component and 20 bit unsigned integers for the +// z, y, and x components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (W4Z20Y20X20): [64] wwwwzzzz zzzzzzzz zzzzzzzz yyyyyyyy yyyyyyyy yyyyxxxx xxxxxxxx xxxxxxxx [0] +typedef struct _XMUICO4 +{ + union + { + struct + { + UINT64 x : 20; // 0 to 1048575 + UINT64 y : 20; // 0 to 1048575 + UINT64 z : 20; // 0 to 1048575 + UINT64 w : 4; // 0 to 15 + }; + UINT64 v; + }; + +#ifdef __cplusplus + + _XMUICO4() {}; + explicit _XMUICO4(UINT64 Packed) : v(Packed) {}; + _XMUICO4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w); + explicit _XMUICO4(_In_count_c_(4) CONST FLOAT *pArray); + + operator UINT64 () const { return v; } + + _XMUICO4& operator= (CONST _XMUICO4& UIco4); + _XMUICO4& operator= (CONST UINT64 Packed); + +#endif // __cplusplus + +} XMUICO4; + +//------------------------------------------------------------------------------ +// ARGB Color; 8-8-8-8 bit unsigned normalized integer components packed into +// a 32 bit integer. The normalized color is packed into 32 bits using 8 bit +// unsigned, normalized integers for the alpha, red, green, and blue components. +// The alpha component is stored in the most significant bits and the blue +// component in the least significant bits (A8R8G8B8): +// [32] aaaaaaaa rrrrrrrr gggggggg bbbbbbbb [0] +typedef struct _XMCOLOR +{ + union + { + struct + { + UINT b : 8; // Blue: 0/255 to 255/255 + UINT g : 8; // Green: 0/255 to 255/255 + UINT r : 8; // Red: 0/255 to 255/255 + UINT a : 8; // Alpha: 0/255 to 255/255 + }; + UINT c; + }; + +#ifdef __cplusplus + + _XMCOLOR() {}; + _XMCOLOR(UINT Color) : c(Color) {}; + _XMCOLOR(FLOAT _r, FLOAT _g, FLOAT _b, FLOAT _a); + explicit _XMCOLOR(_In_count_c_(4) CONST FLOAT *pArray); + + operator UINT () const { return c; } + + _XMCOLOR& operator= (CONST _XMCOLOR& Color); + _XMCOLOR& operator= (CONST UINT Color); + +#endif // __cplusplus + +} XMCOLOR; + +//------------------------------------------------------------------------------ +// 4D Vector; 8 bit signed normalized integer components +typedef struct _XMBYTEN4 +{ + union + { + struct + { + CHAR x; + CHAR y; + CHAR z; + CHAR w; + }; + UINT v; + }; + +#ifdef __cplusplus + + _XMBYTEN4() {}; + _XMBYTEN4(CHAR _x, CHAR _y, CHAR _z, CHAR _w) : x(_x), y(_y), z(_z), w(_w) {}; + explicit _XMBYTEN4(UINT Packed) : v(Packed) {}; + explicit _XMBYTEN4(_In_count_c_(4) CONST CHAR *pArray); + _XMBYTEN4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w); + explicit _XMBYTEN4(_In_count_c_(4) CONST FLOAT *pArray); + + _XMBYTEN4& operator= (CONST _XMBYTEN4& ByteN4); + _XMBYTEN4& operator= (UINT Packed) { v = Packed; return *this; } + +#endif // __cplusplus + +} XMBYTEN4; + +// 4D Vector; 8 bit signed integer components +typedef struct _XMBYTE4 +{ + union + { + struct + { + CHAR x; + CHAR y; + CHAR z; + CHAR w; + }; + UINT v; + }; + +#ifdef __cplusplus + + _XMBYTE4() {}; + _XMBYTE4(CHAR _x, CHAR _y, CHAR _z, CHAR _w) : x(_x), y(_y), z(_z), w(_w) {}; + explicit _XMBYTE4(UINT Packed) : v(Packed) {}; + explicit _XMBYTE4(_In_count_c_(4) CONST CHAR *pArray); + _XMBYTE4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w); + explicit _XMBYTE4(_In_count_c_(4) CONST FLOAT *pArray); + + _XMBYTE4& operator= (CONST _XMBYTE4& Byte4); + _XMBYTE4& operator= (UINT Packed) { v = Packed; return *this; } + +#endif // __cplusplus + +} XMBYTE4; + +// 4D Vector; 8 bit unsigned normalized integer components +typedef struct _XMUBYTEN4 +{ + union + { + struct + { + BYTE x; + BYTE y; + BYTE z; + BYTE w; + }; + UINT v; + }; + +#ifdef __cplusplus + + _XMUBYTEN4() {}; + _XMUBYTEN4(BYTE _x, BYTE _y, BYTE _z, BYTE _w) : x(_x), y(_y), z(_z), w(_w) {}; + explicit _XMUBYTEN4(UINT Packed) : v(Packed) {}; + explicit _XMUBYTEN4(_In_count_c_(4) CONST BYTE *pArray); + _XMUBYTEN4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w); + explicit _XMUBYTEN4(_In_count_c_(4) CONST FLOAT *pArray); + + _XMUBYTEN4& operator= (CONST _XMUBYTEN4& UByteN4); + _XMUBYTEN4& operator= (UINT Packed) { v = Packed; return *this; } + +#endif // __cplusplus + +} XMUBYTEN4; + +// 4D Vector; 8 bit unsigned integer components +typedef struct _XMUBYTE4 +{ + union + { + struct + { + BYTE x; + BYTE y; + BYTE z; + BYTE w; + }; + UINT v; + }; + +#ifdef __cplusplus + + _XMUBYTE4() {}; + _XMUBYTE4(BYTE _x, BYTE _y, BYTE _z, BYTE _w) : x(_x), y(_y), z(_z), w(_w) {}; + explicit _XMUBYTE4(UINT Packed) : v(Packed) {}; + explicit _XMUBYTE4(_In_count_c_(4) CONST BYTE *pArray); + _XMUBYTE4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w); + explicit _XMUBYTE4(_In_count_c_(4) CONST FLOAT *pArray); + + _XMUBYTE4& operator= (CONST _XMUBYTE4& UByte4); + _XMUBYTE4& operator= (UINT Packed) { v = Packed; return *this; } + +#endif // __cplusplus + +} XMUBYTE4; + +//------------------------------------------------------------------------------ +// 4D vector; 4 bit unsigned integer components +typedef struct _XMUNIBBLE4 +{ + union + { + struct + { + USHORT x : 4; + USHORT y : 4; + USHORT z : 4; + USHORT w : 4; + }; + USHORT v; + }; + +#ifdef __cplusplus + + _XMUNIBBLE4() {}; + explicit _XMUNIBBLE4(USHORT Packed) : v(Packed) {}; + _XMUNIBBLE4(CHAR _x, CHAR _y, CHAR _z, CHAR _w) : x(_x), y(_y), z(_z), w(_w) {}; + explicit _XMUNIBBLE4(_In_count_c_(4) CONST CHAR *pArray); + _XMUNIBBLE4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w); + explicit _XMUNIBBLE4(_In_count_c_(4) CONST FLOAT *pArray); + + operator USHORT () const { return v; } + + _XMUNIBBLE4& operator= (CONST _XMUNIBBLE4& UNibble4); + _XMUNIBBLE4& operator= (CONST USHORT Packed); + +#endif // __cplusplus + +} XMUNIBBLE4; + +//------------------------------------------------------------------------------ +// 4D vector: 5/5/5/1 unsigned integer components +typedef struct _XMU555 +{ + union + { + struct + { + USHORT x : 5; + USHORT y : 5; + USHORT z : 5; + USHORT w : 1; + }; + USHORT v; + }; + +#ifdef __cplusplus + + _XMU555() {}; + explicit _XMU555(USHORT Packed) : v(Packed) {}; + _XMU555(CHAR _x, CHAR _y, CHAR _z, BOOL _w) : x(_x), y(_y), z(_z), w(_w ? 0x1 : 0) {}; + _XMU555(_In_count_c_(3) CONST CHAR *pArray, BOOL _w); + _XMU555(FLOAT _x, FLOAT _y, FLOAT _z, BOOL _w); + _XMU555(_In_count_c_(3) CONST FLOAT *pArray, BOOL _w); + + operator USHORT () const { return v; } + + _XMU555& operator= (CONST _XMU555& U555); + _XMU555& operator= (CONST USHORT Packed); + +#endif // __cplusplus + +} XMU555; + +//------------------------------------------------------------------------------ +// 3x3 Matrix: 32 bit floating point components +typedef struct _XMFLOAT3X3 +{ + union + { + struct + { + FLOAT _11, _12, _13; + FLOAT _21, _22, _23; + FLOAT _31, _32, _33; + }; + FLOAT m[3][3]; + }; + +#ifdef __cplusplus + + _XMFLOAT3X3() {}; + _XMFLOAT3X3(FLOAT m00, FLOAT m01, FLOAT m02, + FLOAT m10, FLOAT m11, FLOAT m12, + FLOAT m20, FLOAT m21, FLOAT m22); + explicit _XMFLOAT3X3(_In_count_c_(9) CONST FLOAT *pArray); + + FLOAT operator() (UINT Row, UINT Column) CONST { return m[Row][Column]; } + FLOAT& operator() (UINT Row, UINT Column) { return m[Row][Column]; } + + _XMFLOAT3X3& operator= (CONST _XMFLOAT3X3& Float3x3); + +#endif // __cplusplus + +} XMFLOAT3X3; + +//------------------------------------------------------------------------------ +// 4x3 Matrix: 32 bit floating point components +typedef struct _XMFLOAT4X3 +{ + union + { + struct + { + FLOAT _11, _12, _13; + FLOAT _21, _22, _23; + FLOAT _31, _32, _33; + FLOAT _41, _42, _43; + }; + FLOAT m[4][3]; + }; + +#ifdef __cplusplus + + _XMFLOAT4X3() {}; + _XMFLOAT4X3(FLOAT m00, FLOAT m01, FLOAT m02, + FLOAT m10, FLOAT m11, FLOAT m12, + FLOAT m20, FLOAT m21, FLOAT m22, + FLOAT m30, FLOAT m31, FLOAT m32); + explicit _XMFLOAT4X3(_In_count_c_(12) CONST FLOAT *pArray); + + FLOAT operator() (UINT Row, UINT Column) CONST { return m[Row][Column]; } + FLOAT& operator() (UINT Row, UINT Column) { return m[Row][Column]; } + + _XMFLOAT4X3& operator= (CONST _XMFLOAT4X3& Float4x3); + +#endif // __cplusplus + +} XMFLOAT4X3; + +// 4x3 Matrix: 32 bit floating point components aligned on a 16 byte boundary +#ifdef __cplusplus +__declspec(align(16)) struct XMFLOAT4X3A : public XMFLOAT4X3 +{ + XMFLOAT4X3A() : XMFLOAT4X3() {}; + XMFLOAT4X3A(FLOAT m00, FLOAT m01, FLOAT m02, + FLOAT m10, FLOAT m11, FLOAT m12, + FLOAT m20, FLOAT m21, FLOAT m22, + FLOAT m30, FLOAT m31, FLOAT m32) : + XMFLOAT4X3(m00,m01,m02,m10,m11,m12,m20,m21,m22,m30,m31,m32) {}; + explicit XMFLOAT4X3A(_In_count_c_(12) CONST FLOAT *pArray) : XMFLOAT4X3(pArray) {} + + FLOAT operator() (UINT Row, UINT Column) CONST { return m[Row][Column]; } + FLOAT& operator() (UINT Row, UINT Column) { return m[Row][Column]; } + + XMFLOAT4X3A& operator= (CONST XMFLOAT4X3A& Float4x3); +}; +#else +typedef __declspec(align(16)) XMFLOAT4X3 XMFLOAT4X3A; +#endif // __cplusplus + +//------------------------------------------------------------------------------ +// 4x4 Matrix: 32 bit floating point components +typedef struct _XMFLOAT4X4 +{ + union + { + struct + { + FLOAT _11, _12, _13, _14; + FLOAT _21, _22, _23, _24; + FLOAT _31, _32, _33, _34; + FLOAT _41, _42, _43, _44; + }; + FLOAT m[4][4]; + }; + +#ifdef __cplusplus + + _XMFLOAT4X4() {}; + _XMFLOAT4X4(FLOAT m00, FLOAT m01, FLOAT m02, FLOAT m03, + FLOAT m10, FLOAT m11, FLOAT m12, FLOAT m13, + FLOAT m20, FLOAT m21, FLOAT m22, FLOAT m23, + FLOAT m30, FLOAT m31, FLOAT m32, FLOAT m33); + explicit _XMFLOAT4X4(_In_count_c_(16) CONST FLOAT *pArray); + + FLOAT operator() (UINT Row, UINT Column) CONST { return m[Row][Column]; } + FLOAT& operator() (UINT Row, UINT Column) { return m[Row][Column]; } + + _XMFLOAT4X4& operator= (CONST _XMFLOAT4X4& Float4x4); + +#endif // __cplusplus + +} XMFLOAT4X4; + +// 4x4 Matrix: 32 bit floating point components aligned on a 16 byte boundary +#ifdef __cplusplus +__declspec(align(16)) struct XMFLOAT4X4A : public XMFLOAT4X4 +{ + XMFLOAT4X4A() : XMFLOAT4X4() {}; + XMFLOAT4X4A(FLOAT m00, FLOAT m01, FLOAT m02, FLOAT m03, + FLOAT m10, FLOAT m11, FLOAT m12, FLOAT m13, + FLOAT m20, FLOAT m21, FLOAT m22, FLOAT m23, + FLOAT m30, FLOAT m31, FLOAT m32, FLOAT m33) + : XMFLOAT4X4(m00,m01,m02,m03,m10,m11,m12,m13,m20,m21,m22,m23,m30,m31,m32,m33) {}; + explicit XMFLOAT4X4A(_In_count_c_(16) CONST FLOAT *pArray) : XMFLOAT4X4(pArray) {} + + FLOAT operator() (UINT Row, UINT Column) CONST { return m[Row][Column]; } + FLOAT& operator() (UINT Row, UINT Column) { return m[Row][Column]; } + + XMFLOAT4X4A& operator= (CONST XMFLOAT4X4A& Float4x4); +}; +#else +typedef __declspec(align(16)) XMFLOAT4X4 XMFLOAT4X4A; +#endif // __cplusplus + + +#ifdef _XM_BIGENDIAN_ +#pragma bitfield_order(pop) +#endif + +#pragma warning(pop) + +/**************************************************************************** + * + * Data conversion operations + * + ****************************************************************************/ + +#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_VMX128_INTRINSICS_) +#else +XMVECTOR XMConvertVectorIntToFloat(FXMVECTOR VInt, UINT DivExponent); +XMVECTOR XMConvertVectorFloatToInt(FXMVECTOR VFloat, UINT MulExponent); +XMVECTOR XMConvertVectorUIntToFloat(FXMVECTOR VUInt, UINT DivExponent); +XMVECTOR XMConvertVectorFloatToUInt(FXMVECTOR VFloat, UINT MulExponent); +#endif + +FLOAT XMConvertHalfToFloat(HALF Value); +FLOAT* XMConvertHalfToFloatStream(_Out_bytecap_x_(sizeof(FLOAT)+OutputStride*(HalfCount-1)) FLOAT* pOutputStream, + _In_ size_t OutputStride, + _In_bytecount_x_(sizeof(HALF)+InputStride*(HalfCount-1)) CONST HALF* pInputStream, + _In_ size_t InputStride, _In_ size_t HalfCount); +HALF XMConvertFloatToHalf(FLOAT Value); +HALF* XMConvertFloatToHalfStream(_Out_bytecap_x_(sizeof(HALF)+OutputStride*(FloatCount-1)) HALF* pOutputStream, + _In_ size_t OutputStride, + _In_bytecount_x_(sizeof(FLOAT)+InputStride*(FloatCount-1)) CONST FLOAT* pInputStream, + _In_ size_t InputStride, _In_ size_t FloatCount); + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) +XMVECTOR XMVectorSetBinaryConstant(UINT C0, UINT C1, UINT C2, UINT C3); +XMVECTOR XMVectorSplatConstant(INT IntConstant, UINT DivExponent); +XMVECTOR XMVectorSplatConstantInt(INT IntConstant); + +// VMX128 versions defined below as macros +#endif + +/**************************************************************************** + * + * Load operations + * + ****************************************************************************/ + +XMVECTOR XMLoadInt(_In_ CONST UINT* pSource); +XMVECTOR XMLoadFloat(_In_ CONST FLOAT* pSource); + +XMVECTOR XMLoadInt2(_In_count_c_(2) CONST UINT* pSource); +XMVECTOR XMLoadInt2A(_In_count_c_(2) CONST UINT* PSource); +XMVECTOR XMLoadFloat2(_In_ CONST XMFLOAT2* pSource); +XMVECTOR XMLoadFloat2A(_In_ CONST XMFLOAT2A* pSource); +XMVECTOR XMLoadSInt2(_In_ CONST XMINT2* pSource); +XMVECTOR XMLoadUInt2(_In_ CONST XMUINT2* pSource); +XMVECTOR XMLoadHalf2(_In_ CONST XMHALF2* pSource); +XMVECTOR XMLoadShortN2(_In_ CONST XMSHORTN2* pSource); +XMVECTOR XMLoadShort2(_In_ CONST XMSHORT2* pSource); +XMVECTOR XMLoadUShortN2(_In_ CONST XMUSHORTN2* pSource); +XMVECTOR XMLoadUShort2(_In_ CONST XMUSHORT2* pSource); +XMVECTOR XMLoadByteN2(_In_ CONST XMBYTEN2* pSource); +XMVECTOR XMLoadByte2(_In_ CONST XMBYTE2* pSource); +XMVECTOR XMLoadUByteN2(_In_ CONST XMUBYTEN2* pSource); +XMVECTOR XMLoadUByte2(_In_ CONST XMUBYTE2* pSource); + +XMVECTOR XMLoadInt3(_In_count_c_(3) CONST UINT* pSource); +XMVECTOR XMLoadInt3A(_In_count_c_(3) CONST UINT* pSource); +XMVECTOR XMLoadFloat3(_In_ CONST XMFLOAT3* pSource); +XMVECTOR XMLoadFloat3A(_In_ CONST XMFLOAT3A* pSource); +XMVECTOR XMLoadSInt3(_In_ CONST XMINT3* pSource); +XMVECTOR XMLoadUInt3(_In_ CONST XMUINT3* pSource); +XMVECTOR XMLoadHenDN3(_In_ CONST XMHENDN3* pSource); +XMVECTOR XMLoadHenD3(_In_ CONST XMHEND3* pSource); +XMVECTOR XMLoadUHenDN3(_In_ CONST XMUHENDN3* pSource); +XMVECTOR XMLoadUHenD3(_In_ CONST XMUHEND3* pSource); +XMVECTOR XMLoadDHenN3(_In_ CONST XMDHENN3* pSource); +XMVECTOR XMLoadDHen3(_In_ CONST XMDHEN3* pSource); +XMVECTOR XMLoadUDHenN3(_In_ CONST XMUDHENN3* pSource); +XMVECTOR XMLoadUDHen3(_In_ CONST XMUDHEN3* pSource); +XMVECTOR XMLoadU565(_In_ CONST XMU565* pSource); +XMVECTOR XMLoadFloat3PK(_In_ CONST XMFLOAT3PK* pSource); +XMVECTOR XMLoadFloat3SE(_In_ CONST XMFLOAT3SE* pSource); + +XMVECTOR XMLoadInt4(_In_count_c_(4) CONST UINT* pSource); +XMVECTOR XMLoadInt4A(_In_count_c_(4) CONST UINT* pSource); +XMVECTOR XMLoadFloat4(_In_ CONST XMFLOAT4* pSource); +XMVECTOR XMLoadFloat4A(_In_ CONST XMFLOAT4A* pSource); +XMVECTOR XMLoadSInt4(_In_ CONST XMINT4* pSource); +XMVECTOR XMLoadUInt4(_In_ CONST XMUINT4* pSource); +XMVECTOR XMLoadHalf4(_In_ CONST XMHALF4* pSource); +XMVECTOR XMLoadShortN4(_In_ CONST XMSHORTN4* pSource); +XMVECTOR XMLoadShort4(_In_ CONST XMSHORT4* pSource); +XMVECTOR XMLoadUShortN4(_In_ CONST XMUSHORTN4* pSource); +XMVECTOR XMLoadUShort4(_In_ CONST XMUSHORT4* pSource); +XMVECTOR XMLoadXIcoN4(_In_ CONST XMXICON4* pSource); +XMVECTOR XMLoadXIco4(_In_ CONST XMXICO4* pSource); +XMVECTOR XMLoadIcoN4(_In_ CONST XMICON4* pSource); +XMVECTOR XMLoadIco4(_In_ CONST XMICO4* pSource); +XMVECTOR XMLoadUIcoN4(_In_ CONST XMUICON4* pSource); +XMVECTOR XMLoadUIco4(_In_ CONST XMUICO4* pSource); +XMVECTOR XMLoadXDecN4(_In_ CONST XMXDECN4* pSource); +XMVECTOR XMLoadXDec4(_In_ CONST XMXDEC4* pSource); +XMVECTOR XMLoadDecN4(_In_ CONST XMDECN4* pSource); +XMVECTOR XMLoadDec4(_In_ CONST XMDEC4* pSource); +XMVECTOR XMLoadUDecN4(_In_ CONST XMUDECN4* pSource); +XMVECTOR XMLoadUDec4(_In_ CONST XMUDEC4* pSource); +XMVECTOR XMLoadByteN4(_In_ CONST XMBYTEN4* pSource); +XMVECTOR XMLoadByte4(_In_ CONST XMBYTE4* pSource); +XMVECTOR XMLoadUByteN4(_In_ CONST XMUBYTEN4* pSource); +XMVECTOR XMLoadUByte4(_In_ CONST XMUBYTE4* pSource); +XMVECTOR XMLoadUNibble4(_In_ CONST XMUNIBBLE4* pSource); +XMVECTOR XMLoadU555(_In_ CONST XMU555* pSource); +XMVECTOR XMLoadColor(_In_ CONST XMCOLOR* pSource); + +XMMATRIX XMLoadFloat3x3(_In_ CONST XMFLOAT3X3* pSource); +XMMATRIX XMLoadFloat4x3(_In_ CONST XMFLOAT4X3* pSource); +XMMATRIX XMLoadFloat4x3A(_In_ CONST XMFLOAT4X3A* pSource); +XMMATRIX XMLoadFloat4x4(_In_ CONST XMFLOAT4X4* pSource); +XMMATRIX XMLoadFloat4x4A(_In_ CONST XMFLOAT4X4A* pSource); + +/**************************************************************************** + * + * Store operations + * + ****************************************************************************/ + +VOID XMStoreInt(_Out_ UINT* pDestination, FXMVECTOR V); +VOID XMStoreFloat(_Out_ FLOAT* pDestination, FXMVECTOR V); + +VOID XMStoreInt2(_Out_cap_c_(2) UINT* pDestination, FXMVECTOR V); +VOID XMStoreInt2A(_Out_cap_c_(2) UINT* pDestination, FXMVECTOR V); +VOID XMStoreFloat2(_Out_ XMFLOAT2* pDestination, FXMVECTOR V); +VOID XMStoreFloat2A(_Out_ XMFLOAT2A* pDestination, FXMVECTOR V); +VOID XMStoreSInt2(_Out_ XMINT2* pDestination, FXMVECTOR V); +VOID XMStoreUInt2(_Out_ XMUINT2* pDestination, FXMVECTOR V); +VOID XMStoreHalf2(_Out_ XMHALF2* pDestination, FXMVECTOR V); +VOID XMStoreShortN2(_Out_ XMSHORTN2* pDestination, FXMVECTOR V); +VOID XMStoreShort2(_Out_ XMSHORT2* pDestination, FXMVECTOR V); +VOID XMStoreUShortN2(_Out_ XMUSHORTN2* pDestination, FXMVECTOR V); +VOID XMStoreUShort2(_Out_ XMUSHORT2* pDestination, FXMVECTOR V); +VOID XMStoreByteN2(_Out_ XMBYTEN2* pDestination, FXMVECTOR V); +VOID XMStoreByte2(_Out_ XMBYTE2* pDestination, FXMVECTOR V); +VOID XMStoreUByteN2(_Out_ XMUBYTEN2* pDestination, FXMVECTOR V); +VOID XMStoreUByte2(_Out_ XMUBYTE2* pDestination, FXMVECTOR V); + +VOID XMStoreInt3(_Out_cap_c_(3) UINT* pDestination, FXMVECTOR V); +VOID XMStoreInt3A(_Out_cap_c_(3) UINT* pDestination, FXMVECTOR V); +VOID XMStoreFloat3(_Out_ XMFLOAT3* pDestination, FXMVECTOR V); +VOID XMStoreFloat3A(_Out_ XMFLOAT3A* pDestination, FXMVECTOR V); +VOID XMStoreSInt3(_Out_ XMINT3* pDestination, FXMVECTOR V); +VOID XMStoreUInt3(_Out_ XMUINT3* pDestination, FXMVECTOR V); +VOID XMStoreHenDN3(_Out_ XMHENDN3* pDestination, FXMVECTOR V); +VOID XMStoreHenD3(_Out_ XMHEND3* pDestination, FXMVECTOR V); +VOID XMStoreUHenDN3(_Out_ XMUHENDN3* pDestination, FXMVECTOR V); +VOID XMStoreUHenD3(_Out_ XMUHEND3* pDestination, FXMVECTOR V); +VOID XMStoreDHenN3(_Out_ XMDHENN3* pDestination, FXMVECTOR V); +VOID XMStoreDHen3(_Out_ XMDHEN3* pDestination, FXMVECTOR V); +VOID XMStoreUDHenN3(_Out_ XMUDHENN3* pDestination, FXMVECTOR V); +VOID XMStoreUDHen3(_Out_ XMUDHEN3* pDestination, FXMVECTOR V); +VOID XMStoreU565(_Out_ XMU565* pDestination, FXMVECTOR V); +VOID XMStoreFloat3PK(_Out_ XMFLOAT3PK* pDestination, FXMVECTOR V); +VOID XMStoreFloat3SE(_Out_ XMFLOAT3SE* pDestination, FXMVECTOR V); + +VOID XMStoreInt4(_Out_cap_c_(4) UINT* pDestination, FXMVECTOR V); +VOID XMStoreInt4A(_Out_cap_c_(4) UINT* pDestination, FXMVECTOR V); +VOID XMStoreInt4NC(_Out_cap_c_(4) UINT* pDestination, FXMVECTOR V); +VOID XMStoreFloat4(_Out_ XMFLOAT4* pDestination, FXMVECTOR V); +VOID XMStoreFloat4A(_Out_ XMFLOAT4A* pDestination, FXMVECTOR V); +VOID XMStoreFloat4NC(_Out_ XMFLOAT4* pDestination, FXMVECTOR V); +VOID XMStoreSInt4(_Out_ XMINT4* pDestination, FXMVECTOR V); +VOID XMStoreUInt4(_Out_ XMUINT4* pDestination, FXMVECTOR V); +VOID XMStoreHalf4(_Out_ XMHALF4* pDestination, FXMVECTOR V); +VOID XMStoreShortN4(_Out_ XMSHORTN4* pDestination, FXMVECTOR V); +VOID XMStoreShort4(_Out_ XMSHORT4* pDestination, FXMVECTOR V); +VOID XMStoreUShortN4(_Out_ XMUSHORTN4* pDestination, FXMVECTOR V); +VOID XMStoreUShort4(_Out_ XMUSHORT4* pDestination, FXMVECTOR V); +VOID XMStoreXIcoN4(_Out_ XMXICON4* pDestination, FXMVECTOR V); +VOID XMStoreXIco4(_Out_ XMXICO4* pDestination, FXMVECTOR V); +VOID XMStoreIcoN4(_Out_ XMICON4* pDestination, FXMVECTOR V); +VOID XMStoreIco4(_Out_ XMICO4* pDestination, FXMVECTOR V); +VOID XMStoreUIcoN4(_Out_ XMUICON4* pDestination, FXMVECTOR V); +VOID XMStoreUIco4(_Out_ XMUICO4* pDestination, FXMVECTOR V); +VOID XMStoreXDecN4(_Out_ XMXDECN4* pDestination, FXMVECTOR V); +VOID XMStoreXDec4(_Out_ XMXDEC4* pDestination, FXMVECTOR V); +VOID XMStoreDecN4(_Out_ XMDECN4* pDestination, FXMVECTOR V); +VOID XMStoreDec4(_Out_ XMDEC4* pDestination, FXMVECTOR V); +VOID XMStoreUDecN4(_Out_ XMUDECN4* pDestination, FXMVECTOR V); +VOID XMStoreUDec4(_Out_ XMUDEC4* pDestination, FXMVECTOR V); +VOID XMStoreByteN4(_Out_ XMBYTEN4* pDestination, FXMVECTOR V); +VOID XMStoreByte4(_Out_ XMBYTE4* pDestination, FXMVECTOR V); +VOID XMStoreUByteN4(_Out_ XMUBYTEN4* pDestination, FXMVECTOR V); +VOID XMStoreUByte4(_Out_ XMUBYTE4* pDestination, FXMVECTOR V); +VOID XMStoreUNibble4(_Out_ XMUNIBBLE4* pDestination, FXMVECTOR V); +VOID XMStoreU555(_Out_ XMU555* pDestination, FXMVECTOR V); +VOID XMStoreColor(_Out_ XMCOLOR* pDestination, FXMVECTOR V); + +VOID XMStoreFloat3x3(_Out_ XMFLOAT3X3* pDestination, CXMMATRIX M); +VOID XMStoreFloat3x3NC(_Out_ XMFLOAT3X3* pDestination, CXMMATRIX M); +VOID XMStoreFloat4x3(_Out_ XMFLOAT4X3* pDestination, CXMMATRIX M); +VOID XMStoreFloat4x3A(_Out_ XMFLOAT4X3A* pDestination, CXMMATRIX M); +VOID XMStoreFloat4x3NC(_Out_ XMFLOAT4X3* pDestination, CXMMATRIX M); +VOID XMStoreFloat4x4(_Out_ XMFLOAT4X4* pDestination, CXMMATRIX M); +VOID XMStoreFloat4x4A(_Out_ XMFLOAT4X4A* pDestination, CXMMATRIX M); +VOID XMStoreFloat4x4NC(_Out_ XMFLOAT4X4* pDestination, CXMMATRIX M); + +/**************************************************************************** + * + * General vector operations + * + ****************************************************************************/ + +XMVECTOR XMVectorZero(); +XMVECTOR XMVectorSet(FLOAT x, FLOAT y, FLOAT z, FLOAT w); +XMVECTOR XMVectorSetInt(UINT x, UINT y, UINT z, UINT w); +XMVECTOR XMVectorReplicate(FLOAT Value); +XMVECTOR XMVectorReplicatePtr(_In_ CONST FLOAT *pValue); +XMVECTOR XMVectorReplicateInt(UINT Value); +XMVECTOR XMVectorReplicateIntPtr(_In_ CONST UINT *pValue); +XMVECTOR XMVectorTrueInt(); +XMVECTOR XMVectorFalseInt(); +XMVECTOR XMVectorSplatX(FXMVECTOR V); +XMVECTOR XMVectorSplatY(FXMVECTOR V); +XMVECTOR XMVectorSplatZ(FXMVECTOR V); +XMVECTOR XMVectorSplatW(FXMVECTOR V); +XMVECTOR XMVectorSplatOne(); +XMVECTOR XMVectorSplatInfinity(); +XMVECTOR XMVectorSplatQNaN(); +XMVECTOR XMVectorSplatEpsilon(); +XMVECTOR XMVectorSplatSignMask(); + +FLOAT XMVectorGetByIndex(FXMVECTOR V,UINT i); +FLOAT XMVectorGetX(FXMVECTOR V); +FLOAT XMVectorGetY(FXMVECTOR V); +FLOAT XMVectorGetZ(FXMVECTOR V); +FLOAT XMVectorGetW(FXMVECTOR V); + +VOID XMVectorGetByIndexPtr(_Out_ FLOAT *f, FXMVECTOR V, UINT i); +VOID XMVectorGetXPtr(_Out_ FLOAT *x, FXMVECTOR V); +VOID XMVectorGetYPtr(_Out_ FLOAT *y, FXMVECTOR V); +VOID XMVectorGetZPtr(_Out_ FLOAT *z, FXMVECTOR V); +VOID XMVectorGetWPtr(_Out_ FLOAT *w, FXMVECTOR V); + +UINT XMVectorGetIntByIndex(FXMVECTOR V,UINT i); +UINT XMVectorGetIntX(FXMVECTOR V); +UINT XMVectorGetIntY(FXMVECTOR V); +UINT XMVectorGetIntZ(FXMVECTOR V); +UINT XMVectorGetIntW(FXMVECTOR V); + +VOID XMVectorGetIntByIndexPtr(_Out_ UINT *x,FXMVECTOR V, UINT i); +VOID XMVectorGetIntXPtr(_Out_ UINT *x, FXMVECTOR V); +VOID XMVectorGetIntYPtr(_Out_ UINT *y, FXMVECTOR V); +VOID XMVectorGetIntZPtr(_Out_ UINT *z, FXMVECTOR V); +VOID XMVectorGetIntWPtr(_Out_ UINT *w, FXMVECTOR V); + +XMVECTOR XMVectorSetByIndex(FXMVECTOR V,FLOAT f,UINT i); +XMVECTOR XMVectorSetX(FXMVECTOR V, FLOAT x); +XMVECTOR XMVectorSetY(FXMVECTOR V, FLOAT y); +XMVECTOR XMVectorSetZ(FXMVECTOR V, FLOAT z); +XMVECTOR XMVectorSetW(FXMVECTOR V, FLOAT w); + +XMVECTOR XMVectorSetByIndexPtr(FXMVECTOR V, _In_ CONST FLOAT *f, UINT i); +XMVECTOR XMVectorSetXPtr(FXMVECTOR V, _In_ CONST FLOAT *x); +XMVECTOR XMVectorSetYPtr(FXMVECTOR V, _In_ CONST FLOAT *y); +XMVECTOR XMVectorSetZPtr(FXMVECTOR V, _In_ CONST FLOAT *z); +XMVECTOR XMVectorSetWPtr(FXMVECTOR V, _In_ CONST FLOAT *w); + +XMVECTOR XMVectorSetIntByIndex(FXMVECTOR V, UINT x,UINT i); +XMVECTOR XMVectorSetIntX(FXMVECTOR V, UINT x); +XMVECTOR XMVectorSetIntY(FXMVECTOR V, UINT y); +XMVECTOR XMVectorSetIntZ(FXMVECTOR V, UINT z); +XMVECTOR XMVectorSetIntW(FXMVECTOR V, UINT w); + +XMVECTOR XMVectorSetIntByIndexPtr(FXMVECTOR V, _In_ CONST UINT *x, UINT i); +XMVECTOR XMVectorSetIntXPtr(FXMVECTOR V, _In_ CONST UINT *x); +XMVECTOR XMVectorSetIntYPtr(FXMVECTOR V, _In_ CONST UINT *y); +XMVECTOR XMVectorSetIntZPtr(FXMVECTOR V, _In_ CONST UINT *z); +XMVECTOR XMVectorSetIntWPtr(FXMVECTOR V, _In_ CONST UINT *w); + +XMVECTOR XMVectorPermuteControl(UINT ElementIndex0, UINT ElementIndex1, UINT ElementIndex2, UINT ElementIndex3); +XMVECTOR XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Control); +XMVECTOR XMVectorSelectControl(UINT VectorIndex0, UINT VectorIndex1, UINT VectorIndex2, UINT VectorIndex3); +XMVECTOR XMVectorSelect(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Control); +XMVECTOR XMVectorMergeXY(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorMergeZW(FXMVECTOR V1, FXMVECTOR V2); + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) +XMVECTOR XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, UINT Elements); +XMVECTOR XMVectorRotateLeft(FXMVECTOR V, UINT Elements); +XMVECTOR XMVectorRotateRight(FXMVECTOR V, UINT Elements); +XMVECTOR XMVectorSwizzle(FXMVECTOR V, UINT E0, UINT E1, UINT E2, UINT E3); +XMVECTOR XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS, UINT VSLeftRotateElements, + UINT Select0, UINT Select1, UINT Select2, UINT Select3); + +// VMX128 versions defined below as macros +#endif + +XMVECTOR XMVectorEqual(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorEqualR(_Out_ UINT* pCR, FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorEqualInt(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorEqualIntR(_Out_ UINT* pCR, FXMVECTOR V, FXMVECTOR V2); +XMVECTOR XMVectorNearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon); +XMVECTOR XMVectorNotEqual(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorNotEqualInt(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorGreater(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorGreaterR(_Out_ UINT* pCR, FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorGreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorGreaterOrEqualR(_Out_ UINT* pCR, FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorLess(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorLessOrEqual(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorInBounds(FXMVECTOR V, FXMVECTOR Bounds); +XMVECTOR XMVectorInBoundsR(_Out_ UINT* pCR, FXMVECTOR V, FXMVECTOR Bounds); + +XMVECTOR XMVectorIsNaN(FXMVECTOR V); +XMVECTOR XMVectorIsInfinite(FXMVECTOR V); + +XMVECTOR XMVectorMin(FXMVECTOR V1,FXMVECTOR V2); +XMVECTOR XMVectorMax(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorRound(FXMVECTOR V); +XMVECTOR XMVectorTruncate(FXMVECTOR V); +XMVECTOR XMVectorFloor(FXMVECTOR V); +XMVECTOR XMVectorCeiling(FXMVECTOR V); +XMVECTOR XMVectorClamp(FXMVECTOR V, FXMVECTOR Min, FXMVECTOR Max); +XMVECTOR XMVectorSaturate(FXMVECTOR V); + +XMVECTOR XMVectorAndInt(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorAndCInt(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorOrInt(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorNorInt(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorXorInt(FXMVECTOR V1, FXMVECTOR V2); + +XMVECTOR XMVectorNegate(FXMVECTOR V); +XMVECTOR XMVectorAdd(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorAddAngles(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorSubtract(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorSubtractAngles(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorMultiply(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorMultiplyAdd(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3); +XMVECTOR XMVectorDivide(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorNegativeMultiplySubtract(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3); +XMVECTOR XMVectorScale(FXMVECTOR V, FLOAT ScaleFactor); +XMVECTOR XMVectorReciprocalEst(FXMVECTOR V); +XMVECTOR XMVectorReciprocal(FXMVECTOR V); +XMVECTOR XMVectorSqrtEst(FXMVECTOR V); +XMVECTOR XMVectorSqrt(FXMVECTOR V); +XMVECTOR XMVectorReciprocalSqrtEst(FXMVECTOR V); +XMVECTOR XMVectorReciprocalSqrt(FXMVECTOR V); +XMVECTOR XMVectorExpEst(FXMVECTOR V); +XMVECTOR XMVectorExp(FXMVECTOR V); +XMVECTOR XMVectorLogEst(FXMVECTOR V); +XMVECTOR XMVectorLog(FXMVECTOR V); +XMVECTOR XMVectorPowEst(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorPow(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorAbs(FXMVECTOR V); +XMVECTOR XMVectorMod(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorModAngles(FXMVECTOR Angles); +XMVECTOR XMVectorSin(FXMVECTOR V); +XMVECTOR XMVectorSinEst(FXMVECTOR V); +XMVECTOR XMVectorCos(FXMVECTOR V); +XMVECTOR XMVectorCosEst(FXMVECTOR V); +VOID XMVectorSinCos(_Out_ XMVECTOR* pSin, _Out_ XMVECTOR* pCos, FXMVECTOR V); +VOID XMVectorSinCosEst(_Out_ XMVECTOR* pSin, _Out_ XMVECTOR* pCos, FXMVECTOR V); +XMVECTOR XMVectorTan(FXMVECTOR V); +XMVECTOR XMVectorTanEst(FXMVECTOR V); +XMVECTOR XMVectorSinH(FXMVECTOR V); +XMVECTOR XMVectorSinHEst(FXMVECTOR V); +XMVECTOR XMVectorCosH(FXMVECTOR V); +XMVECTOR XMVectorCosHEst(FXMVECTOR V); +XMVECTOR XMVectorTanH(FXMVECTOR V); +XMVECTOR XMVectorTanHEst(FXMVECTOR V); +XMVECTOR XMVectorASin(FXMVECTOR V); +XMVECTOR XMVectorASinEst(FXMVECTOR V); +XMVECTOR XMVectorACos(FXMVECTOR V); +XMVECTOR XMVectorACosEst(FXMVECTOR V); +XMVECTOR XMVectorATan(FXMVECTOR V); +XMVECTOR XMVectorATanEst(FXMVECTOR V); +XMVECTOR XMVectorATan2(FXMVECTOR Y, FXMVECTOR X); +XMVECTOR XMVectorATan2Est(FXMVECTOR Y, FXMVECTOR X); +XMVECTOR XMVectorLerp(FXMVECTOR V0, FXMVECTOR V1, FLOAT t); +XMVECTOR XMVectorLerpV(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR T); +XMVECTOR XMVectorHermite(FXMVECTOR Position0, FXMVECTOR Tangent0, FXMVECTOR Position1, CXMVECTOR Tangent1, FLOAT t); +XMVECTOR XMVectorHermiteV(FXMVECTOR Position0, FXMVECTOR Tangent0, FXMVECTOR Position1, CXMVECTOR Tangent1, CXMVECTOR T); +XMVECTOR XMVectorCatmullRom(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, CXMVECTOR Position3, FLOAT t); +XMVECTOR XMVectorCatmullRomV(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, CXMVECTOR Position3, CXMVECTOR T); +XMVECTOR XMVectorBaryCentric(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, FLOAT f, FLOAT g); +XMVECTOR XMVectorBaryCentricV(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, CXMVECTOR F, CXMVECTOR G); + +/**************************************************************************** + * + * 2D vector operations + * + ****************************************************************************/ + + +BOOL XMVector2Equal(FXMVECTOR V1, FXMVECTOR V2); +UINT XMVector2EqualR(FXMVECTOR V1, FXMVECTOR V2); +BOOL XMVector2EqualInt(FXMVECTOR V1, FXMVECTOR V2); +UINT XMVector2EqualIntR(FXMVECTOR V1, FXMVECTOR V2); +BOOL XMVector2NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon); +BOOL XMVector2NotEqual(FXMVECTOR V1, FXMVECTOR V2); +BOOL XMVector2NotEqualInt(FXMVECTOR V1, FXMVECTOR V2); +BOOL XMVector2Greater(FXMVECTOR V1, FXMVECTOR V2); +UINT XMVector2GreaterR(FXMVECTOR V1, FXMVECTOR V2); +BOOL XMVector2GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2); +UINT XMVector2GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2); +BOOL XMVector2Less(FXMVECTOR V1, FXMVECTOR V2); +BOOL XMVector2LessOrEqual(FXMVECTOR V1, FXMVECTOR V2); +BOOL XMVector2InBounds(FXMVECTOR V, FXMVECTOR Bounds); +UINT XMVector2InBoundsR(FXMVECTOR V, FXMVECTOR Bounds); + +BOOL XMVector2IsNaN(FXMVECTOR V); +BOOL XMVector2IsInfinite(FXMVECTOR V); + +XMVECTOR XMVector2Dot(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVector2Cross(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVector2LengthSq(FXMVECTOR V); +XMVECTOR XMVector2ReciprocalLengthEst(FXMVECTOR V); +XMVECTOR XMVector2ReciprocalLength(FXMVECTOR V); +XMVECTOR XMVector2LengthEst(FXMVECTOR V); +XMVECTOR XMVector2Length(FXMVECTOR V); +XMVECTOR XMVector2NormalizeEst(FXMVECTOR V); +XMVECTOR XMVector2Normalize(FXMVECTOR V); +XMVECTOR XMVector2ClampLength(FXMVECTOR V, FLOAT LengthMin, FLOAT LengthMax); +XMVECTOR XMVector2ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax); +XMVECTOR XMVector2Reflect(FXMVECTOR Incident, FXMVECTOR Normal); +XMVECTOR XMVector2Refract(FXMVECTOR Incident, FXMVECTOR Normal, FLOAT RefractionIndex); +XMVECTOR XMVector2RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex); +XMVECTOR XMVector2Orthogonal(FXMVECTOR V); +XMVECTOR XMVector2AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2); +XMVECTOR XMVector2AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2); +XMVECTOR XMVector2AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVector2LinePointDistance(FXMVECTOR LinePoint1, FXMVECTOR LinePoint2, FXMVECTOR Point); +XMVECTOR XMVector2IntersectLine(FXMVECTOR Line1Point1, FXMVECTOR Line1Point2, FXMVECTOR Line2Point1, CXMVECTOR Line2Point2); +XMVECTOR XMVector2Transform(FXMVECTOR V, CXMMATRIX M); +XMFLOAT4* XMVector2TransformStream(_Out_bytecap_x_(sizeof(XMFLOAT4)+OutputStride*(VectorCount-1)) XMFLOAT4* pOutputStream, + _In_ size_t OutputStride, + _In_bytecount_x_(sizeof(XMFLOAT2)+InputStride*(VectorCount-1)) CONST XMFLOAT2* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, CXMMATRIX M); +XMFLOAT4* XMVector2TransformStreamNC(_Out_bytecap_x_(sizeof(XMFLOAT4)+OutputStride*(VectorCount-1)) XMFLOAT4* pOutputStream, + _In_ size_t OutputStride, + _In_bytecount_x_(sizeof(XMFLOAT2)+InputStride*(VectorCount-1)) CONST XMFLOAT2* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, CXMMATRIX M); +XMVECTOR XMVector2TransformCoord(FXMVECTOR V, CXMMATRIX M); +XMFLOAT2* XMVector2TransformCoordStream(_Out_bytecap_x_(sizeof(XMFLOAT2)+OutputStride*(VectorCount-1)) XMFLOAT2* pOutputStream, + _In_ size_t OutputStride, + _In_bytecount_x_(sizeof(XMFLOAT2)+InputStride*(VectorCount-1)) CONST XMFLOAT2* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, CXMMATRIX M); +XMVECTOR XMVector2TransformNormal(FXMVECTOR V, CXMMATRIX M); +XMFLOAT2* XMVector2TransformNormalStream(_Out_bytecap_x_(sizeof(XMFLOAT2)+OutputStride*(VectorCount-1)) XMFLOAT2* pOutputStream, + _In_ size_t OutputStride, + _In_bytecount_x_(sizeof(XMFLOAT2)+InputStride*(VectorCount-1)) CONST XMFLOAT2* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, CXMMATRIX M); + +/**************************************************************************** + * + * 3D vector operations + * + ****************************************************************************/ + + +BOOL XMVector3Equal(FXMVECTOR V1, FXMVECTOR V2); +UINT XMVector3EqualR(FXMVECTOR V1, FXMVECTOR V2); +BOOL XMVector3EqualInt(FXMVECTOR V1, FXMVECTOR V2); +UINT XMVector3EqualIntR(FXMVECTOR V1, FXMVECTOR V2); +BOOL XMVector3NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon); +BOOL XMVector3NotEqual(FXMVECTOR V1, FXMVECTOR V2); +BOOL XMVector3NotEqualInt(FXMVECTOR V1, FXMVECTOR V2); +BOOL XMVector3Greater(FXMVECTOR V1, FXMVECTOR V2); +UINT XMVector3GreaterR(FXMVECTOR V1, FXMVECTOR V2); +BOOL XMVector3GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2); +UINT XMVector3GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2); +BOOL XMVector3Less(FXMVECTOR V1, FXMVECTOR V2); +BOOL XMVector3LessOrEqual(FXMVECTOR V1, FXMVECTOR V2); +BOOL XMVector3InBounds(FXMVECTOR V, FXMVECTOR Bounds); +UINT XMVector3InBoundsR(FXMVECTOR V, FXMVECTOR Bounds); + +BOOL XMVector3IsNaN(FXMVECTOR V); +BOOL XMVector3IsInfinite(FXMVECTOR V); + +XMVECTOR XMVector3Dot(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVector3Cross(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVector3LengthSq(FXMVECTOR V); +XMVECTOR XMVector3ReciprocalLengthEst(FXMVECTOR V); +XMVECTOR XMVector3ReciprocalLength(FXMVECTOR V); +XMVECTOR XMVector3LengthEst(FXMVECTOR V); +XMVECTOR XMVector3Length(FXMVECTOR V); +XMVECTOR XMVector3NormalizeEst(FXMVECTOR V); +XMVECTOR XMVector3Normalize(FXMVECTOR V); +XMVECTOR XMVector3ClampLength(FXMVECTOR V, FLOAT LengthMin, FLOAT LengthMax); +XMVECTOR XMVector3ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax); +XMVECTOR XMVector3Reflect(FXMVECTOR Incident, FXMVECTOR Normal); +XMVECTOR XMVector3Refract(FXMVECTOR Incident, FXMVECTOR Normal, FLOAT RefractionIndex); +XMVECTOR XMVector3RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex); +XMVECTOR XMVector3Orthogonal(FXMVECTOR V); +XMVECTOR XMVector3AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2); +XMVECTOR XMVector3AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2); +XMVECTOR XMVector3AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVector3LinePointDistance(FXMVECTOR LinePoint1, FXMVECTOR LinePoint2, FXMVECTOR Point); +VOID XMVector3ComponentsFromNormal(_Out_ XMVECTOR* pParallel, _Out_ XMVECTOR* pPerpendicular, FXMVECTOR V, FXMVECTOR Normal); +XMVECTOR XMVector3Rotate(FXMVECTOR V, FXMVECTOR RotationQuaternion); +XMVECTOR XMVector3InverseRotate(FXMVECTOR V, FXMVECTOR RotationQuaternion); +XMVECTOR XMVector3Transform(FXMVECTOR V, CXMMATRIX M); +XMFLOAT4* XMVector3TransformStream(_Out_bytecap_x_(sizeof(XMFLOAT4)+OutputStride*(VectorCount-1)) XMFLOAT4* pOutputStream, + _In_ size_t OutputStride, + _In_bytecount_x_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) CONST XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, CXMMATRIX M); +XMFLOAT4* XMVector3TransformStreamNC(_Out_bytecap_x_(sizeof(XMFLOAT4)+OutputStride*(VectorCount-1)) XMFLOAT4* pOutputStream, + _In_ size_t OutputStride, + _In_bytecount_x_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) CONST XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, CXMMATRIX M); +XMVECTOR XMVector3TransformCoord(FXMVECTOR V, CXMMATRIX M); +XMFLOAT3* XMVector3TransformCoordStream(_Out_bytecap_x_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream, + _In_ size_t OutputStride, + _In_bytecount_x_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) CONST XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, CXMMATRIX M); +XMVECTOR XMVector3TransformNormal(FXMVECTOR V, CXMMATRIX M); +XMFLOAT3* XMVector3TransformNormalStream(_Out_bytecap_x_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream, + _In_ size_t OutputStride, + _In_bytecount_x_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) CONST XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, CXMMATRIX M); +XMVECTOR XMVector3Project(FXMVECTOR V, FLOAT ViewportX, FLOAT ViewportY, FLOAT ViewportWidth, FLOAT ViewportHeight, FLOAT ViewportMinZ, FLOAT ViewportMaxZ, + CXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World); +XMFLOAT3* XMVector3ProjectStream(_Out_bytecap_x_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream, + _In_ size_t OutputStride, + _In_bytecount_x_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) CONST XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, + FLOAT ViewportX, FLOAT ViewportY, FLOAT ViewportWidth, FLOAT ViewportHeight, FLOAT ViewportMinZ, FLOAT ViewportMaxZ, + CXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World); +XMVECTOR XMVector3Unproject(FXMVECTOR V, FLOAT ViewportX, FLOAT ViewportY, FLOAT ViewportWidth, FLOAT ViewportHeight, FLOAT ViewportMinZ, FLOAT ViewportMaxZ, + CXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World); +XMFLOAT3* XMVector3UnprojectStream(_Out_bytecap_x_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream, + _In_ size_t OutputStride, + _In_bytecount_x_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) CONST XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, + FLOAT ViewportX, FLOAT ViewportY, FLOAT ViewportWidth, FLOAT ViewportHeight, FLOAT ViewportMinZ, FLOAT ViewportMaxZ, + CXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World); + +/**************************************************************************** + * + * 4D vector operations + * + ****************************************************************************/ + +BOOL XMVector4Equal(FXMVECTOR V1, FXMVECTOR V2); +UINT XMVector4EqualR(FXMVECTOR V1, FXMVECTOR V2); +BOOL XMVector4EqualInt(FXMVECTOR V1, FXMVECTOR V2); +UINT XMVector4EqualIntR(FXMVECTOR V1, FXMVECTOR V2); +BOOL XMVector4NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon); +BOOL XMVector4NotEqual(FXMVECTOR V1, FXMVECTOR V2); +BOOL XMVector4NotEqualInt(FXMVECTOR V1, FXMVECTOR V2); +BOOL XMVector4Greater(FXMVECTOR V1, FXMVECTOR V2); +UINT XMVector4GreaterR(FXMVECTOR V1, FXMVECTOR V2); +BOOL XMVector4GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2); +UINT XMVector4GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2); +BOOL XMVector4Less(FXMVECTOR V1, FXMVECTOR V2); +BOOL XMVector4LessOrEqual(FXMVECTOR V1, FXMVECTOR V2); +BOOL XMVector4InBounds(FXMVECTOR V, FXMVECTOR Bounds); +UINT XMVector4InBoundsR(FXMVECTOR V, FXMVECTOR Bounds); + +BOOL XMVector4IsNaN(FXMVECTOR V); +BOOL XMVector4IsInfinite(FXMVECTOR V); + +XMVECTOR XMVector4Dot(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVector4Cross(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3); +XMVECTOR XMVector4LengthSq(FXMVECTOR V); +XMVECTOR XMVector4ReciprocalLengthEst(FXMVECTOR V); +XMVECTOR XMVector4ReciprocalLength(FXMVECTOR V); +XMVECTOR XMVector4LengthEst(FXMVECTOR V); +XMVECTOR XMVector4Length(FXMVECTOR V); +XMVECTOR XMVector4NormalizeEst(FXMVECTOR V); +XMVECTOR XMVector4Normalize(FXMVECTOR V); +XMVECTOR XMVector4ClampLength(FXMVECTOR V, FLOAT LengthMin, FLOAT LengthMax); +XMVECTOR XMVector4ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax); +XMVECTOR XMVector4Reflect(FXMVECTOR Incident, FXMVECTOR Normal); +XMVECTOR XMVector4Refract(FXMVECTOR Incident, FXMVECTOR Normal, FLOAT RefractionIndex); +XMVECTOR XMVector4RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex); +XMVECTOR XMVector4Orthogonal(FXMVECTOR V); +XMVECTOR XMVector4AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2); +XMVECTOR XMVector4AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2); +XMVECTOR XMVector4AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVector4Transform(FXMVECTOR V, CXMMATRIX M); +XMFLOAT4* XMVector4TransformStream(_Out_bytecap_x_(sizeof(XMFLOAT4)+OutputStride*(VectorCount-1)) XMFLOAT4* pOutputStream, + _In_ size_t OutputStride, + _In_bytecount_x_(sizeof(XMFLOAT4)+InputStride*(VectorCount-1)) CONST XMFLOAT4* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, CXMMATRIX M); + +/**************************************************************************** + * + * Matrix operations + * + ****************************************************************************/ + +BOOL XMMatrixIsNaN(CXMMATRIX M); +BOOL XMMatrixIsInfinite(CXMMATRIX M); +BOOL XMMatrixIsIdentity(CXMMATRIX M); + +XMMATRIX XMMatrixMultiply(CXMMATRIX M1, CXMMATRIX M2); +XMMATRIX XMMatrixMultiplyTranspose(CXMMATRIX M1, CXMMATRIX M2); +XMMATRIX XMMatrixTranspose(CXMMATRIX M); +XMMATRIX XMMatrixInverse(_Out_opt_ XMVECTOR* pDeterminant, CXMMATRIX M); +XMVECTOR XMMatrixDeterminant(CXMMATRIX M); +BOOL XMMatrixDecompose(_Out_ XMVECTOR *outScale, _Out_ XMVECTOR *outRotQuat, _Out_ XMVECTOR *outTrans, CXMMATRIX M); + +XMMATRIX XMMatrixIdentity(); +XMMATRIX XMMatrixSet(FLOAT m00, FLOAT m01, FLOAT m02, FLOAT m03, + FLOAT m10, FLOAT m11, FLOAT m12, FLOAT m13, + FLOAT m20, FLOAT m21, FLOAT m22, FLOAT m23, + FLOAT m30, FLOAT m31, FLOAT m32, FLOAT m33); +XMMATRIX XMMatrixTranslation(FLOAT OffsetX, FLOAT OffsetY, FLOAT OffsetZ); +XMMATRIX XMMatrixTranslationFromVector(FXMVECTOR Offset); +XMMATRIX XMMatrixScaling(FLOAT ScaleX, FLOAT ScaleY, FLOAT ScaleZ); +XMMATRIX XMMatrixScalingFromVector(FXMVECTOR Scale); +XMMATRIX XMMatrixRotationX(FLOAT Angle); +XMMATRIX XMMatrixRotationY(FLOAT Angle); +XMMATRIX XMMatrixRotationZ(FLOAT Angle); +XMMATRIX XMMatrixRotationRollPitchYaw(FLOAT Pitch, FLOAT Yaw, FLOAT Roll); +XMMATRIX XMMatrixRotationRollPitchYawFromVector(FXMVECTOR Angles); +XMMATRIX XMMatrixRotationNormal(FXMVECTOR NormalAxis, FLOAT Angle); +XMMATRIX XMMatrixRotationAxis(FXMVECTOR Axis, FLOAT Angle); +XMMATRIX XMMatrixRotationQuaternion(FXMVECTOR Quaternion); +XMMATRIX XMMatrixTransformation2D(FXMVECTOR ScalingOrigin, FLOAT ScalingOrientation, FXMVECTOR Scaling, + FXMVECTOR RotationOrigin, FLOAT Rotation, CXMVECTOR Translation); +XMMATRIX XMMatrixTransformation(FXMVECTOR ScalingOrigin, FXMVECTOR ScalingOrientationQuaternion, FXMVECTOR Scaling, + CXMVECTOR RotationOrigin, CXMVECTOR RotationQuaternion, CXMVECTOR Translation); +XMMATRIX XMMatrixAffineTransformation2D(FXMVECTOR Scaling, FXMVECTOR RotationOrigin, FLOAT Rotation, FXMVECTOR Translation); +XMMATRIX XMMatrixAffineTransformation(FXMVECTOR Scaling, FXMVECTOR RotationOrigin, FXMVECTOR RotationQuaternion, CXMVECTOR Translation); +XMMATRIX XMMatrixReflect(FXMVECTOR ReflectionPlane); +XMMATRIX XMMatrixShadow(FXMVECTOR ShadowPlane, FXMVECTOR LightPosition); + +XMMATRIX XMMatrixLookAtLH(FXMVECTOR EyePosition, FXMVECTOR FocusPosition, FXMVECTOR UpDirection); +XMMATRIX XMMatrixLookAtRH(FXMVECTOR EyePosition, FXMVECTOR FocusPosition, FXMVECTOR UpDirection); +XMMATRIX XMMatrixLookToLH(FXMVECTOR EyePosition, FXMVECTOR EyeDirection, FXMVECTOR UpDirection); +XMMATRIX XMMatrixLookToRH(FXMVECTOR EyePosition, FXMVECTOR EyeDirection, FXMVECTOR UpDirection); +XMMATRIX XMMatrixPerspectiveLH(FLOAT ViewWidth, FLOAT ViewHeight, FLOAT NearZ, FLOAT FarZ); +XMMATRIX XMMatrixPerspectiveRH(FLOAT ViewWidth, FLOAT ViewHeight, FLOAT NearZ, FLOAT FarZ); +XMMATRIX XMMatrixPerspectiveFovLH(FLOAT FovAngleY, FLOAT AspectHByW, FLOAT NearZ, FLOAT FarZ); +XMMATRIX XMMatrixPerspectiveFovRH(FLOAT FovAngleY, FLOAT AspectHByW, FLOAT NearZ, FLOAT FarZ); +XMMATRIX XMMatrixPerspectiveOffCenterLH(FLOAT ViewLeft, FLOAT ViewRight, FLOAT ViewBottom, FLOAT ViewTop, FLOAT NearZ, FLOAT FarZ); +XMMATRIX XMMatrixPerspectiveOffCenterRH(FLOAT ViewLeft, FLOAT ViewRight, FLOAT ViewBottom, FLOAT ViewTop, FLOAT NearZ, FLOAT FarZ); +XMMATRIX XMMatrixOrthographicLH(FLOAT ViewWidth, FLOAT ViewHeight, FLOAT NearZ, FLOAT FarZ); +XMMATRIX XMMatrixOrthographicRH(FLOAT ViewWidth, FLOAT ViewHeight, FLOAT NearZ, FLOAT FarZ); +XMMATRIX XMMatrixOrthographicOffCenterLH(FLOAT ViewLeft, FLOAT ViewRight, FLOAT ViewBottom, FLOAT ViewTop, FLOAT NearZ, FLOAT FarZ); +XMMATRIX XMMatrixOrthographicOffCenterRH(FLOAT ViewLeft, FLOAT ViewRight, FLOAT ViewBottom, FLOAT ViewTop, FLOAT NearZ, FLOAT FarZ); + + +/**************************************************************************** + * + * Quaternion operations + * + ****************************************************************************/ + +BOOL XMQuaternionEqual(FXMVECTOR Q1, FXMVECTOR Q2); +BOOL XMQuaternionNotEqual(FXMVECTOR Q1, FXMVECTOR Q2); + +BOOL XMQuaternionIsNaN(FXMVECTOR Q); +BOOL XMQuaternionIsInfinite(FXMVECTOR Q); +BOOL XMQuaternionIsIdentity(FXMVECTOR Q); + +XMVECTOR XMQuaternionDot(FXMVECTOR Q1, FXMVECTOR Q2); +XMVECTOR XMQuaternionMultiply(FXMVECTOR Q1, FXMVECTOR Q2); +XMVECTOR XMQuaternionLengthSq(FXMVECTOR Q); +XMVECTOR XMQuaternionReciprocalLength(FXMVECTOR Q); +XMVECTOR XMQuaternionLength(FXMVECTOR Q); +XMVECTOR XMQuaternionNormalizeEst(FXMVECTOR Q); +XMVECTOR XMQuaternionNormalize(FXMVECTOR Q); +XMVECTOR XMQuaternionConjugate(FXMVECTOR Q); +XMVECTOR XMQuaternionInverse(FXMVECTOR Q); +XMVECTOR XMQuaternionLn(FXMVECTOR Q); +XMVECTOR XMQuaternionExp(FXMVECTOR Q); +XMVECTOR XMQuaternionSlerp(FXMVECTOR Q0, FXMVECTOR Q1, FLOAT t); +XMVECTOR XMQuaternionSlerpV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR T); +XMVECTOR XMQuaternionSquad(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, CXMVECTOR Q3, FLOAT t); +XMVECTOR XMQuaternionSquadV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, CXMVECTOR Q3, CXMVECTOR T); +VOID XMQuaternionSquadSetup(_Out_ XMVECTOR* pA, _Out_ XMVECTOR* pB, _Out_ XMVECTOR* pC, FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, CXMVECTOR Q3); +XMVECTOR XMQuaternionBaryCentric(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, FLOAT f, FLOAT g); +XMVECTOR XMQuaternionBaryCentricV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, CXMVECTOR F, CXMVECTOR G); + +XMVECTOR XMQuaternionIdentity(); +XMVECTOR XMQuaternionRotationRollPitchYaw(FLOAT Pitch, FLOAT Yaw, FLOAT Roll); +XMVECTOR XMQuaternionRotationRollPitchYawFromVector(FXMVECTOR Angles); +XMVECTOR XMQuaternionRotationNormal(FXMVECTOR NormalAxis, FLOAT Angle); +XMVECTOR XMQuaternionRotationAxis(FXMVECTOR Axis, FLOAT Angle); +XMVECTOR XMQuaternionRotationMatrix(CXMMATRIX M); + +VOID XMQuaternionToAxisAngle(_Out_ XMVECTOR* pAxis, _Out_ FLOAT* pAngle, FXMVECTOR Q); + +/**************************************************************************** + * + * Plane operations + * + ****************************************************************************/ + +BOOL XMPlaneEqual(FXMVECTOR P1, FXMVECTOR P2); +BOOL XMPlaneNearEqual(FXMVECTOR P1, FXMVECTOR P2, FXMVECTOR Epsilon); +BOOL XMPlaneNotEqual(FXMVECTOR P1, FXMVECTOR P2); + +BOOL XMPlaneIsNaN(FXMVECTOR P); +BOOL XMPlaneIsInfinite(FXMVECTOR P); + +XMVECTOR XMPlaneDot(FXMVECTOR P, FXMVECTOR V); +XMVECTOR XMPlaneDotCoord(FXMVECTOR P, FXMVECTOR V); +XMVECTOR XMPlaneDotNormal(FXMVECTOR P, FXMVECTOR V); +XMVECTOR XMPlaneNormalizeEst(FXMVECTOR P); +XMVECTOR XMPlaneNormalize(FXMVECTOR P); +XMVECTOR XMPlaneIntersectLine(FXMVECTOR P, FXMVECTOR LinePoint1, FXMVECTOR LinePoint2); +VOID XMPlaneIntersectPlane(_Out_ XMVECTOR* pLinePoint1, _Out_ XMVECTOR* pLinePoint2, FXMVECTOR P1, FXMVECTOR P2); +XMVECTOR XMPlaneTransform(FXMVECTOR P, CXMMATRIX M); +XMFLOAT4* XMPlaneTransformStream(_Out_bytecap_x_(sizeof(XMFLOAT4)+OutputStride*(PlaneCount-1)) XMFLOAT4* pOutputStream, + _In_ size_t OutputStride, + _In_bytecount_x_(sizeof(XMFLOAT4)+InputStride*(PlaneCount-1)) CONST XMFLOAT4* pInputStream, + _In_ size_t InputStride, _In_ size_t PlaneCount, CXMMATRIX M); + +XMVECTOR XMPlaneFromPointNormal(FXMVECTOR Point, FXMVECTOR Normal); +XMVECTOR XMPlaneFromPoints(FXMVECTOR Point1, FXMVECTOR Point2, FXMVECTOR Point3); + +/**************************************************************************** + * + * Color operations + * + ****************************************************************************/ + +BOOL XMColorEqual(FXMVECTOR C1, FXMVECTOR C2); +BOOL XMColorNotEqual(FXMVECTOR C1, FXMVECTOR C2); +BOOL XMColorGreater(FXMVECTOR C1, FXMVECTOR C2); +BOOL XMColorGreaterOrEqual(FXMVECTOR C1, FXMVECTOR C2); +BOOL XMColorLess(FXMVECTOR C1, FXMVECTOR C2); +BOOL XMColorLessOrEqual(FXMVECTOR C1, FXMVECTOR C2); + +BOOL XMColorIsNaN(FXMVECTOR C); +BOOL XMColorIsInfinite(FXMVECTOR C); + +XMVECTOR XMColorNegative(FXMVECTOR C); +XMVECTOR XMColorModulate(FXMVECTOR C1, FXMVECTOR C2); +XMVECTOR XMColorAdjustSaturation(FXMVECTOR C, FLOAT Saturation); +XMVECTOR XMColorAdjustContrast(FXMVECTOR C, FLOAT Contrast); + +/**************************************************************************** + * + * Miscellaneous operations + * + ****************************************************************************/ + +BOOL XMVerifyCPUSupport(); + +VOID XMAssert(_In_z_ CONST CHAR* pExpression, _In_z_ CONST CHAR* pFileName, UINT LineNumber); + +XMVECTOR XMFresnelTerm(FXMVECTOR CosIncidentAngle, FXMVECTOR RefractionIndex); + +BOOL XMScalarNearEqual(FLOAT S1, FLOAT S2, FLOAT Epsilon); +FLOAT XMScalarModAngle(FLOAT Value); +FLOAT XMScalarSin(FLOAT Value); +FLOAT XMScalarCos(FLOAT Value); +VOID XMScalarSinCos(_Out_ FLOAT* pSin, _Out_ FLOAT* pCos, FLOAT Value); +FLOAT XMScalarASin(FLOAT Value); +FLOAT XMScalarACos(FLOAT Value); +FLOAT XMScalarSinEst(FLOAT Value); +FLOAT XMScalarCosEst(FLOAT Value); +VOID XMScalarSinCosEst(_Out_ FLOAT* pSin, _Out_ FLOAT* pCos, FLOAT Value); +FLOAT XMScalarASinEst(FLOAT Value); +FLOAT XMScalarACosEst(FLOAT Value); + +/**************************************************************************** + * + * Templates + * + ****************************************************************************/ + +#if defined(__cplusplus) + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + +// PermuteHelper internal template (SSE only) +namespace XNAMathInternal +{ + // Slow path fallback for permutes that do not map to a single SSE shuffle opcode. + template struct PermuteHelper + { + static XMVECTOR Permute(FXMVECTOR v1, FXMVECTOR v2) + { + static const XMVECTORU32 selectMask = + { + WhichX ? 0xFFFFFFFF : 0, + WhichY ? 0xFFFFFFFF : 0, + WhichZ ? 0xFFFFFFFF : 0, + WhichW ? 0xFFFFFFFF : 0, + }; + + XMVECTOR shuffled1 = _mm_shuffle_ps(v1, v1, Shuffle); + XMVECTOR shuffled2 = _mm_shuffle_ps(v2, v2, Shuffle); + + XMVECTOR masked1 = _mm_andnot_ps(selectMask, shuffled1); + XMVECTOR masked2 = _mm_and_ps(selectMask, shuffled2); + + return _mm_or_ps(masked1, masked2); + } + }; + + // Fast path for permutes that only read from the first vector. + template struct PermuteHelper + { + static XMVECTOR Permute(FXMVECTOR v1, FXMVECTOR v2) { (v2); return _mm_shuffle_ps(v1, v1, Shuffle); } + }; + + // Fast path for permutes that only read from the second vector. + template struct PermuteHelper + { + static XMVECTOR Permute(FXMVECTOR v1, FXMVECTOR v2){ (v1); return _mm_shuffle_ps(v2, v2, Shuffle); } + }; + + // Fast path for permutes that read XY from the first vector, ZW from the second. + template struct PermuteHelper + { + static XMVECTOR Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v1, v2, Shuffle); } + }; + + // Fast path for permutes that read XY from the second vector, ZW from the first. + template struct PermuteHelper + { + static XMVECTOR Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v2, v1, Shuffle); } + }; +}; + +#endif // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_ + +// General permute template +template + inline XMVECTOR XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2) +{ +#if defined(_MSC_VER) && (_MSC_VER >= 1600) + static_assert(PermuteX <= 7, "PermuteX template parameter out of range"); + static_assert(PermuteY <= 7, "PermuteY template parameter out of range"); + static_assert(PermuteZ <= 7, "PermuteZ template parameter out of range"); + static_assert(PermuteW <= 7, "PermuteW template parameter out of range"); +#else + XMASSERT(PermuteX <= 7); + XMASSERT(PermuteY <= 7); + XMASSERT(PermuteZ <= 7); + XMASSERT(PermuteW <= 7); +#endif + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + const UINT Shuffle = _MM_SHUFFLE(PermuteW & 3, PermuteZ & 3, PermuteY & 3, PermuteX & 3); + + const bool WhichX = PermuteX > 3; + const bool WhichY = PermuteY > 3; + const bool WhichZ = PermuteZ > 3; + const bool WhichW = PermuteW > 3; + + return XNAMathInternal::PermuteHelper::Permute(V1, V2); +#else + + XMVECTOR c = XMVectorPermuteControl( PermuteX, PermuteY, PermuteZ, PermuteW ); + return XMVectorPermute( V1, V2, c ); + +#endif +} + +// Special-case permute templates +template<> inline XMVECTOR XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { (V2); return V1; } +template<> inline XMVECTOR XMVectorPermute<4,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { (V1); return V2; } + +//------------------------------------------------------------------------------ + +// General swizzle template +template + inline XMVECTOR XMVectorSwizzle(FXMVECTOR V) +{ +#if defined(_MSC_VER) && (_MSC_VER >= 1600) + static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range"); + static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range"); + static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range"); + static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range"); +#else + XMASSERT(SwizzleX <= 3); + XMASSERT(SwizzleY <= 3); + XMASSERT(SwizzleZ <= 3); + XMASSERT(SwizzleW <= 3); +#endif + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + return _mm_shuffle_ps( V, V, _MM_SHUFFLE( SwizzleW, SwizzleZ, SwizzleY, SwizzleX ) ); +#elif defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + return __vpermwi(V, ((SwizzleX & 3) << 6) | ((SwizzleY & 3) << 4) | ((SwizzleZ & 3) << 2) | (SwizzleW & 3) ); +#else + + return XMVectorSwizzle( V, SwizzleX, SwizzleY, SwizzleZ, SwizzleW ); + +#endif +} + +// Specialized swizzles +template<> inline XMVECTOR XMVectorSwizzle<0,1,2,3>(FXMVECTOR V) { return V; } + +//------------------------------------------------------------------------------ + +template + inline XMVECTOR XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2) +{ +#if defined(_MSC_VER) && (_MSC_VER >= 1600) + static_assert( Elements < 4, "Elements template parameter out of range" ); +#else + XMASSERT( Elements < 4 ); +#endif + +#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +#else + return XMVectorPermute(V1, V2); +#endif +} + +template + inline XMVECTOR XMVectorRotateLeft(FXMVECTOR V) +{ +#if defined(_MSC_VER) && (_MSC_VER >= 1600) + static_assert( Elements < 4, "Elements template parameter out of range" ); +#else + XMASSERT( Elements < 4 ); +#endif + +#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +#else + return XMVectorSwizzle(V); +#endif +} + +template + inline XMVECTOR XMVectorRotateRight(FXMVECTOR V) +{ +#if defined(_MSC_VER) && (_MSC_VER >= 1600) + static_assert( Elements < 4, "Elements template parameter out of range" ); +#else + XMASSERT( Elements < 4 ); +#endif + +#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +#else + return XMVectorSwizzle<(4 - Elements) & 3, (5 - Elements) & 3, (6 - Elements) & 3, (7 - Elements) & 3>(V); +#endif +} + +template + inline XMVECTOR XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS) +{ +#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +#else + XMVECTOR Control = XMVectorSelectControl(Select0&1, Select1&1, Select2&1, Select3&1); + return XMVectorSelect( VD, XMVectorRotateLeft(VS), Control ); +#endif +} + +#endif // __cplusplus + +/**************************************************************************** + * + * Globals + * + ****************************************************************************/ + +// The purpose of the following global constants is to prevent redundant +// reloading of the constants when they are referenced by more than one +// separate inline math routine called within the same function. Declaring +// a constant locally within a routine is sufficient to prevent redundant +// reloads of that constant when that single routine is called multiple +// times in a function, but if the constant is used (and declared) in a +// separate math routine it would be reloaded. + +#define XMGLOBALCONST extern CONST __declspec(selectany) + +XMGLOBALCONST XMVECTORF32 g_XMSinCoefficients0 = {1.0f, -0.166666667f, 8.333333333e-3f, -1.984126984e-4f}; +XMGLOBALCONST XMVECTORF32 g_XMSinCoefficients1 = {2.755731922e-6f, -2.505210839e-8f, 1.605904384e-10f, -7.647163732e-13f}; +XMGLOBALCONST XMVECTORF32 g_XMSinCoefficients2 = {2.811457254e-15f, -8.220635247e-18f, 1.957294106e-20f, -3.868170171e-23f}; +XMGLOBALCONST XMVECTORF32 g_XMCosCoefficients0 = {1.0f, -0.5f, 4.166666667e-2f, -1.388888889e-3f}; +XMGLOBALCONST XMVECTORF32 g_XMCosCoefficients1 = {2.480158730e-5f, -2.755731922e-7f, 2.087675699e-9f, -1.147074560e-11f}; +XMGLOBALCONST XMVECTORF32 g_XMCosCoefficients2 = {4.779477332e-14f, -1.561920697e-16f, 4.110317623e-19f, -8.896791392e-22f}; +XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients0 = {1.0f, 0.333333333f, 0.133333333f, 5.396825397e-2f}; +XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients1 = {2.186948854e-2f, 8.863235530e-3f, 3.592128167e-3f, 1.455834485e-3f}; +XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients2 = {5.900274264e-4f, 2.391290764e-4f, 9.691537707e-5f, 3.927832950e-5f}; +XMGLOBALCONST XMVECTORF32 g_XMASinCoefficients0 = {-0.05806367563904f, -0.41861972469416f, 0.22480114791621f, 2.17337241360606f}; +XMGLOBALCONST XMVECTORF32 g_XMASinCoefficients1 = {0.61657275907170f, 4.29696498283455f, -1.18942822255452f, -6.53784832094831f}; +XMGLOBALCONST XMVECTORF32 g_XMASinCoefficients2 = {-1.36926553863413f, -4.48179294237210f, 1.41810672941833f, 5.48179257935713f}; +XMGLOBALCONST XMVECTORF32 g_XMATanCoefficients0 = {1.0f, 0.333333334f, 0.2f, 0.142857143f}; +XMGLOBALCONST XMVECTORF32 g_XMATanCoefficients1 = {1.111111111e-1f, 9.090909091e-2f, 7.692307692e-2f, 6.666666667e-2f}; +XMGLOBALCONST XMVECTORF32 g_XMATanCoefficients2 = {5.882352941e-2f, 5.263157895e-2f, 4.761904762e-2f, 4.347826087e-2f}; +XMGLOBALCONST XMVECTORF32 g_XMSinEstCoefficients = {1.0f, -1.66521856991541e-1f, 8.199913018755e-3f, -1.61475937228e-4f}; +XMGLOBALCONST XMVECTORF32 g_XMCosEstCoefficients = {1.0f, -4.95348008918096e-1f, 3.878259962881e-2f, -9.24587976263e-4f}; +XMGLOBALCONST XMVECTORF32 g_XMTanEstCoefficients = {2.484f, -1.954923183e-1f, 2.467401101f, XM_1DIVPI}; +XMGLOBALCONST XMVECTORF32 g_XMATanEstCoefficients = {7.689891418951e-1f, 1.104742493348f, 8.661844266006e-1f, XM_PIDIV2}; +XMGLOBALCONST XMVECTORF32 g_XMASinEstCoefficients = {-1.36178272886711f, 2.37949493464538f, -8.08228565650486e-1f, 2.78440142746736e-1f}; +XMGLOBALCONST XMVECTORF32 g_XMASinEstConstants = {1.00000011921f, XM_PIDIV2, 0.0f, 0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMPiConstants0 = {XM_PI, XM_2PI, XM_1DIVPI, XM_1DIV2PI}; +XMGLOBALCONST XMVECTORF32 g_XMIdentityR0 = {1.0f, 0.0f, 0.0f, 0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMIdentityR1 = {0.0f, 1.0f, 0.0f, 0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMIdentityR2 = {0.0f, 0.0f, 1.0f, 0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMIdentityR3 = {0.0f, 0.0f, 0.0f, 1.0f}; +XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR0 = {-1.0f,0.0f, 0.0f, 0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR1 = {0.0f,-1.0f, 0.0f, 0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR2 = {0.0f, 0.0f,-1.0f, 0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR3 = {0.0f, 0.0f, 0.0f,-1.0f}; +XMGLOBALCONST XMVECTORI32 g_XMNegativeZero = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; +XMGLOBALCONST XMVECTORI32 g_XMNegate3 = {0x80000000, 0x80000000, 0x80000000, 0x00000000}; +XMGLOBALCONST XMVECTORI32 g_XMMask3 = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000}; +XMGLOBALCONST XMVECTORI32 g_XMMaskX = {0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000}; +XMGLOBALCONST XMVECTORI32 g_XMMaskY = {0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000}; +XMGLOBALCONST XMVECTORI32 g_XMMaskZ = {0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000}; +XMGLOBALCONST XMVECTORI32 g_XMMaskW = {0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF}; +XMGLOBALCONST XMVECTORF32 g_XMOne = { 1.0f, 1.0f, 1.0f, 1.0f}; +XMGLOBALCONST XMVECTORF32 g_XMOne3 = { 1.0f, 1.0f, 1.0f, 0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMZero = { 0.0f, 0.0f, 0.0f, 0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMNegativeOne = {-1.0f,-1.0f,-1.0f,-1.0f}; +XMGLOBALCONST XMVECTORF32 g_XMOneHalf = { 0.5f, 0.5f, 0.5f, 0.5f}; +XMGLOBALCONST XMVECTORF32 g_XMNegativeOneHalf = {-0.5f,-0.5f,-0.5f,-0.5f}; +XMGLOBALCONST XMVECTORF32 g_XMNegativeTwoPi = {-XM_2PI, -XM_2PI, -XM_2PI, -XM_2PI}; +XMGLOBALCONST XMVECTORF32 g_XMNegativePi = {-XM_PI, -XM_PI, -XM_PI, -XM_PI}; +XMGLOBALCONST XMVECTORF32 g_XMHalfPi = {XM_PIDIV2, XM_PIDIV2, XM_PIDIV2, XM_PIDIV2}; +XMGLOBALCONST XMVECTORF32 g_XMPi = {XM_PI, XM_PI, XM_PI, XM_PI}; +XMGLOBALCONST XMVECTORF32 g_XMReciprocalPi = {XM_1DIVPI, XM_1DIVPI, XM_1DIVPI, XM_1DIVPI}; +XMGLOBALCONST XMVECTORF32 g_XMTwoPi = {XM_2PI, XM_2PI, XM_2PI, XM_2PI}; +XMGLOBALCONST XMVECTORF32 g_XMReciprocalTwoPi = {XM_1DIV2PI, XM_1DIV2PI, XM_1DIV2PI, XM_1DIV2PI}; +XMGLOBALCONST XMVECTORF32 g_XMEpsilon = {1.192092896e-7f, 1.192092896e-7f, 1.192092896e-7f, 1.192092896e-7f}; +XMGLOBALCONST XMVECTORI32 g_XMInfinity = {0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000}; +XMGLOBALCONST XMVECTORI32 g_XMQNaN = {0x7FC00000, 0x7FC00000, 0x7FC00000, 0x7FC00000}; +XMGLOBALCONST XMVECTORI32 g_XMQNaNTest = {0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF}; +XMGLOBALCONST XMVECTORI32 g_XMAbsMask = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF}; +XMGLOBALCONST XMVECTORI32 g_XMFltMin = {0x00800000, 0x00800000, 0x00800000, 0x00800000}; +XMGLOBALCONST XMVECTORI32 g_XMFltMax = {0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF}; +XMGLOBALCONST XMVECTORI32 g_XMNegOneMask = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF}; +XMGLOBALCONST XMVECTORI32 g_XMMaskA8R8G8B8 = {0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000}; +XMGLOBALCONST XMVECTORI32 g_XMFlipA8R8G8B8 = {0x00000000, 0x00000000, 0x00000000, 0x80000000}; +XMGLOBALCONST XMVECTORF32 g_XMFixAA8R8G8B8 = {0.0f,0.0f,0.0f,(float)(0x80000000U)}; +XMGLOBALCONST XMVECTORF32 g_XMNormalizeA8R8G8B8 = {1.0f/(255.0f*(float)(0x10000)),1.0f/(255.0f*(float)(0x100)),1.0f/255.0f,1.0f/(255.0f*(float)(0x1000000))}; +XMGLOBALCONST XMVECTORI32 g_XMMaskA2B10G10R10 = {0x000003FF, 0x000FFC00, 0x3FF00000, 0xC0000000}; +XMGLOBALCONST XMVECTORI32 g_XMFlipA2B10G10R10 = {0x00000200, 0x00080000, 0x20000000, 0x80000000}; +XMGLOBALCONST XMVECTORF32 g_XMFixAA2B10G10R10 = {-512.0f,-512.0f*(float)(0x400),-512.0f*(float)(0x100000),(float)(0x80000000U)}; +XMGLOBALCONST XMVECTORF32 g_XMNormalizeA2B10G10R10 = {1.0f/511.0f,1.0f/(511.0f*(float)(0x400)),1.0f/(511.0f*(float)(0x100000)),1.0f/(3.0f*(float)(0x40000000))}; +XMGLOBALCONST XMVECTORI32 g_XMMaskX16Y16 = {0x0000FFFF, 0xFFFF0000, 0x00000000, 0x00000000}; +XMGLOBALCONST XMVECTORI32 g_XMFlipX16Y16 = {0x00008000, 0x00000000, 0x00000000, 0x00000000}; +XMGLOBALCONST XMVECTORF32 g_XMFixX16Y16 = {-32768.0f,0.0f,0.0f,0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMNormalizeX16Y16 = {1.0f/32767.0f,1.0f/(32767.0f*65536.0f),0.0f,0.0f}; +XMGLOBALCONST XMVECTORI32 g_XMMaskX16Y16Z16W16 = {0x0000FFFF, 0x0000FFFF, 0xFFFF0000, 0xFFFF0000}; +XMGLOBALCONST XMVECTORI32 g_XMFlipX16Y16Z16W16 = {0x00008000, 0x00008000, 0x00000000, 0x00000000}; +XMGLOBALCONST XMVECTORF32 g_XMFixX16Y16Z16W16 = {-32768.0f,-32768.0f,0.0f,0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMNormalizeX16Y16Z16W16 = {1.0f/32767.0f,1.0f/32767.0f,1.0f/(32767.0f*65536.0f),1.0f/(32767.0f*65536.0f)}; +XMGLOBALCONST XMVECTORF32 g_XMNoFraction = {8388608.0f,8388608.0f,8388608.0f,8388608.0f}; +XMGLOBALCONST XMVECTORI32 g_XMMaskByte = {0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF}; +XMGLOBALCONST XMVECTORF32 g_XMNegateX = {-1.0f, 1.0f, 1.0f, 1.0f}; +XMGLOBALCONST XMVECTORF32 g_XMNegateY = { 1.0f,-1.0f, 1.0f, 1.0f}; +XMGLOBALCONST XMVECTORF32 g_XMNegateZ = { 1.0f, 1.0f,-1.0f, 1.0f}; +XMGLOBALCONST XMVECTORF32 g_XMNegateW = { 1.0f, 1.0f, 1.0f,-1.0f}; +XMGLOBALCONST XMVECTORI32 g_XMSelect0101 = {XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_1}; +XMGLOBALCONST XMVECTORI32 g_XMSelect1010 = {XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0}; +XMGLOBALCONST XMVECTORI32 g_XMOneHalfMinusEpsilon = { 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD}; +XMGLOBALCONST XMVECTORI32 g_XMSelect1000 = {XM_SELECT_1, XM_SELECT_0, XM_SELECT_0, XM_SELECT_0}; +XMGLOBALCONST XMVECTORI32 g_XMSelect1100 = {XM_SELECT_1, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0}; +XMGLOBALCONST XMVECTORI32 g_XMSelect1110 = {XM_SELECT_1, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0}; +XMGLOBALCONST XMVECTORI32 g_XMSwizzleXYXY = {XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_0X, XM_PERMUTE_0Y}; +XMGLOBALCONST XMVECTORI32 g_XMSwizzleXYZX = {XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_0Z, XM_PERMUTE_0X}; +XMGLOBALCONST XMVECTORI32 g_XMSwizzleYXZW = {XM_PERMUTE_0Y, XM_PERMUTE_0X, XM_PERMUTE_0Z, XM_PERMUTE_0W}; +XMGLOBALCONST XMVECTORI32 g_XMSwizzleYZXW = {XM_PERMUTE_0Y, XM_PERMUTE_0Z, XM_PERMUTE_0X, XM_PERMUTE_0W}; +XMGLOBALCONST XMVECTORI32 g_XMSwizzleZXYW = {XM_PERMUTE_0Z, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_0W}; +XMGLOBALCONST XMVECTORI32 g_XMPermute0X0Y1X1Y = {XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_1Y}; +XMGLOBALCONST XMVECTORI32 g_XMPermute0Z0W1Z1W = {XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_1W}; +XMGLOBALCONST XMVECTORF32 g_XMFixupY16 = {1.0f,1.0f/65536.0f,0.0f,0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMFixupY16W16 = {1.0f,1.0f,1.0f/65536.0f,1.0f/65536.0f}; +XMGLOBALCONST XMVECTORI32 g_XMFlipY = {0,0x80000000,0,0}; +XMGLOBALCONST XMVECTORI32 g_XMFlipZ = {0,0,0x80000000,0}; +XMGLOBALCONST XMVECTORI32 g_XMFlipW = {0,0,0,0x80000000}; +XMGLOBALCONST XMVECTORI32 g_XMFlipYZ = {0,0x80000000,0x80000000,0}; +XMGLOBALCONST XMVECTORI32 g_XMFlipZW = {0,0,0x80000000,0x80000000}; +XMGLOBALCONST XMVECTORI32 g_XMFlipYW = {0,0x80000000,0,0x80000000}; +XMGLOBALCONST XMVECTORI32 g_XMMaskHenD3 = {0x7FF,0x7ff<<11,0x3FF<<22,0}; +XMGLOBALCONST XMVECTORI32 g_XMMaskDHen3 = {0x3FF,0x7ff<<10,0x7FF<<21,0}; +XMGLOBALCONST XMVECTORF32 g_XMAddUHenD3 = {0,0,32768.0f*65536.0f,0}; +XMGLOBALCONST XMVECTORF32 g_XMAddHenD3 = {-1024.0f,-1024.0f*2048.0f,0,0}; +XMGLOBALCONST XMVECTORF32 g_XMAddDHen3 = {-512.0f,-1024.0f*1024.0f,0,0}; +XMGLOBALCONST XMVECTORF32 g_XMMulHenD3 = {1.0f,1.0f/2048.0f,1.0f/(2048.0f*2048.0f),0}; +XMGLOBALCONST XMVECTORF32 g_XMMulDHen3 = {1.0f,1.0f/1024.0f,1.0f/(1024.0f*2048.0f),0}; +XMGLOBALCONST XMVECTORI32 g_XMXorHenD3 = {0x400,0x400<<11,0,0}; +XMGLOBALCONST XMVECTORI32 g_XMXorDHen3 = {0x200,0x400<<10,0,0}; +XMGLOBALCONST XMVECTORI32 g_XMMaskIco4 = {0xFFFFF,0xFFFFF000,0xFFFFF,0xF0000000}; +XMGLOBALCONST XMVECTORI32 g_XMXorXIco4 = {0x80000,0,0x80000,0x80000000}; +XMGLOBALCONST XMVECTORI32 g_XMXorIco4 = {0x80000,0,0x80000,0}; +XMGLOBALCONST XMVECTORF32 g_XMAddXIco4 = {-8.0f*65536.0f,0,-8.0f*65536.0f,32768.0f*65536.0f}; +XMGLOBALCONST XMVECTORF32 g_XMAddUIco4 = {0,32768.0f*65536.0f,0,32768.0f*65536.0f}; +XMGLOBALCONST XMVECTORF32 g_XMAddIco4 = {-8.0f*65536.0f,0,-8.0f*65536.0f,0}; +XMGLOBALCONST XMVECTORF32 g_XMMulIco4 = {1.0f,1.0f/4096.0f,1.0f,1.0f/(4096.0f*65536.0f)}; +XMGLOBALCONST XMVECTORI32 g_XMMaskDec4 = {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<30}; +XMGLOBALCONST XMVECTORI32 g_XMXorDec4 = {0x200,0x200<<10,0x200<<20,0}; +XMGLOBALCONST XMVECTORF32 g_XMAddUDec4 = {0,0,0,32768.0f*65536.0f}; +XMGLOBALCONST XMVECTORF32 g_XMAddDec4 = {-512.0f,-512.0f*1024.0f,-512.0f*1024.0f*1024.0f,0}; +XMGLOBALCONST XMVECTORF32 g_XMMulDec4 = {1.0f,1.0f/1024.0f,1.0f/(1024.0f*1024.0f),1.0f/(1024.0f*1024.0f*1024.0f)}; +XMGLOBALCONST XMVECTORI32 g_XMMaskByte4 = {0xFF,0xFF00,0xFF0000,0xFF000000}; +XMGLOBALCONST XMVECTORI32 g_XMXorByte4 = {0x80,0x8000,0x800000,0x00000000}; +XMGLOBALCONST XMVECTORF32 g_XMAddByte4 = {-128.0f,-128.0f*256.0f,-128.0f*65536.0f,0}; +XMGLOBALCONST XMVECTORF32 g_XMFixUnsigned = {32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f}; +XMGLOBALCONST XMVECTORF32 g_XMMaxInt = {65536.0f*32768.0f-128.0f,65536.0f*32768.0f-128.0f,65536.0f*32768.0f-128.0f,65536.0f*32768.0f-128.0f}; +XMGLOBALCONST XMVECTORF32 g_XMMaxUInt = {65536.0f*65536.0f-256.0f,65536.0f*65536.0f-256.0f,65536.0f*65536.0f-256.0f,65536.0f*65536.0f-256.0f}; +XMGLOBALCONST XMVECTORF32 g_XMUnsignedFix = {32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f}; + + +/**************************************************************************** + * + * Implementation + * + ****************************************************************************/ + +#pragma warning(push) +#pragma warning(disable:4068 4214 4204 4365 4616 6001) + +#pragma prefast(push) +#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes") + +#if !defined(__cplusplus) && !defined(_XBOX) && defined(_XM_ISVS2005_) + +/* Work around VC 2005 bug where math.h defines logf with a semicolon at the end. + * Note this is fixed as of Visual Studio 2005 Service Pack 1 + */ + +#undef logf +#define logf(x) ((float)log((double)(x))) + +#endif // !defined(__cplusplus) && !defined(_XBOX) && defined(_XM_ISVS2005_) + + +//------------------------------------------------------------------------------ + +#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_VMX128_INTRINSICS_) +#else + +XMFINLINE XMVECTOR XMVectorSetBinaryConstant(UINT C0, UINT C1, UINT C2, UINT C3) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = (0-(C0&1)) & 0x3F800000; + vResult.u[1] = (0-(C1&1)) & 0x3F800000; + vResult.u[2] = (0-(C2&1)) & 0x3F800000; + vResult.u[3] = (0-(C3&1)) & 0x3F800000; + return vResult.v; +#else // XM_SSE_INTRINSICS_ + static const XMVECTORU32 g_vMask1 = {1,1,1,1}; + // Move the parms to a vector + __m128i vTemp = _mm_set_epi32(C3,C2,C1,C0); + // Mask off the low bits + vTemp = _mm_and_si128(vTemp,g_vMask1); + // 0xFFFFFFFF on true bits + vTemp = _mm_cmpeq_epi32(vTemp,g_vMask1); + // 0xFFFFFFFF -> 1.0f, 0x00000000 -> 0.0f + vTemp = _mm_and_si128(vTemp,g_XMOne); + return reinterpret_cast(&vTemp)[0]; +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorSplatConstant(INT IntConstant, UINT DivExponent) +{ +#if defined(_XM_NO_INTRINSICS_) + XMASSERT( IntConstant >= -16 && IntConstant <= 15 ); + XMASSERT(DivExponent<32); + { + XMVECTORI32 V = { IntConstant, IntConstant, IntConstant, IntConstant }; + return XMConvertVectorIntToFloat( V.v, DivExponent); + } +#else // XM_SSE_INTRINSICS_ + XMASSERT( IntConstant >= -16 && IntConstant <= 15 ); + XMASSERT(DivExponent<32); + // Splat the int + __m128i vScale = _mm_set1_epi32(IntConstant); + // Convert to a float + XMVECTOR vResult = _mm_cvtepi32_ps(vScale); + // Convert DivExponent into 1.0f/(1<(&vScale)[0]); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorSplatConstantInt(INT IntConstant) +{ +#if defined(_XM_NO_INTRINSICS_) + XMASSERT( IntConstant >= -16 && IntConstant <= 15 ); + { + XMVECTORI32 V = { IntConstant, IntConstant, IntConstant, IntConstant }; + return V.v; + } +#else // XM_SSE_INTRINSICS_ + XMASSERT( IntConstant >= -16 && IntConstant <= 15 ); + __m128i V = _mm_set1_epi32( IntConstant ); + return reinterpret_cast<__m128 *>(&V)[0]; +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, UINT Elements) +{ + return XMVectorPermute(V1, V2, XMVectorPermuteControl((Elements), ((Elements) + 1), ((Elements) + 2), ((Elements) + 3))); +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorRotateLeft(FXMVECTOR V, UINT Elements) +{ +#if defined(_XM_NO_INTRINSICS_) + XMASSERT( Elements < 4 ); + { + XMVECTORF32 vResult = { V.vector4_f32[Elements & 3], V.vector4_f32[(Elements + 1) & 3], + V.vector4_f32[(Elements + 2) & 3], V.vector4_f32[(Elements + 3) & 3] }; + return vResult.v; + } +#else // XM_SSE_INTRINSICS_ + FLOAT fx = XMVectorGetByIndex(V,(Elements) & 3); + FLOAT fy = XMVectorGetByIndex(V,((Elements) + 1) & 3); + FLOAT fz = XMVectorGetByIndex(V,((Elements) + 2) & 3); + FLOAT fw = XMVectorGetByIndex(V,((Elements) + 3) & 3); + return _mm_set_ps( fw, fz, fy, fx ); +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorRotateRight(FXMVECTOR V, UINT Elements) +{ +#if defined(_XM_NO_INTRINSICS_) + XMASSERT( Elements < 4 ); + { + XMVECTORF32 vResult = { V.vector4_f32[(4 - (Elements)) & 3], V.vector4_f32[(5 - (Elements)) & 3], + V.vector4_f32[(6 - (Elements)) & 3], V.vector4_f32[(7 - (Elements)) & 3] }; + return vResult.v; + } +#else // XM_SSE_INTRINSICS_ + FLOAT fx = XMVectorGetByIndex(V,(4 - (Elements)) & 3); + FLOAT fy = XMVectorGetByIndex(V,(5 - (Elements)) & 3); + FLOAT fz = XMVectorGetByIndex(V,(6 - (Elements)) & 3); + FLOAT fw = XMVectorGetByIndex(V,(7 - (Elements)) & 3); + return _mm_set_ps( fw, fz, fy, fx ); +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorSwizzle(FXMVECTOR V, UINT E0, UINT E1, UINT E2, UINT E3) +{ +#if defined(_XM_NO_INTRINSICS_) + XMASSERT( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) ); + { + XMVECTORF32 vResult = { V.vector4_f32[E0], V.vector4_f32[E1], V.vector4_f32[E2], V.vector4_f32[E3] }; + return vResult.v; + } +#else // XM_SSE_INTRINSICS_ + FLOAT fx = XMVectorGetByIndex(V,E0); + FLOAT fy = XMVectorGetByIndex(V,E1); + FLOAT fz = XMVectorGetByIndex(V,E2); + FLOAT fw = XMVectorGetByIndex(V,E3); + return _mm_set_ps( fw, fz, fy, fx ); +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS, UINT VSLeftRotateElements, + UINT Select0, UINT Select1, UINT Select2, UINT Select3) +{ + XMVECTOR Control = XMVectorSelectControl(Select0&1, Select1&1, Select2&1, Select3&1); + return XMVectorSelect( VD, XMVectorRotateLeft(VS, VSLeftRotateElements), Control ); +} + +#endif + +//------------------------------------------------------------------------------ + +#include "xnamathconvert.inl" +#include "xnamathvector.inl" +#include "xnamathmatrix.inl" +#include "xnamathmisc.inl" + +#pragma prefast(pop) +#pragma warning(pop) + +#endif // __XNAMATH_H__ + diff --git a/thirdparty/directxtex/XNAMath/xnamathconvert.inl b/thirdparty/directxtex/XNAMath/xnamathconvert.inl new file mode 100644 index 0000000..63ed20d --- /dev/null +++ b/thirdparty/directxtex/XNAMath/xnamathconvert.inl @@ -0,0 +1,6383 @@ +/************************************************************************ +* * +* xnamathconvert.inl -- SIMD C++ Math library for Windows and Xbox 360 * +* Conversion, loading, and storing functions * +* * +* Copyright (c) Microsoft Corp. All rights reserved. * +* * +************************************************************************/ + +#if defined(_MSC_VER) && (_MSC_VER > 1000) +#pragma once +#endif + +#ifndef __XNAMATHCONVERT_INL__ +#define __XNAMATHCONVERT_INL__ + +#define XM_PACK_FACTOR (FLOAT)(1 << 22) +#define XM_UNPACK_FACTOR_UNSIGNED (FLOAT)(1 << 23) +#define XM_UNPACK_FACTOR_SIGNED XM_PACK_FACTOR + +#define XM_UNPACK_UNSIGNEDN_OFFSET(BitsX, BitsY, BitsZ, BitsW) \ + {-XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsX)) - 1), \ + -XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsY)) - 1), \ + -XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsZ)) - 1), \ + -XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsW)) - 1)} + +#define XM_UNPACK_UNSIGNEDN_SCALE(BitsX, BitsY, BitsZ, BitsW) \ + {XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsX)) - 1), \ + XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsY)) - 1), \ + XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsZ)) - 1), \ + XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsW)) - 1)} + +#define XM_UNPACK_SIGNEDN_SCALE(BitsX, BitsY, BitsZ, BitsW) \ + {-XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsX) - 1)) - 1), \ + -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsY) - 1)) - 1), \ + -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsZ) - 1)) - 1), \ + -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsW) - 1)) - 1)} + +//#define XM_UNPACK_SIGNEDN_OFFSET(BitsX, BitsY, BitsZ, BitsW) \ +// {-XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsX) - 1)) - 1) * 3.0f, \ +// -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsY) - 1)) - 1) * 3.0f, \ +// -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsZ) - 1)) - 1) * 3.0f, \ +// -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsW) - 1)) - 1) * 3.0f} + +#define XM_PACK_UNSIGNEDN_SCALE(BitsX, BitsY, BitsZ, BitsW) \ + {-(FLOAT)((1 << (BitsX)) - 1) / XM_PACK_FACTOR, \ + -(FLOAT)((1 << (BitsY)) - 1) / XM_PACK_FACTOR, \ + -(FLOAT)((1 << (BitsZ)) - 1) / XM_PACK_FACTOR, \ + -(FLOAT)((1 << (BitsW)) - 1) / XM_PACK_FACTOR} + +#define XM_PACK_SIGNEDN_SCALE(BitsX, BitsY, BitsZ, BitsW) \ + {-(FLOAT)((1 << ((BitsX) - 1)) - 1) / XM_PACK_FACTOR, \ + -(FLOAT)((1 << ((BitsY) - 1)) - 1) / XM_PACK_FACTOR, \ + -(FLOAT)((1 << ((BitsZ) - 1)) - 1) / XM_PACK_FACTOR, \ + -(FLOAT)((1 << ((BitsW) - 1)) - 1) / XM_PACK_FACTOR} + +#define XM_PACK_OFFSET XMVectorSplatConstant(3, 0) +//#define XM_UNPACK_OFFSET XM_PACK_OFFSET + +/**************************************************************************** + * + * Data conversion + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE FLOAT XMConvertHalfToFloat +( + HALF Value +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) + + UINT Mantissa; + UINT Exponent; + UINT Result; + + Mantissa = (UINT)(Value & 0x03FF); + + if ((Value & 0x7C00) != 0) // The value is normalized + { + Exponent = (UINT)((Value >> 10) & 0x1F); + } + else if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x0400) == 0); + + Mantissa &= 0x03FF; + } + else // The value is zero + { + Exponent = (UINT)-112; + } + + Result = ((Value & 0x8000) << 16) | // Sign + ((Exponent + 112) << 23) | // Exponent + (Mantissa << 13); // Mantissa + + return *(FLOAT*)&Result; + +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif +} + +//------------------------------------------------------------------------------ + +XMINLINE FLOAT* XMConvertHalfToFloatStream +( + FLOAT* pOutputStream, + size_t OutputStride, + CONST HALF* pInputStream, + size_t InputStride, + size_t HalfCount +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) + + size_t i; + CONST BYTE* pHalf = (CONST BYTE*)pInputStream; + BYTE* pFloat = (BYTE*)pOutputStream; + + XMASSERT(pOutputStream); + XMASSERT(pInputStream); + + for (i = 0; i < HalfCount; i++) + { + *(FLOAT*)pFloat = XMConvertHalfToFloat(*(const HALF*)pHalf); + pHalf += InputStride; + pFloat += OutputStride; + } + + return pOutputStream; + +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE HALF XMConvertFloatToHalf +( + FLOAT Value +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) + UINT Result; + + UINT IValue = ((UINT *)(&Value))[0]; + UINT Sign = (IValue & 0x80000000U) >> 16U; + IValue = IValue & 0x7FFFFFFFU; // Hack off the sign + + if (IValue > 0x47FFEFFFU) + { + // The number is too large to be represented as a half. Saturate to infinity. + Result = 0x7FFFU; + } + else + { + if (IValue < 0x38800000U) + { + // The number is too small to be represented as a normalized half. + // Convert it to a denormalized value. + UINT Shift = 113U - (IValue >> 23U); + IValue = (0x800000U | (IValue & 0x7FFFFFU)) >> Shift; + } + else + { + // Rebias the exponent to represent the value as a normalized half. + IValue += 0xC8000000U; + } + + Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U)&0x7FFFU; + } + return (HALF)(Result|Sign); + +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif +} + +//------------------------------------------------------------------------------ + +XMINLINE HALF* XMConvertFloatToHalfStream +( + HALF* pOutputStream, + size_t OutputStride, + CONST FLOAT* pInputStream, + size_t InputStride, + size_t FloatCount +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) + + size_t i; + BYTE* pFloat = (BYTE*)pInputStream; + BYTE* pHalf = (BYTE*)pOutputStream; + + XMASSERT(pOutputStream); + XMASSERT(pInputStream); + + for (i = 0; i < FloatCount; i++) + { + *(HALF*)pHalf = XMConvertFloatToHalf(*(FLOAT*)pFloat); + pFloat += InputStride; + pHalf += OutputStride; + } + return pOutputStream; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) +// For VMX128, these routines are all defines in the main header + +#pragma warning(push) +#pragma warning(disable:4701) // Prevent warnings about 'Result' potentially being used without having been initialized + +XMINLINE XMVECTOR XMConvertVectorIntToFloat +( + FXMVECTOR VInt, + UINT DivExponent +) +{ +#if defined(_XM_NO_INTRINSICS_) + UINT ElementIndex; + FLOAT fScale; + XMVECTOR Result; + XMASSERT(DivExponent<32); + fScale = 1.0f / (FLOAT)(1U << DivExponent); + ElementIndex = 0; + do { + INT iTemp = (INT)VInt.vector4_u32[ElementIndex]; + Result.vector4_f32[ElementIndex] = ((FLOAT)iTemp) * fScale; + } while (++ElementIndex<4); + return Result; +#else // _XM_SSE_INTRINSICS_ + XMASSERT(DivExponent<32); + // Convert to floats + XMVECTOR vResult = _mm_cvtepi32_ps(reinterpret_cast(&VInt)[0]); + // Convert DivExponent into 1.0f/(1<(&vScale)[0]); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +XMINLINE XMVECTOR XMConvertVectorFloatToInt +( + FXMVECTOR VFloat, + UINT MulExponent +) +{ +#if defined(_XM_NO_INTRINSICS_) + UINT ElementIndex; + XMVECTOR Result; + FLOAT fScale; + XMASSERT(MulExponent<32); + // Get the scalar factor. + fScale = (FLOAT)(1U << MulExponent); + ElementIndex = 0; + do { + INT iResult; + FLOAT fTemp = VFloat.vector4_f32[ElementIndex]*fScale; + if (fTemp <= -(65536.0f*32768.0f)) { + iResult = (-0x7FFFFFFF)-1; + } else if (fTemp > (65536.0f*32768.0f)-128.0f) { + iResult = 0x7FFFFFFF; + } else { + iResult = (INT)fTemp; + } + Result.vector4_u32[ElementIndex] = (UINT)iResult; + } while (++ElementIndex<4); + return Result; +#else // _XM_SSE_INTRINSICS_ + XMASSERT(MulExponent<32); + XMVECTOR vResult = _mm_set_ps1((FLOAT)(1U << MulExponent)); + vResult = _mm_mul_ps(vResult,VFloat); + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(vResult); + // If there was positive overflow, set to 0x7FFFFFFF + vResult = _mm_and_ps(vOverflow,g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow,reinterpret_cast(&vResulti)[0]); + vOverflow = _mm_or_ps(vOverflow,vResult); + return vOverflow; +#endif +} + +//------------------------------------------------------------------------------ + +XMINLINE XMVECTOR XMConvertVectorUIntToFloat +( + FXMVECTOR VUInt, + UINT DivExponent +) +{ +#if defined(_XM_NO_INTRINSICS_) + UINT ElementIndex; + FLOAT fScale; + XMVECTOR Result; + XMASSERT(DivExponent<32); + fScale = 1.0f / (FLOAT)(1U << DivExponent); + ElementIndex = 0; + do { + Result.vector4_f32[ElementIndex] = (FLOAT)VUInt.vector4_u32[ElementIndex] * fScale; + } while (++ElementIndex<4); + return Result; +#else // _XM_SSE_INTRINSICS_ + XMASSERT(DivExponent<32); + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(VUInt,g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(VUInt,vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(reinterpret_cast(&vResult)[0]); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(reinterpret_cast(&vMask)[0],31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(reinterpret_cast(&iMask)[0],g_XMFixUnsigned); + vResult = _mm_add_ps(vResult,vMask); + // Convert DivExponent into 1.0f/(1<(&iMask)[0]); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +XMINLINE XMVECTOR XMConvertVectorFloatToUInt +( + FXMVECTOR VFloat, + UINT MulExponent +) +{ +#if defined(_XM_NO_INTRINSICS_) + UINT ElementIndex; + XMVECTOR Result; + FLOAT fScale; + XMASSERT(MulExponent<32); + // Get the scalar factor. + fScale = (FLOAT)(1U << MulExponent); + ElementIndex = 0; + do { + UINT uResult; + FLOAT fTemp = VFloat.vector4_f32[ElementIndex]*fScale; + if (fTemp <= 0.0f) { + uResult = 0; + } else if (fTemp >= (65536.0f*65536.0f)) { + uResult = 0xFFFFFFFFU; + } else { + uResult = (UINT)fTemp; + } + Result.vector4_u32[ElementIndex] = uResult; + } while (++ElementIndex<4); + return Result; +#else // _XM_SSE_INTRINSICS_ + XMASSERT(MulExponent<32); + XMVECTOR vResult = _mm_set_ps1(static_cast(1U << MulExponent)); + vResult = _mm_mul_ps(vResult,VFloat); + // Clamp to >=0 + vResult = _mm_max_ps(vResult,g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue,vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult,vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask,g_XMNegativeZero); + vResult = _mm_xor_ps(reinterpret_cast(&vResulti)[0],vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult,vOverflow); + return vResult; +#endif +} + +#pragma warning(pop) + +#endif // _XM_NO_INTRINSICS_ || _XM_SSE_INTRINSICS_ + +/**************************************************************************** + * + * Vector and matrix load operations + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadInt(CONST UINT* pSource) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + XMASSERT(pSource); + XMASSERT(((UINT_PTR)pSource & 3) == 0); + + V.vector4_u32[0] = *pSource; + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + XMASSERT(((UINT_PTR)pSource & 3) == 0); + + return _mm_load_ss( (const float*)pSource ); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadFloat(CONST FLOAT* pSource) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + XMASSERT(pSource); + XMASSERT(((UINT_PTR)pSource & 3) == 0); + + V.vector4_f32[0] = *pSource; + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + XMASSERT(((UINT_PTR)pSource & 3) == 0); + + return _mm_load_ss( pSource ); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadInt2 +( + CONST UINT* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + + XMASSERT(pSource); + + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + + return V; +#elif defined(_XM_SSE_INTRINSICS_) + + XMASSERT(pSource); + + __m128 x = _mm_load_ss( (const float*)pSource ); + __m128 y = _mm_load_ss( (const float*)(pSource+1) ); + return _mm_unpacklo_ps( x, y ); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadSInt2 +( + CONST XMINT2* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + XMASSERT(pSource); + + V.vector4_f32[0] = (float)pSource->x; + V.vector4_f32[1] = (float)pSource->y; + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + + __m128 x = _mm_load_ss( (const float*)&pSource->x ); + __m128 y = _mm_load_ss( (const float*)&pSource->y ); + __m128 V = _mm_unpacklo_ps( x, y ); + return _mm_cvtepi32_ps(reinterpret_cast(&V)[0]); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadUInt2 +( + CONST XMUINT2* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + XMASSERT(pSource); + + V.vector4_f32[0] = (float)pSource->x; + V.vector4_f32[1] = (float)pSource->y; + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + + __m128 x = _mm_load_ss( (const float*)&pSource->x ); + __m128 y = _mm_load_ss( (const float*)&pSource->y ); + __m128 V = _mm_unpacklo_ps( x, y ); + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(V,vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(reinterpret_cast(&vResult)[0]); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(reinterpret_cast(&vMask)[0],31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(reinterpret_cast(&iMask)[0],g_XMFixUnsigned); + vResult = _mm_add_ps(vResult,vMask); + return vResult; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadInt2A +( + CONST UINT* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + + XMASSERT(pSource); + XMASSERT(((UINT_PTR)pSource & 0xF) == 0); + + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + + XMASSERT(pSource); + XMASSERT(((UINT_PTR)pSource & 0xF) == 0); + + __m128i V = _mm_loadl_epi64( (const __m128i*)pSource ); + return reinterpret_cast<__m128 *>(&V)[0]; + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadFloat2 +( + CONST XMFLOAT2* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + XMASSERT(pSource); + + ((UINT *)(&V.vector4_f32[0]))[0] = ((const UINT *)(&pSource->x))[0]; + ((UINT *)(&V.vector4_f32[1]))[0] = ((const UINT *)(&pSource->y))[0]; + return V; +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + + __m128 x = _mm_load_ss( &pSource->x ); + __m128 y = _mm_load_ss( &pSource->y ); + return _mm_unpacklo_ps( x, y ); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadFloat2A +( + CONST XMFLOAT2A* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + + XMASSERT(pSource); + XMASSERT(((UINT_PTR)pSource & 0xF) == 0); + + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + XMASSERT(((UINT_PTR)pSource & 0xF) == 0); + + __m128i V = _mm_loadl_epi64( (const __m128i*)pSource ); + return reinterpret_cast<__m128 *>(&V)[0]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadHalf2 +( + CONST XMHALF2* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMASSERT(pSource); + { + XMVECTOR vResult = { + XMConvertHalfToFloat(pSource->x), + XMConvertHalfToFloat(pSource->y), + 0.0f, + 0.0f + }; + return vResult; + } +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + XMVECTOR vResult = { + XMConvertHalfToFloat(pSource->x), + XMConvertHalfToFloat(pSource->y), + 0.0f, + 0.0f + }; + return vResult; + +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadShortN2 +( + CONST XMSHORTN2* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMASSERT(pSource); + { + XMVECTOR vResult = { + (pSource->x == -32768) ? -1.f : ((FLOAT)pSource->x * (1.0f/32767.0f)), + (pSource->y == -32768) ? -1.f : ((FLOAT)pSource->y * (1.0f/32767.0f)), + 0.0f, + 0.0f + }; + return vResult; + } + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + // Splat the two shorts in all four entries (WORD alignment okay, + // DWORD alignment preferred) + __m128 vTemp = _mm_load_ps1(reinterpret_cast(&pSource->x)); + // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0 + vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16); + // x needs to be sign extended + vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(reinterpret_cast(&vTemp)[0]); + // x - 0x8000 to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16); + // Convert -1.0f - 1.0f + vTemp = _mm_mul_ps(vTemp,g_XMNormalizeX16Y16); + // Clamp result (for case of -32768) + return _mm_max_ps( vTemp, g_XMNegativeOne ); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadShort2 +( + CONST XMSHORT2* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + + XMASSERT(pSource); + + V.vector4_f32[0] = (FLOAT)pSource->x; + V.vector4_f32[1] = (FLOAT)pSource->y; + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + // Splat the two shorts in all four entries (WORD alignment okay, + // DWORD alignment preferred) + __m128 vTemp = _mm_load_ps1(reinterpret_cast(&pSource->x)); + // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0 + vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16); + // x needs to be sign extended + vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(reinterpret_cast(&vTemp)[0]); + // x - 0x8000 to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16); + // Y is 65536 too large + return _mm_mul_ps(vTemp,g_XMFixupY16); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadUShortN2 +( + CONST XMUSHORTN2* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + + XMASSERT(pSource); + + V.vector4_f32[0] = (FLOAT)pSource->x / 65535.0f; + V.vector4_f32[1] = (FLOAT)pSource->y / 65535.0f; + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 FixupY16 = {1.0f/65535.0f,1.0f/(65535.0f*65536.0f),0.0f,0.0f}; + static const XMVECTORF32 FixaddY16 = {0,32768.0f*65536.0f,0,0}; + XMASSERT(pSource); + // Splat the two shorts in all four entries (WORD alignment okay, + // DWORD alignment preferred) + __m128 vTemp = _mm_load_ps1(reinterpret_cast(&pSource->x)); + // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0 + vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16); + // y needs to be sign flipped + vTemp = _mm_xor_ps(vTemp,g_XMFlipY); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(reinterpret_cast(&vTemp)[0]); + // y + 0x8000 to undo the signed order. + vTemp = _mm_add_ps(vTemp,FixaddY16); + // Y is 65536 times too large + vTemp = _mm_mul_ps(vTemp,FixupY16); + return vTemp; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadUShort2 +( + CONST XMUSHORT2* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + + XMASSERT(pSource); + + V.vector4_f32[0] = (FLOAT)pSource->x; + V.vector4_f32[1] = (FLOAT)pSource->y; + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 FixaddY16 = {0,32768.0f,0,0}; + XMASSERT(pSource); + // Splat the two shorts in all four entries (WORD alignment okay, + // DWORD alignment preferred) + __m128 vTemp = _mm_load_ps1(reinterpret_cast(&pSource->x)); + // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0 + vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16); + // y needs to be sign flipped + vTemp = _mm_xor_ps(vTemp,g_XMFlipY); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(reinterpret_cast(&vTemp)[0]); + // Y is 65536 times too large + vTemp = _mm_mul_ps(vTemp,g_XMFixupY16); + // y + 0x8000 to undo the signed order. + vTemp = _mm_add_ps(vTemp,FixaddY16); + return vTemp; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadByteN2 +( + CONST XMBYTEN2* pSource +) +{ + XMASSERT(pSource); + { + XMVECTOR vResult = { + (pSource->x == -128) ? -1.f : ((FLOAT)pSource->x * (1.0f/127.0f)), + (pSource->y == -128) ? -1.f : ((FLOAT)pSource->y * (1.0f/127.0f)), + 0.0f, + 0.0f + }; + return vResult; + } +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadByte2 +( + CONST XMBYTE2* pSource +) +{ + XMASSERT(pSource); + { + XMVECTOR vResult = { + (FLOAT)pSource->x, + (FLOAT)pSource->y, + 0.0f, + 0.0f + }; + return vResult; + } +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadUByteN2 +( + CONST XMUBYTEN2* pSource +) +{ + XMASSERT(pSource); + { + XMVECTOR vResult = { + (FLOAT)pSource->x * (1.0f/255.0f), + (FLOAT)pSource->y * (1.0f/255.0f), + 0.0f, + 0.0f + }; + return vResult; + } +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadUByte2 +( + CONST XMUBYTE2* pSource +) +{ + XMASSERT(pSource); + { + XMVECTOR vResult = { + (FLOAT)pSource->x, + (FLOAT)pSource->y, + 0.0f, + 0.0f + }; + return vResult; + } +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadInt3 +( + CONST UINT* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + + XMASSERT(pSource); + + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + +#ifdef _XM_ISVS2005_ + __m128i V = _mm_set_epi32( 0, *(pSource+2), *(pSource+1), *pSource ); + return reinterpret_cast<__m128 *>(&V)[0]; +#else + __m128 x = _mm_load_ss( (const float*)pSource ); + __m128 y = _mm_load_ss( (const float*)(pSource+1) ); + __m128 z = _mm_load_ss( (const float*)(pSource+2) ); + __m128 xy = _mm_unpacklo_ps( x, y ); + return _mm_movelh_ps( xy, z ); +#endif // !_XM_ISVS2005_ +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadSInt3 +( + CONST XMINT3* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + XMASSERT(pSource); + +#ifdef _XBOX_VER + V = XMLoadInt3( (const UINT*)pSource ); + return XMConvertVectorIntToFloat( V, 0 ); +#else + V.vector4_f32[0] = (float)pSource->x; + V.vector4_f32[1] = (float)pSource->y; + V.vector4_f32[2] = (float)pSource->z; + return V; +#endif + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + +#ifdef _XM_ISVS2005_ + __m128i V = _mm_set_epi32( 0, pSource->z, pSource->y, pSource->x ); + return _mm_cvtepi32_ps(V); +#else + __m128 x = _mm_load_ss( (const float*)&pSource->x ); + __m128 y = _mm_load_ss( (const float*)&pSource->y ); + __m128 z = _mm_load_ss( (const float*)&pSource->z ); + __m128 xy = _mm_unpacklo_ps( x, y ); + __m128 V = _mm_movelh_ps( xy, z ); + return _mm_cvtepi32_ps(reinterpret_cast(&V)[0]); +#endif // !_XM_ISVS2005_ +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadUInt3 +( + CONST XMUINT3* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + XMASSERT(pSource); + + V.vector4_f32[0] = (float)pSource->x; + V.vector4_f32[1] = (float)pSource->y; + V.vector4_f32[2] = (float)pSource->z; + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + +#ifdef _XM_ISVS2005_ + __m128i V = _mm_set_epi32( 0, pSource->z, pSource->y, pSource->x ); + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(reinterpret_cast(&V)[0],g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(reinterpret_cast(&V)[0],vMask); +#else + __m128 x = _mm_load_ss( (const float*)&pSource->x ); + __m128 y = _mm_load_ss( (const float*)&pSource->y ); + __m128 z = _mm_load_ss( (const float*)&pSource->z ); + __m128 xy = _mm_unpacklo_ps( x, y ); + __m128 V = _mm_movelh_ps( xy, z ); + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(V,vMask); +#endif // !_XM_ISVS2005_ + // Convert to floats + vResult = _mm_cvtepi32_ps(reinterpret_cast(&vResult)[0]); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(reinterpret_cast(&vMask)[0],31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(reinterpret_cast(&iMask)[0],g_XMFixUnsigned); + vResult = _mm_add_ps(vResult,vMask); + return vResult; + +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadInt3A +( + CONST UINT* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + + XMASSERT(pSource); + XMASSERT(((UINT_PTR)pSource & 0xF) == 0); + + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + + // Reads an extra integer that is 'undefined' + + __m128i V = _mm_load_si128( (const __m128i*)pSource ); + return reinterpret_cast<__m128 *>(&V)[0]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadFloat3 +( + CONST XMFLOAT3* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + XMASSERT(pSource); + + ((UINT *)(&V.vector4_f32[0]))[0] = ((const UINT *)(&pSource->x))[0]; + ((UINT *)(&V.vector4_f32[1]))[0] = ((const UINT *)(&pSource->y))[0]; + ((UINT *)(&V.vector4_f32[2]))[0] = ((const UINT *)(&pSource->z))[0]; + return V; +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + +#ifdef _XM_ISVS2005_ + // This reads 1 floats past the memory that should be ignored. + // Need to continue to do this for VS 2005 due to compiler issue but prefer new method + // to avoid triggering issues with memory debug tools (like AV) + return _mm_loadu_ps( &pSource->x ); +#else + __m128 x = _mm_load_ss( &pSource->x ); + __m128 y = _mm_load_ss( &pSource->y ); + __m128 z = _mm_load_ss( &pSource->z ); + __m128 xy = _mm_unpacklo_ps( x, y ); + return _mm_movelh_ps( xy, z ); +#endif // !_XM_ISVS2005_ +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadFloat3A +( + CONST XMFLOAT3A* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + + XMASSERT(pSource); + XMASSERT(((UINT_PTR)pSource & 0xF) == 0); + + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = pSource->z; + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + XMASSERT(((UINT_PTR)pSource & 0xF) == 0); + + // This reads 1 floats past the memory that should be ignored. + return _mm_load_ps( &pSource->x ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadUHenDN3 +( + CONST XMUHENDN3* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + UINT Element; + + XMASSERT(pSource); + + Element = pSource->v & 0x7FF; + V.vector4_f32[0] = (FLOAT)Element / 2047.0f; + Element = (pSource->v >> 11) & 0x7FF; + V.vector4_f32[1] = (FLOAT)Element / 2047.0f; + Element = (pSource->v >> 22) & 0x3FF; + V.vector4_f32[2] = (FLOAT)Element / 1023.0f; + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 UHenDN3Mul = {1.0f/2047.0f,1.0f/(2047.0f*2048.0f),1.0f/(1023.0f*2048.0f*2048.0f),0}; + XMASSERT(pSource); + // Get the 32 bit value and splat it + XMVECTOR vResult = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Mask off x, y and z + vResult = _mm_and_ps(vResult,g_XMMaskHenD3); + // Convert x and y to unsigned + vResult = _mm_xor_ps(vResult,g_XMFlipZ); + // Convert to float + vResult = _mm_cvtepi32_ps(reinterpret_cast(&vResult)[0]); + // Convert x and y back to signed + vResult = _mm_add_ps(vResult,g_XMAddUHenD3); + // Normalize x,y and z to -1.0f-1.0f + vResult = _mm_mul_ps(vResult,UHenDN3Mul); + return vResult; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadUHenD3 +( + CONST XMUHEND3* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + UINT Element; + + XMASSERT(pSource); + + Element = pSource->v & 0x7FF; + V.vector4_f32[0] = (FLOAT)Element; + Element = (pSource->v >> 11) & 0x7FF; + V.vector4_f32[1] = (FLOAT)Element; + Element = (pSource->v >> 22) & 0x3FF; + V.vector4_f32[2] = (FLOAT)Element; + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + // Get the 32 bit value and splat it + XMVECTOR vResult = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Mask off x, y and z + vResult = _mm_and_ps(vResult,g_XMMaskHenD3); + // Convert x and y to unsigned + vResult = _mm_xor_ps(vResult,g_XMFlipZ); + // Convert to float + vResult = _mm_cvtepi32_ps(reinterpret_cast(&vResult)[0]); + // Convert x and y back to signed + vResult = _mm_add_ps(vResult,g_XMAddUHenD3); + // Normalize x and y to -1024-1023.0f and z to -512-511.0f + vResult = _mm_mul_ps(vResult,g_XMMulHenD3); + return vResult; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadHenDN3 +( + CONST XMHENDN3* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + UINT Element; + static CONST UINT SignExtendXY[] = {0x00000000, 0xFFFFF800}; + static CONST UINT SignExtendZ[] = {0x00000000, 0xFFFFFC00}; + + XMASSERT(pSource); + XMASSERT((pSource->v & 0x7FF) != 0x400); + XMASSERT(((pSource->v >> 11) & 0x7FF) != 0x400); + XMASSERT(((pSource->v >> 22) & 0x3FF) != 0x200); + + Element = pSource->v & 0x7FF; + V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtendXY[Element >> 10]) / 1023.0f; + Element = (pSource->v >> 11) & 0x7FF; + V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtendXY[Element >> 10]) / 1023.0f; + Element = (pSource->v >> 22) & 0x3FF; + V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtendZ[Element >> 9]) / 511.0f; + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 HenDN3Mul = {1.0f/1023.0f,1.0f/(1023.0f*2048.0f),1.0f/(511.0f*2048.0f*2048.0f),0}; + XMASSERT(pSource); + XMASSERT((pSource->v & 0x7FF) != 0x400); + XMASSERT(((pSource->v >> 11) & 0x7FF) != 0x400); + XMASSERT(((pSource->v >> 22) & 0x3FF) != 0x200); + // Get the 32 bit value and splat it + XMVECTOR vResult = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Mask off x, y and z + vResult = _mm_and_ps(vResult,g_XMMaskHenD3); + // Convert x and y to unsigned + vResult = _mm_xor_ps(vResult,g_XMXorHenD3); + // Convert to float + vResult = _mm_cvtepi32_ps(reinterpret_cast(&vResult)[0]); + // Convert x and y back to signed + vResult = _mm_add_ps(vResult,g_XMAddHenD3); + // Normalize x,y and z to -1.0f-1.0f + vResult = _mm_mul_ps(vResult,HenDN3Mul); + return vResult; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadHenD3 +( + CONST XMHEND3* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + UINT Element; + static CONST UINT SignExtendXY[] = {0x00000000, 0xFFFFF800}; + static CONST UINT SignExtendZ[] = {0x00000000, 0xFFFFFC00}; + + XMASSERT(pSource); + XMASSERT((pSource->v & 0x7FF) != 0x400); + XMASSERT(((pSource->v >> 11) & 0x7FF) != 0x400); + XMASSERT(((pSource->v >> 22) & 0x3FF) != 0x200); + + Element = pSource->v & 0x7FF; + V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtendXY[Element >> 10]); + Element = (pSource->v >> 11) & 0x7FF; + V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtendXY[Element >> 10]); + Element = (pSource->v >> 22) & 0x3FF; + V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtendZ[Element >> 9]); + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + XMASSERT((pSource->v & 0x7FF) != 0x400); + XMASSERT(((pSource->v >> 11) & 0x7FF) != 0x400); + XMASSERT(((pSource->v >> 22) & 0x3FF) != 0x200); + // Get the 32 bit value and splat it + XMVECTOR vResult = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Mask off x, y and z + vResult = _mm_and_ps(vResult,g_XMMaskHenD3); + // Convert x and y to unsigned + vResult = _mm_xor_ps(vResult,g_XMXorHenD3); + // Convert to float + vResult = _mm_cvtepi32_ps(reinterpret_cast(&vResult)[0]); + // Convert x and y back to signed + vResult = _mm_add_ps(vResult,g_XMAddHenD3); + // Normalize x and y to -1024-1023.0f and z to -512-511.0f + vResult = _mm_mul_ps(vResult,g_XMMulHenD3); + return vResult; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadUDHenN3 +( + CONST XMUDHENN3* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + UINT Element; + + XMASSERT(pSource); + + Element = pSource->v & 0x3FF; + V.vector4_f32[0] = (FLOAT)Element / 1023.0f; + Element = (pSource->v >> 10) & 0x7FF; + V.vector4_f32[1] = (FLOAT)Element / 2047.0f; + Element = (pSource->v >> 21) & 0x7FF; + V.vector4_f32[2] = (FLOAT)Element / 2047.0f; + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 UDHenN3Mul = {1.0f/1023.0f,1.0f/(2047.0f*1024.0f),1.0f/(2047.0f*1024.0f*2048.0f),0}; + XMASSERT(pSource); + // Get the 32 bit value and splat it + XMVECTOR vResult = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Mask off x, y and z + vResult = _mm_and_ps(vResult,g_XMMaskDHen3); + // Convert x and y to unsigned + vResult = _mm_xor_ps(vResult,g_XMFlipZ); + // Convert to float + vResult = _mm_cvtepi32_ps(reinterpret_cast(&vResult)[0]); + // Convert x and y back to signed + vResult = _mm_add_ps(vResult,g_XMAddUHenD3); + // Normalize x,y and z to -1.0f-1.0f + vResult = _mm_mul_ps(vResult,UDHenN3Mul); + return vResult; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadUDHen3 +( + CONST XMUDHEN3* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + UINT Element; + + XMASSERT(pSource); + + Element = pSource->v & 0x3FF; + V.vector4_f32[0] = (FLOAT)Element; + Element = (pSource->v >> 10) & 0x7FF; + V.vector4_f32[1] = (FLOAT)Element; + Element = (pSource->v >> 21) & 0x7FF; + V.vector4_f32[2] = (FLOAT)Element; + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + // Get the 32 bit value and splat it + XMVECTOR vResult = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Mask off x, y and z + vResult = _mm_and_ps(vResult,g_XMMaskDHen3); + // Convert x and y to unsigned + vResult = _mm_xor_ps(vResult,g_XMFlipZ); + // Convert to float + vResult = _mm_cvtepi32_ps(reinterpret_cast(&vResult)[0]); + // Convert x and y back to signed + vResult = _mm_add_ps(vResult,g_XMAddUHenD3); + // Normalize x to 0-1023.0f and y and z to 0-2047.0f + vResult = _mm_mul_ps(vResult,g_XMMulDHen3); + return vResult; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadDHenN3 +( + CONST XMDHENN3* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + UINT Element; + static CONST UINT SignExtendX[] = {0x00000000, 0xFFFFFC00}; + static CONST UINT SignExtendYZ[] = {0x00000000, 0xFFFFF800}; + + XMASSERT(pSource); + XMASSERT((pSource->v & 0x3FF) != 0x200); + XMASSERT(((pSource->v >> 10) & 0x7FF) != 0x400); + XMASSERT(((pSource->v >> 21) & 0x7FF) != 0x400); + + Element = pSource->v & 0x3FF; + V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtendX[Element >> 9]) / 511.0f; + Element = (pSource->v >> 10) & 0x7FF; + V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtendYZ[Element >> 10]) / 1023.0f; + Element = (pSource->v >> 21) & 0x7FF; + V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtendYZ[Element >> 10]) / 1023.0f; + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 DHenN3Mul = {1.0f/511.0f,1.0f/(1023.0f*1024.0f),1.0f/(1023.0f*1024.0f*2048.0f),0}; + XMASSERT(pSource); + XMASSERT((pSource->v & 0x3FF) != 0x200); + XMASSERT(((pSource->v >> 10) & 0x7FF) != 0x400); + XMASSERT(((pSource->v >> 21) & 0x7FF) != 0x400); + // Get the 32 bit value and splat it + XMVECTOR vResult = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Mask off x, y and z + vResult = _mm_and_ps(vResult,g_XMMaskDHen3); + // Convert x and y to unsigned + vResult = _mm_xor_ps(vResult,g_XMXorDHen3); + // Convert to float + vResult = _mm_cvtepi32_ps(reinterpret_cast(&vResult)[0]); + // Convert x and y back to signed + vResult = _mm_add_ps(vResult,g_XMAddDHen3); + // Normalize x,y and z to -1.0f-1.0f + vResult = _mm_mul_ps(vResult,DHenN3Mul); + return vResult; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadDHen3 +( + CONST XMDHEN3* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + UINT Element; + static CONST UINT SignExtendX[] = {0x00000000, 0xFFFFFC00}; + static CONST UINT SignExtendYZ[] = {0x00000000, 0xFFFFF800}; + + XMASSERT(pSource); + XMASSERT((pSource->v & 0x3FF) != 0x200); + XMASSERT(((pSource->v >> 10) & 0x7FF) != 0x400); + XMASSERT(((pSource->v >> 21) & 0x7FF) != 0x400); + + Element = pSource->v & 0x3FF; + V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtendX[Element >> 9]); + Element = (pSource->v >> 10) & 0x7FF; + V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtendYZ[Element >> 10]); + Element = (pSource->v >> 21) & 0x7FF; + V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtendYZ[Element >> 10]); + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + XMASSERT((pSource->v & 0x3FF) != 0x200); + XMASSERT(((pSource->v >> 10) & 0x7FF) != 0x400); + XMASSERT(((pSource->v >> 21) & 0x7FF) != 0x400); + // Get the 32 bit value and splat it + XMVECTOR vResult = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Mask off x, y and z + vResult = _mm_and_ps(vResult,g_XMMaskDHen3); + // Convert x and y to unsigned + vResult = _mm_xor_ps(vResult,g_XMXorDHen3); + // Convert to float + vResult = _mm_cvtepi32_ps(reinterpret_cast(&vResult)[0]); + // Convert x and y back to signed + vResult = _mm_add_ps(vResult,g_XMAddDHen3); + // Normalize x to -210-511.0f and y and z to -1024-1023.0f + vResult = _mm_mul_ps(vResult,g_XMMulDHen3); + return vResult; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadU565 +( + CONST XMU565* pSource +) +{ +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + static const XMVECTORI32 U565And = {0x1F,0x3F<<5,0x1F<<11,0}; + static const XMVECTORF32 U565Mul = {1.0f,1.0f/32.0f,1.0f/2048.f,0}; + XMASSERT(pSource); + // Get the 32 bit value and splat it + XMVECTOR vResult = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Mask off x, y and z + vResult = _mm_and_ps(vResult,U565And); + // Convert to float + vResult = _mm_cvtepi32_ps(reinterpret_cast(&vResult)[0]); + // Normalize x, y, and z + vResult = _mm_mul_ps(vResult,U565Mul); + return vResult; +#else + XMVECTOR V; + UINT Element; + + XMASSERT(pSource); + + Element = pSource->v & 0x1F; + V.vector4_f32[0] = (FLOAT)Element; + Element = (pSource->v >> 5) & 0x3F; + V.vector4_f32[1] = (FLOAT)Element; + Element = (pSource->v >> 11) & 0x1F; + V.vector4_f32[2] = (FLOAT)Element; + + return V; +#endif // !_XM_SSE_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadFloat3PK +( + CONST XMFLOAT3PK* pSource +) +{ + _DECLSPEC_ALIGN_16_ UINT Result[4]; + UINT Mantissa; + UINT Exponent; + + XMASSERT(pSource); + + // X Channel (6-bit mantissa) + Mantissa = pSource->xm; + + if ( pSource->xe == 0x1f ) // INF or NAN + { + Result[0] = 0x7f800000 | (pSource->xm << 17); + } + else + { + if ( pSource->xe != 0 ) // The value is normalized + { + Exponent = pSource->xe; + } + else if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x40) == 0); + + Mantissa &= 0x3F; + } + else // The value is zero + { + Exponent = (UINT)-112; + } + + Result[0] = ((Exponent + 112) << 23) | (Mantissa << 17); + } + + // Y Channel (6-bit mantissa) + Mantissa = pSource->ym; + + if ( pSource->ye == 0x1f ) // INF or NAN + { + Result[1] = 0x7f800000 | (pSource->ym << 17); + } + else + { + if ( pSource->ye != 0 ) // The value is normalized + { + Exponent = pSource->ye; + } + else if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x40) == 0); + + Mantissa &= 0x3F; + } + else // The value is zero + { + Exponent = (UINT)-112; + } + + Result[1] = ((Exponent + 112) << 23) | (Mantissa << 17); + } + + // Z Channel (5-bit mantissa) + Mantissa = pSource->zm; + + if ( pSource->ze == 0x1f ) // INF or NAN + { + Result[2] = 0x7f800000 | (pSource->zm << 17); + } + else + { + if ( pSource->ze != 0 ) // The value is normalized + { + Exponent = pSource->ze; + } + else if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x20) == 0); + + Mantissa &= 0x1F; + } + else // The value is zero + { + Exponent = (UINT)-112; + } + + Result[2] = ((Exponent + 112) << 23) | (Mantissa << 18); + } + + return XMLoadFloat3A( (const XMFLOAT3A*)&Result ); +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadFloat3SE +( + CONST XMFLOAT3SE* pSource +) +{ + _DECLSPEC_ALIGN_16_ UINT Result[4]; + UINT Mantissa; + UINT Exponent, ExpBits; + + XMASSERT(pSource); + + if ( pSource->e == 0x1f ) // INF or NAN + { + Result[0] = 0x7f800000 | (pSource->xm << 14); + Result[1] = 0x7f800000 | (pSource->ym << 14); + Result[2] = 0x7f800000 | (pSource->zm << 14); + } + else if ( pSource->e != 0 ) // The values are all normalized + { + Exponent = pSource->e; + + ExpBits = (Exponent + 112) << 23; + + Mantissa = pSource->xm; + Result[0] = ExpBits | (Mantissa << 14); + + Mantissa = pSource->ym; + Result[1] = ExpBits | (Mantissa << 14); + + Mantissa = pSource->zm; + Result[2] = ExpBits | (Mantissa << 14); + } + else + { + // X Channel + Mantissa = pSource->xm; + + if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x200) == 0); + + Mantissa &= 0x1FF; + } + else // The value is zero + { + Exponent = (UINT)-112; + } + + Result[0] = ((Exponent + 112) << 23) | (Mantissa << 14); + + // Y Channel + Mantissa = pSource->ym; + + if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x200) == 0); + + Mantissa &= 0x1FF; + } + else // The value is zero + { + Exponent = (UINT)-112; + } + + Result[1] = ((Exponent + 112) << 23) | (Mantissa << 14); + + // Z Channel + Mantissa = pSource->zm; + + if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x200) == 0); + + Mantissa &= 0x1FF; + } + else // The value is zero + { + Exponent = (UINT)-112; + } + + Result[2] = ((Exponent + 112) << 23) | (Mantissa << 14); + } + + return XMLoadFloat3A( (const XMFLOAT3A*)&Result ); +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadInt4 +( + CONST UINT* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + + XMASSERT(pSource); + + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + V.vector4_u32[3] = pSource[3]; + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + + XMASSERT(pSource); + + __m128i V = _mm_loadu_si128( (const __m128i*)pSource ); + return reinterpret_cast<__m128 *>(&V)[0]; + +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadSInt4 +( + CONST XMINT4* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + XMASSERT(pSource); + +#ifdef _XBOX_VER + V = XMLoadInt4( (const UINT*)pSource ); + return XMConvertVectorIntToFloat( V, 0 ); +#else + V.vector4_f32[0] = (float)pSource->x; + V.vector4_f32[1] = (float)pSource->y; + V.vector4_f32[2] = (float)pSource->z; + V.vector4_f32[3] = (float)pSource->w; + return V; +#endif + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + __m128i V = _mm_loadu_si128( (const __m128i*)pSource ); + return _mm_cvtepi32_ps(V); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadUInt4 +( + CONST XMUINT4* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + XMASSERT(pSource); + + V.vector4_f32[0] = (float)pSource->x; + V.vector4_f32[1] = (float)pSource->y; + V.vector4_f32[2] = (float)pSource->z; + V.vector4_f32[3] = (float)pSource->w; + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + __m128i V = _mm_loadu_si128( (const __m128i*)pSource ); + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(reinterpret_cast(&V)[0],g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(reinterpret_cast(&V)[0],vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(reinterpret_cast(&vResult)[0]); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(reinterpret_cast(&vMask)[0],31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(reinterpret_cast(&iMask)[0],g_XMFixUnsigned); + vResult = _mm_add_ps(vResult,vMask); + return vResult; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadInt4A +( + CONST UINT* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + + XMASSERT(pSource); + XMASSERT(((UINT_PTR)pSource & 0xF) == 0); + + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + V.vector4_u32[3] = pSource[3]; + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + + XMASSERT(pSource); + XMASSERT(((UINT_PTR)pSource & 0xF) == 0); + + __m128i V = _mm_load_si128( (const __m128i*)pSource ); + return reinterpret_cast<__m128 *>(&V)[0]; + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadFloat4 +( + CONST XMFLOAT4* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + XMASSERT(pSource); + + ((UINT *)(&V.vector4_f32[0]))[0] = ((const UINT *)(&pSource->x))[0]; + ((UINT *)(&V.vector4_f32[1]))[0] = ((const UINT *)(&pSource->y))[0]; + ((UINT *)(&V.vector4_f32[2]))[0] = ((const UINT *)(&pSource->z))[0]; + ((UINT *)(&V.vector4_f32[3]))[0] = ((const UINT *)(&pSource->w))[0]; + return V; +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + + return _mm_loadu_ps( &pSource->x ); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadFloat4A +( + CONST XMFLOAT4A* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + + XMASSERT(pSource); + XMASSERT(((UINT_PTR)pSource & 0xF) == 0); + + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = pSource->z; + V.vector4_f32[3] = pSource->w; + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + + XMASSERT(pSource); + XMASSERT(((UINT_PTR)pSource & 0xF) == 0); + + return _mm_load_ps( &pSource->x ); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadHalf4 +( + CONST XMHALF4* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMASSERT(pSource); + { + XMVECTOR vResult = { + XMConvertHalfToFloat(pSource->x), + XMConvertHalfToFloat(pSource->y), + XMConvertHalfToFloat(pSource->z), + XMConvertHalfToFloat(pSource->w) + }; + return vResult; + } +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + XMVECTOR vResult = { + XMConvertHalfToFloat(pSource->x), + XMConvertHalfToFloat(pSource->y), + XMConvertHalfToFloat(pSource->z), + XMConvertHalfToFloat(pSource->w) + }; + return vResult; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadShortN4 +( + CONST XMSHORTN4* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMASSERT(pSource); + { + XMVECTOR vResult = { + (pSource->x == -32768) ? -1.f : ((FLOAT)pSource->x * (1.0f/32767.0f)), + (pSource->y == -32768) ? -1.f : ((FLOAT)pSource->y * (1.0f/32767.0f)), + (pSource->z == -32768) ? -1.f : ((FLOAT)pSource->z * (1.0f/32767.0f)), + (pSource->w == -32768) ? -1.f : ((FLOAT)pSource->w * (1.0f/32767.0f)) + }; + return vResult; + } +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + // Splat the color in all four entries (x,z,y,w) + __m128d vIntd = _mm_load1_pd(reinterpret_cast(&pSource->x)); + // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000 + __m128 vTemp = _mm_and_ps(reinterpret_cast(&vIntd)[0],g_XMMaskX16Y16Z16W16); + // x and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(reinterpret_cast(&vTemp)[0]); + // x and z - 0x8000 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16); + // Convert to -1.0f - 1.0f + vTemp = _mm_mul_ps(vTemp,g_XMNormalizeX16Y16Z16W16); + // Very important! The entries are x,z,y,w, flip it to x,y,z,w + vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0)); + // Clamp result (for case of -32768) + return _mm_max_ps( vTemp, g_XMNegativeOne ); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadShort4 +( + CONST XMSHORT4* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + + XMASSERT(pSource); + + V.vector4_f32[0] = (FLOAT)pSource->x; + V.vector4_f32[1] = (FLOAT)pSource->y; + V.vector4_f32[2] = (FLOAT)pSource->z; + V.vector4_f32[3] = (FLOAT)pSource->w; + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + // Splat the color in all four entries (x,z,y,w) + __m128d vIntd = _mm_load1_pd(reinterpret_cast(&pSource->x)); + // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000 + __m128 vTemp = _mm_and_ps(reinterpret_cast(&vIntd)[0],g_XMMaskX16Y16Z16W16); + // x and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(reinterpret_cast(&vTemp)[0]); + // x and z - 0x8000 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16); + // Fix y and w because they are 65536 too large + vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16); + // Very important! The entries are x,z,y,w, flip it to x,y,z,w + return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0)); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadUShortN4 +( + CONST XMUSHORTN4* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + + XMASSERT(pSource); + + V.vector4_f32[0] = (FLOAT)pSource->x / 65535.0f; + V.vector4_f32[1] = (FLOAT)pSource->y / 65535.0f; + V.vector4_f32[2] = (FLOAT)pSource->z / 65535.0f; + V.vector4_f32[3] = (FLOAT)pSource->w / 65535.0f; + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + static const XMVECTORF32 FixupY16W16 = {1.0f/65535.0f,1.0f/65535.0f,1.0f/(65535.0f*65536.0f),1.0f/(65535.0f*65536.0f)}; + static const XMVECTORF32 FixaddY16W16 = {0,0,32768.0f*65536.0f,32768.0f*65536.0f}; + XMASSERT(pSource); + // Splat the color in all four entries (x,z,y,w) + __m128d vIntd = _mm_load1_pd(reinterpret_cast(&pSource->x)); + // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000 + __m128 vTemp = _mm_and_ps(reinterpret_cast(&vIntd)[0],g_XMMaskX16Y16Z16W16); + // y and w are signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp,g_XMFlipZW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(reinterpret_cast(&vTemp)[0]); + // y and w + 0x8000 to complete the conversion + vTemp = _mm_add_ps(vTemp,FixaddY16W16); + // Fix y and w because they are 65536 too large + vTemp = _mm_mul_ps(vTemp,FixupY16W16); + // Very important! The entries are x,z,y,w, flip it to x,y,z,w + return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0)); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadUShort4 +( + CONST XMUSHORT4* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + + XMASSERT(pSource); + + V.vector4_f32[0] = (FLOAT)pSource->x; + V.vector4_f32[1] = (FLOAT)pSource->y; + V.vector4_f32[2] = (FLOAT)pSource->z; + V.vector4_f32[3] = (FLOAT)pSource->w; + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + static const XMVECTORF32 FixaddY16W16 = {0,0,32768.0f,32768.0f}; + XMASSERT(pSource); + // Splat the color in all four entries (x,z,y,w) + __m128d vIntd = _mm_load1_pd(reinterpret_cast(&pSource->x)); + // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000 + __m128 vTemp = _mm_and_ps(reinterpret_cast(&vIntd)[0],g_XMMaskX16Y16Z16W16); + // y and w are signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp,g_XMFlipZW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(reinterpret_cast(&vTemp)[0]); + // Fix y and w because they are 65536 too large + vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16); + // y and w + 0x8000 to complete the conversion + vTemp = _mm_add_ps(vTemp,FixaddY16W16); + // Very important! The entries are x,z,y,w, flip it to x,y,z,w + return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0)); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadXIcoN4 +( + CONST XMXICON4* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + UINT Element; + static CONST UINT SignExtend[] = {0x00000000, 0xFFF00000}; + + XMASSERT(pSource); + XMASSERT((pSource->v & 0xFFFFFull) != 0x80000ull); + XMASSERT(((pSource->v >> 20) & 0xFFFFFull) != 0x80000ull); + XMASSERT(((pSource->v >> 40) & 0xFFFFFull) != 0x80000ull); + + Element = (UINT)(pSource->v & 0xFFFFF); + V.vector4_f32[0] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f; + Element = (UINT)((pSource->v >> 20) & 0xFFFFF); + V.vector4_f32[1] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f; + Element = (UINT)((pSource->v >> 40) & 0xFFFFF); + V.vector4_f32[2] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f; + V.vector4_f32[3] = (FLOAT)(pSource->v >> 60) / 15.0f; + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT((pSource->v & 0xFFFFFull) != 0x80000ull); + XMASSERT(((pSource->v >> 20) & 0xFFFFFull) != 0x80000ull); + XMASSERT(((pSource->v >> 40) & 0xFFFFFull) != 0x80000ull); + static const XMVECTORF32 LoadXIcoN4Mul = {1.0f/524287.0f,1.0f/(524287.0f*4096.0f),1.0f/524287.0f,1.0f/(15.0f*4096.0f*65536.0f)}; + XMASSERT(pSource); + // Grab the 64 bit structure + __m128d vResultd = _mm_load_sd(reinterpret_cast(&pSource->v)); + // By shifting down 8 bits, y and z are in seperate 32 bit elements + __m128i vResulti = _mm_srli_si128(reinterpret_cast(&vResultd)[0],8/8); + // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z + XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast(&vResultd)[0],reinterpret_cast(&vResulti)[0],_MM_SHUFFLE(1,0,1,0)); + // Fix the entries to x,y,z,w + vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0)); + // Mask x,y,z and w + vTemp = _mm_and_ps(vTemp,g_XMMaskIco4); + // x and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMXorXIco4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(reinterpret_cast(&vTemp)[0]); + // x and z - 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMAddXIco4); + // Fix y and w because they are too large + vTemp = _mm_mul_ps(vTemp,LoadXIcoN4Mul); + return vTemp; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadXIco4 +( + CONST XMXICO4* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + UINT Element; + static CONST UINT SignExtend[] = {0x00000000, 0xFFF00000}; + + XMASSERT(pSource); + XMASSERT((pSource->v & 0xFFFFFull) != 0x80000ull); + XMASSERT(((pSource->v >> 20) & 0xFFFFFull) != 0x80000ull); + XMASSERT(((pSource->v >> 40) & 0xFFFFFull) != 0x80000ull); + + Element = (UINT)(pSource->v & 0xFFFFF); + V.vector4_f32[0] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]); + Element = (UINT)((pSource->v >> 20) & 0xFFFFF); + V.vector4_f32[1] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]); + Element = (UINT)((pSource->v >> 40) & 0xFFFFF); + V.vector4_f32[2] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]); + V.vector4_f32[3] = (FLOAT)(pSource->v >> 60); + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT((pSource->v & 0xFFFFFull) != 0x80000ull); + XMASSERT(((pSource->v >> 20) & 0xFFFFFull) != 0x80000ull); + XMASSERT(((pSource->v >> 40) & 0xFFFFFull) != 0x80000ull); + XMASSERT(pSource); + // Grab the 64 bit structure + __m128d vResultd = _mm_load_sd(reinterpret_cast(&pSource->v)); + // By shifting down 8 bits, y and z are in seperate 32 bit elements + __m128i vResulti = _mm_srli_si128(reinterpret_cast(&vResultd)[0],8/8); + // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z + XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast(&vResultd)[0],reinterpret_cast(&vResulti)[0],_MM_SHUFFLE(1,0,1,0)); + // Fix the entries to x,y,z,w + vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0)); + // Mask x,y,z and w + vTemp = _mm_and_ps(vTemp,g_XMMaskIco4); + // x and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMXorXIco4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(reinterpret_cast(&vTemp)[0]); + // x and z - 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMAddXIco4); + // Fix y and w because they are too large + vTemp = _mm_mul_ps(vTemp,g_XMMulIco4); + return vTemp; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadUIcoN4 +( + CONST XMUICON4* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + + XMASSERT(pSource); + + V.vector4_f32[0] = (FLOAT)(pSource->v & 0xFFFFF) / 1048575.0f; + V.vector4_f32[1] = (FLOAT)((pSource->v >> 20) & 0xFFFFF) / 1048575.0f; + V.vector4_f32[2] = (FLOAT)((pSource->v >> 40) & 0xFFFFF) / 1048575.0f; + V.vector4_f32[3] = (FLOAT)(pSource->v >> 60) / 15.0f; + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 LoadUIcoN4Mul = {1.0f/1048575.0f,1.0f/(1048575.0f*4096.0f),1.0f/1048575.0f,1.0f/(15.0f*4096.0f*65536.0f)}; + XMASSERT(pSource); + // Grab the 64 bit structure + __m128d vResultd = _mm_load_sd(reinterpret_cast(&pSource->v)); + // By shifting down 8 bits, y and z are in seperate 32 bit elements + __m128i vResulti = _mm_srli_si128(reinterpret_cast(&vResultd)[0],8/8); + // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z + XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast(&vResultd)[0],reinterpret_cast(&vResulti)[0],_MM_SHUFFLE(1,0,1,0)); + // Fix the entries to x,y,z,w + vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0)); + // Mask x,y,z and w + vTemp = _mm_and_ps(vTemp,g_XMMaskIco4); + // x and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMFlipYW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(reinterpret_cast(&vTemp)[0]); + // x and z - 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMAddUIco4); + // Fix y and w because they are too large + vTemp = _mm_mul_ps(vTemp,LoadUIcoN4Mul); + return vTemp; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadUIco4 +( + CONST XMUICO4* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + + XMASSERT(pSource); + + V.vector4_f32[0] = (FLOAT)(pSource->v & 0xFFFFF); + V.vector4_f32[1] = (FLOAT)((pSource->v >> 20) & 0xFFFFF); + V.vector4_f32[2] = (FLOAT)((pSource->v >> 40) & 0xFFFFF); + V.vector4_f32[3] = (FLOAT)(pSource->v >> 60); + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + // Grab the 64 bit structure + __m128d vResultd = _mm_load_sd(reinterpret_cast(&pSource->v)); + // By shifting down 8 bits, y and z are in seperate 32 bit elements + __m128i vResulti = _mm_srli_si128(reinterpret_cast(&vResultd)[0],8/8); + // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z + XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast(&vResultd)[0],reinterpret_cast(&vResulti)[0],_MM_SHUFFLE(1,0,1,0)); + // Fix the entries to x,y,z,w + vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0)); + // Mask x,y,z and w + vTemp = _mm_and_ps(vTemp,g_XMMaskIco4); + // x and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMFlipYW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(reinterpret_cast(&vTemp)[0]); + // x and z - 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMAddUIco4); + // Fix y and w because they are too large + vTemp = _mm_mul_ps(vTemp,g_XMMulIco4); + return vTemp; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadIcoN4 +( + CONST XMICON4* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + UINT Element; + static CONST UINT SignExtend[] = {0x00000000, 0xFFF00000}; + static CONST UINT SignExtendW[] = {0x00000000, 0xFFFFFFF0}; + + XMASSERT(pSource); + + Element = (UINT)(pSource->v & 0xFFFFF); + V.vector4_f32[0] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f; + Element = (UINT)((pSource->v >> 20) & 0xFFFFF); + V.vector4_f32[1] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f; + Element = (UINT)((pSource->v >> 40) & 0xFFFFF); + V.vector4_f32[2] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f; + Element = (UINT)(pSource->v >> 60); + V.vector4_f32[3] = (FLOAT)(INT)(Element | SignExtendW[Element >> 3]) / 7.0f; + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 LoadIcoN4Mul = {1.0f/524287.0f,1.0f/(524287.0f*4096.0f),1.0f/524287.0f,1.0f/(7.0f*4096.0f*65536.0f)}; + XMASSERT(pSource); + // Grab the 64 bit structure + __m128d vResultd = _mm_load_sd(reinterpret_cast(&pSource->v)); + // By shifting down 8 bits, y and z are in seperate 32 bit elements + __m128i vResulti = _mm_srli_si128(reinterpret_cast(&vResultd)[0],8/8); + // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z + XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast(&vResultd)[0],reinterpret_cast(&vResulti)[0],_MM_SHUFFLE(1,0,1,0)); + // Fix the entries to x,y,z,w + vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0)); + // Mask x,y,z and w + vTemp = _mm_and_ps(vTemp,g_XMMaskIco4); + // x and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMXorIco4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(reinterpret_cast(&vTemp)[0]); + // x and z - 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMAddIco4); + // Fix y and w because they are too large + vTemp = _mm_mul_ps(vTemp,LoadIcoN4Mul); + return vTemp; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadIco4 +( + CONST XMICO4* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + UINT Element; + static CONST UINT SignExtend[] = {0x00000000, 0xFFF00000}; + static CONST UINT SignExtendW[] = {0x00000000, 0xFFFFFFF0}; + + XMASSERT(pSource); + + Element = (UINT)(pSource->v & 0xFFFFF); + V.vector4_f32[0] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]); + Element = (UINT)((pSource->v >> 20) & 0xFFFFF); + V.vector4_f32[1] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]); + Element = (UINT)((pSource->v >> 40) & 0xFFFFF); + V.vector4_f32[2] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]); + Element = (UINT)(pSource->v >> 60); + V.vector4_f32[3] = (FLOAT)(INT)(Element | SignExtendW[Element >> 3]); + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + // Grab the 64 bit structure + __m128d vResultd = _mm_load_sd(reinterpret_cast(&pSource->v)); + // By shifting down 8 bits, y and z are in seperate 32 bit elements + __m128i vResulti = _mm_srli_si128(reinterpret_cast(&vResultd)[0],8/8); + // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z + XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast(&vResultd)[0],reinterpret_cast(&vResulti)[0],_MM_SHUFFLE(1,0,1,0)); + // Fix the entries to x,y,z,w + vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0)); + // Mask x,y,z and w + vTemp = _mm_and_ps(vTemp,g_XMMaskIco4); + // x and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMXorIco4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(reinterpret_cast(&vTemp)[0]); + // x and z - 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMAddIco4); + // Fix y and w because they are too large + vTemp = _mm_mul_ps(vTemp,g_XMMulIco4); + return vTemp; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadXDecN4 +( + CONST XMXDECN4* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + UINT Element; + static CONST UINT SignExtend[] = {0x00000000, 0xFFFFFC00}; + + XMASSERT(pSource); + XMASSERT((pSource->v & 0x3FF) != 0x200); + XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200); + XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200); + + Element = pSource->v & 0x3FF; + V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f; + Element = (pSource->v >> 10) & 0x3FF; + V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f; + Element = (pSource->v >> 20) & 0x3FF; + V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f; + V.vector4_f32[3] = (FLOAT)(pSource->v >> 30) / 3.0f; + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + // Splat the color in all four entries + __m128 vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskA2B10G10R10); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMFlipA2B10G10R10); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(reinterpret_cast(&vTemp)[0]); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMFixAA2B10G10R10); + // Convert 0-255 to 0.0f-1.0f + return _mm_mul_ps(vTemp,g_XMNormalizeA2B10G10R10); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadXDec4 +( + CONST XMXDEC4* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + UINT Element; + static CONST UINT SignExtend[] = {0x00000000, 0xFFFFFC00}; + + XMASSERT(pSource); + XMASSERT((pSource->v & 0x3FF) != 0x200); + XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200); + XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200); + + Element = pSource->v & 0x3FF; + V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]); + Element = (pSource->v >> 10) & 0x3FF; + V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]); + Element = (pSource->v >> 20) & 0x3FF; + V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]); + V.vector4_f32[3] = (FLOAT)(pSource->v >> 30); + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT((pSource->v & 0x3FF) != 0x200); + XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200); + XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200); + static const XMVECTORI32 XDec4Xor = {0x200, 0x200<<10, 0x200<<20, 0x80000000}; + static const XMVECTORF32 XDec4Add = {-512.0f,-512.0f*1024.0f,-512.0f*1024.0f*1024.0f,32768*65536.0f}; + XMASSERT(pSource); + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskDec4); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp,XDec4Xor); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(reinterpret_cast(&vTemp)[0]); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp,XDec4Add); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp,g_XMMulDec4); + return vTemp; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadUDecN4 +( + CONST XMUDECN4* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + UINT Element; + + XMASSERT(pSource); + + Element = pSource->v & 0x3FF; + V.vector4_f32[0] = (FLOAT)Element / 1023.0f; + Element = (pSource->v >> 10) & 0x3FF; + V.vector4_f32[1] = (FLOAT)Element / 1023.0f; + Element = (pSource->v >> 20) & 0x3FF; + V.vector4_f32[2] = (FLOAT)Element / 1023.0f; + V.vector4_f32[3] = (FLOAT)(pSource->v >> 30) / 3.0f; + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + static const XMVECTORF32 UDecN4Mul = {1.0f/1023.0f,1.0f/(1023.0f*1024.0f),1.0f/(1023.0f*1024.0f*1024.0f),1.0f/(3.0f*1024.0f*1024.0f*1024.0f)}; + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskDec4); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(reinterpret_cast(&vTemp)[0]); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMAddUDec4); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp,UDecN4Mul); + return vTemp; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadUDec4 +( + CONST XMUDEC4* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + UINT Element; + + XMASSERT(pSource); + + Element = pSource->v & 0x3FF; + V.vector4_f32[0] = (FLOAT)Element; + Element = (pSource->v >> 10) & 0x3FF; + V.vector4_f32[1] = (FLOAT)Element; + Element = (pSource->v >> 20) & 0x3FF; + V.vector4_f32[2] = (FLOAT)Element; + V.vector4_f32[3] = (FLOAT)(pSource->v >> 30); + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskDec4); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(reinterpret_cast(&vTemp)[0]); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMAddUDec4); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp,g_XMMulDec4); + return vTemp; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadDecN4 +( + CONST XMDECN4* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + UINT Element; + static CONST UINT SignExtend[] = {0x00000000, 0xFFFFFC00}; + static CONST UINT SignExtendW[] = {0x00000000, 0xFFFFFFFC}; + + XMASSERT(pSource); + XMASSERT((pSource->v & 0x3FF) != 0x200); + XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200); + XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200); + XMASSERT(((pSource->v >> 30) & 0x3) != 0x2); + + Element = pSource->v & 0x3FF; + V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f; + Element = (pSource->v >> 10) & 0x3FF; + V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f; + Element = (pSource->v >> 20) & 0x3FF; + V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f; + Element = pSource->v >> 30; + V.vector4_f32[3] = (FLOAT)(SHORT)(Element | SignExtendW[Element >> 1]); + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + XMASSERT((pSource->v & 0x3FF) != 0x200); + XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200); + XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200); + XMASSERT(((pSource->v >> 30) & 0x3) != 0x2); + static const XMVECTORF32 DecN4Mul = {1.0f/511.0f,1.0f/(511.0f*1024.0f),1.0f/(511.0f*1024.0f*1024.0f),1.0f/(1024.0f*1024.0f*1024.0f)}; + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskDec4); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMXorDec4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(reinterpret_cast(&vTemp)[0]); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMAddDec4); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp,DecN4Mul); + return vTemp; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadDec4 +( + CONST XMDEC4* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + UINT Element; + static CONST UINT SignExtend[] = {0x00000000, 0xFFFFFC00}; + static CONST UINT SignExtendW[] = {0x00000000, 0xFFFFFFFC}; + + XMASSERT(pSource); + XMASSERT((pSource->v & 0x3FF) != 0x200); + XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200); + XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200); + XMASSERT(((pSource->v >> 30) & 0x3) != 0x2); + + Element = pSource->v & 0x3FF; + V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]); + Element = (pSource->v >> 10) & 0x3FF; + V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]); + Element = (pSource->v >> 20) & 0x3FF; + V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]); + Element = pSource->v >> 30; + V.vector4_f32[3] = (FLOAT)(SHORT)(Element | SignExtendW[Element >> 1]); + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT((pSource->v & 0x3FF) != 0x200); + XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200); + XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200); + XMASSERT(((pSource->v >> 30) & 0x3) != 0x2); + XMASSERT(pSource); + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskDec4); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMXorDec4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(reinterpret_cast(&vTemp)[0]); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMAddDec4); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp,g_XMMulDec4); + return vTemp; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadUByteN4 +( + CONST XMUBYTEN4* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + + XMASSERT(pSource); + + V.vector4_f32[0] = (FLOAT)pSource->x / 255.0f; + V.vector4_f32[1] = (FLOAT)pSource->y / 255.0f; + V.vector4_f32[2] = (FLOAT)pSource->z / 255.0f; + V.vector4_f32[3] = (FLOAT)pSource->w / 255.0f; + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 LoadUByteN4Mul = {1.0f/255.0f,1.0f/(255.0f*256.0f),1.0f/(255.0f*65536.0f),1.0f/(255.0f*65536.0f*256.0f)}; + XMASSERT(pSource); + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskByte4); + // w is signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp,g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(reinterpret_cast(&vTemp)[0]); + // w + 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMAddUDec4); + // Fix y, z and w because they are too large + vTemp = _mm_mul_ps(vTemp,LoadUByteN4Mul); + return vTemp; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadUByte4 +( + CONST XMUBYTE4* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + + XMASSERT(pSource); + + V.vector4_f32[0] = (FLOAT)pSource->x; + V.vector4_f32[1] = (FLOAT)pSource->y; + V.vector4_f32[2] = (FLOAT)pSource->z; + V.vector4_f32[3] = (FLOAT)pSource->w; + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 LoadUByte4Mul = {1.0f,1.0f/256.0f,1.0f/65536.0f,1.0f/(65536.0f*256.0f)}; + XMASSERT(pSource); + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskByte4); + // w is signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp,g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(reinterpret_cast(&vTemp)[0]); + // w + 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMAddUDec4); + // Fix y, z and w because they are too large + vTemp = _mm_mul_ps(vTemp,LoadUByte4Mul); + return vTemp; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadByteN4 +( + CONST XMBYTEN4* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + + XMASSERT(pSource); + + V.vector4_f32[0] = (pSource->x == -128) ? -1.f : ((FLOAT)pSource->x / 127.0f); + V.vector4_f32[1] = (pSource->y == -128) ? -1.f : ((FLOAT)pSource->y / 127.0f); + V.vector4_f32[2] = (pSource->z == -128) ? -1.f : ((FLOAT)pSource->z / 127.0f); + V.vector4_f32[3] = (pSource->w == -128) ? -1.f : ((FLOAT)pSource->w / 127.0f); + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 LoadByteN4Mul = {1.0f/127.0f,1.0f/(127.0f*256.0f),1.0f/(127.0f*65536.0f),1.0f/(127.0f*65536.0f*256.0f)}; + XMASSERT(pSource); + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskByte4); + // x,y and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMXorByte4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(reinterpret_cast(&vTemp)[0]); + // x, y and z - 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMAddByte4); + // Fix y, z and w because they are too large + vTemp = _mm_mul_ps(vTemp,LoadByteN4Mul); + // Clamp result (for case of -128) + return _mm_max_ps( vTemp, g_XMNegativeOne ); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadByte4 +( + CONST XMBYTE4* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + + XMASSERT(pSource); + + V.vector4_f32[0] = (FLOAT)pSource->x; + V.vector4_f32[1] = (FLOAT)pSource->y; + V.vector4_f32[2] = (FLOAT)pSource->z; + V.vector4_f32[3] = (FLOAT)pSource->w; + + return V; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 LoadByte4Mul = {1.0f,1.0f/256.0f,1.0f/65536.0f,1.0f/(65536.0f*256.0f)}; + XMASSERT(pSource); + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskByte4); + // x,y and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMXorByte4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(reinterpret_cast(&vTemp)[0]); + // x, y and z - 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMAddByte4); + // Fix y, z and w because they are too large + vTemp = _mm_mul_ps(vTemp,LoadByte4Mul); + return vTemp; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadUNibble4 +( + CONST XMUNIBBLE4* pSource +) +{ +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + static const XMVECTORI32 UNibble4And = {0xF,0xF0,0xF00,0xF000}; + static const XMVECTORF32 UNibble4Mul = {1.0f,1.0f/16.f,1.0f/256.f,1.0f/4096.f}; + XMASSERT(pSource); + // Get the 32 bit value and splat it + XMVECTOR vResult = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Mask off x, y and z + vResult = _mm_and_ps(vResult,UNibble4And); + // Convert to float + vResult = _mm_cvtepi32_ps(reinterpret_cast(&vResult)[0]); + // Normalize x, y, and z + vResult = _mm_mul_ps(vResult,UNibble4Mul); + return vResult; +#else + XMVECTOR V; + UINT Element; + + XMASSERT(pSource); + + Element = pSource->v & 0xF; + V.vector4_f32[0] = (FLOAT)Element; + Element = (pSource->v >> 4) & 0xF; + V.vector4_f32[1] = (FLOAT)Element; + Element = (pSource->v >> 8) & 0xF; + V.vector4_f32[2] = (FLOAT)Element; + Element = (pSource->v >> 12) & 0xF; + V.vector4_f32[3] = (FLOAT)Element; + + return V; +#endif // !_XM_SSE_INTRISICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadU555 +( + CONST XMU555* pSource +) +{ +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + static const XMVECTORI32 U555And = {0x1F,0x1F<<5,0x1F<<10,0x8000}; + static const XMVECTORF32 U555Mul = {1.0f,1.0f/32.f,1.0f/1024.f,1.0f/32768.f}; + XMASSERT(pSource); + // Get the 32 bit value and splat it + XMVECTOR vResult = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Mask off x, y and z + vResult = _mm_and_ps(vResult,U555And); + // Convert to float + vResult = _mm_cvtepi32_ps(reinterpret_cast(&vResult)[0]); + // Normalize x, y, and z + vResult = _mm_mul_ps(vResult,U555Mul); + return vResult; +#else + XMVECTOR V; + UINT Element; + + XMASSERT(pSource); + + Element = pSource->v & 0x1F; + V.vector4_f32[0] = (FLOAT)Element; + Element = (pSource->v >> 5) & 0x1F; + V.vector4_f32[1] = (FLOAT)Element; + Element = (pSource->v >> 10) & 0x1F; + V.vector4_f32[2] = (FLOAT)Element; + Element = (pSource->v >> 15) & 0x1; + V.vector4_f32[3] = (FLOAT)Element; + + return V; +#endif // !_XM_SSE_INTRISICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMLoadColor +( + CONST XMCOLOR* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMASSERT(pSource); + { + // INT -> Float conversions are done in one instruction. + // UINT -> Float calls a runtime function. Keep in INT + INT iColor = (INT)(pSource->c); + XMVECTOR vColor = { + (FLOAT)((iColor >> 16) & 0xFF) * (1.0f/255.0f), + (FLOAT)((iColor >> 8) & 0xFF) * (1.0f/255.0f), + (FLOAT)(iColor & 0xFF) * (1.0f/255.0f), + (FLOAT)((iColor >> 24) & 0xFF) * (1.0f/255.0f) + }; + return vColor; + } +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + // Splat the color in all four entries + __m128i vInt = _mm_set1_epi32(pSource->c); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vInt = _mm_and_si128(vInt,g_XMMaskA8R8G8B8); + // a is unsigned! Flip the bit to convert the order to signed + vInt = _mm_xor_si128(vInt,g_XMFlipA8R8G8B8); + // Convert to floating point numbers + XMVECTOR vTemp = _mm_cvtepi32_ps(vInt); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMFixAA8R8G8B8); + // Convert 0-255 to 0.0f-1.0f + return _mm_mul_ps(vTemp,g_XMNormalizeA8R8G8B8); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMMATRIX XMLoadFloat3x3 +( + CONST XMFLOAT3X3* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + + XMASSERT(pSource); + + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + + M.r[3].vector4_f32[0] = 0.0f; + M.r[3].vector4_f32[1] = 0.0f; + M.r[3].vector4_f32[2] = 0.0f; + M.r[3].vector4_f32[3] = 1.0f; + + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + XMVECTOR V1, V2, V3, Z, T1, T2, T3, T4, T5; + + Z = _mm_setzero_ps(); + + XMASSERT(pSource); + + V1 = _mm_loadu_ps( &pSource->m[0][0] ); + V2 = _mm_loadu_ps( &pSource->m[1][1] ); + V3 = _mm_load_ss( &pSource->m[2][2] ); + + T1 = _mm_unpackhi_ps( V1, Z ); + T2 = _mm_unpacklo_ps( V2, Z ); + T3 = _mm_shuffle_ps( V3, T2, _MM_SHUFFLE( 0, 1, 0, 0 ) ); + T4 = _mm_movehl_ps( T2, T3 ); + T5 = _mm_movehl_ps( Z, T1 ); + + M.r[0] = _mm_movelh_ps( V1, T1 ); + M.r[1] = _mm_add_ps( T4, T5 ); + M.r[2] = _mm_shuffle_ps( V2, V3, _MM_SHUFFLE(1, 0, 3, 2) ); + M.r[3] = g_XMIdentityR3; + + return M; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMMATRIX XMLoadFloat4x3 +( + CONST XMFLOAT4X3* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMMATRIX M; + XMASSERT(pSource); + + ((UINT *)(&M.r[0].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[0][0]))[0]; + ((UINT *)(&M.r[0].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[0][1]))[0]; + ((UINT *)(&M.r[0].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[0][2]))[0]; + M.r[0].vector4_f32[3] = 0.0f; + + ((UINT *)(&M.r[1].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[1][0]))[0]; + ((UINT *)(&M.r[1].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[1][1]))[0]; + ((UINT *)(&M.r[1].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[1][2]))[0]; + M.r[1].vector4_f32[3] = 0.0f; + + ((UINT *)(&M.r[2].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[2][0]))[0]; + ((UINT *)(&M.r[2].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[2][1]))[0]; + ((UINT *)(&M.r[2].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[2][2]))[0]; + M.r[2].vector4_f32[3] = 0.0f; + + ((UINT *)(&M.r[3].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[3][0]))[0]; + ((UINT *)(&M.r[3].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[3][1]))[0]; + ((UINT *)(&M.r[3].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[3][2]))[0]; + M.r[3].vector4_f32[3] = 1.0f; + + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + // Use unaligned load instructions to + // load the 12 floats + // vTemp1 = x1,y1,z1,x2 + XMVECTOR vTemp1 = _mm_loadu_ps(&pSource->m[0][0]); + // vTemp2 = y2,z2,x3,y3 + XMVECTOR vTemp2 = _mm_loadu_ps(&pSource->m[1][1]); + // vTemp4 = z3,x4,y4,z4 + XMVECTOR vTemp4 = _mm_loadu_ps(&pSource->m[2][2]); + // vTemp3 = x3,y3,z3,z3 + XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2)); + // vTemp2 = y2,z2,x2,x2 + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0)); + // vTemp2 = x2,y2,z2,z2 + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp2,_MM_SHUFFLE(1,1,0,2)); + // vTemp1 = x1,y1,z1,0 + vTemp1 = _mm_and_ps(vTemp1,g_XMMask3); + // vTemp2 = x2,y2,z2,0 + vTemp2 = _mm_and_ps(vTemp2,g_XMMask3); + // vTemp3 = x3,y3,z3,0 + vTemp3 = _mm_and_ps(vTemp3,g_XMMask3); + // vTemp4i = x4,y4,z4,0 + __m128i vTemp4i = _mm_srli_si128(reinterpret_cast(&vTemp4)[0],32/8); + // vTemp4i = x4,y4,z4,1.0f + vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3); + XMMATRIX M(vTemp1, + vTemp2, + vTemp3, + reinterpret_cast(&vTemp4i)[0]); + return M; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMMATRIX XMLoadFloat4x3A +( + CONST XMFLOAT4X3A* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + + XMASSERT(pSource); + XMASSERT(((UINT_PTR)pSource & 0xF) == 0); + + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + + M.r[3].vector4_f32[0] = pSource->m[3][0]; + M.r[3].vector4_f32[1] = pSource->m[3][1]; + M.r[3].vector4_f32[2] = pSource->m[3][2]; + M.r[3].vector4_f32[3] = 1.0f; + + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + // Use aligned load instructions to + // load the 12 floats + // vTemp1 = x1,y1,z1,x2 + XMVECTOR vTemp1 = _mm_load_ps(&pSource->m[0][0]); + // vTemp2 = y2,z2,x3,y3 + XMVECTOR vTemp2 = _mm_load_ps(&pSource->m[1][1]); + // vTemp4 = z3,x4,y4,z4 + XMVECTOR vTemp4 = _mm_load_ps(&pSource->m[2][2]); + // vTemp3 = x3,y3,z3,z3 + XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2)); + // vTemp2 = y2,z2,x2,x2 + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0)); + // vTemp2 = x2,y2,z2,z2 + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp2,_MM_SHUFFLE(1,1,0,2)); + // vTemp1 = x1,y1,z1,0 + vTemp1 = _mm_and_ps(vTemp1,g_XMMask3); + // vTemp2 = x2,y2,z2,0 + vTemp2 = _mm_and_ps(vTemp2,g_XMMask3); + // vTemp3 = x3,y3,z3,0 + vTemp3 = _mm_and_ps(vTemp3,g_XMMask3); + // vTemp4i = x4,y4,z4,0 + __m128i vTemp4i = _mm_srli_si128(reinterpret_cast(&vTemp4)[0],32/8); + // vTemp4i = x4,y4,z4,1.0f + vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3); + XMMATRIX M(vTemp1, + vTemp2, + vTemp3, + reinterpret_cast(&vTemp4i)[0]); + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMMATRIX XMLoadFloat4x4 +( + CONST XMFLOAT4X4* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMMATRIX M; + XMASSERT(pSource); + + ((UINT *)(&M.r[0].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[0][0]))[0]; + ((UINT *)(&M.r[0].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[0][1]))[0]; + ((UINT *)(&M.r[0].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[0][2]))[0]; + ((UINT *)(&M.r[0].vector4_f32[3]))[0] = ((const UINT *)(&pSource->m[0][3]))[0]; + + ((UINT *)(&M.r[1].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[1][0]))[0]; + ((UINT *)(&M.r[1].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[1][1]))[0]; + ((UINT *)(&M.r[1].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[1][2]))[0]; + ((UINT *)(&M.r[1].vector4_f32[3]))[0] = ((const UINT *)(&pSource->m[1][3]))[0]; + + ((UINT *)(&M.r[2].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[2][0]))[0]; + ((UINT *)(&M.r[2].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[2][1]))[0]; + ((UINT *)(&M.r[2].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[2][2]))[0]; + ((UINT *)(&M.r[2].vector4_f32[3]))[0] = ((const UINT *)(&pSource->m[2][3]))[0]; + + ((UINT *)(&M.r[3].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[3][0]))[0]; + ((UINT *)(&M.r[3].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[3][1]))[0]; + ((UINT *)(&M.r[3].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[3][2]))[0]; + ((UINT *)(&M.r[3].vector4_f32[3]))[0] = ((const UINT *)(&pSource->m[3][3]))[0]; + + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSource); + XMMATRIX M; + + M.r[0] = _mm_loadu_ps( &pSource->_11 ); + M.r[1] = _mm_loadu_ps( &pSource->_21 ); + M.r[2] = _mm_loadu_ps( &pSource->_31 ); + M.r[3] = _mm_loadu_ps( &pSource->_41 ); + + return M; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMMATRIX XMLoadFloat4x4A +( + CONST XMFLOAT4X4A* pSource +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + + XMASSERT(pSource); + XMASSERT(((UINT_PTR)pSource & 0xF) == 0); + + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = pSource->m[0][3]; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = pSource->m[1][3]; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = pSource->m[2][3]; + + M.r[3].vector4_f32[0] = pSource->m[3][0]; + M.r[3].vector4_f32[1] = pSource->m[3][1]; + M.r[3].vector4_f32[2] = pSource->m[3][2]; + M.r[3].vector4_f32[3] = pSource->m[3][3]; + + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + + XMASSERT(pSource); + + M.r[0] = _mm_load_ps( &pSource->_11 ); + M.r[1] = _mm_load_ps( &pSource->_21 ); + M.r[2] = _mm_load_ps( &pSource->_31 ); + M.r[3] = _mm_load_ps( &pSource->_41 ); + + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +/**************************************************************************** + * + * Vector and matrix store operations + * + ****************************************************************************/ + +XMFINLINE VOID XMStoreInt +( + UINT* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 3) == 0); + + *pDestination = XMVectorGetIntX( V ); + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 3) == 0); + + _mm_store_ss( (float*)pDestination, V ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreFloat +( + FLOAT* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 3) == 0); + + *pDestination = XMVectorGetX( V ); + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 3) == 0); + + _mm_store_ss( pDestination, V ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreInt2 +( + UINT* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 3) == 0); + + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 3) == 0); + + XMVECTOR T = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) ); + _mm_store_ss( (float*)&pDestination[0], V ); + _mm_store_ss( (float*)&pDestination[1], T ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreSInt2 +( + XMINT2* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 3) == 0); + + pDestination->x = (INT)V.vector4_f32[0]; + pDestination->y = (INT)V.vector4_f32[1]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 3) == 0); + + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(V); + // If there was positive overflow, set to 0x7FFFFFFF + XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow,reinterpret_cast(&vResulti)[0]); + vOverflow = _mm_or_ps(vOverflow,vResult); + // Write two ints + XMVECTOR T = _mm_shuffle_ps( vOverflow, vOverflow, _MM_SHUFFLE( 1, 1, 1, 1 ) ); + _mm_store_ss( (float*)&pDestination->x, vOverflow ); + _mm_store_ss( (float*)&pDestination->y, T ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreUInt2 +( + XMUINT2* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 3) == 0); + + pDestination->x = (UINT)V.vector4_f32[0]; + pDestination->y = (UINT)V.vector4_f32[1]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 3) == 0); + + // Clamp to >=0 + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue,vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult,vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask,g_XMNegativeZero); + vResult = _mm_xor_ps(reinterpret_cast(&vResulti)[0],vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult,vOverflow); + // Write two uints + XMVECTOR T = _mm_shuffle_ps( vResult, vResult, _MM_SHUFFLE( 1, 1, 1, 1 ) ); + _mm_store_ss( (float*)&pDestination->x, vResult ); + _mm_store_ss( (float*)&pDestination->y, T ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreInt2A +( + UINT* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 0xF) == 0); + + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + +#elif defined(_XM_SSE_INTRINSICS_) + + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 0xF) == 0); + + _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast(&V)[0] ); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreFloat2 +( + XMFLOAT2* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 3) == 0); + + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 3) == 0); + + XMVECTOR T = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) ); + _mm_store_ss( &pDestination->x, V ); + _mm_store_ss( &pDestination->y, T ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreFloat2A +( + XMFLOAT2A* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 0xF) == 0); + + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + +#elif defined(_XM_SSE_INTRINSICS_) + + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 0xF) == 0); + + _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast(&V)[0] ); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreHalf2 +( + XMHALF2* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMASSERT(pDestination); + + pDestination->x = XMConvertFloatToHalf(V.vector4_f32[0]); + pDestination->y = XMConvertFloatToHalf(V.vector4_f32[1]); + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V)); + pDestination->y = XMConvertFloatToHalf(XMVectorGetY(V)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreShortN2 +( + XMSHORTN2* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTORF32 Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f}; + + XMASSERT(pDestination); + + N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(N, Scale.v); + N = XMVectorRound(N); + + pDestination->x = (SHORT)N.vector4_f32[0]; + pDestination->y = (SHORT)N.vector4_f32[1]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + static CONST XMVECTORF32 Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f}; + + XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne); + vResult = _mm_min_ps(vResult,g_XMOne); + vResult = _mm_mul_ps(vResult,Scale); + __m128i vResulti = _mm_cvtps_epi32(vResult); + vResulti = _mm_packs_epi32(vResulti,vResulti); + _mm_store_ss(reinterpret_cast(&pDestination->x),reinterpret_cast(&vResulti)[0]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreShort2 +( + XMSHORT2* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTOR Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f}; + static CONST XMVECTOR Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f}; + + XMASSERT(pDestination); + + N = XMVectorClamp(V, Min, Max); + N = XMVectorRound(N); + + pDestination->x = (SHORT)N.vector4_f32[0]; + pDestination->y = (SHORT)N.vector4_f32[1]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + static CONST XMVECTORF32 Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f}; + static CONST XMVECTORF32 Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f}; + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,Min); + vResult = _mm_min_ps(vResult,Max); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // Pack the ints into shorts + vInt = _mm_packs_epi32(vInt,vInt); + _mm_store_ss(reinterpret_cast(&pDestination->x),reinterpret_cast(&vInt)[0]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreUShortN2 +( + XMUSHORTN2* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTORF32 Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f}; + + XMASSERT(pDestination); + + N = XMVectorSaturate(V); + N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v); + N = XMVectorTruncate(N); + + pDestination->x = (SHORT)N.vector4_f32[0]; + pDestination->y = (SHORT)N.vector4_f32[1]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + static CONST XMVECTORF32 Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f}; + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,g_XMOne); + vResult = _mm_mul_ps(vResult,Scale); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // Since the SSE pack instruction clamps using signed rules, + // manually extract the values to store them to memory + pDestination->x = static_cast(_mm_extract_epi16(vInt,0)); + pDestination->y = static_cast(_mm_extract_epi16(vInt,2)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreUShort2 +( + XMUSHORT2* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTOR Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f}; + + XMASSERT(pDestination); + + N = XMVectorClamp(V, XMVectorZero(), Max); + N = XMVectorRound(N); + + pDestination->x = (SHORT)N.vector4_f32[0]; + pDestination->y = (SHORT)N.vector4_f32[1]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + static CONST XMVECTORF32 Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f}; + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,Max); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // Since the SSE pack instruction clamps using signed rules, + // manually extract the values to store them to memory + pDestination->x = static_cast(_mm_extract_epi16(vInt,0)); + pDestination->y = static_cast(_mm_extract_epi16(vInt,2)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreByteN2 +( + XMBYTEN2* pDestination, + FXMVECTOR V +) +{ + XMVECTOR N; + XMFLOAT4A tmp; + static CONST XMVECTORF32 Scale = {127.0f, 127.0f, 127.0f, 127.0f}; + + XMASSERT(pDestination); + + N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(N, Scale.v); + N = XMVectorRound(N); + + XMStoreFloat4A( &tmp, N ); + + pDestination->x = (CHAR)tmp.x; + pDestination->y = (CHAR)tmp.y; +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreByte2 +( + XMBYTE2* pDestination, + FXMVECTOR V +) +{ + XMVECTOR N; + XMFLOAT4A tmp; + static CONST XMVECTOR Min = {-127.0f, -127.0f, -127.0f, -127.0f}; + static CONST XMVECTOR Max = {127.0f, 127.0f, 127.0f, 127.0f}; + + XMASSERT(pDestination); + + N = XMVectorClamp(V, Min, Max); + N = XMVectorRound(N); + + XMStoreFloat4A( &tmp, N ); + + pDestination->x = (CHAR)tmp.x; + pDestination->y = (CHAR)tmp.y; +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreUByteN2 +( + XMUBYTEN2* pDestination, + FXMVECTOR V +) +{ + XMVECTOR N; + XMFLOAT4A tmp; + static CONST XMVECTORF32 Scale = {255.0f, 255.0f, 255.0f, 255.0f}; + + XMASSERT(pDestination); + + N = XMVectorSaturate(V); + N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v); + N = XMVectorTruncate(N); + + XMStoreFloat4A( &tmp, N ); + + pDestination->x = (BYTE)tmp.x; + pDestination->y = (BYTE)tmp.y; +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreUByte2 +( + XMUBYTE2* pDestination, + FXMVECTOR V +) +{ + XMVECTOR N; + static CONST XMVECTOR Max = {255.0f, 255.0f, 255.0f, 255.0f}; + XMFLOAT4A tmp; + + XMASSERT(pDestination); + + N = XMVectorClamp(V, XMVectorZero(), Max); + N = XMVectorRound(N); + + XMStoreFloat4A( &tmp, N ); + + pDestination->x = (BYTE)tmp.x; + pDestination->y = (BYTE)tmp.y; +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreInt3 +( + UINT* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 3) == 0); + + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; + +#elif defined(_XM_SSE_INTRINSICS_) + + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 3) == 0); + + XMVECTOR T1 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR T2 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss( (float*)pDestination, V ); + _mm_store_ss( (float*)&pDestination[1], T1 ); + _mm_store_ss( (float*)&pDestination[2], T2 ); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreSInt3 +( + XMINT3* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 3) == 0); + + pDestination->x = (INT)V.vector4_f32[0]; + pDestination->y = (INT)V.vector4_f32[1]; + pDestination->z = (INT)V.vector4_f32[2]; + +#elif defined(_XM_SSE_INTRINSICS_) + + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 3) == 0); + + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(V); + // If there was positive overflow, set to 0x7FFFFFFF + XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow,reinterpret_cast(&vResulti)[0]); + vOverflow = _mm_or_ps(vOverflow,vResult); + // Write 3 uints + XMVECTOR T1 = _mm_shuffle_ps(vOverflow,vOverflow,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR T2 = _mm_shuffle_ps(vOverflow,vOverflow,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss( (float*)&pDestination->x, vOverflow ); + _mm_store_ss( (float*)&pDestination->y, T1 ); + _mm_store_ss( (float*)&pDestination->z, T2 ); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreUInt3 +( + XMUINT3* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 3) == 0); + + pDestination->x = (UINT)V.vector4_f32[0]; + pDestination->y = (UINT)V.vector4_f32[1]; + pDestination->z = (UINT)V.vector4_f32[2]; + +#elif defined(_XM_SSE_INTRINSICS_) + + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 3) == 0); + + // Clamp to >=0 + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue,vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult,vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask,g_XMNegativeZero); + vResult = _mm_xor_ps(reinterpret_cast(&vResulti)[0],vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult,vOverflow); + // Write 3 uints + XMVECTOR T1 = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR T2 = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss( (float*)&pDestination->x, vResult ); + _mm_store_ss( (float*)&pDestination->y, T1 ); + _mm_store_ss( (float*)&pDestination->z, T2 ); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreInt3A +( + UINT* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 0xF) == 0); + + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; + +#elif defined(_XM_SSE_INTRINSICS_) + + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 0xF) == 0); + + XMVECTOR T = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2)); + _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast(&V)[0] ); + _mm_store_ss( (float*)&pDestination[2], T ); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreFloat3 +( + XMFLOAT3* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 3) == 0); + + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; + +#elif defined(_XM_SSE_INTRINSICS_) + + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 3) == 0); + + XMVECTOR T1 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR T2 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss( &pDestination->x, V ); + _mm_store_ss( &pDestination->y, T1 ); + _mm_store_ss( &pDestination->z, T2 ); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreFloat3A +( + XMFLOAT3A* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 0xF) == 0); + + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; + +#elif defined(_XM_SSE_INTRINSICS_) + + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 0xF) == 0); + + XMVECTOR T = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2)); + _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast(&V)[0] ); + _mm_store_ss( &pDestination->z, T ); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreUHenDN3 +( + XMUHENDN3* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTORF32 Scale = {2047.0f, 2047.0f, 1023.0f, 0.0f}; + + XMASSERT(pDestination); + + N = XMVectorSaturate(V); + N = XMVectorMultiply(N, Scale.v); + + pDestination->v = (((UINT)N.vector4_f32[2] & 0x3FF) << 22) | + (((UINT)N.vector4_f32[1] & 0x7FF) << 11) | + (((UINT)N.vector4_f32[0] & 0x7FF)); + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + static const XMVECTORF32 ScaleUHenDN3 = {2047.0f, 2047.0f*2048.0f,1023.0f*(2048.0f*2048.0f)/2.0f,1.0f}; + static const XMVECTORI32 MaskUHenDN3 = {0x7FF,0x7FF<<11,0x3FF<<(22-1),0}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleUHenDN3); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskUHenDN3); + // Do a horizontal or of 3 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(0,3,2,1)); + // i = x|y + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti2,_MM_SHUFFLE(0,3,2,1)); + // Add Z to itself to perform a single bit left shift + vResulti2 = _mm_add_epi32(vResulti2,vResulti2); + // i = x|y|z + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),reinterpret_cast(&vResulti)[0]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreUHenD3 +( + XMUHEND3* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTOR Max = {2047.0f, 2047.0f, 1023.0f, 0.0f}; + + XMASSERT(pDestination); + + N = XMVectorClamp(V, XMVectorZero(), Max); + + pDestination->v = (((UINT)N.vector4_f32[2] & 0x3FF) << 22) | + (((UINT)N.vector4_f32[1] & 0x7FF) << 11) | + (((UINT)N.vector4_f32[0] & 0x7FF)); + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + static const XMVECTORF32 MaxUHenD3 = { 2047.0f, 2047.0f, 1023.0f, 1.0f}; + static const XMVECTORF32 ScaleUHenD3 = {1.0f, 2048.0f,(2048.0f*2048.0f)/2.0f,1.0f}; + static const XMVECTORI32 MaskUHenD3 = {0x7FF,0x7FF<<11,0x3FF<<(22-1),0}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,MaxUHenD3); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleUHenD3); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskUHenD3); + // Do a horizontal or of 3 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(0,3,2,1)); + // i = x|y + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti2,_MM_SHUFFLE(0,3,2,1)); + // Add Z to itself to perform a single bit left shift + vResulti2 = _mm_add_epi32(vResulti2,vResulti2); + // i = x|y|z + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),reinterpret_cast(&vResulti)[0]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreHenDN3 +( + XMHENDN3* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTORF32 Scale = {1023.0f, 1023.0f, 511.0f, 1.0f}; + + XMASSERT(pDestination); + + N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(N, Scale.v); + + pDestination->v = (((INT)N.vector4_f32[2] & 0x3FF) << 22) | + (((INT)N.vector4_f32[1] & 0x7FF) << 11) | + (((INT)N.vector4_f32[0] & 0x7FF)); + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + static const XMVECTORF32 ScaleHenDN3 = {1023.0f, 1023.0f*2048.0f,511.0f*(2048.0f*2048.0f),1.0f}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleHenDN3); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,g_XMMaskHenD3); + // Do a horizontal or of all 4 entries + vResult = _mm_shuffle_ps(reinterpret_cast(&vResulti)[0],reinterpret_cast(&vResulti)[0],_MM_SHUFFLE(0,3,2,1)); + vResulti = _mm_or_si128(vResulti,reinterpret_cast(&vResult)[0]); + vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1)); + vResulti = _mm_or_si128(vResulti,reinterpret_cast(&vResult)[0]); + _mm_store_ss(reinterpret_cast(&pDestination->v),reinterpret_cast(&vResulti)[0]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreHenD3 +( + XMHEND3* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTOR Min = {-1023.0f, -1023.0f, -511.0f, -1.0f}; + static CONST XMVECTOR Max = {1023.0f, 1023.0f, 511.0f, 1.0f}; + + XMASSERT(pDestination); + + N = XMVectorClamp(V, Min, Max); + + pDestination->v = (((INT)N.vector4_f32[2] & 0x3FF) << 22) | + (((INT)N.vector4_f32[1] & 0x7FF) << 11) | + (((INT)N.vector4_f32[0] & 0x7FF)); + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + static const XMVECTORF32 MinHenD3 = {-1023.0f,-1023.0f,-511.0f,-1.0f}; + static const XMVECTORF32 MaxHenD3 = { 1023.0f, 1023.0f, 511.0f, 1.0f}; + static const XMVECTORF32 ScaleHenD3 = {1.0f, 2048.0f,(2048.0f*2048.0f),1.0f}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,MinHenD3); + vResult = _mm_min_ps(vResult,MaxHenD3); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleHenD3); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,g_XMMaskHenD3); + // Do a horizontal or of all 4 entries + vResult = _mm_shuffle_ps(reinterpret_cast(&vResulti)[0],reinterpret_cast(&vResulti)[0],_MM_SHUFFLE(0,3,2,1)); + vResulti = _mm_or_si128(vResulti,reinterpret_cast(&vResult)[0]); + vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1)); + vResulti = _mm_or_si128(vResulti,reinterpret_cast(&vResult)[0]); + _mm_store_ss(reinterpret_cast(&pDestination->v),reinterpret_cast(&vResulti)[0]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreUDHenN3 +( + XMUDHENN3* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTORF32 Scale = {1023.0f, 2047.0f, 2047.0f, 0.0f}; + + XMASSERT(pDestination); + + N = XMVectorSaturate(V); + N = XMVectorMultiply(N, Scale.v); + + pDestination->v = (((UINT)N.vector4_f32[2] & 0x7FF) << 21) | + (((UINT)N.vector4_f32[1] & 0x7FF) << 10) | + (((UINT)N.vector4_f32[0] & 0x3FF)); + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + static const XMVECTORF32 ScaleUDHenN3 = {1023.0f,2047.0f*1024.0f,2047.0f*(1024.0f*2048.0f)/2.0f,1.0f}; + static const XMVECTORI32 MaskUDHenN3 = {0x3FF,0x7FF<<10,0x7FF<<(21-1),0}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleUDHenN3); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskUDHenN3); + // Do a horizontal or of 3 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(0,3,2,1)); + // i = x|y + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti2,_MM_SHUFFLE(0,3,2,1)); + // Add Z to itself to perform a single bit left shift + vResulti2 = _mm_add_epi32(vResulti2,vResulti2); + // i = x|y|z + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),reinterpret_cast(&vResulti)[0]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreUDHen3 +( + XMUDHEN3* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTOR Max = {1023.0f, 2047.0f, 2047.0f, 0.0f}; + + XMASSERT(pDestination); + + N = XMVectorClamp(V, XMVectorZero(), Max); + + pDestination->v = (((UINT)N.vector4_f32[2] & 0x7FF) << 21) | + (((UINT)N.vector4_f32[1] & 0x7FF) << 10) | + (((UINT)N.vector4_f32[0] & 0x3FF)); + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + static const XMVECTORF32 MaxUDHen3 = { 1023.0f, 2047.0f, 2047.0f, 1.0f}; + static const XMVECTORF32 ScaleUDHen3 = {1.0f, 1024.0f,(1024.0f*2048.0f)/2.0f,1.0f}; + static const XMVECTORI32 MaskUDHen3 = {0x3FF,0x7FF<<10,0x7FF<<(21-1),0}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,MaxUDHen3); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleUDHen3); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskUDHen3); + // Do a horizontal or of 3 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(0,3,2,1)); + // i = x|y + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti2,_MM_SHUFFLE(0,3,2,1)); + // Add Z to itself to perform a single bit left shift + vResulti2 = _mm_add_epi32(vResulti2,vResulti2); + // i = x|y|z + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),reinterpret_cast(&vResulti)[0]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreDHenN3 +( + XMDHENN3* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTORF32 Scale = {511.0f, 1023.0f, 1023.0f, 1.0f}; + + XMASSERT(pDestination); + + N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(N, Scale.v); + + pDestination->v = (((INT)N.vector4_f32[2] & 0x7FF) << 21) | + (((INT)N.vector4_f32[1] & 0x7FF) << 10) | + (((INT)N.vector4_f32[0] & 0x3FF)); + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + static const XMVECTORF32 ScaleDHenN3 = {511.0f, 1023.0f*1024.0f,1023.0f*(1024.0f*2048.0f),1.0f}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleDHenN3); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,g_XMMaskDHen3); + // Do a horizontal or of all 4 entries + vResult = _mm_shuffle_ps(reinterpret_cast(&vResulti)[0],reinterpret_cast(&vResulti)[0],_MM_SHUFFLE(0,3,2,1)); + vResulti = _mm_or_si128(vResulti,reinterpret_cast(&vResult)[0]); + vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1)); + vResulti = _mm_or_si128(vResulti,reinterpret_cast(&vResult)[0]); + _mm_store_ss(reinterpret_cast(&pDestination->v),reinterpret_cast(&vResulti)[0]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreDHen3 +( + XMDHEN3* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTOR Min = {-511.0f, -1023.0f, -1023.0f, -1.0f}; + static CONST XMVECTOR Max = {511.0f, 1023.0f, 1023.0f, 1.0f}; + + XMASSERT(pDestination); + + N = XMVectorClamp(V, Min, Max); + + pDestination->v = (((INT)N.vector4_f32[2] & 0x7FF) << 21) | + (((INT)N.vector4_f32[1] & 0x7FF) << 10) | + (((INT)N.vector4_f32[0] & 0x3FF)); + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + static const XMVECTORF32 MinDHen3 = {-511.0f,-1023.0f,-1023.0f,-1.0f}; + static const XMVECTORF32 MaxDHen3 = { 511.0f, 1023.0f, 1023.0f, 1.0f}; + static const XMVECTORF32 ScaleDHen3 = {1.0f, 1024.0f,(1024.0f*2048.0f),1.0f}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,MinDHen3); + vResult = _mm_min_ps(vResult,MaxDHen3); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleDHen3); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,g_XMMaskDHen3); + // Do a horizontal or of all 4 entries + vResult = _mm_shuffle_ps(reinterpret_cast(&vResulti)[0],reinterpret_cast(&vResulti)[0],_MM_SHUFFLE(0,3,2,1)); + vResulti = _mm_or_si128(vResulti,reinterpret_cast(&vResult)[0]); + vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1)); + vResulti = _mm_or_si128(vResulti,reinterpret_cast(&vResult)[0]); + _mm_store_ss(reinterpret_cast(&pDestination->v),reinterpret_cast(&vResulti)[0]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreU565 +( + XMU565* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + XMASSERT(pDestination); + static CONST XMVECTORF32 Max = {31.0f, 63.0f, 31.0f, 0.0f}; + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,Max); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + USHORT x = static_cast(_mm_extract_epi16(vInt,0)); + USHORT y = static_cast(_mm_extract_epi16(vInt,2)); + USHORT z = static_cast(_mm_extract_epi16(vInt,4)); + pDestination->v = ((z & 0x1F) << 11) | + ((y & 0x3F) << 5) | + ((x & 0x1F)); +#else + XMVECTOR N; + static CONST XMVECTORF32 Max = {31.0f, 63.0f, 31.0f, 0.0f}; + + XMASSERT(pDestination); + + N = XMVectorClamp(V, XMVectorZero(), Max.v); + N = XMVectorRound(N); + + pDestination->v = (((USHORT)N.vector4_f32[2] & 0x1F) << 11) | + (((USHORT)N.vector4_f32[1] & 0x3F) << 5) | + (((USHORT)N.vector4_f32[0] & 0x1F)); +#endif !_XM_SSE_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreFloat3PK +( + XMFLOAT3PK* pDestination, + FXMVECTOR V +) +{ + _DECLSPEC_ALIGN_16_ UINT IValue[4]; + UINT I, Sign, j; + UINT Result[3]; + + XMASSERT(pDestination); + + XMStoreFloat3A( (XMFLOAT3A*)&IValue, V ); + + // X & Y Channels (5-bit exponent, 6-bit mantissa) + for(j=0; j < 2; ++j) + { + Sign = IValue[j] & 0x80000000; + I = IValue[j] & 0x7FFFFFFF; + + if ((I & 0x7F800000) == 0x7F800000) + { + // INF or NAN + Result[j] = 0x7c0; + if (( I & 0x7FFFFF ) != 0) + { + Result[j] = 0x7c0 | (((I>>17)|(I>11)|(I>>6)|(I))&0x3f); + } + else if ( Sign ) + { + // -INF is clamped to 0 since 3PK is positive only + Result[j] = 0; + } + } + else if ( Sign ) + { + // 3PK is positive only, so clamp to zero + Result[j] = 0; + } + else if (I > 0x477E0000U) + { + // The number is too large to be represented as a float11, set to max + Result[j] = 0x7BF; + } + else + { + if (I < 0x38800000U) + { + // The number is too small to be represented as a normalized float11 + // Convert it to a denormalized value. + UINT Shift = 113U - (I >> 23U); + I = (0x800000U | (I & 0x7FFFFFU)) >> Shift; + } + else + { + // Rebias the exponent to represent the value as a normalized float11 + I += 0xC8000000U; + } + + Result[j] = ((I + 0xFFFFU + ((I >> 17U) & 1U)) >> 17U)&0x7ffU; + } + } + + // Z Channel (5-bit exponent, 5-bit mantissa) + Sign = IValue[2] & 0x80000000; + I = IValue[2] & 0x7FFFFFFF; + + if ((I & 0x7F800000) == 0x7F800000) + { + // INF or NAN + Result[2] = 0x3e0; + if ( I & 0x7FFFFF ) + { + Result[2] = 0x3e0 | (((I>>18)|(I>13)|(I>>3)|(I))&0x1f); + } + else if ( Sign ) + { + // -INF is clamped to 0 since 3PK is positive only + Result[2] = 0; + } + } + else if ( Sign ) + { + // 3PK is positive only, so clamp to zero + Result[2] = 0; + } + else if (I > 0x477C0000U) + { + // The number is too large to be represented as a float10, set to max + Result[2] = 0x3df; + } + else + { + if (I < 0x38800000U) + { + // The number is too small to be represented as a normalized float10 + // Convert it to a denormalized value. + UINT Shift = 113U - (I >> 23U); + I = (0x800000U | (I & 0x7FFFFFU)) >> Shift; + } + else + { + // Rebias the exponent to represent the value as a normalized float10 + I += 0xC8000000U; + } + + Result[2] = ((I + 0x1FFFFU + ((I >> 18U) & 1U)) >> 18U)&0x3ffU; + } + + // Pack Result into memory + pDestination->v = (Result[0] & 0x7ff) + | ( (Result[1] & 0x7ff) << 11 ) + | ( (Result[2] & 0x3ff) << 22 ); +} + + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreFloat3SE +( + XMFLOAT3SE* pDestination, + FXMVECTOR V +) +{ + _DECLSPEC_ALIGN_16_ UINT IValue[4]; + UINT I, Sign, j, T; + UINT Frac[3]; + UINT Exp[3]; + + + XMASSERT(pDestination); + + XMStoreFloat3A( (XMFLOAT3A*)&IValue, V ); + + // X, Y, Z Channels (5-bit exponent, 9-bit mantissa) + for(j=0; j < 3; ++j) + { + Sign = IValue[j] & 0x80000000; + I = IValue[j] & 0x7FFFFFFF; + + if ((I & 0x7F800000) == 0x7F800000) + { + // INF or NAN + Exp[j] = 0x1f; + if (( I & 0x7FFFFF ) != 0) + { + Frac[j] = ((I>>14)|(I>5)|(I))&0x1ff; + } + else if ( Sign ) + { + // -INF is clamped to 0 since 3SE is positive only + Exp[j] = Frac[j] = 0; + } + } + else if ( Sign ) + { + // 3SE is positive only, so clamp to zero + Exp[j] = Frac[j] = 0; + } + else if (I > 0x477FC000U) + { + // The number is too large, set to max + Exp[j] = 0x1e; + Frac[j] = 0x1ff; + } + else + { + if (I < 0x38800000U) + { + // The number is too small to be represented as a normalized float11 + // Convert it to a denormalized value. + UINT Shift = 113U - (I >> 23U); + I = (0x800000U | (I & 0x7FFFFFU)) >> Shift; + } + else + { + // Rebias the exponent to represent the value as a normalized float11 + I += 0xC8000000U; + } + + T = ((I + 0x1FFFU + ((I >> 14U) & 1U)) >> 14U)&0x3fffU; + + Exp[j] = (T & 0x3E00) >> 9; + Frac[j] = T & 0x1ff; + } + } + + // Adjust to a shared exponent + T = XMMax( Exp[0], XMMax( Exp[1], Exp[2] ) ); + + Frac[0] = Frac[0] >> (T - Exp[0]); + Frac[1] = Frac[1] >> (T - Exp[1]); + Frac[2] = Frac[2] >> (T - Exp[2]); + + // Store packed into memory + pDestination->xm = Frac[0]; + pDestination->ym = Frac[1]; + pDestination->zm = Frac[2]; + pDestination->e = T; +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreInt4 +( + UINT* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMASSERT(pDestination); + + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; + pDestination[3] = V.vector4_u32[3]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + + _mm_storeu_si128( (__m128i*)pDestination, reinterpret_cast(&V)[0] ); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreInt4A +( + UINT* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 0xF) == 0); + + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; + pDestination[3] = V.vector4_u32[3]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 0xF) == 0); + + _mm_store_si128( (__m128i*)pDestination, reinterpret_cast(&V)[0] ); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreSInt4 +( + XMINT4* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMASSERT(pDestination); + + pDestination->x = (INT)V.vector4_f32[0]; + pDestination->y = (INT)V.vector4_f32[1]; + pDestination->z = (INT)V.vector4_f32[2]; + pDestination->w = (INT)V.vector4_f32[3]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(V); + // If there was positive overflow, set to 0x7FFFFFFF + XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow,reinterpret_cast(&vResulti)[0]); + vOverflow = _mm_or_ps(vOverflow,vResult); + _mm_storeu_si128( (__m128i*)pDestination, reinterpret_cast(&vOverflow)[0] ); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreUInt4 +( + XMUINT4* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMASSERT(pDestination); + + pDestination->x = (UINT)V.vector4_f32[0]; + pDestination->y = (UINT)V.vector4_f32[1]; + pDestination->z = (UINT)V.vector4_f32[2]; + pDestination->w = (UINT)V.vector4_f32[3]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + + // Clamp to >=0 + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue,vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult,vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask,g_XMNegativeZero); + vResult = _mm_xor_ps(reinterpret_cast(&vResulti)[0],vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult,vOverflow); + _mm_storeu_si128( (__m128i*)pDestination, reinterpret_cast(&vResult)[0] ); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreInt4NC +( + UINT* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 3) == 0); + + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; + pDestination[3] = V.vector4_u32[3]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 3) == 0); + + _mm_storeu_si128( (__m128i*)pDestination, reinterpret_cast(&V)[0] ); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreFloat4 +( + XMFLOAT4* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMASSERT(pDestination); + + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; + pDestination->w = V.vector4_f32[3]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + + _mm_storeu_ps( &pDestination->x, V ); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreFloat4A +( + XMFLOAT4A* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 0xF) == 0); + + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; + pDestination->w = V.vector4_f32[3]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 0xF) == 0); + + _mm_store_ps( &pDestination->x, V ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreFloat4NC +( + XMFLOAT4* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 3) == 0); + + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; + pDestination->w = V.vector4_f32[3]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 3) == 0); + + _mm_storeu_ps( &pDestination->x, V ); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreHalf4 +( + XMHALF4* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMASSERT(pDestination); + + pDestination->x = XMConvertFloatToHalf(V.vector4_f32[0]); + pDestination->y = XMConvertFloatToHalf(V.vector4_f32[1]); + pDestination->z = XMConvertFloatToHalf(V.vector4_f32[2]); + pDestination->w = XMConvertFloatToHalf(V.vector4_f32[3]); + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V)); + pDestination->y = XMConvertFloatToHalf(XMVectorGetY(V)); + pDestination->z = XMConvertFloatToHalf(XMVectorGetZ(V)); + pDestination->w = XMConvertFloatToHalf(XMVectorGetW(V)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreShortN4 +( + XMSHORTN4* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTORF32 Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f}; + + XMASSERT(pDestination); + + N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(N, Scale.v); + N = XMVectorRound(N); + + pDestination->x = (SHORT)N.vector4_f32[0]; + pDestination->y = (SHORT)N.vector4_f32[1]; + pDestination->z = (SHORT)N.vector4_f32[2]; + pDestination->w = (SHORT)N.vector4_f32[3]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + static CONST XMVECTORF32 Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f}; + + XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne); + vResult = _mm_min_ps(vResult,g_XMOne); + vResult = _mm_mul_ps(vResult,Scale); + __m128i vResulti = _mm_cvtps_epi32(vResult); + vResulti = _mm_packs_epi32(vResulti,vResulti); + _mm_store_sd(reinterpret_cast(&pDestination->x),reinterpret_cast(&vResulti)[0]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreShort4 +( + XMSHORT4* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTOR Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f}; + static CONST XMVECTOR Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f}; + + XMASSERT(pDestination); + + N = XMVectorClamp(V, Min, Max); + N = XMVectorRound(N); + + pDestination->x = (SHORT)N.vector4_f32[0]; + pDestination->y = (SHORT)N.vector4_f32[1]; + pDestination->z = (SHORT)N.vector4_f32[2]; + pDestination->w = (SHORT)N.vector4_f32[3]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + static CONST XMVECTORF32 Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f}; + static CONST XMVECTORF32 Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f}; + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,Min); + vResult = _mm_min_ps(vResult,Max); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // Pack the ints into shorts + vInt = _mm_packs_epi32(vInt,vInt); + _mm_store_sd(reinterpret_cast(&pDestination->x),reinterpret_cast(&vInt)[0]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreUShortN4 +( + XMUSHORTN4* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTORF32 Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f}; + + XMASSERT(pDestination); + + N = XMVectorSaturate(V); + N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v); + N = XMVectorTruncate(N); + + pDestination->x = (SHORT)N.vector4_f32[0]; + pDestination->y = (SHORT)N.vector4_f32[1]; + pDestination->z = (SHORT)N.vector4_f32[2]; + pDestination->w = (SHORT)N.vector4_f32[3]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + static CONST XMVECTORF32 Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f}; + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,g_XMOne); + vResult = _mm_mul_ps(vResult,Scale); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // Since the SSE pack instruction clamps using signed rules, + // manually extract the values to store them to memory + pDestination->x = static_cast(_mm_extract_epi16(vInt,0)); + pDestination->y = static_cast(_mm_extract_epi16(vInt,2)); + pDestination->z = static_cast(_mm_extract_epi16(vInt,4)); + pDestination->w = static_cast(_mm_extract_epi16(vInt,6)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreUShort4 +( + XMUSHORT4* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTOR Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f}; + + XMASSERT(pDestination); + + N = XMVectorClamp(V, XMVectorZero(), Max); + N = XMVectorRound(N); + + pDestination->x = (SHORT)N.vector4_f32[0]; + pDestination->y = (SHORT)N.vector4_f32[1]; + pDestination->z = (SHORT)N.vector4_f32[2]; + pDestination->w = (SHORT)N.vector4_f32[3]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + static CONST XMVECTORF32 Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f}; + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,Max); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // Since the SSE pack instruction clamps using signed rules, + // manually extract the values to store them to memory + pDestination->x = static_cast(_mm_extract_epi16(vInt,0)); + pDestination->y = static_cast(_mm_extract_epi16(vInt,2)); + pDestination->z = static_cast(_mm_extract_epi16(vInt,4)); + pDestination->w = static_cast(_mm_extract_epi16(vInt,6)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreXIcoN4 +( + XMXICON4* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTORF32 Min = {-1.0f, -1.0f, -1.0f, 0.0f}; + static CONST XMVECTORF32 Scale = {524287.0f, 524287.0f, 524287.0f, 15.0f}; + + XMASSERT(pDestination); + + N = XMVectorClamp(V, Min.v, g_XMOne.v); + N = XMVectorMultiply(N, Scale.v); + N = XMVectorRound(N); + + pDestination->v = ((UINT64)N.vector4_f32[3] << 60) | + (((INT64)N.vector4_f32[2] & 0xFFFFF) << 40) | + (((INT64)N.vector4_f32[1] & 0xFFFFF) << 20) | + (((INT64)N.vector4_f32[0] & 0xFFFFF)); + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + // Note: Masks are x,w,y and z + static const XMVECTORF32 MinXIcoN4 = {-1.0f, 0.0f,-1.0f,-1.0f}; + static const XMVECTORF32 ScaleXIcoN4 = {524287.0f,15.0f*4096.0f*65536.0f*0.5f,524287.0f*4096.0f,524287.0f}; + static const XMVECTORI32 MaskXIcoN4 = {0xFFFFF,0xF<<((60-32)-1),0xFFFFF000,0xFFFFF}; + + // Clamp to bounds + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0)); + vResult = _mm_max_ps(vResult,MinXIcoN4); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleXIcoN4); + // Convert to integer (w is unsigned) + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off unused bits + vResulti = _mm_and_si128(vResulti,MaskXIcoN4); + // Isolate Y + __m128i vResulti2 = _mm_and_si128(vResulti,g_XMMaskY); + // Double Y (Really W) to fixup for unsigned conversion + vResulti = _mm_add_epi32(vResulti,vResulti2); + // Shift y and z to straddle the 32-bit boundary + vResulti2 = _mm_srli_si128(vResulti,(64+12)/8); + // Shift it into place + vResulti2 = _mm_slli_si128(vResulti2,20/8); + // i = x|y<<20|z<<40|w<<60 + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_sd(reinterpret_cast(&pDestination->v),reinterpret_cast(&vResulti)[0]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreXIco4 +( + XMXICO4* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTORF32 Min = {-524287.0f, -524287.0f, -524287.0f, 0.0f}; + static CONST XMVECTORF32 Max = {524287.0f, 524287.0f, 524287.0f, 15.0f}; + + XMASSERT(pDestination); + N = XMVectorClamp(V, Min.v, Max.v); + pDestination->v = ((UINT64)N.vector4_f32[3] << 60) | + (((INT64)N.vector4_f32[2] & 0xFFFFF) << 40) | + (((INT64)N.vector4_f32[1] & 0xFFFFF) << 20) | + (((INT64)N.vector4_f32[0] & 0xFFFFF)); + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + // Note: Masks are x,w,y and z + static const XMVECTORF32 MinXIco4 = {-524287.0f, 0.0f,-524287.0f,-524287.0f}; + static const XMVECTORF32 MaxXIco4 = { 524287.0f,15.0f, 524287.0f, 524287.0f}; + static const XMVECTORF32 ScaleXIco4 = {1.0f,4096.0f*65536.0f*0.5f,4096.0f,1.0f}; + static const XMVECTORI32 MaskXIco4 = {0xFFFFF,0xF<<((60-1)-32),0xFFFFF000,0xFFFFF}; + // Clamp to bounds + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0)); + vResult = _mm_max_ps(vResult,MinXIco4); + vResult = _mm_min_ps(vResult,MaxXIco4); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleXIco4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskXIco4); + // Isolate Y + __m128i vResulti2 = _mm_and_si128(vResulti,g_XMMaskY); + // Double Y (Really W) to fixup for unsigned conversion + vResulti = _mm_add_epi32(vResulti,vResulti2); + // Shift y and z to straddle the 32-bit boundary + vResulti2 = _mm_srli_si128(vResulti,(64+12)/8); + // Shift it into place + vResulti2 = _mm_slli_si128(vResulti2,20/8); + // i = x|y<<20|z<<40|w<<60 + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_sd(reinterpret_cast(&pDestination->v),reinterpret_cast(&vResulti)[0]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreUIcoN4 +( + XMUICON4* pDestination, + FXMVECTOR V +) +{ + #define XM_URange ((FLOAT)(1 << 20)) + #define XM_URangeDiv2 ((FLOAT)(1 << 19)) + #define XM_UMaxXYZ ((FLOAT)((1 << 20) - 1)) + #define XM_UMaxW ((FLOAT)((1 << 4) - 1)) + #define XM_ScaleXYZ (-(FLOAT)((1 << 20) - 1) / XM_PACK_FACTOR) + #define XM_ScaleW (-(FLOAT)((1 << 4) - 1) / XM_PACK_FACTOR) + #define XM_Scale (-1.0f / XM_PACK_FACTOR) + #define XM_Offset (3.0f) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTORF32 Scale = {1048575.0f, 1048575.0f, 1048575.0f, 15.0f}; + + XMASSERT(pDestination); + + N = XMVectorSaturate(V); + N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v); + + pDestination->v = ((UINT64)N.vector4_f32[3] << 60) | + (((UINT64)N.vector4_f32[2] & 0xFFFFF) << 40) | + (((UINT64)N.vector4_f32[1] & 0xFFFFF) << 20) | + (((UINT64)N.vector4_f32[0] & 0xFFFFF)); + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + // Note: Masks are x,w,y and z + static const XMVECTORF32 ScaleUIcoN4 = {1048575.0f,15.0f*4096.0f*65536.0f,1048575.0f*4096.0f,1048575.0f}; + static const XMVECTORI32 MaskUIcoN4 = {0xFFFFF,0xF<<(60-32),0xFFFFF000,0xFFFFF}; + static const XMVECTORF32 AddUIcoN4 = {0.0f,-32768.0f*65536.0f,-32768.0f*65536.0f,0.0f}; + // Clamp to bounds + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0)); + vResult = _mm_max_ps(vResult,g_XMZero); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleUIcoN4); + // Adjust for unsigned entries + vResult = _mm_add_ps(vResult,AddUIcoN4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Fix the signs on the unsigned entries + vResulti = _mm_xor_si128(vResulti,g_XMFlipYZ); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskUIcoN4); + // Shift y and z to straddle the 32-bit boundary + __m128i vResulti2 = _mm_srli_si128(vResulti,(64+12)/8); + // Shift it into place + vResulti2 = _mm_slli_si128(vResulti2,20/8); + // i = x|y<<20|z<<40|w<<60 + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_sd(reinterpret_cast(&pDestination->v),reinterpret_cast(&vResulti)[0]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ + + #undef XM_URange + #undef XM_URangeDiv2 + #undef XM_UMaxXYZ + #undef XM_UMaxW + #undef XM_ScaleXYZ + #undef XM_ScaleW + #undef XM_Scale + #undef XM_Offset +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreUIco4 +( + XMUICO4* pDestination, + FXMVECTOR V +) +{ + #define XM_Scale (-1.0f / XM_PACK_FACTOR) + #define XM_URange ((FLOAT)(1 << 20)) + #define XM_URangeDiv2 ((FLOAT)(1 << 19)) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTOR Max = {1048575.0f, 1048575.0f, 1048575.0f, 15.0f}; + + XMASSERT(pDestination); + + N = XMVectorClamp(V, XMVectorZero(), Max); + N = XMVectorRound(N); + + pDestination->v = ((UINT64)N.vector4_f32[3] << 60) | + (((UINT64)N.vector4_f32[2] & 0xFFFFF) << 40) | + (((UINT64)N.vector4_f32[1] & 0xFFFFF) << 20) | + (((UINT64)N.vector4_f32[0] & 0xFFFFF)); + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + // Note: Masks are x,w,y and z + static const XMVECTORF32 MaxUIco4 = { 1048575.0f, 15.0f, 1048575.0f, 1048575.0f}; + static const XMVECTORF32 ScaleUIco4 = {1.0f,4096.0f*65536.0f,4096.0f,1.0f}; + static const XMVECTORI32 MaskUIco4 = {0xFFFFF,0xF<<(60-32),0xFFFFF000,0xFFFFF}; + static const XMVECTORF32 AddUIco4 = {0.0f,-32768.0f*65536.0f,-32768.0f*65536.0f,0.0f}; + // Clamp to bounds + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0)); + vResult = _mm_max_ps(vResult,g_XMZero); + vResult = _mm_min_ps(vResult,MaxUIco4); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleUIco4); + vResult = _mm_add_ps(vResult,AddUIco4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + vResulti = _mm_xor_si128(vResulti,g_XMFlipYZ); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskUIco4); + // Shift y and z to straddle the 32-bit boundary + __m128i vResulti2 = _mm_srli_si128(vResulti,(64+12)/8); + // Shift it into place + vResulti2 = _mm_slli_si128(vResulti2,20/8); + // i = x|y<<20|z<<40|w<<60 + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_sd(reinterpret_cast(&pDestination->v),reinterpret_cast(&vResulti)[0]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ + + #undef XM_Scale + #undef XM_URange + #undef XM_URangeDiv2 +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreIcoN4 +( + XMICON4* pDestination, + FXMVECTOR V +) +{ + #define XM_Scale (-1.0f / XM_PACK_FACTOR) + #define XM_URange ((FLOAT)(1 << 4)) + #define XM_Offset (3.0f) + #define XM_UMaxXYZ ((FLOAT)((1 << (20 - 1)) - 1)) + #define XM_UMaxW ((FLOAT)((1 << (4 - 1)) - 1)) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTORF32 Scale = {524287.0f, 524287.0f, 524287.0f, 7.0f}; + + XMASSERT(pDestination); + + N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiplyAdd(N, Scale.v, g_XMNegativeZero.v); + N = XMVectorRound(N); + + pDestination->v = ((UINT64)N.vector4_f32[3] << 60) | + (((UINT64)N.vector4_f32[2] & 0xFFFFF) << 40) | + (((UINT64)N.vector4_f32[1] & 0xFFFFF) << 20) | + (((UINT64)N.vector4_f32[0] & 0xFFFFF)); + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + // Note: Masks are x,w,y and z + static const XMVECTORF32 ScaleIcoN4 = {524287.0f,7.0f*4096.0f*65536.0f,524287.0f*4096.0f,524287.0f}; + static const XMVECTORI32 MaskIcoN4 = {0xFFFFF,0xF<<(60-32),0xFFFFF000,0xFFFFF}; + // Clamp to bounds + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0)); + vResult = _mm_max_ps(vResult,g_XMNegativeOne); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleIcoN4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskIcoN4); + // Shift y and z to straddle the 32-bit boundary + __m128i vResulti2 = _mm_srli_si128(vResulti,(64+12)/8); + // Shift it into place + vResulti2 = _mm_slli_si128(vResulti2,20/8); + // i = x|y<<20|z<<40|w<<60 + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_sd(reinterpret_cast(&pDestination->v),reinterpret_cast(&vResulti)[0]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ + + #undef XM_Scale + #undef XM_URange + #undef XM_Offset + #undef XM_UMaxXYZ + #undef XM_UMaxW +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreIco4 +( + XMICO4* pDestination, + FXMVECTOR V +) +{ + #define XM_Scale (-1.0f / XM_PACK_FACTOR) + #define XM_URange ((FLOAT)(1 << 4)) + #define XM_Offset (3.0f) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTOR Min = {-524287.0f, -524287.0f, -524287.0f, -7.0f}; + static CONST XMVECTOR Max = {524287.0f, 524287.0f, 524287.0f, 7.0f}; + + XMASSERT(pDestination); + + N = XMVectorClamp(V, Min, Max); + N = XMVectorRound(N); + + pDestination->v = ((INT64)N.vector4_f32[3] << 60) | + (((INT64)N.vector4_f32[2] & 0xFFFFF) << 40) | + (((INT64)N.vector4_f32[1] & 0xFFFFF) << 20) | + (((INT64)N.vector4_f32[0] & 0xFFFFF)); + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + // Note: Masks are x,w,y and z + static const XMVECTORF32 MinIco4 = {-524287.0f,-7.0f,-524287.0f,-524287.0f}; + static const XMVECTORF32 MaxIco4 = { 524287.0f, 7.0f, 524287.0f, 524287.0f}; + static const XMVECTORF32 ScaleIco4 = {1.0f,4096.0f*65536.0f,4096.0f,1.0f}; + static const XMVECTORI32 MaskIco4 = {0xFFFFF,0xF<<(60-32),0xFFFFF000,0xFFFFF}; + // Clamp to bounds + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0)); + vResult = _mm_max_ps(vResult,MinIco4); + vResult = _mm_min_ps(vResult,MaxIco4); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleIco4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskIco4); + // Shift y and z to straddle the 32-bit boundary + __m128i vResulti2 = _mm_srli_si128(vResulti,(64+12)/8); + // Shift it into place + vResulti2 = _mm_slli_si128(vResulti2,20/8); + // i = x|y<<20|z<<40|w<<60 + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_sd(reinterpret_cast(&pDestination->v),reinterpret_cast(&vResulti)[0]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ + + #undef XM_Scale + #undef XM_URange + #undef XM_Offset +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreXDecN4 +( + XMXDECN4* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTORF32 Min = {-1.0f, -1.0f, -1.0f, 0.0f}; + static CONST XMVECTORF32 Scale = {511.0f, 511.0f, 511.0f, 3.0f}; + + XMASSERT(pDestination); + + N = XMVectorClamp(V, Min.v, g_XMOne.v); + N = XMVectorMultiply(N, Scale.v); + N = XMVectorRound(N); + + pDestination->v = ((UINT)N.vector4_f32[3] << 30) | + (((INT)N.vector4_f32[2] & 0x3FF) << 20) | + (((INT)N.vector4_f32[1] & 0x3FF) << 10) | + (((INT)N.vector4_f32[0] & 0x3FF)); + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Min = {-1.0f, -1.0f, -1.0f, 0.0f}; + static const XMVECTORF32 Scale = {511.0f, 511.0f*1024.0f, 511.0f*1048576.0f,3.0f*536870912.0f}; + static const XMVECTORI32 ScaleMask = {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<29}; + XMASSERT(pDestination); + XMVECTOR vResult = _mm_max_ps(V,Min); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,Scale); + // Convert to int (W is unsigned) + __m128i vResulti = _mm_cvtps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,ScaleMask); + // To fix W, add itself to shift it up to <<30 instead of <<29 + __m128i vResultw = _mm_and_si128(vResulti,g_XMMaskW); + vResulti = _mm_add_epi32(vResulti,vResultw); + // Do a horizontal or of all 4 entries + vResult = _mm_shuffle_ps(reinterpret_cast(&vResulti)[0],reinterpret_cast(&vResulti)[0],_MM_SHUFFLE(0,3,2,1)); + vResulti = _mm_or_si128(vResulti,reinterpret_cast(&vResult)[0]); + vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1)); + vResulti = _mm_or_si128(vResulti,reinterpret_cast(&vResult)[0]); + vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1)); + vResulti = _mm_or_si128(vResulti,reinterpret_cast(&vResult)[0]); + _mm_store_ss(reinterpret_cast(&pDestination->v),reinterpret_cast(&vResulti)[0]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreXDec4 +( + XMXDEC4* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTOR Min = {-511.0f, -511.0f, -511.0f, 0.0f}; + static CONST XMVECTOR Max = {511.0f, 511.0f, 511.0f, 3.0f}; + + XMASSERT(pDestination); + + N = XMVectorClamp(V, Min, Max); + + pDestination->v = ((UINT)N.vector4_f32[3] << 30) | + (((INT)N.vector4_f32[2] & 0x3FF) << 20) | + (((INT)N.vector4_f32[1] & 0x3FF) << 10) | + (((INT)N.vector4_f32[0] & 0x3FF)); + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + static const XMVECTORF32 MinXDec4 = {-511.0f,-511.0f,-511.0f, 0.0f}; + static const XMVECTORF32 MaxXDec4 = { 511.0f, 511.0f, 511.0f, 3.0f}; + static const XMVECTORF32 ScaleXDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f}; + static const XMVECTORI32 MaskXDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,MinXDec4); + vResult = _mm_min_ps(vResult,MaxXDec4); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleXDec4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskXDec4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // Perform a single bit left shift on y|w + vResulti2 = _mm_add_epi32(vResulti2,vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),reinterpret_cast(&vResulti)[0]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreUDecN4 +( + XMUDECN4* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTORF32 Scale = {1023.0f, 1023.0f, 1023.0f, 3.0f}; + + XMASSERT(pDestination); + + N = XMVectorSaturate(V); + N = XMVectorMultiply(N, Scale.v); + + pDestination->v = ((UINT)N.vector4_f32[3] << 30) | + (((UINT)N.vector4_f32[2] & 0x3FF) << 20) | + (((UINT)N.vector4_f32[1] & 0x3FF) << 10) | + (((UINT)N.vector4_f32[0] & 0x3FF)); + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + static const XMVECTORF32 ScaleUDecN4 = {1023.0f,1023.0f*1024.0f*0.5f,1023.0f*1024.0f*1024.0f,3.0f*1024.0f*1024.0f*1024.0f*0.5f}; + static const XMVECTORI32 MaskUDecN4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleUDecN4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskUDecN4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // Perform a left shift by one bit on y|w + vResulti2 = _mm_add_epi32(vResulti2,vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),reinterpret_cast(&vResulti)[0]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreUDec4 +( + XMUDEC4* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTOR Max = {1023.0f, 1023.0f, 1023.0f, 3.0f}; + + XMASSERT(pDestination); + + N = XMVectorClamp(V, XMVectorZero(), Max); + + pDestination->v = ((UINT)N.vector4_f32[3] << 30) | + (((UINT)N.vector4_f32[2] & 0x3FF) << 20) | + (((UINT)N.vector4_f32[1] & 0x3FF) << 10) | + (((UINT)N.vector4_f32[0] & 0x3FF)); + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + static const XMVECTORF32 MaxUDec4 = { 1023.0f, 1023.0f, 1023.0f, 3.0f}; + static const XMVECTORF32 ScaleUDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f}; + static const XMVECTORI32 MaskUDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,MaxUDec4); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleUDec4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskUDec4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // Perform a left shift by one bit on y|w + vResulti2 = _mm_add_epi32(vResulti2,vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),reinterpret_cast(&vResulti)[0]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreDecN4 +( + XMDECN4* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTORF32 Scale = {511.0f, 511.0f, 511.0f, 1.0f}; + + XMASSERT(pDestination); + + N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(N, Scale.v); + + pDestination->v = ((INT)N.vector4_f32[3] << 30) | + (((INT)N.vector4_f32[2] & 0x3FF) << 20) | + (((INT)N.vector4_f32[1] & 0x3FF) << 10) | + (((INT)N.vector4_f32[0] & 0x3FF)); + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + static const XMVECTORF32 ScaleDecN4 = {511.0f,511.0f*1024.0f,511.0f*1024.0f*1024.0f,1.0f*1024.0f*1024.0f*1024.0f}; + static const XMVECTORI32 MaskDecN4= {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<30}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleDecN4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskDecN4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),reinterpret_cast(&vResulti)[0]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreDec4 +( + XMDEC4* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTOR Min = {-511.0f, -511.0f, -511.0f, -1.0f}; + static CONST XMVECTOR Max = {511.0f, 511.0f, 511.0f, 1.0f}; + + XMASSERT(pDestination); + + N = XMVectorClamp(V, Min, Max); + + pDestination->v = ((INT)N.vector4_f32[3] << 30) | + (((INT)N.vector4_f32[2] & 0x3FF) << 20) | + (((INT)N.vector4_f32[1] & 0x3FF) << 10) | + (((INT)N.vector4_f32[0] & 0x3FF)); + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + static const XMVECTORF32 MinDec4 = {-511.0f,-511.0f,-511.0f,-1.0f}; + static const XMVECTORF32 MaxDec4 = { 511.0f, 511.0f, 511.0f, 1.0f}; + static const XMVECTORF32 ScaleDec4 = {1.0f,1024.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f}; + static const XMVECTORI32 MaskDec4= {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<30}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,MinDec4); + vResult = _mm_min_ps(vResult,MaxDec4); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleDec4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskDec4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),reinterpret_cast(&vResulti)[0]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreUByteN4 +( + XMUBYTEN4* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTORF32 Scale = {255.0f, 255.0f, 255.0f, 255.0f}; + + XMASSERT(pDestination); + + N = XMVectorSaturate(V); + N = XMVectorMultiply(N, Scale.v); + N = XMVectorRound(N); + + pDestination->x = (BYTE)N.vector4_f32[0]; + pDestination->y = (BYTE)N.vector4_f32[1]; + pDestination->z = (BYTE)N.vector4_f32[2]; + pDestination->w = (BYTE)N.vector4_f32[3]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + static const XMVECTORF32 ScaleUByteN4 = {255.0f,255.0f*256.0f*0.5f,255.0f*256.0f*256.0f,255.0f*256.0f*256.0f*256.0f*0.5f}; + static const XMVECTORI32 MaskUByteN4 = {0xFF,0xFF<<(8-1),0xFF<<16,0xFF<<(24-1)}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleUByteN4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskUByteN4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // Perform a single bit left shift to fix y|w + vResulti2 = _mm_add_epi32(vResulti2,vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),reinterpret_cast(&vResulti)[0]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreUByte4 +( + XMUBYTE4* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTOR Max = {255.0f, 255.0f, 255.0f, 255.0f}; + + XMASSERT(pDestination); + + N = XMVectorClamp(V, XMVectorZero(), Max); + N = XMVectorRound(N); + + pDestination->x = (BYTE)N.vector4_f32[0]; + pDestination->y = (BYTE)N.vector4_f32[1]; + pDestination->z = (BYTE)N.vector4_f32[2]; + pDestination->w = (BYTE)N.vector4_f32[3]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + static const XMVECTORF32 MaxUByte4 = { 255.0f, 255.0f, 255.0f, 255.0f}; + static const XMVECTORF32 ScaleUByte4 = {1.0f,256.0f*0.5f,256.0f*256.0f,256.0f*256.0f*256.0f*0.5f}; + static const XMVECTORI32 MaskUByte4 = {0xFF,0xFF<<(8-1),0xFF<<16,0xFF<<(24-1)}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,MaxUByte4); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleUByte4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskUByte4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // Perform a single bit left shift to fix y|w + vResulti2 = _mm_add_epi32(vResulti2,vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),reinterpret_cast(&vResulti)[0]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreByteN4 +( + XMBYTEN4* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTORF32 Scale = {127.0f, 127.0f, 127.0f, 127.0f}; + + XMASSERT(pDestination); + + N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(V, Scale.v); + N = XMVectorRound(N); + + pDestination->x = (CHAR)N.vector4_f32[0]; + pDestination->y = (CHAR)N.vector4_f32[1]; + pDestination->z = (CHAR)N.vector4_f32[2]; + pDestination->w = (CHAR)N.vector4_f32[3]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + static const XMVECTORF32 ScaleByteN4 = {127.0f,127.0f*256.0f,127.0f*256.0f*256.0f,127.0f*256.0f*256.0f*256.0f}; + static const XMVECTORI32 MaskByteN4 = {0xFF,0xFF<<8,0xFF<<16,0xFF<<24}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleByteN4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskByteN4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),reinterpret_cast(&vResulti)[0]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreByte4 +( + XMBYTE4* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTOR Min = {-127.0f, -127.0f, -127.0f, -127.0f}; + static CONST XMVECTOR Max = {127.0f, 127.0f, 127.0f, 127.0f}; + + XMASSERT(pDestination); + + N = XMVectorClamp(V, Min, Max); + N = XMVectorRound(N); + + pDestination->x = (CHAR)N.vector4_f32[0]; + pDestination->y = (CHAR)N.vector4_f32[1]; + pDestination->z = (CHAR)N.vector4_f32[2]; + pDestination->w = (CHAR)N.vector4_f32[3]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + static const XMVECTORF32 MinByte4 = {-127.0f,-127.0f,-127.0f,-127.0f}; + static const XMVECTORF32 MaxByte4 = { 127.0f, 127.0f, 127.0f, 127.0f}; + static const XMVECTORF32 ScaleByte4 = {1.0f,256.0f,256.0f*256.0f,256.0f*256.0f*256.0f}; + static const XMVECTORI32 MaskByte4 = {0xFF,0xFF<<8,0xFF<<16,0xFF<<24}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,MinByte4); + vResult = _mm_min_ps(vResult,MaxByte4); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleByte4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskByte4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),reinterpret_cast(&vResulti)[0]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreUNibble4 +( + XMUNIBBLE4* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + XMASSERT(pDestination); + static CONST XMVECTORF32 Max = {15.0f,15.0f,15.0f,15.0f}; + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,Max); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + USHORT x = static_cast(_mm_extract_epi16(vInt,0)); + USHORT y = static_cast(_mm_extract_epi16(vInt,2)); + USHORT z = static_cast(_mm_extract_epi16(vInt,4)); + USHORT w = static_cast(_mm_extract_epi16(vInt,6)); + pDestination->v = ((w & 0xF) << 12) | + ((z & 0xF) << 8) | + ((y & 0xF) << 4) | + ((x & 0xF)); +#else + XMVECTOR N; + static CONST XMVECTORF32 Max = {15.0f,15.0f,15.0f,15.0f}; + + XMASSERT(pDestination); + + N = XMVectorClamp(V, XMVectorZero(), Max.v); + N = XMVectorRound(N); + + pDestination->v = (((USHORT)N.vector4_f32[3] & 0xF) << 12) | + (((USHORT)N.vector4_f32[2] & 0xF) << 8) | + (((USHORT)N.vector4_f32[1] & 0xF) << 4) | + (((USHORT)N.vector4_f32[0] & 0xF)); +#endif !_XM_SSE_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreU555( + XMU555* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + XMASSERT(pDestination); + static CONST XMVECTORF32 Max = {31.0f, 31.0f, 31.0f, 1.0f}; + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,Max); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + USHORT x = static_cast(_mm_extract_epi16(vInt,0)); + USHORT y = static_cast(_mm_extract_epi16(vInt,2)); + USHORT z = static_cast(_mm_extract_epi16(vInt,4)); + USHORT w = static_cast(_mm_extract_epi16(vInt,6)); + pDestination->v = ((w) ? 0x8000 : 0) | + ((z & 0x1F) << 10) | + ((y & 0x1F) << 5) | + ((x & 0x1F)); +#else + XMVECTOR N; + static CONST XMVECTORF32 Max = {31.0f, 31.0f, 31.0f, 1.0f}; + + XMASSERT(pDestination); + + N = XMVectorClamp(V, XMVectorZero(), Max.v); + N = XMVectorRound(N); + + pDestination->v = ((N.vector4_f32[3] > 0.f) ? 0x8000 : 0) | + (((USHORT)N.vector4_f32[2] & 0x1F) << 10) | + (((USHORT)N.vector4_f32[1] & 0x1F) << 5) | + (((USHORT)N.vector4_f32[0] & 0x1F)); +#endif !_XM_SSE_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreColor +( + XMCOLOR* pDestination, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + static CONST XMVECTORF32 Scale = {255.0f, 255.0f, 255.0f, 255.0f}; + + XMASSERT(pDestination); + + N = XMVectorSaturate(V); + N = XMVectorMultiply(N, Scale.v); + N = XMVectorRound(N); + + pDestination->c = ((UINT)N.vector4_f32[3] << 24) | + ((UINT)N.vector4_f32[0] << 16) | + ((UINT)N.vector4_f32[1] << 8) | + ((UINT)N.vector4_f32[2]); + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + static CONST XMVECTORF32 Scale = {255.0f,255.0f,255.0f,255.0f}; + // Set <0 to 0 + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + // Set>1 to 1 + vResult = _mm_min_ps(vResult,g_XMOne); + // Convert to 0-255 + vResult = _mm_mul_ps(vResult,Scale); + // Shuffle RGBA to ARGB + vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,0,1,2)); + // Convert to int + __m128i vInt = _mm_cvtps_epi32(vResult); + // Mash to shorts + vInt = _mm_packs_epi32(vInt,vInt); + // Mash to bytes + vInt = _mm_packus_epi16(vInt,vInt); + // Store the color + _mm_store_ss(reinterpret_cast(&pDestination->c),reinterpret_cast<__m128 *>(&vInt)[0]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreFloat3x3 +( + XMFLOAT3X3* pDestination, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) || defined(_XM_SSE_INTRINSICS_) + + XMStoreFloat3x3NC(pDestination, M); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreFloat3x3NC +( + XMFLOAT3X3* pDestination, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMASSERT(pDestination); + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + XMVECTOR vTemp1 = M.r[0]; + XMVECTOR vTemp2 = M.r[1]; + XMVECTOR vTemp3 = M.r[2]; + XMVECTOR vWork = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,0,2,2)); + vTemp1 = _mm_shuffle_ps(vTemp1,vWork,_MM_SHUFFLE(2,0,1,0)); + _mm_storeu_ps(&pDestination->m[0][0],vTemp1); + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1)); + _mm_storeu_ps(&pDestination->m[1][1],vTemp2); + vTemp3 = _mm_shuffle_ps(vTemp3,vTemp3,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss(&pDestination->m[2][2],vTemp3); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreFloat4x3 +( + XMFLOAT4X3* pDestination, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) || defined(_XM_SSE_INTRINSICS_) + + XMStoreFloat4x3NC(pDestination, M); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreFloat4x3A +( + XMFLOAT4X3A* pDestination, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 0xF) == 0); + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 0xF) == 0); + // x1,y1,z1,w1 + XMVECTOR vTemp1 = M.r[0]; + // x2,y2,z2,w2 + XMVECTOR vTemp2 = M.r[1]; + // x3,y3,z3,w3 + XMVECTOR vTemp3 = M.r[2]; + // x4,y4,z4,w4 + XMVECTOR vTemp4 = M.r[3]; + // z1,z1,x2,y2 + XMVECTOR vTemp = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(1,0,2,2)); + // y2,z2,x3,y3 (Final) + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1)); + // x1,y1,z1,x2 (Final) + vTemp1 = _mm_shuffle_ps(vTemp1,vTemp,_MM_SHUFFLE(2,0,1,0)); + // z3,z3,x4,x4 + vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2)); + // z3,x4,y4,z4 (Final) + vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0)); + // Store in 3 operations + _mm_store_ps(&pDestination->m[0][0],vTemp1); + _mm_store_ps(&pDestination->m[1][1],vTemp2); + _mm_store_ps(&pDestination->m[2][2],vTemp3); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreFloat4x3NC +( + XMFLOAT4X3* pDestination, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMASSERT(pDestination); + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + XMVECTOR vTemp1 = M.r[0]; + XMVECTOR vTemp2 = M.r[1]; + XMVECTOR vTemp3 = M.r[2]; + XMVECTOR vTemp4 = M.r[3]; + XMVECTOR vTemp2x = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1)); + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(2,2,0,0)); + vTemp1 = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,2,1,0)); + vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2)); + vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0)); + _mm_storeu_ps(&pDestination->m[0][0],vTemp1); + _mm_storeu_ps(&pDestination->m[1][1],vTemp2x); + _mm_storeu_ps(&pDestination->m[2][2],vTemp3); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreFloat4x4 +( + XMFLOAT4X4* pDestination, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) + + XMStoreFloat4x4NC(pDestination, M); + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + + _mm_storeu_ps( &pDestination->_11, M.r[0] ); + _mm_storeu_ps( &pDestination->_21, M.r[1] ); + _mm_storeu_ps( &pDestination->_31, M.r[2] ); + _mm_storeu_ps( &pDestination->_41, M.r[3] ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreFloat4x4A +( + XMFLOAT4X4A* pDestination, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 0xF) == 0); + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + pDestination->m[0][3] = M.r[0].vector4_f32[3]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + pDestination->m[1][3] = M.r[1].vector4_f32[3]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + pDestination->m[2][3] = M.r[2].vector4_f32[3]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + pDestination->m[3][3] = M.r[3].vector4_f32[3]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + + _mm_store_ps( &pDestination->_11, M.r[0] ); + _mm_store_ps( &pDestination->_21, M.r[1] ); + _mm_store_ps( &pDestination->_31, M.r[2] ); + _mm_store_ps( &pDestination->_41, M.r[3] ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMStoreFloat4x4NC +( + XMFLOAT4X4* pDestination, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMASSERT(pDestination); + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + pDestination->m[0][3] = M.r[0].vector4_f32[3]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + pDestination->m[1][3] = M.r[1].vector4_f32[3]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + pDestination->m[2][3] = M.r[2].vector4_f32[3]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + pDestination->m[3][3] = M.r[3].vector4_f32[3]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + _mm_storeu_ps(&pDestination->m[0][0],M.r[0]); + _mm_storeu_ps(&pDestination->m[1][0],M.r[1]); + _mm_storeu_ps(&pDestination->m[2][0],M.r[2]); + _mm_storeu_ps(&pDestination->m[3][0],M.r[3]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +#endif // __XNAMATHCONVERT_INL__ + diff --git a/thirdparty/directxtex/XNAMath/xnamathmatrix.inl b/thirdparty/directxtex/XNAMath/xnamathmatrix.inl new file mode 100644 index 0000000..eb9f164 --- /dev/null +++ b/thirdparty/directxtex/XNAMath/xnamathmatrix.inl @@ -0,0 +1,3293 @@ +/************************************************************************ +* * +* xnamathmatrix.inl -- SIMD C++ Math library for Windows and Xbox 360 * +* Matrix functions * +* * +* Copyright (c) Microsoft Corp. All rights reserved. * +* * +************************************************************************/ + +#if defined(_MSC_VER) && (_MSC_VER > 1000) +#pragma once +#endif + +#ifndef __XNAMATHMATRIX_INL__ +#define __XNAMATHMATRIX_INL__ + +/**************************************************************************** + * + * Matrix + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +// Return TRUE if any entry in the matrix is NaN +XMFINLINE BOOL XMMatrixIsNaN +( + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + UINT i, uTest; + const UINT *pWork; + + i = 16; + pWork = (const UINT *)(&M.m[0][0]); + do { + // Fetch value into integer unit + uTest = pWork[0]; + // Remove sign + uTest &= 0x7FFFFFFFU; + // NaN is 0x7F800001 through 0x7FFFFFFF inclusive + uTest -= 0x7F800001U; + if (uTest<0x007FFFFFU) { + break; // NaN found + } + ++pWork; // Next entry + } while (--i); + return (i!=0); // i == 0 if nothing matched +#elif defined(_XM_SSE_INTRINSICS_) + // Load in registers + XMVECTOR vX = M.r[0]; + XMVECTOR vY = M.r[1]; + XMVECTOR vZ = M.r[2]; + XMVECTOR vW = M.r[3]; + // Test themselves to check for NaN + vX = _mm_cmpneq_ps(vX,vX); + vY = _mm_cmpneq_ps(vY,vY); + vZ = _mm_cmpneq_ps(vZ,vZ); + vW = _mm_cmpneq_ps(vW,vW); + // Or all the results + vX = _mm_or_ps(vX,vZ); + vY = _mm_or_ps(vY,vW); + vX = _mm_or_ps(vX,vY); + // If any tested true, return true + return (_mm_movemask_ps(vX)!=0); +#else +#endif +} + +//------------------------------------------------------------------------------ + +// Return TRUE if any entry in the matrix is +/-INF +XMFINLINE BOOL XMMatrixIsInfinite +( + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + UINT i, uTest; + const UINT *pWork; + + i = 16; + pWork = (const UINT *)(&M.m[0][0]); + do { + // Fetch value into integer unit + uTest = pWork[0]; + // Remove sign + uTest &= 0x7FFFFFFFU; + // INF is 0x7F800000 + if (uTest==0x7F800000U) { + break; // INF found + } + ++pWork; // Next entry + } while (--i); + return (i!=0); // i == 0 if nothing matched +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bits + XMVECTOR vTemp1 = _mm_and_ps(M.r[0],g_XMAbsMask); + XMVECTOR vTemp2 = _mm_and_ps(M.r[1],g_XMAbsMask); + XMVECTOR vTemp3 = _mm_and_ps(M.r[2],g_XMAbsMask); + XMVECTOR vTemp4 = _mm_and_ps(M.r[3],g_XMAbsMask); + // Compare to infinity + vTemp1 = _mm_cmpeq_ps(vTemp1,g_XMInfinity); + vTemp2 = _mm_cmpeq_ps(vTemp2,g_XMInfinity); + vTemp3 = _mm_cmpeq_ps(vTemp3,g_XMInfinity); + vTemp4 = _mm_cmpeq_ps(vTemp4,g_XMInfinity); + // Or the answers together + vTemp1 = _mm_or_ps(vTemp1,vTemp2); + vTemp3 = _mm_or_ps(vTemp3,vTemp4); + vTemp1 = _mm_or_ps(vTemp1,vTemp3); + // If any are infinity, the signs are true. + return (_mm_movemask_ps(vTemp1)!=0); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Return TRUE if the XMMatrix is equal to identity +XMFINLINE BOOL XMMatrixIsIdentity +( + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + unsigned int uOne, uZero; + const unsigned int *pWork; + + // Use the integer pipeline to reduce branching to a minimum + pWork = (const unsigned int*)(&M.m[0][0]); + // Convert 1.0f to zero and or them together + uOne = pWork[0]^0x3F800000U; + // Or all the 0.0f entries together + uZero = pWork[1]; + uZero |= pWork[2]; + uZero |= pWork[3]; + // 2nd row + uZero |= pWork[4]; + uOne |= pWork[5]^0x3F800000U; + uZero |= pWork[6]; + uZero |= pWork[7]; + // 3rd row + uZero |= pWork[8]; + uZero |= pWork[9]; + uOne |= pWork[10]^0x3F800000U; + uZero |= pWork[11]; + // 4th row + uZero |= pWork[12]; + uZero |= pWork[13]; + uZero |= pWork[14]; + uOne |= pWork[15]^0x3F800000U; + // If all zero entries are zero, the uZero==0 + uZero &= 0x7FFFFFFF; // Allow -0.0f + // If all 1.0f entries are 1.0f, then uOne==0 + uOne |= uZero; + return (uOne==0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp1 = _mm_cmpeq_ps(M.r[0],g_XMIdentityR0); + XMVECTOR vTemp2 = _mm_cmpeq_ps(M.r[1],g_XMIdentityR1); + XMVECTOR vTemp3 = _mm_cmpeq_ps(M.r[2],g_XMIdentityR2); + XMVECTOR vTemp4 = _mm_cmpeq_ps(M.r[3],g_XMIdentityR3); + vTemp1 = _mm_and_ps(vTemp1,vTemp2); + vTemp3 = _mm_and_ps(vTemp3,vTemp4); + vTemp1 = _mm_and_ps(vTemp1,vTemp3); + return (_mm_movemask_ps(vTemp1)==0x0f); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// Perform a 4x4 matrix multiply by a 4x4 matrix +XMFINLINE XMMATRIX XMMatrixMultiply +( + CXMMATRIX M1, + CXMMATRIX M2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMMATRIX mResult; + // Cache the invariants in registers + float x = M1.m[0][0]; + float y = M1.m[0][1]; + float z = M1.m[0][2]; + float w = M1.m[0][3]; + // Perform the operation on the first row + mResult.m[0][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w); + mResult.m[0][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w); + mResult.m[0][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w); + mResult.m[0][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w); + // Repeat for all the other rows + x = M1.m[1][0]; + y = M1.m[1][1]; + z = M1.m[1][2]; + w = M1.m[1][3]; + mResult.m[1][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w); + mResult.m[1][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w); + mResult.m[1][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w); + mResult.m[1][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w); + x = M1.m[2][0]; + y = M1.m[2][1]; + z = M1.m[2][2]; + w = M1.m[2][3]; + mResult.m[2][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w); + mResult.m[2][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w); + mResult.m[2][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w); + mResult.m[2][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w); + x = M1.m[3][0]; + y = M1.m[3][1]; + z = M1.m[3][2]; + w = M1.m[3][3]; + mResult.m[3][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w); + mResult.m[3][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w); + mResult.m[3][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w); + mResult.m[3][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w); + return mResult; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX mResult; + // Use vW to hold the original row + XMVECTOR vW = M1.r[0]; + // Splat the component X,Y,Z then W + XMVECTOR vX = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR vY = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vZ = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(3,3,3,3)); + // Perform the opertion on the first row + vX = _mm_mul_ps(vX,M2.r[0]); + vY = _mm_mul_ps(vY,M2.r[1]); + vZ = _mm_mul_ps(vZ,M2.r[2]); + vW = _mm_mul_ps(vW,M2.r[3]); + // Perform a binary add to reduce cumulative errors + vX = _mm_add_ps(vX,vZ); + vY = _mm_add_ps(vY,vW); + vX = _mm_add_ps(vX,vY); + mResult.r[0] = vX; + // Repeat for the other 3 rows + vW = M1.r[1]; + vX = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(0,0,0,0)); + vY = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vY = _mm_mul_ps(vY,M2.r[1]); + vZ = _mm_mul_ps(vZ,M2.r[2]); + vW = _mm_mul_ps(vW,M2.r[3]); + vX = _mm_add_ps(vX,vZ); + vY = _mm_add_ps(vY,vW); + vX = _mm_add_ps(vX,vY); + mResult.r[1] = vX; + vW = M1.r[2]; + vX = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(0,0,0,0)); + vY = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vY = _mm_mul_ps(vY,M2.r[1]); + vZ = _mm_mul_ps(vZ,M2.r[2]); + vW = _mm_mul_ps(vW,M2.r[3]); + vX = _mm_add_ps(vX,vZ); + vY = _mm_add_ps(vY,vW); + vX = _mm_add_ps(vX,vY); + mResult.r[2] = vX; + vW = M1.r[3]; + vX = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(0,0,0,0)); + vY = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vY = _mm_mul_ps(vY,M2.r[1]); + vZ = _mm_mul_ps(vZ,M2.r[2]); + vW = _mm_mul_ps(vW,M2.r[3]); + vX = _mm_add_ps(vX,vZ); + vY = _mm_add_ps(vY,vW); + vX = _mm_add_ps(vX,vY); + mResult.r[3] = vX; + return mResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMMATRIX XMMatrixMultiplyTranspose +( + CXMMATRIX M1, + CXMMATRIX M2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMMATRIX mResult; + // Cache the invariants in registers + float x = M2.m[0][0]; + float y = M2.m[1][0]; + float z = M2.m[2][0]; + float w = M2.m[3][0]; + // Perform the operation on the first row + mResult.m[0][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w); + mResult.m[0][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w); + mResult.m[0][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w); + mResult.m[0][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w); + // Repeat for all the other rows + x = M2.m[0][1]; + y = M2.m[1][1]; + z = M2.m[2][1]; + w = M2.m[3][1]; + mResult.m[1][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w); + mResult.m[1][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w); + mResult.m[1][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w); + mResult.m[1][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w); + x = M2.m[0][2]; + y = M2.m[1][2]; + z = M2.m[2][2]; + w = M2.m[3][2]; + mResult.m[2][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w); + mResult.m[2][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w); + mResult.m[2][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w); + mResult.m[2][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w); + x = M2.m[0][3]; + y = M2.m[1][3]; + z = M2.m[2][3]; + w = M2.m[3][3]; + mResult.m[3][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w); + mResult.m[3][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w); + mResult.m[3][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w); + mResult.m[3][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w); + return mResult; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX Product; + XMMATRIX Result; + Product = XMMatrixMultiply(M1, M2); + Result = XMMatrixTranspose(Product); + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMMATRIX XMMatrixTranspose +( + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX P; + XMMATRIX MT; + + // Original matrix: + // + // m00m01m02m03 + // m10m11m12m13 + // m20m21m22m23 + // m30m31m32m33 + + P.r[0] = XMVectorMergeXY(M.r[0], M.r[2]); // m00m20m01m21 + P.r[1] = XMVectorMergeXY(M.r[1], M.r[3]); // m10m30m11m31 + P.r[2] = XMVectorMergeZW(M.r[0], M.r[2]); // m02m22m03m23 + P.r[3] = XMVectorMergeZW(M.r[1], M.r[3]); // m12m32m13m33 + + MT.r[0] = XMVectorMergeXY(P.r[0], P.r[1]); // m00m10m20m30 + MT.r[1] = XMVectorMergeZW(P.r[0], P.r[1]); // m01m11m21m31 + MT.r[2] = XMVectorMergeXY(P.r[2], P.r[3]); // m02m12m22m32 + MT.r[3] = XMVectorMergeZW(P.r[2], P.r[3]); // m03m13m23m33 + + return MT; + +#elif defined(_XM_SSE_INTRINSICS_) + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0],M.r[1],_MM_SHUFFLE(1,0,1,0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0],M.r[1],_MM_SHUFFLE(3,2,3,2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2],M.r[3],_MM_SHUFFLE(1,0,1,0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2],M.r[3],_MM_SHUFFLE(3,2,3,2)); + XMMATRIX mResult; + + // x.x,y.x,z.x,w.x + mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0)); + // x.y,y.y,z.y,w.y + mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1)); + // x.z,y.z,z.z,w.z + mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0)); + // x.w,y.w,z.w,w.w + mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1)); + return mResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Return the inverse and the determinant of a 4x4 matrix +XMINLINE XMMATRIX XMMatrixInverse +( + XMVECTOR* pDeterminant, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX R; + XMMATRIX MT; + XMVECTOR D0, D1, D2; + XMVECTOR C0, C1, C2, C3, C4, C5, C6, C7; + XMVECTOR V0[4], V1[4]; + XMVECTOR Determinant; + XMVECTOR Reciprocal; + XMMATRIX Result; + static CONST XMVECTORU32 SwizzleXXYY = {XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_0Y}; + static CONST XMVECTORU32 SwizzleZWZW = {XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_0Z, XM_PERMUTE_0W}; + static CONST XMVECTORU32 SwizzleYZXY = {XM_PERMUTE_0Y, XM_PERMUTE_0Z, XM_PERMUTE_0X, XM_PERMUTE_0Y}; + static CONST XMVECTORU32 SwizzleZWYZ = {XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_0Y, XM_PERMUTE_0Z}; + static CONST XMVECTORU32 SwizzleWXWX = {XM_PERMUTE_0W, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_0X}; + static CONST XMVECTORU32 SwizzleZXYX = {XM_PERMUTE_0Z, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_0X}; + static CONST XMVECTORU32 SwizzleYWXZ = {XM_PERMUTE_0Y, XM_PERMUTE_0W, XM_PERMUTE_0X, XM_PERMUTE_0Z}; + static CONST XMVECTORU32 SwizzleWZWY = {XM_PERMUTE_0W, XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_0Y}; + static CONST XMVECTORU32 Permute0X0Z1X1Z = {XM_PERMUTE_0X, XM_PERMUTE_0Z, XM_PERMUTE_1X, XM_PERMUTE_1Z}; + static CONST XMVECTORU32 Permute0Y0W1Y1W = {XM_PERMUTE_0Y, XM_PERMUTE_0W, XM_PERMUTE_1Y, XM_PERMUTE_1W}; + static CONST XMVECTORU32 Permute1Y0Y0W0X = {XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_0W, XM_PERMUTE_0X}; + static CONST XMVECTORU32 Permute0W0X0Y1X = {XM_PERMUTE_0W, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_1X}; + static CONST XMVECTORU32 Permute0Z1Y1X0Z = {XM_PERMUTE_0Z, XM_PERMUTE_1Y, XM_PERMUTE_1X, XM_PERMUTE_0Z}; + static CONST XMVECTORU32 Permute0W1Y0Y0Z = {XM_PERMUTE_0W, XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_0Z}; + static CONST XMVECTORU32 Permute0Z0Y1X0X = {XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_0X}; + static CONST XMVECTORU32 Permute1Y0X0W1X = {XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_1X}; + static CONST XMVECTORU32 Permute1W0Y0W0X = {XM_PERMUTE_1W, XM_PERMUTE_0Y, XM_PERMUTE_0W, XM_PERMUTE_0X}; + static CONST XMVECTORU32 Permute0W0X0Y1Z = {XM_PERMUTE_0W, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_1Z}; + static CONST XMVECTORU32 Permute0Z1W1Z0Z = {XM_PERMUTE_0Z, XM_PERMUTE_1W, XM_PERMUTE_1Z, XM_PERMUTE_0Z}; + static CONST XMVECTORU32 Permute0W1W0Y0Z = {XM_PERMUTE_0W, XM_PERMUTE_1W, XM_PERMUTE_0Y, XM_PERMUTE_0Z}; + static CONST XMVECTORU32 Permute0Z0Y1Z0X = {XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_1Z, XM_PERMUTE_0X}; + static CONST XMVECTORU32 Permute1W0X0W1Z = {XM_PERMUTE_1W, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_1Z}; + + MT = XMMatrixTranspose(M); + + V0[0] = XMVectorPermute(MT.r[2], MT.r[2], SwizzleXXYY.v); + V1[0] = XMVectorPermute(MT.r[3], MT.r[3], SwizzleZWZW.v); + V0[1] = XMVectorPermute(MT.r[0], MT.r[0], SwizzleXXYY.v); + V1[1] = XMVectorPermute(MT.r[1], MT.r[1], SwizzleZWZW.v); + V0[2] = XMVectorPermute(MT.r[2], MT.r[0], Permute0X0Z1X1Z.v); + V1[2] = XMVectorPermute(MT.r[3], MT.r[1], Permute0Y0W1Y1W.v); + + D0 = XMVectorMultiply(V0[0], V1[0]); + D1 = XMVectorMultiply(V0[1], V1[1]); + D2 = XMVectorMultiply(V0[2], V1[2]); + + V0[0] = XMVectorPermute(MT.r[2], MT.r[2], SwizzleZWZW.v); + V1[0] = XMVectorPermute(MT.r[3], MT.r[3], SwizzleXXYY.v); + V0[1] = XMVectorPermute(MT.r[0], MT.r[0], SwizzleZWZW.v); + V1[1] = XMVectorPermute(MT.r[1], MT.r[1], SwizzleXXYY.v); + V0[2] = XMVectorPermute(MT.r[2], MT.r[0], Permute0Y0W1Y1W.v); + V1[2] = XMVectorPermute(MT.r[3], MT.r[1], Permute0X0Z1X1Z.v); + + D0 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], D0); + D1 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], D1); + D2 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], D2); + + V0[0] = XMVectorPermute(MT.r[1], MT.r[1], SwizzleYZXY.v); + V1[0] = XMVectorPermute(D0, D2, Permute1Y0Y0W0X.v); + V0[1] = XMVectorPermute(MT.r[0], MT.r[0], SwizzleZXYX.v); + V1[1] = XMVectorPermute(D0, D2, Permute0W1Y0Y0Z.v); + V0[2] = XMVectorPermute(MT.r[3], MT.r[3], SwizzleYZXY.v); + V1[2] = XMVectorPermute(D1, D2, Permute1W0Y0W0X.v); + V0[3] = XMVectorPermute(MT.r[2], MT.r[2], SwizzleZXYX.v); + V1[3] = XMVectorPermute(D1, D2, Permute0W1W0Y0Z.v); + + C0 = XMVectorMultiply(V0[0], V1[0]); + C2 = XMVectorMultiply(V0[1], V1[1]); + C4 = XMVectorMultiply(V0[2], V1[2]); + C6 = XMVectorMultiply(V0[3], V1[3]); + + V0[0] = XMVectorPermute(MT.r[1], MT.r[1], SwizzleZWYZ.v); + V1[0] = XMVectorPermute(D0, D2, Permute0W0X0Y1X.v); + V0[1] = XMVectorPermute(MT.r[0], MT.r[0], SwizzleWZWY.v); + V1[1] = XMVectorPermute(D0, D2, Permute0Z0Y1X0X.v); + V0[2] = XMVectorPermute(MT.r[3], MT.r[3], SwizzleZWYZ.v); + V1[2] = XMVectorPermute(D1, D2, Permute0W0X0Y1Z.v); + V0[3] = XMVectorPermute(MT.r[2], MT.r[2], SwizzleWZWY.v); + V1[3] = XMVectorPermute(D1, D2, Permute0Z0Y1Z0X.v); + + C0 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], C0); + C2 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], C2); + C4 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], C4); + C6 = XMVectorNegativeMultiplySubtract(V0[3], V1[3], C6); + + V0[0] = XMVectorPermute(MT.r[1], MT.r[1], SwizzleWXWX.v); + V1[0] = XMVectorPermute(D0, D2, Permute0Z1Y1X0Z.v); + V0[1] = XMVectorPermute(MT.r[0], MT.r[0], SwizzleYWXZ.v); + V1[1] = XMVectorPermute(D0, D2, Permute1Y0X0W1X.v); + V0[2] = XMVectorPermute(MT.r[3], MT.r[3], SwizzleWXWX.v); + V1[2] = XMVectorPermute(D1, D2, Permute0Z1W1Z0Z.v); + V0[3] = XMVectorPermute(MT.r[2], MT.r[2], SwizzleYWXZ.v); + V1[3] = XMVectorPermute(D1, D2, Permute1W0X0W1Z.v); + + C1 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], C0); + C0 = XMVectorMultiplyAdd(V0[0], V1[0], C0); + C3 = XMVectorMultiplyAdd(V0[1], V1[1], C2); + C2 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], C2); + C5 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], C4); + C4 = XMVectorMultiplyAdd(V0[2], V1[2], C4); + C7 = XMVectorMultiplyAdd(V0[3], V1[3], C6); + C6 = XMVectorNegativeMultiplySubtract(V0[3], V1[3], C6); + + R.r[0] = XMVectorSelect(C0, C1, g_XMSelect0101.v); + R.r[1] = XMVectorSelect(C2, C3, g_XMSelect0101.v); + R.r[2] = XMVectorSelect(C4, C5, g_XMSelect0101.v); + R.r[3] = XMVectorSelect(C6, C7, g_XMSelect0101.v); + + Determinant = XMVector4Dot(R.r[0], MT.r[0]); + + if (pDeterminant) + *pDeterminant = Determinant; + + Reciprocal = XMVectorReciprocal(Determinant); + + Result.r[0] = XMVectorMultiply(R.r[0], Reciprocal); + Result.r[1] = XMVectorMultiply(R.r[1], Reciprocal); + Result.r[2] = XMVectorMultiply(R.r[2], Reciprocal); + Result.r[3] = XMVectorMultiply(R.r[3], Reciprocal); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX MT = XMMatrixTranspose(M); + XMVECTOR V00 = _mm_shuffle_ps(MT.r[2], MT.r[2],_MM_SHUFFLE(1,1,0,0)); + XMVECTOR V10 = _mm_shuffle_ps(MT.r[3], MT.r[3],_MM_SHUFFLE(3,2,3,2)); + XMVECTOR V01 = _mm_shuffle_ps(MT.r[0], MT.r[0],_MM_SHUFFLE(1,1,0,0)); + XMVECTOR V11 = _mm_shuffle_ps(MT.r[1], MT.r[1],_MM_SHUFFLE(3,2,3,2)); + XMVECTOR V02 = _mm_shuffle_ps(MT.r[2], MT.r[0],_MM_SHUFFLE(2,0,2,0)); + XMVECTOR V12 = _mm_shuffle_ps(MT.r[3], MT.r[1],_MM_SHUFFLE(3,1,3,1)); + + XMVECTOR D0 = _mm_mul_ps(V00,V10); + XMVECTOR D1 = _mm_mul_ps(V01,V11); + XMVECTOR D2 = _mm_mul_ps(V02,V12); + + V00 = _mm_shuffle_ps(MT.r[2],MT.r[2],_MM_SHUFFLE(3,2,3,2)); + V10 = _mm_shuffle_ps(MT.r[3],MT.r[3],_MM_SHUFFLE(1,1,0,0)); + V01 = _mm_shuffle_ps(MT.r[0],MT.r[0],_MM_SHUFFLE(3,2,3,2)); + V11 = _mm_shuffle_ps(MT.r[1],MT.r[1],_MM_SHUFFLE(1,1,0,0)); + V02 = _mm_shuffle_ps(MT.r[2],MT.r[0],_MM_SHUFFLE(3,1,3,1)); + V12 = _mm_shuffle_ps(MT.r[3],MT.r[1],_MM_SHUFFLE(2,0,2,0)); + + V00 = _mm_mul_ps(V00,V10); + V01 = _mm_mul_ps(V01,V11); + V02 = _mm_mul_ps(V02,V12); + D0 = _mm_sub_ps(D0,V00); + D1 = _mm_sub_ps(D1,V01); + D2 = _mm_sub_ps(D2,V02); + // V11 = D0Y,D0W,D2Y,D2Y + V11 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(1,1,3,1)); + V00 = _mm_shuffle_ps(MT.r[1], MT.r[1],_MM_SHUFFLE(1,0,2,1)); + V10 = _mm_shuffle_ps(V11,D0,_MM_SHUFFLE(0,3,0,2)); + V01 = _mm_shuffle_ps(MT.r[0], MT.r[0],_MM_SHUFFLE(0,1,0,2)); + V11 = _mm_shuffle_ps(V11,D0,_MM_SHUFFLE(2,1,2,1)); + // V13 = D1Y,D1W,D2W,D2W + XMVECTOR V13 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(3,3,3,1)); + V02 = _mm_shuffle_ps(MT.r[3], MT.r[3],_MM_SHUFFLE(1,0,2,1)); + V12 = _mm_shuffle_ps(V13,D1,_MM_SHUFFLE(0,3,0,2)); + XMVECTOR V03 = _mm_shuffle_ps(MT.r[2], MT.r[2],_MM_SHUFFLE(0,1,0,2)); + V13 = _mm_shuffle_ps(V13,D1,_MM_SHUFFLE(2,1,2,1)); + + XMVECTOR C0 = _mm_mul_ps(V00,V10); + XMVECTOR C2 = _mm_mul_ps(V01,V11); + XMVECTOR C4 = _mm_mul_ps(V02,V12); + XMVECTOR C6 = _mm_mul_ps(V03,V13); + + // V11 = D0X,D0Y,D2X,D2X + V11 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(0,0,1,0)); + V00 = _mm_shuffle_ps(MT.r[1], MT.r[1],_MM_SHUFFLE(2,1,3,2)); + V10 = _mm_shuffle_ps(D0,V11,_MM_SHUFFLE(2,1,0,3)); + V01 = _mm_shuffle_ps(MT.r[0], MT.r[0],_MM_SHUFFLE(1,3,2,3)); + V11 = _mm_shuffle_ps(D0,V11,_MM_SHUFFLE(0,2,1,2)); + // V13 = D1X,D1Y,D2Z,D2Z + V13 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(2,2,1,0)); + V02 = _mm_shuffle_ps(MT.r[3], MT.r[3],_MM_SHUFFLE(2,1,3,2)); + V12 = _mm_shuffle_ps(D1,V13,_MM_SHUFFLE(2,1,0,3)); + V03 = _mm_shuffle_ps(MT.r[2], MT.r[2],_MM_SHUFFLE(1,3,2,3)); + V13 = _mm_shuffle_ps(D1,V13,_MM_SHUFFLE(0,2,1,2)); + + V00 = _mm_mul_ps(V00,V10); + V01 = _mm_mul_ps(V01,V11); + V02 = _mm_mul_ps(V02,V12); + V03 = _mm_mul_ps(V03,V13); + C0 = _mm_sub_ps(C0,V00); + C2 = _mm_sub_ps(C2,V01); + C4 = _mm_sub_ps(C4,V02); + C6 = _mm_sub_ps(C6,V03); + + V00 = _mm_shuffle_ps(MT.r[1],MT.r[1],_MM_SHUFFLE(0,3,0,3)); + // V10 = D0Z,D0Z,D2X,D2Y + V10 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(1,0,2,2)); + V10 = _mm_shuffle_ps(V10,V10,_MM_SHUFFLE(0,2,3,0)); + V01 = _mm_shuffle_ps(MT.r[0],MT.r[0],_MM_SHUFFLE(2,0,3,1)); + // V11 = D0X,D0W,D2X,D2Y + V11 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(1,0,3,0)); + V11 = _mm_shuffle_ps(V11,V11,_MM_SHUFFLE(2,1,0,3)); + V02 = _mm_shuffle_ps(MT.r[3],MT.r[3],_MM_SHUFFLE(0,3,0,3)); + // V12 = D1Z,D1Z,D2Z,D2W + V12 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(3,2,2,2)); + V12 = _mm_shuffle_ps(V12,V12,_MM_SHUFFLE(0,2,3,0)); + V03 = _mm_shuffle_ps(MT.r[2],MT.r[2],_MM_SHUFFLE(2,0,3,1)); + // V13 = D1X,D1W,D2Z,D2W + V13 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(3,2,3,0)); + V13 = _mm_shuffle_ps(V13,V13,_MM_SHUFFLE(2,1,0,3)); + + V00 = _mm_mul_ps(V00,V10); + V01 = _mm_mul_ps(V01,V11); + V02 = _mm_mul_ps(V02,V12); + V03 = _mm_mul_ps(V03,V13); + XMVECTOR C1 = _mm_sub_ps(C0,V00); + C0 = _mm_add_ps(C0,V00); + XMVECTOR C3 = _mm_add_ps(C2,V01); + C2 = _mm_sub_ps(C2,V01); + XMVECTOR C5 = _mm_sub_ps(C4,V02); + C4 = _mm_add_ps(C4,V02); + XMVECTOR C7 = _mm_add_ps(C6,V03); + C6 = _mm_sub_ps(C6,V03); + + C0 = _mm_shuffle_ps(C0,C1,_MM_SHUFFLE(3,1,2,0)); + C2 = _mm_shuffle_ps(C2,C3,_MM_SHUFFLE(3,1,2,0)); + C4 = _mm_shuffle_ps(C4,C5,_MM_SHUFFLE(3,1,2,0)); + C6 = _mm_shuffle_ps(C6,C7,_MM_SHUFFLE(3,1,2,0)); + C0 = _mm_shuffle_ps(C0,C0,_MM_SHUFFLE(3,1,2,0)); + C2 = _mm_shuffle_ps(C2,C2,_MM_SHUFFLE(3,1,2,0)); + C4 = _mm_shuffle_ps(C4,C4,_MM_SHUFFLE(3,1,2,0)); + C6 = _mm_shuffle_ps(C6,C6,_MM_SHUFFLE(3,1,2,0)); + // Get the determinate + XMVECTOR vTemp = XMVector4Dot(C0,MT.r[0]); + if (pDeterminant) + *pDeterminant = vTemp; + vTemp = _mm_div_ps(g_XMOne,vTemp); + XMMATRIX mResult; + mResult.r[0] = _mm_mul_ps(C0,vTemp); + mResult.r[1] = _mm_mul_ps(C2,vTemp); + mResult.r[2] = _mm_mul_ps(C4,vTemp); + mResult.r[3] = _mm_mul_ps(C6,vTemp); + return mResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMVECTOR XMMatrixDeterminant +( + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V0, V1, V2, V3, V4, V5; + XMVECTOR P0, P1, P2, R, S; + XMVECTOR Result; + static CONST XMVECTORU32 SwizzleYXXX = {XM_PERMUTE_0Y, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X}; + static CONST XMVECTORU32 SwizzleZZYY = {XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_0Y}; + static CONST XMVECTORU32 SwizzleWWWZ = {XM_PERMUTE_0W, XM_PERMUTE_0W, XM_PERMUTE_0W, XM_PERMUTE_0Z}; + static CONST XMVECTOR Sign = {1.0f, -1.0f, 1.0f, -1.0f}; + + V0 = XMVectorPermute(M.r[2], M.r[2], SwizzleYXXX.v); + V1 = XMVectorPermute(M.r[3], M.r[3], SwizzleZZYY.v); + V2 = XMVectorPermute(M.r[2], M.r[2], SwizzleYXXX.v); + V3 = XMVectorPermute(M.r[3], M.r[3], SwizzleWWWZ.v); + V4 = XMVectorPermute(M.r[2], M.r[2], SwizzleZZYY.v); + V5 = XMVectorPermute(M.r[3], M.r[3], SwizzleWWWZ.v); + + P0 = XMVectorMultiply(V0, V1); + P1 = XMVectorMultiply(V2, V3); + P2 = XMVectorMultiply(V4, V5); + + V0 = XMVectorPermute(M.r[2], M.r[2], SwizzleZZYY.v); + V1 = XMVectorPermute(M.r[3], M.r[3], SwizzleYXXX.v); + V2 = XMVectorPermute(M.r[2], M.r[2], SwizzleWWWZ.v); + V3 = XMVectorPermute(M.r[3], M.r[3], SwizzleYXXX.v); + V4 = XMVectorPermute(M.r[2], M.r[2], SwizzleWWWZ.v); + V5 = XMVectorPermute(M.r[3], M.r[3], SwizzleZZYY.v); + + P0 = XMVectorNegativeMultiplySubtract(V0, V1, P0); + P1 = XMVectorNegativeMultiplySubtract(V2, V3, P1); + P2 = XMVectorNegativeMultiplySubtract(V4, V5, P2); + + V0 = XMVectorPermute(M.r[1], M.r[1], SwizzleWWWZ.v); + V1 = XMVectorPermute(M.r[1], M.r[1], SwizzleZZYY.v); + V2 = XMVectorPermute(M.r[1], M.r[1], SwizzleYXXX.v); + + S = XMVectorMultiply(M.r[0], Sign); + R = XMVectorMultiply(V0, P0); + R = XMVectorNegativeMultiplySubtract(V1, P1, R); + R = XMVectorMultiplyAdd(V2, P2, R); + + Result = XMVector4Dot(S, R); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR V0, V1, V2, V3, V4, V5; + XMVECTOR P0, P1, P2, R, S; + XMVECTOR Result; + static CONST XMVECTORU32 SwizzleYXXX = {XM_PERMUTE_0Y, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X}; + static CONST XMVECTORU32 SwizzleZZYY = {XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_0Y}; + static CONST XMVECTORU32 SwizzleWWWZ = {XM_PERMUTE_0W, XM_PERMUTE_0W, XM_PERMUTE_0W, XM_PERMUTE_0Z}; + static CONST XMVECTORF32 Sign = {1.0f, -1.0f, 1.0f, -1.0f}; + + V0 = XMVectorPermute(M.r[2], M.r[2], SwizzleYXXX); + V1 = XMVectorPermute(M.r[3], M.r[3], SwizzleZZYY); + V2 = XMVectorPermute(M.r[2], M.r[2], SwizzleYXXX); + V3 = XMVectorPermute(M.r[3], M.r[3], SwizzleWWWZ); + V4 = XMVectorPermute(M.r[2], M.r[2], SwizzleZZYY); + V5 = XMVectorPermute(M.r[3], M.r[3], SwizzleWWWZ); + + P0 = _mm_mul_ps(V0, V1); + P1 = _mm_mul_ps(V2, V3); + P2 = _mm_mul_ps(V4, V5); + + V0 = XMVectorPermute(M.r[2], M.r[2], SwizzleZZYY); + V1 = XMVectorPermute(M.r[3], M.r[3], SwizzleYXXX); + V2 = XMVectorPermute(M.r[2], M.r[2], SwizzleWWWZ); + V3 = XMVectorPermute(M.r[3], M.r[3], SwizzleYXXX); + V4 = XMVectorPermute(M.r[2], M.r[2], SwizzleWWWZ); + V5 = XMVectorPermute(M.r[3], M.r[3], SwizzleZZYY); + + P0 = XMVectorNegativeMultiplySubtract(V0, V1, P0); + P1 = XMVectorNegativeMultiplySubtract(V2, V3, P1); + P2 = XMVectorNegativeMultiplySubtract(V4, V5, P2); + + V0 = XMVectorPermute(M.r[1], M.r[1], SwizzleWWWZ); + V1 = XMVectorPermute(M.r[1], M.r[1], SwizzleZZYY); + V2 = XMVectorPermute(M.r[1], M.r[1], SwizzleYXXX); + + S = _mm_mul_ps(M.r[0], Sign); + R = _mm_mul_ps(V0, P0); + R = XMVectorNegativeMultiplySubtract(V1, P1, R); + R = XMVectorMultiplyAdd(V2, P2, R); + + Result = XMVector4Dot(S, R); + + return Result; + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +#undef XMRANKDECOMPOSE +#undef XM_DECOMP_EPSILON + +#define XMRANKDECOMPOSE(a, b, c, x, y, z) \ + if((x) < (y)) \ + { \ + if((y) < (z)) \ + { \ + (a) = 2; \ + (b) = 1; \ + (c) = 0; \ + } \ + else \ + { \ + (a) = 1; \ + \ + if((x) < (z)) \ + { \ + (b) = 2; \ + (c) = 0; \ + } \ + else \ + { \ + (b) = 0; \ + (c) = 2; \ + } \ + } \ + } \ + else \ + { \ + if((x) < (z)) \ + { \ + (a) = 2; \ + (b) = 0; \ + (c) = 1; \ + } \ + else \ + { \ + (a) = 0; \ + \ + if((y) < (z)) \ + { \ + (b) = 2; \ + (c) = 1; \ + } \ + else \ + { \ + (b) = 1; \ + (c) = 2; \ + } \ + } \ + } + +#define XM_DECOMP_EPSILON 0.0001f + +XMINLINE BOOL XMMatrixDecompose +( + XMVECTOR *outScale, + XMVECTOR *outRotQuat, + XMVECTOR *outTrans, + CXMMATRIX M +) +{ + FLOAT fDet; + FLOAT *pfScales; + XMVECTOR *ppvBasis[3]; + XMMATRIX matTemp; + UINT a, b, c; + static const XMVECTOR *pvCanonicalBasis[3] = { + &g_XMIdentityR0.v, + &g_XMIdentityR1.v, + &g_XMIdentityR2.v + }; + + XMASSERT( outScale != NULL ); + XMASSERT( outRotQuat != NULL ); + XMASSERT( outTrans != NULL ); + + // Get the translation + outTrans[0] = M.r[3]; + + ppvBasis[0] = &matTemp.r[0]; + ppvBasis[1] = &matTemp.r[1]; + ppvBasis[2] = &matTemp.r[2]; + + matTemp.r[0] = M.r[0]; + matTemp.r[1] = M.r[1]; + matTemp.r[2] = M.r[2]; + matTemp.r[3] = g_XMIdentityR3.v; + + pfScales = (FLOAT *)outScale; + + XMVectorGetXPtr(&pfScales[0],XMVector3Length(ppvBasis[0][0])); + XMVectorGetXPtr(&pfScales[1],XMVector3Length(ppvBasis[1][0])); + XMVectorGetXPtr(&pfScales[2],XMVector3Length(ppvBasis[2][0])); + pfScales[3] = 0.f; + + XMRANKDECOMPOSE(a, b, c, pfScales[0], pfScales[1], pfScales[2]) + + if(pfScales[a] < XM_DECOMP_EPSILON) + { + ppvBasis[a][0] = pvCanonicalBasis[a][0]; + } + ppvBasis[a][0] = XMVector3Normalize(ppvBasis[a][0]); + + if(pfScales[b] < XM_DECOMP_EPSILON) + { + UINT aa, bb, cc; + FLOAT fAbsX, fAbsY, fAbsZ; + + fAbsX = fabsf(XMVectorGetX(ppvBasis[a][0])); + fAbsY = fabsf(XMVectorGetY(ppvBasis[a][0])); + fAbsZ = fabsf(XMVectorGetZ(ppvBasis[a][0])); + + XMRANKDECOMPOSE(aa, bb, cc, fAbsX, fAbsY, fAbsZ) + + ppvBasis[b][0] = XMVector3Cross(ppvBasis[a][0],pvCanonicalBasis[cc][0]); + } + + ppvBasis[b][0] = XMVector3Normalize(ppvBasis[b][0]); + + if(pfScales[c] < XM_DECOMP_EPSILON) + { + ppvBasis[c][0] = XMVector3Cross(ppvBasis[a][0],ppvBasis[b][0]); + } + + ppvBasis[c][0] = XMVector3Normalize(ppvBasis[c][0]); + + fDet = XMVectorGetX(XMMatrixDeterminant(matTemp)); + + // use Kramer's rule to check for handedness of coordinate system + if(fDet < 0.0f) + { + // switch coordinate system by negating the scale and inverting the basis vector on the x-axis + pfScales[a] = -pfScales[a]; + ppvBasis[a][0] = XMVectorNegate(ppvBasis[a][0]); + + fDet = -fDet; + } + + fDet -= 1.0f; + fDet *= fDet; + + if(XM_DECOMP_EPSILON < fDet) + { + // Non-SRT matrix encountered + return FALSE; + } + + // generate the quaternion from the matrix + outRotQuat[0] = XMQuaternionRotationMatrix(matTemp); + return TRUE; +} + +#undef XMRANKDECOMPOSE +#undef XM_DECOMP_EPSILON + +//------------------------------------------------------------------------------ +// Transformation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +XMFINLINE XMMATRIX XMMatrixIdentity() +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0] = g_XMIdentityR0.v; + M.r[1] = g_XMIdentityR1.v; + M.r[2] = g_XMIdentityR2.v; + M.r[3] = g_XMIdentityR3.v; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = g_XMIdentityR0; + M.r[1] = g_XMIdentityR1; + M.r[2] = g_XMIdentityR2; + M.r[3] = g_XMIdentityR3; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMMATRIX XMMatrixSet +( + FLOAT m00, FLOAT m01, FLOAT m02, FLOAT m03, + FLOAT m10, FLOAT m11, FLOAT m12, FLOAT m13, + FLOAT m20, FLOAT m21, FLOAT m22, FLOAT m23, + FLOAT m30, FLOAT m31, FLOAT m32, FLOAT m33 +) +{ + XMMATRIX M; + + M.r[0] = XMVectorSet(m00, m01, m02, m03); + M.r[1] = XMVectorSet(m10, m11, m12, m13); + M.r[2] = XMVectorSet(m20, m21, m22, m23); + M.r[3] = XMVectorSet(m30, m31, m32, m33); + + return M; +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMMATRIX XMMatrixTranslation +( + FLOAT OffsetX, + FLOAT OffsetY, + FLOAT OffsetZ +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + + M.m[0][0] = 1.0f; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 1.0f; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = 1.0f; + M.m[2][3] = 0.0f; + + M.m[3][0] = OffsetX; + M.m[3][1] = OffsetY; + M.m[3][2] = OffsetZ; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = g_XMIdentityR0; + M.r[1] = g_XMIdentityR1; + M.r[2] = g_XMIdentityR2; + M.r[3] = _mm_set_ps(1.0f,OffsetZ,OffsetY,OffsetX); + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + + +//------------------------------------------------------------------------------ + +XMFINLINE XMMATRIX XMMatrixTranslationFromVector +( + FXMVECTOR Offset +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.m[0][0] = 1.0f; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 1.0f; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = 1.0f; + M.m[2][3] = 0.0f; + + M.m[3][0] = Offset.vector4_f32[0]; + M.m[3][1] = Offset.vector4_f32[1]; + M.m[3][2] = Offset.vector4_f32[2]; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_and_ps(Offset,g_XMMask3); + vTemp = _mm_or_ps(vTemp,g_XMIdentityR3); + XMMATRIX M; + M.r[0] = g_XMIdentityR0; + M.r[1] = g_XMIdentityR1; + M.r[2] = g_XMIdentityR2; + M.r[3] = vTemp; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMMATRIX XMMatrixScaling +( + FLOAT ScaleX, + FLOAT ScaleY, + FLOAT ScaleZ +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + + M.r[0] = XMVectorSet(ScaleX, 0.0f, 0.0f, 0.0f); + M.r[1] = XMVectorSet(0.0f, ScaleY, 0.0f, 0.0f); + M.r[2] = XMVectorSet(0.0f, 0.0f, ScaleZ, 0.0f); + + M.r[3] = g_XMIdentityR3.v; + + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_set_ps( 0, 0, 0, ScaleX ); + M.r[1] = _mm_set_ps( 0, 0, ScaleY, 0 ); + M.r[2] = _mm_set_ps( 0, ScaleZ, 0, 0 ); + M.r[3] = g_XMIdentityR3; + return M; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMMATRIX XMMatrixScalingFromVector +( + FXMVECTOR Scale +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMMATRIX M; + M.m[0][0] = Scale.vector4_f32[0]; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = Scale.vector4_f32[1]; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = Scale.vector4_f32[2]; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_and_ps(Scale,g_XMMaskX); + M.r[1] = _mm_and_ps(Scale,g_XMMaskY); + M.r[2] = _mm_and_ps(Scale,g_XMMaskZ); + M.r[3] = g_XMIdentityR3; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMMATRIX XMMatrixRotationX +( + FLOAT Angle +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMMATRIX M; + + FLOAT fSinAngle = sinf(Angle); + FLOAT fCosAngle = cosf(Angle); + + M.m[0][0] = 1.0f; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = fCosAngle; + M.m[1][2] = fSinAngle; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = -fSinAngle; + M.m[2][2] = fCosAngle; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + FLOAT SinAngle = sinf(Angle); + FLOAT CosAngle = cosf(Angle); + + XMVECTOR vSin = _mm_set_ss(SinAngle); + XMVECTOR vCos = _mm_set_ss(CosAngle); + // x = 0,y = cos,z = sin, w = 0 + vCos = _mm_shuffle_ps(vCos,vSin,_MM_SHUFFLE(3,0,0,3)); + XMMATRIX M; + M.r[0] = g_XMIdentityR0; + M.r[1] = vCos; + // x = 0,y = sin,z = cos, w = 0 + vCos = _mm_shuffle_ps(vCos,vCos,_MM_SHUFFLE(3,1,2,0)); + // x = 0,y = -sin,z = cos, w = 0 + vCos = _mm_mul_ps(vCos,g_XMNegateY); + M.r[2] = vCos; + M.r[3] = g_XMIdentityR3; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMMATRIX XMMatrixRotationY +( + FLOAT Angle +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMMATRIX M; + + FLOAT fSinAngle = sinf(Angle); + FLOAT fCosAngle = cosf(Angle); + + M.m[0][0] = fCosAngle; + M.m[0][1] = 0.0f; + M.m[0][2] = -fSinAngle; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 1.0f; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = fSinAngle; + M.m[2][1] = 0.0f; + M.m[2][2] = fCosAngle; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + FLOAT SinAngle = sinf(Angle); + FLOAT CosAngle = cosf(Angle); + + XMVECTOR vSin = _mm_set_ss(SinAngle); + XMVECTOR vCos = _mm_set_ss(CosAngle); + // x = sin,y = 0,z = cos, w = 0 + vSin = _mm_shuffle_ps(vSin,vCos,_MM_SHUFFLE(3,0,3,0)); + XMMATRIX M; + M.r[2] = vSin; + M.r[1] = g_XMIdentityR1; + // x = cos,y = 0,z = sin, w = 0 + vSin = _mm_shuffle_ps(vSin,vSin,_MM_SHUFFLE(3,0,1,2)); + // x = cos,y = 0,z = -sin, w = 0 + vSin = _mm_mul_ps(vSin,g_XMNegateZ); + M.r[0] = vSin; + M.r[3] = g_XMIdentityR3; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMMATRIX XMMatrixRotationZ +( + FLOAT Angle +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMMATRIX M; + + FLOAT fSinAngle = sinf(Angle); + FLOAT fCosAngle = cosf(Angle); + + M.m[0][0] = fCosAngle; + M.m[0][1] = fSinAngle; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = -fSinAngle; + M.m[1][1] = fCosAngle; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = 1.0f; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + FLOAT SinAngle = sinf(Angle); + FLOAT CosAngle = cosf(Angle); + + XMVECTOR vSin = _mm_set_ss(SinAngle); + XMVECTOR vCos = _mm_set_ss(CosAngle); + // x = cos,y = sin,z = 0, w = 0 + vCos = _mm_unpacklo_ps(vCos,vSin); + XMMATRIX M; + M.r[0] = vCos; + // x = sin,y = cos,z = 0, w = 0 + vCos = _mm_shuffle_ps(vCos,vCos,_MM_SHUFFLE(3,2,0,1)); + // x = cos,y = -sin,z = 0, w = 0 + vCos = _mm_mul_ps(vCos,g_XMNegateX); + M.r[1] = vCos; + M.r[2] = g_XMIdentityR2; + M.r[3] = g_XMIdentityR3; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMMATRIX XMMatrixRotationRollPitchYaw +( + FLOAT Pitch, + FLOAT Yaw, + FLOAT Roll +) +{ + XMVECTOR Angles; + XMMATRIX M; + + Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f); + M = XMMatrixRotationRollPitchYawFromVector(Angles); + + return M; +} + +//------------------------------------------------------------------------------ + +XMINLINE XMMATRIX XMMatrixRotationRollPitchYawFromVector +( + FXMVECTOR Angles // +) +{ + XMVECTOR Q; + XMMATRIX M; + + Q = XMQuaternionRotationRollPitchYawFromVector(Angles); + M = XMMatrixRotationQuaternion(Q); + + return M; +} + +//------------------------------------------------------------------------------ + +XMINLINE XMMATRIX XMMatrixRotationNormal +( + FXMVECTOR NormalAxis, + FLOAT Angle +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR A; + XMVECTOR N0, N1; + XMVECTOR V0, V1, V2; + XMVECTOR R0, R1, R2; + XMVECTOR C0, C1, C2; + XMMATRIX M; + static CONST XMVECTORU32 SwizzleYZXW = {XM_PERMUTE_0Y, XM_PERMUTE_0Z, XM_PERMUTE_0X, XM_PERMUTE_0W}; + static CONST XMVECTORU32 SwizzleZXYW = {XM_PERMUTE_0Z, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_0W}; + static CONST XMVECTORU32 Permute0Z1Y1Z0X = {XM_PERMUTE_0Z, XM_PERMUTE_1Y, XM_PERMUTE_1Z, XM_PERMUTE_0X}; + static CONST XMVECTORU32 Permute0Y1X0Y1X = {XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_0Y, XM_PERMUTE_1X}; + static CONST XMVECTORU32 Permute0X1X1Y0W = {XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0W}; + static CONST XMVECTORU32 Permute1Z0Y1W0W = {XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_1W, XM_PERMUTE_0W}; + static CONST XMVECTORU32 Permute1X1Y0Z0W = {XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0Z, XM_PERMUTE_0W}; + + FLOAT fSinAngle = sinf(Angle); + FLOAT fCosAngle = cosf(Angle); + + A = XMVectorSet(fSinAngle, fCosAngle, 1.0f - fCosAngle, 0.0f); + + C2 = XMVectorSplatZ(A); + C1 = XMVectorSplatY(A); + C0 = XMVectorSplatX(A); + + N0 = XMVectorPermute(NormalAxis, NormalAxis, SwizzleYZXW.v); + N1 = XMVectorPermute(NormalAxis, NormalAxis, SwizzleZXYW.v); + + V0 = XMVectorMultiply(C2, N0); + V0 = XMVectorMultiply(V0, N1); + + R0 = XMVectorMultiply(C2, NormalAxis); + R0 = XMVectorMultiplyAdd(R0, NormalAxis, C1); + + R1 = XMVectorMultiplyAdd(C0, NormalAxis, V0); + R2 = XMVectorNegativeMultiplySubtract(C0, NormalAxis, V0); + + V0 = XMVectorSelect(A, R0, g_XMSelect1110.v); + V1 = XMVectorPermute(R1, R2, Permute0Z1Y1Z0X.v); + V2 = XMVectorPermute(R1, R2, Permute0Y1X0Y1X.v); + + M.r[0] = XMVectorPermute(V0, V1, Permute0X1X1Y0W.v); + M.r[1] = XMVectorPermute(V0, V1, Permute1Z0Y1W0W.v); + M.r[2] = XMVectorPermute(V0, V2, Permute1X1Y0Z0W.v); + M.r[3] = g_XMIdentityR3.v; + + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR N0, N1; + XMVECTOR V0, V1, V2; + XMVECTOR R0, R1, R2; + XMVECTOR C0, C1, C2; + XMMATRIX M; + + FLOAT fSinAngle = sinf(Angle); + FLOAT fCosAngle = cosf(Angle); + + C2 = _mm_set_ps1(1.0f - fCosAngle); + C1 = _mm_set_ps1(fCosAngle); + C0 = _mm_set_ps1(fSinAngle); + + N0 = _mm_shuffle_ps(NormalAxis,NormalAxis,_MM_SHUFFLE(3,0,2,1)); +// N0 = XMVectorPermute(NormalAxis, NormalAxis, SwizzleYZXW); + N1 = _mm_shuffle_ps(NormalAxis,NormalAxis,_MM_SHUFFLE(3,1,0,2)); +// N1 = XMVectorPermute(NormalAxis, NormalAxis, SwizzleZXYW); + + V0 = _mm_mul_ps(C2, N0); + V0 = _mm_mul_ps(V0, N1); + + R0 = _mm_mul_ps(C2, NormalAxis); + R0 = _mm_mul_ps(R0, NormalAxis); + R0 = _mm_add_ps(R0, C1); + + R1 = _mm_mul_ps(C0, NormalAxis); + R1 = _mm_add_ps(R1, V0); + R2 = _mm_mul_ps(C0, NormalAxis); + R2 = _mm_sub_ps(V0,R2); + + V0 = _mm_and_ps(R0,g_XMMask3); +// V0 = XMVectorSelect(A, R0, g_XMSelect1110); + V1 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(2,1,2,0)); + V1 = _mm_shuffle_ps(V1,V1,_MM_SHUFFLE(0,3,2,1)); +// V1 = XMVectorPermute(R1, R2, Permute0Z1Y1Z0X); + V2 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(0,0,1,1)); + V2 = _mm_shuffle_ps(V2,V2,_MM_SHUFFLE(2,0,2,0)); +// V2 = XMVectorPermute(R1, R2, Permute0Y1X0Y1X); + + R2 = _mm_shuffle_ps(V0,V1,_MM_SHUFFLE(1,0,3,0)); + R2 = _mm_shuffle_ps(R2,R2,_MM_SHUFFLE(1,3,2,0)); + M.r[0] = R2; +// M.r[0] = XMVectorPermute(V0, V1, Permute0X1X1Y0W); + R2 = _mm_shuffle_ps(V0,V1,_MM_SHUFFLE(3,2,3,1)); + R2 = _mm_shuffle_ps(R2,R2,_MM_SHUFFLE(1,3,0,2)); + M.r[1] = R2; +// M.r[1] = XMVectorPermute(V0, V1, Permute1Z0Y1W0W); + V2 = _mm_shuffle_ps(V2,V0,_MM_SHUFFLE(3,2,1,0)); +// R2 = _mm_shuffle_ps(R2,R2,_MM_SHUFFLE(3,2,1,0)); + M.r[2] = V2; +// M.r[2] = XMVectorPermute(V0, V2, Permute1X1Y0Z0W); + M.r[3] = g_XMIdentityR3; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMMATRIX XMMatrixRotationAxis +( + FXMVECTOR Axis, + FLOAT Angle +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Normal; + XMMATRIX M; + + XMASSERT(!XMVector3Equal(Axis, XMVectorZero())); + XMASSERT(!XMVector3IsInfinite(Axis)); + + Normal = XMVector3Normalize(Axis); + M = XMMatrixRotationNormal(Normal, Angle); + + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(!XMVector3Equal(Axis, XMVectorZero())); + XMASSERT(!XMVector3IsInfinite(Axis)); + XMVECTOR Normal = XMVector3Normalize(Axis); + XMMATRIX M = XMMatrixRotationNormal(Normal, Angle); + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMMATRIX XMMatrixRotationQuaternion +( + FXMVECTOR Quaternion +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + XMVECTOR Q0, Q1; + XMVECTOR V0, V1, V2; + XMVECTOR R0, R1, R2; + static CONST XMVECTOR Constant1110 = {1.0f, 1.0f, 1.0f, 0.0f}; + static CONST XMVECTORU32 SwizzleXXYW = {XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_0W}; + static CONST XMVECTORU32 SwizzleZYZW = {XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_0Z, XM_PERMUTE_0W}; + static CONST XMVECTORU32 SwizzleYZXW = {XM_PERMUTE_0Y, XM_PERMUTE_0Z, XM_PERMUTE_0X, XM_PERMUTE_0W}; + static CONST XMVECTORU32 Permute0Y0X0X1W = {XM_PERMUTE_0Y, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_1W}; + static CONST XMVECTORU32 Permute0Z0Z0Y1W = {XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_1W}; + static CONST XMVECTORU32 Permute0Y1X1Y0Z = {XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0Z}; + static CONST XMVECTORU32 Permute0X1Z0X1Z = {XM_PERMUTE_0X, XM_PERMUTE_1Z, XM_PERMUTE_0X, XM_PERMUTE_1Z}; + static CONST XMVECTORU32 Permute0X1X1Y0W = {XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0W}; + static CONST XMVECTORU32 Permute1Z0Y1W0W = {XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_1W, XM_PERMUTE_0W}; + static CONST XMVECTORU32 Permute1X1Y0Z0W = {XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0Z, XM_PERMUTE_0W}; + + Q0 = XMVectorAdd(Quaternion, Quaternion); + Q1 = XMVectorMultiply(Quaternion, Q0); + + V0 = XMVectorPermute(Q1, Constant1110, Permute0Y0X0X1W.v); + V1 = XMVectorPermute(Q1, Constant1110, Permute0Z0Z0Y1W.v); + R0 = XMVectorSubtract(Constant1110, V0); + R0 = XMVectorSubtract(R0, V1); + + V0 = XMVectorPermute(Quaternion, Quaternion, SwizzleXXYW.v); + V1 = XMVectorPermute(Q0, Q0, SwizzleZYZW.v); + V0 = XMVectorMultiply(V0, V1); + + V1 = XMVectorSplatW(Quaternion); + V2 = XMVectorPermute(Q0, Q0, SwizzleYZXW.v); + V1 = XMVectorMultiply(V1, V2); + + R1 = XMVectorAdd(V0, V1); + R2 = XMVectorSubtract(V0, V1); + + V0 = XMVectorPermute(R1, R2, Permute0Y1X1Y0Z.v); + V1 = XMVectorPermute(R1, R2, Permute0X1Z0X1Z.v); + + M.r[0] = XMVectorPermute(R0, V0, Permute0X1X1Y0W.v); + M.r[1] = XMVectorPermute(R0, V0, Permute1Z0Y1W0W.v); + M.r[2] = XMVectorPermute(R0, V1, Permute1X1Y0Z0W.v); + M.r[3] = g_XMIdentityR3.v; + + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + XMVECTOR Q0, Q1; + XMVECTOR V0, V1, V2; + XMVECTOR R0, R1, R2; + static CONST XMVECTORF32 Constant1110 = {1.0f, 1.0f, 1.0f, 0.0f}; + + Q0 = _mm_add_ps(Quaternion,Quaternion); + Q1 = _mm_mul_ps(Quaternion,Q0); + + V0 = _mm_shuffle_ps(Q1,Q1,_MM_SHUFFLE(3,0,0,1)); + V0 = _mm_and_ps(V0,g_XMMask3); +// V0 = XMVectorPermute(Q1, Constant1110,Permute0Y0X0X1W); + V1 = _mm_shuffle_ps(Q1,Q1,_MM_SHUFFLE(3,1,2,2)); + V1 = _mm_and_ps(V1,g_XMMask3); +// V1 = XMVectorPermute(Q1, Constant1110,Permute0Z0Z0Y1W); + R0 = _mm_sub_ps(Constant1110,V0); + R0 = _mm_sub_ps(R0, V1); + + V0 = _mm_shuffle_ps(Quaternion,Quaternion,_MM_SHUFFLE(3,1,0,0)); +// V0 = XMVectorPermute(Quaternion, Quaternion,SwizzleXXYW); + V1 = _mm_shuffle_ps(Q0,Q0,_MM_SHUFFLE(3,2,1,2)); +// V1 = XMVectorPermute(Q0, Q0,SwizzleZYZW); + V0 = _mm_mul_ps(V0, V1); + + V1 = _mm_shuffle_ps(Quaternion,Quaternion,_MM_SHUFFLE(3,3,3,3)); +// V1 = XMVectorSplatW(Quaternion); + V2 = _mm_shuffle_ps(Q0,Q0,_MM_SHUFFLE(3,0,2,1)); +// V2 = XMVectorPermute(Q0, Q0,SwizzleYZXW); + V1 = _mm_mul_ps(V1, V2); + + R1 = _mm_add_ps(V0, V1); + R2 = _mm_sub_ps(V0, V1); + + V0 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(1,0,2,1)); + V0 = _mm_shuffle_ps(V0,V0,_MM_SHUFFLE(1,3,2,0)); +// V0 = XMVectorPermute(R1, R2,Permute0Y1X1Y0Z); + V1 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(2,2,0,0)); + V1 = _mm_shuffle_ps(V1,V1,_MM_SHUFFLE(2,0,2,0)); +// V1 = XMVectorPermute(R1, R2,Permute0X1Z0X1Z); + + Q1 = _mm_shuffle_ps(R0,V0,_MM_SHUFFLE(1,0,3,0)); + Q1 = _mm_shuffle_ps(Q1,Q1,_MM_SHUFFLE(1,3,2,0)); + M.r[0] = Q1; +// M.r[0] = XMVectorPermute(R0, V0,Permute0X1X1Y0W); + Q1 = _mm_shuffle_ps(R0,V0,_MM_SHUFFLE(3,2,3,1)); + Q1 = _mm_shuffle_ps(Q1,Q1,_MM_SHUFFLE(1,3,0,2)); + M.r[1] = Q1; +// M.r[1] = XMVectorPermute(R0, V0,Permute1Z0Y1W0W); + Q1 = _mm_shuffle_ps(V1,R0,_MM_SHUFFLE(3,2,1,0)); + M.r[2] = Q1; +// M.r[2] = XMVectorPermute(R0, V1,Permute1X1Y0Z0W); + M.r[3] = g_XMIdentityR3; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMMATRIX XMMatrixTransformation2D +( + FXMVECTOR ScalingOrigin, + FLOAT ScalingOrientation, + FXMVECTOR Scaling, + FXMVECTOR RotationOrigin, + FLOAT Rotation, + CXMVECTOR Translation +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + XMVECTOR VScaling; + XMVECTOR NegScalingOrigin; + XMVECTOR VScalingOrigin; + XMMATRIX MScalingOriginI; + XMMATRIX MScalingOrientation; + XMMATRIX MScalingOrientationT; + XMMATRIX MScaling; + XMVECTOR VRotationOrigin; + XMMATRIX MRotation; + XMVECTOR VTranslation; + + // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling * MScalingOrientation * + // MScalingOrigin * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + VScalingOrigin = XMVectorSelect(g_XMSelect1100.v, ScalingOrigin, g_XMSelect1100.v); + NegScalingOrigin = XMVectorNegate(VScalingOrigin); + + MScalingOriginI = XMMatrixTranslationFromVector(NegScalingOrigin); + MScalingOrientation = XMMatrixRotationZ(ScalingOrientation); + MScalingOrientationT = XMMatrixTranspose(MScalingOrientation); + VScaling = XMVectorSelect(g_XMOne.v, Scaling, g_XMSelect1100.v); + MScaling = XMMatrixScalingFromVector(VScaling); + VRotationOrigin = XMVectorSelect(g_XMSelect1100.v, RotationOrigin, g_XMSelect1100.v); + MRotation = XMMatrixRotationZ(Rotation); + VTranslation = XMVectorSelect(g_XMSelect1100.v, Translation,g_XMSelect1100.v); + + M = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT); + M = XMMatrixMultiply(M, MScaling); + M = XMMatrixMultiply(M, MScalingOrientation); + M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin); + M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); + M.r[3] = XMVectorAdd(M.r[3], VTranslation); + + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + XMVECTOR VScaling; + XMVECTOR NegScalingOrigin; + XMVECTOR VScalingOrigin; + XMMATRIX MScalingOriginI; + XMMATRIX MScalingOrientation; + XMMATRIX MScalingOrientationT; + XMMATRIX MScaling; + XMVECTOR VRotationOrigin; + XMMATRIX MRotation; + XMVECTOR VTranslation; + + // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling * MScalingOrientation * + // MScalingOrigin * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + static const XMVECTORU32 Mask2 = {0xFFFFFFFF,0xFFFFFFFF,0,0}; + static const XMVECTORF32 ZWOne = {0,0,1.0f,1.0f}; + + VScalingOrigin = _mm_and_ps(ScalingOrigin, Mask2); + NegScalingOrigin = XMVectorNegate(VScalingOrigin); + + MScalingOriginI = XMMatrixTranslationFromVector(NegScalingOrigin); + MScalingOrientation = XMMatrixRotationZ(ScalingOrientation); + MScalingOrientationT = XMMatrixTranspose(MScalingOrientation); + VScaling = _mm_and_ps(Scaling, Mask2); + VScaling = _mm_or_ps(VScaling,ZWOne); + MScaling = XMMatrixScalingFromVector(VScaling); + VRotationOrigin = _mm_and_ps(RotationOrigin, Mask2); + MRotation = XMMatrixRotationZ(Rotation); + VTranslation = _mm_and_ps(Translation, Mask2); + + M = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT); + M = XMMatrixMultiply(M, MScaling); + M = XMMatrixMultiply(M, MScalingOrientation); + M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin); + M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); + M.r[3] = XMVectorAdd(M.r[3], VTranslation); + + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMMATRIX XMMatrixTransformation +( + FXMVECTOR ScalingOrigin, + FXMVECTOR ScalingOrientationQuaternion, + FXMVECTOR Scaling, + CXMVECTOR RotationOrigin, + CXMVECTOR RotationQuaternion, + CXMVECTOR Translation +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + XMVECTOR NegScalingOrigin; + XMVECTOR VScalingOrigin; + XMMATRIX MScalingOriginI; + XMMATRIX MScalingOrientation; + XMMATRIX MScalingOrientationT; + XMMATRIX MScaling; + XMVECTOR VRotationOrigin; + XMMATRIX MRotation; + XMVECTOR VTranslation; + + // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling * MScalingOrientation * + // MScalingOrigin * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + VScalingOrigin = XMVectorSelect(g_XMSelect1110.v, ScalingOrigin, g_XMSelect1110.v); + NegScalingOrigin = XMVectorNegate(ScalingOrigin); + + MScalingOriginI = XMMatrixTranslationFromVector(NegScalingOrigin); + MScalingOrientation = XMMatrixRotationQuaternion(ScalingOrientationQuaternion); + MScalingOrientationT = XMMatrixTranspose(MScalingOrientation); + MScaling = XMMatrixScalingFromVector(Scaling); + VRotationOrigin = XMVectorSelect(g_XMSelect1110.v, RotationOrigin, g_XMSelect1110.v); + MRotation = XMMatrixRotationQuaternion(RotationQuaternion); + VTranslation = XMVectorSelect(g_XMSelect1110.v, Translation, g_XMSelect1110.v); + + M = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT); + M = XMMatrixMultiply(M, MScaling); + M = XMMatrixMultiply(M, MScalingOrientation); + M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin); + M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); + M.r[3] = XMVectorAdd(M.r[3], VTranslation); + + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + XMVECTOR NegScalingOrigin; + XMVECTOR VScalingOrigin; + XMMATRIX MScalingOriginI; + XMMATRIX MScalingOrientation; + XMMATRIX MScalingOrientationT; + XMMATRIX MScaling; + XMVECTOR VRotationOrigin; + XMMATRIX MRotation; + XMVECTOR VTranslation; + + // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling * MScalingOrientation * + // MScalingOrigin * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + VScalingOrigin = _mm_and_ps(ScalingOrigin,g_XMMask3); + NegScalingOrigin = XMVectorNegate(ScalingOrigin); + + MScalingOriginI = XMMatrixTranslationFromVector(NegScalingOrigin); + MScalingOrientation = XMMatrixRotationQuaternion(ScalingOrientationQuaternion); + MScalingOrientationT = XMMatrixTranspose(MScalingOrientation); + MScaling = XMMatrixScalingFromVector(Scaling); + VRotationOrigin = _mm_and_ps(RotationOrigin,g_XMMask3); + MRotation = XMMatrixRotationQuaternion(RotationQuaternion); + VTranslation = _mm_and_ps(Translation,g_XMMask3); + + M = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT); + M = XMMatrixMultiply(M, MScaling); + M = XMMatrixMultiply(M, MScalingOrientation); + M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin); + M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); + M.r[3] = XMVectorAdd(M.r[3], VTranslation); + + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMMATRIX XMMatrixAffineTransformation2D +( + FXMVECTOR Scaling, + FXMVECTOR RotationOrigin, + FLOAT Rotation, + FXMVECTOR Translation +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + XMVECTOR VScaling; + XMMATRIX MScaling; + XMVECTOR VRotationOrigin; + XMMATRIX MRotation; + XMVECTOR VTranslation; + + // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + VScaling = XMVectorSelect(g_XMOne.v, Scaling, g_XMSelect1100.v); + MScaling = XMMatrixScalingFromVector(VScaling); + VRotationOrigin = XMVectorSelect(g_XMSelect1100.v, RotationOrigin, g_XMSelect1100.v); + MRotation = XMMatrixRotationZ(Rotation); + VTranslation = XMVectorSelect(g_XMSelect1100.v, Translation,g_XMSelect1100.v); + + M = MScaling; + M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); + M.r[3] = XMVectorAdd(M.r[3], VTranslation); + + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + XMVECTOR VScaling; + XMMATRIX MScaling; + XMVECTOR VRotationOrigin; + XMMATRIX MRotation; + XMVECTOR VTranslation; + static const XMVECTORU32 Mask2 = {0xFFFFFFFFU,0xFFFFFFFFU,0,0}; + static const XMVECTORF32 ZW1 = {0,0,1.0f,1.0f}; + + // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + VScaling = _mm_and_ps(Scaling, Mask2); + VScaling = _mm_or_ps(VScaling, ZW1); + MScaling = XMMatrixScalingFromVector(VScaling); + VRotationOrigin = _mm_and_ps(RotationOrigin, Mask2); + MRotation = XMMatrixRotationZ(Rotation); + VTranslation = _mm_and_ps(Translation, Mask2); + + M = MScaling; + M.r[3] = _mm_sub_ps(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = _mm_add_ps(M.r[3], VRotationOrigin); + M.r[3] = _mm_add_ps(M.r[3], VTranslation); + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMMATRIX XMMatrixAffineTransformation +( + FXMVECTOR Scaling, + FXMVECTOR RotationOrigin, + FXMVECTOR RotationQuaternion, + CXMVECTOR Translation +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + XMMATRIX MScaling; + XMVECTOR VRotationOrigin; + XMMATRIX MRotation; + XMVECTOR VTranslation; + + // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + MScaling = XMMatrixScalingFromVector(Scaling); + VRotationOrigin = XMVectorSelect(g_XMSelect1110.v, RotationOrigin,g_XMSelect1110.v); + MRotation = XMMatrixRotationQuaternion(RotationQuaternion); + VTranslation = XMVectorSelect(g_XMSelect1110.v, Translation,g_XMSelect1110.v); + + M = MScaling; + M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); + M.r[3] = XMVectorAdd(M.r[3], VTranslation); + + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + XMMATRIX MScaling; + XMVECTOR VRotationOrigin; + XMMATRIX MRotation; + XMVECTOR VTranslation; + + // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + MScaling = XMMatrixScalingFromVector(Scaling); + VRotationOrigin = _mm_and_ps(RotationOrigin,g_XMMask3); + MRotation = XMMatrixRotationQuaternion(RotationQuaternion); + VTranslation = _mm_and_ps(Translation,g_XMMask3); + + M = MScaling; + M.r[3] = _mm_sub_ps(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = _mm_add_ps(M.r[3], VRotationOrigin); + M.r[3] = _mm_add_ps(M.r[3], VTranslation); + + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMMATRIX XMMatrixReflect +( + FXMVECTOR ReflectionPlane +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR P; + XMVECTOR S; + XMVECTOR A, B, C, D; + XMMATRIX M; + static CONST XMVECTOR NegativeTwo = {-2.0f, -2.0f, -2.0f, 0.0f}; + + XMASSERT(!XMVector3Equal(ReflectionPlane, XMVectorZero())); + XMASSERT(!XMPlaneIsInfinite(ReflectionPlane)); + + P = XMPlaneNormalize(ReflectionPlane); + S = XMVectorMultiply(P, NegativeTwo); + + A = XMVectorSplatX(P); + B = XMVectorSplatY(P); + C = XMVectorSplatZ(P); + D = XMVectorSplatW(P); + + M.r[0] = XMVectorMultiplyAdd(A, S, g_XMIdentityR0.v); + M.r[1] = XMVectorMultiplyAdd(B, S, g_XMIdentityR1.v); + M.r[2] = XMVectorMultiplyAdd(C, S, g_XMIdentityR2.v); + M.r[3] = XMVectorMultiplyAdd(D, S, g_XMIdentityR3.v); + + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + static CONST XMVECTORF32 NegativeTwo = {-2.0f, -2.0f, -2.0f, 0.0f}; + + XMASSERT(!XMVector3Equal(ReflectionPlane, XMVectorZero())); + XMASSERT(!XMPlaneIsInfinite(ReflectionPlane)); + + XMVECTOR P = XMPlaneNormalize(ReflectionPlane); + XMVECTOR S = _mm_mul_ps(P,NegativeTwo); + XMVECTOR X = _mm_shuffle_ps(P,P,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR Y = _mm_shuffle_ps(P,P,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR Z = _mm_shuffle_ps(P,P,_MM_SHUFFLE(2,2,2,2)); + P = _mm_shuffle_ps(P,P,_MM_SHUFFLE(3,3,3,3)); + X = _mm_mul_ps(X,S); + Y = _mm_mul_ps(Y,S); + Z = _mm_mul_ps(Z,S); + P = _mm_mul_ps(P,S); + X = _mm_add_ps(X,g_XMIdentityR0); + Y = _mm_add_ps(Y,g_XMIdentityR1); + Z = _mm_add_ps(Z,g_XMIdentityR2); + P = _mm_add_ps(P,g_XMIdentityR3); + M.r[0] = X; + M.r[1] = Y; + M.r[2] = Z; + M.r[3] = P; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMMATRIX XMMatrixShadow +( + FXMVECTOR ShadowPlane, + FXMVECTOR LightPosition +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR P; + XMVECTOR Dot; + XMVECTOR A, B, C, D; + XMMATRIX M; + static CONST XMVECTORU32 Select0001 = {XM_SELECT_0, XM_SELECT_0, XM_SELECT_0, XM_SELECT_1}; + + XMASSERT(!XMVector3Equal(ShadowPlane, XMVectorZero())); + XMASSERT(!XMPlaneIsInfinite(ShadowPlane)); + + P = XMPlaneNormalize(ShadowPlane); + Dot = XMPlaneDot(P, LightPosition); + P = XMVectorNegate(P); + D = XMVectorSplatW(P); + C = XMVectorSplatZ(P); + B = XMVectorSplatY(P); + A = XMVectorSplatX(P); + Dot = XMVectorSelect(Select0001.v, Dot, Select0001.v); + M.r[3] = XMVectorMultiplyAdd(D, LightPosition, Dot); + Dot = XMVectorRotateLeft(Dot, 1); + M.r[2] = XMVectorMultiplyAdd(C, LightPosition, Dot); + Dot = XMVectorRotateLeft(Dot, 1); + M.r[1] = XMVectorMultiplyAdd(B, LightPosition, Dot); + Dot = XMVectorRotateLeft(Dot, 1); + M.r[0] = XMVectorMultiplyAdd(A, LightPosition, Dot); + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + XMASSERT(!XMVector3Equal(ShadowPlane, XMVectorZero())); + XMASSERT(!XMPlaneIsInfinite(ShadowPlane)); + XMVECTOR P = XMPlaneNormalize(ShadowPlane); + XMVECTOR Dot = XMPlaneDot(P,LightPosition); + // Negate + P = _mm_mul_ps(P,g_XMNegativeOne); + XMVECTOR X = _mm_shuffle_ps(P,P,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR Y = _mm_shuffle_ps(P,P,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR Z = _mm_shuffle_ps(P,P,_MM_SHUFFLE(2,2,2,2)); + P = _mm_shuffle_ps(P,P,_MM_SHUFFLE(3,3,3,3)); + Dot = _mm_and_ps(Dot,g_XMMaskW); + X = _mm_mul_ps(X,LightPosition); + Y = _mm_mul_ps(Y,LightPosition); + Z = _mm_mul_ps(Z,LightPosition); + P = _mm_mul_ps(P,LightPosition); + P = _mm_add_ps(P,Dot); + Dot = _mm_shuffle_ps(Dot,Dot,_MM_SHUFFLE(0,3,2,1)); + Z = _mm_add_ps(Z,Dot); + Dot = _mm_shuffle_ps(Dot,Dot,_MM_SHUFFLE(0,3,2,1)); + Y = _mm_add_ps(Y,Dot); + Dot = _mm_shuffle_ps(Dot,Dot,_MM_SHUFFLE(0,3,2,1)); + X = _mm_add_ps(X,Dot); + // Store the resulting matrix + M.r[0] = X; + M.r[1] = Y; + M.r[2] = Z; + M.r[3] = P; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// View and projection initialization operations +//------------------------------------------------------------------------------ + + +//------------------------------------------------------------------------------ + +XMFINLINE XMMATRIX XMMatrixLookAtLH +( + FXMVECTOR EyePosition, + FXMVECTOR FocusPosition, + FXMVECTOR UpDirection +) +{ + XMVECTOR EyeDirection; + XMMATRIX M; + + EyeDirection = XMVectorSubtract(FocusPosition, EyePosition); + M = XMMatrixLookToLH(EyePosition, EyeDirection, UpDirection); + + return M; +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMMATRIX XMMatrixLookAtRH +( + FXMVECTOR EyePosition, + FXMVECTOR FocusPosition, + FXMVECTOR UpDirection +) +{ + XMVECTOR NegEyeDirection; + XMMATRIX M; + + NegEyeDirection = XMVectorSubtract(EyePosition, FocusPosition); + M = XMMatrixLookToLH(EyePosition, NegEyeDirection, UpDirection); + + return M; +} + +//------------------------------------------------------------------------------ + +XMINLINE XMMATRIX XMMatrixLookToLH +( + FXMVECTOR EyePosition, + FXMVECTOR EyeDirection, + FXMVECTOR UpDirection +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR NegEyePosition; + XMVECTOR D0, D1, D2; + XMVECTOR R0, R1, R2; + XMMATRIX M; + + XMASSERT(!XMVector3Equal(EyeDirection, XMVectorZero())); + XMASSERT(!XMVector3IsInfinite(EyeDirection)); + XMASSERT(!XMVector3Equal(UpDirection, XMVectorZero())); + XMASSERT(!XMVector3IsInfinite(UpDirection)); + + R2 = XMVector3Normalize(EyeDirection); + + R0 = XMVector3Cross(UpDirection, R2); + R0 = XMVector3Normalize(R0); + + R1 = XMVector3Cross(R2, R0); + + NegEyePosition = XMVectorNegate(EyePosition); + + D0 = XMVector3Dot(R0, NegEyePosition); + D1 = XMVector3Dot(R1, NegEyePosition); + D2 = XMVector3Dot(R2, NegEyePosition); + + M.r[0] = XMVectorSelect(D0, R0, g_XMSelect1110.v); + M.r[1] = XMVectorSelect(D1, R1, g_XMSelect1110.v); + M.r[2] = XMVectorSelect(D2, R2, g_XMSelect1110.v); + M.r[3] = g_XMIdentityR3.v; + + M = XMMatrixTranspose(M); + + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + + XMASSERT(!XMVector3Equal(EyeDirection, XMVectorZero())); + XMASSERT(!XMVector3IsInfinite(EyeDirection)); + XMASSERT(!XMVector3Equal(UpDirection, XMVectorZero())); + XMASSERT(!XMVector3IsInfinite(UpDirection)); + + XMVECTOR R2 = XMVector3Normalize(EyeDirection); + XMVECTOR R0 = XMVector3Cross(UpDirection, R2); + R0 = XMVector3Normalize(R0); + XMVECTOR R1 = XMVector3Cross(R2,R0); + XMVECTOR NegEyePosition = _mm_mul_ps(EyePosition,g_XMNegativeOne); + XMVECTOR D0 = XMVector3Dot(R0,NegEyePosition); + XMVECTOR D1 = XMVector3Dot(R1,NegEyePosition); + XMVECTOR D2 = XMVector3Dot(R2,NegEyePosition); + R0 = _mm_and_ps(R0,g_XMMask3); + R1 = _mm_and_ps(R1,g_XMMask3); + R2 = _mm_and_ps(R2,g_XMMask3); + D0 = _mm_and_ps(D0,g_XMMaskW); + D1 = _mm_and_ps(D1,g_XMMaskW); + D2 = _mm_and_ps(D2,g_XMMaskW); + D0 = _mm_or_ps(D0,R0); + D1 = _mm_or_ps(D1,R1); + D2 = _mm_or_ps(D2,R2); + M.r[0] = D0; + M.r[1] = D1; + M.r[2] = D2; + M.r[3] = g_XMIdentityR3; + M = XMMatrixTranspose(M); + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMMATRIX XMMatrixLookToRH +( + FXMVECTOR EyePosition, + FXMVECTOR EyeDirection, + FXMVECTOR UpDirection +) +{ + XMVECTOR NegEyeDirection; + XMMATRIX M; + + NegEyeDirection = XMVectorNegate(EyeDirection); + M = XMMatrixLookToLH(EyePosition, NegEyeDirection, UpDirection); + + return M; +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMMATRIX XMMatrixPerspectiveLH +( + FLOAT ViewWidth, + FLOAT ViewHeight, + FLOAT NearZ, + FLOAT FarZ +) +{ +#if defined(_XM_NO_INTRINSICS_) + + FLOAT TwoNearZ, fRange; + XMMATRIX M; + + XMASSERT(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + XMASSERT(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + + TwoNearZ = NearZ + NearZ; + fRange = FarZ / (FarZ - NearZ); + M.m[0][0] = TwoNearZ / ViewWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = TwoNearZ / ViewHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 0.0f; + + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + XMASSERT(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + + XMMATRIX M; + FLOAT TwoNearZ = NearZ + NearZ; + FLOAT fRange = FarZ / (FarZ - NearZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + TwoNearZ / ViewWidth, + TwoNearZ / ViewHeight, + fRange, + -fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // TwoNearZ / ViewWidth,0,0,0 + M.r[0] = vTemp; + // 0,TwoNearZ / ViewHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,1.0f + vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2)); + // 0,0,fRange,1.0f + vTemp = _mm_setzero_ps(); + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0)); + M.r[2] = vTemp; + // 0,0,-fRange * NearZ,0 + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0)); + M.r[3] = vTemp; + + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMMATRIX XMMatrixPerspectiveRH +( + FLOAT ViewWidth, + FLOAT ViewHeight, + FLOAT NearZ, + FLOAT FarZ +) +{ +#if defined(_XM_NO_INTRINSICS_) + + FLOAT TwoNearZ, fRange; + XMMATRIX M; + + XMASSERT(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + XMASSERT(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + + TwoNearZ = NearZ + NearZ; + fRange = FarZ / (NearZ - FarZ); + M.m[0][0] = TwoNearZ / ViewWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = TwoNearZ / ViewHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = -1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = fRange * NearZ; + M.m[3][3] = 0.0f; + + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + XMASSERT(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + + XMMATRIX M; + FLOAT TwoNearZ = NearZ + NearZ; + FLOAT fRange = FarZ / (NearZ-FarZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + TwoNearZ / ViewWidth, + TwoNearZ / ViewHeight, + fRange, + fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // TwoNearZ / ViewWidth,0,0,0 + M.r[0] = vTemp; + // 0,TwoNearZ / ViewHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,-1.0f + vValues = _mm_shuffle_ps(vValues,g_XMNegIdentityR3,_MM_SHUFFLE(3,2,3,2)); + // 0,0,fRange,-1.0f + vTemp = _mm_setzero_ps(); + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0)); + M.r[2] = vTemp; + // 0,0,-fRange * NearZ,0 + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0)); + M.r[3] = vTemp; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMMATRIX XMMatrixPerspectiveFovLH +( + FLOAT FovAngleY, + FLOAT AspectHByW, + FLOAT NearZ, + FLOAT FarZ +) +{ +#if defined(_XM_NO_INTRINSICS_) + + FLOAT SinFov; + FLOAT CosFov; + FLOAT Height; + FLOAT Width; + XMMATRIX M; + + XMASSERT(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f)); + XMASSERT(!XMScalarNearEqual(AspectHByW, 0.0f, 0.00001f)); + XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + + Height = CosFov / SinFov; + Width = Height / AspectHByW; + + M.r[0] = XMVectorSet(Width, 0.0f, 0.0f, 0.0f); + M.r[1] = XMVectorSet(0.0f, Height, 0.0f, 0.0f); + M.r[2] = XMVectorSet(0.0f, 0.0f, FarZ / (FarZ - NearZ), 1.0f); + M.r[3] = XMVectorSet(0.0f, 0.0f, -M.r[2].vector4_f32[2] * NearZ, 0.0f); + + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f)); + XMASSERT(!XMScalarNearEqual(AspectHByW, 0.0f, 0.00001f)); + XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + XMMATRIX M; + FLOAT SinFov; + FLOAT CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + FLOAT fRange = FarZ / (FarZ-NearZ); + // Note: This is recorded on the stack + FLOAT Height = CosFov / SinFov; + XMVECTOR rMem = { + Height / AspectHByW, + Height, + fRange, + -fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // CosFov / SinFov,0,0,0 + M.r[0] = vTemp; + // 0,Height / AspectHByW,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,1.0f + vTemp = _mm_setzero_ps(); + vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2)); + // 0,0,fRange,1.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0)); + M.r[2] = vTemp; + // 0,0,-fRange * NearZ,0.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0)); + M.r[3] = vTemp; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMMATRIX XMMatrixPerspectiveFovRH +( + FLOAT FovAngleY, + FLOAT AspectHByW, + FLOAT NearZ, + FLOAT FarZ +) +{ +#if defined(_XM_NO_INTRINSICS_) + + FLOAT SinFov; + FLOAT CosFov; + FLOAT Height; + FLOAT Width; + XMMATRIX M; + + XMASSERT(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f)); + XMASSERT(!XMScalarNearEqual(AspectHByW, 0.0f, 0.00001f)); + XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + + Height = CosFov / SinFov; + Width = Height / AspectHByW; + + M.r[0] = XMVectorSet(Width, 0.0f, 0.0f, 0.0f); + M.r[1] = XMVectorSet(0.0f, Height, 0.0f, 0.0f); + M.r[2] = XMVectorSet(0.0f, 0.0f, FarZ / (NearZ - FarZ), -1.0f); + M.r[3] = XMVectorSet(0.0f, 0.0f, M.r[2].vector4_f32[2] * NearZ, 0.0f); + + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f)); + XMASSERT(!XMScalarNearEqual(AspectHByW, 0.0f, 0.00001f)); + XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + XMMATRIX M; + FLOAT SinFov; + FLOAT CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + FLOAT fRange = FarZ / (NearZ-FarZ); + // Note: This is recorded on the stack + FLOAT Height = CosFov / SinFov; + XMVECTOR rMem = { + Height / AspectHByW, + Height, + fRange, + fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // CosFov / SinFov,0,0,0 + M.r[0] = vTemp; + // 0,Height / AspectHByW,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,-1.0f + vTemp = _mm_setzero_ps(); + vValues = _mm_shuffle_ps(vValues,g_XMNegIdentityR3,_MM_SHUFFLE(3,2,3,2)); + // 0,0,fRange,-1.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0)); + M.r[2] = vTemp; + // 0,0,fRange * NearZ,0.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0)); + M.r[3] = vTemp; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMMATRIX XMMatrixPerspectiveOffCenterLH +( + FLOAT ViewLeft, + FLOAT ViewRight, + FLOAT ViewBottom, + FLOAT ViewTop, + FLOAT NearZ, + FLOAT FarZ +) +{ +#if defined(_XM_NO_INTRINSICS_) + + FLOAT TwoNearZ; + FLOAT ReciprocalWidth; + FLOAT ReciprocalHeight; + FLOAT fRange; + XMMATRIX M; + + XMASSERT(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); + XMASSERT(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); + XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + + TwoNearZ = NearZ + NearZ; + ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + fRange = FarZ / (FarZ-NearZ); + + M.m[0][0] = TwoNearZ * ReciprocalWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = TwoNearZ * ReciprocalHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = -(ViewLeft + ViewRight) * ReciprocalWidth; + M.m[2][1] = -(ViewTop + ViewBottom) * ReciprocalHeight; + M.m[2][2] = fRange; + M.m[2][3] = 1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); + XMASSERT(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); + XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + XMMATRIX M; + FLOAT TwoNearZ = NearZ+NearZ; + FLOAT ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + FLOAT ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + FLOAT fRange = FarZ / (FarZ-NearZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + TwoNearZ*ReciprocalWidth, + TwoNearZ*ReciprocalHeight, + -fRange * NearZ, + 0 + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // TwoNearZ*ReciprocalWidth,0,0,0 + M.r[0] = vTemp; + // 0,TwoNearZ*ReciprocalHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // 0,0,fRange,1.0f + M.r[2] = XMVectorSet( -(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + fRange, + 1.0f ); + // 0,0,-fRange * NearZ,0.0f + vValues = _mm_and_ps(vValues,g_XMMaskZ); + M.r[3] = vValues; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMMATRIX XMMatrixPerspectiveOffCenterRH +( + FLOAT ViewLeft, + FLOAT ViewRight, + FLOAT ViewBottom, + FLOAT ViewTop, + FLOAT NearZ, + FLOAT FarZ +) +{ +#if defined(_XM_NO_INTRINSICS_) + + FLOAT TwoNearZ; + FLOAT ReciprocalWidth; + FLOAT ReciprocalHeight; + FLOAT fRange; + XMMATRIX M; + + XMASSERT(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); + XMASSERT(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); + XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + + TwoNearZ = NearZ + NearZ; + ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + fRange = FarZ / (NearZ-FarZ); + + M.m[0][0] = TwoNearZ * ReciprocalWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = TwoNearZ * ReciprocalHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = (ViewLeft + ViewRight) * ReciprocalWidth; + M.m[2][1] = (ViewTop + ViewBottom) * ReciprocalHeight; + M.m[2][2] = fRange; + M.m[2][3] = -1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); + XMASSERT(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); + XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + + XMMATRIX M; + FLOAT TwoNearZ = NearZ+NearZ; + FLOAT ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + FLOAT ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + FLOAT fRange = FarZ / (NearZ-FarZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + TwoNearZ*ReciprocalWidth, + TwoNearZ*ReciprocalHeight, + fRange * NearZ, + 0 + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // TwoNearZ*ReciprocalWidth,0,0,0 + M.r[0] = vTemp; + // 0,TwoNearZ*ReciprocalHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // 0,0,fRange,1.0f + M.r[2] = XMVectorSet((ViewLeft + ViewRight) * ReciprocalWidth, + (ViewTop + ViewBottom) * ReciprocalHeight, + fRange, + -1.0f); + // 0,0,-fRange * NearZ,0.0f + vValues = _mm_and_ps(vValues,g_XMMaskZ); + M.r[3] = vValues; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMMATRIX XMMatrixOrthographicLH +( + FLOAT ViewWidth, + FLOAT ViewHeight, + FLOAT NearZ, + FLOAT FarZ +) +{ +#if defined(_XM_NO_INTRINSICS_) + + FLOAT fRange; + XMMATRIX M; + + XMASSERT(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + XMASSERT(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + + fRange = 1.0f / (FarZ-NearZ); + M.r[0] = XMVectorSet(2.0f / ViewWidth, 0.0f, 0.0f, 0.0f); + M.r[1] = XMVectorSet(0.0f, 2.0f / ViewHeight, 0.0f, 0.0f); + M.r[2] = XMVectorSet(0.0f, 0.0f, fRange, 0.0f); + M.r[3] = XMVectorSet(0.0f, 0.0f, -fRange * NearZ, 1.0f); + + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + XMASSERT(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + XMMATRIX M; + FLOAT fRange = 1.0f / (FarZ-NearZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + 2.0f / ViewWidth, + 2.0f / ViewHeight, + fRange, + -fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // 2.0f / ViewWidth,0,0,0 + M.r[0] = vTemp; + // 0,2.0f / ViewHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,1.0f + vTemp = _mm_setzero_ps(); + vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2)); + // 0,0,fRange,0.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,0,0,0)); + M.r[2] = vTemp; + // 0,0,-fRange * NearZ,1.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,1,0,0)); + M.r[3] = vTemp; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMMATRIX XMMatrixOrthographicRH +( + FLOAT ViewWidth, + FLOAT ViewHeight, + FLOAT NearZ, + FLOAT FarZ +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + + XMASSERT(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + XMASSERT(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + + M.r[0] = XMVectorSet(2.0f / ViewWidth, 0.0f, 0.0f, 0.0f); + M.r[1] = XMVectorSet(0.0f, 2.0f / ViewHeight, 0.0f, 0.0f); + M.r[2] = XMVectorSet(0.0f, 0.0f, 1.0f / (NearZ - FarZ), 0.0f); + M.r[3] = XMVectorSet(0.0f, 0.0f, M.r[2].vector4_f32[2] * NearZ, 1.0f); + + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + XMASSERT(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + XMMATRIX M; + FLOAT fRange = 1.0f / (NearZ-FarZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + 2.0f / ViewWidth, + 2.0f / ViewHeight, + fRange, + fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // 2.0f / ViewWidth,0,0,0 + M.r[0] = vTemp; + // 0,2.0f / ViewHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=fRange * NearZ,0,1.0f + vTemp = _mm_setzero_ps(); + vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2)); + // 0,0,fRange,0.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,0,0,0)); + M.r[2] = vTemp; + // 0,0,fRange * NearZ,1.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,1,0,0)); + M.r[3] = vTemp; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMMATRIX XMMatrixOrthographicOffCenterLH +( + FLOAT ViewLeft, + FLOAT ViewRight, + FLOAT ViewBottom, + FLOAT ViewTop, + FLOAT NearZ, + FLOAT FarZ +) +{ +#if defined(_XM_NO_INTRINSICS_) + + FLOAT ReciprocalWidth; + FLOAT ReciprocalHeight; + XMMATRIX M; + + XMASSERT(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); + XMASSERT(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); + XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + + ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + + M.r[0] = XMVectorSet(ReciprocalWidth + ReciprocalWidth, 0.0f, 0.0f, 0.0f); + M.r[1] = XMVectorSet(0.0f, ReciprocalHeight + ReciprocalHeight, 0.0f, 0.0f); + M.r[2] = XMVectorSet(0.0f, 0.0f, 1.0f / (FarZ - NearZ), 0.0f); + M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + -M.r[2].vector4_f32[2] * NearZ, + 1.0f); + + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + FLOAT fReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + FLOAT fReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + FLOAT fRange = 1.0f / (FarZ-NearZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + fReciprocalWidth, + fReciprocalHeight, + fRange, + 1.0f + }; + XMVECTOR rMem2 = { + -(ViewLeft + ViewRight), + -(ViewTop + ViewBottom), + -NearZ, + 1.0f + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // fReciprocalWidth*2,0,0,0 + vTemp = _mm_add_ss(vTemp,vTemp); + M.r[0] = vTemp; + // 0,fReciprocalHeight*2,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + vTemp = _mm_add_ps(vTemp,vTemp); + M.r[1] = vTemp; + // 0,0,fRange,0.0f + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskZ); + M.r[2] = vTemp; + // -(ViewLeft + ViewRight)*fReciprocalWidth,-(ViewTop + ViewBottom)*fReciprocalHeight,fRange*-NearZ,1.0f + vValues = _mm_mul_ps(vValues,rMem2); + M.r[3] = vValues; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMMATRIX XMMatrixOrthographicOffCenterRH +( + FLOAT ViewLeft, + FLOAT ViewRight, + FLOAT ViewBottom, + FLOAT ViewTop, + FLOAT NearZ, + FLOAT FarZ +) +{ +#if defined(_XM_NO_INTRINSICS_) + + FLOAT ReciprocalWidth; + FLOAT ReciprocalHeight; + XMMATRIX M; + + XMASSERT(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); + XMASSERT(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); + XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + + ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + + M.r[0] = XMVectorSet(ReciprocalWidth + ReciprocalWidth, 0.0f, 0.0f, 0.0f); + M.r[1] = XMVectorSet(0.0f, ReciprocalHeight + ReciprocalHeight, 0.0f, 0.0f); + M.r[2] = XMVectorSet(0.0f, 0.0f, 1.0f / (NearZ - FarZ), 0.0f); + M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + M.r[2].vector4_f32[2] * NearZ, + 1.0f); + + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + FLOAT fReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + FLOAT fReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + FLOAT fRange = 1.0f / (NearZ-FarZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + fReciprocalWidth, + fReciprocalHeight, + fRange, + 1.0f + }; + XMVECTOR rMem2 = { + -(ViewLeft + ViewRight), + -(ViewTop + ViewBottom), + NearZ, + 1.0f + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // fReciprocalWidth*2,0,0,0 + vTemp = _mm_add_ss(vTemp,vTemp); + M.r[0] = vTemp; + // 0,fReciprocalHeight*2,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + vTemp = _mm_add_ps(vTemp,vTemp); + M.r[1] = vTemp; + // 0,0,fRange,0.0f + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskZ); + M.r[2] = vTemp; + // -(ViewLeft + ViewRight)*fReciprocalWidth,-(ViewTop + ViewBottom)*fReciprocalHeight,fRange*-NearZ,1.0f + vValues = _mm_mul_ps(vValues,rMem2); + M.r[3] = vValues; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + + +#ifdef __cplusplus + +/**************************************************************************** + * + * XMMATRIX operators and methods + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMMATRIX::_XMMATRIX +( + FXMVECTOR R0, + FXMVECTOR R1, + FXMVECTOR R2, + CXMVECTOR R3 +) +{ + r[0] = R0; + r[1] = R1; + r[2] = R2; + r[3] = R3; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMMATRIX::_XMMATRIX +( + FLOAT m00, FLOAT m01, FLOAT m02, FLOAT m03, + FLOAT m10, FLOAT m11, FLOAT m12, FLOAT m13, + FLOAT m20, FLOAT m21, FLOAT m22, FLOAT m23, + FLOAT m30, FLOAT m31, FLOAT m32, FLOAT m33 +) +{ + r[0] = XMVectorSet(m00, m01, m02, m03); + r[1] = XMVectorSet(m10, m11, m12, m13); + r[2] = XMVectorSet(m20, m21, m22, m23); + r[3] = XMVectorSet(m30, m31, m32, m33); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMMATRIX::_XMMATRIX +( + CONST FLOAT* pArray +) +{ + r[0] = XMLoadFloat4((const XMFLOAT4*)pArray); + r[1] = XMLoadFloat4((const XMFLOAT4*)(pArray + 4)); + r[2] = XMLoadFloat4((const XMFLOAT4*)(pArray + 8)); + r[3] = XMLoadFloat4((const XMFLOAT4*)(pArray + 12)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMMATRIX& _XMMATRIX::operator= +( + CONST _XMMATRIX& M +) +{ + r[0] = M.r[0]; + r[1] = M.r[1]; + r[2] = M.r[2]; + r[3] = M.r[3]; + return *this; +} + +//------------------------------------------------------------------------------ + +#ifndef XM_NO_OPERATOR_OVERLOADS + +#if !defined(_XBOX_VER) && defined(_XM_ISVS2005_) && defined(_XM_X64_) +#pragma warning(push) +#pragma warning(disable : 4328) +#endif + +XMFINLINE _XMMATRIX& _XMMATRIX::operator*= +( + CONST _XMMATRIX& M +) +{ + *this = XMMatrixMultiply(*this, M); + return *this; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMMATRIX _XMMATRIX::operator* +( + CONST _XMMATRIX& M +) CONST +{ + return XMMatrixMultiply(*this, M); +} + +#if !defined(_XBOX_VER) && defined(_XM_ISVS2005_) && defined(_XM_X64_) +#pragma warning(pop) +#endif + +#endif // !XM_NO_OPERATOR_OVERLOADS + +/**************************************************************************** + * + * XMFLOAT3X3 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMFLOAT3X3::_XMFLOAT3X3 +( + FLOAT m00, FLOAT m01, FLOAT m02, + FLOAT m10, FLOAT m11, FLOAT m12, + FLOAT m20, FLOAT m21, FLOAT m22 +) +{ + m[0][0] = m00; + m[0][1] = m01; + m[0][2] = m02; + + m[1][0] = m10; + m[1][1] = m11; + m[1][2] = m12; + + m[2][0] = m20; + m[2][1] = m21; + m[2][2] = m22; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMFLOAT3X3::_XMFLOAT3X3 +( + CONST FLOAT* pArray +) +{ + UINT Row; + UINT Column; + + for (Row = 0; Row < 3; Row++) + { + for (Column = 0; Column < 3; Column++) + { + m[Row][Column] = pArray[Row * 3 + Column]; + } + } +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMFLOAT3X3& _XMFLOAT3X3::operator= +( + CONST _XMFLOAT3X3& Float3x3 +) +{ + _11 = Float3x3._11; + _12 = Float3x3._12; + _13 = Float3x3._13; + _21 = Float3x3._21; + _22 = Float3x3._22; + _23 = Float3x3._23; + _31 = Float3x3._31; + _32 = Float3x3._32; + _33 = Float3x3._33; + + return *this; +} + +/**************************************************************************** + * + * XMFLOAT4X3 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMFLOAT4X3::_XMFLOAT4X3 +( + FLOAT m00, FLOAT m01, FLOAT m02, + FLOAT m10, FLOAT m11, FLOAT m12, + FLOAT m20, FLOAT m21, FLOAT m22, + FLOAT m30, FLOAT m31, FLOAT m32 +) +{ + m[0][0] = m00; + m[0][1] = m01; + m[0][2] = m02; + + m[1][0] = m10; + m[1][1] = m11; + m[1][2] = m12; + + m[2][0] = m20; + m[2][1] = m21; + m[2][2] = m22; + + m[3][0] = m30; + m[3][1] = m31; + m[3][2] = m32; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMFLOAT4X3::_XMFLOAT4X3 +( + CONST FLOAT* pArray +) +{ + UINT Row; + UINT Column; + + for (Row = 0; Row < 4; Row++) + { + for (Column = 0; Column < 3; Column++) + { + m[Row][Column] = pArray[Row * 3 + Column]; + } + } +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMFLOAT4X3& _XMFLOAT4X3::operator= +( + CONST _XMFLOAT4X3& Float4x3 +) +{ + XMVECTOR V1 = XMLoadFloat4((const XMFLOAT4*)&Float4x3._11); + XMVECTOR V2 = XMLoadFloat4((const XMFLOAT4*)&Float4x3._22); + XMVECTOR V3 = XMLoadFloat4((const XMFLOAT4*)&Float4x3._33); + + XMStoreFloat4((XMFLOAT4*)&_11, V1); + XMStoreFloat4((XMFLOAT4*)&_22, V2); + XMStoreFloat4((XMFLOAT4*)&_33, V3); + + return *this; +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMFLOAT4X3A& XMFLOAT4X3A::operator= +( + CONST XMFLOAT4X3A& Float4x3 +) +{ + XMVECTOR V1 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x3._11); + XMVECTOR V2 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x3._22); + XMVECTOR V3 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x3._33); + + XMStoreFloat4A((XMFLOAT4A*)&_11, V1); + XMStoreFloat4A((XMFLOAT4A*)&_22, V2); + XMStoreFloat4A((XMFLOAT4A*)&_33, V3); + + return *this; +} + +/**************************************************************************** + * + * XMFLOAT4X4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMFLOAT4X4::_XMFLOAT4X4 +( + FLOAT m00, FLOAT m01, FLOAT m02, FLOAT m03, + FLOAT m10, FLOAT m11, FLOAT m12, FLOAT m13, + FLOAT m20, FLOAT m21, FLOAT m22, FLOAT m23, + FLOAT m30, FLOAT m31, FLOAT m32, FLOAT m33 +) +{ + m[0][0] = m00; + m[0][1] = m01; + m[0][2] = m02; + m[0][3] = m03; + + m[1][0] = m10; + m[1][1] = m11; + m[1][2] = m12; + m[1][3] = m13; + + m[2][0] = m20; + m[2][1] = m21; + m[2][2] = m22; + m[2][3] = m23; + + m[3][0] = m30; + m[3][1] = m31; + m[3][2] = m32; + m[3][3] = m33; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMFLOAT4X4::_XMFLOAT4X4 +( + CONST FLOAT* pArray +) +{ + UINT Row; + UINT Column; + + for (Row = 0; Row < 4; Row++) + { + for (Column = 0; Column < 4; Column++) + { + m[Row][Column] = pArray[Row * 4 + Column]; + } + } +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMFLOAT4X4& _XMFLOAT4X4::operator= +( + CONST _XMFLOAT4X4& Float4x4 +) +{ + XMVECTOR V1 = XMLoadFloat4((const XMFLOAT4*)&Float4x4._11); + XMVECTOR V2 = XMLoadFloat4((const XMFLOAT4*)&Float4x4._21); + XMVECTOR V3 = XMLoadFloat4((const XMFLOAT4*)&Float4x4._31); + XMVECTOR V4 = XMLoadFloat4((const XMFLOAT4*)&Float4x4._41); + + XMStoreFloat4((XMFLOAT4*)&_11, V1); + XMStoreFloat4((XMFLOAT4*)&_21, V2); + XMStoreFloat4((XMFLOAT4*)&_31, V3); + XMStoreFloat4((XMFLOAT4*)&_41, V4); + + return *this; +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMFLOAT4X4A& XMFLOAT4X4A::operator= +( + CONST XMFLOAT4X4A& Float4x4 +) +{ + XMVECTOR V1 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x4._11); + XMVECTOR V2 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x4._21); + XMVECTOR V3 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x4._31); + XMVECTOR V4 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x4._41); + + XMStoreFloat4A((XMFLOAT4A*)&_11, V1); + XMStoreFloat4A((XMFLOAT4A*)&_21, V2); + XMStoreFloat4A((XMFLOAT4A*)&_31, V3); + XMStoreFloat4A((XMFLOAT4A*)&_41, V4); + + return *this; +} + +#endif // __cplusplus + +#endif // __XNAMATHMATRIX_INL__ + diff --git a/thirdparty/directxtex/XNAMath/xnamathmisc.inl b/thirdparty/directxtex/XNAMath/xnamathmisc.inl new file mode 100644 index 0000000..d4d4ef2 --- /dev/null +++ b/thirdparty/directxtex/XNAMath/xnamathmisc.inl @@ -0,0 +1,2460 @@ +/************************************************************************ +* * +* xnamathmisc.inl -- SIMD C++ Math library for Windows and Xbox 360 * +* Quaternion, plane, and color functions * +* * +* Copyright (c) Microsoft Corp. All rights reserved. * +* * +************************************************************************/ + +#if defined(_MSC_VER) && (_MSC_VER > 1000) +#pragma once +#endif + +#ifndef __XNAMATHMISC_INL__ +#define __XNAMATHMISC_INL__ + +/**************************************************************************** + * + * Quaternion + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMQuaternionEqual +( + FXMVECTOR Q1, + FXMVECTOR Q2 +) +{ + return XMVector4Equal(Q1, Q2); +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMQuaternionNotEqual +( + FXMVECTOR Q1, + FXMVECTOR Q2 +) +{ + return XMVector4NotEqual(Q1, Q2); +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMQuaternionIsNaN +( + FXMVECTOR Q +) +{ + return XMVector4IsNaN(Q); +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMQuaternionIsInfinite +( + FXMVECTOR Q +) +{ + return XMVector4IsInfinite(Q); +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMQuaternionIsIdentity +( + FXMVECTOR Q +) +{ +#if defined(_XM_NO_INTRINSICS_) + + return XMVector4Equal(Q, g_XMIdentityR3.v); + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(Q,g_XMIdentityR3); + return (_mm_movemask_ps(vTemp)==0x0f); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMQuaternionDot +( + FXMVECTOR Q1, + FXMVECTOR Q2 +) +{ + return XMVector4Dot(Q1, Q2); +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMQuaternionMultiply +( + FXMVECTOR Q1, + FXMVECTOR Q2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR NegativeQ1; + XMVECTOR Q2X; + XMVECTOR Q2Y; + XMVECTOR Q2Z; + XMVECTOR Q2W; + XMVECTOR Q1WZYX; + XMVECTOR Q1ZWXY; + XMVECTOR Q1YXWZ; + XMVECTOR Result; + CONST XMVECTORU32 ControlWZYX = {XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_1X}; + CONST XMVECTORU32 ControlZWXY = {XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X, XM_PERMUTE_1Y}; + CONST XMVECTORU32 ControlYXWZ = {XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_1Z}; + + NegativeQ1 = XMVectorNegate(Q1); + + Q2W = XMVectorSplatW(Q2); + Q2X = XMVectorSplatX(Q2); + Q2Y = XMVectorSplatY(Q2); + Q2Z = XMVectorSplatZ(Q2); + + Q1WZYX = XMVectorPermute(Q1, NegativeQ1, ControlWZYX.v); + Q1ZWXY = XMVectorPermute(Q1, NegativeQ1, ControlZWXY.v); + Q1YXWZ = XMVectorPermute(Q1, NegativeQ1, ControlYXWZ.v); + + Result = XMVectorMultiply(Q1, Q2W); + Result = XMVectorMultiplyAdd(Q1WZYX, Q2X, Result); + Result = XMVectorMultiplyAdd(Q1ZWXY, Q2Y, Result); + Result = XMVectorMultiplyAdd(Q1YXWZ, Q2Z, Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + static CONST XMVECTORF32 ControlWZYX = { 1.0f,-1.0f, 1.0f,-1.0f}; + static CONST XMVECTORF32 ControlZWXY = { 1.0f, 1.0f,-1.0f,-1.0f}; + static CONST XMVECTORF32 ControlYXWZ = {-1.0f, 1.0f, 1.0f,-1.0f}; + // Copy to SSE registers and use as few as possible for x86 + XMVECTOR Q2X = Q2; + XMVECTOR Q2Y = Q2; + XMVECTOR Q2Z = Q2; + XMVECTOR vResult = Q2; + // Splat with one instruction + vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,3,3,3)); + Q2X = _mm_shuffle_ps(Q2X,Q2X,_MM_SHUFFLE(0,0,0,0)); + Q2Y = _mm_shuffle_ps(Q2Y,Q2Y,_MM_SHUFFLE(1,1,1,1)); + Q2Z = _mm_shuffle_ps(Q2Z,Q2Z,_MM_SHUFFLE(2,2,2,2)); + // Retire Q1 and perform Q1*Q2W + vResult = _mm_mul_ps(vResult,Q1); + XMVECTOR Q1Shuffle = Q1; + // Shuffle the copies of Q1 + Q1Shuffle = _mm_shuffle_ps(Q1Shuffle,Q1Shuffle,_MM_SHUFFLE(0,1,2,3)); + // Mul by Q1WZYX + Q2X = _mm_mul_ps(Q2X,Q1Shuffle); + Q1Shuffle = _mm_shuffle_ps(Q1Shuffle,Q1Shuffle,_MM_SHUFFLE(2,3,0,1)); + // Flip the signs on y and z + Q2X = _mm_mul_ps(Q2X,ControlWZYX); + // Mul by Q1ZWXY + Q2Y = _mm_mul_ps(Q2Y,Q1Shuffle); + Q1Shuffle = _mm_shuffle_ps(Q1Shuffle,Q1Shuffle,_MM_SHUFFLE(0,1,2,3)); + // Flip the signs on z and w + Q2Y = _mm_mul_ps(Q2Y,ControlZWXY); + // Mul by Q1YXWZ + Q2Z = _mm_mul_ps(Q2Z,Q1Shuffle); + vResult = _mm_add_ps(vResult,Q2X); + // Flip the signs on x and w + Q2Z = _mm_mul_ps(Q2Z,ControlYXWZ); + Q2Y = _mm_add_ps(Q2Y,Q2Z); + vResult = _mm_add_ps(vResult,Q2Y); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMQuaternionLengthSq +( + FXMVECTOR Q +) +{ + return XMVector4LengthSq(Q); +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMQuaternionReciprocalLength +( + FXMVECTOR Q +) +{ + return XMVector4ReciprocalLength(Q); +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMQuaternionLength +( + FXMVECTOR Q +) +{ + return XMVector4Length(Q); +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMQuaternionNormalizeEst +( + FXMVECTOR Q +) +{ + return XMVector4NormalizeEst(Q); +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMQuaternionNormalize +( + FXMVECTOR Q +) +{ + return XMVector4Normalize(Q); +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMQuaternionConjugate +( + FXMVECTOR Q +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result = { + -Q.vector4_f32[0], + -Q.vector4_f32[1], + -Q.vector4_f32[2], + Q.vector4_f32[3] + }; + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 NegativeOne3 = {-1.0f,-1.0f,-1.0f,1.0f}; + XMVECTOR Result = _mm_mul_ps(Q,NegativeOne3); + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMQuaternionInverse +( + FXMVECTOR Q +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Conjugate; + XMVECTOR L; + XMVECTOR Control; + XMVECTOR Result; + CONST XMVECTOR Zero = XMVectorZero(); + + L = XMVector4LengthSq(Q); + Conjugate = XMQuaternionConjugate(Q); + + Control = XMVectorLessOrEqual(L, g_XMEpsilon.v); + + L = XMVectorReciprocal(L); + Result = XMVectorMultiply(Conjugate, L); + + Result = XMVectorSelect(Result, Zero, Control); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR Conjugate; + XMVECTOR L; + XMVECTOR Control; + XMVECTOR Result; + XMVECTOR Zero = XMVectorZero(); + + L = XMVector4LengthSq(Q); + Conjugate = XMQuaternionConjugate(Q); + Control = XMVectorLessOrEqual(L, g_XMEpsilon); + Result = _mm_div_ps(Conjugate,L); + Result = XMVectorSelect(Result, Zero, Control); + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMQuaternionLn +( + FXMVECTOR Q +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Q0; + XMVECTOR QW; + XMVECTOR Theta; + XMVECTOR SinTheta; + XMVECTOR S; + XMVECTOR ControlW; + XMVECTOR Result; + static CONST XMVECTOR OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f}; + + QW = XMVectorSplatW(Q); + Q0 = XMVectorSelect(g_XMSelect1110.v, Q, g_XMSelect1110.v); + + ControlW = XMVectorInBounds(QW, OneMinusEpsilon); + + Theta = XMVectorACos(QW); + SinTheta = XMVectorSin(Theta); + + S = XMVectorReciprocal(SinTheta); + S = XMVectorMultiply(Theta, S); + + Result = XMVectorMultiply(Q0, S); + + Result = XMVectorSelect(Q0, Result, ControlW); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + static CONST XMVECTORF32 OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f}; + static CONST XMVECTORF32 NegOneMinusEpsilon = {-(1.0f - 0.00001f), -(1.0f - 0.00001f),-(1.0f - 0.00001f),-(1.0f - 0.00001f)}; + // Get W only + XMVECTOR QW = _mm_shuffle_ps(Q,Q,_MM_SHUFFLE(3,3,3,3)); + // W = 0 + XMVECTOR Q0 = _mm_and_ps(Q,g_XMMask3); + // Use W if within bounds + XMVECTOR ControlW = _mm_cmple_ps(QW,OneMinusEpsilon); + XMVECTOR vTemp2 = _mm_cmpge_ps(QW,NegOneMinusEpsilon); + ControlW = _mm_and_ps(ControlW,vTemp2); + // Get theta + XMVECTOR vTheta = XMVectorACos(QW); + // Get Sine of theta + vTemp2 = XMVectorSin(vTheta); + // theta/sine of theta + vTheta = _mm_div_ps(vTheta,vTemp2); + // Here's the answer + vTheta = _mm_mul_ps(vTheta,Q0); + // Was W in bounds? If not, return input as is + vTheta = XMVectorSelect(Q0,vTheta,ControlW); + return vTheta; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMQuaternionExp +( + FXMVECTOR Q +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Theta; + XMVECTOR SinTheta; + XMVECTOR CosTheta; + XMVECTOR S; + XMVECTOR Control; + XMVECTOR Zero; + XMVECTOR Result; + + Theta = XMVector3Length(Q); + XMVectorSinCos(&SinTheta, &CosTheta, Theta); + + S = XMVectorReciprocal(Theta); + S = XMVectorMultiply(SinTheta, S); + + Result = XMVectorMultiply(Q, S); + + Zero = XMVectorZero(); + Control = XMVectorNearEqual(Theta, Zero, g_XMEpsilon.v); + Result = XMVectorSelect(Result, Q, Control); + + Result = XMVectorSelect(CosTheta, Result, g_XMSelect1110.v); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR Theta; + XMVECTOR SinTheta; + XMVECTOR CosTheta; + XMVECTOR S; + XMVECTOR Control; + XMVECTOR Zero; + XMVECTOR Result; + Theta = XMVector3Length(Q); + XMVectorSinCos(&SinTheta, &CosTheta, Theta); + S = _mm_div_ps(SinTheta,Theta); + Result = _mm_mul_ps(Q, S); + Zero = XMVectorZero(); + Control = XMVectorNearEqual(Theta, Zero, g_XMEpsilon); + Result = XMVectorSelect(Result,Q,Control); + Result = _mm_and_ps(Result,g_XMMask3); + CosTheta = _mm_and_ps(CosTheta,g_XMMaskW); + Result = _mm_or_ps(Result,CosTheta); + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMVECTOR XMQuaternionSlerp +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FLOAT t +) +{ + XMVECTOR T = XMVectorReplicate(t); + return XMQuaternionSlerpV(Q0, Q1, T); +} + +//------------------------------------------------------------------------------ + +XMINLINE XMVECTOR XMQuaternionSlerpV +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR T +) +{ +#if defined(_XM_NO_INTRINSICS_) + + // Result = Q0 * sin((1.0 - t) * Omega) / sin(Omega) + Q1 * sin(t * Omega) / sin(Omega) + XMVECTOR Omega; + XMVECTOR CosOmega; + XMVECTOR SinOmega; + XMVECTOR InvSinOmega; + XMVECTOR V01; + XMVECTOR C1000; + XMVECTOR SignMask; + XMVECTOR S0; + XMVECTOR S1; + XMVECTOR Sign; + XMVECTOR Control; + XMVECTOR Result; + XMVECTOR Zero; + CONST XMVECTOR OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f}; + + XMASSERT((T.vector4_f32[1] == T.vector4_f32[0]) && (T.vector4_f32[2] == T.vector4_f32[0]) && (T.vector4_f32[3] == T.vector4_f32[0])); + + CosOmega = XMQuaternionDot(Q0, Q1); + + Zero = XMVectorZero(); + Control = XMVectorLess(CosOmega, Zero); + Sign = XMVectorSelect(g_XMOne.v, g_XMNegativeOne.v, Control); + + CosOmega = XMVectorMultiply(CosOmega, Sign); + + Control = XMVectorLess(CosOmega, OneMinusEpsilon); + + SinOmega = XMVectorNegativeMultiplySubtract(CosOmega, CosOmega, g_XMOne.v); + SinOmega = XMVectorSqrt(SinOmega); + + Omega = XMVectorATan2(SinOmega, CosOmega); + + SignMask = XMVectorSplatSignMask(); + C1000 = XMVectorSetBinaryConstant(1, 0, 0, 0); + V01 = XMVectorShiftLeft(T, Zero, 2); + SignMask = XMVectorShiftLeft(SignMask, Zero, 3); + V01 = XMVectorXorInt(V01, SignMask); + V01 = XMVectorAdd(C1000, V01); + + InvSinOmega = XMVectorReciprocal(SinOmega); + + S0 = XMVectorMultiply(V01, Omega); + S0 = XMVectorSin(S0); + S0 = XMVectorMultiply(S0, InvSinOmega); + + S0 = XMVectorSelect(V01, S0, Control); + + S1 = XMVectorSplatY(S0); + S0 = XMVectorSplatX(S0); + + S1 = XMVectorMultiply(S1, Sign); + + Result = XMVectorMultiply(Q0, S0); + Result = XMVectorMultiplyAdd(Q1, S1, Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Result = Q0 * sin((1.0 - t) * Omega) / sin(Omega) + Q1 * sin(t * Omega) / sin(Omega) + XMVECTOR Omega; + XMVECTOR CosOmega; + XMVECTOR SinOmega; + XMVECTOR V01; + XMVECTOR S0; + XMVECTOR S1; + XMVECTOR Sign; + XMVECTOR Control; + XMVECTOR Result; + XMVECTOR Zero; + static const XMVECTORF32 OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f}; + static const XMVECTORI32 SignMask2 = {0x80000000,0x00000000,0x00000000,0x00000000}; + static const XMVECTORI32 MaskXY = {0xFFFFFFFF,0xFFFFFFFF,0x00000000,0x00000000}; + + XMASSERT((XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T))); + + CosOmega = XMQuaternionDot(Q0, Q1); + + Zero = XMVectorZero(); + Control = XMVectorLess(CosOmega, Zero); + Sign = XMVectorSelect(g_XMOne, g_XMNegativeOne, Control); + + CosOmega = _mm_mul_ps(CosOmega, Sign); + + Control = XMVectorLess(CosOmega, OneMinusEpsilon); + + SinOmega = _mm_mul_ps(CosOmega,CosOmega); + SinOmega = _mm_sub_ps(g_XMOne,SinOmega); + SinOmega = _mm_sqrt_ps(SinOmega); + + Omega = XMVectorATan2(SinOmega, CosOmega); + + V01 = _mm_shuffle_ps(T,T,_MM_SHUFFLE(2,3,0,1)); + V01 = _mm_and_ps(V01,MaskXY); + V01 = _mm_xor_ps(V01,SignMask2); + V01 = _mm_add_ps(g_XMIdentityR0, V01); + + S0 = _mm_mul_ps(V01, Omega); + S0 = XMVectorSin(S0); + S0 = _mm_div_ps(S0, SinOmega); + + S0 = XMVectorSelect(V01, S0, Control); + + S1 = XMVectorSplatY(S0); + S0 = XMVectorSplatX(S0); + + S1 = _mm_mul_ps(S1, Sign); + Result = _mm_mul_ps(Q0, S0); + S1 = _mm_mul_ps(S1, Q1); + Result = _mm_add_ps(Result,S1); + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMQuaternionSquad +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + CXMVECTOR Q3, + FLOAT t +) +{ + XMVECTOR T = XMVectorReplicate(t); + return XMQuaternionSquadV(Q0, Q1, Q2, Q3, T); +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMQuaternionSquadV +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + CXMVECTOR Q3, + CXMVECTOR T +) +{ + XMVECTOR Q03; + XMVECTOR Q12; + XMVECTOR TP; + XMVECTOR Two; + XMVECTOR Result; + + XMASSERT( (XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T)) ); + + TP = T; + Two = XMVectorSplatConstant(2, 0); + + Q03 = XMQuaternionSlerpV(Q0, Q3, T); + Q12 = XMQuaternionSlerpV(Q1, Q2, T); + + TP = XMVectorNegativeMultiplySubtract(TP, TP, TP); + TP = XMVectorMultiply(TP, Two); + + Result = XMQuaternionSlerpV(Q03, Q12, TP); + + return Result; + +} + +//------------------------------------------------------------------------------ + +XMINLINE VOID XMQuaternionSquadSetup +( + XMVECTOR* pA, + XMVECTOR* pB, + XMVECTOR* pC, + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + CXMVECTOR Q3 +) +{ + XMVECTOR SQ0, SQ2, SQ3; + XMVECTOR InvQ1, InvQ2; + XMVECTOR LnQ0, LnQ1, LnQ2, LnQ3; + XMVECTOR ExpQ02, ExpQ13; + XMVECTOR LS01, LS12, LS23; + XMVECTOR LD01, LD12, LD23; + XMVECTOR Control0, Control1, Control2; + XMVECTOR NegativeOneQuarter; + + XMASSERT(pA); + XMASSERT(pB); + XMASSERT(pC); + + LS12 = XMQuaternionLengthSq(XMVectorAdd(Q1, Q2)); + LD12 = XMQuaternionLengthSq(XMVectorSubtract(Q1, Q2)); + SQ2 = XMVectorNegate(Q2); + + Control1 = XMVectorLess(LS12, LD12); + SQ2 = XMVectorSelect(Q2, SQ2, Control1); + + LS01 = XMQuaternionLengthSq(XMVectorAdd(Q0, Q1)); + LD01 = XMQuaternionLengthSq(XMVectorSubtract(Q0, Q1)); + SQ0 = XMVectorNegate(Q0); + + LS23 = XMQuaternionLengthSq(XMVectorAdd(SQ2, Q3)); + LD23 = XMQuaternionLengthSq(XMVectorSubtract(SQ2, Q3)); + SQ3 = XMVectorNegate(Q3); + + Control0 = XMVectorLess(LS01, LD01); + Control2 = XMVectorLess(LS23, LD23); + + SQ0 = XMVectorSelect(Q0, SQ0, Control0); + SQ3 = XMVectorSelect(Q3, SQ3, Control2); + + InvQ1 = XMQuaternionInverse(Q1); + InvQ2 = XMQuaternionInverse(SQ2); + + LnQ0 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ0)); + LnQ2 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ2)); + LnQ1 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, Q1)); + LnQ3 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, SQ3)); + + NegativeOneQuarter = XMVectorSplatConstant(-1, 2); + + ExpQ02 = XMVectorMultiply(XMVectorAdd(LnQ0, LnQ2), NegativeOneQuarter); + ExpQ13 = XMVectorMultiply(XMVectorAdd(LnQ1, LnQ3), NegativeOneQuarter); + ExpQ02 = XMQuaternionExp(ExpQ02); + ExpQ13 = XMQuaternionExp(ExpQ13); + + *pA = XMQuaternionMultiply(Q1, ExpQ02); + *pB = XMQuaternionMultiply(SQ2, ExpQ13); + *pC = SQ2; +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMQuaternionBaryCentric +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + FLOAT f, + FLOAT g +) +{ + XMVECTOR Q01; + XMVECTOR Q02; + FLOAT s; + XMVECTOR Result; + + s = f + g; + + if ((s < 0.00001f) && (s > -0.00001f)) + { + Result = Q0; + } + else + { + Q01 = XMQuaternionSlerp(Q0, Q1, s); + Q02 = XMQuaternionSlerp(Q0, Q2, s); + + Result = XMQuaternionSlerp(Q01, Q02, g / s); + } + + return Result; +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMQuaternionBaryCentricV +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + CXMVECTOR F, + CXMVECTOR G +) +{ + XMVECTOR Q01; + XMVECTOR Q02; + XMVECTOR S, GS; + XMVECTOR Epsilon; + XMVECTOR Result; + + XMASSERT( (XMVectorGetY(F) == XMVectorGetX(F)) && (XMVectorGetZ(F) == XMVectorGetX(F)) && (XMVectorGetW(F) == XMVectorGetX(F)) ); + XMASSERT( (XMVectorGetY(G) == XMVectorGetX(G)) && (XMVectorGetZ(G) == XMVectorGetX(G)) && (XMVectorGetW(G) == XMVectorGetX(G)) ); + + Epsilon = XMVectorSplatConstant(1, 16); + + S = XMVectorAdd(F, G); + + if (XMVector4InBounds(S, Epsilon)) + { + Result = Q0; + } + else + { + Q01 = XMQuaternionSlerpV(Q0, Q1, S); + Q02 = XMQuaternionSlerpV(Q0, Q2, S); + GS = XMVectorReciprocal(S); + GS = XMVectorMultiply(G, GS); + + Result = XMQuaternionSlerpV(Q01, Q02, GS); + } + + return Result; +} + +//------------------------------------------------------------------------------ +// Transformation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMQuaternionIdentity() +{ +#if defined(_XM_NO_INTRINSICS_) + return g_XMIdentityR3.v; +#elif defined(_XM_SSE_INTRINSICS_) + return g_XMIdentityR3; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMQuaternionRotationRollPitchYaw +( + FLOAT Pitch, + FLOAT Yaw, + FLOAT Roll +) +{ + XMVECTOR Angles; + XMVECTOR Q; + + Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f); + Q = XMQuaternionRotationRollPitchYawFromVector(Angles); + + return Q; +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMQuaternionRotationRollPitchYawFromVector +( + FXMVECTOR Angles // +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Q, Q0, Q1; + XMVECTOR P0, P1, Y0, Y1, R0, R1; + XMVECTOR HalfAngles; + XMVECTOR SinAngles, CosAngles; + static CONST XMVECTORU32 ControlPitch = {XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1X, XM_PERMUTE_1X}; + static CONST XMVECTORU32 ControlYaw = {XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Y}; + static CONST XMVECTORU32 ControlRoll = {XM_PERMUTE_1Z, XM_PERMUTE_1Z, XM_PERMUTE_0Z, XM_PERMUTE_1Z}; + static CONST XMVECTOR Sign = {1.0f, -1.0f, -1.0f, 1.0f}; + + HalfAngles = XMVectorMultiply(Angles, g_XMOneHalf.v); + XMVectorSinCos(&SinAngles, &CosAngles, HalfAngles); + + P0 = XMVectorPermute(SinAngles, CosAngles, ControlPitch.v); + Y0 = XMVectorPermute(SinAngles, CosAngles, ControlYaw.v); + R0 = XMVectorPermute(SinAngles, CosAngles, ControlRoll.v); + P1 = XMVectorPermute(CosAngles, SinAngles, ControlPitch.v); + Y1 = XMVectorPermute(CosAngles, SinAngles, ControlYaw.v); + R1 = XMVectorPermute(CosAngles, SinAngles, ControlRoll.v); + + Q1 = XMVectorMultiply(P1, Sign); + Q0 = XMVectorMultiply(P0, Y0); + Q1 = XMVectorMultiply(Q1, Y1); + Q0 = XMVectorMultiply(Q0, R0); + Q = XMVectorMultiplyAdd(Q1, R1, Q0); + + return Q; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR Q, Q0, Q1; + XMVECTOR P0, P1, Y0, Y1, R0, R1; + XMVECTOR HalfAngles; + XMVECTOR SinAngles, CosAngles; + static CONST XMVECTORI32 ControlPitch = {XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1X, XM_PERMUTE_1X}; + static CONST XMVECTORI32 ControlYaw = {XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Y}; + static CONST XMVECTORI32 ControlRoll = {XM_PERMUTE_1Z, XM_PERMUTE_1Z, XM_PERMUTE_0Z, XM_PERMUTE_1Z}; + static CONST XMVECTORF32 Sign = {1.0f, -1.0f, -1.0f, 1.0f}; + + HalfAngles = _mm_mul_ps(Angles, g_XMOneHalf); + XMVectorSinCos(&SinAngles, &CosAngles, HalfAngles); + + P0 = XMVectorPermute(SinAngles, CosAngles, ControlPitch); + Y0 = XMVectorPermute(SinAngles, CosAngles, ControlYaw); + R0 = XMVectorPermute(SinAngles, CosAngles, ControlRoll); + P1 = XMVectorPermute(CosAngles, SinAngles, ControlPitch); + Y1 = XMVectorPermute(CosAngles, SinAngles, ControlYaw); + R1 = XMVectorPermute(CosAngles, SinAngles, ControlRoll); + + Q1 = _mm_mul_ps(P1, Sign); + Q0 = _mm_mul_ps(P0, Y0); + Q1 = _mm_mul_ps(Q1, Y1); + Q0 = _mm_mul_ps(Q0, R0); + Q = _mm_mul_ps(Q1, R1); + Q = _mm_add_ps(Q,Q0); + return Q; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMQuaternionRotationNormal +( + FXMVECTOR NormalAxis, + FLOAT Angle +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Q; + XMVECTOR N; + XMVECTOR Scale; + + N = XMVectorSelect(g_XMOne.v, NormalAxis, g_XMSelect1110.v); + + XMScalarSinCos(&Scale.vector4_f32[2], &Scale.vector4_f32[3], 0.5f * Angle); + + Scale.vector4_f32[0] = Scale.vector4_f32[1] = Scale.vector4_f32[2]; + + Q = XMVectorMultiply(N, Scale); + + return Q; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR N = _mm_and_ps(NormalAxis,g_XMMask3); + N = _mm_or_ps(N,g_XMIdentityR3); + XMVECTOR Scale = _mm_set_ps1(0.5f * Angle); + XMVECTOR vSine; + XMVECTOR vCosine; + XMVectorSinCos(&vSine,&vCosine,Scale); + Scale = _mm_and_ps(vSine,g_XMMask3); + vCosine = _mm_and_ps(vCosine,g_XMMaskW); + Scale = _mm_or_ps(Scale,vCosine); + N = _mm_mul_ps(N,Scale); + return N; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMQuaternionRotationAxis +( + FXMVECTOR Axis, + FLOAT Angle +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Normal; + XMVECTOR Q; + + XMASSERT(!XMVector3Equal(Axis, XMVectorZero())); + XMASSERT(!XMVector3IsInfinite(Axis)); + + Normal = XMVector3Normalize(Axis); + Q = XMQuaternionRotationNormal(Normal, Angle); + + return Q; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR Normal; + XMVECTOR Q; + + XMASSERT(!XMVector3Equal(Axis, XMVectorZero())); + XMASSERT(!XMVector3IsInfinite(Axis)); + + Normal = XMVector3Normalize(Axis); + Q = XMQuaternionRotationNormal(Normal, Angle); + return Q; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMVECTOR XMQuaternionRotationMatrix +( + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) + + XMVECTOR Q0, Q1, Q2; + XMVECTOR M00, M11, M22; + XMVECTOR CQ0, CQ1, C; + XMVECTOR CX, CY, CZ, CW; + XMVECTOR SQ1, Scale; + XMVECTOR Rsq, Sqrt, VEqualsNaN; + XMVECTOR A, B, P; + XMVECTOR PermuteSplat, PermuteSplatT; + XMVECTOR SignB, SignBT; + XMVECTOR PermuteControl, PermuteControlT; + XMVECTOR Result; + static CONST XMVECTORF32 OneQuarter = {0.25f, 0.25f, 0.25f, 0.25f}; + static CONST XMVECTORF32 SignPNNP = {1.0f, -1.0f, -1.0f, 1.0f}; + static CONST XMVECTORF32 SignNPNP = {-1.0f, 1.0f, -1.0f, 1.0f}; + static CONST XMVECTORF32 SignNNPP = {-1.0f, -1.0f, 1.0f, 1.0f}; + static CONST XMVECTORF32 SignPNPP = {1.0f, -1.0f, 1.0f, 1.0f}; + static CONST XMVECTORF32 SignPPNP = {1.0f, 1.0f, -1.0f, 1.0f}; + static CONST XMVECTORF32 SignNPPP = {-1.0f, 1.0f, 1.0f, 1.0f}; + static CONST XMVECTORU32 Permute0X0X0Y0W = {XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_0W}; + static CONST XMVECTORU32 Permute0Y0Z0Z1W = {XM_PERMUTE_0Y, XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_1W}; + static CONST XMVECTORU32 SplatX = {XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X}; + static CONST XMVECTORU32 SplatY = {XM_PERMUTE_0Y, XM_PERMUTE_0Y, XM_PERMUTE_0Y, XM_PERMUTE_0Y}; + static CONST XMVECTORU32 SplatZ = {XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_0Z}; + static CONST XMVECTORU32 SplatW = {XM_PERMUTE_0W, XM_PERMUTE_0W, XM_PERMUTE_0W, XM_PERMUTE_0W}; + static CONST XMVECTORU32 PermuteC = {XM_PERMUTE_0X, XM_PERMUTE_0Z, XM_PERMUTE_1X, XM_PERMUTE_1Y}; + static CONST XMVECTORU32 PermuteA = {XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Z, XM_PERMUTE_0W}; + static CONST XMVECTORU32 PermuteB = {XM_PERMUTE_1X, XM_PERMUTE_1W, XM_PERMUTE_0Z, XM_PERMUTE_0W}; + static CONST XMVECTORU32 Permute0 = {XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1Z, XM_PERMUTE_1Y}; + static CONST XMVECTORU32 Permute1 = {XM_PERMUTE_1X, XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Z}; + static CONST XMVECTORU32 Permute2 = {XM_PERMUTE_1Z, XM_PERMUTE_1Y, XM_PERMUTE_0Z, XM_PERMUTE_1X}; + static CONST XMVECTORU32 Permute3 = {XM_PERMUTE_1Y, XM_PERMUTE_1Z, XM_PERMUTE_1X, XM_PERMUTE_0W}; + + M00 = XMVectorSplatX(M.r[0]); + M11 = XMVectorSplatY(M.r[1]); + M22 = XMVectorSplatZ(M.r[2]); + + Q0 = XMVectorMultiply(SignPNNP.v, M00); + Q0 = XMVectorMultiplyAdd(SignNPNP.v, M11, Q0); + Q0 = XMVectorMultiplyAdd(SignNNPP.v, M22, Q0); + + Q1 = XMVectorAdd(Q0, g_XMOne.v); + + Rsq = XMVectorReciprocalSqrt(Q1); + VEqualsNaN = XMVectorIsNaN(Rsq); + Sqrt = XMVectorMultiply(Q1, Rsq); + Q1 = XMVectorSelect(Sqrt, Q1, VEqualsNaN); + + Q1 = XMVectorMultiply(Q1, g_XMOneHalf.v); + + SQ1 = XMVectorMultiply(Rsq, g_XMOneHalf.v); + + CQ0 = XMVectorPermute(Q0, Q0, Permute0X0X0Y0W.v); + CQ1 = XMVectorPermute(Q0, g_XMEpsilon.v, Permute0Y0Z0Z1W.v); + C = XMVectorGreaterOrEqual(CQ0, CQ1); + + CX = XMVectorSplatX(C); + CY = XMVectorSplatY(C); + CZ = XMVectorSplatZ(C); + CW = XMVectorSplatW(C); + + PermuteSplat = XMVectorSelect(SplatZ.v, SplatY.v, CZ); + SignB = XMVectorSelect(SignNPPP.v, SignPPNP.v, CZ); + PermuteControl = XMVectorSelect(Permute2.v, Permute1.v, CZ); + + PermuteSplat = XMVectorSelect(PermuteSplat, SplatZ.v, CX); + SignB = XMVectorSelect(SignB, SignNPPP.v, CX); + PermuteControl = XMVectorSelect(PermuteControl, Permute2.v, CX); + + PermuteSplatT = XMVectorSelect(PermuteSplat,SplatX.v, CY); + SignBT = XMVectorSelect(SignB, SignPNPP.v, CY); + PermuteControlT = XMVectorSelect(PermuteControl,Permute0.v, CY); + + PermuteSplat = XMVectorSelect(PermuteSplat, PermuteSplatT, CX); + SignB = XMVectorSelect(SignB, SignBT, CX); + PermuteControl = XMVectorSelect(PermuteControl, PermuteControlT, CX); + + PermuteSplat = XMVectorSelect(PermuteSplat,SplatW.v, CW); + SignB = XMVectorSelect(SignB, g_XMNegativeOne.v, CW); + PermuteControl = XMVectorSelect(PermuteControl,Permute3.v, CW); + + Scale = XMVectorPermute(SQ1, SQ1, PermuteSplat); + + P = XMVectorPermute(M.r[1], M.r[2],PermuteC.v); // {M10, M12, M20, M21} + A = XMVectorPermute(M.r[0], P, PermuteA.v); // {M01, M12, M20, M03} + B = XMVectorPermute(M.r[0], P, PermuteB.v); // {M10, M21, M02, M03} + + Q2 = XMVectorMultiplyAdd(SignB, B, A); + Q2 = XMVectorMultiply(Q2, Scale); + + Result = XMVectorPermute(Q1, Q2, PermuteControl); + + return Result; + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Conversion operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMQuaternionToAxisAngle +( + XMVECTOR* pAxis, + FLOAT* pAngle, + FXMVECTOR Q +) +{ + XMASSERT(pAxis); + XMASSERT(pAngle); + + *pAxis = Q; + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + *pAngle = 2.0f * acosf(XMVectorGetW(Q)); +#else + *pAngle = 2.0f * XMScalarACos(XMVectorGetW(Q)); +#endif +} + +/**************************************************************************** + * + * Plane + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMPlaneEqual +( + FXMVECTOR P1, + FXMVECTOR P2 +) +{ + return XMVector4Equal(P1, P2); +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMPlaneNearEqual +( + FXMVECTOR P1, + FXMVECTOR P2, + FXMVECTOR Epsilon +) +{ + XMVECTOR NP1 = XMPlaneNormalize(P1); + XMVECTOR NP2 = XMPlaneNormalize(P2); + return XMVector4NearEqual(NP1, NP2, Epsilon); +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMPlaneNotEqual +( + FXMVECTOR P1, + FXMVECTOR P2 +) +{ + return XMVector4NotEqual(P1, P2); +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMPlaneIsNaN +( + FXMVECTOR P +) +{ + return XMVector4IsNaN(P); +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMPlaneIsInfinite +( + FXMVECTOR P +) +{ + return XMVector4IsInfinite(P); +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMPlaneDot +( + FXMVECTOR P, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + return XMVector4Dot(P, V); + +#elif defined(_XM_SSE_INTRINSICS_) + __m128 vTemp2 = V; + __m128 vTemp = _mm_mul_ps(P,vTemp2); + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp,_MM_SHUFFLE(1,0,0,0)); // Copy X to the Z position and Y to the W position + vTemp2 = _mm_add_ps(vTemp2,vTemp); // Add Z = X+Z; W = Y+W; + vTemp = _mm_shuffle_ps(vTemp,vTemp2,_MM_SHUFFLE(0,3,0,0)); // Copy W to the Z position + vTemp = _mm_add_ps(vTemp,vTemp2); // Add Z and W together + return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(2,2,2,2)); // Splat Z and return +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMPlaneDotCoord +( + FXMVECTOR P, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V3; + XMVECTOR Result; + + // Result = P[0] * V[0] + P[1] * V[1] + P[2] * V[2] + P[3] + V3 = XMVectorSelect(g_XMOne.v, V, g_XMSelect1110.v); + Result = XMVector4Dot(P, V3); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp2 = _mm_and_ps(V,g_XMMask3); + vTemp2 = _mm_or_ps(vTemp2,g_XMIdentityR3); + XMVECTOR vTemp = _mm_mul_ps(P,vTemp2); + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp,_MM_SHUFFLE(1,0,0,0)); // Copy X to the Z position and Y to the W position + vTemp2 = _mm_add_ps(vTemp2,vTemp); // Add Z = X+Z; W = Y+W; + vTemp = _mm_shuffle_ps(vTemp,vTemp2,_MM_SHUFFLE(0,3,0,0)); // Copy W to the Z position + vTemp = _mm_add_ps(vTemp,vTemp2); // Add Z and W together + return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(2,2,2,2)); // Splat Z and return +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMPlaneDotNormal +( + FXMVECTOR P, + FXMVECTOR V +) +{ + return XMVector3Dot(P, V); +} + +//------------------------------------------------------------------------------ +// XMPlaneNormalizeEst uses a reciprocal estimate and +// returns QNaN on zero and infinite vectors. + +XMFINLINE XMVECTOR XMPlaneNormalizeEst +( + FXMVECTOR P +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector3ReciprocalLength(P); + Result = XMVectorMultiply(P, Result); + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product + XMVECTOR vDot = _mm_mul_ps(P,P); + // x=Dot.y, y=Dot.z + XMVECTOR vTemp = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(2,1,2,1)); + // Result.x = x+y + vDot = _mm_add_ss(vDot,vTemp); + // x=Dot.z + vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1)); + // Result.x = (x+y)+z + vDot = _mm_add_ss(vDot,vTemp); + // Splat x + vDot = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(0,0,0,0)); + // Get the reciprocal + vDot = _mm_rsqrt_ps(vDot); + // Get the reciprocal + vDot = _mm_mul_ps(vDot,P); + return vDot; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMPlaneNormalize +( + FXMVECTOR P +) +{ +#if defined(_XM_NO_INTRINSICS_) + FLOAT fLengthSq = sqrtf((P.vector4_f32[0]*P.vector4_f32[0])+(P.vector4_f32[1]*P.vector4_f32[1])+(P.vector4_f32[2]*P.vector4_f32[2])); + // Prevent divide by zero + if (fLengthSq) { + fLengthSq = 1.0f/fLengthSq; + } + { + XMVECTOR vResult = { + P.vector4_f32[0]*fLengthSq, + P.vector4_f32[1]*fLengthSq, + P.vector4_f32[2]*fLengthSq, + P.vector4_f32[3]*fLengthSq + }; + return vResult; + } +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z only + XMVECTOR vLengthSq = _mm_mul_ps(P,P); + XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(2,1,2,1)); + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1)); + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(0,0,0,0)); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(P,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vLengthSq); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMPlaneIntersectLine +( + FXMVECTOR P, + FXMVECTOR LinePoint1, + FXMVECTOR LinePoint2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V1; + XMVECTOR V2; + XMVECTOR D; + XMVECTOR ReciprocalD; + XMVECTOR VT; + XMVECTOR Point; + XMVECTOR Zero; + XMVECTOR Control; + XMVECTOR Result; + + V1 = XMVector3Dot(P, LinePoint1); + V2 = XMVector3Dot(P, LinePoint2); + D = XMVectorSubtract(V1, V2); + + ReciprocalD = XMVectorReciprocal(D); + VT = XMPlaneDotCoord(P, LinePoint1); + VT = XMVectorMultiply(VT, ReciprocalD); + + Point = XMVectorSubtract(LinePoint2, LinePoint1); + Point = XMVectorMultiplyAdd(Point, VT, LinePoint1); + + Zero = XMVectorZero(); + Control = XMVectorNearEqual(D, Zero, g_XMEpsilon.v); + + Result = XMVectorSelect(Point, g_XMQNaN.v, Control); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR V1; + XMVECTOR V2; + XMVECTOR D; + XMVECTOR VT; + XMVECTOR Point; + XMVECTOR Zero; + XMVECTOR Control; + XMVECTOR Result; + + V1 = XMVector3Dot(P, LinePoint1); + V2 = XMVector3Dot(P, LinePoint2); + D = _mm_sub_ps(V1, V2); + + VT = XMPlaneDotCoord(P, LinePoint1); + VT = _mm_div_ps(VT, D); + + Point = _mm_sub_ps(LinePoint2, LinePoint1); + Point = _mm_mul_ps(Point,VT); + Point = _mm_add_ps(Point,LinePoint1); + Zero = XMVectorZero(); + Control = XMVectorNearEqual(D, Zero, g_XMEpsilon); + Result = XMVectorSelect(Point, g_XMQNaN, Control); + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE VOID XMPlaneIntersectPlane +( + XMVECTOR* pLinePoint1, + XMVECTOR* pLinePoint2, + FXMVECTOR P1, + FXMVECTOR P2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V1; + XMVECTOR V2; + XMVECTOR V3; + XMVECTOR LengthSq; + XMVECTOR RcpLengthSq; + XMVECTOR Point; + XMVECTOR P1W; + XMVECTOR P2W; + XMVECTOR Control; + XMVECTOR LinePoint1; + XMVECTOR LinePoint2; + + XMASSERT(pLinePoint1); + XMASSERT(pLinePoint2); + + V1 = XMVector3Cross(P2, P1); + + LengthSq = XMVector3LengthSq(V1); + + V2 = XMVector3Cross(P2, V1); + + P1W = XMVectorSplatW(P1); + Point = XMVectorMultiply(V2, P1W); + + V3 = XMVector3Cross(V1, P1); + + P2W = XMVectorSplatW(P2); + Point = XMVectorMultiplyAdd(V3, P2W, Point); + + RcpLengthSq = XMVectorReciprocal(LengthSq); + LinePoint1 = XMVectorMultiply(Point, RcpLengthSq); + + LinePoint2 = XMVectorAdd(LinePoint1, V1); + + Control = XMVectorLessOrEqual(LengthSq, g_XMEpsilon.v); + *pLinePoint1 = XMVectorSelect(LinePoint1,g_XMQNaN.v, Control); + *pLinePoint2 = XMVectorSelect(LinePoint2,g_XMQNaN.v, Control); + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pLinePoint1); + XMASSERT(pLinePoint2); + XMVECTOR V1; + XMVECTOR V2; + XMVECTOR V3; + XMVECTOR LengthSq; + XMVECTOR Point; + XMVECTOR P1W; + XMVECTOR P2W; + XMVECTOR Control; + XMVECTOR LinePoint1; + XMVECTOR LinePoint2; + + V1 = XMVector3Cross(P2, P1); + + LengthSq = XMVector3LengthSq(V1); + + V2 = XMVector3Cross(P2, V1); + + P1W = _mm_shuffle_ps(P1,P1,_MM_SHUFFLE(3,3,3,3)); + Point = _mm_mul_ps(V2, P1W); + + V3 = XMVector3Cross(V1, P1); + + P2W = _mm_shuffle_ps(P2,P2,_MM_SHUFFLE(3,3,3,3)); + V3 = _mm_mul_ps(V3,P2W); + Point = _mm_add_ps(Point,V3); + LinePoint1 = _mm_div_ps(Point,LengthSq); + + LinePoint2 = _mm_add_ps(LinePoint1, V1); + + Control = XMVectorLessOrEqual(LengthSq, g_XMEpsilon); + *pLinePoint1 = XMVectorSelect(LinePoint1,g_XMQNaN, Control); + *pLinePoint2 = XMVectorSelect(LinePoint2,g_XMQNaN, Control); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMPlaneTransform +( + FXMVECTOR P, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR X; + XMVECTOR Y; + XMVECTOR Z; + XMVECTOR W; + XMVECTOR Result; + + W = XMVectorSplatW(P); + Z = XMVectorSplatZ(P); + Y = XMVectorSplatY(P); + X = XMVectorSplatX(P); + + Result = XMVectorMultiply(W, M.r[3]); + Result = XMVectorMultiplyAdd(Z, M.r[2], Result); + Result = XMVectorMultiplyAdd(Y, M.r[1], Result); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR X = _mm_shuffle_ps(P,P,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR Y = _mm_shuffle_ps(P,P,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR Z = _mm_shuffle_ps(P,P,_MM_SHUFFLE(2,2,2,2)); + XMVECTOR W = _mm_shuffle_ps(P,P,_MM_SHUFFLE(3,3,3,3)); + X = _mm_mul_ps(X, M.r[0]); + Y = _mm_mul_ps(Y, M.r[1]); + Z = _mm_mul_ps(Z, M.r[2]); + W = _mm_mul_ps(W, M.r[3]); + X = _mm_add_ps(X,Z); + Y = _mm_add_ps(Y,W); + X = _mm_add_ps(X,Y); + return X; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMFLOAT4* XMPlaneTransformStream +( + XMFLOAT4* pOutputStream, + size_t OutputStride, + CONST XMFLOAT4* pInputStream, + size_t InputStride, + size_t PlaneCount, + CXMMATRIX M +) +{ + return XMVector4TransformStream(pOutputStream, + OutputStride, + pInputStream, + InputStride, + PlaneCount, + M); +} + +//------------------------------------------------------------------------------ +// Conversion operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMPlaneFromPointNormal +( + FXMVECTOR Point, + FXMVECTOR Normal +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR W; + XMVECTOR Result; + + W = XMVector3Dot(Point, Normal); + W = XMVectorNegate(W); + Result = XMVectorSelect(W, Normal, g_XMSelect1110.v); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR W; + XMVECTOR Result; + W = XMVector3Dot(Point,Normal); + W = _mm_mul_ps(W,g_XMNegativeOne); + Result = _mm_and_ps(Normal,g_XMMask3); + W = _mm_and_ps(W,g_XMMaskW); + Result = _mm_or_ps(Result,W); + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMPlaneFromPoints +( + FXMVECTOR Point1, + FXMVECTOR Point2, + FXMVECTOR Point3 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N; + XMVECTOR D; + XMVECTOR V21; + XMVECTOR V31; + XMVECTOR Result; + + V21 = XMVectorSubtract(Point1, Point2); + V31 = XMVectorSubtract(Point1, Point3); + + N = XMVector3Cross(V21, V31); + N = XMVector3Normalize(N); + + D = XMPlaneDotNormal(N, Point1); + D = XMVectorNegate(D); + + Result = XMVectorSelect(D, N, g_XMSelect1110.v); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR N; + XMVECTOR D; + XMVECTOR V21; + XMVECTOR V31; + XMVECTOR Result; + + V21 = _mm_sub_ps(Point1, Point2); + V31 = _mm_sub_ps(Point1, Point3); + + N = XMVector3Cross(V21, V31); + N = XMVector3Normalize(N); + + D = XMPlaneDotNormal(N, Point1); + D = _mm_mul_ps(D,g_XMNegativeOne); + N = _mm_and_ps(N,g_XMMask3); + D = _mm_and_ps(D,g_XMMaskW); + Result = _mm_or_ps(D,N); + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +/**************************************************************************** + * + * Color + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMColorEqual +( + FXMVECTOR C1, + FXMVECTOR C2 +) +{ + return XMVector4Equal(C1, C2); +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMColorNotEqual +( + FXMVECTOR C1, + FXMVECTOR C2 +) +{ + return XMVector4NotEqual(C1, C2); +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMColorGreater +( + FXMVECTOR C1, + FXMVECTOR C2 +) +{ + return XMVector4Greater(C1, C2); +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMColorGreaterOrEqual +( + FXMVECTOR C1, + FXMVECTOR C2 +) +{ + return XMVector4GreaterOrEqual(C1, C2); +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMColorLess +( + FXMVECTOR C1, + FXMVECTOR C2 +) +{ + return XMVector4Less(C1, C2); +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMColorLessOrEqual +( + FXMVECTOR C1, + FXMVECTOR C2 +) +{ + return XMVector4LessOrEqual(C1, C2); +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMColorIsNaN +( + FXMVECTOR C +) +{ + return XMVector4IsNaN(C); +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMColorIsInfinite +( + FXMVECTOR C +) +{ + return XMVector4IsInfinite(C); +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMColorNegative +( + FXMVECTOR vColor +) +{ +#if defined(_XM_NO_INTRINSICS_) +// XMASSERT(XMVector4GreaterOrEqual(C, XMVectorReplicate(0.0f))); +// XMASSERT(XMVector4LessOrEqual(C, XMVectorReplicate(1.0f))); + XMVECTOR vResult = { + 1.0f - vColor.vector4_f32[0], + 1.0f - vColor.vector4_f32[1], + 1.0f - vColor.vector4_f32[2], + vColor.vector4_f32[3] + }; + return vResult; + +#elif defined(_XM_SSE_INTRINSICS_) + // Negate only x,y and z. + XMVECTOR vTemp = _mm_xor_ps(vColor,g_XMNegate3); + // Add 1,1,1,0 to -x,-y,-z,w + return _mm_add_ps(vTemp,g_XMOne3); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMColorModulate +( + FXMVECTOR C1, + FXMVECTOR C2 +) +{ + return XMVectorMultiply(C1, C2); +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMColorAdjustSaturation +( + FXMVECTOR vColor, + FLOAT fSaturation +) +{ +#if defined(_XM_NO_INTRINSICS_) + CONST XMVECTOR gvLuminance = {0.2125f, 0.7154f, 0.0721f, 0.0f}; + + // Luminance = 0.2125f * C[0] + 0.7154f * C[1] + 0.0721f * C[2]; + // Result = (C - Luminance) * Saturation + Luminance; + + FLOAT fLuminance = (vColor.vector4_f32[0]*gvLuminance.vector4_f32[0])+(vColor.vector4_f32[1]*gvLuminance.vector4_f32[1])+(vColor.vector4_f32[2]*gvLuminance.vector4_f32[2]); + XMVECTOR vResult = { + ((vColor.vector4_f32[0] - fLuminance)*fSaturation)+fLuminance, + ((vColor.vector4_f32[1] - fLuminance)*fSaturation)+fLuminance, + ((vColor.vector4_f32[2] - fLuminance)*fSaturation)+fLuminance, + vColor.vector4_f32[3]}; + return vResult; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 gvLuminance = {0.2125f, 0.7154f, 0.0721f, 0.0f}; +// Mul RGB by intensity constants + XMVECTOR vLuminance = _mm_mul_ps(vColor,gvLuminance); +// vResult.x = vLuminance.y, vResult.y = vLuminance.y, +// vResult.z = vLuminance.z, vResult.w = vLuminance.z + XMVECTOR vResult = vLuminance; + vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(2,2,1,1)); +// vLuminance.x += vLuminance.y + vLuminance = _mm_add_ss(vLuminance,vResult); +// Splat vLuminance.z + vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(2,2,2,2)); +// vLuminance.x += vLuminance.z (Dot product) + vLuminance = _mm_add_ss(vLuminance,vResult); +// Splat vLuminance + vLuminance = _mm_shuffle_ps(vLuminance,vLuminance,_MM_SHUFFLE(0,0,0,0)); +// Splat fSaturation + XMVECTOR vSaturation = _mm_set_ps1(fSaturation); +// vResult = ((vColor-vLuminance)*vSaturation)+vLuminance; + vResult = _mm_sub_ps(vColor,vLuminance); + vResult = _mm_mul_ps(vResult,vSaturation); + vResult = _mm_add_ps(vResult,vLuminance); +// Retain w from the source color + vLuminance = _mm_shuffle_ps(vResult,vColor,_MM_SHUFFLE(3,2,2,2)); // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w + vResult = _mm_shuffle_ps(vResult,vLuminance,_MM_SHUFFLE(3,0,1,0)); // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w + return vResult; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMColorAdjustContrast +( + FXMVECTOR vColor, + FLOAT fContrast +) +{ +#if defined(_XM_NO_INTRINSICS_) + // Result = (vColor - 0.5f) * fContrast + 0.5f; + XMVECTOR vResult = { + ((vColor.vector4_f32[0]-0.5f) * fContrast) + 0.5f, + ((vColor.vector4_f32[1]-0.5f) * fContrast) + 0.5f, + ((vColor.vector4_f32[2]-0.5f) * fContrast) + 0.5f, + vColor.vector4_f32[3] // Leave W untouched + }; + return vResult; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vScale = _mm_set_ps1(fContrast); // Splat the scale + XMVECTOR vResult = _mm_sub_ps(vColor,g_XMOneHalf); // Subtract 0.5f from the source (Saving source) + vResult = _mm_mul_ps(vResult,vScale); // Mul by scale + vResult = _mm_add_ps(vResult,g_XMOneHalf); // Add 0.5f +// Retain w from the source color + vScale = _mm_shuffle_ps(vResult,vColor,_MM_SHUFFLE(3,2,2,2)); // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w + vResult = _mm_shuffle_ps(vResult,vScale,_MM_SHUFFLE(3,0,1,0)); // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w + return vResult; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +/**************************************************************************** + * + * Miscellaneous + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMINLINE BOOL XMVerifyCPUSupport() +{ +#if defined(_XM_NO_INTRINSICS_) || !defined(_XM_SSE_INTRINSICS_) + return TRUE; +#else // _XM_SSE_INTRINSICS_ + // Note that on Windows 2000 or older, SSE2 detection is not supported so this will always fail + // Detecting SSE2 on older versions of Windows would require using cpuid directly + return ( IsProcessorFeaturePresent( PF_XMMI_INSTRUCTIONS_AVAILABLE ) && IsProcessorFeaturePresent( PF_XMMI64_INSTRUCTIONS_AVAILABLE ) ); +#endif +} + + +//------------------------------------------------------------------------------ + +#define XMASSERT_LINE_STRING_SIZE 16 + +XMINLINE VOID XMAssert +( + CONST CHAR* pExpression, + CONST CHAR* pFileName, + UINT LineNumber +) +{ + CHAR aLineString[XMASSERT_LINE_STRING_SIZE]; + CHAR* pLineString; + UINT Line; + + aLineString[XMASSERT_LINE_STRING_SIZE - 2] = '0'; + aLineString[XMASSERT_LINE_STRING_SIZE - 1] = '\0'; + for (Line = LineNumber, pLineString = aLineString + XMASSERT_LINE_STRING_SIZE - 2; + Line != 0 && pLineString >= aLineString; + Line /= 10, pLineString--) + { + *pLineString = (CHAR)('0' + (Line % 10)); + } + +#ifndef NO_OUTPUT_DEBUG_STRING + OutputDebugStringA("Assertion failed: "); + OutputDebugStringA(pExpression); + OutputDebugStringA(", file "); + OutputDebugStringA(pFileName); + OutputDebugStringA(", line "); + OutputDebugStringA(pLineString + 1); + OutputDebugStringA("\r\n"); +#else + DbgPrint("Assertion failed: %s, file %s, line %d\r\n", pExpression, pFileName, LineNumber); +#endif + + __debugbreak(); +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMFresnelTerm +( + FXMVECTOR CosIncidentAngle, + FXMVECTOR RefractionIndex +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR G; + XMVECTOR D, S; + XMVECTOR V0, V1, V2, V3; + XMVECTOR Result; + + // Result = 0.5f * (g - c)^2 / (g + c)^2 * ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1) where + // c = CosIncidentAngle + // g = sqrt(c^2 + RefractionIndex^2 - 1) + + XMASSERT(!XMVector4IsInfinite(CosIncidentAngle)); + + G = XMVectorMultiplyAdd(RefractionIndex, RefractionIndex, g_XMNegativeOne.v); + G = XMVectorMultiplyAdd(CosIncidentAngle, CosIncidentAngle, G); + G = XMVectorAbs(G); + G = XMVectorSqrt(G); + + S = XMVectorAdd(G, CosIncidentAngle); + D = XMVectorSubtract(G, CosIncidentAngle); + + V0 = XMVectorMultiply(D, D); + V1 = XMVectorMultiply(S, S); + V1 = XMVectorReciprocal(V1); + V0 = XMVectorMultiply(g_XMOneHalf.v, V0); + V0 = XMVectorMultiply(V0, V1); + + V2 = XMVectorMultiplyAdd(CosIncidentAngle, S, g_XMNegativeOne.v); + V3 = XMVectorMultiplyAdd(CosIncidentAngle, D, g_XMOne.v); + V2 = XMVectorMultiply(V2, V2); + V3 = XMVectorMultiply(V3, V3); + V3 = XMVectorReciprocal(V3); + V2 = XMVectorMultiplyAdd(V2, V3, g_XMOne.v); + + Result = XMVectorMultiply(V0, V2); + + Result = XMVectorSaturate(Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Result = 0.5f * (g - c)^2 / (g + c)^2 * ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1) where + // c = CosIncidentAngle + // g = sqrt(c^2 + RefractionIndex^2 - 1) + + XMASSERT(!XMVector4IsInfinite(CosIncidentAngle)); + + // G = sqrt(abs((RefractionIndex^2-1) + CosIncidentAngle^2)) + XMVECTOR G = _mm_mul_ps(RefractionIndex,RefractionIndex); + XMVECTOR vTemp = _mm_mul_ps(CosIncidentAngle,CosIncidentAngle); + G = _mm_sub_ps(G,g_XMOne); + vTemp = _mm_add_ps(vTemp,G); + // max((0-vTemp),vTemp) == abs(vTemp) + // The abs is needed to deal with refraction and cosine being zero + G = _mm_setzero_ps(); + G = _mm_sub_ps(G,vTemp); + G = _mm_max_ps(G,vTemp); + // Last operation, the sqrt() + G = _mm_sqrt_ps(G); + + // Calc G-C and G+C + XMVECTOR GAddC = _mm_add_ps(G,CosIncidentAngle); + XMVECTOR GSubC = _mm_sub_ps(G,CosIncidentAngle); + // Perform the term (0.5f *(g - c)^2) / (g + c)^2 + XMVECTOR vResult = _mm_mul_ps(GSubC,GSubC); + vTemp = _mm_mul_ps(GAddC,GAddC); + vResult = _mm_mul_ps(vResult,g_XMOneHalf); + vResult = _mm_div_ps(vResult,vTemp); + // Perform the term ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1) + GAddC = _mm_mul_ps(GAddC,CosIncidentAngle); + GSubC = _mm_mul_ps(GSubC,CosIncidentAngle); + GAddC = _mm_sub_ps(GAddC,g_XMOne); + GSubC = _mm_add_ps(GSubC,g_XMOne); + GAddC = _mm_mul_ps(GAddC,GAddC); + GSubC = _mm_mul_ps(GSubC,GSubC); + GAddC = _mm_div_ps(GAddC,GSubC); + GAddC = _mm_add_ps(GAddC,g_XMOne); + // Multiply the two term parts + vResult = _mm_mul_ps(vResult,GAddC); + // Clamp to 0.0 - 1.0f + vResult = _mm_max_ps(vResult,g_XMZero); + vResult = _mm_min_ps(vResult,g_XMOne); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMScalarNearEqual +( + FLOAT S1, + FLOAT S2, + FLOAT Epsilon +) +{ + FLOAT Delta = S1 - S2; +#if defined(_XM_NO_INTRINSICS_) + UINT AbsDelta = *(const UINT*)&Delta & 0x7FFFFFFF; + return (*(FLOAT*)&AbsDelta <= Epsilon); +#elif defined(_XM_SSE_INTRINSICS_) + return (fabsf(Delta) <= Epsilon); +#else + return (__fabs(Delta) <= Epsilon); +#endif +} + +//------------------------------------------------------------------------------ +// Modulo the range of the given angle such that -XM_PI <= Angle < XM_PI +XMFINLINE FLOAT XMScalarModAngle +( + FLOAT Angle +) +{ + // Note: The modulo is performed with unsigned math only to work + // around a precision error on numbers that are close to PI + float fTemp; +#if defined(_XM_NO_INTRINSICS_) || !defined(_XM_VMX128_INTRINSICS_) + // Normalize the range from 0.0f to XM_2PI + Angle = Angle + XM_PI; + // Perform the modulo, unsigned + fTemp = fabsf(Angle); + fTemp = fTemp - (XM_2PI * (FLOAT)((INT)(fTemp/XM_2PI))); + // Restore the number to the range of -XM_PI to XM_PI-epsilon + fTemp = fTemp - XM_PI; + // If the modulo'd value was negative, restore negation + if (Angle<0.0f) { + fTemp = -fTemp; + } + return fTemp; +#else +#endif +} + +//------------------------------------------------------------------------------ + +XMINLINE FLOAT XMScalarSin +( + FLOAT Value +) +{ +#if defined(_XM_NO_INTRINSICS_) + + FLOAT ValueMod; + FLOAT ValueSq; + XMVECTOR V0123, V0246, V1357, V9111315, V17192123; + XMVECTOR V1, V7, V8; + XMVECTOR R0, R1, R2; + + ValueMod = XMScalarModAngle(Value); + + // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! - V^15 / 15! + + // V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI) + + ValueSq = ValueMod * ValueMod; + + V0123 = XMVectorSet(1.0f, ValueMod, ValueSq, ValueSq * ValueMod); + V1 = XMVectorSplatY(V0123); + V0246 = XMVectorMultiply(V0123, V0123); + V1357 = XMVectorMultiply(V0246, V1); + V7 = XMVectorSplatW(V1357); + V8 = XMVectorMultiply(V7, V1); + V9111315 = XMVectorMultiply(V1357, V8); + V17192123 = XMVectorMultiply(V9111315, V8); + + R0 = XMVector4Dot(V1357, g_XMSinCoefficients0.v); + R1 = XMVector4Dot(V9111315, g_XMSinCoefficients1.v); + R2 = XMVector4Dot(V17192123, g_XMSinCoefficients2.v); + + return R0.vector4_f32[0] + R1.vector4_f32[0] + R2.vector4_f32[0]; + +#elif defined(_XM_SSE_INTRINSICS_) + return sinf( Value ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE FLOAT XMScalarCos +( + FLOAT Value +) +{ +#if defined(_XM_NO_INTRINSICS_) + + FLOAT ValueMod; + FLOAT ValueSq; + XMVECTOR V0123, V0246, V8101214, V16182022; + XMVECTOR V2, V6, V8; + XMVECTOR R0, R1, R2; + + ValueMod = XMScalarModAngle(Value); + + // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + + // V^12 / 12! - V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI) + + ValueSq = ValueMod * ValueMod; + + V0123 = XMVectorSet(1.0f, ValueMod, ValueSq, ValueSq * ValueMod); + V0246 = XMVectorMultiply(V0123, V0123); + + V2 = XMVectorSplatZ(V0123); + V6 = XMVectorSplatW(V0246); + V8 = XMVectorMultiply(V6, V2); + + V8101214 = XMVectorMultiply(V0246, V8); + V16182022 = XMVectorMultiply(V8101214, V8); + + R0 = XMVector4Dot(V0246, g_XMCosCoefficients0.v); + R1 = XMVector4Dot(V8101214, g_XMCosCoefficients1.v); + R2 = XMVector4Dot(V16182022, g_XMCosCoefficients2.v); + + return R0.vector4_f32[0] + R1.vector4_f32[0] + R2.vector4_f32[0]; + +#elif defined(_XM_SSE_INTRINSICS_) + return cosf(Value); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE VOID XMScalarSinCos +( + FLOAT* pSin, + FLOAT* pCos, + FLOAT Value +) +{ +#if defined(_XM_NO_INTRINSICS_) + + FLOAT ValueMod; + FLOAT ValueSq; + XMVECTOR V0123, V0246, V1357, V8101214, V9111315, V16182022, V17192123; + XMVECTOR V1, V2, V6, V8; + XMVECTOR S0, S1, S2, C0, C1, C2; + + XMASSERT(pSin); + XMASSERT(pCos); + + ValueMod = XMScalarModAngle(Value); + + // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! - V^15 / 15! + + // V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI) + // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + + // V^12 / 12! - V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI) + + ValueSq = ValueMod * ValueMod; + + V0123 = XMVectorSet(1.0f, ValueMod, ValueSq, ValueSq * ValueMod); + + V1 = XMVectorSplatY(V0123); + V2 = XMVectorSplatZ(V0123); + + V0246 = XMVectorMultiply(V0123, V0123); + V1357 = XMVectorMultiply(V0246, V1); + + V6 = XMVectorSplatW(V0246); + V8 = XMVectorMultiply(V6, V2); + + V8101214 = XMVectorMultiply(V0246, V8); + V9111315 = XMVectorMultiply(V1357, V8); + V16182022 = XMVectorMultiply(V8101214, V8); + V17192123 = XMVectorMultiply(V9111315, V8); + + C0 = XMVector4Dot(V0246, g_XMCosCoefficients0.v); + S0 = XMVector4Dot(V1357, g_XMSinCoefficients0.v); + C1 = XMVector4Dot(V8101214, g_XMCosCoefficients1.v); + S1 = XMVector4Dot(V9111315, g_XMSinCoefficients1.v); + C2 = XMVector4Dot(V16182022, g_XMCosCoefficients2.v); + S2 = XMVector4Dot(V17192123, g_XMSinCoefficients2.v); + + *pCos = C0.vector4_f32[0] + C1.vector4_f32[0] + C2.vector4_f32[0]; + *pSin = S0.vector4_f32[0] + S1.vector4_f32[0] + S2.vector4_f32[0]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSin); + XMASSERT(pCos); + + *pSin = sinf(Value); + *pCos = cosf(Value); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE FLOAT XMScalarASin +( + FLOAT Value +) +{ +#if defined(_XM_NO_INTRINSICS_) + + FLOAT AbsValue, Value2, Value3, D; + XMVECTOR AbsV, R0, R1, Result; + XMVECTOR V3; + + *(UINT*)&AbsValue = *(const UINT*)&Value & 0x7FFFFFFF; + + Value2 = Value * AbsValue; + Value3 = Value * Value2; + D = (Value - Value2) / sqrtf(1.00000011921f - AbsValue); + + AbsV = XMVectorReplicate(AbsValue); + + V3.vector4_f32[0] = Value3; + V3.vector4_f32[1] = 1.0f; + V3.vector4_f32[2] = Value3; + V3.vector4_f32[3] = 1.0f; + + R1 = XMVectorSet(D, D, Value, Value); + R1 = XMVectorMultiply(R1, V3); + + R0 = XMVectorMultiplyAdd(AbsV, g_XMASinCoefficients0.v, g_XMASinCoefficients1.v); + R0 = XMVectorMultiplyAdd(AbsV, R0, g_XMASinCoefficients2.v); + + Result = XMVector4Dot(R0, R1); + + return Result.vector4_f32[0]; + +#elif defined(_XM_SSE_INTRINSICS_) + return asinf(Value); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE FLOAT XMScalarACos +( + FLOAT Value +) +{ +#if defined(_XM_NO_INTRINSICS_) + + return XM_PIDIV2 - XMScalarASin(Value); + +#elif defined(_XM_SSE_INTRINSICS_) + return acosf(Value); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE FLOAT XMScalarSinEst +( + FLOAT Value +) +{ +#if defined(_XM_NO_INTRINSICS_) + + FLOAT ValueSq; + XMVECTOR V; + XMVECTOR Y; + XMVECTOR Result; + + XMASSERT(Value >= -XM_PI); + XMASSERT(Value < XM_PI); + + // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! (for -PI <= V < PI) + + ValueSq = Value * Value; + + V = XMVectorSet(1.0f, Value, ValueSq, ValueSq * Value); + Y = XMVectorSplatY(V); + V = XMVectorMultiply(V, V); + V = XMVectorMultiply(V, Y); + + Result = XMVector4Dot(V, g_XMSinEstCoefficients.v); + + return Result.vector4_f32[0]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(Value >= -XM_PI); + XMASSERT(Value < XM_PI); + float ValueSq = Value*Value; + XMVECTOR vValue = _mm_set_ps1(Value); + XMVECTOR vTemp = _mm_set_ps(ValueSq * Value,ValueSq,Value,1.0f); + vTemp = _mm_mul_ps(vTemp,vTemp); + vTemp = _mm_mul_ps(vTemp,vValue); + // vTemp = Value,Value^3,Value^5,Value^7 + vTemp = _mm_mul_ps(vTemp,g_XMSinEstCoefficients); + vValue = _mm_shuffle_ps(vValue,vTemp,_MM_SHUFFLE(1,0,0,0)); // Copy X to the Z position and Y to the W position + vValue = _mm_add_ps(vValue,vTemp); // Add Z = X+Z; W = Y+W; + vTemp = _mm_shuffle_ps(vTemp,vValue,_MM_SHUFFLE(0,3,0,0)); // Copy W to the Z position + vTemp = _mm_add_ps(vTemp,vValue); // Add Z and W together + vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(2,2,2,2)); // Splat Z and return +#if defined(_MSC_VER) && (_MSC_VER>=1500) + return _mm_cvtss_f32(vTemp); +#else + return vTemp.m128_f32[0]; +#endif +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE FLOAT XMScalarCosEst +( + FLOAT Value +) +{ +#if defined(_XM_NO_INTRINSICS_) + FLOAT ValueSq; + XMVECTOR V; + XMVECTOR Result; + XMASSERT(Value >= -XM_PI); + XMASSERT(Value < XM_PI); + // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! (for -PI <= V < PI) + ValueSq = Value * Value; + V = XMVectorSet(1.0f, Value, ValueSq, ValueSq * Value); + V = XMVectorMultiply(V, V); + Result = XMVector4Dot(V, g_XMCosEstCoefficients.v); + return Result.vector4_f32[0]; +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(Value >= -XM_PI); + XMASSERT(Value < XM_PI); + float ValueSq = Value*Value; + XMVECTOR vValue = _mm_setzero_ps(); + XMVECTOR vTemp = _mm_set_ps(ValueSq * Value,ValueSq,Value,1.0f); + vTemp = _mm_mul_ps(vTemp,vTemp); + // vTemp = 1.0f,Value^2,Value^4,Value^6 + vTemp = _mm_mul_ps(vTemp,g_XMCosEstCoefficients); + vValue = _mm_shuffle_ps(vValue,vTemp,_MM_SHUFFLE(1,0,0,0)); // Copy X to the Z position and Y to the W position + vValue = _mm_add_ps(vValue,vTemp); // Add Z = X+Z; W = Y+W; + vTemp = _mm_shuffle_ps(vTemp,vValue,_MM_SHUFFLE(0,3,0,0)); // Copy W to the Z position + vTemp = _mm_add_ps(vTemp,vValue); // Add Z and W together + vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(2,2,2,2)); // Splat Z and return +#if defined(_MSC_VER) && (_MSC_VER>=1500) + return _mm_cvtss_f32(vTemp); +#else + return vTemp.m128_f32[0]; +#endif +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMScalarSinCosEst +( + FLOAT* pSin, + FLOAT* pCos, + FLOAT Value +) +{ +#if defined(_XM_NO_INTRINSICS_) + + FLOAT ValueSq; + XMVECTOR V, Sin, Cos; + XMVECTOR Y; + + XMASSERT(pSin); + XMASSERT(pCos); + XMASSERT(Value >= -XM_PI); + XMASSERT(Value < XM_PI); + + // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! (for -PI <= V < PI) + // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! (for -PI <= V < PI) + + ValueSq = Value * Value; + V = XMVectorSet(1.0f, Value, ValueSq, Value * ValueSq); + Y = XMVectorSplatY(V); + Cos = XMVectorMultiply(V, V); + Sin = XMVectorMultiply(Cos, Y); + + Cos = XMVector4Dot(Cos, g_XMCosEstCoefficients.v); + Sin = XMVector4Dot(Sin, g_XMSinEstCoefficients.v); + + *pCos = Cos.vector4_f32[0]; + *pSin = Sin.vector4_f32[0]; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSin); + XMASSERT(pCos); + XMASSERT(Value >= -XM_PI); + XMASSERT(Value < XM_PI); + float ValueSq = Value * Value; + XMVECTOR Cos = _mm_set_ps(Value * ValueSq,ValueSq,Value,1.0f); + XMVECTOR Sin = _mm_set_ps1(Value); + Cos = _mm_mul_ps(Cos,Cos); + Sin = _mm_mul_ps(Sin,Cos); + // Cos = 1.0f,Value^2,Value^4,Value^6 + Cos = XMVector4Dot(Cos,g_XMCosEstCoefficients); + _mm_store_ss(pCos,Cos); + // Sin = Value,Value^3,Value^5,Value^7 + Sin = XMVector4Dot(Sin, g_XMSinEstCoefficients); + _mm_store_ss(pSin,Sin); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE FLOAT XMScalarASinEst +( + FLOAT Value +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR VR, CR, CS; + XMVECTOR Result; + FLOAT AbsV, V2, D; + CONST FLOAT OnePlusEps = 1.00000011921f; + + *(UINT*)&AbsV = *(const UINT*)&Value & 0x7FFFFFFF; + V2 = Value * AbsV; + D = OnePlusEps - AbsV; + + CS = XMVectorSet(Value, 1.0f, 1.0f, V2); + VR = XMVectorSet(sqrtf(D), Value, V2, D * AbsV); + CR = XMVectorMultiply(CS, g_XMASinEstCoefficients.v); + + Result = XMVector4Dot(VR, CR); + + return Result.vector4_f32[0]; + +#elif defined(_XM_SSE_INTRINSICS_) + CONST FLOAT OnePlusEps = 1.00000011921f; + FLOAT AbsV = fabsf(Value); + FLOAT V2 = Value * AbsV; // Square with sign retained + FLOAT D = OnePlusEps - AbsV; + + XMVECTOR Result = _mm_set_ps(V2,1.0f,1.0f,Value); + XMVECTOR VR = _mm_set_ps(D * AbsV,V2,Value,sqrtf(D)); + Result = _mm_mul_ps(Result, g_XMASinEstCoefficients); + Result = XMVector4Dot(VR,Result); +#if defined(_MSC_VER) && (_MSC_VER>=1500) + return _mm_cvtss_f32(Result); +#else + return Result.m128_f32[0]; +#endif +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE FLOAT XMScalarACosEst +( + FLOAT Value +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR VR, CR, CS; + XMVECTOR Result; + FLOAT AbsV, V2, D; + CONST FLOAT OnePlusEps = 1.00000011921f; + + // return XM_PIDIV2 - XMScalarASin(Value); + + *(UINT*)&AbsV = *(const UINT*)&Value & 0x7FFFFFFF; + V2 = Value * AbsV; + D = OnePlusEps - AbsV; + + CS = XMVectorSet(Value, 1.0f, 1.0f, V2); + VR = XMVectorSet(sqrtf(D), Value, V2, D * AbsV); + CR = XMVectorMultiply(CS, g_XMASinEstCoefficients.v); + + Result = XMVector4Dot(VR, CR); + + return XM_PIDIV2 - Result.vector4_f32[0]; + +#elif defined(_XM_SSE_INTRINSICS_) + CONST FLOAT OnePlusEps = 1.00000011921f; + FLOAT AbsV = fabsf(Value); + FLOAT V2 = Value * AbsV; // Value^2 retaining sign + FLOAT D = OnePlusEps - AbsV; + XMVECTOR Result = _mm_set_ps(V2,1.0f,1.0f,Value); + XMVECTOR VR = _mm_set_ps(D * AbsV,V2,Value,sqrtf(D)); + Result = _mm_mul_ps(Result,g_XMASinEstCoefficients); + Result = XMVector4Dot(VR,Result); +#if defined(_MSC_VER) && (_MSC_VER>=1500) + return XM_PIDIV2 - _mm_cvtss_f32(Result); +#else + return XM_PIDIV2 - Result.m128_f32[0]; +#endif +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +#endif // __XNAMATHMISC_INL__ + diff --git a/thirdparty/directxtex/XNAMath/xnamathvector.inl b/thirdparty/directxtex/XNAMath/xnamathvector.inl new file mode 100644 index 0000000..37b7d13 --- /dev/null +++ b/thirdparty/directxtex/XNAMath/xnamathvector.inl @@ -0,0 +1,13673 @@ +/************************************************************************ +* * +* xnamathvector.inl -- SIMD C++ Math library for Windows and Xbox 360 * +* Vector functions * +* * +* Copyright (c) Microsoft Corp. All rights reserved. * +* * +************************************************************************/ + +#if defined(_MSC_VER) && (_MSC_VER > 1000) +#pragma once +#endif + +#ifndef __XNAMATHVECTOR_INL__ +#define __XNAMATHVECTOR_INL__ + +#if defined(_XM_NO_INTRINSICS_) +#define XMISNAN(x) ((*(UINT*)&(x) & 0x7F800000) == 0x7F800000 && (*(UINT*)&(x) & 0x7FFFFF) != 0) +#define XMISINF(x) ((*(UINT*)&(x) & 0x7FFFFFFF) == 0x7F800000) +#endif + +/**************************************************************************** + * + * General Vector + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Assignment operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// Return a vector with all elements equaling zero +XMFINLINE XMVECTOR XMVectorZero() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult = {0.0f,0.0f,0.0f,0.0f}; + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_setzero_ps(); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Initialize a vector with four floating point values +XMFINLINE XMVECTOR XMVectorSet +( + FLOAT x, + FLOAT y, + FLOAT z, + FLOAT w +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = {x,y,z,w}; + return vResult.v; +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_set_ps( w, z, y, x ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Initialize a vector with four integer values +XMFINLINE XMVECTOR XMVectorSetInt +( + UINT x, + UINT y, + UINT z, + UINT w +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult = {x,y,z,w}; + return vResult.v; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_set_epi32( w, z, y, x ); + return reinterpret_cast<__m128 *>(&V)[0]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Initialize a vector with a replicated floating point value +XMFINLINE XMVECTOR XMVectorReplicate +( + FLOAT Value +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) + XMVECTORF32 vResult = {Value,Value,Value,Value}; + return vResult.v; +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_set_ps1( Value ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Initialize a vector with a replicated floating point value passed by pointer +XMFINLINE XMVECTOR XMVectorReplicatePtr +( + CONST FLOAT *pValue +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) + FLOAT Value = pValue[0]; + XMVECTORF32 vResult = {Value,Value,Value,Value}; + return vResult.v; +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ps1( pValue ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Initialize a vector with a replicated integer value +XMFINLINE XMVECTOR XMVectorReplicateInt +( + UINT Value +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) + XMVECTORU32 vResult = {Value,Value,Value,Value}; + return vResult.v; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_set1_epi32( Value ); + return reinterpret_cast(&vTemp)[0]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Initialize a vector with a replicated integer value passed by pointer +XMFINLINE XMVECTOR XMVectorReplicateIntPtr +( + CONST UINT *pValue +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) + UINT Value = pValue[0]; + XMVECTORU32 vResult = {Value,Value,Value,Value}; + return vResult.v; +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ps1(reinterpret_cast(pValue)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Initialize a vector with all bits set (true mask) +XMFINLINE XMVECTOR XMVectorTrueInt() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult = {0xFFFFFFFFU,0xFFFFFFFFU,0xFFFFFFFFU,0xFFFFFFFFU}; + return vResult.v; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_set1_epi32(-1); + return reinterpret_cast<__m128 *>(&V)[0]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Initialize a vector with all bits clear (false mask) +XMFINLINE XMVECTOR XMVectorFalseInt() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult = {0.0f,0.0f,0.0f,0.0f}; + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_setzero_ps(); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Replicate the x component of the vector +XMFINLINE XMVECTOR XMVectorSplatX +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult; + vResult.vector4_f32[0] = + vResult.vector4_f32[1] = + vResult.vector4_f32[2] = + vResult.vector4_f32[3] = V.vector4_f32[0]; + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_shuffle_ps( V, V, _MM_SHUFFLE(0, 0, 0, 0) ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Replicate the y component of the vector +XMFINLINE XMVECTOR XMVectorSplatY +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult; + vResult.vector4_f32[0] = + vResult.vector4_f32[1] = + vResult.vector4_f32[2] = + vResult.vector4_f32[3] = V.vector4_f32[1]; + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_shuffle_ps( V, V, _MM_SHUFFLE(1, 1, 1, 1) ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Replicate the z component of the vector +XMFINLINE XMVECTOR XMVectorSplatZ +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult; + vResult.vector4_f32[0] = + vResult.vector4_f32[1] = + vResult.vector4_f32[2] = + vResult.vector4_f32[3] = V.vector4_f32[2]; + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_shuffle_ps( V, V, _MM_SHUFFLE(2, 2, 2, 2) ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Replicate the w component of the vector +XMFINLINE XMVECTOR XMVectorSplatW +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult; + vResult.vector4_f32[0] = + vResult.vector4_f32[1] = + vResult.vector4_f32[2] = + vResult.vector4_f32[3] = V.vector4_f32[3]; + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_shuffle_ps( V, V, _MM_SHUFFLE(3, 3, 3, 3) ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Return a vector of 1.0f,1.0f,1.0f,1.0f +XMFINLINE XMVECTOR XMVectorSplatOne() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult; + vResult.vector4_f32[0] = + vResult.vector4_f32[1] = + vResult.vector4_f32[2] = + vResult.vector4_f32[3] = 1.0f; + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + return g_XMOne; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Return a vector of INF,INF,INF,INF +XMFINLINE XMVECTOR XMVectorSplatInfinity() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult; + vResult.vector4_u32[0] = + vResult.vector4_u32[1] = + vResult.vector4_u32[2] = + vResult.vector4_u32[3] = 0x7F800000; + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + return g_XMInfinity; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Return a vector of Q_NAN,Q_NAN,Q_NAN,Q_NAN +XMFINLINE XMVECTOR XMVectorSplatQNaN() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult; + vResult.vector4_u32[0] = + vResult.vector4_u32[1] = + vResult.vector4_u32[2] = + vResult.vector4_u32[3] = 0x7FC00000; + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + return g_XMQNaN; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Return a vector of 1.192092896e-7f,1.192092896e-7f,1.192092896e-7f,1.192092896e-7f +XMFINLINE XMVECTOR XMVectorSplatEpsilon() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult; + vResult.vector4_u32[0] = + vResult.vector4_u32[1] = + vResult.vector4_u32[2] = + vResult.vector4_u32[3] = 0x34000000; + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + return g_XMEpsilon; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Return a vector of -0.0f (0x80000000),-0.0f,-0.0f,-0.0f +XMFINLINE XMVECTOR XMVectorSplatSignMask() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult; + vResult.vector4_u32[0] = + vResult.vector4_u32[1] = + vResult.vector4_u32[2] = + vResult.vector4_u32[3] = 0x80000000U; + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_set1_epi32( 0x80000000 ); + return reinterpret_cast<__m128*>(&V)[0]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Return a floating point value via an index. This is not a recommended +// function to use due to performance loss. +XMFINLINE FLOAT XMVectorGetByIndex(FXMVECTOR V,UINT i) +{ + XMASSERT( i <= 3 ); +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[i]; +#elif defined(_XM_SSE_INTRINSICS_) + return V.m128_f32[i]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Return the X component in an FPU register. +// This causes Load/Hit/Store on VMX targets +XMFINLINE FLOAT XMVectorGetX(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[0]; +#elif defined(_XM_SSE_INTRINSICS_) +#if defined(_MSC_VER) && (_MSC_VER>=1500) + return _mm_cvtss_f32(V); +#else + return V.m128_f32[0]; +#endif +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Return the Y component in an FPU register. +// This causes Load/Hit/Store on VMX targets +XMFINLINE FLOAT XMVectorGetY(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[1]; +#elif defined(_XM_SSE_INTRINSICS_) +#if defined(_MSC_VER) && (_MSC_VER>=1500) + XMVECTOR vTemp = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1)); + return _mm_cvtss_f32(vTemp); +#else + return V.m128_f32[1]; +#endif +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Return the Z component in an FPU register. +// This causes Load/Hit/Store on VMX targets +XMFINLINE FLOAT XMVectorGetZ(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[2]; +#elif defined(_XM_SSE_INTRINSICS_) +#if defined(_MSC_VER) && (_MSC_VER>=1500) + XMVECTOR vTemp = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2)); + return _mm_cvtss_f32(vTemp); +#else + return V.m128_f32[2]; +#endif +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Return the W component in an FPU register. +// This causes Load/Hit/Store on VMX targets +XMFINLINE FLOAT XMVectorGetW(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[3]; +#elif defined(_XM_SSE_INTRINSICS_) +#if defined(_MSC_VER) && (_MSC_VER>=1500) + XMVECTOR vTemp = _mm_shuffle_ps(V,V,_MM_SHUFFLE(3,3,3,3)); + return _mm_cvtss_f32(vTemp); +#else + return V.m128_f32[3]; +#endif +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Store a component indexed by i into a 32 bit float location in memory. +// This causes Load/Hit/Store on VMX targets +XMFINLINE VOID XMVectorGetByIndexPtr(FLOAT *f,FXMVECTOR V,UINT i) +{ + XMASSERT( f != 0 ); + XMASSERT( i < 4 ); +#if defined(_XM_NO_INTRINSICS_) + *f = V.vector4_f32[i]; +#elif defined(_XM_SSE_INTRINSICS_) + *f = V.m128_f32[i]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Store the X component into a 32 bit float location in memory. +XMFINLINE VOID XMVectorGetXPtr(FLOAT *x,FXMVECTOR V) +{ + XMASSERT( x != 0 ); +#if defined(_XM_NO_INTRINSICS_) + *x = V.vector4_f32[0]; +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ss(x,V); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Store the Y component into a 32 bit float location in memory. +XMFINLINE VOID XMVectorGetYPtr(FLOAT *y,FXMVECTOR V) +{ + XMASSERT( y != 0 ); +#if defined(_XM_NO_INTRINSICS_) + *y = V.vector4_f32[1]; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1)); + _mm_store_ss(y,vResult); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Store the Z component into a 32 bit float location in memory. +XMFINLINE VOID XMVectorGetZPtr(FLOAT *z,FXMVECTOR V) +{ + XMASSERT( z != 0 ); +#if defined(_XM_NO_INTRINSICS_) + *z = V.vector4_f32[2]; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss(z,vResult); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Store the W component into a 32 bit float location in memory. +XMFINLINE VOID XMVectorGetWPtr(FLOAT *w,FXMVECTOR V) +{ + XMASSERT( w != 0 ); +#if defined(_XM_NO_INTRINSICS_) + *w = V.vector4_f32[3]; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(3,3,3,3)); + _mm_store_ss(w,vResult); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Return an integer value via an index. This is not a recommended +// function to use due to performance loss. +XMFINLINE UINT XMVectorGetIntByIndex(FXMVECTOR V, UINT i) +{ + XMASSERT( i < 4 ); +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[i]; +#elif defined(_XM_SSE_INTRINSICS_) +#if defined(_MSC_VER) && (_MSC_VER<1400) + XMVECTORU32 tmp; + tmp.v = V; + return tmp.u[i]; +#else + return V.m128_u32[i]; +#endif +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Return the X component in an integer register. +// This causes Load/Hit/Store on VMX targets +XMFINLINE UINT XMVectorGetIntX(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[0]; +#elif defined(_XM_SSE_INTRINSICS_) + return static_cast(_mm_cvtsi128_si32(reinterpret_cast(&V)[0])); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Return the Y component in an integer register. +// This causes Load/Hit/Store on VMX targets +XMFINLINE UINT XMVectorGetIntY(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[1]; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vResulti = _mm_shuffle_epi32(reinterpret_cast(&V)[0],_MM_SHUFFLE(1,1,1,1)); + return static_cast(_mm_cvtsi128_si32(vResulti)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Return the Z component in an integer register. +// This causes Load/Hit/Store on VMX targets +XMFINLINE UINT XMVectorGetIntZ(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[2]; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vResulti = _mm_shuffle_epi32(reinterpret_cast(&V)[0],_MM_SHUFFLE(2,2,2,2)); + return static_cast(_mm_cvtsi128_si32(vResulti)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Return the W component in an integer register. +// This causes Load/Hit/Store on VMX targets +XMFINLINE UINT XMVectorGetIntW(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[3]; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vResulti = _mm_shuffle_epi32(reinterpret_cast(&V)[0],_MM_SHUFFLE(3,3,3,3)); + return static_cast(_mm_cvtsi128_si32(vResulti)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Store a component indexed by i into a 32 bit integer location in memory. +// This causes Load/Hit/Store on VMX targets +XMFINLINE VOID XMVectorGetIntByIndexPtr(UINT *x,FXMVECTOR V,UINT i) +{ + XMASSERT( x != 0 ); + XMASSERT( i < 4 ); +#if defined(_XM_NO_INTRINSICS_) + *x = V.vector4_u32[i]; +#elif defined(_XM_SSE_INTRINSICS_) +#if defined(_MSC_VER) && (_MSC_VER<1400) + XMVECTORU32 tmp; + tmp.v = V; + *x = tmp.u[i]; +#else + *x = V.m128_u32[i]; +#endif +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Store the X component into a 32 bit integer location in memory. +XMFINLINE VOID XMVectorGetIntXPtr(UINT *x,FXMVECTOR V) +{ + XMASSERT( x != 0 ); +#if defined(_XM_NO_INTRINSICS_) + *x = V.vector4_u32[0]; +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ss(reinterpret_cast(x),V); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Store the Y component into a 32 bit integer location in memory. +XMFINLINE VOID XMVectorGetIntYPtr(UINT *y,FXMVECTOR V) +{ + XMASSERT( y != 0 ); +#if defined(_XM_NO_INTRINSICS_) + *y = V.vector4_u32[1]; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1)); + _mm_store_ss(reinterpret_cast(y),vResult); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Store the Z component into a 32 bit integer locaCantion in memory. +XMFINLINE VOID XMVectorGetIntZPtr(UINT *z,FXMVECTOR V) +{ + XMASSERT( z != 0 ); +#if defined(_XM_NO_INTRINSICS_) + *z = V.vector4_u32[2]; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss(reinterpret_cast(z),vResult); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Store the W component into a 32 bit integer location in memory. +XMFINLINE VOID XMVectorGetIntWPtr(UINT *w,FXMVECTOR V) +{ + XMASSERT( w != 0 ); +#if defined(_XM_NO_INTRINSICS_) + *w = V.vector4_u32[3]; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(3,3,3,3)); + _mm_store_ss(reinterpret_cast(w),vResult); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Set a single indexed floating point component +// This causes Load/Hit/Store on VMX targets +XMFINLINE XMVECTOR XMVectorSetByIndex(FXMVECTOR V, FLOAT f,UINT i) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + XMASSERT( i <= 3 ); + U = V; + U.vector4_f32[i] = f; + return U; +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT( i <= 3 ); + XMVECTOR U = V; + U.m128_f32[i] = f; + return U; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Sets the X component of a vector to a passed floating point value +// This causes Load/Hit/Store on VMX targets +XMFINLINE XMVECTOR XMVectorSetX(FXMVECTOR V, FLOAT x) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_f32[0] = x; + U.vector4_f32[1] = V.vector4_f32[1]; + U.vector4_f32[2] = V.vector4_f32[2]; + U.vector4_f32[3] = V.vector4_f32[3]; + return U; +#elif defined(_XM_SSE_INTRINSICS_) +#if defined(_XM_ISVS2005_) + XMVECTOR vResult = V; + vResult.m128_f32[0] = x; + return vResult; +#else + XMVECTOR vResult = _mm_set_ss(x); + vResult = _mm_move_ss(V,vResult); + return vResult; +#endif // _XM_ISVS2005_ +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Sets the Y component of a vector to a passed floating point value +// This causes Load/Hit/Store on VMX targets +XMFINLINE XMVECTOR XMVectorSetY(FXMVECTOR V, FLOAT y) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_f32[0] = V.vector4_f32[0]; + U.vector4_f32[1] = y; + U.vector4_f32[2] = V.vector4_f32[2]; + U.vector4_f32[3] = V.vector4_f32[3]; + return U; +#elif defined(_XM_SSE_INTRINSICS_) +#if defined(_XM_ISVS2005_) + XMVECTOR vResult = V; + vResult.m128_f32[1] = y; + return vResult; +#else + // Swap y and x + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(3,2,0,1)); + // Convert input to vector + XMVECTOR vTemp = _mm_set_ss(y); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap y and x again + vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,2,0,1)); + return vResult; +#endif // _XM_ISVS2005_ +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} +// Sets the Z component of a vector to a passed floating point value +// This causes Load/Hit/Store on VMX targets +XMFINLINE XMVECTOR XMVectorSetZ(FXMVECTOR V, FLOAT z) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_f32[0] = V.vector4_f32[0]; + U.vector4_f32[1] = V.vector4_f32[1]; + U.vector4_f32[2] = z; + U.vector4_f32[3] = V.vector4_f32[3]; + return U; +#elif defined(_XM_SSE_INTRINSICS_) +#if defined(_XM_ISVS2005_) + XMVECTOR vResult = V; + vResult.m128_f32[2] = z; + return vResult; +#else + // Swap z and x + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(3,0,1,2)); + // Convert input to vector + XMVECTOR vTemp = _mm_set_ss(z); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap z and x again + vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,0,1,2)); + return vResult; +#endif // _XM_ISVS2005_ +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Sets the W component of a vector to a passed floating point value +// This causes Load/Hit/Store on VMX targets +XMFINLINE XMVECTOR XMVectorSetW(FXMVECTOR V, FLOAT w) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_f32[0] = V.vector4_f32[0]; + U.vector4_f32[1] = V.vector4_f32[1]; + U.vector4_f32[2] = V.vector4_f32[2]; + U.vector4_f32[3] = w; + return U; +#elif defined(_XM_SSE_INTRINSICS_) +#if defined(_XM_ISVS2005_) + XMVECTOR vResult = V; + vResult.m128_f32[3] = w; + return vResult; +#else + // Swap w and x + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(0,2,1,3)); + // Convert input to vector + XMVECTOR vTemp = _mm_set_ss(w); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap w and x again + vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,2,1,3)); + return vResult; +#endif // _XM_ISVS2005_ +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Sets a component of a vector to a floating point value passed by pointer +// This causes Load/Hit/Store on VMX targets +XMFINLINE XMVECTOR XMVectorSetByIndexPtr(FXMVECTOR V,CONST FLOAT *f,UINT i) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + XMASSERT( f != 0 ); + XMASSERT( i <= 3 ); + U = V; + U.vector4_f32[i] = *f; + return U; +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT( f != 0 ); + XMASSERT( i <= 3 ); + XMVECTOR U = V; + U.m128_f32[i] = *f; + return U; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Sets the X component of a vector to a floating point value passed by pointer +XMFINLINE XMVECTOR XMVectorSetXPtr(FXMVECTOR V,CONST FLOAT *x) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + XMASSERT( x != 0 ); + U.vector4_f32[0] = *x; + U.vector4_f32[1] = V.vector4_f32[1]; + U.vector4_f32[2] = V.vector4_f32[2]; + U.vector4_f32[3] = V.vector4_f32[3]; + return U; +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT( x != 0 ); + XMVECTOR vResult = _mm_load_ss(x); + vResult = _mm_move_ss(V,vResult); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Sets the Y component of a vector to a floating point value passed by pointer +XMFINLINE XMVECTOR XMVectorSetYPtr(FXMVECTOR V,CONST FLOAT *y) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + XMASSERT( y != 0 ); + U.vector4_f32[0] = V.vector4_f32[0]; + U.vector4_f32[1] = *y; + U.vector4_f32[2] = V.vector4_f32[2]; + U.vector4_f32[3] = V.vector4_f32[3]; + return U; +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT( y != 0 ); + // Swap y and x + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(3,2,0,1)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(y); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap y and x again + vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,2,0,1)); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Sets the Z component of a vector to a floating point value passed by pointer +XMFINLINE XMVECTOR XMVectorSetZPtr(FXMVECTOR V,CONST FLOAT *z) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + XMASSERT( z != 0 ); + U.vector4_f32[0] = V.vector4_f32[0]; + U.vector4_f32[1] = V.vector4_f32[1]; + U.vector4_f32[2] = *z; + U.vector4_f32[3] = V.vector4_f32[3]; + return U; +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT( z != 0 ); + // Swap z and x + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(3,0,1,2)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(z); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap z and x again + vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,0,1,2)); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Sets the W component of a vector to a floating point value passed by pointer +XMFINLINE XMVECTOR XMVectorSetWPtr(FXMVECTOR V,CONST FLOAT *w) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + XMASSERT( w != 0 ); + U.vector4_f32[0] = V.vector4_f32[0]; + U.vector4_f32[1] = V.vector4_f32[1]; + U.vector4_f32[2] = V.vector4_f32[2]; + U.vector4_f32[3] = *w; + return U; +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT( w != 0 ); + // Swap w and x + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(0,2,1,3)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(w); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap w and x again + vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,2,1,3)); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Sets a component of a vector to an integer passed by value +// This causes Load/Hit/Store on VMX targets +XMFINLINE XMVECTOR XMVectorSetIntByIndex(FXMVECTOR V, UINT x, UINT i) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + XMASSERT( i <= 3 ); + U = V; + U.vector4_u32[i] = x; + return U; +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT( i <= 3 ); + XMVECTORU32 tmp; + tmp.v = V; + tmp.u[i] = x; + return tmp; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Sets the X component of a vector to an integer passed by value +// This causes Load/Hit/Store on VMX targets +XMFINLINE XMVECTOR XMVectorSetIntX(FXMVECTOR V, UINT x) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_u32[0] = x; + U.vector4_u32[1] = V.vector4_u32[1]; + U.vector4_u32[2] = V.vector4_u32[2]; + U.vector4_u32[3] = V.vector4_u32[3]; + return U; +#elif defined(_XM_SSE_INTRINSICS_) +#if defined(_XM_ISVS2005_) + XMVECTOR vResult = V; + vResult.m128_i32[0] = x; + return vResult; +#else + __m128i vTemp = _mm_cvtsi32_si128(x); + XMVECTOR vResult = _mm_move_ss(V,reinterpret_cast(&vTemp)[0]); + return vResult; +#endif // _XM_ISVS2005_ +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Sets the Y component of a vector to an integer passed by value +// This causes Load/Hit/Store on VMX targets +XMFINLINE XMVECTOR XMVectorSetIntY(FXMVECTOR V, UINT y) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_u32[0] = V.vector4_u32[0]; + U.vector4_u32[1] = y; + U.vector4_u32[2] = V.vector4_u32[2]; + U.vector4_u32[3] = V.vector4_u32[3]; + return U; +#elif defined(_XM_SSE_INTRINSICS_) +#if defined(_XM_ISVS2005_) + XMVECTOR vResult = V; + vResult.m128_i32[1] = y; + return vResult; +#else // Swap y and x + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(3,2,0,1)); + // Convert input to vector + __m128i vTemp = _mm_cvtsi32_si128(y); + // Replace the x component + vResult = _mm_move_ss(vResult,reinterpret_cast(&vTemp)[0]); + // Swap y and x again + vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,2,0,1)); + return vResult; +#endif // _XM_ISVS2005_ +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Sets the Z component of a vector to an integer passed by value +// This causes Load/Hit/Store on VMX targets +XMFINLINE XMVECTOR XMVectorSetIntZ(FXMVECTOR V, UINT z) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_u32[0] = V.vector4_u32[0]; + U.vector4_u32[1] = V.vector4_u32[1]; + U.vector4_u32[2] = z; + U.vector4_u32[3] = V.vector4_u32[3]; + return U; +#elif defined(_XM_SSE_INTRINSICS_) +#if defined(_XM_ISVS2005_) + XMVECTOR vResult = V; + vResult.m128_i32[2] = z; + return vResult; +#else + // Swap z and x + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(3,0,1,2)); + // Convert input to vector + __m128i vTemp = _mm_cvtsi32_si128(z); + // Replace the x component + vResult = _mm_move_ss(vResult,reinterpret_cast(&vTemp)[0]); + // Swap z and x again + vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,0,1,2)); + return vResult; +#endif // _XM_ISVS2005_ +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Sets the W component of a vector to an integer passed by value +// This causes Load/Hit/Store on VMX targets +XMFINLINE XMVECTOR XMVectorSetIntW(FXMVECTOR V, UINT w) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_u32[0] = V.vector4_u32[0]; + U.vector4_u32[1] = V.vector4_u32[1]; + U.vector4_u32[2] = V.vector4_u32[2]; + U.vector4_u32[3] = w; + return U; +#elif defined(_XM_SSE_INTRINSICS_) +#if defined(_XM_ISVS2005_) + XMVECTOR vResult = V; + vResult.m128_i32[3] = w; + return vResult; +#else + // Swap w and x + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(0,2,1,3)); + // Convert input to vector + __m128i vTemp = _mm_cvtsi32_si128(w); + // Replace the x component + vResult = _mm_move_ss(vResult,reinterpret_cast(&vTemp)[0]); + // Swap w and x again + vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,2,1,3)); + return vResult; +#endif // _XM_ISVS2005_ +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Sets a component of a vector to an integer value passed by pointer +// This causes Load/Hit/Store on VMX targets +XMFINLINE XMVECTOR XMVectorSetIntByIndexPtr(FXMVECTOR V, CONST UINT *x,UINT i) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + XMASSERT( x != 0 ); + XMASSERT( i <= 3 ); + U = V; + U.vector4_u32[i] = *x; + return U; +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT( x != 0 ); + XMASSERT( i <= 3 ); + XMVECTORU32 tmp; + tmp.v = V; + tmp.u[i] = *x; + return tmp; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Sets the X component of a vector to an integer value passed by pointer +XMFINLINE XMVECTOR XMVectorSetIntXPtr(FXMVECTOR V,CONST UINT *x) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + XMASSERT( x != 0 ); + U.vector4_u32[0] = *x; + U.vector4_u32[1] = V.vector4_u32[1]; + U.vector4_u32[2] = V.vector4_u32[2]; + U.vector4_u32[3] = V.vector4_u32[3]; + return U; +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT( x != 0 ); + XMVECTOR vTemp = _mm_load_ss(reinterpret_cast(x)); + XMVECTOR vResult = _mm_move_ss(V,vTemp); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Sets the Y component of a vector to an integer value passed by pointer +XMFINLINE XMVECTOR XMVectorSetIntYPtr(FXMVECTOR V,CONST UINT *y) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + XMASSERT( y != 0 ); + U.vector4_u32[0] = V.vector4_u32[0]; + U.vector4_u32[1] = *y; + U.vector4_u32[2] = V.vector4_u32[2]; + U.vector4_u32[3] = V.vector4_u32[3]; + return U; +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT( y != 0 ); + // Swap y and x + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(3,2,0,1)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(reinterpret_cast(y)); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap y and x again + vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,2,0,1)); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Sets the Z component of a vector to an integer value passed by pointer +XMFINLINE XMVECTOR XMVectorSetIntZPtr(FXMVECTOR V,CONST UINT *z) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + XMASSERT( z != 0 ); + U.vector4_u32[0] = V.vector4_u32[0]; + U.vector4_u32[1] = V.vector4_u32[1]; + U.vector4_u32[2] = *z; + U.vector4_u32[3] = V.vector4_u32[3]; + return U; +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT( z != 0 ); + // Swap z and x + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(3,0,1,2)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(reinterpret_cast(z)); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap z and x again + vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,0,1,2)); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Sets the W component of a vector to an integer value passed by pointer +XMFINLINE XMVECTOR XMVectorSetIntWPtr(FXMVECTOR V,CONST UINT *w) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + XMASSERT( w != 0 ); + U.vector4_u32[0] = V.vector4_u32[0]; + U.vector4_u32[1] = V.vector4_u32[1]; + U.vector4_u32[2] = V.vector4_u32[2]; + U.vector4_u32[3] = *w; + return U; +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT( w != 0 ); + // Swap w and x + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(0,2,1,3)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(reinterpret_cast(w)); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap w and x again + vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,2,1,3)); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Define a control vector to be used in XMVectorPermute +// operations. Visualize the two vectors V1 and V2 given +// in a permute as arranged back to back in a linear fashion, +// such that they form an array of 8 floating point values. +// The four integers specified in XMVectorPermuteControl +// will serve as indices into the array to select components +// from the two vectors. ElementIndex0 is used to select +// an element from the vectors to be placed in the first +// component of the resulting vector, ElementIndex1 is used +// to select an element for the second component, etc. + +XMFINLINE XMVECTOR XMVectorPermuteControl +( + UINT ElementIndex0, + UINT ElementIndex1, + UINT ElementIndex2, + UINT ElementIndex3 +) +{ +#if defined(_XM_SSE_INTRINSICS_) || defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vControl; + static CONST UINT ControlElement[] = { + XM_PERMUTE_0X, + XM_PERMUTE_0Y, + XM_PERMUTE_0Z, + XM_PERMUTE_0W, + XM_PERMUTE_1X, + XM_PERMUTE_1Y, + XM_PERMUTE_1Z, + XM_PERMUTE_1W + }; + XMASSERT(ElementIndex0 < 8); + XMASSERT(ElementIndex1 < 8); + XMASSERT(ElementIndex2 < 8); + XMASSERT(ElementIndex3 < 8); + + vControl.u[0] = ControlElement[ElementIndex0]; + vControl.u[1] = ControlElement[ElementIndex1]; + vControl.u[2] = ControlElement[ElementIndex2]; + vControl.u[3] = ControlElement[ElementIndex3]; + return vControl.v; +#else +#endif +} + +//------------------------------------------------------------------------------ + +// Using a control vector made up of 16 bytes from 0-31, remap V1 and V2's byte +// entries into a single 16 byte vector and return it. Index 0-15 = V1, +// 16-31 = V2 +XMFINLINE XMVECTOR XMVectorPermute +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Control +) +{ +#if defined(_XM_NO_INTRINSICS_) + const BYTE *aByte[2]; + XMVECTOR Result; + UINT i, uIndex, VectorIndex; + const BYTE *pControl; + BYTE *pWork; + + // Indices must be in range from 0 to 31 + XMASSERT((Control.vector4_u32[0] & 0xE0E0E0E0) == 0); + XMASSERT((Control.vector4_u32[1] & 0xE0E0E0E0) == 0); + XMASSERT((Control.vector4_u32[2] & 0xE0E0E0E0) == 0); + XMASSERT((Control.vector4_u32[3] & 0xE0E0E0E0) == 0); + + // 0-15 = V1, 16-31 = V2 + aByte[0] = (const BYTE*)(&V1); + aByte[1] = (const BYTE*)(&V2); + i = 16; + pControl = (const BYTE *)(&Control); + pWork = (BYTE *)(&Result); + do { + // Get the byte to map from + uIndex = pControl[0]; + ++pControl; + VectorIndex = (uIndex>>4)&1; + uIndex &= 0x0F; +#if defined(_XM_LITTLEENDIAN_) + uIndex ^= 3; // Swap byte ordering on little endian machines +#endif + pWork[0] = aByte[VectorIndex][uIndex]; + ++pWork; + } while (--i); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) +#if defined(_PREFAST_) || defined(XMDEBUG) + // Indices must be in range from 0 to 31 + static const XMVECTORI32 PremuteTest = {0xE0E0E0E0,0xE0E0E0E0,0xE0E0E0E0,0xE0E0E0E0}; + XMVECTOR vAssert = _mm_and_ps(Control,PremuteTest); + __m128i vAsserti = _mm_cmpeq_epi32(reinterpret_cast(&vAssert)[0],g_XMZero); + XMASSERT(_mm_movemask_ps(*reinterpret_cast(&vAsserti)) == 0xf); +#endif + // Store the vectors onto local memory on the stack + XMVECTOR Array[2]; + Array[0] = V1; + Array[1] = V2; + // Output vector, on the stack + XMVECTORU8 vResult; + // Get pointer to the two vectors on the stack + const BYTE *pInput = reinterpret_cast(Array); + // Store the Control vector on the stack to access the bytes + // don't use Control, it can cause a register variable to spill on the stack. + XMVECTORU8 vControl; + vControl.v = Control; // Write to memory + UINT i = 0; + do { + UINT ComponentIndex = vControl.u[i] & 0x1FU; + ComponentIndex ^= 3; // Swap byte ordering + vResult.u[i] = pInput[ComponentIndex]; + } while (++i<16); + return vResult; +#else // _XM_SSE_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Define a control vector to be used in XMVectorSelect +// operations. The four integers specified in XMVectorSelectControl +// serve as indices to select between components in two vectors. +// The first index controls selection for the first component of +// the vectors involved in a select operation, the second index +// controls selection for the second component etc. A value of +// zero for an index causes the corresponding component from the first +// vector to be selected whereas a one causes the component from the +// second vector to be selected instead. + +XMFINLINE XMVECTOR XMVectorSelectControl +( + UINT VectorIndex0, + UINT VectorIndex1, + UINT VectorIndex2, + UINT VectorIndex3 +) +{ +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + // x=Index0,y=Index1,z=Index2,w=Index3 + __m128i vTemp = _mm_set_epi32(VectorIndex3,VectorIndex2,VectorIndex1,VectorIndex0); + // Any non-zero entries become 0xFFFFFFFF else 0 + vTemp = _mm_cmpgt_epi32(vTemp,g_XMZero); + return reinterpret_cast<__m128 *>(&vTemp)[0]; +#else + XMVECTOR ControlVector; + CONST UINT ControlElement[] = + { + XM_SELECT_0, + XM_SELECT_1 + }; + + XMASSERT(VectorIndex0 < 2); + XMASSERT(VectorIndex1 < 2); + XMASSERT(VectorIndex2 < 2); + XMASSERT(VectorIndex3 < 2); + + ControlVector.vector4_u32[0] = ControlElement[VectorIndex0]; + ControlVector.vector4_u32[1] = ControlElement[VectorIndex1]; + ControlVector.vector4_u32[2] = ControlElement[VectorIndex2]; + ControlVector.vector4_u32[3] = ControlElement[VectorIndex3]; + + return ControlVector; + +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorSelect +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Control +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result.vector4_u32[0] = (V1.vector4_u32[0] & ~Control.vector4_u32[0]) | (V2.vector4_u32[0] & Control.vector4_u32[0]); + Result.vector4_u32[1] = (V1.vector4_u32[1] & ~Control.vector4_u32[1]) | (V2.vector4_u32[1] & Control.vector4_u32[1]); + Result.vector4_u32[2] = (V1.vector4_u32[2] & ~Control.vector4_u32[2]) | (V2.vector4_u32[2] & Control.vector4_u32[2]); + Result.vector4_u32[3] = (V1.vector4_u32[3] & ~Control.vector4_u32[3]) | (V2.vector4_u32[3] & Control.vector4_u32[3]); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp1 = _mm_andnot_ps(Control,V1); + XMVECTOR vTemp2 = _mm_and_ps(V2,Control); + return _mm_or_ps(vTemp1,vTemp2); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorMergeXY +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result.vector4_u32[0] = V1.vector4_u32[0]; + Result.vector4_u32[1] = V2.vector4_u32[0]; + Result.vector4_u32[2] = V1.vector4_u32[1]; + Result.vector4_u32[3] = V2.vector4_u32[1]; + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_unpacklo_ps( V1, V2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorMergeZW +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result.vector4_u32[0] = V1.vector4_u32[2]; + Result.vector4_u32[1] = V2.vector4_u32[2]; + Result.vector4_u32[2] = V1.vector4_u32[3]; + Result.vector4_u32[3] = V2.vector4_u32[3]; + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_unpackhi_ps( V1, V2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + + Control.vector4_u32[0] = (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[1] = (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[2] = (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[3] = (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFF : 0; + + return Control; + +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmpeq_ps( V1, V2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorEqualR +( + UINT* pCR, + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + UINT ux, uy, uz, uw, CR; + XMVECTOR Control; + + XMASSERT( pCR ); + + ux = (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + uy = (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + uz = (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + uw = (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + CR = 0; + if (ux&uy&uz&uw) + { + // All elements are greater + CR = XM_CRMASK_CR6TRUE; + } + else if (!(ux|uy|uz|uw)) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + Control.vector4_u32[0] = ux; + Control.vector4_u32[1] = uy; + Control.vector4_u32[2] = uz; + Control.vector4_u32[3] = uw; + return Control; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT( pCR ); + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); + UINT CR = 0; + int iTest = _mm_movemask_ps(vTemp); + if (iTest==0xf) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vTemp; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Treat the components of the vectors as unsigned integers and +// compare individual bits between the two. This is useful for +// comparing control vectors and result vectors returned from +// other comparison operations. + +XMFINLINE XMVECTOR XMVectorEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + + Control.vector4_u32[0] = (V1.vector4_u32[0] == V2.vector4_u32[0]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[1] = (V1.vector4_u32[1] == V2.vector4_u32[1]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[2] = (V1.vector4_u32[2] == V2.vector4_u32[2]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[3] = (V1.vector4_u32[3] == V2.vector4_u32[3]) ? 0xFFFFFFFF : 0; + + return Control; + +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_cmpeq_epi32( reinterpret_cast(&V1)[0],reinterpret_cast(&V2)[0] ); + return reinterpret_cast<__m128 *>(&V)[0]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorEqualIntR +( + UINT* pCR, + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + + XMASSERT(pCR); + + Control = XMVectorEqualInt(V1, V2); + + *pCR = 0; + + if (XMVector4EqualInt(Control, XMVectorTrueInt())) + { + // All elements are equal + *pCR |= XM_CRMASK_CR6TRUE; + } + else if (XMVector4EqualInt(Control, XMVectorFalseInt())) + { + // All elements are not equal + *pCR |= XM_CRMASK_CR6FALSE; + } + + return Control; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pCR); + __m128i V = _mm_cmpeq_epi32( reinterpret_cast(&V1)[0],reinterpret_cast(&V2)[0] ); + int iTemp = _mm_movemask_ps(reinterpret_cast(&V)[0]); + UINT CR = 0; + if (iTemp==0x0F) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTemp) + { + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return reinterpret_cast<__m128 *>(&V)[0]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorNearEqual +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Epsilon +) +{ +#if defined(_XM_NO_INTRINSICS_) + + FLOAT fDeltax, fDeltay, fDeltaz, fDeltaw; + XMVECTOR Control; + + fDeltax = V1.vector4_f32[0]-V2.vector4_f32[0]; + fDeltay = V1.vector4_f32[1]-V2.vector4_f32[1]; + fDeltaz = V1.vector4_f32[2]-V2.vector4_f32[2]; + fDeltaw = V1.vector4_f32[3]-V2.vector4_f32[3]; + + fDeltax = fabsf(fDeltax); + fDeltay = fabsf(fDeltay); + fDeltaz = fabsf(fDeltaz); + fDeltaw = fabsf(fDeltaw); + + Control.vector4_u32[0] = (fDeltax <= Epsilon.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[1] = (fDeltay <= Epsilon.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[2] = (fDeltaz <= Epsilon.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[3] = (fDeltaw <= Epsilon.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + + return Control; + +#elif defined(_XM_SSE_INTRINSICS_) + // Get the difference + XMVECTOR vDelta = _mm_sub_ps(V1,V2); + // Get the absolute value of the difference + XMVECTOR vTemp = _mm_setzero_ps(); + vTemp = _mm_sub_ps(vTemp,vDelta); + vTemp = _mm_max_ps(vTemp,vDelta); + vTemp = _mm_cmple_ps(vTemp,Epsilon); + return vTemp; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorNotEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = (V1.vector4_f32[0] != V2.vector4_f32[0]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[1] = (V1.vector4_f32[1] != V2.vector4_f32[1]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[2] = (V1.vector4_f32[2] != V2.vector4_f32[2]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[3] = (V1.vector4_f32[3] != V2.vector4_f32[3]) ? 0xFFFFFFFF : 0; + return Control; + +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmpneq_ps( V1, V2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorNotEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = (V1.vector4_u32[0] != V2.vector4_u32[0]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[1] = (V1.vector4_u32[1] != V2.vector4_u32[1]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[2] = (V1.vector4_u32[2] != V2.vector4_u32[2]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[3] = (V1.vector4_u32[3] != V2.vector4_u32[3]) ? 0xFFFFFFFFU : 0; + return Control; + +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_cmpeq_epi32( reinterpret_cast(&V1)[0],reinterpret_cast(&V2)[0] ); + return _mm_xor_ps(reinterpret_cast<__m128 *>(&V)[0],g_XMNegOneMask); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorGreater +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[1] = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[2] = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[3] = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFF : 0; + return Control; + +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmpgt_ps( V1, V2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorGreaterR +( + UINT* pCR, + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + UINT ux, uy, uz, uw, CR; + XMVECTOR Control; + + XMASSERT( pCR ); + + ux = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + uy = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + uz = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + uw = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + CR = 0; + if (ux&uy&uz&uw) + { + // All elements are greater + CR = XM_CRMASK_CR6TRUE; + } + else if (!(ux|uy|uz|uw)) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + Control.vector4_u32[0] = ux; + Control.vector4_u32[1] = uy; + Control.vector4_u32[2] = uz; + Control.vector4_u32[3] = uw; + return Control; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT( pCR ); + XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); + UINT CR = 0; + int iTest = _mm_movemask_ps(vTemp); + if (iTest==0xf) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vTemp; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorGreaterOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[1] = (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[2] = (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[3] = (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0; + return Control; + +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmpge_ps( V1, V2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorGreaterOrEqualR +( + UINT* pCR, + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + UINT ux, uy, uz, uw, CR; + XMVECTOR Control; + + XMASSERT( pCR ); + + ux = (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + uy = (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + uz = (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + uw = (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + CR = 0; + if (ux&uy&uz&uw) + { + // All elements are greater + CR = XM_CRMASK_CR6TRUE; + } + else if (!(ux|uy|uz|uw)) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + Control.vector4_u32[0] = ux; + Control.vector4_u32[1] = uy; + Control.vector4_u32[2] = uz; + Control.vector4_u32[3] = uw; + return Control; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT( pCR ); + XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); + UINT CR = 0; + int iTest = _mm_movemask_ps(vTemp); + if (iTest==0xf) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vTemp; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorLess +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = (V1.vector4_f32[0] < V2.vector4_f32[0]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[1] = (V1.vector4_f32[1] < V2.vector4_f32[1]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[2] = (V1.vector4_f32[2] < V2.vector4_f32[2]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[3] = (V1.vector4_f32[3] < V2.vector4_f32[3]) ? 0xFFFFFFFF : 0; + return Control; + +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmplt_ps( V1, V2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorLessOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = (V1.vector4_f32[0] <= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[1] = (V1.vector4_f32[1] <= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[2] = (V1.vector4_f32[2] <= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[3] = (V1.vector4_f32[3] <= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0; + return Control; + +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmple_ps( V1, V2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorInBounds +( + FXMVECTOR V, + FXMVECTOR Bounds +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[1] = (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[2] = (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[3] = (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFF : 0; + return Control; + +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2,V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1,vTemp2); + return vTemp1; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorInBoundsR +( + UINT* pCR, + FXMVECTOR V, + FXMVECTOR Bounds +) +{ +#if defined(_XM_NO_INTRINSICS_) + UINT ux, uy, uz, uw, CR; + XMVECTOR Control; + + XMASSERT( pCR != 0 ); + + ux = (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + uy = (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + uz = (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + uw = (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + + CR = 0; + + if (ux&uy&uz&uw) + { + // All elements are in bounds + CR = XM_CRMASK_CR6BOUNDS; + } + *pCR = CR; + Control.vector4_u32[0] = ux; + Control.vector4_u32[1] = uy; + Control.vector4_u32[2] = uz; + Control.vector4_u32[3] = uw; + return Control; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT( pCR != 0 ); + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2,V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1,vTemp2); + + UINT CR = 0; + if (_mm_movemask_ps(vTemp1)==0xf) { + // All elements are in bounds + CR = XM_CRMASK_CR6BOUNDS; + } + *pCR = CR; + return vTemp1; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorIsNaN +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = XMISNAN(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[1] = XMISNAN(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[2] = XMISNAN(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[3] = XMISNAN(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + return Control; + +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the exponent + __m128i vTempInf = _mm_and_si128(reinterpret_cast(&V)[0],g_XMInfinity); + // Mask off the mantissa + __m128i vTempNan = _mm_and_si128(reinterpret_cast(&V)[0],g_XMQNaNTest); + // Are any of the exponents == 0x7F800000? + vTempInf = _mm_cmpeq_epi32(vTempInf,g_XMInfinity); + // Are any of the mantissa's zero? (SSE2 doesn't have a neq test) + vTempNan = _mm_cmpeq_epi32(vTempNan,g_XMZero); + // Perform a not on the NaN test to be true on NON-zero mantissas + vTempNan = _mm_andnot_si128(vTempNan,vTempInf); + // If any are NaN, the signs are true after the merge above + return reinterpret_cast(&vTempNan)[0]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorIsInfinite +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = XMISINF(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[1] = XMISINF(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[2] = XMISINF(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[3] = XMISINF(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + return Control; + +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bit + __m128 vTemp = _mm_and_ps(V,g_XMAbsMask); + // Compare to infinity + vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity); + // If any are infinity, the signs are true. + return vTemp; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Rounding and clamping operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorMin +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_f32[0] = (V1.vector4_f32[0] < V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0]; + Result.vector4_f32[1] = (V1.vector4_f32[1] < V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1]; + Result.vector4_f32[2] = (V1.vector4_f32[2] < V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2]; + Result.vector4_f32[3] = (V1.vector4_f32[3] < V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3]; + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_min_ps( V1, V2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorMax +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_f32[0] = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0]; + Result.vector4_f32[1] = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1]; + Result.vector4_f32[2] = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2]; + Result.vector4_f32[3] = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3]; + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_max_ps( V1, V2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorRound +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + XMVECTOR Bias; + CONST XMVECTOR Zero = XMVectorZero(); + CONST XMVECTOR BiasPos = XMVectorReplicate(0.5f); + CONST XMVECTOR BiasNeg = XMVectorReplicate(-0.5f); + + Bias = XMVectorLess(V, Zero); + Bias = XMVectorSelect(BiasPos, BiasNeg, Bias); + Result = XMVectorAdd(V, Bias); + Result = XMVectorTruncate(Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // To handle NAN, INF and numbers greater than 8388608, use masking + // Get the abs value + __m128i vTest = _mm_and_si128(reinterpret_cast(&V)[0],g_XMAbsMask); + // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF + vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction); + // Convert to int and back to float for rounding + __m128i vInt = _mm_cvtps_epi32(V); + // Convert back to floats + XMVECTOR vResult = _mm_cvtepi32_ps(vInt); + // All numbers less than 8388608 will use the round to int + vResult = _mm_and_ps(vResult,reinterpret_cast(&vTest)[0]); + // All others, use the ORIGINAL value + vTest = _mm_andnot_si128(vTest,reinterpret_cast(&V)[0]); + vResult = _mm_or_ps(vResult,reinterpret_cast(&vTest)[0]); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorTruncate +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + UINT i; + + // Avoid C4701 + Result.vector4_f32[0] = 0.0f; + + for (i = 0; i < 4; i++) + { + if (XMISNAN(V.vector4_f32[i])) + { + Result.vector4_u32[i] = 0x7FC00000; + } + else if (fabsf(V.vector4_f32[i]) < 8388608.0f) + { + Result.vector4_f32[i] = (FLOAT)((INT)V.vector4_f32[i]); + } + else + { + Result.vector4_f32[i] = V.vector4_f32[i]; + } + } + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // To handle NAN, INF and numbers greater than 8388608, use masking + // Get the abs value + __m128i vTest = _mm_and_si128(reinterpret_cast(&V)[0],g_XMAbsMask); + // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF + vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction); + // Convert to int and back to float for rounding with truncation + __m128i vInt = _mm_cvttps_epi32(V); + // Convert back to floats + XMVECTOR vResult = _mm_cvtepi32_ps(vInt); + // All numbers less than 8388608 will use the round to int + vResult = _mm_and_ps(vResult,reinterpret_cast(&vTest)[0]); + // All others, use the ORIGINAL value + vTest = _mm_andnot_si128(vTest,reinterpret_cast(&V)[0]); + vResult = _mm_or_ps(vResult,reinterpret_cast(&vTest)[0]); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorFloor +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR vResult = { + floorf(V.vector4_f32[0]), + floorf(V.vector4_f32[1]), + floorf(V.vector4_f32[2]), + floorf(V.vector4_f32[3]) + }; + return vResult; + +#elif defined(_XM_SSE_INTRINSICS_) + // To handle NAN, INF and numbers greater than 8388608, use masking + // Get the abs value + __m128i vTest = _mm_and_si128(reinterpret_cast(&V)[0],g_XMAbsMask); + // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF + vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction); + // Convert to int and back to float for rounding + XMVECTOR vResult = _mm_sub_ps(V,g_XMOneHalfMinusEpsilon); + __m128i vInt = _mm_cvtps_epi32(vResult); + // Convert back to floats + vResult = _mm_cvtepi32_ps(vInt); + // All numbers less than 8388608 will use the round to int + vResult = _mm_and_ps(vResult,reinterpret_cast(&vTest)[0]); + // All others, use the ORIGINAL value + vTest = _mm_andnot_si128(vTest,reinterpret_cast(&V)[0]); + vResult = _mm_or_ps(vResult,reinterpret_cast(&vTest)[0]); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorCeiling +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult = { + ceilf(V.vector4_f32[0]), + ceilf(V.vector4_f32[1]), + ceilf(V.vector4_f32[2]), + ceilf(V.vector4_f32[3]) + }; + return vResult; + +#elif defined(_XM_SSE_INTRINSICS_) + // To handle NAN, INF and numbers greater than 8388608, use masking + // Get the abs value + __m128i vTest = _mm_and_si128(reinterpret_cast(&V)[0],g_XMAbsMask); + // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF + vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction); + // Convert to int and back to float for rounding + XMVECTOR vResult = _mm_add_ps(V,g_XMOneHalfMinusEpsilon); + __m128i vInt = _mm_cvtps_epi32(vResult); + // Convert back to floats + vResult = _mm_cvtepi32_ps(vInt); + // All numbers less than 8388608 will use the round to int + vResult = _mm_and_ps(vResult,reinterpret_cast(&vTest)[0]); + // All others, use the ORIGINAL value + vTest = _mm_andnot_si128(vTest,reinterpret_cast(&V)[0]); + vResult = _mm_or_ps(vResult,reinterpret_cast(&vTest)[0]); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorClamp +( + FXMVECTOR V, + FXMVECTOR Min, + FXMVECTOR Max +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + XMASSERT(XMVector4LessOrEqual(Min, Max)); + + Result = XMVectorMax(Min, V); + Result = XMVectorMin(Max, Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult; + XMASSERT(XMVector4LessOrEqual(Min, Max)); + vResult = _mm_max_ps(Min,V); + vResult = _mm_min_ps(vResult,Max); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorSaturate +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + CONST XMVECTOR Zero = XMVectorZero(); + + return XMVectorClamp(V, Zero, g_XMOne.v); + +#elif defined(_XM_SSE_INTRINSICS_) + // Set <0 to 0 + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + // Set>1 to 1 + return _mm_min_ps(vResult,g_XMOne); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Bitwise logical operations +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorAndInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result.vector4_u32[0] = V1.vector4_u32[0] & V2.vector4_u32[0]; + Result.vector4_u32[1] = V1.vector4_u32[1] & V2.vector4_u32[1]; + Result.vector4_u32[2] = V1.vector4_u32[2] & V2.vector4_u32[2]; + Result.vector4_u32[3] = V1.vector4_u32[3] & V2.vector4_u32[3]; + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_and_ps(V1,V2); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorAndCInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result.vector4_u32[0] = V1.vector4_u32[0] & ~V2.vector4_u32[0]; + Result.vector4_u32[1] = V1.vector4_u32[1] & ~V2.vector4_u32[1]; + Result.vector4_u32[2] = V1.vector4_u32[2] & ~V2.vector4_u32[2]; + Result.vector4_u32[3] = V1.vector4_u32[3] & ~V2.vector4_u32[3]; + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_andnot_si128( reinterpret_cast(&V2)[0], reinterpret_cast(&V1)[0] ); + return reinterpret_cast<__m128 *>(&V)[0]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorOrInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result.vector4_u32[0] = V1.vector4_u32[0] | V2.vector4_u32[0]; + Result.vector4_u32[1] = V1.vector4_u32[1] | V2.vector4_u32[1]; + Result.vector4_u32[2] = V1.vector4_u32[2] | V2.vector4_u32[2]; + Result.vector4_u32[3] = V1.vector4_u32[3] | V2.vector4_u32[3]; + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_or_si128( reinterpret_cast(&V1)[0], reinterpret_cast(&V2)[0] ); + return reinterpret_cast<__m128 *>(&V)[0]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorNorInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result.vector4_u32[0] = ~(V1.vector4_u32[0] | V2.vector4_u32[0]); + Result.vector4_u32[1] = ~(V1.vector4_u32[1] | V2.vector4_u32[1]); + Result.vector4_u32[2] = ~(V1.vector4_u32[2] | V2.vector4_u32[2]); + Result.vector4_u32[3] = ~(V1.vector4_u32[3] | V2.vector4_u32[3]); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + __m128i Result; + Result = _mm_or_si128( reinterpret_cast(&V1)[0], reinterpret_cast(&V2)[0] ); + Result = _mm_andnot_si128( Result,g_XMNegOneMask); + return reinterpret_cast<__m128 *>(&Result)[0]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorXorInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result.vector4_u32[0] = V1.vector4_u32[0] ^ V2.vector4_u32[0]; + Result.vector4_u32[1] = V1.vector4_u32[1] ^ V2.vector4_u32[1]; + Result.vector4_u32[2] = V1.vector4_u32[2] ^ V2.vector4_u32[2]; + Result.vector4_u32[3] = V1.vector4_u32[3] ^ V2.vector4_u32[3]; + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_xor_si128( reinterpret_cast(&V1)[0], reinterpret_cast(&V2)[0] ); + return reinterpret_cast<__m128 *>(&V)[0]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorNegate +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result.vector4_f32[0] = -V.vector4_f32[0]; + Result.vector4_f32[1] = -V.vector4_f32[1]; + Result.vector4_f32[2] = -V.vector4_f32[2]; + Result.vector4_f32[3] = -V.vector4_f32[3]; + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR Z; + + Z = _mm_setzero_ps(); + + return _mm_sub_ps( Z, V ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorAdd +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result.vector4_f32[0] = V1.vector4_f32[0] + V2.vector4_f32[0]; + Result.vector4_f32[1] = V1.vector4_f32[1] + V2.vector4_f32[1]; + Result.vector4_f32[2] = V1.vector4_f32[2] + V2.vector4_f32[2]; + Result.vector4_f32[3] = V1.vector4_f32[3] + V2.vector4_f32[3]; + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_add_ps( V1, V2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorAddAngles +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Mask; + XMVECTOR Offset; + XMVECTOR Result; + CONST XMVECTOR Zero = XMVectorZero(); + + // Add the given angles together. If the range of V1 is such + // that -Pi <= V1 < Pi and the range of V2 is such that + // -2Pi <= V2 <= 2Pi, then the range of the resulting angle + // will be -Pi <= Result < Pi. + Result = XMVectorAdd(V1, V2); + + Mask = XMVectorLess(Result, g_XMNegativePi.v); + Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask); + + Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v); + Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask); + + Result = XMVectorAdd(Result, Offset); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Adjust the angles + XMVECTOR vResult = _mm_add_ps(V1,V2); + // Less than Pi? + XMVECTOR vOffset = _mm_cmplt_ps(vResult,g_XMNegativePi); + vOffset = _mm_and_ps(vOffset,g_XMTwoPi); + // Add 2Pi to all entries less than -Pi + vResult = _mm_add_ps(vResult,vOffset); + // Greater than or equal to Pi? + vOffset = _mm_cmpge_ps(vResult,g_XMPi); + vOffset = _mm_and_ps(vOffset,g_XMTwoPi); + // Sub 2Pi to all entries greater than Pi + vResult = _mm_sub_ps(vResult,vOffset); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorSubtract +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result.vector4_f32[0] = V1.vector4_f32[0] - V2.vector4_f32[0]; + Result.vector4_f32[1] = V1.vector4_f32[1] - V2.vector4_f32[1]; + Result.vector4_f32[2] = V1.vector4_f32[2] - V2.vector4_f32[2]; + Result.vector4_f32[3] = V1.vector4_f32[3] - V2.vector4_f32[3]; + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_sub_ps( V1, V2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorSubtractAngles +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Mask; + XMVECTOR Offset; + XMVECTOR Result; + CONST XMVECTOR Zero = XMVectorZero(); + + // Subtract the given angles. If the range of V1 is such + // that -Pi <= V1 < Pi and the range of V2 is such that + // -2Pi <= V2 <= 2Pi, then the range of the resulting angle + // will be -Pi <= Result < Pi. + Result = XMVectorSubtract(V1, V2); + + Mask = XMVectorLess(Result, g_XMNegativePi.v); + Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask); + + Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v); + Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask); + + Result = XMVectorAdd(Result, Offset); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Adjust the angles + XMVECTOR vResult = _mm_sub_ps(V1,V2); + // Less than Pi? + XMVECTOR vOffset = _mm_cmplt_ps(vResult,g_XMNegativePi); + vOffset = _mm_and_ps(vOffset,g_XMTwoPi); + // Add 2Pi to all entries less than -Pi + vResult = _mm_add_ps(vResult,vOffset); + // Greater than or equal to Pi? + vOffset = _mm_cmpge_ps(vResult,g_XMPi); + vOffset = _mm_and_ps(vOffset,g_XMTwoPi); + // Sub 2Pi to all entries greater than Pi + vResult = _mm_sub_ps(vResult,vOffset); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorMultiply +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result = { + V1.vector4_f32[0] * V2.vector4_f32[0], + V1.vector4_f32[1] * V2.vector4_f32[1], + V1.vector4_f32[2] * V2.vector4_f32[2], + V1.vector4_f32[3] * V2.vector4_f32[3] + }; + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_mul_ps( V1, V2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorMultiplyAdd +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult = { + (V1.vector4_f32[0] * V2.vector4_f32[0]) + V3.vector4_f32[0], + (V1.vector4_f32[1] * V2.vector4_f32[1]) + V3.vector4_f32[1], + (V1.vector4_f32[2] * V2.vector4_f32[2]) + V3.vector4_f32[2], + (V1.vector4_f32[3] * V2.vector4_f32[3]) + V3.vector4_f32[3] + }; + return vResult; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_mul_ps( V1, V2 ); + return _mm_add_ps(vResult, V3 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorDivide +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = V1.vector4_f32[0] / V2.vector4_f32[0]; + Result.vector4_f32[1] = V1.vector4_f32[1] / V2.vector4_f32[1]; + Result.vector4_f32[2] = V1.vector4_f32[2] / V2.vector4_f32[2]; + Result.vector4_f32[3] = V1.vector4_f32[3] / V2.vector4_f32[3]; + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_div_ps( V1, V2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorNegativeMultiplySubtract +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR vResult = { + V3.vector4_f32[0] - (V1.vector4_f32[0] * V2.vector4_f32[0]), + V3.vector4_f32[1] - (V1.vector4_f32[1] * V2.vector4_f32[1]), + V3.vector4_f32[2] - (V1.vector4_f32[2] * V2.vector4_f32[2]), + V3.vector4_f32[3] - (V1.vector4_f32[3] * V2.vector4_f32[3]) + }; + return vResult; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR R = _mm_mul_ps( V1, V2 ); + return _mm_sub_ps( V3, R ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorScale +( + FXMVECTOR V, + FLOAT ScaleFactor +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult = { + V.vector4_f32[0] * ScaleFactor, + V.vector4_f32[1] * ScaleFactor, + V.vector4_f32[2] * ScaleFactor, + V.vector4_f32[3] * ScaleFactor + }; + return vResult; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_set_ps1(ScaleFactor); + return _mm_mul_ps(vResult,V); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorReciprocalEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + UINT i; + + // Avoid C4701 + Result.vector4_f32[0] = 0.0f; + + for (i = 0; i < 4; i++) + { + if (XMISNAN(V.vector4_f32[i])) + { + Result.vector4_u32[i] = 0x7FC00000; + } + else if (V.vector4_f32[i] == 0.0f || V.vector4_f32[i] == -0.0f) + { + Result.vector4_u32[i] = 0x7F800000 | (V.vector4_u32[i] & 0x80000000); + } + else + { + Result.vector4_f32[i] = 1.f / V.vector4_f32[i]; + } + } + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_rcp_ps(V); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorReciprocal +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + return XMVectorReciprocalEst(V); + +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_div_ps(g_XMOne,V); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Return an estimated square root +XMFINLINE XMVECTOR XMVectorSqrtEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Select; + + // if (x == +Infinity) sqrt(x) = +Infinity + // if (x == +0.0f) sqrt(x) = +0.0f + // if (x == -0.0f) sqrt(x) = -0.0f + // if (x < 0.0f) sqrt(x) = QNaN + + XMVECTOR Result = XMVectorReciprocalSqrtEst(V); + XMVECTOR Zero = XMVectorZero(); + XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v); + XMVECTOR VEqualsZero = XMVectorEqual(V, Zero); + Result = XMVectorMultiply(V, Result); + Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero); + Result = XMVectorSelect(V, Result, Select); + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_sqrt_ps(V); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorSqrt +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Zero; + XMVECTOR VEqualsInfinity, VEqualsZero; + XMVECTOR Select; + XMVECTOR Result; + + // if (x == +Infinity) sqrt(x) = +Infinity + // if (x == +0.0f) sqrt(x) = +0.0f + // if (x == -0.0f) sqrt(x) = -0.0f + // if (x < 0.0f) sqrt(x) = QNaN + + Result = XMVectorReciprocalSqrt(V); + Zero = XMVectorZero(); + VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v); + VEqualsZero = XMVectorEqual(V, Zero); + Result = XMVectorMultiply(V, Result); + Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero); + Result = XMVectorSelect(V, Result, Select); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_sqrt_ps(V); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorReciprocalSqrtEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + // if (x == +Infinity) rsqrt(x) = 0 + // if (x == +0.0f) rsqrt(x) = +Infinity + // if (x == -0.0f) rsqrt(x) = -Infinity + // if (x < 0.0f) rsqrt(x) = QNaN + + XMVECTOR Result; + UINT i; + + // Avoid C4701 + Result.vector4_f32[0] = 0.0f; + + for (i = 0; i < 4; i++) + { + if (XMISNAN(V.vector4_f32[i])) + { + Result.vector4_u32[i] = 0x7FC00000; + } + else if (V.vector4_f32[i] == 0.0f || V.vector4_f32[i] == -0.0f) + { + Result.vector4_u32[i] = 0x7F800000 | (V.vector4_u32[i] & 0x80000000); + } + else if (V.vector4_f32[i] < 0.0f) + { + Result.vector4_u32[i] = 0x7FFFFFFF; + } + else if (XMISINF(V.vector4_f32[i])) + { + Result.vector4_f32[i] = 0.0f; + } + else + { + Result.vector4_f32[i] = 1.0f / sqrtf(V.vector4_f32[i]); + } + } + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_rsqrt_ps(V); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorReciprocalSqrt +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + return XMVectorReciprocalSqrtEst(V); + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_sqrt_ps(V); + vResult = _mm_div_ps(g_XMOne,vResult); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorExpEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_f32[0] = powf(2.0f, V.vector4_f32[0]); + Result.vector4_f32[1] = powf(2.0f, V.vector4_f32[1]); + Result.vector4_f32[2] = powf(2.0f, V.vector4_f32[2]); + Result.vector4_f32[3] = powf(2.0f, V.vector4_f32[3]); + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_setr_ps( + powf(2.0f,XMVectorGetX(V)), + powf(2.0f,XMVectorGetY(V)), + powf(2.0f,XMVectorGetZ(V)), + powf(2.0f,XMVectorGetW(V))); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMVECTOR XMVectorExp +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR E, S; + XMVECTOR R, R2, R3, R4; + XMVECTOR V0, V1; + XMVECTOR C0X, C0Y, C0Z, C0W; + XMVECTOR C1X, C1Y, C1Z, C1W; + XMVECTOR Result; + static CONST XMVECTOR C0 = {1.0f, -6.93147182e-1f, 2.40226462e-1f, -5.55036440e-2f}; + static CONST XMVECTOR C1 = {9.61597636e-3f, -1.32823968e-3f, 1.47491097e-4f, -1.08635004e-5f}; + + R = XMVectorFloor(V); + E = XMVectorExpEst(R); + R = XMVectorSubtract(V, R); + R2 = XMVectorMultiply(R, R); + R3 = XMVectorMultiply(R, R2); + R4 = XMVectorMultiply(R2, R2); + + C0X = XMVectorSplatX(C0); + C0Y = XMVectorSplatY(C0); + C0Z = XMVectorSplatZ(C0); + C0W = XMVectorSplatW(C0); + + C1X = XMVectorSplatX(C1); + C1Y = XMVectorSplatY(C1); + C1Z = XMVectorSplatZ(C1); + C1W = XMVectorSplatW(C1); + + V0 = XMVectorMultiplyAdd(R, C0Y, C0X); + V0 = XMVectorMultiplyAdd(R2, C0Z, V0); + V0 = XMVectorMultiplyAdd(R3, C0W, V0); + + V1 = XMVectorMultiplyAdd(R, C1Y, C1X); + V1 = XMVectorMultiplyAdd(R2, C1Z, V1); + V1 = XMVectorMultiplyAdd(R3, C1W, V1); + + S = XMVectorMultiplyAdd(R4, V1, V0); + + S = XMVectorReciprocal(S); + Result = XMVectorMultiply(E, S); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + static CONST XMVECTORF32 C0 = {1.0f, -6.93147182e-1f, 2.40226462e-1f, -5.55036440e-2f}; + static CONST XMVECTORF32 C1 = {9.61597636e-3f, -1.32823968e-3f, 1.47491097e-4f, -1.08635004e-5f}; + + // Get the integer of the input + XMVECTOR R = XMVectorFloor(V); + // Get the exponent estimate + XMVECTOR E = XMVectorExpEst(R); + // Get the fractional only + R = _mm_sub_ps(V,R); + // Get R^2 + XMVECTOR R2 = _mm_mul_ps(R,R); + // And R^3 + XMVECTOR R3 = _mm_mul_ps(R,R2); + + XMVECTOR V0 = _mm_load_ps1(&C0.f[1]); + V0 = _mm_mul_ps(V0,R); + XMVECTOR vConstants = _mm_load_ps1(&C0.f[0]); + V0 = _mm_add_ps(V0,vConstants); + vConstants = _mm_load_ps1(&C0.f[2]); + vConstants = _mm_mul_ps(vConstants,R2); + V0 = _mm_add_ps(V0,vConstants); + vConstants = _mm_load_ps1(&C0.f[3]); + vConstants = _mm_mul_ps(vConstants,R3); + V0 = _mm_add_ps(V0,vConstants); + + XMVECTOR V1 = _mm_load_ps1(&C1.f[1]); + V1 = _mm_mul_ps(V1,R); + vConstants = _mm_load_ps1(&C1.f[0]); + V1 = _mm_add_ps(V1,vConstants); + vConstants = _mm_load_ps1(&C1.f[2]); + vConstants = _mm_mul_ps(vConstants,R2); + V1 = _mm_add_ps(V1,vConstants); + vConstants = _mm_load_ps1(&C1.f[3]); + vConstants = _mm_mul_ps(vConstants,R3); + V1 = _mm_add_ps(V1,vConstants); + // R2 = R^4 + R2 = _mm_mul_ps(R2,R2); + R2 = _mm_mul_ps(R2,V1); + R2 = _mm_add_ps(R2,V0); + E = _mm_div_ps(E,R2); + return E; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorLogEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + FLOAT fScale = (1.0f / logf(2.0f)); + XMVECTOR Result; + + Result.vector4_f32[0] = logf(V.vector4_f32[0])*fScale; + Result.vector4_f32[1] = logf(V.vector4_f32[1])*fScale; + Result.vector4_f32[2] = logf(V.vector4_f32[2])*fScale; + Result.vector4_f32[3] = logf(V.vector4_f32[3])*fScale; + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vScale = _mm_set_ps1(1.0f / logf(2.0f)); + XMVECTOR vResult = _mm_setr_ps( + logf(XMVectorGetX(V)), + logf(XMVectorGetY(V)), + logf(XMVectorGetZ(V)), + logf(XMVectorGetW(V))); + vResult = _mm_mul_ps(vResult,vScale); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMVECTOR XMVectorLog +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + FLOAT fScale = (1.0f / logf(2.0f)); + XMVECTOR Result; + + Result.vector4_f32[0] = logf(V.vector4_f32[0])*fScale; + Result.vector4_f32[1] = logf(V.vector4_f32[1])*fScale; + Result.vector4_f32[2] = logf(V.vector4_f32[2])*fScale; + Result.vector4_f32[3] = logf(V.vector4_f32[3])*fScale; + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vScale = _mm_set_ps1(1.0f / logf(2.0f)); + XMVECTOR vResult = _mm_setr_ps( + logf(XMVectorGetX(V)), + logf(XMVectorGetY(V)), + logf(XMVectorGetZ(V)), + logf(XMVectorGetW(V))); + vResult = _mm_mul_ps(vResult,vScale); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorPowEst +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result.vector4_f32[0] = powf(V1.vector4_f32[0], V2.vector4_f32[0]); + Result.vector4_f32[1] = powf(V1.vector4_f32[1], V2.vector4_f32[1]); + Result.vector4_f32[2] = powf(V1.vector4_f32[2], V2.vector4_f32[2]); + Result.vector4_f32[3] = powf(V1.vector4_f32[3], V2.vector4_f32[3]); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_setr_ps( + powf(XMVectorGetX(V1),XMVectorGetX(V2)), + powf(XMVectorGetY(V1),XMVectorGetY(V2)), + powf(XMVectorGetZ(V1),XMVectorGetZ(V2)), + powf(XMVectorGetW(V1),XMVectorGetW(V2))); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorPow +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) + + return XMVectorPowEst(V1, V2); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorAbs +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult = { + fabsf(V.vector4_f32[0]), + fabsf(V.vector4_f32[1]), + fabsf(V.vector4_f32[2]), + fabsf(V.vector4_f32[3]) + }; + return vResult; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_setzero_ps(); + vResult = _mm_sub_ps(vResult,V); + vResult = _mm_max_ps(vResult,V); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorMod +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Reciprocal; + XMVECTOR Quotient; + XMVECTOR Result; + + // V1 % V2 = V1 - V2 * truncate(V1 / V2) + Reciprocal = XMVectorReciprocal(V2); + Quotient = XMVectorMultiply(V1, Reciprocal); + Quotient = XMVectorTruncate(Quotient); + Result = XMVectorNegativeMultiplySubtract(V2, Quotient, V1); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_div_ps(V1, V2); + vResult = XMVectorTruncate(vResult); + vResult = _mm_mul_ps(vResult,V2); + vResult = _mm_sub_ps(V1,vResult); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorModAngles +( + FXMVECTOR Angles +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + XMVECTOR Result; + + // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI + V = XMVectorMultiply(Angles, g_XMReciprocalTwoPi.v); + V = XMVectorRound(V); + Result = XMVectorNegativeMultiplySubtract(g_XMTwoPi.v, V, Angles); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI + XMVECTOR vResult = _mm_mul_ps(Angles,g_XMReciprocalTwoPi); + // Use the inline function due to complexity for rounding + vResult = XMVectorRound(vResult); + vResult = _mm_mul_ps(vResult,g_XMTwoPi); + vResult = _mm_sub_ps(Angles,vResult); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMVECTOR XMVectorSin +( + FXMVECTOR V +) +{ + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V1, V2, V3, V5, V7, V9, V11, V13, V15, V17, V19, V21, V23; + XMVECTOR S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11; + XMVECTOR Result; + + V1 = XMVectorModAngles(V); + + // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! - + // V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI) + V2 = XMVectorMultiply(V1, V1); + V3 = XMVectorMultiply(V2, V1); + V5 = XMVectorMultiply(V3, V2); + V7 = XMVectorMultiply(V5, V2); + V9 = XMVectorMultiply(V7, V2); + V11 = XMVectorMultiply(V9, V2); + V13 = XMVectorMultiply(V11, V2); + V15 = XMVectorMultiply(V13, V2); + V17 = XMVectorMultiply(V15, V2); + V19 = XMVectorMultiply(V17, V2); + V21 = XMVectorMultiply(V19, V2); + V23 = XMVectorMultiply(V21, V2); + + S1 = XMVectorSplatY(g_XMSinCoefficients0.v); + S2 = XMVectorSplatZ(g_XMSinCoefficients0.v); + S3 = XMVectorSplatW(g_XMSinCoefficients0.v); + S4 = XMVectorSplatX(g_XMSinCoefficients1.v); + S5 = XMVectorSplatY(g_XMSinCoefficients1.v); + S6 = XMVectorSplatZ(g_XMSinCoefficients1.v); + S7 = XMVectorSplatW(g_XMSinCoefficients1.v); + S8 = XMVectorSplatX(g_XMSinCoefficients2.v); + S9 = XMVectorSplatY(g_XMSinCoefficients2.v); + S10 = XMVectorSplatZ(g_XMSinCoefficients2.v); + S11 = XMVectorSplatW(g_XMSinCoefficients2.v); + + Result = XMVectorMultiplyAdd(S1, V3, V1); + Result = XMVectorMultiplyAdd(S2, V5, Result); + Result = XMVectorMultiplyAdd(S3, V7, Result); + Result = XMVectorMultiplyAdd(S4, V9, Result); + Result = XMVectorMultiplyAdd(S5, V11, Result); + Result = XMVectorMultiplyAdd(S6, V13, Result); + Result = XMVectorMultiplyAdd(S7, V15, Result); + Result = XMVectorMultiplyAdd(S8, V17, Result); + Result = XMVectorMultiplyAdd(S9, V19, Result); + Result = XMVectorMultiplyAdd(S10, V21, Result); + Result = XMVectorMultiplyAdd(S11, V23, Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR vResult = XMVectorModAngles(V); + // Each on is V to the "num" power + // V2 = V1^2 + XMVECTOR V2 = _mm_mul_ps(vResult,vResult); + // V1^3 + XMVECTOR vPower = _mm_mul_ps(vResult,V2); + XMVECTOR vConstants = _mm_load_ps1(&g_XMSinCoefficients0.f[1]); + vConstants = _mm_mul_ps(vConstants,vPower); + vResult = _mm_add_ps(vResult,vConstants); + + // V^5 + vPower = _mm_mul_ps(vPower,V2); + vConstants = _mm_load_ps1(&g_XMSinCoefficients0.f[2]); + vConstants = _mm_mul_ps(vConstants,vPower); + vResult = _mm_add_ps(vResult,vConstants); + + // V^7 + vPower = _mm_mul_ps(vPower,V2); + vConstants = _mm_load_ps1(&g_XMSinCoefficients0.f[3]); + vConstants = _mm_mul_ps(vConstants,vPower); + vResult = _mm_add_ps(vResult,vConstants); + + // V^9 + vPower = _mm_mul_ps(vPower,V2); + vConstants = _mm_load_ps1(&g_XMSinCoefficients1.f[0]); + vConstants = _mm_mul_ps(vConstants,vPower); + vResult = _mm_add_ps(vResult,vConstants); + + // V^11 + vPower = _mm_mul_ps(vPower,V2); + vConstants = _mm_load_ps1(&g_XMSinCoefficients1.f[1]); + vConstants = _mm_mul_ps(vConstants,vPower); + vResult = _mm_add_ps(vResult,vConstants); + + // V^13 + vPower = _mm_mul_ps(vPower,V2); + vConstants = _mm_load_ps1(&g_XMSinCoefficients1.f[2]); + vConstants = _mm_mul_ps(vConstants,vPower); + vResult = _mm_add_ps(vResult,vConstants); + + // V^15 + vPower = _mm_mul_ps(vPower,V2); + vConstants = _mm_load_ps1(&g_XMSinCoefficients1.f[3]); + vConstants = _mm_mul_ps(vConstants,vPower); + vResult = _mm_add_ps(vResult,vConstants); + + // V^17 + vPower = _mm_mul_ps(vPower,V2); + vConstants = _mm_load_ps1(&g_XMSinCoefficients2.f[0]); + vConstants = _mm_mul_ps(vConstants,vPower); + vResult = _mm_add_ps(vResult,vConstants); + + // V^19 + vPower = _mm_mul_ps(vPower,V2); + vConstants = _mm_load_ps1(&g_XMSinCoefficients2.f[1]); + vConstants = _mm_mul_ps(vConstants,vPower); + vResult = _mm_add_ps(vResult,vConstants); + + // V^21 + vPower = _mm_mul_ps(vPower,V2); + vConstants = _mm_load_ps1(&g_XMSinCoefficients2.f[2]); + vConstants = _mm_mul_ps(vConstants,vPower); + vResult = _mm_add_ps(vResult,vConstants); + + // V^23 + vPower = _mm_mul_ps(vPower,V2); + vConstants = _mm_load_ps1(&g_XMSinCoefficients2.f[3]); + vConstants = _mm_mul_ps(vConstants,vPower); + vResult = _mm_add_ps(vResult,vConstants); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMVECTOR XMVectorCos +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V1, V2, V4, V6, V8, V10, V12, V14, V16, V18, V20, V22; + XMVECTOR C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11; + XMVECTOR Result; + + V1 = XMVectorModAngles(V); + + // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! - + // V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI) + V2 = XMVectorMultiply(V1, V1); + V4 = XMVectorMultiply(V2, V2); + V6 = XMVectorMultiply(V4, V2); + V8 = XMVectorMultiply(V4, V4); + V10 = XMVectorMultiply(V6, V4); + V12 = XMVectorMultiply(V6, V6); + V14 = XMVectorMultiply(V8, V6); + V16 = XMVectorMultiply(V8, V8); + V18 = XMVectorMultiply(V10, V8); + V20 = XMVectorMultiply(V10, V10); + V22 = XMVectorMultiply(V12, V10); + + C1 = XMVectorSplatY(g_XMCosCoefficients0.v); + C2 = XMVectorSplatZ(g_XMCosCoefficients0.v); + C3 = XMVectorSplatW(g_XMCosCoefficients0.v); + C4 = XMVectorSplatX(g_XMCosCoefficients1.v); + C5 = XMVectorSplatY(g_XMCosCoefficients1.v); + C6 = XMVectorSplatZ(g_XMCosCoefficients1.v); + C7 = XMVectorSplatW(g_XMCosCoefficients1.v); + C8 = XMVectorSplatX(g_XMCosCoefficients2.v); + C9 = XMVectorSplatY(g_XMCosCoefficients2.v); + C10 = XMVectorSplatZ(g_XMCosCoefficients2.v); + C11 = XMVectorSplatW(g_XMCosCoefficients2.v); + + Result = XMVectorMultiplyAdd(C1, V2, g_XMOne.v); + Result = XMVectorMultiplyAdd(C2, V4, Result); + Result = XMVectorMultiplyAdd(C3, V6, Result); + Result = XMVectorMultiplyAdd(C4, V8, Result); + Result = XMVectorMultiplyAdd(C5, V10, Result); + Result = XMVectorMultiplyAdd(C6, V12, Result); + Result = XMVectorMultiplyAdd(C7, V14, Result); + Result = XMVectorMultiplyAdd(C8, V16, Result); + Result = XMVectorMultiplyAdd(C9, V18, Result); + Result = XMVectorMultiplyAdd(C10, V20, Result); + Result = XMVectorMultiplyAdd(C11, V22, Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR V2 = XMVectorModAngles(V); + // Each on is V to the "num" power + // V2 = V1^2 + V2 = _mm_mul_ps(V2,V2); + // V^2 + XMVECTOR vConstants = _mm_load_ps1(&g_XMCosCoefficients0.f[1]); + vConstants = _mm_mul_ps(vConstants,V2); + XMVECTOR vResult = _mm_add_ps(vConstants,g_XMOne); + + // V^4 + XMVECTOR vPower = _mm_mul_ps(V2,V2); + vConstants = _mm_load_ps1(&g_XMCosCoefficients0.f[2]); + vConstants = _mm_mul_ps(vConstants,vPower); + vResult = _mm_add_ps(vResult,vConstants); + + // V^6 + vPower = _mm_mul_ps(vPower,V2); + vConstants = _mm_load_ps1(&g_XMCosCoefficients0.f[3]); + vConstants = _mm_mul_ps(vConstants,vPower); + vResult = _mm_add_ps(vResult,vConstants); + + // V^8 + vPower = _mm_mul_ps(vPower,V2); + vConstants = _mm_load_ps1(&g_XMCosCoefficients1.f[0]); + vConstants = _mm_mul_ps(vConstants,vPower); + vResult = _mm_add_ps(vResult,vConstants); + + // V^10 + vPower = _mm_mul_ps(vPower,V2); + vConstants = _mm_load_ps1(&g_XMCosCoefficients1.f[1]); + vConstants = _mm_mul_ps(vConstants,vPower); + vResult = _mm_add_ps(vResult,vConstants); + + // V^12 + vPower = _mm_mul_ps(vPower,V2); + vConstants = _mm_load_ps1(&g_XMCosCoefficients1.f[2]); + vConstants = _mm_mul_ps(vConstants,vPower); + vResult = _mm_add_ps(vResult,vConstants); + + // V^14 + vPower = _mm_mul_ps(vPower,V2); + vConstants = _mm_load_ps1(&g_XMCosCoefficients1.f[3]); + vConstants = _mm_mul_ps(vConstants,vPower); + vResult = _mm_add_ps(vResult,vConstants); + + // V^16 + vPower = _mm_mul_ps(vPower,V2); + vConstants = _mm_load_ps1(&g_XMCosCoefficients2.f[0]); + vConstants = _mm_mul_ps(vConstants,vPower); + vResult = _mm_add_ps(vResult,vConstants); + + // V^18 + vPower = _mm_mul_ps(vPower,V2); + vConstants = _mm_load_ps1(&g_XMCosCoefficients2.f[1]); + vConstants = _mm_mul_ps(vConstants,vPower); + vResult = _mm_add_ps(vResult,vConstants); + + // V^20 + vPower = _mm_mul_ps(vPower,V2); + vConstants = _mm_load_ps1(&g_XMCosCoefficients2.f[2]); + vConstants = _mm_mul_ps(vConstants,vPower); + vResult = _mm_add_ps(vResult,vConstants); + + // V^22 + vPower = _mm_mul_ps(vPower,V2); + vConstants = _mm_load_ps1(&g_XMCosCoefficients2.f[3]); + vConstants = _mm_mul_ps(vConstants,vPower); + vResult = _mm_add_ps(vResult,vConstants); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE VOID XMVectorSinCos +( + XMVECTOR* pSin, + XMVECTOR* pCos, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13; + XMVECTOR V14, V15, V16, V17, V18, V19, V20, V21, V22, V23; + XMVECTOR S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11; + XMVECTOR C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11; + XMVECTOR Sin, Cos; + + XMASSERT(pSin); + XMASSERT(pCos); + + V1 = XMVectorModAngles(V); + + // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! - + // V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI) + // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! - + // V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI) + + V2 = XMVectorMultiply(V1, V1); + V3 = XMVectorMultiply(V2, V1); + V4 = XMVectorMultiply(V2, V2); + V5 = XMVectorMultiply(V3, V2); + V6 = XMVectorMultiply(V3, V3); + V7 = XMVectorMultiply(V4, V3); + V8 = XMVectorMultiply(V4, V4); + V9 = XMVectorMultiply(V5, V4); + V10 = XMVectorMultiply(V5, V5); + V11 = XMVectorMultiply(V6, V5); + V12 = XMVectorMultiply(V6, V6); + V13 = XMVectorMultiply(V7, V6); + V14 = XMVectorMultiply(V7, V7); + V15 = XMVectorMultiply(V8, V7); + V16 = XMVectorMultiply(V8, V8); + V17 = XMVectorMultiply(V9, V8); + V18 = XMVectorMultiply(V9, V9); + V19 = XMVectorMultiply(V10, V9); + V20 = XMVectorMultiply(V10, V10); + V21 = XMVectorMultiply(V11, V10); + V22 = XMVectorMultiply(V11, V11); + V23 = XMVectorMultiply(V12, V11); + + S1 = XMVectorSplatY(g_XMSinCoefficients0.v); + S2 = XMVectorSplatZ(g_XMSinCoefficients0.v); + S3 = XMVectorSplatW(g_XMSinCoefficients0.v); + S4 = XMVectorSplatX(g_XMSinCoefficients1.v); + S5 = XMVectorSplatY(g_XMSinCoefficients1.v); + S6 = XMVectorSplatZ(g_XMSinCoefficients1.v); + S7 = XMVectorSplatW(g_XMSinCoefficients1.v); + S8 = XMVectorSplatX(g_XMSinCoefficients2.v); + S9 = XMVectorSplatY(g_XMSinCoefficients2.v); + S10 = XMVectorSplatZ(g_XMSinCoefficients2.v); + S11 = XMVectorSplatW(g_XMSinCoefficients2.v); + + C1 = XMVectorSplatY(g_XMCosCoefficients0.v); + C2 = XMVectorSplatZ(g_XMCosCoefficients0.v); + C3 = XMVectorSplatW(g_XMCosCoefficients0.v); + C4 = XMVectorSplatX(g_XMCosCoefficients1.v); + C5 = XMVectorSplatY(g_XMCosCoefficients1.v); + C6 = XMVectorSplatZ(g_XMCosCoefficients1.v); + C7 = XMVectorSplatW(g_XMCosCoefficients1.v); + C8 = XMVectorSplatX(g_XMCosCoefficients2.v); + C9 = XMVectorSplatY(g_XMCosCoefficients2.v); + C10 = XMVectorSplatZ(g_XMCosCoefficients2.v); + C11 = XMVectorSplatW(g_XMCosCoefficients2.v); + + Sin = XMVectorMultiplyAdd(S1, V3, V1); + Sin = XMVectorMultiplyAdd(S2, V5, Sin); + Sin = XMVectorMultiplyAdd(S3, V7, Sin); + Sin = XMVectorMultiplyAdd(S4, V9, Sin); + Sin = XMVectorMultiplyAdd(S5, V11, Sin); + Sin = XMVectorMultiplyAdd(S6, V13, Sin); + Sin = XMVectorMultiplyAdd(S7, V15, Sin); + Sin = XMVectorMultiplyAdd(S8, V17, Sin); + Sin = XMVectorMultiplyAdd(S9, V19, Sin); + Sin = XMVectorMultiplyAdd(S10, V21, Sin); + Sin = XMVectorMultiplyAdd(S11, V23, Sin); + + Cos = XMVectorMultiplyAdd(C1, V2, g_XMOne.v); + Cos = XMVectorMultiplyAdd(C2, V4, Cos); + Cos = XMVectorMultiplyAdd(C3, V6, Cos); + Cos = XMVectorMultiplyAdd(C4, V8, Cos); + Cos = XMVectorMultiplyAdd(C5, V10, Cos); + Cos = XMVectorMultiplyAdd(C6, V12, Cos); + Cos = XMVectorMultiplyAdd(C7, V14, Cos); + Cos = XMVectorMultiplyAdd(C8, V16, Cos); + Cos = XMVectorMultiplyAdd(C9, V18, Cos); + Cos = XMVectorMultiplyAdd(C10, V20, Cos); + Cos = XMVectorMultiplyAdd(C11, V22, Cos); + + *pSin = Sin; + *pCos = Cos; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSin); + XMASSERT(pCos); + XMVECTOR V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13; + XMVECTOR V14, V15, V16, V17, V18, V19, V20, V21, V22, V23; + XMVECTOR S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11; + XMVECTOR C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11; + XMVECTOR Sin, Cos; + + V1 = XMVectorModAngles(V); + + // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! - + // V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI) + // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! - + // V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI) + + V2 = XMVectorMultiply(V1, V1); + V3 = XMVectorMultiply(V2, V1); + V4 = XMVectorMultiply(V2, V2); + V5 = XMVectorMultiply(V3, V2); + V6 = XMVectorMultiply(V3, V3); + V7 = XMVectorMultiply(V4, V3); + V8 = XMVectorMultiply(V4, V4); + V9 = XMVectorMultiply(V5, V4); + V10 = XMVectorMultiply(V5, V5); + V11 = XMVectorMultiply(V6, V5); + V12 = XMVectorMultiply(V6, V6); + V13 = XMVectorMultiply(V7, V6); + V14 = XMVectorMultiply(V7, V7); + V15 = XMVectorMultiply(V8, V7); + V16 = XMVectorMultiply(V8, V8); + V17 = XMVectorMultiply(V9, V8); + V18 = XMVectorMultiply(V9, V9); + V19 = XMVectorMultiply(V10, V9); + V20 = XMVectorMultiply(V10, V10); + V21 = XMVectorMultiply(V11, V10); + V22 = XMVectorMultiply(V11, V11); + V23 = XMVectorMultiply(V12, V11); + + S1 = _mm_load_ps1(&g_XMSinCoefficients0.f[1]); + S2 = _mm_load_ps1(&g_XMSinCoefficients0.f[2]); + S3 = _mm_load_ps1(&g_XMSinCoefficients0.f[3]); + S4 = _mm_load_ps1(&g_XMSinCoefficients1.f[0]); + S5 = _mm_load_ps1(&g_XMSinCoefficients1.f[1]); + S6 = _mm_load_ps1(&g_XMSinCoefficients1.f[2]); + S7 = _mm_load_ps1(&g_XMSinCoefficients1.f[3]); + S8 = _mm_load_ps1(&g_XMSinCoefficients2.f[0]); + S9 = _mm_load_ps1(&g_XMSinCoefficients2.f[1]); + S10 = _mm_load_ps1(&g_XMSinCoefficients2.f[2]); + S11 = _mm_load_ps1(&g_XMSinCoefficients2.f[3]); + + C1 = _mm_load_ps1(&g_XMCosCoefficients0.f[1]); + C2 = _mm_load_ps1(&g_XMCosCoefficients0.f[2]); + C3 = _mm_load_ps1(&g_XMCosCoefficients0.f[3]); + C4 = _mm_load_ps1(&g_XMCosCoefficients1.f[0]); + C5 = _mm_load_ps1(&g_XMCosCoefficients1.f[1]); + C6 = _mm_load_ps1(&g_XMCosCoefficients1.f[2]); + C7 = _mm_load_ps1(&g_XMCosCoefficients1.f[3]); + C8 = _mm_load_ps1(&g_XMCosCoefficients2.f[0]); + C9 = _mm_load_ps1(&g_XMCosCoefficients2.f[1]); + C10 = _mm_load_ps1(&g_XMCosCoefficients2.f[2]); + C11 = _mm_load_ps1(&g_XMCosCoefficients2.f[3]); + + S1 = _mm_mul_ps(S1,V3); + Sin = _mm_add_ps(S1,V1); + Sin = XMVectorMultiplyAdd(S2, V5, Sin); + Sin = XMVectorMultiplyAdd(S3, V7, Sin); + Sin = XMVectorMultiplyAdd(S4, V9, Sin); + Sin = XMVectorMultiplyAdd(S5, V11, Sin); + Sin = XMVectorMultiplyAdd(S6, V13, Sin); + Sin = XMVectorMultiplyAdd(S7, V15, Sin); + Sin = XMVectorMultiplyAdd(S8, V17, Sin); + Sin = XMVectorMultiplyAdd(S9, V19, Sin); + Sin = XMVectorMultiplyAdd(S10, V21, Sin); + Sin = XMVectorMultiplyAdd(S11, V23, Sin); + + Cos = _mm_mul_ps(C1,V2); + Cos = _mm_add_ps(Cos,g_XMOne); + Cos = XMVectorMultiplyAdd(C2, V4, Cos); + Cos = XMVectorMultiplyAdd(C3, V6, Cos); + Cos = XMVectorMultiplyAdd(C4, V8, Cos); + Cos = XMVectorMultiplyAdd(C5, V10, Cos); + Cos = XMVectorMultiplyAdd(C6, V12, Cos); + Cos = XMVectorMultiplyAdd(C7, V14, Cos); + Cos = XMVectorMultiplyAdd(C8, V16, Cos); + Cos = XMVectorMultiplyAdd(C9, V18, Cos); + Cos = XMVectorMultiplyAdd(C10, V20, Cos); + Cos = XMVectorMultiplyAdd(C11, V22, Cos); + + *pSin = Sin; + *pCos = Cos; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMVECTOR XMVectorTan +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + // Cody and Waite algorithm to compute tangent. + + XMVECTOR VA, VB, VC, VC2; + XMVECTOR T0, T1, T2, T3, T4, T5, T6, T7; + XMVECTOR C0, C1, TwoDivPi, Epsilon; + XMVECTOR N, D; + XMVECTOR R0, R1; + XMVECTOR VIsZero, VCNearZero, VBIsEven; + XMVECTOR Zero; + XMVECTOR Result; + UINT i; + static CONST XMVECTOR TanCoefficients0 = {1.0f, -4.667168334e-1f, 2.566383229e-2f, -3.118153191e-4f}; + static CONST XMVECTOR TanCoefficients1 = {4.981943399e-7f, -1.333835001e-1f, 3.424887824e-3f, -1.786170734e-5f}; + static CONST XMVECTOR TanConstants = {1.570796371f, 6.077100628e-11f, 0.000244140625f, 2.0f / XM_PI}; + static CONST XMVECTORU32 Mask = {0x1, 0x1, 0x1, 0x1}; + + TwoDivPi = XMVectorSplatW(TanConstants); + + Zero = XMVectorZero(); + + C0 = XMVectorSplatX(TanConstants); + C1 = XMVectorSplatY(TanConstants); + Epsilon = XMVectorSplatZ(TanConstants); + + VA = XMVectorMultiply(V, TwoDivPi); + + VA = XMVectorRound(VA); + + VC = XMVectorNegativeMultiplySubtract(VA, C0, V); + + VB = XMVectorAbs(VA); + + VC = XMVectorNegativeMultiplySubtract(VA, C1, VC); + + for (i = 0; i < 4; i++) + { + VB.vector4_u32[i] = (UINT)VB.vector4_f32[i]; + } + + VC2 = XMVectorMultiply(VC, VC); + + T7 = XMVectorSplatW(TanCoefficients1); + T6 = XMVectorSplatZ(TanCoefficients1); + T4 = XMVectorSplatX(TanCoefficients1); + T3 = XMVectorSplatW(TanCoefficients0); + T5 = XMVectorSplatY(TanCoefficients1); + T2 = XMVectorSplatZ(TanCoefficients0); + T1 = XMVectorSplatY(TanCoefficients0); + T0 = XMVectorSplatX(TanCoefficients0); + + VBIsEven = XMVectorAndInt(VB, Mask.v); + VBIsEven = XMVectorEqualInt(VBIsEven, Zero); + + N = XMVectorMultiplyAdd(VC2, T7, T6); + D = XMVectorMultiplyAdd(VC2, T4, T3); + N = XMVectorMultiplyAdd(VC2, N, T5); + D = XMVectorMultiplyAdd(VC2, D, T2); + N = XMVectorMultiply(VC2, N); + D = XMVectorMultiplyAdd(VC2, D, T1); + N = XMVectorMultiplyAdd(VC, N, VC); + VCNearZero = XMVectorInBounds(VC, Epsilon); + D = XMVectorMultiplyAdd(VC2, D, T0); + + N = XMVectorSelect(N, VC, VCNearZero); + D = XMVectorSelect(D, g_XMOne.v, VCNearZero); + + R0 = XMVectorNegate(N); + R1 = XMVectorReciprocal(D); + R0 = XMVectorReciprocal(R0); + R1 = XMVectorMultiply(N, R1); + R0 = XMVectorMultiply(D, R0); + + VIsZero = XMVectorEqual(V, Zero); + + Result = XMVectorSelect(R0, R1, VBIsEven); + + Result = XMVectorSelect(Result, Zero, VIsZero); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Cody and Waite algorithm to compute tangent. + + XMVECTOR VA, VB, VC, VC2; + XMVECTOR T0, T1, T2, T3, T4, T5, T6, T7; + XMVECTOR C0, C1, TwoDivPi, Epsilon; + XMVECTOR N, D; + XMVECTOR R0, R1; + XMVECTOR VIsZero, VCNearZero, VBIsEven; + XMVECTOR Zero; + XMVECTOR Result; + static CONST XMVECTORF32 TanCoefficients0 = {1.0f, -4.667168334e-1f, 2.566383229e-2f, -3.118153191e-4f}; + static CONST XMVECTORF32 TanCoefficients1 = {4.981943399e-7f, -1.333835001e-1f, 3.424887824e-3f, -1.786170734e-5f}; + static CONST XMVECTORF32 TanConstants = {1.570796371f, 6.077100628e-11f, 0.000244140625f, 2.0f / XM_PI}; + static CONST XMVECTORI32 Mask = {0x1, 0x1, 0x1, 0x1}; + + TwoDivPi = XMVectorSplatW(TanConstants); + + Zero = XMVectorZero(); + + C0 = XMVectorSplatX(TanConstants); + C1 = XMVectorSplatY(TanConstants); + Epsilon = XMVectorSplatZ(TanConstants); + + VA = XMVectorMultiply(V, TwoDivPi); + + VA = XMVectorRound(VA); + + VC = XMVectorNegativeMultiplySubtract(VA, C0, V); + + VB = XMVectorAbs(VA); + + VC = XMVectorNegativeMultiplySubtract(VA, C1, VC); + + reinterpret_cast<__m128i *>(&VB)[0] = _mm_cvttps_epi32(VB); + + VC2 = XMVectorMultiply(VC, VC); + + T7 = XMVectorSplatW(TanCoefficients1); + T6 = XMVectorSplatZ(TanCoefficients1); + T4 = XMVectorSplatX(TanCoefficients1); + T3 = XMVectorSplatW(TanCoefficients0); + T5 = XMVectorSplatY(TanCoefficients1); + T2 = XMVectorSplatZ(TanCoefficients0); + T1 = XMVectorSplatY(TanCoefficients0); + T0 = XMVectorSplatX(TanCoefficients0); + + VBIsEven = XMVectorAndInt(VB,Mask); + VBIsEven = XMVectorEqualInt(VBIsEven, Zero); + + N = XMVectorMultiplyAdd(VC2, T7, T6); + D = XMVectorMultiplyAdd(VC2, T4, T3); + N = XMVectorMultiplyAdd(VC2, N, T5); + D = XMVectorMultiplyAdd(VC2, D, T2); + N = XMVectorMultiply(VC2, N); + D = XMVectorMultiplyAdd(VC2, D, T1); + N = XMVectorMultiplyAdd(VC, N, VC); + VCNearZero = XMVectorInBounds(VC, Epsilon); + D = XMVectorMultiplyAdd(VC2, D, T0); + + N = XMVectorSelect(N, VC, VCNearZero); + D = XMVectorSelect(D, g_XMOne, VCNearZero); + R0 = XMVectorNegate(N); + R1 = _mm_div_ps(N,D); + R0 = _mm_div_ps(D,R0); + VIsZero = XMVectorEqual(V, Zero); + Result = XMVectorSelect(R0, R1, VBIsEven); + Result = XMVectorSelect(Result, Zero, VIsZero); + + return Result; + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMVECTOR XMVectorSinH +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V1, V2; + XMVECTOR E1, E2; + XMVECTOR Result; + static CONST XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f) + + V1 = XMVectorMultiplyAdd(V, Scale.v, g_XMNegativeOne.v); + V2 = XMVectorNegativeMultiplySubtract(V, Scale.v, g_XMNegativeOne.v); + + E1 = XMVectorExp(V1); + E2 = XMVectorExp(V2); + + Result = XMVectorSubtract(E1, E2); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR V1, V2; + XMVECTOR E1, E2; + XMVECTOR Result; + static CONST XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f) + + V1 = _mm_mul_ps(V, Scale); + V1 = _mm_add_ps(V1,g_XMNegativeOne); + V2 = _mm_mul_ps(V, Scale); + V2 = _mm_sub_ps(g_XMNegativeOne,V2); + E1 = XMVectorExp(V1); + E2 = XMVectorExp(V2); + + Result = _mm_sub_ps(E1, E2); + + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMVECTOR XMVectorCosH +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V1, V2; + XMVECTOR E1, E2; + XMVECTOR Result; + static CONST XMVECTOR Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f) + + V1 = XMVectorMultiplyAdd(V, Scale, g_XMNegativeOne.v); + V2 = XMVectorNegativeMultiplySubtract(V, Scale, g_XMNegativeOne.v); + + E1 = XMVectorExp(V1); + E2 = XMVectorExp(V2); + + Result = XMVectorAdd(E1, E2); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR V1, V2; + XMVECTOR E1, E2; + XMVECTOR Result; + static CONST XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f) + + V1 = _mm_mul_ps(V,Scale); + V1 = _mm_add_ps(V1,g_XMNegativeOne); + V2 = _mm_mul_ps(V, Scale); + V2 = _mm_sub_ps(g_XMNegativeOne,V2); + E1 = XMVectorExp(V1); + E2 = XMVectorExp(V2); + Result = _mm_add_ps(E1, E2); + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMVECTOR XMVectorTanH +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR E; + XMVECTOR Result; + static CONST XMVECTORF32 Scale = {2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f}; // 2.0f / ln(2.0f) + + E = XMVectorMultiply(V, Scale.v); + E = XMVectorExp(E); + E = XMVectorMultiplyAdd(E, g_XMOneHalf.v, g_XMOneHalf.v); + E = XMVectorReciprocal(E); + + Result = XMVectorSubtract(g_XMOne.v, E); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + static CONST XMVECTORF32 Scale = {2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f}; // 2.0f / ln(2.0f) + + XMVECTOR E = _mm_mul_ps(V, Scale); + E = XMVectorExp(E); + E = _mm_mul_ps(E,g_XMOneHalf); + E = _mm_add_ps(E,g_XMOneHalf); + E = XMVectorReciprocal(E); + E = _mm_sub_ps(g_XMOne, E); + return E; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMVECTOR XMVectorASin +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V2, V3, AbsV; + XMVECTOR C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11; + XMVECTOR R0, R1, R2, R3, R4; + XMVECTOR OneMinusAbsV; + XMVECTOR Rsq; + XMVECTOR Result; + static CONST XMVECTOR OnePlusEpsilon = {1.00000011921f, 1.00000011921f, 1.00000011921f, 1.00000011921f}; + + // asin(V) = V * (C0 + C1 * V + C2 * V^2 + C3 * V^3 + C4 * V^4 + C5 * V^5) + (1 - V) * rsq(1 - V) * + // V * (C6 + C7 * V + C8 * V^2 + C9 * V^3 + C10 * V^4 + C11 * V^5) + + AbsV = XMVectorAbs(V); + + V2 = XMVectorMultiply(V, V); + V3 = XMVectorMultiply(V2, AbsV); + + R4 = XMVectorNegativeMultiplySubtract(AbsV, V, V); + + OneMinusAbsV = XMVectorSubtract(OnePlusEpsilon, AbsV); + Rsq = XMVectorReciprocalSqrt(OneMinusAbsV); + + C0 = XMVectorSplatX(g_XMASinCoefficients0.v); + C1 = XMVectorSplatY(g_XMASinCoefficients0.v); + C2 = XMVectorSplatZ(g_XMASinCoefficients0.v); + C3 = XMVectorSplatW(g_XMASinCoefficients0.v); + + C4 = XMVectorSplatX(g_XMASinCoefficients1.v); + C5 = XMVectorSplatY(g_XMASinCoefficients1.v); + C6 = XMVectorSplatZ(g_XMASinCoefficients1.v); + C7 = XMVectorSplatW(g_XMASinCoefficients1.v); + + C8 = XMVectorSplatX(g_XMASinCoefficients2.v); + C9 = XMVectorSplatY(g_XMASinCoefficients2.v); + C10 = XMVectorSplatZ(g_XMASinCoefficients2.v); + C11 = XMVectorSplatW(g_XMASinCoefficients2.v); + + R0 = XMVectorMultiplyAdd(C3, AbsV, C7); + R1 = XMVectorMultiplyAdd(C1, AbsV, C5); + R2 = XMVectorMultiplyAdd(C2, AbsV, C6); + R3 = XMVectorMultiplyAdd(C0, AbsV, C4); + + R0 = XMVectorMultiplyAdd(R0, AbsV, C11); + R1 = XMVectorMultiplyAdd(R1, AbsV, C9); + R2 = XMVectorMultiplyAdd(R2, AbsV, C10); + R3 = XMVectorMultiplyAdd(R3, AbsV, C8); + + R0 = XMVectorMultiplyAdd(R2, V3, R0); + R1 = XMVectorMultiplyAdd(R3, V3, R1); + + R0 = XMVectorMultiply(V, R0); + R1 = XMVectorMultiply(R4, R1); + + Result = XMVectorMultiplyAdd(R1, Rsq, R0); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + static CONST XMVECTORF32 OnePlusEpsilon = {1.00000011921f, 1.00000011921f, 1.00000011921f, 1.00000011921f}; + + // asin(V) = V * (C0 + C1 * V + C2 * V^2 + C3 * V^3 + C4 * V^4 + C5 * V^5) + (1 - V) * rsq(1 - V) * + // V * (C6 + C7 * V + C8 * V^2 + C9 * V^3 + C10 * V^4 + C11 * V^5) + // Get abs(V) + XMVECTOR vAbsV = _mm_setzero_ps(); + vAbsV = _mm_sub_ps(vAbsV,V); + vAbsV = _mm_max_ps(vAbsV,V); + + XMVECTOR R0 = vAbsV; + XMVECTOR vConstants = _mm_load_ps1(&g_XMASinCoefficients0.f[3]); + R0 = _mm_mul_ps(R0,vConstants); + vConstants = _mm_load_ps1(&g_XMASinCoefficients1.f[3]); + R0 = _mm_add_ps(R0,vConstants); + + XMVECTOR R1 = vAbsV; + vConstants = _mm_load_ps1(&g_XMASinCoefficients0.f[1]); + R1 = _mm_mul_ps(R1,vConstants); + vConstants = _mm_load_ps1(&g_XMASinCoefficients1.f[1]); + R1 = _mm_add_ps(R1, vConstants); + + XMVECTOR R2 = vAbsV; + vConstants = _mm_load_ps1(&g_XMASinCoefficients0.f[2]); + R2 = _mm_mul_ps(R2,vConstants); + vConstants = _mm_load_ps1(&g_XMASinCoefficients1.f[2]); + R2 = _mm_add_ps(R2, vConstants); + + XMVECTOR R3 = vAbsV; + vConstants = _mm_load_ps1(&g_XMASinCoefficients0.f[0]); + R3 = _mm_mul_ps(R3,vConstants); + vConstants = _mm_load_ps1(&g_XMASinCoefficients1.f[0]); + R3 = _mm_add_ps(R3, vConstants); + + vConstants = _mm_load_ps1(&g_XMASinCoefficients2.f[3]); + R0 = _mm_mul_ps(R0,vAbsV); + R0 = _mm_add_ps(R0,vConstants); + + vConstants = _mm_load_ps1(&g_XMASinCoefficients2.f[1]); + R1 = _mm_mul_ps(R1,vAbsV); + R1 = _mm_add_ps(R1,vConstants); + + vConstants = _mm_load_ps1(&g_XMASinCoefficients2.f[2]); + R2 = _mm_mul_ps(R2,vAbsV); + R2 = _mm_add_ps(R2,vConstants); + + vConstants = _mm_load_ps1(&g_XMASinCoefficients2.f[0]); + R3 = _mm_mul_ps(R3,vAbsV); + R3 = _mm_add_ps(R3,vConstants); + + // V3 = V^3 + vConstants = _mm_mul_ps(V,V); + vConstants = _mm_mul_ps(vConstants, vAbsV); + // Mul by V^3 + R2 = _mm_mul_ps(R2,vConstants); + R3 = _mm_mul_ps(R3,vConstants); + // Merge the results + R0 = _mm_add_ps(R0,R2); + R1 = _mm_add_ps(R1,R3); + + R0 = _mm_mul_ps(R0,V); + // vConstants = V-(V^2 retaining sign) + vConstants = _mm_mul_ps(vAbsV, V); + vConstants = _mm_sub_ps(V,vConstants); + R1 = _mm_mul_ps(R1,vConstants); + vConstants = _mm_sub_ps(OnePlusEpsilon,vAbsV); + // Do NOT use rsqrt/mul. This needs the precision + vConstants = _mm_sqrt_ps(vConstants); + R1 = _mm_div_ps(R1,vConstants); + R0 = _mm_add_ps(R0,R1); + return R0; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMVECTOR XMVectorACos +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V2, V3, AbsV; + XMVECTOR C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11; + XMVECTOR R0, R1, R2, R3, R4; + XMVECTOR OneMinusAbsV; + XMVECTOR Rsq; + XMVECTOR Result; + static CONST XMVECTOR OnePlusEpsilon = {1.00000011921f, 1.00000011921f, 1.00000011921f, 1.00000011921f}; + + // acos(V) = PI / 2 - asin(V) + + AbsV = XMVectorAbs(V); + + V2 = XMVectorMultiply(V, V); + V3 = XMVectorMultiply(V2, AbsV); + + R4 = XMVectorNegativeMultiplySubtract(AbsV, V, V); + + OneMinusAbsV = XMVectorSubtract(OnePlusEpsilon, AbsV); + Rsq = XMVectorReciprocalSqrt(OneMinusAbsV); + + C0 = XMVectorSplatX(g_XMASinCoefficients0.v); + C1 = XMVectorSplatY(g_XMASinCoefficients0.v); + C2 = XMVectorSplatZ(g_XMASinCoefficients0.v); + C3 = XMVectorSplatW(g_XMASinCoefficients0.v); + + C4 = XMVectorSplatX(g_XMASinCoefficients1.v); + C5 = XMVectorSplatY(g_XMASinCoefficients1.v); + C6 = XMVectorSplatZ(g_XMASinCoefficients1.v); + C7 = XMVectorSplatW(g_XMASinCoefficients1.v); + + C8 = XMVectorSplatX(g_XMASinCoefficients2.v); + C9 = XMVectorSplatY(g_XMASinCoefficients2.v); + C10 = XMVectorSplatZ(g_XMASinCoefficients2.v); + C11 = XMVectorSplatW(g_XMASinCoefficients2.v); + + R0 = XMVectorMultiplyAdd(C3, AbsV, C7); + R1 = XMVectorMultiplyAdd(C1, AbsV, C5); + R2 = XMVectorMultiplyAdd(C2, AbsV, C6); + R3 = XMVectorMultiplyAdd(C0, AbsV, C4); + + R0 = XMVectorMultiplyAdd(R0, AbsV, C11); + R1 = XMVectorMultiplyAdd(R1, AbsV, C9); + R2 = XMVectorMultiplyAdd(R2, AbsV, C10); + R3 = XMVectorMultiplyAdd(R3, AbsV, C8); + + R0 = XMVectorMultiplyAdd(R2, V3, R0); + R1 = XMVectorMultiplyAdd(R3, V3, R1); + + R0 = XMVectorMultiply(V, R0); + R1 = XMVectorMultiply(R4, R1); + + Result = XMVectorMultiplyAdd(R1, Rsq, R0); + + Result = XMVectorSubtract(g_XMHalfPi.v, Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + static CONST XMVECTORF32 OnePlusEpsilon = {1.00000011921f, 1.00000011921f, 1.00000011921f, 1.00000011921f}; + // Uses only 6 registers for good code on x86 targets + // acos(V) = PI / 2 - asin(V) + // Get abs(V) + XMVECTOR vAbsV = _mm_setzero_ps(); + vAbsV = _mm_sub_ps(vAbsV,V); + vAbsV = _mm_max_ps(vAbsV,V); + // Perform the series in precision groups to + // retain precision across 20 bits. (3 bits of imprecision due to operations) + XMVECTOR R0 = vAbsV; + XMVECTOR vConstants = _mm_load_ps1(&g_XMASinCoefficients0.f[3]); + R0 = _mm_mul_ps(R0,vConstants); + vConstants = _mm_load_ps1(&g_XMASinCoefficients1.f[3]); + R0 = _mm_add_ps(R0,vConstants); + R0 = _mm_mul_ps(R0,vAbsV); + vConstants = _mm_load_ps1(&g_XMASinCoefficients2.f[3]); + R0 = _mm_add_ps(R0,vConstants); + + XMVECTOR R1 = vAbsV; + vConstants = _mm_load_ps1(&g_XMASinCoefficients0.f[1]); + R1 = _mm_mul_ps(R1,vConstants); + vConstants = _mm_load_ps1(&g_XMASinCoefficients1.f[1]); + R1 = _mm_add_ps(R1,vConstants); + R1 = _mm_mul_ps(R1, vAbsV); + vConstants = _mm_load_ps1(&g_XMASinCoefficients2.f[1]); + R1 = _mm_add_ps(R1,vConstants); + + XMVECTOR R2 = vAbsV; + vConstants = _mm_load_ps1(&g_XMASinCoefficients0.f[2]); + R2 = _mm_mul_ps(R2,vConstants); + vConstants = _mm_load_ps1(&g_XMASinCoefficients1.f[2]); + R2 = _mm_add_ps(R2,vConstants); + R2 = _mm_mul_ps(R2, vAbsV); + vConstants = _mm_load_ps1(&g_XMASinCoefficients2.f[2]); + R2 = _mm_add_ps(R2,vConstants); + + XMVECTOR R3 = vAbsV; + vConstants = _mm_load_ps1(&g_XMASinCoefficients0.f[0]); + R3 = _mm_mul_ps(R3,vConstants); + vConstants = _mm_load_ps1(&g_XMASinCoefficients1.f[0]); + R3 = _mm_add_ps(R3,vConstants); + R3 = _mm_mul_ps(R3, vAbsV); + vConstants = _mm_load_ps1(&g_XMASinCoefficients2.f[0]); + R3 = _mm_add_ps(R3,vConstants); + + // vConstants = V^3 + vConstants = _mm_mul_ps(V,V); + vConstants = _mm_mul_ps(vConstants,vAbsV); + R2 = _mm_mul_ps(R2,vConstants); + R3 = _mm_mul_ps(R3,vConstants); + // Add the pair of values together here to retain + // as much precision as possible + R0 = _mm_add_ps(R0,R2); + R1 = _mm_add_ps(R1,R3); + + R0 = _mm_mul_ps(R0,V); + // vConstants = V-(V*abs(V)) + vConstants = _mm_mul_ps(V,vAbsV); + vConstants = _mm_sub_ps(V,vConstants); + R1 = _mm_mul_ps(R1,vConstants); + // Episilon exists to allow 1.0 as an answer + vConstants = _mm_sub_ps(OnePlusEpsilon, vAbsV); + // Use sqrt instead of rsqrt for precision + vConstants = _mm_sqrt_ps(vConstants); + R1 = _mm_div_ps(R1,vConstants); + R1 = _mm_add_ps(R1,R0); + vConstants = _mm_sub_ps(g_XMHalfPi,R1); + return vConstants; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMVECTOR XMVectorATan +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + // Cody and Waite algorithm to compute inverse tangent. + + XMVECTOR N, D; + XMVECTOR VF, G, ReciprocalF, AbsF, FA, FB; + XMVECTOR Sqrt3, Sqrt3MinusOne, TwoMinusSqrt3; + XMVECTOR HalfPi, OneThirdPi, OneSixthPi, Epsilon, MinV, MaxV; + XMVECTOR Zero; + XMVECTOR NegativeHalfPi; + XMVECTOR Angle1, Angle2; + XMVECTOR F_GT_One, F_GT_TwoMinusSqrt3, AbsF_LT_Epsilon, V_LT_Zero, V_GT_MaxV, V_LT_MinV; + XMVECTOR NegativeResult, Result; + XMVECTOR P0, P1, P2, P3, Q0, Q1, Q2, Q3; + static CONST XMVECTOR ATanConstants0 = {-1.3688768894e+1f, -2.0505855195e+1f, -8.4946240351f, -8.3758299368e-1f}; + static CONST XMVECTOR ATanConstants1 = {4.1066306682e+1f, 8.6157349597e+1f, 5.9578436142e+1f, 1.5024001160e+1f}; + static CONST XMVECTOR ATanConstants2 = {1.732050808f, 7.320508076e-1f, 2.679491924e-1f, 0.000244140625f}; // + static CONST XMVECTOR ATanConstants3 = {XM_PIDIV2, XM_PI / 3.0f, XM_PI / 6.0f, 8.507059173e+37f}; // + + Zero = XMVectorZero(); + + P0 = XMVectorSplatX(ATanConstants0); + P1 = XMVectorSplatY(ATanConstants0); + P2 = XMVectorSplatZ(ATanConstants0); + P3 = XMVectorSplatW(ATanConstants0); + + Q0 = XMVectorSplatX(ATanConstants1); + Q1 = XMVectorSplatY(ATanConstants1); + Q2 = XMVectorSplatZ(ATanConstants1); + Q3 = XMVectorSplatW(ATanConstants1); + + Sqrt3 = XMVectorSplatX(ATanConstants2); + Sqrt3MinusOne = XMVectorSplatY(ATanConstants2); + TwoMinusSqrt3 = XMVectorSplatZ(ATanConstants2); + Epsilon = XMVectorSplatW(ATanConstants2); + + HalfPi = XMVectorSplatX(ATanConstants3); + OneThirdPi = XMVectorSplatY(ATanConstants3); + OneSixthPi = XMVectorSplatZ(ATanConstants3); + MaxV = XMVectorSplatW(ATanConstants3); + + VF = XMVectorAbs(V); + ReciprocalF = XMVectorReciprocal(VF); + + F_GT_One = XMVectorGreater(VF, g_XMOne.v); + + VF = XMVectorSelect(VF, ReciprocalF, F_GT_One); + Angle1 = XMVectorSelect(Zero, HalfPi, F_GT_One); + Angle2 = XMVectorSelect(OneSixthPi, OneThirdPi, F_GT_One); + + F_GT_TwoMinusSqrt3 = XMVectorGreater(VF, TwoMinusSqrt3); + + FA = XMVectorMultiplyAdd(Sqrt3MinusOne, VF, VF); + FA = XMVectorAdd(FA, g_XMNegativeOne.v); + FB = XMVectorAdd(VF, Sqrt3); + FB = XMVectorReciprocal(FB); + FA = XMVectorMultiply(FA, FB); + + VF = XMVectorSelect(VF, FA, F_GT_TwoMinusSqrt3); + Angle1 = XMVectorSelect(Angle1, Angle2, F_GT_TwoMinusSqrt3); + + AbsF = XMVectorAbs(VF); + AbsF_LT_Epsilon = XMVectorLess(AbsF, Epsilon); + + G = XMVectorMultiply(VF, VF); + + D = XMVectorAdd(G, Q3); + D = XMVectorMultiplyAdd(D, G, Q2); + D = XMVectorMultiplyAdd(D, G, Q1); + D = XMVectorMultiplyAdd(D, G, Q0); + D = XMVectorReciprocal(D); + + N = XMVectorMultiplyAdd(P3, G, P2); + N = XMVectorMultiplyAdd(N, G, P1); + N = XMVectorMultiplyAdd(N, G, P0); + N = XMVectorMultiply(N, G); + Result = XMVectorMultiply(N, D); + + Result = XMVectorMultiplyAdd(Result, VF, VF); + + Result = XMVectorSelect(Result, VF, AbsF_LT_Epsilon); + + NegativeResult = XMVectorNegate(Result); + Result = XMVectorSelect(Result, NegativeResult, F_GT_One); + + Result = XMVectorAdd(Result, Angle1); + + V_LT_Zero = XMVectorLess(V, Zero); + NegativeResult = XMVectorNegate(Result); + Result = XMVectorSelect(Result, NegativeResult, V_LT_Zero); + + MinV = XMVectorNegate(MaxV); + NegativeHalfPi = XMVectorNegate(HalfPi); + V_GT_MaxV = XMVectorGreater(V, MaxV); + V_LT_MinV = XMVectorLess(V, MinV); + Result = XMVectorSelect(Result, g_XMHalfPi.v, V_GT_MaxV); + Result = XMVectorSelect(Result, NegativeHalfPi, V_LT_MinV); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + static CONST XMVECTORF32 ATanConstants0 = {-1.3688768894e+1f, -2.0505855195e+1f, -8.4946240351f, -8.3758299368e-1f}; + static CONST XMVECTORF32 ATanConstants1 = {4.1066306682e+1f, 8.6157349597e+1f, 5.9578436142e+1f, 1.5024001160e+1f}; + static CONST XMVECTORF32 ATanConstants2 = {1.732050808f, 7.320508076e-1f, 2.679491924e-1f, 0.000244140625f}; // + static CONST XMVECTORF32 ATanConstants3 = {XM_PIDIV2, XM_PI / 3.0f, XM_PI / 6.0f, 8.507059173e+37f}; // + + XMVECTOR VF = XMVectorAbs(V); + XMVECTOR F_GT_One = _mm_cmpgt_ps(VF,g_XMOne); + XMVECTOR ReciprocalF = XMVectorReciprocal(VF); + VF = XMVectorSelect(VF, ReciprocalF, F_GT_One); + XMVECTOR Zero = XMVectorZero(); + XMVECTOR HalfPi = _mm_load_ps1(&ATanConstants3.f[0]); + XMVECTOR Angle1 = XMVectorSelect(Zero, HalfPi, F_GT_One); + // Pi/3 + XMVECTOR vConstants = _mm_load_ps1(&ATanConstants3.f[1]); + // Pi/6 + XMVECTOR Angle2 = _mm_load_ps1(&ATanConstants3.f[2]); + Angle2 = XMVectorSelect(Angle2, vConstants, F_GT_One); + + // 1-sqrt(3) + XMVECTOR FA = _mm_load_ps1(&ATanConstants2.f[1]); + FA = _mm_mul_ps(FA,VF); + FA = _mm_add_ps(FA,VF); + FA = _mm_add_ps(FA,g_XMNegativeOne); + // sqrt(3) + vConstants = _mm_load_ps1(&ATanConstants2.f[0]); + vConstants = _mm_add_ps(vConstants,VF); + FA = _mm_div_ps(FA,vConstants); + + // 2-sqrt(3) + vConstants = _mm_load_ps1(&ATanConstants2.f[2]); + // >2-sqrt(3)? + vConstants = _mm_cmpgt_ps(VF,vConstants); + VF = XMVectorSelect(VF, FA, vConstants); + Angle1 = XMVectorSelect(Angle1, Angle2, vConstants); + + XMVECTOR AbsF = XMVectorAbs(VF); + + XMVECTOR G = _mm_mul_ps(VF,VF); + XMVECTOR D = _mm_load_ps1(&ATanConstants1.f[3]); + D = _mm_add_ps(D,G); + D = _mm_mul_ps(D,G); + vConstants = _mm_load_ps1(&ATanConstants1.f[2]); + D = _mm_add_ps(D,vConstants); + D = _mm_mul_ps(D,G); + vConstants = _mm_load_ps1(&ATanConstants1.f[1]); + D = _mm_add_ps(D,vConstants); + D = _mm_mul_ps(D,G); + vConstants = _mm_load_ps1(&ATanConstants1.f[0]); + D = _mm_add_ps(D,vConstants); + + XMVECTOR N = _mm_load_ps1(&ATanConstants0.f[3]); + N = _mm_mul_ps(N,G); + vConstants = _mm_load_ps1(&ATanConstants0.f[2]); + N = _mm_add_ps(N,vConstants); + N = _mm_mul_ps(N,G); + vConstants = _mm_load_ps1(&ATanConstants0.f[1]); + N = _mm_add_ps(N,vConstants); + N = _mm_mul_ps(N,G); + vConstants = _mm_load_ps1(&ATanConstants0.f[0]); + N = _mm_add_ps(N,vConstants); + N = _mm_mul_ps(N,G); + XMVECTOR Result = _mm_div_ps(N,D); + + Result = _mm_mul_ps(Result,VF); + Result = _mm_add_ps(Result,VF); + // Epsilon + vConstants = _mm_load_ps1(&ATanConstants2.f[3]); + vConstants = _mm_cmpge_ps(vConstants,AbsF); + Result = XMVectorSelect(Result,VF,vConstants); + + XMVECTOR NegativeResult = _mm_mul_ps(Result,g_XMNegativeOne); + Result = XMVectorSelect(Result,NegativeResult,F_GT_One); + Result = _mm_add_ps(Result,Angle1); + + Zero = _mm_cmpge_ps(Zero,V); + NegativeResult = _mm_mul_ps(Result,g_XMNegativeOne); + Result = XMVectorSelect(Result,NegativeResult,Zero); + + XMVECTOR MaxV = _mm_load_ps1(&ATanConstants3.f[3]); + XMVECTOR MinV = _mm_mul_ps(MaxV,g_XMNegativeOne); + // Negate HalfPi + HalfPi = _mm_mul_ps(HalfPi,g_XMNegativeOne); + MaxV = _mm_cmple_ps(MaxV,V); + MinV = _mm_cmpge_ps(MinV,V); + Result = XMVectorSelect(Result,g_XMHalfPi,MaxV); + // HalfPi = -HalfPi + Result = XMVectorSelect(Result,HalfPi,MinV); + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMVECTOR XMVectorATan2 +( + FXMVECTOR Y, + FXMVECTOR X +) +{ +#if defined(_XM_NO_INTRINSICS_) + + // Return the inverse tangent of Y / X in the range of -Pi to Pi with the following exceptions: + + // Y == 0 and X is Negative -> Pi with the sign of Y + // y == 0 and x is positive -> 0 with the sign of y + // Y != 0 and X == 0 -> Pi / 2 with the sign of Y + // Y != 0 and X is Negative -> atan(y/x) + (PI with the sign of Y) + // X == -Infinity and Finite Y -> Pi with the sign of Y + // X == +Infinity and Finite Y -> 0 with the sign of Y + // Y == Infinity and X is Finite -> Pi / 2 with the sign of Y + // Y == Infinity and X == -Infinity -> 3Pi / 4 with the sign of Y + // Y == Infinity and X == +Infinity -> Pi / 4 with the sign of Y + + XMVECTOR Reciprocal; + XMVECTOR V; + XMVECTOR YSign; + XMVECTOR Pi, PiOverTwo, PiOverFour, ThreePiOverFour; + XMVECTOR YEqualsZero, XEqualsZero, XIsPositive, YEqualsInfinity, XEqualsInfinity; + XMVECTOR ATanResultValid; + XMVECTOR R0, R1, R2, R3, R4, R5; + XMVECTOR Zero; + XMVECTOR Result; + static CONST XMVECTOR ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f}; + + Zero = XMVectorZero(); + ATanResultValid = XMVectorTrueInt(); + + Pi = XMVectorSplatX(ATan2Constants); + PiOverTwo = XMVectorSplatY(ATan2Constants); + PiOverFour = XMVectorSplatZ(ATan2Constants); + ThreePiOverFour = XMVectorSplatW(ATan2Constants); + + YEqualsZero = XMVectorEqual(Y, Zero); + XEqualsZero = XMVectorEqual(X, Zero); + XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v); + XIsPositive = XMVectorEqualInt(XIsPositive, Zero); + YEqualsInfinity = XMVectorIsInfinite(Y); + XEqualsInfinity = XMVectorIsInfinite(X); + + YSign = XMVectorAndInt(Y, g_XMNegativeZero.v); + Pi = XMVectorOrInt(Pi, YSign); + PiOverTwo = XMVectorOrInt(PiOverTwo, YSign); + PiOverFour = XMVectorOrInt(PiOverFour, YSign); + ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign); + + R1 = XMVectorSelect(Pi, YSign, XIsPositive); + R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero); + R3 = XMVectorSelect(R2, R1, YEqualsZero); + R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive); + R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity); + Result = XMVectorSelect(R3, R5, YEqualsInfinity); + ATanResultValid = XMVectorEqualInt(Result, ATanResultValid); + + Reciprocal = XMVectorReciprocal(X); + V = XMVectorMultiply(Y, Reciprocal); + R0 = XMVectorATan(V); + + R1 = XMVectorSelect( Pi, Zero, XIsPositive ); + R2 = XMVectorAdd(R0, R1); + + Result = XMVectorSelect(Result, R2, ATanResultValid); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + static CONST XMVECTORF32 ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f}; + + // Mask if Y>0 && Y!=INF + XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y); + // Get the sign of (Y&0x80000000) + XMVECTOR YSign = _mm_and_ps(Y, g_XMNegativeZero); + // Get the sign bits of X + XMVECTOR XIsPositive = _mm_and_ps(X,g_XMNegativeZero); + // Change them to masks + XIsPositive = XMVectorEqualInt(XIsPositive,g_XMZero); + // Get Pi + XMVECTOR Pi = _mm_load_ps1(&ATan2Constants.f[0]); + // Copy the sign of Y + Pi = _mm_or_ps(Pi,YSign); + XMVECTOR R1 = XMVectorSelect(Pi,YSign,XIsPositive); + // Mask for X==0 + XMVECTOR vConstants = _mm_cmpeq_ps(X,g_XMZero); + // Get Pi/2 with with sign of Y + XMVECTOR PiOverTwo = _mm_load_ps1(&ATan2Constants.f[1]); + PiOverTwo = _mm_or_ps(PiOverTwo,YSign); + XMVECTOR R2 = XMVectorSelect(g_XMNegOneMask,PiOverTwo,vConstants); + // Mask for Y==0 + vConstants = _mm_cmpeq_ps(Y,g_XMZero); + R2 = XMVectorSelect(R2,R1,vConstants); + // Get Pi/4 with sign of Y + XMVECTOR PiOverFour = _mm_load_ps1(&ATan2Constants.f[2]); + PiOverFour = _mm_or_ps(PiOverFour,YSign); + // Get (Pi*3)/4 with sign of Y + XMVECTOR ThreePiOverFour = _mm_load_ps1(&ATan2Constants.f[3]); + ThreePiOverFour = _mm_or_ps(ThreePiOverFour,YSign); + vConstants = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive); + XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X); + vConstants = XMVectorSelect(PiOverTwo,vConstants,XEqualsInfinity); + + XMVECTOR vResult = XMVectorSelect(R2,vConstants,YEqualsInfinity); + vConstants = XMVectorSelect(R1,vResult,YEqualsInfinity); + // At this point, any entry that's zero will get the result + // from XMVectorATan(), otherwise, return the failsafe value + vResult = XMVectorSelect(vResult,vConstants,XEqualsInfinity); + // Any entries not 0xFFFFFFFF, are considered precalculated + XMVECTOR ATanResultValid = XMVectorEqualInt(vResult,g_XMNegOneMask); + // Let's do the ATan2 function + vConstants = _mm_div_ps(Y,X); + vConstants = XMVectorATan(vConstants); + // Discard entries that have been declared void + + XMVECTOR R3 = XMVectorSelect( Pi, g_XMZero, XIsPositive ); + vConstants = _mm_add_ps( vConstants, R3 ); + + vResult = XMVectorSelect(vResult,vConstants,ATanResultValid); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorSinEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V2, V3, V5, V7; + XMVECTOR S1, S2, S3; + XMVECTOR Result; + + // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! (for -PI <= V < PI) + V2 = XMVectorMultiply(V, V); + V3 = XMVectorMultiply(V2, V); + V5 = XMVectorMultiply(V3, V2); + V7 = XMVectorMultiply(V5, V2); + + S1 = XMVectorSplatY(g_XMSinEstCoefficients.v); + S2 = XMVectorSplatZ(g_XMSinEstCoefficients.v); + S3 = XMVectorSplatW(g_XMSinEstCoefficients.v); + + Result = XMVectorMultiplyAdd(S1, V3, V); + Result = XMVectorMultiplyAdd(S2, V5, Result); + Result = XMVectorMultiplyAdd(S3, V7, Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! (for -PI <= V < PI) + XMVECTOR V2 = _mm_mul_ps(V,V); + XMVECTOR V3 = _mm_mul_ps(V2,V); + XMVECTOR vResult = _mm_load_ps1(&g_XMSinEstCoefficients.f[1]); + vResult = _mm_mul_ps(vResult,V3); + vResult = _mm_add_ps(vResult,V); + XMVECTOR vConstants = _mm_load_ps1(&g_XMSinEstCoefficients.f[2]); + // V^5 + V3 = _mm_mul_ps(V3,V2); + vConstants = _mm_mul_ps(vConstants,V3); + vResult = _mm_add_ps(vResult,vConstants); + vConstants = _mm_load_ps1(&g_XMSinEstCoefficients.f[3]); + // V^7 + V3 = _mm_mul_ps(V3,V2); + vConstants = _mm_mul_ps(vConstants,V3); + vResult = _mm_add_ps(vResult,vConstants); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorCosEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V2, V4, V6; + XMVECTOR C0, C1, C2, C3; + XMVECTOR Result; + + V2 = XMVectorMultiply(V, V); + V4 = XMVectorMultiply(V2, V2); + V6 = XMVectorMultiply(V4, V2); + + C0 = XMVectorSplatX(g_XMCosEstCoefficients.v); + C1 = XMVectorSplatY(g_XMCosEstCoefficients.v); + C2 = XMVectorSplatZ(g_XMCosEstCoefficients.v); + C3 = XMVectorSplatW(g_XMCosEstCoefficients.v); + + Result = XMVectorMultiplyAdd(C1, V2, C0); + Result = XMVectorMultiplyAdd(C2, V4, Result); + Result = XMVectorMultiplyAdd(C3, V6, Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Get V^2 + XMVECTOR V2 = _mm_mul_ps(V,V); + XMVECTOR vResult = _mm_load_ps1(&g_XMCosEstCoefficients.f[1]); + vResult = _mm_mul_ps(vResult,V2); + XMVECTOR vConstants = _mm_load_ps1(&g_XMCosEstCoefficients.f[0]); + vResult = _mm_add_ps(vResult,vConstants); + vConstants = _mm_load_ps1(&g_XMCosEstCoefficients.f[2]); + // Get V^4 + XMVECTOR V4 = _mm_mul_ps(V2, V2); + vConstants = _mm_mul_ps(vConstants,V4); + vResult = _mm_add_ps(vResult,vConstants); + vConstants = _mm_load_ps1(&g_XMCosEstCoefficients.f[3]); + // It's really V^6 + V4 = _mm_mul_ps(V4,V2); + vConstants = _mm_mul_ps(vConstants,V4); + vResult = _mm_add_ps(vResult,vConstants); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMVectorSinCosEst +( + XMVECTOR* pSin, + XMVECTOR* pCos, + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V2, V3, V4, V5, V6, V7; + XMVECTOR S1, S2, S3; + XMVECTOR C0, C1, C2, C3; + XMVECTOR Sin, Cos; + + XMASSERT(pSin); + XMASSERT(pCos); + + // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! (for -PI <= V < PI) + // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! (for -PI <= V < PI) + V2 = XMVectorMultiply(V, V); + V3 = XMVectorMultiply(V2, V); + V4 = XMVectorMultiply(V2, V2); + V5 = XMVectorMultiply(V3, V2); + V6 = XMVectorMultiply(V3, V3); + V7 = XMVectorMultiply(V4, V3); + + S1 = XMVectorSplatY(g_XMSinEstCoefficients.v); + S2 = XMVectorSplatZ(g_XMSinEstCoefficients.v); + S3 = XMVectorSplatW(g_XMSinEstCoefficients.v); + + C0 = XMVectorSplatX(g_XMCosEstCoefficients.v); + C1 = XMVectorSplatY(g_XMCosEstCoefficients.v); + C2 = XMVectorSplatZ(g_XMCosEstCoefficients.v); + C3 = XMVectorSplatW(g_XMCosEstCoefficients.v); + + Sin = XMVectorMultiplyAdd(S1, V3, V); + Sin = XMVectorMultiplyAdd(S2, V5, Sin); + Sin = XMVectorMultiplyAdd(S3, V7, Sin); + + Cos = XMVectorMultiplyAdd(C1, V2, C0); + Cos = XMVectorMultiplyAdd(C2, V4, Cos); + Cos = XMVectorMultiplyAdd(C3, V6, Cos); + + *pSin = Sin; + *pCos = Cos; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pSin); + XMASSERT(pCos); + XMVECTOR V2, V3, V4, V5, V6, V7; + XMVECTOR S1, S2, S3; + XMVECTOR C0, C1, C2, C3; + XMVECTOR Sin, Cos; + + // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! (for -PI <= V < PI) + // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! (for -PI <= V < PI) + V2 = XMVectorMultiply(V, V); + V3 = XMVectorMultiply(V2, V); + V4 = XMVectorMultiply(V2, V2); + V5 = XMVectorMultiply(V3, V2); + V6 = XMVectorMultiply(V3, V3); + V7 = XMVectorMultiply(V4, V3); + + S1 = _mm_load_ps1(&g_XMSinEstCoefficients.f[1]); + S2 = _mm_load_ps1(&g_XMSinEstCoefficients.f[2]); + S3 = _mm_load_ps1(&g_XMSinEstCoefficients.f[3]); + + C0 = _mm_load_ps1(&g_XMCosEstCoefficients.f[0]); + C1 = _mm_load_ps1(&g_XMCosEstCoefficients.f[1]); + C2 = _mm_load_ps1(&g_XMCosEstCoefficients.f[2]); + C3 = _mm_load_ps1(&g_XMCosEstCoefficients.f[3]); + + Sin = XMVectorMultiplyAdd(S1, V3, V); + Sin = XMVectorMultiplyAdd(S2, V5, Sin); + Sin = XMVectorMultiplyAdd(S3, V7, Sin); + + Cos = XMVectorMultiplyAdd(C1, V2, C0); + Cos = XMVectorMultiplyAdd(C2, V4, Cos); + Cos = XMVectorMultiplyAdd(C3, V6, Cos); + + *pSin = Sin; + *pCos = Cos; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorTanEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V1, V2, V1T0, V1T1, V2T2; + XMVECTOR T0, T1, T2; + XMVECTOR N, D; + XMVECTOR OneOverPi; + XMVECTOR Result; + + OneOverPi = XMVectorSplatW(g_XMTanEstCoefficients.v); + + V1 = XMVectorMultiply(V, OneOverPi); + V1 = XMVectorRound(V1); + + V1 = XMVectorNegativeMultiplySubtract(g_XMPi.v, V1, V); + + T0 = XMVectorSplatX(g_XMTanEstCoefficients.v); + T1 = XMVectorSplatY(g_XMTanEstCoefficients.v); + T2 = XMVectorSplatZ(g_XMTanEstCoefficients.v); + + V2T2 = XMVectorNegativeMultiplySubtract(V1, V1, T2); + V2 = XMVectorMultiply(V1, V1); + V1T0 = XMVectorMultiply(V1, T0); + V1T1 = XMVectorMultiply(V1, T1); + + D = XMVectorReciprocalEst(V2T2); + N = XMVectorMultiplyAdd(V2, V1T1, V1T0); + + Result = XMVectorMultiply(N, D); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR V1, V2, V1T0, V1T1, V2T2; + XMVECTOR T0, T1, T2; + XMVECTOR N, D; + XMVECTOR OneOverPi; + XMVECTOR Result; + + OneOverPi = XMVectorSplatW(g_XMTanEstCoefficients); + + V1 = XMVectorMultiply(V, OneOverPi); + V1 = XMVectorRound(V1); + + V1 = XMVectorNegativeMultiplySubtract(g_XMPi, V1, V); + + T0 = XMVectorSplatX(g_XMTanEstCoefficients); + T1 = XMVectorSplatY(g_XMTanEstCoefficients); + T2 = XMVectorSplatZ(g_XMTanEstCoefficients); + + V2T2 = XMVectorNegativeMultiplySubtract(V1, V1, T2); + V2 = XMVectorMultiply(V1, V1); + V1T0 = XMVectorMultiply(V1, T0); + V1T1 = XMVectorMultiply(V1, T1); + + D = XMVectorReciprocalEst(V2T2); + N = XMVectorMultiplyAdd(V2, V1T1, V1T0); + + Result = XMVectorMultiply(N, D); + + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorSinHEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V1, V2; + XMVECTOR E1, E2; + XMVECTOR Result; + static CONST XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f) + + V1 = XMVectorMultiplyAdd(V, Scale.v, g_XMNegativeOne.v); + V2 = XMVectorNegativeMultiplySubtract(V, Scale.v, g_XMNegativeOne.v); + + E1 = XMVectorExpEst(V1); + E2 = XMVectorExpEst(V2); + + Result = XMVectorSubtract(E1, E2); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR V1, V2; + XMVECTOR E1, E2; + XMVECTOR Result; + static CONST XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f) + + V1 = _mm_mul_ps(V,Scale); + V1 = _mm_add_ps(V1,g_XMNegativeOne); + V2 = _mm_mul_ps(V,Scale); + V2 = _mm_sub_ps(g_XMNegativeOne,V2); + E1 = XMVectorExpEst(V1); + E2 = XMVectorExpEst(V2); + Result = _mm_sub_ps(E1, E2); + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorCosHEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V1, V2; + XMVECTOR E1, E2; + XMVECTOR Result; + static CONST XMVECTOR Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f) + + V1 = XMVectorMultiplyAdd(V, Scale, g_XMNegativeOne.v); + V2 = XMVectorNegativeMultiplySubtract(V, Scale, g_XMNegativeOne.v); + + E1 = XMVectorExpEst(V1); + E2 = XMVectorExpEst(V2); + + Result = XMVectorAdd(E1, E2); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR V1, V2; + XMVECTOR E1, E2; + XMVECTOR Result; + static CONST XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f) + + V1 = _mm_mul_ps(V,Scale); + V1 = _mm_add_ps(V1,g_XMNegativeOne); + V2 = _mm_mul_ps(V, Scale); + V2 = _mm_sub_ps(g_XMNegativeOne,V2); + E1 = XMVectorExpEst(V1); + E2 = XMVectorExpEst(V2); + Result = _mm_add_ps(E1, E2); + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorTanHEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR E; + XMVECTOR Result; + static CONST XMVECTOR Scale = {2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f}; // 2.0f / ln(2.0f) + + E = XMVectorMultiply(V, Scale); + E = XMVectorExpEst(E); + E = XMVectorMultiplyAdd(E, g_XMOneHalf.v, g_XMOneHalf.v); + E = XMVectorReciprocalEst(E); + + Result = XMVectorSubtract(g_XMOne.v, E); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + static CONST XMVECTORF32 Scale = {2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f}; // 2.0f / ln(2.0f) + + XMVECTOR E = _mm_mul_ps(V, Scale); + E = XMVectorExpEst(E); + E = _mm_mul_ps(E,g_XMOneHalf); + E = _mm_add_ps(E,g_XMOneHalf); + E = XMVectorReciprocalEst(E); + E = _mm_sub_ps(g_XMOne, E); + return E; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorASinEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR AbsV, V2, VD, VC0, V2C3; + XMVECTOR C0, C1, C2, C3; + XMVECTOR D, Rsq, SqrtD; + XMVECTOR OnePlusEps; + XMVECTOR Result; + + AbsV = XMVectorAbs(V); + + OnePlusEps = XMVectorSplatX(g_XMASinEstConstants.v); + + C0 = XMVectorSplatX(g_XMASinEstCoefficients.v); + C1 = XMVectorSplatY(g_XMASinEstCoefficients.v); + C2 = XMVectorSplatZ(g_XMASinEstCoefficients.v); + C3 = XMVectorSplatW(g_XMASinEstCoefficients.v); + + D = XMVectorSubtract(OnePlusEps, AbsV); + + Rsq = XMVectorReciprocalSqrtEst(D); + SqrtD = XMVectorMultiply(D, Rsq); + + V2 = XMVectorMultiply(V, AbsV); + V2C3 = XMVectorMultiply(V2, C3); + VD = XMVectorMultiply(D, AbsV); + VC0 = XMVectorMultiply(V, C0); + + Result = XMVectorMultiply(V, C1); + Result = XMVectorMultiplyAdd(V2, C2, Result); + Result = XMVectorMultiplyAdd(V2C3, VD, Result); + Result = XMVectorMultiplyAdd(VC0, SqrtD, Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Get abs(V) + XMVECTOR vAbsV = _mm_setzero_ps(); + vAbsV = _mm_sub_ps(vAbsV,V); + vAbsV = _mm_max_ps(vAbsV,V); + + XMVECTOR D = _mm_load_ps1(&g_XMASinEstConstants.f[0]); + D = _mm_sub_ps(D,vAbsV); + // Since this is an estimate, rqsrt is okay + XMVECTOR vConstants = _mm_rsqrt_ps(D); + XMVECTOR SqrtD = _mm_mul_ps(D,vConstants); + // V2 = V^2 retaining sign + XMVECTOR V2 = _mm_mul_ps(V,vAbsV); + D = _mm_mul_ps(D,vAbsV); + + XMVECTOR vResult = _mm_load_ps1(&g_XMASinEstCoefficients.f[1]); + vResult = _mm_mul_ps(vResult,V); + vConstants = _mm_load_ps1(&g_XMASinEstCoefficients.f[2]); + vConstants = _mm_mul_ps(vConstants,V2); + vResult = _mm_add_ps(vResult,vConstants); + + vConstants = _mm_load_ps1(&g_XMASinEstCoefficients.f[3]); + vConstants = _mm_mul_ps(vConstants,V2); + vConstants = _mm_mul_ps(vConstants,D); + vResult = _mm_add_ps(vResult,vConstants); + + vConstants = _mm_load_ps1(&g_XMASinEstCoefficients.f[0]); + vConstants = _mm_mul_ps(vConstants,V); + vConstants = _mm_mul_ps(vConstants,SqrtD); + vResult = _mm_add_ps(vResult,vConstants); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorACosEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR AbsV, V2, VD, VC0, V2C3; + XMVECTOR C0, C1, C2, C3; + XMVECTOR D, Rsq, SqrtD; + XMVECTOR OnePlusEps, HalfPi; + XMVECTOR Result; + + // acos(V) = PI / 2 - asin(V) + + AbsV = XMVectorAbs(V); + + OnePlusEps = XMVectorSplatX(g_XMASinEstConstants.v); + HalfPi = XMVectorSplatY(g_XMASinEstConstants.v); + + C0 = XMVectorSplatX(g_XMASinEstCoefficients.v); + C1 = XMVectorSplatY(g_XMASinEstCoefficients.v); + C2 = XMVectorSplatZ(g_XMASinEstCoefficients.v); + C3 = XMVectorSplatW(g_XMASinEstCoefficients.v); + + D = XMVectorSubtract(OnePlusEps, AbsV); + + Rsq = XMVectorReciprocalSqrtEst(D); + SqrtD = XMVectorMultiply(D, Rsq); + + V2 = XMVectorMultiply(V, AbsV); + V2C3 = XMVectorMultiply(V2, C3); + VD = XMVectorMultiply(D, AbsV); + VC0 = XMVectorMultiply(V, C0); + + Result = XMVectorMultiply(V, C1); + Result = XMVectorMultiplyAdd(V2, C2, Result); + Result = XMVectorMultiplyAdd(V2C3, VD, Result); + Result = XMVectorMultiplyAdd(VC0, SqrtD, Result); + Result = XMVectorSubtract(HalfPi, Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // acos(V) = PI / 2 - asin(V) + // Get abs(V) + XMVECTOR vAbsV = _mm_setzero_ps(); + vAbsV = _mm_sub_ps(vAbsV,V); + vAbsV = _mm_max_ps(vAbsV,V); + // Calc D + XMVECTOR D = _mm_load_ps1(&g_XMASinEstConstants.f[0]); + D = _mm_sub_ps(D,vAbsV); + // SqrtD = sqrt(D-abs(V)) estimated + XMVECTOR vConstants = _mm_rsqrt_ps(D); + XMVECTOR SqrtD = _mm_mul_ps(D,vConstants); + // V2 = V^2 while retaining sign + XMVECTOR V2 = _mm_mul_ps(V, vAbsV); + // Drop vAbsV here. D = (Const-abs(V))*abs(V) + D = _mm_mul_ps(D, vAbsV); + + XMVECTOR vResult = _mm_load_ps1(&g_XMASinEstCoefficients.f[1]); + vResult = _mm_mul_ps(vResult,V); + vConstants = _mm_load_ps1(&g_XMASinEstCoefficients.f[2]); + vConstants = _mm_mul_ps(vConstants,V2); + vResult = _mm_add_ps(vResult,vConstants); + + vConstants = _mm_load_ps1(&g_XMASinEstCoefficients.f[3]); + vConstants = _mm_mul_ps(vConstants,V2); + vConstants = _mm_mul_ps(vConstants,D); + vResult = _mm_add_ps(vResult,vConstants); + + vConstants = _mm_load_ps1(&g_XMASinEstCoefficients.f[0]); + vConstants = _mm_mul_ps(vConstants,V); + vConstants = _mm_mul_ps(vConstants,SqrtD); + vResult = _mm_add_ps(vResult,vConstants); + + vConstants = _mm_load_ps1(&g_XMASinEstConstants.f[1]); + vResult = _mm_sub_ps(vConstants,vResult); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorATanEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR AbsV, V2S2, N, D; + XMVECTOR S0, S1, S2; + XMVECTOR HalfPi; + XMVECTOR Result; + + S0 = XMVectorSplatX(g_XMATanEstCoefficients.v); + S1 = XMVectorSplatY(g_XMATanEstCoefficients.v); + S2 = XMVectorSplatZ(g_XMATanEstCoefficients.v); + HalfPi = XMVectorSplatW(g_XMATanEstCoefficients.v); + + AbsV = XMVectorAbs(V); + + V2S2 = XMVectorMultiplyAdd(V, V, S2); + N = XMVectorMultiplyAdd(AbsV, HalfPi, S0); + D = XMVectorMultiplyAdd(AbsV, S1, V2S2); + N = XMVectorMultiply(N, V); + D = XMVectorReciprocalEst(D); + + Result = XMVectorMultiply(N, D); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Get abs(V) + XMVECTOR vAbsV = _mm_setzero_ps(); + vAbsV = _mm_sub_ps(vAbsV,V); + vAbsV = _mm_max_ps(vAbsV,V); + + XMVECTOR vResult = _mm_load_ps1(&g_XMATanEstCoefficients.f[3]); + vResult = _mm_mul_ps(vResult,vAbsV); + XMVECTOR vConstants = _mm_load_ps1(&g_XMATanEstCoefficients.f[0]); + vResult = _mm_add_ps(vResult,vConstants); + vResult = _mm_mul_ps(vResult,V); + + XMVECTOR D = _mm_mul_ps(V,V); + vConstants = _mm_load_ps1(&g_XMATanEstCoefficients.f[2]); + D = _mm_add_ps(D,vConstants); + vConstants = _mm_load_ps1(&g_XMATanEstCoefficients.f[1]); + vConstants = _mm_mul_ps(vConstants,vAbsV); + D = _mm_add_ps(D,vConstants); + vResult = _mm_div_ps(vResult,D); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorATan2Est +( + FXMVECTOR Y, + FXMVECTOR X +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Reciprocal; + XMVECTOR V; + XMVECTOR YSign; + XMVECTOR Pi, PiOverTwo, PiOverFour, ThreePiOverFour; + XMVECTOR YEqualsZero, XEqualsZero, XIsPositive, YEqualsInfinity, XEqualsInfinity; + XMVECTOR ATanResultValid; + XMVECTOR R0, R1, R2, R3, R4, R5; + XMVECTOR Zero; + XMVECTOR Result; + static CONST XMVECTOR ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f}; + + Zero = XMVectorZero(); + ATanResultValid = XMVectorTrueInt(); + + Pi = XMVectorSplatX(ATan2Constants); + PiOverTwo = XMVectorSplatY(ATan2Constants); + PiOverFour = XMVectorSplatZ(ATan2Constants); + ThreePiOverFour = XMVectorSplatW(ATan2Constants); + + YEqualsZero = XMVectorEqual(Y, Zero); + XEqualsZero = XMVectorEqual(X, Zero); + XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v); + XIsPositive = XMVectorEqualInt(XIsPositive, Zero); + YEqualsInfinity = XMVectorIsInfinite(Y); + XEqualsInfinity = XMVectorIsInfinite(X); + + YSign = XMVectorAndInt(Y, g_XMNegativeZero.v); + Pi = XMVectorOrInt(Pi, YSign); + PiOverTwo = XMVectorOrInt(PiOverTwo, YSign); + PiOverFour = XMVectorOrInt(PiOverFour, YSign); + ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign); + + R1 = XMVectorSelect(Pi, YSign, XIsPositive); + R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero); + R3 = XMVectorSelect(R2, R1, YEqualsZero); + R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive); + R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity); + Result = XMVectorSelect(R3, R5, YEqualsInfinity); + ATanResultValid = XMVectorEqualInt(Result, ATanResultValid); + + Reciprocal = XMVectorReciprocalEst(X); + V = XMVectorMultiply(Y, Reciprocal); + R0 = XMVectorATanEst(V); + + R1 = XMVectorSelect( Pi, Zero, XIsPositive ); + R2 = XMVectorAdd(R0, R1); + + Result = XMVectorSelect(Result, R2, ATanResultValid); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + static CONST XMVECTORF32 ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f}; + + // Mask if Y>0 && Y!=INF + XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y); + // Get the sign of (Y&0x80000000) + XMVECTOR YSign = _mm_and_ps(Y, g_XMNegativeZero); + // Get the sign bits of X + XMVECTOR XIsPositive = _mm_and_ps(X,g_XMNegativeZero); + // Change them to masks + XIsPositive = XMVectorEqualInt(XIsPositive,g_XMZero); + // Get Pi + XMVECTOR Pi = _mm_load_ps1(&ATan2Constants.f[0]); + // Copy the sign of Y + Pi = _mm_or_ps(Pi,YSign); + XMVECTOR R1 = XMVectorSelect(Pi,YSign,XIsPositive); + // Mask for X==0 + XMVECTOR vConstants = _mm_cmpeq_ps(X,g_XMZero); + // Get Pi/2 with with sign of Y + XMVECTOR PiOverTwo = _mm_load_ps1(&ATan2Constants.f[1]); + PiOverTwo = _mm_or_ps(PiOverTwo,YSign); + XMVECTOR R2 = XMVectorSelect(g_XMNegOneMask,PiOverTwo,vConstants); + // Mask for Y==0 + vConstants = _mm_cmpeq_ps(Y,g_XMZero); + R2 = XMVectorSelect(R2,R1,vConstants); + // Get Pi/4 with sign of Y + XMVECTOR PiOverFour = _mm_load_ps1(&ATan2Constants.f[2]); + PiOverFour = _mm_or_ps(PiOverFour,YSign); + // Get (Pi*3)/4 with sign of Y + XMVECTOR ThreePiOverFour = _mm_load_ps1(&ATan2Constants.f[3]); + ThreePiOverFour = _mm_or_ps(ThreePiOverFour,YSign); + vConstants = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive); + XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X); + vConstants = XMVectorSelect(PiOverTwo,vConstants,XEqualsInfinity); + + XMVECTOR vResult = XMVectorSelect(R2,vConstants,YEqualsInfinity); + vConstants = XMVectorSelect(R1,vResult,YEqualsInfinity); + // At this point, any entry that's zero will get the result + // from XMVectorATan(), otherwise, return the failsafe value + vResult = XMVectorSelect(vResult,vConstants,XEqualsInfinity); + // Any entries not 0xFFFFFFFF, are considered precalculated + XMVECTOR ATanResultValid = XMVectorEqualInt(vResult,g_XMNegOneMask); + // Let's do the ATan2 function + XMVECTOR Reciprocal = _mm_rcp_ps(X); + vConstants = _mm_mul_ps(Y, Reciprocal); + vConstants = XMVectorATanEst(vConstants); + // Discard entries that have been declared void + + XMVECTOR R3 = XMVectorSelect( Pi, g_XMZero, XIsPositive ); + vConstants = _mm_add_ps( vConstants, R3 ); + + vResult = XMVectorSelect(vResult,vConstants,ATanResultValid); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorLerp +( + FXMVECTOR V0, + FXMVECTOR V1, + FLOAT t +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Scale; + XMVECTOR Length; + XMVECTOR Result; + + // V0 + t * (V1 - V0) + Scale = XMVectorReplicate(t); + Length = XMVectorSubtract(V1, V0); + Result = XMVectorMultiplyAdd(Length, Scale, V0); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR L, S; + XMVECTOR Result; + + L = _mm_sub_ps( V1, V0 ); + + S = _mm_set_ps1( t ); + + Result = _mm_mul_ps( L, S ); + + return _mm_add_ps( Result, V0 ); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorLerpV +( + FXMVECTOR V0, + FXMVECTOR V1, + FXMVECTOR T +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Length; + XMVECTOR Result; + + // V0 + T * (V1 - V0) + Length = XMVectorSubtract(V1, V0); + Result = XMVectorMultiplyAdd(Length, T, V0); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR Length; + XMVECTOR Result; + + Length = _mm_sub_ps( V1, V0 ); + + Result = _mm_mul_ps( Length, T ); + + return _mm_add_ps( Result, V0 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorHermite +( + FXMVECTOR Position0, + FXMVECTOR Tangent0, + FXMVECTOR Position1, + CXMVECTOR Tangent1, + FLOAT t +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR P0; + XMVECTOR T0; + XMVECTOR P1; + XMVECTOR T1; + XMVECTOR Result; + FLOAT t2; + FLOAT t3; + + // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 + + // (t^3 - 2 * t^2 + t) * Tangent0 + + // (-2 * t^3 + 3 * t^2) * Position1 + + // (t^3 - t^2) * Tangent1 + t2 = t * t; + t3 = t * t2; + + P0 = XMVectorReplicate(2.0f * t3 - 3.0f * t2 + 1.0f); + T0 = XMVectorReplicate(t3 - 2.0f * t2 + t); + P1 = XMVectorReplicate(-2.0f * t3 + 3.0f * t2); + T1 = XMVectorReplicate(t3 - t2); + + Result = XMVectorMultiply(P0, Position0); + Result = XMVectorMultiplyAdd(T0, Tangent0, Result); + Result = XMVectorMultiplyAdd(P1, Position1, Result); + Result = XMVectorMultiplyAdd(T1, Tangent1, Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + FLOAT t2 = t * t; + FLOAT t3 = t * t2; + + XMVECTOR P0 = _mm_set_ps1(2.0f * t3 - 3.0f * t2 + 1.0f); + XMVECTOR T0 = _mm_set_ps1(t3 - 2.0f * t2 + t); + XMVECTOR P1 = _mm_set_ps1(-2.0f * t3 + 3.0f * t2); + XMVECTOR T1 = _mm_set_ps1(t3 - t2); + + XMVECTOR vResult = _mm_mul_ps(P0, Position0); + XMVECTOR vTemp = _mm_mul_ps(T0, Tangent0); + vResult = _mm_add_ps(vResult,vTemp); + vTemp = _mm_mul_ps(P1, Position1); + vResult = _mm_add_ps(vResult,vTemp); + vTemp = _mm_mul_ps(T1, Tangent1); + vResult = _mm_add_ps(vResult,vTemp); + return vResult; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorHermiteV +( + FXMVECTOR Position0, + FXMVECTOR Tangent0, + FXMVECTOR Position1, + CXMVECTOR Tangent1, + CXMVECTOR T +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR P0; + XMVECTOR T0; + XMVECTOR P1; + XMVECTOR T1; + XMVECTOR Result; + XMVECTOR T2; + XMVECTOR T3; + + // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 + + // (t^3 - 2 * t^2 + t) * Tangent0 + + // (-2 * t^3 + 3 * t^2) * Position1 + + // (t^3 - t^2) * Tangent1 + T2 = XMVectorMultiply(T, T); + T3 = XMVectorMultiply(T , T2); + + P0 = XMVectorReplicate(2.0f * T3.vector4_f32[0] - 3.0f * T2.vector4_f32[0] + 1.0f); + T0 = XMVectorReplicate(T3.vector4_f32[1] - 2.0f * T2.vector4_f32[1] + T.vector4_f32[1]); + P1 = XMVectorReplicate(-2.0f * T3.vector4_f32[2] + 3.0f * T2.vector4_f32[2]); + T1 = XMVectorReplicate(T3.vector4_f32[3] - T2.vector4_f32[3]); + + Result = XMVectorMultiply(P0, Position0); + Result = XMVectorMultiplyAdd(T0, Tangent0, Result); + Result = XMVectorMultiplyAdd(P1, Position1, Result); + Result = XMVectorMultiplyAdd(T1, Tangent1, Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 CatMulT2 = {-3.0f,-2.0f,3.0f,-1.0f}; + static const XMVECTORF32 CatMulT3 = {2.0f,1.0f,-2.0f,1.0f}; + + // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 + + // (t^3 - 2 * t^2 + t) * Tangent0 + + // (-2 * t^3 + 3 * t^2) * Position1 + + // (t^3 - t^2) * Tangent1 + XMVECTOR T2 = _mm_mul_ps(T,T); + XMVECTOR T3 = _mm_mul_ps(T,T2); + // Mul by the constants against t^2 + T2 = _mm_mul_ps(T2,CatMulT2); + // Mul by the constants against t^3 + T3 = _mm_mul_ps(T3,CatMulT3); + // T3 now has the pre-result. + T3 = _mm_add_ps(T3,T2); + // I need to add t.y only + T2 = _mm_and_ps(T,g_XMMaskY); + T3 = _mm_add_ps(T3,T2); + // Add 1.0f to x + T3 = _mm_add_ps(T3,g_XMIdentityR0); + // Now, I have the constants created + // Mul the x constant to Position0 + XMVECTOR vResult = _mm_shuffle_ps(T3,T3,_MM_SHUFFLE(0,0,0,0)); + vResult = _mm_mul_ps(vResult,Position0); + // Mul the y constant to Tangent0 + T2 = _mm_shuffle_ps(T3,T3,_MM_SHUFFLE(1,1,1,1)); + T2 = _mm_mul_ps(T2,Tangent0); + vResult = _mm_add_ps(vResult,T2); + // Mul the z constant to Position1 + T2 = _mm_shuffle_ps(T3,T3,_MM_SHUFFLE(2,2,2,2)); + T2 = _mm_mul_ps(T2,Position1); + vResult = _mm_add_ps(vResult,T2); + // Mul the w constant to Tangent1 + T3 = _mm_shuffle_ps(T3,T3,_MM_SHUFFLE(3,3,3,3)); + T3 = _mm_mul_ps(T3,Tangent1); + vResult = _mm_add_ps(vResult,T3); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorCatmullRom +( + FXMVECTOR Position0, + FXMVECTOR Position1, + FXMVECTOR Position2, + CXMVECTOR Position3, + FLOAT t +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR P0; + XMVECTOR P1; + XMVECTOR P2; + XMVECTOR P3; + XMVECTOR Result; + FLOAT t2; + FLOAT t3; + + // Result = ((-t^3 + 2 * t^2 - t) * Position0 + + // (3 * t^3 - 5 * t^2 + 2) * Position1 + + // (-3 * t^3 + 4 * t^2 + t) * Position2 + + // (t^3 - t^2) * Position3) * 0.5 + t2 = t * t; + t3 = t * t2; + + P0 = XMVectorReplicate((-t3 + 2.0f * t2 - t) * 0.5f); + P1 = XMVectorReplicate((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f); + P2 = XMVectorReplicate((-3.0f * t3 + 4.0f * t2 + t) * 0.5f); + P3 = XMVectorReplicate((t3 - t2) * 0.5f); + + Result = XMVectorMultiply(P0, Position0); + Result = XMVectorMultiplyAdd(P1, Position1, Result); + Result = XMVectorMultiplyAdd(P2, Position2, Result); + Result = XMVectorMultiplyAdd(P3, Position3, Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + FLOAT t2 = t * t; + FLOAT t3 = t * t2; + + XMVECTOR P0 = _mm_set_ps1((-t3 + 2.0f * t2 - t) * 0.5f); + XMVECTOR P1 = _mm_set_ps1((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f); + XMVECTOR P2 = _mm_set_ps1((-3.0f * t3 + 4.0f * t2 + t) * 0.5f); + XMVECTOR P3 = _mm_set_ps1((t3 - t2) * 0.5f); + + P0 = _mm_mul_ps(P0, Position0); + P1 = _mm_mul_ps(P1, Position1); + P2 = _mm_mul_ps(P2, Position2); + P3 = _mm_mul_ps(P3, Position3); + P0 = _mm_add_ps(P0,P1); + P2 = _mm_add_ps(P2,P3); + P0 = _mm_add_ps(P0,P2); + return P0; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorCatmullRomV +( + FXMVECTOR Position0, + FXMVECTOR Position1, + FXMVECTOR Position2, + CXMVECTOR Position3, + CXMVECTOR T +) +{ +#if defined(_XM_NO_INTRINSICS_) + float fx = T.vector4_f32[0]; + float fy = T.vector4_f32[1]; + float fz = T.vector4_f32[2]; + float fw = T.vector4_f32[3]; + XMVECTOR vResult = { + 0.5f*((-fx*fx*fx+2*fx*fx-fx)*Position0.vector4_f32[0]+ + (3*fx*fx*fx-5*fx*fx+2)*Position1.vector4_f32[0]+ + (-3*fx*fx*fx+4*fx*fx+fx)*Position2.vector4_f32[0]+ + (fx*fx*fx-fx*fx)*Position3.vector4_f32[0]), + 0.5f*((-fy*fy*fy+2*fy*fy-fy)*Position0.vector4_f32[1]+ + (3*fy*fy*fy-5*fy*fy+2)*Position1.vector4_f32[1]+ + (-3*fy*fy*fy+4*fy*fy+fy)*Position2.vector4_f32[1]+ + (fy*fy*fy-fy*fy)*Position3.vector4_f32[1]), + 0.5f*((-fz*fz*fz+2*fz*fz-fz)*Position0.vector4_f32[2]+ + (3*fz*fz*fz-5*fz*fz+2)*Position1.vector4_f32[2]+ + (-3*fz*fz*fz+4*fz*fz+fz)*Position2.vector4_f32[2]+ + (fz*fz*fz-fz*fz)*Position3.vector4_f32[2]), + 0.5f*((-fw*fw*fw+2*fw*fw-fw)*Position0.vector4_f32[3]+ + (3*fw*fw*fw-5*fw*fw+2)*Position1.vector4_f32[3]+ + (-3*fw*fw*fw+4*fw*fw+fw)*Position2.vector4_f32[3]+ + (fw*fw*fw-fw*fw)*Position3.vector4_f32[3]) + }; + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Catmul2 = {2.0f,2.0f,2.0f,2.0f}; + static const XMVECTORF32 Catmul3 = {3.0f,3.0f,3.0f,3.0f}; + static const XMVECTORF32 Catmul4 = {4.0f,4.0f,4.0f,4.0f}; + static const XMVECTORF32 Catmul5 = {5.0f,5.0f,5.0f,5.0f}; + // Cache T^2 and T^3 + XMVECTOR T2 = _mm_mul_ps(T,T); + XMVECTOR T3 = _mm_mul_ps(T,T2); + // Perform the Position0 term + XMVECTOR vResult = _mm_add_ps(T2,T2); + vResult = _mm_sub_ps(vResult,T); + vResult = _mm_sub_ps(vResult,T3); + vResult = _mm_mul_ps(vResult,Position0); + // Perform the Position1 term and add + XMVECTOR vTemp = _mm_mul_ps(T3,Catmul3); + XMVECTOR vTemp2 = _mm_mul_ps(T2,Catmul5); + vTemp = _mm_sub_ps(vTemp,vTemp2); + vTemp = _mm_add_ps(vTemp,Catmul2); + vTemp = _mm_mul_ps(vTemp,Position1); + vResult = _mm_add_ps(vResult,vTemp); + // Perform the Position2 term and add + vTemp = _mm_mul_ps(T2,Catmul4); + vTemp2 = _mm_mul_ps(T3,Catmul3); + vTemp = _mm_sub_ps(vTemp,vTemp2); + vTemp = _mm_add_ps(vTemp,T); + vTemp = _mm_mul_ps(vTemp,Position2); + vResult = _mm_add_ps(vResult,vTemp); + // Position3 is the last term + T3 = _mm_sub_ps(T3,T2); + T3 = _mm_mul_ps(T3,Position3); + vResult = _mm_add_ps(vResult,T3); + // Multiply by 0.5f and exit + vResult = _mm_mul_ps(vResult,g_XMOneHalf); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorBaryCentric +( + FXMVECTOR Position0, + FXMVECTOR Position1, + FXMVECTOR Position2, + FLOAT f, + FLOAT g +) +{ +#if defined(_XM_NO_INTRINSICS_) + + // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0) + XMVECTOR P10; + XMVECTOR P20; + XMVECTOR ScaleF; + XMVECTOR ScaleG; + XMVECTOR Result; + + P10 = XMVectorSubtract(Position1, Position0); + ScaleF = XMVectorReplicate(f); + + P20 = XMVectorSubtract(Position2, Position0); + ScaleG = XMVectorReplicate(g); + + Result = XMVectorMultiplyAdd(P10, ScaleF, Position0); + Result = XMVectorMultiplyAdd(P20, ScaleG, Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR R1 = _mm_sub_ps(Position1,Position0); + XMVECTOR SF = _mm_set_ps1(f); + XMVECTOR R2 = _mm_sub_ps(Position2,Position0); + XMVECTOR SG = _mm_set_ps1(g); + R1 = _mm_mul_ps(R1,SF); + R2 = _mm_mul_ps(R2,SG); + R1 = _mm_add_ps(R1,Position0); + R1 = _mm_add_ps(R1,R2); + return R1; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVectorBaryCentricV +( + FXMVECTOR Position0, + FXMVECTOR Position1, + FXMVECTOR Position2, + CXMVECTOR F, + CXMVECTOR G +) +{ +#if defined(_XM_NO_INTRINSICS_) + + // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0) + XMVECTOR P10; + XMVECTOR P20; + XMVECTOR Result; + + P10 = XMVectorSubtract(Position1, Position0); + P20 = XMVectorSubtract(Position2, Position0); + + Result = XMVectorMultiplyAdd(P10, F, Position0); + Result = XMVectorMultiplyAdd(P20, G, Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR R1 = _mm_sub_ps(Position1,Position0); + XMVECTOR R2 = _mm_sub_ps(Position2,Position0); + R1 = _mm_mul_ps(R1,F); + R2 = _mm_mul_ps(R2,G); + R1 = _mm_add_ps(R1,Position0); + R1 = _mm_add_ps(R1,R2); + return R1; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +/**************************************************************************** + * + * 2D Vector + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector2Equal +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1])) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); +// z and w are don't care + return (((_mm_movemask_ps(vTemp)&3)==3) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAllTrue(XMVector2EqualR(V1, V2)); +#endif +} + + +//------------------------------------------------------------------------------ + +XMFINLINE UINT XMVector2EqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + UINT CR = 0; + + if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && + (V1.vector4_f32[1] == V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && + (V1.vector4_f32[1] != V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); +// z and w are don't care + int iTest = _mm_movemask_ps(vTemp)&3; + UINT CR = 0; + if (iTest==3) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector2EqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1])) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(reinterpret_cast(&V1)[0],reinterpret_cast(&V2)[0]); + return (((_mm_movemask_ps(reinterpret_cast(&vTemp)[0])&3)==3) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAllTrue(XMVector2EqualIntR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE UINT XMVector2EqualIntR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + UINT CR = 0; + if ((V1.vector4_u32[0] == V2.vector4_u32[0]) && + (V1.vector4_u32[1] == V2.vector4_u32[1])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) && + (V1.vector4_u32[1] != V2.vector4_u32[1])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(reinterpret_cast(&V1)[0],reinterpret_cast(&V2)[0]); + int iTest = _mm_movemask_ps(reinterpret_cast(&vTemp)[0])&3; + UINT CR = 0; + if (iTest==3) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector2NearEqual +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Epsilon +) +{ +#if defined(_XM_NO_INTRINSICS_) + FLOAT dx, dy; + dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]); + dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]); + return ((dx <= Epsilon.vector4_f32[0]) && + (dy <= Epsilon.vector4_f32[1])); +#elif defined(_XM_SSE_INTRINSICS_) + // Get the difference + XMVECTOR vDelta = _mm_sub_ps(V1,V2); + // Get the absolute value of the difference + XMVECTOR vTemp = _mm_setzero_ps(); + vTemp = _mm_sub_ps(vTemp,vDelta); + vTemp = _mm_max_ps(vTemp,vDelta); + vTemp = _mm_cmple_ps(vTemp,Epsilon); + // z and w are don't care + return (((_mm_movemask_ps(vTemp)&3)==0x3) != 0); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector2NotEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1])) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); +// z and w are don't care + return (((_mm_movemask_ps(vTemp)&3)!=3) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAnyFalse(XMVector2EqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector2NotEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1])) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(reinterpret_cast(&V1)[0],reinterpret_cast(&V2)[0]); + return (((_mm_movemask_ps(reinterpret_cast(&vTemp)[0])&3)!=3) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAnyFalse(XMVector2EqualIntR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector2Greater +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1])) != 0); + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); +// z and w are don't care + return (((_mm_movemask_ps(vTemp)&3)==3) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAllTrue(XMVector2GreaterR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE UINT XMVector2GreaterR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + UINT CR = 0; + if ((V1.vector4_f32[0] > V2.vector4_f32[0]) && + (V1.vector4_f32[1] > V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) && + (V1.vector4_f32[1] <= V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); + int iTest = _mm_movemask_ps(vTemp)&3; + UINT CR = 0; + if (iTest==3) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector2GreaterOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1])) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&3)==3) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAllTrue(XMVector2GreaterOrEqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE UINT XMVector2GreaterOrEqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + UINT CR = 0; + if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && + (V1.vector4_f32[1] >= V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && + (V1.vector4_f32[1] < V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); + int iTest = _mm_movemask_ps(vTemp)&3; + UINT CR = 0; + if (iTest == 3) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector2Less +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1])) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmplt_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&3)==3) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAllTrue(XMVector2GreaterR(V2, V1)); +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector2LessOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1])) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmple_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&3)==3) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAllTrue(XMVector2GreaterOrEqualR(V2, V1)); +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector2InBounds +( + FXMVECTOR V, + FXMVECTOR Bounds +) +{ + #if defined(_XM_NO_INTRINSICS_) + return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && + (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1])) != 0); + #elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2,V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1,vTemp2); + // x and y in bounds? (z and w are don't care) + return (((_mm_movemask_ps(vTemp1)&0x3)==0x3) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAllInBounds(XMVector2InBoundsR(V, Bounds)); +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE UINT XMVector2InBoundsR +( + FXMVECTOR V, + FXMVECTOR Bounds +) +{ +#if defined(_XM_NO_INTRINSICS_) + UINT CR = 0; + if ((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && + (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1])) + { + CR = XM_CRMASK_CR6BOUNDS; + } + return CR; + +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2,V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1,vTemp2); + // x and y in bounds? (z and w are don't care) + return ((_mm_movemask_ps(vTemp1)&0x3)==0x3) ? XM_CRMASK_CR6BOUNDS : 0; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector2IsNaN +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (XMISNAN(V.vector4_f32[0]) || + XMISNAN(V.vector4_f32[1])); +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the exponent + __m128i vTempInf = _mm_and_si128(reinterpret_cast(&V)[0],g_XMInfinity); + // Mask off the mantissa + __m128i vTempNan = _mm_and_si128(reinterpret_cast(&V)[0],g_XMQNaNTest); + // Are any of the exponents == 0x7F800000? + vTempInf = _mm_cmpeq_epi32(vTempInf,g_XMInfinity); + // Are any of the mantissa's zero? (SSE2 doesn't have a neq test) + vTempNan = _mm_cmpeq_epi32(vTempNan,g_XMZero); + // Perform a not on the NaN test to be true on NON-zero mantissas + vTempNan = _mm_andnot_si128(vTempNan,vTempInf); + // If x or y are NaN, the signs are true after the merge above + return ((_mm_movemask_ps(reinterpret_cast(&vTempNan)[0])&3) != 0); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector2IsInfinite +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + return (XMISINF(V.vector4_f32[0]) || + XMISINF(V.vector4_f32[1])); +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bit + __m128 vTemp = _mm_and_ps(V,g_XMAbsMask); + // Compare to infinity + vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity); + // If x or z are infinity, the signs are true. + return ((_mm_movemask_ps(vTemp)&3) != 0); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector2Dot +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result.vector4_f32[0] = + Result.vector4_f32[1] = + Result.vector4_f32[2] = + Result.vector4_f32[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1]; + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V1,V2); + // vTemp has y splatted + XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,1,1,1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(0,0,0,0)); + return vLengthSq; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector2Cross +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + FLOAT fCross = (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]); + XMVECTOR vResult = { + fCross, + fCross, + fCross, + fCross + }; + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Swap x and y + XMVECTOR vResult = _mm_shuffle_ps(V2,V2,_MM_SHUFFLE(0,1,0,1)); + // Perform the muls + vResult = _mm_mul_ps(vResult,V1); + // Splat y + XMVECTOR vTemp = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(1,1,1,1)); + // Sub the values + vResult = _mm_sub_ss(vResult,vTemp); + // Splat the cross product + vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,0,0,0)); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector2LengthSq +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + return XMVector2Dot(V, V); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has y splatted + XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,1,1,1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(0,0,0,0)); + return vLengthSq; +#else + return XMVector2Dot(V, V); +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector2ReciprocalLengthEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector2LengthSq(V); + Result = XMVectorReciprocalSqrtEst(Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has y splatted + XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,1,1,1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = _mm_rsqrt_ss(vLengthSq); + vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(0,0,0,0)); + return vLengthSq; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector2ReciprocalLength +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector2LengthSq(V); + Result = XMVectorReciprocalSqrt(Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has y splatted + XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,1,1,1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = _mm_sqrt_ss(vLengthSq); + vLengthSq = _mm_div_ss(g_XMOne,vLengthSq); + vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(0,0,0,0)); + return vLengthSq; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector2LengthEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result = XMVector2LengthSq(V); + Result = XMVectorSqrtEst(Result); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has y splatted + XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,1,1,1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = _mm_sqrt_ss(vLengthSq); + vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(0,0,0,0)); + return vLengthSq; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector2Length +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2LengthSq(V); + Result = XMVectorSqrt(Result); + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has y splatted + XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,1,1,1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(0,0,0,0)); + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// XMVector2NormalizeEst uses a reciprocal estimate and +// returns QNaN on zero and infinite vectors. + +XMFINLINE XMVECTOR XMVector2NormalizeEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2ReciprocalLength(V); + Result = XMVectorMultiply(V, Result); + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has y splatted + XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,1,1,1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = _mm_rsqrt_ss(vLengthSq); + vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(0,0,0,0)); + vLengthSq = _mm_mul_ps(vLengthSq,V); + return vLengthSq; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector2Normalize +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + FLOAT fLength; + XMVECTOR vResult; + + vResult = XMVector2Length( V ); + fLength = vResult.vector4_f32[0]; + + // Prevent divide by zero + if (fLength > 0) { + fLength = 1.0f/fLength; + } + + vResult.vector4_f32[0] = V.vector4_f32[0]*fLength; + vResult.vector4_f32[1] = V.vector4_f32[1]*fLength; + vResult.vector4_f32[2] = V.vector4_f32[2]*fLength; + vResult.vector4_f32[3] = V.vector4_f32[3]*fLength; + return vResult; + +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y only + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,1,1,1)); + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(0,0,0,0)); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(V,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); + vResult = _mm_or_ps(vTemp1,vTemp2); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector2ClampLength +( + FXMVECTOR V, + FLOAT LengthMin, + FLOAT LengthMax +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR ClampMax; + XMVECTOR ClampMin; + + ClampMax = XMVectorReplicate(LengthMax); + ClampMin = XMVectorReplicate(LengthMin); + + return XMVector2ClampLengthV(V, ClampMin, ClampMax); + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR ClampMax = _mm_set_ps1(LengthMax); + XMVECTOR ClampMin = _mm_set_ps1(LengthMin); + return XMVector2ClampLengthV(V, ClampMin, ClampMax); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector2ClampLengthV +( + FXMVECTOR V, + FXMVECTOR LengthMin, + FXMVECTOR LengthMax +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR ClampLength; + XMVECTOR LengthSq; + XMVECTOR RcpLength; + XMVECTOR Length; + XMVECTOR Normal; + XMVECTOR Zero; + XMVECTOR InfiniteLength; + XMVECTOR ZeroLength; + XMVECTOR Select; + XMVECTOR ControlMax; + XMVECTOR ControlMin; + XMVECTOR Control; + XMVECTOR Result; + + XMASSERT((LengthMin.vector4_f32[1] == LengthMin.vector4_f32[0])); + XMASSERT((LengthMax.vector4_f32[1] == LengthMax.vector4_f32[0])); + XMASSERT(XMVector2GreaterOrEqual(LengthMin, XMVectorZero())); + XMASSERT(XMVector2GreaterOrEqual(LengthMax, XMVectorZero())); + XMASSERT(XMVector2GreaterOrEqual(LengthMax, LengthMin)); + + LengthSq = XMVector2LengthSq(V); + + Zero = XMVectorZero(); + + RcpLength = XMVectorReciprocalSqrt(LengthSq); + + InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v); + ZeroLength = XMVectorEqual(LengthSq, Zero); + + Length = XMVectorMultiply(LengthSq, RcpLength); + + Normal = XMVectorMultiply(V, RcpLength); + + Select = XMVectorEqualInt(InfiniteLength, ZeroLength); + Length = XMVectorSelect(LengthSq, Length, Select); + Normal = XMVectorSelect(LengthSq, Normal, Select); + + ControlMax = XMVectorGreater(Length, LengthMax); + ControlMin = XMVectorLess(Length, LengthMin); + + ClampLength = XMVectorSelect(Length, LengthMax, ControlMax); + ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin); + + Result = XMVectorMultiply(Normal, ClampLength); + + // Preserve the original vector (with no precision loss) if the length falls within the given range + Control = XMVectorEqualInt(ControlMax, ControlMin); + Result = XMVectorSelect(Result, V, Control); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR ClampLength; + XMVECTOR LengthSq; + XMVECTOR RcpLength; + XMVECTOR Length; + XMVECTOR Normal; + XMVECTOR InfiniteLength; + XMVECTOR ZeroLength; + XMVECTOR Select; + XMVECTOR ControlMax; + XMVECTOR ControlMin; + XMVECTOR Control; + XMVECTOR Result; + + XMASSERT((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin))); + XMASSERT((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax))); + XMASSERT(XMVector2GreaterOrEqual(LengthMin, g_XMZero)); + XMASSERT(XMVector2GreaterOrEqual(LengthMax, g_XMZero)); + XMASSERT(XMVector2GreaterOrEqual(LengthMax, LengthMin)); + LengthSq = XMVector2LengthSq(V); + RcpLength = XMVectorReciprocalSqrt(LengthSq); + InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity); + ZeroLength = XMVectorEqual(LengthSq, g_XMZero); + Length = _mm_mul_ps(LengthSq, RcpLength); + Normal = _mm_mul_ps(V, RcpLength); + Select = XMVectorEqualInt(InfiniteLength, ZeroLength); + Length = XMVectorSelect(LengthSq, Length, Select); + Normal = XMVectorSelect(LengthSq, Normal, Select); + ControlMax = XMVectorGreater(Length, LengthMax); + ControlMin = XMVectorLess(Length, LengthMin); + ClampLength = XMVectorSelect(Length, LengthMax, ControlMax); + ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin); + Result = _mm_mul_ps(Normal, ClampLength); + // Preserve the original vector (with no precision loss) if the length falls within the given range + Control = XMVectorEqualInt(ControlMax, ControlMin); + Result = XMVectorSelect(Result, V, Control); + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector2Reflect +( + FXMVECTOR Incident, + FXMVECTOR Normal +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + // Result = Incident - (2 * dot(Incident, Normal)) * Normal + Result = XMVector2Dot(Incident, Normal); + Result = XMVectorAdd(Result, Result); + Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Result = Incident - (2 * dot(Incident, Normal)) * Normal + XMVECTOR Result = XMVector2Dot(Incident,Normal); + Result = _mm_add_ps(Result, Result); + Result = _mm_mul_ps(Result, Normal); + Result = _mm_sub_ps(Incident,Result); + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector2Refract +( + FXMVECTOR Incident, + FXMVECTOR Normal, + FLOAT RefractionIndex +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Index; + Index = XMVectorReplicate(RefractionIndex); + return XMVector2RefractV(Incident, Normal, Index); + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR Index = _mm_set_ps1(RefractionIndex); + return XMVector2RefractV(Incident,Normal,Index); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Return the refraction of a 2D vector +XMFINLINE XMVECTOR XMVector2RefractV +( + FXMVECTOR Incident, + FXMVECTOR Normal, + FXMVECTOR RefractionIndex +) +{ +#if defined(_XM_NO_INTRINSICS_) + float IDotN; + float RX,RY; + XMVECTOR vResult; + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + IDotN = (Incident.vector4_f32[0]*Normal.vector4_f32[0])+(Incident.vector4_f32[1]*Normal.vector4_f32[1]); + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + RY = 1.0f-(IDotN*IDotN); + RX = 1.0f-(RY*RefractionIndex.vector4_f32[0]*RefractionIndex.vector4_f32[0]); + RY = 1.0f-(RY*RefractionIndex.vector4_f32[1]*RefractionIndex.vector4_f32[1]); + if (RX>=0.0f) { + RX = (RefractionIndex.vector4_f32[0]*Incident.vector4_f32[0])-(Normal.vector4_f32[0]*((RefractionIndex.vector4_f32[0]*IDotN)+sqrtf(RX))); + } else { + RX = 0.0f; + } + if (RY>=0.0f) { + RY = (RefractionIndex.vector4_f32[1]*Incident.vector4_f32[1])-(Normal.vector4_f32[1]*((RefractionIndex.vector4_f32[1]*IDotN)+sqrtf(RY))); + } else { + RY = 0.0f; + } + vResult.vector4_f32[0] = RX; + vResult.vector4_f32[1] = RY; + vResult.vector4_f32[2] = 0.0f; + vResult.vector4_f32[3] = 0.0f; + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + // Get the 2D Dot product of Incident-Normal + XMVECTOR IDotN = _mm_mul_ps(Incident,Normal); + XMVECTOR vTemp = _mm_shuffle_ps(IDotN,IDotN,_MM_SHUFFLE(1,1,1,1)); + IDotN = _mm_add_ss(IDotN,vTemp); + IDotN = _mm_shuffle_ps(IDotN,IDotN,_MM_SHUFFLE(0,0,0,0)); + // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + vTemp = _mm_mul_ps(IDotN,IDotN); + vTemp = _mm_sub_ps(g_XMOne,vTemp); + vTemp = _mm_mul_ps(vTemp,RefractionIndex); + vTemp = _mm_mul_ps(vTemp,RefractionIndex); + vTemp = _mm_sub_ps(g_XMOne,vTemp); + // If any terms are <=0, sqrt() will fail, punt to zero + XMVECTOR vMask = _mm_cmpgt_ps(vTemp,g_XMZero); + // R = RefractionIndex * IDotN + sqrt(R) + vTemp = _mm_sqrt_ps(vTemp); + XMVECTOR vResult = _mm_mul_ps(RefractionIndex,IDotN); + vTemp = _mm_add_ps(vTemp,vResult); + // Result = RefractionIndex * Incident - Normal * R + vResult = _mm_mul_ps(RefractionIndex,Incident); + vTemp = _mm_mul_ps(vTemp,Normal); + vResult = _mm_sub_ps(vResult,vTemp); + vResult = _mm_and_ps(vResult,vMask); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector2Orthogonal +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result.vector4_f32[0] = -V.vector4_f32[1]; + Result.vector4_f32[1] = V.vector4_f32[0]; + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(3,2,0,1)); + vResult = _mm_mul_ps(vResult,g_XMNegateX); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector2AngleBetweenNormalsEst +( + FXMVECTOR N1, + FXMVECTOR N2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR NegativeOne; + XMVECTOR One; + XMVECTOR Result; + + Result = XMVector2Dot(N1, N2); + NegativeOne = XMVectorSplatConstant(-1, 0); + One = XMVectorSplatOne(); + Result = XMVectorClamp(Result, NegativeOne, One); + Result = XMVectorACosEst(Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XMVector2Dot(N1,N2); + // Clamp to -1.0f to 1.0f + vResult = _mm_max_ps(vResult,g_XMNegativeOne); + vResult = _mm_min_ps(vResult,g_XMOne);; + vResult = XMVectorACosEst(vResult); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector2AngleBetweenNormals +( + FXMVECTOR N1, + FXMVECTOR N2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR NegativeOne; + XMVECTOR One; + XMVECTOR Result; + + Result = XMVector2Dot(N1, N2); + NegativeOne = XMVectorSplatConstant(-1, 0); + One = XMVectorSplatOne(); + Result = XMVectorClamp(Result, NegativeOne, One); + Result = XMVectorACos(Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XMVector2Dot(N1,N2); + // Clamp to -1.0f to 1.0f + vResult = _mm_max_ps(vResult,g_XMNegativeOne); + vResult = _mm_min_ps(vResult,g_XMOne);; + vResult = XMVectorACos(vResult); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector2AngleBetweenVectors +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR L1; + XMVECTOR L2; + XMVECTOR Dot; + XMVECTOR CosAngle; + XMVECTOR NegativeOne; + XMVECTOR One; + XMVECTOR Result; + + L1 = XMVector2ReciprocalLength(V1); + L2 = XMVector2ReciprocalLength(V2); + + Dot = XMVector2Dot(V1, V2); + + L1 = XMVectorMultiply(L1, L2); + + CosAngle = XMVectorMultiply(Dot, L1); + NegativeOne = XMVectorSplatConstant(-1, 0); + One = XMVectorSplatOne(); + CosAngle = XMVectorClamp(CosAngle, NegativeOne, One); + + Result = XMVectorACos(CosAngle); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR L1; + XMVECTOR L2; + XMVECTOR Dot; + XMVECTOR CosAngle; + XMVECTOR Result; + L1 = XMVector2ReciprocalLength(V1); + L2 = XMVector2ReciprocalLength(V2); + Dot = XMVector2Dot(V1, V2); + L1 = _mm_mul_ps(L1, L2); + CosAngle = _mm_mul_ps(Dot, L1); + CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne,g_XMOne); + Result = XMVectorACos(CosAngle); + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector2LinePointDistance +( + FXMVECTOR LinePoint1, + FXMVECTOR LinePoint2, + FXMVECTOR Point +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR PointVector; + XMVECTOR LineVector; + XMVECTOR ReciprocalLengthSq; + XMVECTOR PointProjectionScale; + XMVECTOR DistanceVector; + XMVECTOR Result; + + // Given a vector PointVector from LinePoint1 to Point and a vector + // LineVector from LinePoint1 to LinePoint2, the scaled distance + // PointProjectionScale from LinePoint1 to the perpendicular projection + // of PointVector onto the line is defined as: + // + // PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector) + + PointVector = XMVectorSubtract(Point, LinePoint1); + LineVector = XMVectorSubtract(LinePoint2, LinePoint1); + + ReciprocalLengthSq = XMVector2LengthSq(LineVector); + ReciprocalLengthSq = XMVectorReciprocal(ReciprocalLengthSq); + + PointProjectionScale = XMVector2Dot(PointVector, LineVector); + PointProjectionScale = XMVectorMultiply(PointProjectionScale, ReciprocalLengthSq); + + DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale); + DistanceVector = XMVectorSubtract(PointVector, DistanceVector); + + Result = XMVector2Length(DistanceVector); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR PointVector = _mm_sub_ps(Point,LinePoint1); + XMVECTOR LineVector = _mm_sub_ps(LinePoint2,LinePoint1); + XMVECTOR ReciprocalLengthSq = XMVector2LengthSq(LineVector); + XMVECTOR vResult = XMVector2Dot(PointVector,LineVector); + vResult = _mm_div_ps(vResult,ReciprocalLengthSq); + vResult = _mm_mul_ps(vResult,LineVector); + vResult = _mm_sub_ps(PointVector,vResult); + vResult = XMVector2Length(vResult); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector2IntersectLine +( + FXMVECTOR Line1Point1, + FXMVECTOR Line1Point2, + FXMVECTOR Line2Point1, + CXMVECTOR Line2Point2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V1; + XMVECTOR V2; + XMVECTOR V3; + XMVECTOR C1; + XMVECTOR C2; + XMVECTOR Result; + CONST XMVECTOR Zero = XMVectorZero(); + + V1 = XMVectorSubtract(Line1Point2, Line1Point1); + V2 = XMVectorSubtract(Line2Point2, Line2Point1); + V3 = XMVectorSubtract(Line1Point1, Line2Point1); + + C1 = XMVector2Cross(V1, V2); + C2 = XMVector2Cross(V2, V3); + + if (XMVector2NearEqual(C1, Zero, g_XMEpsilon.v)) + { + if (XMVector2NearEqual(C2, Zero, g_XMEpsilon.v)) + { + // Coincident + Result = g_XMInfinity.v; + } + else + { + // Parallel + Result = g_XMQNaN.v; + } + } + else + { + // Intersection point = Line1Point1 + V1 * (C2 / C1) + XMVECTOR Scale; + Scale = XMVectorReciprocal(C1); + Scale = XMVectorMultiply(C2, Scale); + Result = XMVectorMultiplyAdd(V1, Scale, Line1Point1); + } + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR V1 = _mm_sub_ps(Line1Point2, Line1Point1); + XMVECTOR V2 = _mm_sub_ps(Line2Point2, Line2Point1); + XMVECTOR V3 = _mm_sub_ps(Line1Point1, Line2Point1); + // Generate the cross products + XMVECTOR C1 = XMVector2Cross(V1, V2); + XMVECTOR C2 = XMVector2Cross(V2, V3); + // If C1 is not close to epsilon, use the calculated value + XMVECTOR vResultMask = _mm_setzero_ps(); + vResultMask = _mm_sub_ps(vResultMask,C1); + vResultMask = _mm_max_ps(vResultMask,C1); + // 0xFFFFFFFF if the calculated value is to be used + vResultMask = _mm_cmpgt_ps(vResultMask,g_XMEpsilon); + // If C1 is close to epsilon, which fail type is it? INFINITY or NAN? + XMVECTOR vFailMask = _mm_setzero_ps(); + vFailMask = _mm_sub_ps(vFailMask,C2); + vFailMask = _mm_max_ps(vFailMask,C2); + vFailMask = _mm_cmple_ps(vFailMask,g_XMEpsilon); + XMVECTOR vFail = _mm_and_ps(vFailMask,g_XMInfinity); + vFailMask = _mm_andnot_ps(vFailMask,g_XMQNaN); + // vFail is NAN or INF + vFail = _mm_or_ps(vFail,vFailMask); + // Intersection point = Line1Point1 + V1 * (C2 / C1) + XMVECTOR vResult = _mm_div_ps(C2,C1); + vResult = _mm_mul_ps(vResult,V1); + vResult = _mm_add_ps(vResult,Line1Point1); + // Use result, or failure value + vResult = _mm_and_ps(vResult,vResultMask); + vResultMask = _mm_andnot_ps(vResultMask,vFail); + vResult = _mm_or_ps(vResult,vResultMask); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector2Transform +( + FXMVECTOR V, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR X; + XMVECTOR Y; + XMVECTOR Result; + + Y = XMVectorSplatY(V); + X = XMVectorSplatX(V); + + Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(0,0,0,0)); + vResult = _mm_mul_ps(vResult,M.r[0]); + XMVECTOR vTemp = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1)); + vTemp = _mm_mul_ps(vTemp,M.r[1]); + vResult = _mm_add_ps(vResult,vTemp); + vResult = _mm_add_ps(vResult,M.r[3]); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMFLOAT4* XMVector2TransformStream +( + XMFLOAT4* pOutputStream, + size_t OutputStride, + CONST XMFLOAT2* pInputStream, + size_t InputStride, + size_t VectorCount, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + XMVECTOR X; + XMVECTOR Y; + XMVECTOR Result; + size_t i; + CONST BYTE* pInputVector = (CONST BYTE*)pInputStream; + BYTE* pOutputVector = (BYTE*)pOutputStream; + + XMASSERT(pOutputStream); + XMASSERT(pInputStream); + + for (i = 0; i < VectorCount; i++) + { + V = XMLoadFloat2((const XMFLOAT2*)pInputVector); + Y = XMVectorSplatY(V); + X = XMVectorSplatX(V); +// Y = XMVectorReplicate(((XMFLOAT2*)pInputVector)->y); +// X = XMVectorReplicate(((XMFLOAT2*)pInputVector)->x); + + Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + XMStoreFloat4((XMFLOAT4*)pOutputVector, Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pOutputStream); + XMASSERT(pInputStream); + size_t i; + CONST BYTE* pInputVector = (CONST BYTE*)pInputStream; + BYTE* pOutputVector = (BYTE*)pOutputStream; + + for (i = 0; i < VectorCount; i++) + { + XMVECTOR X = _mm_load_ps1(&reinterpret_cast(pInputVector)->x); + XMVECTOR vResult = _mm_load_ps1(&reinterpret_cast(pInputVector)->y); + vResult = _mm_mul_ps(vResult,M.r[1]); + vResult = _mm_add_ps(vResult,M.r[3]); + X = _mm_mul_ps(X,M.r[0]); + vResult = _mm_add_ps(vResult,X); + _mm_storeu_ps(reinterpret_cast(pOutputVector),vResult); + pInputVector += InputStride; + pOutputVector += OutputStride; + } + return pOutputStream; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMFLOAT4* XMVector2TransformStreamNC +( + XMFLOAT4* pOutputStream, + size_t OutputStride, + CONST XMFLOAT2* pInputStream, + size_t InputStride, + size_t VectorCount, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) || defined(_XM_SSE_INTRINSICS_) + return XMVector2TransformStream( pOutputStream, OutputStride, pInputStream, InputStride, VectorCount, M ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector2TransformCoord +( + FXMVECTOR V, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR X; + XMVECTOR Y; + XMVECTOR InverseW; + XMVECTOR Result; + + Y = XMVectorSplatY(V); + X = XMVectorSplatX(V); + + Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + InverseW = XMVectorSplatW(Result); + InverseW = XMVectorReciprocal(InverseW); + + Result = XMVectorMultiply(Result, InverseW); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(0,0,0,0)); + vResult = _mm_mul_ps(vResult,M.r[0]); + XMVECTOR vTemp = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1)); + vTemp = _mm_mul_ps(vTemp,M.r[1]); + vResult = _mm_add_ps(vResult,vTemp); + vResult = _mm_add_ps(vResult,M.r[3]); + vTemp = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,3,3,3)); + vResult = _mm_div_ps(vResult,vTemp); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMFLOAT2* XMVector2TransformCoordStream +( + XMFLOAT2* pOutputStream, + size_t OutputStride, + CONST XMFLOAT2* pInputStream, + size_t InputStride, + size_t VectorCount, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + XMVECTOR X; + XMVECTOR Y; + XMVECTOR InverseW; + XMVECTOR Result; + size_t i; + CONST BYTE* pInputVector = (CONST BYTE*)pInputStream; + BYTE* pOutputVector = (BYTE*)pOutputStream; + + XMASSERT(pOutputStream); + XMASSERT(pInputStream); + + for (i = 0; i < VectorCount; i++) + { + V = XMLoadFloat2((const XMFLOAT2*)pInputVector); + Y = XMVectorSplatY(V); + X = XMVectorSplatX(V); +// Y = XMVectorReplicate(((XMFLOAT2*)pInputVector)->y); +// X = XMVectorReplicate(((XMFLOAT2*)pInputVector)->x); + + Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + InverseW = XMVectorSplatW(Result); + InverseW = XMVectorReciprocal(InverseW); + + Result = XMVectorMultiply(Result, InverseW); + + XMStoreFloat2((XMFLOAT2*)pOutputVector, Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pOutputStream); + XMASSERT(pInputStream); + size_t i; + CONST BYTE *pInputVector = (CONST BYTE*)pInputStream; + BYTE *pOutputVector = (BYTE*)pOutputStream; + + for (i = 0; i < VectorCount; i++) + { + XMVECTOR X = _mm_load_ps1(&reinterpret_cast(pInputVector)->x); + XMVECTOR vResult = _mm_load_ps1(&reinterpret_cast(pInputVector)->y); + vResult = _mm_mul_ps(vResult,M.r[1]); + vResult = _mm_add_ps(vResult,M.r[3]); + X = _mm_mul_ps(X,M.r[0]); + vResult = _mm_add_ps(vResult,X); + X = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,3,3,3)); + vResult = _mm_div_ps(vResult,X); + _mm_store_sd(reinterpret_cast(pOutputVector),reinterpret_cast<__m128d *>(&vResult)[0]); + pInputVector += InputStride; + pOutputVector += OutputStride; + } + return pOutputStream; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector2TransformNormal +( + FXMVECTOR V, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR X; + XMVECTOR Y; + XMVECTOR Result; + + Y = XMVectorSplatY(V); + X = XMVectorSplatX(V); + + Result = XMVectorMultiply(Y, M.r[1]); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(0,0,0,0)); + vResult = _mm_mul_ps(vResult,M.r[0]); + XMVECTOR vTemp = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1)); + vTemp = _mm_mul_ps(vTemp,M.r[1]); + vResult = _mm_add_ps(vResult,vTemp); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMFLOAT2* XMVector2TransformNormalStream +( + XMFLOAT2* pOutputStream, + size_t OutputStride, + CONST XMFLOAT2* pInputStream, + size_t InputStride, + size_t VectorCount, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + XMVECTOR X; + XMVECTOR Y; + XMVECTOR Result; + size_t i; + CONST BYTE* pInputVector = (CONST BYTE*)pInputStream; + BYTE* pOutputVector = (BYTE*)pOutputStream; + + XMASSERT(pOutputStream); + XMASSERT(pInputStream); + + for (i = 0; i < VectorCount; i++) + { + V = XMLoadFloat2((const XMFLOAT2*)pInputVector); + Y = XMVectorSplatY(V); + X = XMVectorSplatX(V); +// Y = XMVectorReplicate(((XMFLOAT2*)pInputVector)->y); +// X = XMVectorReplicate(((XMFLOAT2*)pInputVector)->x); + + Result = XMVectorMultiply(Y, M.r[1]); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + XMStoreFloat2((XMFLOAT2*)pOutputVector, Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pOutputStream); + XMASSERT(pInputStream); + size_t i; + CONST BYTE*pInputVector = (CONST BYTE*)pInputStream; + BYTE *pOutputVector = (BYTE*)pOutputStream; + for (i = 0; i < VectorCount; i++) + { + XMVECTOR X = _mm_load_ps1(&reinterpret_cast(pInputVector)->x); + XMVECTOR vResult = _mm_load_ps1(&reinterpret_cast(pInputVector)->y); + vResult = _mm_mul_ps(vResult,M.r[1]); + X = _mm_mul_ps(X,M.r[0]); + vResult = _mm_add_ps(vResult,X); + _mm_store_sd(reinterpret_cast(pOutputVector),reinterpret_cast(&vResult)[0]); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +/**************************************************************************** + * + * 3D Vector + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector3Equal +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2])) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&7)==7) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAllTrue(XMVector3EqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE UINT XMVector3EqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + UINT CR = 0; + if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && + (V1.vector4_f32[1] == V2.vector4_f32[1]) && + (V1.vector4_f32[2] == V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && + (V1.vector4_f32[1] != V2.vector4_f32[1]) && + (V1.vector4_f32[2] != V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); + int iTest = _mm_movemask_ps(vTemp)&7; + UINT CR = 0; + if (iTest==7) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector3EqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2])) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(reinterpret_cast(&V1)[0],reinterpret_cast(&V2)[0]); + return (((_mm_movemask_ps(reinterpret_cast(&vTemp)[0])&7)==7) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAllTrue(XMVector3EqualIntR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE UINT XMVector3EqualIntR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + UINT CR = 0; + if ((V1.vector4_u32[0] == V2.vector4_u32[0]) && + (V1.vector4_u32[1] == V2.vector4_u32[1]) && + (V1.vector4_u32[2] == V2.vector4_u32[2])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) && + (V1.vector4_u32[1] != V2.vector4_u32[1]) && + (V1.vector4_u32[2] != V2.vector4_u32[2])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(reinterpret_cast(&V1)[0],reinterpret_cast(&V2)[0]); + int iTemp = _mm_movemask_ps(reinterpret_cast(&vTemp)[0])&7; + UINT CR = 0; + if (iTemp==7) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTemp) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector3NearEqual +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Epsilon +) +{ +#if defined(_XM_NO_INTRINSICS_) + FLOAT dx, dy, dz; + + dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]); + dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]); + dz = fabsf(V1.vector4_f32[2]-V2.vector4_f32[2]); + return (((dx <= Epsilon.vector4_f32[0]) && + (dy <= Epsilon.vector4_f32[1]) && + (dz <= Epsilon.vector4_f32[2])) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Get the difference + XMVECTOR vDelta = _mm_sub_ps(V1,V2); + // Get the absolute value of the difference + XMVECTOR vTemp = _mm_setzero_ps(); + vTemp = _mm_sub_ps(vTemp,vDelta); + vTemp = _mm_max_ps(vTemp,vDelta); + vTemp = _mm_cmple_ps(vTemp,Epsilon); + // w is don't care + return (((_mm_movemask_ps(vTemp)&7)==0x7) != 0); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector3NotEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2])) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&7)!=7) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAnyFalse(XMVector3EqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector3NotEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2])) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(reinterpret_cast(&V1)[0],reinterpret_cast(&V2)[0]); + return (((_mm_movemask_ps(reinterpret_cast(&vTemp)[0])&7)!=7) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAnyFalse(XMVector3EqualIntR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector3Greater +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2])) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&7)==7) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAllTrue(XMVector3GreaterR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE UINT XMVector3GreaterR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + UINT CR = 0; + if ((V1.vector4_f32[0] > V2.vector4_f32[0]) && + (V1.vector4_f32[1] > V2.vector4_f32[1]) && + (V1.vector4_f32[2] > V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) && + (V1.vector4_f32[1] <= V2.vector4_f32[1]) && + (V1.vector4_f32[2] <= V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); + UINT CR = 0; + int iTest = _mm_movemask_ps(vTemp)&7; + if (iTest==7) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector3GreaterOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2])) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&7)==7) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAllTrue(XMVector3GreaterOrEqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE UINT XMVector3GreaterOrEqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + UINT CR = 0; + if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && + (V1.vector4_f32[1] >= V2.vector4_f32[1]) && + (V1.vector4_f32[2] >= V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && + (V1.vector4_f32[1] < V2.vector4_f32[1]) && + (V1.vector4_f32[2] < V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); + UINT CR = 0; + int iTest = _mm_movemask_ps(vTemp)&7; + if (iTest==7) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector3Less +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2])) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmplt_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&7)==7) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAllTrue(XMVector3GreaterR(V2, V1)); +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector3LessOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2])) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmple_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&7)==7) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAllTrue(XMVector3GreaterOrEqualR(V2, V1)); +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector3InBounds +( + FXMVECTOR V, + FXMVECTOR Bounds +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && + (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) && + (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2])) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2,V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1,vTemp2); + // x,y and z in bounds? (w is don't care) + return (((_mm_movemask_ps(vTemp1)&0x7)==0x7) != 0); +#else + return XMComparisonAllInBounds(XMVector3InBoundsR(V, Bounds)); +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE UINT XMVector3InBoundsR +( + FXMVECTOR V, + FXMVECTOR Bounds +) +{ +#if defined(_XM_NO_INTRINSICS_) + UINT CR = 0; + if ((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && + (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) && + (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2])) + { + CR = XM_CRMASK_CR6BOUNDS; + } + return CR; + +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2,V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1,vTemp2); + // x,y and z in bounds? (w is don't care) + return ((_mm_movemask_ps(vTemp1)&0x7)==0x7) ? XM_CRMASK_CR6BOUNDS : 0; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector3IsNaN +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + return (XMISNAN(V.vector4_f32[0]) || + XMISNAN(V.vector4_f32[1]) || + XMISNAN(V.vector4_f32[2])); + +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the exponent + __m128i vTempInf = _mm_and_si128(reinterpret_cast(&V)[0],g_XMInfinity); + // Mask off the mantissa + __m128i vTempNan = _mm_and_si128(reinterpret_cast(&V)[0],g_XMQNaNTest); + // Are any of the exponents == 0x7F800000? + vTempInf = _mm_cmpeq_epi32(vTempInf,g_XMInfinity); + // Are any of the mantissa's zero? (SSE2 doesn't have a neq test) + vTempNan = _mm_cmpeq_epi32(vTempNan,g_XMZero); + // Perform a not on the NaN test to be true on NON-zero mantissas + vTempNan = _mm_andnot_si128(vTempNan,vTempInf); + // If x, y or z are NaN, the signs are true after the merge above + return ((_mm_movemask_ps(reinterpret_cast(&vTempNan)[0])&7) != 0); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector3IsInfinite +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (XMISINF(V.vector4_f32[0]) || + XMISINF(V.vector4_f32[1]) || + XMISINF(V.vector4_f32[2])); +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bit + __m128 vTemp = _mm_and_ps(V,g_XMAbsMask); + // Compare to infinity + vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity); + // If x,y or z are infinity, the signs are true. + return ((_mm_movemask_ps(vTemp)&7) != 0); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector3Dot +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + FLOAT fValue = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2]; + XMVECTOR vResult = { + fValue, + fValue, + fValue, + fValue + }; + return vResult; + +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product + XMVECTOR vDot = _mm_mul_ps(V1,V2); + // x=Dot.vector4_f32[1], y=Dot.vector4_f32[2] + XMVECTOR vTemp = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(2,1,2,1)); + // Result.vector4_f32[0] = x+y + vDot = _mm_add_ss(vDot,vTemp); + // x=Dot.vector4_f32[2] + vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1)); + // Result.vector4_f32[0] = (x+y)+z + vDot = _mm_add_ss(vDot,vTemp); + // Splat x + return _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(0,0,0,0)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector3Cross +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult = { + (V1.vector4_f32[1] * V2.vector4_f32[2]) - (V1.vector4_f32[2] * V2.vector4_f32[1]), + (V1.vector4_f32[2] * V2.vector4_f32[0]) - (V1.vector4_f32[0] * V2.vector4_f32[2]), + (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]), + 0.0f + }; + return vResult; + +#elif defined(_XM_SSE_INTRINSICS_) + // y1,z1,x1,w1 + XMVECTOR vTemp1 = _mm_shuffle_ps(V1,V1,_MM_SHUFFLE(3,0,2,1)); + // z2,x2,y2,w2 + XMVECTOR vTemp2 = _mm_shuffle_ps(V2,V2,_MM_SHUFFLE(3,1,0,2)); + // Perform the left operation + XMVECTOR vResult = _mm_mul_ps(vTemp1,vTemp2); + // z1,x1,y1,w1 + vTemp1 = _mm_shuffle_ps(vTemp1,vTemp1,_MM_SHUFFLE(3,0,2,1)); + // y2,z2,x2,w2 + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp2,_MM_SHUFFLE(3,1,0,2)); + // Perform the right operation + vTemp1 = _mm_mul_ps(vTemp1,vTemp2); + // Subract the right from left, and return answer + vResult = _mm_sub_ps(vResult,vTemp1); + // Set w to zero + return _mm_and_ps(vResult,g_XMMask3); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector3LengthSq +( + FXMVECTOR V +) +{ + return XMVector3Dot(V, V); +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector3ReciprocalLengthEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector3LengthSq(V); + Result = XMVectorReciprocalSqrtEst(Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and y + XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,2,1,2)); + // x+z, y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + // y,y,y,y + vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1)); + // x+z+y,??,??,?? + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + // Splat the length squared + vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(0,0,0,0)); + // Get the reciprocal + vLengthSq = _mm_rsqrt_ps(vLengthSq); + return vLengthSq; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector3ReciprocalLength +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector3LengthSq(V); + Result = XMVectorReciprocalSqrt(Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product + XMVECTOR vDot = _mm_mul_ps(V,V); + // x=Dot.y, y=Dot.z + XMVECTOR vTemp = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(2,1,2,1)); + // Result.x = x+y + vDot = _mm_add_ss(vDot,vTemp); + // x=Dot.z + vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1)); + // Result.x = (x+y)+z + vDot = _mm_add_ss(vDot,vTemp); + // Splat x + vDot = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(0,0,0,0)); + // Get the reciprocal + vDot = _mm_sqrt_ps(vDot); + // Get the reciprocal + vDot = _mm_div_ps(g_XMOne,vDot); + return vDot; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector3LengthEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector3LengthSq(V); + Result = XMVectorSqrtEst(Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and y + XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,2,1,2)); + // x+z, y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + // y,y,y,y + vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1)); + // x+z+y,??,??,?? + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + // Splat the length squared + vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(0,0,0,0)); + // Get the length + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector3Length +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector3LengthSq(V); + Result = XMVectorSqrt(Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and y + XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,2,1,2)); + // x+z, y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + // y,y,y,y + vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1)); + // x+z+y,??,??,?? + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + // Splat the length squared + vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(0,0,0,0)); + // Get the length + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// XMVector3NormalizeEst uses a reciprocal estimate and +// returns QNaN on zero and infinite vectors. + +XMFINLINE XMVECTOR XMVector3NormalizeEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector3ReciprocalLength(V); + Result = XMVectorMultiply(V, Result); + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product + XMVECTOR vDot = _mm_mul_ps(V,V); + // x=Dot.y, y=Dot.z + XMVECTOR vTemp = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(2,1,2,1)); + // Result.x = x+y + vDot = _mm_add_ss(vDot,vTemp); + // x=Dot.z + vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1)); + // Result.x = (x+y)+z + vDot = _mm_add_ss(vDot,vTemp); + // Splat x + vDot = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(0,0,0,0)); + // Get the reciprocal + vDot = _mm_rsqrt_ps(vDot); + // Perform the normalization + vDot = _mm_mul_ps(vDot,V); + return vDot; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector3Normalize +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + FLOAT fLength; + XMVECTOR vResult; + + vResult = XMVector3Length( V ); + fLength = vResult.vector4_f32[0]; + + // Prevent divide by zero + if (fLength > 0) { + fLength = 1.0f/fLength; + } + + vResult.vector4_f32[0] = V.vector4_f32[0]*fLength; + vResult.vector4_f32[1] = V.vector4_f32[1]*fLength; + vResult.vector4_f32[2] = V.vector4_f32[2]*fLength; + vResult.vector4_f32[3] = V.vector4_f32[3]*fLength; + return vResult; + +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z only + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(2,1,2,1)); + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1)); + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(0,0,0,0)); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); + vResult = _mm_or_ps(vTemp1,vTemp2); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector3ClampLength +( + FXMVECTOR V, + FLOAT LengthMin, + FLOAT LengthMax +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR ClampMax; + XMVECTOR ClampMin; + + ClampMax = XMVectorReplicate(LengthMax); + ClampMin = XMVectorReplicate(LengthMin); + + return XMVector3ClampLengthV(V, ClampMin, ClampMax); + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR ClampMax = _mm_set_ps1(LengthMax); + XMVECTOR ClampMin = _mm_set_ps1(LengthMin); + return XMVector3ClampLengthV(V,ClampMin,ClampMax); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector3ClampLengthV +( + FXMVECTOR V, + FXMVECTOR LengthMin, + FXMVECTOR LengthMax +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR ClampLength; + XMVECTOR LengthSq; + XMVECTOR RcpLength; + XMVECTOR Length; + XMVECTOR Normal; + XMVECTOR Zero; + XMVECTOR InfiniteLength; + XMVECTOR ZeroLength; + XMVECTOR Select; + XMVECTOR ControlMax; + XMVECTOR ControlMin; + XMVECTOR Control; + XMVECTOR Result; + + XMASSERT((LengthMin.vector4_f32[1] == LengthMin.vector4_f32[0]) && (LengthMin.vector4_f32[2] == LengthMin.vector4_f32[0])); + XMASSERT((LengthMax.vector4_f32[1] == LengthMax.vector4_f32[0]) && (LengthMax.vector4_f32[2] == LengthMax.vector4_f32[0])); + XMASSERT(XMVector3GreaterOrEqual(LengthMin, XMVectorZero())); + XMASSERT(XMVector3GreaterOrEqual(LengthMax, XMVectorZero())); + XMASSERT(XMVector3GreaterOrEqual(LengthMax, LengthMin)); + + LengthSq = XMVector3LengthSq(V); + + Zero = XMVectorZero(); + + RcpLength = XMVectorReciprocalSqrt(LengthSq); + + InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v); + ZeroLength = XMVectorEqual(LengthSq, Zero); + + Normal = XMVectorMultiply(V, RcpLength); + + Length = XMVectorMultiply(LengthSq, RcpLength); + + Select = XMVectorEqualInt(InfiniteLength, ZeroLength); + Length = XMVectorSelect(LengthSq, Length, Select); + Normal = XMVectorSelect(LengthSq, Normal, Select); + + ControlMax = XMVectorGreater(Length, LengthMax); + ControlMin = XMVectorLess(Length, LengthMin); + + ClampLength = XMVectorSelect(Length, LengthMax, ControlMax); + ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin); + + Result = XMVectorMultiply(Normal, ClampLength); + + // Preserve the original vector (with no precision loss) if the length falls within the given range + Control = XMVectorEqualInt(ControlMax, ControlMin); + Result = XMVectorSelect(Result, V, Control); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR ClampLength; + XMVECTOR LengthSq; + XMVECTOR RcpLength; + XMVECTOR Length; + XMVECTOR Normal; + XMVECTOR InfiniteLength; + XMVECTOR ZeroLength; + XMVECTOR Select; + XMVECTOR ControlMax; + XMVECTOR ControlMin; + XMVECTOR Control; + XMVECTOR Result; + + XMASSERT((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin))); + XMASSERT((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax))); + XMASSERT(XMVector3GreaterOrEqual(LengthMin, g_XMZero)); + XMASSERT(XMVector3GreaterOrEqual(LengthMax, g_XMZero)); + XMASSERT(XMVector3GreaterOrEqual(LengthMax, LengthMin)); + + LengthSq = XMVector3LengthSq(V); + RcpLength = XMVectorReciprocalSqrt(LengthSq); + InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity); + ZeroLength = XMVectorEqual(LengthSq,g_XMZero); + Normal = _mm_mul_ps(V, RcpLength); + Length = _mm_mul_ps(LengthSq, RcpLength); + Select = XMVectorEqualInt(InfiniteLength, ZeroLength); + Length = XMVectorSelect(LengthSq, Length, Select); + Normal = XMVectorSelect(LengthSq, Normal, Select); + ControlMax = XMVectorGreater(Length, LengthMax); + ControlMin = XMVectorLess(Length, LengthMin); + ClampLength = XMVectorSelect(Length, LengthMax, ControlMax); + ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin); + Result = _mm_mul_ps(Normal, ClampLength); + // Preserve the original vector (with no precision loss) if the length falls within the given range + Control = XMVectorEqualInt(ControlMax, ControlMin); + Result = XMVectorSelect(Result, V, Control); + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector3Reflect +( + FXMVECTOR Incident, + FXMVECTOR Normal +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + // Result = Incident - (2 * dot(Incident, Normal)) * Normal + Result = XMVector3Dot(Incident, Normal); + Result = XMVectorAdd(Result, Result); + Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Result = Incident - (2 * dot(Incident, Normal)) * Normal + XMVECTOR Result = XMVector3Dot(Incident, Normal); + Result = _mm_add_ps(Result, Result); + Result = _mm_mul_ps(Result, Normal); + Result = _mm_sub_ps(Incident,Result); + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector3Refract +( + FXMVECTOR Incident, + FXMVECTOR Normal, + FLOAT RefractionIndex +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Index; + Index = XMVectorReplicate(RefractionIndex); + return XMVector3RefractV(Incident, Normal, Index); + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR Index = _mm_set_ps1(RefractionIndex); + return XMVector3RefractV(Incident,Normal,Index); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector3RefractV +( + FXMVECTOR Incident, + FXMVECTOR Normal, + FXMVECTOR RefractionIndex +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR IDotN; + XMVECTOR R; + CONST XMVECTOR Zero = XMVectorZero(); + + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + + IDotN = XMVector3Dot(Incident, Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v); + R = XMVectorMultiply(R, RefractionIndex); + R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v); + + if (XMVector4LessOrEqual(R, Zero)) + { + // Total internal reflection + return Zero; + } + else + { + XMVECTOR Result; + + // R = RefractionIndex * IDotN + sqrt(R) + R = XMVectorSqrt(R); + R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R); + + // Result = RefractionIndex * Incident - Normal * R + Result = XMVectorMultiply(RefractionIndex, Incident); + Result = XMVectorNegativeMultiplySubtract(Normal, R, Result); + + return Result; + } + +#elif defined(_XM_SSE_INTRINSICS_) + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + XMVECTOR IDotN = XMVector3Dot(Incident, Normal); + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + XMVECTOR R = _mm_mul_ps(IDotN, IDotN); + R = _mm_sub_ps(g_XMOne,R); + R = _mm_mul_ps(R, RefractionIndex); + R = _mm_mul_ps(R, RefractionIndex); + R = _mm_sub_ps(g_XMOne,R); + + XMVECTOR vResult = _mm_cmple_ps(R,g_XMZero); + if (_mm_movemask_ps(vResult)==0x0f) + { + // Total internal reflection + vResult = g_XMZero; + } + else + { + // R = RefractionIndex * IDotN + sqrt(R) + R = _mm_sqrt_ps(R); + vResult = _mm_mul_ps(RefractionIndex,IDotN); + R = _mm_add_ps(R,vResult); + // Result = RefractionIndex * Incident - Normal * R + vResult = _mm_mul_ps(RefractionIndex, Incident); + R = _mm_mul_ps(R,Normal); + vResult = _mm_sub_ps(vResult,R); + } + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector3Orthogonal +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR NegativeV; + XMVECTOR Z, YZYY; + XMVECTOR ZIsNegative, YZYYIsNegative; + XMVECTOR S, D; + XMVECTOR R0, R1; + XMVECTOR Select; + XMVECTOR Zero; + XMVECTOR Result; + static CONST XMVECTORU32 Permute1X0X0X0X = {XM_PERMUTE_1X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X}; + static CONST XMVECTORU32 Permute0Y0Z0Y0Y= {XM_PERMUTE_0Y, XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_0Y}; + + Zero = XMVectorZero(); + Z = XMVectorSplatZ(V); + YZYY = XMVectorPermute(V, V, Permute0Y0Z0Y0Y.v); + + NegativeV = XMVectorSubtract(Zero, V); + + ZIsNegative = XMVectorLess(Z, Zero); + YZYYIsNegative = XMVectorLess(YZYY, Zero); + + S = XMVectorAdd(YZYY, Z); + D = XMVectorSubtract(YZYY, Z); + + Select = XMVectorEqualInt(ZIsNegative, YZYYIsNegative); + + R0 = XMVectorPermute(NegativeV, S, Permute1X0X0X0X.v); + R1 = XMVectorPermute(V, D, Permute1X0X0X0X.v); + + Result = XMVectorSelect(R1, R0, Select); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR NegativeV; + XMVECTOR Z, YZYY; + XMVECTOR ZIsNegative, YZYYIsNegative; + XMVECTOR S, D; + XMVECTOR R0, R1; + XMVECTOR Select; + XMVECTOR Zero; + XMVECTOR Result; + static CONST XMVECTORI32 Permute1X0X0X0X = {XM_PERMUTE_1X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X}; + static CONST XMVECTORI32 Permute0Y0Z0Y0Y= {XM_PERMUTE_0Y, XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_0Y}; + + Zero = XMVectorZero(); + Z = XMVectorSplatZ(V); + YZYY = XMVectorPermute(V, V, Permute0Y0Z0Y0Y); + + NegativeV = _mm_sub_ps(Zero, V); + + ZIsNegative = XMVectorLess(Z, Zero); + YZYYIsNegative = XMVectorLess(YZYY, Zero); + + S = _mm_add_ps(YZYY, Z); + D = _mm_sub_ps(YZYY, Z); + + Select = XMVectorEqualInt(ZIsNegative, YZYYIsNegative); + + R0 = XMVectorPermute(NegativeV, S, Permute1X0X0X0X); + R1 = XMVectorPermute(V, D,Permute1X0X0X0X); + Result = XMVectorSelect(R1, R0, Select); + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector3AngleBetweenNormalsEst +( + FXMVECTOR N1, + FXMVECTOR N2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + XMVECTOR NegativeOne; + XMVECTOR One; + + Result = XMVector3Dot(N1, N2); + NegativeOne = XMVectorSplatConstant(-1, 0); + One = XMVectorSplatOne(); + Result = XMVectorClamp(Result, NegativeOne, One); + Result = XMVectorACosEst(Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XMVector3Dot(N1,N2); + // Clamp to -1.0f to 1.0f + vResult = _mm_max_ps(vResult,g_XMNegativeOne); + vResult = _mm_min_ps(vResult,g_XMOne); + vResult = XMVectorACosEst(vResult); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector3AngleBetweenNormals +( + FXMVECTOR N1, + FXMVECTOR N2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + XMVECTOR NegativeOne; + XMVECTOR One; + + Result = XMVector3Dot(N1, N2); + NegativeOne = XMVectorSplatConstant(-1, 0); + One = XMVectorSplatOne(); + Result = XMVectorClamp(Result, NegativeOne, One); + Result = XMVectorACos(Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XMVector3Dot(N1,N2); + // Clamp to -1.0f to 1.0f + vResult = _mm_max_ps(vResult,g_XMNegativeOne); + vResult = _mm_min_ps(vResult,g_XMOne); + vResult = XMVectorACos(vResult); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector3AngleBetweenVectors +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR L1; + XMVECTOR L2; + XMVECTOR Dot; + XMVECTOR CosAngle; + XMVECTOR NegativeOne; + XMVECTOR One; + XMVECTOR Result; + + L1 = XMVector3ReciprocalLength(V1); + L2 = XMVector3ReciprocalLength(V2); + + Dot = XMVector3Dot(V1, V2); + + L1 = XMVectorMultiply(L1, L2); + + NegativeOne = XMVectorSplatConstant(-1, 0); + One = XMVectorSplatOne(); + + CosAngle = XMVectorMultiply(Dot, L1); + + CosAngle = XMVectorClamp(CosAngle, NegativeOne, One); + + Result = XMVectorACos(CosAngle); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR L1; + XMVECTOR L2; + XMVECTOR Dot; + XMVECTOR CosAngle; + XMVECTOR Result; + + L1 = XMVector3ReciprocalLength(V1); + L2 = XMVector3ReciprocalLength(V2); + Dot = XMVector3Dot(V1, V2); + L1 = _mm_mul_ps(L1, L2); + CosAngle = _mm_mul_ps(Dot, L1); + CosAngle = XMVectorClamp(CosAngle,g_XMNegativeOne,g_XMOne); + Result = XMVectorACos(CosAngle); + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector3LinePointDistance +( + FXMVECTOR LinePoint1, + FXMVECTOR LinePoint2, + FXMVECTOR Point +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR PointVector; + XMVECTOR LineVector; + XMVECTOR ReciprocalLengthSq; + XMVECTOR PointProjectionScale; + XMVECTOR DistanceVector; + XMVECTOR Result; + + // Given a vector PointVector from LinePoint1 to Point and a vector + // LineVector from LinePoint1 to LinePoint2, the scaled distance + // PointProjectionScale from LinePoint1 to the perpendicular projection + // of PointVector onto the line is defined as: + // + // PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector) + + PointVector = XMVectorSubtract(Point, LinePoint1); + LineVector = XMVectorSubtract(LinePoint2, LinePoint1); + + ReciprocalLengthSq = XMVector3LengthSq(LineVector); + ReciprocalLengthSq = XMVectorReciprocal(ReciprocalLengthSq); + + PointProjectionScale = XMVector3Dot(PointVector, LineVector); + PointProjectionScale = XMVectorMultiply(PointProjectionScale, ReciprocalLengthSq); + + DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale); + DistanceVector = XMVectorSubtract(PointVector, DistanceVector); + + Result = XMVector3Length(DistanceVector); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR PointVector = _mm_sub_ps(Point,LinePoint1); + XMVECTOR LineVector = _mm_sub_ps(LinePoint2,LinePoint1); + XMVECTOR ReciprocalLengthSq = XMVector3LengthSq(LineVector); + XMVECTOR vResult = XMVector3Dot(PointVector,LineVector); + vResult = _mm_div_ps(vResult,ReciprocalLengthSq); + vResult = _mm_mul_ps(vResult,LineVector); + vResult = _mm_sub_ps(PointVector,vResult); + vResult = XMVector3Length(vResult); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE VOID XMVector3ComponentsFromNormal +( + XMVECTOR* pParallel, + XMVECTOR* pPerpendicular, + FXMVECTOR V, + FXMVECTOR Normal +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Parallel; + XMVECTOR Scale; + + XMASSERT(pParallel); + XMASSERT(pPerpendicular); + + Scale = XMVector3Dot(V, Normal); + + Parallel = XMVectorMultiply(Normal, Scale); + + *pParallel = Parallel; + *pPerpendicular = XMVectorSubtract(V, Parallel); + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pParallel); + XMASSERT(pPerpendicular); + XMVECTOR Scale = XMVector3Dot(V, Normal); + XMVECTOR Parallel = _mm_mul_ps(Normal,Scale); + *pParallel = Parallel; + *pPerpendicular = _mm_sub_ps(V,Parallel); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Transform a vector using a rotation expressed as a unit quaternion + +XMFINLINE XMVECTOR XMVector3Rotate +( + FXMVECTOR V, + FXMVECTOR RotationQuaternion +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR A; + XMVECTOR Q; + XMVECTOR Result; + + A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v); + Q = XMQuaternionConjugate(RotationQuaternion); + Result = XMQuaternionMultiply(Q, A); + Result = XMQuaternionMultiply(Result, RotationQuaternion); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR A; + XMVECTOR Q; + XMVECTOR Result; + + A = _mm_and_ps(V,g_XMMask3); + Q = XMQuaternionConjugate(RotationQuaternion); + Result = XMQuaternionMultiply(Q, A); + Result = XMQuaternionMultiply(Result, RotationQuaternion); + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Transform a vector using the inverse of a rotation expressed as a unit quaternion + +XMFINLINE XMVECTOR XMVector3InverseRotate +( + FXMVECTOR V, + FXMVECTOR RotationQuaternion +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR A; + XMVECTOR Q; + XMVECTOR Result; + + A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v); + Result = XMQuaternionMultiply(RotationQuaternion, A); + Q = XMQuaternionConjugate(RotationQuaternion); + Result = XMQuaternionMultiply(Result, Q); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR A; + XMVECTOR Q; + XMVECTOR Result; + A = _mm_and_ps(V,g_XMMask3); + Result = XMQuaternionMultiply(RotationQuaternion, A); + Q = XMQuaternionConjugate(RotationQuaternion); + Result = XMQuaternionMultiply(Result, Q); + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector3Transform +( + FXMVECTOR V, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR X; + XMVECTOR Y; + XMVECTOR Z; + XMVECTOR Result; + + Z = XMVectorSplatZ(V); + Y = XMVectorSplatY(V); + X = XMVectorSplatX(V); + + Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]); + Result = XMVectorMultiplyAdd(Y, M.r[1], Result); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(0,0,0,0)); + vResult = _mm_mul_ps(vResult,M.r[0]); + XMVECTOR vTemp = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1)); + vTemp = _mm_mul_ps(vTemp,M.r[1]); + vResult = _mm_add_ps(vResult,vTemp); + vTemp = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2)); + vTemp = _mm_mul_ps(vTemp,M.r[2]); + vResult = _mm_add_ps(vResult,vTemp); + vResult = _mm_add_ps(vResult,M.r[3]); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMFLOAT4* XMVector3TransformStream +( + XMFLOAT4* pOutputStream, + size_t OutputStride, + CONST XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + XMVECTOR X; + XMVECTOR Y; + XMVECTOR Z; + XMVECTOR Result; + size_t i; + CONST BYTE* pInputVector = (CONST BYTE*)pInputStream; + BYTE* pOutputVector = (BYTE*)pOutputStream; + + XMASSERT(pOutputStream); + XMASSERT(pInputStream); + + for (i = 0; i < VectorCount; i++) + { + V = XMLoadFloat3((const XMFLOAT3*)pInputVector); + Z = XMVectorSplatZ(V); + Y = XMVectorSplatY(V); + X = XMVectorSplatX(V); + + Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]); + Result = XMVectorMultiplyAdd(Y, M.r[1], Result); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + XMStoreFloat4((XMFLOAT4*)pOutputVector, Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pOutputStream); + XMASSERT(pInputStream); + size_t i; + CONST BYTE* pInputVector = (CONST BYTE*)pInputStream; + BYTE* pOutputVector = (BYTE*)pOutputStream; + + for (i = 0; i < VectorCount; i++) + { + XMVECTOR X = _mm_load_ps1(&reinterpret_cast(pInputVector)->x); + XMVECTOR Y = _mm_load_ps1(&reinterpret_cast(pInputVector)->y); + XMVECTOR vResult = _mm_load_ps1(&reinterpret_cast(pInputVector)->z); + vResult = _mm_mul_ps(vResult,M.r[2]); + vResult = _mm_add_ps(vResult,M.r[3]); + Y = _mm_mul_ps(Y,M.r[1]); + vResult = _mm_add_ps(vResult,Y); + X = _mm_mul_ps(X,M.r[0]); + vResult = _mm_add_ps(vResult,X); + _mm_storeu_ps(reinterpret_cast(pOutputVector),vResult); + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMFLOAT4* XMVector3TransformStreamNC +( + XMFLOAT4* pOutputStream, + size_t OutputStride, + CONST XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) || defined(_XM_SSE_INTRINSICS_) + return XMVector3TransformStream( pOutputStream, OutputStride, pInputStream, InputStride, VectorCount, M ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector3TransformCoord +( + FXMVECTOR V, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR X; + XMVECTOR Y; + XMVECTOR Z; + XMVECTOR InverseW; + XMVECTOR Result; + + Z = XMVectorSplatZ(V); + Y = XMVectorSplatY(V); + X = XMVectorSplatX(V); + + Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]); + Result = XMVectorMultiplyAdd(Y, M.r[1], Result); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + InverseW = XMVectorSplatW(Result); + InverseW = XMVectorReciprocal(InverseW); + + Result = XMVectorMultiply(Result, InverseW); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(0,0,0,0)); + vResult = _mm_mul_ps(vResult,M.r[0]); + XMVECTOR vTemp = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1)); + vTemp = _mm_mul_ps(vTemp,M.r[1]); + vResult = _mm_add_ps(vResult,vTemp); + vTemp = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2)); + vTemp = _mm_mul_ps(vTemp,M.r[2]); + vResult = _mm_add_ps(vResult,vTemp); + vResult = _mm_add_ps(vResult,M.r[3]); + vTemp = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,3,3,3)); + vResult = _mm_div_ps(vResult,vTemp); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMFLOAT3* XMVector3TransformCoordStream +( + XMFLOAT3* pOutputStream, + size_t OutputStride, + CONST XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + XMVECTOR X; + XMVECTOR Y; + XMVECTOR Z; + XMVECTOR InverseW; + XMVECTOR Result; + size_t i; + CONST BYTE* pInputVector = (CONST BYTE*)pInputStream; + BYTE* pOutputVector = (BYTE*)pOutputStream; + + XMASSERT(pOutputStream); + XMASSERT(pInputStream); + + for (i = 0; i < VectorCount; i++) + { + V = XMLoadFloat3((const XMFLOAT3*)pInputVector); + Z = XMVectorSplatZ(V); + Y = XMVectorSplatY(V); + X = XMVectorSplatX(V); +// Z = XMVectorReplicate(((XMFLOAT3*)pInputVector)->z); +// Y = XMVectorReplicate(((XMFLOAT3*)pInputVector)->y); +// X = XMVectorReplicate(((XMFLOAT3*)pInputVector)->x); + + Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]); + Result = XMVectorMultiplyAdd(Y, M.r[1], Result); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + InverseW = XMVectorSplatW(Result); + InverseW = XMVectorReciprocal(InverseW); + + Result = XMVectorMultiply(Result, InverseW); + + XMStoreFloat3((XMFLOAT3*)pOutputVector, Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pOutputStream); + XMASSERT(pInputStream); + + size_t i; + CONST BYTE *pInputVector = (CONST BYTE*)pInputStream; + BYTE *pOutputVector = (BYTE*)pOutputStream; + + for (i = 0; i < VectorCount; i++) + { + XMVECTOR X = _mm_load_ps1(&reinterpret_cast(pInputVector)->x); + XMVECTOR Y = _mm_load_ps1(&reinterpret_cast(pInputVector)->y); + XMVECTOR vResult = _mm_load_ps1(&reinterpret_cast(pInputVector)->z); + vResult = _mm_mul_ps(vResult,M.r[2]); + vResult = _mm_add_ps(vResult,M.r[3]); + Y = _mm_mul_ps(Y,M.r[1]); + vResult = _mm_add_ps(vResult,Y); + X = _mm_mul_ps(X,M.r[0]); + vResult = _mm_add_ps(vResult,X); + + X = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,3,3,3)); + vResult = _mm_div_ps(vResult,X); + _mm_store_ss(&reinterpret_cast(pOutputVector)->x,vResult); + vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1)); + _mm_store_ss(&reinterpret_cast(pOutputVector)->y,vResult); + vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1)); + _mm_store_ss(&reinterpret_cast(pOutputVector)->z,vResult); + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector3TransformNormal +( + FXMVECTOR V, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR X; + XMVECTOR Y; + XMVECTOR Z; + XMVECTOR Result; + + Z = XMVectorSplatZ(V); + Y = XMVectorSplatY(V); + X = XMVectorSplatX(V); + + Result = XMVectorMultiply(Z, M.r[2]); + Result = XMVectorMultiplyAdd(Y, M.r[1], Result); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(0,0,0,0)); + vResult = _mm_mul_ps(vResult,M.r[0]); + XMVECTOR vTemp = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1)); + vTemp = _mm_mul_ps(vTemp,M.r[1]); + vResult = _mm_add_ps(vResult,vTemp); + vTemp = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2)); + vTemp = _mm_mul_ps(vTemp,M.r[2]); + vResult = _mm_add_ps(vResult,vTemp); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMFLOAT3* XMVector3TransformNormalStream +( + XMFLOAT3* pOutputStream, + size_t OutputStride, + CONST XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + XMVECTOR X; + XMVECTOR Y; + XMVECTOR Z; + XMVECTOR Result; + size_t i; + CONST BYTE* pInputVector = (CONST BYTE*)pInputStream; + BYTE* pOutputVector = (BYTE*)pOutputStream; + + XMASSERT(pOutputStream); + XMASSERT(pInputStream); + + for (i = 0; i < VectorCount; i++) + { + V = XMLoadFloat3((const XMFLOAT3*)pInputVector); + Z = XMVectorSplatZ(V); + Y = XMVectorSplatY(V); + X = XMVectorSplatX(V); +// Z = XMVectorReplicate(((XMFLOAT3*)pInputVector)->z); +// Y = XMVectorReplicate(((XMFLOAT3*)pInputVector)->y); +// X = XMVectorReplicate(((XMFLOAT3*)pInputVector)->x); + + Result = XMVectorMultiply(Z, M.r[2]); + Result = XMVectorMultiplyAdd(Y, M.r[1], Result); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + XMStoreFloat3((XMFLOAT3*)pOutputVector, Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pOutputStream); + XMASSERT(pInputStream); + + size_t i; + CONST BYTE *pInputVector = (CONST BYTE*)pInputStream; + BYTE *pOutputVector = (BYTE*)pOutputStream; + + for (i = 0; i < VectorCount; i++) + { + XMVECTOR X = _mm_load_ps1(&reinterpret_cast(pInputVector)->x); + XMVECTOR Y = _mm_load_ps1(&reinterpret_cast(pInputVector)->y); + XMVECTOR vResult = _mm_load_ps1(&reinterpret_cast(pInputVector)->z); + vResult = _mm_mul_ps(vResult,M.r[2]); + Y = _mm_mul_ps(Y,M.r[1]); + vResult = _mm_add_ps(vResult,Y); + X = _mm_mul_ps(X,M.r[0]); + vResult = _mm_add_ps(vResult,X); + _mm_store_ss(&reinterpret_cast(pOutputVector)->x,vResult); + vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1)); + _mm_store_ss(&reinterpret_cast(pOutputVector)->y,vResult); + vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1)); + _mm_store_ss(&reinterpret_cast(pOutputVector)->z,vResult); + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMVECTOR XMVector3Project +( + FXMVECTOR V, + FLOAT ViewportX, + FLOAT ViewportY, + FLOAT ViewportWidth, + FLOAT ViewportHeight, + FLOAT ViewportMinZ, + FLOAT ViewportMaxZ, + CXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX Transform; + XMVECTOR Scale; + XMVECTOR Offset; + XMVECTOR Result; + FLOAT HalfViewportWidth = ViewportWidth * 0.5f; + FLOAT HalfViewportHeight = ViewportHeight * 0.5f; + + Scale = XMVectorSet(HalfViewportWidth, + -HalfViewportHeight, + ViewportMaxZ - ViewportMinZ, + 0.0f); + + Offset = XMVectorSet(ViewportX + HalfViewportWidth, + ViewportY + HalfViewportHeight, + ViewportMinZ, + 0.0f); + + Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + + Result = XMVector3TransformCoord(V, Transform); + + Result = XMVectorMultiplyAdd(Result, Scale, Offset); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX Transform; + XMVECTOR Scale; + XMVECTOR Offset; + XMVECTOR Result; + FLOAT HalfViewportWidth = ViewportWidth * 0.5f; + FLOAT HalfViewportHeight = ViewportHeight * 0.5f; + + Scale = XMVectorSet(HalfViewportWidth, + -HalfViewportHeight, + ViewportMaxZ - ViewportMinZ, + 0.0f); + + Offset = XMVectorSet(ViewportX + HalfViewportWidth, + ViewportY + HalfViewportHeight, + ViewportMinZ, + 0.0f); + Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + Result = XMVector3TransformCoord(V, Transform); + Result = _mm_mul_ps(Result,Scale); + Result = _mm_add_ps(Result,Offset); + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMFLOAT3* XMVector3ProjectStream +( + XMFLOAT3* pOutputStream, + size_t OutputStride, + CONST XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + FLOAT ViewportX, + FLOAT ViewportY, + FLOAT ViewportWidth, + FLOAT ViewportHeight, + FLOAT ViewportMinZ, + FLOAT ViewportMaxZ, + CXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX Transform; + XMVECTOR V; + XMVECTOR Scale; + XMVECTOR Offset; + XMVECTOR Result; + size_t i; + FLOAT HalfViewportWidth = ViewportWidth * 0.5f; + FLOAT HalfViewportHeight = ViewportHeight * 0.5f; + CONST BYTE* pInputVector = (CONST BYTE*)pInputStream; + BYTE* pOutputVector = (BYTE*)pOutputStream; + + XMASSERT(pOutputStream); + XMASSERT(pInputStream); + + Scale = XMVectorSet(HalfViewportWidth, + -HalfViewportHeight, + ViewportMaxZ - ViewportMinZ, + 1.0f); + + Offset = XMVectorSet(ViewportX + HalfViewportWidth, + ViewportY + HalfViewportHeight, + ViewportMinZ, + 0.0f); + + Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + + for (i = 0; i < VectorCount; i++) + { + V = XMLoadFloat3((const XMFLOAT3*)pInputVector); + + Result = XMVector3TransformCoord(V, Transform); + + Result = XMVectorMultiplyAdd(Result, Scale, Offset); + + XMStoreFloat3((XMFLOAT3*)pOutputVector, Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pOutputStream); + XMASSERT(pInputStream); + XMMATRIX Transform; + XMVECTOR V; + XMVECTOR Scale; + XMVECTOR Offset; + XMVECTOR Result; + size_t i; + FLOAT HalfViewportWidth = ViewportWidth * 0.5f; + FLOAT HalfViewportHeight = ViewportHeight * 0.5f; + CONST BYTE* pInputVector = (CONST BYTE*)pInputStream; + BYTE* pOutputVector = (BYTE*)pOutputStream; + + Scale = XMVectorSet(HalfViewportWidth, + -HalfViewportHeight, + ViewportMaxZ - ViewportMinZ, + 1.0f); + + Offset = XMVectorSet(ViewportX + HalfViewportWidth, + ViewportY + HalfViewportHeight, + ViewportMinZ, + 0.0f); + + Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + + for (i = 0; i < VectorCount; i++) + { + V = XMLoadFloat3((const XMFLOAT3*)pInputVector); + + Result = XMVector3TransformCoord(V, Transform); + + Result = _mm_mul_ps(Result,Scale); + Result = _mm_add_ps(Result,Offset); + XMStoreFloat3((XMFLOAT3*)pOutputVector, Result); + pInputVector += InputStride; + pOutputVector += OutputStride; + } + return pOutputStream; + +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector3Unproject +( + FXMVECTOR V, + FLOAT ViewportX, + FLOAT ViewportY, + FLOAT ViewportWidth, + FLOAT ViewportHeight, + FLOAT ViewportMinZ, + FLOAT ViewportMaxZ, + CXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX Transform; + XMVECTOR Scale; + XMVECTOR Offset; + XMVECTOR Determinant; + XMVECTOR Result; + CONST XMVECTOR D = XMVectorSet(-1.0f, 1.0f, 0.0f, 0.0f); + + Scale = XMVectorSet(ViewportWidth * 0.5f, + -ViewportHeight * 0.5f, + ViewportMaxZ - ViewportMinZ, + 1.0f); + Scale = XMVectorReciprocal(Scale); + + Offset = XMVectorSet(-ViewportX, + -ViewportY, + -ViewportMinZ, + 0.0f); + Offset = XMVectorMultiplyAdd(Scale, Offset, D); + + Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(&Determinant, Transform); + + Result = XMVectorMultiplyAdd(V, Scale, Offset); + + Result = XMVector3TransformCoord(Result, Transform); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX Transform; + XMVECTOR Scale; + XMVECTOR Offset; + XMVECTOR Determinant; + XMVECTOR Result; + CONST XMVECTORF32 D = {-1.0f, 1.0f, 0.0f, 0.0f}; + + Scale = XMVectorSet(ViewportWidth * 0.5f, + -ViewportHeight * 0.5f, + ViewportMaxZ - ViewportMinZ, + 1.0f); + Scale = XMVectorReciprocal(Scale); + + Offset = XMVectorSet(-ViewportX, + -ViewportY, + -ViewportMinZ, + 0.0f); + Offset = _mm_mul_ps(Offset,Scale); + Offset = _mm_add_ps(Offset,D); + + Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(&Determinant, Transform); + + Result = _mm_mul_ps(V,Scale); + Result = _mm_add_ps(Result,Offset); + + Result = XMVector3TransformCoord(Result, Transform); + + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMFLOAT3* XMVector3UnprojectStream +( + XMFLOAT3* pOutputStream, + size_t OutputStride, + CONST XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + FLOAT ViewportX, + FLOAT ViewportY, + FLOAT ViewportWidth, + FLOAT ViewportHeight, + FLOAT ViewportMinZ, + FLOAT ViewportMaxZ, + CXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX Transform; + XMVECTOR Scale; + XMVECTOR Offset; + XMVECTOR V; + XMVECTOR Determinant; + XMVECTOR Result; + size_t i; + CONST BYTE* pInputVector = (CONST BYTE*)pInputStream; + BYTE* pOutputVector = (BYTE*)pOutputStream; + CONST XMVECTOR D = XMVectorSet(-1.0f, 1.0f, 0.0f, 0.0f); + + XMASSERT(pOutputStream); + XMASSERT(pInputStream); + + Scale = XMVectorSet(ViewportWidth * 0.5f, + -ViewportHeight * 0.5f, + ViewportMaxZ - ViewportMinZ, + 1.0f); + Scale = XMVectorReciprocal(Scale); + + Offset = XMVectorSet(-ViewportX, + -ViewportY, + -ViewportMinZ, + 0.0f); + Offset = XMVectorMultiplyAdd(Scale, Offset, D); + + Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(&Determinant, Transform); + + for (i = 0; i < VectorCount; i++) + { + V = XMLoadFloat3((const XMFLOAT3*)pInputVector); + + Result = XMVectorMultiplyAdd(V, Scale, Offset); + + Result = XMVector3TransformCoord(Result, Transform); + + XMStoreFloat3((XMFLOAT3*)pOutputVector, Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pOutputStream); + XMASSERT(pInputStream); + XMMATRIX Transform; + XMVECTOR Scale; + XMVECTOR Offset; + XMVECTOR V; + XMVECTOR Determinant; + XMVECTOR Result; + size_t i; + CONST BYTE* pInputVector = (CONST BYTE*)pInputStream; + BYTE* pOutputVector = (BYTE*)pOutputStream; + CONST XMVECTORF32 D = {-1.0f, 1.0f, 0.0f, 0.0f}; + + Scale = XMVectorSet(ViewportWidth * 0.5f, + -ViewportHeight * 0.5f, + ViewportMaxZ - ViewportMinZ, + 1.0f); + Scale = XMVectorReciprocal(Scale); + + Offset = XMVectorSet(-ViewportX, + -ViewportY, + -ViewportMinZ, + 0.0f); + Offset = _mm_mul_ps(Offset,Scale); + Offset = _mm_add_ps(Offset,D); + + Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(&Determinant, Transform); + + for (i = 0; i < VectorCount; i++) + { + V = XMLoadFloat3((const XMFLOAT3*)pInputVector); + + Result = XMVectorMultiplyAdd(V, Scale, Offset); + + Result = XMVector3TransformCoord(Result, Transform); + + XMStoreFloat3((XMFLOAT3*)pOutputVector, Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +/**************************************************************************** + * + * 4D Vector + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector4Equal +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2]) && (V1.vector4_f32[3] == V2.vector4_f32[3])) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); + return ((_mm_movemask_ps(vTemp)==0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4EqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE UINT XMVector4EqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + UINT CR = 0; + + if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && + (V1.vector4_f32[1] == V2.vector4_f32[1]) && + (V1.vector4_f32[2] == V2.vector4_f32[2]) && + (V1.vector4_f32[3] == V2.vector4_f32[3])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && + (V1.vector4_f32[1] != V2.vector4_f32[1]) && + (V1.vector4_f32[2] != V2.vector4_f32[2]) && + (V1.vector4_f32[3] != V2.vector4_f32[3])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); + int iTest = _mm_movemask_ps(vTemp); + UINT CR = 0; + if (iTest==0xf) // All equal? + { + CR = XM_CRMASK_CR6TRUE; + } + else if (iTest==0) // All not equal? + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector4EqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2]) && (V1.vector4_u32[3] == V2.vector4_u32[3])) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(reinterpret_cast(&V1)[0],reinterpret_cast(&V2)[0]); + return ((_mm_movemask_ps(reinterpret_cast(&vTemp)[0])==0xf) != 0); +#else + return XMComparisonAllTrue(XMVector4EqualIntR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE UINT XMVector4EqualIntR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + UINT CR = 0; + if (V1.vector4_u32[0] == V2.vector4_u32[0] && + V1.vector4_u32[1] == V2.vector4_u32[1] && + V1.vector4_u32[2] == V2.vector4_u32[2] && + V1.vector4_u32[3] == V2.vector4_u32[3]) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (V1.vector4_u32[0] != V2.vector4_u32[0] && + V1.vector4_u32[1] != V2.vector4_u32[1] && + V1.vector4_u32[2] != V2.vector4_u32[2] && + V1.vector4_u32[3] != V2.vector4_u32[3]) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(reinterpret_cast(&V1)[0],reinterpret_cast(&V2)[0]); + int iTest = _mm_movemask_ps(reinterpret_cast(&vTemp)[0]); + UINT CR = 0; + if (iTest==0xf) // All equal? + { + CR = XM_CRMASK_CR6TRUE; + } + else if (iTest==0) // All not equal? + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +XMFINLINE BOOL XMVector4NearEqual +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Epsilon +) +{ +#if defined(_XM_NO_INTRINSICS_) + FLOAT dx, dy, dz, dw; + + dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]); + dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]); + dz = fabsf(V1.vector4_f32[2]-V2.vector4_f32[2]); + dw = fabsf(V1.vector4_f32[3]-V2.vector4_f32[3]); + return (((dx <= Epsilon.vector4_f32[0]) && + (dy <= Epsilon.vector4_f32[1]) && + (dz <= Epsilon.vector4_f32[2]) && + (dw <= Epsilon.vector4_f32[3])) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Get the difference + XMVECTOR vDelta = _mm_sub_ps(V1,V2); + // Get the absolute value of the difference + XMVECTOR vTemp = _mm_setzero_ps(); + vTemp = _mm_sub_ps(vTemp,vDelta); + vTemp = _mm_max_ps(vTemp,vDelta); + vTemp = _mm_cmple_ps(vTemp,Epsilon); + return ((_mm_movemask_ps(vTemp)==0xf) != 0); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector4NotEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2]) || (V1.vector4_f32[3] != V2.vector4_f32[3])) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpneq_ps(V1,V2); + return ((_mm_movemask_ps(vTemp)) != 0); +#else + return XMComparisonAnyFalse(XMVector4EqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector4NotEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2]) || (V1.vector4_u32[3] != V2.vector4_u32[3])) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(reinterpret_cast(&V1)[0],reinterpret_cast(&V2)[0]); + return ((_mm_movemask_ps(reinterpret_cast(&vTemp)[0])!=0xF) != 0); +#else + return XMComparisonAnyFalse(XMVector4EqualIntR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector4Greater +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2]) && (V1.vector4_f32[3] > V2.vector4_f32[3])) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); + return ((_mm_movemask_ps(vTemp)==0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4GreaterR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE UINT XMVector4GreaterR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + UINT CR = 0; + if (V1.vector4_f32[0] > V2.vector4_f32[0] && + V1.vector4_f32[1] > V2.vector4_f32[1] && + V1.vector4_f32[2] > V2.vector4_f32[2] && + V1.vector4_f32[3] > V2.vector4_f32[3]) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (V1.vector4_f32[0] <= V2.vector4_f32[0] && + V1.vector4_f32[1] <= V2.vector4_f32[1] && + V1.vector4_f32[2] <= V2.vector4_f32[2] && + V1.vector4_f32[3] <= V2.vector4_f32[3]) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_SSE_INTRINSICS_) + UINT CR = 0; + XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); + int iTest = _mm_movemask_ps(vTemp); + if (iTest==0xf) { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector4GreaterOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2]) && (V1.vector4_f32[3] >= V2.vector4_f32[3])) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); + return ((_mm_movemask_ps(vTemp)==0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE UINT XMVector4GreaterOrEqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + UINT CR = 0; + if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && + (V1.vector4_f32[1] >= V2.vector4_f32[1]) && + (V1.vector4_f32[2] >= V2.vector4_f32[2]) && + (V1.vector4_f32[3] >= V2.vector4_f32[3])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && + (V1.vector4_f32[1] < V2.vector4_f32[1]) && + (V1.vector4_f32[2] < V2.vector4_f32[2]) && + (V1.vector4_f32[3] < V2.vector4_f32[3])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_SSE_INTRINSICS_) + UINT CR = 0; + XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); + int iTest = _mm_movemask_ps(vTemp); + if (iTest==0x0f) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector4Less +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2]) && (V1.vector4_f32[3] < V2.vector4_f32[3])) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmplt_ps(V1,V2); + return ((_mm_movemask_ps(vTemp)==0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4GreaterR(V2, V1)); +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector4LessOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2]) && (V1.vector4_f32[3] <= V2.vector4_f32[3])) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmple_ps(V1,V2); + return ((_mm_movemask_ps(vTemp)==0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V2, V1)); +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector4InBounds +( + FXMVECTOR V, + FXMVECTOR Bounds +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && + (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) && + (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) && + (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3])) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2,V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1,vTemp2); + // All in bounds? + return ((_mm_movemask_ps(vTemp1)==0x0f) != 0); +#else + return XMComparisonAllInBounds(XMVector4InBoundsR(V, Bounds)); +#endif +} + +//------------------------------------------------------------------------------ + +XMFINLINE UINT XMVector4InBoundsR +( + FXMVECTOR V, + FXMVECTOR Bounds +) +{ +#if defined(_XM_NO_INTRINSICS_) + + UINT CR = 0; + if ((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && + (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) && + (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) && + (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3])) + { + CR = XM_CRMASK_CR6BOUNDS; + } + return CR; + +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2,V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1,vTemp2); + // All in bounds? + return (_mm_movemask_ps(vTemp1)==0x0f) ? XM_CRMASK_CR6BOUNDS : 0; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector4IsNaN +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (XMISNAN(V.vector4_f32[0]) || + XMISNAN(V.vector4_f32[1]) || + XMISNAN(V.vector4_f32[2]) || + XMISNAN(V.vector4_f32[3])); +#elif defined(_XM_SSE_INTRINSICS_) + // Test against itself. NaN is always not equal + XMVECTOR vTempNan = _mm_cmpneq_ps(V,V); + // If any are NaN, the mask is non-zero + return (_mm_movemask_ps(vTempNan)!=0); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE BOOL XMVector4IsInfinite +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + return (XMISINF(V.vector4_f32[0]) || + XMISINF(V.vector4_f32[1]) || + XMISINF(V.vector4_f32[2]) || + XMISINF(V.vector4_f32[3])); + +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bit + XMVECTOR vTemp = _mm_and_ps(V,g_XMAbsMask); + // Compare to infinity + vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity); + // If any are infinity, the signs are true. + return (_mm_movemask_ps(vTemp) != 0); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector4Dot +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result.vector4_f32[0] = + Result.vector4_f32[1] = + Result.vector4_f32[2] = + Result.vector4_f32[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2] + V1.vector4_f32[3] * V2.vector4_f32[3]; + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp2 = V2; + XMVECTOR vTemp = _mm_mul_ps(V1,vTemp2); + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp,_MM_SHUFFLE(1,0,0,0)); // Copy X to the Z position and Y to the W position + vTemp2 = _mm_add_ps(vTemp2,vTemp); // Add Z = X+Z; W = Y+W; + vTemp = _mm_shuffle_ps(vTemp,vTemp2,_MM_SHUFFLE(0,3,0,0)); // Copy W to the Z position + vTemp = _mm_add_ps(vTemp,vTemp2); // Add Z and W together + return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(2,2,2,2)); // Splat Z and return +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector4Cross +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + + Result.vector4_f32[0] = (((V2.vector4_f32[2]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[2]))*V1.vector4_f32[1])-(((V2.vector4_f32[1]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[1]))*V1.vector4_f32[2])+(((V2.vector4_f32[1]*V3.vector4_f32[2])-(V2.vector4_f32[2]*V3.vector4_f32[1]))*V1.vector4_f32[3]); + Result.vector4_f32[1] = (((V2.vector4_f32[3]*V3.vector4_f32[2])-(V2.vector4_f32[2]*V3.vector4_f32[3]))*V1.vector4_f32[0])-(((V2.vector4_f32[3]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[3]))*V1.vector4_f32[2])+(((V2.vector4_f32[2]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[2]))*V1.vector4_f32[3]); + Result.vector4_f32[2] = (((V2.vector4_f32[1]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[1]))*V1.vector4_f32[0])-(((V2.vector4_f32[0]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[0]))*V1.vector4_f32[1])+(((V2.vector4_f32[0]*V3.vector4_f32[1])-(V2.vector4_f32[1]*V3.vector4_f32[0]))*V1.vector4_f32[3]); + Result.vector4_f32[3] = (((V2.vector4_f32[2]*V3.vector4_f32[1])-(V2.vector4_f32[1]*V3.vector4_f32[2]))*V1.vector4_f32[0])-(((V2.vector4_f32[2]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[2]))*V1.vector4_f32[1])+(((V2.vector4_f32[1]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[1]))*V1.vector4_f32[2]); + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // V2zwyz * V3wzwy + XMVECTOR vResult = _mm_shuffle_ps(V2,V2,_MM_SHUFFLE(2,1,3,2)); + XMVECTOR vTemp3 = _mm_shuffle_ps(V3,V3,_MM_SHUFFLE(1,3,2,3)); + vResult = _mm_mul_ps(vResult,vTemp3); + // - V2wzwy * V3zwyz + XMVECTOR vTemp2 = _mm_shuffle_ps(V2,V2,_MM_SHUFFLE(1,3,2,3)); + vTemp3 = _mm_shuffle_ps(vTemp3,vTemp3,_MM_SHUFFLE(1,3,0,1)); + vTemp2 = _mm_mul_ps(vTemp2,vTemp3); + vResult = _mm_sub_ps(vResult,vTemp2); + // term1 * V1yxxx + XMVECTOR vTemp1 = _mm_shuffle_ps(V1,V1,_MM_SHUFFLE(0,0,0,1)); + vResult = _mm_mul_ps(vResult,vTemp1); + + // V2ywxz * V3wxwx + vTemp2 = _mm_shuffle_ps(V2,V2,_MM_SHUFFLE(2,0,3,1)); + vTemp3 = _mm_shuffle_ps(V3,V3,_MM_SHUFFLE(0,3,0,3)); + vTemp3 = _mm_mul_ps(vTemp3,vTemp2); + // - V2wxwx * V3ywxz + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp2,_MM_SHUFFLE(2,1,2,1)); + vTemp1 = _mm_shuffle_ps(V3,V3,_MM_SHUFFLE(2,0,3,1)); + vTemp2 = _mm_mul_ps(vTemp2,vTemp1); + vTemp3 = _mm_sub_ps(vTemp3,vTemp2); + // vResult - temp * V1zzyy + vTemp1 = _mm_shuffle_ps(V1,V1,_MM_SHUFFLE(1,1,2,2)); + vTemp1 = _mm_mul_ps(vTemp1,vTemp3); + vResult = _mm_sub_ps(vResult,vTemp1); + + // V2yzxy * V3zxyx + vTemp2 = _mm_shuffle_ps(V2,V2,_MM_SHUFFLE(1,0,2,1)); + vTemp3 = _mm_shuffle_ps(V3,V3,_MM_SHUFFLE(0,1,0,2)); + vTemp3 = _mm_mul_ps(vTemp3,vTemp2); + // - V2zxyx * V3yzxy + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp2,_MM_SHUFFLE(2,0,2,1)); + vTemp1 = _mm_shuffle_ps(V3,V3,_MM_SHUFFLE(1,0,2,1)); + vTemp1 = _mm_mul_ps(vTemp1,vTemp2); + vTemp3 = _mm_sub_ps(vTemp3,vTemp1); + // vResult + term * V1wwwz + vTemp1 = _mm_shuffle_ps(V1,V1,_MM_SHUFFLE(2,3,3,3)); + vTemp3 = _mm_mul_ps(vTemp3,vTemp1); + vResult = _mm_add_ps(vResult,vTemp3); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector4LengthSq +( + FXMVECTOR V +) +{ + return XMVector4Dot(V, V); +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector4ReciprocalLengthEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector4LengthSq(V); + Result = XMVectorReciprocalSqrtEst(Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and w + XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(3,2,3,2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,0,0,0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // Splat the length + vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(2,2,2,2)); + // Get the reciprocal + vLengthSq = _mm_rsqrt_ps(vLengthSq); + return vLengthSq; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector4ReciprocalLength +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector4LengthSq(V); + Result = XMVectorReciprocalSqrt(Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and w + XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(3,2,3,2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,0,0,0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // Splat the length + vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(2,2,2,2)); + // Get the reciprocal + vLengthSq = _mm_sqrt_ps(vLengthSq); + // Accurate! + vLengthSq = _mm_div_ps(g_XMOne,vLengthSq); + return vLengthSq; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector4LengthEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector4LengthSq(V); + Result = XMVectorSqrtEst(Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and w + XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(3,2,3,2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,0,0,0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // Splat the length + vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(2,2,2,2)); + // Prepare for the division + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector4Length +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector4LengthSq(V); + Result = XMVectorSqrt(Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and w + XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(3,2,3,2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,0,0,0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // Splat the length + vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(2,2,2,2)); + // Prepare for the division + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// XMVector4NormalizeEst uses a reciprocal estimate and +// returns QNaN on zero and infinite vectors. + +XMFINLINE XMVECTOR XMVector4NormalizeEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector4ReciprocalLength(V); + Result = XMVectorMultiply(V, Result); + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and w + XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(3,2,3,2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,0,0,0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // Splat the length + vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(2,2,2,2)); + // Get the reciprocal + XMVECTOR vResult = _mm_rsqrt_ps(vLengthSq); + // Reciprocal mul to perform the normalization + vResult = _mm_mul_ps(vResult,V); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector4Normalize +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + FLOAT fLength; + XMVECTOR vResult; + + vResult = XMVector4Length( V ); + fLength = vResult.vector4_f32[0]; + + // Prevent divide by zero + if (fLength > 0) { + fLength = 1.0f/fLength; + } + + vResult.vector4_f32[0] = V.vector4_f32[0]*fLength; + vResult.vector4_f32[1] = V.vector4_f32[1]*fLength; + vResult.vector4_f32[2] = V.vector4_f32[2]*fLength; + vResult.vector4_f32[3] = V.vector4_f32[3]*fLength; + return vResult; + +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and w + XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(3,2,3,2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,0,0,0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // Splat the length + vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(2,2,2,2)); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); + vResult = _mm_or_ps(vTemp1,vTemp2); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector4ClampLength +( + FXMVECTOR V, + FLOAT LengthMin, + FLOAT LengthMax +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR ClampMax; + XMVECTOR ClampMin; + + ClampMax = XMVectorReplicate(LengthMax); + ClampMin = XMVectorReplicate(LengthMin); + + return XMVector4ClampLengthV(V, ClampMin, ClampMax); + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR ClampMax = _mm_set_ps1(LengthMax); + XMVECTOR ClampMin = _mm_set_ps1(LengthMin); + return XMVector4ClampLengthV(V, ClampMin, ClampMax); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector4ClampLengthV +( + FXMVECTOR V, + FXMVECTOR LengthMin, + FXMVECTOR LengthMax +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR ClampLength; + XMVECTOR LengthSq; + XMVECTOR RcpLength; + XMVECTOR Length; + XMVECTOR Normal; + XMVECTOR Zero; + XMVECTOR InfiniteLength; + XMVECTOR ZeroLength; + XMVECTOR Select; + XMVECTOR ControlMax; + XMVECTOR ControlMin; + XMVECTOR Control; + XMVECTOR Result; + + XMASSERT((LengthMin.vector4_f32[1] == LengthMin.vector4_f32[0]) && (LengthMin.vector4_f32[2] == LengthMin.vector4_f32[0]) && (LengthMin.vector4_f32[3] == LengthMin.vector4_f32[0])); + XMASSERT((LengthMax.vector4_f32[1] == LengthMax.vector4_f32[0]) && (LengthMax.vector4_f32[2] == LengthMax.vector4_f32[0]) && (LengthMax.vector4_f32[3] == LengthMax.vector4_f32[0])); + XMASSERT(XMVector4GreaterOrEqual(LengthMin, XMVectorZero())); + XMASSERT(XMVector4GreaterOrEqual(LengthMax, XMVectorZero())); + XMASSERT(XMVector4GreaterOrEqual(LengthMax, LengthMin)); + + LengthSq = XMVector4LengthSq(V); + + Zero = XMVectorZero(); + + RcpLength = XMVectorReciprocalSqrt(LengthSq); + + InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v); + ZeroLength = XMVectorEqual(LengthSq, Zero); + + Normal = XMVectorMultiply(V, RcpLength); + + Length = XMVectorMultiply(LengthSq, RcpLength); + + Select = XMVectorEqualInt(InfiniteLength, ZeroLength); + Length = XMVectorSelect(LengthSq, Length, Select); + Normal = XMVectorSelect(LengthSq, Normal, Select); + + ControlMax = XMVectorGreater(Length, LengthMax); + ControlMin = XMVectorLess(Length, LengthMin); + + ClampLength = XMVectorSelect(Length, LengthMax, ControlMax); + ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin); + + Result = XMVectorMultiply(Normal, ClampLength); + + // Preserve the original vector (with no precision loss) if the length falls within the given range + Control = XMVectorEqualInt(ControlMax, ControlMin); + Result = XMVectorSelect(Result, V, Control); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR ClampLength; + XMVECTOR LengthSq; + XMVECTOR RcpLength; + XMVECTOR Length; + XMVECTOR Normal; + XMVECTOR Zero; + XMVECTOR InfiniteLength; + XMVECTOR ZeroLength; + XMVECTOR Select; + XMVECTOR ControlMax; + XMVECTOR ControlMin; + XMVECTOR Control; + XMVECTOR Result; + + XMASSERT((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetW(LengthMin) == XMVectorGetX(LengthMin))); + XMASSERT((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetW(LengthMax) == XMVectorGetX(LengthMax))); + XMASSERT(XMVector4GreaterOrEqual(LengthMin, g_XMZero)); + XMASSERT(XMVector4GreaterOrEqual(LengthMax, g_XMZero)); + XMASSERT(XMVector4GreaterOrEqual(LengthMax, LengthMin)); + + LengthSq = XMVector4LengthSq(V); + Zero = XMVectorZero(); + RcpLength = XMVectorReciprocalSqrt(LengthSq); + InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity); + ZeroLength = XMVectorEqual(LengthSq, Zero); + Normal = _mm_mul_ps(V, RcpLength); + Length = _mm_mul_ps(LengthSq, RcpLength); + Select = XMVectorEqualInt(InfiniteLength, ZeroLength); + Length = XMVectorSelect(LengthSq, Length, Select); + Normal = XMVectorSelect(LengthSq, Normal, Select); + ControlMax = XMVectorGreater(Length, LengthMax); + ControlMin = XMVectorLess(Length, LengthMin); + ClampLength = XMVectorSelect(Length, LengthMax, ControlMax); + ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin); + Result = _mm_mul_ps(Normal, ClampLength); + // Preserve the original vector (with no precision loss) if the length falls within the given range + Control = XMVectorEqualInt(ControlMax,ControlMin); + Result = XMVectorSelect(Result,V,Control); + return Result; + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector4Reflect +( + FXMVECTOR Incident, + FXMVECTOR Normal +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + // Result = Incident - (2 * dot(Incident, Normal)) * Normal + Result = XMVector4Dot(Incident, Normal); + Result = XMVectorAdd(Result, Result); + Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Result = Incident - (2 * dot(Incident, Normal)) * Normal + XMVECTOR Result = XMVector4Dot(Incident,Normal); + Result = _mm_add_ps(Result,Result); + Result = _mm_mul_ps(Result,Normal); + Result = _mm_sub_ps(Incident,Result); + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector4Refract +( + FXMVECTOR Incident, + FXMVECTOR Normal, + FLOAT RefractionIndex +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Index; + Index = XMVectorReplicate(RefractionIndex); + return XMVector4RefractV(Incident, Normal, Index); + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR Index = _mm_set_ps1(RefractionIndex); + return XMVector4RefractV(Incident,Normal,Index); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector4RefractV +( + FXMVECTOR Incident, + FXMVECTOR Normal, + FXMVECTOR RefractionIndex +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR IDotN; + XMVECTOR R; + CONST XMVECTOR Zero = XMVectorZero(); + + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + + IDotN = XMVector4Dot(Incident, Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v); + R = XMVectorMultiply(R, RefractionIndex); + R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v); + + if (XMVector4LessOrEqual(R, Zero)) + { + // Total internal reflection + return Zero; + } + else + { + XMVECTOR Result; + + // R = RefractionIndex * IDotN + sqrt(R) + R = XMVectorSqrt(R); + R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R); + + // Result = RefractionIndex * Incident - Normal * R + Result = XMVectorMultiply(RefractionIndex, Incident); + Result = XMVectorNegativeMultiplySubtract(Normal, R, Result); + + return Result; + } + +#elif defined(_XM_SSE_INTRINSICS_) + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + + XMVECTOR IDotN = XMVector4Dot(Incident,Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + XMVECTOR R = _mm_mul_ps(IDotN,IDotN); + R = _mm_sub_ps(g_XMOne,R); + R = _mm_mul_ps(R, RefractionIndex); + R = _mm_mul_ps(R, RefractionIndex); + R = _mm_sub_ps(g_XMOne,R); + + XMVECTOR vResult = _mm_cmple_ps(R,g_XMZero); + if (_mm_movemask_ps(vResult)==0x0f) + { + // Total internal reflection + vResult = g_XMZero; + } + else + { + // R = RefractionIndex * IDotN + sqrt(R) + R = _mm_sqrt_ps(R); + vResult = _mm_mul_ps(RefractionIndex, IDotN); + R = _mm_add_ps(R,vResult); + // Result = RefractionIndex * Incident - Normal * R + vResult = _mm_mul_ps(RefractionIndex, Incident); + R = _mm_mul_ps(R,Normal); + vResult = _mm_sub_ps(vResult,R); + } + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector4Orthogonal +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_f32[0] = V.vector4_f32[2]; + Result.vector4_f32[1] = V.vector4_f32[3]; + Result.vector4_f32[2] = -V.vector4_f32[0]; + Result.vector4_f32[3] = -V.vector4_f32[1]; + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 FlipZW = {1.0f,1.0f,-1.0f,-1.0f}; + XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,0,3,2)); + vResult = _mm_mul_ps(vResult,FlipZW); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector4AngleBetweenNormalsEst +( + FXMVECTOR N1, + FXMVECTOR N2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR NegativeOne; + XMVECTOR One; + XMVECTOR Result; + + Result = XMVector4Dot(N1, N2); + NegativeOne = XMVectorSplatConstant(-1, 0); + One = XMVectorSplatOne(); + Result = XMVectorClamp(Result, NegativeOne, One); + Result = XMVectorACosEst(Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XMVector4Dot(N1,N2); + // Clamp to -1.0f to 1.0f + vResult = _mm_max_ps(vResult,g_XMNegativeOne); + vResult = _mm_min_ps(vResult,g_XMOne);; + vResult = XMVectorACosEst(vResult); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector4AngleBetweenNormals +( + FXMVECTOR N1, + FXMVECTOR N2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR NegativeOne; + XMVECTOR One; + XMVECTOR Result; + + Result = XMVector4Dot(N1, N2); + NegativeOne = XMVectorSplatConstant(-1, 0); + One = XMVectorSplatOne(); + Result = XMVectorClamp(Result, NegativeOne, One); + Result = XMVectorACos(Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XMVector4Dot(N1,N2); + // Clamp to -1.0f to 1.0f + vResult = _mm_max_ps(vResult,g_XMNegativeOne); + vResult = _mm_min_ps(vResult,g_XMOne);; + vResult = XMVectorACos(vResult); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector4AngleBetweenVectors +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR L1; + XMVECTOR L2; + XMVECTOR Dot; + XMVECTOR CosAngle; + XMVECTOR NegativeOne; + XMVECTOR One; + XMVECTOR Result; + + L1 = XMVector4ReciprocalLength(V1); + L2 = XMVector4ReciprocalLength(V2); + + Dot = XMVector4Dot(V1, V2); + + L1 = XMVectorMultiply(L1, L2); + + CosAngle = XMVectorMultiply(Dot, L1); + NegativeOne = XMVectorSplatConstant(-1, 0); + One = XMVectorSplatOne(); + CosAngle = XMVectorClamp(CosAngle, NegativeOne, One); + + Result = XMVectorACos(CosAngle); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR L1; + XMVECTOR L2; + XMVECTOR Dot; + XMVECTOR CosAngle; + XMVECTOR Result; + + L1 = XMVector4ReciprocalLength(V1); + L2 = XMVector4ReciprocalLength(V2); + Dot = XMVector4Dot(V1, V2); + L1 = _mm_mul_ps(L1,L2); + CosAngle = _mm_mul_ps(Dot,L1); + CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne, g_XMOne); + Result = XMVectorACos(CosAngle); + return Result; + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR XMVector4Transform +( + FXMVECTOR V, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + FLOAT fX = (M.m[0][0]*V.vector4_f32[0])+(M.m[1][0]*V.vector4_f32[1])+(M.m[2][0]*V.vector4_f32[2])+(M.m[3][0]*V.vector4_f32[3]); + FLOAT fY = (M.m[0][1]*V.vector4_f32[0])+(M.m[1][1]*V.vector4_f32[1])+(M.m[2][1]*V.vector4_f32[2])+(M.m[3][1]*V.vector4_f32[3]); + FLOAT fZ = (M.m[0][2]*V.vector4_f32[0])+(M.m[1][2]*V.vector4_f32[1])+(M.m[2][2]*V.vector4_f32[2])+(M.m[3][2]*V.vector4_f32[3]); + FLOAT fW = (M.m[0][3]*V.vector4_f32[0])+(M.m[1][3]*V.vector4_f32[1])+(M.m[2][3]*V.vector4_f32[2])+(M.m[3][3]*V.vector4_f32[3]); + XMVECTOR vResult = { + fX, + fY, + fZ, + fW + }; + return vResult; + +#elif defined(_XM_SSE_INTRINSICS_) + // Splat x,y,z and w + XMVECTOR vTempX = _mm_shuffle_ps(V,V,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR vTempY = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vTempZ = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2)); + XMVECTOR vTempW = _mm_shuffle_ps(V,V,_MM_SHUFFLE(3,3,3,3)); + // Mul by the matrix + vTempX = _mm_mul_ps(vTempX,M.r[0]); + vTempY = _mm_mul_ps(vTempY,M.r[1]); + vTempZ = _mm_mul_ps(vTempZ,M.r[2]); + vTempW = _mm_mul_ps(vTempW,M.r[3]); + // Add them all together + vTempX = _mm_add_ps(vTempX,vTempY); + vTempZ = _mm_add_ps(vTempZ,vTempW); + vTempX = _mm_add_ps(vTempX,vTempZ); + return vTempX; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +XMINLINE XMFLOAT4* XMVector4TransformStream +( + XMFLOAT4* pOutputStream, + size_t OutputStride, + CONST XMFLOAT4* pInputStream, + size_t InputStride, + size_t VectorCount, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + XMVECTOR X; + XMVECTOR Y; + XMVECTOR Z; + XMVECTOR W; + XMVECTOR Result; + size_t i; + CONST BYTE* pInputVector = (CONST BYTE*)pInputStream; + BYTE* pOutputVector = (BYTE*)pOutputStream; + + XMASSERT(pOutputStream); + XMASSERT(pInputStream); + + for (i = 0; i < VectorCount; i++) + { + V = XMLoadFloat4((const XMFLOAT4*)pInputVector); + W = XMVectorSplatW(V); + Z = XMVectorSplatZ(V); + Y = XMVectorSplatY(V); + X = XMVectorSplatX(V); +// W = XMVectorReplicate(((XMFLOAT4*)pInputVector)->w); +// Z = XMVectorReplicate(((XMFLOAT4*)pInputVector)->z); +// Y = XMVectorReplicate(((XMFLOAT4*)pInputVector)->y); +// X = XMVectorReplicate(((XMFLOAT4*)pInputVector)->x); + + Result = XMVectorMultiply(W, M.r[3]); + Result = XMVectorMultiplyAdd(Z, M.r[2], Result); + Result = XMVectorMultiplyAdd(Y, M.r[1], Result); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + XMStoreFloat4((XMFLOAT4*)pOutputVector, Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_SSE_INTRINSICS_) + size_t i; + + XMASSERT(pOutputStream); + XMASSERT(pInputStream); + + const BYTE*pInputVector = reinterpret_cast(pInputStream); + BYTE* pOutputVector = reinterpret_cast(pOutputStream); + for (i = 0; i < VectorCount; i++) + { + // Fetch the row and splat it + XMVECTOR vTempx = _mm_loadu_ps(reinterpret_cast(pInputVector)); + XMVECTOR vTempy = _mm_shuffle_ps(vTempx,vTempx,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vTempz = _mm_shuffle_ps(vTempx,vTempx,_MM_SHUFFLE(2,2,2,2)); + XMVECTOR vTempw = _mm_shuffle_ps(vTempx,vTempx,_MM_SHUFFLE(3,3,3,3)); + vTempx = _mm_shuffle_ps(vTempx,vTempx,_MM_SHUFFLE(0,0,0,0)); + vTempx = _mm_mul_ps(vTempx,M.r[0]); + vTempy = _mm_mul_ps(vTempy,M.r[1]); + vTempz = _mm_mul_ps(vTempz,M.r[2]); + vTempw = _mm_mul_ps(vTempw,M.r[3]); + vTempx = _mm_add_ps(vTempx,vTempy); + vTempw = _mm_add_ps(vTempw,vTempz); + vTempw = _mm_add_ps(vTempw,vTempx); + // Store the transformed vector + _mm_storeu_ps(reinterpret_cast(pOutputVector),vTempw); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + return pOutputStream; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +#ifdef __cplusplus + +/**************************************************************************** + * + * XMVECTOR operators + * + ****************************************************************************/ + +#ifndef XM_NO_OPERATOR_OVERLOADS + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR operator+ (FXMVECTOR V) +{ + return V; +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR operator- (FXMVECTOR V) +{ + return XMVectorNegate(V); +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR& operator+= +( + XMVECTOR& V1, + FXMVECTOR V2 +) +{ + V1 = XMVectorAdd(V1, V2); + return V1; +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR& operator-= +( + XMVECTOR& V1, + FXMVECTOR V2 +) +{ + V1 = XMVectorSubtract(V1, V2); + return V1; +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR& operator*= +( + XMVECTOR& V1, + FXMVECTOR V2 +) +{ + V1 = XMVectorMultiply(V1, V2); + return V1; +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR& operator/= +( + XMVECTOR& V1, + FXMVECTOR V2 +) +{ + V1 = XMVectorDivide(V1,V2); + return V1; +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR& operator*= +( + XMVECTOR& V, + CONST FLOAT S +) +{ + V = XMVectorScale(V, S); + return V; +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR& operator/= +( + XMVECTOR& V, + CONST FLOAT S +) +{ + V = XMVectorScale(V, 1.0f / S); + return V; +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR operator+ +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + return XMVectorAdd(V1, V2); +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR operator- +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + return XMVectorSubtract(V1, V2); +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR operator* +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + return XMVectorMultiply(V1, V2); +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR operator/ +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + return XMVectorDivide(V1,V2); +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR operator* +( + FXMVECTOR V, + CONST FLOAT S +) +{ + return XMVectorScale(V, S); +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR operator/ +( + FXMVECTOR V, + CONST FLOAT S +) +{ + return XMVectorScale(V, 1.0f / S); +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMVECTOR operator* +( + FLOAT S, + FXMVECTOR V +) +{ + return XMVectorScale(V, S); +} + +#endif // !XM_NO_OPERATOR_OVERLOADS + +/**************************************************************************** + * + * XMFLOAT2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMFLOAT2::_XMFLOAT2 +( + CONST FLOAT* pArray +) +{ + x = pArray[0]; + y = pArray[1]; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMFLOAT2& _XMFLOAT2::operator= +( + CONST _XMFLOAT2& Float2 +) +{ + x = Float2.x; + y = Float2.y; + return *this; +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMFLOAT2A& XMFLOAT2A::operator= +( + CONST XMFLOAT2A& Float2 +) +{ + x = Float2.x; + y = Float2.y; + return *this; +} + +/**************************************************************************** + * + * XMINT2 operators + * + ****************************************************************************/ + +XMFINLINE _XMINT2::_XMINT2 +( + CONST INT *pArray +) +{ + x = pArray[0]; + y = pArray[1]; +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMINT2& _XMINT2::operator= +( + CONST _XMINT2& Int2 +) +{ + x = Int2.x; + y = Int2.y; + return *this; +} + +/**************************************************************************** + * + * XMUINT2 operators + * + ****************************************************************************/ + +XMFINLINE _XMUINT2::_XMUINT2 +( + CONST UINT *pArray +) +{ + x = pArray[0]; + y = pArray[1]; +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMUINT2& _XMUINT2::operator= +( + CONST _XMUINT2& UInt2 +) +{ + x = UInt2.x; + y = UInt2.y; + return *this; +} + +/**************************************************************************** + * + * XMHALF2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMHALF2::_XMHALF2 +( + CONST HALF* pArray +) +{ + x = pArray[0]; + y = pArray[1]; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMHALF2::_XMHALF2 +( + FLOAT _x, + FLOAT _y +) +{ + x = XMConvertFloatToHalf(_x); + y = XMConvertFloatToHalf(_y); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMHALF2::_XMHALF2 +( + CONST FLOAT* pArray +) +{ + x = XMConvertFloatToHalf(pArray[0]); + y = XMConvertFloatToHalf(pArray[1]); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMHALF2& _XMHALF2::operator= +( + CONST _XMHALF2& Half2 +) +{ + x = Half2.x; + y = Half2.y; + return *this; +} + +/**************************************************************************** + * + * XMSHORTN2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMSHORTN2::_XMSHORTN2 +( + CONST SHORT* pArray +) +{ + x = pArray[0]; + y = pArray[1]; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMSHORTN2::_XMSHORTN2 +( + FLOAT _x, + FLOAT _y +) +{ + XMStoreShortN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMSHORTN2::_XMSHORTN2 +( + CONST FLOAT* pArray +) +{ + XMStoreShortN2(this, XMLoadFloat2((const XMFLOAT2*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMSHORTN2& _XMSHORTN2::operator= +( + CONST _XMSHORTN2& ShortN2 +) +{ + x = ShortN2.x; + y = ShortN2.y; + return *this; +} + +/**************************************************************************** + * + * XMSHORT2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMSHORT2::_XMSHORT2 +( + CONST SHORT* pArray +) +{ + x = pArray[0]; + y = pArray[1]; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMSHORT2::_XMSHORT2 +( + FLOAT _x, + FLOAT _y +) +{ + XMStoreShort2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMSHORT2::_XMSHORT2 +( + CONST FLOAT* pArray +) +{ + XMStoreShort2(this, XMLoadFloat2((const XMFLOAT2*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMSHORT2& _XMSHORT2::operator= +( + CONST _XMSHORT2& Short2 +) +{ + x = Short2.x; + y = Short2.y; + return *this; +} + +/**************************************************************************** + * + * XMUSHORTN2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUSHORTN2::_XMUSHORTN2 +( + CONST USHORT* pArray +) +{ + x = pArray[0]; + y = pArray[1]; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUSHORTN2::_XMUSHORTN2 +( + FLOAT _x, + FLOAT _y +) +{ + XMStoreUShortN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUSHORTN2::_XMUSHORTN2 +( + CONST FLOAT* pArray +) +{ + XMStoreUShortN2(this, XMLoadFloat2((const XMFLOAT2*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUSHORTN2& _XMUSHORTN2::operator= +( + CONST _XMUSHORTN2& UShortN2 +) +{ + x = UShortN2.x; + y = UShortN2.y; + return *this; +} + +/**************************************************************************** + * + * XMUSHORT2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUSHORT2::_XMUSHORT2 +( + CONST USHORT* pArray +) +{ + x = pArray[0]; + y = pArray[1]; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUSHORT2::_XMUSHORT2 +( + FLOAT _x, + FLOAT _y +) +{ + XMStoreUShort2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUSHORT2::_XMUSHORT2 +( + CONST FLOAT* pArray +) +{ + XMStoreUShort2(this, XMLoadFloat2((const XMFLOAT2*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUSHORT2& _XMUSHORT2::operator= +( + CONST _XMUSHORT2& UShort2 +) +{ + x = UShort2.x; + y = UShort2.y; + return *this; +} + +/**************************************************************************** + * + * XMBYTEN2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMBYTEN2::_XMBYTEN2 +( + CONST CHAR* pArray +) +{ + x = pArray[0]; + y = pArray[1]; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMBYTEN2::_XMBYTEN2 +( + FLOAT _x, + FLOAT _y +) +{ + XMStoreByteN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMBYTEN2::_XMBYTEN2 +( + CONST FLOAT* pArray +) +{ + XMStoreByteN2(this, XMLoadFloat2((const XMFLOAT2*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMBYTEN2& _XMBYTEN2::operator= +( + CONST _XMBYTEN2& ByteN2 +) +{ + x = ByteN2.x; + y = ByteN2.y; + return *this; +} + +/**************************************************************************** + * + * XMBYTE2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMBYTE2::_XMBYTE2 +( + CONST CHAR* pArray +) +{ + x = pArray[0]; + y = pArray[1]; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMBYTE2::_XMBYTE2 +( + FLOAT _x, + FLOAT _y +) +{ + XMStoreByte2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMBYTE2::_XMBYTE2 +( + CONST FLOAT* pArray +) +{ + XMStoreByte2(this, XMLoadFloat2((const XMFLOAT2*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMBYTE2& _XMBYTE2::operator= +( + CONST _XMBYTE2& Byte2 +) +{ + x = Byte2.x; + y = Byte2.y; + return *this; +} + +/**************************************************************************** + * + * XMUBYTEN2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUBYTEN2::_XMUBYTEN2 +( + CONST BYTE* pArray +) +{ + x = pArray[0]; + y = pArray[1]; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUBYTEN2::_XMUBYTEN2 +( + FLOAT _x, + FLOAT _y +) +{ + XMStoreUByteN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUBYTEN2::_XMUBYTEN2 +( + CONST FLOAT* pArray +) +{ + XMStoreUByteN2(this, XMLoadFloat2((const XMFLOAT2*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUBYTEN2& _XMUBYTEN2::operator= +( + CONST _XMUBYTEN2& UByteN2 +) +{ + x = UByteN2.x; + y = UByteN2.y; + return *this; +} + +/**************************************************************************** + * + * XMUBYTE2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUBYTE2::_XMUBYTE2 +( + CONST BYTE* pArray +) +{ + x = pArray[0]; + y = pArray[1]; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUBYTE2::_XMUBYTE2 +( + FLOAT _x, + FLOAT _y +) +{ + XMStoreUByte2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUBYTE2::_XMUBYTE2 +( + CONST FLOAT* pArray +) +{ + XMStoreUByte2(this, XMLoadFloat2((const XMFLOAT2*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUBYTE2& _XMUBYTE2::operator= +( + CONST _XMUBYTE2& UByte2 +) +{ + x = UByte2.x; + y = UByte2.y; + return *this; +} + +/**************************************************************************** + * + * XMFLOAT3 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMFLOAT3::_XMFLOAT3 +( + CONST FLOAT* pArray +) +{ + x = pArray[0]; + y = pArray[1]; + z = pArray[2]; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMFLOAT3& _XMFLOAT3::operator= +( + CONST _XMFLOAT3& Float3 +) +{ + x = Float3.x; + y = Float3.y; + z = Float3.z; + return *this; +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMFLOAT3A& XMFLOAT3A::operator= +( + CONST XMFLOAT3A& Float3 +) +{ + x = Float3.x; + y = Float3.y; + z = Float3.z; + return *this; +} + +/**************************************************************************** + * + * XMINT3 operators + * + ****************************************************************************/ + +XMFINLINE _XMINT3::_XMINT3 +( + CONST INT *pArray +) +{ + x = pArray[0]; + y = pArray[1]; + z = pArray[2]; +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMINT3& _XMINT3::operator= +( + CONST _XMINT3& Int3 +) +{ + x = Int3.x; + y = Int3.y; + z = Int3.z; + return *this; +} + +/**************************************************************************** + * + * XMUINT3 operators + * + ****************************************************************************/ + +XMFINLINE _XMUINT3::_XMUINT3 +( + CONST UINT *pArray +) +{ + x = pArray[0]; + y = pArray[1]; + z = pArray[2]; +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMUINT3& _XMUINT3::operator= +( + CONST _XMUINT3& UInt3 +) +{ + x = UInt3.x; + y = UInt3.y; + z = UInt3.z; + return *this; +} + +/**************************************************************************** + * + * XMHENDN3 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMHENDN3::_XMHENDN3 +( + FLOAT _x, + FLOAT _y, + FLOAT _z +) +{ + XMStoreHenDN3(this, XMVectorSet(_x, _y, _z, 0.0f)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMHENDN3::_XMHENDN3 +( + CONST FLOAT* pArray +) +{ + XMStoreHenDN3(this, XMLoadFloat3((const XMFLOAT3*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMHENDN3& _XMHENDN3::operator= +( + CONST _XMHENDN3& HenDN3 +) +{ + v = HenDN3.v; + return *this; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMHENDN3& _XMHENDN3::operator= +( + CONST UINT Packed +) +{ + v = Packed; + return *this; +} + +/**************************************************************************** + * + * XMHEND3 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMHEND3::_XMHEND3 +( + FLOAT _x, + FLOAT _y, + FLOAT _z +) +{ + XMStoreHenD3(this, XMVectorSet(_x, _y, _z, 0.0f)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMHEND3::_XMHEND3 +( + CONST FLOAT* pArray +) +{ + XMStoreHenD3(this, XMLoadFloat3((const XMFLOAT3*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMHEND3& _XMHEND3::operator= +( + CONST _XMHEND3& HenD3 +) +{ + v = HenD3.v; + return *this; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMHEND3& _XMHEND3::operator= +( + CONST UINT Packed +) +{ + v = Packed; + return *this; +} + +/**************************************************************************** + * + * XMUHENDN3 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUHENDN3::_XMUHENDN3 +( + FLOAT _x, + FLOAT _y, + FLOAT _z +) +{ + XMStoreUHenDN3(this, XMVectorSet(_x, _y, _z, 0.0f)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUHENDN3::_XMUHENDN3 +( + CONST FLOAT* pArray +) +{ + XMStoreUHenDN3(this, XMLoadFloat3((const XMFLOAT3*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUHENDN3& _XMUHENDN3::operator= +( + CONST _XMUHENDN3& UHenDN3 +) +{ + v = UHenDN3.v; + return *this; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUHENDN3& _XMUHENDN3::operator= +( + CONST UINT Packed +) +{ + v = Packed; + return *this; +} + +/**************************************************************************** + * + * XMUHEND3 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUHEND3::_XMUHEND3 +( + FLOAT _x, + FLOAT _y, + FLOAT _z +) +{ + XMStoreUHenD3(this, XMVectorSet(_x, _y, _z, 0.0f)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUHEND3::_XMUHEND3 +( + CONST FLOAT* pArray +) +{ + XMStoreUHenD3(this, XMLoadFloat3((const XMFLOAT3*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUHEND3& _XMUHEND3::operator= +( + CONST _XMUHEND3& UHenD3 +) +{ + v = UHenD3.v; + return *this; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUHEND3& _XMUHEND3::operator= +( + CONST UINT Packed +) +{ + v = Packed; + return *this; +} + +/**************************************************************************** + * + * XMDHENN3 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMDHENN3::_XMDHENN3 +( + FLOAT _x, + FLOAT _y, + FLOAT _z +) +{ + XMStoreDHenN3(this, XMVectorSet(_x, _y, _z, 0.0f)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMDHENN3::_XMDHENN3 +( + CONST FLOAT* pArray +) +{ + XMStoreDHenN3(this, XMLoadFloat3((const XMFLOAT3*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMDHENN3& _XMDHENN3::operator= +( + CONST _XMDHENN3& DHenN3 +) +{ + v = DHenN3.v; + return *this; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMDHENN3& _XMDHENN3::operator= +( + CONST UINT Packed +) +{ + v = Packed; + return *this; +} + +/**************************************************************************** + * + * XMDHEN3 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMDHEN3::_XMDHEN3 +( + FLOAT _x, + FLOAT _y, + FLOAT _z +) +{ + XMStoreDHen3(this, XMVectorSet(_x, _y, _z, 0.0f)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMDHEN3::_XMDHEN3 +( + CONST FLOAT* pArray +) +{ + XMStoreDHen3(this, XMLoadFloat3((const XMFLOAT3*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMDHEN3& _XMDHEN3::operator= +( + CONST _XMDHEN3& DHen3 +) +{ + v = DHen3.v; + return *this; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMDHEN3& _XMDHEN3::operator= +( + CONST UINT Packed +) +{ + v = Packed; + return *this; +} + +/**************************************************************************** + * + * XMUDHENN3 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUDHENN3::_XMUDHENN3 +( + FLOAT _x, + FLOAT _y, + FLOAT _z +) +{ + XMStoreUDHenN3(this, XMVectorSet(_x, _y, _z, 0.0f)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUDHENN3::_XMUDHENN3 +( + CONST FLOAT* pArray +) +{ + XMStoreUDHenN3(this, XMLoadFloat3((const XMFLOAT3*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUDHENN3& _XMUDHENN3::operator= +( + CONST _XMUDHENN3& UDHenN3 +) +{ + v = UDHenN3.v; + return *this; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUDHENN3& _XMUDHENN3::operator= +( + CONST UINT Packed +) +{ + v = Packed; + return *this; +} + +/**************************************************************************** + * + * XMUDHEN3 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUDHEN3::_XMUDHEN3 +( + FLOAT _x, + FLOAT _y, + FLOAT _z +) +{ + XMStoreUDHen3(this, XMVectorSet(_x, _y, _z, 0.0f)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUDHEN3::_XMUDHEN3 +( + CONST FLOAT* pArray +) +{ + XMStoreUDHen3(this, XMLoadFloat3((const XMFLOAT3*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUDHEN3& _XMUDHEN3::operator= +( + CONST _XMUDHEN3& UDHen3 +) +{ + v = UDHen3.v; + return *this; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUDHEN3& _XMUDHEN3::operator= +( + CONST UINT Packed +) +{ + v = Packed; + return *this; +} + +/**************************************************************************** + * + * XMU565 operators + * + ****************************************************************************/ + +XMFINLINE _XMU565::_XMU565 +( + CONST CHAR *pArray +) +{ + x = pArray[0]; + y = pArray[1]; + z = pArray[2]; +} + +XMFINLINE _XMU565::_XMU565 +( + FLOAT _x, + FLOAT _y, + FLOAT _z +) +{ + XMStoreU565(this, XMVectorSet( _x, _y, _z, 0.0f )); +} + +XMFINLINE _XMU565::_XMU565 +( + CONST FLOAT *pArray +) +{ + XMStoreU565(this, XMLoadFloat3((const XMFLOAT3*)pArray )); +} + +XMFINLINE _XMU565& _XMU565::operator= +( + CONST _XMU565& U565 +) +{ + v = U565.v; + return *this; +} + +XMFINLINE _XMU565& _XMU565::operator= +( + CONST USHORT Packed +) +{ + v = Packed; + return *this; +} + +/**************************************************************************** + * + * XMFLOAT3PK operators + * + ****************************************************************************/ + +XMFINLINE _XMFLOAT3PK::_XMFLOAT3PK +( + FLOAT _x, + FLOAT _y, + FLOAT _z +) +{ + XMStoreFloat3PK(this, XMVectorSet( _x, _y, _z, 0.0f )); +} + +XMFINLINE _XMFLOAT3PK::_XMFLOAT3PK +( + CONST FLOAT *pArray +) +{ + XMStoreFloat3PK(this, XMLoadFloat3((const XMFLOAT3*)pArray )); +} + +XMFINLINE _XMFLOAT3PK& _XMFLOAT3PK::operator= +( + CONST _XMFLOAT3PK& float3pk +) +{ + v = float3pk.v; + return *this; +} + +XMFINLINE _XMFLOAT3PK& _XMFLOAT3PK::operator= +( + CONST UINT Packed +) +{ + v = Packed; + return *this; +} + +/**************************************************************************** + * + * XMFLOAT3SE operators + * + ****************************************************************************/ + +XMFINLINE _XMFLOAT3SE::_XMFLOAT3SE +( + FLOAT _x, + FLOAT _y, + FLOAT _z +) +{ + XMStoreFloat3SE(this, XMVectorSet( _x, _y, _z, 0.0f )); +} + +XMFINLINE _XMFLOAT3SE::_XMFLOAT3SE +( + CONST FLOAT *pArray +) +{ + XMStoreFloat3SE(this, XMLoadFloat3((const XMFLOAT3*)pArray )); +} + +XMFINLINE _XMFLOAT3SE& _XMFLOAT3SE::operator= +( + CONST _XMFLOAT3SE& float3se +) +{ + v = float3se.v; + return *this; +} + +XMFINLINE _XMFLOAT3SE& _XMFLOAT3SE::operator= +( + CONST UINT Packed +) +{ + v = Packed; + return *this; +} + +/**************************************************************************** + * + * XMFLOAT4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMFLOAT4::_XMFLOAT4 +( + CONST FLOAT* pArray +) +{ + x = pArray[0]; + y = pArray[1]; + z = pArray[2]; + w = pArray[3]; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMFLOAT4& _XMFLOAT4::operator= +( + CONST _XMFLOAT4& Float4 +) +{ + x = Float4.x; + y = Float4.y; + z = Float4.z; + w = Float4.w; + return *this; +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMFLOAT4A& XMFLOAT4A::operator= +( + CONST XMFLOAT4A& Float4 +) +{ + x = Float4.x; + y = Float4.y; + z = Float4.z; + w = Float4.w; + return *this; +} + +/**************************************************************************** + * + * XMINT4 operators + * + ****************************************************************************/ + +XMFINLINE _XMINT4::_XMINT4 +( + CONST INT *pArray +) +{ + x = pArray[0]; + y = pArray[1]; + z = pArray[2]; + w = pArray[3]; +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMINT4& _XMINT4::operator= +( + CONST _XMINT4& Int4 +) +{ + x = Int4.x; + y = Int4.y; + z = Int4.z; + w = Int4.w; + return *this; +} + +/**************************************************************************** + * + * XMUINT4 operators + * + ****************************************************************************/ + +XMFINLINE _XMUINT4::_XMUINT4 +( + CONST UINT *pArray +) +{ + x = pArray[0]; + y = pArray[1]; + z = pArray[2]; + w = pArray[3]; +} + +//------------------------------------------------------------------------------ + +XMFINLINE XMUINT4& _XMUINT4::operator= +( + CONST _XMUINT4& UInt4 +) +{ + x = UInt4.x; + y = UInt4.y; + z = UInt4.z; + w = UInt4.w; + return *this; +} + +/**************************************************************************** + * + * XMHALF4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMHALF4::_XMHALF4 +( + CONST HALF* pArray +) +{ + x = pArray[0]; + y = pArray[1]; + z = pArray[2]; + w = pArray[3]; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMHALF4::_XMHALF4 +( + FLOAT _x, + FLOAT _y, + FLOAT _z, + FLOAT _w +) +{ + x = XMConvertFloatToHalf(_x); + y = XMConvertFloatToHalf(_y); + z = XMConvertFloatToHalf(_z); + w = XMConvertFloatToHalf(_w); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMHALF4::_XMHALF4 +( + CONST FLOAT* pArray +) +{ + XMConvertFloatToHalfStream(&x, sizeof(HALF), pArray, sizeof(FLOAT), 4); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMHALF4& _XMHALF4::operator= +( + CONST _XMHALF4& Half4 +) +{ + x = Half4.x; + y = Half4.y; + z = Half4.z; + w = Half4.w; + return *this; +} + +/**************************************************************************** + * + * XMSHORTN4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMSHORTN4::_XMSHORTN4 +( + CONST SHORT* pArray +) +{ + x = pArray[0]; + y = pArray[1]; + z = pArray[2]; + w = pArray[3]; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMSHORTN4::_XMSHORTN4 +( + FLOAT _x, + FLOAT _y, + FLOAT _z, + FLOAT _w +) +{ + XMStoreShortN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMSHORTN4::_XMSHORTN4 +( + CONST FLOAT* pArray +) +{ + XMStoreShortN4(this, XMLoadFloat4((const XMFLOAT4*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMSHORTN4& _XMSHORTN4::operator= +( + CONST _XMSHORTN4& ShortN4 +) +{ + x = ShortN4.x; + y = ShortN4.y; + z = ShortN4.z; + w = ShortN4.w; + return *this; +} + +/**************************************************************************** + * + * XMSHORT4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMSHORT4::_XMSHORT4 +( + CONST SHORT* pArray +) +{ + x = pArray[0]; + y = pArray[1]; + z = pArray[2]; + w = pArray[3]; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMSHORT4::_XMSHORT4 +( + FLOAT _x, + FLOAT _y, + FLOAT _z, + FLOAT _w +) +{ + XMStoreShort4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMSHORT4::_XMSHORT4 +( + CONST FLOAT* pArray +) +{ + XMStoreShort4(this, XMLoadFloat4((const XMFLOAT4*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMSHORT4& _XMSHORT4::operator= +( + CONST _XMSHORT4& Short4 +) +{ + x = Short4.x; + y = Short4.y; + z = Short4.z; + w = Short4.w; + return *this; +} + +/**************************************************************************** + * + * XMUSHORTN4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUSHORTN4::_XMUSHORTN4 +( + CONST USHORT* pArray +) +{ + x = pArray[0]; + y = pArray[1]; + z = pArray[2]; + w = pArray[3]; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUSHORTN4::_XMUSHORTN4 +( + FLOAT _x, + FLOAT _y, + FLOAT _z, + FLOAT _w +) +{ + XMStoreUShortN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUSHORTN4::_XMUSHORTN4 +( + CONST FLOAT* pArray +) +{ + XMStoreUShortN4(this, XMLoadFloat4((const XMFLOAT4*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUSHORTN4& _XMUSHORTN4::operator= +( + CONST _XMUSHORTN4& UShortN4 +) +{ + x = UShortN4.x; + y = UShortN4.y; + z = UShortN4.z; + w = UShortN4.w; + return *this; +} + +/**************************************************************************** + * + * XMUSHORT4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUSHORT4::_XMUSHORT4 +( + CONST USHORT* pArray +) +{ + x = pArray[0]; + y = pArray[1]; + z = pArray[2]; + w = pArray[3]; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUSHORT4::_XMUSHORT4 +( + FLOAT _x, + FLOAT _y, + FLOAT _z, + FLOAT _w +) +{ + XMStoreUShort4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUSHORT4::_XMUSHORT4 +( + CONST FLOAT* pArray +) +{ + XMStoreUShort4(this, XMLoadFloat4((const XMFLOAT4*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUSHORT4& _XMUSHORT4::operator= +( + CONST _XMUSHORT4& UShort4 +) +{ + x = UShort4.x; + y = UShort4.y; + z = UShort4.z; + w = UShort4.w; + return *this; +} + +/**************************************************************************** + * + * XMXDECN4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMXDECN4::_XMXDECN4 +( + FLOAT _x, + FLOAT _y, + FLOAT _z, + FLOAT _w +) +{ + XMStoreXDecN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMXDECN4::_XMXDECN4 +( + CONST FLOAT* pArray +) +{ + XMStoreXDecN4(this, XMLoadFloat4((const XMFLOAT4*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMXDECN4& _XMXDECN4::operator= +( + CONST _XMXDECN4& XDecN4 +) +{ + v = XDecN4.v; + return *this; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMXDECN4& _XMXDECN4::operator= +( + CONST UINT Packed +) +{ + v = Packed; + return *this; +} + +/**************************************************************************** + * + * XMXDEC4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMXDEC4::_XMXDEC4 +( + FLOAT _x, + FLOAT _y, + FLOAT _z, + FLOAT _w +) +{ + XMStoreXDec4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMXDEC4::_XMXDEC4 +( + CONST FLOAT* pArray +) +{ + XMStoreXDec4(this, XMLoadFloat4((const XMFLOAT4*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMXDEC4& _XMXDEC4::operator= +( + CONST _XMXDEC4& XDec4 +) +{ + v = XDec4.v; + return *this; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMXDEC4& _XMXDEC4::operator= +( + CONST UINT Packed +) +{ + v = Packed; + return *this; +} + +/**************************************************************************** + * + * XMDECN4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMDECN4::_XMDECN4 +( + FLOAT _x, + FLOAT _y, + FLOAT _z, + FLOAT _w +) +{ + XMStoreDecN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMDECN4::_XMDECN4 +( + CONST FLOAT* pArray +) +{ + XMStoreDecN4(this, XMLoadFloat4((const XMFLOAT4*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMDECN4& _XMDECN4::operator= +( + CONST _XMDECN4& DecN4 +) +{ + v = DecN4.v; + return *this; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMDECN4& _XMDECN4::operator= +( + CONST UINT Packed +) +{ + v = Packed; + return *this; +} + +/**************************************************************************** + * + * XMDEC4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMDEC4::_XMDEC4 +( + FLOAT _x, + FLOAT _y, + FLOAT _z, + FLOAT _w +) +{ + XMStoreDec4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMDEC4::_XMDEC4 +( + CONST FLOAT* pArray +) +{ + XMStoreDec4(this, XMLoadFloat4((const XMFLOAT4*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMDEC4& _XMDEC4::operator= +( + CONST _XMDEC4& Dec4 +) +{ + v = Dec4.v; + return *this; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMDEC4& _XMDEC4::operator= +( + CONST UINT Packed +) +{ + v = Packed; + return *this; +} + +/**************************************************************************** + * + * XMUDECN4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUDECN4::_XMUDECN4 +( + FLOAT _x, + FLOAT _y, + FLOAT _z, + FLOAT _w +) +{ + XMStoreUDecN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUDECN4::_XMUDECN4 +( + CONST FLOAT* pArray +) +{ + XMStoreUDecN4(this, XMLoadFloat4((const XMFLOAT4*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUDECN4& _XMUDECN4::operator= +( + CONST _XMUDECN4& UDecN4 +) +{ + v = UDecN4.v; + return *this; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUDECN4& _XMUDECN4::operator= +( + CONST UINT Packed +) +{ + v = Packed; + return *this; +} + +/**************************************************************************** + * + * XMUDEC4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUDEC4::_XMUDEC4 +( + FLOAT _x, + FLOAT _y, + FLOAT _z, + FLOAT _w +) +{ + XMStoreUDec4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUDEC4::_XMUDEC4 +( + CONST FLOAT* pArray +) +{ + XMStoreUDec4(this, XMLoadFloat4((const XMFLOAT4*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUDEC4& _XMUDEC4::operator= +( + CONST _XMUDEC4& UDec4 +) +{ + v = UDec4.v; + return *this; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUDEC4& _XMUDEC4::operator= +( + CONST UINT Packed +) +{ + v = Packed; + return *this; +} + +/**************************************************************************** + * + * XMXICON4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMXICON4::_XMXICON4 +( + FLOAT _x, + FLOAT _y, + FLOAT _z, + FLOAT _w +) +{ + XMStoreXIcoN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMXICON4::_XMXICON4 +( + CONST FLOAT* pArray +) +{ + XMStoreXIcoN4(this, XMLoadFloat4((const XMFLOAT4*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMXICON4& _XMXICON4::operator= +( + CONST _XMXICON4& XIcoN4 +) +{ + v = XIcoN4.v; + return *this; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMXICON4& _XMXICON4::operator= +( + CONST UINT64 Packed +) +{ + v = Packed; + return *this; +} + +/**************************************************************************** + * + * XMXICO4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMXICO4::_XMXICO4 +( + FLOAT _x, + FLOAT _y, + FLOAT _z, + FLOAT _w +) +{ + XMStoreXIco4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMXICO4::_XMXICO4 +( + CONST FLOAT* pArray +) +{ + XMStoreXIco4(this, XMLoadFloat4((const XMFLOAT4*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMXICO4& _XMXICO4::operator= +( + CONST _XMXICO4& XIco4 +) +{ + v = XIco4.v; + return *this; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMXICO4& _XMXICO4::operator= +( + CONST UINT64 Packed +) +{ + v = Packed; + return *this; +} + +/**************************************************************************** + * + * XMICON4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMICON4::_XMICON4 +( + FLOAT _x, + FLOAT _y, + FLOAT _z, + FLOAT _w +) +{ + XMStoreIcoN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMICON4::_XMICON4 +( + CONST FLOAT* pArray +) +{ + XMStoreIcoN4(this, XMLoadFloat4((const XMFLOAT4*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMICON4& _XMICON4::operator= +( + CONST _XMICON4& IcoN4 +) +{ + v = IcoN4.v; + return *this; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMICON4& _XMICON4::operator= +( + CONST UINT64 Packed +) +{ + v = Packed; + return *this; +} + +/**************************************************************************** + * + * XMICO4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMICO4::_XMICO4 +( + FLOAT _x, + FLOAT _y, + FLOAT _z, + FLOAT _w +) +{ + XMStoreIco4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMICO4::_XMICO4 +( + CONST FLOAT* pArray +) +{ + XMStoreIco4(this, XMLoadFloat4((const XMFLOAT4*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMICO4& _XMICO4::operator= +( + CONST _XMICO4& Ico4 +) +{ + v = Ico4.v; + return *this; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMICO4& _XMICO4::operator= +( + CONST UINT64 Packed +) +{ + v = Packed; + return *this; +} + +/**************************************************************************** + * + * XMUICON4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUICON4::_XMUICON4 +( + FLOAT _x, + FLOAT _y, + FLOAT _z, + FLOAT _w +) +{ + XMStoreUIcoN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUICON4::_XMUICON4 +( + CONST FLOAT* pArray +) +{ + XMStoreUIcoN4(this, XMLoadFloat4((const XMFLOAT4*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUICON4& _XMUICON4::operator= +( + CONST _XMUICON4& UIcoN4 +) +{ + v = UIcoN4.v; + return *this; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUICON4& _XMUICON4::operator= +( + CONST UINT64 Packed +) +{ + v = Packed; + return *this; +} + +/**************************************************************************** + * + * XMUICO4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUICO4::_XMUICO4 +( + FLOAT _x, + FLOAT _y, + FLOAT _z, + FLOAT _w +) +{ + XMStoreUIco4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUICO4::_XMUICO4 +( + CONST FLOAT* pArray +) +{ + XMStoreUIco4(this, XMLoadFloat4((const XMFLOAT4*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUICO4& _XMUICO4::operator= +( + CONST _XMUICO4& UIco4 +) +{ + v = UIco4.v; + return *this; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUICO4& _XMUICO4::operator= +( + CONST UINT64 Packed +) +{ + v = Packed; + return *this; +} + +/**************************************************************************** + * + * XMCOLOR4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMCOLOR::_XMCOLOR +( + FLOAT _r, + FLOAT _g, + FLOAT _b, + FLOAT _a +) +{ + XMStoreColor(this, XMVectorSet(_r, _g, _b, _a)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMCOLOR::_XMCOLOR +( + CONST FLOAT* pArray +) +{ + XMStoreColor(this, XMLoadFloat4((const XMFLOAT4*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMCOLOR& _XMCOLOR::operator= +( + CONST _XMCOLOR& Color +) +{ + c = Color.c; + return *this; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMCOLOR& _XMCOLOR::operator= +( + CONST UINT Color +) +{ + c = Color; + return *this; +} + +/**************************************************************************** + * + * XMBYTEN4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMBYTEN4::_XMBYTEN4 +( + CONST CHAR* pArray +) +{ + x = pArray[0]; + y = pArray[1]; + z = pArray[2]; + w = pArray[3]; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMBYTEN4::_XMBYTEN4 +( + FLOAT _x, + FLOAT _y, + FLOAT _z, + FLOAT _w +) +{ + XMStoreByteN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMBYTEN4::_XMBYTEN4 +( + CONST FLOAT* pArray +) +{ + XMStoreByteN4(this, XMLoadFloat4((const XMFLOAT4*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMBYTEN4& _XMBYTEN4::operator= +( + CONST _XMBYTEN4& ByteN4 +) +{ + x = ByteN4.x; + y = ByteN4.y; + z = ByteN4.z; + w = ByteN4.w; + return *this; +} + +/**************************************************************************** + * + * XMBYTE4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMBYTE4::_XMBYTE4 +( + CONST CHAR* pArray +) +{ + x = pArray[0]; + y = pArray[1]; + z = pArray[2]; + w = pArray[3]; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMBYTE4::_XMBYTE4 +( + FLOAT _x, + FLOAT _y, + FLOAT _z, + FLOAT _w +) +{ + XMStoreByte4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMBYTE4::_XMBYTE4 +( + CONST FLOAT* pArray +) +{ + XMStoreByte4(this, XMLoadFloat4((const XMFLOAT4*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMBYTE4& _XMBYTE4::operator= +( + CONST _XMBYTE4& Byte4 +) +{ + x = Byte4.x; + y = Byte4.y; + z = Byte4.z; + w = Byte4.w; + return *this; +} + +/**************************************************************************** + * + * XMUBYTEN4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUBYTEN4::_XMUBYTEN4 +( + CONST BYTE* pArray +) +{ + x = pArray[0]; + y = pArray[1]; + z = pArray[2]; + w = pArray[3]; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUBYTEN4::_XMUBYTEN4 +( + FLOAT _x, + FLOAT _y, + FLOAT _z, + FLOAT _w +) +{ + XMStoreUByteN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUBYTEN4::_XMUBYTEN4 +( + CONST FLOAT* pArray +) +{ + XMStoreUByteN4(this, XMLoadFloat4((const XMFLOAT4*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUBYTEN4& _XMUBYTEN4::operator= +( + CONST _XMUBYTEN4& UByteN4 +) +{ + x = UByteN4.x; + y = UByteN4.y; + z = UByteN4.z; + w = UByteN4.w; + return *this; +} + +/**************************************************************************** + * + * XMUBYTE4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUBYTE4::_XMUBYTE4 +( + CONST BYTE* pArray +) +{ + x = pArray[0]; + y = pArray[1]; + z = pArray[2]; + w = pArray[3]; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUBYTE4::_XMUBYTE4 +( + FLOAT _x, + FLOAT _y, + FLOAT _z, + FLOAT _w +) +{ + XMStoreUByte4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUBYTE4::_XMUBYTE4 +( + CONST FLOAT* pArray +) +{ + XMStoreUByte4(this, XMLoadFloat4((const XMFLOAT4*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUBYTE4& _XMUBYTE4::operator= +( + CONST _XMUBYTE4& UByte4 +) +{ + x = UByte4.x; + y = UByte4.y; + z = UByte4.z; + w = UByte4.w; + return *this; +} + +/**************************************************************************** + * + * XMUNIBBLE4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUNIBBLE4::_XMUNIBBLE4 +( + CONST CHAR *pArray +) +{ + x = pArray[0]; + y = pArray[1]; + z = pArray[2]; + w = pArray[3]; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUNIBBLE4::_XMUNIBBLE4 +( + FLOAT _x, + FLOAT _y, + FLOAT _z, + FLOAT _w +) +{ + XMStoreUNibble4(this, XMVectorSet( _x, _y, _z, _w )); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUNIBBLE4::_XMUNIBBLE4 +( + CONST FLOAT *pArray +) +{ + XMStoreUNibble4(this, XMLoadFloat4((const XMFLOAT4*)pArray)); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUNIBBLE4& _XMUNIBBLE4::operator= +( + CONST _XMUNIBBLE4& UNibble4 +) +{ + v = UNibble4.v; + return *this; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMUNIBBLE4& _XMUNIBBLE4::operator= +( + CONST USHORT Packed +) +{ + v = Packed; + return *this; +} + +/**************************************************************************** + * + * XMU555 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +XMFINLINE _XMU555::_XMU555 +( + CONST CHAR *pArray, + BOOL _w +) +{ + x = pArray[0]; + y = pArray[1]; + z = pArray[2]; + w = _w; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMU555::_XMU555 +( + FLOAT _x, + FLOAT _y, + FLOAT _z, + BOOL _w +) +{ + XMStoreU555(this, XMVectorSet(_x, _y, _z, ((_w) ? 1.0f : 0.0f) )); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMU555::_XMU555 +( + CONST FLOAT *pArray, + BOOL _w +) +{ + XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pArray); + XMStoreU555(this, XMVectorSetW(V, ((_w) ? 1.0f : 0.0f) )); +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMU555& _XMU555::operator= +( + CONST _XMU555& U555 +) +{ + v = U555.v; + return *this; +} + +//------------------------------------------------------------------------------ + +XMFINLINE _XMU555& _XMU555::operator= +( + CONST USHORT Packed +) +{ + v = Packed; + return *this; +} + +#endif // __cplusplus + +#if defined(_XM_NO_INTRINSICS_) +#undef XMISNAN +#undef XMISINF +#endif + +#endif // __XNAMATHVECTOR_INL__ + -- 2.43.0