thirdparty/directxtex: Import DirectXTex library.

[apitrace] / thirdparty / directxtex / XNAMath / xnamathmisc.inl
diff --git a/thirdparty/directxtex/XNAMath/xnamathmisc.inl b/thirdparty/directxtex/XNAMath/xnamathmisc.inl

new file mode 100644 (file)

index 0000000..d4d4ef2
--- /dev/null
+++ b/thirdparty/directxtex/XNAMath/xnamathmisc.inl
@@ -0,0 +1,2460 @@
+/************************************************************************
+*                                                                       *
+* xnamathmisc.inl -- SIMD C++ Math library for Windows and Xbox 360     *
+*                    Quaternion, plane, and color functions             *
+*                                                                       *
+* Copyright (c) Microsoft Corp. All rights reserved.                    *
+*                                                                       *
+************************************************************************/
+
+#if defined(_MSC_VER) && (_MSC_VER > 1000)
+#pragma once
+#endif
+
+#ifndef __XNAMATHMISC_INL__
+#define __XNAMATHMISC_INL__
+
+/****************************************************************************
+ *
+ * Quaternion
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+// Comparison operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMQuaternionEqual
+(
+    FXMVECTOR Q1,
+    FXMVECTOR Q2
+)
+{
+    return XMVector4Equal(Q1, Q2);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMQuaternionNotEqual
+(
+    FXMVECTOR Q1,
+    FXMVECTOR Q2
+)
+{
+    return XMVector4NotEqual(Q1, Q2);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMQuaternionIsNaN
+(
+    FXMVECTOR Q
+)
+{
+    return XMVector4IsNaN(Q);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMQuaternionIsInfinite
+(
+    FXMVECTOR Q
+)
+{
+    return XMVector4IsInfinite(Q);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMQuaternionIsIdentity
+(
+    FXMVECTOR Q
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    return XMVector4Equal(Q, g_XMIdentityR3.v);
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpeq_ps(Q,g_XMIdentityR3);
+    return (_mm_movemask_ps(vTemp)==0x0f);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Computation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionDot
+(
+    FXMVECTOR Q1,
+    FXMVECTOR Q2
+)
+{
+    return XMVector4Dot(Q1, Q2);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionMultiply
+(
+    FXMVECTOR Q1,
+    FXMVECTOR Q2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR         NegativeQ1;
+    XMVECTOR         Q2X;
+    XMVECTOR         Q2Y;
+    XMVECTOR         Q2Z;
+    XMVECTOR         Q2W;
+    XMVECTOR         Q1WZYX;
+    XMVECTOR         Q1ZWXY;
+    XMVECTOR         Q1YXWZ;
+    XMVECTOR         Result;
+    CONST XMVECTORU32 ControlWZYX = {XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_1X};
+    CONST XMVECTORU32 ControlZWXY = {XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X, XM_PERMUTE_1Y};
+    CONST XMVECTORU32 ControlYXWZ = {XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_1Z};
+
+    NegativeQ1 = XMVectorNegate(Q1);
+
+    Q2W = XMVectorSplatW(Q2);
+    Q2X = XMVectorSplatX(Q2);
+    Q2Y = XMVectorSplatY(Q2);
+    Q2Z = XMVectorSplatZ(Q2);
+
+    Q1WZYX = XMVectorPermute(Q1, NegativeQ1, ControlWZYX.v);
+    Q1ZWXY = XMVectorPermute(Q1, NegativeQ1, ControlZWXY.v);
+    Q1YXWZ = XMVectorPermute(Q1, NegativeQ1, ControlYXWZ.v);
+
+    Result = XMVectorMultiply(Q1, Q2W);
+    Result = XMVectorMultiplyAdd(Q1WZYX, Q2X, Result);
+    Result = XMVectorMultiplyAdd(Q1ZWXY, Q2Y, Result);
+    Result = XMVectorMultiplyAdd(Q1YXWZ, Q2Z, Result);
+
+    return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    static CONST XMVECTORF32 ControlWZYX = { 1.0f,-1.0f, 1.0f,-1.0f};
+    static CONST XMVECTORF32 ControlZWXY = { 1.0f, 1.0f,-1.0f,-1.0f};
+    static CONST XMVECTORF32 ControlYXWZ = {-1.0f, 1.0f, 1.0f,-1.0f};
+    // Copy to SSE registers and use as few as possible for x86
+    XMVECTOR Q2X = Q2;
+    XMVECTOR Q2Y = Q2;
+    XMVECTOR Q2Z = Q2;
+    XMVECTOR vResult = Q2;
+    // Splat with one instruction
+    vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,3,3,3));
+    Q2X = _mm_shuffle_ps(Q2X,Q2X,_MM_SHUFFLE(0,0,0,0));
+    Q2Y = _mm_shuffle_ps(Q2Y,Q2Y,_MM_SHUFFLE(1,1,1,1));
+    Q2Z = _mm_shuffle_ps(Q2Z,Q2Z,_MM_SHUFFLE(2,2,2,2));
+    // Retire Q1 and perform Q1*Q2W
+    vResult = _mm_mul_ps(vResult,Q1);
+    XMVECTOR Q1Shuffle = Q1;
+    // Shuffle the copies of Q1
+    Q1Shuffle = _mm_shuffle_ps(Q1Shuffle,Q1Shuffle,_MM_SHUFFLE(0,1,2,3));
+    // Mul by Q1WZYX
+    Q2X = _mm_mul_ps(Q2X,Q1Shuffle);
+    Q1Shuffle = _mm_shuffle_ps(Q1Shuffle,Q1Shuffle,_MM_SHUFFLE(2,3,0,1));
+    // Flip the signs on y and z
+    Q2X = _mm_mul_ps(Q2X,ControlWZYX);
+    // Mul by Q1ZWXY
+    Q2Y = _mm_mul_ps(Q2Y,Q1Shuffle);
+    Q1Shuffle = _mm_shuffle_ps(Q1Shuffle,Q1Shuffle,_MM_SHUFFLE(0,1,2,3));
+    // Flip the signs on z and w
+    Q2Y = _mm_mul_ps(Q2Y,ControlZWXY);
+    // Mul by Q1YXWZ
+    Q2Z = _mm_mul_ps(Q2Z,Q1Shuffle);
+    vResult = _mm_add_ps(vResult,Q2X);
+    // Flip the signs on x and w
+    Q2Z = _mm_mul_ps(Q2Z,ControlYXWZ);
+    Q2Y = _mm_add_ps(Q2Y,Q2Z);
+    vResult = _mm_add_ps(vResult,Q2Y);
+    return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionLengthSq
+(
+    FXMVECTOR Q
+)
+{
+    return XMVector4LengthSq(Q);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionReciprocalLength
+(
+    FXMVECTOR Q
+)
+{
+    return XMVector4ReciprocalLength(Q);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionLength
+(
+    FXMVECTOR Q
+)
+{
+    return XMVector4Length(Q);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionNormalizeEst
+(
+    FXMVECTOR Q
+)
+{
+    return XMVector4NormalizeEst(Q);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionNormalize
+(
+    FXMVECTOR Q
+)
+{
+    return XMVector4Normalize(Q);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionConjugate
+(
+    FXMVECTOR Q
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result = {
+        -Q.vector4_f32[0],
+        -Q.vector4_f32[1],
+        -Q.vector4_f32[2],
+        Q.vector4_f32[3]
+    };
+    return Result;
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 NegativeOne3 = {-1.0f,-1.0f,-1.0f,1.0f};
+    XMVECTOR Result = _mm_mul_ps(Q,NegativeOne3);
+    return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionInverse
+(
+    FXMVECTOR Q
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR        Conjugate;
+    XMVECTOR        L;
+    XMVECTOR        Control;
+    XMVECTOR        Result;
+    CONST XMVECTOR  Zero = XMVectorZero();
+
+    L = XMVector4LengthSq(Q);
+    Conjugate = XMQuaternionConjugate(Q);
+
+    Control = XMVectorLessOrEqual(L, g_XMEpsilon.v);
+
+    L = XMVectorReciprocal(L);
+    Result = XMVectorMultiply(Conjugate, L);
+
+    Result = XMVectorSelect(Result, Zero, Control);
+
+    return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR        Conjugate;
+    XMVECTOR        L;
+    XMVECTOR        Control;
+    XMVECTOR        Result;
+    XMVECTOR  Zero = XMVectorZero();
+
+    L = XMVector4LengthSq(Q);
+    Conjugate = XMQuaternionConjugate(Q);
+    Control = XMVectorLessOrEqual(L, g_XMEpsilon);
+    Result = _mm_div_ps(Conjugate,L);
+    Result = XMVectorSelect(Result, Zero, Control);
+    return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionLn
+(
+    FXMVECTOR Q
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Q0;
+    XMVECTOR QW;
+    XMVECTOR Theta;
+    XMVECTOR SinTheta;
+    XMVECTOR S;
+    XMVECTOR ControlW;
+    XMVECTOR Result;
+    static CONST XMVECTOR OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f};
+
+    QW = XMVectorSplatW(Q);
+    Q0 = XMVectorSelect(g_XMSelect1110.v, Q, g_XMSelect1110.v);
+
+    ControlW = XMVectorInBounds(QW, OneMinusEpsilon);
+
+    Theta = XMVectorACos(QW);
+    SinTheta = XMVectorSin(Theta);
+
+    S = XMVectorReciprocal(SinTheta);
+    S = XMVectorMultiply(Theta, S);
+
+    Result = XMVectorMultiply(Q0, S);
+
+    Result = XMVectorSelect(Q0, Result, ControlW);
+
+    return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    static CONST XMVECTORF32 OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f};
+    static CONST XMVECTORF32 NegOneMinusEpsilon = {-(1.0f - 0.00001f), -(1.0f - 0.00001f),-(1.0f - 0.00001f),-(1.0f - 0.00001f)};
+    // Get W only
+    XMVECTOR QW = _mm_shuffle_ps(Q,Q,_MM_SHUFFLE(3,3,3,3));
+    // W = 0
+    XMVECTOR Q0 = _mm_and_ps(Q,g_XMMask3);
+    // Use W if within bounds
+    XMVECTOR ControlW = _mm_cmple_ps(QW,OneMinusEpsilon);
+    XMVECTOR vTemp2 = _mm_cmpge_ps(QW,NegOneMinusEpsilon);
+    ControlW = _mm_and_ps(ControlW,vTemp2);
+    // Get theta
+    XMVECTOR vTheta = XMVectorACos(QW);
+    // Get Sine of theta
+    vTemp2 = XMVectorSin(vTheta);
+    // theta/sine of theta
+    vTheta = _mm_div_ps(vTheta,vTemp2);
+    // Here's the answer
+    vTheta = _mm_mul_ps(vTheta,Q0);
+    // Was W in bounds? If not, return input as is
+    vTheta = XMVectorSelect(Q0,vTheta,ControlW);
+    return vTheta;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionExp
+(
+    FXMVECTOR Q
+)
+{
+#if defined(_XM_NO_INTRINSICS_) 
+
+    XMVECTOR Theta;
+    XMVECTOR SinTheta;
+    XMVECTOR CosTheta;
+    XMVECTOR S;
+    XMVECTOR Control;
+    XMVECTOR Zero;
+    XMVECTOR Result;
+
+    Theta = XMVector3Length(Q);
+    XMVectorSinCos(&SinTheta, &CosTheta, Theta);
+
+    S = XMVectorReciprocal(Theta);
+    S = XMVectorMultiply(SinTheta, S);
+
+    Result = XMVectorMultiply(Q, S);
+
+    Zero = XMVectorZero();
+    Control = XMVectorNearEqual(Theta, Zero, g_XMEpsilon.v);
+    Result = XMVectorSelect(Result, Q, Control);
+
+    Result = XMVectorSelect(CosTheta, Result, g_XMSelect1110.v);
+
+    return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR Theta;
+    XMVECTOR SinTheta;
+    XMVECTOR CosTheta;
+    XMVECTOR S;
+    XMVECTOR Control;
+    XMVECTOR Zero;
+    XMVECTOR Result;
+    Theta = XMVector3Length(Q);
+    XMVectorSinCos(&SinTheta, &CosTheta, Theta);
+    S = _mm_div_ps(SinTheta,Theta);
+    Result = _mm_mul_ps(Q, S);
+    Zero = XMVectorZero();
+    Control = XMVectorNearEqual(Theta, Zero, g_XMEpsilon);
+    Result = XMVectorSelect(Result,Q,Control);
+    Result = _mm_and_ps(Result,g_XMMask3);
+    CosTheta = _mm_and_ps(CosTheta,g_XMMaskW);
+    Result = _mm_or_ps(Result,CosTheta);
+    return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMVECTOR XMQuaternionSlerp
+(
+    FXMVECTOR Q0,
+    FXMVECTOR Q1,
+    FLOAT    t
+)
+{
+    XMVECTOR T = XMVectorReplicate(t);
+    return XMQuaternionSlerpV(Q0, Q1, T);
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMVECTOR XMQuaternionSlerpV
+(
+    FXMVECTOR Q0,
+    FXMVECTOR Q1,
+    FXMVECTOR T
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    // Result = Q0 * sin((1.0 - t) * Omega) / sin(Omega) + Q1 * sin(t * Omega) / sin(Omega)
+    XMVECTOR Omega;
+    XMVECTOR CosOmega;
+    XMVECTOR SinOmega;
+    XMVECTOR InvSinOmega;
+    XMVECTOR V01;
+    XMVECTOR C1000;
+    XMVECTOR SignMask;
+    XMVECTOR S0;
+    XMVECTOR S1;
+    XMVECTOR Sign;
+    XMVECTOR Control;
+    XMVECTOR Result;
+    XMVECTOR Zero;
+    CONST XMVECTOR OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f};
+
+    XMASSERT((T.vector4_f32[1] == T.vector4_f32[0]) && (T.vector4_f32[2] == T.vector4_f32[0]) && (T.vector4_f32[3] == T.vector4_f32[0]));
+
+    CosOmega = XMQuaternionDot(Q0, Q1);
+
+    Zero = XMVectorZero();
+    Control = XMVectorLess(CosOmega, Zero);
+    Sign = XMVectorSelect(g_XMOne.v, g_XMNegativeOne.v, Control);
+
+    CosOmega = XMVectorMultiply(CosOmega, Sign);
+
+    Control = XMVectorLess(CosOmega, OneMinusEpsilon);
+
+    SinOmega = XMVectorNegativeMultiplySubtract(CosOmega, CosOmega, g_XMOne.v);
+    SinOmega = XMVectorSqrt(SinOmega);
+
+    Omega = XMVectorATan2(SinOmega, CosOmega);
+
+    SignMask = XMVectorSplatSignMask();
+    C1000 = XMVectorSetBinaryConstant(1, 0, 0, 0);
+    V01 = XMVectorShiftLeft(T, Zero, 2);
+    SignMask = XMVectorShiftLeft(SignMask, Zero, 3);
+    V01 = XMVectorXorInt(V01, SignMask);
+    V01 = XMVectorAdd(C1000, V01);
+
+    InvSinOmega = XMVectorReciprocal(SinOmega);
+
+    S0 = XMVectorMultiply(V01, Omega);
+    S0 = XMVectorSin(S0);
+    S0 = XMVectorMultiply(S0, InvSinOmega);
+
+    S0 = XMVectorSelect(V01, S0, Control);
+
+    S1 = XMVectorSplatY(S0);
+    S0 = XMVectorSplatX(S0);
+
+    S1 = XMVectorMultiply(S1, Sign);
+
+    Result = XMVectorMultiply(Q0, S0);
+    Result = XMVectorMultiplyAdd(Q1, S1, Result);
+
+    return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Result = Q0 * sin((1.0 - t) * Omega) / sin(Omega) + Q1 * sin(t * Omega) / sin(Omega)
+    XMVECTOR Omega;
+    XMVECTOR CosOmega;
+    XMVECTOR SinOmega;
+    XMVECTOR V01;
+    XMVECTOR S0;
+    XMVECTOR S1;
+    XMVECTOR Sign;
+    XMVECTOR Control;
+    XMVECTOR Result;
+    XMVECTOR Zero;
+    static const XMVECTORF32 OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f};
+    static const XMVECTORI32 SignMask2 = {0x80000000,0x00000000,0x00000000,0x00000000};
+    static const XMVECTORI32 MaskXY = {0xFFFFFFFF,0xFFFFFFFF,0x00000000,0x00000000};
+
+    XMASSERT((XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T)));
+
+    CosOmega = XMQuaternionDot(Q0, Q1);
+
+    Zero = XMVectorZero();
+    Control = XMVectorLess(CosOmega, Zero);
+    Sign = XMVectorSelect(g_XMOne, g_XMNegativeOne, Control);
+
+    CosOmega = _mm_mul_ps(CosOmega, Sign);
+
+    Control = XMVectorLess(CosOmega, OneMinusEpsilon);
+
+    SinOmega = _mm_mul_ps(CosOmega,CosOmega);
+    SinOmega = _mm_sub_ps(g_XMOne,SinOmega);
+    SinOmega = _mm_sqrt_ps(SinOmega);
+
+    Omega = XMVectorATan2(SinOmega, CosOmega);
+
+    V01 = _mm_shuffle_ps(T,T,_MM_SHUFFLE(2,3,0,1));
+    V01 = _mm_and_ps(V01,MaskXY);
+    V01 = _mm_xor_ps(V01,SignMask2);
+    V01 = _mm_add_ps(g_XMIdentityR0, V01);
+
+    S0 = _mm_mul_ps(V01, Omega);
+    S0 = XMVectorSin(S0);
+    S0 = _mm_div_ps(S0, SinOmega);
+
+    S0 = XMVectorSelect(V01, S0, Control);
+
+    S1 = XMVectorSplatY(S0);
+    S0 = XMVectorSplatX(S0);
+
+    S1 = _mm_mul_ps(S1, Sign);
+    Result = _mm_mul_ps(Q0, S0);
+    S1 = _mm_mul_ps(S1, Q1);
+    Result = _mm_add_ps(Result,S1);
+    return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionSquad
+(
+    FXMVECTOR Q0,
+    FXMVECTOR Q1,
+    FXMVECTOR Q2,
+    CXMVECTOR Q3,
+    FLOAT    t
+)
+{
+    XMVECTOR T = XMVectorReplicate(t);
+    return XMQuaternionSquadV(Q0, Q1, Q2, Q3, T);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionSquadV
+(
+    FXMVECTOR Q0,
+    FXMVECTOR Q1,
+    FXMVECTOR Q2,
+    CXMVECTOR Q3,
+    CXMVECTOR T
+)
+{
+    XMVECTOR Q03;
+    XMVECTOR Q12;
+    XMVECTOR TP;
+    XMVECTOR Two;
+    XMVECTOR Result;
+
+    XMASSERT( (XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T)) );
+
+    TP = T;
+    Two = XMVectorSplatConstant(2, 0);
+
+    Q03 = XMQuaternionSlerpV(Q0, Q3, T);
+    Q12 = XMQuaternionSlerpV(Q1, Q2, T);
+
+    TP = XMVectorNegativeMultiplySubtract(TP, TP, TP);
+    TP = XMVectorMultiply(TP, Two);
+
+    Result = XMQuaternionSlerpV(Q03, Q12, TP);
+
+    return Result;
+
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE VOID XMQuaternionSquadSetup
+(
+    XMVECTOR* pA,
+    XMVECTOR* pB,
+    XMVECTOR* pC,
+    FXMVECTOR  Q0,
+    FXMVECTOR  Q1,
+    FXMVECTOR  Q2,
+    CXMVECTOR  Q3
+)
+{
+    XMVECTOR SQ0, SQ2, SQ3;
+    XMVECTOR InvQ1, InvQ2;
+    XMVECTOR LnQ0, LnQ1, LnQ2, LnQ3;
+    XMVECTOR ExpQ02, ExpQ13;
+    XMVECTOR LS01, LS12, LS23;
+    XMVECTOR LD01, LD12, LD23;
+    XMVECTOR Control0, Control1, Control2;
+    XMVECTOR NegativeOneQuarter;
+
+    XMASSERT(pA);
+    XMASSERT(pB);
+    XMASSERT(pC);
+
+    LS12 = XMQuaternionLengthSq(XMVectorAdd(Q1, Q2));
+    LD12 = XMQuaternionLengthSq(XMVectorSubtract(Q1, Q2));
+    SQ2 = XMVectorNegate(Q2);
+
+    Control1 = XMVectorLess(LS12, LD12);
+    SQ2 = XMVectorSelect(Q2, SQ2, Control1);
+
+    LS01 = XMQuaternionLengthSq(XMVectorAdd(Q0, Q1));
+    LD01 = XMQuaternionLengthSq(XMVectorSubtract(Q0, Q1));
+    SQ0 = XMVectorNegate(Q0);
+
+    LS23 = XMQuaternionLengthSq(XMVectorAdd(SQ2, Q3));
+    LD23 = XMQuaternionLengthSq(XMVectorSubtract(SQ2, Q3));
+    SQ3 = XMVectorNegate(Q3);
+
+    Control0 = XMVectorLess(LS01, LD01);
+    Control2 = XMVectorLess(LS23, LD23);
+
+    SQ0 = XMVectorSelect(Q0, SQ0, Control0);
+    SQ3 = XMVectorSelect(Q3, SQ3, Control2);
+
+    InvQ1 = XMQuaternionInverse(Q1);
+    InvQ2 = XMQuaternionInverse(SQ2);
+
+    LnQ0 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ0));
+    LnQ2 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ2));
+    LnQ1 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, Q1));
+    LnQ3 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, SQ3));
+
+    NegativeOneQuarter = XMVectorSplatConstant(-1, 2);
+
+    ExpQ02 = XMVectorMultiply(XMVectorAdd(LnQ0, LnQ2), NegativeOneQuarter);
+    ExpQ13 = XMVectorMultiply(XMVectorAdd(LnQ1, LnQ3), NegativeOneQuarter);
+    ExpQ02 = XMQuaternionExp(ExpQ02);
+    ExpQ13 = XMQuaternionExp(ExpQ13);
+
+    *pA = XMQuaternionMultiply(Q1, ExpQ02);
+    *pB = XMQuaternionMultiply(SQ2, ExpQ13);
+    *pC = SQ2;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionBaryCentric
+(
+    FXMVECTOR Q0,
+    FXMVECTOR Q1,
+    FXMVECTOR Q2,
+    FLOAT    f,
+    FLOAT    g
+)
+{
+    XMVECTOR Q01;
+    XMVECTOR Q02;
+    FLOAT    s;
+    XMVECTOR Result;
+
+    s = f + g;
+
+    if ((s < 0.00001f) && (s > -0.00001f))
+    {
+        Result = Q0;
+    }
+    else
+    {
+        Q01 = XMQuaternionSlerp(Q0, Q1, s);
+        Q02 = XMQuaternionSlerp(Q0, Q2, s);
+
+        Result = XMQuaternionSlerp(Q01, Q02, g / s);
+    }
+
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionBaryCentricV
+(
+    FXMVECTOR Q0,
+    FXMVECTOR Q1,
+    FXMVECTOR Q2,
+    CXMVECTOR F,
+    CXMVECTOR G
+)
+{
+    XMVECTOR Q01;
+    XMVECTOR Q02;
+    XMVECTOR S, GS;
+    XMVECTOR Epsilon;
+    XMVECTOR Result;
+
+    XMASSERT( (XMVectorGetY(F) == XMVectorGetX(F)) && (XMVectorGetZ(F) == XMVectorGetX(F)) && (XMVectorGetW(F) == XMVectorGetX(F)) );
+    XMASSERT( (XMVectorGetY(G) == XMVectorGetX(G)) && (XMVectorGetZ(G) == XMVectorGetX(G)) && (XMVectorGetW(G) == XMVectorGetX(G)) );
+
+    Epsilon = XMVectorSplatConstant(1, 16);
+
+    S = XMVectorAdd(F, G);
+
+    if (XMVector4InBounds(S, Epsilon))
+    {
+        Result = Q0;
+    }
+    else
+    {
+        Q01 = XMQuaternionSlerpV(Q0, Q1, S);
+        Q02 = XMQuaternionSlerpV(Q0, Q2, S);
+        GS = XMVectorReciprocal(S);
+        GS = XMVectorMultiply(G, GS);
+
+        Result = XMQuaternionSlerpV(Q01, Q02, GS);
+    }
+
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+// Transformation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionIdentity()
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return g_XMIdentityR3.v;
+#elif defined(_XM_SSE_INTRINSICS_)
+    return g_XMIdentityR3;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionRotationRollPitchYaw
+(
+    FLOAT Pitch,
+    FLOAT Yaw,
+    FLOAT Roll
+)
+{
+    XMVECTOR Angles;
+    XMVECTOR Q;
+
+    Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f);
+    Q = XMQuaternionRotationRollPitchYawFromVector(Angles);
+
+    return Q;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionRotationRollPitchYawFromVector
+(
+    FXMVECTOR Angles // <Pitch, Yaw, Roll, 0>
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR                Q, Q0, Q1;
+    XMVECTOR                P0, P1, Y0, Y1, R0, R1;
+    XMVECTOR                HalfAngles;
+    XMVECTOR                SinAngles, CosAngles;
+    static CONST XMVECTORU32 ControlPitch = {XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1X, XM_PERMUTE_1X};
+    static CONST XMVECTORU32 ControlYaw = {XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Y};
+    static CONST XMVECTORU32 ControlRoll = {XM_PERMUTE_1Z, XM_PERMUTE_1Z, XM_PERMUTE_0Z, XM_PERMUTE_1Z};
+    static CONST XMVECTOR   Sign = {1.0f, -1.0f, -1.0f, 1.0f};
+
+    HalfAngles = XMVectorMultiply(Angles, g_XMOneHalf.v);
+    XMVectorSinCos(&SinAngles, &CosAngles, HalfAngles);
+
+    P0 = XMVectorPermute(SinAngles, CosAngles, ControlPitch.v);
+    Y0 = XMVectorPermute(SinAngles, CosAngles, ControlYaw.v);
+    R0 = XMVectorPermute(SinAngles, CosAngles, ControlRoll.v);
+    P1 = XMVectorPermute(CosAngles, SinAngles, ControlPitch.v);
+    Y1 = XMVectorPermute(CosAngles, SinAngles, ControlYaw.v);
+    R1 = XMVectorPermute(CosAngles, SinAngles, ControlRoll.v);
+
+    Q1 = XMVectorMultiply(P1, Sign);
+    Q0 = XMVectorMultiply(P0, Y0);
+    Q1 = XMVectorMultiply(Q1, Y1);
+    Q0 = XMVectorMultiply(Q0, R0);
+    Q = XMVectorMultiplyAdd(Q1, R1, Q0);
+
+    return Q;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR                Q, Q0, Q1;
+    XMVECTOR                P0, P1, Y0, Y1, R0, R1;
+    XMVECTOR                HalfAngles;
+    XMVECTOR                SinAngles, CosAngles;
+    static CONST XMVECTORI32 ControlPitch = {XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1X, XM_PERMUTE_1X};
+    static CONST XMVECTORI32 ControlYaw = {XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Y};
+    static CONST XMVECTORI32 ControlRoll = {XM_PERMUTE_1Z, XM_PERMUTE_1Z, XM_PERMUTE_0Z, XM_PERMUTE_1Z};
+    static CONST XMVECTORF32 Sign = {1.0f, -1.0f, -1.0f, 1.0f};
+
+    HalfAngles = _mm_mul_ps(Angles, g_XMOneHalf);
+    XMVectorSinCos(&SinAngles, &CosAngles, HalfAngles);
+
+    P0 = XMVectorPermute(SinAngles, CosAngles, ControlPitch);
+    Y0 = XMVectorPermute(SinAngles, CosAngles, ControlYaw);
+    R0 = XMVectorPermute(SinAngles, CosAngles, ControlRoll);
+    P1 = XMVectorPermute(CosAngles, SinAngles, ControlPitch);
+    Y1 = XMVectorPermute(CosAngles, SinAngles, ControlYaw);
+    R1 = XMVectorPermute(CosAngles, SinAngles, ControlRoll);
+
+    Q1 = _mm_mul_ps(P1, Sign);
+    Q0 = _mm_mul_ps(P0, Y0);
+    Q1 = _mm_mul_ps(Q1, Y1);
+    Q0 = _mm_mul_ps(Q0, R0);
+    Q = _mm_mul_ps(Q1, R1);
+    Q = _mm_add_ps(Q,Q0);
+    return Q;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionRotationNormal
+(
+    FXMVECTOR NormalAxis,
+    FLOAT    Angle
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Q;
+    XMVECTOR N;
+    XMVECTOR Scale;
+
+    N = XMVectorSelect(g_XMOne.v, NormalAxis, g_XMSelect1110.v);
+
+    XMScalarSinCos(&Scale.vector4_f32[2], &Scale.vector4_f32[3], 0.5f * Angle);
+
+    Scale.vector4_f32[0] = Scale.vector4_f32[1] = Scale.vector4_f32[2];
+
+    Q = XMVectorMultiply(N, Scale);
+
+    return Q;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR N = _mm_and_ps(NormalAxis,g_XMMask3);
+    N = _mm_or_ps(N,g_XMIdentityR3);
+    XMVECTOR Scale = _mm_set_ps1(0.5f * Angle);
+    XMVECTOR vSine;
+    XMVECTOR vCosine;
+    XMVectorSinCos(&vSine,&vCosine,Scale);
+    Scale = _mm_and_ps(vSine,g_XMMask3);
+    vCosine = _mm_and_ps(vCosine,g_XMMaskW);
+    Scale = _mm_or_ps(Scale,vCosine);
+    N = _mm_mul_ps(N,Scale);
+    return N;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionRotationAxis
+(
+    FXMVECTOR Axis,
+    FLOAT    Angle
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Normal;
+    XMVECTOR Q;
+
+    XMASSERT(!XMVector3Equal(Axis, XMVectorZero()));
+    XMASSERT(!XMVector3IsInfinite(Axis));
+
+    Normal = XMVector3Normalize(Axis);
+    Q = XMQuaternionRotationNormal(Normal, Angle);
+
+    return Q;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR Normal;
+    XMVECTOR Q;
+
+    XMASSERT(!XMVector3Equal(Axis, XMVectorZero()));
+    XMASSERT(!XMVector3IsInfinite(Axis));
+
+    Normal = XMVector3Normalize(Axis);
+    Q = XMQuaternionRotationNormal(Normal, Angle);
+    return Q;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMVECTOR XMQuaternionRotationMatrix
+(
+    CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
+
+    XMVECTOR Q0, Q1, Q2;
+    XMVECTOR M00, M11, M22;
+    XMVECTOR CQ0, CQ1, C;
+    XMVECTOR CX, CY, CZ, CW;
+    XMVECTOR SQ1, Scale;
+    XMVECTOR Rsq, Sqrt, VEqualsNaN;
+    XMVECTOR A, B, P;
+    XMVECTOR PermuteSplat, PermuteSplatT;
+    XMVECTOR SignB, SignBT;
+    XMVECTOR PermuteControl, PermuteControlT;
+    XMVECTOR Result;
+    static CONST XMVECTORF32 OneQuarter = {0.25f, 0.25f, 0.25f, 0.25f};
+    static CONST XMVECTORF32 SignPNNP = {1.0f, -1.0f, -1.0f, 1.0f};
+    static CONST XMVECTORF32 SignNPNP = {-1.0f, 1.0f, -1.0f, 1.0f};
+    static CONST XMVECTORF32 SignNNPP = {-1.0f, -1.0f, 1.0f, 1.0f};
+    static CONST XMVECTORF32 SignPNPP = {1.0f, -1.0f, 1.0f, 1.0f};
+    static CONST XMVECTORF32 SignPPNP = {1.0f, 1.0f, -1.0f, 1.0f};
+    static CONST XMVECTORF32 SignNPPP = {-1.0f, 1.0f, 1.0f, 1.0f};
+    static CONST XMVECTORU32 Permute0X0X0Y0W = {XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_0W};
+    static CONST XMVECTORU32 Permute0Y0Z0Z1W = {XM_PERMUTE_0Y, XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_1W};
+    static CONST XMVECTORU32 SplatX = {XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X};
+    static CONST XMVECTORU32 SplatY = {XM_PERMUTE_0Y, XM_PERMUTE_0Y, XM_PERMUTE_0Y, XM_PERMUTE_0Y};
+    static CONST XMVECTORU32 SplatZ = {XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_0Z};
+    static CONST XMVECTORU32 SplatW = {XM_PERMUTE_0W, XM_PERMUTE_0W, XM_PERMUTE_0W, XM_PERMUTE_0W};
+    static CONST XMVECTORU32 PermuteC = {XM_PERMUTE_0X, XM_PERMUTE_0Z, XM_PERMUTE_1X, XM_PERMUTE_1Y};
+    static CONST XMVECTORU32 PermuteA = {XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Z, XM_PERMUTE_0W};
+    static CONST XMVECTORU32 PermuteB = {XM_PERMUTE_1X, XM_PERMUTE_1W, XM_PERMUTE_0Z, XM_PERMUTE_0W};
+    static CONST XMVECTORU32 Permute0 = {XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1Z, XM_PERMUTE_1Y};
+    static CONST XMVECTORU32 Permute1 = {XM_PERMUTE_1X, XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Z};
+    static CONST XMVECTORU32 Permute2 = {XM_PERMUTE_1Z, XM_PERMUTE_1Y, XM_PERMUTE_0Z, XM_PERMUTE_1X};
+    static CONST XMVECTORU32 Permute3 = {XM_PERMUTE_1Y, XM_PERMUTE_1Z, XM_PERMUTE_1X, XM_PERMUTE_0W};
+
+    M00 = XMVectorSplatX(M.r[0]);
+    M11 = XMVectorSplatY(M.r[1]);
+    M22 = XMVectorSplatZ(M.r[2]);
+
+    Q0 = XMVectorMultiply(SignPNNP.v, M00);
+    Q0 = XMVectorMultiplyAdd(SignNPNP.v, M11, Q0);
+    Q0 = XMVectorMultiplyAdd(SignNNPP.v, M22, Q0);
+
+    Q1 = XMVectorAdd(Q0, g_XMOne.v);
+
+    Rsq = XMVectorReciprocalSqrt(Q1);
+    VEqualsNaN = XMVectorIsNaN(Rsq);
+    Sqrt = XMVectorMultiply(Q1, Rsq);
+    Q1 = XMVectorSelect(Sqrt, Q1, VEqualsNaN);
+
+    Q1 = XMVectorMultiply(Q1, g_XMOneHalf.v);
+
+    SQ1 = XMVectorMultiply(Rsq, g_XMOneHalf.v);
+
+    CQ0 = XMVectorPermute(Q0, Q0, Permute0X0X0Y0W.v);
+    CQ1 = XMVectorPermute(Q0, g_XMEpsilon.v, Permute0Y0Z0Z1W.v);
+    C = XMVectorGreaterOrEqual(CQ0, CQ1);
+
+    CX = XMVectorSplatX(C);
+    CY = XMVectorSplatY(C);
+    CZ = XMVectorSplatZ(C);
+    CW = XMVectorSplatW(C);
+
+    PermuteSplat = XMVectorSelect(SplatZ.v, SplatY.v, CZ);
+    SignB = XMVectorSelect(SignNPPP.v, SignPPNP.v, CZ);
+    PermuteControl = XMVectorSelect(Permute2.v, Permute1.v, CZ);
+
+    PermuteSplat = XMVectorSelect(PermuteSplat, SplatZ.v, CX);
+    SignB = XMVectorSelect(SignB, SignNPPP.v, CX);
+    PermuteControl = XMVectorSelect(PermuteControl, Permute2.v, CX);
+
+    PermuteSplatT = XMVectorSelect(PermuteSplat,SplatX.v, CY);
+    SignBT = XMVectorSelect(SignB, SignPNPP.v, CY);
+    PermuteControlT = XMVectorSelect(PermuteControl,Permute0.v, CY);
+
+    PermuteSplat = XMVectorSelect(PermuteSplat, PermuteSplatT, CX);
+    SignB = XMVectorSelect(SignB, SignBT, CX);
+    PermuteControl = XMVectorSelect(PermuteControl, PermuteControlT, CX);
+
+    PermuteSplat = XMVectorSelect(PermuteSplat,SplatW.v, CW);
+    SignB = XMVectorSelect(SignB, g_XMNegativeOne.v, CW);
+    PermuteControl = XMVectorSelect(PermuteControl,Permute3.v, CW);
+
+    Scale = XMVectorPermute(SQ1, SQ1, PermuteSplat);
+
+    P = XMVectorPermute(M.r[1], M.r[2],PermuteC.v);  // {M10, M12, M20, M21}
+    A = XMVectorPermute(M.r[0], P, PermuteA.v);       // {M01, M12, M20, M03}
+    B = XMVectorPermute(M.r[0], P, PermuteB.v);       // {M10, M21, M02, M03}
+
+    Q2 = XMVectorMultiplyAdd(SignB, B, A);
+    Q2 = XMVectorMultiply(Q2, Scale);
+
+    Result = XMVectorPermute(Q1, Q2, PermuteControl);
+
+    return Result;
+
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Conversion operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMQuaternionToAxisAngle
+(
+    XMVECTOR* pAxis,
+    FLOAT*    pAngle,
+    FXMVECTOR  Q
+)
+{
+    XMASSERT(pAxis);
+    XMASSERT(pAngle);
+
+    *pAxis = Q;
+
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    *pAngle = 2.0f * acosf(XMVectorGetW(Q));
+#else
+    *pAngle = 2.0f * XMScalarACos(XMVectorGetW(Q));
+#endif
+}
+
+/****************************************************************************
+ *
+ * Plane
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+// Comparison operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMPlaneEqual
+(
+    FXMVECTOR P1,
+    FXMVECTOR P2
+)
+{
+    return XMVector4Equal(P1, P2);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMPlaneNearEqual
+(
+    FXMVECTOR P1,
+    FXMVECTOR P2,
+    FXMVECTOR Epsilon
+)
+{
+    XMVECTOR NP1 = XMPlaneNormalize(P1);
+    XMVECTOR NP2 = XMPlaneNormalize(P2);
+    return XMVector4NearEqual(NP1, NP2, Epsilon);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMPlaneNotEqual
+(
+    FXMVECTOR P1,
+    FXMVECTOR P2
+)
+{
+    return XMVector4NotEqual(P1, P2);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMPlaneIsNaN
+(
+    FXMVECTOR P
+)
+{
+    return XMVector4IsNaN(P);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMPlaneIsInfinite
+(
+    FXMVECTOR P
+)
+{
+    return XMVector4IsInfinite(P);
+}
+
+//------------------------------------------------------------------------------
+// Computation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMPlaneDot
+(
+    FXMVECTOR P,
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    return XMVector4Dot(P, V);
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 vTemp2 = V;
+    __m128 vTemp = _mm_mul_ps(P,vTemp2);
+    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp,_MM_SHUFFLE(1,0,0,0)); // Copy X to the Z position and Y to the W position
+    vTemp2 = _mm_add_ps(vTemp2,vTemp);          // Add Z = X+Z; W = Y+W;
+    vTemp = _mm_shuffle_ps(vTemp,vTemp2,_MM_SHUFFLE(0,3,0,0));  // Copy W to the Z position
+    vTemp = _mm_add_ps(vTemp,vTemp2);           // Add Z and W together
+    return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(2,2,2,2));    // Splat Z and return
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMPlaneDotCoord
+(
+    FXMVECTOR P,
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR V3;
+    XMVECTOR Result;
+
+    // Result = P[0] * V[0] + P[1] * V[1] + P[2] * V[2] + P[3]
+    V3 = XMVectorSelect(g_XMOne.v, V, g_XMSelect1110.v);
+    Result = XMVector4Dot(P, V3);
+
+    return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp2 = _mm_and_ps(V,g_XMMask3);
+    vTemp2 = _mm_or_ps(vTemp2,g_XMIdentityR3);
+    XMVECTOR vTemp = _mm_mul_ps(P,vTemp2);
+    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp,_MM_SHUFFLE(1,0,0,0)); // Copy X to the Z position and Y to the W position
+    vTemp2 = _mm_add_ps(vTemp2,vTemp);          // Add Z = X+Z; W = Y+W;
+    vTemp = _mm_shuffle_ps(vTemp,vTemp2,_MM_SHUFFLE(0,3,0,0));  // Copy W to the Z position
+    vTemp = _mm_add_ps(vTemp,vTemp2);           // Add Z and W together
+    return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(2,2,2,2));    // Splat Z and return
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMPlaneDotNormal
+(
+    FXMVECTOR P,
+    FXMVECTOR V
+)
+{
+    return XMVector3Dot(P, V);
+}
+
+//------------------------------------------------------------------------------
+// XMPlaneNormalizeEst uses a reciprocal estimate and
+// returns QNaN on zero and infinite vectors.
+
+XMFINLINE XMVECTOR XMPlaneNormalizeEst
+(
+    FXMVECTOR P
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result = XMVector3ReciprocalLength(P);
+    Result = XMVectorMultiply(P, Result);
+    return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product
+    XMVECTOR vDot = _mm_mul_ps(P,P);
+    // x=Dot.y, y=Dot.z
+    XMVECTOR vTemp = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(2,1,2,1));
+    // Result.x = x+y
+    vDot = _mm_add_ss(vDot,vTemp);
+    // x=Dot.z
+    vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1));
+    // Result.x = (x+y)+z
+    vDot = _mm_add_ss(vDot,vTemp);
+    // Splat x
+       vDot = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(0,0,0,0));
+    // Get the reciprocal
+    vDot = _mm_rsqrt_ps(vDot);
+    // Get the reciprocal
+    vDot = _mm_mul_ps(vDot,P);
+    return vDot;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMPlaneNormalize
+(
+    FXMVECTOR P
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    FLOAT fLengthSq = sqrtf((P.vector4_f32[0]*P.vector4_f32[0])+(P.vector4_f32[1]*P.vector4_f32[1])+(P.vector4_f32[2]*P.vector4_f32[2]));
+    // Prevent divide by zero
+    if (fLengthSq) {
+        fLengthSq = 1.0f/fLengthSq;
+    }
+    {
+    XMVECTOR vResult = {
+        P.vector4_f32[0]*fLengthSq,
+        P.vector4_f32[1]*fLengthSq,
+        P.vector4_f32[2]*fLengthSq,
+        P.vector4_f32[3]*fLengthSq
+    };
+    return vResult;
+    }
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x,y and z only
+    XMVECTOR vLengthSq = _mm_mul_ps(P,P);
+    XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(2,1,2,1));
+    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+    vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1));
+    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+       vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(0,0,0,0));
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
+    // Reciprocal mul to perform the normalization
+    vResult = _mm_div_ps(P,vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult,vLengthSq);
+    return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMPlaneIntersectLine
+(
+    FXMVECTOR P,
+    FXMVECTOR LinePoint1,
+    FXMVECTOR LinePoint2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR V1;
+    XMVECTOR V2;
+    XMVECTOR D;
+    XMVECTOR ReciprocalD;
+    XMVECTOR VT;
+    XMVECTOR Point;
+    XMVECTOR Zero;
+    XMVECTOR Control;
+    XMVECTOR Result;
+
+    V1 = XMVector3Dot(P, LinePoint1);
+    V2 = XMVector3Dot(P, LinePoint2);
+    D = XMVectorSubtract(V1, V2);
+
+    ReciprocalD = XMVectorReciprocal(D);
+    VT = XMPlaneDotCoord(P, LinePoint1);
+    VT = XMVectorMultiply(VT, ReciprocalD);
+
+    Point = XMVectorSubtract(LinePoint2, LinePoint1);
+    Point = XMVectorMultiplyAdd(Point, VT, LinePoint1);
+
+    Zero = XMVectorZero();
+    Control = XMVectorNearEqual(D, Zero, g_XMEpsilon.v);
+
+    Result = XMVectorSelect(Point, g_XMQNaN.v, Control);
+
+    return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR V1;
+    XMVECTOR V2;
+    XMVECTOR D;
+    XMVECTOR VT;
+    XMVECTOR Point;
+    XMVECTOR Zero;
+    XMVECTOR Control;
+    XMVECTOR Result;
+
+    V1 = XMVector3Dot(P, LinePoint1);
+    V2 = XMVector3Dot(P, LinePoint2);
+    D = _mm_sub_ps(V1, V2);
+
+    VT = XMPlaneDotCoord(P, LinePoint1);
+    VT = _mm_div_ps(VT, D);
+
+    Point = _mm_sub_ps(LinePoint2, LinePoint1);
+    Point = _mm_mul_ps(Point,VT);
+    Point = _mm_add_ps(Point,LinePoint1);
+    Zero = XMVectorZero();
+    Control = XMVectorNearEqual(D, Zero, g_XMEpsilon);
+    Result = XMVectorSelect(Point, g_XMQNaN, Control);
+    return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE VOID XMPlaneIntersectPlane
+(
+    XMVECTOR* pLinePoint1,
+    XMVECTOR* pLinePoint2,
+    FXMVECTOR  P1,
+    FXMVECTOR  P2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR V1;
+    XMVECTOR V2;
+    XMVECTOR V3;
+    XMVECTOR LengthSq;
+    XMVECTOR RcpLengthSq;
+    XMVECTOR Point;
+    XMVECTOR P1W;
+    XMVECTOR P2W;
+    XMVECTOR Control;
+    XMVECTOR LinePoint1;
+    XMVECTOR LinePoint2;
+
+    XMASSERT(pLinePoint1);
+    XMASSERT(pLinePoint2);
+
+    V1 = XMVector3Cross(P2, P1);
+
+    LengthSq = XMVector3LengthSq(V1);
+
+    V2 = XMVector3Cross(P2, V1);
+
+    P1W = XMVectorSplatW(P1);
+    Point = XMVectorMultiply(V2, P1W);
+
+    V3 = XMVector3Cross(V1, P1);
+
+    P2W = XMVectorSplatW(P2);
+    Point = XMVectorMultiplyAdd(V3, P2W, Point);
+
+    RcpLengthSq = XMVectorReciprocal(LengthSq);
+    LinePoint1 = XMVectorMultiply(Point, RcpLengthSq);
+
+    LinePoint2 = XMVectorAdd(LinePoint1, V1);
+
+    Control = XMVectorLessOrEqual(LengthSq, g_XMEpsilon.v);
+    *pLinePoint1 = XMVectorSelect(LinePoint1,g_XMQNaN.v, Control);
+    *pLinePoint2 = XMVectorSelect(LinePoint2,g_XMQNaN.v, Control);
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMASSERT(pLinePoint1);
+    XMASSERT(pLinePoint2);
+    XMVECTOR V1;
+    XMVECTOR V2;
+    XMVECTOR V3;
+    XMVECTOR LengthSq;
+    XMVECTOR Point;
+    XMVECTOR P1W;
+    XMVECTOR P2W;
+    XMVECTOR Control;
+    XMVECTOR LinePoint1;
+    XMVECTOR LinePoint2;
+
+    V1 = XMVector3Cross(P2, P1);
+
+    LengthSq = XMVector3LengthSq(V1);
+
+    V2 = XMVector3Cross(P2, V1);
+
+    P1W = _mm_shuffle_ps(P1,P1,_MM_SHUFFLE(3,3,3,3));
+    Point = _mm_mul_ps(V2, P1W);
+
+    V3 = XMVector3Cross(V1, P1);
+
+    P2W = _mm_shuffle_ps(P2,P2,_MM_SHUFFLE(3,3,3,3));
+    V3 = _mm_mul_ps(V3,P2W);
+    Point = _mm_add_ps(Point,V3);
+    LinePoint1 = _mm_div_ps(Point,LengthSq);
+
+    LinePoint2 = _mm_add_ps(LinePoint1, V1);
+
+    Control = XMVectorLessOrEqual(LengthSq, g_XMEpsilon);
+    *pLinePoint1 = XMVectorSelect(LinePoint1,g_XMQNaN, Control);
+    *pLinePoint2 = XMVectorSelect(LinePoint2,g_XMQNaN, Control);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMPlaneTransform
+(
+    FXMVECTOR P,
+    CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR X;
+    XMVECTOR Y;
+    XMVECTOR Z;
+    XMVECTOR W;
+    XMVECTOR Result;
+
+    W = XMVectorSplatW(P);
+    Z = XMVectorSplatZ(P);
+    Y = XMVectorSplatY(P);
+    X = XMVectorSplatX(P);
+
+    Result = XMVectorMultiply(W, M.r[3]);
+    Result = XMVectorMultiplyAdd(Z, M.r[2], Result);
+    Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
+    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
+
+    return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR X = _mm_shuffle_ps(P,P,_MM_SHUFFLE(0,0,0,0));
+    XMVECTOR Y = _mm_shuffle_ps(P,P,_MM_SHUFFLE(1,1,1,1));
+    XMVECTOR Z = _mm_shuffle_ps(P,P,_MM_SHUFFLE(2,2,2,2));
+    XMVECTOR W = _mm_shuffle_ps(P,P,_MM_SHUFFLE(3,3,3,3));
+    X = _mm_mul_ps(X, M.r[0]);
+    Y = _mm_mul_ps(Y, M.r[1]);
+    Z = _mm_mul_ps(Z, M.r[2]);
+    W = _mm_mul_ps(W, M.r[3]);
+    X = _mm_add_ps(X,Z);
+    Y = _mm_add_ps(Y,W);
+    X = _mm_add_ps(X,Y);
+    return X;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMFLOAT4* XMPlaneTransformStream
+(
+    XMFLOAT4*       pOutputStream,
+    size_t          OutputStride,
+    CONST XMFLOAT4* pInputStream,
+    size_t          InputStride,
+    size_t          PlaneCount,
+    CXMMATRIX       M
+)
+{
+    return XMVector4TransformStream(pOutputStream,
+                                    OutputStride,
+                                    pInputStream,
+                                    InputStride,
+                                    PlaneCount,
+                                    M);
+}
+
+//------------------------------------------------------------------------------
+// Conversion operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMPlaneFromPointNormal
+(
+    FXMVECTOR Point,
+    FXMVECTOR Normal
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR W;
+    XMVECTOR Result;
+
+    W = XMVector3Dot(Point, Normal);
+    W = XMVectorNegate(W);
+    Result = XMVectorSelect(W, Normal, g_XMSelect1110.v);
+
+    return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR W;
+    XMVECTOR Result;
+    W = XMVector3Dot(Point,Normal);
+    W = _mm_mul_ps(W,g_XMNegativeOne);
+    Result = _mm_and_ps(Normal,g_XMMask3);
+    W = _mm_and_ps(W,g_XMMaskW);
+    Result = _mm_or_ps(Result,W);
+    return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMPlaneFromPoints
+(
+    FXMVECTOR Point1,
+    FXMVECTOR Point2,
+    FXMVECTOR Point3
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N;
+    XMVECTOR D;
+    XMVECTOR V21;
+    XMVECTOR V31;
+    XMVECTOR Result;
+
+    V21 = XMVectorSubtract(Point1, Point2);
+    V31 = XMVectorSubtract(Point1, Point3);
+
+    N = XMVector3Cross(V21, V31);
+    N = XMVector3Normalize(N);
+
+    D = XMPlaneDotNormal(N, Point1);
+    D = XMVectorNegate(D);
+
+    Result = XMVectorSelect(D, N, g_XMSelect1110.v);
+
+    return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR N;
+    XMVECTOR D;
+    XMVECTOR V21;
+    XMVECTOR V31;
+    XMVECTOR Result;
+
+    V21 = _mm_sub_ps(Point1, Point2);
+    V31 = _mm_sub_ps(Point1, Point3);
+
+    N = XMVector3Cross(V21, V31);
+    N = XMVector3Normalize(N);
+
+    D = XMPlaneDotNormal(N, Point1);
+    D = _mm_mul_ps(D,g_XMNegativeOne);
+    N = _mm_and_ps(N,g_XMMask3);
+    D = _mm_and_ps(D,g_XMMaskW);
+    Result = _mm_or_ps(D,N);
+    return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+/****************************************************************************
+ *
+ * Color
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+// Comparison operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMColorEqual
+(
+    FXMVECTOR C1,
+    FXMVECTOR C2
+)
+{
+    return XMVector4Equal(C1, C2);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMColorNotEqual
+(
+    FXMVECTOR C1,
+    FXMVECTOR C2
+)
+{
+    return XMVector4NotEqual(C1, C2);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMColorGreater
+(
+    FXMVECTOR C1,
+    FXMVECTOR C2
+)
+{
+    return XMVector4Greater(C1, C2);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMColorGreaterOrEqual
+(
+    FXMVECTOR C1,
+    FXMVECTOR C2
+)
+{
+    return XMVector4GreaterOrEqual(C1, C2);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMColorLess
+(
+    FXMVECTOR C1,
+    FXMVECTOR C2
+)
+{
+    return XMVector4Less(C1, C2);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMColorLessOrEqual
+(
+    FXMVECTOR C1,
+    FXMVECTOR C2
+)
+{
+    return XMVector4LessOrEqual(C1, C2);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMColorIsNaN
+(
+    FXMVECTOR C
+)
+{
+    return XMVector4IsNaN(C);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMColorIsInfinite
+(
+    FXMVECTOR C
+)
+{
+    return XMVector4IsInfinite(C);
+}
+
+//------------------------------------------------------------------------------
+// Computation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMColorNegative
+(
+    FXMVECTOR vColor
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+//    XMASSERT(XMVector4GreaterOrEqual(C, XMVectorReplicate(0.0f)));
+//    XMASSERT(XMVector4LessOrEqual(C, XMVectorReplicate(1.0f)));
+    XMVECTOR vResult = {
+        1.0f - vColor.vector4_f32[0],
+        1.0f - vColor.vector4_f32[1],
+        1.0f - vColor.vector4_f32[2],
+        vColor.vector4_f32[3]
+    };
+    return vResult;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Negate only x,y and z.
+    XMVECTOR vTemp = _mm_xor_ps(vColor,g_XMNegate3);
+    // Add 1,1,1,0 to -x,-y,-z,w
+       return _mm_add_ps(vTemp,g_XMOne3);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMColorModulate
+(
+    FXMVECTOR C1,
+    FXMVECTOR C2
+)
+{
+    return XMVectorMultiply(C1, C2);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMColorAdjustSaturation
+(
+    FXMVECTOR vColor,
+    FLOAT    fSaturation
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    CONST XMVECTOR gvLuminance = {0.2125f, 0.7154f, 0.0721f, 0.0f};
+
+    // Luminance = 0.2125f * C[0] + 0.7154f * C[1] + 0.0721f * C[2];
+    // Result = (C - Luminance) * Saturation + Luminance;
+
+    FLOAT fLuminance = (vColor.vector4_f32[0]*gvLuminance.vector4_f32[0])+(vColor.vector4_f32[1]*gvLuminance.vector4_f32[1])+(vColor.vector4_f32[2]*gvLuminance.vector4_f32[2]);
+    XMVECTOR vResult = {
+        ((vColor.vector4_f32[0] - fLuminance)*fSaturation)+fLuminance,
+        ((vColor.vector4_f32[1] - fLuminance)*fSaturation)+fLuminance,
+        ((vColor.vector4_f32[2] - fLuminance)*fSaturation)+fLuminance,
+        vColor.vector4_f32[3]};
+    return vResult;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 gvLuminance = {0.2125f, 0.7154f, 0.0721f, 0.0f};
+// Mul RGB by intensity constants
+    XMVECTOR vLuminance = _mm_mul_ps(vColor,gvLuminance);      
+// vResult.x = vLuminance.y, vResult.y = vLuminance.y,
+// vResult.z = vLuminance.z, vResult.w = vLuminance.z 
+    XMVECTOR vResult = vLuminance;
+    vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(2,2,1,1)); 
+// vLuminance.x += vLuminance.y
+    vLuminance = _mm_add_ss(vLuminance,vResult);
+// Splat vLuminance.z
+    vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(2,2,2,2));
+// vLuminance.x += vLuminance.z (Dot product)
+    vLuminance = _mm_add_ss(vLuminance,vResult);
+// Splat vLuminance
+    vLuminance = _mm_shuffle_ps(vLuminance,vLuminance,_MM_SHUFFLE(0,0,0,0));
+// Splat fSaturation
+    XMVECTOR vSaturation = _mm_set_ps1(fSaturation);
+// vResult = ((vColor-vLuminance)*vSaturation)+vLuminance;
+    vResult = _mm_sub_ps(vColor,vLuminance);
+    vResult = _mm_mul_ps(vResult,vSaturation);
+    vResult = _mm_add_ps(vResult,vLuminance);
+// Retain w from the source color
+    vLuminance = _mm_shuffle_ps(vResult,vColor,_MM_SHUFFLE(3,2,2,2));   // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w
+    vResult = _mm_shuffle_ps(vResult,vLuminance,_MM_SHUFFLE(3,0,1,0));  // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w
+    return vResult;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMColorAdjustContrast
+(
+    FXMVECTOR vColor,
+    FLOAT    fContrast
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    // Result = (vColor - 0.5f) * fContrast + 0.5f;
+    XMVECTOR vResult = {
+        ((vColor.vector4_f32[0]-0.5f) * fContrast) + 0.5f,
+        ((vColor.vector4_f32[1]-0.5f) * fContrast) + 0.5f,
+        ((vColor.vector4_f32[2]-0.5f) * fContrast) + 0.5f,
+        vColor.vector4_f32[3]        // Leave W untouched
+    };
+    return vResult;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vScale = _mm_set_ps1(fContrast);           // Splat the scale
+    XMVECTOR vResult = _mm_sub_ps(vColor,g_XMOneHalf);  // Subtract 0.5f from the source (Saving source)
+    vResult = _mm_mul_ps(vResult,vScale);               // Mul by scale
+    vResult = _mm_add_ps(vResult,g_XMOneHalf);          // Add 0.5f
+// Retain w from the source color
+    vScale = _mm_shuffle_ps(vResult,vColor,_MM_SHUFFLE(3,2,2,2));   // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w
+    vResult = _mm_shuffle_ps(vResult,vScale,_MM_SHUFFLE(3,0,1,0));  // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w
+    return vResult;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+/****************************************************************************
+ *
+ * Miscellaneous
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMINLINE BOOL XMVerifyCPUSupport()
+{
+#if defined(_XM_NO_INTRINSICS_) || !defined(_XM_SSE_INTRINSICS_)
+       return TRUE;
+#else // _XM_SSE_INTRINSICS_
+       // Note that on Windows 2000 or older, SSE2 detection is not supported so this will always fail
+       // Detecting SSE2 on older versions of Windows would require using cpuid directly
+       return ( IsProcessorFeaturePresent( PF_XMMI_INSTRUCTIONS_AVAILABLE ) && IsProcessorFeaturePresent( PF_XMMI64_INSTRUCTIONS_AVAILABLE ) );
+#endif
+}
+
+
+//------------------------------------------------------------------------------
+
+#define XMASSERT_LINE_STRING_SIZE 16
+
+XMINLINE VOID XMAssert
+(
+    CONST CHAR* pExpression,
+    CONST CHAR* pFileName,
+    UINT        LineNumber
+)
+{
+    CHAR        aLineString[XMASSERT_LINE_STRING_SIZE];
+    CHAR*       pLineString;
+    UINT        Line;
+
+    aLineString[XMASSERT_LINE_STRING_SIZE - 2] = '0';
+    aLineString[XMASSERT_LINE_STRING_SIZE - 1] = '\0';
+    for (Line = LineNumber, pLineString = aLineString + XMASSERT_LINE_STRING_SIZE - 2;
+         Line != 0 && pLineString >= aLineString;
+         Line /= 10, pLineString--)
+    {
+        *pLineString = (CHAR)('0' + (Line % 10));
+    }
+
+#ifndef NO_OUTPUT_DEBUG_STRING
+    OutputDebugStringA("Assertion failed: ");
+    OutputDebugStringA(pExpression);
+    OutputDebugStringA(", file ");
+    OutputDebugStringA(pFileName);
+    OutputDebugStringA(", line ");
+    OutputDebugStringA(pLineString + 1);
+    OutputDebugStringA("\r\n");
+#else
+    DbgPrint("Assertion failed: %s, file %s, line %d\r\n", pExpression, pFileName, LineNumber);
+#endif
+
+    __debugbreak();
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMFresnelTerm
+(
+    FXMVECTOR CosIncidentAngle,
+    FXMVECTOR RefractionIndex
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR G;
+    XMVECTOR D, S;
+    XMVECTOR V0, V1, V2, V3;
+    XMVECTOR Result;
+
+    // Result = 0.5f * (g - c)^2 / (g + c)^2 * ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1) where
+    // c = CosIncidentAngle
+    // g = sqrt(c^2 + RefractionIndex^2 - 1)
+
+    XMASSERT(!XMVector4IsInfinite(CosIncidentAngle));
+
+    G = XMVectorMultiplyAdd(RefractionIndex, RefractionIndex, g_XMNegativeOne.v);
+    G = XMVectorMultiplyAdd(CosIncidentAngle, CosIncidentAngle, G);
+    G = XMVectorAbs(G);
+    G = XMVectorSqrt(G);
+
+    S = XMVectorAdd(G, CosIncidentAngle);
+    D = XMVectorSubtract(G, CosIncidentAngle);
+
+    V0 = XMVectorMultiply(D, D);
+    V1 = XMVectorMultiply(S, S);
+    V1 = XMVectorReciprocal(V1);
+    V0 = XMVectorMultiply(g_XMOneHalf.v, V0);
+    V0 = XMVectorMultiply(V0, V1);
+
+    V2 = XMVectorMultiplyAdd(CosIncidentAngle, S, g_XMNegativeOne.v);
+    V3 = XMVectorMultiplyAdd(CosIncidentAngle, D, g_XMOne.v);
+    V2 = XMVectorMultiply(V2, V2);
+    V3 = XMVectorMultiply(V3, V3);
+    V3 = XMVectorReciprocal(V3);
+    V2 = XMVectorMultiplyAdd(V2, V3, g_XMOne.v);
+
+    Result = XMVectorMultiply(V0, V2);
+
+    Result = XMVectorSaturate(Result);
+
+    return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Result = 0.5f * (g - c)^2 / (g + c)^2 * ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1) where
+    // c = CosIncidentAngle
+    // g = sqrt(c^2 + RefractionIndex^2 - 1)
+
+    XMASSERT(!XMVector4IsInfinite(CosIncidentAngle));
+
+    // G = sqrt(abs((RefractionIndex^2-1) + CosIncidentAngle^2))
+    XMVECTOR G = _mm_mul_ps(RefractionIndex,RefractionIndex);
+    XMVECTOR vTemp = _mm_mul_ps(CosIncidentAngle,CosIncidentAngle);
+    G = _mm_sub_ps(G,g_XMOne);
+    vTemp = _mm_add_ps(vTemp,G);
+    // max((0-vTemp),vTemp) == abs(vTemp)
+    // The abs is needed to deal with refraction and cosine being zero
+       G = _mm_setzero_ps();
+       G = _mm_sub_ps(G,vTemp);
+       G = _mm_max_ps(G,vTemp);
+    // Last operation, the sqrt()
+    G = _mm_sqrt_ps(G);
+
+    // Calc G-C and G+C
+    XMVECTOR GAddC = _mm_add_ps(G,CosIncidentAngle);
+    XMVECTOR GSubC = _mm_sub_ps(G,CosIncidentAngle);
+    // Perform the term (0.5f *(g - c)^2) / (g + c)^2 
+    XMVECTOR vResult = _mm_mul_ps(GSubC,GSubC);
+    vTemp = _mm_mul_ps(GAddC,GAddC);
+    vResult = _mm_mul_ps(vResult,g_XMOneHalf);
+    vResult = _mm_div_ps(vResult,vTemp);
+    // Perform the term ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1)
+    GAddC = _mm_mul_ps(GAddC,CosIncidentAngle);
+    GSubC = _mm_mul_ps(GSubC,CosIncidentAngle);
+    GAddC = _mm_sub_ps(GAddC,g_XMOne);
+    GSubC = _mm_add_ps(GSubC,g_XMOne);
+    GAddC = _mm_mul_ps(GAddC,GAddC);
+    GSubC = _mm_mul_ps(GSubC,GSubC);
+    GAddC = _mm_div_ps(GAddC,GSubC);
+    GAddC = _mm_add_ps(GAddC,g_XMOne);
+    // Multiply the two term parts
+    vResult = _mm_mul_ps(vResult,GAddC);
+    // Clamp to 0.0 - 1.0f
+    vResult = _mm_max_ps(vResult,g_XMZero);
+    vResult = _mm_min_ps(vResult,g_XMOne);
+    return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMScalarNearEqual
+(
+    FLOAT S1,
+    FLOAT S2,
+    FLOAT Epsilon
+)
+{
+    FLOAT Delta = S1 - S2;
+#if defined(_XM_NO_INTRINSICS_)
+    UINT  AbsDelta = *(const UINT*)&Delta & 0x7FFFFFFF;
+    return (*(FLOAT*)&AbsDelta <= Epsilon);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return (fabsf(Delta) <= Epsilon);
+#else
+    return (__fabs(Delta) <= Epsilon);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Modulo the range of the given angle such that -XM_PI <= Angle < XM_PI
+XMFINLINE FLOAT XMScalarModAngle
+(
+    FLOAT Angle
+)
+{
+    // Note: The modulo is performed with unsigned math only to work
+    // around a precision error on numbers that are close to PI
+    float fTemp;
+#if defined(_XM_NO_INTRINSICS_) || !defined(_XM_VMX128_INTRINSICS_)
+    // Normalize the range from 0.0f to XM_2PI
+    Angle = Angle + XM_PI;
+    // Perform the modulo, unsigned
+    fTemp = fabsf(Angle);
+    fTemp = fTemp - (XM_2PI * (FLOAT)((INT)(fTemp/XM_2PI)));
+    // Restore the number to the range of -XM_PI to XM_PI-epsilon
+    fTemp = fTemp - XM_PI;
+    // If the modulo'd value was negative, restore negation
+    if (Angle<0.0f) {
+        fTemp = -fTemp;
+    }
+    return fTemp;
+#else
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE FLOAT XMScalarSin
+(
+    FLOAT Value
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    FLOAT                  ValueMod;
+    FLOAT                  ValueSq;
+    XMVECTOR               V0123, V0246, V1357, V9111315, V17192123;
+    XMVECTOR               V1, V7, V8;
+    XMVECTOR               R0, R1, R2;
+
+    ValueMod = XMScalarModAngle(Value);
+
+    // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! - V^15 / 15! +
+    //           V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
+
+    ValueSq = ValueMod * ValueMod;
+
+    V0123     = XMVectorSet(1.0f, ValueMod, ValueSq, ValueSq * ValueMod);
+    V1        = XMVectorSplatY(V0123);
+    V0246     = XMVectorMultiply(V0123, V0123);
+    V1357     = XMVectorMultiply(V0246, V1);
+    V7        = XMVectorSplatW(V1357);
+    V8        = XMVectorMultiply(V7, V1);
+    V9111315  = XMVectorMultiply(V1357, V8);
+    V17192123 = XMVectorMultiply(V9111315, V8);
+
+    R0        = XMVector4Dot(V1357, g_XMSinCoefficients0.v);
+    R1        = XMVector4Dot(V9111315, g_XMSinCoefficients1.v);
+    R2        = XMVector4Dot(V17192123, g_XMSinCoefficients2.v);
+
+    return R0.vector4_f32[0] + R1.vector4_f32[0] + R2.vector4_f32[0];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    return sinf( Value );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE FLOAT XMScalarCos
+(
+    FLOAT Value
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    FLOAT                  ValueMod;
+    FLOAT                  ValueSq;
+    XMVECTOR               V0123, V0246, V8101214, V16182022;
+    XMVECTOR               V2, V6, V8;
+    XMVECTOR               R0, R1, R2;
+
+    ValueMod = XMScalarModAngle(Value);
+
+    // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! +
+    //           V^12 / 12! - V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
+
+    ValueSq = ValueMod * ValueMod;
+
+    V0123 = XMVectorSet(1.0f, ValueMod, ValueSq, ValueSq * ValueMod);
+    V0246 = XMVectorMultiply(V0123, V0123);
+
+    V2 = XMVectorSplatZ(V0123);
+    V6 = XMVectorSplatW(V0246);
+    V8 = XMVectorMultiply(V6, V2);
+
+    V8101214 = XMVectorMultiply(V0246, V8);
+    V16182022 = XMVectorMultiply(V8101214, V8);
+
+    R0 = XMVector4Dot(V0246, g_XMCosCoefficients0.v);
+    R1 = XMVector4Dot(V8101214, g_XMCosCoefficients1.v);
+    R2 = XMVector4Dot(V16182022, g_XMCosCoefficients2.v);
+
+    return R0.vector4_f32[0] + R1.vector4_f32[0] + R2.vector4_f32[0];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    return cosf(Value);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE VOID XMScalarSinCos
+(
+    FLOAT* pSin,
+    FLOAT* pCos,
+    FLOAT  Value
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    FLOAT                  ValueMod;
+    FLOAT                  ValueSq;
+    XMVECTOR               V0123, V0246, V1357, V8101214, V9111315, V16182022, V17192123;
+    XMVECTOR               V1, V2, V6, V8;
+    XMVECTOR               S0, S1, S2, C0, C1, C2;
+
+    XMASSERT(pSin);
+    XMASSERT(pCos);
+
+    ValueMod = XMScalarModAngle(Value);
+
+    // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! - V^15 / 15! +
+    //           V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
+    // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! +
+    //           V^12 / 12! - V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
+
+    ValueSq = ValueMod * ValueMod;
+
+    V0123 = XMVectorSet(1.0f, ValueMod, ValueSq, ValueSq * ValueMod);
+
+    V1 = XMVectorSplatY(V0123);
+    V2 = XMVectorSplatZ(V0123);
+
+    V0246 = XMVectorMultiply(V0123, V0123);
+    V1357 = XMVectorMultiply(V0246, V1);
+
+    V6 = XMVectorSplatW(V0246);
+    V8 = XMVectorMultiply(V6, V2);
+
+    V8101214 = XMVectorMultiply(V0246, V8);
+    V9111315 = XMVectorMultiply(V1357, V8);
+    V16182022 = XMVectorMultiply(V8101214, V8);
+    V17192123 = XMVectorMultiply(V9111315, V8);
+
+    C0 = XMVector4Dot(V0246, g_XMCosCoefficients0.v);
+    S0 = XMVector4Dot(V1357, g_XMSinCoefficients0.v);
+    C1 = XMVector4Dot(V8101214, g_XMCosCoefficients1.v);
+    S1 = XMVector4Dot(V9111315, g_XMSinCoefficients1.v);
+    C2 = XMVector4Dot(V16182022, g_XMCosCoefficients2.v);
+    S2 = XMVector4Dot(V17192123, g_XMSinCoefficients2.v);
+
+    *pCos = C0.vector4_f32[0] + C1.vector4_f32[0] + C2.vector4_f32[0];
+    *pSin = S0.vector4_f32[0] + S1.vector4_f32[0] + S2.vector4_f32[0];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMASSERT(pSin);
+    XMASSERT(pCos);
+
+    *pSin = sinf(Value);
+    *pCos = cosf(Value);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE FLOAT XMScalarASin
+(
+    FLOAT Value
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    FLOAT AbsValue, Value2, Value3, D;
+    XMVECTOR AbsV, R0, R1, Result;
+    XMVECTOR V3;
+
+    *(UINT*)&AbsValue = *(const UINT*)&Value & 0x7FFFFFFF;
+
+    Value2 = Value * AbsValue;
+    Value3 = Value * Value2;
+    D = (Value - Value2) / sqrtf(1.00000011921f - AbsValue);
+
+    AbsV = XMVectorReplicate(AbsValue);
+
+    V3.vector4_f32[0] = Value3;
+    V3.vector4_f32[1] = 1.0f;
+    V3.vector4_f32[2] = Value3;
+    V3.vector4_f32[3] = 1.0f;
+
+    R1 = XMVectorSet(D, D, Value, Value);
+    R1 = XMVectorMultiply(R1, V3);
+
+    R0 = XMVectorMultiplyAdd(AbsV, g_XMASinCoefficients0.v, g_XMASinCoefficients1.v);
+    R0 = XMVectorMultiplyAdd(AbsV, R0, g_XMASinCoefficients2.v);
+
+    Result = XMVector4Dot(R0, R1);
+
+    return Result.vector4_f32[0];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    return asinf(Value);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE FLOAT XMScalarACos
+(
+    FLOAT Value
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    return XM_PIDIV2 - XMScalarASin(Value);
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    return acosf(Value);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE FLOAT XMScalarSinEst
+(
+    FLOAT Value
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    FLOAT                  ValueSq;
+    XMVECTOR               V;
+    XMVECTOR               Y;
+    XMVECTOR               Result;
+
+    XMASSERT(Value >= -XM_PI);
+    XMASSERT(Value < XM_PI);
+
+    // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! (for -PI <= V < PI)
+
+    ValueSq = Value * Value;
+
+    V = XMVectorSet(1.0f, Value, ValueSq, ValueSq * Value);
+    Y = XMVectorSplatY(V);
+    V = XMVectorMultiply(V, V);
+    V = XMVectorMultiply(V, Y);
+
+    Result = XMVector4Dot(V, g_XMSinEstCoefficients.v);
+
+    return Result.vector4_f32[0];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMASSERT(Value >= -XM_PI);
+    XMASSERT(Value < XM_PI);
+    float ValueSq = Value*Value;
+    XMVECTOR vValue = _mm_set_ps1(Value);
+    XMVECTOR vTemp = _mm_set_ps(ValueSq * Value,ValueSq,Value,1.0f); 
+    vTemp = _mm_mul_ps(vTemp,vTemp);
+    vTemp = _mm_mul_ps(vTemp,vValue);
+    // vTemp = Value,Value^3,Value^5,Value^7
+    vTemp = _mm_mul_ps(vTemp,g_XMSinEstCoefficients);
+    vValue = _mm_shuffle_ps(vValue,vTemp,_MM_SHUFFLE(1,0,0,0)); // Copy X to the Z position and Y to the W position
+    vValue = _mm_add_ps(vValue,vTemp);          // Add Z = X+Z; W = Y+W;
+    vTemp = _mm_shuffle_ps(vTemp,vValue,_MM_SHUFFLE(0,3,0,0));  // Copy W to the Z position
+    vTemp = _mm_add_ps(vTemp,vValue);           // Add Z and W together
+    vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(2,2,2,2));    // Splat Z and return
+#if defined(_MSC_VER) && (_MSC_VER>=1500)
+    return _mm_cvtss_f32(vTemp);    
+#else
+    return vTemp.m128_f32[0];
+#endif
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE FLOAT XMScalarCosEst
+(
+    FLOAT Value
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+    FLOAT    ValueSq;
+    XMVECTOR V;
+    XMVECTOR Result;
+    XMASSERT(Value >= -XM_PI);
+    XMASSERT(Value < XM_PI);
+    // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! (for -PI <= V < PI)
+    ValueSq = Value * Value;
+    V = XMVectorSet(1.0f, Value, ValueSq, ValueSq * Value);
+    V = XMVectorMultiply(V, V);
+    Result = XMVector4Dot(V, g_XMCosEstCoefficients.v);
+    return Result.vector4_f32[0];
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMASSERT(Value >= -XM_PI);
+    XMASSERT(Value < XM_PI);
+    float ValueSq = Value*Value;
+    XMVECTOR vValue = _mm_setzero_ps();
+    XMVECTOR vTemp = _mm_set_ps(ValueSq * Value,ValueSq,Value,1.0f); 
+    vTemp = _mm_mul_ps(vTemp,vTemp);
+    // vTemp = 1.0f,Value^2,Value^4,Value^6
+    vTemp = _mm_mul_ps(vTemp,g_XMCosEstCoefficients);
+    vValue = _mm_shuffle_ps(vValue,vTemp,_MM_SHUFFLE(1,0,0,0)); // Copy X to the Z position and Y to the W position
+    vValue = _mm_add_ps(vValue,vTemp);          // Add Z = X+Z; W = Y+W;
+    vTemp = _mm_shuffle_ps(vTemp,vValue,_MM_SHUFFLE(0,3,0,0));  // Copy W to the Z position
+    vTemp = _mm_add_ps(vTemp,vValue);           // Add Z and W together
+    vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(2,2,2,2));    // Splat Z and return
+#if defined(_MSC_VER) && (_MSC_VER>=1500)
+    return _mm_cvtss_f32(vTemp);    
+#else
+    return vTemp.m128_f32[0];
+#endif
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMScalarSinCosEst
+(
+    FLOAT* pSin,
+    FLOAT* pCos,
+    FLOAT  Value
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    FLOAT    ValueSq;
+    XMVECTOR V, Sin, Cos;
+    XMVECTOR Y;
+
+    XMASSERT(pSin);
+    XMASSERT(pCos);
+    XMASSERT(Value >= -XM_PI);
+    XMASSERT(Value < XM_PI);
+
+    // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! (for -PI <= V < PI)
+    // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! (for -PI <= V < PI)
+
+    ValueSq = Value * Value;
+    V = XMVectorSet(1.0f, Value, ValueSq, Value * ValueSq);
+    Y = XMVectorSplatY(V);
+    Cos = XMVectorMultiply(V, V);
+    Sin = XMVectorMultiply(Cos, Y);
+
+    Cos = XMVector4Dot(Cos, g_XMCosEstCoefficients.v);
+    Sin = XMVector4Dot(Sin, g_XMSinEstCoefficients.v);
+
+    *pCos = Cos.vector4_f32[0];
+    *pSin = Sin.vector4_f32[0];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMASSERT(pSin);
+    XMASSERT(pCos);
+    XMASSERT(Value >= -XM_PI);
+    XMASSERT(Value < XM_PI);
+    float ValueSq = Value * Value;
+    XMVECTOR Cos = _mm_set_ps(Value * ValueSq,ValueSq,Value,1.0f);
+    XMVECTOR Sin = _mm_set_ps1(Value);
+    Cos = _mm_mul_ps(Cos,Cos);
+    Sin = _mm_mul_ps(Sin,Cos);
+    // Cos = 1.0f,Value^2,Value^4,Value^6
+    Cos = XMVector4Dot(Cos,g_XMCosEstCoefficients);
+    _mm_store_ss(pCos,Cos);
+    // Sin = Value,Value^3,Value^5,Value^7
+    Sin = XMVector4Dot(Sin, g_XMSinEstCoefficients);
+    _mm_store_ss(pSin,Sin);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE FLOAT XMScalarASinEst
+(
+    FLOAT Value
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR VR, CR, CS;
+    XMVECTOR Result;
+    FLOAT AbsV, V2, D;
+    CONST FLOAT OnePlusEps = 1.00000011921f;
+
+    *(UINT*)&AbsV = *(const UINT*)&Value & 0x7FFFFFFF;
+    V2 = Value * AbsV;
+    D = OnePlusEps - AbsV;
+
+    CS = XMVectorSet(Value, 1.0f, 1.0f, V2);
+    VR = XMVectorSet(sqrtf(D), Value, V2, D * AbsV);
+    CR = XMVectorMultiply(CS, g_XMASinEstCoefficients.v);
+
+    Result = XMVector4Dot(VR, CR);
+
+    return Result.vector4_f32[0];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    CONST FLOAT OnePlusEps = 1.00000011921f;
+    FLOAT AbsV = fabsf(Value);
+    FLOAT V2 = Value * AbsV;    // Square with sign retained
+    FLOAT D = OnePlusEps - AbsV;
+
+    XMVECTOR Result = _mm_set_ps(V2,1.0f,1.0f,Value);
+    XMVECTOR VR = _mm_set_ps(D * AbsV,V2,Value,sqrtf(D));
+    Result = _mm_mul_ps(Result, g_XMASinEstCoefficients);
+    Result = XMVector4Dot(VR,Result);
+#if defined(_MSC_VER) && (_MSC_VER>=1500)
+    return _mm_cvtss_f32(Result);    
+#else
+    return Result.m128_f32[0];
+#endif
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE FLOAT XMScalarACosEst
+(
+    FLOAT Value
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR VR, CR, CS;
+    XMVECTOR Result;
+    FLOAT AbsV, V2, D;
+    CONST FLOAT OnePlusEps = 1.00000011921f;
+
+    // return XM_PIDIV2 - XMScalarASin(Value);
+
+    *(UINT*)&AbsV = *(const UINT*)&Value & 0x7FFFFFFF;
+    V2 = Value * AbsV;
+    D = OnePlusEps - AbsV;
+
+    CS = XMVectorSet(Value, 1.0f, 1.0f, V2);
+    VR = XMVectorSet(sqrtf(D), Value, V2, D * AbsV);
+    CR = XMVectorMultiply(CS, g_XMASinEstCoefficients.v);
+
+    Result = XMVector4Dot(VR, CR);
+
+    return XM_PIDIV2 - Result.vector4_f32[0];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    CONST FLOAT OnePlusEps = 1.00000011921f;
+    FLOAT AbsV = fabsf(Value);
+    FLOAT V2 = Value * AbsV;    // Value^2 retaining sign
+    FLOAT D = OnePlusEps - AbsV;
+    XMVECTOR Result = _mm_set_ps(V2,1.0f,1.0f,Value);
+    XMVECTOR VR = _mm_set_ps(D * AbsV,V2,Value,sqrtf(D));
+    Result = _mm_mul_ps(Result,g_XMASinEstCoefficients);
+    Result = XMVector4Dot(VR,Result);
+#if defined(_MSC_VER) && (_MSC_VER>=1500)
+    return XM_PIDIV2 - _mm_cvtss_f32(Result);    
+#else
+    return XM_PIDIV2 - Result.m128_f32[0];
+#endif
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+#endif // __XNAMATHMISC_INL__
+