/************************************************************************
*                                                                       *
* xnamathmisc.inl -- SIMD C++ Math library for Windows and Xbox 360     *
*                    Quaternion, plane, and color functions             *
*                                                                       *
* Copyright (c) Microsoft Corp. All rights reserved.                    *
*                                                                       *
************************************************************************/

#if defined(_MSC_VER) && (_MSC_VER > 1000)
#pragma once
#endif

#ifndef __XNAMATHMISC_INL__
#define __XNAMATHMISC_INL__

/****************************************************************************
 *
 * Quaternion
 *
 ****************************************************************************/

//------------------------------------------------------------------------------
// Comparison operations
//------------------------------------------------------------------------------

//------------------------------------------------------------------------------

XMFINLINE BOOL XMQuaternionEqual
(
    FXMVECTOR Q1,
    FXMVECTOR Q2
)
{
    return XMVector4Equal(Q1, Q2);
}

//------------------------------------------------------------------------------

XMFINLINE BOOL XMQuaternionNotEqual
(
    FXMVECTOR Q1,
    FXMVECTOR Q2
)
{
    return XMVector4NotEqual(Q1, Q2);
}

//------------------------------------------------------------------------------

XMFINLINE BOOL XMQuaternionIsNaN
(
    FXMVECTOR Q
)
{
    return XMVector4IsNaN(Q);
}

//------------------------------------------------------------------------------

XMFINLINE BOOL XMQuaternionIsInfinite
(
    FXMVECTOR Q
)
{
    return XMVector4IsInfinite(Q);
}

//------------------------------------------------------------------------------

XMFINLINE BOOL XMQuaternionIsIdentity
(
    FXMVECTOR Q
)
{
#if defined(_XM_NO_INTRINSICS_)

    return XMVector4Equal(Q, g_XMIdentityR3.v);

#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = _mm_cmpeq_ps(Q,g_XMIdentityR3);
    return (_mm_movemask_ps(vTemp)==0x0f);
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------
// Computation operations
//------------------------------------------------------------------------------

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMQuaternionDot
(
    FXMVECTOR Q1,
    FXMVECTOR Q2
)
{
    return XMVector4Dot(Q1, Q2);
}

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMQuaternionMultiply
(
    FXMVECTOR Q1,
    FXMVECTOR Q2
)
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR         NegativeQ1;
    XMVECTOR         Q2X;
    XMVECTOR         Q2Y;
    XMVECTOR         Q2Z;
    XMVECTOR         Q2W;
    XMVECTOR         Q1WZYX;
    XMVECTOR         Q1ZWXY;
    XMVECTOR         Q1YXWZ;
    XMVECTOR         Result;
    CONST XMVECTORU32 ControlWZYX = {XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_1X};
    CONST XMVECTORU32 ControlZWXY = {XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X, XM_PERMUTE_1Y};
    CONST XMVECTORU32 ControlYXWZ = {XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_1Z};

    NegativeQ1 = XMVectorNegate(Q1);

    Q2W = XMVectorSplatW(Q2);
    Q2X = XMVectorSplatX(Q2);
    Q2Y = XMVectorSplatY(Q2);
    Q2Z = XMVectorSplatZ(Q2);

    Q1WZYX = XMVectorPermute(Q1, NegativeQ1, ControlWZYX.v);
    Q1ZWXY = XMVectorPermute(Q1, NegativeQ1, ControlZWXY.v);
    Q1YXWZ = XMVectorPermute(Q1, NegativeQ1, ControlYXWZ.v);

    Result = XMVectorMultiply(Q1, Q2W);
    Result = XMVectorMultiplyAdd(Q1WZYX, Q2X, Result);
    Result = XMVectorMultiplyAdd(Q1ZWXY, Q2Y, Result);
    Result = XMVectorMultiplyAdd(Q1YXWZ, Q2Z, Result);

    return Result;

#elif defined(_XM_SSE_INTRINSICS_)
    static CONST XMVECTORF32 ControlWZYX = { 1.0f,-1.0f, 1.0f,-1.0f};
    static CONST XMVECTORF32 ControlZWXY = { 1.0f, 1.0f,-1.0f,-1.0f};
    static CONST XMVECTORF32 ControlYXWZ = {-1.0f, 1.0f, 1.0f,-1.0f};
    // Copy to SSE registers and use as few as possible for x86
    XMVECTOR Q2X = Q2;
    XMVECTOR Q2Y = Q2;
    XMVECTOR Q2Z = Q2;
    XMVECTOR vResult = Q2;
    // Splat with one instruction
    vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,3,3,3));
    Q2X = _mm_shuffle_ps(Q2X,Q2X,_MM_SHUFFLE(0,0,0,0));
    Q2Y = _mm_shuffle_ps(Q2Y,Q2Y,_MM_SHUFFLE(1,1,1,1));
    Q2Z = _mm_shuffle_ps(Q2Z,Q2Z,_MM_SHUFFLE(2,2,2,2));
    // Retire Q1 and perform Q1*Q2W
    vResult = _mm_mul_ps(vResult,Q1);
    XMVECTOR Q1Shuffle = Q1;
    // Shuffle the copies of Q1
    Q1Shuffle = _mm_shuffle_ps(Q1Shuffle,Q1Shuffle,_MM_SHUFFLE(0,1,2,3));
    // Mul by Q1WZYX
    Q2X = _mm_mul_ps(Q2X,Q1Shuffle);
    Q1Shuffle = _mm_shuffle_ps(Q1Shuffle,Q1Shuffle,_MM_SHUFFLE(2,3,0,1));
    // Flip the signs on y and z
    Q2X = _mm_mul_ps(Q2X,ControlWZYX);
    // Mul by Q1ZWXY
    Q2Y = _mm_mul_ps(Q2Y,Q1Shuffle);
    Q1Shuffle = _mm_shuffle_ps(Q1Shuffle,Q1Shuffle,_MM_SHUFFLE(0,1,2,3));
    // Flip the signs on z and w
    Q2Y = _mm_mul_ps(Q2Y,ControlZWXY);
    // Mul by Q1YXWZ
    Q2Z = _mm_mul_ps(Q2Z,Q1Shuffle);
    vResult = _mm_add_ps(vResult,Q2X);
    // Flip the signs on x and w
    Q2Z = _mm_mul_ps(Q2Z,ControlYXWZ);
    Q2Y = _mm_add_ps(Q2Y,Q2Z);
    vResult = _mm_add_ps(vResult,Q2Y);
    return vResult;
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMQuaternionLengthSq
(
    FXMVECTOR Q
)
{
    return XMVector4LengthSq(Q);
}

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMQuaternionReciprocalLength
(
    FXMVECTOR Q
)
{
    return XMVector4ReciprocalLength(Q);
}

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMQuaternionLength
(
    FXMVECTOR Q
)
{
    return XMVector4Length(Q);
}

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMQuaternionNormalizeEst
(
    FXMVECTOR Q
)
{
    return XMVector4NormalizeEst(Q);
}

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMQuaternionNormalize
(
    FXMVECTOR Q
)
{
    return XMVector4Normalize(Q);
}

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMQuaternionConjugate
(
    FXMVECTOR Q
)
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR Result = {
        -Q.vector4_f32[0],
        -Q.vector4_f32[1],
        -Q.vector4_f32[2],
        Q.vector4_f32[3]
    };
    return Result;
#elif defined(_XM_SSE_INTRINSICS_)
    static const XMVECTORF32 NegativeOne3 = {-1.0f,-1.0f,-1.0f,1.0f};
    XMVECTOR Result = _mm_mul_ps(Q,NegativeOne3);
    return Result;
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMQuaternionInverse
(
    FXMVECTOR Q
)
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR        Conjugate;
    XMVECTOR        L;
    XMVECTOR        Control;
    XMVECTOR        Result;
    CONST XMVECTOR  Zero = XMVectorZero();

    L = XMVector4LengthSq(Q);
    Conjugate = XMQuaternionConjugate(Q);

    Control = XMVectorLessOrEqual(L, g_XMEpsilon.v);

    L = XMVectorReciprocal(L);
    Result = XMVectorMultiply(Conjugate, L);

    Result = XMVectorSelect(Result, Zero, Control);

    return Result;

#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR        Conjugate;
    XMVECTOR        L;
    XMVECTOR        Control;
    XMVECTOR        Result;
    XMVECTOR  Zero = XMVectorZero();

    L = XMVector4LengthSq(Q);
    Conjugate = XMQuaternionConjugate(Q);
    Control = XMVectorLessOrEqual(L, g_XMEpsilon);
    Result = _mm_div_ps(Conjugate,L);
    Result = XMVectorSelect(Result, Zero, Control);
    return Result;
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMQuaternionLn
(
    FXMVECTOR Q
)
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR Q0;
    XMVECTOR QW;
    XMVECTOR Theta;
    XMVECTOR SinTheta;
    XMVECTOR S;
    XMVECTOR ControlW;
    XMVECTOR Result;
    static CONST XMVECTOR OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f};

    QW = XMVectorSplatW(Q);
    Q0 = XMVectorSelect(g_XMSelect1110.v, Q, g_XMSelect1110.v);

    ControlW = XMVectorInBounds(QW, OneMinusEpsilon);

    Theta = XMVectorACos(QW);
    SinTheta = XMVectorSin(Theta);

    S = XMVectorReciprocal(SinTheta);
    S = XMVectorMultiply(Theta, S);

    Result = XMVectorMultiply(Q0, S);

    Result = XMVectorSelect(Q0, Result, ControlW);

    return Result;

#elif defined(_XM_SSE_INTRINSICS_)
    static CONST XMVECTORF32 OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f};
    static CONST XMVECTORF32 NegOneMinusEpsilon = {-(1.0f - 0.00001f), -(1.0f - 0.00001f),-(1.0f - 0.00001f),-(1.0f - 0.00001f)};
    // Get W only
    XMVECTOR QW = _mm_shuffle_ps(Q,Q,_MM_SHUFFLE(3,3,3,3));
    // W = 0
    XMVECTOR Q0 = _mm_and_ps(Q,g_XMMask3);
    // Use W if within bounds
    XMVECTOR ControlW = _mm_cmple_ps(QW,OneMinusEpsilon);
    XMVECTOR vTemp2 = _mm_cmpge_ps(QW,NegOneMinusEpsilon);
    ControlW = _mm_and_ps(ControlW,vTemp2);
    // Get theta
    XMVECTOR vTheta = XMVectorACos(QW);
    // Get Sine of theta
    vTemp2 = XMVectorSin(vTheta);
    // theta/sine of theta
    vTheta = _mm_div_ps(vTheta,vTemp2);
    // Here's the answer
    vTheta = _mm_mul_ps(vTheta,Q0);
    // Was W in bounds? If not, return input as is
    vTheta = XMVectorSelect(Q0,vTheta,ControlW);
    return vTheta;
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMQuaternionExp
(
    FXMVECTOR Q
)
{
#if defined(_XM_NO_INTRINSICS_) 

    XMVECTOR Theta;
    XMVECTOR SinTheta;
    XMVECTOR CosTheta;
    XMVECTOR S;
    XMVECTOR Control;
    XMVECTOR Zero;
    XMVECTOR Result;

    Theta = XMVector3Length(Q);
    XMVectorSinCos(&SinTheta, &CosTheta, Theta);

    S = XMVectorReciprocal(Theta);
    S = XMVectorMultiply(SinTheta, S);

    Result = XMVectorMultiply(Q, S);

    Zero = XMVectorZero();
    Control = XMVectorNearEqual(Theta, Zero, g_XMEpsilon.v);
    Result = XMVectorSelect(Result, Q, Control);

    Result = XMVectorSelect(CosTheta, Result, g_XMSelect1110.v);

    return Result;

#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR Theta;
    XMVECTOR SinTheta;
    XMVECTOR CosTheta;
    XMVECTOR S;
    XMVECTOR Control;
    XMVECTOR Zero;
    XMVECTOR Result;
    Theta = XMVector3Length(Q);
    XMVectorSinCos(&SinTheta, &CosTheta, Theta);
    S = _mm_div_ps(SinTheta,Theta);
    Result = _mm_mul_ps(Q, S);
    Zero = XMVectorZero();
    Control = XMVectorNearEqual(Theta, Zero, g_XMEpsilon);
    Result = XMVectorSelect(Result,Q,Control);
    Result = _mm_and_ps(Result,g_XMMask3);
    CosTheta = _mm_and_ps(CosTheta,g_XMMaskW);
    Result = _mm_or_ps(Result,CosTheta);
    return Result;
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------

XMINLINE XMVECTOR XMQuaternionSlerp
(
    FXMVECTOR Q0,
    FXMVECTOR Q1,
    FLOAT    t
)
{
    XMVECTOR T = XMVectorReplicate(t);
    return XMQuaternionSlerpV(Q0, Q1, T);
}

//------------------------------------------------------------------------------

XMINLINE XMVECTOR XMQuaternionSlerpV
(
    FXMVECTOR Q0,
    FXMVECTOR Q1,
    FXMVECTOR T
)
{
#if defined(_XM_NO_INTRINSICS_)

    // Result = Q0 * sin((1.0 - t) * Omega) / sin(Omega) + Q1 * sin(t * Omega) / sin(Omega)
    XMVECTOR Omega;
    XMVECTOR CosOmega;
    XMVECTOR SinOmega;
    XMVECTOR InvSinOmega;
    XMVECTOR V01;
    XMVECTOR C1000;
    XMVECTOR SignMask;
    XMVECTOR S0;
    XMVECTOR S1;
    XMVECTOR Sign;
    XMVECTOR Control;
    XMVECTOR Result;
    XMVECTOR Zero;
    CONST XMVECTOR OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f};

    XMASSERT((T.vector4_f32[1] == T.vector4_f32[0]) && (T.vector4_f32[2] == T.vector4_f32[0]) && (T.vector4_f32[3] == T.vector4_f32[0]));

    CosOmega = XMQuaternionDot(Q0, Q1);

    Zero = XMVectorZero();
    Control = XMVectorLess(CosOmega, Zero);
    Sign = XMVectorSelect(g_XMOne.v, g_XMNegativeOne.v, Control);

    CosOmega = XMVectorMultiply(CosOmega, Sign);

    Control = XMVectorLess(CosOmega, OneMinusEpsilon);

    SinOmega = XMVectorNegativeMultiplySubtract(CosOmega, CosOmega, g_XMOne.v);
    SinOmega = XMVectorSqrt(SinOmega);

    Omega = XMVectorATan2(SinOmega, CosOmega);

    SignMask = XMVectorSplatSignMask();
    C1000 = XMVectorSetBinaryConstant(1, 0, 0, 0);
    V01 = XMVectorShiftLeft(T, Zero, 2);
    SignMask = XMVectorShiftLeft(SignMask, Zero, 3);
    V01 = XMVectorXorInt(V01, SignMask);
    V01 = XMVectorAdd(C1000, V01);

    InvSinOmega = XMVectorReciprocal(SinOmega);

    S0 = XMVectorMultiply(V01, Omega);
    S0 = XMVectorSin(S0);
    S0 = XMVectorMultiply(S0, InvSinOmega);

    S0 = XMVectorSelect(V01, S0, Control);

    S1 = XMVectorSplatY(S0);
    S0 = XMVectorSplatX(S0);

    S1 = XMVectorMultiply(S1, Sign);

    Result = XMVectorMultiply(Q0, S0);
    Result = XMVectorMultiplyAdd(Q1, S1, Result);

    return Result;

#elif defined(_XM_SSE_INTRINSICS_)
    // Result = Q0 * sin((1.0 - t) * Omega) / sin(Omega) + Q1 * sin(t * Omega) / sin(Omega)
    XMVECTOR Omega;
    XMVECTOR CosOmega;
    XMVECTOR SinOmega;
    XMVECTOR V01;
    XMVECTOR S0;
    XMVECTOR S1;
    XMVECTOR Sign;
    XMVECTOR Control;
    XMVECTOR Result;
    XMVECTOR Zero;
    static const XMVECTORF32 OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f};
    static const XMVECTORI32 SignMask2 = {0x80000000,0x00000000,0x00000000,0x00000000};
    static const XMVECTORI32 MaskXY = {0xFFFFFFFF,0xFFFFFFFF,0x00000000,0x00000000};

    XMASSERT((XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T)));

    CosOmega = XMQuaternionDot(Q0, Q1);

    Zero = XMVectorZero();
    Control = XMVectorLess(CosOmega, Zero);
    Sign = XMVectorSelect(g_XMOne, g_XMNegativeOne, Control);

    CosOmega = _mm_mul_ps(CosOmega, Sign);

    Control = XMVectorLess(CosOmega, OneMinusEpsilon);

    SinOmega = _mm_mul_ps(CosOmega,CosOmega);
    SinOmega = _mm_sub_ps(g_XMOne,SinOmega);
    SinOmega = _mm_sqrt_ps(SinOmega);

    Omega = XMVectorATan2(SinOmega, CosOmega);

    V01 = _mm_shuffle_ps(T,T,_MM_SHUFFLE(2,3,0,1));
    V01 = _mm_and_ps(V01,MaskXY);
    V01 = _mm_xor_ps(V01,SignMask2);
    V01 = _mm_add_ps(g_XMIdentityR0, V01);

    S0 = _mm_mul_ps(V01, Omega);
    S0 = XMVectorSin(S0);
    S0 = _mm_div_ps(S0, SinOmega);

    S0 = XMVectorSelect(V01, S0, Control);

    S1 = XMVectorSplatY(S0);
    S0 = XMVectorSplatX(S0);

    S1 = _mm_mul_ps(S1, Sign);
    Result = _mm_mul_ps(Q0, S0);
    S1 = _mm_mul_ps(S1, Q1);
    Result = _mm_add_ps(Result,S1);
    return Result;
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMQuaternionSquad
(
    FXMVECTOR Q0,
    FXMVECTOR Q1,
    FXMVECTOR Q2,
    CXMVECTOR Q3,
    FLOAT    t
)
{
    XMVECTOR T = XMVectorReplicate(t);
    return XMQuaternionSquadV(Q0, Q1, Q2, Q3, T);
}

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMQuaternionSquadV
(
    FXMVECTOR Q0,
    FXMVECTOR Q1,
    FXMVECTOR Q2,
    CXMVECTOR Q3,
    CXMVECTOR T
)
{
    XMVECTOR Q03;
    XMVECTOR Q12;
    XMVECTOR TP;
    XMVECTOR Two;
    XMVECTOR Result;

    XMASSERT( (XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T)) );

    TP = T;
    Two = XMVectorSplatConstant(2, 0);

    Q03 = XMQuaternionSlerpV(Q0, Q3, T);
    Q12 = XMQuaternionSlerpV(Q1, Q2, T);

    TP = XMVectorNegativeMultiplySubtract(TP, TP, TP);
    TP = XMVectorMultiply(TP, Two);

    Result = XMQuaternionSlerpV(Q03, Q12, TP);

    return Result;

}

//------------------------------------------------------------------------------

XMINLINE VOID XMQuaternionSquadSetup
(
    XMVECTOR* pA,
    XMVECTOR* pB,
    XMVECTOR* pC,
    FXMVECTOR  Q0,
    FXMVECTOR  Q1,
    FXMVECTOR  Q2,
    CXMVECTOR  Q3
)
{
    XMVECTOR SQ0, SQ2, SQ3;
    XMVECTOR InvQ1, InvQ2;
    XMVECTOR LnQ0, LnQ1, LnQ2, LnQ3;
    XMVECTOR ExpQ02, ExpQ13;
    XMVECTOR LS01, LS12, LS23;
    XMVECTOR LD01, LD12, LD23;
    XMVECTOR Control0, Control1, Control2;
    XMVECTOR NegativeOneQuarter;

    XMASSERT(pA);
    XMASSERT(pB);
    XMASSERT(pC);

    LS12 = XMQuaternionLengthSq(XMVectorAdd(Q1, Q2));
    LD12 = XMQuaternionLengthSq(XMVectorSubtract(Q1, Q2));
    SQ2 = XMVectorNegate(Q2);

    Control1 = XMVectorLess(LS12, LD12);
    SQ2 = XMVectorSelect(Q2, SQ2, Control1);

    LS01 = XMQuaternionLengthSq(XMVectorAdd(Q0, Q1));
    LD01 = XMQuaternionLengthSq(XMVectorSubtract(Q0, Q1));
    SQ0 = XMVectorNegate(Q0);

    LS23 = XMQuaternionLengthSq(XMVectorAdd(SQ2, Q3));
    LD23 = XMQuaternionLengthSq(XMVectorSubtract(SQ2, Q3));
    SQ3 = XMVectorNegate(Q3);

    Control0 = XMVectorLess(LS01, LD01);
    Control2 = XMVectorLess(LS23, LD23);

    SQ0 = XMVectorSelect(Q0, SQ0, Control0);
    SQ3 = XMVectorSelect(Q3, SQ3, Control2);

    InvQ1 = XMQuaternionInverse(Q1);
    InvQ2 = XMQuaternionInverse(SQ2);

    LnQ0 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ0));
    LnQ2 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ2));
    LnQ1 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, Q1));
    LnQ3 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, SQ3));

    NegativeOneQuarter = XMVectorSplatConstant(-1, 2);

    ExpQ02 = XMVectorMultiply(XMVectorAdd(LnQ0, LnQ2), NegativeOneQuarter);
    ExpQ13 = XMVectorMultiply(XMVectorAdd(LnQ1, LnQ3), NegativeOneQuarter);
    ExpQ02 = XMQuaternionExp(ExpQ02);
    ExpQ13 = XMQuaternionExp(ExpQ13);

    *pA = XMQuaternionMultiply(Q1, ExpQ02);
    *pB = XMQuaternionMultiply(SQ2, ExpQ13);
    *pC = SQ2;
}

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMQuaternionBaryCentric
(
    FXMVECTOR Q0,
    FXMVECTOR Q1,
    FXMVECTOR Q2,
    FLOAT    f,
    FLOAT    g
)
{
    XMVECTOR Q01;
    XMVECTOR Q02;
    FLOAT    s;
    XMVECTOR Result;

    s = f + g;

    if ((s < 0.00001f) && (s > -0.00001f))
    {
        Result = Q0;
    }
    else
    {
        Q01 = XMQuaternionSlerp(Q0, Q1, s);
        Q02 = XMQuaternionSlerp(Q0, Q2, s);

        Result = XMQuaternionSlerp(Q01, Q02, g / s);
    }

    return Result;
}

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMQuaternionBaryCentricV
(
    FXMVECTOR Q0,
    FXMVECTOR Q1,
    FXMVECTOR Q2,
    CXMVECTOR F,
    CXMVECTOR G
)
{
    XMVECTOR Q01;
    XMVECTOR Q02;
    XMVECTOR S, GS;
    XMVECTOR Epsilon;
    XMVECTOR Result;

    XMASSERT( (XMVectorGetY(F) == XMVectorGetX(F)) && (XMVectorGetZ(F) == XMVectorGetX(F)) && (XMVectorGetW(F) == XMVectorGetX(F)) );
    XMASSERT( (XMVectorGetY(G) == XMVectorGetX(G)) && (XMVectorGetZ(G) == XMVectorGetX(G)) && (XMVectorGetW(G) == XMVectorGetX(G)) );

    Epsilon = XMVectorSplatConstant(1, 16);

    S = XMVectorAdd(F, G);

    if (XMVector4InBounds(S, Epsilon))
    {
        Result = Q0;
    }
    else
    {
        Q01 = XMQuaternionSlerpV(Q0, Q1, S);
        Q02 = XMQuaternionSlerpV(Q0, Q2, S);
        GS = XMVectorReciprocal(S);
        GS = XMVectorMultiply(G, GS);

        Result = XMQuaternionSlerpV(Q01, Q02, GS);
    }

    return Result;
}

//------------------------------------------------------------------------------
// Transformation operations
//------------------------------------------------------------------------------

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMQuaternionIdentity()
{
#if defined(_XM_NO_INTRINSICS_)
    return g_XMIdentityR3.v;
#elif defined(_XM_SSE_INTRINSICS_)
    return g_XMIdentityR3;
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMQuaternionRotationRollPitchYaw
(
    FLOAT Pitch,
    FLOAT Yaw,
    FLOAT Roll
)
{
    XMVECTOR Angles;
    XMVECTOR Q;

    Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f);
    Q = XMQuaternionRotationRollPitchYawFromVector(Angles);

    return Q;
}

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMQuaternionRotationRollPitchYawFromVector
(
    FXMVECTOR Angles // <Pitch, Yaw, Roll, 0>
)
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR                Q, Q0, Q1;
    XMVECTOR                P0, P1, Y0, Y1, R0, R1;
    XMVECTOR                HalfAngles;
    XMVECTOR                SinAngles, CosAngles;
    static CONST XMVECTORU32 ControlPitch = {XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1X, XM_PERMUTE_1X};
    static CONST XMVECTORU32 ControlYaw = {XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Y};
    static CONST XMVECTORU32 ControlRoll = {XM_PERMUTE_1Z, XM_PERMUTE_1Z, XM_PERMUTE_0Z, XM_PERMUTE_1Z};
    static CONST XMVECTOR   Sign = {1.0f, -1.0f, -1.0f, 1.0f};

    HalfAngles = XMVectorMultiply(Angles, g_XMOneHalf.v);
    XMVectorSinCos(&SinAngles, &CosAngles, HalfAngles);

    P0 = XMVectorPermute(SinAngles, CosAngles, ControlPitch.v);
    Y0 = XMVectorPermute(SinAngles, CosAngles, ControlYaw.v);
    R0 = XMVectorPermute(SinAngles, CosAngles, ControlRoll.v);
    P1 = XMVectorPermute(CosAngles, SinAngles, ControlPitch.v);
    Y1 = XMVectorPermute(CosAngles, SinAngles, ControlYaw.v);
    R1 = XMVectorPermute(CosAngles, SinAngles, ControlRoll.v);

    Q1 = XMVectorMultiply(P1, Sign);
    Q0 = XMVectorMultiply(P0, Y0);
    Q1 = XMVectorMultiply(Q1, Y1);
    Q0 = XMVectorMultiply(Q0, R0);
    Q = XMVectorMultiplyAdd(Q1, R1, Q0);

    return Q;

#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR                Q, Q0, Q1;
    XMVECTOR                P0, P1, Y0, Y1, R0, R1;
    XMVECTOR                HalfAngles;
    XMVECTOR                SinAngles, CosAngles;
    static CONST XMVECTORI32 ControlPitch = {XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1X, XM_PERMUTE_1X};
    static CONST XMVECTORI32 ControlYaw = {XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Y};
    static CONST XMVECTORI32 ControlRoll = {XM_PERMUTE_1Z, XM_PERMUTE_1Z, XM_PERMUTE_0Z, XM_PERMUTE_1Z};
    static CONST XMVECTORF32 Sign = {1.0f, -1.0f, -1.0f, 1.0f};

    HalfAngles = _mm_mul_ps(Angles, g_XMOneHalf);
    XMVectorSinCos(&SinAngles, &CosAngles, HalfAngles);

    P0 = XMVectorPermute(SinAngles, CosAngles, ControlPitch);
    Y0 = XMVectorPermute(SinAngles, CosAngles, ControlYaw);
    R0 = XMVectorPermute(SinAngles, CosAngles, ControlRoll);
    P1 = XMVectorPermute(CosAngles, SinAngles, ControlPitch);
    Y1 = XMVectorPermute(CosAngles, SinAngles, ControlYaw);
    R1 = XMVectorPermute(CosAngles, SinAngles, ControlRoll);

    Q1 = _mm_mul_ps(P1, Sign);
    Q0 = _mm_mul_ps(P0, Y0);
    Q1 = _mm_mul_ps(Q1, Y1);
    Q0 = _mm_mul_ps(Q0, R0);
    Q = _mm_mul_ps(Q1, R1);
    Q = _mm_add_ps(Q,Q0);
    return Q;
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMQuaternionRotationNormal
(
    FXMVECTOR NormalAxis,
    FLOAT    Angle
)
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR Q;
    XMVECTOR N;
    XMVECTOR Scale;

    N = XMVectorSelect(g_XMOne.v, NormalAxis, g_XMSelect1110.v);

    XMScalarSinCos(&Scale.vector4_f32[2], &Scale.vector4_f32[3], 0.5f * Angle);

    Scale.vector4_f32[0] = Scale.vector4_f32[1] = Scale.vector4_f32[2];

    Q = XMVectorMultiply(N, Scale);

    return Q;

#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR N = _mm_and_ps(NormalAxis,g_XMMask3);
    N = _mm_or_ps(N,g_XMIdentityR3);
    XMVECTOR Scale = _mm_set_ps1(0.5f * Angle);
    XMVECTOR vSine;
    XMVECTOR vCosine;
    XMVectorSinCos(&vSine,&vCosine,Scale);
    Scale = _mm_and_ps(vSine,g_XMMask3);
    vCosine = _mm_and_ps(vCosine,g_XMMaskW);
    Scale = _mm_or_ps(Scale,vCosine);
    N = _mm_mul_ps(N,Scale);
    return N;
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMQuaternionRotationAxis
(
    FXMVECTOR Axis,
    FLOAT    Angle
)
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR Normal;
    XMVECTOR Q;

    XMASSERT(!XMVector3Equal(Axis, XMVectorZero()));
    XMASSERT(!XMVector3IsInfinite(Axis));

    Normal = XMVector3Normalize(Axis);
    Q = XMQuaternionRotationNormal(Normal, Angle);

    return Q;

#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR Normal;
    XMVECTOR Q;

    XMASSERT(!XMVector3Equal(Axis, XMVectorZero()));
    XMASSERT(!XMVector3IsInfinite(Axis));

    Normal = XMVector3Normalize(Axis);
    Q = XMQuaternionRotationNormal(Normal, Angle);
    return Q;
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------

XMINLINE XMVECTOR XMQuaternionRotationMatrix
(
    CXMMATRIX M
)
{
#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)

    XMVECTOR Q0, Q1, Q2;
    XMVECTOR M00, M11, M22;
    XMVECTOR CQ0, CQ1, C;
    XMVECTOR CX, CY, CZ, CW;
    XMVECTOR SQ1, Scale;
    XMVECTOR Rsq, Sqrt, VEqualsNaN;
    XMVECTOR A, B, P;
    XMVECTOR PermuteSplat, PermuteSplatT;
    XMVECTOR SignB, SignBT;
    XMVECTOR PermuteControl, PermuteControlT;
    XMVECTOR Result;
    static CONST XMVECTORF32 OneQuarter = {0.25f, 0.25f, 0.25f, 0.25f};
    static CONST XMVECTORF32 SignPNNP = {1.0f, -1.0f, -1.0f, 1.0f};
    static CONST XMVECTORF32 SignNPNP = {-1.0f, 1.0f, -1.0f, 1.0f};
    static CONST XMVECTORF32 SignNNPP = {-1.0f, -1.0f, 1.0f, 1.0f};
    static CONST XMVECTORF32 SignPNPP = {1.0f, -1.0f, 1.0f, 1.0f};
    static CONST XMVECTORF32 SignPPNP = {1.0f, 1.0f, -1.0f, 1.0f};
    static CONST XMVECTORF32 SignNPPP = {-1.0f, 1.0f, 1.0f, 1.0f};
    static CONST XMVECTORU32 Permute0X0X0Y0W = {XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_0W};
    static CONST XMVECTORU32 Permute0Y0Z0Z1W = {XM_PERMUTE_0Y, XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_1W};
    static CONST XMVECTORU32 SplatX = {XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X};
    static CONST XMVECTORU32 SplatY = {XM_PERMUTE_0Y, XM_PERMUTE_0Y, XM_PERMUTE_0Y, XM_PERMUTE_0Y};
    static CONST XMVECTORU32 SplatZ = {XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_0Z};
    static CONST XMVECTORU32 SplatW = {XM_PERMUTE_0W, XM_PERMUTE_0W, XM_PERMUTE_0W, XM_PERMUTE_0W};
    static CONST XMVECTORU32 PermuteC = {XM_PERMUTE_0X, XM_PERMUTE_0Z, XM_PERMUTE_1X, XM_PERMUTE_1Y};
    static CONST XMVECTORU32 PermuteA = {XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Z, XM_PERMUTE_0W};
    static CONST XMVECTORU32 PermuteB = {XM_PERMUTE_1X, XM_PERMUTE_1W, XM_PERMUTE_0Z, XM_PERMUTE_0W};
    static CONST XMVECTORU32 Permute0 = {XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1Z, XM_PERMUTE_1Y};
    static CONST XMVECTORU32 Permute1 = {XM_PERMUTE_1X, XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Z};
    static CONST XMVECTORU32 Permute2 = {XM_PERMUTE_1Z, XM_PERMUTE_1Y, XM_PERMUTE_0Z, XM_PERMUTE_1X};
    static CONST XMVECTORU32 Permute3 = {XM_PERMUTE_1Y, XM_PERMUTE_1Z, XM_PERMUTE_1X, XM_PERMUTE_0W};

    M00 = XMVectorSplatX(M.r[0]);
    M11 = XMVectorSplatY(M.r[1]);
    M22 = XMVectorSplatZ(M.r[2]);

    Q0 = XMVectorMultiply(SignPNNP.v, M00);
    Q0 = XMVectorMultiplyAdd(SignNPNP.v, M11, Q0);
    Q0 = XMVectorMultiplyAdd(SignNNPP.v, M22, Q0);

    Q1 = XMVectorAdd(Q0, g_XMOne.v);

    Rsq = XMVectorReciprocalSqrt(Q1);
    VEqualsNaN = XMVectorIsNaN(Rsq);
    Sqrt = XMVectorMultiply(Q1, Rsq);
    Q1 = XMVectorSelect(Sqrt, Q1, VEqualsNaN);

    Q1 = XMVectorMultiply(Q1, g_XMOneHalf.v);

    SQ1 = XMVectorMultiply(Rsq, g_XMOneHalf.v);

    CQ0 = XMVectorPermute(Q0, Q0, Permute0X0X0Y0W.v);
    CQ1 = XMVectorPermute(Q0, g_XMEpsilon.v, Permute0Y0Z0Z1W.v);
    C = XMVectorGreaterOrEqual(CQ0, CQ1);

    CX = XMVectorSplatX(C);
    CY = XMVectorSplatY(C);
    CZ = XMVectorSplatZ(C);
    CW = XMVectorSplatW(C);

    PermuteSplat = XMVectorSelect(SplatZ.v, SplatY.v, CZ);
    SignB = XMVectorSelect(SignNPPP.v, SignPPNP.v, CZ);
    PermuteControl = XMVectorSelect(Permute2.v, Permute1.v, CZ);

    PermuteSplat = XMVectorSelect(PermuteSplat, SplatZ.v, CX);
    SignB = XMVectorSelect(SignB, SignNPPP.v, CX);
    PermuteControl = XMVectorSelect(PermuteControl, Permute2.v, CX);

    PermuteSplatT = XMVectorSelect(PermuteSplat,SplatX.v, CY);
    SignBT = XMVectorSelect(SignB, SignPNPP.v, CY);
    PermuteControlT = XMVectorSelect(PermuteControl,Permute0.v, CY);

    PermuteSplat = XMVectorSelect(PermuteSplat, PermuteSplatT, CX);
    SignB = XMVectorSelect(SignB, SignBT, CX);
    PermuteControl = XMVectorSelect(PermuteControl, PermuteControlT, CX);

    PermuteSplat = XMVectorSelect(PermuteSplat,SplatW.v, CW);
    SignB = XMVectorSelect(SignB, g_XMNegativeOne.v, CW);
    PermuteControl = XMVectorSelect(PermuteControl,Permute3.v, CW);

    Scale = XMVectorPermute(SQ1, SQ1, PermuteSplat);

    P = XMVectorPermute(M.r[1], M.r[2],PermuteC.v);  // {M10, M12, M20, M21}
    A = XMVectorPermute(M.r[0], P, PermuteA.v);       // {M01, M12, M20, M03}
    B = XMVectorPermute(M.r[0], P, PermuteB.v);       // {M10, M21, M02, M03}

    Q2 = XMVectorMultiplyAdd(SignB, B, A);
    Q2 = XMVectorMultiply(Q2, Scale);

    Result = XMVectorPermute(Q1, Q2, PermuteControl);

    return Result;

#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------
// Conversion operations
//------------------------------------------------------------------------------

//------------------------------------------------------------------------------

XMFINLINE VOID XMQuaternionToAxisAngle
(
    XMVECTOR* pAxis,
    FLOAT*    pAngle,
    FXMVECTOR  Q
)
{
    XMASSERT(pAxis);
    XMASSERT(pAngle);

    *pAxis = Q;

#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
    *pAngle = 2.0f * acosf(XMVectorGetW(Q));
#else
    *pAngle = 2.0f * XMScalarACos(XMVectorGetW(Q));
#endif
}

/****************************************************************************
 *
 * Plane
 *
 ****************************************************************************/

//------------------------------------------------------------------------------
// Comparison operations
//------------------------------------------------------------------------------

//------------------------------------------------------------------------------

XMFINLINE BOOL XMPlaneEqual
(
    FXMVECTOR P1,
    FXMVECTOR P2
)
{
    return XMVector4Equal(P1, P2);
}

//------------------------------------------------------------------------------

XMFINLINE BOOL XMPlaneNearEqual
(
    FXMVECTOR P1,
    FXMVECTOR P2,
    FXMVECTOR Epsilon
)
{
    XMVECTOR NP1 = XMPlaneNormalize(P1);
    XMVECTOR NP2 = XMPlaneNormalize(P2);
    return XMVector4NearEqual(NP1, NP2, Epsilon);
}

//------------------------------------------------------------------------------

XMFINLINE BOOL XMPlaneNotEqual
(
    FXMVECTOR P1,
    FXMVECTOR P2
)
{
    return XMVector4NotEqual(P1, P2);
}

//------------------------------------------------------------------------------

XMFINLINE BOOL XMPlaneIsNaN
(
    FXMVECTOR P
)
{
    return XMVector4IsNaN(P);
}

//------------------------------------------------------------------------------

XMFINLINE BOOL XMPlaneIsInfinite
(
    FXMVECTOR P
)
{
    return XMVector4IsInfinite(P);
}

//------------------------------------------------------------------------------
// Computation operations
//------------------------------------------------------------------------------

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMPlaneDot
(
    FXMVECTOR P,
    FXMVECTOR V
)
{
#if defined(_XM_NO_INTRINSICS_)

    return XMVector4Dot(P, V);

#elif defined(_XM_SSE_INTRINSICS_)
    __m128 vTemp2 = V;
    __m128 vTemp = _mm_mul_ps(P,vTemp2);
    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp,_MM_SHUFFLE(1,0,0,0)); // Copy X to the Z position and Y to the W position
    vTemp2 = _mm_add_ps(vTemp2,vTemp);          // Add Z = X+Z; W = Y+W;
    vTemp = _mm_shuffle_ps(vTemp,vTemp2,_MM_SHUFFLE(0,3,0,0));  // Copy W to the Z position
    vTemp = _mm_add_ps(vTemp,vTemp2);           // Add Z and W together
    return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(2,2,2,2));    // Splat Z and return
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMPlaneDotCoord
(
    FXMVECTOR P,
    FXMVECTOR V
)
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR V3;
    XMVECTOR Result;

    // Result = P[0] * V[0] + P[1] * V[1] + P[2] * V[2] + P[3]
    V3 = XMVectorSelect(g_XMOne.v, V, g_XMSelect1110.v);
    Result = XMVector4Dot(P, V3);

    return Result;

#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp2 = _mm_and_ps(V,g_XMMask3);
    vTemp2 = _mm_or_ps(vTemp2,g_XMIdentityR3);
    XMVECTOR vTemp = _mm_mul_ps(P,vTemp2);
    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp,_MM_SHUFFLE(1,0,0,0)); // Copy X to the Z position and Y to the W position
    vTemp2 = _mm_add_ps(vTemp2,vTemp);          // Add Z = X+Z; W = Y+W;
    vTemp = _mm_shuffle_ps(vTemp,vTemp2,_MM_SHUFFLE(0,3,0,0));  // Copy W to the Z position
    vTemp = _mm_add_ps(vTemp,vTemp2);           // Add Z and W together
    return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(2,2,2,2));    // Splat Z and return
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMPlaneDotNormal
(
    FXMVECTOR P,
    FXMVECTOR V
)
{
    return XMVector3Dot(P, V);
}

//------------------------------------------------------------------------------
// XMPlaneNormalizeEst uses a reciprocal estimate and
// returns QNaN on zero and infinite vectors.

XMFINLINE XMVECTOR XMPlaneNormalizeEst
(
    FXMVECTOR P
)
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR Result;
    Result = XMVector3ReciprocalLength(P);
    Result = XMVectorMultiply(P, Result);
    return Result;

#elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product
    XMVECTOR vDot = _mm_mul_ps(P,P);
    // x=Dot.y, y=Dot.z
    XMVECTOR vTemp = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(2,1,2,1));
    // Result.x = x+y
    vDot = _mm_add_ss(vDot,vTemp);
    // x=Dot.z
    vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1));
    // Result.x = (x+y)+z
    vDot = _mm_add_ss(vDot,vTemp);
    // Splat x
	vDot = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(0,0,0,0));
    // Get the reciprocal
    vDot = _mm_rsqrt_ps(vDot);
    // Get the reciprocal
    vDot = _mm_mul_ps(vDot,P);
    return vDot;
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMPlaneNormalize
(
    FXMVECTOR P
)
{
#if defined(_XM_NO_INTRINSICS_)
    FLOAT fLengthSq = sqrtf((P.vector4_f32[0]*P.vector4_f32[0])+(P.vector4_f32[1]*P.vector4_f32[1])+(P.vector4_f32[2]*P.vector4_f32[2]));
    // Prevent divide by zero
    if (fLengthSq) {
        fLengthSq = 1.0f/fLengthSq;
    }
    {
    XMVECTOR vResult = {
        P.vector4_f32[0]*fLengthSq,
        P.vector4_f32[1]*fLengthSq,
        P.vector4_f32[2]*fLengthSq,
        P.vector4_f32[3]*fLengthSq
    };
    return vResult;
    }
#elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x,y and z only
    XMVECTOR vLengthSq = _mm_mul_ps(P,P);
    XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(2,1,2,1));
    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
    vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1));
    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
	vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(0,0,0,0));
    // Prepare for the division
    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
    // Failsafe on zero (Or epsilon) length planes
    // If the length is infinity, set the elements to zero
    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
    // Reciprocal mul to perform the normalization
    vResult = _mm_div_ps(P,vResult);
    // Any that are infinity, set to zero
    vResult = _mm_and_ps(vResult,vLengthSq);
    return vResult;
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMPlaneIntersectLine
(
    FXMVECTOR P,
    FXMVECTOR LinePoint1,
    FXMVECTOR LinePoint2
)
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR V1;
    XMVECTOR V2;
    XMVECTOR D;
    XMVECTOR ReciprocalD;
    XMVECTOR VT;
    XMVECTOR Point;
    XMVECTOR Zero;
    XMVECTOR Control;
    XMVECTOR Result;

    V1 = XMVector3Dot(P, LinePoint1);
    V2 = XMVector3Dot(P, LinePoint2);
    D = XMVectorSubtract(V1, V2);

    ReciprocalD = XMVectorReciprocal(D);
    VT = XMPlaneDotCoord(P, LinePoint1);
    VT = XMVectorMultiply(VT, ReciprocalD);

    Point = XMVectorSubtract(LinePoint2, LinePoint1);
    Point = XMVectorMultiplyAdd(Point, VT, LinePoint1);

    Zero = XMVectorZero();
    Control = XMVectorNearEqual(D, Zero, g_XMEpsilon.v);

    Result = XMVectorSelect(Point, g_XMQNaN.v, Control);

    return Result;

#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR V1;
    XMVECTOR V2;
    XMVECTOR D;
    XMVECTOR VT;
    XMVECTOR Point;
    XMVECTOR Zero;
    XMVECTOR Control;
    XMVECTOR Result;

    V1 = XMVector3Dot(P, LinePoint1);
    V2 = XMVector3Dot(P, LinePoint2);
    D = _mm_sub_ps(V1, V2);

    VT = XMPlaneDotCoord(P, LinePoint1);
    VT = _mm_div_ps(VT, D);

    Point = _mm_sub_ps(LinePoint2, LinePoint1);
    Point = _mm_mul_ps(Point,VT);
    Point = _mm_add_ps(Point,LinePoint1);
    Zero = XMVectorZero();
    Control = XMVectorNearEqual(D, Zero, g_XMEpsilon);
    Result = XMVectorSelect(Point, g_XMQNaN, Control);
    return Result;
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------

XMINLINE VOID XMPlaneIntersectPlane
(
    XMVECTOR* pLinePoint1,
    XMVECTOR* pLinePoint2,
    FXMVECTOR  P1,
    FXMVECTOR  P2
)
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR V1;
    XMVECTOR V2;
    XMVECTOR V3;
    XMVECTOR LengthSq;
    XMVECTOR RcpLengthSq;
    XMVECTOR Point;
    XMVECTOR P1W;
    XMVECTOR P2W;
    XMVECTOR Control;
    XMVECTOR LinePoint1;
    XMVECTOR LinePoint2;

    XMASSERT(pLinePoint1);
    XMASSERT(pLinePoint2);

    V1 = XMVector3Cross(P2, P1);

    LengthSq = XMVector3LengthSq(V1);

    V2 = XMVector3Cross(P2, V1);

    P1W = XMVectorSplatW(P1);
    Point = XMVectorMultiply(V2, P1W);

    V3 = XMVector3Cross(V1, P1);

    P2W = XMVectorSplatW(P2);
    Point = XMVectorMultiplyAdd(V3, P2W, Point);

    RcpLengthSq = XMVectorReciprocal(LengthSq);
    LinePoint1 = XMVectorMultiply(Point, RcpLengthSq);

    LinePoint2 = XMVectorAdd(LinePoint1, V1);

    Control = XMVectorLessOrEqual(LengthSq, g_XMEpsilon.v);
    *pLinePoint1 = XMVectorSelect(LinePoint1,g_XMQNaN.v, Control);
    *pLinePoint2 = XMVectorSelect(LinePoint2,g_XMQNaN.v, Control);

#elif defined(_XM_SSE_INTRINSICS_)
    XMASSERT(pLinePoint1);
    XMASSERT(pLinePoint2);
    XMVECTOR V1;
    XMVECTOR V2;
    XMVECTOR V3;
    XMVECTOR LengthSq;
    XMVECTOR Point;
    XMVECTOR P1W;
    XMVECTOR P2W;
    XMVECTOR Control;
    XMVECTOR LinePoint1;
    XMVECTOR LinePoint2;

    V1 = XMVector3Cross(P2, P1);

    LengthSq = XMVector3LengthSq(V1);

    V2 = XMVector3Cross(P2, V1);

    P1W = _mm_shuffle_ps(P1,P1,_MM_SHUFFLE(3,3,3,3));
    Point = _mm_mul_ps(V2, P1W);

    V3 = XMVector3Cross(V1, P1);

    P2W = _mm_shuffle_ps(P2,P2,_MM_SHUFFLE(3,3,3,3));
    V3 = _mm_mul_ps(V3,P2W);
    Point = _mm_add_ps(Point,V3);
    LinePoint1 = _mm_div_ps(Point,LengthSq);

    LinePoint2 = _mm_add_ps(LinePoint1, V1);

    Control = XMVectorLessOrEqual(LengthSq, g_XMEpsilon);
    *pLinePoint1 = XMVectorSelect(LinePoint1,g_XMQNaN, Control);
    *pLinePoint2 = XMVectorSelect(LinePoint2,g_XMQNaN, Control);
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMPlaneTransform
(
    FXMVECTOR P,
    CXMMATRIX M
)
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR X;
    XMVECTOR Y;
    XMVECTOR Z;
    XMVECTOR W;
    XMVECTOR Result;

    W = XMVectorSplatW(P);
    Z = XMVectorSplatZ(P);
    Y = XMVectorSplatY(P);
    X = XMVectorSplatX(P);

    Result = XMVectorMultiply(W, M.r[3]);
    Result = XMVectorMultiplyAdd(Z, M.r[2], Result);
    Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
    Result = XMVectorMultiplyAdd(X, M.r[0], Result);

    return Result;

#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR X = _mm_shuffle_ps(P,P,_MM_SHUFFLE(0,0,0,0));
    XMVECTOR Y = _mm_shuffle_ps(P,P,_MM_SHUFFLE(1,1,1,1));
    XMVECTOR Z = _mm_shuffle_ps(P,P,_MM_SHUFFLE(2,2,2,2));
    XMVECTOR W = _mm_shuffle_ps(P,P,_MM_SHUFFLE(3,3,3,3));
    X = _mm_mul_ps(X, M.r[0]);
    Y = _mm_mul_ps(Y, M.r[1]);
    Z = _mm_mul_ps(Z, M.r[2]);
    W = _mm_mul_ps(W, M.r[3]);
    X = _mm_add_ps(X,Z);
    Y = _mm_add_ps(Y,W);
    X = _mm_add_ps(X,Y);
    return X;
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------

XMFINLINE XMFLOAT4* XMPlaneTransformStream
(
    XMFLOAT4*       pOutputStream,
    size_t          OutputStride,
    CONST XMFLOAT4* pInputStream,
    size_t          InputStride,
    size_t          PlaneCount,
    CXMMATRIX       M
)
{
    return XMVector4TransformStream(pOutputStream,
                                    OutputStride,
                                    pInputStream,
                                    InputStride,
                                    PlaneCount,
                                    M);
}

//------------------------------------------------------------------------------
// Conversion operations
//------------------------------------------------------------------------------

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMPlaneFromPointNormal
(
    FXMVECTOR Point,
    FXMVECTOR Normal
)
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR W;
    XMVECTOR Result;

    W = XMVector3Dot(Point, Normal);
    W = XMVectorNegate(W);
    Result = XMVectorSelect(W, Normal, g_XMSelect1110.v);

    return Result;

#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR W;
    XMVECTOR Result;
    W = XMVector3Dot(Point,Normal);
    W = _mm_mul_ps(W,g_XMNegativeOne);
    Result = _mm_and_ps(Normal,g_XMMask3);
    W = _mm_and_ps(W,g_XMMaskW);
    Result = _mm_or_ps(Result,W);
    return Result;
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMPlaneFromPoints
(
    FXMVECTOR Point1,
    FXMVECTOR Point2,
    FXMVECTOR Point3
)
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR N;
    XMVECTOR D;
    XMVECTOR V21;
    XMVECTOR V31;
    XMVECTOR Result;

    V21 = XMVectorSubtract(Point1, Point2);
    V31 = XMVectorSubtract(Point1, Point3);

    N = XMVector3Cross(V21, V31);
    N = XMVector3Normalize(N);

    D = XMPlaneDotNormal(N, Point1);
    D = XMVectorNegate(D);

    Result = XMVectorSelect(D, N, g_XMSelect1110.v);

    return Result;

#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR N;
    XMVECTOR D;
    XMVECTOR V21;
    XMVECTOR V31;
    XMVECTOR Result;

    V21 = _mm_sub_ps(Point1, Point2);
    V31 = _mm_sub_ps(Point1, Point3);

    N = XMVector3Cross(V21, V31);
    N = XMVector3Normalize(N);

    D = XMPlaneDotNormal(N, Point1);
    D = _mm_mul_ps(D,g_XMNegativeOne);
    N = _mm_and_ps(N,g_XMMask3);
    D = _mm_and_ps(D,g_XMMaskW);
    Result = _mm_or_ps(D,N);
    return Result;
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

/****************************************************************************
 *
 * Color
 *
 ****************************************************************************/

//------------------------------------------------------------------------------
// Comparison operations
//------------------------------------------------------------------------------

//------------------------------------------------------------------------------

XMFINLINE BOOL XMColorEqual
(
    FXMVECTOR C1,
    FXMVECTOR C2
)
{
    return XMVector4Equal(C1, C2);
}

//------------------------------------------------------------------------------

XMFINLINE BOOL XMColorNotEqual
(
    FXMVECTOR C1,
    FXMVECTOR C2
)
{
    return XMVector4NotEqual(C1, C2);
}

//------------------------------------------------------------------------------

XMFINLINE BOOL XMColorGreater
(
    FXMVECTOR C1,
    FXMVECTOR C2
)
{
    return XMVector4Greater(C1, C2);
}

//------------------------------------------------------------------------------

XMFINLINE BOOL XMColorGreaterOrEqual
(
    FXMVECTOR C1,
    FXMVECTOR C2
)
{
    return XMVector4GreaterOrEqual(C1, C2);
}

//------------------------------------------------------------------------------

XMFINLINE BOOL XMColorLess
(
    FXMVECTOR C1,
    FXMVECTOR C2
)
{
    return XMVector4Less(C1, C2);
}

//------------------------------------------------------------------------------

XMFINLINE BOOL XMColorLessOrEqual
(
    FXMVECTOR C1,
    FXMVECTOR C2
)
{
    return XMVector4LessOrEqual(C1, C2);
}

//------------------------------------------------------------------------------

XMFINLINE BOOL XMColorIsNaN
(
    FXMVECTOR C
)
{
    return XMVector4IsNaN(C);
}

//------------------------------------------------------------------------------

XMFINLINE BOOL XMColorIsInfinite
(
    FXMVECTOR C
)
{
    return XMVector4IsInfinite(C);
}

//------------------------------------------------------------------------------
// Computation operations
//------------------------------------------------------------------------------

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMColorNegative
(
    FXMVECTOR vColor
)
{
#if defined(_XM_NO_INTRINSICS_)
//    XMASSERT(XMVector4GreaterOrEqual(C, XMVectorReplicate(0.0f)));
//    XMASSERT(XMVector4LessOrEqual(C, XMVectorReplicate(1.0f)));
    XMVECTOR vResult = {
        1.0f - vColor.vector4_f32[0],
        1.0f - vColor.vector4_f32[1],
        1.0f - vColor.vector4_f32[2],
        vColor.vector4_f32[3]
    };
    return vResult;

#elif defined(_XM_SSE_INTRINSICS_)
    // Negate only x,y and z.
    XMVECTOR vTemp = _mm_xor_ps(vColor,g_XMNegate3);
    // Add 1,1,1,0 to -x,-y,-z,w
	return _mm_add_ps(vTemp,g_XMOne3);
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMColorModulate
(
    FXMVECTOR C1,
    FXMVECTOR C2
)
{
    return XMVectorMultiply(C1, C2);
}

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMColorAdjustSaturation
(
    FXMVECTOR vColor,
    FLOAT    fSaturation
)
{
#if defined(_XM_NO_INTRINSICS_)
    CONST XMVECTOR gvLuminance = {0.2125f, 0.7154f, 0.0721f, 0.0f};

    // Luminance = 0.2125f * C[0] + 0.7154f * C[1] + 0.0721f * C[2];
    // Result = (C - Luminance) * Saturation + Luminance;

    FLOAT fLuminance = (vColor.vector4_f32[0]*gvLuminance.vector4_f32[0])+(vColor.vector4_f32[1]*gvLuminance.vector4_f32[1])+(vColor.vector4_f32[2]*gvLuminance.vector4_f32[2]);
    XMVECTOR vResult = {
        ((vColor.vector4_f32[0] - fLuminance)*fSaturation)+fLuminance,
        ((vColor.vector4_f32[1] - fLuminance)*fSaturation)+fLuminance,
        ((vColor.vector4_f32[2] - fLuminance)*fSaturation)+fLuminance,
        vColor.vector4_f32[3]};
    return vResult;

#elif defined(_XM_SSE_INTRINSICS_)
    static const XMVECTORF32 gvLuminance = {0.2125f, 0.7154f, 0.0721f, 0.0f};
// Mul RGB by intensity constants
    XMVECTOR vLuminance = _mm_mul_ps(vColor,gvLuminance);      
// vResult.x = vLuminance.y, vResult.y = vLuminance.y,
// vResult.z = vLuminance.z, vResult.w = vLuminance.z 
    XMVECTOR vResult = vLuminance;
    vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(2,2,1,1)); 
// vLuminance.x += vLuminance.y
    vLuminance = _mm_add_ss(vLuminance,vResult);
// Splat vLuminance.z
    vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(2,2,2,2));
// vLuminance.x += vLuminance.z (Dot product)
    vLuminance = _mm_add_ss(vLuminance,vResult);
// Splat vLuminance
    vLuminance = _mm_shuffle_ps(vLuminance,vLuminance,_MM_SHUFFLE(0,0,0,0));
// Splat fSaturation
    XMVECTOR vSaturation = _mm_set_ps1(fSaturation);
// vResult = ((vColor-vLuminance)*vSaturation)+vLuminance;
    vResult = _mm_sub_ps(vColor,vLuminance);
    vResult = _mm_mul_ps(vResult,vSaturation);
    vResult = _mm_add_ps(vResult,vLuminance);
// Retain w from the source color
    vLuminance = _mm_shuffle_ps(vResult,vColor,_MM_SHUFFLE(3,2,2,2));   // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w
    vResult = _mm_shuffle_ps(vResult,vLuminance,_MM_SHUFFLE(3,0,1,0));  // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w
    return vResult;
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMColorAdjustContrast
(
    FXMVECTOR vColor,
    FLOAT    fContrast
)
{
#if defined(_XM_NO_INTRINSICS_)
    // Result = (vColor - 0.5f) * fContrast + 0.5f;
    XMVECTOR vResult = {
        ((vColor.vector4_f32[0]-0.5f) * fContrast) + 0.5f,
        ((vColor.vector4_f32[1]-0.5f) * fContrast) + 0.5f,
        ((vColor.vector4_f32[2]-0.5f) * fContrast) + 0.5f,
        vColor.vector4_f32[3]        // Leave W untouched
    };
    return vResult;

#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vScale = _mm_set_ps1(fContrast);           // Splat the scale
    XMVECTOR vResult = _mm_sub_ps(vColor,g_XMOneHalf);  // Subtract 0.5f from the source (Saving source)
    vResult = _mm_mul_ps(vResult,vScale);               // Mul by scale
    vResult = _mm_add_ps(vResult,g_XMOneHalf);          // Add 0.5f
// Retain w from the source color
    vScale = _mm_shuffle_ps(vResult,vColor,_MM_SHUFFLE(3,2,2,2));   // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w
    vResult = _mm_shuffle_ps(vResult,vScale,_MM_SHUFFLE(3,0,1,0));  // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w
    return vResult;
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
#endif // _XM_VMX128_INTRINSICS_
}

/****************************************************************************
 *
 * Miscellaneous
 *
 ****************************************************************************/

//------------------------------------------------------------------------------

XMINLINE BOOL XMVerifyCPUSupport()
{
#if defined(_XM_NO_INTRINSICS_) || !defined(_XM_SSE_INTRINSICS_)
	return TRUE;
#else // _XM_SSE_INTRINSICS_
	// Note that on Windows 2000 or older, SSE2 detection is not supported so this will always fail
	// Detecting SSE2 on older versions of Windows would require using cpuid directly
	return ( IsProcessorFeaturePresent( PF_XMMI_INSTRUCTIONS_AVAILABLE ) && IsProcessorFeaturePresent( PF_XMMI64_INSTRUCTIONS_AVAILABLE ) );
#endif
}


//------------------------------------------------------------------------------

#define XMASSERT_LINE_STRING_SIZE 16

XMINLINE VOID XMAssert
(
    CONST CHAR* pExpression,
    CONST CHAR* pFileName,
    UINT        LineNumber
)
{
    CHAR        aLineString[XMASSERT_LINE_STRING_SIZE];
    CHAR*       pLineString;
    UINT        Line;

    aLineString[XMASSERT_LINE_STRING_SIZE - 2] = '0';
    aLineString[XMASSERT_LINE_STRING_SIZE - 1] = '\0';
    for (Line = LineNumber, pLineString = aLineString + XMASSERT_LINE_STRING_SIZE - 2;
         Line != 0 && pLineString >= aLineString;
         Line /= 10, pLineString--)
    {
        *pLineString = (CHAR)('0' + (Line % 10));
    }

#ifndef NO_OUTPUT_DEBUG_STRING
    OutputDebugStringA("Assertion failed: ");
    OutputDebugStringA(pExpression);
    OutputDebugStringA(", file ");
    OutputDebugStringA(pFileName);
    OutputDebugStringA(", line ");
    OutputDebugStringA(pLineString + 1);
    OutputDebugStringA("\r\n");
#else
    DbgPrint("Assertion failed: %s, file %s, line %d\r\n", pExpression, pFileName, LineNumber);
#endif

    __debugbreak();
}

//------------------------------------------------------------------------------

XMFINLINE XMVECTOR XMFresnelTerm
(
    FXMVECTOR CosIncidentAngle,
    FXMVECTOR RefractionIndex
)
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR G;
    XMVECTOR D, S;
    XMVECTOR V0, V1, V2, V3;
    XMVECTOR Result;

    // Result = 0.5f * (g - c)^2 / (g + c)^2 * ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1) where
    // c = CosIncidentAngle
    // g = sqrt(c^2 + RefractionIndex^2 - 1)

    XMASSERT(!XMVector4IsInfinite(CosIncidentAngle));

    G = XMVectorMultiplyAdd(RefractionIndex, RefractionIndex, g_XMNegativeOne.v);
    G = XMVectorMultiplyAdd(CosIncidentAngle, CosIncidentAngle, G);
    G = XMVectorAbs(G);
    G = XMVectorSqrt(G);

    S = XMVectorAdd(G, CosIncidentAngle);
    D = XMVectorSubtract(G, CosIncidentAngle);

    V0 = XMVectorMultiply(D, D);
    V1 = XMVectorMultiply(S, S);
    V1 = XMVectorReciprocal(V1);
    V0 = XMVectorMultiply(g_XMOneHalf.v, V0);
    V0 = XMVectorMultiply(V0, V1);

    V2 = XMVectorMultiplyAdd(CosIncidentAngle, S, g_XMNegativeOne.v);
    V3 = XMVectorMultiplyAdd(CosIncidentAngle, D, g_XMOne.v);
    V2 = XMVectorMultiply(V2, V2);
    V3 = XMVectorMultiply(V3, V3);
    V3 = XMVectorReciprocal(V3);
    V2 = XMVectorMultiplyAdd(V2, V3, g_XMOne.v);

    Result = XMVectorMultiply(V0, V2);

    Result = XMVectorSaturate(Result);

    return Result;

#elif defined(_XM_SSE_INTRINSICS_)
    // Result = 0.5f * (g - c)^2 / (g + c)^2 * ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1) where
    // c = CosIncidentAngle
    // g = sqrt(c^2 + RefractionIndex^2 - 1)

    XMASSERT(!XMVector4IsInfinite(CosIncidentAngle));

    // G = sqrt(abs((RefractionIndex^2-1) + CosIncidentAngle^2))
    XMVECTOR G = _mm_mul_ps(RefractionIndex,RefractionIndex);
    XMVECTOR vTemp = _mm_mul_ps(CosIncidentAngle,CosIncidentAngle);
    G = _mm_sub_ps(G,g_XMOne);
    vTemp = _mm_add_ps(vTemp,G);
    // max((0-vTemp),vTemp) == abs(vTemp)
    // The abs is needed to deal with refraction and cosine being zero
	G = _mm_setzero_ps();
	G = _mm_sub_ps(G,vTemp);
	G = _mm_max_ps(G,vTemp);
    // Last operation, the sqrt()
    G = _mm_sqrt_ps(G);

    // Calc G-C and G+C
    XMVECTOR GAddC = _mm_add_ps(G,CosIncidentAngle);
    XMVECTOR GSubC = _mm_sub_ps(G,CosIncidentAngle);
    // Perform the term (0.5f *(g - c)^2) / (g + c)^2 
    XMVECTOR vResult = _mm_mul_ps(GSubC,GSubC);
    vTemp = _mm_mul_ps(GAddC,GAddC);
    vResult = _mm_mul_ps(vResult,g_XMOneHalf);
    vResult = _mm_div_ps(vResult,vTemp);
    // Perform the term ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1)
    GAddC = _mm_mul_ps(GAddC,CosIncidentAngle);
    GSubC = _mm_mul_ps(GSubC,CosIncidentAngle);
    GAddC = _mm_sub_ps(GAddC,g_XMOne);
    GSubC = _mm_add_ps(GSubC,g_XMOne);
    GAddC = _mm_mul_ps(GAddC,GAddC);
    GSubC = _mm_mul_ps(GSubC,GSubC);
    GAddC = _mm_div_ps(GAddC,GSubC);
    GAddC = _mm_add_ps(GAddC,g_XMOne);
    // Multiply the two term parts
    vResult = _mm_mul_ps(vResult,GAddC);
    // Clamp to 0.0 - 1.0f
    vResult = _mm_max_ps(vResult,g_XMZero);
    vResult = _mm_min_ps(vResult,g_XMOne);
    return vResult;
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------

XMFINLINE BOOL XMScalarNearEqual
(
    FLOAT S1,
    FLOAT S2,
    FLOAT Epsilon
)
{
    FLOAT Delta = S1 - S2;
#if defined(_XM_NO_INTRINSICS_)
    UINT  AbsDelta = *(const UINT*)&Delta & 0x7FFFFFFF;
    return (*(FLOAT*)&AbsDelta <= Epsilon);
#elif defined(_XM_SSE_INTRINSICS_)
    return (fabsf(Delta) <= Epsilon);
#else
    return (__fabs(Delta) <= Epsilon);
#endif
}

//------------------------------------------------------------------------------
// Modulo the range of the given angle such that -XM_PI <= Angle < XM_PI
XMFINLINE FLOAT XMScalarModAngle
(
    FLOAT Angle
)
{
    // Note: The modulo is performed with unsigned math only to work
    // around a precision error on numbers that are close to PI
    float fTemp;
#if defined(_XM_NO_INTRINSICS_) || !defined(_XM_VMX128_INTRINSICS_)
    // Normalize the range from 0.0f to XM_2PI
    Angle = Angle + XM_PI;
    // Perform the modulo, unsigned
    fTemp = fabsf(Angle);
    fTemp = fTemp - (XM_2PI * (FLOAT)((INT)(fTemp/XM_2PI)));
    // Restore the number to the range of -XM_PI to XM_PI-epsilon
    fTemp = fTemp - XM_PI;
    // If the modulo'd value was negative, restore negation
    if (Angle<0.0f) {
        fTemp = -fTemp;
    }
    return fTemp;
#else
#endif
}

//------------------------------------------------------------------------------

XMINLINE FLOAT XMScalarSin
(
    FLOAT Value
)
{
#if defined(_XM_NO_INTRINSICS_)

    FLOAT                  ValueMod;
    FLOAT                  ValueSq;
    XMVECTOR               V0123, V0246, V1357, V9111315, V17192123;
    XMVECTOR               V1, V7, V8;
    XMVECTOR               R0, R1, R2;

    ValueMod = XMScalarModAngle(Value);

    // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! - V^15 / 15! +
    //           V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)

    ValueSq = ValueMod * ValueMod;

    V0123     = XMVectorSet(1.0f, ValueMod, ValueSq, ValueSq * ValueMod);
    V1        = XMVectorSplatY(V0123);
    V0246     = XMVectorMultiply(V0123, V0123);
    V1357     = XMVectorMultiply(V0246, V1);
    V7        = XMVectorSplatW(V1357);
    V8        = XMVectorMultiply(V7, V1);
    V9111315  = XMVectorMultiply(V1357, V8);
    V17192123 = XMVectorMultiply(V9111315, V8);

    R0        = XMVector4Dot(V1357, g_XMSinCoefficients0.v);
    R1        = XMVector4Dot(V9111315, g_XMSinCoefficients1.v);
    R2        = XMVector4Dot(V17192123, g_XMSinCoefficients2.v);

    return R0.vector4_f32[0] + R1.vector4_f32[0] + R2.vector4_f32[0];

#elif defined(_XM_SSE_INTRINSICS_)
    return sinf( Value );
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------

XMINLINE FLOAT XMScalarCos
(
    FLOAT Value
)
{
#if defined(_XM_NO_INTRINSICS_)

    FLOAT                  ValueMod;
    FLOAT                  ValueSq;
    XMVECTOR               V0123, V0246, V8101214, V16182022;
    XMVECTOR               V2, V6, V8;
    XMVECTOR               R0, R1, R2;

    ValueMod = XMScalarModAngle(Value);

    // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! +
    //           V^12 / 12! - V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)

    ValueSq = ValueMod * ValueMod;

    V0123 = XMVectorSet(1.0f, ValueMod, ValueSq, ValueSq * ValueMod);
    V0246 = XMVectorMultiply(V0123, V0123);

    V2 = XMVectorSplatZ(V0123);
    V6 = XMVectorSplatW(V0246);
    V8 = XMVectorMultiply(V6, V2);

    V8101214 = XMVectorMultiply(V0246, V8);
    V16182022 = XMVectorMultiply(V8101214, V8);

    R0 = XMVector4Dot(V0246, g_XMCosCoefficients0.v);
    R1 = XMVector4Dot(V8101214, g_XMCosCoefficients1.v);
    R2 = XMVector4Dot(V16182022, g_XMCosCoefficients2.v);

    return R0.vector4_f32[0] + R1.vector4_f32[0] + R2.vector4_f32[0];

#elif defined(_XM_SSE_INTRINSICS_)
    return cosf(Value);
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------

XMINLINE VOID XMScalarSinCos
(
    FLOAT* pSin,
    FLOAT* pCos,
    FLOAT  Value
)
{
#if defined(_XM_NO_INTRINSICS_)

    FLOAT                  ValueMod;
    FLOAT                  ValueSq;
    XMVECTOR               V0123, V0246, V1357, V8101214, V9111315, V16182022, V17192123;
    XMVECTOR               V1, V2, V6, V8;
    XMVECTOR               S0, S1, S2, C0, C1, C2;

    XMASSERT(pSin);
    XMASSERT(pCos);

    ValueMod = XMScalarModAngle(Value);

    // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! - V^15 / 15! +
    //           V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
    // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! +
    //           V^12 / 12! - V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)

    ValueSq = ValueMod * ValueMod;

    V0123 = XMVectorSet(1.0f, ValueMod, ValueSq, ValueSq * ValueMod);

    V1 = XMVectorSplatY(V0123);
    V2 = XMVectorSplatZ(V0123);

    V0246 = XMVectorMultiply(V0123, V0123);
    V1357 = XMVectorMultiply(V0246, V1);

    V6 = XMVectorSplatW(V0246);
    V8 = XMVectorMultiply(V6, V2);

    V8101214 = XMVectorMultiply(V0246, V8);
    V9111315 = XMVectorMultiply(V1357, V8);
    V16182022 = XMVectorMultiply(V8101214, V8);
    V17192123 = XMVectorMultiply(V9111315, V8);

    C0 = XMVector4Dot(V0246, g_XMCosCoefficients0.v);
    S0 = XMVector4Dot(V1357, g_XMSinCoefficients0.v);
    C1 = XMVector4Dot(V8101214, g_XMCosCoefficients1.v);
    S1 = XMVector4Dot(V9111315, g_XMSinCoefficients1.v);
    C2 = XMVector4Dot(V16182022, g_XMCosCoefficients2.v);
    S2 = XMVector4Dot(V17192123, g_XMSinCoefficients2.v);

    *pCos = C0.vector4_f32[0] + C1.vector4_f32[0] + C2.vector4_f32[0];
    *pSin = S0.vector4_f32[0] + S1.vector4_f32[0] + S2.vector4_f32[0];

#elif defined(_XM_SSE_INTRINSICS_)
    XMASSERT(pSin);
    XMASSERT(pCos);

    *pSin = sinf(Value);
    *pCos = cosf(Value);
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------

XMINLINE FLOAT XMScalarASin
(
    FLOAT Value
)
{
#if defined(_XM_NO_INTRINSICS_)

    FLOAT AbsValue, Value2, Value3, D;
    XMVECTOR AbsV, R0, R1, Result;
    XMVECTOR V3;

    *(UINT*)&AbsValue = *(const UINT*)&Value & 0x7FFFFFFF;

    Value2 = Value * AbsValue;
    Value3 = Value * Value2;
    D = (Value - Value2) / sqrtf(1.00000011921f - AbsValue);

    AbsV = XMVectorReplicate(AbsValue);

    V3.vector4_f32[0] = Value3;
    V3.vector4_f32[1] = 1.0f;
    V3.vector4_f32[2] = Value3;
    V3.vector4_f32[3] = 1.0f;

    R1 = XMVectorSet(D, D, Value, Value);
    R1 = XMVectorMultiply(R1, V3);

    R0 = XMVectorMultiplyAdd(AbsV, g_XMASinCoefficients0.v, g_XMASinCoefficients1.v);
    R0 = XMVectorMultiplyAdd(AbsV, R0, g_XMASinCoefficients2.v);

    Result = XMVector4Dot(R0, R1);

    return Result.vector4_f32[0];

#elif defined(_XM_SSE_INTRINSICS_)
    return asinf(Value);
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------

XMINLINE FLOAT XMScalarACos
(
    FLOAT Value
)
{
#if defined(_XM_NO_INTRINSICS_)

    return XM_PIDIV2 - XMScalarASin(Value);

#elif defined(_XM_SSE_INTRINSICS_)
    return acosf(Value);
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------

XMFINLINE FLOAT XMScalarSinEst
(
    FLOAT Value
)
{
#if defined(_XM_NO_INTRINSICS_)

    FLOAT                  ValueSq;
    XMVECTOR               V;
    XMVECTOR               Y;
    XMVECTOR               Result;

    XMASSERT(Value >= -XM_PI);
    XMASSERT(Value < XM_PI);

    // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! (for -PI <= V < PI)

    ValueSq = Value * Value;

    V = XMVectorSet(1.0f, Value, ValueSq, ValueSq * Value);
    Y = XMVectorSplatY(V);
    V = XMVectorMultiply(V, V);
    V = XMVectorMultiply(V, Y);

    Result = XMVector4Dot(V, g_XMSinEstCoefficients.v);

    return Result.vector4_f32[0];

#elif defined(_XM_SSE_INTRINSICS_)
    XMASSERT(Value >= -XM_PI);
    XMASSERT(Value < XM_PI);
    float ValueSq = Value*Value;
    XMVECTOR vValue = _mm_set_ps1(Value);
    XMVECTOR vTemp = _mm_set_ps(ValueSq * Value,ValueSq,Value,1.0f); 
    vTemp = _mm_mul_ps(vTemp,vTemp);
    vTemp = _mm_mul_ps(vTemp,vValue);
    // vTemp = Value,Value^3,Value^5,Value^7
    vTemp = _mm_mul_ps(vTemp,g_XMSinEstCoefficients);
    vValue = _mm_shuffle_ps(vValue,vTemp,_MM_SHUFFLE(1,0,0,0)); // Copy X to the Z position and Y to the W position
    vValue = _mm_add_ps(vValue,vTemp);          // Add Z = X+Z; W = Y+W;
    vTemp = _mm_shuffle_ps(vTemp,vValue,_MM_SHUFFLE(0,3,0,0));  // Copy W to the Z position
    vTemp = _mm_add_ps(vTemp,vValue);           // Add Z and W together
    vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(2,2,2,2));    // Splat Z and return
#if defined(_MSC_VER) && (_MSC_VER>=1500)
    return _mm_cvtss_f32(vTemp);    
#else
    return vTemp.m128_f32[0];
#endif
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------

XMFINLINE FLOAT XMScalarCosEst
(
    FLOAT Value
)
{
#if defined(_XM_NO_INTRINSICS_)
    FLOAT    ValueSq;
    XMVECTOR V;
    XMVECTOR Result;
    XMASSERT(Value >= -XM_PI);
    XMASSERT(Value < XM_PI);
    // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! (for -PI <= V < PI)
    ValueSq = Value * Value;
    V = XMVectorSet(1.0f, Value, ValueSq, ValueSq * Value);
    V = XMVectorMultiply(V, V);
    Result = XMVector4Dot(V, g_XMCosEstCoefficients.v);
    return Result.vector4_f32[0];
#elif defined(_XM_SSE_INTRINSICS_)
    XMASSERT(Value >= -XM_PI);
    XMASSERT(Value < XM_PI);
    float ValueSq = Value*Value;
    XMVECTOR vValue = _mm_setzero_ps();
    XMVECTOR vTemp = _mm_set_ps(ValueSq * Value,ValueSq,Value,1.0f); 
    vTemp = _mm_mul_ps(vTemp,vTemp);
    // vTemp = 1.0f,Value^2,Value^4,Value^6
    vTemp = _mm_mul_ps(vTemp,g_XMCosEstCoefficients);
    vValue = _mm_shuffle_ps(vValue,vTemp,_MM_SHUFFLE(1,0,0,0)); // Copy X to the Z position and Y to the W position
    vValue = _mm_add_ps(vValue,vTemp);          // Add Z = X+Z; W = Y+W;
    vTemp = _mm_shuffle_ps(vTemp,vValue,_MM_SHUFFLE(0,3,0,0));  // Copy W to the Z position
    vTemp = _mm_add_ps(vTemp,vValue);           // Add Z and W together
    vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(2,2,2,2));    // Splat Z and return
#if defined(_MSC_VER) && (_MSC_VER>=1500)
    return _mm_cvtss_f32(vTemp);    
#else
    return vTemp.m128_f32[0];
#endif
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------

XMFINLINE VOID XMScalarSinCosEst
(
    FLOAT* pSin,
    FLOAT* pCos,
    FLOAT  Value
)
{
#if defined(_XM_NO_INTRINSICS_)

    FLOAT    ValueSq;
    XMVECTOR V, Sin, Cos;
    XMVECTOR Y;

    XMASSERT(pSin);
    XMASSERT(pCos);
    XMASSERT(Value >= -XM_PI);
    XMASSERT(Value < XM_PI);

    // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! (for -PI <= V < PI)
    // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! (for -PI <= V < PI)

    ValueSq = Value * Value;
    V = XMVectorSet(1.0f, Value, ValueSq, Value * ValueSq);
    Y = XMVectorSplatY(V);
    Cos = XMVectorMultiply(V, V);
    Sin = XMVectorMultiply(Cos, Y);

    Cos = XMVector4Dot(Cos, g_XMCosEstCoefficients.v);
    Sin = XMVector4Dot(Sin, g_XMSinEstCoefficients.v);

    *pCos = Cos.vector4_f32[0];
    *pSin = Sin.vector4_f32[0];

#elif defined(_XM_SSE_INTRINSICS_)
    XMASSERT(pSin);
    XMASSERT(pCos);
    XMASSERT(Value >= -XM_PI);
    XMASSERT(Value < XM_PI);
    float ValueSq = Value * Value;
    XMVECTOR Cos = _mm_set_ps(Value * ValueSq,ValueSq,Value,1.0f);
    XMVECTOR Sin = _mm_set_ps1(Value);
    Cos = _mm_mul_ps(Cos,Cos);
    Sin = _mm_mul_ps(Sin,Cos);
    // Cos = 1.0f,Value^2,Value^4,Value^6
    Cos = XMVector4Dot(Cos,g_XMCosEstCoefficients);
    _mm_store_ss(pCos,Cos);
    // Sin = Value,Value^3,Value^5,Value^7
    Sin = XMVector4Dot(Sin, g_XMSinEstCoefficients);
    _mm_store_ss(pSin,Sin);
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------

XMFINLINE FLOAT XMScalarASinEst
(
    FLOAT Value
)
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR VR, CR, CS;
    XMVECTOR Result;
    FLOAT AbsV, V2, D;
    CONST FLOAT OnePlusEps = 1.00000011921f;

    *(UINT*)&AbsV = *(const UINT*)&Value & 0x7FFFFFFF;
    V2 = Value * AbsV;
    D = OnePlusEps - AbsV;

    CS = XMVectorSet(Value, 1.0f, 1.0f, V2);
    VR = XMVectorSet(sqrtf(D), Value, V2, D * AbsV);
    CR = XMVectorMultiply(CS, g_XMASinEstCoefficients.v);

    Result = XMVector4Dot(VR, CR);

    return Result.vector4_f32[0];

#elif defined(_XM_SSE_INTRINSICS_)
    CONST FLOAT OnePlusEps = 1.00000011921f;
    FLOAT AbsV = fabsf(Value);
    FLOAT V2 = Value * AbsV;    // Square with sign retained
    FLOAT D = OnePlusEps - AbsV;

    XMVECTOR Result = _mm_set_ps(V2,1.0f,1.0f,Value);
    XMVECTOR VR = _mm_set_ps(D * AbsV,V2,Value,sqrtf(D));
    Result = _mm_mul_ps(Result, g_XMASinEstCoefficients);
    Result = XMVector4Dot(VR,Result);
#if defined(_MSC_VER) && (_MSC_VER>=1500)
    return _mm_cvtss_f32(Result);    
#else
    return Result.m128_f32[0];
#endif
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

//------------------------------------------------------------------------------

XMFINLINE FLOAT XMScalarACosEst
(
    FLOAT Value
)
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR VR, CR, CS;
    XMVECTOR Result;
    FLOAT AbsV, V2, D;
    CONST FLOAT OnePlusEps = 1.00000011921f;

    // return XM_PIDIV2 - XMScalarASin(Value);

    *(UINT*)&AbsV = *(const UINT*)&Value & 0x7FFFFFFF;
    V2 = Value * AbsV;
    D = OnePlusEps - AbsV;

    CS = XMVectorSet(Value, 1.0f, 1.0f, V2);
    VR = XMVectorSet(sqrtf(D), Value, V2, D * AbsV);
    CR = XMVectorMultiply(CS, g_XMASinEstCoefficients.v);

    Result = XMVector4Dot(VR, CR);

    return XM_PIDIV2 - Result.vector4_f32[0];

#elif defined(_XM_SSE_INTRINSICS_)
    CONST FLOAT OnePlusEps = 1.00000011921f;
    FLOAT AbsV = fabsf(Value);
    FLOAT V2 = Value * AbsV;    // Value^2 retaining sign
    FLOAT D = OnePlusEps - AbsV;
    XMVECTOR Result = _mm_set_ps(V2,1.0f,1.0f,Value);
    XMVECTOR VR = _mm_set_ps(D * AbsV,V2,Value,sqrtf(D));
    Result = _mm_mul_ps(Result,g_XMASinEstCoefficients);
    Result = XMVector4Dot(VR,Result);
#if defined(_MSC_VER) && (_MSC_VER>=1500)
    return XM_PIDIV2 - _mm_cvtss_f32(Result);    
#else
    return XM_PIDIV2 - Result.m128_f32[0];
#endif
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}

#endif // __XNAMATHMISC_INL__