1 /************************************************************************
3 * xnamathconvert.inl -- SIMD C++ Math library for Windows and Xbox 360 *
4 * Conversion, loading, and storing functions *
6 * Copyright (c) Microsoft Corp. All rights reserved. *
8 ************************************************************************/
10 #if defined(_MSC_VER) && (_MSC_VER > 1000)
14 #ifndef __XNAMATHCONVERT_INL__
15 #define __XNAMATHCONVERT_INL__
17 #define XM_PACK_FACTOR (FLOAT)(1 << 22)
18 #define XM_UNPACK_FACTOR_UNSIGNED (FLOAT)(1 << 23)
19 #define XM_UNPACK_FACTOR_SIGNED XM_PACK_FACTOR
21 #define XM_UNPACK_UNSIGNEDN_OFFSET(BitsX, BitsY, BitsZ, BitsW) \
22 {-XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsX)) - 1), \
23 -XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsY)) - 1), \
24 -XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsZ)) - 1), \
25 -XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsW)) - 1)}
27 #define XM_UNPACK_UNSIGNEDN_SCALE(BitsX, BitsY, BitsZ, BitsW) \
28 {XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsX)) - 1), \
29 XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsY)) - 1), \
30 XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsZ)) - 1), \
31 XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsW)) - 1)}
33 #define XM_UNPACK_SIGNEDN_SCALE(BitsX, BitsY, BitsZ, BitsW) \
34 {-XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsX) - 1)) - 1), \
35 -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsY) - 1)) - 1), \
36 -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsZ) - 1)) - 1), \
37 -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsW) - 1)) - 1)}
39 //#define XM_UNPACK_SIGNEDN_OFFSET(BitsX, BitsY, BitsZ, BitsW) \
40 // {-XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsX) - 1)) - 1) * 3.0f, \
41 // -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsY) - 1)) - 1) * 3.0f, \
42 // -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsZ) - 1)) - 1) * 3.0f, \
43 // -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsW) - 1)) - 1) * 3.0f}
45 #define XM_PACK_UNSIGNEDN_SCALE(BitsX, BitsY, BitsZ, BitsW) \
46 {-(FLOAT)((1 << (BitsX)) - 1) / XM_PACK_FACTOR, \
47 -(FLOAT)((1 << (BitsY)) - 1) / XM_PACK_FACTOR, \
48 -(FLOAT)((1 << (BitsZ)) - 1) / XM_PACK_FACTOR, \
49 -(FLOAT)((1 << (BitsW)) - 1) / XM_PACK_FACTOR}
51 #define XM_PACK_SIGNEDN_SCALE(BitsX, BitsY, BitsZ, BitsW) \
52 {-(FLOAT)((1 << ((BitsX) - 1)) - 1) / XM_PACK_FACTOR, \
53 -(FLOAT)((1 << ((BitsY) - 1)) - 1) / XM_PACK_FACTOR, \
54 -(FLOAT)((1 << ((BitsZ) - 1)) - 1) / XM_PACK_FACTOR, \
55 -(FLOAT)((1 << ((BitsW) - 1)) - 1) / XM_PACK_FACTOR}
57 #define XM_PACK_OFFSET XMVectorSplatConstant(3, 0)
58 //#define XM_UNPACK_OFFSET XM_PACK_OFFSET
60 /****************************************************************************
64 ****************************************************************************/
66 //------------------------------------------------------------------------------
68 XMFINLINE FLOAT XMConvertHalfToFloat
73 #if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
79 Mantissa = (UINT)(Value & 0x03FF);
81 if ((Value & 0x7C00) != 0) // The value is normalized
83 Exponent = (UINT)((Value >> 10) & 0x1F);
85 else if (Mantissa != 0) // The value is denormalized
87 // Normalize the value in the resulting float
94 } while ((Mantissa & 0x0400) == 0);
98 else // The value is zero
100 Exponent = (UINT)-112;
103 Result = ((Value & 0x8000) << 16) | // Sign
104 ((Exponent + 112) << 23) | // Exponent
105 (Mantissa << 13); // Mantissa
107 return *(FLOAT*)&Result;
109 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
113 //------------------------------------------------------------------------------
115 XMINLINE FLOAT* XMConvertHalfToFloatStream
117 FLOAT* pOutputStream,
119 CONST HALF* pInputStream,
124 #if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
127 CONST BYTE* pHalf = (CONST BYTE*)pInputStream;
128 BYTE* pFloat = (BYTE*)pOutputStream;
130 XMASSERT(pOutputStream);
131 XMASSERT(pInputStream);
133 for (i = 0; i < HalfCount; i++)
135 *(FLOAT*)pFloat = XMConvertHalfToFloat(*(const HALF*)pHalf);
136 pHalf += InputStride;
137 pFloat += OutputStride;
140 return pOutputStream;
142 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
143 #endif // _XM_VMX128_INTRINSICS_
146 //------------------------------------------------------------------------------
148 XMFINLINE HALF XMConvertFloatToHalf
153 #if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
156 UINT IValue = ((UINT *)(&Value))[0];
157 UINT Sign = (IValue & 0x80000000U) >> 16U;
158 IValue = IValue & 0x7FFFFFFFU; // Hack off the sign
160 if (IValue > 0x47FFEFFFU)
162 // The number is too large to be represented as a half. Saturate to infinity.
167 if (IValue < 0x38800000U)
169 // The number is too small to be represented as a normalized half.
170 // Convert it to a denormalized value.
171 UINT Shift = 113U - (IValue >> 23U);
172 IValue = (0x800000U | (IValue & 0x7FFFFFU)) >> Shift;
176 // Rebias the exponent to represent the value as a normalized half.
177 IValue += 0xC8000000U;
180 Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U)&0x7FFFU;
182 return (HALF)(Result|Sign);
184 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
188 //------------------------------------------------------------------------------
190 XMINLINE HALF* XMConvertFloatToHalfStream
194 CONST FLOAT* pInputStream,
199 #if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
202 BYTE* pFloat = (BYTE*)pInputStream;
203 BYTE* pHalf = (BYTE*)pOutputStream;
205 XMASSERT(pOutputStream);
206 XMASSERT(pInputStream);
208 for (i = 0; i < FloatCount; i++)
210 *(HALF*)pHalf = XMConvertFloatToHalf(*(FLOAT*)pFloat);
211 pFloat += InputStride;
212 pHalf += OutputStride;
214 return pOutputStream;
215 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
216 #endif // _XM_VMX128_INTRINSICS_
219 //------------------------------------------------------------------------------
221 #if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
222 // For VMX128, these routines are all defines in the main header
224 #pragma warning(push)
225 #pragma warning(disable:4701) // Prevent warnings about 'Result' potentially being used without having been initialized
227 XMINLINE XMVECTOR XMConvertVectorIntToFloat
233 #if defined(_XM_NO_INTRINSICS_)
237 XMASSERT(DivExponent<32);
238 fScale = 1.0f / (FLOAT)(1U << DivExponent);
241 INT iTemp = (INT)VInt.vector4_u32[ElementIndex];
242 Result.vector4_f32[ElementIndex] = ((FLOAT)iTemp) * fScale;
243 } while (++ElementIndex<4);
245 #else // _XM_SSE_INTRINSICS_
246 XMASSERT(DivExponent<32);
248 XMVECTOR vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&VInt)[0]);
249 // Convert DivExponent into 1.0f/(1<<DivExponent)
250 UINT uScale = 0x3F800000U - (DivExponent << 23);
251 // Splat the scalar value
252 __m128i vScale = _mm_set1_epi32(uScale);
253 vResult = _mm_mul_ps(vResult,reinterpret_cast<const __m128 *>(&vScale)[0]);
258 //------------------------------------------------------------------------------
260 XMINLINE XMVECTOR XMConvertVectorFloatToInt
266 #if defined(_XM_NO_INTRINSICS_)
270 XMASSERT(MulExponent<32);
271 // Get the scalar factor.
272 fScale = (FLOAT)(1U << MulExponent);
276 FLOAT fTemp = VFloat.vector4_f32[ElementIndex]*fScale;
277 if (fTemp <= -(65536.0f*32768.0f)) {
278 iResult = (-0x7FFFFFFF)-1;
279 } else if (fTemp > (65536.0f*32768.0f)-128.0f) {
280 iResult = 0x7FFFFFFF;
282 iResult = (INT)fTemp;
284 Result.vector4_u32[ElementIndex] = (UINT)iResult;
285 } while (++ElementIndex<4);
287 #else // _XM_SSE_INTRINSICS_
288 XMASSERT(MulExponent<32);
289 XMVECTOR vResult = _mm_set_ps1((FLOAT)(1U << MulExponent));
290 vResult = _mm_mul_ps(vResult,VFloat);
291 // In case of positive overflow, detect it
292 XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxInt);
293 // Float to int conversion
294 __m128i vResulti = _mm_cvttps_epi32(vResult);
295 // If there was positive overflow, set to 0x7FFFFFFF
296 vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
297 vOverflow = _mm_andnot_ps(vOverflow,reinterpret_cast<const __m128 *>(&vResulti)[0]);
298 vOverflow = _mm_or_ps(vOverflow,vResult);
303 //------------------------------------------------------------------------------
305 XMINLINE XMVECTOR XMConvertVectorUIntToFloat
311 #if defined(_XM_NO_INTRINSICS_)
315 XMASSERT(DivExponent<32);
316 fScale = 1.0f / (FLOAT)(1U << DivExponent);
319 Result.vector4_f32[ElementIndex] = (FLOAT)VUInt.vector4_u32[ElementIndex] * fScale;
320 } while (++ElementIndex<4);
322 #else // _XM_SSE_INTRINSICS_
323 XMASSERT(DivExponent<32);
324 // For the values that are higher than 0x7FFFFFFF, a fixup is needed
325 // Determine which ones need the fix.
326 XMVECTOR vMask = _mm_and_ps(VUInt,g_XMNegativeZero);
327 // Force all values positive
328 XMVECTOR vResult = _mm_xor_ps(VUInt,vMask);
330 vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
331 // Convert 0x80000000 -> 0xFFFFFFFF
332 __m128i iMask = _mm_srai_epi32(reinterpret_cast<const __m128i *>(&vMask)[0],31);
333 // For only the ones that are too big, add the fixup
334 vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&iMask)[0],g_XMFixUnsigned);
335 vResult = _mm_add_ps(vResult,vMask);
336 // Convert DivExponent into 1.0f/(1<<DivExponent)
337 UINT uScale = 0x3F800000U - (DivExponent << 23);
339 iMask = _mm_set1_epi32(uScale);
340 vResult = _mm_mul_ps(vResult,reinterpret_cast<const __m128 *>(&iMask)[0]);
345 //------------------------------------------------------------------------------
347 XMINLINE XMVECTOR XMConvertVectorFloatToUInt
353 #if defined(_XM_NO_INTRINSICS_)
357 XMASSERT(MulExponent<32);
358 // Get the scalar factor.
359 fScale = (FLOAT)(1U << MulExponent);
363 FLOAT fTemp = VFloat.vector4_f32[ElementIndex]*fScale;
366 } else if (fTemp >= (65536.0f*65536.0f)) {
367 uResult = 0xFFFFFFFFU;
369 uResult = (UINT)fTemp;
371 Result.vector4_u32[ElementIndex] = uResult;
372 } while (++ElementIndex<4);
374 #else // _XM_SSE_INTRINSICS_
375 XMASSERT(MulExponent<32);
376 XMVECTOR vResult = _mm_set_ps1(static_cast<float>(1U << MulExponent));
377 vResult = _mm_mul_ps(vResult,VFloat);
379 vResult = _mm_max_ps(vResult,g_XMZero);
380 // Any numbers that are too big, set to 0xFFFFFFFFU
381 XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
382 XMVECTOR vValue = g_XMUnsignedFix;
383 // Too large for a signed integer?
384 XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
385 // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
386 vValue = _mm_and_ps(vValue,vMask);
387 // Perform fixup only on numbers too large (Keeps low bit precision)
388 vResult = _mm_sub_ps(vResult,vValue);
389 __m128i vResulti = _mm_cvttps_epi32(vResult);
390 // Convert from signed to unsigned pnly if greater than 0x80000000
391 vMask = _mm_and_ps(vMask,g_XMNegativeZero);
392 vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],vMask);
393 // On those that are too large, set to 0xFFFFFFFF
394 vResult = _mm_or_ps(vResult,vOverflow);
401 #endif // _XM_NO_INTRINSICS_ || _XM_SSE_INTRINSICS_
403 /****************************************************************************
405 * Vector and matrix load operations
407 ****************************************************************************/
409 //------------------------------------------------------------------------------
411 XMFINLINE XMVECTOR XMLoadInt(CONST UINT* pSource)
413 #if defined(_XM_NO_INTRINSICS_)
417 XMASSERT(((UINT_PTR)pSource & 3) == 0);
419 V.vector4_u32[0] = *pSource;
423 #elif defined(_XM_SSE_INTRINSICS_)
425 XMASSERT(((UINT_PTR)pSource & 3) == 0);
427 return _mm_load_ss( (const float*)pSource );
428 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
429 #endif // _XM_VMX128_INTRINSICS_
432 //------------------------------------------------------------------------------
434 XMFINLINE XMVECTOR XMLoadFloat(CONST FLOAT* pSource)
436 #if defined(_XM_NO_INTRINSICS_)
440 XMASSERT(((UINT_PTR)pSource & 3) == 0);
442 V.vector4_f32[0] = *pSource;
446 #elif defined(_XM_SSE_INTRINSICS_)
448 XMASSERT(((UINT_PTR)pSource & 3) == 0);
450 return _mm_load_ss( pSource );
451 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
452 #endif // _XM_VMX128_INTRINSICS_
455 //------------------------------------------------------------------------------
457 XMFINLINE XMVECTOR XMLoadInt2
462 #if defined(_XM_NO_INTRINSICS_)
468 V.vector4_u32[0] = pSource[0];
469 V.vector4_u32[1] = pSource[1];
472 #elif defined(_XM_SSE_INTRINSICS_)
476 __m128 x = _mm_load_ss( (const float*)pSource );
477 __m128 y = _mm_load_ss( (const float*)(pSource+1) );
478 return _mm_unpacklo_ps( x, y );
479 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
480 #endif // _XM_VMX128_INTRINSICS_
483 //------------------------------------------------------------------------------
485 XMFINLINE XMVECTOR XMLoadSInt2
487 CONST XMINT2* pSource
490 #if defined(_XM_NO_INTRINSICS_)
494 V.vector4_f32[0] = (float)pSource->x;
495 V.vector4_f32[1] = (float)pSource->y;
498 #elif defined(_XM_SSE_INTRINSICS_)
501 __m128 x = _mm_load_ss( (const float*)&pSource->x );
502 __m128 y = _mm_load_ss( (const float*)&pSource->y );
503 __m128 V = _mm_unpacklo_ps( x, y );
504 return _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&V)[0]);
505 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
506 #endif // _XM_VMX128_INTRINSICS_
509 //------------------------------------------------------------------------------
511 XMFINLINE XMVECTOR XMLoadUInt2
513 CONST XMUINT2* pSource
516 #if defined(_XM_NO_INTRINSICS_)
520 V.vector4_f32[0] = (float)pSource->x;
521 V.vector4_f32[1] = (float)pSource->y;
524 #elif defined(_XM_SSE_INTRINSICS_)
527 __m128 x = _mm_load_ss( (const float*)&pSource->x );
528 __m128 y = _mm_load_ss( (const float*)&pSource->y );
529 __m128 V = _mm_unpacklo_ps( x, y );
530 // For the values that are higher than 0x7FFFFFFF, a fixup is needed
531 // Determine which ones need the fix.
532 XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero);
533 // Force all values positive
534 XMVECTOR vResult = _mm_xor_ps(V,vMask);
536 vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
537 // Convert 0x80000000 -> 0xFFFFFFFF
538 __m128i iMask = _mm_srai_epi32(reinterpret_cast<const __m128i *>(&vMask)[0],31);
539 // For only the ones that are too big, add the fixup
540 vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&iMask)[0],g_XMFixUnsigned);
541 vResult = _mm_add_ps(vResult,vMask);
543 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
544 #endif // _XM_VMX128_INTRINSICS_
547 //------------------------------------------------------------------------------
549 XMFINLINE XMVECTOR XMLoadInt2A
554 #if defined(_XM_NO_INTRINSICS_)
559 XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
561 V.vector4_u32[0] = pSource[0];
562 V.vector4_u32[1] = pSource[1];
566 #elif defined(_XM_SSE_INTRINSICS_)
569 XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
571 __m128i V = _mm_loadl_epi64( (const __m128i*)pSource );
572 return reinterpret_cast<__m128 *>(&V)[0];
574 #else // _XM_VMX128_INTRINSICS_
575 #endif // _XM_VMX128_INTRINSICS_
578 //------------------------------------------------------------------------------
580 XMFINLINE XMVECTOR XMLoadFloat2
582 CONST XMFLOAT2* pSource
585 #if defined(_XM_NO_INTRINSICS_)
589 ((UINT *)(&V.vector4_f32[0]))[0] = ((const UINT *)(&pSource->x))[0];
590 ((UINT *)(&V.vector4_f32[1]))[0] = ((const UINT *)(&pSource->y))[0];
592 #elif defined(_XM_SSE_INTRINSICS_)
595 __m128 x = _mm_load_ss( &pSource->x );
596 __m128 y = _mm_load_ss( &pSource->y );
597 return _mm_unpacklo_ps( x, y );
598 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
599 #endif // _XM_VMX128_INTRINSICS_
602 //------------------------------------------------------------------------------
604 XMFINLINE XMVECTOR XMLoadFloat2A
606 CONST XMFLOAT2A* pSource
609 #if defined(_XM_NO_INTRINSICS_)
614 XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
616 V.vector4_f32[0] = pSource->x;
617 V.vector4_f32[1] = pSource->y;
621 #elif defined(_XM_SSE_INTRINSICS_)
623 XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
625 __m128i V = _mm_loadl_epi64( (const __m128i*)pSource );
626 return reinterpret_cast<__m128 *>(&V)[0];
627 #else // _XM_VMX128_INTRINSICS_
628 #endif // _XM_VMX128_INTRINSICS_
631 //------------------------------------------------------------------------------
633 XMFINLINE XMVECTOR XMLoadHalf2
635 CONST XMHALF2* pSource
638 #if defined(_XM_NO_INTRINSICS_)
642 XMConvertHalfToFloat(pSource->x),
643 XMConvertHalfToFloat(pSource->y),
649 #elif defined(_XM_SSE_INTRINSICS_)
652 XMConvertHalfToFloat(pSource->x),
653 XMConvertHalfToFloat(pSource->y),
659 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
660 #endif // _XM_VMX128_INTRINSICS_
663 //------------------------------------------------------------------------------
665 XMFINLINE XMVECTOR XMLoadShortN2
667 CONST XMSHORTN2* pSource
670 #if defined(_XM_NO_INTRINSICS_)
674 (pSource->x == -32768) ? -1.f : ((FLOAT)pSource->x * (1.0f/32767.0f)),
675 (pSource->y == -32768) ? -1.f : ((FLOAT)pSource->y * (1.0f/32767.0f)),
682 #elif defined(_XM_SSE_INTRINSICS_)
684 // Splat the two shorts in all four entries (WORD alignment okay,
685 // DWORD alignment preferred)
686 __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
687 // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
688 vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
689 // x needs to be sign extended
690 vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16);
691 // Convert to floating point numbers
692 vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
693 // x - 0x8000 to undo the signed order.
694 vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16);
695 // Convert -1.0f - 1.0f
696 vTemp = _mm_mul_ps(vTemp,g_XMNormalizeX16Y16);
697 // Clamp result (for case of -32768)
698 return _mm_max_ps( vTemp, g_XMNegativeOne );
699 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
700 #endif // _XM_VMX128_INTRINSICS_
703 //------------------------------------------------------------------------------
705 XMFINLINE XMVECTOR XMLoadShort2
707 CONST XMSHORT2* pSource
710 #if defined(_XM_NO_INTRINSICS_)
716 V.vector4_f32[0] = (FLOAT)pSource->x;
717 V.vector4_f32[1] = (FLOAT)pSource->y;
721 #elif defined(_XM_SSE_INTRINSICS_)
723 // Splat the two shorts in all four entries (WORD alignment okay,
724 // DWORD alignment preferred)
725 __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
726 // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
727 vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
728 // x needs to be sign extended
729 vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16);
730 // Convert to floating point numbers
731 vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
732 // x - 0x8000 to undo the signed order.
733 vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16);
734 // Y is 65536 too large
735 return _mm_mul_ps(vTemp,g_XMFixupY16);
736 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
737 #endif // _XM_VMX128_INTRINSICS_
740 //------------------------------------------------------------------------------
742 XMFINLINE XMVECTOR XMLoadUShortN2
744 CONST XMUSHORTN2* pSource
747 #if defined(_XM_NO_INTRINSICS_)
753 V.vector4_f32[0] = (FLOAT)pSource->x / 65535.0f;
754 V.vector4_f32[1] = (FLOAT)pSource->y / 65535.0f;
758 #elif defined(_XM_SSE_INTRINSICS_)
759 static const XMVECTORF32 FixupY16 = {1.0f/65535.0f,1.0f/(65535.0f*65536.0f),0.0f,0.0f};
760 static const XMVECTORF32 FixaddY16 = {0,32768.0f*65536.0f,0,0};
762 // Splat the two shorts in all four entries (WORD alignment okay,
763 // DWORD alignment preferred)
764 __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
765 // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
766 vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
767 // y needs to be sign flipped
768 vTemp = _mm_xor_ps(vTemp,g_XMFlipY);
769 // Convert to floating point numbers
770 vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
771 // y + 0x8000 to undo the signed order.
772 vTemp = _mm_add_ps(vTemp,FixaddY16);
773 // Y is 65536 times too large
774 vTemp = _mm_mul_ps(vTemp,FixupY16);
776 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
777 #endif // _XM_VMX128_INTRINSICS_
780 //------------------------------------------------------------------------------
782 XMFINLINE XMVECTOR XMLoadUShort2
784 CONST XMUSHORT2* pSource
787 #if defined(_XM_NO_INTRINSICS_)
793 V.vector4_f32[0] = (FLOAT)pSource->x;
794 V.vector4_f32[1] = (FLOAT)pSource->y;
798 #elif defined(_XM_SSE_INTRINSICS_)
799 static const XMVECTORF32 FixaddY16 = {0,32768.0f,0,0};
801 // Splat the two shorts in all four entries (WORD alignment okay,
802 // DWORD alignment preferred)
803 __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
804 // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
805 vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
806 // y needs to be sign flipped
807 vTemp = _mm_xor_ps(vTemp,g_XMFlipY);
808 // Convert to floating point numbers
809 vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
810 // Y is 65536 times too large
811 vTemp = _mm_mul_ps(vTemp,g_XMFixupY16);
812 // y + 0x8000 to undo the signed order.
813 vTemp = _mm_add_ps(vTemp,FixaddY16);
815 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
816 #endif // _XM_VMX128_INTRINSICS_
819 //------------------------------------------------------------------------------
821 XMFINLINE XMVECTOR XMLoadByteN2
823 CONST XMBYTEN2* pSource
829 (pSource->x == -128) ? -1.f : ((FLOAT)pSource->x * (1.0f/127.0f)),
830 (pSource->y == -128) ? -1.f : ((FLOAT)pSource->y * (1.0f/127.0f)),
838 //------------------------------------------------------------------------------
840 XMFINLINE XMVECTOR XMLoadByte2
842 CONST XMBYTE2* pSource
857 //------------------------------------------------------------------------------
859 XMFINLINE XMVECTOR XMLoadUByteN2
861 CONST XMUBYTEN2* pSource
867 (FLOAT)pSource->x * (1.0f/255.0f),
868 (FLOAT)pSource->y * (1.0f/255.0f),
876 //------------------------------------------------------------------------------
878 XMFINLINE XMVECTOR XMLoadUByte2
880 CONST XMUBYTE2* pSource
895 //------------------------------------------------------------------------------
897 XMFINLINE XMVECTOR XMLoadInt3
902 #if defined(_XM_NO_INTRINSICS_)
908 V.vector4_u32[0] = pSource[0];
909 V.vector4_u32[1] = pSource[1];
910 V.vector4_u32[2] = pSource[2];
914 #elif defined(_XM_SSE_INTRINSICS_)
918 __m128i V = _mm_set_epi32( 0, *(pSource+2), *(pSource+1), *pSource );
919 return reinterpret_cast<__m128 *>(&V)[0];
921 __m128 x = _mm_load_ss( (const float*)pSource );
922 __m128 y = _mm_load_ss( (const float*)(pSource+1) );
923 __m128 z = _mm_load_ss( (const float*)(pSource+2) );
924 __m128 xy = _mm_unpacklo_ps( x, y );
925 return _mm_movelh_ps( xy, z );
926 #endif // !_XM_ISVS2005_
927 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
928 #endif // _XM_VMX128_INTRINSICS_
931 //------------------------------------------------------------------------------
933 XMFINLINE XMVECTOR XMLoadSInt3
935 CONST XMINT3* pSource
938 #if defined(_XM_NO_INTRINSICS_)
943 V = XMLoadInt3( (const UINT*)pSource );
944 return XMConvertVectorIntToFloat( V, 0 );
946 V.vector4_f32[0] = (float)pSource->x;
947 V.vector4_f32[1] = (float)pSource->y;
948 V.vector4_f32[2] = (float)pSource->z;
952 #elif defined(_XM_SSE_INTRINSICS_)
956 __m128i V = _mm_set_epi32( 0, pSource->z, pSource->y, pSource->x );
957 return _mm_cvtepi32_ps(V);
959 __m128 x = _mm_load_ss( (const float*)&pSource->x );
960 __m128 y = _mm_load_ss( (const float*)&pSource->y );
961 __m128 z = _mm_load_ss( (const float*)&pSource->z );
962 __m128 xy = _mm_unpacklo_ps( x, y );
963 __m128 V = _mm_movelh_ps( xy, z );
964 return _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&V)[0]);
965 #endif // !_XM_ISVS2005_
966 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
967 #endif // _XM_VMX128_INTRINSICS_
970 //------------------------------------------------------------------------------
972 XMFINLINE XMVECTOR XMLoadUInt3
974 CONST XMUINT3* pSource
977 #if defined(_XM_NO_INTRINSICS_)
981 V.vector4_f32[0] = (float)pSource->x;
982 V.vector4_f32[1] = (float)pSource->y;
983 V.vector4_f32[2] = (float)pSource->z;
986 #elif defined(_XM_SSE_INTRINSICS_)
990 __m128i V = _mm_set_epi32( 0, pSource->z, pSource->y, pSource->x );
991 // For the values that are higher than 0x7FFFFFFF, a fixup is needed
992 // Determine which ones need the fix.
993 XMVECTOR vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&V)[0],g_XMNegativeZero);
994 // Force all values positive
995 XMVECTOR vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&V)[0],vMask);
997 __m128 x = _mm_load_ss( (const float*)&pSource->x );
998 __m128 y = _mm_load_ss( (const float*)&pSource->y );
999 __m128 z = _mm_load_ss( (const float*)&pSource->z );
1000 __m128 xy = _mm_unpacklo_ps( x, y );
1001 __m128 V = _mm_movelh_ps( xy, z );
1002 // For the values that are higher than 0x7FFFFFFF, a fixup is needed
1003 // Determine which ones need the fix.
1004 XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero);
1005 // Force all values positive
1006 XMVECTOR vResult = _mm_xor_ps(V,vMask);
1007 #endif // !_XM_ISVS2005_
1008 // Convert to floats
1009 vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
1010 // Convert 0x80000000 -> 0xFFFFFFFF
1011 __m128i iMask = _mm_srai_epi32(reinterpret_cast<const __m128i *>(&vMask)[0],31);
1012 // For only the ones that are too big, add the fixup
1013 vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&iMask)[0],g_XMFixUnsigned);
1014 vResult = _mm_add_ps(vResult,vMask);
1017 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1018 #endif // _XM_VMX128_INTRINSICS_
1021 //------------------------------------------------------------------------------
1023 XMFINLINE XMVECTOR XMLoadInt3A
1028 #if defined(_XM_NO_INTRINSICS_)
1033 XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
1035 V.vector4_u32[0] = pSource[0];
1036 V.vector4_u32[1] = pSource[1];
1037 V.vector4_u32[2] = pSource[2];
1041 #elif defined(_XM_SSE_INTRINSICS_)
1044 // Reads an extra integer that is 'undefined'
1046 __m128i V = _mm_load_si128( (const __m128i*)pSource );
1047 return reinterpret_cast<__m128 *>(&V)[0];
1048 #else // _XM_VMX128_INTRINSICS_
1049 #endif // _XM_VMX128_INTRINSICS_
1052 //------------------------------------------------------------------------------
1054 XMFINLINE XMVECTOR XMLoadFloat3
1056 CONST XMFLOAT3* pSource
1059 #if defined(_XM_NO_INTRINSICS_)
1063 ((UINT *)(&V.vector4_f32[0]))[0] = ((const UINT *)(&pSource->x))[0];
1064 ((UINT *)(&V.vector4_f32[1]))[0] = ((const UINT *)(&pSource->y))[0];
1065 ((UINT *)(&V.vector4_f32[2]))[0] = ((const UINT *)(&pSource->z))[0];
1067 #elif defined(_XM_SSE_INTRINSICS_)
1070 #ifdef _XM_ISVS2005_
1071 // This reads 1 floats past the memory that should be ignored.
1072 // Need to continue to do this for VS 2005 due to compiler issue but prefer new method
1073 // to avoid triggering issues with memory debug tools (like AV)
1074 return _mm_loadu_ps( &pSource->x );
1076 __m128 x = _mm_load_ss( &pSource->x );
1077 __m128 y = _mm_load_ss( &pSource->y );
1078 __m128 z = _mm_load_ss( &pSource->z );
1079 __m128 xy = _mm_unpacklo_ps( x, y );
1080 return _mm_movelh_ps( xy, z );
1081 #endif // !_XM_ISVS2005_
1082 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1083 #endif // _XM_VMX128_INTRINSICS_
1086 //------------------------------------------------------------------------------
1088 XMFINLINE XMVECTOR XMLoadFloat3A
1090 CONST XMFLOAT3A* pSource
1093 #if defined(_XM_NO_INTRINSICS_)
1098 XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
1100 V.vector4_f32[0] = pSource->x;
1101 V.vector4_f32[1] = pSource->y;
1102 V.vector4_f32[2] = pSource->z;
1106 #elif defined(_XM_SSE_INTRINSICS_)
1108 XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
1110 // This reads 1 floats past the memory that should be ignored.
1111 return _mm_load_ps( &pSource->x );
1112 #else // _XM_VMX128_INTRINSICS_
1113 #endif // _XM_VMX128_INTRINSICS_
1116 //------------------------------------------------------------------------------
1118 XMFINLINE XMVECTOR XMLoadUHenDN3
1120 CONST XMUHENDN3* pSource
1123 #if defined(_XM_NO_INTRINSICS_)
1130 Element = pSource->v & 0x7FF;
1131 V.vector4_f32[0] = (FLOAT)Element / 2047.0f;
1132 Element = (pSource->v >> 11) & 0x7FF;
1133 V.vector4_f32[1] = (FLOAT)Element / 2047.0f;
1134 Element = (pSource->v >> 22) & 0x3FF;
1135 V.vector4_f32[2] = (FLOAT)Element / 1023.0f;
1139 #elif defined(_XM_SSE_INTRINSICS_)
1140 static const XMVECTORF32 UHenDN3Mul = {1.0f/2047.0f,1.0f/(2047.0f*2048.0f),1.0f/(1023.0f*2048.0f*2048.0f),0};
1142 // Get the 32 bit value and splat it
1143 XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
1144 // Mask off x, y and z
1145 vResult = _mm_and_ps(vResult,g_XMMaskHenD3);
1146 // Convert x and y to unsigned
1147 vResult = _mm_xor_ps(vResult,g_XMFlipZ);
1149 vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
1150 // Convert x and y back to signed
1151 vResult = _mm_add_ps(vResult,g_XMAddUHenD3);
1152 // Normalize x,y and z to -1.0f-1.0f
1153 vResult = _mm_mul_ps(vResult,UHenDN3Mul);
1155 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1156 #endif // _XM_VMX128_INTRINSICS_
1159 //------------------------------------------------------------------------------
1161 XMFINLINE XMVECTOR XMLoadUHenD3
1163 CONST XMUHEND3* pSource
1166 #if defined(_XM_NO_INTRINSICS_)
1173 Element = pSource->v & 0x7FF;
1174 V.vector4_f32[0] = (FLOAT)Element;
1175 Element = (pSource->v >> 11) & 0x7FF;
1176 V.vector4_f32[1] = (FLOAT)Element;
1177 Element = (pSource->v >> 22) & 0x3FF;
1178 V.vector4_f32[2] = (FLOAT)Element;
1182 #elif defined(_XM_SSE_INTRINSICS_)
1184 // Get the 32 bit value and splat it
1185 XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
1186 // Mask off x, y and z
1187 vResult = _mm_and_ps(vResult,g_XMMaskHenD3);
1188 // Convert x and y to unsigned
1189 vResult = _mm_xor_ps(vResult,g_XMFlipZ);
1191 vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
1192 // Convert x and y back to signed
1193 vResult = _mm_add_ps(vResult,g_XMAddUHenD3);
1194 // Normalize x and y to -1024-1023.0f and z to -512-511.0f
1195 vResult = _mm_mul_ps(vResult,g_XMMulHenD3);
1197 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1198 #endif // _XM_VMX128_INTRINSICS_
1201 //------------------------------------------------------------------------------
1203 XMFINLINE XMVECTOR XMLoadHenDN3
1205 CONST XMHENDN3* pSource
1208 #if defined(_XM_NO_INTRINSICS_)
1212 static CONST UINT SignExtendXY[] = {0x00000000, 0xFFFFF800};
1213 static CONST UINT SignExtendZ[] = {0x00000000, 0xFFFFFC00};
1216 XMASSERT((pSource->v & 0x7FF) != 0x400);
1217 XMASSERT(((pSource->v >> 11) & 0x7FF) != 0x400);
1218 XMASSERT(((pSource->v >> 22) & 0x3FF) != 0x200);
1220 Element = pSource->v & 0x7FF;
1221 V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtendXY[Element >> 10]) / 1023.0f;
1222 Element = (pSource->v >> 11) & 0x7FF;
1223 V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtendXY[Element >> 10]) / 1023.0f;
1224 Element = (pSource->v >> 22) & 0x3FF;
1225 V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtendZ[Element >> 9]) / 511.0f;
1229 #elif defined(_XM_SSE_INTRINSICS_)
1230 static const XMVECTORF32 HenDN3Mul = {1.0f/1023.0f,1.0f/(1023.0f*2048.0f),1.0f/(511.0f*2048.0f*2048.0f),0};
1232 XMASSERT((pSource->v & 0x7FF) != 0x400);
1233 XMASSERT(((pSource->v >> 11) & 0x7FF) != 0x400);
1234 XMASSERT(((pSource->v >> 22) & 0x3FF) != 0x200);
1235 // Get the 32 bit value and splat it
1236 XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
1237 // Mask off x, y and z
1238 vResult = _mm_and_ps(vResult,g_XMMaskHenD3);
1239 // Convert x and y to unsigned
1240 vResult = _mm_xor_ps(vResult,g_XMXorHenD3);
1242 vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
1243 // Convert x and y back to signed
1244 vResult = _mm_add_ps(vResult,g_XMAddHenD3);
1245 // Normalize x,y and z to -1.0f-1.0f
1246 vResult = _mm_mul_ps(vResult,HenDN3Mul);
1248 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1249 #endif // _XM_VMX128_INTRINSICS_
1252 //------------------------------------------------------------------------------
1254 XMFINLINE XMVECTOR XMLoadHenD3
1256 CONST XMHEND3* pSource
1259 #if defined(_XM_NO_INTRINSICS_)
1263 static CONST UINT SignExtendXY[] = {0x00000000, 0xFFFFF800};
1264 static CONST UINT SignExtendZ[] = {0x00000000, 0xFFFFFC00};
1267 XMASSERT((pSource->v & 0x7FF) != 0x400);
1268 XMASSERT(((pSource->v >> 11) & 0x7FF) != 0x400);
1269 XMASSERT(((pSource->v >> 22) & 0x3FF) != 0x200);
1271 Element = pSource->v & 0x7FF;
1272 V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtendXY[Element >> 10]);
1273 Element = (pSource->v >> 11) & 0x7FF;
1274 V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtendXY[Element >> 10]);
1275 Element = (pSource->v >> 22) & 0x3FF;
1276 V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtendZ[Element >> 9]);
1280 #elif defined(_XM_SSE_INTRINSICS_)
1282 XMASSERT((pSource->v & 0x7FF) != 0x400);
1283 XMASSERT(((pSource->v >> 11) & 0x7FF) != 0x400);
1284 XMASSERT(((pSource->v >> 22) & 0x3FF) != 0x200);
1285 // Get the 32 bit value and splat it
1286 XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
1287 // Mask off x, y and z
1288 vResult = _mm_and_ps(vResult,g_XMMaskHenD3);
1289 // Convert x and y to unsigned
1290 vResult = _mm_xor_ps(vResult,g_XMXorHenD3);
1292 vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
1293 // Convert x and y back to signed
1294 vResult = _mm_add_ps(vResult,g_XMAddHenD3);
1295 // Normalize x and y to -1024-1023.0f and z to -512-511.0f
1296 vResult = _mm_mul_ps(vResult,g_XMMulHenD3);
1298 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1299 #endif // _XM_VMX128_INTRINSICS_
1302 //------------------------------------------------------------------------------
1304 XMFINLINE XMVECTOR XMLoadUDHenN3
1306 CONST XMUDHENN3* pSource
1309 #if defined(_XM_NO_INTRINSICS_)
1316 Element = pSource->v & 0x3FF;
1317 V.vector4_f32[0] = (FLOAT)Element / 1023.0f;
1318 Element = (pSource->v >> 10) & 0x7FF;
1319 V.vector4_f32[1] = (FLOAT)Element / 2047.0f;
1320 Element = (pSource->v >> 21) & 0x7FF;
1321 V.vector4_f32[2] = (FLOAT)Element / 2047.0f;
1325 #elif defined(_XM_SSE_INTRINSICS_)
1326 static const XMVECTORF32 UDHenN3Mul = {1.0f/1023.0f,1.0f/(2047.0f*1024.0f),1.0f/(2047.0f*1024.0f*2048.0f),0};
1328 // Get the 32 bit value and splat it
1329 XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
1330 // Mask off x, y and z
1331 vResult = _mm_and_ps(vResult,g_XMMaskDHen3);
1332 // Convert x and y to unsigned
1333 vResult = _mm_xor_ps(vResult,g_XMFlipZ);
1335 vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
1336 // Convert x and y back to signed
1337 vResult = _mm_add_ps(vResult,g_XMAddUHenD3);
1338 // Normalize x,y and z to -1.0f-1.0f
1339 vResult = _mm_mul_ps(vResult,UDHenN3Mul);
1341 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1342 #endif // _XM_VMX128_INTRINSICS_
1345 //------------------------------------------------------------------------------
1347 XMFINLINE XMVECTOR XMLoadUDHen3
1349 CONST XMUDHEN3* pSource
1352 #if defined(_XM_NO_INTRINSICS_)
1359 Element = pSource->v & 0x3FF;
1360 V.vector4_f32[0] = (FLOAT)Element;
1361 Element = (pSource->v >> 10) & 0x7FF;
1362 V.vector4_f32[1] = (FLOAT)Element;
1363 Element = (pSource->v >> 21) & 0x7FF;
1364 V.vector4_f32[2] = (FLOAT)Element;
1368 #elif defined(_XM_SSE_INTRINSICS_)
1370 // Get the 32 bit value and splat it
1371 XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
1372 // Mask off x, y and z
1373 vResult = _mm_and_ps(vResult,g_XMMaskDHen3);
1374 // Convert x and y to unsigned
1375 vResult = _mm_xor_ps(vResult,g_XMFlipZ);
1377 vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
1378 // Convert x and y back to signed
1379 vResult = _mm_add_ps(vResult,g_XMAddUHenD3);
1380 // Normalize x to 0-1023.0f and y and z to 0-2047.0f
1381 vResult = _mm_mul_ps(vResult,g_XMMulDHen3);
1383 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1384 #endif // _XM_VMX128_INTRINSICS_
1387 //------------------------------------------------------------------------------
1389 XMFINLINE XMVECTOR XMLoadDHenN3
1391 CONST XMDHENN3* pSource
1394 #if defined(_XM_NO_INTRINSICS_)
1398 static CONST UINT SignExtendX[] = {0x00000000, 0xFFFFFC00};
1399 static CONST UINT SignExtendYZ[] = {0x00000000, 0xFFFFF800};
1402 XMASSERT((pSource->v & 0x3FF) != 0x200);
1403 XMASSERT(((pSource->v >> 10) & 0x7FF) != 0x400);
1404 XMASSERT(((pSource->v >> 21) & 0x7FF) != 0x400);
1406 Element = pSource->v & 0x3FF;
1407 V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtendX[Element >> 9]) / 511.0f;
1408 Element = (pSource->v >> 10) & 0x7FF;
1409 V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtendYZ[Element >> 10]) / 1023.0f;
1410 Element = (pSource->v >> 21) & 0x7FF;
1411 V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtendYZ[Element >> 10]) / 1023.0f;
1415 #elif defined(_XM_SSE_INTRINSICS_)
1416 static const XMVECTORF32 DHenN3Mul = {1.0f/511.0f,1.0f/(1023.0f*1024.0f),1.0f/(1023.0f*1024.0f*2048.0f),0};
1418 XMASSERT((pSource->v & 0x3FF) != 0x200);
1419 XMASSERT(((pSource->v >> 10) & 0x7FF) != 0x400);
1420 XMASSERT(((pSource->v >> 21) & 0x7FF) != 0x400);
1421 // Get the 32 bit value and splat it
1422 XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
1423 // Mask off x, y and z
1424 vResult = _mm_and_ps(vResult,g_XMMaskDHen3);
1425 // Convert x and y to unsigned
1426 vResult = _mm_xor_ps(vResult,g_XMXorDHen3);
1428 vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
1429 // Convert x and y back to signed
1430 vResult = _mm_add_ps(vResult,g_XMAddDHen3);
1431 // Normalize x,y and z to -1.0f-1.0f
1432 vResult = _mm_mul_ps(vResult,DHenN3Mul);
1434 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1435 #endif // _XM_VMX128_INTRINSICS_
1438 //------------------------------------------------------------------------------
1440 XMFINLINE XMVECTOR XMLoadDHen3
1442 CONST XMDHEN3* pSource
1445 #if defined(_XM_NO_INTRINSICS_)
1449 static CONST UINT SignExtendX[] = {0x00000000, 0xFFFFFC00};
1450 static CONST UINT SignExtendYZ[] = {0x00000000, 0xFFFFF800};
1453 XMASSERT((pSource->v & 0x3FF) != 0x200);
1454 XMASSERT(((pSource->v >> 10) & 0x7FF) != 0x400);
1455 XMASSERT(((pSource->v >> 21) & 0x7FF) != 0x400);
1457 Element = pSource->v & 0x3FF;
1458 V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtendX[Element >> 9]);
1459 Element = (pSource->v >> 10) & 0x7FF;
1460 V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtendYZ[Element >> 10]);
1461 Element = (pSource->v >> 21) & 0x7FF;
1462 V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtendYZ[Element >> 10]);
1466 #elif defined(_XM_SSE_INTRINSICS_)
1468 XMASSERT((pSource->v & 0x3FF) != 0x200);
1469 XMASSERT(((pSource->v >> 10) & 0x7FF) != 0x400);
1470 XMASSERT(((pSource->v >> 21) & 0x7FF) != 0x400);
1471 // Get the 32 bit value and splat it
1472 XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
1473 // Mask off x, y and z
1474 vResult = _mm_and_ps(vResult,g_XMMaskDHen3);
1475 // Convert x and y to unsigned
1476 vResult = _mm_xor_ps(vResult,g_XMXorDHen3);
1478 vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
1479 // Convert x and y back to signed
1480 vResult = _mm_add_ps(vResult,g_XMAddDHen3);
1481 // Normalize x to -210-511.0f and y and z to -1024-1023.0f
1482 vResult = _mm_mul_ps(vResult,g_XMMulDHen3);
1484 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1485 #endif // _XM_VMX128_INTRINSICS_
1488 //------------------------------------------------------------------------------
1490 XMFINLINE XMVECTOR XMLoadU565
1492 CONST XMU565* pSource
1495 #if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
1496 static const XMVECTORI32 U565And = {0x1F,0x3F<<5,0x1F<<11,0};
1497 static const XMVECTORF32 U565Mul = {1.0f,1.0f/32.0f,1.0f/2048.f,0};
1499 // Get the 32 bit value and splat it
1500 XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
1501 // Mask off x, y and z
1502 vResult = _mm_and_ps(vResult,U565And);
1504 vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
1505 // Normalize x, y, and z
1506 vResult = _mm_mul_ps(vResult,U565Mul);
1514 Element = pSource->v & 0x1F;
1515 V.vector4_f32[0] = (FLOAT)Element;
1516 Element = (pSource->v >> 5) & 0x3F;
1517 V.vector4_f32[1] = (FLOAT)Element;
1518 Element = (pSource->v >> 11) & 0x1F;
1519 V.vector4_f32[2] = (FLOAT)Element;
1522 #endif // !_XM_SSE_INTRINSICS_
1525 //------------------------------------------------------------------------------
1527 XMFINLINE XMVECTOR XMLoadFloat3PK
1529 CONST XMFLOAT3PK* pSource
1532 _DECLSPEC_ALIGN_16_ UINT Result[4];
1538 // X Channel (6-bit mantissa)
1539 Mantissa = pSource->xm;
1541 if ( pSource->xe == 0x1f ) // INF or NAN
1543 Result[0] = 0x7f800000 | (pSource->xm << 17);
1547 if ( pSource->xe != 0 ) // The value is normalized
1549 Exponent = pSource->xe;
1551 else if (Mantissa != 0) // The value is denormalized
1553 // Normalize the value in the resulting float
1560 } while ((Mantissa & 0x40) == 0);
1564 else // The value is zero
1566 Exponent = (UINT)-112;
1569 Result[0] = ((Exponent + 112) << 23) | (Mantissa << 17);
1572 // Y Channel (6-bit mantissa)
1573 Mantissa = pSource->ym;
1575 if ( pSource->ye == 0x1f ) // INF or NAN
1577 Result[1] = 0x7f800000 | (pSource->ym << 17);
1581 if ( pSource->ye != 0 ) // The value is normalized
1583 Exponent = pSource->ye;
1585 else if (Mantissa != 0) // The value is denormalized
1587 // Normalize the value in the resulting float
1594 } while ((Mantissa & 0x40) == 0);
1598 else // The value is zero
1600 Exponent = (UINT)-112;
1603 Result[1] = ((Exponent + 112) << 23) | (Mantissa << 17);
1606 // Z Channel (5-bit mantissa)
1607 Mantissa = pSource->zm;
1609 if ( pSource->ze == 0x1f ) // INF or NAN
1611 Result[2] = 0x7f800000 | (pSource->zm << 17);
1615 if ( pSource->ze != 0 ) // The value is normalized
1617 Exponent = pSource->ze;
1619 else if (Mantissa != 0) // The value is denormalized
1621 // Normalize the value in the resulting float
1628 } while ((Mantissa & 0x20) == 0);
1632 else // The value is zero
1634 Exponent = (UINT)-112;
1637 Result[2] = ((Exponent + 112) << 23) | (Mantissa << 18);
1640 return XMLoadFloat3A( (const XMFLOAT3A*)&Result );
1643 //------------------------------------------------------------------------------
1645 XMFINLINE XMVECTOR XMLoadFloat3SE
1647 CONST XMFLOAT3SE* pSource
1650 _DECLSPEC_ALIGN_16_ UINT Result[4];
1652 UINT Exponent, ExpBits;
1656 if ( pSource->e == 0x1f ) // INF or NAN
1658 Result[0] = 0x7f800000 | (pSource->xm << 14);
1659 Result[1] = 0x7f800000 | (pSource->ym << 14);
1660 Result[2] = 0x7f800000 | (pSource->zm << 14);
1662 else if ( pSource->e != 0 ) // The values are all normalized
1664 Exponent = pSource->e;
1666 ExpBits = (Exponent + 112) << 23;
1668 Mantissa = pSource->xm;
1669 Result[0] = ExpBits | (Mantissa << 14);
1671 Mantissa = pSource->ym;
1672 Result[1] = ExpBits | (Mantissa << 14);
1674 Mantissa = pSource->zm;
1675 Result[2] = ExpBits | (Mantissa << 14);
1680 Mantissa = pSource->xm;
1682 if (Mantissa != 0) // The value is denormalized
1684 // Normalize the value in the resulting float
1691 } while ((Mantissa & 0x200) == 0);
1695 else // The value is zero
1697 Exponent = (UINT)-112;
1700 Result[0] = ((Exponent + 112) << 23) | (Mantissa << 14);
1703 Mantissa = pSource->ym;
1705 if (Mantissa != 0) // The value is denormalized
1707 // Normalize the value in the resulting float
1714 } while ((Mantissa & 0x200) == 0);
1718 else // The value is zero
1720 Exponent = (UINT)-112;
1723 Result[1] = ((Exponent + 112) << 23) | (Mantissa << 14);
1726 Mantissa = pSource->zm;
1728 if (Mantissa != 0) // The value is denormalized
1730 // Normalize the value in the resulting float
1737 } while ((Mantissa & 0x200) == 0);
1741 else // The value is zero
1743 Exponent = (UINT)-112;
1746 Result[2] = ((Exponent + 112) << 23) | (Mantissa << 14);
1749 return XMLoadFloat3A( (const XMFLOAT3A*)&Result );
1752 //------------------------------------------------------------------------------
1754 XMFINLINE XMVECTOR XMLoadInt4
1759 #if defined(_XM_NO_INTRINSICS_)
1765 V.vector4_u32[0] = pSource[0];
1766 V.vector4_u32[1] = pSource[1];
1767 V.vector4_u32[2] = pSource[2];
1768 V.vector4_u32[3] = pSource[3];
1772 #elif defined(_XM_SSE_INTRINSICS_)
1776 __m128i V = _mm_loadu_si128( (const __m128i*)pSource );
1777 return reinterpret_cast<__m128 *>(&V)[0];
1779 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1780 #endif // _XM_VMX128_INTRINSICS_
1783 //------------------------------------------------------------------------------
1785 XMFINLINE XMVECTOR XMLoadSInt4
1787 CONST XMINT4* pSource
1790 #if defined(_XM_NO_INTRINSICS_)
1795 V = XMLoadInt4( (const UINT*)pSource );
1796 return XMConvertVectorIntToFloat( V, 0 );
1798 V.vector4_f32[0] = (float)pSource->x;
1799 V.vector4_f32[1] = (float)pSource->y;
1800 V.vector4_f32[2] = (float)pSource->z;
1801 V.vector4_f32[3] = (float)pSource->w;
1805 #elif defined(_XM_SSE_INTRINSICS_)
1807 __m128i V = _mm_loadu_si128( (const __m128i*)pSource );
1808 return _mm_cvtepi32_ps(V);
1809 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1810 #endif // _XM_VMX128_INTRINSICS_
1813 //------------------------------------------------------------------------------
1815 XMFINLINE XMVECTOR XMLoadUInt4
1817 CONST XMUINT4* pSource
1820 #if defined(_XM_NO_INTRINSICS_)
1824 V.vector4_f32[0] = (float)pSource->x;
1825 V.vector4_f32[1] = (float)pSource->y;
1826 V.vector4_f32[2] = (float)pSource->z;
1827 V.vector4_f32[3] = (float)pSource->w;
1830 #elif defined(_XM_SSE_INTRINSICS_)
1832 __m128i V = _mm_loadu_si128( (const __m128i*)pSource );
1833 // For the values that are higher than 0x7FFFFFFF, a fixup is needed
1834 // Determine which ones need the fix.
1835 XMVECTOR vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&V)[0],g_XMNegativeZero);
1836 // Force all values positive
1837 XMVECTOR vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&V)[0],vMask);
1838 // Convert to floats
1839 vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
1840 // Convert 0x80000000 -> 0xFFFFFFFF
1841 __m128i iMask = _mm_srai_epi32(reinterpret_cast<const __m128i *>(&vMask)[0],31);
1842 // For only the ones that are too big, add the fixup
1843 vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&iMask)[0],g_XMFixUnsigned);
1844 vResult = _mm_add_ps(vResult,vMask);
1846 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1847 #endif // _XM_VMX128_INTRINSICS_
1850 //------------------------------------------------------------------------------
1852 XMFINLINE XMVECTOR XMLoadInt4A
1857 #if defined(_XM_NO_INTRINSICS_)
1862 XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
1864 V.vector4_u32[0] = pSource[0];
1865 V.vector4_u32[1] = pSource[1];
1866 V.vector4_u32[2] = pSource[2];
1867 V.vector4_u32[3] = pSource[3];
1871 #elif defined(_XM_SSE_INTRINSICS_)
1874 XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
1876 __m128i V = _mm_load_si128( (const __m128i*)pSource );
1877 return reinterpret_cast<__m128 *>(&V)[0];
1879 #else // _XM_VMX128_INTRINSICS_
1880 #endif // _XM_VMX128_INTRINSICS_
1883 //------------------------------------------------------------------------------
1885 XMFINLINE XMVECTOR XMLoadFloat4
1887 CONST XMFLOAT4* pSource
1890 #if defined(_XM_NO_INTRINSICS_)
1894 ((UINT *)(&V.vector4_f32[0]))[0] = ((const UINT *)(&pSource->x))[0];
1895 ((UINT *)(&V.vector4_f32[1]))[0] = ((const UINT *)(&pSource->y))[0];
1896 ((UINT *)(&V.vector4_f32[2]))[0] = ((const UINT *)(&pSource->z))[0];
1897 ((UINT *)(&V.vector4_f32[3]))[0] = ((const UINT *)(&pSource->w))[0];
1899 #elif defined(_XM_SSE_INTRINSICS_)
1902 return _mm_loadu_ps( &pSource->x );
1903 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1904 #endif // _XM_VMX128_INTRINSICS_
1907 //------------------------------------------------------------------------------
1909 XMFINLINE XMVECTOR XMLoadFloat4A
1911 CONST XMFLOAT4A* pSource
1914 #if defined(_XM_NO_INTRINSICS_)
1919 XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
1921 V.vector4_f32[0] = pSource->x;
1922 V.vector4_f32[1] = pSource->y;
1923 V.vector4_f32[2] = pSource->z;
1924 V.vector4_f32[3] = pSource->w;
1928 #elif defined(_XM_SSE_INTRINSICS_)
1931 XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
1933 return _mm_load_ps( &pSource->x );
1935 #else // _XM_VMX128_INTRINSICS_
1936 #endif // _XM_VMX128_INTRINSICS_
1939 //------------------------------------------------------------------------------
1941 XMFINLINE XMVECTOR XMLoadHalf4
1943 CONST XMHALF4* pSource
1946 #if defined(_XM_NO_INTRINSICS_)
1949 XMVECTOR vResult = {
1950 XMConvertHalfToFloat(pSource->x),
1951 XMConvertHalfToFloat(pSource->y),
1952 XMConvertHalfToFloat(pSource->z),
1953 XMConvertHalfToFloat(pSource->w)
1957 #elif defined(_XM_SSE_INTRINSICS_)
1959 XMVECTOR vResult = {
1960 XMConvertHalfToFloat(pSource->x),
1961 XMConvertHalfToFloat(pSource->y),
1962 XMConvertHalfToFloat(pSource->z),
1963 XMConvertHalfToFloat(pSource->w)
1966 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1967 #endif // _XM_VMX128_INTRINSICS_
1970 //------------------------------------------------------------------------------
1972 XMFINLINE XMVECTOR XMLoadShortN4
1974 CONST XMSHORTN4* pSource
1977 #if defined(_XM_NO_INTRINSICS_)
1980 XMVECTOR vResult = {
1981 (pSource->x == -32768) ? -1.f : ((FLOAT)pSource->x * (1.0f/32767.0f)),
1982 (pSource->y == -32768) ? -1.f : ((FLOAT)pSource->y * (1.0f/32767.0f)),
1983 (pSource->z == -32768) ? -1.f : ((FLOAT)pSource->z * (1.0f/32767.0f)),
1984 (pSource->w == -32768) ? -1.f : ((FLOAT)pSource->w * (1.0f/32767.0f))
1988 #elif defined(_XM_SSE_INTRINSICS_)
1990 // Splat the color in all four entries (x,z,y,w)
1991 __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
1992 // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
1993 __m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
1994 // x and z are unsigned! Flip the bits to convert the order to signed
1995 vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16);
1996 // Convert to floating point numbers
1997 vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
1998 // x and z - 0x8000 to complete the conversion
1999 vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16);
2000 // Convert to -1.0f - 1.0f
2001 vTemp = _mm_mul_ps(vTemp,g_XMNormalizeX16Y16Z16W16);
2002 // Very important! The entries are x,z,y,w, flip it to x,y,z,w
2003 vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
2004 // Clamp result (for case of -32768)
2005 return _mm_max_ps( vTemp, g_XMNegativeOne );
2006 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2007 #endif // _XM_VMX128_INTRINSICS_
2010 //------------------------------------------------------------------------------
2012 XMFINLINE XMVECTOR XMLoadShort4
2014 CONST XMSHORT4* pSource
2017 #if defined(_XM_NO_INTRINSICS_)
2023 V.vector4_f32[0] = (FLOAT)pSource->x;
2024 V.vector4_f32[1] = (FLOAT)pSource->y;
2025 V.vector4_f32[2] = (FLOAT)pSource->z;
2026 V.vector4_f32[3] = (FLOAT)pSource->w;
2030 #elif defined(_XM_SSE_INTRINSICS_)
2032 // Splat the color in all four entries (x,z,y,w)
2033 __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
2034 // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
2035 __m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
2036 // x and z are unsigned! Flip the bits to convert the order to signed
2037 vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16);
2038 // Convert to floating point numbers
2039 vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2040 // x and z - 0x8000 to complete the conversion
2041 vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16);
2042 // Fix y and w because they are 65536 too large
2043 vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16);
2044 // Very important! The entries are x,z,y,w, flip it to x,y,z,w
2045 return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
2046 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2047 #endif // _XM_VMX128_INTRINSICS_
2050 //------------------------------------------------------------------------------
2052 XMFINLINE XMVECTOR XMLoadUShortN4
2054 CONST XMUSHORTN4* pSource
2057 #if defined(_XM_NO_INTRINSICS_)
2063 V.vector4_f32[0] = (FLOAT)pSource->x / 65535.0f;
2064 V.vector4_f32[1] = (FLOAT)pSource->y / 65535.0f;
2065 V.vector4_f32[2] = (FLOAT)pSource->z / 65535.0f;
2066 V.vector4_f32[3] = (FLOAT)pSource->w / 65535.0f;
2070 #elif defined(_XM_SSE_INTRINSICS_)
2072 static const XMVECTORF32 FixupY16W16 = {1.0f/65535.0f,1.0f/65535.0f,1.0f/(65535.0f*65536.0f),1.0f/(65535.0f*65536.0f)};
2073 static const XMVECTORF32 FixaddY16W16 = {0,0,32768.0f*65536.0f,32768.0f*65536.0f};
2075 // Splat the color in all four entries (x,z,y,w)
2076 __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
2077 // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
2078 __m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
2079 // y and w are signed! Flip the bits to convert the order to unsigned
2080 vTemp = _mm_xor_ps(vTemp,g_XMFlipZW);
2081 // Convert to floating point numbers
2082 vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2083 // y and w + 0x8000 to complete the conversion
2084 vTemp = _mm_add_ps(vTemp,FixaddY16W16);
2085 // Fix y and w because they are 65536 too large
2086 vTemp = _mm_mul_ps(vTemp,FixupY16W16);
2087 // Very important! The entries are x,z,y,w, flip it to x,y,z,w
2088 return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
2089 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2090 #endif // _XM_VMX128_INTRINSICS_
2093 //------------------------------------------------------------------------------
2095 XMFINLINE XMVECTOR XMLoadUShort4
2097 CONST XMUSHORT4* pSource
2100 #if defined(_XM_NO_INTRINSICS_)
2106 V.vector4_f32[0] = (FLOAT)pSource->x;
2107 V.vector4_f32[1] = (FLOAT)pSource->y;
2108 V.vector4_f32[2] = (FLOAT)pSource->z;
2109 V.vector4_f32[3] = (FLOAT)pSource->w;
2113 #elif defined(_XM_SSE_INTRINSICS_)
2115 static const XMVECTORF32 FixaddY16W16 = {0,0,32768.0f,32768.0f};
2117 // Splat the color in all four entries (x,z,y,w)
2118 __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
2119 // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
2120 __m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
2121 // y and w are signed! Flip the bits to convert the order to unsigned
2122 vTemp = _mm_xor_ps(vTemp,g_XMFlipZW);
2123 // Convert to floating point numbers
2124 vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2125 // Fix y and w because they are 65536 too large
2126 vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16);
2127 // y and w + 0x8000 to complete the conversion
2128 vTemp = _mm_add_ps(vTemp,FixaddY16W16);
2129 // Very important! The entries are x,z,y,w, flip it to x,y,z,w
2130 return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
2131 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2132 #endif // _XM_VMX128_INTRINSICS_
2135 //------------------------------------------------------------------------------
2137 XMFINLINE XMVECTOR XMLoadXIcoN4
2139 CONST XMXICON4* pSource
2142 #if defined(_XM_NO_INTRINSICS_)
2146 static CONST UINT SignExtend[] = {0x00000000, 0xFFF00000};
2149 XMASSERT((pSource->v & 0xFFFFFull) != 0x80000ull);
2150 XMASSERT(((pSource->v >> 20) & 0xFFFFFull) != 0x80000ull);
2151 XMASSERT(((pSource->v >> 40) & 0xFFFFFull) != 0x80000ull);
2153 Element = (UINT)(pSource->v & 0xFFFFF);
2154 V.vector4_f32[0] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f;
2155 Element = (UINT)((pSource->v >> 20) & 0xFFFFF);
2156 V.vector4_f32[1] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f;
2157 Element = (UINT)((pSource->v >> 40) & 0xFFFFF);
2158 V.vector4_f32[2] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f;
2159 V.vector4_f32[3] = (FLOAT)(pSource->v >> 60) / 15.0f;
2163 #elif defined(_XM_SSE_INTRINSICS_)
2164 XMASSERT((pSource->v & 0xFFFFFull) != 0x80000ull);
2165 XMASSERT(((pSource->v >> 20) & 0xFFFFFull) != 0x80000ull);
2166 XMASSERT(((pSource->v >> 40) & 0xFFFFFull) != 0x80000ull);
2167 static const XMVECTORF32 LoadXIcoN4Mul = {1.0f/524287.0f,1.0f/(524287.0f*4096.0f),1.0f/524287.0f,1.0f/(15.0f*4096.0f*65536.0f)};
2169 // Grab the 64 bit structure
2170 __m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
2171 // By shifting down 8 bits, y and z are in seperate 32 bit elements
2172 __m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
2173 // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
2174 XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResultd)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
2175 // Fix the entries to x,y,z,w
2176 vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
2178 vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
2179 // x and z are unsigned! Flip the bits to convert the order to signed
2180 vTemp = _mm_xor_ps(vTemp,g_XMXorXIco4);
2181 // Convert to floating point numbers
2182 vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2183 // x and z - 0x80 to complete the conversion
2184 vTemp = _mm_add_ps(vTemp,g_XMAddXIco4);
2185 // Fix y and w because they are too large
2186 vTemp = _mm_mul_ps(vTemp,LoadXIcoN4Mul);
2188 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2189 #endif // _XM_VMX128_INTRINSICS_
2192 //------------------------------------------------------------------------------
2194 XMFINLINE XMVECTOR XMLoadXIco4
2196 CONST XMXICO4* pSource
2199 #if defined(_XM_NO_INTRINSICS_)
2203 static CONST UINT SignExtend[] = {0x00000000, 0xFFF00000};
2206 XMASSERT((pSource->v & 0xFFFFFull) != 0x80000ull);
2207 XMASSERT(((pSource->v >> 20) & 0xFFFFFull) != 0x80000ull);
2208 XMASSERT(((pSource->v >> 40) & 0xFFFFFull) != 0x80000ull);
2210 Element = (UINT)(pSource->v & 0xFFFFF);
2211 V.vector4_f32[0] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]);
2212 Element = (UINT)((pSource->v >> 20) & 0xFFFFF);
2213 V.vector4_f32[1] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]);
2214 Element = (UINT)((pSource->v >> 40) & 0xFFFFF);
2215 V.vector4_f32[2] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]);
2216 V.vector4_f32[3] = (FLOAT)(pSource->v >> 60);
2220 #elif defined(_XM_SSE_INTRINSICS_)
2221 XMASSERT((pSource->v & 0xFFFFFull) != 0x80000ull);
2222 XMASSERT(((pSource->v >> 20) & 0xFFFFFull) != 0x80000ull);
2223 XMASSERT(((pSource->v >> 40) & 0xFFFFFull) != 0x80000ull);
2225 // Grab the 64 bit structure
2226 __m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
2227 // By shifting down 8 bits, y and z are in seperate 32 bit elements
2228 __m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
2229 // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
2230 XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResultd)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
2231 // Fix the entries to x,y,z,w
2232 vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
2234 vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
2235 // x and z are unsigned! Flip the bits to convert the order to signed
2236 vTemp = _mm_xor_ps(vTemp,g_XMXorXIco4);
2237 // Convert to floating point numbers
2238 vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2239 // x and z - 0x80 to complete the conversion
2240 vTemp = _mm_add_ps(vTemp,g_XMAddXIco4);
2241 // Fix y and w because they are too large
2242 vTemp = _mm_mul_ps(vTemp,g_XMMulIco4);
2244 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2245 #endif // _XM_VMX128_INTRINSICS_
2248 //------------------------------------------------------------------------------
2250 XMFINLINE XMVECTOR XMLoadUIcoN4
2252 CONST XMUICON4* pSource
2255 #if defined(_XM_NO_INTRINSICS_)
2261 V.vector4_f32[0] = (FLOAT)(pSource->v & 0xFFFFF) / 1048575.0f;
2262 V.vector4_f32[1] = (FLOAT)((pSource->v >> 20) & 0xFFFFF) / 1048575.0f;
2263 V.vector4_f32[2] = (FLOAT)((pSource->v >> 40) & 0xFFFFF) / 1048575.0f;
2264 V.vector4_f32[3] = (FLOAT)(pSource->v >> 60) / 15.0f;
2268 #elif defined(_XM_SSE_INTRINSICS_)
2269 static const XMVECTORF32 LoadUIcoN4Mul = {1.0f/1048575.0f,1.0f/(1048575.0f*4096.0f),1.0f/1048575.0f,1.0f/(15.0f*4096.0f*65536.0f)};
2271 // Grab the 64 bit structure
2272 __m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
2273 // By shifting down 8 bits, y and z are in seperate 32 bit elements
2274 __m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
2275 // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
2276 XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResultd)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
2277 // Fix the entries to x,y,z,w
2278 vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
2280 vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
2281 // x and z are unsigned! Flip the bits to convert the order to signed
2282 vTemp = _mm_xor_ps(vTemp,g_XMFlipYW);
2283 // Convert to floating point numbers
2284 vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2285 // x and z - 0x80 to complete the conversion
2286 vTemp = _mm_add_ps(vTemp,g_XMAddUIco4);
2287 // Fix y and w because they are too large
2288 vTemp = _mm_mul_ps(vTemp,LoadUIcoN4Mul);
2290 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2291 #endif // _XM_VMX128_INTRINSICS_
2294 //------------------------------------------------------------------------------
2296 XMFINLINE XMVECTOR XMLoadUIco4
2298 CONST XMUICO4* pSource
2301 #if defined(_XM_NO_INTRINSICS_)
2307 V.vector4_f32[0] = (FLOAT)(pSource->v & 0xFFFFF);
2308 V.vector4_f32[1] = (FLOAT)((pSource->v >> 20) & 0xFFFFF);
2309 V.vector4_f32[2] = (FLOAT)((pSource->v >> 40) & 0xFFFFF);
2310 V.vector4_f32[3] = (FLOAT)(pSource->v >> 60);
2314 #elif defined(_XM_SSE_INTRINSICS_)
2316 // Grab the 64 bit structure
2317 __m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
2318 // By shifting down 8 bits, y and z are in seperate 32 bit elements
2319 __m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
2320 // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
2321 XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResultd)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
2322 // Fix the entries to x,y,z,w
2323 vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
2325 vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
2326 // x and z are unsigned! Flip the bits to convert the order to signed
2327 vTemp = _mm_xor_ps(vTemp,g_XMFlipYW);
2328 // Convert to floating point numbers
2329 vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2330 // x and z - 0x80 to complete the conversion
2331 vTemp = _mm_add_ps(vTemp,g_XMAddUIco4);
2332 // Fix y and w because they are too large
2333 vTemp = _mm_mul_ps(vTemp,g_XMMulIco4);
2335 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2336 #endif // _XM_VMX128_INTRINSICS_
2339 //------------------------------------------------------------------------------
2341 XMFINLINE XMVECTOR XMLoadIcoN4
2343 CONST XMICON4* pSource
2346 #if defined(_XM_NO_INTRINSICS_)
2350 static CONST UINT SignExtend[] = {0x00000000, 0xFFF00000};
2351 static CONST UINT SignExtendW[] = {0x00000000, 0xFFFFFFF0};
2355 Element = (UINT)(pSource->v & 0xFFFFF);
2356 V.vector4_f32[0] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f;
2357 Element = (UINT)((pSource->v >> 20) & 0xFFFFF);
2358 V.vector4_f32[1] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f;
2359 Element = (UINT)((pSource->v >> 40) & 0xFFFFF);
2360 V.vector4_f32[2] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f;
2361 Element = (UINT)(pSource->v >> 60);
2362 V.vector4_f32[3] = (FLOAT)(INT)(Element | SignExtendW[Element >> 3]) / 7.0f;
2366 #elif defined(_XM_SSE_INTRINSICS_)
2367 static const XMVECTORF32 LoadIcoN4Mul = {1.0f/524287.0f,1.0f/(524287.0f*4096.0f),1.0f/524287.0f,1.0f/(7.0f*4096.0f*65536.0f)};
2369 // Grab the 64 bit structure
2370 __m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
2371 // By shifting down 8 bits, y and z are in seperate 32 bit elements
2372 __m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
2373 // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
2374 XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResultd)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
2375 // Fix the entries to x,y,z,w
2376 vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
2378 vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
2379 // x and z are unsigned! Flip the bits to convert the order to signed
2380 vTemp = _mm_xor_ps(vTemp,g_XMXorIco4);
2381 // Convert to floating point numbers
2382 vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2383 // x and z - 0x80 to complete the conversion
2384 vTemp = _mm_add_ps(vTemp,g_XMAddIco4);
2385 // Fix y and w because they are too large
2386 vTemp = _mm_mul_ps(vTemp,LoadIcoN4Mul);
2388 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2389 #endif // _XM_VMX128_INTRINSICS_
2392 //------------------------------------------------------------------------------
2394 XMFINLINE XMVECTOR XMLoadIco4
2396 CONST XMICO4* pSource
2399 #if defined(_XM_NO_INTRINSICS_)
2403 static CONST UINT SignExtend[] = {0x00000000, 0xFFF00000};
2404 static CONST UINT SignExtendW[] = {0x00000000, 0xFFFFFFF0};
2408 Element = (UINT)(pSource->v & 0xFFFFF);
2409 V.vector4_f32[0] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]);
2410 Element = (UINT)((pSource->v >> 20) & 0xFFFFF);
2411 V.vector4_f32[1] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]);
2412 Element = (UINT)((pSource->v >> 40) & 0xFFFFF);
2413 V.vector4_f32[2] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]);
2414 Element = (UINT)(pSource->v >> 60);
2415 V.vector4_f32[3] = (FLOAT)(INT)(Element | SignExtendW[Element >> 3]);
2419 #elif defined(_XM_SSE_INTRINSICS_)
2421 // Grab the 64 bit structure
2422 __m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
2423 // By shifting down 8 bits, y and z are in seperate 32 bit elements
2424 __m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
2425 // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
2426 XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResultd)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
2427 // Fix the entries to x,y,z,w
2428 vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
2430 vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
2431 // x and z are unsigned! Flip the bits to convert the order to signed
2432 vTemp = _mm_xor_ps(vTemp,g_XMXorIco4);
2433 // Convert to floating point numbers
2434 vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2435 // x and z - 0x80 to complete the conversion
2436 vTemp = _mm_add_ps(vTemp,g_XMAddIco4);
2437 // Fix y and w because they are too large
2438 vTemp = _mm_mul_ps(vTemp,g_XMMulIco4);
2440 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2441 #endif // _XM_VMX128_INTRINSICS_
2445 //------------------------------------------------------------------------------
2447 XMFINLINE XMVECTOR XMLoadXDecN4
2449 CONST XMXDECN4* pSource
2452 #if defined(_XM_NO_INTRINSICS_)
2455 static CONST UINT SignExtend[] = {0x00000000, 0xFFFFFC00};
2458 XMASSERT((pSource->v & 0x3FF) != 0x200);
2459 XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
2460 XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
2462 Element = pSource->v & 0x3FF;
2463 V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f;
2464 Element = (pSource->v >> 10) & 0x3FF;
2465 V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f;
2466 Element = (pSource->v >> 20) & 0x3FF;
2467 V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f;
2468 V.vector4_f32[3] = (FLOAT)(pSource->v >> 30) / 3.0f;
2472 #elif defined(_XM_SSE_INTRINSICS_)
2474 // Splat the color in all four entries
2475 __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
2476 // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
2477 vTemp = _mm_and_ps(vTemp,g_XMMaskA2B10G10R10);
2478 // a is unsigned! Flip the bit to convert the order to signed
2479 vTemp = _mm_xor_ps(vTemp,g_XMFlipA2B10G10R10);
2480 // Convert to floating point numbers
2481 vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2482 // RGB + 0, A + 0x80000000.f to undo the signed order.
2483 vTemp = _mm_add_ps(vTemp,g_XMFixAA2B10G10R10);
2484 // Convert 0-255 to 0.0f-1.0f
2485 return _mm_mul_ps(vTemp,g_XMNormalizeA2B10G10R10);
2486 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2487 #endif // _XM_VMX128_INTRINSICS_
2490 //------------------------------------------------------------------------------
2492 XMFINLINE XMVECTOR XMLoadXDec4
2494 CONST XMXDEC4* pSource
2497 #if defined(_XM_NO_INTRINSICS_)
2501 static CONST UINT SignExtend[] = {0x00000000, 0xFFFFFC00};
2504 XMASSERT((pSource->v & 0x3FF) != 0x200);
2505 XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
2506 XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
2508 Element = pSource->v & 0x3FF;
2509 V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]);
2510 Element = (pSource->v >> 10) & 0x3FF;
2511 V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]);
2512 Element = (pSource->v >> 20) & 0x3FF;
2513 V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]);
2514 V.vector4_f32[3] = (FLOAT)(pSource->v >> 30);
2518 #elif defined(_XM_SSE_INTRINSICS_)
2519 XMASSERT((pSource->v & 0x3FF) != 0x200);
2520 XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
2521 XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
2522 static const XMVECTORI32 XDec4Xor = {0x200, 0x200<<10, 0x200<<20, 0x80000000};
2523 static const XMVECTORF32 XDec4Add = {-512.0f,-512.0f*1024.0f,-512.0f*1024.0f*1024.0f,32768*65536.0f};
2525 // Splat the color in all four entries
2526 XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
2527 // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
2528 vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
2529 // a is unsigned! Flip the bit to convert the order to signed
2530 vTemp = _mm_xor_ps(vTemp,XDec4Xor);
2531 // Convert to floating point numbers
2532 vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2533 // RGB + 0, A + 0x80000000.f to undo the signed order.
2534 vTemp = _mm_add_ps(vTemp,XDec4Add);
2535 // Convert 0-255 to 0.0f-1.0f
2536 vTemp = _mm_mul_ps(vTemp,g_XMMulDec4);
2538 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2539 #endif // _XM_VMX128_INTRINSICS_
2542 //------------------------------------------------------------------------------
2544 XMFINLINE XMVECTOR XMLoadUDecN4
2546 CONST XMUDECN4* pSource
2549 #if defined(_XM_NO_INTRINSICS_)
2556 Element = pSource->v & 0x3FF;
2557 V.vector4_f32[0] = (FLOAT)Element / 1023.0f;
2558 Element = (pSource->v >> 10) & 0x3FF;
2559 V.vector4_f32[1] = (FLOAT)Element / 1023.0f;
2560 Element = (pSource->v >> 20) & 0x3FF;
2561 V.vector4_f32[2] = (FLOAT)Element / 1023.0f;
2562 V.vector4_f32[3] = (FLOAT)(pSource->v >> 30) / 3.0f;
2566 #elif defined(_XM_SSE_INTRINSICS_)
2568 static const XMVECTORF32 UDecN4Mul = {1.0f/1023.0f,1.0f/(1023.0f*1024.0f),1.0f/(1023.0f*1024.0f*1024.0f),1.0f/(3.0f*1024.0f*1024.0f*1024.0f)};
2569 // Splat the color in all four entries
2570 XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
2571 // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
2572 vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
2573 // a is unsigned! Flip the bit to convert the order to signed
2574 vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
2575 // Convert to floating point numbers
2576 vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2577 // RGB + 0, A + 0x80000000.f to undo the signed order.
2578 vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
2579 // Convert 0-255 to 0.0f-1.0f
2580 vTemp = _mm_mul_ps(vTemp,UDecN4Mul);
2582 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2583 #endif // _XM_VMX128_INTRINSICS_
2586 //------------------------------------------------------------------------------
2588 XMFINLINE XMVECTOR XMLoadUDec4
2590 CONST XMUDEC4* pSource
2593 #if defined(_XM_NO_INTRINSICS_)
2600 Element = pSource->v & 0x3FF;
2601 V.vector4_f32[0] = (FLOAT)Element;
2602 Element = (pSource->v >> 10) & 0x3FF;
2603 V.vector4_f32[1] = (FLOAT)Element;
2604 Element = (pSource->v >> 20) & 0x3FF;
2605 V.vector4_f32[2] = (FLOAT)Element;
2606 V.vector4_f32[3] = (FLOAT)(pSource->v >> 30);
2610 #elif defined(_XM_SSE_INTRINSICS_)
2612 // Splat the color in all four entries
2613 XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
2614 // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
2615 vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
2616 // a is unsigned! Flip the bit to convert the order to signed
2617 vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
2618 // Convert to floating point numbers
2619 vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2620 // RGB + 0, A + 0x80000000.f to undo the signed order.
2621 vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
2622 // Convert 0-255 to 0.0f-1.0f
2623 vTemp = _mm_mul_ps(vTemp,g_XMMulDec4);
2625 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2626 #endif // _XM_VMX128_INTRINSICS_
2629 //------------------------------------------------------------------------------
2631 XMFINLINE XMVECTOR XMLoadDecN4
2633 CONST XMDECN4* pSource
2636 #if defined(_XM_NO_INTRINSICS_)
2640 static CONST UINT SignExtend[] = {0x00000000, 0xFFFFFC00};
2641 static CONST UINT SignExtendW[] = {0x00000000, 0xFFFFFFFC};
2644 XMASSERT((pSource->v & 0x3FF) != 0x200);
2645 XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
2646 XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
2647 XMASSERT(((pSource->v >> 30) & 0x3) != 0x2);
2649 Element = pSource->v & 0x3FF;
2650 V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f;
2651 Element = (pSource->v >> 10) & 0x3FF;
2652 V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f;
2653 Element = (pSource->v >> 20) & 0x3FF;
2654 V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f;
2655 Element = pSource->v >> 30;
2656 V.vector4_f32[3] = (FLOAT)(SHORT)(Element | SignExtendW[Element >> 1]);
2660 #elif defined(_XM_SSE_INTRINSICS_)
2662 XMASSERT((pSource->v & 0x3FF) != 0x200);
2663 XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
2664 XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
2665 XMASSERT(((pSource->v >> 30) & 0x3) != 0x2);
2666 static const XMVECTORF32 DecN4Mul = {1.0f/511.0f,1.0f/(511.0f*1024.0f),1.0f/(511.0f*1024.0f*1024.0f),1.0f/(1024.0f*1024.0f*1024.0f)};
2667 // Splat the color in all four entries
2668 XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
2669 // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
2670 vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
2671 // a is unsigned! Flip the bit to convert the order to signed
2672 vTemp = _mm_xor_ps(vTemp,g_XMXorDec4);
2673 // Convert to floating point numbers
2674 vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2675 // RGB + 0, A + 0x80000000.f to undo the signed order.
2676 vTemp = _mm_add_ps(vTemp,g_XMAddDec4);
2677 // Convert 0-255 to 0.0f-1.0f
2678 vTemp = _mm_mul_ps(vTemp,DecN4Mul);
2680 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2681 #endif // _XM_VMX128_INTRINSICS_
2684 //------------------------------------------------------------------------------
2686 XMFINLINE XMVECTOR XMLoadDec4
2688 CONST XMDEC4* pSource
2691 #if defined(_XM_NO_INTRINSICS_)
2695 static CONST UINT SignExtend[] = {0x00000000, 0xFFFFFC00};
2696 static CONST UINT SignExtendW[] = {0x00000000, 0xFFFFFFFC};
2699 XMASSERT((pSource->v & 0x3FF) != 0x200);
2700 XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
2701 XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
2702 XMASSERT(((pSource->v >> 30) & 0x3) != 0x2);
2704 Element = pSource->v & 0x3FF;
2705 V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]);
2706 Element = (pSource->v >> 10) & 0x3FF;
2707 V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]);
2708 Element = (pSource->v >> 20) & 0x3FF;
2709 V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]);
2710 Element = pSource->v >> 30;
2711 V.vector4_f32[3] = (FLOAT)(SHORT)(Element | SignExtendW[Element >> 1]);
2715 #elif defined(_XM_SSE_INTRINSICS_)
2716 XMASSERT((pSource->v & 0x3FF) != 0x200);
2717 XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
2718 XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
2719 XMASSERT(((pSource->v >> 30) & 0x3) != 0x2);
2721 // Splat the color in all four entries
2722 XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
2723 // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
2724 vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
2725 // a is unsigned! Flip the bit to convert the order to signed
2726 vTemp = _mm_xor_ps(vTemp,g_XMXorDec4);
2727 // Convert to floating point numbers
2728 vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2729 // RGB + 0, A + 0x80000000.f to undo the signed order.
2730 vTemp = _mm_add_ps(vTemp,g_XMAddDec4);
2731 // Convert 0-255 to 0.0f-1.0f
2732 vTemp = _mm_mul_ps(vTemp,g_XMMulDec4);
2734 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2735 #endif // _XM_VMX128_INTRINSICS_
2738 //------------------------------------------------------------------------------
2740 XMFINLINE XMVECTOR XMLoadUByteN4
2742 CONST XMUBYTEN4* pSource
2745 #if defined(_XM_NO_INTRINSICS_)
2751 V.vector4_f32[0] = (FLOAT)pSource->x / 255.0f;
2752 V.vector4_f32[1] = (FLOAT)pSource->y / 255.0f;
2753 V.vector4_f32[2] = (FLOAT)pSource->z / 255.0f;
2754 V.vector4_f32[3] = (FLOAT)pSource->w / 255.0f;
2758 #elif defined(_XM_SSE_INTRINSICS_)
2759 static const XMVECTORF32 LoadUByteN4Mul = {1.0f/255.0f,1.0f/(255.0f*256.0f),1.0f/(255.0f*65536.0f),1.0f/(255.0f*65536.0f*256.0f)};
2761 // Splat the color in all four entries (x,z,y,w)
2762 XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
2763 // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
2764 vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
2765 // w is signed! Flip the bits to convert the order to unsigned
2766 vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
2767 // Convert to floating point numbers
2768 vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2769 // w + 0x80 to complete the conversion
2770 vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
2771 // Fix y, z and w because they are too large
2772 vTemp = _mm_mul_ps(vTemp,LoadUByteN4Mul);
2774 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2775 #endif // _XM_VMX128_INTRINSICS_
2778 //------------------------------------------------------------------------------
2780 XMFINLINE XMVECTOR XMLoadUByte4
2782 CONST XMUBYTE4* pSource
2785 #if defined(_XM_NO_INTRINSICS_)
2791 V.vector4_f32[0] = (FLOAT)pSource->x;
2792 V.vector4_f32[1] = (FLOAT)pSource->y;
2793 V.vector4_f32[2] = (FLOAT)pSource->z;
2794 V.vector4_f32[3] = (FLOAT)pSource->w;
2798 #elif defined(_XM_SSE_INTRINSICS_)
2799 static const XMVECTORF32 LoadUByte4Mul = {1.0f,1.0f/256.0f,1.0f/65536.0f,1.0f/(65536.0f*256.0f)};
2801 // Splat the color in all four entries (x,z,y,w)
2802 XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
2803 // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
2804 vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
2805 // w is signed! Flip the bits to convert the order to unsigned
2806 vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
2807 // Convert to floating point numbers
2808 vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2809 // w + 0x80 to complete the conversion
2810 vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
2811 // Fix y, z and w because they are too large
2812 vTemp = _mm_mul_ps(vTemp,LoadUByte4Mul);
2814 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2815 #endif // _XM_VMX128_INTRINSICS_
2818 //------------------------------------------------------------------------------
2820 XMFINLINE XMVECTOR XMLoadByteN4
2822 CONST XMBYTEN4* pSource
2825 #if defined(_XM_NO_INTRINSICS_)
2831 V.vector4_f32[0] = (pSource->x == -128) ? -1.f : ((FLOAT)pSource->x / 127.0f);
2832 V.vector4_f32[1] = (pSource->y == -128) ? -1.f : ((FLOAT)pSource->y / 127.0f);
2833 V.vector4_f32[2] = (pSource->z == -128) ? -1.f : ((FLOAT)pSource->z / 127.0f);
2834 V.vector4_f32[3] = (pSource->w == -128) ? -1.f : ((FLOAT)pSource->w / 127.0f);
2838 #elif defined(_XM_SSE_INTRINSICS_)
2839 static const XMVECTORF32 LoadByteN4Mul = {1.0f/127.0f,1.0f/(127.0f*256.0f),1.0f/(127.0f*65536.0f),1.0f/(127.0f*65536.0f*256.0f)};
2841 // Splat the color in all four entries (x,z,y,w)
2842 XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
2843 // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
2844 vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
2845 // x,y and z are unsigned! Flip the bits to convert the order to signed
2846 vTemp = _mm_xor_ps(vTemp,g_XMXorByte4);
2847 // Convert to floating point numbers
2848 vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2849 // x, y and z - 0x80 to complete the conversion
2850 vTemp = _mm_add_ps(vTemp,g_XMAddByte4);
2851 // Fix y, z and w because they are too large
2852 vTemp = _mm_mul_ps(vTemp,LoadByteN4Mul);
2853 // Clamp result (for case of -128)
2854 return _mm_max_ps( vTemp, g_XMNegativeOne );
2855 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2856 #endif // _XM_VMX128_INTRINSICS_
2859 //------------------------------------------------------------------------------
2861 XMFINLINE XMVECTOR XMLoadByte4
2863 CONST XMBYTE4* pSource
2866 #if defined(_XM_NO_INTRINSICS_)
2872 V.vector4_f32[0] = (FLOAT)pSource->x;
2873 V.vector4_f32[1] = (FLOAT)pSource->y;
2874 V.vector4_f32[2] = (FLOAT)pSource->z;
2875 V.vector4_f32[3] = (FLOAT)pSource->w;
2879 #elif defined(_XM_SSE_INTRINSICS_)
2880 static const XMVECTORF32 LoadByte4Mul = {1.0f,1.0f/256.0f,1.0f/65536.0f,1.0f/(65536.0f*256.0f)};
2882 // Splat the color in all four entries (x,z,y,w)
2883 XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
2884 // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
2885 vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
2886 // x,y and z are unsigned! Flip the bits to convert the order to signed
2887 vTemp = _mm_xor_ps(vTemp,g_XMXorByte4);
2888 // Convert to floating point numbers
2889 vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2890 // x, y and z - 0x80 to complete the conversion
2891 vTemp = _mm_add_ps(vTemp,g_XMAddByte4);
2892 // Fix y, z and w because they are too large
2893 vTemp = _mm_mul_ps(vTemp,LoadByte4Mul);
2895 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2896 #endif // _XM_VMX128_INTRINSICS_
2899 //------------------------------------------------------------------------------
2901 XMFINLINE XMVECTOR XMLoadUNibble4
2903 CONST XMUNIBBLE4* pSource
2906 #if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
2907 static const XMVECTORI32 UNibble4And = {0xF,0xF0,0xF00,0xF000};
2908 static const XMVECTORF32 UNibble4Mul = {1.0f,1.0f/16.f,1.0f/256.f,1.0f/4096.f};
2910 // Get the 32 bit value and splat it
2911 XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
2912 // Mask off x, y and z
2913 vResult = _mm_and_ps(vResult,UNibble4And);
2915 vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
2916 // Normalize x, y, and z
2917 vResult = _mm_mul_ps(vResult,UNibble4Mul);
2925 Element = pSource->v & 0xF;
2926 V.vector4_f32[0] = (FLOAT)Element;
2927 Element = (pSource->v >> 4) & 0xF;
2928 V.vector4_f32[1] = (FLOAT)Element;
2929 Element = (pSource->v >> 8) & 0xF;
2930 V.vector4_f32[2] = (FLOAT)Element;
2931 Element = (pSource->v >> 12) & 0xF;
2932 V.vector4_f32[3] = (FLOAT)Element;
2935 #endif // !_XM_SSE_INTRISICS_
2938 //------------------------------------------------------------------------------
2940 XMFINLINE XMVECTOR XMLoadU555
2942 CONST XMU555* pSource
2945 #if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
2946 static const XMVECTORI32 U555And = {0x1F,0x1F<<5,0x1F<<10,0x8000};
2947 static const XMVECTORF32 U555Mul = {1.0f,1.0f/32.f,1.0f/1024.f,1.0f/32768.f};
2949 // Get the 32 bit value and splat it
2950 XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
2951 // Mask off x, y and z
2952 vResult = _mm_and_ps(vResult,U555And);
2954 vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
2955 // Normalize x, y, and z
2956 vResult = _mm_mul_ps(vResult,U555Mul);
2964 Element = pSource->v & 0x1F;
2965 V.vector4_f32[0] = (FLOAT)Element;
2966 Element = (pSource->v >> 5) & 0x1F;
2967 V.vector4_f32[1] = (FLOAT)Element;
2968 Element = (pSource->v >> 10) & 0x1F;
2969 V.vector4_f32[2] = (FLOAT)Element;
2970 Element = (pSource->v >> 15) & 0x1;
2971 V.vector4_f32[3] = (FLOAT)Element;
2974 #endif // !_XM_SSE_INTRISICS_
2977 //------------------------------------------------------------------------------
2979 XMFINLINE XMVECTOR XMLoadColor
2981 CONST XMCOLOR* pSource
2984 #if defined(_XM_NO_INTRINSICS_)
2987 // INT -> Float conversions are done in one instruction.
2988 // UINT -> Float calls a runtime function. Keep in INT
2989 INT iColor = (INT)(pSource->c);
2991 (FLOAT)((iColor >> 16) & 0xFF) * (1.0f/255.0f),
2992 (FLOAT)((iColor >> 8) & 0xFF) * (1.0f/255.0f),
2993 (FLOAT)(iColor & 0xFF) * (1.0f/255.0f),
2994 (FLOAT)((iColor >> 24) & 0xFF) * (1.0f/255.0f)
2998 #elif defined(_XM_SSE_INTRINSICS_)
3000 // Splat the color in all four entries
3001 __m128i vInt = _mm_set1_epi32(pSource->c);
3002 // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
3003 vInt = _mm_and_si128(vInt,g_XMMaskA8R8G8B8);
3004 // a is unsigned! Flip the bit to convert the order to signed
3005 vInt = _mm_xor_si128(vInt,g_XMFlipA8R8G8B8);
3006 // Convert to floating point numbers
3007 XMVECTOR vTemp = _mm_cvtepi32_ps(vInt);
3008 // RGB + 0, A + 0x80000000.f to undo the signed order.
3009 vTemp = _mm_add_ps(vTemp,g_XMFixAA8R8G8B8);
3010 // Convert 0-255 to 0.0f-1.0f
3011 return _mm_mul_ps(vTemp,g_XMNormalizeA8R8G8B8);
3012 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
3013 #endif // _XM_VMX128_INTRINSICS_
3016 //------------------------------------------------------------------------------
3018 XMFINLINE XMMATRIX XMLoadFloat3x3
3020 CONST XMFLOAT3X3* pSource
3023 #if defined(_XM_NO_INTRINSICS_)
3029 M.r[0].vector4_f32[0] = pSource->m[0][0];
3030 M.r[0].vector4_f32[1] = pSource->m[0][1];
3031 M.r[0].vector4_f32[2] = pSource->m[0][2];
3032 M.r[0].vector4_f32[3] = 0.0f;
3034 M.r[1].vector4_f32[0] = pSource->m[1][0];
3035 M.r[1].vector4_f32[1] = pSource->m[1][1];
3036 M.r[1].vector4_f32[2] = pSource->m[1][2];
3037 M.r[1].vector4_f32[3] = 0.0f;
3039 M.r[2].vector4_f32[0] = pSource->m[2][0];
3040 M.r[2].vector4_f32[1] = pSource->m[2][1];
3041 M.r[2].vector4_f32[2] = pSource->m[2][2];
3042 M.r[2].vector4_f32[3] = 0.0f;
3044 M.r[3].vector4_f32[0] = 0.0f;
3045 M.r[3].vector4_f32[1] = 0.0f;
3046 M.r[3].vector4_f32[2] = 0.0f;
3047 M.r[3].vector4_f32[3] = 1.0f;
3051 #elif defined(_XM_SSE_INTRINSICS_)
3053 XMVECTOR V1, V2, V3, Z, T1, T2, T3, T4, T5;
3055 Z = _mm_setzero_ps();
3059 V1 = _mm_loadu_ps( &pSource->m[0][0] );
3060 V2 = _mm_loadu_ps( &pSource->m[1][1] );
3061 V3 = _mm_load_ss( &pSource->m[2][2] );
3063 T1 = _mm_unpackhi_ps( V1, Z );
3064 T2 = _mm_unpacklo_ps( V2, Z );
3065 T3 = _mm_shuffle_ps( V3, T2, _MM_SHUFFLE( 0, 1, 0, 0 ) );
3066 T4 = _mm_movehl_ps( T2, T3 );
3067 T5 = _mm_movehl_ps( Z, T1 );
3069 M.r[0] = _mm_movelh_ps( V1, T1 );
3070 M.r[1] = _mm_add_ps( T4, T5 );
3071 M.r[2] = _mm_shuffle_ps( V2, V3, _MM_SHUFFLE(1, 0, 3, 2) );
3072 M.r[3] = g_XMIdentityR3;
3075 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
3076 #endif // _XM_VMX128_INTRINSICS_
3079 //------------------------------------------------------------------------------
3081 XMFINLINE XMMATRIX XMLoadFloat4x3
3083 CONST XMFLOAT4X3* pSource
3086 #if defined(_XM_NO_INTRINSICS_)
3090 ((UINT *)(&M.r[0].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[0][0]))[0];
3091 ((UINT *)(&M.r[0].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[0][1]))[0];
3092 ((UINT *)(&M.r[0].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[0][2]))[0];
3093 M.r[0].vector4_f32[3] = 0.0f;
3095 ((UINT *)(&M.r[1].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[1][0]))[0];
3096 ((UINT *)(&M.r[1].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[1][1]))[0];
3097 ((UINT *)(&M.r[1].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[1][2]))[0];
3098 M.r[1].vector4_f32[3] = 0.0f;
3100 ((UINT *)(&M.r[2].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[2][0]))[0];
3101 ((UINT *)(&M.r[2].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[2][1]))[0];
3102 ((UINT *)(&M.r[2].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[2][2]))[0];
3103 M.r[2].vector4_f32[3] = 0.0f;
3105 ((UINT *)(&M.r[3].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[3][0]))[0];
3106 ((UINT *)(&M.r[3].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[3][1]))[0];
3107 ((UINT *)(&M.r[3].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[3][2]))[0];
3108 M.r[3].vector4_f32[3] = 1.0f;
3112 #elif defined(_XM_SSE_INTRINSICS_)
3114 // Use unaligned load instructions to
3115 // load the 12 floats
3116 // vTemp1 = x1,y1,z1,x2
3117 XMVECTOR vTemp1 = _mm_loadu_ps(&pSource->m[0][0]);
3118 // vTemp2 = y2,z2,x3,y3
3119 XMVECTOR vTemp2 = _mm_loadu_ps(&pSource->m[1][1]);
3120 // vTemp4 = z3,x4,y4,z4
3121 XMVECTOR vTemp4 = _mm_loadu_ps(&pSource->m[2][2]);
3122 // vTemp3 = x3,y3,z3,z3
3123 XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2));
3124 // vTemp2 = y2,z2,x2,x2
3125 vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0));
3126 // vTemp2 = x2,y2,z2,z2
3127 vTemp2 = _mm_shuffle_ps(vTemp2,vTemp2,_MM_SHUFFLE(1,1,0,2));
3128 // vTemp1 = x1,y1,z1,0
3129 vTemp1 = _mm_and_ps(vTemp1,g_XMMask3);
3130 // vTemp2 = x2,y2,z2,0
3131 vTemp2 = _mm_and_ps(vTemp2,g_XMMask3);
3132 // vTemp3 = x3,y3,z3,0
3133 vTemp3 = _mm_and_ps(vTemp3,g_XMMask3);
3134 // vTemp4i = x4,y4,z4,0
3135 __m128i vTemp4i = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vTemp4)[0],32/8);
3136 // vTemp4i = x4,y4,z4,1.0f
3137 vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3);
3141 reinterpret_cast<const __m128 *>(&vTemp4i)[0]);
3143 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
3144 #endif // _XM_VMX128_INTRINSICS_
3147 //------------------------------------------------------------------------------
3149 XMFINLINE XMMATRIX XMLoadFloat4x3A
3151 CONST XMFLOAT4X3A* pSource
3154 #if defined(_XM_NO_INTRINSICS_)
3159 XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
3161 M.r[0].vector4_f32[0] = pSource->m[0][0];
3162 M.r[0].vector4_f32[1] = pSource->m[0][1];
3163 M.r[0].vector4_f32[2] = pSource->m[0][2];
3164 M.r[0].vector4_f32[3] = 0.0f;
3166 M.r[1].vector4_f32[0] = pSource->m[1][0];
3167 M.r[1].vector4_f32[1] = pSource->m[1][1];
3168 M.r[1].vector4_f32[2] = pSource->m[1][2];
3169 M.r[1].vector4_f32[3] = 0.0f;
3171 M.r[2].vector4_f32[0] = pSource->m[2][0];
3172 M.r[2].vector4_f32[1] = pSource->m[2][1];
3173 M.r[2].vector4_f32[2] = pSource->m[2][2];
3174 M.r[2].vector4_f32[3] = 0.0f;
3176 M.r[3].vector4_f32[0] = pSource->m[3][0];
3177 M.r[3].vector4_f32[1] = pSource->m[3][1];
3178 M.r[3].vector4_f32[2] = pSource->m[3][2];
3179 M.r[3].vector4_f32[3] = 1.0f;
3183 #elif defined(_XM_SSE_INTRINSICS_)
3185 // Use aligned load instructions to
3186 // load the 12 floats
3187 // vTemp1 = x1,y1,z1,x2
3188 XMVECTOR vTemp1 = _mm_load_ps(&pSource->m[0][0]);
3189 // vTemp2 = y2,z2,x3,y3
3190 XMVECTOR vTemp2 = _mm_load_ps(&pSource->m[1][1]);
3191 // vTemp4 = z3,x4,y4,z4
3192 XMVECTOR vTemp4 = _mm_load_ps(&pSource->m[2][2]);
3193 // vTemp3 = x3,y3,z3,z3
3194 XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2));
3195 // vTemp2 = y2,z2,x2,x2
3196 vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0));
3197 // vTemp2 = x2,y2,z2,z2
3198 vTemp2 = _mm_shuffle_ps(vTemp2,vTemp2,_MM_SHUFFLE(1,1,0,2));
3199 // vTemp1 = x1,y1,z1,0
3200 vTemp1 = _mm_and_ps(vTemp1,g_XMMask3);
3201 // vTemp2 = x2,y2,z2,0
3202 vTemp2 = _mm_and_ps(vTemp2,g_XMMask3);
3203 // vTemp3 = x3,y3,z3,0
3204 vTemp3 = _mm_and_ps(vTemp3,g_XMMask3);
3205 // vTemp4i = x4,y4,z4,0
3206 __m128i vTemp4i = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vTemp4)[0],32/8);
3207 // vTemp4i = x4,y4,z4,1.0f
3208 vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3);
3212 reinterpret_cast<const __m128 *>(&vTemp4i)[0]);
3214 #else // _XM_VMX128_INTRINSICS_
3215 #endif // _XM_VMX128_INTRINSICS_
3218 //------------------------------------------------------------------------------
3220 XMFINLINE XMMATRIX XMLoadFloat4x4
3222 CONST XMFLOAT4X4* pSource
3225 #if defined(_XM_NO_INTRINSICS_)
3229 ((UINT *)(&M.r[0].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[0][0]))[0];
3230 ((UINT *)(&M.r[0].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[0][1]))[0];
3231 ((UINT *)(&M.r[0].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[0][2]))[0];
3232 ((UINT *)(&M.r[0].vector4_f32[3]))[0] = ((const UINT *)(&pSource->m[0][3]))[0];
3234 ((UINT *)(&M.r[1].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[1][0]))[0];
3235 ((UINT *)(&M.r[1].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[1][1]))[0];
3236 ((UINT *)(&M.r[1].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[1][2]))[0];
3237 ((UINT *)(&M.r[1].vector4_f32[3]))[0] = ((const UINT *)(&pSource->m[1][3]))[0];
3239 ((UINT *)(&M.r[2].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[2][0]))[0];
3240 ((UINT *)(&M.r[2].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[2][1]))[0];
3241 ((UINT *)(&M.r[2].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[2][2]))[0];
3242 ((UINT *)(&M.r[2].vector4_f32[3]))[0] = ((const UINT *)(&pSource->m[2][3]))[0];
3244 ((UINT *)(&M.r[3].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[3][0]))[0];
3245 ((UINT *)(&M.r[3].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[3][1]))[0];
3246 ((UINT *)(&M.r[3].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[3][2]))[0];
3247 ((UINT *)(&M.r[3].vector4_f32[3]))[0] = ((const UINT *)(&pSource->m[3][3]))[0];
3251 #elif defined(_XM_SSE_INTRINSICS_)
3255 M.r[0] = _mm_loadu_ps( &pSource->_11 );
3256 M.r[1] = _mm_loadu_ps( &pSource->_21 );
3257 M.r[2] = _mm_loadu_ps( &pSource->_31 );
3258 M.r[3] = _mm_loadu_ps( &pSource->_41 );
3261 #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
3262 #endif // _XM_VMX128_INTRINSICS_
3265 //------------------------------------------------------------------------------
3267 XMFINLINE XMMATRIX XMLoadFloat4x4A
3269 CONST XMFLOAT4X4A* pSource
3272 #if defined(_XM_NO_INTRINSICS_)
3277 XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
3279 M.r[0].vector4_f32[0] = pSource->m[0][0];
3280 M.r[0].vector4_f32[1] = pSource->m[0][1];
3281 M.r[0].vector4_f32[2] = pSource->m[0][2];
3282 M.r[0].vector4_f32[3] = pSource->m[0][3];
3284 M.r[1].vector4_f32[0] = pSource->m[1][0];
3285 M.r[1].vector4_f32[1] = pSource->m[1][1];
3286 M.r[1].vector4_f32[2] = pSource->m[1][2];
3287 M.r[1].vector4_f32[3] = pSource->m[1][3];
3289 M.r[2].vector4_f32[0] = pSource->m[2][0];
3290 M.r[2].vector4_f32[1] = pSource->m[2][1];
3291 M.r[2].vector4_f32[2] = pSource->m[2][2];
3292 M.r[2].vector4_f32[3] = pSource->m[2][3];
3294 M.r[3].vector4_f32[0] = pSource->m[3][0];
3295 M.r[3].vector4_f32[1] = pSource->m[3][1];
3296 M.r[3].vector4_f32[2] = pSource->m[3][2];
3297 M.r[3].vector4_f32[3] = pSource->m[3][3];
3301 #elif defined(_XM_SSE_INTRINSICS_)
3306 M.r[0] = _mm_load_ps( &pSource->_11 );
3307 M.r[1] = _mm_load_ps( &pSource->_21 );
3308 M.r[2] = _mm_load_ps( &pSource->_31 );
3309 M.r[3] = _mm_load_ps( &pSource->_41 );
3312 #else // _XM_VMX128_INTRINSICS_
3313 #endif // _XM_VMX128_INTRINSICS_
3316 /****************************************************************************
3318 * Vector and matrix store operations
3320 ****************************************************************************/
3322 XMFINLINE VOID XMStoreInt
3328 #if defined(_XM_NO_INTRINSICS_)
3330 XMASSERT(pDestination);
3331 XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3333 *pDestination = XMVectorGetIntX( V );
3335 #elif defined(_XM_SSE_INTRINSICS_)
3336 XMASSERT(pDestination);
3337 XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3339 _mm_store_ss( (float*)pDestination, V );
3340 #else // _XM_VMX128_INTRINSICS_
3341 #endif // _XM_VMX128_INTRINSICS_
3344 //------------------------------------------------------------------------------
3346 XMFINLINE VOID XMStoreFloat
3348 FLOAT* pDestination,
3352 #if defined(_XM_NO_INTRINSICS_)
3354 XMASSERT(pDestination);
3355 XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3357 *pDestination = XMVectorGetX( V );
3359 #elif defined(_XM_SSE_INTRINSICS_)
3360 XMASSERT(pDestination);
3361 XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3363 _mm_store_ss( pDestination, V );
3364 #else // _XM_VMX128_INTRINSICS_
3365 #endif // _XM_VMX128_INTRINSICS_
3368 //------------------------------------------------------------------------------
3370 XMFINLINE VOID XMStoreInt2
3376 #if defined(_XM_NO_INTRINSICS_)
3378 XMASSERT(pDestination);
3379 XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3381 pDestination[0] = V.vector4_u32[0];
3382 pDestination[1] = V.vector4_u32[1];
3384 #elif defined(_XM_SSE_INTRINSICS_)
3385 XMASSERT(pDestination);
3386 XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3388 XMVECTOR T = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
3389 _mm_store_ss( (float*)&pDestination[0], V );
3390 _mm_store_ss( (float*)&pDestination[1], T );
3391 #else // _XM_VMX128_INTRINSICS_
3392 #endif // _XM_VMX128_INTRINSICS_
3395 //------------------------------------------------------------------------------
3397 XMFINLINE VOID XMStoreSInt2
3399 XMINT2* pDestination,
3403 #if defined(_XM_NO_INTRINSICS_)
3405 XMASSERT(pDestination);
3406 XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3408 pDestination->x = (INT)V.vector4_f32[0];
3409 pDestination->y = (INT)V.vector4_f32[1];
3411 #elif defined(_XM_SSE_INTRINSICS_)
3412 XMASSERT(pDestination);
3413 XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3415 // In case of positive overflow, detect it
3416 XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt);
3417 // Float to int conversion
3418 __m128i vResulti = _mm_cvttps_epi32(V);
3419 // If there was positive overflow, set to 0x7FFFFFFF
3420 XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
3421 vOverflow = _mm_andnot_ps(vOverflow,reinterpret_cast<const __m128 *>(&vResulti)[0]);
3422 vOverflow = _mm_or_ps(vOverflow,vResult);
3424 XMVECTOR T = _mm_shuffle_ps( vOverflow, vOverflow, _MM_SHUFFLE( 1, 1, 1, 1 ) );
3425 _mm_store_ss( (float*)&pDestination->x, vOverflow );
3426 _mm_store_ss( (float*)&pDestination->y, T );
3427 #else // _XM_VMX128_INTRINSICS_
3428 #endif // _XM_VMX128_INTRINSICS_
3431 //------------------------------------------------------------------------------
3433 XMFINLINE VOID XMStoreUInt2
3435 XMUINT2* pDestination,
3439 #if defined(_XM_NO_INTRINSICS_)
3441 XMASSERT(pDestination);
3442 XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3444 pDestination->x = (UINT)V.vector4_f32[0];
3445 pDestination->y = (UINT)V.vector4_f32[1];
3447 #elif defined(_XM_SSE_INTRINSICS_)
3448 XMASSERT(pDestination);
3449 XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3452 XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
3453 // Any numbers that are too big, set to 0xFFFFFFFFU
3454 XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
3455 XMVECTOR vValue = g_XMUnsignedFix;
3456 // Too large for a signed integer?
3457 XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
3458 // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
3459 vValue = _mm_and_ps(vValue,vMask);
3460 // Perform fixup only on numbers too large (Keeps low bit precision)
3461 vResult = _mm_sub_ps(vResult,vValue);
3462 __m128i vResulti = _mm_cvttps_epi32(vResult);
3463 // Convert from signed to unsigned pnly if greater than 0x80000000
3464 vMask = _mm_and_ps(vMask,g_XMNegativeZero);
3465 vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],vMask);
3466 // On those that are too large, set to 0xFFFFFFFF
3467 vResult = _mm_or_ps(vResult,vOverflow);
3469 XMVECTOR T = _mm_shuffle_ps( vResult, vResult, _MM_SHUFFLE( 1, 1, 1, 1 ) );
3470 _mm_store_ss( (float*)&pDestination->x, vResult );
3471 _mm_store_ss( (float*)&pDestination->y, T );
3472 #else // _XM_VMX128_INTRINSICS_
3473 #endif // _XM_VMX128_INTRINSICS_
3476 //------------------------------------------------------------------------------
3478 XMFINLINE VOID XMStoreInt2A
3484 #if defined(_XM_NO_INTRINSICS_)
3486 XMASSERT(pDestination);
3487 XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
3489 pDestination[0] = V.vector4_u32[0];
3490 pDestination[1] = V.vector4_u32[1];
3492 #elif defined(_XM_SSE_INTRINSICS_)
3494 XMASSERT(pDestination);
3495 XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
3497 _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
3499 #else // _XM_VMX128_INTRINSICS_
3500 #endif // _XM_VMX128_INTRINSICS_
3503 //------------------------------------------------------------------------------
3505 XMFINLINE VOID XMStoreFloat2
3507 XMFLOAT2* pDestination,
3511 #if defined(_XM_NO_INTRINSICS_)
3513 XMASSERT(pDestination);
3514 XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3516 pDestination->x = V.vector4_f32[0];
3517 pDestination->y = V.vector4_f32[1];
3519 #elif defined(_XM_SSE_INTRINSICS_)
3520 XMASSERT(pDestination);
3521 XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3523 XMVECTOR T = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
3524 _mm_store_ss( &pDestination->x, V );
3525 _mm_store_ss( &pDestination->y, T );
3526 #else // _XM_VMX128_INTRINSICS_
3527 #endif // _XM_VMX128_INTRINSICS_
3530 //------------------------------------------------------------------------------
3532 XMFINLINE VOID XMStoreFloat2A
3534 XMFLOAT2A* pDestination,
3538 #if defined(_XM_NO_INTRINSICS_)
3540 XMASSERT(pDestination);
3541 XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
3543 pDestination->x = V.vector4_f32[0];
3544 pDestination->y = V.vector4_f32[1];
3546 #elif defined(_XM_SSE_INTRINSICS_)
3548 XMASSERT(pDestination);
3549 XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
3551 _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
3553 #else // _XM_VMX128_INTRINSICS_
3554 #endif // _XM_VMX128_INTRINSICS_
3557 //------------------------------------------------------------------------------
3559 XMFINLINE VOID XMStoreHalf2
3561 XMHALF2* pDestination,
3565 #if defined(_XM_NO_INTRINSICS_)
3567 XMASSERT(pDestination);
3569 pDestination->x = XMConvertFloatToHalf(V.vector4_f32[0]);
3570 pDestination->y = XMConvertFloatToHalf(V.vector4_f32[1]);
3572 #elif defined(_XM_SSE_INTRINSICS_)
3573 XMASSERT(pDestination);
3574 pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V));
3575 pDestination->y = XMConvertFloatToHalf(XMVectorGetY(V));
3576 #else // _XM_VMX128_INTRINSICS_
3577 #endif // _XM_VMX128_INTRINSICS_
3580 //------------------------------------------------------------------------------
3582 XMFINLINE VOID XMStoreShortN2
3584 XMSHORTN2* pDestination,
3588 #if defined(_XM_NO_INTRINSICS_)
3591 static CONST XMVECTORF32 Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
3593 XMASSERT(pDestination);
3595 N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
3596 N = XMVectorMultiply(N, Scale.v);
3597 N = XMVectorRound(N);
3599 pDestination->x = (SHORT)N.vector4_f32[0];
3600 pDestination->y = (SHORT)N.vector4_f32[1];
3602 #elif defined(_XM_SSE_INTRINSICS_)
3603 XMASSERT(pDestination);
3604 static CONST XMVECTORF32 Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
3606 XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
3607 vResult = _mm_min_ps(vResult,g_XMOne);
3608 vResult = _mm_mul_ps(vResult,Scale);
3609 __m128i vResulti = _mm_cvtps_epi32(vResult);
3610 vResulti = _mm_packs_epi32(vResulti,vResulti);
3611 _mm_store_ss(reinterpret_cast<float *>(&pDestination->x),reinterpret_cast<const __m128 *>(&vResulti)[0]);
3612 #else // _XM_VMX128_INTRINSICS_
3613 #endif // _XM_VMX128_INTRINSICS_
3616 //------------------------------------------------------------------------------
3618 XMFINLINE VOID XMStoreShort2
3620 XMSHORT2* pDestination,
3624 #if defined(_XM_NO_INTRINSICS_)
3627 static CONST XMVECTOR Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
3628 static CONST XMVECTOR Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
3630 XMASSERT(pDestination);
3632 N = XMVectorClamp(V, Min, Max);
3633 N = XMVectorRound(N);
3635 pDestination->x = (SHORT)N.vector4_f32[0];
3636 pDestination->y = (SHORT)N.vector4_f32[1];
3638 #elif defined(_XM_SSE_INTRINSICS_)
3639 XMASSERT(pDestination);
3640 static CONST XMVECTORF32 Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
3641 static CONST XMVECTORF32 Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
3643 XMVECTOR vResult = _mm_max_ps(V,Min);
3644 vResult = _mm_min_ps(vResult,Max);
3645 // Convert to int with rounding
3646 __m128i vInt = _mm_cvtps_epi32(vResult);
3647 // Pack the ints into shorts
3648 vInt = _mm_packs_epi32(vInt,vInt);
3649 _mm_store_ss(reinterpret_cast<float *>(&pDestination->x),reinterpret_cast<const __m128 *>(&vInt)[0]);
3650 #else // _XM_VMX128_INTRINSICS_
3651 #endif // _XM_VMX128_INTRINSICS_
3654 //------------------------------------------------------------------------------
3656 XMFINLINE VOID XMStoreUShortN2
3658 XMUSHORTN2* pDestination,
3662 #if defined(_XM_NO_INTRINSICS_)
3665 static CONST XMVECTORF32 Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
3667 XMASSERT(pDestination);
3669 N = XMVectorSaturate(V);
3670 N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v);
3671 N = XMVectorTruncate(N);
3673 pDestination->x = (SHORT)N.vector4_f32[0];
3674 pDestination->y = (SHORT)N.vector4_f32[1];
3676 #elif defined(_XM_SSE_INTRINSICS_)
3677 XMASSERT(pDestination);
3678 static CONST XMVECTORF32 Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
3680 XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
3681 vResult = _mm_min_ps(vResult,g_XMOne);
3682 vResult = _mm_mul_ps(vResult,Scale);
3683 // Convert to int with rounding
3684 __m128i vInt = _mm_cvtps_epi32(vResult);
3685 // Since the SSE pack instruction clamps using signed rules,
3686 // manually extract the values to store them to memory
3687 pDestination->x = static_cast<SHORT>(_mm_extract_epi16(vInt,0));
3688 pDestination->y = static_cast<SHORT>(_mm_extract_epi16(vInt,2));
3689 #else // _XM_VMX128_INTRINSICS_
3690 #endif // _XM_VMX128_INTRINSICS_
3693 //------------------------------------------------------------------------------
3695 XMFINLINE VOID XMStoreUShort2
3697 XMUSHORT2* pDestination,
3701 #if defined(_XM_NO_INTRINSICS_)
3704 static CONST XMVECTOR Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
3706 XMASSERT(pDestination);
3708 N = XMVectorClamp(V, XMVectorZero(), Max);
3709 N = XMVectorRound(N);
3711 pDestination->x = (SHORT)N.vector4_f32[0];
3712 pDestination->y = (SHORT)N.vector4_f32[1];
3714 #elif defined(_XM_SSE_INTRINSICS_)
3715 XMASSERT(pDestination);
3716 static CONST XMVECTORF32 Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
3718 XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
3719 vResult = _mm_min_ps(vResult,Max);
3720 // Convert to int with rounding
3721 __m128i vInt = _mm_cvtps_epi32(vResult);
3722 // Since the SSE pack instruction clamps using signed rules,
3723 // manually extract the values to store them to memory
3724 pDestination->x = static_cast<SHORT>(_mm_extract_epi16(vInt,0));
3725 pDestination->y = static_cast<SHORT>(_mm_extract_epi16(vInt,2));
3726 #else // _XM_VMX128_INTRINSICS_
3727 #endif // _XM_VMX128_INTRINSICS_
3730 //------------------------------------------------------------------------------
3732 XMFINLINE VOID XMStoreByteN2
3734 XMBYTEN2* pDestination,
3740 static CONST XMVECTORF32 Scale = {127.0f, 127.0f, 127.0f, 127.0f};
3742 XMASSERT(pDestination);
3744 N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
3745 N = XMVectorMultiply(N, Scale.v);
3746 N = XMVectorRound(N);
3748 XMStoreFloat4A( &tmp, N );
3750 pDestination->x = (CHAR)tmp.x;
3751 pDestination->y = (CHAR)tmp.y;
3754 //------------------------------------------------------------------------------
3756 XMFINLINE VOID XMStoreByte2
3758 XMBYTE2* pDestination,
3764 static CONST XMVECTOR Min = {-127.0f, -127.0f, -127.0f, -127.0f};
3765 static CONST XMVECTOR Max = {127.0f, 127.0f, 127.0f, 127.0f};
3767 XMASSERT(pDestination);
3769 N = XMVectorClamp(V, Min, Max);
3770 N = XMVectorRound(N);
3772 XMStoreFloat4A( &tmp, N );
3774 pDestination->x = (CHAR)tmp.x;
3775 pDestination->y = (CHAR)tmp.y;
3778 //------------------------------------------------------------------------------
3780 XMFINLINE VOID XMStoreUByteN2
3782 XMUBYTEN2* pDestination,
3788 static CONST XMVECTORF32 Scale = {255.0f, 255.0f, 255.0f, 255.0f};
3790 XMASSERT(pDestination);
3792 N = XMVectorSaturate(V);
3793 N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v);
3794 N = XMVectorTruncate(N);
3796 XMStoreFloat4A( &tmp, N );
3798 pDestination->x = (BYTE)tmp.x;
3799 pDestination->y = (BYTE)tmp.y;
3802 //------------------------------------------------------------------------------
3804 XMFINLINE VOID XMStoreUByte2
3806 XMUBYTE2* pDestination,
3811 static CONST XMVECTOR Max = {255.0f, 255.0f, 255.0f, 255.0f};
3814 XMASSERT(pDestination);
3816 N = XMVectorClamp(V, XMVectorZero(), Max);
3817 N = XMVectorRound(N);
3819 XMStoreFloat4A( &tmp, N );
3821 pDestination->x = (BYTE)tmp.x;
3822 pDestination->y = (BYTE)tmp.y;
3825 //------------------------------------------------------------------------------
3827 XMFINLINE VOID XMStoreInt3
3833 #if defined(_XM_NO_INTRINSICS_)
3835 XMASSERT(pDestination);
3836 XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3838 pDestination[0] = V.vector4_u32[0];
3839 pDestination[1] = V.vector4_u32[1];
3840 pDestination[2] = V.vector4_u32[2];
3842 #elif defined(_XM_SSE_INTRINSICS_)
3844 XMASSERT(pDestination);
3845 XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3847 XMVECTOR T1 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1));
3848 XMVECTOR T2 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
3849 _mm_store_ss( (float*)pDestination, V );
3850 _mm_store_ss( (float*)&pDestination[1], T1 );
3851 _mm_store_ss( (float*)&pDestination[2], T2 );
3853 #else // _XM_VMX128_INTRINSICS_
3854 #endif // _XM_VMX128_INTRINSICS_
3857 //------------------------------------------------------------------------------
3859 XMFINLINE VOID XMStoreSInt3
3861 XMINT3* pDestination,
3865 #if defined(_XM_NO_INTRINSICS_)
3867 XMASSERT(pDestination);
3868 XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3870 pDestination->x = (INT)V.vector4_f32[0];
3871 pDestination->y = (INT)V.vector4_f32[1];
3872 pDestination->z = (INT)V.vector4_f32[2];
3874 #elif defined(_XM_SSE_INTRINSICS_)
3876 XMASSERT(pDestination);
3877 XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3879 // In case of positive overflow, detect it
3880 XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt);
3881 // Float to int conversion
3882 __m128i vResulti = _mm_cvttps_epi32(V);
3883 // If there was positive overflow, set to 0x7FFFFFFF
3884 XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
3885 vOverflow = _mm_andnot_ps(vOverflow,reinterpret_cast<const __m128 *>(&vResulti)[0]);
3886 vOverflow = _mm_or_ps(vOverflow,vResult);
3888 XMVECTOR T1 = _mm_shuffle_ps(vOverflow,vOverflow,_MM_SHUFFLE(1,1,1,1));
3889 XMVECTOR T2 = _mm_shuffle_ps(vOverflow,vOverflow,_MM_SHUFFLE(2,2,2,2));
3890 _mm_store_ss( (float*)&pDestination->x, vOverflow );
3891 _mm_store_ss( (float*)&pDestination->y, T1 );
3892 _mm_store_ss( (float*)&pDestination->z, T2 );
3894 #else // _XM_VMX128_INTRINSICS_
3895 #endif // _XM_VMX128_INTRINSICS_
3898 //------------------------------------------------------------------------------
3900 XMFINLINE VOID XMStoreUInt3
3902 XMUINT3* pDestination,
3906 #if defined(_XM_NO_INTRINSICS_)
3908 XMASSERT(pDestination);
3909 XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3911 pDestination->x = (UINT)V.vector4_f32[0];
3912 pDestination->y = (UINT)V.vector4_f32[1];
3913 pDestination->z = (UINT)V.vector4_f32[2];
3915 #elif defined(_XM_SSE_INTRINSICS_)
3917 XMASSERT(pDestination);
3918 XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3921 XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
3922 // Any numbers that are too big, set to 0xFFFFFFFFU
3923 XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
3924 XMVECTOR vValue = g_XMUnsignedFix;
3925 // Too large for a signed integer?
3926 XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
3927 // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
3928 vValue = _mm_and_ps(vValue,vMask);
3929 // Perform fixup only on numbers too large (Keeps low bit precision)
3930 vResult = _mm_sub_ps(vResult,vValue);
3931 __m128i vResulti = _mm_cvttps_epi32(vResult);
3932 // Convert from signed to unsigned pnly if greater than 0x80000000
3933 vMask = _mm_and_ps(vMask,g_XMNegativeZero);
3934 vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],vMask);
3935 // On those that are too large, set to 0xFFFFFFFF
3936 vResult = _mm_or_ps(vResult,vOverflow);
3938 XMVECTOR T1 = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(1,1,1,1));
3939 XMVECTOR T2 = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(2,2,2,2));
3940 _mm_store_ss( (float*)&pDestination->x, vResult );
3941 _mm_store_ss( (float*)&pDestination->y, T1 );
3942 _mm_store_ss( (float*)&pDestination->z, T2 );
3944 #else // _XM_VMX128_INTRINSICS_
3945 #endif // _XM_VMX128_INTRINSICS_
3948 //------------------------------------------------------------------------------
3950 XMFINLINE VOID XMStoreInt3A
3956 #if defined(_XM_NO_INTRINSICS_)
3958 XMASSERT(pDestination);
3959 XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
3961 pDestination[0] = V.vector4_u32[0];
3962 pDestination[1] = V.vector4_u32[1];
3963 pDestination[2] = V.vector4_u32[2];
3965 #elif defined(_XM_SSE_INTRINSICS_)
3967 XMASSERT(pDestination);
3968 XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
3970 XMVECTOR T = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
3971 _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
3972 _mm_store_ss( (float*)&pDestination[2], T );
3974 #else // _XM_VMX128_INTRINSICS_
3975 #endif // _XM_VMX128_INTRINSICS_
3978 //------------------------------------------------------------------------------
3980 XMFINLINE VOID XMStoreFloat3
3982 XMFLOAT3* pDestination,
3986 #if defined(_XM_NO_INTRINSICS_)
3988 XMASSERT(pDestination);
3989 XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3991 pDestination->x = V.vector4_f32[0];
3992 pDestination->y = V.vector4_f32[1];
3993 pDestination->z = V.vector4_f32[2];
3995 #elif defined(_XM_SSE_INTRINSICS_)
3997 XMASSERT(pDestination);
3998 XMASSERT(((UINT_PTR)pDestination & 3) == 0);
4000 XMVECTOR T1 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1));
4001 XMVECTOR T2 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
4002 _mm_store_ss( &pDestination->x, V );
4003 _mm_store_ss( &pDestination->y, T1 );
4004 _mm_store_ss( &pDestination->z, T2 );
4006 #else // _XM_VMX128_INTRINSICS_
4007 #endif // _XM_VMX128_INTRINSICS_
4010 //------------------------------------------------------------------------------
4012 XMFINLINE VOID XMStoreFloat3A
4014 XMFLOAT3A* pDestination,
4018 #if defined(_XM_NO_INTRINSICS_)
4020 XMASSERT(pDestination);
4021 XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
4023 pDestination->x = V.vector4_f32[0];
4024 pDestination->y = V.vector4_f32[1];
4025 pDestination->z = V.vector4_f32[2];
4027 #elif defined(_XM_SSE_INTRINSICS_)
4029 XMASSERT(pDestination);
4030 XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
4032 XMVECTOR T = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
4033 _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
4034 _mm_store_ss( &pDestination->z, T );
4036 #else // _XM_VMX128_INTRINSICS_
4037 #endif // _XM_VMX128_INTRINSICS_
4040 //------------------------------------------------------------------------------
4042 XMFINLINE VOID XMStoreUHenDN3
4044 XMUHENDN3* pDestination,
4048 #if defined(_XM_NO_INTRINSICS_)
4051 static CONST XMVECTORF32 Scale = {2047.0f, 2047.0f, 1023.0f, 0.0f};
4053 XMASSERT(pDestination);
4055 N = XMVectorSaturate(V);
4056 N = XMVectorMultiply(N, Scale.v);
4058 pDestination->v = (((UINT)N.vector4_f32[2] & 0x3FF) << 22) |
4059 (((UINT)N.vector4_f32[1] & 0x7FF) << 11) |
4060 (((UINT)N.vector4_f32[0] & 0x7FF));
4062 #elif defined(_XM_SSE_INTRINSICS_)
4063 XMASSERT(pDestination);
4064 static const XMVECTORF32 ScaleUHenDN3 = {2047.0f, 2047.0f*2048.0f,1023.0f*(2048.0f*2048.0f)/2.0f,1.0f};
4065 static const XMVECTORI32 MaskUHenDN3 = {0x7FF,0x7FF<<11,0x3FF<<(22-1),0};
4067 XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
4068 vResult = _mm_min_ps(vResult,g_XMOne);
4069 // Scale by multiplication
4070 vResult = _mm_mul_ps(vResult,ScaleUHenDN3);
4072 __m128i vResulti = _mm_cvttps_epi32(vResult);
4073 // Mask off any fraction
4074 vResulti = _mm_and_si128(vResulti,MaskUHenDN3);
4075 // Do a horizontal or of 3 entries
4076 __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(0,3,2,1));
4078 vResulti = _mm_or_si128(vResulti,vResulti2);
4079 // Move Z to the x position
4080 vResulti2 = _mm_shuffle_epi32(vResulti2,_MM_SHUFFLE(0,3,2,1));
4081 // Add Z to itself to perform a single bit left shift
4082 vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
4084 vResulti = _mm_or_si128(vResulti,vResulti2);
4085 _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
4086 #else // _XM_VMX128_INTRINSICS_
4087 #endif // _XM_VMX128_INTRINSICS_
4090 //------------------------------------------------------------------------------
4092 XMFINLINE VOID XMStoreUHenD3
4094 XMUHEND3* pDestination,
4098 #if defined(_XM_NO_INTRINSICS_)
4101 static CONST XMVECTOR Max = {2047.0f, 2047.0f, 1023.0f, 0.0f};
4103 XMASSERT(pDestination);
4105 N = XMVectorClamp(V, XMVectorZero(), Max);
4107 pDestination->v = (((UINT)N.vector4_f32[2] & 0x3FF) << 22) |
4108 (((UINT)N.vector4_f32[1] & 0x7FF) << 11) |
4109 (((UINT)N.vector4_f32[0] & 0x7FF));
4111 #elif defined(_XM_SSE_INTRINSICS_)
4112 XMASSERT(pDestination);
4113 static const XMVECTORF32 MaxUHenD3 = { 2047.0f, 2047.0f, 1023.0f, 1.0f};
4114 static const XMVECTORF32 ScaleUHenD3 = {1.0f, 2048.0f,(2048.0f*2048.0f)/2.0f,1.0f};
4115 static const XMVECTORI32 MaskUHenD3 = {0x7FF,0x7FF<<11,0x3FF<<(22-1),0};
4117 XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
4118 vResult = _mm_min_ps(vResult,MaxUHenD3);
4119 // Scale by multiplication
4120 vResult = _mm_mul_ps(vResult,ScaleUHenD3);
4122 __m128i vResulti = _mm_cvttps_epi32(vResult);
4123 // Mask off any fraction
4124 vResulti = _mm_and_si128(vResulti,MaskUHenD3);
4125 // Do a horizontal or of 3 entries
4126 __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(0,3,2,1));
4128 vResulti = _mm_or_si128(vResulti,vResulti2);
4129 // Move Z to the x position
4130 vResulti2 = _mm_shuffle_epi32(vResulti2,_MM_SHUFFLE(0,3,2,1));
4131 // Add Z to itself to perform a single bit left shift
4132 vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
4134 vResulti = _mm_or_si128(vResulti,vResulti2);
4135 _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
4136 #else // _XM_VMX128_INTRINSICS_
4137 #endif // _XM_VMX128_INTRINSICS_
4140 //------------------------------------------------------------------------------
4142 XMFINLINE VOID XMStoreHenDN3
4144 XMHENDN3* pDestination,
4148 #if defined(_XM_NO_INTRINSICS_)
4151 static CONST XMVECTORF32 Scale = {1023.0f, 1023.0f, 511.0f, 1.0f};
4153 XMASSERT(pDestination);
4155 N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
4156 N = XMVectorMultiply(N, Scale.v);
4158 pDestination->v = (((INT)N.vector4_f32[2] & 0x3FF) << 22) |
4159 (((INT)N.vector4_f32[1] & 0x7FF) << 11) |
4160 (((INT)N.vector4_f32[0] & 0x7FF));
4162 #elif defined(_XM_SSE_INTRINSICS_)
4163 XMASSERT(pDestination);
4164 static const XMVECTORF32 ScaleHenDN3 = {1023.0f, 1023.0f*2048.0f,511.0f*(2048.0f*2048.0f),1.0f};
4166 XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
4167 vResult = _mm_min_ps(vResult,g_XMOne);
4168 // Scale by multiplication
4169 vResult = _mm_mul_ps(vResult,ScaleHenDN3);
4171 __m128i vResulti = _mm_cvttps_epi32(vResult);
4172 // Mask off any fraction
4173 vResulti = _mm_and_si128(vResulti,g_XMMaskHenD3);
4174 // Do a horizontal or of all 4 entries
4175 vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
4176 vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
4177 vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
4178 vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
4179 _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
4180 #else // _XM_VMX128_INTRINSICS_
4181 #endif // _XM_VMX128_INTRINSICS_
4184 //------------------------------------------------------------------------------
4186 XMFINLINE VOID XMStoreHenD3
4188 XMHEND3* pDestination,
4192 #if defined(_XM_NO_INTRINSICS_)
4195 static CONST XMVECTOR Min = {-1023.0f, -1023.0f, -511.0f, -1.0f};
4196 static CONST XMVECTOR Max = {1023.0f, 1023.0f, 511.0f, 1.0f};
4198 XMASSERT(pDestination);
4200 N = XMVectorClamp(V, Min, Max);
4202 pDestination->v = (((INT)N.vector4_f32[2] & 0x3FF) << 22) |
4203 (((INT)N.vector4_f32[1] & 0x7FF) << 11) |
4204 (((INT)N.vector4_f32[0] & 0x7FF));
4206 #elif defined(_XM_SSE_INTRINSICS_)
4207 XMASSERT(pDestination);
4208 static const XMVECTORF32 MinHenD3 = {-1023.0f,-1023.0f,-511.0f,-1.0f};
4209 static const XMVECTORF32 MaxHenD3 = { 1023.0f, 1023.0f, 511.0f, 1.0f};
4210 static const XMVECTORF32 ScaleHenD3 = {1.0f, 2048.0f,(2048.0f*2048.0f),1.0f};
4212 XMVECTOR vResult = _mm_max_ps(V,MinHenD3);
4213 vResult = _mm_min_ps(vResult,MaxHenD3);
4214 // Scale by multiplication
4215 vResult = _mm_mul_ps(vResult,ScaleHenD3);
4217 __m128i vResulti = _mm_cvttps_epi32(vResult);
4218 // Mask off any fraction
4219 vResulti = _mm_and_si128(vResulti,g_XMMaskHenD3);
4220 // Do a horizontal or of all 4 entries
4221 vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
4222 vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
4223 vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
4224 vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
4225 _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
4226 #else // _XM_VMX128_INTRINSICS_
4227 #endif // _XM_VMX128_INTRINSICS_
4230 //------------------------------------------------------------------------------
4232 XMFINLINE VOID XMStoreUDHenN3
4234 XMUDHENN3* pDestination,
4238 #if defined(_XM_NO_INTRINSICS_)
4241 static CONST XMVECTORF32 Scale = {1023.0f, 2047.0f, 2047.0f, 0.0f};
4243 XMASSERT(pDestination);
4245 N = XMVectorSaturate(V);
4246 N = XMVectorMultiply(N, Scale.v);
4248 pDestination->v = (((UINT)N.vector4_f32[2] & 0x7FF) << 21) |
4249 (((UINT)N.vector4_f32[1] & 0x7FF) << 10) |
4250 (((UINT)N.vector4_f32[0] & 0x3FF));
4252 #elif defined(_XM_SSE_INTRINSICS_)
4253 XMASSERT(pDestination);
4254 static const XMVECTORF32 ScaleUDHenN3 = {1023.0f,2047.0f*1024.0f,2047.0f*(1024.0f*2048.0f)/2.0f,1.0f};
4255 static const XMVECTORI32 MaskUDHenN3 = {0x3FF,0x7FF<<10,0x7FF<<(21-1),0};
4257 XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
4258 vResult = _mm_min_ps(vResult,g_XMOne);
4259 // Scale by multiplication
4260 vResult = _mm_mul_ps(vResult,ScaleUDHenN3);
4262 __m128i vResulti = _mm_cvttps_epi32(vResult);
4263 // Mask off any fraction
4264 vResulti = _mm_and_si128(vResulti,MaskUDHenN3);
4265 // Do a horizontal or of 3 entries
4266 __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(0,3,2,1));
4268 vResulti = _mm_or_si128(vResulti,vResulti2);
4269 // Move Z to the x position
4270 vResulti2 = _mm_shuffle_epi32(vResulti2,_MM_SHUFFLE(0,3,2,1));
4271 // Add Z to itself to perform a single bit left shift
4272 vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
4274 vResulti = _mm_or_si128(vResulti,vResulti2);
4275 _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
4276 #else // _XM_VMX128_INTRINSICS_
4277 #endif // _XM_VMX128_INTRINSICS_
4280 //------------------------------------------------------------------------------
4282 XMFINLINE VOID XMStoreUDHen3
4284 XMUDHEN3* pDestination,
4288 #if defined(_XM_NO_INTRINSICS_)
4291 static CONST XMVECTOR Max = {1023.0f, 2047.0f, 2047.0f, 0.0f};
4293 XMASSERT(pDestination);
4295 N = XMVectorClamp(V, XMVectorZero(), Max);
4297 pDestination->v = (((UINT)N.vector4_f32[2] & 0x7FF) << 21) |
4298 (((UINT)N.vector4_f32[1] & 0x7FF) << 10) |
4299 (((UINT)N.vector4_f32[0] & 0x3FF));
4301 #elif defined(_XM_SSE_INTRINSICS_)
4302 XMASSERT(pDestination);
4303 static const XMVECTORF32 MaxUDHen3 = { 1023.0f, 2047.0f, 2047.0f, 1.0f};
4304 static const XMVECTORF32 ScaleUDHen3 = {1.0f, 1024.0f,(1024.0f*2048.0f)/2.0f,1.0f};
4305 static const XMVECTORI32 MaskUDHen3 = {0x3FF,0x7FF<<10,0x7FF<<(21-1),0};
4307 XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
4308 vResult = _mm_min_ps(vResult,MaxUDHen3);
4309 // Scale by multiplication
4310 vResult = _mm_mul_ps(vResult,ScaleUDHen3);
4312 __m128i vResulti = _mm_cvttps_epi32(vResult);
4313 // Mask off any fraction
4314 vResulti = _mm_and_si128(vResulti,MaskUDHen3);
4315 // Do a horizontal or of 3 entries
4316 __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(0,3,2,1));
4318 vResulti = _mm_or_si128(vResulti,vResulti2);
4319 // Move Z to the x position
4320 vResulti2 = _mm_shuffle_epi32(vResulti2,_MM_SHUFFLE(0,3,2,1));
4321 // Add Z to itself to perform a single bit left shift
4322 vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
4324 vResulti = _mm_or_si128(vResulti,vResulti2);
4325 _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
4326 #else // _XM_VMX128_INTRINSICS_
4327 #endif // _XM_VMX128_INTRINSICS_
4330 //------------------------------------------------------------------------------
4332 XMFINLINE VOID XMStoreDHenN3
4334 XMDHENN3* pDestination,
4338 #if defined(_XM_NO_INTRINSICS_)
4341 static CONST XMVECTORF32 Scale = {511.0f, 1023.0f, 1023.0f, 1.0f};
4343 XMASSERT(pDestination);
4345 N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
4346 N = XMVectorMultiply(N, Scale.v);
4348 pDestination->v = (((INT)N.vector4_f32[2] & 0x7FF) << 21) |
4349 (((INT)N.vector4_f32[1] & 0x7FF) << 10) |
4350 (((INT)N.vector4_f32[0] & 0x3FF));
4352 #elif defined(_XM_SSE_INTRINSICS_)
4353 XMASSERT(pDestination);
4354 static const XMVECTORF32 ScaleDHenN3 = {511.0f, 1023.0f*1024.0f,1023.0f*(1024.0f*2048.0f),1.0f};
4356 XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
4357 vResult = _mm_min_ps(vResult,g_XMOne);
4358 // Scale by multiplication
4359 vResult = _mm_mul_ps(vResult,ScaleDHenN3);
4361 __m128i vResulti = _mm_cvttps_epi32(vResult);
4362 // Mask off any fraction
4363 vResulti = _mm_and_si128(vResulti,g_XMMaskDHen3);
4364 // Do a horizontal or of all 4 entries
4365 vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
4366 vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
4367 vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
4368 vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
4369 _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
4370 #else // _XM_VMX128_INTRINSICS_
4371 #endif // _XM_VMX128_INTRINSICS_
4374 //------------------------------------------------------------------------------
4376 XMFINLINE VOID XMStoreDHen3
4378 XMDHEN3* pDestination,
4382 #if defined(_XM_NO_INTRINSICS_)
4385 static CONST XMVECTOR Min = {-511.0f, -1023.0f, -1023.0f, -1.0f};
4386 static CONST XMVECTOR Max = {511.0f, 1023.0f, 1023.0f, 1.0f};
4388 XMASSERT(pDestination);
4390 N = XMVectorClamp(V, Min, Max);
4392 pDestination->v = (((INT)N.vector4_f32[2] & 0x7FF) << 21) |
4393 (((INT)N.vector4_f32[1] & 0x7FF) << 10) |
4394 (((INT)N.vector4_f32[0] & 0x3FF));
4396 #elif defined(_XM_SSE_INTRINSICS_)
4397 XMASSERT(pDestination);
4398 static const XMVECTORF32 MinDHen3 = {-511.0f,-1023.0f,-1023.0f,-1.0f};
4399 static const XMVECTORF32 MaxDHen3 = { 511.0f, 1023.0f, 1023.0f, 1.0f};
4400 static const XMVECTORF32 ScaleDHen3 = {1.0f, 1024.0f,(1024.0f*2048.0f),1.0f};
4402 XMVECTOR vResult = _mm_max_ps(V,MinDHen3);
4403 vResult = _mm_min_ps(vResult,MaxDHen3);
4404 // Scale by multiplication
4405 vResult = _mm_mul_ps(vResult,ScaleDHen3);
4407 __m128i vResulti = _mm_cvttps_epi32(vResult);
4408 // Mask off any fraction
4409 vResulti = _mm_and_si128(vResulti,g_XMMaskDHen3);
4410 // Do a horizontal or of all 4 entries
4411 vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
4412 vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
4413 vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
4414 vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
4415 _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
4416 #else // _XM_VMX128_INTRINSICS_
4417 #endif // _XM_VMX128_INTRINSICS_
4420 //------------------------------------------------------------------------------
4422 XMFINLINE VOID XMStoreU565
4424 XMU565* pDestination,
4428 #if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
4429 XMASSERT(pDestination);
4430 static CONST XMVECTORF32 Max = {31.0f, 63.0f, 31.0f, 0.0f};
4432 XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
4433 vResult = _mm_min_ps(vResult,Max);
4434 // Convert to int with rounding
4435 __m128i vInt = _mm_cvtps_epi32(vResult);
4436 // No SSE operations will write to 16-bit values, so we have to extract them manually
4437 USHORT x = static_cast<USHORT>(_mm_extract_epi16(vInt,0));
4438 USHORT y = static_cast<USHORT>(_mm_extract_epi16(vInt,2));
4439 USHORT z = static_cast<USHORT>(_mm_extract_epi16(vInt,4));
4440 pDestination->v = ((z & 0x1F) << 11) |
4445 static CONST XMVECTORF32 Max = {31.0f, 63.0f, 31.0f, 0.0f};
4447 XMASSERT(pDestination);
4449 N = XMVectorClamp(V, XMVectorZero(), Max.v);
4450 N = XMVectorRound(N);
4452 pDestination->v = (((USHORT)N.vector4_f32[2] & 0x1F) << 11) |
4453 (((USHORT)N.vector4_f32[1] & 0x3F) << 5) |
4454 (((USHORT)N.vector4_f32[0] & 0x1F));
4455 #endif !_XM_SSE_INTRINSICS_
4458 //------------------------------------------------------------------------------
4460 XMFINLINE VOID XMStoreFloat3PK
4462 XMFLOAT3PK* pDestination,
4466 _DECLSPEC_ALIGN_16_ UINT IValue[4];
4470 XMASSERT(pDestination);
4472 XMStoreFloat3A( (XMFLOAT3A*)&IValue, V );
4474 // X & Y Channels (5-bit exponent, 6-bit mantissa)
4475 for(j=0; j < 2; ++j)
4477 Sign = IValue[j] & 0x80000000;
4478 I = IValue[j] & 0x7FFFFFFF;
4480 if ((I & 0x7F800000) == 0x7F800000)
4484 if (( I & 0x7FFFFF ) != 0)
4486 Result[j] = 0x7c0 | (((I>>17)|(I>11)|(I>>6)|(I))&0x3f);
4490 // -INF is clamped to 0 since 3PK is positive only
4496 // 3PK is positive only, so clamp to zero
4499 else if (I > 0x477E0000U)
4501 // The number is too large to be represented as a float11, set to max
4506 if (I < 0x38800000U)
4508 // The number is too small to be represented as a normalized float11
4509 // Convert it to a denormalized value.
4510 UINT Shift = 113U - (I >> 23U);
4511 I = (0x800000U | (I & 0x7FFFFFU)) >> Shift;
4515 // Rebias the exponent to represent the value as a normalized float11
4519 Result[j] = ((I + 0xFFFFU + ((I >> 17U) & 1U)) >> 17U)&0x7ffU;
4523 // Z Channel (5-bit exponent, 5-bit mantissa)
4524 Sign = IValue[2] & 0x80000000;
4525 I = IValue[2] & 0x7FFFFFFF;
4527 if ((I & 0x7F800000) == 0x7F800000)
4533 Result[2] = 0x3e0 | (((I>>18)|(I>13)|(I>>3)|(I))&0x1f);
4537 // -INF is clamped to 0 since 3PK is positive only
4543 // 3PK is positive only, so clamp to zero
4546 else if (I > 0x477C0000U)
4548 // The number is too large to be represented as a float10, set to max
4553 if (I < 0x38800000U)
4555 // The number is too small to be represented as a normalized float10
4556 // Convert it to a denormalized value.
4557 UINT Shift = 113U - (I >> 23U);
4558 I = (0x800000U | (I & 0x7FFFFFU)) >> Shift;
4562 // Rebias the exponent to represent the value as a normalized float10
4566 Result[2] = ((I + 0x1FFFFU + ((I >> 18U) & 1U)) >> 18U)&0x3ffU;
4569 // Pack Result into memory
4570 pDestination->v = (Result[0] & 0x7ff)
4571 | ( (Result[1] & 0x7ff) << 11 )
4572 | ( (Result[2] & 0x3ff) << 22 );
4576 //------------------------------------------------------------------------------
4578 XMFINLINE VOID XMStoreFloat3SE
4580 XMFLOAT3SE* pDestination,
4584 _DECLSPEC_ALIGN_16_ UINT IValue[4];
4590 XMASSERT(pDestination);
4592 XMStoreFloat3A( (XMFLOAT3A*)&IValue, V );
4594 // X, Y, Z Channels (5-bit exponent, 9-bit mantissa)
4595 for(j=0; j < 3; ++j)
4597 Sign = IValue[j] & 0x80000000;
4598 I = IValue[j] & 0x7FFFFFFF;
4600 if ((I & 0x7F800000) == 0x7F800000)
4604 if (( I & 0x7FFFFF ) != 0)
4606 Frac[j] = ((I>>14)|(I>5)|(I))&0x1ff;
4610 // -INF is clamped to 0 since 3SE is positive only
4611 Exp[j] = Frac[j] = 0;
4616 // 3SE is positive only, so clamp to zero
4617 Exp[j] = Frac[j] = 0;
4619 else if (I > 0x477FC000U)
4621 // The number is too large, set to max
4627 if (I < 0x38800000U)
4629 // The number is too small to be represented as a normalized float11
4630 // Convert it to a denormalized value.
4631 UINT Shift = 113U - (I >> 23U);
4632 I = (0x800000U | (I & 0x7FFFFFU)) >> Shift;
4636 // Rebias the exponent to represent the value as a normalized float11
4640 T = ((I + 0x1FFFU + ((I >> 14U) & 1U)) >> 14U)&0x3fffU;
4642 Exp[j] = (T & 0x3E00) >> 9;
4643 Frac[j] = T & 0x1ff;
4647 // Adjust to a shared exponent
4648 T = XMMax( Exp[0], XMMax( Exp[1], Exp[2] ) );
4650 Frac[0] = Frac[0] >> (T - Exp[0]);
4651 Frac[1] = Frac[1] >> (T - Exp[1]);
4652 Frac[2] = Frac[2] >> (T - Exp[2]);
4654 // Store packed into memory
4655 pDestination->xm = Frac[0];
4656 pDestination->ym = Frac[1];
4657 pDestination->zm = Frac[2];
4658 pDestination->e = T;
4661 //------------------------------------------------------------------------------
4663 XMFINLINE VOID XMStoreInt4
4669 #if defined(_XM_NO_INTRINSICS_)
4671 XMASSERT(pDestination);
4673 pDestination[0] = V.vector4_u32[0];
4674 pDestination[1] = V.vector4_u32[1];
4675 pDestination[2] = V.vector4_u32[2];
4676 pDestination[3] = V.vector4_u32[3];
4678 #elif defined(_XM_SSE_INTRINSICS_)
4679 XMASSERT(pDestination);
4681 _mm_storeu_si128( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
4683 #else // _XM_VMX128_INTRINSICS_
4684 #endif // _XM_VMX128_INTRINSICS_
4687 //------------------------------------------------------------------------------
4689 XMFINLINE VOID XMStoreInt4A
4695 #if defined(_XM_NO_INTRINSICS_)
4697 XMASSERT(pDestination);
4698 XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
4700 pDestination[0] = V.vector4_u32[0];
4701 pDestination[1] = V.vector4_u32[1];
4702 pDestination[2] = V.vector4_u32[2];
4703 pDestination[3] = V.vector4_u32[3];
4705 #elif defined(_XM_SSE_INTRINSICS_)
4706 XMASSERT(pDestination);
4707 XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
4709 _mm_store_si128( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
4711 #else // _XM_VMX128_INTRINSICS_
4712 #endif // _XM_VMX128_INTRINSICS_
4715 //------------------------------------------------------------------------------
4717 XMFINLINE VOID XMStoreSInt4
4719 XMINT4* pDestination,
4723 #if defined(_XM_NO_INTRINSICS_)
4725 XMASSERT(pDestination);
4727 pDestination->x = (INT)V.vector4_f32[0];
4728 pDestination->y = (INT)V.vector4_f32[1];
4729 pDestination->z = (INT)V.vector4_f32[2];
4730 pDestination->w = (INT)V.vector4_f32[3];
4732 #elif defined(_XM_SSE_INTRINSICS_)
4733 XMASSERT(pDestination);
4735 // In case of positive overflow, detect it
4736 XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt);
4737 // Float to int conversion
4738 __m128i vResulti = _mm_cvttps_epi32(V);
4739 // If there was positive overflow, set to 0x7FFFFFFF
4740 XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
4741 vOverflow = _mm_andnot_ps(vOverflow,reinterpret_cast<const __m128 *>(&vResulti)[0]);
4742 vOverflow = _mm_or_ps(vOverflow,vResult);
4743 _mm_storeu_si128( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&vOverflow)[0] );
4745 #else // _XM_VMX128_INTRINSICS_
4746 #endif // _XM_VMX128_INTRINSICS_
4749 //------------------------------------------------------------------------------
4751 XMFINLINE VOID XMStoreUInt4
4753 XMUINT4* pDestination,
4757 #if defined(_XM_NO_INTRINSICS_)
4759 XMASSERT(pDestination);
4761 pDestination->x = (UINT)V.vector4_f32[0];
4762 pDestination->y = (UINT)V.vector4_f32[1];
4763 pDestination->z = (UINT)V.vector4_f32[2];
4764 pDestination->w = (UINT)V.vector4_f32[3];
4766 #elif defined(_XM_SSE_INTRINSICS_)
4767 XMASSERT(pDestination);
4770 XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
4771 // Any numbers that are too big, set to 0xFFFFFFFFU
4772 XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
4773 XMVECTOR vValue = g_XMUnsignedFix;
4774 // Too large for a signed integer?
4775 XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
4776 // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
4777 vValue = _mm_and_ps(vValue,vMask);
4778 // Perform fixup only on numbers too large (Keeps low bit precision)
4779 vResult = _mm_sub_ps(vResult,vValue);
4780 __m128i vResulti = _mm_cvttps_epi32(vResult);
4781 // Convert from signed to unsigned pnly if greater than 0x80000000
4782 vMask = _mm_and_ps(vMask,g_XMNegativeZero);
4783 vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],vMask);
4784 // On those that are too large, set to 0xFFFFFFFF
4785 vResult = _mm_or_ps(vResult,vOverflow);
4786 _mm_storeu_si128( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&vResult)[0] );
4788 #else // _XM_VMX128_INTRINSICS_
4789 #endif // _XM_VMX128_INTRINSICS_
4792 //------------------------------------------------------------------------------
4794 XMFINLINE VOID XMStoreInt4NC
4800 #if defined(_XM_NO_INTRINSICS_)
4802 XMASSERT(pDestination);
4803 XMASSERT(((UINT_PTR)pDestination & 3) == 0);
4805 pDestination[0] = V.vector4_u32[0];
4806 pDestination[1] = V.vector4_u32[1];
4807 pDestination[2] = V.vector4_u32[2];
4808 pDestination[3] = V.vector4_u32[3];
4810 #elif defined(_XM_SSE_INTRINSICS_)
4811 XMASSERT(pDestination);
4812 XMASSERT(((UINT_PTR)pDestination & 3) == 0);
4814 _mm_storeu_si128( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
4816 #else // _XM_VMX128_INTRINSICS_
4817 #endif // _XM_VMX128_INTRINSICS_
4820 //------------------------------------------------------------------------------
4822 XMFINLINE VOID XMStoreFloat4
4824 XMFLOAT4* pDestination,
4828 #if defined(_XM_NO_INTRINSICS_)
4830 XMASSERT(pDestination);
4832 pDestination->x = V.vector4_f32[0];
4833 pDestination->y = V.vector4_f32[1];
4834 pDestination->z = V.vector4_f32[2];
4835 pDestination->w = V.vector4_f32[3];
4837 #elif defined(_XM_SSE_INTRINSICS_)
4838 XMASSERT(pDestination);
4840 _mm_storeu_ps( &pDestination->x, V );
4842 #else // _XM_VMX128_INTRINSICS_
4843 #endif // _XM_VMX128_INTRINSICS_
4846 //------------------------------------------------------------------------------
4848 XMFINLINE VOID XMStoreFloat4A
4850 XMFLOAT4A* pDestination,
4854 #if defined(_XM_NO_INTRINSICS_)
4856 XMASSERT(pDestination);
4857 XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
4859 pDestination->x = V.vector4_f32[0];
4860 pDestination->y = V.vector4_f32[1];
4861 pDestination->z = V.vector4_f32[2];
4862 pDestination->w = V.vector4_f32[3];
4864 #elif defined(_XM_SSE_INTRINSICS_)
4865 XMASSERT(pDestination);
4866 XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
4868 _mm_store_ps( &pDestination->x, V );
4869 #else // _XM_VMX128_INTRINSICS_
4870 #endif // _XM_VMX128_INTRINSICS_
4873 //------------------------------------------------------------------------------
4875 XMFINLINE VOID XMStoreFloat4NC
4877 XMFLOAT4* pDestination,
4881 #if defined(_XM_NO_INTRINSICS_)
4883 XMASSERT(pDestination);
4884 XMASSERT(((UINT_PTR)pDestination & 3) == 0);
4886 pDestination->x = V.vector4_f32[0];
4887 pDestination->y = V.vector4_f32[1];
4888 pDestination->z = V.vector4_f32[2];
4889 pDestination->w = V.vector4_f32[3];
4891 #elif defined(_XM_SSE_INTRINSICS_)
4892 XMASSERT(pDestination);
4893 XMASSERT(((UINT_PTR)pDestination & 3) == 0);
4895 _mm_storeu_ps( &pDestination->x, V );
4897 #else // _XM_VMX128_INTRINSICS_
4898 #endif // _XM_VMX128_INTRINSICS_
4901 //------------------------------------------------------------------------------
4903 XMFINLINE VOID XMStoreHalf4
4905 XMHALF4* pDestination,
4909 #if defined(_XM_NO_INTRINSICS_)
4911 XMASSERT(pDestination);
4913 pDestination->x = XMConvertFloatToHalf(V.vector4_f32[0]);
4914 pDestination->y = XMConvertFloatToHalf(V.vector4_f32[1]);
4915 pDestination->z = XMConvertFloatToHalf(V.vector4_f32[2]);
4916 pDestination->w = XMConvertFloatToHalf(V.vector4_f32[3]);
4918 #elif defined(_XM_SSE_INTRINSICS_)
4919 XMASSERT(pDestination);
4920 pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V));
4921 pDestination->y = XMConvertFloatToHalf(XMVectorGetY(V));
4922 pDestination->z = XMConvertFloatToHalf(XMVectorGetZ(V));
4923 pDestination->w = XMConvertFloatToHalf(XMVectorGetW(V));
4924 #else // _XM_VMX128_INTRINSICS_
4925 #endif // _XM_VMX128_INTRINSICS_
4928 //------------------------------------------------------------------------------
4930 XMFINLINE VOID XMStoreShortN4
4932 XMSHORTN4* pDestination,
4936 #if defined(_XM_NO_INTRINSICS_)
4939 static CONST XMVECTORF32 Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
4941 XMASSERT(pDestination);
4943 N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
4944 N = XMVectorMultiply(N, Scale.v);
4945 N = XMVectorRound(N);
4947 pDestination->x = (SHORT)N.vector4_f32[0];
4948 pDestination->y = (SHORT)N.vector4_f32[1];
4949 pDestination->z = (SHORT)N.vector4_f32[2];
4950 pDestination->w = (SHORT)N.vector4_f32[3];
4952 #elif defined(_XM_SSE_INTRINSICS_)
4953 XMASSERT(pDestination);
4954 static CONST XMVECTORF32 Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
4956 XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
4957 vResult = _mm_min_ps(vResult,g_XMOne);
4958 vResult = _mm_mul_ps(vResult,Scale);
4959 __m128i vResulti = _mm_cvtps_epi32(vResult);
4960 vResulti = _mm_packs_epi32(vResulti,vResulti);
4961 _mm_store_sd(reinterpret_cast<double *>(&pDestination->x),reinterpret_cast<const __m128d *>(&vResulti)[0]);
4962 #else // _XM_VMX128_INTRINSICS_
4963 #endif // _XM_VMX128_INTRINSICS_
4966 //------------------------------------------------------------------------------
4968 XMFINLINE VOID XMStoreShort4
4970 XMSHORT4* pDestination,
4974 #if defined(_XM_NO_INTRINSICS_)
4977 static CONST XMVECTOR Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
4978 static CONST XMVECTOR Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
4980 XMASSERT(pDestination);
4982 N = XMVectorClamp(V, Min, Max);
4983 N = XMVectorRound(N);
4985 pDestination->x = (SHORT)N.vector4_f32[0];
4986 pDestination->y = (SHORT)N.vector4_f32[1];
4987 pDestination->z = (SHORT)N.vector4_f32[2];
4988 pDestination->w = (SHORT)N.vector4_f32[3];
4990 #elif defined(_XM_SSE_INTRINSICS_)
4991 XMASSERT(pDestination);
4992 static CONST XMVECTORF32 Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
4993 static CONST XMVECTORF32 Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
4995 XMVECTOR vResult = _mm_max_ps(V,Min);
4996 vResult = _mm_min_ps(vResult,Max);
4997 // Convert to int with rounding
4998 __m128i vInt = _mm_cvtps_epi32(vResult);
4999 // Pack the ints into shorts
5000 vInt = _mm_packs_epi32(vInt,vInt);
5001 _mm_store_sd(reinterpret_cast<double *>(&pDestination->x),reinterpret_cast<const __m128d *>(&vInt)[0]);
5002 #else // _XM_VMX128_INTRINSICS_
5003 #endif // _XM_VMX128_INTRINSICS_
5006 //------------------------------------------------------------------------------
5008 XMFINLINE VOID XMStoreUShortN4
5010 XMUSHORTN4* pDestination,
5014 #if defined(_XM_NO_INTRINSICS_)
5017 static CONST XMVECTORF32 Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
5019 XMASSERT(pDestination);
5021 N = XMVectorSaturate(V);
5022 N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v);
5023 N = XMVectorTruncate(N);
5025 pDestination->x = (SHORT)N.vector4_f32[0];
5026 pDestination->y = (SHORT)N.vector4_f32[1];
5027 pDestination->z = (SHORT)N.vector4_f32[2];
5028 pDestination->w = (SHORT)N.vector4_f32[3];
5030 #elif defined(_XM_SSE_INTRINSICS_)
5031 XMASSERT(pDestination);
5032 static CONST XMVECTORF32 Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
5034 XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
5035 vResult = _mm_min_ps(vResult,g_XMOne);
5036 vResult = _mm_mul_ps(vResult,Scale);
5037 // Convert to int with rounding
5038 __m128i vInt = _mm_cvtps_epi32(vResult);
5039 // Since the SSE pack instruction clamps using signed rules,
5040 // manually extract the values to store them to memory
5041 pDestination->x = static_cast<SHORT>(_mm_extract_epi16(vInt,0));
5042 pDestination->y = static_cast<SHORT>(_mm_extract_epi16(vInt,2));
5043 pDestination->z = static_cast<SHORT>(_mm_extract_epi16(vInt,4));
5044 pDestination->w = static_cast<SHORT>(_mm_extract_epi16(vInt,6));
5045 #else // _XM_VMX128_INTRINSICS_
5046 #endif // _XM_VMX128_INTRINSICS_
5049 //------------------------------------------------------------------------------
5051 XMFINLINE VOID XMStoreUShort4
5053 XMUSHORT4* pDestination,
5057 #if defined(_XM_NO_INTRINSICS_)
5060 static CONST XMVECTOR Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
5062 XMASSERT(pDestination);
5064 N = XMVectorClamp(V, XMVectorZero(), Max);
5065 N = XMVectorRound(N);
5067 pDestination->x = (SHORT)N.vector4_f32[0];
5068 pDestination->y = (SHORT)N.vector4_f32[1];
5069 pDestination->z = (SHORT)N.vector4_f32[2];
5070 pDestination->w = (SHORT)N.vector4_f32[3];
5072 #elif defined(_XM_SSE_INTRINSICS_)
5073 XMASSERT(pDestination);
5074 static CONST XMVECTORF32 Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
5076 XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
5077 vResult = _mm_min_ps(vResult,Max);
5078 // Convert to int with rounding
5079 __m128i vInt = _mm_cvtps_epi32(vResult);
5080 // Since the SSE pack instruction clamps using signed rules,
5081 // manually extract the values to store them to memory
5082 pDestination->x = static_cast<SHORT>(_mm_extract_epi16(vInt,0));
5083 pDestination->y = static_cast<SHORT>(_mm_extract_epi16(vInt,2));
5084 pDestination->z = static_cast<SHORT>(_mm_extract_epi16(vInt,4));
5085 pDestination->w = static_cast<SHORT>(_mm_extract_epi16(vInt,6));
5086 #else // _XM_VMX128_INTRINSICS_
5087 #endif // _XM_VMX128_INTRINSICS_
5090 //------------------------------------------------------------------------------
5092 XMFINLINE VOID XMStoreXIcoN4
5094 XMXICON4* pDestination,
5098 #if defined(_XM_NO_INTRINSICS_)
5101 static CONST XMVECTORF32 Min = {-1.0f, -1.0f, -1.0f, 0.0f};
5102 static CONST XMVECTORF32 Scale = {524287.0f, 524287.0f, 524287.0f, 15.0f};
5104 XMASSERT(pDestination);
5106 N = XMVectorClamp(V, Min.v, g_XMOne.v);
5107 N = XMVectorMultiply(N, Scale.v);
5108 N = XMVectorRound(N);
5110 pDestination->v = ((UINT64)N.vector4_f32[3] << 60) |
5111 (((INT64)N.vector4_f32[2] & 0xFFFFF) << 40) |
5112 (((INT64)N.vector4_f32[1] & 0xFFFFF) << 20) |
5113 (((INT64)N.vector4_f32[0] & 0xFFFFF));
5115 #elif defined(_XM_SSE_INTRINSICS_)
5116 XMASSERT(pDestination);
5117 // Note: Masks are x,w,y and z
5118 static const XMVECTORF32 MinXIcoN4 = {-1.0f, 0.0f,-1.0f,-1.0f};
5119 static const XMVECTORF32 ScaleXIcoN4 = {524287.0f,15.0f*4096.0f*65536.0f*0.5f,524287.0f*4096.0f,524287.0f};
5120 static const XMVECTORI32 MaskXIcoN4 = {0xFFFFF,0xF<<((60-32)-1),0xFFFFF000,0xFFFFF};
5123 XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
5124 vResult = _mm_max_ps(vResult,MinXIcoN4);
5125 vResult = _mm_min_ps(vResult,g_XMOne);
5126 // Scale by multiplication
5127 vResult = _mm_mul_ps(vResult,ScaleXIcoN4);
5128 // Convert to integer (w is unsigned)
5129 __m128i vResulti = _mm_cvttps_epi32(vResult);
5130 // Mask off unused bits
5131 vResulti = _mm_and_si128(vResulti,MaskXIcoN4);
5133 __m128i vResulti2 = _mm_and_si128(vResulti,g_XMMaskY);
5134 // Double Y (Really W) to fixup for unsigned conversion
5135 vResulti = _mm_add_epi32(vResulti,vResulti2);
5136 // Shift y and z to straddle the 32-bit boundary
5137 vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
5138 // Shift it into place
5139 vResulti2 = _mm_slli_si128(vResulti2,20/8);
5140 // i = x|y<<20|z<<40|w<<60
5141 vResulti = _mm_or_si128(vResulti,vResulti2);
5142 _mm_store_sd(reinterpret_cast<double *>(&pDestination->v),reinterpret_cast<const __m128d *>(&vResulti)[0]);
5143 #else // _XM_VMX128_INTRINSICS_
5144 #endif // _XM_VMX128_INTRINSICS_
5147 //------------------------------------------------------------------------------
5149 XMFINLINE VOID XMStoreXIco4
5151 XMXICO4* pDestination,
5155 #if defined(_XM_NO_INTRINSICS_)
5158 static CONST XMVECTORF32 Min = {-524287.0f, -524287.0f, -524287.0f, 0.0f};
5159 static CONST XMVECTORF32 Max = {524287.0f, 524287.0f, 524287.0f, 15.0f};
5161 XMASSERT(pDestination);
5162 N = XMVectorClamp(V, Min.v, Max.v);
5163 pDestination->v = ((UINT64)N.vector4_f32[3] << 60) |
5164 (((INT64)N.vector4_f32[2] & 0xFFFFF) << 40) |
5165 (((INT64)N.vector4_f32[1] & 0xFFFFF) << 20) |
5166 (((INT64)N.vector4_f32[0] & 0xFFFFF));
5168 #elif defined(_XM_SSE_INTRINSICS_)
5169 XMASSERT(pDestination);
5170 // Note: Masks are x,w,y and z
5171 static const XMVECTORF32 MinXIco4 = {-524287.0f, 0.0f,-524287.0f,-524287.0f};
5172 static const XMVECTORF32 MaxXIco4 = { 524287.0f,15.0f, 524287.0f, 524287.0f};
5173 static const XMVECTORF32 ScaleXIco4 = {1.0f,4096.0f*65536.0f*0.5f,4096.0f,1.0f};
5174 static const XMVECTORI32 MaskXIco4 = {0xFFFFF,0xF<<((60-1)-32),0xFFFFF000,0xFFFFF};
5176 XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
5177 vResult = _mm_max_ps(vResult,MinXIco4);
5178 vResult = _mm_min_ps(vResult,MaxXIco4);
5179 // Scale by multiplication
5180 vResult = _mm_mul_ps(vResult,ScaleXIco4);
5182 __m128i vResulti = _mm_cvttps_epi32(vResult);
5183 // Mask off any fraction
5184 vResulti = _mm_and_si128(vResulti,MaskXIco4);
5186 __m128i vResulti2 = _mm_and_si128(vResulti,g_XMMaskY);
5187 // Double Y (Really W) to fixup for unsigned conversion
5188 vResulti = _mm_add_epi32(vResulti,vResulti2);
5189 // Shift y and z to straddle the 32-bit boundary
5190 vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
5191 // Shift it into place
5192 vResulti2 = _mm_slli_si128(vResulti2,20/8);
5193 // i = x|y<<20|z<<40|w<<60
5194 vResulti = _mm_or_si128(vResulti,vResulti2);
5195 _mm_store_sd(reinterpret_cast<double *>(&pDestination->v),reinterpret_cast<const __m128d *>(&vResulti)[0]);
5196 #else // _XM_VMX128_INTRINSICS_
5197 #endif // _XM_VMX128_INTRINSICS_
5200 //------------------------------------------------------------------------------
5202 XMFINLINE VOID XMStoreUIcoN4
5204 XMUICON4* pDestination,
5208 #define XM_URange ((FLOAT)(1 << 20))
5209 #define XM_URangeDiv2 ((FLOAT)(1 << 19))
5210 #define XM_UMaxXYZ ((FLOAT)((1 << 20) - 1))
5211 #define XM_UMaxW ((FLOAT)((1 << 4) - 1))
5212 #define XM_ScaleXYZ (-(FLOAT)((1 << 20) - 1) / XM_PACK_FACTOR)
5213 #define XM_ScaleW (-(FLOAT)((1 << 4) - 1) / XM_PACK_FACTOR)
5214 #define XM_Scale (-1.0f / XM_PACK_FACTOR)
5215 #define XM_Offset (3.0f)
5217 #if defined(_XM_NO_INTRINSICS_)
5220 static CONST XMVECTORF32 Scale = {1048575.0f, 1048575.0f, 1048575.0f, 15.0f};
5222 XMASSERT(pDestination);
5224 N = XMVectorSaturate(V);
5225 N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v);
5227 pDestination->v = ((UINT64)N.vector4_f32[3] << 60) |
5228 (((UINT64)N.vector4_f32[2] & 0xFFFFF) << 40) |
5229 (((UINT64)N.vector4_f32[1] & 0xFFFFF) << 20) |
5230 (((UINT64)N.vector4_f32[0] & 0xFFFFF));
5232 #elif defined(_XM_SSE_INTRINSICS_)
5233 XMASSERT(pDestination);
5234 // Note: Masks are x,w,y and z
5235 static const XMVECTORF32 ScaleUIcoN4 = {1048575.0f,15.0f*4096.0f*65536.0f,1048575.0f*4096.0f,1048575.0f};
5236 static const XMVECTORI32 MaskUIcoN4 = {0xFFFFF,0xF<<(60-32),0xFFFFF000,0xFFFFF};
5237 static const XMVECTORF32 AddUIcoN4 = {0.0f,-32768.0f*65536.0f,-32768.0f*65536.0f,0.0f};
5239 XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
5240 vResult = _mm_max_ps(vResult,g_XMZero);
5241 vResult = _mm_min_ps(vResult,g_XMOne);
5242 // Scale by multiplication
5243 vResult = _mm_mul_ps(vResult,ScaleUIcoN4);
5244 // Adjust for unsigned entries
5245 vResult = _mm_add_ps(vResult,AddUIcoN4);
5247 __m128i vResulti = _mm_cvttps_epi32(vResult);
5248 // Fix the signs on the unsigned entries
5249 vResulti = _mm_xor_si128(vResulti,g_XMFlipYZ);
5250 // Mask off any fraction
5251 vResulti = _mm_and_si128(vResulti,MaskUIcoN4);
5252 // Shift y and z to straddle the 32-bit boundary
5253 __m128i vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
5254 // Shift it into place
5255 vResulti2 = _mm_slli_si128(vResulti2,20/8);
5256 // i = x|y<<20|z<<40|w<<60
5257 vResulti = _mm_or_si128(vResulti,vResulti2);
5258 _mm_store_sd(reinterpret_cast<double *>(&pDestination->v),reinterpret_cast<const __m128d *>(&vResulti)[0]);
5259 #else // _XM_VMX128_INTRINSICS_
5260 #endif // _XM_VMX128_INTRINSICS_
5263 #undef XM_URangeDiv2
5272 //------------------------------------------------------------------------------
5274 XMFINLINE VOID XMStoreUIco4
5276 XMUICO4* pDestination,
5280 #define XM_Scale (-1.0f / XM_PACK_FACTOR)
5281 #define XM_URange ((FLOAT)(1 << 20))
5282 #define XM_URangeDiv2 ((FLOAT)(1 << 19))
5284 #if defined(_XM_NO_INTRINSICS_)
5287 static CONST XMVECTOR Max = {1048575.0f, 1048575.0f, 1048575.0f, 15.0f};
5289 XMASSERT(pDestination);
5291 N = XMVectorClamp(V, XMVectorZero(), Max);
5292 N = XMVectorRound(N);
5294 pDestination->v = ((UINT64)N.vector4_f32[3] << 60) |
5295 (((UINT64)N.vector4_f32[2] & 0xFFFFF) << 40) |
5296 (((UINT64)N.vector4_f32[1] & 0xFFFFF) << 20) |
5297 (((UINT64)N.vector4_f32[0] & 0xFFFFF));
5299 #elif defined(_XM_SSE_INTRINSICS_)
5300 XMASSERT(pDestination);
5301 // Note: Masks are x,w,y and z
5302 static const XMVECTORF32 MaxUIco4 = { 1048575.0f, 15.0f, 1048575.0f, 1048575.0f};
5303 static const XMVECTORF32 ScaleUIco4 = {1.0f,4096.0f*65536.0f,4096.0f,1.0f};
5304 static const XMVECTORI32 MaskUIco4 = {0xFFFFF,0xF<<(60-32),0xFFFFF000,0xFFFFF};
5305 static const XMVECTORF32 AddUIco4 = {0.0f,-32768.0f*65536.0f,-32768.0f*65536.0f,0.0f};
5307 XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
5308 vResult = _mm_max_ps(vResult,g_XMZero);
5309 vResult = _mm_min_ps(vResult,MaxUIco4);
5310 // Scale by multiplication
5311 vResult = _mm_mul_ps(vResult,ScaleUIco4);
5312 vResult = _mm_add_ps(vResult,AddUIco4);
5314 __m128i vResulti = _mm_cvttps_epi32(vResult);
5315 vResulti = _mm_xor_si128(vResulti,g_XMFlipYZ);
5316 // Mask off any fraction
5317 vResulti = _mm_and_si128(vResulti,MaskUIco4);
5318 // Shift y and z to straddle the 32-bit boundary
5319 __m128i vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
5320 // Shift it into place
5321 vResulti2 = _mm_slli_si128(vResulti2,20/8);
5322 // i = x|y<<20|z<<40|w<<60
5323 vResulti = _mm_or_si128(vResulti,vResulti2);
5324 _mm_store_sd(reinterpret_cast<double *>(&pDestination->v),reinterpret_cast<const __m128d *>(&vResulti)[0]);
5325 #else // _XM_VMX128_INTRINSICS_
5326 #endif // _XM_VMX128_INTRINSICS_
5330 #undef XM_URangeDiv2
5333 //------------------------------------------------------------------------------
5335 XMFINLINE VOID XMStoreIcoN4
5337 XMICON4* pDestination,
5341 #define XM_Scale (-1.0f / XM_PACK_FACTOR)
5342 #define XM_URange ((FLOAT)(1 << 4))
5343 #define XM_Offset (3.0f)
5344 #define XM_UMaxXYZ ((FLOAT)((1 << (20 - 1)) - 1))
5345 #define XM_UMaxW ((FLOAT)((1 << (4 - 1)) - 1))
5347 #if defined(_XM_NO_INTRINSICS_)
5350 static CONST XMVECTORF32 Scale = {524287.0f, 524287.0f, 524287.0f, 7.0f};
5352 XMASSERT(pDestination);
5354 N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
5355 N = XMVectorMultiplyAdd(N, Scale.v, g_XMNegativeZero.v);
5356 N = XMVectorRound(N);
5358 pDestination->v = ((UINT64)N.vector4_f32[3] << 60) |
5359 (((UINT64)N.vector4_f32[2] & 0xFFFFF) << 40) |
5360 (((UINT64)N.vector4_f32[1] & 0xFFFFF) << 20) |
5361 (((UINT64)N.vector4_f32[0] & 0xFFFFF));
5363 #elif defined(_XM_SSE_INTRINSICS_)
5364 XMASSERT(pDestination);
5365 // Note: Masks are x,w,y and z
5366 static const XMVECTORF32 ScaleIcoN4 = {524287.0f,7.0f*4096.0f*65536.0f,524287.0f*4096.0f,524287.0f};
5367 static const XMVECTORI32 MaskIcoN4 = {0xFFFFF,0xF<<(60-32),0xFFFFF000,0xFFFFF};
5369 XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
5370 vResult = _mm_max_ps(vResult,g_XMNegativeOne);
5371 vResult = _mm_min_ps(vResult,g_XMOne);
5372 // Scale by multiplication
5373 vResult = _mm_mul_ps(vResult,ScaleIcoN4);
5375 __m128i vResulti = _mm_cvttps_epi32(vResult);
5376 // Mask off any fraction
5377 vResulti = _mm_and_si128(vResulti,MaskIcoN4);
5378 // Shift y and z to straddle the 32-bit boundary
5379 __m128i vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
5380 // Shift it into place
5381 vResulti2 = _mm_slli_si128(vResulti2,20/8);
5382 // i = x|y<<20|z<<40|w<<60
5383 vResulti = _mm_or_si128(vResulti,vResulti2);
5384 _mm_store_sd(reinterpret_cast<double *>(&pDestination->v),reinterpret_cast<const __m128d *>(&vResulti)[0]);
5385 #else // _XM_VMX128_INTRINSICS_
5386 #endif // _XM_VMX128_INTRINSICS_
5395 //------------------------------------------------------------------------------
5397 XMFINLINE VOID XMStoreIco4
5399 XMICO4* pDestination,
5403 #define XM_Scale (-1.0f / XM_PACK_FACTOR)
5404 #define XM_URange ((FLOAT)(1 << 4))
5405 #define XM_Offset (3.0f)
5407 #if defined(_XM_NO_INTRINSICS_)
5410 static CONST XMVECTOR Min = {-524287.0f, -524287.0f, -524287.0f, -7.0f};
5411 static CONST XMVECTOR Max = {524287.0f, 524287.0f, 524287.0f, 7.0f};
5413 XMASSERT(pDestination);
5415 N = XMVectorClamp(V, Min, Max);
5416 N = XMVectorRound(N);
5418 pDestination->v = ((INT64)N.vector4_f32[3] << 60) |
5419 (((INT64)N.vector4_f32[2] & 0xFFFFF) << 40) |
5420 (((INT64)N.vector4_f32[1] & 0xFFFFF) << 20) |
5421 (((INT64)N.vector4_f32[0] & 0xFFFFF));
5423 #elif defined(_XM_SSE_INTRINSICS_)
5424 XMASSERT(pDestination);
5425 // Note: Masks are x,w,y and z
5426 static const XMVECTORF32 MinIco4 = {-524287.0f,-7.0f,-524287.0f,-524287.0f};
5427 static const XMVECTORF32 MaxIco4 = { 524287.0f, 7.0f, 524287.0f, 524287.0f};
5428 static const XMVECTORF32 ScaleIco4 = {1.0f,4096.0f*65536.0f,4096.0f,1.0f};
5429 static const XMVECTORI32 MaskIco4 = {0xFFFFF,0xF<<(60-32),0xFFFFF000,0xFFFFF};
5431 XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
5432 vResult = _mm_max_ps(vResult,MinIco4);
5433 vResult = _mm_min_ps(vResult,MaxIco4);
5434 // Scale by multiplication
5435 vResult = _mm_mul_ps(vResult,ScaleIco4);
5437 __m128i vResulti = _mm_cvttps_epi32(vResult);
5438 // Mask off any fraction
5439 vResulti = _mm_and_si128(vResulti,MaskIco4);
5440 // Shift y and z to straddle the 32-bit boundary
5441 __m128i vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
5442 // Shift it into place
5443 vResulti2 = _mm_slli_si128(vResulti2,20/8);
5444 // i = x|y<<20|z<<40|w<<60
5445 vResulti = _mm_or_si128(vResulti,vResulti2);
5446 _mm_store_sd(reinterpret_cast<double *>(&pDestination->v),reinterpret_cast<const __m128d *>(&vResulti)[0]);
5447 #else // _XM_VMX128_INTRINSICS_
5448 #endif // _XM_VMX128_INTRINSICS_
5455 //------------------------------------------------------------------------------
5457 XMFINLINE VOID XMStoreXDecN4
5459 XMXDECN4* pDestination,
5463 #if defined(_XM_NO_INTRINSICS_)
5466 static CONST XMVECTORF32 Min = {-1.0f, -1.0f, -1.0f, 0.0f};
5467 static CONST XMVECTORF32 Scale = {511.0f, 511.0f, 511.0f, 3.0f};
5469 XMASSERT(pDestination);
5471 N = XMVectorClamp(V, Min.v, g_XMOne.v);
5472 N = XMVectorMultiply(N, Scale.v);
5473 N = XMVectorRound(N);
5475 pDestination->v = ((UINT)N.vector4_f32[3] << 30) |
5476 (((INT)N.vector4_f32[2] & 0x3FF) << 20) |
5477 (((INT)N.vector4_f32[1] & 0x3FF) << 10) |
5478 (((INT)N.vector4_f32[0] & 0x3FF));
5480 #elif defined(_XM_SSE_INTRINSICS_)
5481 static const XMVECTORF32 Min = {-1.0f, -1.0f, -1.0f, 0.0f};
5482 static const XMVECTORF32 Scale = {511.0f, 511.0f*1024.0f, 511.0f*1048576.0f,3.0f*536870912.0f};
5483 static const XMVECTORI32 ScaleMask = {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<29};
5484 XMASSERT(pDestination);
5485 XMVECTOR vResult = _mm_max_ps(V,Min);
5486 vResult = _mm_min_ps(vResult,g_XMOne);
5487 // Scale by multiplication
5488 vResult = _mm_mul_ps(vResult,Scale);
5489 // Convert to int (W is unsigned)
5490 __m128i vResulti = _mm_cvtps_epi32(vResult);
5491 // Mask off any fraction
5492 vResulti = _mm_and_si128(vResulti,ScaleMask);
5493 // To fix W, add itself to shift it up to <<30 instead of <<29
5494 __m128i vResultw = _mm_and_si128(vResulti,g_XMMaskW);
5495 vResulti = _mm_add_epi32(vResulti,vResultw);
5496 // Do a horizontal or of all 4 entries
5497 vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
5498 vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
5499 vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
5500 vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
5501 vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
5502 vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
5503 _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
5504 #else // _XM_VMX128_INTRINSICS_
5505 #endif // _XM_VMX128_INTRINSICS_
5508 //------------------------------------------------------------------------------
5510 XMFINLINE VOID XMStoreXDec4
5512 XMXDEC4* pDestination,
5516 #if defined(_XM_NO_INTRINSICS_)
5519 static CONST XMVECTOR Min = {-511.0f, -511.0f, -511.0f, 0.0f};
5520 static CONST XMVECTOR Max = {511.0f, 511.0f, 511.0f, 3.0f};
5522 XMASSERT(pDestination);
5524 N = XMVectorClamp(V, Min, Max);
5526 pDestination->v = ((UINT)N.vector4_f32[3] << 30) |
5527 (((INT)N.vector4_f32[2] & 0x3FF) << 20) |
5528 (((INT)N.vector4_f32[1] & 0x3FF) << 10) |
5529 (((INT)N.vector4_f32[0] & 0x3FF));
5531 #elif defined(_XM_SSE_INTRINSICS_)
5532 XMASSERT(pDestination);
5533 static const XMVECTORF32 MinXDec4 = {-511.0f,-511.0f,-511.0f, 0.0f};
5534 static const XMVECTORF32 MaxXDec4 = { 511.0f, 511.0f, 511.0f, 3.0f};
5535 static const XMVECTORF32 ScaleXDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f};
5536 static const XMVECTORI32 MaskXDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
5538 XMVECTOR vResult = _mm_max_ps(V,MinXDec4);
5539 vResult = _mm_min_ps(vResult,MaxXDec4);
5540 // Scale by multiplication
5541 vResult = _mm_mul_ps(vResult,ScaleXDec4);
5543 __m128i vResulti = _mm_cvttps_epi32(vResult);
5544 // Mask off any fraction
5545 vResulti = _mm_and_si128(vResulti,MaskXDec4);
5546 // Do a horizontal or of 4 entries
5547 __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
5549 vResulti = _mm_or_si128(vResulti,vResulti2);
5550 // Move Z to the x position
5551 vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
5552 // Perform a single bit left shift on y|w
5553 vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
5555 vResulti = _mm_or_si128(vResulti,vResulti2);
5556 _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
5557 #else // _XM_VMX128_INTRINSICS_
5558 #endif // _XM_VMX128_INTRINSICS_
5561 //------------------------------------------------------------------------------
5563 XMFINLINE VOID XMStoreUDecN4
5565 XMUDECN4* pDestination,
5569 #if defined(_XM_NO_INTRINSICS_)
5572 static CONST XMVECTORF32 Scale = {1023.0f, 1023.0f, 1023.0f, 3.0f};
5574 XMASSERT(pDestination);
5576 N = XMVectorSaturate(V);
5577 N = XMVectorMultiply(N, Scale.v);
5579 pDestination->v = ((UINT)N.vector4_f32[3] << 30) |
5580 (((UINT)N.vector4_f32[2] & 0x3FF) << 20) |
5581 (((UINT)N.vector4_f32[1] & 0x3FF) << 10) |
5582 (((UINT)N.vector4_f32[0] & 0x3FF));
5584 #elif defined(_XM_SSE_INTRINSICS_)
5585 XMASSERT(pDestination);
5586 static const XMVECTORF32 ScaleUDecN4 = {1023.0f,1023.0f*1024.0f*0.5f,1023.0f*1024.0f*1024.0f,3.0f*1024.0f*1024.0f*1024.0f*0.5f};
5587 static const XMVECTORI32 MaskUDecN4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
5589 XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
5590 vResult = _mm_min_ps(vResult,g_XMOne);
5591 // Scale by multiplication
5592 vResult = _mm_mul_ps(vResult,ScaleUDecN4);
5594 __m128i vResulti = _mm_cvttps_epi32(vResult);
5595 // Mask off any fraction
5596 vResulti = _mm_and_si128(vResulti,MaskUDecN4);
5597 // Do a horizontal or of 4 entries
5598 __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
5600 vResulti = _mm_or_si128(vResulti,vResulti2);
5601 // Move Z to the x position
5602 vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
5603 // Perform a left shift by one bit on y|w
5604 vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
5606 vResulti = _mm_or_si128(vResulti,vResulti2);
5607 _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
5608 #else // _XM_VMX128_INTRINSICS_
5609 #endif // _XM_VMX128_INTRINSICS_
5612 //------------------------------------------------------------------------------
5614 XMFINLINE VOID XMStoreUDec4
5616 XMUDEC4* pDestination,
5620 #if defined(_XM_NO_INTRINSICS_)
5623 static CONST XMVECTOR Max = {1023.0f, 1023.0f, 1023.0f, 3.0f};
5625 XMASSERT(pDestination);
5627 N = XMVectorClamp(V, XMVectorZero(), Max);
5629 pDestination->v = ((UINT)N.vector4_f32[3] << 30) |
5630 (((UINT)N.vector4_f32[2] & 0x3FF) << 20) |
5631 (((UINT)N.vector4_f32[1] & 0x3FF) << 10) |
5632 (((UINT)N.vector4_f32[0] & 0x3FF));
5634 #elif defined(_XM_SSE_INTRINSICS_)
5635 XMASSERT(pDestination);
5636 static const XMVECTORF32 MaxUDec4 = { 1023.0f, 1023.0f, 1023.0f, 3.0f};
5637 static const XMVECTORF32 ScaleUDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f};
5638 static const XMVECTORI32 MaskUDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
5640 XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
5641 vResult = _mm_min_ps(vResult,MaxUDec4);
5642 // Scale by multiplication
5643 vResult = _mm_mul_ps(vResult,ScaleUDec4);
5645 __m128i vResulti = _mm_cvttps_epi32(vResult);
5646 // Mask off any fraction
5647 vResulti = _mm_and_si128(vResulti,MaskUDec4);
5648 // Do a horizontal or of 4 entries
5649 __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
5651 vResulti = _mm_or_si128(vResulti,vResulti2);
5652 // Move Z to the x position
5653 vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
5654 // Perform a left shift by one bit on y|w
5655 vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
5657 vResulti = _mm_or_si128(vResulti,vResulti2);
5658 _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
5659 #else // _XM_VMX128_INTRINSICS_
5660 #endif // _XM_VMX128_INTRINSICS_
5663 //------------------------------------------------------------------------------
5665 XMFINLINE VOID XMStoreDecN4
5667 XMDECN4* pDestination,
5671 #if defined(_XM_NO_INTRINSICS_)
5674 static CONST XMVECTORF32 Scale = {511.0f, 511.0f, 511.0f, 1.0f};
5676 XMASSERT(pDestination);
5678 N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
5679 N = XMVectorMultiply(N, Scale.v);
5681 pDestination->v = ((INT)N.vector4_f32[3] << 30) |
5682 (((INT)N.vector4_f32[2] & 0x3FF) << 20) |
5683 (((INT)N.vector4_f32[1] & 0x3FF) << 10) |
5684 (((INT)N.vector4_f32[0] & 0x3FF));
5686 #elif defined(_XM_SSE_INTRINSICS_)
5687 XMASSERT(pDestination);
5688 static const XMVECTORF32 ScaleDecN4 = {511.0f,511.0f*1024.0f,511.0f*1024.0f*1024.0f,1.0f*1024.0f*1024.0f*1024.0f};
5689 static const XMVECTORI32 MaskDecN4= {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<30};
5691 XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
5692 vResult = _mm_min_ps(vResult,g_XMOne);
5693 // Scale by multiplication
5694 vResult = _mm_mul_ps(vResult,ScaleDecN4);
5696 __m128i vResulti = _mm_cvttps_epi32(vResult);
5697 // Mask off any fraction
5698 vResulti = _mm_and_si128(vResulti,MaskDecN4);
5699 // Do a horizontal or of 4 entries
5700 __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
5702 vResulti = _mm_or_si128(vResulti,vResulti2);
5703 // Move Z to the x position
5704 vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
5706 vResulti = _mm_or_si128(vResulti,vResulti2);
5707 _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
5708 #else // _XM_VMX128_INTRINSICS_
5709 #endif // _XM_VMX128_INTRINSICS_
5712 //------------------------------------------------------------------------------
5714 XMFINLINE VOID XMStoreDec4
5716 XMDEC4* pDestination,
5720 #if defined(_XM_NO_INTRINSICS_)
5723 static CONST XMVECTOR Min = {-511.0f, -511.0f, -511.0f, -1.0f};
5724 static CONST XMVECTOR Max = {511.0f, 511.0f, 511.0f, 1.0f};
5726 XMASSERT(pDestination);
5728 N = XMVectorClamp(V, Min, Max);
5730 pDestination->v = ((INT)N.vector4_f32[3] << 30) |
5731 (((INT)N.vector4_f32[2] & 0x3FF) << 20) |
5732 (((INT)N.vector4_f32[1] & 0x3FF) << 10) |
5733 (((INT)N.vector4_f32[0] & 0x3FF));
5735 #elif defined(_XM_SSE_INTRINSICS_)
5736 XMASSERT(pDestination);
5737 static const XMVECTORF32 MinDec4 = {-511.0f,-511.0f,-511.0f,-1.0f};
5738 static const XMVECTORF32 MaxDec4 = { 511.0f, 511.0f, 511.0f, 1.0f};
5739 static const XMVECTORF32 ScaleDec4 = {1.0f,1024.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f};
5740 static const XMVECTORI32 MaskDec4= {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<30};
5742 XMVECTOR vResult = _mm_max_ps(V,MinDec4);
5743 vResult = _mm_min_ps(vResult,MaxDec4);
5744 // Scale by multiplication
5745 vResult = _mm_mul_ps(vResult,ScaleDec4);
5747 __m128i vResulti = _mm_cvttps_epi32(vResult);
5748 // Mask off any fraction
5749 vResulti = _mm_and_si128(vResulti,MaskDec4);
5750 // Do a horizontal or of 4 entries
5751 __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
5753 vResulti = _mm_or_si128(vResulti,vResulti2);
5754 // Move Z to the x position
5755 vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
5757 vResulti = _mm_or_si128(vResulti,vResulti2);
5758 _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
5759 #else // _XM_VMX128_INTRINSICS_
5760 #endif // _XM_VMX128_INTRINSICS_
5763 //------------------------------------------------------------------------------
5765 XMFINLINE VOID XMStoreUByteN4
5767 XMUBYTEN4* pDestination,
5771 #if defined(_XM_NO_INTRINSICS_)
5774 static CONST XMVECTORF32 Scale = {255.0f, 255.0f, 255.0f, 255.0f};
5776 XMASSERT(pDestination);
5778 N = XMVectorSaturate(V);
5779 N = XMVectorMultiply(N, Scale.v);
5780 N = XMVectorRound(N);
5782 pDestination->x = (BYTE)N.vector4_f32[0];
5783 pDestination->y = (BYTE)N.vector4_f32[1];
5784 pDestination->z = (BYTE)N.vector4_f32[2];
5785 pDestination->w = (BYTE)N.vector4_f32[3];
5787 #elif defined(_XM_SSE_INTRINSICS_)
5788 XMASSERT(pDestination);
5789 static const XMVECTORF32 ScaleUByteN4 = {255.0f,255.0f*256.0f*0.5f,255.0f*256.0f*256.0f,255.0f*256.0f*256.0f*256.0f*0.5f};
5790 static const XMVECTORI32 MaskUByteN4 = {0xFF,0xFF<<(8-1),0xFF<<16,0xFF<<(24-1)};
5792 XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
5793 vResult = _mm_min_ps(vResult,g_XMOne);
5794 // Scale by multiplication
5795 vResult = _mm_mul_ps(vResult,ScaleUByteN4);
5797 __m128i vResulti = _mm_cvttps_epi32(vResult);
5798 // Mask off any fraction
5799 vResulti = _mm_and_si128(vResulti,MaskUByteN4);
5800 // Do a horizontal or of 4 entries
5801 __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
5803 vResulti = _mm_or_si128(vResulti,vResulti2);
5804 // Move Z to the x position
5805 vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
5806 // Perform a single bit left shift to fix y|w
5807 vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
5809 vResulti = _mm_or_si128(vResulti,vResulti2);
5810 _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
5811 #else // _XM_VMX128_INTRINSICS_
5812 #endif // _XM_VMX128_INTRINSICS_
5815 //------------------------------------------------------------------------------
5817 XMFINLINE VOID XMStoreUByte4
5819 XMUBYTE4* pDestination,
5823 #if defined(_XM_NO_INTRINSICS_)
5826 static CONST XMVECTOR Max = {255.0f, 255.0f, 255.0f, 255.0f};
5828 XMASSERT(pDestination);
5830 N = XMVectorClamp(V, XMVectorZero(), Max);
5831 N = XMVectorRound(N);
5833 pDestination->x = (BYTE)N.vector4_f32[0];
5834 pDestination->y = (BYTE)N.vector4_f32[1];
5835 pDestination->z = (BYTE)N.vector4_f32[2];
5836 pDestination->w = (BYTE)N.vector4_f32[3];
5838 #elif defined(_XM_SSE_INTRINSICS_)
5839 XMASSERT(pDestination);
5840 static const XMVECTORF32 MaxUByte4 = { 255.0f, 255.0f, 255.0f, 255.0f};
5841 static const XMVECTORF32 ScaleUByte4 = {1.0f,256.0f*0.5f,256.0f*256.0f,256.0f*256.0f*256.0f*0.5f};
5842 static const XMVECTORI32 MaskUByte4 = {0xFF,0xFF<<(8-1),0xFF<<16,0xFF<<(24-1)};
5844 XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
5845 vResult = _mm_min_ps(vResult,MaxUByte4);
5846 // Scale by multiplication
5847 vResult = _mm_mul_ps(vResult,ScaleUByte4);
5849 __m128i vResulti = _mm_cvttps_epi32(vResult);
5850 // Mask off any fraction
5851 vResulti = _mm_and_si128(vResulti,MaskUByte4);
5852 // Do a horizontal or of 4 entries
5853 __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
5855 vResulti = _mm_or_si128(vResulti,vResulti2);
5856 // Move Z to the x position
5857 vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
5858 // Perform a single bit left shift to fix y|w
5859 vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
5861 vResulti = _mm_or_si128(vResulti,vResulti2);
5862 _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
5863 #else // _XM_VMX128_INTRINSICS_
5864 #endif // _XM_VMX128_INTRINSICS_
5867 //------------------------------------------------------------------------------
5869 XMFINLINE VOID XMStoreByteN4
5871 XMBYTEN4* pDestination,
5875 #if defined(_XM_NO_INTRINSICS_)
5878 static CONST XMVECTORF32 Scale = {127.0f, 127.0f, 127.0f, 127.0f};
5880 XMASSERT(pDestination);
5882 N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
5883 N = XMVectorMultiply(V, Scale.v);
5884 N = XMVectorRound(N);
5886 pDestination->x = (CHAR)N.vector4_f32[0];
5887 pDestination->y = (CHAR)N.vector4_f32[1];
5888 pDestination->z = (CHAR)N.vector4_f32[2];
5889 pDestination->w = (CHAR)N.vector4_f32[3];
5891 #elif defined(_XM_SSE_INTRINSICS_)
5892 XMASSERT(pDestination);
5893 static const XMVECTORF32 ScaleByteN4 = {127.0f,127.0f*256.0f,127.0f*256.0f*256.0f,127.0f*256.0f*256.0f*256.0f};
5894 static const XMVECTORI32 MaskByteN4 = {0xFF,0xFF<<8,0xFF<<16,0xFF<<24};
5896 XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
5897 vResult = _mm_min_ps(vResult,g_XMOne);
5898 // Scale by multiplication
5899 vResult = _mm_mul_ps(vResult,ScaleByteN4);
5901 __m128i vResulti = _mm_cvttps_epi32(vResult);
5902 // Mask off any fraction
5903 vResulti = _mm_and_si128(vResulti,MaskByteN4);
5904 // Do a horizontal or of 4 entries
5905 __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
5907 vResulti = _mm_or_si128(vResulti,vResulti2);
5908 // Move Z to the x position
5909 vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
5911 vResulti = _mm_or_si128(vResulti,vResulti2);
5912 _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
5913 #else // _XM_VMX128_INTRINSICS_
5914 #endif // _XM_VMX128_INTRINSICS_
5917 //------------------------------------------------------------------------------
5919 XMFINLINE VOID XMStoreByte4
5921 XMBYTE4* pDestination,
5925 #if defined(_XM_NO_INTRINSICS_)
5928 static CONST XMVECTOR Min = {-127.0f, -127.0f, -127.0f, -127.0f};
5929 static CONST XMVECTOR Max = {127.0f, 127.0f, 127.0f, 127.0f};
5931 XMASSERT(pDestination);
5933 N = XMVectorClamp(V, Min, Max);
5934 N = XMVectorRound(N);
5936 pDestination->x = (CHAR)N.vector4_f32[0];
5937 pDestination->y = (CHAR)N.vector4_f32[1];
5938 pDestination->z = (CHAR)N.vector4_f32[2];
5939 pDestination->w = (CHAR)N.vector4_f32[3];
5941 #elif defined(_XM_SSE_INTRINSICS_)
5942 XMASSERT(pDestination);
5943 static const XMVECTORF32 MinByte4 = {-127.0f,-127.0f,-127.0f,-127.0f};
5944 static const XMVECTORF32 MaxByte4 = { 127.0f, 127.0f, 127.0f, 127.0f};
5945 static const XMVECTORF32 ScaleByte4 = {1.0f,256.0f,256.0f*256.0f,256.0f*256.0f*256.0f};
5946 static const XMVECTORI32 MaskByte4 = {0xFF,0xFF<<8,0xFF<<16,0xFF<<24};
5948 XMVECTOR vResult = _mm_max_ps(V,MinByte4);
5949 vResult = _mm_min_ps(vResult,MaxByte4);
5950 // Scale by multiplication
5951 vResult = _mm_mul_ps(vResult,ScaleByte4);
5953 __m128i vResulti = _mm_cvttps_epi32(vResult);
5954 // Mask off any fraction
5955 vResulti = _mm_and_si128(vResulti,MaskByte4);
5956 // Do a horizontal or of 4 entries
5957 __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
5959 vResulti = _mm_or_si128(vResulti,vResulti2);
5960 // Move Z to the x position
5961 vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
5963 vResulti = _mm_or_si128(vResulti,vResulti2);
5964 _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
5965 #else // _XM_VMX128_INTRINSICS_
5966 #endif // _XM_VMX128_INTRINSICS_
5969 //------------------------------------------------------------------------------
5971 XMFINLINE VOID XMStoreUNibble4
5973 XMUNIBBLE4* pDestination,
5977 #if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
5978 XMASSERT(pDestination);
5979 static CONST XMVECTORF32 Max = {15.0f,15.0f,15.0f,15.0f};
5981 XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
5982 vResult = _mm_min_ps(vResult,Max);
5983 // Convert to int with rounding
5984 __m128i vInt = _mm_cvtps_epi32(vResult);
5985 // No SSE operations will write to 16-bit values, so we have to extract them manually
5986 USHORT x = static_cast<USHORT>(_mm_extract_epi16(vInt,0));
5987 USHORT y = static_cast<USHORT>(_mm_extract_epi16(vInt,2));
5988 USHORT z = static_cast<USHORT>(_mm_extract_epi16(vInt,4));
5989 USHORT w = static_cast<USHORT>(_mm_extract_epi16(vInt,6));
5990 pDestination->v = ((w & 0xF) << 12) |
5996 static CONST XMVECTORF32 Max = {15.0f,15.0f,15.0f,15.0f};
5998 XMASSERT(pDestination);
6000 N = XMVectorClamp(V, XMVectorZero(), Max.v);
6001 N = XMVectorRound(N);
6003 pDestination->v = (((USHORT)N.vector4_f32[3] & 0xF) << 12) |
6004 (((USHORT)N.vector4_f32[2] & 0xF) << 8) |
6005 (((USHORT)N.vector4_f32[1] & 0xF) << 4) |
6006 (((USHORT)N.vector4_f32[0] & 0xF));
6007 #endif !_XM_SSE_INTRINSICS_
6010 //------------------------------------------------------------------------------
6012 XMFINLINE VOID XMStoreU555(
6013 XMU555* pDestination,
6017 #if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
6018 XMASSERT(pDestination);
6019 static CONST XMVECTORF32 Max = {31.0f, 31.0f, 31.0f, 1.0f};
6021 XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
6022 vResult = _mm_min_ps(vResult,Max);
6023 // Convert to int with rounding
6024 __m128i vInt = _mm_cvtps_epi32(vResult);
6025 // No SSE operations will write to 16-bit values, so we have to extract them manually
6026 USHORT x = static_cast<USHORT>(_mm_extract_epi16(vInt,0));
6027 USHORT y = static_cast<USHORT>(_mm_extract_epi16(vInt,2));
6028 USHORT z = static_cast<USHORT>(_mm_extract_epi16(vInt,4));
6029 USHORT w = static_cast<USHORT>(_mm_extract_epi16(vInt,6));
6030 pDestination->v = ((w) ? 0x8000 : 0) |
6031 ((z & 0x1F) << 10) |
6036 static CONST XMVECTORF32 Max = {31.0f, 31.0f, 31.0f, 1.0f};
6038 XMASSERT(pDestination);
6040 N = XMVectorClamp(V, XMVectorZero(), Max.v);
6041 N = XMVectorRound(N);
6043 pDestination->v = ((N.vector4_f32[3] > 0.f) ? 0x8000 : 0) |
6044 (((USHORT)N.vector4_f32[2] & 0x1F) << 10) |
6045 (((USHORT)N.vector4_f32[1] & 0x1F) << 5) |
6046 (((USHORT)N.vector4_f32[0] & 0x1F));
6047 #endif !_XM_SSE_INTRINSICS_
6050 //------------------------------------------------------------------------------
6052 XMFINLINE VOID XMStoreColor
6054 XMCOLOR* pDestination,
6058 #if defined(_XM_NO_INTRINSICS_)
6061 static CONST XMVECTORF32 Scale = {255.0f, 255.0f, 255.0f, 255.0f};
6063 XMASSERT(pDestination);
6065 N = XMVectorSaturate(V);
6066 N = XMVectorMultiply(N, Scale.v);
6067 N = XMVectorRound(N);
6069 pDestination->c = ((UINT)N.vector4_f32[3] << 24) |
6070 ((UINT)N.vector4_f32[0] << 16) |
6071 ((UINT)N.vector4_f32[1] << 8) |
6072 ((UINT)N.vector4_f32[2]);
6074 #elif defined(_XM_SSE_INTRINSICS_)
6075 XMASSERT(pDestination);
6076 static CONST XMVECTORF32 Scale = {255.0f,255.0f,255.0f,255.0f};
6078 XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
6080 vResult = _mm_min_ps(vResult,g_XMOne);
6082 vResult = _mm_mul_ps(vResult,Scale);
6083 // Shuffle RGBA to ARGB
6084 vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,0,1,2));
6086 __m128i vInt = _mm_cvtps_epi32(vResult);
6088 vInt = _mm_packs_epi32(vInt,vInt);
6090 vInt = _mm_packus_epi16(vInt,vInt);
6092 _mm_store_ss(reinterpret_cast<float *>(&pDestination->c),reinterpret_cast<__m128 *>(&vInt)[0]);
6093 #else // _XM_VMX128_INTRINSICS_
6094 #endif // _XM_VMX128_INTRINSICS_
6097 //------------------------------------------------------------------------------
6099 XMFINLINE VOID XMStoreFloat3x3
6101 XMFLOAT3X3* pDestination,
6105 #if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) || defined(_XM_SSE_INTRINSICS_)
6107 XMStoreFloat3x3NC(pDestination, M);
6109 #else // _XM_VMX128_INTRINSICS_
6110 #endif // _XM_VMX128_INTRINSICS_
6113 //------------------------------------------------------------------------------
6115 XMFINLINE VOID XMStoreFloat3x3NC
6117 XMFLOAT3X3* pDestination,
6121 #if defined(_XM_NO_INTRINSICS_)
6123 XMASSERT(pDestination);
6125 pDestination->m[0][0] = M.r[0].vector4_f32[0];
6126 pDestination->m[0][1] = M.r[0].vector4_f32[1];
6127 pDestination->m[0][2] = M.r[0].vector4_f32[2];
6129 pDestination->m[1][0] = M.r[1].vector4_f32[0];
6130 pDestination->m[1][1] = M.r[1].vector4_f32[1];
6131 pDestination->m[1][2] = M.r[1].vector4_f32[2];
6133 pDestination->m[2][0] = M.r[2].vector4_f32[0];
6134 pDestination->m[2][1] = M.r[2].vector4_f32[1];
6135 pDestination->m[2][2] = M.r[2].vector4_f32[2];
6137 #elif defined(_XM_SSE_INTRINSICS_)
6138 XMASSERT(pDestination);
6139 XMVECTOR vTemp1 = M.r[0];
6140 XMVECTOR vTemp2 = M.r[1];
6141 XMVECTOR vTemp3 = M.r[2];
6142 XMVECTOR vWork = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,0,2,2));
6143 vTemp1 = _mm_shuffle_ps(vTemp1,vWork,_MM_SHUFFLE(2,0,1,0));
6144 _mm_storeu_ps(&pDestination->m[0][0],vTemp1);
6145 vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
6146 _mm_storeu_ps(&pDestination->m[1][1],vTemp2);
6147 vTemp3 = _mm_shuffle_ps(vTemp3,vTemp3,_MM_SHUFFLE(2,2,2,2));
6148 _mm_store_ss(&pDestination->m[2][2],vTemp3);
6149 #else // _XM_VMX128_INTRINSICS_
6150 #endif // _XM_VMX128_INTRINSICS_
6153 //------------------------------------------------------------------------------
6155 XMFINLINE VOID XMStoreFloat4x3
6157 XMFLOAT4X3* pDestination,
6161 #if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) || defined(_XM_SSE_INTRINSICS_)
6163 XMStoreFloat4x3NC(pDestination, M);
6165 #else // _XM_VMX128_INTRINSICS_
6166 #endif // _XM_VMX128_INTRINSICS_
6169 //------------------------------------------------------------------------------
6171 XMFINLINE VOID XMStoreFloat4x3A
6173 XMFLOAT4X3A* pDestination,
6177 #if defined(_XM_NO_INTRINSICS_)
6179 XMASSERT(pDestination);
6180 XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
6182 pDestination->m[0][0] = M.r[0].vector4_f32[0];
6183 pDestination->m[0][1] = M.r[0].vector4_f32[1];
6184 pDestination->m[0][2] = M.r[0].vector4_f32[2];
6186 pDestination->m[1][0] = M.r[1].vector4_f32[0];
6187 pDestination->m[1][1] = M.r[1].vector4_f32[1];
6188 pDestination->m[1][2] = M.r[1].vector4_f32[2];
6190 pDestination->m[2][0] = M.r[2].vector4_f32[0];
6191 pDestination->m[2][1] = M.r[2].vector4_f32[1];
6192 pDestination->m[2][2] = M.r[2].vector4_f32[2];
6194 pDestination->m[3][0] = M.r[3].vector4_f32[0];
6195 pDestination->m[3][1] = M.r[3].vector4_f32[1];
6196 pDestination->m[3][2] = M.r[3].vector4_f32[2];
6198 #elif defined(_XM_SSE_INTRINSICS_)
6199 XMASSERT(pDestination);
6200 XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
6202 XMVECTOR vTemp1 = M.r[0];
6204 XMVECTOR vTemp2 = M.r[1];
6206 XMVECTOR vTemp3 = M.r[2];
6208 XMVECTOR vTemp4 = M.r[3];
6210 XMVECTOR vTemp = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(1,0,2,2));
6211 // y2,z2,x3,y3 (Final)
6212 vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
6213 // x1,y1,z1,x2 (Final)
6214 vTemp1 = _mm_shuffle_ps(vTemp1,vTemp,_MM_SHUFFLE(2,0,1,0));
6216 vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2));
6217 // z3,x4,y4,z4 (Final)
6218 vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0));
6219 // Store in 3 operations
6220 _mm_store_ps(&pDestination->m[0][0],vTemp1);
6221 _mm_store_ps(&pDestination->m[1][1],vTemp2);
6222 _mm_store_ps(&pDestination->m[2][2],vTemp3);
6223 #else // _XM_VMX128_INTRINSICS_
6224 #endif // _XM_VMX128_INTRINSICS_
6227 //------------------------------------------------------------------------------
6229 XMFINLINE VOID XMStoreFloat4x3NC
6231 XMFLOAT4X3* pDestination,
6235 #if defined(_XM_NO_INTRINSICS_)
6237 XMASSERT(pDestination);
6239 pDestination->m[0][0] = M.r[0].vector4_f32[0];
6240 pDestination->m[0][1] = M.r[0].vector4_f32[1];
6241 pDestination->m[0][2] = M.r[0].vector4_f32[2];
6243 pDestination->m[1][0] = M.r[1].vector4_f32[0];
6244 pDestination->m[1][1] = M.r[1].vector4_f32[1];
6245 pDestination->m[1][2] = M.r[1].vector4_f32[2];
6247 pDestination->m[2][0] = M.r[2].vector4_f32[0];
6248 pDestination->m[2][1] = M.r[2].vector4_f32[1];
6249 pDestination->m[2][2] = M.r[2].vector4_f32[2];
6251 pDestination->m[3][0] = M.r[3].vector4_f32[0];
6252 pDestination->m[3][1] = M.r[3].vector4_f32[1];
6253 pDestination->m[3][2] = M.r[3].vector4_f32[2];
6255 #elif defined(_XM_SSE_INTRINSICS_)
6256 XMASSERT(pDestination);
6257 XMVECTOR vTemp1 = M.r[0];
6258 XMVECTOR vTemp2 = M.r[1];
6259 XMVECTOR vTemp3 = M.r[2];
6260 XMVECTOR vTemp4 = M.r[3];
6261 XMVECTOR vTemp2x = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
6262 vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(2,2,0,0));
6263 vTemp1 = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,2,1,0));
6264 vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2));
6265 vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0));
6266 _mm_storeu_ps(&pDestination->m[0][0],vTemp1);
6267 _mm_storeu_ps(&pDestination->m[1][1],vTemp2x);
6268 _mm_storeu_ps(&pDestination->m[2][2],vTemp3);
6269 #else // _XM_VMX128_INTRINSICS_
6270 #endif // _XM_VMX128_INTRINSICS_
6273 //------------------------------------------------------------------------------
6275 XMFINLINE VOID XMStoreFloat4x4
6277 XMFLOAT4X4* pDestination,
6281 #if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
6283 XMStoreFloat4x4NC(pDestination, M);
6285 #elif defined(_XM_SSE_INTRINSICS_)
6286 XMASSERT(pDestination);
6288 _mm_storeu_ps( &pDestination->_11, M.r[0] );
6289 _mm_storeu_ps( &pDestination->_21, M.r[1] );
6290 _mm_storeu_ps( &pDestination->_31, M.r[2] );
6291 _mm_storeu_ps( &pDestination->_41, M.r[3] );
6292 #else // _XM_VMX128_INTRINSICS_
6293 #endif // _XM_VMX128_INTRINSICS_
6296 //------------------------------------------------------------------------------
6298 XMFINLINE VOID XMStoreFloat4x4A
6300 XMFLOAT4X4A* pDestination,
6304 #if defined(_XM_NO_INTRINSICS_)
6306 XMASSERT(pDestination);
6307 XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
6309 pDestination->m[0][0] = M.r[0].vector4_f32[0];
6310 pDestination->m[0][1] = M.r[0].vector4_f32[1];
6311 pDestination->m[0][2] = M.r[0].vector4_f32[2];
6312 pDestination->m[0][3] = M.r[0].vector4_f32[3];
6314 pDestination->m[1][0] = M.r[1].vector4_f32[0];
6315 pDestination->m[1][1] = M.r[1].vector4_f32[1];
6316 pDestination->m[1][2] = M.r[1].vector4_f32[2];
6317 pDestination->m[1][3] = M.r[1].vector4_f32[3];
6319 pDestination->m[2][0] = M.r[2].vector4_f32[0];
6320 pDestination->m[2][1] = M.r[2].vector4_f32[1];
6321 pDestination->m[2][2] = M.r[2].vector4_f32[2];
6322 pDestination->m[2][3] = M.r[2].vector4_f32[3];
6324 pDestination->m[3][0] = M.r[3].vector4_f32[0];
6325 pDestination->m[3][1] = M.r[3].vector4_f32[1];
6326 pDestination->m[3][2] = M.r[3].vector4_f32[2];
6327 pDestination->m[3][3] = M.r[3].vector4_f32[3];
6329 #elif defined(_XM_SSE_INTRINSICS_)
6330 XMASSERT(pDestination);
6332 _mm_store_ps( &pDestination->_11, M.r[0] );
6333 _mm_store_ps( &pDestination->_21, M.r[1] );
6334 _mm_store_ps( &pDestination->_31, M.r[2] );
6335 _mm_store_ps( &pDestination->_41, M.r[3] );
6336 #else // _XM_VMX128_INTRINSICS_
6337 #endif // _XM_VMX128_INTRINSICS_
6340 //------------------------------------------------------------------------------
6342 XMFINLINE VOID XMStoreFloat4x4NC
6344 XMFLOAT4X4* pDestination,
6348 #if defined(_XM_NO_INTRINSICS_)
6350 XMASSERT(pDestination);
6352 pDestination->m[0][0] = M.r[0].vector4_f32[0];
6353 pDestination->m[0][1] = M.r[0].vector4_f32[1];
6354 pDestination->m[0][2] = M.r[0].vector4_f32[2];
6355 pDestination->m[0][3] = M.r[0].vector4_f32[3];
6357 pDestination->m[1][0] = M.r[1].vector4_f32[0];
6358 pDestination->m[1][1] = M.r[1].vector4_f32[1];
6359 pDestination->m[1][2] = M.r[1].vector4_f32[2];
6360 pDestination->m[1][3] = M.r[1].vector4_f32[3];
6362 pDestination->m[2][0] = M.r[2].vector4_f32[0];
6363 pDestination->m[2][1] = M.r[2].vector4_f32[1];
6364 pDestination->m[2][2] = M.r[2].vector4_f32[2];
6365 pDestination->m[2][3] = M.r[2].vector4_f32[3];
6367 pDestination->m[3][0] = M.r[3].vector4_f32[0];
6368 pDestination->m[3][1] = M.r[3].vector4_f32[1];
6369 pDestination->m[3][2] = M.r[3].vector4_f32[2];
6370 pDestination->m[3][3] = M.r[3].vector4_f32[3];
6372 #elif defined(_XM_SSE_INTRINSICS_)
6373 XMASSERT(pDestination);
6374 _mm_storeu_ps(&pDestination->m[0][0],M.r[0]);
6375 _mm_storeu_ps(&pDestination->m[1][0],M.r[1]);
6376 _mm_storeu_ps(&pDestination->m[2][0],M.r[2]);
6377 _mm_storeu_ps(&pDestination->m[3][0],M.r[3]);
6378 #else // _XM_VMX128_INTRINSICS_
6379 #endif // _XM_VMX128_INTRINSICS_
6382 #endif // __XNAMATHCONVERT_INL__