Code Structure Analysis - c:\Program Files\Microsoft Visual Studio 9.0\VC\include\dvec.h


1		/**
2		*** Copyright (C) 1985-1999 Intel Corporation. All rights reserved.
3		***
4		*** The information and source code contained herein is the exclusive
5		*** property of Intel Corporation and may not be disclosed, examined
6		*** or reproduced in whole or in part without explicit written authorization
7		*** from the company.
8		***
9		**/
10
11		/*
12		* Definition of a C++ class interface to Willamette New Instruction intrinsics.
13		*
14		* File name : dvec.h class definitions
15		*
16		* Concept: A C++ abstraction of Willamette new intructions designed to improve
17		* programmer productivity. Speed and accuracy are sacrificed for utility.
18		* Facilitates an easy transition to compiler intrinsics
19		* or assembly language.
20		*
21		*/
22
23		#ifndef _DVEC_H_INCLUDED
24		#define _DVEC_H_INCLUDED
25		#ifndef RC_INVOKED
26
27		#if !defined __cplusplus
28		#error ERROR: This file is only supported in C++ compilations!
29		#endif /* !__cplusplus */
30
31		#if defined(_M_CEE_PURE)
32		#error ERROR: This file is not supported in the pure mode!
33		#else
34
35		#include <emmintrin.h> /* Willamette New Instructions Intrinsics include file */
36		#include <assert.h>
37		#include <fvec.h>
38		#include <crtdefs.h>
39
40		#ifdef _MSC_VER
41		#pragma pack(push,_CRT_PACKING)
42		#endif /* _MSC_VER */
43
44		/* Define _ENABLE_VEC_DEBUG to enable std::ostream inserters for debug output */
45		#if defined(_ENABLE_VEC_DEBUG)
46		#include <iostream>
47		#endif
48
49		#pragma pack(push,16) /* Must ensure class & union 16-B aligned */
50
51		#define EXPLICIT explicit
52
53		/* EMM Functionality Intrinsics */
54
55		class I8vec16; /* 16 elements, each element a signed or unsigned char data type */
56		class Is8vec16; /* 16 elements, each element a signed char data type */
57		class Iu8vec16; /* 16 elements, each element an unsigned char data type */
58		class I16vec8; /* 8 elements, each element a signed or unsigned short */
59		class Is16vec8; /* 8 elements, each element a signed short */
60		class Iu16vec8; /* 8 elements, each element an unsigned short */
61		class I32vec4; /* 4 elements, each element a signed or unsigned long */
62		class Is32vec4; /* 4 elements, each element a signed long */
63		class Iu32vec4; /* 4 elements, each element a unsigned long */
64		class I64vec2; /* 2 element, each a __m64 data type */
65		class I128vec1; /* 1 element, a __m128i data type */
66
67		#define _MM_16UB(element,vector) (((unsigned char)&##vector + ##element))
68		#define _MM_16B(element,vector) (((signed char)&##vector + ##element))
69
70		#define _MM_8UW(element,vector) (((unsigned short)&##vector + ##element))
71		#define _MM_8W(element,vector) (((short)&##vector + ##element))
72
73		#define _MM_4UDW(element,vector) (((unsigned int)&##vector + ##element))
74		#define _MM_4DW(element,vector) (((int)&##vector + ##element))
75
76		#define _MM_2QW(element,vector) (((__int64)&##vector + ##element))
77
78
79		/* We need a m128i constant, keeping performance in mind*/
80
81		#pragma warning(push)
82		#pragma warning(disable : 4640)
83		inline const __m128i get_mask128()
84		{
85		static const __m128i mask128 = _mm_set1_epi64(M64(0xffffffffffffffffi64));
86		return mask128;
87		}
88		#pragma warning(pop)
89
90
91		/* M128 Class:
92		* 1 element, a __m128i data type
93		* Contructors & Logical Operations
94		*/
95
96		class M128
97		{
98		protected:
99		__m128i vec;
100
101		public:
102		M128() { }
103		M128(__m128i mm) { vec = mm; }
104
105		operator __m128i() const { return vec; }
106
107		/* Logical Operations */
108		M128& operator&=(const M128 &a) { return *this = (M128) _mm_and_si128(vec,a); }
109		M128& operator\|=(const M128 &a) { return *this = (M128) _mm_or_si128(vec,a); }
110		M128& operator^=(const M128 &a) { return *this = (M128) _mm_xor_si128(vec,a); }
111
112		};
113
114		inline M128 operator&(const M128 &a, const M128 &b) { return _mm_and_si128(a,b); }
115		inline M128 operator\|(const M128 &a, const M128 &b) { return _mm_or_si128(a,b); }
116		inline M128 operator^(const M128 &a, const M128 &b) { return _mm_xor_si128(a,b); }
117		inline M128 andnot(const M128 &a, const M128 &b) { return _mm_andnot_si128(a,b); }
118
119		/* I128vec1 Class:
120		* 1 element, a __m128i data type
121		* Contains Operations which can operate on any __m6128i data type
122		*/
123
124		class I128vec1 : public M128
125		{
126		public:
127		I128vec1() { }
128		I128vec1(__m128i mm) : M128(mm) { }
129
130		I128vec1& operator= (const M128 &a) { return *this = (I128vec1) a; }
131		I128vec1& operator&=(const M128 &a) { return *this = (I128vec1) _mm_and_si128(vec,a); }
132		I128vec1& operator\|=(const M128 &a) { return *this = (I128vec1) _mm_or_si128(vec,a); }
133		I128vec1& operator^=(const M128 &a) { return *this = (I128vec1) _mm_xor_si128(vec,a); }
134
135		};
136
137		/* I64vec2 Class:
138		* 2 elements, each element signed or unsigned 64-bit integer
139		*/
140		class I64vec2 : public M128
141		{
142		public:
143		I64vec2() { }
144		I64vec2(__m128i mm) : M128(mm) { }
145
146		I64vec2(__m64 q1, __m64 q0)
147		{
148		_MM_2QW(0,vec) = (__int64)&q0;
149		_MM_2QW(1,vec) = (__int64)&q1;
150		}
151
152		/* Assignment Operator */
153		I64vec2& operator= (const M128 &a) { return *this = (I64vec2) a; }
154
155		/* Logical Assignment Operators */
156		I64vec2& operator&=(const M128 &a) { return *this = (I64vec2) _mm_and_si128(vec,a); }
157		I64vec2& operator\|=(const M128 &a) { return *this = (I64vec2) _mm_or_si128(vec,a); }
158		I64vec2& operator^=(const M128 &a) { return *this = (I64vec2) _mm_xor_si128(vec,a); }
159
160		/* Addition & Subtraction Assignment Operators */
161		I64vec2& operator +=(const I64vec2 &a) { return *this = (I64vec2) _mm_add_epi64(vec,a); }
162		I64vec2& operator -=(const I64vec2 &a) { return *this = (I64vec2) _mm_sub_epi64(vec,a); }
163
164		/* Shift Logical Operators */
165		I64vec2 operator<<(const I64vec2 &a) { return _mm_sll_epi64(vec,a); }
166		I64vec2 operator<<(int count) { return _mm_slli_epi64(vec,count); }
167		I64vec2& operator<<=(const I64vec2 &a) { return *this = (I64vec2) _mm_sll_epi64(vec,a); }
168		I64vec2& operator<<=(int count) { return *this = (I64vec2) _mm_slli_epi64(vec,count); }
169		I64vec2 operator>>(const I64vec2 &a) { return _mm_srl_epi64(vec,a); }
170		I64vec2 operator>>(int count) { return _mm_srli_epi64(vec,count); }
171		I64vec2& operator>>=(const I64vec2 &a) { return *this = (I64vec2) _mm_srl_epi64(vec,a); }
172		I64vec2& operator>>=(int count) { return *this = (I64vec2) _mm_srli_epi64(vec,count); }
173
174		/* Element Access for Debug, No data modified */
175		const __int64& operator[](int i)const
176		{
177		assert(static_cast<unsigned int>(i) < 2); /* Only 2 elements to access */
178		return _MM_2QW(i,vec);
179		}
180
181		/* Element Access and Assignment for Debug */
182		__int64& operator[](int i)
183		{
184		assert(static_cast<unsigned int>(i) < 2); /* Only 2 elements to access */
185		return _MM_2QW(i,vec);
186		}
187
188
189		};
190
191		/* Unpacks */
192		inline I64vec2 unpack_low(const I64vec2 &a, const I64vec2 &b) {return _mm_unpacklo_epi64(a,b); }
193		inline I64vec2 unpack_high(const I64vec2 &a, const I64vec2 &b) {return _mm_unpackhi_epi64(a,b); }
194
195		/* I32vec4 Class:
196		* 4 elements, each element either a signed or unsigned int
197		*/
198		class I32vec4 : public M128
199		{
200		public:
201		I32vec4() { }
202		I32vec4(__m128i mm) : M128(mm) { }
203
204		/* Assignment Operator */
205		I32vec4& operator= (const M128 &a) { return *this = (I32vec4) a; }
206
207		/* Logicals Operators */
208		I32vec4& operator&=(const M128 &a) { return *this = (I32vec4) _mm_and_si128(vec,a); }
209		I32vec4& operator\|=(const M128 &a) { return *this = (I32vec4) _mm_or_si128(vec,a); }
210		I32vec4& operator^=(const M128 &a) { return *this = (I32vec4) _mm_xor_si128(vec,a); }
211
212		/* Addition & Subtraction Assignment Operators */
213		I32vec4& operator +=(const I32vec4 &a) { return *this = (I32vec4)_mm_add_epi32(vec,a); }
214		I32vec4& operator -=(const I32vec4 &a) { return *this = (I32vec4)_mm_sub_epi32(vec,a); }
215
216		/* Shift Logical Operators */
217		I32vec4 operator<<(const I32vec4 &a) { return _mm_sll_epi32(vec,a); }
218		I32vec4 operator<<(int count) { return _mm_slli_epi32(vec,count); }
219		I32vec4& operator<<=(const I32vec4 &a) { return *this = (I32vec4)_mm_sll_epi32(vec,a); }
220		I32vec4& operator<<=(int count) { return *this = (I32vec4)_mm_slli_epi32(vec,count); }
221
222		};
223
224		inline I32vec4 cmpeq(const I32vec4 &a, const I32vec4 &b) { return _mm_cmpeq_epi32(a,b); }
225		inline I32vec4 cmpneq(const I32vec4 &a, const I32vec4 &b) { return _mm_andnot_si128(_mm_cmpeq_epi32(a,b), get_mask128()); }
226
227		inline I32vec4 unpack_low(const I32vec4 &a, const I32vec4 &b) { return _mm_unpacklo_epi32(a,b); }
228		inline I32vec4 unpack_high(const I32vec4 &a, const I32vec4 &b) { return _mm_unpackhi_epi32(a,b); }
229
230		/* Is32vec4 Class:
231		* 4 elements, each element signed integer
232		*/
233		class Is32vec4 : public I32vec4
234		{
235		public:
236		Is32vec4() { }
237		Is32vec4(__m128i mm) : I32vec4(mm) { }
238		Is32vec4(int i3, int i2, int i1, int i0)
239		{
240		_MM_4DW(0,vec) = i0;
241		_MM_4DW(1,vec) = i1;
242		_MM_4DW(2,vec) = i2;
243		_MM_4DW(3,vec) = i3;
244		}
245
246		/* Assignment Operator */
247		Is32vec4& operator= (const M128 &a) { return *this = (Is32vec4) a; }
248
249		/* Logical Operators */
250		Is32vec4& operator&=(const M128 &a) { return *this = (Is32vec4) _mm_and_si128(vec,a); }
251		Is32vec4& operator\|=(const M128 &a) { return *this = (Is32vec4) _mm_or_si128(vec,a); }
252		Is32vec4& operator^=(const M128 &a) { return *this = (Is32vec4) _mm_xor_si128(vec,a); }
253
254		/* Addition & Subtraction Assignment Operators */
255		Is32vec4& operator +=(const I32vec4 &a) { return *this = (Is32vec4)_mm_add_epi32(vec,a); }
256		Is32vec4& operator -=(const I32vec4 &a) { return *this = (Is32vec4)_mm_sub_epi32(vec,a); }
257
258		/* Shift Logical Operators */
259		Is32vec4 operator<<(const M128 &a) { return _mm_sll_epi32(vec,a); }
260		Is32vec4 operator<<(int count) { return _mm_slli_epi32(vec,count); }
261		Is32vec4& operator<<=(const M128 &a) { return *this = (Is32vec4)_mm_sll_epi32(vec,a); }
262		Is32vec4& operator<<=(int count) { return *this = (Is32vec4)_mm_slli_epi32(vec,count); }
263		/* Shift Arithmetic Operations */
264		Is32vec4 operator>>(const M128 &a) { return _mm_sra_epi32(vec,a); }
265		Is32vec4 operator>>(int count) { return _mm_srai_epi32(vec,count); }
266		Is32vec4& operator>>=(const M128 &a) { return *this = (Is32vec4) _mm_sra_epi32(vec,a); }
267		Is32vec4& operator>>=(int count) { return *this = (Is32vec4) _mm_srai_epi32(vec,count); }
268
269		#if defined(_ENABLE_VEC_DEBUG)
270		/* Output for Debug */
271		friend std::ostream& operator<< (std::ostream &os, const Is32vec4 &a)
272		{
273		os << "[3]:" << _MM_4DW(3,a)
274		<< " [2]:" << _MM_4DW(2,a)
275		<< " [1]:" << _MM_4DW(1,a)
276		<< " [0]:" << _MM_4DW(0,a);
277		return os;
278		}
279		#endif
280
281		/* Element Access for Debug, No data modified */
282		const int& operator[](int i)const
283		{
284		assert(static_cast<unsigned int>(i) < 4); /* Only 4 elements to access */
285		return _MM_4DW(i,vec);
286		}
287
288		/* Element Access for Debug */
289		int& operator[](int i)
290		{
291		assert(static_cast<unsigned int>(i) < 4); /* Only 4 elements to access */
292		return _MM_4DW(i,vec);
293		}
294		};
295
296		/* Compares */
297		inline Is32vec4 cmpeq(const Is32vec4 &a, const Is32vec4 &b) { return _mm_cmpeq_epi32(a,b); }
298		inline Is32vec4 cmpneq(const Is32vec4 &a, const Is32vec4 &b) { return _mm_andnot_si128(_mm_cmpeq_epi32(a,b), get_mask128()); }
299		inline Is32vec4 cmpgt(const Is32vec4 &a, const Is32vec4 &b) { return _mm_cmpgt_epi32(a,b); }
300		inline Is32vec4 cmplt(const Is32vec4 &a, const Is32vec4 &b) { return _mm_cmpgt_epi32(b,a); }
301
302		/* Unpacks */
303		inline Is32vec4 unpack_low(const Is32vec4 &a, const Is32vec4 &b) { return _mm_unpacklo_epi32(a,b); }
304		inline Is32vec4 unpack_high(const Is32vec4 &a, const Is32vec4 &b) { return _mm_unpackhi_epi32(a,b); }
305
306
307
308		/* Iu32vec4 Class:
309		* 4 elements, each element unsigned int
310		*/
311		class Iu32vec4 : public I32vec4
312		{
313		public:
314		Iu32vec4() { }
315		Iu32vec4(__m128i mm) : I32vec4(mm) { }
316		Iu32vec4(unsigned int ui3, unsigned int ui2, unsigned int ui1, unsigned int ui0)
317		{
318		_MM_4UDW(0,vec) = ui0;
319		_MM_4UDW(1,vec) = ui1;
320		_MM_4UDW(2,vec) = ui2;
321		_MM_4UDW(3,vec) = ui3;
322		}
323
324		/* Assignment Operator */
325		Iu32vec4& operator= (const M128 &a) { return *this = (Iu32vec4) a; }
326
327		/* Logical Assignment Operators */
328		Iu32vec4& operator&=(const M128 &a) { return *this = (Iu32vec4) _mm_and_si128(vec,a); }
329		Iu32vec4& operator\|=(const M128 &a) { return *this = (Iu32vec4) _mm_or_si128(vec,a); }
330		Iu32vec4& operator^=(const M128 &a) { return *this = (Iu32vec4) _mm_xor_si128(vec,a); }
331
332		/* Addition & Subtraction Assignment Operators */
333		Iu32vec4& operator +=(const I32vec4 &a) { return *this = (Iu32vec4)_mm_add_epi32(vec,a); }
334		Iu32vec4& operator -=(const I32vec4 &a) { return *this = (Iu32vec4)_mm_sub_epi32(vec,a); }
335
336		/* Shift Logical Operators */
337		Iu32vec4 operator<<(const M128 &a) { return _mm_sll_epi32(vec,a); }
338		Iu32vec4 operator<<(int count) { return _mm_slli_epi32(vec,count); }
339		Iu32vec4& operator<<=(const M128 &a) { return *this = (Iu32vec4)_mm_sll_epi32(vec,a); }
340		Iu32vec4& operator<<=(int count) { return *this = (Iu32vec4)_mm_slli_epi32(vec,count); }
341		Iu32vec4 operator>>(const M128 &a) { return _mm_srl_epi32(vec,a); }
342		Iu32vec4 operator>>(int count) { return _mm_srli_epi32(vec,count); }
343		Iu32vec4& operator>>=(const M128 &a) { return *this = (Iu32vec4) _mm_srl_epi32(vec,a); }
344		Iu32vec4& operator>>=(int count) { return *this = (Iu32vec4) _mm_srli_epi32(vec,count); }
345
346		#if defined(_ENABLE_VEC_DEBUG)
347		/* Output for Debug */
348		friend std::ostream& operator<< (std::ostream &os, const Iu32vec4 &a)
349		{
350		os << "[3]:" << _MM_4UDW(3,a)
351		<< " [2]:" << _MM_4UDW(2,a)
352		<< " [1]:" << _MM_4UDW(1,a)
353		<< " [0]:" << _MM_4UDW(0,a);
354		return os;
355		}
356		#endif
357
358		/* Element Access for Debug, No data modified */
359		const unsigned int& operator[](int i)const
360		{
361		assert(static_cast<unsigned int>(i) < 4); /* Only 4 elements to access */
362		return _MM_4UDW(i,vec);
363		}
364
365		/* Element Access and Assignment for Debug */
366		unsigned int& operator[](int i)
367		{
368		assert(static_cast<unsigned int>(i) < 4); /* Only 4 elements to access */
369		return _MM_4UDW(i,vec);
370		}
371		};
372
373		inline I64vec2 operator*(const Iu32vec4 &a, const Iu32vec4 &b) { return _mm_mul_epu32(a,b); }
374		inline Iu32vec4 cmpeq(const Iu32vec4 &a, const Iu32vec4 &b) { return _mm_cmpeq_epi32(a,b); }
375		inline Iu32vec4 cmpneq(const Iu32vec4 &a, const Iu32vec4 &b) { return _mm_andnot_si128(_mm_cmpeq_epi32(a,b), get_mask128()); }
376
377		inline Iu32vec4 unpack_low(const Iu32vec4 &a, const Iu32vec4 &b) { return _mm_unpacklo_epi32(a,b); }
378		inline Iu32vec4 unpack_high(const Iu32vec4 &a, const Iu32vec4 &b) { return _mm_unpackhi_epi32(a,b); }
379
380		/* I16vec8 Class:
381		* 8 elements, each element either unsigned or signed short
382		*/
383		class I16vec8 : public M128
384		{
385		public:
386		I16vec8() { }
387		I16vec8(__m128i mm) : M128(mm) { }
388
389		/* Assignment Operator */
390		I16vec8& operator= (const M128 &a) { return *this = (I16vec8) a; }
391
392		/* Logical Assignment Operators */
393		I16vec8& operator&=(const M128 &a) { return *this = (I16vec8) _mm_and_si128(vec,a); }
394		I16vec8& operator\|=(const M128 &a) { return *this = (I16vec8) _mm_or_si128(vec,a); }
395		I16vec8& operator^=(const M128 &a) { return *this = (I16vec8) _mm_xor_si128(vec,a); }
396
397		/* Addition & Subtraction Assignment Operators */
398		I16vec8& operator +=(const I16vec8 &a) { return *this = (I16vec8) _mm_add_epi16(vec,a); }
399		I16vec8& operator -=(const I16vec8 &a) { return *this = (I16vec8) _mm_sub_epi16(vec,a); }
400		I16vec8& operator =(const I16vec8 &a) { return this = (I16vec8) _mm_mullo_epi16(vec,a); }
401
402		/* Shift Logical Operators */
403		I16vec8 operator<<(const M128 &a) { return _mm_sll_epi16(vec,a); }
404		I16vec8 operator<<(int count) { return _mm_slli_epi16(vec,count); }
405		I16vec8& operator<<=(const M128 &a) { return *this = (I16vec8)_mm_sll_epi16(vec,a); }
406		I16vec8& operator<<=(int count) { return *this = (I16vec8)_mm_slli_epi16(vec,count); }
407
408		};
409
410
411		inline I16vec8 operator*(const I16vec8 &a, const I16vec8 &b) { return _mm_mullo_epi16(a,b); }
412
413		inline I16vec8 cmpeq(const I16vec8 &a, const I16vec8 &b) { return _mm_cmpeq_epi16(a,b); }
414		inline I16vec8 cmpneq(const I16vec8 &a, const I16vec8 &b) { return _mm_andnot_si128(_mm_cmpeq_epi16(a,b), get_mask128()); }
415
416		inline I16vec8 unpack_low(const I16vec8 &a, const I16vec8 &b) { return _mm_unpacklo_epi16(a,b); }
417		inline I16vec8 unpack_high(const I16vec8 &a, const I16vec8 &b) { return _mm_unpackhi_epi16(a,b); }
418
419		/* Is16vec8 Class:
420		* 8 elements, each element signed short
421		*/
422		class Is16vec8 : public I16vec8
423		{
424		public:
425		Is16vec8() { }
426		Is16vec8(__m128i mm) : I16vec8(mm) { }
427		Is16vec8(signed short s7,signed short s6,signed short s5,signed short s4,signed short s3,signed short s2,signed short s1,signed short s0)
428		{
429		_MM_8W(0,vec) = s0;
430		_MM_8W(1,vec) = s1;
431		_MM_8W(2,vec) = s2;
432		_MM_8W(3,vec) = s3;
433		_MM_8W(4,vec) = s4;
434		_MM_8W(5,vec) = s5;
435		_MM_8W(6,vec) = s6;
436		_MM_8W(7,vec) = s7;
437		}
438
439		/* Assignment Operator */
440		Is16vec8& operator= (const M128 &a) { return *this = (Is16vec8) a; }
441
442		/* Logical Assignment Operators */
443		Is16vec8& operator&=(const M128 &a) { return *this = (Is16vec8) _mm_and_si128(vec,a); }
444		Is16vec8& operator\|=(const M128 &a) { return *this = (Is16vec8) _mm_or_si128(vec,a); }
445		Is16vec8& operator^=(const M128 &a) { return *this = (Is16vec8) _mm_xor_si128(vec,a); }
446
447		/* Addition & Subtraction Assignment Operators */
448		Is16vec8& operator +=(const I16vec8 &a) { return *this = (Is16vec8) _mm_add_epi16(vec,a); }
449		Is16vec8& operator -=(const I16vec8 &a) { return *this = (Is16vec8) _mm_sub_epi16(vec,a); }
450		Is16vec8& operator =(const I16vec8 &a) { return this = (Is16vec8) _mm_mullo_epi16(vec,a); }
451
452		/* Shift Logical Operators */
453		Is16vec8 operator<<(const M128 &a) { return _mm_sll_epi16(vec,a); }
454		Is16vec8 operator<<(int count) { return _mm_slli_epi16(vec,count); }
455		Is16vec8& operator<<=(const M128 &a) { return *this = (Is16vec8)_mm_sll_epi16(vec,a); }
456		Is16vec8& operator<<=(int count) { return *this = (Is16vec8)_mm_slli_epi16(vec,count); }
457		/* Shift Arithmetic Operators */
458		Is16vec8 operator>>(const M128 &a) { return _mm_sra_epi16(vec,a); }
459		Is16vec8 operator>>(int count) { return _mm_srai_epi16(vec,count); }
460		Is16vec8& operator>>=(const M128 &a) { return *this = (Is16vec8)_mm_sra_epi16(vec,a); }
461		Is16vec8& operator>>=(int count) { return *this = (Is16vec8)_mm_srai_epi16(vec,count); }
462
463		#if defined(_ENABLE_VEC_DEBUG)
464		/* Output for Debug */
465		friend std::ostream& operator<< (std::ostream &os, const Is16vec8 &a)
466		{
467		os << "[7]:" << _MM_8W(7,a)
468		<< " [6]:" << _MM_8W(6,a)
469		<< " [5]:" << _MM_8W(5,a)
470		<< " [4]:" << _MM_8W(4,a)
471		<< " [3]:" << _MM_8W(3,a)
472		<< " [2]:" << _MM_8W(2,a)
473		<< " [1]:" << _MM_8W(1,a)
474		<< " [0]:" << _MM_8W(0,a);
475		return os;
476		}
477		#endif
478
479		/* Element Access for Debug, No data modified */
480		const signed short& operator[](int i)const
481		{
482		assert(static_cast<unsigned int>(i) < 8); /* Only 8 elements to access */
483		return _MM_8W(i,vec);
484		}
485
486		/* Element Access and Assignment for Debug */
487		signed short& operator[](int i)
488		{
489		assert(static_cast<unsigned int>(i) < 8); /* Only 8 elements to access */
490		return _MM_8W(i,vec);
491		}
492		};
493
494		inline Is16vec8 operator*(const Is16vec8 &a, const Is16vec8 &b) { return _mm_mullo_epi16(a,b); }
495
496
497		/* Additional Is16vec8 functions: compares, unpacks, sat add/sub */
498		inline Is16vec8 cmpeq(const Is16vec8 &a, const Is16vec8 &b) { return _mm_cmpeq_epi16(a,b); }
499		inline Is16vec8 cmpneq(const Is16vec8 &a, const Is16vec8 &b) { return _mm_andnot_si128(_mm_cmpeq_epi16(a,b), get_mask128()); }
500		inline Is16vec8 cmpgt(const Is16vec8 &a, const Is16vec8 &b) { return _mm_cmpgt_epi16(a,b); }
501		inline Is16vec8 cmplt(const Is16vec8 &a, const Is16vec8 &b) { return _mm_cmpgt_epi16(b,a); }
502
503		inline Is16vec8 unpack_low(const Is16vec8 &a, const Is16vec8 &b) { return _mm_unpacklo_epi16(a,b); }
504		inline Is16vec8 unpack_high(const Is16vec8 &a, const Is16vec8 &b) { return _mm_unpackhi_epi16(a,b); }
505
506		inline Is16vec8 mul_high(const Is16vec8 &a, const Is16vec8 &b) { return _mm_mulhi_epi16(a,b); }
507		inline Is32vec4 mul_add(const Is16vec8 &a, const Is16vec8 &b) { return _mm_madd_epi16(a,b);}
508
509		inline Is16vec8 sat_add(const Is16vec8 &a, const Is16vec8 &b) { return _mm_adds_epi16(a,b); }
510		inline Is16vec8 sat_sub(const Is16vec8 &a, const Is16vec8 &b) { return _mm_subs_epi16(a,b); }
511
512		inline Is16vec8 simd_max(const Is16vec8 &a, const Is16vec8 &b) { return _mm_max_epi16(a,b); }
513		inline Is16vec8 simd_min(const Is16vec8 &a, const Is16vec8 &b) { return _mm_min_epi16(a,b); }
514
515
516		/* Iu16vec8 Class:
517		* 8 elements, each element unsigned short
518		*/
519		class Iu16vec8 : public I16vec8
520		{
521		public:
522		Iu16vec8() { }
523		Iu16vec8(__m128i mm) : I16vec8(mm) { }
524		Iu16vec8(unsigned short s7,unsigned short s6,unsigned short s5,unsigned short s4, unsigned short s3,unsigned short s2,unsigned short s1,unsigned short s0)
525		{
526		_MM_8UW(0,vec) = s0;
527		_MM_8UW(1,vec) = s1;
528		_MM_8UW(2,vec) = s2;
529		_MM_8UW(3,vec) = s3;
530		_MM_8UW(4,vec) = s4;
531		_MM_8UW(5,vec) = s5;
532		_MM_8UW(6,vec) = s6;
533		_MM_8UW(7,vec) = s7;
534		}
535
536		/* Assignment Operator */
537		Iu16vec8& operator= (const M128 &a) { return *this = (Iu16vec8) a; }
538		/* Logical Assignment Operators */
539		Iu16vec8& operator&=(const M128 &a) { return *this = (Iu16vec8) _mm_and_si128(vec,a); }
540		Iu16vec8& operator\|=(const M128 &a) { return *this = (Iu16vec8) _mm_or_si128(vec,a); }
541		Iu16vec8& operator^=(const M128 &a) { return *this = (Iu16vec8) _mm_xor_si128(vec,a); }
542		/* Addition & Subtraction Assignment Operators */
543		Iu16vec8& operator +=(const I16vec8 &a) { return *this = (Iu16vec8) _mm_add_epi16(vec,a); }
544		Iu16vec8& operator -=(const I16vec8 &a) { return *this = (Iu16vec8) _mm_sub_epi16(vec,a); }
545		Iu16vec8& operator =(const I16vec8 &a) { return this = (Iu16vec8) _mm_mullo_epi16(vec,a); }
546
547		/* Shift Logical Operators */
548		Iu16vec8 operator<<(const M128 &a) { return _mm_sll_epi16(vec,a); }
549		Iu16vec8 operator<<(int count) { return _mm_slli_epi16(vec,count); }
550		Iu16vec8& operator<<=(const M128 &a) { return *this = (Iu16vec8)_mm_sll_epi16(vec,a); }
551		Iu16vec8& operator<<=(int count) { return *this = (Iu16vec8)_mm_slli_epi16(vec,count); }
552		Iu16vec8 operator>>(const M128 &a) { return _mm_srl_epi16(vec,a); }
553		Iu16vec8 operator>>(int count) { return _mm_srli_epi16(vec,count); }
554		Iu16vec8& operator>>=(const M128 &a) { return *this = (Iu16vec8) _mm_srl_epi16(vec,a); }
555		Iu16vec8& operator>>=(int count) { return *this = (Iu16vec8) _mm_srli_epi16(vec,count); }
556
557
558		#if defined(_ENABLE_VEC_DEBUG)
559		/* Output for Debug */
560		friend std::ostream& operator << (std::ostream &os, const Iu16vec8 &a)
561		{
562		os << "[7]:" << unsigned short(_MM_8UW(7,a))
563		<< " [6]:" << unsigned short(_MM_8UW(6,a))
564		<< " [5]:" << unsigned short(_MM_8UW(5,a))
565		<< " [4]:" << unsigned short(_MM_8UW(4,a))
566		<< " [3]:" << unsigned short(_MM_8UW(3,a))
567		<< " [2]:" << unsigned short(_MM_8UW(2,a))
568		<< " [1]:" << unsigned short(_MM_8UW(1,a))
569		<< " [0]:" << unsigned short(_MM_8UW(0,a));
570		return os;
571		}
572		#endif
573
574		/* Element Access for Debug, No data modified */
575		const unsigned short& operator[](int i)const
576		{
577		assert(static_cast<unsigned int>(i) < 8); /* Only 8 elements to access */
578		return _MM_8UW(i,vec);
579		}
580
581		/* Element Access for Debug */
582		unsigned short& operator[](int i)
583		{
584		assert(static_cast<unsigned int>(i) < 8); /* Only 8 elements to access */
585		return _MM_8UW(i,vec);
586		}
587		};
588
589		inline Iu16vec8 operator*(const Iu16vec8 &a, const Iu16vec8 &b) { return _mm_mullo_epi16(a,b); }
590
591		/* Additional Iu16vec8 functions: cmpeq,cmpneq, unpacks, sat add/sub */
592		inline Iu16vec8 cmpeq(const Iu16vec8 &a, const Iu16vec8 &b) { return _mm_cmpeq_epi16(a,b); }
593		inline Iu16vec8 cmpneq(const Iu16vec8 &a, const Iu16vec8 &b) { return _mm_andnot_si128(_mm_cmpeq_epi16(a,b), get_mask128()); }
594
595		inline Iu16vec8 unpack_low(const Iu16vec8 &a, const Iu16vec8 &b) { return _mm_unpacklo_epi16(a,b); }
596		inline Iu16vec8 unpack_high(const Iu16vec8 &a, const Iu16vec8 &b) { return _mm_unpackhi_epi16(a,b); }
597
598		inline Iu16vec8 sat_add(const Iu16vec8 &a, const Iu16vec8 &b) { return _mm_adds_epu16(a,b); }
599		inline Iu16vec8 sat_sub(const Iu16vec8 &a, const Iu16vec8 &b) { return _mm_subs_epu16(a,b); }
600
601		inline Iu16vec8 simd_avg(const Iu16vec8 &a, const Iu16vec8 &b) { return _mm_avg_epu16(a,b); }
602		inline I16vec8 mul_high(const Iu16vec8 &a, const Iu16vec8 &b) { return _mm_mulhi_epu16(a,b); }
603
604		/* I8vec16 Class:
605		* 16 elements, each element either unsigned or signed char
606		*/
607		class I8vec16 : public M128
608		{
609		public:
610		I8vec16() { }
611		I8vec16(__m128i mm) : M128(mm) { }
612
613		/* Assignment Operator */
614		I8vec16& operator= (const M128 &a) { return *this = (I8vec16) a; }
615
616		/* Logical Assignment Operators */
617		I8vec16& operator&=(const M128 &a) { return *this = (I8vec16) _mm_and_si128(vec,a); }
618		I8vec16& operator\|=(const M128 &a) { return *this = (I8vec16) _mm_or_si128(vec,a); }
619		I8vec16& operator^=(const M128 &a) { return *this = (I8vec16) _mm_xor_si128(vec,a); }
620
621		/* Addition & Subtraction Assignment Operators */
622		I8vec16& operator +=(const I8vec16 &a) { return *this = (I8vec16) _mm_add_epi8(vec,a); }
623		I8vec16& operator -=(const I8vec16 &a) { return *this = (I8vec16) _mm_sub_epi8(vec,a); }
624
625		};
626
627		inline I8vec16 cmpeq(const I8vec16 &a, const I8vec16 &b) { return _mm_cmpeq_epi8(a,b); }
628		inline I8vec16 cmpneq(const I8vec16 &a, const I8vec16 &b) { return _mm_andnot_si128(_mm_cmpeq_epi8(a,b), get_mask128()); }
629
630		inline I8vec16 unpack_low(const I8vec16 &a, const I8vec16 &b) { return _mm_unpacklo_epi8(a,b); }
631		inline I8vec16 unpack_high(const I8vec16 &a, const I8vec16 &b) { return _mm_unpackhi_epi8(a,b); }
632
633		/* Is8vec16 Class:
634		* 16 elements, each element a signed char
635		*/
636		class Is8vec16 : public I8vec16
637		{
638		public:
639		Is8vec16() { }
640		Is8vec16(__m128i mm) : I8vec16(mm) { }
641
642		/* Assignment Operator */
643		Is8vec16& operator= (const M128 &a) { return *this = (Is8vec16) a; }
644
645		/* Logical Assignment Operators */
646		Is8vec16& operator&=(const M128 &a) { return *this = (Is8vec16) _mm_and_si128(vec,a); }
647		Is8vec16& operator\|=(const M128 &a) { return *this = (Is8vec16) _mm_or_si128(vec,a); }
648		Is8vec16& operator^=(const M128 &a) { return *this = (Is8vec16) _mm_xor_si128(vec,a); }
649
650		/* Addition & Subtraction Assignment Operators */
651		Is8vec16& operator +=(const I8vec16 &a) { return *this = (Is8vec16) _mm_add_epi8(vec,a); }
652		Is8vec16& operator -=(const I8vec16 &a) { return *this = (Is8vec16) _mm_sub_epi8(vec,a); }
653
654		#if defined(_ENABLE_VEC_DEBUG)
655		/* Output for Debug */
656		friend std::ostream& operator << (std::ostream &os, const Is8vec16 &a)
657		{
658		os << "[15]:" << short(_MM_16B(15,a))
659		<< " [14]:" << short(_MM_16B(14,a))
660		<< " [13]:" << short(_MM_16B(13,a))
661		<< " [12]:" << short(_MM_16B(12,a))
662		<< " [11]:" << short(_MM_16B(11,a))
663		<< " [10]:" << short(_MM_16B(10,a))
664		<< " [9]:" << short(_MM_16B(9,a))
665		<< " [8]:" << short(_MM_16B(8,a))
666		<< " [7]:" << short(_MM_16B(7,a))
667		<< " [6]:" << short(_MM_16B(6,a))
668		<< " [5]:" << short(_MM_16B(5,a))
669		<< " [4]:" << short(_MM_16B(4,a))
670		<< " [3]:" << short(_MM_16B(3,a))
671		<< " [2]:" << short(_MM_16B(2,a))
672		<< " [1]:" << short(_MM_16B(1,a))
673		<< " [0]:" << short(_MM_16B(0,a));
674		return os;
675		}
676		#endif
677
678		/* Element Access for Debug, No data modified */
679		const signed char& operator[](int i)const
680		{
681		assert(static_cast<unsigned int>(i) < 16); /* Only 16 elements to access */
682		return _MM_16B(i,vec);
683		}
684
685		/* Element Access for Debug */
686		signed char& operator[](int i)
687		{
688		assert(static_cast<unsigned int>(i) < 16); /* Only 16 elements to access */
689		return _MM_16B(i,vec);
690		}
691
692		};
693
694		inline Is8vec16 cmpeq(const Is8vec16 &a, const Is8vec16 &b) { return _mm_cmpeq_epi8(a,b); }
695		inline Is8vec16 cmpneq(const Is8vec16 &a, const Is8vec16 &b) { return _mm_andnot_si128(_mm_cmpeq_epi8(a,b), get_mask128()); }
696		inline Is8vec16 cmpgt(const Is8vec16 &a, const Is8vec16 &b) { return _mm_cmpgt_epi8(a,b); }
697		inline Is8vec16 cmplt(const Is8vec16 &a, const Is8vec16 &b) { return _mm_cmplt_epi8(a,b); }
698
699		inline Is8vec16 unpack_low(const Is8vec16 &a, const Is8vec16 &b) { return _mm_unpacklo_epi8(a,b); }
700		inline Is8vec16 unpack_high(const Is8vec16 &a, const Is8vec16 &b) { return _mm_unpackhi_epi8(a,b); }
701
702		inline Is8vec16 sat_add(const Is8vec16 &a, const Is8vec16 &b) { return _mm_adds_epi8(a,b); }
703		inline Is8vec16 sat_sub(const Is8vec16 &a, const Is8vec16 &b) { return _mm_subs_epi8(a,b); }
704
705		/* Iu8vec16 Class:
706		* 16 elements, each element a unsigned char
707		*/
708		class Iu8vec16 : public I8vec16
709		{
710		public:
711		Iu8vec16() { }
712		Iu8vec16(__m128i mm) : I8vec16(mm) { }
713
714		/* Assignment Operator */
715		Iu8vec16& operator= (const M128 &a) { return *this = (Iu8vec16) a; }
716
717		/* Logical Assignment Operators */
718		Iu8vec16& operator&=(const M128 &a) { return *this = (Iu8vec16) _mm_and_si128(vec,a); }
719		Iu8vec16& operator\|=(const M128 &a) { return *this = (Iu8vec16) _mm_or_si128(vec,a); }
720		Iu8vec16& operator^=(const M128 &a) { return *this = (Iu8vec16) _mm_xor_si128(vec,a); }
721
722		/* Addition & Subtraction Assignment Operators */
723		Iu8vec16& operator +=(const I8vec16 &a) { return *this = (Iu8vec16) _mm_add_epi8(vec,a); }
724		Iu8vec16& operator -=(const I8vec16 &a) { return *this = (Iu8vec16) _mm_sub_epi8(vec,a); }
725
726		#if defined(_ENABLE_VEC_DEBUG)
727		/* Output for Debug */
728		friend std::ostream& operator << (std::ostream &os, const Iu8vec16 &a)
729		{
730		os << "[15]:" << unsigned short(_MM_16UB(15,a))
731		<< " [14]:" << unsigned short(_MM_16UB(14,a))
732		<< " [13]:" << unsigned short(_MM_16UB(13,a))
733		<< " [12]:" << unsigned short(_MM_16UB(12,a))
734		<< " [11]:" << unsigned short(_MM_16UB(11,a))
735		<< " [10]:" << unsigned short(_MM_16UB(10,a))
736		<< " [9]:" << unsigned short(_MM_16UB(9,a))
737		<< " [8]:" << unsigned short(_MM_16UB(8,a))
738		<< " [7]:" << unsigned short(_MM_16UB(7,a))
739		<< " [6]:" << unsigned short(_MM_16UB(6,a))
740		<< " [5]:" << unsigned short(_MM_16UB(5,a))
741		<< " [4]:" << unsigned short(_MM_16UB(4,a))
742		<< " [3]:" << unsigned short(_MM_16UB(3,a))
743		<< " [2]:" << unsigned short(_MM_16UB(2,a))
744		<< " [1]:" << unsigned short(_MM_16UB(1,a))
745		<< " [0]:" << unsigned short(_MM_16UB(0,a));
746		return os;
747		}
748		#endif
749
750		/* Element Access for Debug, No data modified */
751		const unsigned char& operator[](int i)const
752		{
753		assert(static_cast<unsigned int>(i) < 16); /* Only 16 elements to access */
754		return _MM_16UB(i,vec);
755		}
756
757		/* Element Access for Debug */
758		unsigned char& operator[](int i)
759		{
760		assert(static_cast<unsigned int>(i) < 16); /* Only 16 elements to access */
761		return _MM_16UB(i,vec);
762		}
763
764		};
765
766		inline Iu8vec16 cmpeq(const Iu8vec16 &a, const Iu8vec16 &b) { return _mm_cmpeq_epi8(a,b); }
767		inline Iu8vec16 cmpneq(const Iu8vec16 &a, const Iu8vec16 &b) { return _mm_andnot_si128(_mm_cmpeq_epi8(a,b), get_mask128()); }
768
769		inline Iu8vec16 unpack_low(const Iu8vec16 &a, const Iu8vec16 &b) { return _mm_unpacklo_epi8(a,b); }
770		inline Iu8vec16 unpack_high(const Iu8vec16 &a, const Iu8vec16 &b) { return _mm_unpackhi_epi8(a,b); }
771
772		inline Iu8vec16 sat_add(const Iu8vec16 &a, const Iu8vec16 &b) { return _mm_adds_epu8(a,b); }
773		inline Iu8vec16 sat_sub(const Iu8vec16 &a, const Iu8vec16 &b) { return _mm_subs_epu8(a,b); }
774
775		inline I64vec2 sum_abs(const Iu8vec16 &a, const Iu8vec16 &b) { return _mm_sad_epu8(a,b); }
776
777		inline Iu8vec16 simd_avg(const Iu8vec16 &a, const Iu8vec16 &b) { return _mm_avg_epu8(a,b); }
778		inline Iu8vec16 simd_max(const Iu8vec16 &a, const Iu8vec16 &b) { return _mm_max_epu8(a,b); }
779		inline Iu8vec16 simd_min(const Iu8vec16 &a, const Iu8vec16 &b) { return _mm_min_epu8(a,b); }
780
781		/* Pack & Saturates */
782
783		inline Is16vec8 pack_sat(const Is32vec4 &a, const Is32vec4 &b) { return _mm_packs_epi32(a,b); }
784		inline Is8vec16 pack_sat(const Is16vec8 &a, const Is16vec8 &b) { return _mm_packs_epi16(a,b); }
785		inline Iu8vec16 packu_sat(const Is16vec8 &a, const Is16vec8 &b) { return _mm_packus_epi16(a,b);}
786
787		/******************************* Logicals **************************************/
788		#define IVEC128_LOGICALS(vect,element) \
789		inline I##vect##vec##element operator& (const I##vect##vec##element &a, const I##vect##vec##element &b) \
790		{ return _mm_and_si128( a,b); } \
791		inline I##vect##vec##element operator\| (const I##vect##vec##element &a, const I##vect##vec##element &b) \
792		{ return _mm_or_si128( a,b); } \
793		inline I##vect##vec##element operator^ (const I##vect##vec##element &a, const I##vect##vec##element &b) \
794		{ return _mm_xor_si128( a,b); } \
795		inline I##vect##vec##element andnot (const I##vect##vec##element &a, const I##vect##vec##element &b) \
796		{ return _mm_andnot_si128( a,b); }
797
798		IVEC128_LOGICALS(8,16)
799		IVEC128_LOGICALS(u8,16)
800		IVEC128_LOGICALS(s8,16)
801		IVEC128_LOGICALS(16,8)
802		IVEC128_LOGICALS(u16,8)
803		IVEC128_LOGICALS(s16,8)
804		IVEC128_LOGICALS(32,4)
805		IVEC128_LOGICALS(u32,4)
806		IVEC128_LOGICALS(s32,4)
807		IVEC128_LOGICALS(64,2)
808		IVEC128_LOGICALS(128,1)
809		#undef IVEC128_LOGICALS
810
811		/******************************* Add & Sub **************************************/
812		#define IVEC128_ADD_SUB(vect,element,opsize) \
813		inline I##vect##vec##element operator+ (const I##vect##vec##element &a, const I##vect##vec##element &b) \
814		{ return _mm_add_##opsize( a,b); } \
815		inline I##vect##vec##element operator- (const I##vect##vec##element &a, const I##vect##vec##element &b) \
816		{ return _mm_sub_##opsize( a,b); }
817
818		IVEC128_ADD_SUB(8,16, epi8)
819		IVEC128_ADD_SUB(u8,16, epi8)
820		IVEC128_ADD_SUB(s8,16, epi8)
821		IVEC128_ADD_SUB(16,8, epi16)
822		IVEC128_ADD_SUB(u16,8, epi16)
823		IVEC128_ADD_SUB(s16,8, epi16)
824		IVEC128_ADD_SUB(32,4, epi32)
825		IVEC128_ADD_SUB(u32,4, epi32)
826		IVEC128_ADD_SUB(s32,4, epi32)
827		IVEC128_ADD_SUB(64,2, epi64)
828		#undef IVEC128_ADD_SUB
829
830		/******************************* Conditional Select **************************************/
831		/* version of: retval = (a OP b)? c : d; *
832		* Where OP is one of the possible comparision operators. *
833		* Example: r = select_eq(a,b,c,d); *
834		* if "member at position x of the vector a" == "member at position x of vector b" *
835		* assign the corresponding member in r from c, else assign from d. *
836		******************************* Conditional Select **************************************/
837
838		#define IVEC128_SELECT(vect12,vect34,element,selop,arg1,arg2) \
839		inline I##vect34##vec##element select_##selop (const I##vect12##vec##element &a, const I##vect12##vec##element &b, const I##vect34##vec##element &c, const I##vect34##vec##element &d) \
840		{ \
841		I##vect12##vec##element mask = cmp##selop(a,b); \
842		return( I##vect34##vec##element ((mask & arg1 ) \| I##vect12##vec##element ((_mm_andnot_si128(mask, arg2 ))))); \
843		}
844		IVEC128_SELECT(8,s8,16,eq,c,d)
845		IVEC128_SELECT(8,u8,16,eq,c,d)
846		IVEC128_SELECT(8,8,16,eq,c,d)
847		IVEC128_SELECT(8,s8,16,neq,c,d)
848		IVEC128_SELECT(8,u8,16,neq,c,d)

Lines 849 ... 858 are skipped.

859		IVEC128_SELECT(32,u32,4,eq,c,d)
860		IVEC128_SELECT(32,32,4,eq,c,d)
861		IVEC128_SELECT(32,s32,4,neq,c,d)
862		IVEC128_SELECT(32,u32,4,neq,c,d)
863		IVEC128_SELECT(32,32,4,neq,c,d)
864
865		IVEC128_SELECT(s8,s8,16,gt,c,d)
866		IVEC128_SELECT(s8,u8,16,gt,c,d)
867		IVEC128_SELECT(s8,8,16,gt,c,d)
868		IVEC128_SELECT(s8,s8,16,lt,c,d)
869		IVEC128_SELECT(s8,u8,16,lt,c,d)
870		IVEC128_SELECT(s8,8,16,lt,c,d)
871
872		IVEC128_SELECT(s16,s16,8,gt,c,d)
873		IVEC128_SELECT(s16,u16,8,gt,c,d)
874		IVEC128_SELECT(s16,16,8,gt,c,d)
875		IVEC128_SELECT(s16,s16,8,lt,c,d)
876		IVEC128_SELECT(s16,u16,8,lt,c,d)
877		IVEC128_SELECT(s16,16,8,lt,c,d)
878
879
880		#undef IVEC128_SELECT
881
882
883		class F64vec2
884		{
885		protected:
886		__m128d vec;
887		public:
888
889		/* Constructors: __m128d, 2 doubles */
890		F64vec2() {}
891
892		/* initialize 2 DP FP with __m128d data type */
893		F64vec2(__m128d m) { vec = m;}
894
895		/* initialize 2 DP FPs with 2 doubles */
896		F64vec2(double d1, double d0) { vec= _mm_set_pd(d1,d0); }
897
898		/* Explicitly initialize each of 2 DP FPs with same double */
899		EXPLICIT F64vec2(double d) { vec = _mm_set1_pd(d); }
900
901		/* Conversion functions */
902		operator __m128d() const { return vec; } /* Convert to __m128d */
903
904		/* Logical Operators */
905		friend F64vec2 operator &(const F64vec2 &a, const F64vec2 &b) { return _mm_and_pd(a,b); }
906		friend F64vec2 operator \|(const F64vec2 &a, const F64vec2 &b) { return _mm_or_pd(a,b); }
907		friend F64vec2 operator ^(const F64vec2 &a, const F64vec2 &b) { return _mm_xor_pd(a,b); }
908
909		/* Arithmetic Operators */
910		friend F64vec2 operator +(const F64vec2 &a, const F64vec2 &b) { return _mm_add_pd(a,b); }
911		friend F64vec2 operator -(const F64vec2 &a, const F64vec2 &b) { return _mm_sub_pd(a,b); }
912		friend F64vec2 operator *(const F64vec2 &a, const F64vec2 &b) { return _mm_mul_pd(a,b); }
913		friend F64vec2 operator /(const F64vec2 &a, const F64vec2 &b) { return _mm_div_pd(a,b); }
914
915		F64vec2& operator +=(F64vec2 &a) { return *this = _mm_add_pd(vec,a); }
916		F64vec2& operator -=(F64vec2 &a) { return *this = _mm_sub_pd(vec,a); }
917		F64vec2& operator =(F64vec2 &a) { return this = _mm_mul_pd(vec,a); }
918		F64vec2& operator /=(F64vec2 &a) { return *this = _mm_div_pd(vec,a); }
919		F64vec2& operator &=(F64vec2 &a) { return *this = _mm_and_pd(vec,a); }
920		F64vec2& operator \|=(F64vec2 &a) { return *this = _mm_or_pd(vec,a); }
921		F64vec2& operator ^=(F64vec2 &a) { return *this = _mm_xor_pd(vec,a); }
922
923		/* Horizontal Add */
924		friend double add_horizontal(F64vec2 &a)
925		{
926		F64vec2 ftemp = _mm_add_sd(a,_mm_shuffle_pd(a, a, 1));
927		return ftemp[0];
928		}
929
930		/* And Not */
931		friend F64vec2 andnot(const F64vec2 &a, const F64vec2 &b) { return _mm_andnot_pd(a,b); }
932
933		/* Square Root */
934		friend F64vec2 sqrt(const F64vec2 &a) { return _mm_sqrt_pd(a); }
935
936		/* Compares: Mask is returned */
937		/* Macros expand to all compare intrinsics. Example:
938		friend F64vec2 cmpeq(const F64vec2 &a, const F64vec2 &b)
939		{ return _mm_cmpeq_ps(a,b);} */
940		#define F64vec2_COMP(op) \
941		friend F64vec2 cmp##op (const F64vec2 &a, const F64vec2 &b) { return _mm_cmp##op##_pd(a,b); }
942		F64vec2_COMP(eq) /* expanded to cmpeq(a,b) */
943		F64vec2_COMP(lt) /* expanded to cmplt(a,b) */
944		F64vec2_COMP(le) /* expanded to cmple(a,b) */
945		F64vec2_COMP(gt) /* expanded to cmpgt(a,b) */
946		F64vec2_COMP(ge) /* expanded to cmpge(a,b) */
947		F64vec2_COMP(ngt) /* expanded to cmpngt(a,b) */
948		F64vec2_COMP(nge) /* expanded to cmpnge(a,b) */
949		F64vec2_COMP(neq) /* expanded to cmpneq(a,b) */
950		F64vec2_COMP(nlt) /* expanded to cmpnlt(a,b) */
951		F64vec2_COMP(nle) /* expanded to cmpnle(a,b) */
952		#undef F64vec2_COMP
953
954		/* Min and Max */
955		friend F64vec2 simd_min(const F64vec2 &a, const F64vec2 &b) { return _mm_min_pd(a,b); }
956		friend F64vec2 simd_max(const F64vec2 &a, const F64vec2 &b) { return _mm_max_pd(a,b); }
957
958		/* Compare lower DP FP values */
959		#define F64vec2_COMI(op) \
960		friend int comi##op (const F64vec2 &a, const F64vec2 &b) { return _mm_comi##op##_sd(a,b); }
961		F64vec2_COMI(eq) /* expanded to comieq(a,b) */
962		F64vec2_COMI(lt) /* expanded to comilt(a,b) */
963		F64vec2_COMI(le) /* expanded to comile(a,b) */
964		F64vec2_COMI(gt) /* expanded to comigt(a,b) */
965		F64vec2_COMI(ge) /* expanded to comige(a,b) */
966		F64vec2_COMI(neq) /* expanded to comineq(a,b) */
967		#undef F64vec2_COMI
968
969		/* Compare lower DP FP values */
970		#define F64vec2_UCOMI(op) \
971		friend int ucomi##op (const F64vec2 &a, const F64vec2 &b) { return _mm_ucomi##op##_sd(a,b); }
972		F64vec2_UCOMI(eq) /* expanded to ucomieq(a,b) */
973		F64vec2_UCOMI(lt) /* expanded to ucomilt(a,b) */
974		F64vec2_UCOMI(le) /* expanded to ucomile(a,b) */
975		F64vec2_UCOMI(gt) /* expanded to ucomigt(a,b) */
976		F64vec2_UCOMI(ge) /* expanded to ucomige(a,b) */
977		F64vec2_UCOMI(neq) /* expanded to ucomineq(a,b) */
978		#undef F64vec2_UCOMI
979
980		/* Debug Features */
981		#if defined(_ENABLE_VEC_DEBUG)
982		/* Output */
983		friend std::ostream & operator<<(std::ostream & os, const F64vec2 &a)
984		{
985		/* To use: cout << "Elements of F64vec2 fvec are: " << fvec; */
986		double dp = (double)&a;
987		os << " [1]:" << *(dp+1)
988		<< " [0]:" << *dp;
989		return os;
990		}
991		#endif
992		/* Element Access Only, no modifications to elements*/
993		const double& operator[](int i) const
994		{
995		/* Assert enabled only during debug /DDEBUG */
996		assert((0 <= i) && (i <= 1)); /* User should only access elements 0-1 */
997		double dp = (double)&vec;
998		return *(dp+i);
999		}
1000		/* Element Access and Modification*/
1001		double& operator[](int i)
1002		{
1003		/* Assert enabled only during debug /DDEBUG */
1004		assert((0 <= i) && (i <= 1)); /* User should only access elements 0-1 */
1005		double dp = (double)&vec;
1006		return *(dp+i);
1007		}
1008		};
1009
1010		/* Miscellaneous */
1011
1012		/* Interleave low order data elements of a and b into destination */
1013		inline F64vec2 unpack_low(const F64vec2 &a, const F64vec2 &b)
1014		{ return _mm_unpacklo_pd(a, b); }
1015
1016		/* Interleave high order data elements of a and b into target */
1017		inline F64vec2 unpack_high(const F64vec2 &a, const F64vec2 &b)
1018		{ return _mm_unpackhi_pd(a, b); }
1019
1020		/* Move Mask to Integer returns 4 bit mask formed of most significant bits of a */
1021		inline int move_mask(const F64vec2 &a)
1022		{ return _mm_movemask_pd(a);}
1023
1024		/* Data Motion Functions */
1025
1026		/* Load Unaligned loadu_pd: Unaligned */
1027		inline void loadu(F64vec2 &a, double *p)
1028		{ a = _mm_loadu_pd(p); }
1029
1030		/* Store Temporal storeu_pd: Unaligned */
1031		inline void storeu(double *p, const F64vec2 &a)
1032		{ _mm_storeu_pd(p, a); }
1033
1034		/* Cacheability Support */
1035
1036		/* Non-Temporal Store */
1037		inline void store_nta(double *p, F64vec2 &a)
1038		{ _mm_stream_pd(p,a);}
1039
1040		#define F64vec2_SELECT(op) \
1041		inline F64vec2 select_##op (const F64vec2 &a, const F64vec2 &b, const F64vec2 &c, const F64vec2 &d) \
1042		{ \
1043		F64vec2 mask = _mm_cmp##op##_pd(a,b); \
1044		return( (mask & c) \| F64vec2((_mm_andnot_pd(mask,d)))); \
1045		}
1046		F64vec2_SELECT(eq) /* generates select_eq(a,b) */
1047		F64vec2_SELECT(lt) /* generates select_lt(a,b) */
1048		F64vec2_SELECT(le) /* generates select_le(a,b) */
1049		F64vec2_SELECT(gt) /* generates select_gt(a,b) */
1050		F64vec2_SELECT(ge) /* generates select_ge(a,b) */
1051		F64vec2_SELECT(neq) /* generates select_neq(a,b) */
1052		F64vec2_SELECT(nlt) /* generates select_nlt(a,b) */
1053		F64vec2_SELECT(nle) /* generates select_nle(a,b) */
1054		#undef F64vec2_SELECT
1055
1056		/* Convert the lower DP FP value of a to a 32 bit signed integer using Truncate*/
1057		inline int F64vec2ToInt(const F64vec2 &a)
1058		{
1059
1060		return _mm_cvttsd_si32(a);
1061
1062		}
1063
1064		/* Convert the 4 SP FP values of a to DP FP values */
1065		inline F64vec2 F32vec4ToF64vec2(const F32vec4 &a)
1066		{
1067		return _mm_cvtps_pd(a);
1068		}
1069
1070		/* Convert the 2 DP FP values of a to SP FP values */
1071		inline F32vec4 F64vec2ToF32vec4(const F64vec2 &a)
1072		{
1073		return _mm_cvtpd_ps(a);
1074		}
1075
1076		/* Convert the signed int in b to a DP FP value. Upper DP FP value in a passed through */
1077		inline F64vec2 IntToF64vec2(const F64vec2 &a, int b)
1078		{
1079		return _mm_cvtsi32_sd(a,b);
1080		}
1081
1082		#pragma pack(pop) /* 16-B aligned */
1083
1084		#ifdef _MSC_VER
1085		#pragma pack(pop)
1086		#endif /* _MSC_VER */
1087
1088		#endif /* defined(_M_CEE_PURE) */
1089
1090		#endif /* RC_INVOKED */
1091		#endif /* _DVEC_H_INCLUDED */
1092