SEP Solver Library  1.0
 All Classes Namespaces Files Functions Variables Typedefs Macros
vectorize.h
Go to the documentation of this file.
1 /***
2  * These C++ classes in flexSIMD.h enable performance and allow easy switching
3  * between either SSE, AVX, FMA3 or FMA4.
4  * The file flexSIMD.cpp has sample code that demonstrates some approaches for how to get speed
5  * from FD kernels using these classes.
6  * These classes are simpler than Intel's F32vec4 and F32vec8, and have no license restrictions.
7  *
8  * The kernels should achieve 8-15 GFlops on most modern cores in SSE mode. Unfortunately, I found no
9  * advantage to using AVX or FMA on recent CPUs. This is very, very disappointing. I have performance
10  * test results of this code on numerous CPUs in a an accompanying file.
11  *
12  * Perhaps this code can be used as a basis for experts to show how to achieve better performance from new
13  * hardware, especially heterogeneous hardware.
14  *
15  * These classes easily switch between either SSE, AVX, FMA3 or FMA4 float
16  * operations for multiple & add. The need for efficiency strongly affects the class design.
17  * We are only using language features that are very predictable by the compiler, just a little
18  * more than macros. This is why we can't use virtual functions and why they are fixed length.
19  * We also seek to avoid temporary return variables from functions.
20  * We could have used templates for these classes, but I don't trust templates.
21  *
22  * Note that SSE & AVX is most efficient when they operate on several groups at once.
23  * A multiply may take 3-5 operations to complete, but
24  * the CPU can work on 3-5 at the same time, IF THEY ARE INDEPENDENT OF EACH OTHER. We can't work on too
25  * many floating point values, because then we can't keep most data in the 16 ymm (AVX) registers.
26  * I suggest you operate on 32 float values at once. The reason these classes only contain 16 floats
27  * rather than 32 is that the SSE class would have too many _mm128 objects which would be inefficient
28  * usage of registers. You should use 2 of these SIMD_CLASS16 in sequence. See example code at end. Note that
29  * the compiler can reorder instructions to reduce waiting for instruction dependency.
30  *
31  * Some CPUs are moving to 16 float values (512 bits) SIMD instructions.
32  * And 16 float values (64 bytes) is the cache line size used by most (all?) current CPUs.
33  *
34  *
35  *
36  * This code was created by Christof Stork. It is in the public domain, freely available, with no license.
37  * You are free to copy and modify the code. If you use or copy much of the code, I request that you credit me.
38  * If you make improvements, I'm interested in what you did. cstork@pcisys.net
39  * Thank you!
40 
41  */
42 #define BLOCK_SIZE 16
43 #include <assert.h>
44 //#include <flexSIMD.h>
45 
46 
47 #ifndef __SSE__
48 #error You have an ancient non-SSE compiler, or you need to add compiler flags.
49 #endif
50 
51 
52 #ifdef __GNUC__
53 #warning flag __GNUC__ found. Using Gnu compiler includes.
54 #include <xmmintrin.h>
55 //#include <x86intrin.h>
56 #ifdef __FMA4__
57 #include <fma4intrin.h>
58 #endif
59 
60 #else // not GNUC
61 #warning flag __GNUC__ not found. Using Intel compiler includes.
62 
63 #include <xmmintrin.h>
64 
65 #ifdef __AVX__
66 #include <immintrin.h>
67 #endif // AVX
68 
69 #ifdef __FMA__
70 #define __FMA3__
71 #include <fmaintrin.h>
72 #endif
73 
74 
75 #endif // GNUC
76 
77 
78 
79 #ifdef __AVX__
80 #warning AVX is supported by your compiler!
81 #else
82 #warning AVX is not supported by your compiler, or you need to add compiler flags!
83 #endif
84 #ifdef __FMA4__
85 #warning Compiler supports FMA4!
86 #endif
87 #if defined ( __FMA__ ) || defined ( __FMA3__ )
88 #warning Compiler supports FMA3!
89 #endif
90 
91 
92 #ifdef __FMA4__
93 #define SIMD_Float16 AVX_FMA4_Float16
94 #elif defined ( __FMA__ ) || defined ( __FMA3__ )
95 #define SIMD_Float16 AVX_FMA3_Float16
96 #elif defined __AVX__
97 #define SIMD_Float16 AVX_Float16
98 #else
99 #define SIMD_Float16 SSE_Float16
100 #endif
101 
102 
103 // This class hides the _mm128 & _mm256 objects.
104 // This class is needed so the same code work for SSE or AVX SIMD_Float16
106 public:
107 union {
108 #ifdef __AVX__
109  __m256 _ymm;
110 #endif
111  __m128 _xmm;
112 };
113 };
114 
115 
116 
117 // This is just an interface class to show how all the other classes work.
118 // We can't use virtual functions because that kills performance
119 // We don't bother implementing any of the functions since they won't be used.
121 
122  static const char* description() {return "Abstract class for 16 floats. ";}
123 
124 private:
125  // private to make sure no one creates one of these.
126  Abstract_Float16() // empty constructor
127  {
128  }
129  Abstract_Float16( float* array); //initialize with array of 16 floats.
130  // array needs to be aligned to 16 or 32 bytes.
131 
132  Abstract_Float16(Abstract_Float16& dum) // copy constructor
133  {
134  }
135  Abstract_Float16(float c) // default all values to a constant, such as 0.0
136  {
137  }
138 
139 public:
140 
141  inline void load( float* array); // read in 16 float values
142  inline void loadu( float* array); // read in 16 float values, unaliagned
143 
144  inline void stream( float* array); // Efficient write bypassing cache. Memory is not read into cache which
145  // the cache has to do to "own the memory unit.
146  // Use this if you don't need the output for a long time.
147 
148  inline void store( float* array); // write saving output in cache. Causes a memory read so the cache owns the memory.
149 
150 
151  // These operators are easier to implement because they don't require a temporary.
152  inline Abstract_Float16& operator*=( const Abstract_Float16& B);
153  inline Abstract_Float16& operator*=( const float* Barray); // Use directly with aligned array of 16 floats
154  inline Abstract_Float16& operator*=( const SIMD_constant& sConst ); // use a constant for all operations.
155 
156  inline Abstract_Float16& operator+=( const Abstract_Float16& B);
157  inline Abstract_Float16& operator+=( const float* B);
158  inline Abstract_Float16& operator+=( const SIMD_constant& sConst );
159 
160  inline Abstract_Float16& operator-=( const Abstract_Float16& B);
161  inline Abstract_Float16& operator-=( const float* B);
162  inline Abstract_Float16& operator-=( const SIMD_constant& sConst );
163 
164  // We are only bothering with multiply, add, and subtract in this class. You can expand to the other functions.
165 
166  // we can't use the operator* because that creates an inefficient temporary variable
167  // Use like this:
168  // Abstract_Float16 r1, A, B;
169  // r1.mult( A, B ); // r1= A * B
170  // r1.add( A, B ); // r1= A + B
171  // r1.sub( A, B ); // r1= A - B
172  inline Abstract_Float16& mult(const Abstract_Float16& A, const Abstract_Float16& B);
173  inline Abstract_Float16& mult(const Abstract_Float16& A, const SIMD_constant& sConst );
174  inline Abstract_Float16& mult(const Abstract_Float16& A, const float* B);
175 
176  inline Abstract_Float16& add(const Abstract_Float16& A, const Abstract_Float16& B);
177  inline Abstract_Float16& add(const Abstract_Float16& A, const SIMD_constant& sConst );
178  inline Abstract_Float16& add(const Abstract_Float16& A, const float* B);
179 
180  inline Abstract_Float16& sub(const Abstract_Float16& A, const Abstract_Float16& B);
181  inline Abstract_Float16& sub(const Abstract_Float16& A, const SIMD_constant& sConst );
182  inline Abstract_Float16& sub(const Abstract_Float16& A, const float* B);
183 
184  // new CPUs support efficient multiply-add instructions.
185  // Use like this:
186  // Abstract_Float16 r1;
187  // r1.fMultAdd( B, C ); // r1= r1 + B * C
188  inline Abstract_Float16& fMultAdd( const Abstract_Float16& mult1, const Abstract_Float16& mult2 );
189  inline Abstract_Float16& fMultAdd( const Abstract_Float16& mult1, const SIMD_constant& mult2Const );
190 };
191 
192 
193 
194 class SSE_Float16 {
195  // perform basic SSE instructions on 16 floating point values
196 
197  // most recent CPUs have 16 xmm registers
198 
199  __m128 _xmm0;
200  __m128 _xmm1;
201  __m128 _xmm2;
202  __m128 _xmm3;
203 
204 
205 public:
206 
207  static const char* description() {return "Basic SSE, 16 floats";}
208 
209  SSE_Float16() // empty constructor
210  {
211  }
212  SSE_Float16( float* array) __attribute__((always_inline)) //initialize to float array
213  {
214  assert( ((unsigned long)array) % 16 == 0 );
215  _xmm0= _mm_load_ps( array );
216  _xmm1= _mm_load_ps( array+ 4 );
217  _xmm2= _mm_load_ps( array+ 2*4 );
218  _xmm3= _mm_load_ps( array+ 3*4 );
219  }
220  SSE_Float16( const SSE_Float16& B) // copy constructor
221  {
222  _xmm0= B._xmm0 ;
223  _xmm1= B._xmm1 ;
224  _xmm2= B._xmm2 ;
225  _xmm3= B._xmm3 ;
226  }
227  SSE_Float16( float constVal ) // copy constructor
228  {
229  _xmm0= _mm_set1_ps( constVal );
230  _xmm1= _xmm0;
231  _xmm2= _xmm0;
232  _xmm3= _xmm0;
233  }
234 
235  inline void static setConstant( SIMD_constant& sConst, const float& constVal ) __attribute__((always_inline)) {
236  sConst._xmm= _mm_set1_ps( constVal );
237  }
238 
239  inline void stream( float* array) __attribute__((always_inline)) // write bypassing cache
240  {
241  assert( ((unsigned long)array) % 16 == 0 );
242  _mm_stream_ps( array, _xmm0 );
243  _mm_stream_ps( array+ 4, _xmm1 );
244  _mm_stream_ps( array+ 2*4, _xmm2 );
245  _mm_stream_ps( array+ 3*4, _xmm3 );
246  }
247  inline void store( float* array) // write including cache
248  {
249  assert( ((unsigned long)array) % 16 == 0 );
250  _mm_store_ps( array, _xmm0 );
251  _mm_store_ps( array+ 4, _xmm1 );
252  _mm_store_ps( array+ 2*4, _xmm2 );
253  _mm_store_ps( array+ 3*4, _xmm3 );
254  }
255  inline void load( float* array) //
256  {
257  assert( ((unsigned long)array) % 16 == 0 );
258  _xmm0= _mm_load_ps( array );
259  _xmm1= _mm_load_ps( array+ 4 );
260  _xmm2= _mm_load_ps( array+ 2*4 );
261  _xmm3= _mm_load_ps( array+ 3*4 );
262  }
263  inline void loadu( float* array) //
264  {
265  _xmm0= _mm_loadu_ps( array );
266  _xmm1= _mm_loadu_ps( array+ 4 );
267  _xmm2= _mm_loadu_ps( array+ 2*4 );
268  _xmm3= _mm_loadu_ps( array+ 3*4 );
269  }
270 
271  // The reason that these don't call the operator* is that the compiler implements
272  // a = _mm_mul_ps(b,c) by doing this:
273  // a=b; a=_mm_mul_ps(a,c); _mm_mul_ps() is really the *= operator.
274  // _mm_mul_ps() needs to output to one of the input arguments.
275  // If we implement *= by calling operator*, I'm not sure the compiler is smart enough to see the copy isn't needed.
276  inline SSE_Float16& operator*=( const SSE_Float16& B) __attribute__((always_inline))
277  {
278  _xmm0= _mm_mul_ps( _xmm0, B._xmm0 );
279  _xmm1= _mm_mul_ps( _xmm1, B._xmm1 );
280  _xmm2= _mm_mul_ps( _xmm2, B._xmm2 );
281  _xmm3= _mm_mul_ps( _xmm3, B._xmm3 );
282 
283  return *this;
284  }
285 
286  inline SSE_Float16& operator*=( const float* B)
287  {
288  assert( ((unsigned long)B) % 16 == 0 );
289  _xmm0= _mm_mul_ps( _xmm0, _mm_load_ps(B+0*4) );
290  _xmm1= _mm_mul_ps( _xmm1, _mm_load_ps(B+1*4) );
291  _xmm2= _mm_mul_ps( _xmm2, _mm_load_ps(B+2*4) );
292  _xmm3= _mm_mul_ps( _xmm3, _mm_load_ps(B+3*4) );
293 
294  return *this;
295  }
296  inline SSE_Float16& operator*=( const SIMD_constant& sConst )
297  {
298  _xmm0= _mm_mul_ps( _xmm0, sConst._xmm );
299  _xmm1= _mm_mul_ps( _xmm1, sConst._xmm );
300  _xmm2= _mm_mul_ps( _xmm2, sConst._xmm );
301  _xmm3= _mm_mul_ps( _xmm3, sConst._xmm );
302 
303  return *this;
304  }
305 
306  inline SSE_Float16& operator+=( const SSE_Float16& B)
307  {
308  _xmm0= _mm_add_ps( _xmm0, B._xmm0 );
309  _xmm1= _mm_add_ps( _xmm1, B._xmm1 );
310  _xmm2= _mm_add_ps( _xmm2, B._xmm2 );
311  _xmm3= _mm_add_ps( _xmm3, B._xmm3 );
312 
313  return *this;
314  }
315  inline SSE_Float16& operator+=( const float* B) __attribute__((always_inline))
316  {
317  assert( ((unsigned long)B) % 16 == 0 );
318  _xmm0= _mm_add_ps( _xmm0, _mm_load_ps(B+0*4) );
319  _xmm1= _mm_add_ps( _xmm1, _mm_load_ps(B+1*4) );
320  _xmm2= _mm_add_ps( _xmm2, _mm_load_ps(B+2*4) );
321  _xmm3= _mm_add_ps( _xmm3, _mm_load_ps(B+3*4) );
322 
323  return *this;
324  }
325  inline SSE_Float16& operator+=( const SIMD_constant& sConst )
326  {
327  _xmm0= _mm_add_ps( _xmm0, sConst._xmm );
328  _xmm1= _mm_add_ps( _xmm1, sConst._xmm );
329  _xmm2= _mm_add_ps( _xmm2, sConst._xmm );
330  _xmm3= _mm_add_ps( _xmm3, sConst._xmm );
331 
332  return *this;
333  }
334 
335  inline SSE_Float16& operator-=( const SSE_Float16& B)
336  {
337  _xmm0= _mm_sub_ps( _xmm0, B._xmm0 );
338  _xmm1= _mm_sub_ps( _xmm1, B._xmm1 );
339  _xmm2= _mm_sub_ps( _xmm2, B._xmm2 );
340  _xmm3= _mm_sub_ps( _xmm3, B._xmm3 );
341 
342  return *this;
343  }
344  inline SSE_Float16& operator-=( const float* B)
345  {
346  assert( ((unsigned long)B) % 16 == 0 );
347  _xmm0= _mm_sub_ps( _xmm0, _mm_load_ps(B+0*4) );
348  _xmm1= _mm_sub_ps( _xmm1, _mm_load_ps(B+1*4) );
349  _xmm2= _mm_sub_ps( _xmm2, _mm_load_ps(B+2*4) );
350  _xmm3= _mm_sub_ps( _xmm3, _mm_load_ps(B+3*4) );
351 
352  return *this;
353  }
354  inline SSE_Float16& operator-=( const SIMD_constant& sConst )
355  {
356  _xmm0= _mm_sub_ps( _xmm0, sConst._xmm );
357  _xmm1= _mm_sub_ps( _xmm1, sConst._xmm );
358  _xmm2= _mm_sub_ps( _xmm2, sConst._xmm );
359  _xmm3= _mm_sub_ps( _xmm3, sConst._xmm );
360 
361  return *this;
362  }
363 
364 
365  // SSE_Float16 lhs.mult( A, B); // lhs= A*B
366  // we can't use the operator* because that creates an inefficient temporary variable
367  inline SSE_Float16& mult(const SSE_Float16& A, const SSE_Float16& B)
368  {
369  _xmm0= _mm_mul_ps( A._xmm0, B._xmm0 );
370  _xmm1= _mm_mul_ps( A._xmm1, B._xmm1 );
371  _xmm2= _mm_mul_ps( A._xmm2, B._xmm2 );
372  _xmm3= _mm_mul_ps( A._xmm3, B._xmm3 );
373 
374  return *this;
375  }
376  inline SSE_Float16& mult(const SSE_Float16& A, const SIMD_constant& sConst )
377  {
378  _xmm0= _mm_mul_ps( A._xmm0, sConst._xmm );
379  _xmm1= _mm_mul_ps( A._xmm1, sConst._xmm );
380  _xmm2= _mm_mul_ps( A._xmm2, sConst._xmm );
381  _xmm3= _mm_mul_ps( A._xmm3, sConst._xmm );
382  return *this;
383  }
384  inline SSE_Float16& mult(const SSE_Float16& A, const float* B)
385  {
386  assert( ((unsigned long)B) % 16 == 0 );
387  _xmm0= _mm_mul_ps( A._xmm0, _mm_load_ps(B+0*8) );
388  _xmm1= _mm_mul_ps( A._xmm1, _mm_load_ps(B+1*8) );
389  _xmm2= _mm_mul_ps( A._xmm2, _mm_load_ps(B+2*8) );
390  _xmm3= _mm_mul_ps( A._xmm3, _mm_load_ps(B+3*8) );
391 
392  return *this;
393  }
394 
395  inline SSE_Float16& add(const SSE_Float16& A, const SSE_Float16& B)
396  {
397  _xmm0= _mm_add_ps( A._xmm0, B._xmm0 );
398  _xmm1= _mm_add_ps( A._xmm1, B._xmm1 );
399  _xmm2= _mm_add_ps( A._xmm2, B._xmm2 );
400  _xmm3= _mm_add_ps( A._xmm3, B._xmm3 );
401 
402  return *this;
403  }
404  inline SSE_Float16& add(const SSE_Float16& A, const SIMD_constant& sConst )
405  {
406  _xmm0= _mm_add_ps( A._xmm0, sConst._xmm );
407  _xmm1= _mm_add_ps( A._xmm1, sConst._xmm );
408  _xmm2= _mm_add_ps( A._xmm2, sConst._xmm );
409  _xmm3= _mm_add_ps( A._xmm3, sConst._xmm );
410 
411  return *this;
412  }
413  inline SSE_Float16& add(const SSE_Float16& A, const float* B)
414  {
415  assert( ((unsigned long)B) % 16 == 0 );
416  _xmm0= _mm_add_ps( A._xmm0, _mm_load_ps(B+0*8) );
417  _xmm1= _mm_add_ps( A._xmm1, _mm_load_ps(B+1*8) );
418  _xmm2= _mm_add_ps( A._xmm2, _mm_load_ps(B+2*8) );
419  _xmm3= _mm_add_ps( A._xmm3, _mm_load_ps(B+3*8) );
420 
421  return *this;
422  }
423 
424 
425  inline SSE_Float16& sub(const SSE_Float16& A, const SSE_Float16& B)
426  {
427  _xmm0= _mm_sub_ps( A._xmm0, B._xmm0 );
428  _xmm1= _mm_sub_ps( A._xmm1, B._xmm1 );
429  _xmm2= _mm_sub_ps( A._xmm2, B._xmm2 );
430  _xmm3= _mm_sub_ps( A._xmm3, B._xmm3 );
431 
432  return *this;
433  }
434  inline SSE_Float16& sub(const SSE_Float16& A, const SIMD_constant& sConst )
435  {
436  _xmm0= _mm_sub_ps( A._xmm0, sConst._xmm );
437  _xmm1= _mm_sub_ps( A._xmm1, sConst._xmm );
438  _xmm2= _mm_sub_ps( A._xmm2, sConst._xmm );
439  _xmm3= _mm_sub_ps( A._xmm3, sConst._xmm );
440 
441  return *this;
442  }
443  inline SSE_Float16& sub(const SSE_Float16& A, const float* B)
444  {
445  assert( ((unsigned long)B) % 16 == 0 );
446  _xmm0= _mm_sub_ps( A._xmm0, _mm_load_ps(B+0*8) );
447  _xmm1= _mm_sub_ps( A._xmm1, _mm_load_ps(B+1*8) );
448  _xmm2= _mm_sub_ps( A._xmm2, _mm_load_ps(B+2*8) );
449  _xmm3= _mm_sub_ps( A._xmm3, _mm_load_ps(B+3*8) );
450 
451  return *this;
452  }
453 
454 
455  inline SSE_Float16& fMultAdd( const SSE_Float16& mult1, const SSE_Float16& mult2 )
456  {
457  _xmm0= _mm_add_ps( _xmm0, _mm_mul_ps( mult1._xmm0, mult2._xmm0 ) );
458  _xmm1= _mm_add_ps( _xmm1, _mm_mul_ps( mult1._xmm1, mult2._xmm1 ) );
459  _xmm2= _mm_add_ps( _xmm2, _mm_mul_ps( mult1._xmm2, mult2._xmm2 ) );
460  _xmm3= _mm_add_ps( _xmm3, _mm_mul_ps( mult1._xmm3, mult2._xmm3 ) );
461 
462  return *this;
463  }
464  inline SSE_Float16& fMultAdd( const SSE_Float16& mult1, const SIMD_constant& mult2Const )
465  {
466  _xmm0= _mm_add_ps( _xmm0, _mm_mul_ps( mult1._xmm0, mult2Const._xmm ) );
467  _xmm1= _mm_add_ps( _xmm1, _mm_mul_ps( mult1._xmm1, mult2Const._xmm ) );
468  _xmm2= _mm_add_ps( _xmm2, _mm_mul_ps( mult1._xmm2, mult2Const._xmm ) );
469  _xmm3= _mm_add_ps( _xmm3, _mm_mul_ps( mult1._xmm3, mult2Const._xmm ) );
470 
471  return *this;
472  }
473 };
474 
475 
476 
477 #ifdef __AVX__
478 
479 class AVX_Float16 {
480 protected:
481  __m256 _ymm0;
482  __m256 _ymm1;
483 
484 
485 public:
486  int length() {return 16;}
487 
488  static const char* description() {return "Generic AVX, No FMA, 16 floats";}
489 
490  AVX_Float16( float* array) //initialize
491  {
492  assert( ((unsigned long)array) % 32 == 0 );
493  _ymm0= _mm256_load_ps( array );
494  _ymm1= _mm256_load_ps( array+ 8 );
495  }
496  AVX_Float16( const AVX_Float16& B) // copy constructor
497  {
498  _ymm0= B._ymm0 ;
499  _ymm1= B._ymm1 ;
500  }
501  AVX_Float16() // empty constructor
502  {
503  }
504  AVX_Float16( float constVal ) // copy constructor
505  {
506  _ymm0= _mm256_set1_ps( constVal );
507  _ymm1= _ymm0;
508  }
509 
510  inline static void setConstant( SIMD_constant& sConst, const float& constVal ) {
511  sConst._ymm= _mm256_set1_ps( constVal );
512  }
513 
514  inline void stream( float* array) // write bypassing cache
515  {
516  assert( ((unsigned long)array) % 32 == 0 );
517  _mm256_stream_ps( array, _ymm0 );
518  _mm256_stream_ps( array+ 8, _ymm1 );
519  }
520  inline void store( float* array) // write including cache
521  {
522  assert( ((unsigned long)array) % 32 == 0 );
523  _mm256_store_ps( array, _ymm0 );
524  _mm256_store_ps( array+ 8, _ymm1 );
525  }
526  inline void load( float* array) //
527  {
528  assert( ((unsigned long)array) % 32 == 0 );
529  _ymm0= _mm256_load_ps( array );
530  _ymm1= _mm256_load_ps( array+ 8 );
531  }
532  inline void loadu( float* array) //
533  {
534  _ymm0= _mm256_loadu_ps( array );
535  _ymm1= _mm256_loadu_ps( array+ 8 );
536  }
537 
538 
539  // The _mm256_mul_ps() can output to a different register
540  inline AVX_Float16& mult(const AVX_Float16& A, const AVX_Float16& B)
541  {
542  _ymm0= _mm256_mul_ps( A._ymm0, B._ymm0 );
543  _ymm1= _mm256_mul_ps( A._ymm1, B._ymm1 );
544 
545  return *this;
546  }
547  inline AVX_Float16& mult(const AVX_Float16& A, const SIMD_constant& sConst )
548  {
549  _ymm0= _mm256_mul_ps( A._ymm0, sConst._ymm );
550  _ymm1= _mm256_mul_ps( A._ymm1, sConst._ymm );
551 
552  return *this;
553  }
554  inline AVX_Float16& mult(const AVX_Float16& A, const float* B)
555  {
556  assert( ((unsigned long)B) % 32 == 0 );
557  _ymm0= _mm256_mul_ps( A._ymm0, _mm256_load_ps(B+0*8) );
558  _ymm1= _mm256_mul_ps( A._ymm1, _mm256_load_ps(B+1*8) );
559 
560  return *this;
561  }
562 
563  inline AVX_Float16& add(const AVX_Float16& A, const AVX_Float16& B)
564  {
565  _ymm0= _mm256_add_ps( A._ymm0, B._ymm0 );
566  _ymm1= _mm256_add_ps( A._ymm1, B._ymm1 );
567 
568  return *this;
569  }
570  inline AVX_Float16& add(const AVX_Float16& A, const SIMD_constant& sConst )
571  {
572  _ymm0= _mm256_add_ps( A._ymm0, sConst._ymm );
573  _ymm1= _mm256_add_ps( A._ymm1, sConst._ymm );
574 
575  return *this;
576  }
577  inline AVX_Float16& add(const AVX_Float16& A, const float* B)
578  {
579  assert( ((unsigned long)B) % 32 == 0 );
580  _ymm0= _mm256_add_ps( A._ymm0, _mm256_load_ps(B+0*8) );
581  _ymm1= _mm256_add_ps( A._ymm1, _mm256_load_ps(B+1*8) );
582 
583  return *this;
584  }
585 
586 
587  inline AVX_Float16& sub(const AVX_Float16& A, const AVX_Float16& B)
588  {
589  _ymm0= _mm256_sub_ps( A._ymm0, B._ymm0 );
590  _ymm1= _mm256_sub_ps( A._ymm1, B._ymm1 );
591 
592  return *this;
593  }
594  inline AVX_Float16& sub(const AVX_Float16& A, const SIMD_constant& sConst )
595  {
596  _ymm0= _mm256_sub_ps( A._ymm0, sConst._ymm );
597  _ymm1= _mm256_sub_ps( A._ymm1, sConst._ymm );
598 
599  return *this;
600  }
601  inline AVX_Float16& sub(const AVX_Float16& A, const float* B)
602  {
603  assert( ((unsigned long)B) % 32 == 0 );
604  _ymm0= _mm256_sub_ps( A._ymm0, _mm256_load_ps(B+0*8) );
605  _ymm1= _mm256_sub_ps( A._ymm1, _mm256_load_ps(B+1*8) );
606 
607  return *this;
608  }
609 
610  // we can call the other functions for AVX
611  inline AVX_Float16& operator*=( const AVX_Float16& B)
612  {
613  return this->mult( *this , B);
614  }
615 
616  inline AVX_Float16& operator*=( const float* B)
617  {
618  assert( ((unsigned long)B) % 32 == 0 );
619  return this->mult( *this , B);
620  }
621  inline AVX_Float16& operator*=( const SIMD_constant& sConst )
622  {
623  return this->mult( *this , sConst);
624  }
625 
626  inline AVX_Float16& operator+=( const AVX_Float16& B)
627  {
628  _ymm0= _mm256_add_ps( _ymm0, B._ymm0 );
629  _ymm1= _mm256_add_ps( _ymm1, B._ymm1 );
630 
631  return *this;
632  }
633  inline AVX_Float16& operator+=( const float* B)
634  {
635  assert( ((unsigned long)B) % 32 == 0 );
636  _ymm0= _mm256_add_ps( _ymm0, _mm256_load_ps(B+0*8) );
637  _ymm1= _mm256_add_ps( _ymm1, _mm256_load_ps(B+1*8) );
638 
639  return *this;
640  }
641  inline AVX_Float16& operator+=( const SIMD_constant& sConst )
642  {
643  _ymm0= _mm256_add_ps( _ymm0, sConst._ymm );
644  _ymm1= _mm256_add_ps( _ymm1, sConst._ymm );
645 
646  return *this;
647  }
648 
649  inline AVX_Float16& operator-=( const AVX_Float16& B)
650  {
651  _ymm0= _mm256_sub_ps( _ymm0, B._ymm0 );
652  _ymm1= _mm256_sub_ps( _ymm1, B._ymm1 );
653 
654  return *this;
655  }
656  inline AVX_Float16& operator-=( const float* B)
657  {
658  assert( ((unsigned long)B) % 32 == 0 );
659  _ymm0= _mm256_sub_ps( _ymm0, _mm256_load_ps(B+0*8) );
660  _ymm1= _mm256_sub_ps( _ymm1, _mm256_load_ps(B+1*8) );
661 
662  return *this;
663  }
664  inline AVX_Float16& operator-=( const SIMD_constant& sConst )
665  {
666  _ymm0= _mm256_sub_ps( _ymm0, sConst._ymm );
667  _ymm1= _mm256_sub_ps( _ymm1, sConst._ymm );
668 
669  return *this;
670  }
671 
672 
673  inline AVX_Float16& fMultAdd( const AVX_Float16& mult1, const AVX_Float16& mult2 )
674  {
675  _ymm0= _mm256_add_ps( _ymm0, _mm256_mul_ps( mult1._ymm0, mult2._ymm0 ) );
676  _ymm1= _mm256_add_ps( _ymm1, _mm256_mul_ps( mult1._ymm1, mult2._ymm1 ) );
677  return * this;
678  }
679  inline AVX_Float16& fMultAdd( const AVX_Float16& mult1, const SIMD_constant& sConst )
680  {
681  _ymm0= _mm256_add_ps( _ymm0, _mm256_mul_ps( mult1._ymm0, sConst._ymm ) );
682  _ymm1= _mm256_add_ps( _ymm1, _mm256_mul_ps( mult1._ymm1, sConst._ymm ) );
683  return * this;
684  }
685 
686 };
687 #endif // __AVX__
688 
689 
690 #ifdef __FMA4__
691 class AVX_FMA4_Float16 : public AVX_Float16 {
692 public:
693  static const char* description() {return "AVX, AMD FMA, 16 floats";}
694 
695  AVX_FMA4_Float16( float* array) : AVX_Float16(array) {};
696  AVX_FMA4_Float16( float c) : AVX_Float16(c) {};
697  AVX_FMA4_Float16( const AVX_FMA4_Float16& B) // copy constructor
698  {
699  _ymm0= B._ymm0 ;
700  _ymm1= B._ymm1 ;
701  }
702  AVX_FMA4_Float16() // empty constructor
703  {
704  }
705 
706  inline AVX_FMA4_Float16& fMultAdd( const AVX_FMA4_Float16& mult1, const AVX_FMA4_Float16& mult2 )
707  {
708  _ymm0= _mm256_macc_ps( mult1._ymm0, mult2._ymm0, _ymm0 );
709  _ymm1= _mm256_macc_ps( mult1._ymm1, mult2._ymm1, _ymm1 );
710  return * this;
711  }
712  inline AVX_FMA4_Float16& fMultAdd( const AVX_FMA4_Float16& mult1, const SIMD_constant& sConst )
713  {
714  _ymm0= _mm256_macc_ps( mult1._ymm0, sConst._ymm, _ymm0 );
715  _ymm1= _mm256_macc_ps( mult1._ymm1, sConst._ymm, _ymm1 );
716  return * this;
717  }
718 };
719 #endif
720 
721 #ifdef __FMA__
722 class AVX_FMA3_Float16 : public AVX_Float16 {
723 public:
724  static const char* description() {return "AVX, Intel FMA, 16 floats";}
725 
726  AVX_FMA3_Float16( float* array) : AVX_Float16(array) {};
727  AVX_FMA3_Float16( float c) : AVX_Float16(c) {};
728  AVX_FMA3_Float16( const AVX_FMA3_Float16& B) // copy constructor
729  {
730  _ymm0= B._ymm0 ;
731  _ymm1= B._ymm1 ;
732  }
733  AVX_FMA3_Float16() // empty constructor
734  {
735  }
736 
737 
738  inline AVX_FMA3_Float16& fMultAdd( const AVX_FMA3_Float16& mult1, const AVX_FMA3_Float16& mult2 )
739  {
740  _ymm0= _mm256_fmadd_ps( mult1._ymm0, mult2._ymm0, _ymm0 );
741  _ymm1= _mm256_fmadd_ps( mult1._ymm1, mult2._ymm1, _ymm1 );
742  return * this;
743  }
744  inline AVX_FMA3_Float16& fMultAdd( const AVX_FMA3_Float16& mult1, const SIMD_constant& sConst )
745  {
746  _ymm0= _mm256_fmadd_ps( mult1._ymm0, sConst._ymm, _ymm0 );
747  _ymm1= _mm256_fmadd_ps( mult1._ymm1, sConst._ymm, _ymm1 );
748  return * this;
749  }
750 };
751 #endif
752 
753 
754 
755 
756 static void cpuid( int result[4], int idnum) {
757 #ifdef __INTEL_COMPILER
758  __cpuid(result, idnum );
759 #else // __INTEL_COMPILER
760  int i0, i1, i2, i3;
761  __asm("cpuid": "=a"(i0), "=b"(i1), "=c"(i2), "=d"(i3) : "a"(idnum), "c"(0) : );
762  result[0]= i0;
763  result[1]= i1;
764  result[2]= i2;
765  result[3]= i3;
766 #endif // __INTEL_COMPILER
767 }
768 
769 // this is needed because the cpu may support AVX, but the OS may not.
770 static long xgetbv(int iarg) {
771 #ifndef __AVX__
772  return 0;
773 #else
774 #ifdef __INTEL_COMPILER
775  int i0;
776  i0= _xgetbv(iarg);
777  return i0;
778 #else // __INTEL_COMPILER
779  int i0, i1;
780  __asm("xgetbv": "=a"(i0), "=d"(i1) : "c"(iarg) : );
781  //long l2= ((long)i1) << 32;
782  //return ( i0 | l2 );
783  return i0 ;
784 #endif // __INTEL_COMPILER
785 #endif
786 }
787 
788 static bool AVX_works(void) {
789  static int ireturn= -1;
790  if ( ireturn >= 0 )
791  return ireturn;
792 
793  int cpuflags[4];
794  cpuid(cpuflags,1);
795 
796  ireturn= 0;
797  if ( (cpuflags[2] & (0x1<<27) != 0) &&
798  (cpuflags[2] & (0x1<<28) !=0) &&
799  (xgetbv(0) & 0x6 == 0x6) )
800  ireturn= 1;
801 
802  return ireturn;
803 }
804 
805 static bool SSE_works(void) {
806  static int ireturn= -1;
807  if ( ireturn >= 0 )
808  return ireturn;
809 
810  int cpuflags[4];
811  cpuid(cpuflags,1);
812 
813  ireturn= 0;
814  if ( (cpuflags[3] & (0x1<<25) != 0) )
815  ireturn= 1;
816 
817  return ireturn;
818 }
819 
820 static bool FMA4_works(void) {
821  static int ireturn= -1;
822  if ( ireturn >= 0 )
823  return ireturn;
824 
825  ireturn= 0;
826  if ( AVX_works() ) {
827  int cpuflags[4];
828  cpuid( cpuflags, 0x80000001);
829  if ( cpuflags[2] & (0x1<<16) != 0 )
830  ireturn= 1;
831  }
832  return ireturn;
833 }
834 
835 static bool FMA3_works(void) {
836  static int ireturn= -1;
837  if ( ireturn >= 0 )
838  return ireturn;
839 
840  ireturn= 0;
841  if ( AVX_works() ) {
842  int cpuflags[4];
843  cpuid( cpuflags, 1);
844  if ( cpuflags[2] & (0x1<<12) != 0 )
845  ireturn= 1;
846  }
847  return ireturn;
848 }
849 
850 
851 //#endif /* FLEXSIMD_H_ */
852