48 #error You have an ancient non-SSE compiler, or you need to add compiler flags.
53 #warning flag __GNUC__ found. Using Gnu compiler includes.
54 #include <xmmintrin.h>
57 #include <fma4intrin.h>
61 #warning flag __GNUC__ not found. Using Intel compiler includes.
63 #include <xmmintrin.h>
66 #include <immintrin.h>
71 #include <fmaintrin.h>
80 #warning AVX is supported by your compiler!
82 #warning AVX is not supported by your compiler, or you need to add compiler flags!
85 #warning Compiler supports FMA4!
87 #if defined ( __FMA__ ) || defined ( __FMA3__ )
88 #warning Compiler supports FMA3!
93 #define SIMD_Float16 AVX_FMA4_Float16
94 #elif defined ( __FMA__ ) || defined ( __FMA3__ )
95 #define SIMD_Float16 AVX_FMA3_Float16
97 #define SIMD_Float16 AVX_Float16
99 #define SIMD_Float16 SSE_Float16
122 static const char* description() {
return "Abstract class for 16 floats. ";}
141 inline void load(
float* array);
142 inline void loadu(
float* array);
144 inline void stream(
float* array);
148 inline void store(
float* array);
214 assert( ((
unsigned long)array) % 16 == 0 );
215 _xmm0= _mm_load_ps( array );
216 _xmm1= _mm_load_ps( array+ 4 );
217 _xmm2= _mm_load_ps( array+ 2*4 );
218 _xmm3= _mm_load_ps( array+ 3*4 );
229 _xmm0= _mm_set1_ps( constVal );
236 sConst._xmm= _mm_set1_ps( constVal );
239 inline void stream(
float* array) __attribute__((always_inline))
241 assert( ((
unsigned long)array) % 16 == 0 );
242 _mm_stream_ps( array, _xmm0 );
243 _mm_stream_ps( array+ 4, _xmm1 );
244 _mm_stream_ps( array+ 2*4, _xmm2 );
245 _mm_stream_ps( array+ 3*4, _xmm3 );
249 assert( ((
unsigned long)array) % 16 == 0 );
250 _mm_store_ps( array, _xmm0 );
251 _mm_store_ps( array+ 4, _xmm1 );
252 _mm_store_ps( array+ 2*4, _xmm2 );
253 _mm_store_ps( array+ 3*4, _xmm3 );
255 inline void load(
float* array)
257 assert( ((
unsigned long)array) % 16 == 0 );
258 _xmm0= _mm_load_ps( array );
259 _xmm1= _mm_load_ps( array+ 4 );
260 _xmm2= _mm_load_ps( array+ 2*4 );
261 _xmm3= _mm_load_ps( array+ 3*4 );
265 _xmm0= _mm_loadu_ps( array );
266 _xmm1= _mm_loadu_ps( array+ 4 );
267 _xmm2= _mm_loadu_ps( array+ 2*4 );
268 _xmm3= _mm_loadu_ps( array+ 3*4 );
278 _xmm0= _mm_mul_ps( _xmm0, B._xmm0 );
279 _xmm1= _mm_mul_ps( _xmm1, B._xmm1 );
280 _xmm2= _mm_mul_ps( _xmm2, B._xmm2 );
281 _xmm3= _mm_mul_ps( _xmm3, B._xmm3 );
288 assert( ((
unsigned long)B) % 16 == 0 );
289 _xmm0= _mm_mul_ps( _xmm0, _mm_load_ps(B+0*4) );
290 _xmm1= _mm_mul_ps( _xmm1, _mm_load_ps(B+1*4) );
291 _xmm2= _mm_mul_ps( _xmm2, _mm_load_ps(B+2*4) );
292 _xmm3= _mm_mul_ps( _xmm3, _mm_load_ps(B+3*4) );
298 _xmm0= _mm_mul_ps( _xmm0, sConst.
_xmm );
299 _xmm1= _mm_mul_ps( _xmm1, sConst.
_xmm );
300 _xmm2= _mm_mul_ps( _xmm2, sConst.
_xmm );
301 _xmm3= _mm_mul_ps( _xmm3, sConst.
_xmm );
308 _xmm0= _mm_add_ps( _xmm0, B._xmm0 );
309 _xmm1= _mm_add_ps( _xmm1, B._xmm1 );
310 _xmm2= _mm_add_ps( _xmm2, B._xmm2 );
311 _xmm3= _mm_add_ps( _xmm3, B._xmm3 );
317 assert( ((
unsigned long)B) % 16 == 0 );
318 _xmm0= _mm_add_ps( _xmm0, _mm_load_ps(B+0*4) );
319 _xmm1= _mm_add_ps( _xmm1, _mm_load_ps(B+1*4) );
320 _xmm2= _mm_add_ps( _xmm2, _mm_load_ps(B+2*4) );
321 _xmm3= _mm_add_ps( _xmm3, _mm_load_ps(B+3*4) );
327 _xmm0= _mm_add_ps( _xmm0, sConst.
_xmm );
328 _xmm1= _mm_add_ps( _xmm1, sConst.
_xmm );
329 _xmm2= _mm_add_ps( _xmm2, sConst.
_xmm );
330 _xmm3= _mm_add_ps( _xmm3, sConst.
_xmm );
337 _xmm0= _mm_sub_ps( _xmm0, B._xmm0 );
338 _xmm1= _mm_sub_ps( _xmm1, B._xmm1 );
339 _xmm2= _mm_sub_ps( _xmm2, B._xmm2 );
340 _xmm3= _mm_sub_ps( _xmm3, B._xmm3 );
346 assert( ((
unsigned long)B) % 16 == 0 );
347 _xmm0= _mm_sub_ps( _xmm0, _mm_load_ps(B+0*4) );
348 _xmm1= _mm_sub_ps( _xmm1, _mm_load_ps(B+1*4) );
349 _xmm2= _mm_sub_ps( _xmm2, _mm_load_ps(B+2*4) );
350 _xmm3= _mm_sub_ps( _xmm3, _mm_load_ps(B+3*4) );
356 _xmm0= _mm_sub_ps( _xmm0, sConst.
_xmm );
357 _xmm1= _mm_sub_ps( _xmm1, sConst.
_xmm );
358 _xmm2= _mm_sub_ps( _xmm2, sConst.
_xmm );
359 _xmm3= _mm_sub_ps( _xmm3, sConst.
_xmm );
369 _xmm0= _mm_mul_ps( A._xmm0, B._xmm0 );
370 _xmm1= _mm_mul_ps( A._xmm1, B._xmm1 );
371 _xmm2= _mm_mul_ps( A._xmm2, B._xmm2 );
372 _xmm3= _mm_mul_ps( A._xmm3, B._xmm3 );
378 _xmm0= _mm_mul_ps( A._xmm0, sConst.
_xmm );
379 _xmm1= _mm_mul_ps( A._xmm1, sConst.
_xmm );
380 _xmm2= _mm_mul_ps( A._xmm2, sConst.
_xmm );
381 _xmm3= _mm_mul_ps( A._xmm3, sConst.
_xmm );
386 assert( ((
unsigned long)B) % 16 == 0 );
387 _xmm0= _mm_mul_ps( A._xmm0, _mm_load_ps(B+0*8) );
388 _xmm1= _mm_mul_ps( A._xmm1, _mm_load_ps(B+1*8) );
389 _xmm2= _mm_mul_ps( A._xmm2, _mm_load_ps(B+2*8) );
390 _xmm3= _mm_mul_ps( A._xmm3, _mm_load_ps(B+3*8) );
397 _xmm0= _mm_add_ps( A._xmm0, B._xmm0 );
398 _xmm1= _mm_add_ps( A._xmm1, B._xmm1 );
399 _xmm2= _mm_add_ps( A._xmm2, B._xmm2 );
400 _xmm3= _mm_add_ps( A._xmm3, B._xmm3 );
406 _xmm0= _mm_add_ps( A._xmm0, sConst.
_xmm );
407 _xmm1= _mm_add_ps( A._xmm1, sConst.
_xmm );
408 _xmm2= _mm_add_ps( A._xmm2, sConst.
_xmm );
409 _xmm3= _mm_add_ps( A._xmm3, sConst.
_xmm );
415 assert( ((
unsigned long)B) % 16 == 0 );
416 _xmm0= _mm_add_ps( A._xmm0, _mm_load_ps(B+0*8) );
417 _xmm1= _mm_add_ps( A._xmm1, _mm_load_ps(B+1*8) );
418 _xmm2= _mm_add_ps( A._xmm2, _mm_load_ps(B+2*8) );
419 _xmm3= _mm_add_ps( A._xmm3, _mm_load_ps(B+3*8) );
427 _xmm0= _mm_sub_ps( A._xmm0, B._xmm0 );
428 _xmm1= _mm_sub_ps( A._xmm1, B._xmm1 );
429 _xmm2= _mm_sub_ps( A._xmm2, B._xmm2 );
430 _xmm3= _mm_sub_ps( A._xmm3, B._xmm3 );
436 _xmm0= _mm_sub_ps( A._xmm0, sConst.
_xmm );
437 _xmm1= _mm_sub_ps( A._xmm1, sConst.
_xmm );
438 _xmm2= _mm_sub_ps( A._xmm2, sConst.
_xmm );
439 _xmm3= _mm_sub_ps( A._xmm3, sConst.
_xmm );
445 assert( ((
unsigned long)B) % 16 == 0 );
446 _xmm0= _mm_sub_ps( A._xmm0, _mm_load_ps(B+0*8) );
447 _xmm1= _mm_sub_ps( A._xmm1, _mm_load_ps(B+1*8) );
448 _xmm2= _mm_sub_ps( A._xmm2, _mm_load_ps(B+2*8) );
449 _xmm3= _mm_sub_ps( A._xmm3, _mm_load_ps(B+3*8) );
457 _xmm0= _mm_add_ps( _xmm0, _mm_mul_ps( mult1._xmm0, mult2._xmm0 ) );
458 _xmm1= _mm_add_ps( _xmm1, _mm_mul_ps( mult1._xmm1, mult2._xmm1 ) );
459 _xmm2= _mm_add_ps( _xmm2, _mm_mul_ps( mult1._xmm2, mult2._xmm2 ) );
460 _xmm3= _mm_add_ps( _xmm3, _mm_mul_ps( mult1._xmm3, mult2._xmm3 ) );
466 _xmm0= _mm_add_ps( _xmm0, _mm_mul_ps( mult1._xmm0, mult2Const.
_xmm ) );
467 _xmm1= _mm_add_ps( _xmm1, _mm_mul_ps( mult1._xmm1, mult2Const.
_xmm ) );
468 _xmm2= _mm_add_ps( _xmm2, _mm_mul_ps( mult1._xmm2, mult2Const.
_xmm ) );
469 _xmm3= _mm_add_ps( _xmm3, _mm_mul_ps( mult1._xmm3, mult2Const.
_xmm ) );
486 int length() {
return 16;}
488 static const char* description() {
return "Generic AVX, No FMA, 16 floats";}
490 AVX_Float16(
float* array)
492 assert( ((
unsigned long)array) % 32 == 0 );
493 _ymm0= _mm256_load_ps( array );
494 _ymm1= _mm256_load_ps( array+ 8 );
496 AVX_Float16(
const AVX_Float16& B)
504 AVX_Float16(
float constVal )
506 _ymm0= _mm256_set1_ps( constVal );
510 inline static void setConstant(
SIMD_constant& sConst,
const float& constVal ) {
511 sConst._ymm= _mm256_set1_ps( constVal );
514 inline void stream(
float* array)
516 assert( ((
unsigned long)array) % 32 == 0 );
517 _mm256_stream_ps( array, _ymm0 );
518 _mm256_stream_ps( array+ 8, _ymm1 );
520 inline void store(
float* array)
522 assert( ((
unsigned long)array) % 32 == 0 );
523 _mm256_store_ps( array, _ymm0 );
524 _mm256_store_ps( array+ 8, _ymm1 );
526 inline void load(
float* array)
528 assert( ((
unsigned long)array) % 32 == 0 );
529 _ymm0= _mm256_load_ps( array );
530 _ymm1= _mm256_load_ps( array+ 8 );
532 inline void loadu(
float* array)
534 _ymm0= _mm256_loadu_ps( array );
535 _ymm1= _mm256_loadu_ps( array+ 8 );
540 inline AVX_Float16& mult(
const AVX_Float16& A,
const AVX_Float16& B)
542 _ymm0= _mm256_mul_ps( A._ymm0, B._ymm0 );
543 _ymm1= _mm256_mul_ps( A._ymm1, B._ymm1 );
547 inline AVX_Float16& mult(
const AVX_Float16& A,
const SIMD_constant& sConst )
549 _ymm0= _mm256_mul_ps( A._ymm0, sConst._ymm );
550 _ymm1= _mm256_mul_ps( A._ymm1, sConst._ymm );
554 inline AVX_Float16& mult(
const AVX_Float16& A,
const float* B)
556 assert( ((
unsigned long)B) % 32 == 0 );
557 _ymm0= _mm256_mul_ps( A._ymm0, _mm256_load_ps(B+0*8) );
558 _ymm1= _mm256_mul_ps( A._ymm1, _mm256_load_ps(B+1*8) );
563 inline AVX_Float16& add(
const AVX_Float16& A,
const AVX_Float16& B)
565 _ymm0= _mm256_add_ps( A._ymm0, B._ymm0 );
566 _ymm1= _mm256_add_ps( A._ymm1, B._ymm1 );
570 inline AVX_Float16& add(
const AVX_Float16& A,
const SIMD_constant& sConst )
572 _ymm0= _mm256_add_ps( A._ymm0, sConst._ymm );
573 _ymm1= _mm256_add_ps( A._ymm1, sConst._ymm );
577 inline AVX_Float16& add(
const AVX_Float16& A,
const float* B)
579 assert( ((
unsigned long)B) % 32 == 0 );
580 _ymm0= _mm256_add_ps( A._ymm0, _mm256_load_ps(B+0*8) );
581 _ymm1= _mm256_add_ps( A._ymm1, _mm256_load_ps(B+1*8) );
587 inline AVX_Float16& sub(
const AVX_Float16& A,
const AVX_Float16& B)
589 _ymm0= _mm256_sub_ps( A._ymm0, B._ymm0 );
590 _ymm1= _mm256_sub_ps( A._ymm1, B._ymm1 );
594 inline AVX_Float16& sub(
const AVX_Float16& A,
const SIMD_constant& sConst )
596 _ymm0= _mm256_sub_ps( A._ymm0, sConst._ymm );
597 _ymm1= _mm256_sub_ps( A._ymm1, sConst._ymm );
601 inline AVX_Float16& sub(
const AVX_Float16& A,
const float* B)
603 assert( ((
unsigned long)B) % 32 == 0 );
604 _ymm0= _mm256_sub_ps( A._ymm0, _mm256_load_ps(B+0*8) );
605 _ymm1= _mm256_sub_ps( A._ymm1, _mm256_load_ps(B+1*8) );
611 inline AVX_Float16& operator*=(
const AVX_Float16& B)
613 return this->mult( *
this , B);
616 inline AVX_Float16& operator*=(
const float* B)
618 assert( ((
unsigned long)B) % 32 == 0 );
619 return this->mult( *
this , B);
621 inline AVX_Float16& operator*=(
const SIMD_constant& sConst )
623 return this->mult( *
this , sConst);
626 inline AVX_Float16& operator+=(
const AVX_Float16& B)
628 _ymm0= _mm256_add_ps( _ymm0, B._ymm0 );
629 _ymm1= _mm256_add_ps( _ymm1, B._ymm1 );
633 inline AVX_Float16& operator+=(
const float* B)
635 assert( ((
unsigned long)B) % 32 == 0 );
636 _ymm0= _mm256_add_ps( _ymm0, _mm256_load_ps(B+0*8) );
637 _ymm1= _mm256_add_ps( _ymm1, _mm256_load_ps(B+1*8) );
641 inline AVX_Float16& operator+=(
const SIMD_constant& sConst )
643 _ymm0= _mm256_add_ps( _ymm0, sConst._ymm );
644 _ymm1= _mm256_add_ps( _ymm1, sConst._ymm );
649 inline AVX_Float16& operator-=(
const AVX_Float16& B)
651 _ymm0= _mm256_sub_ps( _ymm0, B._ymm0 );
652 _ymm1= _mm256_sub_ps( _ymm1, B._ymm1 );
656 inline AVX_Float16& operator-=(
const float* B)
658 assert( ((
unsigned long)B) % 32 == 0 );
659 _ymm0= _mm256_sub_ps( _ymm0, _mm256_load_ps(B+0*8) );
660 _ymm1= _mm256_sub_ps( _ymm1, _mm256_load_ps(B+1*8) );
664 inline AVX_Float16& operator-=(
const SIMD_constant& sConst )
666 _ymm0= _mm256_sub_ps( _ymm0, sConst._ymm );
667 _ymm1= _mm256_sub_ps( _ymm1, sConst._ymm );
673 inline AVX_Float16& fMultAdd(
const AVX_Float16& mult1,
const AVX_Float16& mult2 )
675 _ymm0= _mm256_add_ps( _ymm0, _mm256_mul_ps( mult1._ymm0, mult2._ymm0 ) );
676 _ymm1= _mm256_add_ps( _ymm1, _mm256_mul_ps( mult1._ymm1, mult2._ymm1 ) );
679 inline AVX_Float16& fMultAdd(
const AVX_Float16& mult1,
const SIMD_constant& sConst )
681 _ymm0= _mm256_add_ps( _ymm0, _mm256_mul_ps( mult1._ymm0, sConst._ymm ) );
682 _ymm1= _mm256_add_ps( _ymm1, _mm256_mul_ps( mult1._ymm1, sConst._ymm ) );
691 class AVX_FMA4_Float16 :
public AVX_Float16 {
693 static const char* description() {
return "AVX, AMD FMA, 16 floats";}
695 AVX_FMA4_Float16(
float* array) : AVX_Float16(array) {};
696 AVX_FMA4_Float16(
float c) : AVX_Float16(c) {};
697 AVX_FMA4_Float16(
const AVX_FMA4_Float16& B)
706 inline AVX_FMA4_Float16& fMultAdd(
const AVX_FMA4_Float16& mult1,
const AVX_FMA4_Float16& mult2 )
708 _ymm0= _mm256_macc_ps( mult1._ymm0, mult2._ymm0, _ymm0 );
709 _ymm1= _mm256_macc_ps( mult1._ymm1, mult2._ymm1, _ymm1 );
712 inline AVX_FMA4_Float16& fMultAdd(
const AVX_FMA4_Float16& mult1,
const SIMD_constant& sConst )
714 _ymm0= _mm256_macc_ps( mult1._ymm0, sConst._ymm, _ymm0 );
715 _ymm1= _mm256_macc_ps( mult1._ymm1, sConst._ymm, _ymm1 );
722 class AVX_FMA3_Float16 :
public AVX_Float16 {
724 static const char* description() {
return "AVX, Intel FMA, 16 floats";}
726 AVX_FMA3_Float16(
float* array) : AVX_Float16(array) {};
727 AVX_FMA3_Float16(
float c) : AVX_Float16(c) {};
728 AVX_FMA3_Float16(
const AVX_FMA3_Float16& B)
738 inline AVX_FMA3_Float16& fMultAdd(
const AVX_FMA3_Float16& mult1,
const AVX_FMA3_Float16& mult2 )
740 _ymm0= _mm256_fmadd_ps( mult1._ymm0, mult2._ymm0, _ymm0 );
741 _ymm1= _mm256_fmadd_ps( mult1._ymm1, mult2._ymm1, _ymm1 );
744 inline AVX_FMA3_Float16& fMultAdd(
const AVX_FMA3_Float16& mult1,
const SIMD_constant& sConst )
746 _ymm0= _mm256_fmadd_ps( mult1._ymm0, sConst._ymm, _ymm0 );
747 _ymm1= _mm256_fmadd_ps( mult1._ymm1, sConst._ymm, _ymm1 );
756 static void cpuid(
int result[4],
int idnum) {
757 #ifdef __INTEL_COMPILER
758 __cpuid(result, idnum );
759 #else // __INTEL_COMPILER
761 __asm(
"cpuid":
"=a"(i0),
"=b"(i1),
"=c"(i2),
"=d"(i3) :
"a"(idnum),
"c"(0) : );
766 #endif // __INTEL_COMPILER
770 static long xgetbv(
int iarg) {
774 #ifdef __INTEL_COMPILER
778 #else // __INTEL_COMPILER
780 __asm(
"xgetbv":
"=a"(i0),
"=d"(i1) :
"c"(iarg) : );
784 #endif // __INTEL_COMPILER
788 static bool AVX_works(
void) {
789 static int ireturn= -1;
797 if ( (cpuflags[2] & (0x1<<27) != 0) &&
798 (cpuflags[2] & (0x1<<28) !=0) &&
799 (xgetbv(0) & 0x6 == 0x6) )
805 static bool SSE_works(
void) {
806 static int ireturn= -1;
814 if ( (cpuflags[3] & (0x1<<25) != 0) )
820 static bool FMA4_works(
void) {
821 static int ireturn= -1;
828 cpuid( cpuflags, 0x80000001);
829 if ( cpuflags[2] & (0x1<<16) != 0 )
835 static bool FMA3_works(
void) {
836 static int ireturn= -1;
844 if ( cpuflags[2] & (0x1<<12) != 0 )