/**
***
*** Copyright (C) 2007-2010 Intel Corporation.  All rights reserved.
***
*** The information and source code contained herein is the exclusive
*** property of Intel Corporation and may not be disclosed, examined
*** or reproduced in whole or in part except as expressly provided
*** by the accompanying LICENSE AGREEMENT
***
**/

/*
 *  Definition of a C++ class interface to MIC processor intrinsics.
 *
 *    File name : micvec.h  class definitions 
 *
 *    Concept: A C++ abstraction of MIC processor SIMD vector extension
 *      designed to improve programmer productivity.  Speed and accuracy are 
 *      sacrificed for utility.  Facilitates an easy transition to compiler
 *      intrinsics or assembly language. 
 *
 */

#ifndef MICVEC_H_INCLUDED
#define MICVEC_H_INCLUDED

#if !defined __cplusplus
    #error ERROR: This file is only supported in C++ compilations!
#endif /* !__cplusplus */

/* MIC Intrinsics include file */
#include <immintrin.h> 
#include <assert.h>

#ifndef _MSC_VER
#pragma pack(push,64) /* Must ensure class & union 64-B aligned */
#endif


/* If using MSVC5.0, explicit keyword should be used */
#if (_MSC_VER >= 1100) || defined (__linux__) || defined(__unix__) || defined(__APPLE__)
        #define EXPLICIT explicit
#else
   #if (__INTEL_COMPILER)
        #define EXPLICIT explicit /* If MSVC4.x & Intel, use __explicit */
   #else
        #define EXPLICIT /* nothing */
        #pragma message( "explicit keyword not recognized") 
   #endif
#endif

/* Figure out whether and how to define the output operators */
#if defined(_IOSTREAM_) || defined(_CPP_IOSTREAM) || defined(_GLIBCXX_IOSTREAM)
#define MICVEC_DEFINE_OUTPUT_OPERATORS
#define MICVEC_STD std::
#elif defined(_INC_IOSTREAM) || defined(_IOSTREAM_H_)
#define MICVEC_DEFINE_OUTPUT_OPERATORS
#define MICVEC_STD
#endif

# define SETA_PS(x)     _mm512_load_ps((x))
# define SETA_PD(x)     _mm512_load_pd((x))

/* MIC Functionality Intrinsics */

class M128;         /*  1 element,  a __m512 data type */
class I32vec16;     /* 16 elements, each element a signed or unsigned long */
class Is32vec16;    /* 16 elements, each element a signed or unsigned long */
class Iu32vec16;    /* 16 elements, each element a signed or unsigned long */
class I64vec8;      /*  8 element,  each a long long data type */
class F32vec16;     /* 16 elements, each element a float32 type */
class F64vec8;      /*  8 elements, each element a float64 type */
class VecMask16;    /* 16 elements, writemask */

/*
 * Class VecMask16
 * 16 elements, writemask
 * Contructors & Bitwise Operations 
 */

class VecMask16
{
protected:
    __mmask m;
public:
    VecMask16()
        { m = 0xffff; };
        
    VecMask16(__mmask mm)
        { m = mm; };
        
    VecMask16(int mm)
        { m = _mm512_int2mask(mm); };
        
    operator __mmask() const
        { return m; }
        
    VecMask16 operator &= (VecMask16 a)
        { return *this = (VecMask16) _mm512_kand(m,a); }
        
    VecMask16 operator |= (VecMask16 a)
        { return *this = (VecMask16) _mm512_kor(m,a); }
        
    VecMask16 operator ^= (VecMask16 a)
        { return *this = (VecMask16) _mm512_kxor(m,a); }
        
    VecMask16 operator ~ ()
        { return *this = (VecMask16) _mm512_knot(m); }
        
    VecMask16 andn (VecMask16 a)
        { return *this = (VecMask16) _mm512_kandn(m,a); }
        
    VecMask16 andnr (VecMask16 a)
        { return *this = (VecMask16) _mm512_kandnr(m,a); }
};
 
inline VecMask16 operator & (VecMask16 a, VecMask16 b)
    { return _mm512_kand(a,b); }
    
inline VecMask16 operator | (VecMask16 a, VecMask16 b)
    { return _mm512_kor(a,b); }
    
inline VecMask16 operator ^ (VecMask16 a, VecMask16 b)
    { return _mm512_kxor(a,b); }



/*
 * Class M512
 * 1 element, a __m512 data type  
 * Contructors & Logical Operations 
 */

class M512  
{
protected:
    __m512i vec;

public:
    M512()                      { } 
    M512(__m512i mm)             { vec = mm; }

    operator __m512i() const     { return vec; }

    /* Logical Operations */
    M512& operator &= (const M512 &a)
        { return *this = (M512) _mm512_and_epi32(vec,a); }
    
    M512& operator |= (const M512 &a)
        { return *this = (M512) _mm512_or_epi32(vec,a); }
    
    M512& operator ^= (const M512 &a)
        { return *this = (M512) _mm512_xor_epi32(vec,a); }
    
    int reduce_or ()
        { return  _mm512_reduce_or_epi32(vec); }
    
    int reduce_and ()
        { return  _mm512_reduce_and_epi32(vec); }
};

inline M512 operator & (const M512 &a, const M512 &b)
    { return _mm512_and_epi32(a,b); } 

inline M512 operator | (const M512 &a, const M512 &b)
    { return _mm512_or_epi32(a,b); }

inline M512 operator ^ (const M512 &a, const M512 &b)
    { return _mm512_xor_epi32(a,b); }

inline M512 andnot(const M512 &a, const M512 &b)
    { return _mm512_andnot_epi32(a,b); }


/*
 * Class I32vec16
 * 16 elements, each element is a signed or unsigned int 
 */
class I32vec16 : public M512  
{
public:
    I32vec16() { }

    I32vec16(__m512i mm) : M512(mm)     { }
    EXPLICIT I32vec16(const int i)     { vec = _mm512_set1_epi32(i); }
    EXPLICIT I32vec16(const __int64 i) { vec = _mm512_set1_epi32((int)i); }
    
    I32vec16(const int i3, const int i2, const int i1, const int i0)
        { vec = _mm512_set4_epi32(i0, i1, i2, i3); }
        
    I32vec16(const int i15, const int i14, const int i13, const int i12,
             const int i11, const int i10, const int  i9, const int  i8,
             const int  i7, const int  i6, const int  i5, const int  i4,
             const int  i3, const int  i2, const int  i1, const int  i0)
        {
            vec = _mm512_set_epi32(i15, i14, i13, i12,
                                   i11, i10,  i9,  i8,
                                   i7,  i6,  i5,  i4,
                                   i3,  i2,  i1,  i0);
        }
        
    I32vec16(int i[16]) { vec = _mm512_load_epi32((int*)i); }

    /* Special constructor - gather elements from memory locations */
    I32vec16(int *a, I32vec16 b)
        { 
            vec = _mm512_i32gather_epi32(b, (float*)a, _MM_SCALE_4);
        }

    /* Assignment Operator */
    I32vec16& operator = (const M512 &a) { return *this = (I32vec16) a; }

    /* Logicals Operators */
    I32vec16& operator &= (const M512 &a)
        { return *this = (I32vec16) _mm512_and_epi32(vec,a); }

    I32vec16& operator |= (const M512 &a)
        { return *this = (I32vec16) _mm512_or_epi32(vec,a); }

    I32vec16& operator ^= (const M512 &a)
        { return *this = (I32vec16) _mm512_xor_epi32(vec,a); }

    /* Addition & Subtraction Assignment Operators */
    I32vec16& operator += (const I32vec16 &a)
        { return *this = (I32vec16)_mm512_add_epi32(vec,a); }

    I32vec16& operator -= (const I32vec16 &a)
        { return *this = (I32vec16)_mm512_sub_epi32(vec,a); }   

    /* Shift Logical Operators */
    I32vec16 operator << (const I32vec16 &a)
        { return _mm512_sllv_epi32(vec,a); }

    I32vec16& operator <<= (const I32vec16 &a)
        { return *this = (I32vec16)_mm512_sllv_epi32(vec,a); }
    
    /* Reduce functions */
    friend int reduce_or(const M512 &a)  { return _mm512_reduce_or_epi32(a);}
    friend int reduce_and(const M512 &a) { return _mm512_reduce_and_epi32(a);}

    int reduce_mul () { return  _mm512_reduce_mul_epi32(vec); }
    int reduce_add () { return  _mm512_reduce_add_epi32(vec); }

    /* Register swizzles */
    I32vec16 cdab() { return (I32vec16)_mm512_swizzle_epi32(vec, _MM_SWIZ_REG_CDAB); }
    I32vec16 badc() { return (I32vec16)_mm512_swizzle_epi32(vec, _MM_SWIZ_REG_BADC); }
    I32vec16 aaaa() { return (I32vec16)_mm512_swizzle_epi32(vec, _MM_SWIZ_REG_AAAA); }
    I32vec16 bbbb() { return (I32vec16)_mm512_swizzle_epi32(vec, _MM_SWIZ_REG_BBBB); }
    I32vec16 cccc() { return (I32vec16)_mm512_swizzle_epi32(vec, _MM_SWIZ_REG_CCCC); }
    I32vec16 dddd() { return (I32vec16)_mm512_swizzle_epi32(vec, _MM_SWIZ_REG_DDDD); }
    I32vec16 dacb() { return (I32vec16)_mm512_swizzle_epi32(vec, _MM_SWIZ_REG_DACB); }

    /* Shuffle */
    template <_MM_PERM_ENUM p128, _MM_PERM_ENUM p32>
    I32vec16 shuffle()
        { return (I32vec16)_mm512_mask_permute4f128_epi32(
             _mm512_shuffle_epi32(vec, p32), p128); }
};

inline VecMask16  cmpeq(const I32vec16 &a, const I32vec16 &b)
    { return _mm512_cmpeq_epi32_mask(a,b); }

inline VecMask16  cmpneq(const I32vec16 &a, const I32vec16 &b)
    { return _mm512_cmpneq_epi32_mask(a,b); }


/*
 * Class Is32vec16
 * 16 elements, each element signed integer 
 */
class Is32vec16 : public I32vec16  
{
public:
    Is32vec16() { }
    Is32vec16(__m512i mm) : I32vec16(mm) { }
    EXPLICIT Is32vec16(const int i) : I32vec16(i) { }
    EXPLICIT Is32vec16(const __int64 i) : I32vec16(i) { }

    Is32vec16(const int i3, const int i2, const int i1, const int i0) :
        I32vec16(i3, i2, i1, i0) { }
        
    Is32vec16(const int i15, const int i14, const int i13, const int i12,
              const int i11, const int i10, const int  i9, const int  i8,
              const int  i7, const int  i6, const int  i5, const int  i4,
              const int  i3, const int  i2, const int  i1, const int  i0)
        : I32vec16(i15, i14, i13, i12, i11, i10, i9, i8,
                    i7,  i6,  i5,  i4,  i3,  i2, i1, i0) {}
                    
    Is32vec16(int i[16]) : I32vec16(i) { }
    
    /* Special constructor - gather elements from memory locations */
    Is32vec16(int *a, I32vec16 b) : I32vec16(a, b) { }

    /* Assignment Operator */
    Is32vec16& operator= (const M512 &a) { return *this = (Is32vec16) a; }

    /* Logical Operators */
    Is32vec16& operator&=(const M512 &a)
        { return *this = (Is32vec16) _mm512_and_epi32(vec,a); }

    Is32vec16& operator|=(const M512 &a)
        { return *this = (Is32vec16) _mm512_or_epi32(vec,a); }

    Is32vec16& operator^=(const M512 &a)
        { return *this = (Is32vec16) _mm512_xor_epi32(vec,a); }

    /* Addition & Subtraction Assignment Operators */
    Is32vec16& operator +=(const I32vec16 &a)
        { return *this = (Is32vec16)_mm512_add_epi32(vec,a); }

    Is32vec16& operator -=(const I32vec16 &a)
        { return *this = (Is32vec16)_mm512_sub_epi32(vec,a); }   

    /* Arithmetic Operators */
    friend Is32vec16 operator +(const Is32vec16 &a, const Is32vec16 &b)
        { return _mm512_add_epi32(a,b); }

    friend Is32vec16 operator -(const Is32vec16 &a, const Is32vec16 &b)
        { return _mm512_sub_epi32(a,b); } 

    /* Shift Logical Operators */
    Is32vec16 operator<<(const M512 &a)
        { return _mm512_sllv_epi32(vec,a); }

    Is32vec16& operator<<=(const M512 &a)
        { return *this = (Is32vec16)_mm512_sllv_epi32(vec,a); }

    /* Shift Arithmetic Operations */
    Is32vec16 operator>>(const M512 &a)
        { return _mm512_srav_epi32(vec,a); } 

    Is32vec16& operator>>=(const M512 &a)
        { return *this = (Is32vec16) _mm512_srav_epi32(vec,a); } 

    Is32vec16 operator - ( ) const
        { return Is32vec16(0) - vec; }

    Is32vec16 operator - ( )
        { return Is32vec16(0) - vec; }

    /* Reduce functions */
    int reduce_max()
        { return  _mm512_reduce_max_epi32(vec); }
    
    int reduce_min()
        { return  _mm512_reduce_min_epi32(vec); }

    friend int reduce_add(const Is32vec16 &a)
        { return _mm512_reduce_add_epi32(a); }

    friend int reduce_mul(const Is32vec16 &a)
        { return _mm512_reduce_mul_epi32(a); }

    friend int reduce_min(const Is32vec16 &a)
        { return _mm512_reduce_min_epi32(a); }

    friend int reduce_max(const Is32vec16 &a)
        { return _mm512_reduce_max_epi32(a); }

#if defined(MICVEC_DEFINE_OUTPUT_OPERATORS)
    friend MICVEC_STD ostream& operator<< (MICVEC_STD ostream &os, const Is32vec16 &a)
        {
            int *ip = (int*) &a;
            os   << "{" << *(ip + 0)
                 << ", " << *(ip + 1)
                 << ", " << *(ip + 2)
                 << ", " << *(ip + 3)
                 << ", " << *(ip + 4)
                 << ", " << *(ip + 5)
                 << ", " << *(ip + 6)
                 << ", " << *(ip + 7)
                 << ", " << *(ip + 8)
                 << ", " << *(ip + 9)
                 << ", " << *(ip + 10)
                 << ", " << *(ip + 11)
                 << ", " << *(ip + 12)
                 << ", " << *(ip + 13)
                 << ", " << *(ip + 14)
                 << ", " << *(ip + 15) << "}";
            return os;
        }
#endif

    /* Element Access for Debug, No data modified */
    const int& operator[](int i)const
        {
            assert(static_cast<unsigned int>(i) < 16);    /* Only 16 elements to access */
            int *ip = (int*)&vec;
            return *(ip+i);
        } 
    
    /* Element Access for Debug */
    int& operator[](int i)
        {
            assert(static_cast<unsigned int>(i) < 16);    /* Only 16 elements to access */
            int *ip = (int*)&vec;
            return *(ip+i);
        } 

    void scatter(int * f, I32vec16 i)
        {
            _mm512_i32scatter_ps((float*)f, i, vec, _MM_SCALE_4);
        }

    /* Register swizzles */
    Is32vec16 cdab() { return (Is32vec16)_mm512_swizzle_epi32(vec, _MM_SWIZ_REG_CDAB); }
    Is32vec16 badc() { return (Is32vec16)_mm512_swizzle_epi32(vec, _MM_SWIZ_REG_BADC); }
    Is32vec16 aaaa() { return (Is32vec16)_mm512_swizzle_epi32(vec, _MM_SWIZ_REG_AAAA); }
    Is32vec16 bbbb() { return (Is32vec16)_mm512_swizzle_epi32(vec, _MM_SWIZ_REG_BBBB); }
    Is32vec16 cccc() { return (Is32vec16)_mm512_swizzle_epi32(vec, _MM_SWIZ_REG_CCCC); }
    Is32vec16 dddd() { return (Is32vec16)_mm512_swizzle_epi32(vec, _MM_SWIZ_REG_DDDD); }
    Is32vec16 dacb() { return (Is32vec16)_mm512_swizzle_epi32(vec, _MM_SWIZ_REG_DACB); }

    /* Shuffle */
    template <_MM_PERM_ENUM p128, _MM_PERM_ENUM p32>
    Is32vec16 shuffle()
        { return (Is32vec16)_mm512_mask_permute4f128_epi32(
             _mm512_shuffle_epi32(vec, p32), p128); }
};

/* Compares */
inline VecMask16 cmpeq(const Is32vec16 &a, const Is32vec16 &b)
    { return _mm512_cmpeq_epi32_mask(a,b);}

inline VecMask16 cmpneq(const Is32vec16 &a, const Is32vec16 &b)
    { return _mm512_cmpneq_epi32_mask(a,b); }

inline VecMask16 cmpgt(const Is32vec16 &a, const Is32vec16 &b)
    { return _mm512_cmplt_epi32_mask(b,a); }

inline VecMask16 cmplt(const Is32vec16 &a, const Is32vec16 &b)
    { return _mm512_cmplt_epi32_mask(a,b); }




/*
 * Class Iu32vec4
 * 16 elements, each element unsigned int
 */
class Iu32vec16 : public I32vec16  
{
public:
    Iu32vec16() { }
    Iu32vec16(__m512i mm) : I32vec16(mm) { }

    EXPLICIT Iu32vec16(const unsigned int ui) : I32vec16((int)ui) { }
    EXPLICIT Iu32vec16(const unsigned __int64 ui) : I32vec16((__int64)ui) { }

    Iu32vec16(const unsigned int ui3, const unsigned int ui2,
              const unsigned int ui1, const unsigned int ui0)
        : I32vec16(ui3, ui2, ui1, ui0) { }

    Iu32vec16(const unsigned int ui15, const unsigned int ui14,
              const unsigned int ui13, const unsigned int  ui12,
              const unsigned int ui11, const unsigned int  ui10,
              const unsigned int  ui9, const unsigned int  ui8,
              const unsigned int  ui7, const unsigned int  ui6,
              const unsigned int  ui5, const unsigned int  ui4,
              const unsigned int  ui3, const unsigned int  ui2,
              const unsigned int  ui1, const unsigned int  ui0)
        : I32vec16(ui15, ui14, ui13, ui12, ui11, ui10, ui9, ui8,
                    ui7,  ui6,  ui5,  ui4,  ui3,  ui2, ui1, ui0) { }

    Iu32vec16(unsigned int ui[16]) : I32vec16((int*) ui) { }
    
    /* Special constructor - gather elements from memory locations */
    Iu32vec16(unsigned int *a, I32vec16 b) : I32vec16((int*)a, b) { }

    /* Assignment Operator */
    Iu32vec16& operator= (const M512 &a)
        { return *this = (Iu32vec16) a; }

    /* Logical Assignment Operators */
    Iu32vec16& operator&=(const M512 &a)
        { return *this = (Iu32vec16) _mm512_and_epi32(vec,a); }

    Iu32vec16& operator|=(const M512 &a)
        { return *this = (Iu32vec16) _mm512_or_epi32(vec,a); }

    Iu32vec16& operator^=(const M512 &a)
        { return *this = (Iu32vec16) _mm512_xor_epi32(vec,a); }

    /* Addition & Subtraction Assignment Operators */
    Iu32vec16& operator +=(const Iu32vec16 &a)
        { return *this = (Iu32vec16)_mm512_add_epi32(vec,a); }

    Iu32vec16& operator -=(const Iu32vec16 &a)
        { return *this = (Iu32vec16)_mm512_sub_epi32(vec,a); }   

    /* Shift Logical Operators */
    Iu32vec16 operator<<(const M512 &a)
        { return _mm512_sllv_epi32(vec,a); }

    Iu32vec16& operator<<=(const M512 &a)
        { return *this = (Iu32vec16)_mm512_sllv_epi32(vec,a); }

    Iu32vec16 operator>>(const M512 &a)
        { return _mm512_srlv_epi32(vec,a); }

    Iu32vec16& operator>>=(const M512 &a)
        { return *this = (Iu32vec16) _mm512_srlv_epi32(vec,a); }

#if defined(MICVEC_DEFINE_OUTPUT_OPERATORS)
    friend MICVEC_STD ostream& operator<< (MICVEC_STD ostream &os, const Iu32vec16 &a)
        {
            unsigned int *ip = (unsigned int*) &a;
            os   << "{" << *(ip + 0)
                 << ", " << *(ip + 1)
                 << ", " << *(ip + 2)
                 << ", " << *(ip + 3)
                 << ", " << *(ip + 4)
                 << ", " << *(ip + 5)
                 << ", " << *(ip + 6)
                 << ", " << *(ip + 7)
                 << ", " << *(ip + 8)
                 << ", " << *(ip + 9)
                 << ", " << *(ip + 10)
                 << ", " << *(ip + 11)
                 << ", " << *(ip + 12)
                 << ", " << *(ip + 13)
                 << ", " << *(ip + 14)
                 << ", " << *(ip + 15) << "}";
            return os;
        }
#endif

    /* Element Access for Debug, No data modified */
    const unsigned int& operator[](int i)const
        {
            assert(static_cast<unsigned int>(i) < 16);    /* Only 16 elements to access */
            unsigned int *ip = (unsigned int*)&vec;
            return *(ip+i);
        } 
    
    /* Element Access for Debug */
    unsigned int& operator[](int i)
        {
            assert(static_cast<unsigned int>(i) < 16);    /* Only 16 elements to access */
            unsigned int *ip = (unsigned int*)&vec;
            return *(ip+i);
        } 
    
    void scatter(unsigned int * f, I32vec16 i)
        {
            _mm512_i32scatter_epi32((float*)f, i, vec, _MM_SCALE_4);
        }


    /* Register swizzles */
    Iu32vec16 cdab() { return (Iu32vec16)_mm512_swizzle_epi32(vec, _MM_SWIZ_REG_CDAB); }
    Iu32vec16 badc() { return (Iu32vec16)_mm512_swizzle_epi32(vec, _MM_SWIZ_REG_BADC); }
    Iu32vec16 aaaa() { return (Iu32vec16)_mm512_swizzle_epi32(vec, _MM_SWIZ_REG_AAAA); }
    Iu32vec16 bbbb() { return (Iu32vec16)_mm512_swizzle_epi32(vec, _MM_SWIZ_REG_BBBB); }
    Iu32vec16 cccc() { return (Iu32vec16)_mm512_swizzle_epi32(vec, _MM_SWIZ_REG_CCCC); }
    Iu32vec16 dddd() { return (Iu32vec16)_mm512_swizzle_epi32(vec, _MM_SWIZ_REG_DDDD); }
    Iu32vec16 dacb() { return (Iu32vec16)_mm512_swizzle_epi32(vec, _MM_SWIZ_REG_DACB); }

    /* Shuffle */
    template <_MM_PERM_ENUM p128, _MM_PERM_ENUM p32>
    Iu32vec16 shuffle()
        { return (Iu32vec16)_mm512_mask_permute4f128_epi32(
             _mm512_shuffle_epi32(vec, p32), p128); }
};



/*
 * Class I64vec8
 * 8 elements, each element signed or unsigned 64-bit integer 
 */
class I64vec8 : public M512  
{
public:
    I64vec8() { }
    I64vec8(__m512i mm) : M512(mm) { }

    EXPLICIT I64vec8(const __int64 i)
        { vec = _mm512_set1_epi64(i);}

    EXPLICIT I64vec8(const int i)
        { vec = _mm512_set1_epi64(i);}

    I64vec8(const __int64 i3, const __int64 i2,
            const __int64 i1, const __int64 i0)
        {
            vec = _mm512_set4_epi64(i0, i1, i2, i3);
        }

    I64vec8(const __int64 i7, const __int64 i6,
            const __int64 i5, const __int64 i4,
            const __int64 i3, const __int64 i2,
            const __int64 i1, const __int64 i0)
        {
            vec = _mm512_set_epi64(i7, i6, i5, i4,
                                   i3, i2, i1, i0);
        }

    I64vec8(__int64 i[8])
        {
            vec = _mm512_load_epi64((char*) i);
        }

    /* Assignment Operator */
    I64vec8& operator = (const M512 &a)
        { return *this = (I64vec8) a; }

    /* Logical Assignment Operators */
    I64vec8& operator &= (const M512 &a)
        { return *this = (I64vec8)_mm512_and_epi64(vec,a); }

    I64vec8& operator |= (const M512 &a)
        { return  *this = (I64vec8)_mm512_or_epi64(vec,a); }

    I64vec8& operator ^= (const M512 &a)
        { return *this = (I64vec8)_mm512_xor_epi64(vec,a); }

#if defined(MICVEC_DEFINE_OUTPUT_OPERATORS)
    friend MICVEC_STD ostream& operator << (MICVEC_STD ostream &os,
                                            const I64vec8 &a)
        {
            __int64 *ip = (__int64*)&a;
            os   << "{" << *(ip + 0)
                 << ", " << *(ip + 1)
                 << ", " << *(ip + 2)
                 << ", " << *(ip + 3)
                 << ", " << *(ip + 4)
                 << ", " << *(ip + 5)
                 << ", " << *(ip + 6)
                 << ", " << *(ip + 7) << "}";
            return os;
        }
#endif
    
    /* Element Access Only, no modifications to elements*/
    const __int64& operator[](const int i) const
        {
            /* Assert enabled only during debug /DDEBUG */
            assert((0 <= i) && (i <= 7)); /* User should only access elements 0-7 */
            __int64 *ip = (__int64*)&vec;
            return *(ip+i);
        }

    /* Element Access and Modification*/
    __int64& operator[](const int i)
        {
            /* Assert enabled only during debug /DDEBUG */
            assert((0 <= i) && (i <= 7)); /* User should only access elements 0-7 */
            __int64  *ip = (__int64*)&vec;
            return *(ip+i);
        }

    /* Register swizzles */
    I64vec8 cdab() { return (I64vec8)_mm512_swizzle_epi64(vec, _MM_SWIZ_REG_CDAB); }
    I64vec8 badc() { return (I64vec8)_mm512_swizzle_epi64(vec, _MM_SWIZ_REG_BADC); }
    I64vec8 aaaa() { return (I64vec8)_mm512_swizzle_epi64(vec, _MM_SWIZ_REG_AAAA); }
    I64vec8 bbbb() { return (I64vec8)_mm512_swizzle_epi64(vec, _MM_SWIZ_REG_BBBB); }
    I64vec8 cccc() { return (I64vec8)_mm512_swizzle_epi64(vec, _MM_SWIZ_REG_CCCC); }
    I64vec8 dddd() { return (I64vec8)_mm512_swizzle_epi64(vec, _MM_SWIZ_REG_DDDD); }
    I64vec8 dacb() { return (I64vec8)_mm512_swizzle_epi64(vec, _MM_SWIZ_REG_DACB); }

    /* Shuffle */
    template <_MM_PERM_ENUM p128, _MM_PERM_ENUM p32>
    I64vec8 shuffle()
        { return (I64vec8)_mm512_mask_permute4f128_epi32(
             _mm512_shuffle_epi32(vec, p32), p128); }
};


inline VecMask16 cmpeq(const Iu32vec16 &a, const Iu32vec16 &b)
    { return _mm512_cmpeq_epu32_mask(a,b); }

inline VecMask16 cmpneq(const Iu32vec16 &a, const Iu32vec16 &b)
    { return _mm512_cmpneq_epu32_mask(a,b); }


/****************************** Logicals *************************************/

inline I32vec16 operator & (const I32vec16 &a, const I32vec16 &b)
    { return _mm512_and_epi32(a,b); }

inline I32vec16 operator | (const I32vec16 &a, const I32vec16 &b)
    { return _mm512_or_epi32(a,b); }

inline I32vec16 operator ^ (const I32vec16 &a, const I32vec16 &b)
    { return _mm512_xor_epi32(a,b); } 

inline I32vec16 andn (const I32vec16 &a, const I32vec16 &b)
    { return _mm512_andnot_epi32(a,b); }


inline Iu32vec16 operator & (const Iu32vec16 &a, const Iu32vec16 &b)
    { return _mm512_and_epi32(a,b); }

inline Iu32vec16 operator | (const Iu32vec16 &a, const Iu32vec16 &b)
    { return _mm512_or_epi32(a,b); }

inline Iu32vec16 operator ^ (const Iu32vec16 &a, const Iu32vec16 &b)
    { return _mm512_xor_epi32(a,b); } 

inline Iu32vec16 andn (const Iu32vec16 &a, const Iu32vec16 &b)
    { return _mm512_andnot_epi32(a,b); }


inline Is32vec16 operator & (const Is32vec16 &a, const Is32vec16 &b)
    { return _mm512_and_epi32(a,b); }

inline Is32vec16 operator | (const Is32vec16 &a, const Is32vec16 &b)
    { return _mm512_or_epi32(a,b); }

inline Is32vec16 operator ^ (const Is32vec16 &a, const Is32vec16 &b)
    { return _mm512_xor_epi32(a,b); } 

inline Is32vec16 andn (const Is32vec16 &a, const Is32vec16 &b)
    { return _mm512_andnot_epi32(a,b); }


inline I64vec8 operator & (const I64vec8 &a, const I64vec8 &b)
    { return _mm512_and_epi64(a,b); }

inline I64vec8 operator | (const I64vec8 &a, const I64vec8 &b)
    { return _mm512_or_epi64(a,b); }

inline I64vec8 operator ^ (const I64vec8 &a, const I64vec8 &b)
    { return _mm512_xor_epi64(a,b); } 

inline I64vec8 andn (const I64vec8 &a, const I64vec8 &b)
    { return _mm512_andnot_epi64(a,b); }


/****************************** Add & Sub ************************************/

inline I32vec16 operator + (const I32vec16 &a, const I32vec16 &b)
    { return _mm512_add_epi32(a,b); }

inline I32vec16 operator - (const I32vec16 &a, const I32vec16 &b)
    { return _mm512_sub_epi32(a,b); }


/*
 * Conditional select. 
 *      Example: r = select(mask,c,d);
 *      if mask bit at position x is set (1) assign the corresponding
 *      element in r from c, else assign from d.
 */
template <class T> T select(const VecMask16 mask,
                            const T &a,  const T &b)
{
    return (T)(_mm512_mask_mov_ps(b, mask, a));
}

/* Compare and Select.
 *      version of: retval = (a OP b)? c : d;
 *      Where OP is one of the possible comparision operators.
 *      Example: r = select_eq(a,b,c,d);
 *      if "member at position x of the vector a" ==
 *         "member at position x of vector b" 
 *      assign the corresponding member in r from c, else assign from d.
 */

#define IVEC512_SELECT(vect12,vect34,element,selop)                         \
    inline I##vect34##vec##element select_##selop (                         \
            const I##vect12##vec##element &a,                               \
            const I##vect12##vec##element &b,                               \
            const I##vect34##vec##element &c,                               \
            const I##vect34##vec##element &d)                               \
{                                                                           \
    __mmask mask = _mm512_cmp##selop##_epi32_mask(a,b);                     \
    I##vect34##vec##element res = d;                                        \
    return( (I##vect34##vec##element)(_mm512_mask_or_epi32(res, mask, c, c))); \
}

    IVEC512_SELECT(32,s32,16,eq)
    IVEC512_SELECT(32,u32,16,eq)
    IVEC512_SELECT(32,32,16,eq)
    IVEC512_SELECT(32,s32,16,neq)
    IVEC512_SELECT(32,u32,16,neq)
    IVEC512_SELECT(32,32,16,neq)

#undef IVEC512_SELECT


/*
 * Class F32vec16
 * 16 elements, each element float32 
 */
class F32vec16
{
protected:
        __m512 vec;
public:
    
    /* Constructors: __m512, 16 floats, 4 floats, 1 float */
    F32vec16()                      {  }
    F32vec16(__m512 m)              { vec = m; }

    EXPLICIT F32vec16(const float f)
        { vec = _mm512_set1_ps(f); }
        
    EXPLICIT F32vec16(const double f)
        { vec = _mm512_set1_ps((float)f); }
        
    F32vec16(const float f3, const float f2, const float f1, const float f0)
        {
            vec = _mm512_set4_ps(f0,f1,f2,f3);
        }

    F32vec16(const float f15, const float f14, const float f13, const float f12,
             const float f11, const float f10, const float f9, const float f8,
             const float f7, const float f6, const float f5, const float f4,
             const float f3, const float f2, const float f1, const float f0)
        {
            vec = _mm512_set_ps(f15, f14, f13, f12,
                                f11, f10, f9, f8,
                                f7, f6, f5, f4,
                                f3, f2, f1, f0);
        }

    F32vec16(float f[16])           {  vec = SETA_PS(f);}

    /* Special constructor - gather elements from memory locations */
    F32vec16(float *a, I32vec16 b)
        {
            vec = _mm512_i32gather_ps(b, a, _MM_SCALE_4);
        }

    /* Conversion functions */
    operator  __m512() const    { return vec; }

    /* Logical Operators */
    F32vec16& operator&=(const F32vec16 &a) { return *this = _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(vec),_mm512_castps_si512(a))); }
    F32vec16& operator|=(const F32vec16 &a) { return *this = _mm512_castsi512_ps(_mm512_or_epi32(_mm512_castps_si512(vec),_mm512_castps_si512(a))); }
    F32vec16& operator^=(const F32vec16 &a) { return *this = _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(vec),_mm512_castps_si512(a))); }

    /* Arithmetic Operators */
    friend F32vec16 operator +(const F32vec16 &a, const F32vec16 &b)
        { return _mm512_add_ps(a,b); }

    friend F32vec16 operator +(const F32vec16 &a, const float b)
        { return _mm512_add_ps(a,_mm512_set1_ps(b)); }

    friend F32vec16 operator -(const F32vec16 &a, const F32vec16 &b)
        { return _mm512_sub_ps(a,b); } 

    friend F32vec16 operator -(const F32vec16 &a, const float b)
        { return _mm512_sub_ps(a,_mm512_set1_ps(b)); } 

    friend F32vec16 operator *(const F32vec16 &a, const F32vec16 &b)
        { return _mm512_mul_ps(a,b); } 

    friend F32vec16 operator *(const F32vec16 &a, const float b)
        { return _mm512_mul_ps(a,_mm512_set1_ps(b)); } 

    friend F32vec16 operator /(const F32vec16 &a, const F32vec16 &b)
        { return _mm512_div_ps(a,b); }

    friend F32vec16 operator /(const F32vec16 &a, const float b)
        { return _mm512_div_ps(a,_mm512_set1_ps(b)); }

    F32vec16& operator +=(const F32vec16 &b)
        { return *this = _mm512_add_ps(vec, b); }

    F32vec16& operator +=(const float b)
        { return *this = _mm512_add_ps(vec, _mm512_set1_ps(b)); }

    F32vec16& operator -=(const F32vec16 &b)
        { return *this = _mm512_sub_ps(vec, b); }

    F32vec16& operator -=(const float b)
        { return *this = _mm512_sub_ps(vec, _mm512_set1_ps(b)); }

    F32vec16& operator *=(const F32vec16 &b)
        { return *this = _mm512_mul_ps(vec, b); }

    F32vec16& operator *=(const float b)
        { return *this = _mm512_mul_ps(vec, _mm512_set1_ps(b)); }

    F32vec16& operator /=(const F32vec16 &b)
        { return *this = _mm512_div_ps(vec, b); }

    F32vec16& operator /=(const float b)
        { return *this = _mm512_div_ps(vec, _mm512_set1_ps(b)); }


    /* Unary '-' */
    F32vec16 operator - ( ) const { return F32vec16(0.0) - vec; }
    F32vec16 operator - ( )       { return F32vec16(0.0) - vec; }
    
    /* Reduce functions */
    friend float reduce_add(const F32vec16 &a)  { return _mm512_reduce_add_ps(a);}
    friend float reduce_mul(const F32vec16 &a)  { return _mm512_reduce_mul_ps(a);}
    friend float reduce_max(const F32vec16 &a)  { return _mm512_reduce_max_ps(a);}
    friend float reduce_min(const F32vec16 &a)  { return _mm512_reduce_min_ps(a);}

    /* Horizontal add and mul - for compatibility with SSE's fvec.h */
    friend float add_horizontal(const F32vec16 &a)  { return _mm512_reduce_add_ps(a);}
    friend float mul_horizontal(const F32vec16 &a)  { return _mm512_reduce_mul_ps(a);}

    float reduce_max() { return  _mm512_reduce_max_ps(vec); }
    float reduce_min() { return  _mm512_reduce_min_ps(vec); }
    float reduce_add() { return  _mm512_reduce_add_ps(vec); }
    float reduce_mul() { return  _mm512_reduce_mul_ps(vec); }

    /* Square Root */
    friend F32vec16 sqrt(const F32vec16& a)
        { return _mm512_sqrt_ps(a); }

    /* Min and Max */
    friend F32vec16 min(const F32vec16 &a, const F32vec16 &b)
        { return _mm512_min_ps(a,b); }

    friend F32vec16 max(const F32vec16 &a, const F32vec16 &b)
        { return _mm512_max_ps(a,b); }

    /* SIMD Min and Max - added for compatibility with SSE's fvec.h */
    friend F32vec16 simd_min(const F32vec16 &a, const F32vec16 &b)
        { return _mm512_min_ps(a,b); }

    friend F32vec16 simd_max(const F32vec16 &a, const F32vec16 &b)
        { return _mm512_max_ps(a,b); }

    /* Compare */
    #define Fvec32s16_COMP(op) \
    friend VecMask16 cmp##op (const F32vec16 &a, const F32vec16 &b) \
        { return _mm512_cmp##op##_ps_mask(a,b); }

        Fvec32s16_COMP(eq)
        Fvec32s16_COMP(lt)
        Fvec32s16_COMP(le)
        Fvec32s16_COMP(unord)
        Fvec32s16_COMP(neq)
        Fvec32s16_COMP(nlt)
        Fvec32s16_COMP(nle)
        Fvec32s16_COMP(ord)

    #undef Fvec32s16_COMP

    /* Math */
    #define Fvec32s16_MATH(func)\
    friend F32vec16 func(const F32vec16& a) \
        { return  _mm512_##func##_ps(a);}

        Fvec32s16_MATH(acos)
        Fvec32s16_MATH(acosh)
        Fvec32s16_MATH(asin)
        Fvec32s16_MATH(asinh)
        Fvec32s16_MATH(atan)
        Fvec32s16_MATH(atanh)
        Fvec32s16_MATH(cbrt)
        Fvec32s16_MATH(ceil)
        Fvec32s16_MATH(cos)
        Fvec32s16_MATH(cosh)
        Fvec32s16_MATH(erf)
        Fvec32s16_MATH(erfc)
        Fvec32s16_MATH(erfinv)
        Fvec32s16_MATH(exp2)
        Fvec32s16_MATH(exp)
        Fvec32s16_MATH(floor)
        Fvec32s16_MATH(invsqrt)
        Fvec32s16_MATH(log10)
        Fvec32s16_MATH(log2)
        Fvec32s16_MATH(log)
        Fvec32s16_MATH(nearbyint)
        Fvec32s16_MATH(rint)
        Fvec32s16_MATH(sin)
        Fvec32s16_MATH(sinh)
        Fvec32s16_MATH(tan)
        Fvec32s16_MATH(tanh)
        Fvec32s16_MATH(trunc)

    #undef Fvec32s16_MATH

    #define Fvec32s16_MATH2(func)\
    friend F32vec16 func(const F32vec16& a, const F32vec16& b) \
        { return  _mm512_##func##_ps(a, b);}

        Fvec32s16_MATH2(atan2)
        Fvec32s16_MATH2(hypot)
        Fvec32s16_MATH2(pow)

    #undef Fvec32s16_MATH2

/* Debug Features */

#if defined(MICVEC_DEFINE_OUTPUT_OPERATORS)
    friend MICVEC_STD ostream& operator << (MICVEC_STD ostream &os,
                                         const F32vec16 &a)
        {
            float *fp = (float*)&a;
            os   << "{" << *(fp + 0)
                 << ", " << *(fp + 1)
                 << ", " << *(fp + 2)
                 << ", " << *(fp + 3)
                 << ", " << *(fp + 4)
                 << ", " << *(fp + 5)
                 << ", " << *(fp + 6)
                 << ", " << *(fp + 7)
                 << ", " << *(fp + 8)
                 << ", " << *(fp + 9)
                 << ", " << *(fp + 10)
                 << ", " << *(fp + 11)
                 << ", " << *(fp + 12)
                 << ", " << *(fp + 13)
                 << ", " << *(fp + 14)
                 << ", " << *(fp + 15) << "}";
            return os;
        }
#endif

    /* Element Access Only, no modifications to elements*/
    const float& operator[](const int i) const
        {
            /* Assert enabled only during debug /DDEBUG */
            assert((0 <= i) && (i <= 15)); /* User should only access elements 0-15 */
            float *fp = (float*)&vec;
            return *(fp+i);
        }

    /* Element Access and Modification*/
    float& operator[](const int i)
        {
            /* Assert enabled only during debug /DDEBUG */
            assert((0 <= i) && (i <= 15)); /* User should only access elements 0-15 */
            float *fp = (float*)&vec;
            return *(fp+i);
        }

    void scatter(float * f, I32vec16 i)
        {
            _mm512_i32scatter_ps(f, i, vec, _MM_SCALE_4);
        }

    /* Register swizzles */
    F32vec16 cdab() { return (F32vec16)_mm512_swizzle_ps(vec, _MM_SWIZ_REG_CDAB); }
    F32vec16 badc() { return (F32vec16)_mm512_swizzle_ps(vec, _MM_SWIZ_REG_BADC); }
    F32vec16 aaaa() { return (F32vec16)_mm512_swizzle_ps(vec, _MM_SWIZ_REG_AAAA); }
    F32vec16 bbbb() { return (F32vec16)_mm512_swizzle_ps(vec, _MM_SWIZ_REG_BBBB); }
    F32vec16 cccc() { return (F32vec16)_mm512_swizzle_ps(vec, _MM_SWIZ_REG_CCCC); }
    F32vec16 dddd() { return (F32vec16)_mm512_swizzle_ps(vec, _MM_SWIZ_REG_DDDD); }
    F32vec16 dacb() { return (F32vec16)_mm512_swizzle_ps(vec, _MM_SWIZ_REG_DACB); }

    template <_MM_PERM_ENUM p128, _MM_PERM_ENUM p32>
    F32vec16 shuffle()
        { return (F32vec16)_mm512_mask_permute4f128_epi32(
             _mm512_shuffle_epi32(vec, p32), p128); }
};

/*
 * Class F64vec8
 * 8 elements, each element float64 
 */
class F64vec8
{
protected:
    __m512d vec;
public:
    F64vec8() {}
    F64vec8(__m512d m)           { vec = m;}
    F64vec8(double d[8])        {  vec = SETA_PD(d);}

    F64vec8(const double d7, const double d6, const double d5, const double d4,
            const double d3, const double d2, const double d1, const double d0)
        {
            vec = _mm512_set_pd(d7, d6, d5, d4,
                                     d3, d2, d1, d0);
        }

    F64vec8(const double d3, const double d2, const double d1, const double d0)
        {
            vec = _mm512_set4_pd(d0,d1,d2,d3);
        }

    EXPLICIT F64vec8(const double d) { vec = _mm512_set1_pd(d); }
    EXPLICIT F64vec8(const float d) { vec = _mm512_set1_pd((double)d); }

    /* Conversion functions */
    operator  __m512d() const    { return vec; }

    F64vec8& operator&=(const F64vec8 &a) { return *this = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(vec),_mm512_castpd_si512(a))); }
    F64vec8& operator|=(const F64vec8 &a) { return *this = _mm512_castsi512_pd(_mm512_or_epi64(_mm512_castpd_si512(vec),_mm512_castpd_si512(a))); }
    F64vec8& operator^=(const F64vec8 &a) { return *this = _mm512_castsi512_pd(_mm512_xor_epi64(_mm512_castpd_si512(vec),_mm512_castpd_si512(a))); }

    /* Arithmetic Operators */
    friend F64vec8 operator +(const F64vec8 &a, const F64vec8 &b)
        { return _mm512_add_pd(a,b); }

    friend F64vec8 operator +(const F64vec8 &a, const double b)
        { return _mm512_add_pd(a,_mm512_set1_pd(b)); }

    friend F64vec8 operator -(const F64vec8 &a, const F64vec8 &b)
        { return _mm512_sub_pd(a,b); } 

    friend F64vec8 operator -(const F64vec8 &a, const double b)
        { return _mm512_sub_pd(a,_mm512_set1_pd(b)); } 

    friend F64vec8 operator *(const F64vec8 &a, const F64vec8 &b)
        { return _mm512_mul_pd(a,b); } 

    friend F64vec8 operator *(const F64vec8 &a, const double b)
        { return _mm512_mul_pd(a,_mm512_set1_pd(b)); } 

    friend F64vec8 operator /(const F64vec8 &a, const F64vec8 &b)
        { return _mm512_div_pd(a,b); }

    friend F64vec8 operator /(const F64vec8 &a, const double b)
        { return _mm512_div_pd(a,_mm512_set1_pd(b)); }

    F64vec8& operator +=(const F64vec8 &b)
        { return *this = _mm512_add_pd(vec, b); }

    F64vec8& operator +=(const double b)
        { return *this = _mm512_add_pd(vec, _mm512_set1_pd(b)); }

    F64vec8& operator -=(const F64vec8 &b)
        { return *this = _mm512_sub_pd(vec, b); }

    F64vec8& operator -=(const double b)
        { return *this = _mm512_sub_pd(vec, _mm512_set1_pd(b)); }

    F64vec8& operator *=(const F64vec8 &b)
        { return *this = _mm512_mul_pd(vec, b); }

    F64vec8& operator *=(const double b)
        { return *this = _mm512_mul_pd(vec, _mm512_set1_pd(b)); }

    F64vec8& operator /=(const F64vec8 &b)
        { return *this = _mm512_div_pd(vec, b); }

    F64vec8& operator /=(const double b)
        { return *this = _mm512_div_pd(vec, _mm512_set1_pd(b)); }

    /* Unary '-' */
    F64vec8 operator - ( ) const { return F64vec8(0.0) - vec; }
    F64vec8 operator - ( )       { return F64vec8(0.0) - vec; }

    /* Min and Max */
    friend F64vec8 min(const F64vec8 &a, const F64vec8 &b) { return _mm512_min_pd(a,b); }
    friend F64vec8 max(const F64vec8 &a, const F64vec8 &b) { return _mm512_max_pd(a,b); }
    
    /* SIND Min and Max - added for compatibility with SSE's fvec.h */
    friend F64vec8 simd_min(const F64vec8 &a, const F64vec8 &b) { return _mm512_min_pd(a,b); }
    friend F64vec8 simd_max(const F64vec8 &a, const F64vec8 &b) { return _mm512_max_pd(a,b); }
    
    /* Reduce functions */
    friend double reduce_add(const F64vec8 &a)      { return _mm512_reduce_add_pd(a);}
    friend double reduce_mul(const F64vec8 &a)      { return _mm512_reduce_mul_pd(a);}
    friend double reduce_max(const F64vec8 &a)      { return _mm512_reduce_max_pd(a);}
    friend double reduce_min(const F64vec8 &a)      { return _mm512_reduce_min_pd(a);}
    
    /* Horizontal add and mul - for compatibility with SSE's dvec.h */
    friend double add_horizontal(const F64vec8 &a)      { return _mm512_reduce_add_pd(a);}
    friend double mul_horizontal(const F64vec8 &a)      { return _mm512_reduce_mul_pd(a);}

    /* Square Root */
    friend F64vec8 sqrt(const F64vec8& a){ return _mm512_sqrt_pd(a);}

    /* Compare */
    #define Fvec64s8_COMP(op) \
    friend VecMask16 cmp##op (const F64vec8 &a, const F64vec8 &b) \
        { return _mm512_cmp##op##_pd_mask(a,b); }

        Fvec64s8_COMP(eq)
        Fvec64s8_COMP(lt)
        Fvec64s8_COMP(le)
        Fvec64s8_COMP(unord)
        Fvec64s8_COMP(neq)
        Fvec64s8_COMP(nlt)
        Fvec64s8_COMP(nle)
        Fvec64s8_COMP(ord)

    #undef Fvec64s8_COMP

    /* Math */
    #define Fvec64s8_MATH(func) \
    friend F64vec8 func(const F64vec8& a) \
        { return  _mm512_##func##_pd(a);}

        Fvec64s8_MATH(acos)
        Fvec64s8_MATH(acosh)
        Fvec64s8_MATH(asin)
        Fvec64s8_MATH(asinh)
        Fvec64s8_MATH(atan)
        Fvec64s8_MATH(atanh)
        Fvec64s8_MATH(cbrt)
        Fvec64s8_MATH(ceil)
        Fvec64s8_MATH(cos)
        Fvec64s8_MATH(cosh)
        Fvec64s8_MATH(erf)
        Fvec64s8_MATH(erfc)
        Fvec64s8_MATH(erfinv)
        Fvec64s8_MATH(exp2)
        Fvec64s8_MATH(exp)
        Fvec64s8_MATH(floor)
        Fvec64s8_MATH(invsqrt)
        Fvec64s8_MATH(log10)
        Fvec64s8_MATH(log2)
        Fvec64s8_MATH(log)
        Fvec64s8_MATH(nearbyint)
        Fvec64s8_MATH(rint)
        Fvec64s8_MATH(svml_round)
        Fvec64s8_MATH(sin)
        Fvec64s8_MATH(sinh)
        Fvec64s8_MATH(tan)
        Fvec64s8_MATH(tanh)
        Fvec64s8_MATH(trunc)

    #undef Fvec64s8_MATH

    #define Fvec64s8_MATH2(func) \
    friend F64vec8 func(const F64vec8& a, const F64vec8& b) \
        { return  _mm512_##func##_pd(a, b);}

        Fvec64s8_MATH2(atan2)
        Fvec64s8_MATH2(hypot)
        Fvec64s8_MATH2(pow)

    #undef Fvec64s8_MATH2

    /* Debug Features */

#if defined(MICVEC_DEFINE_OUTPUT_OPERATORS)
    friend MICVEC_STD ostream& operator<< (MICVEC_STD ostream &os,
                                         const F64vec8 &a)
        {
            double *fp = (double*)&a;
            os   << "{" << *(fp + 0)
                 << ", " << *(fp + 1)
                 << ", " << *(fp + 2)
                 << ", " << *(fp + 3)
                 << ", " << *(fp + 4)
                 << ", " << *(fp + 5)
                 << ", " << *(fp + 6)
                 << ", " << *(fp + 7) << "}";
            return os;
        }
#endif

    /* Element Access Only, no modifications to elements*/
    const double& operator[](const int i) const
        {
            /* Assert enabled only during debug /DDEBUG */
            assert((0 <= i) && (i <= 7)); /* User should only access elements 0-7 */
            double *fp = (double*)&vec;
            return *(fp+i);
        }

    /* Element Access and Modification*/
    double& operator[](const int i)
        {
            /* Assert enabled only during debug /DDEBUG */
            assert((0 <= i) && (i <= 7)); /* User should only access elements 0-7 */
            double *fp = (double*)&vec;
            return *(fp+i);
        }

    /* Register swizzles */
    F64vec8 cdab() { return (F64vec8)_mm512_swizzle_epi64(vec, _MM_SWIZ_REG_CDAB); }
    F64vec8 badc() { return (F64vec8)_mm512_swizzle_epi64(vec, _MM_SWIZ_REG_BADC); }
    F64vec8 aaaa() { return (F64vec8)_mm512_swizzle_epi64(vec, _MM_SWIZ_REG_AAAA); }
    F64vec8 bbbb() { return (F64vec8)_mm512_swizzle_epi64(vec, _MM_SWIZ_REG_BBBB); }
    F64vec8 cccc() { return (F64vec8)_mm512_swizzle_epi64(vec, _MM_SWIZ_REG_CCCC); }
    F64vec8 dddd() { return (F64vec8)_mm512_swizzle_epi64(vec, _MM_SWIZ_REG_DDDD); }
    F64vec8 dacb() { return (F64vec8)_mm512_swizzle_epi64(vec, _MM_SWIZ_REG_DACB); }

    template <_MM_PERM_ENUM p128, _MM_PERM_ENUM p32>
    F64vec8 shuffle()
        { return (F64vec8)_mm512_mask_permute4f128_epi32(
             _mm512_shuffle_epi32(vec, p32), p128); }

};


/* Logical operations */
inline F32vec16 operator & (const F32vec16 &a, const F32vec16 &b)
    { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b))); }

inline F64vec8  operator & (const F64vec8 &a,  const F64vec8 &b)
    { return _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(a),_mm512_castpd_si512(b))); }

inline F32vec16 operator | (const F32vec16 &a, const F32vec16 &b)
    { return _mm512_castsi512_ps(_mm512_or_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b))); }

inline F64vec8  operator | (const F64vec8 &a,  const F64vec8 &b)
    { return _mm512_castsi512_pd(_mm512_or_epi64(_mm512_castpd_si512(a),_mm512_castpd_si512(b))); }

inline F32vec16 operator ^ (const F32vec16 &a, const F32vec16 &b)
    { return _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b))); }

inline F64vec8  operator ^ (const F64vec8 &a,  const F64vec8 &b)
    { return _mm512_castsi512_pd(_mm512_xor_epi64(_mm512_castpd_si512(a),_mm512_castpd_si512(b))); }

inline F32vec16 andn (const F32vec16 &a, const F32vec16 &b)
    { return _mm512_castsi512_ps(_mm512_andnot_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b))); }

inline F64vec8  andn (const F64vec8 &a,  const F64vec8 &b)
    { return _mm512_castsi512_pd(_mm512_andnot_epi64(_mm512_castpd_si512(a),_mm512_castpd_si512(b))); }


/*
 * Convertions.
 * Convertions primitives implemented by macro due to immediate restrictions
 */

/*
 * Convert float64 vector to int32 vector.
 */
#define F64vec8ToI32vec16(v1,rc) \
    (I32vec16)_mm512_cvtfxpnt_roundpd_epi32lo(v1, rc)

/*
 * Convert float64 vector to float32 vector.
 */
#define F64vec8ToF32vec16(v1,rc) \
    (F32vec16)_mm512_cvt_roundpd_pslo(v1, rc)

/*
 * Convert float64 vector to unsigned int32 vector.
 */
#define F64vec8ToIu32vec16(v1,rc) \
    (Iu32vec16)_mm512_cvtfxpnt_roundpd_epu32lo(v1, rc)

/*
 * Convert int32 vector to float64 vector.
 */
#define I32vec16ToF64vec8(v1) \
    (F64vec8)_mm512_cvtepi32lo_pd(v1)

/*
 * Convert int32 vector to float32 vector.
 */
#define I32vec16ToF32vec16(v1,expadj) \
    (F32vec16)_mm512_cvtfxpnt_round_adjustepi32_ps(v1, \
                                                   _MM_FROUND_CUR_DIRECTION, \
                                                   expadj)

/*
 * Convert float32 vector to float64 vector.
 */
#define F32vec16ToF64vec8(v1) \
    (F64vec8)_mm512_cvtpslo_pd(v1)

/*
 * Convert float32 vector to int32 vector.
 */
#define F32vec16ToI32vec16(v1,rc,expadj) \
    (Is32vec16)_mm512_cvtfxpnt_round_adjustps_epi32(v1, rc, expadj)

/*
 * Convert float32 vector to unsigned int32 vector.
 */
#define F32vec16ToIu32vec16(v1,rc,expadj) \
    (Iu32vec16)_mm512_cvtfxpnt_round_adjustps_epu32(v1, rc, expadj)

/*
 * Convert unsigned int32 vector to float32 vector.
 */
#define Iu32vec16ToF32vec16(v1,expadj) \
    (F32vec16)_mm512_cvtfxpnt_round_adjustepu32_ps(v1, \
                                                   _MM_FROUND_CUR_DIRECTION, \
                                                   expadj)

/*
 * Convert unsigned int32 vector to float64 vector.
 */

#define Iu32vec16ToF64vec8(v1) \
    ((F64vec8)_mm512_cvtepu32lo_pd(v1))


#define F64vec8_SELECT(op) \
inline F64vec8 select_##op (const F64vec8 &a, const F64vec8 &b, \
    const F64vec8 &c, const F64vec8 &d)                         \
{                                                               \
    __mmask mask = _mm512_cmp##op##_pd_mask(a,b);               \
    F64vec8 res = d;                                            \
    return(_mm512_castsi512_pd(_mm512_mask_or_epi64(_mm512_castpd_si512(res), mask, _mm512_castpd_si512(c), _mm512_castpd_si512(c)))); \
}                                                

F64vec8_SELECT(eq)        // select_eq(a,b)
F64vec8_SELECT(lt)        // select_lt(a,b)
F64vec8_SELECT(le)        // select_le(a,b)    
F64vec8_SELECT(neq)       // select_neq(a,b)
F64vec8_SELECT(nlt)       // select_nlt(a,b)
F64vec8_SELECT(nle)       // select_nle(a,b)
#undef F64vec8_SELECT

#define F32vec16_SELECT(op) \
inline F32vec16 select_##op (const F32vec16 &a, const F32vec16 &b,  \
     const F32vec16 &c, const F32vec16 &d)                          \
{                                                                   \
    __mmask mask = _mm512_cmp##op##_ps_mask(a,b);                   \
    F32vec16 res = d;                                               \
    return(_mm512_castsi512_ps(_mm512_mask_or_epi32(_mm512_castps_si512(res), mask, _mm512_castps_si512(c), _mm512_castps_si512(c)))); \
}                                                

F32vec16_SELECT(eq)     // select_eq(a,b)
F32vec16_SELECT(lt)     // select_lt(a,b)
F32vec16_SELECT(le)     // select_le(a,b)    
F32vec16_SELECT(neq)    // select_neq(a,b)
F32vec16_SELECT(nlt)    // select_nlt(a,b)
F32vec16_SELECT(nle)    // select_nle(a,b)
#undef F32vec16_SELECT

#undef MICVEC_DEFINE_OUTPUT_OPERATORS
#undef MICVEC_STD

#ifndef _MSC_VER
#pragma pack(pop) /* 64-B aligned */
#endif

#endif // MICVEC_H_INCLUDED
