Author: Michael R. Crusoe <michael.crusoe@gmail.com>
Description: enable building on non-x86 via the SIMDE library
--- raxml.orig/avxLikelihood.c
+++ raxml/avxLikelihood.c
@@ -10,13 +10,10 @@
 #include <limits.h>
 #include "axml.h"
 #include <stdint.h>
-#include <xmmintrin.h>
-#include <pmmintrin.h>
-#include <immintrin.h>
-
-#ifdef _FMA
-#include <x86intrin.h>
-#define FMAMACC(a,b,c) _mm256_fmadd_pd(b,c,a) 
+#include "debian/include/simde/x86/avx.h"
+
+#ifdef SIMDE_FMA_NATIVE
+#define FMAMACC(a,b,c) simde_mm256_fmadd_pd(b,c,a) 
 #endif
 
 extern const unsigned int mask32[32];
@@ -24,38 +21,38 @@
 const union __attribute__ ((aligned (BYTE_ALIGNMENT)))
 {
   uint64_t i[4];
-  __m256d m;
+  simde__m256d m;
   
 } absMask_AVX = {{0x7fffffffffffffffULL, 0x7fffffffffffffffULL, 0x7fffffffffffffffULL, 0x7fffffffffffffffULL}};
 
 
 
-static inline __m256d hadd4(__m256d v, __m256d u)
+static inline simde__m256d hadd4(simde__m256d v, simde__m256d u)
 { 
-  __m256d
+  simde__m256d
     a, b;
   
-  v = _mm256_hadd_pd(v, v);
-  a = _mm256_permute2f128_pd(v, v, 1);
-  v = _mm256_add_pd(a, v);
-
-  u = _mm256_hadd_pd(u, u);
-  b = _mm256_permute2f128_pd(u, u, 1);
-  u = _mm256_add_pd(b, u);
+  v = simde_mm256_hadd_pd(v, v);
+  a = simde_mm256_permute2f128_pd(v, v, 1);
+  v = simde_mm256_add_pd(a, v);
+
+  u = simde_mm256_hadd_pd(u, u);
+  b = simde_mm256_permute2f128_pd(u, u, 1);
+  u = simde_mm256_add_pd(b, u);
 
-  v = _mm256_mul_pd(v, u);	
+  v = simde_mm256_mul_pd(v, u);	
   
   return v;
 }
 
-static inline __m256d hadd3(__m256d v)
+static inline simde__m256d hadd3(simde__m256d v)
 { 
-  __m256d
+  simde__m256d
     a;
   
-  v = _mm256_hadd_pd(v, v);
-  a = _mm256_permute2f128_pd(v, v, 1);
-  v = _mm256_add_pd(a, v);
+  v = simde_mm256_hadd_pd(v, v);
+  a = simde_mm256_permute2f128_pd(v, v, 1);
+  v = simde_mm256_add_pd(a, v);
   
   return v;
 }
@@ -77,9 +74,9 @@
     scale, 
     addScale = 0;
  
-  __m256d 
-    minlikelihood_avx = _mm256_set1_pd( minlikelihood ),
-    twoto = _mm256_set1_pd(twotothe256);
+  simde__m256d 
+    minlikelihood_avx = simde_mm256_set1_pd( minlikelihood ),
+    twoto = simde_mm256_set1_pd(twotothe256);
  
 
   switch(tipCase)
@@ -94,8 +91,8 @@
 
 	for (i = 1; i < 16; i++)
 	  {	   
-	    __m256d 
-	      tv = _mm256_load_pd(&(tipVector[i * 4]));
+	    simde__m256d 
+	      tv = simde_mm256_load_pd(&(tipVector[i * 4]));
 
 	    int 
 	      j;
@@ -105,13 +102,13 @@
 		for (j = 0; j < 4; j++)
 		  for (k = 0; k < 4; k++)
 		    {		 
-		      __m256d 
-			left1 = _mm256_load_pd(&left[j * 16 + k * 4]);		  		  		  
+		      simde__m256d 
+			left1 = simde_mm256_load_pd(&left[j * 16 + k * 4]);		  		  		  
 		      
-		      left1 = _mm256_mul_pd(left1, tv);		  
+		      left1 = simde_mm256_mul_pd(left1, tv);		  
 		      left1 = hadd3(left1);
 		      
-		      _mm256_store_pd(&umpX1[i * 64 + j * 16 + k * 4], left1);
+		      simde_mm256_store_pd(&umpX1[i * 64 + j * 16 + k * 4], left1);
 		    }
 	      }
 	    
@@ -121,13 +118,13 @@
 		for (j = 0; j < 4; j++)
 		  for (k = 0; k < 4; k++)
 		    {		 
-		      __m256d 
-			left1 = _mm256_load_pd(&right[j * 16 + k * 4]);		  		  		  
+		      simde__m256d 
+			left1 = simde_mm256_load_pd(&right[j * 16 + k * 4]);		  		  		  
 		      
-		      left1 = _mm256_mul_pd(left1, tv);		  
+		      left1 = simde_mm256_mul_pd(left1, tv);		  
 		      left1 = hadd3(left1);
 		      
-		      _mm256_store_pd(&umpX2[i * 64 + j * 16 + k * 4], left1);
+		      simde_mm256_store_pd(&umpX2[i * 64 + j * 16 + k * 4], left1);
 		    }	  
 	      }
 	    
@@ -141,27 +138,27 @@
 	    
 	    for(k = 0; k < 4; k++)
 	      {
-		__m256d	   
-		  xv = _mm256_setzero_pd();
+		simde__m256d	   
+		  xv = simde_mm256_setzero_pd();
 	       
 		int 
 		  l;
 		
 		for(l = 0; l < 4; l++)
 		  {	       	     				      	      																	   
-		    __m256d
-		      x1v =  _mm256_mul_pd(_mm256_load_pd(&uX1[k * 16 + l * 4]), _mm256_load_pd(&uX2[k * 16 + l * 4]));
+		    simde__m256d
+		      x1v =  simde_mm256_mul_pd(simde_mm256_load_pd(&uX1[k * 16 + l * 4]), simde_mm256_load_pd(&uX2[k * 16 + l * 4]));
 		
-		    __m256d 
-		      evv = _mm256_load_pd(&extEV[l * 4]);
-#ifdef _FMA
+		    simde__m256d 
+		      evv = simde_mm256_load_pd(&extEV[l * 4]);
+#ifdef SIMDE_FMA_NATIVE
 		    xv = FMAMACC(xv,x1v,evv);
 #else						  
-		    xv = _mm256_add_pd(xv, _mm256_mul_pd(x1v, evv));
+		    xv = simde_mm256_add_pd(xv, simde_mm256_mul_pd(x1v, evv));
 #endif
 		  }
 		
-		_mm256_store_pd(&x3[16 * i + 4 * k], xv);
+		simde_mm256_store_pd(&x3[16 * i + 4 * k], xv);
 	      }	         	   	    
 	  }
       }
@@ -176,8 +173,8 @@
 	  {
 	    if(mask32[i] & x1_presenceMap)
 	      {
-		__m256d 
-		  tv = _mm256_load_pd(&(tipVector[i*4]));
+		simde__m256d 
+		  tv = simde_mm256_load_pd(&(tipVector[i*4]));
 		
 		int 
 		  j;
@@ -185,20 +182,20 @@
 		for (j = 0; j < 4; j++)
 		  for (k = 0; k < 4; k++)
 		    {		 
-		      __m256d 
-			left1 = _mm256_load_pd(&left[j * 16 + k * 4]);		  		  		  
+		      simde__m256d 
+			left1 = simde_mm256_load_pd(&left[j * 16 + k * 4]);		  		  		  
 		      
-		      left1 = _mm256_mul_pd(left1, tv);		  
+		      left1 = simde_mm256_mul_pd(left1, tv);		  
 		      left1 = hadd3(left1);
 		      
-		      _mm256_store_pd(&umpX1[i * 64 + j * 16 + k * 4], left1);
+		      simde_mm256_store_pd(&umpX1[i * 64 + j * 16 + k * 4], left1);
 		    }	 
 	      }
 	  }   	
 	
 	for(i = 0; i < n; i++)
 	  { 
-	    __m256d
+	    simde__m256d
 	      xv[4];	    	   
 	    
 	    scale = 1;
@@ -206,51 +203,51 @@
 
 	    for(k = 0; k < 4; k++)
 	      {
-		__m256d	   		 
-		  xvr = _mm256_load_pd(&(x2[i * 16 + k * 4]));
+		simde__m256d	   		 
+		  xvr = simde_mm256_load_pd(&(x2[i * 16 + k * 4]));
 
 		int 
 		  l;
 
-		xv[k]  = _mm256_setzero_pd();
+		xv[k]  = simde_mm256_setzero_pd();
 		  
 		for(l = 0; l < 4; l++)
 		  {	       	     				      	      															
-		    __m256d  
-		      x1v = _mm256_load_pd(&uX1[k * 16 + l * 4]),		     
-		      x2v = _mm256_mul_pd(xvr, _mm256_load_pd(&right[k * 16 + l * 4]));			    
+		    simde__m256d  
+		      x1v = simde_mm256_load_pd(&uX1[k * 16 + l * 4]),		     
+		      x2v = simde_mm256_mul_pd(xvr, simde_mm256_load_pd(&right[k * 16 + l * 4]));			    
 			
 		    x2v = hadd3(x2v);
-		    x1v = _mm256_mul_pd(x1v, x2v);			
+		    x1v = simde_mm256_mul_pd(x1v, x2v);			
 		
-		    __m256d 
-		      evv = _mm256_load_pd(&extEV[l * 4]);
+		    simde__m256d 
+		      evv = simde_mm256_load_pd(&extEV[l * 4]);
 			
-#ifdef _FMA
+#ifdef SIMDE_FMA_NATIVE
 		    xv[k] = FMAMACC(xv[k],x1v,evv);
 #else			  
-		    xv[k] = _mm256_add_pd(xv[k], _mm256_mul_pd(x1v, evv));
+		    xv[k] = simde_mm256_add_pd(xv[k], simde_mm256_mul_pd(x1v, evv));
 #endif
 		  }
 		    
 		if(scale)
 		  {
-		    __m256d 	     
-		      v1 = _mm256_and_pd(xv[k], absMask_AVX.m);
+		    simde__m256d 	     
+		      v1 = simde_mm256_and_pd(xv[k], absMask_AVX.m);
 
-		    v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+		    v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
 		    
-		    if(_mm256_movemask_pd( v1 ) != 15)
+		    if(simde_mm256_movemask_pd( v1 ) != 15)
 		      scale = 0;
 		  }
 	      }	    
 
 	    if(scale)
 	      {		
-		xv[0] = _mm256_mul_pd(xv[0], twoto);
-		xv[1] = _mm256_mul_pd(xv[1], twoto);
-		xv[2] = _mm256_mul_pd(xv[2], twoto);
-		xv[3] = _mm256_mul_pd(xv[3], twoto);
+		xv[0] = simde_mm256_mul_pd(xv[0], twoto);
+		xv[1] = simde_mm256_mul_pd(xv[1], twoto);
+		xv[2] = simde_mm256_mul_pd(xv[2], twoto);
+		xv[3] = simde_mm256_mul_pd(xv[3], twoto);
 
 		if(useFastScaling)
 		  addScale += wgt[i];
@@ -258,10 +255,10 @@
 		  ex3[i] += 1;
 	      }
 
-	    _mm256_store_pd(&x3[16 * i],      xv[0]);
-	    _mm256_store_pd(&x3[16 * i + 4],  xv[1]);
-	    _mm256_store_pd(&x3[16 * i + 8],  xv[2]);
-	    _mm256_store_pd(&x3[16 * i + 12], xv[3]);
+	    simde_mm256_store_pd(&x3[16 * i],      xv[0]);
+	    simde_mm256_store_pd(&x3[16 * i + 4],  xv[1]);
+	    simde_mm256_store_pd(&x3[16 * i + 8],  xv[2]);
+	    simde_mm256_store_pd(&x3[16 * i + 12], xv[3]);
 	  }
       }
       break;
@@ -269,55 +266,55 @@
       {
 	for(i = 0; i < n; i++)
 	  {	
-	    __m256d
+	    simde__m256d
 	      xv[4];
 	    
 	    scale = 1;
 
 	    for(k = 0; k < 4; k++)
 	      {
-		__m256d	   
+		simde__m256d	   
 		 
-		  xvl = _mm256_load_pd(&(x1[i * 16 + k * 4])),
-		  xvr = _mm256_load_pd(&(x2[i * 16 + k * 4]));
+		  xvl = simde_mm256_load_pd(&(x1[i * 16 + k * 4])),
+		  xvr = simde_mm256_load_pd(&(x2[i * 16 + k * 4]));
 
 		int 
 		  l;
 
-		xv[k] = _mm256_setzero_pd();
+		xv[k] = simde_mm256_setzero_pd();
 
 		for(l = 0; l < 4; l++)
 		  {	       	     				      	      															
-		    __m256d 
-		      x1v = _mm256_mul_pd(xvl, _mm256_load_pd(&left[k * 16 + l * 4])),
-		      x2v = _mm256_mul_pd(xvr, _mm256_load_pd(&right[k * 16 + l * 4]));			    
+		    simde__m256d 
+		      x1v = simde_mm256_mul_pd(xvl, simde_mm256_load_pd(&left[k * 16 + l * 4])),
+		      x2v = simde_mm256_mul_pd(xvr, simde_mm256_load_pd(&right[k * 16 + l * 4]));			    
 			
 		    x1v = hadd4(x1v, x2v);			
 		
-		    __m256d 
-		      evv = _mm256_load_pd(&extEV[l * 4]);
+		    simde__m256d 
+		      evv = simde_mm256_load_pd(&extEV[l * 4]);
 						  
-		    xv[k] = _mm256_add_pd(xv[k], _mm256_mul_pd(x1v, evv));
+		    xv[k] = simde_mm256_add_pd(xv[k], simde_mm256_mul_pd(x1v, evv));
 		  }
 		
 		if(scale)
 		  {
-		    __m256d 	     
-		      v1 = _mm256_and_pd(xv[k], absMask_AVX.m);
+		    simde__m256d 	     
+		      v1 = simde_mm256_and_pd(xv[k], absMask_AVX.m);
 
-		    v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+		    v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
 		    
-		    if(_mm256_movemask_pd( v1 ) != 15)
+		    if(simde_mm256_movemask_pd( v1 ) != 15)
 		      scale = 0;
 		  }
 	      }
 
 	     if(scale)
 	      {	
-		xv[0] = _mm256_mul_pd(xv[0], twoto);
-		xv[1] = _mm256_mul_pd(xv[1], twoto);
-		xv[2] = _mm256_mul_pd(xv[2], twoto);
-		xv[3] = _mm256_mul_pd(xv[3], twoto);
+		xv[0] = simde_mm256_mul_pd(xv[0], twoto);
+		xv[1] = simde_mm256_mul_pd(xv[1], twoto);
+		xv[2] = simde_mm256_mul_pd(xv[2], twoto);
+		xv[3] = simde_mm256_mul_pd(xv[3], twoto);
 
 		if(useFastScaling)
 		  addScale += wgt[i];
@@ -325,10 +322,10 @@
 		  ex3[i] += 1;		
 	      }
 		
-	    _mm256_store_pd(&x3[16 * i],      xv[0]);
-	    _mm256_store_pd(&x3[16 * i + 4],  xv[1]);
-	    _mm256_store_pd(&x3[16 * i + 8],  xv[2]);
-	    _mm256_store_pd(&x3[16 * i + 12], xv[3]);
+	    simde_mm256_store_pd(&x3[16 * i],      xv[0]);
+	    simde_mm256_store_pd(&x3[16 * i + 4],  xv[1]);
+	    simde_mm256_store_pd(&x3[16 * i + 8],  xv[2]);
+	    simde_mm256_store_pd(&x3[16 * i + 12], xv[3]);
 	  }
       }
       break;
@@ -362,9 +359,9 @@
     scaleGap,
     addScale = 0;
  
-  __m256d 
-    minlikelihood_avx = _mm256_set1_pd( minlikelihood ),
-    twoto = _mm256_set1_pd(twotothe256);
+  simde__m256d 
+    minlikelihood_avx = simde_mm256_set1_pd( minlikelihood ),
+    twoto = simde_mm256_set1_pd(twotothe256);
  
   double
     *x1,
@@ -385,8 +382,8 @@
 
 	for (i = 1; i < 16; i++)
 	  {
-	    __m256d 
-	      tv = _mm256_load_pd(&(tipVector[i * 4]));
+	    simde__m256d 
+	      tv = simde_mm256_load_pd(&(tipVector[i * 4]));
 
 	    int 
 	      j;
@@ -396,13 +393,13 @@
 		for (j = 0; j < 4; j++)
 		  for (k = 0; k < 4; k++)
 		    {		 
-		      __m256d 
-			left1 = _mm256_load_pd(&left[j * 16 + k * 4]);		  		  		  
+		      simde__m256d 
+			left1 = simde_mm256_load_pd(&left[j * 16 + k * 4]);		  		  		  
 		      
-		      left1 = _mm256_mul_pd(left1, tv);		  
+		      left1 = simde_mm256_mul_pd(left1, tv);		  
 		      left1 = hadd3(left1);
 		      
-		      _mm256_store_pd(&umpX1[i * 64 + j * 16 + k * 4], left1);
+		      simde_mm256_store_pd(&umpX1[i * 64 + j * 16 + k * 4], left1);
 		    }
 	      }
 	  
@@ -411,13 +408,13 @@
 		for (j = 0; j < 4; j++)
 		  for (k = 0; k < 4; k++)
 		    {		 
-		      __m256d 
-			left1 = _mm256_load_pd(&right[j * 16 + k * 4]);		  		  		  
+		      simde__m256d 
+			left1 = simde_mm256_load_pd(&right[j * 16 + k * 4]);		  		  		  
 		      
-		      left1 = _mm256_mul_pd(left1, tv);		  
+		      left1 = simde_mm256_mul_pd(left1, tv);		  
 		      left1 = hadd3(left1);
 		      
-		      _mm256_store_pd(&umpX2[i * 64 + j * 16 + k * 4], left1);
+		      simde_mm256_store_pd(&umpX2[i * 64 + j * 16 + k * 4], left1);
 		    }	  
 	      }
 	  }   	
@@ -430,27 +427,27 @@
 	  
 	  for(k = 0; k < 4; k++)
 	    {
-	      __m256d	   
-		xv = _mm256_setzero_pd();
+	      simde__m256d	   
+		xv = simde_mm256_setzero_pd();
 	      
 	      int 
 		l;
 	      
 	      for(l = 0; l < 4; l++)
 		{	       	     				      	      																	   
-		  __m256d
-		    x1v =  _mm256_mul_pd(_mm256_load_pd(&uX1[k * 16 + l * 4]), _mm256_load_pd(&uX2[k * 16 + l * 4]));
+		  simde__m256d
+		    x1v =  simde_mm256_mul_pd(simde_mm256_load_pd(&uX1[k * 16 + l * 4]), simde_mm256_load_pd(&uX2[k * 16 + l * 4]));
 		  
-		  __m256d 
-		    evv = _mm256_load_pd(&extEV[l * 4]);
-#ifdef _FMA
+		  simde__m256d 
+		    evv = simde_mm256_load_pd(&extEV[l * 4]);
+#ifdef SIMDE_FMA_NATIVE
 		  xv = FMAMACC(xv,x1v,evv);
 #else						  
-		  xv = _mm256_add_pd(xv, _mm256_mul_pd(x1v, evv));
+		  xv = simde_mm256_add_pd(xv, simde_mm256_mul_pd(x1v, evv));
 #endif
 		}
 		    
-	      _mm256_store_pd(&x3[4 * k], xv);
+	      simde_mm256_store_pd(&x3[4 * k], xv);
 	    }
 	}
 	
@@ -465,27 +462,27 @@
 	    
 		for(k = 0; k < 4; k++)
 		  {
-		    __m256d	   
-		      xv = _mm256_setzero_pd();
+		    simde__m256d	   
+		      xv = simde_mm256_setzero_pd();
 	       
 		    int 
 		      l;
 		
 		    for(l = 0; l < 4; l++)
 		      {	       	     				      	      																	   
-			__m256d
-			  x1v =  _mm256_mul_pd(_mm256_load_pd(&uX1[k * 16 + l * 4]), _mm256_load_pd(&uX2[k * 16 + l * 4]));
+			simde__m256d
+			  x1v =  simde_mm256_mul_pd(simde_mm256_load_pd(&uX1[k * 16 + l * 4]), simde_mm256_load_pd(&uX2[k * 16 + l * 4]));
 			
-			__m256d 
-			  evv = _mm256_load_pd(&extEV[l * 4]);
-#ifdef _FMA
+			simde__m256d 
+			  evv = simde_mm256_load_pd(&extEV[l * 4]);
+#ifdef SIMDE_FMA_NATIVE
 			xv = FMAMACC(xv,x1v,evv);
 #else						  
-			xv = _mm256_add_pd(xv, _mm256_mul_pd(x1v, evv));
+			xv = simde_mm256_add_pd(xv, simde_mm256_mul_pd(x1v, evv));
 #endif
 		      }
 		    
-		    _mm256_store_pd(&x3[4 * k], xv);
+		    simde_mm256_store_pd(&x3[4 * k], xv);
 		  }
 
 		x3 += 16;
@@ -501,8 +498,8 @@
        
 	for (i = 1; i < 16; i++)
 	  {
-	    __m256d 
-	      tv = _mm256_load_pd(&(tipVector[i*4]));
+	    simde__m256d 
+	      tv = simde_mm256_load_pd(&(tipVector[i*4]));
 	    
 	    int 
 	      j;
@@ -512,19 +509,19 @@
 		for (j = 0; j < 4; j++)
 		  for (k = 0; k < 4; k++)
 		    {		 
-		      __m256d 
-			left1 = _mm256_load_pd(&left[j * 16 + k * 4]);		  		  		  
+		      simde__m256d 
+			left1 = simde_mm256_load_pd(&left[j * 16 + k * 4]);		  		  		  
 		      
-		      left1 = _mm256_mul_pd(left1, tv);		  
+		      left1 = simde_mm256_mul_pd(left1, tv);		  
 		      left1 = hadd3(left1);
 		      
-		      _mm256_store_pd(&umpX1[i * 64 + j * 16 + k * 4], left1);
+		      simde_mm256_store_pd(&umpX1[i * 64 + j * 16 + k * 4], left1);
 		    }	 
 	      }	    	
 	  }	
 
 	{ 
-	  __m256d
+	  simde__m256d
 	    xv[4];
 	  
 	  scaleGap = 1;
@@ -535,57 +532,57 @@
 
 	  for(k = 0; k < 4; k++)
 	    {
-	      __m256d	   		 
-		xvr = _mm256_load_pd(&(x2[k * 4]));
+	      simde__m256d	   		 
+		xvr = simde_mm256_load_pd(&(x2[k * 4]));
 
 	      int 
 		l;
 
-	      xv[k]  = _mm256_setzero_pd();
+	      xv[k]  = simde_mm256_setzero_pd();
 		  
 	      for(l = 0; l < 4; l++)
 		{	       	     				      	      															
-		  __m256d  
-		    x1v = _mm256_load_pd(&uX1[k * 16 + l * 4]),		     
-		    x2v = _mm256_mul_pd(xvr, _mm256_load_pd(&right[k * 16 + l * 4]));			    
+		  simde__m256d  
+		    x1v = simde_mm256_load_pd(&uX1[k * 16 + l * 4]),		     
+		    x2v = simde_mm256_mul_pd(xvr, simde_mm256_load_pd(&right[k * 16 + l * 4]));			    
 			
 		  x2v = hadd3(x2v);
-		  x1v = _mm256_mul_pd(x1v, x2v);			
+		  x1v = simde_mm256_mul_pd(x1v, x2v);			
 		
-		  __m256d 
-		    evv = _mm256_load_pd(&extEV[l * 4]);
+		  simde__m256d 
+		    evv = simde_mm256_load_pd(&extEV[l * 4]);
 			
-#ifdef _FMA
+#ifdef SIMDE_FMA_NATIVE
 		  xv[k] = FMAMACC(xv[k],x1v,evv);
 #else			  
-		  xv[k] = _mm256_add_pd(xv[k], _mm256_mul_pd(x1v, evv));
+		  xv[k] = simde_mm256_add_pd(xv[k], simde_mm256_mul_pd(x1v, evv));
 #endif
 		}
 		    
 	      if(scaleGap)
 		{
-		  __m256d 	     
-		    v1 = _mm256_and_pd(xv[k], absMask_AVX.m);
+		  simde__m256d 	     
+		    v1 = simde_mm256_and_pd(xv[k], absMask_AVX.m);
 		  
-		  v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+		  v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
 		    
-		  if(_mm256_movemask_pd( v1 ) != 15)
+		  if(simde_mm256_movemask_pd( v1 ) != 15)
 		    scaleGap = 0;
 		}
 	    }
 	
 	  if(scaleGap)
 	    {
-	      xv[0] = _mm256_mul_pd(xv[0], twoto);
-	      xv[1] = _mm256_mul_pd(xv[1], twoto);
-	      xv[2] = _mm256_mul_pd(xv[2], twoto);
-	      xv[3] = _mm256_mul_pd(xv[3], twoto);	    
+	      xv[0] = simde_mm256_mul_pd(xv[0], twoto);
+	      xv[1] = simde_mm256_mul_pd(xv[1], twoto);
+	      xv[2] = simde_mm256_mul_pd(xv[2], twoto);
+	      xv[3] = simde_mm256_mul_pd(xv[3], twoto);	    
 	    }
 
-	  _mm256_store_pd(&x3[0],      xv[0]);
-	  _mm256_store_pd(&x3[4],  xv[1]);
-	  _mm256_store_pd(&x3[8],  xv[2]);
-	  _mm256_store_pd(&x3[12], xv[3]);
+	  simde_mm256_store_pd(&x3[0],      xv[0]);
+	  simde_mm256_store_pd(&x3[4],  xv[1]);
+	  simde_mm256_store_pd(&x3[8],  xv[2]);
+	  simde_mm256_store_pd(&x3[12], xv[3]);
 	}
 	
 	x3 = x3_start;
@@ -612,7 +609,7 @@
 		    x2_ptr += 16;
 		  }
 		
-		__m256d
+		simde__m256d
 		  xv[4];	    	   
 		
 		scale = 1;
@@ -620,51 +617,51 @@
 		
 		for(k = 0; k < 4; k++)
 		  {
-		    __m256d	   		 
-		      xvr = _mm256_load_pd(&(x2[k * 4]));
+		    simde__m256d	   		 
+		      xvr = simde_mm256_load_pd(&(x2[k * 4]));
 		    
 		    int 
 		      l;
 		    
-		    xv[k]  = _mm256_setzero_pd();
+		    xv[k]  = simde_mm256_setzero_pd();
 		    
 		    for(l = 0; l < 4; l++)
 		      {	       	     				      	      															
-			__m256d  
-			  x1v = _mm256_load_pd(&uX1[k * 16 + l * 4]),		     
-			  x2v = _mm256_mul_pd(xvr, _mm256_load_pd(&right[k * 16 + l * 4]));			    
+			simde__m256d  
+			  x1v = simde_mm256_load_pd(&uX1[k * 16 + l * 4]),		     
+			  x2v = simde_mm256_mul_pd(xvr, simde_mm256_load_pd(&right[k * 16 + l * 4]));			    
 			
 			x2v = hadd3(x2v);
-			x1v = _mm256_mul_pd(x1v, x2v);			
+			x1v = simde_mm256_mul_pd(x1v, x2v);			
 			
-			__m256d 
-			  evv = _mm256_load_pd(&extEV[l * 4]);
+			simde__m256d 
+			  evv = simde_mm256_load_pd(&extEV[l * 4]);
 			
-#ifdef _FMA
+#ifdef SIMDE_FMA_NATIVE
 			xv[k] = FMAMACC(xv[k],x1v,evv);
 #else			  
-			xv[k] = _mm256_add_pd(xv[k], _mm256_mul_pd(x1v, evv));
+			xv[k] = simde_mm256_add_pd(xv[k], simde_mm256_mul_pd(x1v, evv));
 #endif
 		      }
 		    
 		    if(scale)
 		      {
-			__m256d 	     
-			  v1 = _mm256_and_pd(xv[k], absMask_AVX.m);
+			simde__m256d 	     
+			  v1 = simde_mm256_and_pd(xv[k], absMask_AVX.m);
 			
-			v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+			v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
 			
-			if(_mm256_movemask_pd( v1 ) != 15)
+			if(simde_mm256_movemask_pd( v1 ) != 15)
 			  scale = 0;
 		      }
 		  }	    
 	      
 		if(scale)
 		  {
-		    xv[0] = _mm256_mul_pd(xv[0], twoto);
-		    xv[1] = _mm256_mul_pd(xv[1], twoto);
-		    xv[2] = _mm256_mul_pd(xv[2], twoto);
-		    xv[3] = _mm256_mul_pd(xv[3], twoto);
+		    xv[0] = simde_mm256_mul_pd(xv[0], twoto);
+		    xv[1] = simde_mm256_mul_pd(xv[1], twoto);
+		    xv[2] = simde_mm256_mul_pd(xv[2], twoto);
+		    xv[3] = simde_mm256_mul_pd(xv[3], twoto);
 
 		    if(useFastScaling)
 		      addScale += wgt[i];
@@ -672,10 +669,10 @@
 		      ex3[i] += 1;		   
 		  }
 	      
-		_mm256_store_pd(&x3[0],      xv[0]);
-		_mm256_store_pd(&x3[4],  xv[1]);
-		_mm256_store_pd(&x3[8],  xv[2]);
-		_mm256_store_pd(&x3[12], xv[3]);
+		simde_mm256_store_pd(&x3[0],      xv[0]);
+		simde_mm256_store_pd(&x3[4],  xv[1]);
+		simde_mm256_store_pd(&x3[8],  xv[2]);
+		simde_mm256_store_pd(&x3[12], xv[3]);
 	      
 		x3 += 16;
 	      }
@@ -689,61 +686,61 @@
 	  x2 = x2_gapColumn;	    
 	  x3 = x3_gapColumn;
 
-	  __m256d
+	  simde__m256d
 	    xv[4];
 	    
 	  scaleGap = 1;
 
 	  for(k = 0; k < 4; k++)
 	    {
-	      __m256d	   
+	      simde__m256d	   
 		
-		xvl = _mm256_load_pd(&(x1[k * 4])),
-		xvr = _mm256_load_pd(&(x2[k * 4]));
+		xvl = simde_mm256_load_pd(&(x1[k * 4])),
+		xvr = simde_mm256_load_pd(&(x2[k * 4]));
 
 	      int 
 		l;
 
-	      xv[k] = _mm256_setzero_pd();
+	      xv[k] = simde_mm256_setzero_pd();
 
 	      for(l = 0; l < 4; l++)
 		{	       	     				      	      															
-		  __m256d 
-		    x1v = _mm256_mul_pd(xvl, _mm256_load_pd(&left[k * 16 + l * 4])),
-		    x2v = _mm256_mul_pd(xvr, _mm256_load_pd(&right[k * 16 + l * 4]));			    
+		  simde__m256d 
+		    x1v = simde_mm256_mul_pd(xvl, simde_mm256_load_pd(&left[k * 16 + l * 4])),
+		    x2v = simde_mm256_mul_pd(xvr, simde_mm256_load_pd(&right[k * 16 + l * 4]));			    
 		  
 		  x1v = hadd4(x1v, x2v);			
 		  
-		  __m256d 
-		    evv = _mm256_load_pd(&extEV[l * 4]);
+		  simde__m256d 
+		    evv = simde_mm256_load_pd(&extEV[l * 4]);
 		  
-		  xv[k] = _mm256_add_pd(xv[k], _mm256_mul_pd(x1v, evv));
+		  xv[k] = simde_mm256_add_pd(xv[k], simde_mm256_mul_pd(x1v, evv));
 		}
 		
 	      if(scaleGap)
 		  {
-		    __m256d 	     
-		      v1 = _mm256_and_pd(xv[k], absMask_AVX.m);
+		    simde__m256d 	     
+		      v1 = simde_mm256_and_pd(xv[k], absMask_AVX.m);
 
-		    v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+		    v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
 		    
-		    if(_mm256_movemask_pd( v1 ) != 15)
+		    if(simde_mm256_movemask_pd( v1 ) != 15)
 		      scaleGap = 0;
 		  }
 	    }
 
 	  if(scaleGap)
 	    {
-	      xv[0] = _mm256_mul_pd(xv[0], twoto);
-	      xv[1] = _mm256_mul_pd(xv[1], twoto);
-	      xv[2] = _mm256_mul_pd(xv[2], twoto);
-	      xv[3] = _mm256_mul_pd(xv[3], twoto);	       
+	      xv[0] = simde_mm256_mul_pd(xv[0], twoto);
+	      xv[1] = simde_mm256_mul_pd(xv[1], twoto);
+	      xv[2] = simde_mm256_mul_pd(xv[2], twoto);
+	      xv[3] = simde_mm256_mul_pd(xv[3], twoto);	       
 	    }
 		
-	  _mm256_store_pd(&x3[0],  xv[0]);
-	  _mm256_store_pd(&x3[4],  xv[1]);
-	  _mm256_store_pd(&x3[8],  xv[2]);
-	  _mm256_store_pd(&x3[12], xv[3]);
+	  simde_mm256_store_pd(&x3[0],  xv[0]);
+	  simde_mm256_store_pd(&x3[4],  xv[1]);
+	  simde_mm256_store_pd(&x3[8],  xv[2]);
+	  simde_mm256_store_pd(&x3[12], xv[3]);
 	}	  
       
 	x3 = x3_start;
@@ -778,55 +775,55 @@
 		    x2_ptr += 16;
 		  }
 
-		__m256d
+		simde__m256d
 		  xv[4];
 	    
 		scale = 1;
 
 		for(k = 0; k < 4; k++)
 		  {
-		    __m256d	   
+		    simde__m256d	   
 		      
-		      xvl = _mm256_load_pd(&(x1[k * 4])),
-		      xvr = _mm256_load_pd(&(x2[k * 4]));
+		      xvl = simde_mm256_load_pd(&(x1[k * 4])),
+		      xvr = simde_mm256_load_pd(&(x2[k * 4]));
 		    
 		    int 
 		      l;
 		    
-		    xv[k] = _mm256_setzero_pd();
+		    xv[k] = simde_mm256_setzero_pd();
 		    
 		    for(l = 0; l < 4; l++)
 		      {	       	     				      	      															
-			__m256d 
-			  x1v = _mm256_mul_pd(xvl, _mm256_load_pd(&left[k * 16 + l * 4])),
-			  x2v = _mm256_mul_pd(xvr, _mm256_load_pd(&right[k * 16 + l * 4]));			    
+			simde__m256d 
+			  x1v = simde_mm256_mul_pd(xvl, simde_mm256_load_pd(&left[k * 16 + l * 4])),
+			  x2v = simde_mm256_mul_pd(xvr, simde_mm256_load_pd(&right[k * 16 + l * 4]));			    
 			
 			x1v = hadd4(x1v, x2v);			
 			
-			__m256d 
-			  evv = _mm256_load_pd(&extEV[l * 4]);
+			simde__m256d 
+			  evv = simde_mm256_load_pd(&extEV[l * 4]);
 			
-			xv[k] = _mm256_add_pd(xv[k], _mm256_mul_pd(x1v, evv));
+			xv[k] = simde_mm256_add_pd(xv[k], simde_mm256_mul_pd(x1v, evv));
 		      }
 		    
 		    if(scale)
 		      {
-			__m256d 	     
-			  v1 = _mm256_and_pd(xv[k], absMask_AVX.m);
+			simde__m256d 	     
+			  v1 = simde_mm256_and_pd(xv[k], absMask_AVX.m);
 			
-			v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+			v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
 			
-			if(_mm256_movemask_pd( v1 ) != 15)
+			if(simde_mm256_movemask_pd( v1 ) != 15)
 			  scale = 0;
 		      }
 		  }
 
 		if(scale)
 		  {
-		    xv[0] = _mm256_mul_pd(xv[0], twoto);
-		    xv[1] = _mm256_mul_pd(xv[1], twoto);
-		    xv[2] = _mm256_mul_pd(xv[2], twoto);
-		    xv[3] = _mm256_mul_pd(xv[3], twoto);
+		    xv[0] = simde_mm256_mul_pd(xv[0], twoto);
+		    xv[1] = simde_mm256_mul_pd(xv[1], twoto);
+		    xv[2] = simde_mm256_mul_pd(xv[2], twoto);
+		    xv[3] = simde_mm256_mul_pd(xv[3], twoto);
 		    
 		    if(useFastScaling)
 		      addScale += wgt[i];
@@ -834,10 +831,10 @@
 		      ex3[i] += 1;
 		  }
 		
-		_mm256_store_pd(&x3[0],      xv[0]);
-		_mm256_store_pd(&x3[4],  xv[1]);
-		_mm256_store_pd(&x3[8],  xv[2]);
-		_mm256_store_pd(&x3[12], xv[3]);
+		simde_mm256_store_pd(&x3[0],      xv[0]);
+		simde_mm256_store_pd(&x3[4],  xv[1]);
+		simde_mm256_store_pd(&x3[8],  xv[2]);
+		simde_mm256_store_pd(&x3[12], xv[3]);
 	      
 		x3 += 16;
 	      }
@@ -871,9 +868,9 @@
     i, 
     addScale = 0;
    
-  __m256d 
-    minlikelihood_avx = _mm256_set1_pd( minlikelihood ),
-    twoto = _mm256_set1_pd(twotothe256);
+  simde__m256d 
+    minlikelihood_avx = simde_mm256_set1_pd( minlikelihood ),
+    twoto = simde_mm256_set1_pd(twotothe256);
   
   switch(tipCase)
     {
@@ -889,27 +886,27 @@
 	  x1 = &(tipVector[4 * tipX1[i]]);
 	  x2 = &(tipVector[4 * tipX2[i]]);
 	  
-	  __m256d	   
-	    vv = _mm256_setzero_pd();
+	  simde__m256d	   
+	    vv = simde_mm256_setzero_pd();
 	   	   	    
 	  for(l = 0; l < 4; l++)
 	    {	       	     				      	      															
-	      __m256d 
-		x1v = _mm256_mul_pd(_mm256_load_pd(x1), _mm256_load_pd(&le[l * 4])),
-		x2v = _mm256_mul_pd(_mm256_load_pd(x2), _mm256_load_pd(&ri[l * 4]));			    
+	      simde__m256d 
+		x1v = simde_mm256_mul_pd(simde_mm256_load_pd(x1), simde_mm256_load_pd(&le[l * 4])),
+		x2v = simde_mm256_mul_pd(simde_mm256_load_pd(x2), simde_mm256_load_pd(&ri[l * 4]));			    
 			
 	      x1v = hadd4(x1v, x2v);			
 		
-	      __m256d 
-		evv = _mm256_load_pd(&EV[l * 4]);
-#ifdef _FMA
+	      simde__m256d 
+		evv = simde_mm256_load_pd(&EV[l * 4]);
+#ifdef SIMDE_FMA_NATIVE
 	      vv = FMAMACC(vv,x1v,evv);
 #else				
-	      vv = _mm256_add_pd(vv, _mm256_mul_pd(x1v, evv));						      	
+	      vv = simde_mm256_add_pd(vv, simde_mm256_mul_pd(x1v, evv));						      	
 #endif
 	    }	  		  
 
-	  _mm256_store_pd(&x3_start[4 * i], vv);	    	   	    
+	  simde_mm256_store_pd(&x3_start[4 * i], vv);	    	   	    
 	}
       break;
     case TIP_INNER:      
@@ -924,36 +921,36 @@
 	  le =  &left[cptr[i] * 16];
 	  ri =  &right[cptr[i] * 16];
 
-	  __m256d	   
-	    vv = _mm256_setzero_pd();
+	  simde__m256d	   
+	    vv = simde_mm256_setzero_pd();
 	  
 	  for(l = 0; l < 4; l++)
 	    {	       	     				      	      															
-	      __m256d 
-		x1v = _mm256_mul_pd(_mm256_load_pd(x1), _mm256_load_pd(&le[l * 4])),
-		x2v = _mm256_mul_pd(_mm256_load_pd(x2), _mm256_load_pd(&ri[l * 4]));			    
+	      simde__m256d 
+		x1v = simde_mm256_mul_pd(simde_mm256_load_pd(x1), simde_mm256_load_pd(&le[l * 4])),
+		x2v = simde_mm256_mul_pd(simde_mm256_load_pd(x2), simde_mm256_load_pd(&ri[l * 4]));			    
 			
 	      x1v = hadd4(x1v, x2v);			
 		
-	      __m256d 
-		evv = _mm256_load_pd(&EV[l * 4]);
+	      simde__m256d 
+		evv = simde_mm256_load_pd(&EV[l * 4]);
 				
-#ifdef _FMA
+#ifdef SIMDE_FMA_NATIVE
 	      vv = FMAMACC(vv,x1v,evv);
 #else	      
-	      vv = _mm256_add_pd(vv, _mm256_mul_pd(x1v, evv));
+	      vv = simde_mm256_add_pd(vv, simde_mm256_mul_pd(x1v, evv));
 #endif
 	    }	  		  
 	  
 	  
-	  __m256d 	     
-	    v1 = _mm256_and_pd(vv, absMask_AVX.m);
+	  simde__m256d 	     
+	    v1 = simde_mm256_and_pd(vv, absMask_AVX.m);
 
-	  v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+	  v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
 	    
-	  if(_mm256_movemask_pd( v1 ) == 15)
+	  if(simde_mm256_movemask_pd( v1 ) == 15)
 	    {	     	      
-	      vv = _mm256_mul_pd(vv, twoto);	      
+	      vv = simde_mm256_mul_pd(vv, twoto);	      
 	      
 	      if(useFastScaling)
 		addScale += wgt[i];
@@ -961,7 +958,7 @@
 		ex3[i] += 1;	      	     
 	    }       
 	  
-	  _mm256_store_pd(&x3_start[4 * i], vv);	 	  	  
+	  simde_mm256_store_pd(&x3_start[4 * i], vv);	 	  	  
 	}
       break;
     case INNER_INNER:
@@ -977,35 +974,35 @@
 	  le =  &left[cptr[i] * 16];
 	  ri =  &right[cptr[i] * 16];
 
-	  __m256d	   
-	    vv = _mm256_setzero_pd();
+	  simde__m256d	   
+	    vv = simde_mm256_setzero_pd();
 	  
 	  for(l = 0; l < 4; l++)
 	    {	       	     				      	      															
-	      __m256d 
-		x1v = _mm256_mul_pd(_mm256_load_pd(x1), _mm256_load_pd(&le[l * 4])),
-		x2v = _mm256_mul_pd(_mm256_load_pd(x2), _mm256_load_pd(&ri[l * 4]));			    
+	      simde__m256d 
+		x1v = simde_mm256_mul_pd(simde_mm256_load_pd(x1), simde_mm256_load_pd(&le[l * 4])),
+		x2v = simde_mm256_mul_pd(simde_mm256_load_pd(x2), simde_mm256_load_pd(&ri[l * 4]));			    
 			
 	      x1v = hadd4(x1v, x2v);			
 		
-	      __m256d 
-		evv = _mm256_load_pd(&EV[l * 4]);
-#ifdef _FMA
+	      simde__m256d 
+		evv = simde_mm256_load_pd(&EV[l * 4]);
+#ifdef SIMDE_FMA_NATIVE
 	      vv = FMAMACC(vv,x1v,evv);
 #else						
-	      vv = _mm256_add_pd(vv, _mm256_mul_pd(x1v, evv));						      	
+	      vv = simde_mm256_add_pd(vv, simde_mm256_mul_pd(x1v, evv));						      	
 #endif
 	    }	  		  
 
 	 
-	  __m256d 	     
-	    v1 = _mm256_and_pd(vv, absMask_AVX.m);
+	  simde__m256d 	     
+	    v1 = simde_mm256_and_pd(vv, absMask_AVX.m);
 
-	  v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+	  v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
 	    
-	  if(_mm256_movemask_pd( v1 ) == 15)
+	  if(simde_mm256_movemask_pd( v1 ) == 15)
 	    {	
-	      vv = _mm256_mul_pd(vv, twoto);
+	      vv = simde_mm256_mul_pd(vv, twoto);
 	      
 	      if(useFastScaling)
 		addScale += wgt[i];
@@ -1013,7 +1010,7 @@
 		ex3[i] += 1;	   
 	    }	
 
-	  _mm256_store_pd(&x3_start[4 * i], vv);
+	  simde_mm256_store_pd(&x3_start[4 * i], vv);
 	  	  
 	}
       break;
@@ -1048,9 +1045,9 @@
     scaleGap = 0,
     addScale = 0;
    
-  __m256d 
-    minlikelihood_avx = _mm256_set1_pd( minlikelihood ),
-    twoto = _mm256_set1_pd(twotothe256);
+  simde__m256d 
+    minlikelihood_avx = simde_mm256_set1_pd( minlikelihood ),
+    twoto = simde_mm256_set1_pd(twotothe256);
   
 
   {
@@ -1064,41 +1061,41 @@
     le =  &left[maxCats * 16];
     ri =  &right[maxCats * 16];
 
-    __m256d	   
-      vv = _mm256_setzero_pd();
+    simde__m256d	   
+      vv = simde_mm256_setzero_pd();
 	  
     for(l = 0; l < 4; l++)
       {	       	     				      	      															
-	__m256d 
-	  x1v = _mm256_mul_pd(_mm256_load_pd(x1), _mm256_load_pd(&le[l * 4])),
-	  x2v = _mm256_mul_pd(_mm256_load_pd(x2), _mm256_load_pd(&ri[l * 4]));			    
+	simde__m256d 
+	  x1v = simde_mm256_mul_pd(simde_mm256_load_pd(x1), simde_mm256_load_pd(&le[l * 4])),
+	  x2v = simde_mm256_mul_pd(simde_mm256_load_pd(x2), simde_mm256_load_pd(&ri[l * 4]));			    
 	
 	x1v = hadd4(x1v, x2v);			
 	
-	__m256d 
-	  evv = _mm256_load_pd(&EV[l * 4]);
-#ifdef _FMA
+	simde__m256d 
+	  evv = simde_mm256_load_pd(&EV[l * 4]);
+#ifdef SIMDE_FMA_NATIVE
 	vv = FMAMACC(vv,x1v,evv);
 #else						
-	vv = _mm256_add_pd(vv, _mm256_mul_pd(x1v, evv));						      	
+	vv = simde_mm256_add_pd(vv, simde_mm256_mul_pd(x1v, evv));						      	
 #endif
       }	  		  
 
     if(tipCase != TIP_TIP)
       {
-	__m256d 	     
-	  v1 = _mm256_and_pd(vv, absMask_AVX.m);
+	simde__m256d 	     
+	  v1 = simde_mm256_and_pd(vv, absMask_AVX.m);
     
-	v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+	v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
     
-	if(_mm256_movemask_pd( v1 ) == 15)
+	if(simde_mm256_movemask_pd( v1 ) == 15)
 	  {
-	    vv = _mm256_mul_pd(vv, twoto);	      	 
+	    vv = simde_mm256_mul_pd(vv, twoto);	      	 
 	    scaleGap = 1;
 	  }
       }
     
-    _mm256_store_pd(x3, vv);    
+    simde_mm256_store_pd(x3, vv);    
   }
 
   switch(tipCase)
@@ -1126,27 +1123,27 @@
 	      else	 	  
 		ri =  &right[cptr[i] * 16];
 	  	  
-	      __m256d	   
-		vv = _mm256_setzero_pd();
+	      simde__m256d	   
+		vv = simde_mm256_setzero_pd();
 	      
 	      for(l = 0; l < 4; l++)
 		{	       	     				      	      															
-		  __m256d 
-		    x1v = _mm256_mul_pd(_mm256_load_pd(x1), _mm256_load_pd(&le[l * 4])),
-		    x2v = _mm256_mul_pd(_mm256_load_pd(x2), _mm256_load_pd(&ri[l * 4]));			    
+		  simde__m256d 
+		    x1v = simde_mm256_mul_pd(simde_mm256_load_pd(x1), simde_mm256_load_pd(&le[l * 4])),
+		    x2v = simde_mm256_mul_pd(simde_mm256_load_pd(x2), simde_mm256_load_pd(&ri[l * 4]));			    
 		  
 		  x1v = hadd4(x1v, x2v);			
 		  
-		  __m256d 
-		    evv = _mm256_load_pd(&EV[l * 4]);
-#ifdef _FMA
+		  simde__m256d 
+		    evv = simde_mm256_load_pd(&EV[l * 4]);
+#ifdef SIMDE_FMA_NATIVE
 		  vv = FMAMACC(vv,x1v,evv);
 #else				
-		  vv = _mm256_add_pd(vv, _mm256_mul_pd(x1v, evv));						      	
+		  vv = simde_mm256_add_pd(vv, simde_mm256_mul_pd(x1v, evv));						      	
 #endif
 		}	  		  
 
-	      _mm256_store_pd(x3, vv);	 
+	      simde_mm256_store_pd(x3, vv);	 
 	      
 	      x3_ptr += 4;
 	    }
@@ -1190,36 +1187,36 @@
 		  x2_ptr += 4;
 		}	  	 
 
-	      __m256d	   
-		vv = _mm256_setzero_pd();
+	      simde__m256d	   
+		vv = simde_mm256_setzero_pd();
 	      
 	      for(l = 0; l < 4; l++)
 		{	       	     				      	      															
-		  __m256d 
-		    x1v = _mm256_mul_pd(_mm256_load_pd(x1), _mm256_load_pd(&le[l * 4])),
-		    x2v = _mm256_mul_pd(_mm256_load_pd(x2), _mm256_load_pd(&ri[l * 4]));			    
+		  simde__m256d 
+		    x1v = simde_mm256_mul_pd(simde_mm256_load_pd(x1), simde_mm256_load_pd(&le[l * 4])),
+		    x2v = simde_mm256_mul_pd(simde_mm256_load_pd(x2), simde_mm256_load_pd(&ri[l * 4]));			    
 		  
 		  x1v = hadd4(x1v, x2v);			
 		  
-		  __m256d 
-		    evv = _mm256_load_pd(&EV[l * 4]);
+		  simde__m256d 
+		    evv = simde_mm256_load_pd(&EV[l * 4]);
 		  
-#ifdef _FMA
+#ifdef SIMDE_FMA_NATIVE
 		  vv = FMAMACC(vv,x1v,evv);
 #else	      
-		  vv = _mm256_add_pd(vv, _mm256_mul_pd(x1v, evv));
+		  vv = simde_mm256_add_pd(vv, simde_mm256_mul_pd(x1v, evv));
 #endif
 		}	  		  
 	  
 	  
-	      __m256d 	     
-		v1 = _mm256_and_pd(vv, absMask_AVX.m);
+	      simde__m256d 	     
+		v1 = simde_mm256_and_pd(vv, absMask_AVX.m);
 	      
-	      v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+	      v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
 	      
-	      if(_mm256_movemask_pd( v1 ) == 15)
+	      if(simde_mm256_movemask_pd( v1 ) == 15)
 		{	     	      
-		  vv = _mm256_mul_pd(vv, twoto);	      
+		  vv = simde_mm256_mul_pd(vv, twoto);	      
 		  
 		  if(useFastScaling)
 		    addScale += wgt[i];
@@ -1227,7 +1224,7 @@
 		    ex3[i] += 1;		 
 		}       
 	  
-	      _mm256_store_pd(x3, vv);	 	  	  
+	      simde_mm256_store_pd(x3, vv);	 	  	  
 
 	      x3_ptr += 4;
 	    }
@@ -1277,35 +1274,35 @@
 		  x2_ptr += 4;
 		}	 	  	  	  
 	  
-	      __m256d	   
-		vv = _mm256_setzero_pd();
+	      simde__m256d	   
+		vv = simde_mm256_setzero_pd();
 	      
 	      for(l = 0; l < 4; l++)
 		{	       	     				      	      															
-		  __m256d 
-		    x1v = _mm256_mul_pd(_mm256_load_pd(x1), _mm256_load_pd(&le[l * 4])),
-		    x2v = _mm256_mul_pd(_mm256_load_pd(x2), _mm256_load_pd(&ri[l * 4]));			    
+		  simde__m256d 
+		    x1v = simde_mm256_mul_pd(simde_mm256_load_pd(x1), simde_mm256_load_pd(&le[l * 4])),
+		    x2v = simde_mm256_mul_pd(simde_mm256_load_pd(x2), simde_mm256_load_pd(&ri[l * 4]));			    
 		  
 		  x1v = hadd4(x1v, x2v);			
 		  
-		  __m256d 
-		    evv = _mm256_load_pd(&EV[l * 4]);
-#ifdef _FMA
+		  simde__m256d 
+		    evv = simde_mm256_load_pd(&EV[l * 4]);
+#ifdef SIMDE_FMA_NATIVE
 		  vv = FMAMACC(vv,x1v,evv);
 #else						
-		  vv = _mm256_add_pd(vv, _mm256_mul_pd(x1v, evv));						      	
+		  vv = simde_mm256_add_pd(vv, simde_mm256_mul_pd(x1v, evv));						      	
 #endif
 		}	  		  
 	      
 	      
-	      __m256d 	     
-		v1 = _mm256_and_pd(vv, absMask_AVX.m);
+	      simde__m256d 	     
+		v1 = simde_mm256_and_pd(vv, absMask_AVX.m);
 	      
-	      v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+	      v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
 	      
-	      if(_mm256_movemask_pd( v1 ) == 15)
+	      if(simde_mm256_movemask_pd( v1 ) == 15)
 		{	
-		  vv = _mm256_mul_pd(vv, twoto);	      
+		  vv = simde_mm256_mul_pd(vv, twoto);	      
 		  
 		  if(useFastScaling)
 		    addScale += wgt[i];
@@ -1313,7 +1310,7 @@
 		    ex3[i] += 1;		
 		}	
 	      
-	      _mm256_store_pd(x3, vv);
+	      simde_mm256_store_pd(x3, vv);
 	      
 	      x3_ptr += 4;
 	    }	  	  
@@ -1338,7 +1335,7 @@
 
   int i, l, scale, addScale = 0;
 
-#ifdef _FMA
+#ifdef SIMDE_FMA_NATIVE
   int k;
 #endif
 
@@ -1355,78 +1352,78 @@
 	    vr = &(tipVector[20 * tipX2[i]]);
 	    v  = &x3[20 * i];	    	    	   	    
 
-	    __m256d vv[5];
+	    simde__m256d vv[5];
 	    
-	    vv[0] = _mm256_setzero_pd();
-	    vv[1] = _mm256_setzero_pd();
-	    vv[2] = _mm256_setzero_pd();
-	    vv[3] = _mm256_setzero_pd();
-	    vv[4] = _mm256_setzero_pd();	   	    
+	    vv[0] = simde_mm256_setzero_pd();
+	    vv[1] = simde_mm256_setzero_pd();
+	    vv[2] = simde_mm256_setzero_pd();
+	    vv[3] = simde_mm256_setzero_pd();
+	    vv[4] = simde_mm256_setzero_pd();	   	    
 
 	    for(l = 0; l < 20; l++)
 	      {	       
-		__m256d 
-		  x1v = _mm256_setzero_pd(),
-		  x2v = _mm256_setzero_pd();	
+		simde__m256d 
+		  x1v = simde_mm256_setzero_pd(),
+		  x2v = simde_mm256_setzero_pd();	
 				
 		double 
 		  *ev = &extEV[l * 20],
 		  *lv = &le[l * 20],
 		  *rv = &ri[l * 20];														
 
-#ifdef _FMA		
+#ifdef SIMDE_FMA_NATIVE		
 		for(k = 0; k < 20; k += 4) 
 		  {
-		    __m256d vlv = _mm256_load_pd(&vl[k]);
-		    __m256d lvv = _mm256_load_pd(&lv[k]);
+		    simde__m256d vlv = simde_mm256_load_pd(&vl[k]);
+		    simde__m256d lvv = simde_mm256_load_pd(&lv[k]);
 		    x1v = FMAMACC(x1v,vlv,lvv);
-		    __m256d vrv = _mm256_load_pd(&vr[k]);
-		    __m256d rvv = _mm256_load_pd(&rv[k]);
+		    simde__m256d vrv = simde_mm256_load_pd(&vr[k]);
+		    simde__m256d rvv = simde_mm256_load_pd(&rv[k]);
 		    x2v = FMAMACC(x2v,vrv,rvv);
 		  }
 #else		
-		x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[0]), _mm256_load_pd(&lv[0])));
-		x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[4]), _mm256_load_pd(&lv[4])));
-		x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[8]), _mm256_load_pd(&lv[8])));
-		x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[12]), _mm256_load_pd(&lv[12])));
-		x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[16]), _mm256_load_pd(&lv[16])));
-
-		x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[0]), _mm256_load_pd(&rv[0])));			    
-		x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[4]), _mm256_load_pd(&rv[4])));				    
-		x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[8]), _mm256_load_pd(&rv[8])));			    
-		x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[12]), _mm256_load_pd(&rv[12])));				    
-		x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[16]), _mm256_load_pd(&rv[16])));	
+		x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[0]), simde_mm256_load_pd(&lv[0])));
+		x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[4]), simde_mm256_load_pd(&lv[4])));
+		x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[8]), simde_mm256_load_pd(&lv[8])));
+		x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[12]), simde_mm256_load_pd(&lv[12])));
+		x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[16]), simde_mm256_load_pd(&lv[16])));
+
+		x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[0]), simde_mm256_load_pd(&rv[0])));			    
+		x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[4]), simde_mm256_load_pd(&rv[4])));				    
+		x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[8]), simde_mm256_load_pd(&rv[8])));			    
+		x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[12]), simde_mm256_load_pd(&rv[12])));				    
+		x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[16]), simde_mm256_load_pd(&rv[16])));	
 #endif
 
 		x1v = hadd4(x1v, x2v);			
-#ifdef _FMA
+#ifdef SIMDE_FMA_NATIVE
 		for(k = 0; k < 5; k++) 
 		  {
-		    __m256d evv = _mm256_load_pd(&ev[k*4]);
+		    simde__m256d evv = simde_mm256_load_pd(&ev[k*4]);
 		    vv[k] = FMAMACC(vv[k],x1v,evv);
 		  }	  
 #else		
-		__m256d 
+		simde__m256d 
 		  evv[5];
 	    	
-		evv[0] = _mm256_load_pd(&ev[0]);
-		evv[1] = _mm256_load_pd(&ev[4]);
-		evv[2] = _mm256_load_pd(&ev[8]);
-		evv[3] = _mm256_load_pd(&ev[12]);
-		evv[4] = _mm256_load_pd(&ev[16]);		
-		
-		vv[0] = _mm256_add_pd(vv[0], _mm256_mul_pd(x1v, evv[0]));
-		vv[1] = _mm256_add_pd(vv[1], _mm256_mul_pd(x1v, evv[1]));
-		vv[2] = _mm256_add_pd(vv[2], _mm256_mul_pd(x1v, evv[2]));
-		vv[3] = _mm256_add_pd(vv[3], _mm256_mul_pd(x1v, evv[3]));
-		vv[4] = _mm256_add_pd(vv[4], _mm256_mul_pd(x1v, evv[4]));				      		      	  
-#endif
-	      }
-	    _mm256_store_pd(&v[0], vv[0]);
-	    _mm256_store_pd(&v[4], vv[1]);
-	    _mm256_store_pd(&v[8], vv[2]);
-	    _mm256_store_pd(&v[12], vv[3]);
-	    _mm256_store_pd(&v[16], vv[4]);
+		evv[0] = simde_mm256_load_pd(&ev[0]);
+		evv[1] = simde_mm256_load_pd(&ev[4]);
+		evv[2] = simde_mm256_load_pd(&ev[8]);
+		evv[3] = simde_mm256_load_pd(&ev[12]);
+		evv[4] = simde_mm256_load_pd(&ev[16]);		
+		
+		vv[0] = simde_mm256_add_pd(vv[0], simde_mm256_mul_pd(x1v, evv[0]));
+		vv[1] = simde_mm256_add_pd(vv[1], simde_mm256_mul_pd(x1v, evv[1]));
+		vv[2] = simde_mm256_add_pd(vv[2], simde_mm256_mul_pd(x1v, evv[2]));
+		vv[3] = simde_mm256_add_pd(vv[3], simde_mm256_mul_pd(x1v, evv[3]));
+		vv[4] = simde_mm256_add_pd(vv[4], simde_mm256_mul_pd(x1v, evv[4]));				      		      	  
+#endif
+	      }
+	    simde_mm256_store_pd(&v[0], vv[0]);
+	    simde_mm256_store_pd(&v[4], vv[1]);
+	    simde_mm256_store_pd(&v[8], vv[2]);
+	    simde_mm256_store_pd(&v[12], vv[3]);
+	    simde_mm256_store_pd(&v[16], vv[4]);
 	  }
       }
       break;
@@ -1440,96 +1437,96 @@
 	  vr = &x2[20 * i];
 	  v  = &x3[20 * i];	   
 	  
-	  __m256d vv[5];
+	  simde__m256d vv[5];
 	  
-	  vv[0] = _mm256_setzero_pd();
-	  vv[1] = _mm256_setzero_pd();
-	  vv[2] = _mm256_setzero_pd();
-	  vv[3] = _mm256_setzero_pd();
-	  vv[4] = _mm256_setzero_pd();
+	  vv[0] = simde_mm256_setzero_pd();
+	  vv[1] = simde_mm256_setzero_pd();
+	  vv[2] = simde_mm256_setzero_pd();
+	  vv[3] = simde_mm256_setzero_pd();
+	  vv[4] = simde_mm256_setzero_pd();
 	  
 	 
 
 	  for(l = 0; l < 20; l++)
 	    {	       
-	      __m256d 
-		x1v = _mm256_setzero_pd(),
-		x2v = _mm256_setzero_pd();	
+	      simde__m256d 
+		x1v = simde_mm256_setzero_pd(),
+		x2v = simde_mm256_setzero_pd();	
 	      
 	      double 
 		*ev = &extEV[l * 20],
 		*lv = &le[l * 20],
 		*rv = &ri[l * 20];														
-#ifdef _FMA
+#ifdef SIMDE_FMA_NATIVE
 	      for(k = 0; k < 20; k += 4) 
 		{
-		  __m256d vlv = _mm256_load_pd(&vl[k]);
-		  __m256d lvv = _mm256_load_pd(&lv[k]);
+		  simde__m256d vlv = simde_mm256_load_pd(&vl[k]);
+		  simde__m256d lvv = simde_mm256_load_pd(&lv[k]);
 		  x1v = FMAMACC(x1v,vlv,lvv);
-		  __m256d vrv = _mm256_load_pd(&vr[k]);
-		  __m256d rvv = _mm256_load_pd(&rv[k]);
+		  simde__m256d vrv = simde_mm256_load_pd(&vr[k]);
+		  simde__m256d rvv = simde_mm256_load_pd(&rv[k]);
 		  x2v = FMAMACC(x2v,vrv,rvv);
 		}
 #else	      
-	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[0]), _mm256_load_pd(&lv[0])));
-	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[4]), _mm256_load_pd(&lv[4])));
-	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[8]), _mm256_load_pd(&lv[8])));
-	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[12]), _mm256_load_pd(&lv[12])));
-	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[16]), _mm256_load_pd(&lv[16])));
-	      
-	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[0]), _mm256_load_pd(&rv[0])));			    
-	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[4]), _mm256_load_pd(&rv[4])));				    
-	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[8]), _mm256_load_pd(&rv[8])));			    
-	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[12]), _mm256_load_pd(&rv[12])));				    
-	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[16]), _mm256_load_pd(&rv[16])));
+	      x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[0]), simde_mm256_load_pd(&lv[0])));
+	      x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[4]), simde_mm256_load_pd(&lv[4])));
+	      x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[8]), simde_mm256_load_pd(&lv[8])));
+	      x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[12]), simde_mm256_load_pd(&lv[12])));
+	      x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[16]), simde_mm256_load_pd(&lv[16])));
+	      
+	      x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[0]), simde_mm256_load_pd(&rv[0])));			    
+	      x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[4]), simde_mm256_load_pd(&rv[4])));				    
+	      x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[8]), simde_mm256_load_pd(&rv[8])));			    
+	      x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[12]), simde_mm256_load_pd(&rv[12])));				    
+	      x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[16]), simde_mm256_load_pd(&rv[16])));
 #endif
 
 	      x1v = hadd4(x1v, x2v);			
 	      
-	      __m256d 
+	      simde__m256d 
 		evv[5];
 	      
-	      evv[0] = _mm256_load_pd(&ev[0]);
-	      evv[1] = _mm256_load_pd(&ev[4]);
-	      evv[2] = _mm256_load_pd(&ev[8]);
-	      evv[3] = _mm256_load_pd(&ev[12]);
-	      evv[4] = _mm256_load_pd(&ev[16]);		
+	      evv[0] = simde_mm256_load_pd(&ev[0]);
+	      evv[1] = simde_mm256_load_pd(&ev[4]);
+	      evv[2] = simde_mm256_load_pd(&ev[8]);
+	      evv[3] = simde_mm256_load_pd(&ev[12]);
+	      evv[4] = simde_mm256_load_pd(&ev[16]);		
 
-#ifdef _FMA
+#ifdef SIMDE_FMA_NATIVE
 	      for(k = 0; k < 5; k++)
 		vv[k] = FMAMACC(vv[k],x1v,evv[k]);		 
 #else	      
-	      vv[0] = _mm256_add_pd(vv[0], _mm256_mul_pd(x1v, evv[0]));
-	      vv[1] = _mm256_add_pd(vv[1], _mm256_mul_pd(x1v, evv[1]));
-	      vv[2] = _mm256_add_pd(vv[2], _mm256_mul_pd(x1v, evv[2]));
-	      vv[3] = _mm256_add_pd(vv[3], _mm256_mul_pd(x1v, evv[3]));
-	      vv[4] = _mm256_add_pd(vv[4], _mm256_mul_pd(x1v, evv[4]));				      	
+	      vv[0] = simde_mm256_add_pd(vv[0], simde_mm256_mul_pd(x1v, evv[0]));
+	      vv[1] = simde_mm256_add_pd(vv[1], simde_mm256_mul_pd(x1v, evv[1]));
+	      vv[2] = simde_mm256_add_pd(vv[2], simde_mm256_mul_pd(x1v, evv[2]));
+	      vv[3] = simde_mm256_add_pd(vv[3], simde_mm256_mul_pd(x1v, evv[3]));
+	      vv[4] = simde_mm256_add_pd(vv[4], simde_mm256_mul_pd(x1v, evv[4]));				      	
 #endif
 	    }	  
 
 	   	     
-	  __m256d minlikelihood_avx = _mm256_set1_pd( minlikelihood );
+	  simde__m256d minlikelihood_avx = simde_mm256_set1_pd( minlikelihood );
 	  
 	  scale = 1;
 	  
 	  for(l = 0; scale && (l < 20); l += 4)
 	    {	       
-	      __m256d 
-		v1 = _mm256_and_pd(vv[l / 4], absMask_AVX.m);
-	      v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+	      simde__m256d 
+		v1 = simde_mm256_and_pd(vv[l / 4], absMask_AVX.m);
+	      v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
 	      
-	      if(_mm256_movemask_pd( v1 ) != 15)
+	      if(simde_mm256_movemask_pd( v1 ) != 15)
 		scale = 0;
 	    }	    	  	  
 	 
 
 	  if(scale)
 	    {
-	      __m256d 
-		twoto = _mm256_set1_pd(twotothe256);
+	      simde__m256d 
+		twoto = simde_mm256_set1_pd(twotothe256);
 	      
 	      for(l = 0; l < 20; l += 4)
-		vv[l / 4] = _mm256_mul_pd(vv[l / 4] , twoto);		    		 
+		vv[l / 4] = simde_mm256_mul_pd(vv[l / 4] , twoto);		    		 
 	  
 	      if(useFastScaling)
 		addScale += wgt[i];
@@ -1537,11 +1534,11 @@
 		ex3[i]  += 1;	      
 	    }
 
-	  _mm256_store_pd(&v[0], vv[0]);
-	  _mm256_store_pd(&v[4], vv[1]);
-	  _mm256_store_pd(&v[8], vv[2]);
-	  _mm256_store_pd(&v[12], vv[3]);
-	  _mm256_store_pd(&v[16], vv[4]);	       
+	  simde_mm256_store_pd(&v[0], vv[0]);
+	  simde_mm256_store_pd(&v[4], vv[1]);
+	  simde_mm256_store_pd(&v[8], vv[2]);
+	  simde_mm256_store_pd(&v[12], vv[3]);
+	  simde_mm256_store_pd(&v[16], vv[4]);	       
 	}
       break;
     case INNER_INNER:
@@ -1554,84 +1551,84 @@
 	  vr = &x2[20 * i];
 	  v = &x3[20 * i];
 
-	  __m256d vv[5];
+	  simde__m256d vv[5];
 	  
-	  vv[0] = _mm256_setzero_pd();
-	  vv[1] = _mm256_setzero_pd();
-	  vv[2] = _mm256_setzero_pd();
-	  vv[3] = _mm256_setzero_pd();
-	  vv[4] = _mm256_setzero_pd();
+	  vv[0] = simde_mm256_setzero_pd();
+	  vv[1] = simde_mm256_setzero_pd();
+	  vv[2] = simde_mm256_setzero_pd();
+	  vv[3] = simde_mm256_setzero_pd();
+	  vv[4] = simde_mm256_setzero_pd();
 	  
 	  for(l = 0; l < 20; l++)
 	    {	       
-	      __m256d 
-		x1v = _mm256_setzero_pd(),
-		x2v = _mm256_setzero_pd();	
+	      simde__m256d 
+		x1v = simde_mm256_setzero_pd(),
+		x2v = simde_mm256_setzero_pd();	
 	      
 	      double 
 		*ev = &extEV[l * 20],
 		*lv = &le[l * 20],
 		*rv = &ri[l * 20];														
 	      
-	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[0]), _mm256_load_pd(&lv[0])));
-	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[4]), _mm256_load_pd(&lv[4])));
-	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[8]), _mm256_load_pd(&lv[8])));
-	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[12]), _mm256_load_pd(&lv[12])));
-	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[16]), _mm256_load_pd(&lv[16])));
-	      
-	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[0]), _mm256_load_pd(&rv[0])));			    
-	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[4]), _mm256_load_pd(&rv[4])));				    
-	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[8]), _mm256_load_pd(&rv[8])));			    
-	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[12]), _mm256_load_pd(&rv[12])));				    
-	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[16]), _mm256_load_pd(&rv[16])));
+	      x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[0]), simde_mm256_load_pd(&lv[0])));
+	      x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[4]), simde_mm256_load_pd(&lv[4])));
+	      x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[8]), simde_mm256_load_pd(&lv[8])));
+	      x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[12]), simde_mm256_load_pd(&lv[12])));
+	      x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[16]), simde_mm256_load_pd(&lv[16])));
+	      
+	      x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[0]), simde_mm256_load_pd(&rv[0])));			    
+	      x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[4]), simde_mm256_load_pd(&rv[4])));				    
+	      x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[8]), simde_mm256_load_pd(&rv[8])));			    
+	      x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[12]), simde_mm256_load_pd(&rv[12])));				    
+	      x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[16]), simde_mm256_load_pd(&rv[16])));
 
 	      x1v = hadd4(x1v, x2v);			
-#ifdef _FMA
+#ifdef SIMDE_FMA_NATIVE
 	       for(k = 0; k < 5; k++) 
 		 {
-		   __m256d evv = _mm256_load_pd(&ev[k*4]);
+		   simde__m256d evv = simde_mm256_load_pd(&ev[k*4]);
 		   vv[k] = FMAMACC(vv[k],x1v,evv);
 		 }
 #else	      
-	      __m256d 
+	      simde__m256d 
 		evv[5];
 	      
-	      evv[0] = _mm256_load_pd(&ev[0]);
-	      evv[1] = _mm256_load_pd(&ev[4]);
-	      evv[2] = _mm256_load_pd(&ev[8]);
-	      evv[3] = _mm256_load_pd(&ev[12]);
-	      evv[4] = _mm256_load_pd(&ev[16]);		
-	      
-	      vv[0] = _mm256_add_pd(vv[0], _mm256_mul_pd(x1v, evv[0]));
-	      vv[1] = _mm256_add_pd(vv[1], _mm256_mul_pd(x1v, evv[1]));
-	      vv[2] = _mm256_add_pd(vv[2], _mm256_mul_pd(x1v, evv[2]));
-	      vv[3] = _mm256_add_pd(vv[3], _mm256_mul_pd(x1v, evv[3]));
-	      vv[4] = _mm256_add_pd(vv[4], _mm256_mul_pd(x1v, evv[4]));				      	
+	      evv[0] = simde_mm256_load_pd(&ev[0]);
+	      evv[1] = simde_mm256_load_pd(&ev[4]);
+	      evv[2] = simde_mm256_load_pd(&ev[8]);
+	      evv[3] = simde_mm256_load_pd(&ev[12]);
+	      evv[4] = simde_mm256_load_pd(&ev[16]);		
+	      
+	      vv[0] = simde_mm256_add_pd(vv[0], simde_mm256_mul_pd(x1v, evv[0]));
+	      vv[1] = simde_mm256_add_pd(vv[1], simde_mm256_mul_pd(x1v, evv[1]));
+	      vv[2] = simde_mm256_add_pd(vv[2], simde_mm256_mul_pd(x1v, evv[2]));
+	      vv[3] = simde_mm256_add_pd(vv[3], simde_mm256_mul_pd(x1v, evv[3]));
+	      vv[4] = simde_mm256_add_pd(vv[4], simde_mm256_mul_pd(x1v, evv[4]));				      	
 #endif
 	    }	  
 
 	   	     
-	  __m256d minlikelihood_avx = _mm256_set1_pd( minlikelihood );
+	  simde__m256d minlikelihood_avx = simde_mm256_set1_pd( minlikelihood );
 	  
 	  scale = 1;
 	  
 	  for(l = 0; scale && (l < 20); l += 4)
 	    {	       
-	      __m256d 
-		v1 = _mm256_and_pd(vv[l / 4], absMask_AVX.m);
-	      v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+	      simde__m256d 
+		v1 = simde_mm256_and_pd(vv[l / 4], absMask_AVX.m);
+	      v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
 	      
-	      if(_mm256_movemask_pd( v1 ) != 15)
+	      if(simde_mm256_movemask_pd( v1 ) != 15)
 		scale = 0;
 	    }	    	  	  
 
 	  if(scale)
 	    {
-	      __m256d 
-		twoto = _mm256_set1_pd(twotothe256);
+	      simde__m256d 
+		twoto = simde_mm256_set1_pd(twotothe256);
 	      
 	      for(l = 0; l < 20; l += 4)
-		vv[l / 4] = _mm256_mul_pd(vv[l / 4] , twoto);		    		 
+		vv[l / 4] = simde_mm256_mul_pd(vv[l / 4] , twoto);		    		 
 	  
 	      if(useFastScaling)
 		addScale += wgt[i];
@@ -1639,11 +1636,11 @@
 		ex3[i]  += 1;	      
 	    }
 
-	  _mm256_store_pd(&v[0], vv[0]);
-	  _mm256_store_pd(&v[4], vv[1]);
-	  _mm256_store_pd(&v[8], vv[2]);
-	  _mm256_store_pd(&v[12], vv[3]);
-	  _mm256_store_pd(&v[16], vv[4]);
+	  simde_mm256_store_pd(&v[0], vv[0]);
+	  simde_mm256_store_pd(&v[4], vv[1]);
+	  simde_mm256_store_pd(&v[8], vv[2]);
+	  simde_mm256_store_pd(&v[12], vv[3]);
+	  simde_mm256_store_pd(&v[16], vv[4]);
 	 
 	}
       break;
@@ -1680,7 +1677,7 @@
     addScale = 0,
     scaleGap = 0;
 
-#ifdef _FMA
+#ifdef SIMDE_FMA_NATIVE
   int k;
 #endif
 
@@ -1692,96 +1689,96 @@
     vr = x2_gapColumn;
     v  = x3_gapColumn;
 
-    __m256d vv[5];
+    simde__m256d vv[5];
     
-    vv[0] = _mm256_setzero_pd();
-    vv[1] = _mm256_setzero_pd();
-    vv[2] = _mm256_setzero_pd();
-    vv[3] = _mm256_setzero_pd();
-    vv[4] = _mm256_setzero_pd();
+    vv[0] = simde_mm256_setzero_pd();
+    vv[1] = simde_mm256_setzero_pd();
+    vv[2] = simde_mm256_setzero_pd();
+    vv[3] = simde_mm256_setzero_pd();
+    vv[4] = simde_mm256_setzero_pd();
     
     for(l = 0; l < 20; l++)
       {	       
-	__m256d 
-	  x1v = _mm256_setzero_pd(),
-	  x2v = _mm256_setzero_pd();	
+	simde__m256d 
+	  x1v = simde_mm256_setzero_pd(),
+	  x2v = simde_mm256_setzero_pd();	
 	
 	double 
 	  *ev = &extEV[l * 20],
 	  *lv = &le[l * 20],
 	  *rv = &ri[l * 20];														
 	
-	x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[0]), _mm256_load_pd(&lv[0])));
-	x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[4]), _mm256_load_pd(&lv[4])));
-	x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[8]), _mm256_load_pd(&lv[8])));
-	x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[12]), _mm256_load_pd(&lv[12])));
-	x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[16]), _mm256_load_pd(&lv[16])));
-	
-	x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[0]), _mm256_load_pd(&rv[0])));			    
-	x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[4]), _mm256_load_pd(&rv[4])));				    
-	x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[8]), _mm256_load_pd(&rv[8])));			    
-	x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[12]), _mm256_load_pd(&rv[12])));				    
-	x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[16]), _mm256_load_pd(&rv[16])));
+	x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[0]), simde_mm256_load_pd(&lv[0])));
+	x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[4]), simde_mm256_load_pd(&lv[4])));
+	x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[8]), simde_mm256_load_pd(&lv[8])));
+	x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[12]), simde_mm256_load_pd(&lv[12])));
+	x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[16]), simde_mm256_load_pd(&lv[16])));
+	
+	x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[0]), simde_mm256_load_pd(&rv[0])));			    
+	x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[4]), simde_mm256_load_pd(&rv[4])));				    
+	x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[8]), simde_mm256_load_pd(&rv[8])));			    
+	x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[12]), simde_mm256_load_pd(&rv[12])));				    
+	x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[16]), simde_mm256_load_pd(&rv[16])));
 	
 	x1v = hadd4(x1v, x2v);			
-#ifdef _FMA
+#ifdef SIMDE_FMA_NATIVE
 	for(k = 0; k < 5; k++) 
 	  {
-	    __m256d evv = _mm256_load_pd(&ev[k*4]);
+	    simde__m256d evv = simde_mm256_load_pd(&ev[k*4]);
 	    vv[k] = FMAMACC(vv[k],x1v,evv);
 	  }
 #else	      
-	__m256d 
+	simde__m256d 
 	  evv[5];
 	
-	evv[0] = _mm256_load_pd(&ev[0]);
-	evv[1] = _mm256_load_pd(&ev[4]);
-	evv[2] = _mm256_load_pd(&ev[8]);
-	evv[3] = _mm256_load_pd(&ev[12]);
-	evv[4] = _mm256_load_pd(&ev[16]);		
-	
-	vv[0] = _mm256_add_pd(vv[0], _mm256_mul_pd(x1v, evv[0]));
-	vv[1] = _mm256_add_pd(vv[1], _mm256_mul_pd(x1v, evv[1]));
-	vv[2] = _mm256_add_pd(vv[2], _mm256_mul_pd(x1v, evv[2]));
-	vv[3] = _mm256_add_pd(vv[3], _mm256_mul_pd(x1v, evv[3]));
-	vv[4] = _mm256_add_pd(vv[4], _mm256_mul_pd(x1v, evv[4]));				      	
+	evv[0] = simde_mm256_load_pd(&ev[0]);
+	evv[1] = simde_mm256_load_pd(&ev[4]);
+	evv[2] = simde_mm256_load_pd(&ev[8]);
+	evv[3] = simde_mm256_load_pd(&ev[12]);
+	evv[4] = simde_mm256_load_pd(&ev[16]);		
+	
+	vv[0] = simde_mm256_add_pd(vv[0], simde_mm256_mul_pd(x1v, evv[0]));
+	vv[1] = simde_mm256_add_pd(vv[1], simde_mm256_mul_pd(x1v, evv[1]));
+	vv[2] = simde_mm256_add_pd(vv[2], simde_mm256_mul_pd(x1v, evv[2]));
+	vv[3] = simde_mm256_add_pd(vv[3], simde_mm256_mul_pd(x1v, evv[3]));
+	vv[4] = simde_mm256_add_pd(vv[4], simde_mm256_mul_pd(x1v, evv[4]));				      	
 #endif
       }	  
 
 
      if(tipCase != TIP_TIP)
        {
-	 __m256d minlikelihood_avx = _mm256_set1_pd( minlikelihood );
+	 simde__m256d minlikelihood_avx = simde_mm256_set1_pd( minlikelihood );
 	  
 	 scale = 1;
 	  
 	 for(l = 0; scale && (l < 20); l += 4)
 	   {	       
-	     __m256d 
-	       v1 = _mm256_and_pd(vv[l / 4], absMask_AVX.m);
-	     v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+	     simde__m256d 
+	       v1 = simde_mm256_and_pd(vv[l / 4], absMask_AVX.m);
+	     v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
 	     
-	     if(_mm256_movemask_pd( v1 ) != 15)
+	     if(simde_mm256_movemask_pd( v1 ) != 15)
 	       scale = 0;
 	   }	    	  	  
 
 	 if(scale)
 	   {
-	      __m256d 
-		twoto = _mm256_set1_pd(twotothe256);
+	      simde__m256d 
+		twoto = simde_mm256_set1_pd(twotothe256);
 	      
 	      for(l = 0; l < 20; l += 4)
-		vv[l / 4] = _mm256_mul_pd(vv[l / 4] , twoto);		    		 	      	     	      
+		vv[l / 4] = simde_mm256_mul_pd(vv[l / 4] , twoto);		    		 	      	     	      
 	   
 	      scaleGap = 1;
 	   }
        }
 
-     _mm256_store_pd(&v[0], vv[0]);
-     _mm256_store_pd(&v[4], vv[1]);
-     _mm256_store_pd(&v[8], vv[2]);
-     _mm256_store_pd(&v[12], vv[3]);
-     _mm256_store_pd(&v[16], vv[4]);     
+     simde_mm256_store_pd(&v[0], vv[0]);
+     simde_mm256_store_pd(&v[4], vv[1]);
+     simde_mm256_store_pd(&v[8], vv[2]);
+     simde_mm256_store_pd(&v[12], vv[3]);
+     simde_mm256_store_pd(&v[16], vv[4]);     
   }
 
 
@@ -1808,79 +1805,79 @@
 		else	 	  
 		  ri =  &right[cptr[i] * 400];
 
-		__m256d vv[5];
+		simde__m256d vv[5];
 		
-		vv[0] = _mm256_setzero_pd();
-		vv[1] = _mm256_setzero_pd();
-		vv[2] = _mm256_setzero_pd();
-		vv[3] = _mm256_setzero_pd();
-		vv[4] = _mm256_setzero_pd();	   	    
+		vv[0] = simde_mm256_setzero_pd();
+		vv[1] = simde_mm256_setzero_pd();
+		vv[2] = simde_mm256_setzero_pd();
+		vv[3] = simde_mm256_setzero_pd();
+		vv[4] = simde_mm256_setzero_pd();	   	    
 		
 		for(l = 0; l < 20; l++)
 		  {	       
-		    __m256d 
-		      x1v = _mm256_setzero_pd(),
-		      x2v = _mm256_setzero_pd();	
+		    simde__m256d 
+		      x1v = simde_mm256_setzero_pd(),
+		      x2v = simde_mm256_setzero_pd();	
 		    
 		    double 
 		      *ev = &extEV[l * 20],
 		      *lv = &le[l * 20],
 		      *rv = &ri[l * 20];														
 		    
-#ifdef _FMA		
+#ifdef SIMDE_FMA_NATIVE		
 		    for(k = 0; k < 20; k += 4) 
 		      {
-			__m256d vlv = _mm256_load_pd(&vl[k]);
-			__m256d lvv = _mm256_load_pd(&lv[k]);
+			simde__m256d vlv = simde_mm256_load_pd(&vl[k]);
+			simde__m256d lvv = simde_mm256_load_pd(&lv[k]);
 			x1v = FMAMACC(x1v,vlv,lvv);
-			__m256d vrv = _mm256_load_pd(&vr[k]);
-			__m256d rvv = _mm256_load_pd(&rv[k]);
+			simde__m256d vrv = simde_mm256_load_pd(&vr[k]);
+			simde__m256d rvv = simde_mm256_load_pd(&rv[k]);
 			x2v = FMAMACC(x2v,vrv,rvv);
 		      }
 #else		
-		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[0]), _mm256_load_pd(&lv[0])));
-		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[4]), _mm256_load_pd(&lv[4])));
-		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[8]), _mm256_load_pd(&lv[8])));
-		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[12]), _mm256_load_pd(&lv[12])));
-		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[16]), _mm256_load_pd(&lv[16])));
-		    
-		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[0]), _mm256_load_pd(&rv[0])));			    
-		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[4]), _mm256_load_pd(&rv[4])));				    
-		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[8]), _mm256_load_pd(&rv[8])));			    
-		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[12]), _mm256_load_pd(&rv[12])));				    
-		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[16]), _mm256_load_pd(&rv[16])));	
+		    x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[0]), simde_mm256_load_pd(&lv[0])));
+		    x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[4]), simde_mm256_load_pd(&lv[4])));
+		    x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[8]), simde_mm256_load_pd(&lv[8])));
+		    x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[12]), simde_mm256_load_pd(&lv[12])));
+		    x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[16]), simde_mm256_load_pd(&lv[16])));
+		    
+		    x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[0]), simde_mm256_load_pd(&rv[0])));			    
+		    x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[4]), simde_mm256_load_pd(&rv[4])));				    
+		    x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[8]), simde_mm256_load_pd(&rv[8])));			    
+		    x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[12]), simde_mm256_load_pd(&rv[12])));				    
+		    x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[16]), simde_mm256_load_pd(&rv[16])));	
 #endif
 		    
 		    x1v = hadd4(x1v, x2v);			
-#ifdef _FMA
+#ifdef SIMDE_FMA_NATIVE
 		    for(k = 0; k < 5; k++) 
 		      {
-			__m256d evv = _mm256_load_pd(&ev[k*4]);
+			simde__m256d evv = simde_mm256_load_pd(&ev[k*4]);
 			vv[k] = FMAMACC(vv[k],x1v,evv);
 		      }	  
 #else		
-		    __m256d 
+		    simde__m256d 
 		      evv[5];
 		    
-		    evv[0] = _mm256_load_pd(&ev[0]);
-		    evv[1] = _mm256_load_pd(&ev[4]);
-		    evv[2] = _mm256_load_pd(&ev[8]);
-		    evv[3] = _mm256_load_pd(&ev[12]);
-		    evv[4] = _mm256_load_pd(&ev[16]);		
-		    
-		    vv[0] = _mm256_add_pd(vv[0], _mm256_mul_pd(x1v, evv[0]));
-		    vv[1] = _mm256_add_pd(vv[1], _mm256_mul_pd(x1v, evv[1]));
-		    vv[2] = _mm256_add_pd(vv[2], _mm256_mul_pd(x1v, evv[2]));
-		    vv[3] = _mm256_add_pd(vv[3], _mm256_mul_pd(x1v, evv[3]));
-		    vv[4] = _mm256_add_pd(vv[4], _mm256_mul_pd(x1v, evv[4]));				      		      	  
+		    evv[0] = simde_mm256_load_pd(&ev[0]);
+		    evv[1] = simde_mm256_load_pd(&ev[4]);
+		    evv[2] = simde_mm256_load_pd(&ev[8]);
+		    evv[3] = simde_mm256_load_pd(&ev[12]);
+		    evv[4] = simde_mm256_load_pd(&ev[16]);		
+		    
+		    vv[0] = simde_mm256_add_pd(vv[0], simde_mm256_mul_pd(x1v, evv[0]));
+		    vv[1] = simde_mm256_add_pd(vv[1], simde_mm256_mul_pd(x1v, evv[1]));
+		    vv[2] = simde_mm256_add_pd(vv[2], simde_mm256_mul_pd(x1v, evv[2]));
+		    vv[3] = simde_mm256_add_pd(vv[3], simde_mm256_mul_pd(x1v, evv[3]));
+		    vv[4] = simde_mm256_add_pd(vv[4], simde_mm256_mul_pd(x1v, evv[4]));				      		      	  
 #endif
 		  }
 		
-		_mm256_store_pd(&v[0], vv[0]);
-		_mm256_store_pd(&v[4], vv[1]);
-		_mm256_store_pd(&v[8], vv[2]);
-		_mm256_store_pd(&v[12], vv[3]);
-		_mm256_store_pd(&v[16], vv[4]);
+		simde_mm256_store_pd(&v[0], vv[0]);
+		simde_mm256_store_pd(&v[4], vv[1]);
+		simde_mm256_store_pd(&v[8], vv[2]);
+		simde_mm256_store_pd(&v[12], vv[3]);
+		simde_mm256_store_pd(&v[16], vv[4]);
 
 		x3_ptr += 20;
 	      }
@@ -1924,93 +1921,93 @@
 		  x2_ptr += 20;
 		}	  	  
 	  
-	      __m256d vv[5];
+	      simde__m256d vv[5];
 	      
-	      vv[0] = _mm256_setzero_pd();
-	      vv[1] = _mm256_setzero_pd();
-	      vv[2] = _mm256_setzero_pd();
-	      vv[3] = _mm256_setzero_pd();
-	      vv[4] = _mm256_setzero_pd();
+	      vv[0] = simde_mm256_setzero_pd();
+	      vv[1] = simde_mm256_setzero_pd();
+	      vv[2] = simde_mm256_setzero_pd();
+	      vv[3] = simde_mm256_setzero_pd();
+	      vv[4] = simde_mm256_setzero_pd();
 	      	      	      
 	      for(l = 0; l < 20; l++)
 		{	       
-		  __m256d 
-		    x1v = _mm256_setzero_pd(),
-		    x2v = _mm256_setzero_pd();	
+		  simde__m256d 
+		    x1v = simde_mm256_setzero_pd(),
+		    x2v = simde_mm256_setzero_pd();	
 		  
 		  double 
 		    *ev = &extEV[l * 20],
 		    *lv = &le[l * 20],
 		    *rv = &ri[l * 20];														
-#ifdef _FMA
+#ifdef SIMDE_FMA_NATIVE
 		  for(k = 0; k < 20; k += 4) 
 		    {
-		      __m256d vlv = _mm256_load_pd(&vl[k]);
-		      __m256d lvv = _mm256_load_pd(&lv[k]);
+		      simde__m256d vlv = simde_mm256_load_pd(&vl[k]);
+		      simde__m256d lvv = simde_mm256_load_pd(&lv[k]);
 		      x1v = FMAMACC(x1v,vlv,lvv);
-		      __m256d vrv = _mm256_load_pd(&vr[k]);
-		      __m256d rvv = _mm256_load_pd(&rv[k]);
+		      simde__m256d vrv = simde_mm256_load_pd(&vr[k]);
+		      simde__m256d rvv = simde_mm256_load_pd(&rv[k]);
 		      x2v = FMAMACC(x2v,vrv,rvv);
 		    }
 #else	      
-		  x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[0]), _mm256_load_pd(&lv[0])));
-		  x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[4]), _mm256_load_pd(&lv[4])));
-		  x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[8]), _mm256_load_pd(&lv[8])));
-		  x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[12]), _mm256_load_pd(&lv[12])));
-		  x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[16]), _mm256_load_pd(&lv[16])));
-		  
-		  x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[0]), _mm256_load_pd(&rv[0])));			    
-		  x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[4]), _mm256_load_pd(&rv[4])));				    
-		  x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[8]), _mm256_load_pd(&rv[8])));			    
-		  x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[12]), _mm256_load_pd(&rv[12])));				    
-		  x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[16]), _mm256_load_pd(&rv[16])));
+		  x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[0]), simde_mm256_load_pd(&lv[0])));
+		  x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[4]), simde_mm256_load_pd(&lv[4])));
+		  x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[8]), simde_mm256_load_pd(&lv[8])));
+		  x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[12]), simde_mm256_load_pd(&lv[12])));
+		  x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[16]), simde_mm256_load_pd(&lv[16])));
+		  
+		  x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[0]), simde_mm256_load_pd(&rv[0])));			    
+		  x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[4]), simde_mm256_load_pd(&rv[4])));				    
+		  x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[8]), simde_mm256_load_pd(&rv[8])));			    
+		  x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[12]), simde_mm256_load_pd(&rv[12])));				    
+		  x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[16]), simde_mm256_load_pd(&rv[16])));
 #endif
 		  
 		  x1v = hadd4(x1v, x2v);			
 		  
-		  __m256d 
+		  simde__m256d 
 		    evv[5];
 		  
-		  evv[0] = _mm256_load_pd(&ev[0]);
-		  evv[1] = _mm256_load_pd(&ev[4]);
-		  evv[2] = _mm256_load_pd(&ev[8]);
-		  evv[3] = _mm256_load_pd(&ev[12]);
-		  evv[4] = _mm256_load_pd(&ev[16]);		
+		  evv[0] = simde_mm256_load_pd(&ev[0]);
+		  evv[1] = simde_mm256_load_pd(&ev[4]);
+		  evv[2] = simde_mm256_load_pd(&ev[8]);
+		  evv[3] = simde_mm256_load_pd(&ev[12]);
+		  evv[4] = simde_mm256_load_pd(&ev[16]);		
 		  
-#ifdef _FMA
+#ifdef SIMDE_FMA_NATIVE
 		  for(k = 0; k < 5; k++)
 		    vv[k] = FMAMACC(vv[k],x1v,evv[k]);		 
 #else	      
-		  vv[0] = _mm256_add_pd(vv[0], _mm256_mul_pd(x1v, evv[0]));
-		  vv[1] = _mm256_add_pd(vv[1], _mm256_mul_pd(x1v, evv[1]));
-		  vv[2] = _mm256_add_pd(vv[2], _mm256_mul_pd(x1v, evv[2]));
-		  vv[3] = _mm256_add_pd(vv[3], _mm256_mul_pd(x1v, evv[3]));
-		  vv[4] = _mm256_add_pd(vv[4], _mm256_mul_pd(x1v, evv[4]));				      	
+		  vv[0] = simde_mm256_add_pd(vv[0], simde_mm256_mul_pd(x1v, evv[0]));
+		  vv[1] = simde_mm256_add_pd(vv[1], simde_mm256_mul_pd(x1v, evv[1]));
+		  vv[2] = simde_mm256_add_pd(vv[2], simde_mm256_mul_pd(x1v, evv[2]));
+		  vv[3] = simde_mm256_add_pd(vv[3], simde_mm256_mul_pd(x1v, evv[3]));
+		  vv[4] = simde_mm256_add_pd(vv[4], simde_mm256_mul_pd(x1v, evv[4]));				      	
 #endif
 		}	  
 
 	   	     
-	      __m256d minlikelihood_avx = _mm256_set1_pd( minlikelihood );
+	      simde__m256d minlikelihood_avx = simde_mm256_set1_pd( minlikelihood );
 	  
 	      scale = 1;
 	      
 	      for(l = 0; scale && (l < 20); l += 4)
 		{	       
-		  __m256d 
-		    v1 = _mm256_and_pd(vv[l / 4], absMask_AVX.m);
-		  v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+		  simde__m256d 
+		    v1 = simde_mm256_and_pd(vv[l / 4], absMask_AVX.m);
+		  v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
 		  
-		  if(_mm256_movemask_pd( v1 ) != 15)
+		  if(simde_mm256_movemask_pd( v1 ) != 15)
 		    scale = 0;
 		}	    	  	  
 	 
 	      if(scale)
 		{
-		  __m256d 
-		    twoto = _mm256_set1_pd(twotothe256);
+		  simde__m256d 
+		    twoto = simde_mm256_set1_pd(twotothe256);
 		  
 		  for(l = 0; l < 20; l += 4)
-		    vv[l / 4] = _mm256_mul_pd(vv[l / 4] , twoto);		    		 
+		    vv[l / 4] = simde_mm256_mul_pd(vv[l / 4] , twoto);		    		 
 		  
 		  if(useFastScaling)
 		    addScale += wgt[i];
@@ -2018,11 +2015,11 @@
 		    ex3[i]  += 1;	      
 		}
 
-	      _mm256_store_pd(&v[0], vv[0]);
-	      _mm256_store_pd(&v[4], vv[1]);
-	      _mm256_store_pd(&v[8], vv[2]);
-	      _mm256_store_pd(&v[12], vv[3]);
-	      _mm256_store_pd(&v[16], vv[4]);	       
+	      simde_mm256_store_pd(&v[0], vv[0]);
+	      simde_mm256_store_pd(&v[4], vv[1]);
+	      simde_mm256_store_pd(&v[8], vv[2]);
+	      simde_mm256_store_pd(&v[12], vv[3]);
+	      simde_mm256_store_pd(&v[16], vv[4]);	       
 	      
 	      x3_ptr += 20;
 	    }
@@ -2070,84 +2067,84 @@
 		    x2_ptr += 20;
 		  }	 	  	 
 		
-		__m256d vv[5];
+		simde__m256d vv[5];
 		
-		vv[0] = _mm256_setzero_pd();
-		vv[1] = _mm256_setzero_pd();
-		vv[2] = _mm256_setzero_pd();
-		vv[3] = _mm256_setzero_pd();
-		vv[4] = _mm256_setzero_pd();
+		vv[0] = simde_mm256_setzero_pd();
+		vv[1] = simde_mm256_setzero_pd();
+		vv[2] = simde_mm256_setzero_pd();
+		vv[3] = simde_mm256_setzero_pd();
+		vv[4] = simde_mm256_setzero_pd();
 		
 		for(l = 0; l < 20; l++)
 		  {	       
-		    __m256d 
-		      x1v = _mm256_setzero_pd(),
-		      x2v = _mm256_setzero_pd();	
+		    simde__m256d 
+		      x1v = simde_mm256_setzero_pd(),
+		      x2v = simde_mm256_setzero_pd();	
 		    
 		    double 
 		      *ev = &extEV[l * 20],
 		      *lv = &le[l * 20],
 		      *rv = &ri[l * 20];														
 		    
-		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[0]), _mm256_load_pd(&lv[0])));
-		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[4]), _mm256_load_pd(&lv[4])));
-		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[8]), _mm256_load_pd(&lv[8])));
-		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[12]), _mm256_load_pd(&lv[12])));
-		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[16]), _mm256_load_pd(&lv[16])));
-		    
-		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[0]), _mm256_load_pd(&rv[0])));			    
-		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[4]), _mm256_load_pd(&rv[4])));				    
-		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[8]), _mm256_load_pd(&rv[8])));			    
-		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[12]), _mm256_load_pd(&rv[12])));				    
-		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[16]), _mm256_load_pd(&rv[16])));
+		    x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[0]), simde_mm256_load_pd(&lv[0])));
+		    x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[4]), simde_mm256_load_pd(&lv[4])));
+		    x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[8]), simde_mm256_load_pd(&lv[8])));
+		    x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[12]), simde_mm256_load_pd(&lv[12])));
+		    x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[16]), simde_mm256_load_pd(&lv[16])));
+		    
+		    x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[0]), simde_mm256_load_pd(&rv[0])));			    
+		    x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[4]), simde_mm256_load_pd(&rv[4])));				    
+		    x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[8]), simde_mm256_load_pd(&rv[8])));			    
+		    x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[12]), simde_mm256_load_pd(&rv[12])));				    
+		    x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[16]), simde_mm256_load_pd(&rv[16])));
 		    
 		    x1v = hadd4(x1v, x2v);			
-#ifdef _FMA
+#ifdef SIMDE_FMA_NATIVE
 		    for(k = 0; k < 5; k++) 
 		      {
-			__m256d evv = _mm256_load_pd(&ev[k*4]);
+			simde__m256d evv = simde_mm256_load_pd(&ev[k*4]);
 			vv[k] = FMAMACC(vv[k],x1v,evv);
 		      }
 #else	      
-		    __m256d 
+		    simde__m256d 
 		      evv[5];
 		    
-		    evv[0] = _mm256_load_pd(&ev[0]);
-		    evv[1] = _mm256_load_pd(&ev[4]);
-		    evv[2] = _mm256_load_pd(&ev[8]);
-		    evv[3] = _mm256_load_pd(&ev[12]);
-		    evv[4] = _mm256_load_pd(&ev[16]);		
-		    
-		    vv[0] = _mm256_add_pd(vv[0], _mm256_mul_pd(x1v, evv[0]));
-		    vv[1] = _mm256_add_pd(vv[1], _mm256_mul_pd(x1v, evv[1]));
-		    vv[2] = _mm256_add_pd(vv[2], _mm256_mul_pd(x1v, evv[2]));
-		    vv[3] = _mm256_add_pd(vv[3], _mm256_mul_pd(x1v, evv[3]));
-		    vv[4] = _mm256_add_pd(vv[4], _mm256_mul_pd(x1v, evv[4]));				      	
+		    evv[0] = simde_mm256_load_pd(&ev[0]);
+		    evv[1] = simde_mm256_load_pd(&ev[4]);
+		    evv[2] = simde_mm256_load_pd(&ev[8]);
+		    evv[3] = simde_mm256_load_pd(&ev[12]);
+		    evv[4] = simde_mm256_load_pd(&ev[16]);		
+		    
+		    vv[0] = simde_mm256_add_pd(vv[0], simde_mm256_mul_pd(x1v, evv[0]));
+		    vv[1] = simde_mm256_add_pd(vv[1], simde_mm256_mul_pd(x1v, evv[1]));
+		    vv[2] = simde_mm256_add_pd(vv[2], simde_mm256_mul_pd(x1v, evv[2]));
+		    vv[3] = simde_mm256_add_pd(vv[3], simde_mm256_mul_pd(x1v, evv[3]));
+		    vv[4] = simde_mm256_add_pd(vv[4], simde_mm256_mul_pd(x1v, evv[4]));				      	
 #endif
 		  }	  
 
 	   	     
-		__m256d minlikelihood_avx = _mm256_set1_pd( minlikelihood );
+		simde__m256d minlikelihood_avx = simde_mm256_set1_pd( minlikelihood );
 		
 		scale = 1;
 		
 		for(l = 0; scale && (l < 20); l += 4)
 		  {	       
-		    __m256d 
-		      v1 = _mm256_and_pd(vv[l / 4], absMask_AVX.m);
-		    v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+		    simde__m256d 
+		      v1 = simde_mm256_and_pd(vv[l / 4], absMask_AVX.m);
+		    v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
 		    
-		    if(_mm256_movemask_pd( v1 ) != 15)
+		    if(simde_mm256_movemask_pd( v1 ) != 15)
 		      scale = 0;
 		  }	    	  	  
 		
 		if(scale)
 		  {
-		    __m256d 
-		      twoto = _mm256_set1_pd(twotothe256);
+		    simde__m256d 
+		      twoto = simde_mm256_set1_pd(twotothe256);
 		    
 		    for(l = 0; l < 20; l += 4)
-		      vv[l / 4] = _mm256_mul_pd(vv[l / 4] , twoto);		    		 
+		      vv[l / 4] = simde_mm256_mul_pd(vv[l / 4] , twoto);		    		 
 		    
 		    if(useFastScaling)
 		      addScale += wgt[i];
@@ -2155,11 +2152,11 @@
 		      ex3[i]  += 1;	      
 		  }
 
-		_mm256_store_pd(&v[0], vv[0]);
-		_mm256_store_pd(&v[4], vv[1]);
-		_mm256_store_pd(&v[8], vv[2]);
-		_mm256_store_pd(&v[12], vv[3]);
-		_mm256_store_pd(&v[16], vv[4]);
+		simde_mm256_store_pd(&v[0], vv[0]);
+		simde_mm256_store_pd(&v[4], vv[1]);
+		simde_mm256_store_pd(&v[8], vv[2]);
+		simde_mm256_store_pd(&v[12], vv[3]);
+		simde_mm256_store_pd(&v[16], vv[4]);
 
 		 x3_ptr += 20;
 	     }
@@ -2203,11 +2200,11 @@
 
 
 #if GCC_VERSION < 40500
-   __m256d
-    bitmask = _mm256_set_pd(0,0,0,-1);
+   simde__m256d
+    bitmask = simde_mm256_set_pd(0,0,0,-1);
 #else
-  __m256i
-    bitmask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+  simde__m256i
+    bitmask = simde_mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
 #endif 
   
   switch(tipCase) 
@@ -2230,28 +2227,28 @@
 		  *ll =  &left[k * 20],
 		  *rr =  &right[k * 20];
 		
-		__m256d 
-		  umpX1v = _mm256_setzero_pd(),
-		  umpX2v = _mm256_setzero_pd();
+		simde__m256d 
+		  umpX1v = simde_mm256_setzero_pd(),
+		  umpX2v = simde_mm256_setzero_pd();
 		
 		for(l = 0; l < 20; l+=4) 
 		  {
-		    __m256d vv = _mm256_load_pd(&v[l]);
-#ifdef _FMA
-		    __m256d llv = _mm256_load_pd(&ll[l]);
+		    simde__m256d vv = simde_mm256_load_pd(&v[l]);
+#ifdef SIMDE_FMA_NATIVE
+		    simde__m256d llv = simde_mm256_load_pd(&ll[l]);
 		    umpX1v = FMAMACC(umpX1v,vv,llv);
-		    __m256d rrv = _mm256_load_pd(&rr[l]);
+		    simde__m256d rrv = simde_mm256_load_pd(&rr[l]);
 		    umpX2v = FMAMACC(umpX2v,vv,rrv);
 #else		    
-		    umpX1v = _mm256_add_pd(umpX1v,_mm256_mul_pd(vv,_mm256_load_pd(&ll[l])));
-		    umpX2v = _mm256_add_pd(umpX2v,_mm256_mul_pd(vv,_mm256_load_pd(&rr[l])));
+		    umpX1v = simde_mm256_add_pd(umpX1v,simde_mm256_mul_pd(vv,simde_mm256_load_pd(&ll[l])));
+		    umpX2v = simde_mm256_add_pd(umpX2v,simde_mm256_mul_pd(vv,simde_mm256_load_pd(&rr[l])));
 #endif
 		  }
 		
 		umpX1v = hadd3(umpX1v);
 		umpX2v = hadd3(umpX2v);
-		_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
-		_mm256_maskstore_pd(&umpX2[80 * i + k], bitmask, umpX2v);
+		simde_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
+		simde_mm256_maskstore_pd(&umpX2[80 * i + k], bitmask, umpX2v);
 	      } 	     
 	  }
 
@@ -2262,61 +2259,61 @@
 	   
 	    for(j = 0; j < 4; j++) 
 	      {     	
-		__m256d vv[5];  
+		simde__m256d vv[5];  
 
 		v = &x3[i * 80 + j * 20];
 			
-		vv[0] = _mm256_setzero_pd();
-		vv[1] = _mm256_setzero_pd();
-		vv[2] = _mm256_setzero_pd();
-		vv[3] = _mm256_setzero_pd();
-		vv[4] = _mm256_setzero_pd();
+		vv[0] = simde_mm256_setzero_pd();
+		vv[1] = simde_mm256_setzero_pd();
+		vv[2] = simde_mm256_setzero_pd();
+		vv[3] = simde_mm256_setzero_pd();
+		vv[4] = simde_mm256_setzero_pd();
 
 		for(k = 0; k < 20; k++) 
 		  {			 
 		    x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
 
-		    __m256d x1px2v = _mm256_set1_pd(x1px2);		    
+		    simde__m256d x1px2v = simde_mm256_set1_pd(x1px2);		    
 		    
-		    __m256d extEvv = _mm256_load_pd(&extEV[20 * k]);
-#ifdef _FMA
+		    simde__m256d extEvv = simde_mm256_load_pd(&extEV[20 * k]);
+#ifdef SIMDE_FMA_NATIVE
 		    vv[0] = FMAMACC(vv[0],x1px2v,extEvv);
 #else
-		    vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v,extEvv));
+		    vv[0] = simde_mm256_add_pd(vv[0],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-		    _mm256_store_pd(&v[0],vv[0]);
+		    simde_mm256_store_pd(&v[0],vv[0]);
 		    
-		    extEvv = _mm256_load_pd(&extEV[20 * k + 4]);
-#ifdef _FMA
+		    extEvv = simde_mm256_load_pd(&extEV[20 * k + 4]);
+#ifdef SIMDE_FMA_NATIVE
 		    vv[1] = FMAMACC(vv[1],x1px2v,extEvv);
 #else
-		    vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v,extEvv));
+		    vv[1] = simde_mm256_add_pd(vv[1],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-		    _mm256_store_pd(&v[4],vv[1]);
+		    simde_mm256_store_pd(&v[4],vv[1]);
 
-		    extEvv = _mm256_load_pd(&extEV[20 * k + 8]);
-#ifdef _FMA
+		    extEvv = simde_mm256_load_pd(&extEV[20 * k + 8]);
+#ifdef SIMDE_FMA_NATIVE
 		    vv[2] = FMAMACC(vv[2],x1px2v,extEvv);
 #else
-		    vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v,extEvv));
+		    vv[2] = simde_mm256_add_pd(vv[2],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-		    _mm256_store_pd(&v[8],vv[2]);
+		    simde_mm256_store_pd(&v[8],vv[2]);
 
-		    extEvv = _mm256_load_pd(&extEV[20 * k + 12]);
-#ifdef _FMA
+		    extEvv = simde_mm256_load_pd(&extEV[20 * k + 12]);
+#ifdef SIMDE_FMA_NATIVE
 		    vv[3] = FMAMACC(vv[3],x1px2v,extEvv);
 #else
-		    vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v,extEvv));
+		    vv[3] = simde_mm256_add_pd(vv[3],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-		    _mm256_store_pd(&v[12],vv[3]);
+		    simde_mm256_store_pd(&v[12],vv[3]);
 
-		    extEvv = _mm256_load_pd(&extEV[20 * k + 16]);
-#ifdef _FMA
+		    extEvv = simde_mm256_load_pd(&extEV[20 * k + 16]);
+#ifdef SIMDE_FMA_NATIVE
 		    vv[4] = FMAMACC(vv[4],x1px2v,extEvv);
 #else
-		    vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v,extEvv));
+		    vv[4] = simde_mm256_add_pd(vv[4],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-		    _mm256_store_pd(&v[16],vv[4]);
+		    simde_mm256_store_pd(&v[16],vv[4]);
 		  } 
 	      } 
 	  } 
@@ -2335,20 +2332,20 @@
 
 	    for(k = 0; k < 80; k++) 
 	      {
-		__m256d umpX1v = _mm256_setzero_pd();
+		simde__m256d umpX1v = simde_mm256_setzero_pd();
 		for(l = 0; l < 20; l+=4) 
 		  {
-		    __m256d vv = _mm256_load_pd(&v[l]);
-		    __m256d leftv = _mm256_load_pd(&left[k * 20 + l]);
-#ifdef _FMA
+		    simde__m256d vv = simde_mm256_load_pd(&v[l]);
+		    simde__m256d leftv = simde_mm256_load_pd(&left[k * 20 + l]);
+#ifdef SIMDE_FMA_NATIVE
 		   
 		    umpX1v = FMAMACC(umpX1v, vv, leftv);
 #else
-		    umpX1v = _mm256_add_pd(umpX1v, _mm256_mul_pd(vv, leftv));
+		    umpX1v = simde_mm256_add_pd(umpX1v, simde_mm256_mul_pd(vv, leftv));
 #endif
 		  }
 		umpX1v = hadd3(umpX1v);
-		_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
+		simde_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
 	      } 
 	  }
 	
@@ -2362,131 +2359,131 @@
 		
 		for(l = 0; l < 20; l++) 
 		  {
-		    __m256d ump_x2v = _mm256_setzero_pd();
+		    simde__m256d ump_x2v = simde_mm256_setzero_pd();
 		    		  
-		    __m256d vv = _mm256_load_pd(&v[0]);
-		    __m256d rightv = _mm256_load_pd(&right[k*400+l*20+0]);
-#ifdef _FMA
+		    simde__m256d vv = simde_mm256_load_pd(&v[0]);
+		    simde__m256d rightv = simde_mm256_load_pd(&right[k*400+l*20+0]);
+#ifdef SIMDE_FMA_NATIVE
 		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+		    ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 		    
-		    vv = _mm256_load_pd(&v[4]);
-		    rightv = _mm256_load_pd(&right[k*400+l*20+4]);
-#ifdef _FMA
+		    vv = simde_mm256_load_pd(&v[4]);
+		    rightv = simde_mm256_load_pd(&right[k*400+l*20+4]);
+#ifdef SIMDE_FMA_NATIVE
 		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+		    ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 
-		    vv = _mm256_load_pd(&v[8]);
-		    rightv = _mm256_load_pd(&right[k*400+l*20+8]);
-#ifdef _FMA
+		    vv = simde_mm256_load_pd(&v[8]);
+		    rightv = simde_mm256_load_pd(&right[k*400+l*20+8]);
+#ifdef SIMDE_FMA_NATIVE
 		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+		    ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 
-		    vv = _mm256_load_pd(&v[12]);
-		    rightv = _mm256_load_pd(&right[k*400+l*20+12]);
-#ifdef _FMA
+		    vv = simde_mm256_load_pd(&v[12]);
+		    rightv = simde_mm256_load_pd(&right[k*400+l*20+12]);
+#ifdef SIMDE_FMA_NATIVE
 		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+		    ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 
-		    vv = _mm256_load_pd(&v[16]);
-		    rightv = _mm256_load_pd(&right[k*400+l*20+16]);
-#ifdef _FMA
+		    vv = simde_mm256_load_pd(&v[16]);
+		    rightv = simde_mm256_load_pd(&right[k*400+l*20+16]);
+#ifdef SIMDE_FMA_NATIVE
 		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+		    ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 		    
 		    ump_x2v = hadd3(ump_x2v);
-		    _mm256_maskstore_pd(&ump_x2[l], bitmask, ump_x2v);
+		    simde_mm256_maskstore_pd(&ump_x2[l], bitmask, ump_x2v);
 		  }
 		
 		v = &(x3[80 * i + 20 * k]);
 	
 
-		__m256d vv[5]; 
+		simde__m256d vv[5]; 
 
-		vv[0] = _mm256_setzero_pd();
-		vv[1] = _mm256_setzero_pd();
-		vv[2] = _mm256_setzero_pd();
-		vv[3] = _mm256_setzero_pd();
-		vv[4] = _mm256_setzero_pd();
+		vv[0] = simde_mm256_setzero_pd();
+		vv[1] = simde_mm256_setzero_pd();
+		vv[2] = simde_mm256_setzero_pd();
+		vv[3] = simde_mm256_setzero_pd();
+		vv[4] = simde_mm256_setzero_pd();
 		
 		for(l = 0; l < 20; l++) 
 		  {
 		    x1px2 = uX1[k * 20 + l]	* ump_x2[l];
-		    __m256d x1px2v = _mm256_set1_pd(x1px2);	
+		    simde__m256d x1px2v = simde_mm256_set1_pd(x1px2);	
 	    		 
-#ifdef _FMA
-		    __m256d ev = _mm256_load_pd(&extEV[l * 20 + 0]);
+#ifdef SIMDE_FMA_NATIVE
+		    simde__m256d ev = simde_mm256_load_pd(&extEV[l * 20 + 0]);
 		    vv[0] = FMAMACC(vv[0],x1px2v, ev);
 #else
-		    vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 0])));
+		    vv[0] = simde_mm256_add_pd(vv[0],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[l * 20 + 0])));
 #endif
-		    _mm256_store_pd(&v[0],vv[0]);
+		    simde_mm256_store_pd(&v[0],vv[0]);
 
-#ifdef _FMA
-		    ev = _mm256_load_pd(&extEV[l * 20 + 4]);
+#ifdef SIMDE_FMA_NATIVE
+		    ev = simde_mm256_load_pd(&extEV[l * 20 + 4]);
 		    vv[1] = FMAMACC(vv[1],x1px2v, ev);
 #else
-		    vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 4])));
+		    vv[1] = simde_mm256_add_pd(vv[1],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[l * 20 + 4])));
 #endif
-		    _mm256_store_pd(&v[4],vv[1]);
+		    simde_mm256_store_pd(&v[4],vv[1]);
 
-#ifdef _FMA
-		    ev = _mm256_load_pd(&extEV[l * 20 + 8]);
+#ifdef SIMDE_FMA_NATIVE
+		    ev = simde_mm256_load_pd(&extEV[l * 20 + 8]);
 		    vv[2] = FMAMACC(vv[2],x1px2v, ev);
 #else
-		    vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 8])));
+		    vv[2] = simde_mm256_add_pd(vv[2],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[l * 20 + 8])));
 #endif
-		    _mm256_store_pd(&v[8],vv[2]);
+		    simde_mm256_store_pd(&v[8],vv[2]);
 		    
-#ifdef _FMA
-		    ev = _mm256_load_pd(&extEV[l * 20 + 12]);
+#ifdef SIMDE_FMA_NATIVE
+		    ev = simde_mm256_load_pd(&extEV[l * 20 + 12]);
 		    vv[3] = FMAMACC(vv[3],x1px2v, ev);
 #else
-		    vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 12])));
+		    vv[3] = simde_mm256_add_pd(vv[3],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[l * 20 + 12])));
 #endif
-		    _mm256_store_pd(&v[12],vv[3]);
+		    simde_mm256_store_pd(&v[12],vv[3]);
 
 
-#ifdef _FMA
-		    ev = _mm256_load_pd(&extEV[l * 20 + 16]);
+#ifdef SIMDE_FMA_NATIVE
+		    ev = simde_mm256_load_pd(&extEV[l * 20 + 16]);
 		    vv[4] = FMAMACC(vv[4],x1px2v, ev);
 #else
-		    vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 16])));
+		    vv[4] = simde_mm256_add_pd(vv[4],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[l * 20 + 16])));
 #endif
-		    _mm256_store_pd(&v[16],vv[4]);
+		    simde_mm256_store_pd(&v[16],vv[4]);
 
 		  } 
 	      }
 	   
 	    v = &x3[80 * i];
-	    __m256d minlikelihood_avx = _mm256_set1_pd(minlikelihood);
+	    simde__m256d minlikelihood_avx = simde_mm256_set1_pd(minlikelihood);
 	    scale = 1;
 	    for(l = 0; scale && (l < 80); l += 4) 
 	      {
-		__m256d vv = _mm256_load_pd(&v[l]);
-		__m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
-		vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
-		if(_mm256_movemask_pd(vv_abs) != 15)
+		simde__m256d vv = simde_mm256_load_pd(&v[l]);
+		simde__m256d vv_abs = simde_mm256_and_pd(vv,absMask_AVX.m);
+		vv_abs = simde_mm256_cmp_pd(vv_abs,minlikelihood_avx,SIMDE_CMP_LT_OS);
+		if(simde_mm256_movemask_pd(vv_abs) != 15)
 		  scale = 0;
 	      }
 	    
 	    if(scale) 
 	      {		
-		__m256d twotothe256v = _mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
+		simde__m256d twotothe256v = simde_mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
 		for(l = 0; l < 80; l += 4) 
 		  {
-		    __m256d vv = _mm256_load_pd(&v[l]);
-		    _mm256_store_pd(&v[l],_mm256_mul_pd(vv,twotothe256v));
+		    simde__m256d vv = simde_mm256_load_pd(&v[l]);
+		    simde_mm256_store_pd(&v[l],simde_mm256_mul_pd(vv,twotothe256v));
 		  }
 		if(useFastScaling)
 		  addScale += wgt[i];				
@@ -2507,153 +2504,153 @@
 	      vr = &(x2[80 * i + 20 * k]);
 	      v  = &(x3[80 * i + 20 * k]);	      	   
 
-	      __m256d vv[5]; 
+	      simde__m256d vv[5]; 
 	      
-	      vv[0] = _mm256_setzero_pd();
-	      vv[1] = _mm256_setzero_pd();
-	      vv[2] = _mm256_setzero_pd();
-	      vv[3] = _mm256_setzero_pd();
-	      vv[4] = _mm256_setzero_pd();
+	      vv[0] = simde_mm256_setzero_pd();
+	      vv[1] = simde_mm256_setzero_pd();
+	      vv[2] = simde_mm256_setzero_pd();
+	      vv[3] = simde_mm256_setzero_pd();
+	      vv[4] = simde_mm256_setzero_pd();
 	      
 	      for(l = 0; l < 20; l++) 
 		{		  
-		  __m256d al = _mm256_setzero_pd();
-		  __m256d ar = _mm256_setzero_pd();
+		  simde__m256d al = simde_mm256_setzero_pd();
+		  simde__m256d ar = simde_mm256_setzero_pd();
        		  
-		  __m256d leftv  = _mm256_load_pd(&left[k * 400 + l * 20 + 0]);
-		  __m256d rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 0]);
-		  __m256d vlv = _mm256_load_pd(&vl[0]);
-		  __m256d vrv = _mm256_load_pd(&vr[0]);
+		  simde__m256d leftv  = simde_mm256_load_pd(&left[k * 400 + l * 20 + 0]);
+		  simde__m256d rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 0]);
+		  simde__m256d vlv = simde_mm256_load_pd(&vl[0]);
+		  simde__m256d vrv = simde_mm256_load_pd(&vr[0]);
 		  
-#ifdef _FMA
+#ifdef SIMDE_FMA_NATIVE
 		    
 		  al = FMAMACC(al, vlv, leftv);
 		  ar = FMAMACC(ar, vrv, rightv);
 #else
-		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));		  
+		  al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+		  ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));		  
 #endif
 
-		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 4]);
-		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 4]);
-		  vlv = _mm256_load_pd(&vl[4]);
-		  vrv = _mm256_load_pd(&vr[4]);
-#ifdef _FMA
+		  leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 4]);
+		  rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 4]);
+		  vlv = simde_mm256_load_pd(&vl[4]);
+		  vrv = simde_mm256_load_pd(&vr[4]);
+#ifdef SIMDE_FMA_NATIVE
 		    
 		  al = FMAMACC(al, vlv, leftv);
 		  ar = FMAMACC(ar, vrv, rightv);
 #else
-		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+		  al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+		  ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 
-		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 8]);
-		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 8]);
-		  vlv = _mm256_load_pd(&vl[8]);
-		  vrv = _mm256_load_pd(&vr[8]);
-#ifdef _FMA
+		  leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 8]);
+		  rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 8]);
+		  vlv = simde_mm256_load_pd(&vl[8]);
+		  vrv = simde_mm256_load_pd(&vr[8]);
+#ifdef SIMDE_FMA_NATIVE
 		    
 		  al = FMAMACC(al, vlv, leftv);
 		  ar = FMAMACC(ar, vrv, rightv);
 #else
-		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+		  al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+		  ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 
-		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 12]);
-		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 12]);
-		  vlv = _mm256_load_pd(&vl[12]);
-		  vrv = _mm256_load_pd(&vr[12]);
-#ifdef _FMA
+		  leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 12]);
+		  rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 12]);
+		  vlv = simde_mm256_load_pd(&vl[12]);
+		  vrv = simde_mm256_load_pd(&vr[12]);
+#ifdef SIMDE_FMA_NATIVE
 		    
 		  al = FMAMACC(al, vlv, leftv);
 		  ar = FMAMACC(ar, vrv, rightv);
 #else
-		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+		  al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+		  ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 
-		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 16]);
-		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 16]);
-		  vlv = _mm256_load_pd(&vl[16]);
-		  vrv = _mm256_load_pd(&vr[16]);
+		  leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 16]);
+		  rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 16]);
+		  vlv = simde_mm256_load_pd(&vl[16]);
+		  vrv = simde_mm256_load_pd(&vr[16]);
 
-#ifdef _FMA		    
+#ifdef SIMDE_FMA_NATIVE		    
 		  al = FMAMACC(al, vlv, leftv);
 		  ar = FMAMACC(ar, vrv, rightv);
 #else
-		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+		  al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+		  ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 
 		  /**************************************************************************************************************/
 
 		  al = hadd3(al);
 		  ar = hadd3(ar);
-		  al = _mm256_mul_pd(ar,al);
+		  al = simde_mm256_mul_pd(ar,al);
 		  
 		  /************************************************************************************************************/
-#ifdef _FMA		    
-		  __m256d ev =  _mm256_load_pd(&extEV[20 * l + 0]);
+#ifdef SIMDE_FMA_NATIVE		    
+		  simde__m256d ev =  simde_mm256_load_pd(&extEV[20 * l + 0]);
 		  vv[0] = FMAMACC(vv[0], al, ev);		 
 #else
-		  vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 0])));			  		 		  
+		  vv[0] = simde_mm256_add_pd(vv[0],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[20 * l + 0])));			  		 		  
 #endif
-		  _mm256_store_pd(&v[0],vv[0]);
+		  simde_mm256_store_pd(&v[0],vv[0]);
 
-#ifdef _FMA		    
-		  ev =  _mm256_load_pd(&extEV[20 * l + 4]);
+#ifdef SIMDE_FMA_NATIVE		    
+		  ev =  simde_mm256_load_pd(&extEV[20 * l + 4]);
 		  vv[1] = FMAMACC(vv[1], al, ev);		 
 #else
-		  vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 4])));		  		 
+		  vv[1] = simde_mm256_add_pd(vv[1],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[20 * l + 4])));		  		 
 #endif
-		  _mm256_store_pd(&v[4],vv[1]);
+		  simde_mm256_store_pd(&v[4],vv[1]);
 
-#ifdef _FMA		    
-		  ev =  _mm256_load_pd(&extEV[20 * l + 8]);
+#ifdef SIMDE_FMA_NATIVE		    
+		  ev =  simde_mm256_load_pd(&extEV[20 * l + 8]);
 		  vv[2] = FMAMACC(vv[2], al, ev);		 
 #else
-		  vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 8])));		  		 
+		  vv[2] = simde_mm256_add_pd(vv[2],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[20 * l + 8])));		  		 
 #endif
-		  _mm256_store_pd(&v[8],vv[2]);
+		  simde_mm256_store_pd(&v[8],vv[2]);
 
-#ifdef _FMA		    
-		  ev =  _mm256_load_pd(&extEV[20 * l + 12]);
+#ifdef SIMDE_FMA_NATIVE		    
+		  ev =  simde_mm256_load_pd(&extEV[20 * l + 12]);
 		  vv[3] = FMAMACC(vv[3], al, ev);		 
 #else
-		  vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 12])));		  		 
+		  vv[3] = simde_mm256_add_pd(vv[3],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[20 * l + 12])));		  		 
 #endif
-		  _mm256_store_pd(&v[12],vv[3]);
+		  simde_mm256_store_pd(&v[12],vv[3]);
 
-#ifdef _FMA		    
-		  ev =  _mm256_load_pd(&extEV[20 * l + 16]);
+#ifdef SIMDE_FMA_NATIVE		    
+		  ev =  simde_mm256_load_pd(&extEV[20 * l + 16]);
 		  vv[4] = FMAMACC(vv[4], al, ev);		 
 #else
-		  vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 16])));			 	  
+		  vv[4] = simde_mm256_add_pd(vv[4],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[20 * l + 16])));			 	  
 #endif
-		  _mm256_store_pd(&v[16],vv[4]);		 
+		  simde_mm256_store_pd(&v[16],vv[4]);		 
 		} 
 	    }
 	  v = &(x3[80 * i]);
 	  scale = 1;
-	  __m256d minlikelihood_avx = _mm256_set1_pd(minlikelihood);	 
+	  simde__m256d minlikelihood_avx = simde_mm256_set1_pd(minlikelihood);	 
 
 	  for(l = 0; scale && (l < 80); l += 4) 
 	    {
-	      __m256d vv = _mm256_load_pd(&v[l]);
-	      __m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
-	      vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
-	      if(_mm256_movemask_pd(vv_abs) != 15)
+	      simde__m256d vv = simde_mm256_load_pd(&v[l]);
+	      simde__m256d vv_abs = simde_mm256_and_pd(vv,absMask_AVX.m);
+	      vv_abs = simde_mm256_cmp_pd(vv_abs,minlikelihood_avx,SIMDE_CMP_LT_OS);
+	      if(simde_mm256_movemask_pd(vv_abs) != 15)
 		scale = 0;	     
 	    }
 
 	  if(scale) 
 	    {		     	      
-	      __m256d twotothe256v = _mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
+	      simde__m256d twotothe256v = simde_mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
 	      for(l = 0; l < 80; l += 4) 
 		{
-		  __m256d vv = _mm256_load_pd(&v[l]);
-		  _mm256_store_pd(&v[l],_mm256_mul_pd(vv,twotothe256v));
+		  simde__m256d vv = simde_mm256_load_pd(&v[l]);
+		  simde_mm256_store_pd(&v[l],simde_mm256_mul_pd(vv,twotothe256v));
 		}
 	      if(useFastScaling)
 		addScale += wgt[i];					
@@ -2701,11 +2698,11 @@
 
 
 #if GCC_VERSION < 40500
-   __m256d
-    bitmask = _mm256_set_pd(0,0,0,-1);
+   simde__m256d
+    bitmask = simde_mm256_set_pd(0,0,0,-1);
 #else
-  __m256i
-    bitmask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+  simde__m256i
+    bitmask = simde_mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
 #endif 
   
   switch(tipCase) 
@@ -2726,30 +2723,30 @@
 		  *ll =  &left[k * 20],
 		  *rr =  &right[k * 20];
 		
-		__m256d 
-		  umpX1v = _mm256_setzero_pd(),
-		  umpX2v = _mm256_setzero_pd();
+		simde__m256d 
+		  umpX1v = simde_mm256_setzero_pd(),
+		  umpX2v = simde_mm256_setzero_pd();
 		
 		v = &(tipVector[k / 20][20 * i]);
 
 		for(l = 0; l < 20; l+=4) 
 		  {
-		    __m256d vv = _mm256_load_pd(&v[l]);
-#ifdef _FMA
-		    __m256d llv = _mm256_load_pd(&ll[l]);
+		    simde__m256d vv = simde_mm256_load_pd(&v[l]);
+#ifdef SIMDE_FMA_NATIVE
+		    simde__m256d llv = simde_mm256_load_pd(&ll[l]);
 		    umpX1v = FMAMACC(umpX1v,vv,llv);
-		    __m256d rrv = _mm256_load_pd(&rr[l]);
+		    simde__m256d rrv = simde_mm256_load_pd(&rr[l]);
 		    umpX2v = FMAMACC(umpX2v,vv,rrv);
 #else		    
-		    umpX1v = _mm256_add_pd(umpX1v,_mm256_mul_pd(vv,_mm256_load_pd(&ll[l])));
-		    umpX2v = _mm256_add_pd(umpX2v,_mm256_mul_pd(vv,_mm256_load_pd(&rr[l])));
+		    umpX1v = simde_mm256_add_pd(umpX1v,simde_mm256_mul_pd(vv,simde_mm256_load_pd(&ll[l])));
+		    umpX2v = simde_mm256_add_pd(umpX2v,simde_mm256_mul_pd(vv,simde_mm256_load_pd(&rr[l])));
 #endif
 		  }
 		
 		umpX1v = hadd3(umpX1v);
 		umpX2v = hadd3(umpX2v);
-		_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
-		_mm256_maskstore_pd(&umpX2[80 * i + k], bitmask, umpX2v);
+		simde_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
+		simde_mm256_maskstore_pd(&umpX2[80 * i + k], bitmask, umpX2v);
 	      } 
 	  }
 
@@ -2760,61 +2757,61 @@
 	   
 	    for(j = 0; j < 4; j++) 
 	      {     	
-		__m256d vv[5];  
+		simde__m256d vv[5];  
 
 		v = &x3[i * 80 + j * 20];
 			
-		vv[0] = _mm256_setzero_pd();
-		vv[1] = _mm256_setzero_pd();
-		vv[2] = _mm256_setzero_pd();
-		vv[3] = _mm256_setzero_pd();
-		vv[4] = _mm256_setzero_pd();
+		vv[0] = simde_mm256_setzero_pd();
+		vv[1] = simde_mm256_setzero_pd();
+		vv[2] = simde_mm256_setzero_pd();
+		vv[3] = simde_mm256_setzero_pd();
+		vv[4] = simde_mm256_setzero_pd();
 
 		for(k = 0; k < 20; k++) 
 		  {			 
 		    x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
 
-		    __m256d x1px2v = _mm256_set1_pd(x1px2);		    
+		    simde__m256d x1px2v = simde_mm256_set1_pd(x1px2);		    
 		    
-		    __m256d extEvv = _mm256_load_pd(&extEV[j][20 * k]);
-#ifdef _FMA
+		    simde__m256d extEvv = simde_mm256_load_pd(&extEV[j][20 * k]);
+#ifdef SIMDE_FMA_NATIVE
 		    vv[0] = FMAMACC(vv[0],x1px2v,extEvv);
 #else
-		    vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v,extEvv));
+		    vv[0] = simde_mm256_add_pd(vv[0],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-		    _mm256_store_pd(&v[0],vv[0]);
+		    simde_mm256_store_pd(&v[0],vv[0]);
 		    
-		    extEvv = _mm256_load_pd(&extEV[j][20 * k + 4]);
-#ifdef _FMA
+		    extEvv = simde_mm256_load_pd(&extEV[j][20 * k + 4]);
+#ifdef SIMDE_FMA_NATIVE
 		    vv[1] = FMAMACC(vv[1],x1px2v,extEvv);
 #else
-		    vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v,extEvv));
+		    vv[1] = simde_mm256_add_pd(vv[1],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-		    _mm256_store_pd(&v[4],vv[1]);
+		    simde_mm256_store_pd(&v[4],vv[1]);
 
-		    extEvv = _mm256_load_pd(&extEV[j][20 * k + 8]);
-#ifdef _FMA
+		    extEvv = simde_mm256_load_pd(&extEV[j][20 * k + 8]);
+#ifdef SIMDE_FMA_NATIVE
 		    vv[2] = FMAMACC(vv[2],x1px2v,extEvv);
 #else
-		    vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v,extEvv));
+		    vv[2] = simde_mm256_add_pd(vv[2],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-		    _mm256_store_pd(&v[8],vv[2]);
+		    simde_mm256_store_pd(&v[8],vv[2]);
 
-		    extEvv = _mm256_load_pd(&extEV[j][20 * k + 12]);
-#ifdef _FMA
+		    extEvv = simde_mm256_load_pd(&extEV[j][20 * k + 12]);
+#ifdef SIMDE_FMA_NATIVE
 		    vv[3] = FMAMACC(vv[3],x1px2v,extEvv);
 #else
-		    vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v,extEvv));
+		    vv[3] = simde_mm256_add_pd(vv[3],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-		    _mm256_store_pd(&v[12],vv[3]);
+		    simde_mm256_store_pd(&v[12],vv[3]);
 
-		    extEvv = _mm256_load_pd(&extEV[j][20 * k + 16]);
-#ifdef _FMA
+		    extEvv = simde_mm256_load_pd(&extEV[j][20 * k + 16]);
+#ifdef SIMDE_FMA_NATIVE
 		    vv[4] = FMAMACC(vv[4],x1px2v,extEvv);
 #else
-		    vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v,extEvv));
+		    vv[4] = simde_mm256_add_pd(vv[4],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-		    _mm256_store_pd(&v[16],vv[4]);
+		    simde_mm256_store_pd(&v[16],vv[4]);
 		  } 
 	      } 
 	  } 
@@ -2831,23 +2828,23 @@
 	  {	   
 	    for(k = 0; k < 80; k++) 
 	      {
-		__m256d umpX1v = _mm256_setzero_pd();
+		simde__m256d umpX1v = simde_mm256_setzero_pd();
 		
 		 v = &(tipVector[k / 20][20 * i]);
 
 		for(l = 0; l < 20; l+=4) 
 		  {
-		    __m256d vv = _mm256_load_pd(&v[l]);
-		    __m256d leftv = _mm256_load_pd(&left[k * 20 + l]);
-#ifdef _FMA
+		    simde__m256d vv = simde_mm256_load_pd(&v[l]);
+		    simde__m256d leftv = simde_mm256_load_pd(&left[k * 20 + l]);
+#ifdef SIMDE_FMA_NATIVE
 		   
 		    umpX1v = FMAMACC(umpX1v, vv, leftv);
 #else
-		    umpX1v = _mm256_add_pd(umpX1v, _mm256_mul_pd(vv, leftv));
+		    umpX1v = simde_mm256_add_pd(umpX1v, simde_mm256_mul_pd(vv, leftv));
 #endif
 		  }
 		umpX1v = hadd3(umpX1v);
-		_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
+		simde_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
 	      } 
 	  }
 	
@@ -2861,131 +2858,131 @@
 		
 		for(l = 0; l < 20; l++) 
 		  {
-		    __m256d ump_x2v = _mm256_setzero_pd();
+		    simde__m256d ump_x2v = simde_mm256_setzero_pd();
 		    		  
-		    __m256d vv = _mm256_load_pd(&v[0]);
-		    __m256d rightv = _mm256_load_pd(&right[k*400+l*20+0]);
-#ifdef _FMA
+		    simde__m256d vv = simde_mm256_load_pd(&v[0]);
+		    simde__m256d rightv = simde_mm256_load_pd(&right[k*400+l*20+0]);
+#ifdef SIMDE_FMA_NATIVE
 		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+		    ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 		    
-		    vv = _mm256_load_pd(&v[4]);
-		    rightv = _mm256_load_pd(&right[k*400+l*20+4]);
-#ifdef _FMA
+		    vv = simde_mm256_load_pd(&v[4]);
+		    rightv = simde_mm256_load_pd(&right[k*400+l*20+4]);
+#ifdef SIMDE_FMA_NATIVE
 		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+		    ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 
-		    vv = _mm256_load_pd(&v[8]);
-		    rightv = _mm256_load_pd(&right[k*400+l*20+8]);
-#ifdef _FMA
+		    vv = simde_mm256_load_pd(&v[8]);
+		    rightv = simde_mm256_load_pd(&right[k*400+l*20+8]);
+#ifdef SIMDE_FMA_NATIVE
 		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+		    ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 
-		    vv = _mm256_load_pd(&v[12]);
-		    rightv = _mm256_load_pd(&right[k*400+l*20+12]);
-#ifdef _FMA
+		    vv = simde_mm256_load_pd(&v[12]);
+		    rightv = simde_mm256_load_pd(&right[k*400+l*20+12]);
+#ifdef SIMDE_FMA_NATIVE
 		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+		    ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 
-		    vv = _mm256_load_pd(&v[16]);
-		    rightv = _mm256_load_pd(&right[k*400+l*20+16]);
-#ifdef _FMA
+		    vv = simde_mm256_load_pd(&v[16]);
+		    rightv = simde_mm256_load_pd(&right[k*400+l*20+16]);
+#ifdef SIMDE_FMA_NATIVE
 		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+		    ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 		    
 		    ump_x2v = hadd3(ump_x2v);
-		    _mm256_maskstore_pd(&ump_x2[l], bitmask, ump_x2v);
+		    simde_mm256_maskstore_pd(&ump_x2[l], bitmask, ump_x2v);
 		  }
 		
 		v = &(x3[80 * i + 20 * k]);
 	
 
-		__m256d vv[5]; 
+		simde__m256d vv[5]; 
 
-		vv[0] = _mm256_setzero_pd();
-		vv[1] = _mm256_setzero_pd();
-		vv[2] = _mm256_setzero_pd();
-		vv[3] = _mm256_setzero_pd();
-		vv[4] = _mm256_setzero_pd();
+		vv[0] = simde_mm256_setzero_pd();
+		vv[1] = simde_mm256_setzero_pd();
+		vv[2] = simde_mm256_setzero_pd();
+		vv[3] = simde_mm256_setzero_pd();
+		vv[4] = simde_mm256_setzero_pd();
 		
 		for(l = 0; l < 20; l++) 
 		  {
 		    x1px2 = uX1[k * 20 + l]	* ump_x2[l];
-		    __m256d x1px2v = _mm256_set1_pd(x1px2);	
+		    simde__m256d x1px2v = simde_mm256_set1_pd(x1px2);	
 	    		 
-#ifdef _FMA
-		    __m256d ev = _mm256_load_pd(&extEV[k][l * 20 + 0]);
+#ifdef SIMDE_FMA_NATIVE
+		    simde__m256d ev = simde_mm256_load_pd(&extEV[k][l * 20 + 0]);
 		    vv[0] = FMAMACC(vv[0],x1px2v, ev);
 #else
-		    vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[k][l * 20 + 0])));
+		    vv[0] = simde_mm256_add_pd(vv[0],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[k][l * 20 + 0])));
 #endif
-		    _mm256_store_pd(&v[0],vv[0]);
+		    simde_mm256_store_pd(&v[0],vv[0]);
 
-#ifdef _FMA
-		    ev = _mm256_load_pd(&extEV[k][l * 20 + 4]);
+#ifdef SIMDE_FMA_NATIVE
+		    ev = simde_mm256_load_pd(&extEV[k][l * 20 + 4]);
 		    vv[1] = FMAMACC(vv[1],x1px2v, ev);
 #else
-		    vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[k][l * 20 + 4])));
+		    vv[1] = simde_mm256_add_pd(vv[1],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[k][l * 20 + 4])));
 #endif
-		    _mm256_store_pd(&v[4],vv[1]);
+		    simde_mm256_store_pd(&v[4],vv[1]);
 
-#ifdef _FMA
-		    ev = _mm256_load_pd(&extEV[k][l * 20 + 8]);
+#ifdef SIMDE_FMA_NATIVE
+		    ev = simde_mm256_load_pd(&extEV[k][l * 20 + 8]);
 		    vv[2] = FMAMACC(vv[2],x1px2v, ev);
 #else
-		    vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[k][l * 20 + 8])));
+		    vv[2] = simde_mm256_add_pd(vv[2],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[k][l * 20 + 8])));
 #endif
-		    _mm256_store_pd(&v[8],vv[2]);
+		    simde_mm256_store_pd(&v[8],vv[2]);
 		    
-#ifdef _FMA
-		    ev = _mm256_load_pd(&extEV[k][l * 20 + 12]);
+#ifdef SIMDE_FMA_NATIVE
+		    ev = simde_mm256_load_pd(&extEV[k][l * 20 + 12]);
 		    vv[3] = FMAMACC(vv[3],x1px2v, ev);
 #else
-		    vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[k][l * 20 + 12])));
+		    vv[3] = simde_mm256_add_pd(vv[3],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[k][l * 20 + 12])));
 #endif
-		    _mm256_store_pd(&v[12],vv[3]);
+		    simde_mm256_store_pd(&v[12],vv[3]);
 
 
-#ifdef _FMA
-		    ev = _mm256_load_pd(&extEV[k][l * 20 + 16]);
+#ifdef SIMDE_FMA_NATIVE
+		    ev = simde_mm256_load_pd(&extEV[k][l * 20 + 16]);
 		    vv[4] = FMAMACC(vv[4],x1px2v, ev);
 #else
-		    vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[k][l * 20 + 16])));
+		    vv[4] = simde_mm256_add_pd(vv[4],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[k][l * 20 + 16])));
 #endif
-		    _mm256_store_pd(&v[16],vv[4]);
+		    simde_mm256_store_pd(&v[16],vv[4]);
 
 		  } 
 	      }
 	   
 	    v = &x3[80 * i];
-	    __m256d minlikelihood_avx = _mm256_set1_pd(minlikelihood);
+	    simde__m256d minlikelihood_avx = simde_mm256_set1_pd(minlikelihood);
 	    scale = 1;
 	    for(l = 0; scale && (l < 80); l += 4) 
 	      {
-		__m256d vv = _mm256_load_pd(&v[l]);
-		__m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
-		vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
-		if(_mm256_movemask_pd(vv_abs) != 15)
+		simde__m256d vv = simde_mm256_load_pd(&v[l]);
+		simde__m256d vv_abs = simde_mm256_and_pd(vv,absMask_AVX.m);
+		vv_abs = simde_mm256_cmp_pd(vv_abs,minlikelihood_avx,SIMDE_CMP_LT_OS);
+		if(simde_mm256_movemask_pd(vv_abs) != 15)
 		  scale = 0;
 	      }
 	    
 	    if(scale) 
 	      {		
-		__m256d twotothe256v = _mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
+		simde__m256d twotothe256v = simde_mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
 		for(l = 0; l < 80; l += 4) 
 		  {
-		    __m256d vv = _mm256_load_pd(&v[l]);
-		    _mm256_store_pd(&v[l],_mm256_mul_pd(vv,twotothe256v));
+		    simde__m256d vv = simde_mm256_load_pd(&v[l]);
+		    simde_mm256_store_pd(&v[l],simde_mm256_mul_pd(vv,twotothe256v));
 		  }
 		if(useFastScaling)
 		  addScale += wgt[i];				
@@ -3006,153 +3003,153 @@
 	      vr = &(x2[80 * i + 20 * k]);
 	      v  = &(x3[80 * i + 20 * k]);	      	   
 
-	      __m256d vv[5]; 
+	      simde__m256d vv[5]; 
 	      
-	      vv[0] = _mm256_setzero_pd();
-	      vv[1] = _mm256_setzero_pd();
-	      vv[2] = _mm256_setzero_pd();
-	      vv[3] = _mm256_setzero_pd();
-	      vv[4] = _mm256_setzero_pd();
+	      vv[0] = simde_mm256_setzero_pd();
+	      vv[1] = simde_mm256_setzero_pd();
+	      vv[2] = simde_mm256_setzero_pd();
+	      vv[3] = simde_mm256_setzero_pd();
+	      vv[4] = simde_mm256_setzero_pd();
 	      
 	      for(l = 0; l < 20; l++) 
 		{		  
-		  __m256d al = _mm256_setzero_pd();
-		  __m256d ar = _mm256_setzero_pd();
+		  simde__m256d al = simde_mm256_setzero_pd();
+		  simde__m256d ar = simde_mm256_setzero_pd();
        		  
-		  __m256d leftv  = _mm256_load_pd(&left[k * 400 + l * 20 + 0]);
-		  __m256d rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 0]);
-		  __m256d vlv = _mm256_load_pd(&vl[0]);
-		  __m256d vrv = _mm256_load_pd(&vr[0]);
+		  simde__m256d leftv  = simde_mm256_load_pd(&left[k * 400 + l * 20 + 0]);
+		  simde__m256d rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 0]);
+		  simde__m256d vlv = simde_mm256_load_pd(&vl[0]);
+		  simde__m256d vrv = simde_mm256_load_pd(&vr[0]);
 		  
-#ifdef _FMA
+#ifdef SIMDE_FMA_NATIVE
 		    
 		  al = FMAMACC(al, vlv, leftv);
 		  ar = FMAMACC(ar, vrv, rightv);
 #else
-		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));		  
+		  al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+		  ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));		  
 #endif
 
-		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 4]);
-		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 4]);
-		  vlv = _mm256_load_pd(&vl[4]);
-		  vrv = _mm256_load_pd(&vr[4]);
-#ifdef _FMA
+		  leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 4]);
+		  rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 4]);
+		  vlv = simde_mm256_load_pd(&vl[4]);
+		  vrv = simde_mm256_load_pd(&vr[4]);
+#ifdef SIMDE_FMA_NATIVE
 		    
 		  al = FMAMACC(al, vlv, leftv);
 		  ar = FMAMACC(ar, vrv, rightv);
 #else
-		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+		  al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+		  ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 
-		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 8]);
-		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 8]);
-		  vlv = _mm256_load_pd(&vl[8]);
-		  vrv = _mm256_load_pd(&vr[8]);
-#ifdef _FMA
+		  leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 8]);
+		  rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 8]);
+		  vlv = simde_mm256_load_pd(&vl[8]);
+		  vrv = simde_mm256_load_pd(&vr[8]);
+#ifdef SIMDE_FMA_NATIVE
 		    
 		  al = FMAMACC(al, vlv, leftv);
 		  ar = FMAMACC(ar, vrv, rightv);
 #else
-		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+		  al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+		  ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 
-		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 12]);
-		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 12]);
-		  vlv = _mm256_load_pd(&vl[12]);
-		  vrv = _mm256_load_pd(&vr[12]);
-#ifdef _FMA
+		  leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 12]);
+		  rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 12]);
+		  vlv = simde_mm256_load_pd(&vl[12]);
+		  vrv = simde_mm256_load_pd(&vr[12]);
+#ifdef SIMDE_FMA_NATIVE
 		    
 		  al = FMAMACC(al, vlv, leftv);
 		  ar = FMAMACC(ar, vrv, rightv);
 #else
-		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+		  al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+		  ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 
-		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 16]);
-		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 16]);
-		  vlv = _mm256_load_pd(&vl[16]);
-		  vrv = _mm256_load_pd(&vr[16]);
+		  leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 16]);
+		  rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 16]);
+		  vlv = simde_mm256_load_pd(&vl[16]);
+		  vrv = simde_mm256_load_pd(&vr[16]);
 
-#ifdef _FMA		    
+#ifdef SIMDE_FMA_NATIVE		    
 		  al = FMAMACC(al, vlv, leftv);
 		  ar = FMAMACC(ar, vrv, rightv);
 #else
-		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+		  al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+		  ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 
 		  /**************************************************************************************************************/
 
 		  al = hadd3(al);
 		  ar = hadd3(ar);
-		  al = _mm256_mul_pd(ar,al);
+		  al = simde_mm256_mul_pd(ar,al);
 		  
 		  /************************************************************************************************************/
-#ifdef _FMA		    
-		  __m256d ev =  _mm256_load_pd(&extEV[k][20 * l + 0]);
+#ifdef SIMDE_FMA_NATIVE		    
+		  simde__m256d ev =  simde_mm256_load_pd(&extEV[k][20 * l + 0]);
 		  vv[0] = FMAMACC(vv[0], al, ev);		 
 #else
-		  vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(al, _mm256_load_pd(&extEV[k][20 * l + 0])));			  		 		  
+		  vv[0] = simde_mm256_add_pd(vv[0],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[k][20 * l + 0])));			  		 		  
 #endif
-		  _mm256_store_pd(&v[0],vv[0]);
+		  simde_mm256_store_pd(&v[0],vv[0]);
 
-#ifdef _FMA		    
-		  ev =  _mm256_load_pd(&extEV[k][20 * l + 4]);
+#ifdef SIMDE_FMA_NATIVE		    
+		  ev =  simde_mm256_load_pd(&extEV[k][20 * l + 4]);
 		  vv[1] = FMAMACC(vv[1], al, ev);		 
 #else
-		  vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(al, _mm256_load_pd(&extEV[k][20 * l + 4])));		  		 
+		  vv[1] = simde_mm256_add_pd(vv[1],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[k][20 * l + 4])));		  		 
 #endif
-		  _mm256_store_pd(&v[4],vv[1]);
+		  simde_mm256_store_pd(&v[4],vv[1]);
 
-#ifdef _FMA		    
-		  ev =  _mm256_load_pd(&extEV[k][20 * l + 8]);
+#ifdef SIMDE_FMA_NATIVE		    
+		  ev =  simde_mm256_load_pd(&extEV[k][20 * l + 8]);
 		  vv[2] = FMAMACC(vv[2], al, ev);		 
 #else
-		  vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(al, _mm256_load_pd(&extEV[k][20 * l + 8])));		  		 
+		  vv[2] = simde_mm256_add_pd(vv[2],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[k][20 * l + 8])));		  		 
 #endif
-		  _mm256_store_pd(&v[8],vv[2]);
+		  simde_mm256_store_pd(&v[8],vv[2]);
 
-#ifdef _FMA		    
-		  ev =  _mm256_load_pd(&extEV[k][20 * l + 12]);
+#ifdef SIMDE_FMA_NATIVE		    
+		  ev =  simde_mm256_load_pd(&extEV[k][20 * l + 12]);
 		  vv[3] = FMAMACC(vv[3], al, ev);		 
 #else
-		  vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(al, _mm256_load_pd(&extEV[k][20 * l + 12])));		  		 
+		  vv[3] = simde_mm256_add_pd(vv[3],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[k][20 * l + 12])));		  		 
 #endif
-		  _mm256_store_pd(&v[12],vv[3]);
+		  simde_mm256_store_pd(&v[12],vv[3]);
 
-#ifdef _FMA		    
-		  ev =  _mm256_load_pd(&extEV[k][20 * l + 16]);
+#ifdef SIMDE_FMA_NATIVE		    
+		  ev =  simde_mm256_load_pd(&extEV[k][20 * l + 16]);
 		  vv[4] = FMAMACC(vv[4], al, ev);		 
 #else
-		  vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(al, _mm256_load_pd(&extEV[k][20 * l + 16])));			 	  
+		  vv[4] = simde_mm256_add_pd(vv[4],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[k][20 * l + 16])));			 	  
 #endif
-		  _mm256_store_pd(&v[16],vv[4]);		 
+		  simde_mm256_store_pd(&v[16],vv[4]);		 
 		} 
 	    }
 	  v = &(x3[80 * i]);
 	  scale = 1;
-	  __m256d minlikelihood_avx = _mm256_set1_pd(minlikelihood);	 
+	  simde__m256d minlikelihood_avx = simde_mm256_set1_pd(minlikelihood);	 
 
 	  for(l = 0; scale && (l < 80); l += 4) 
 	    {
-	      __m256d vv = _mm256_load_pd(&v[l]);
-	      __m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
-	      vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
-	      if(_mm256_movemask_pd(vv_abs) != 15)
+	      simde__m256d vv = simde_mm256_load_pd(&v[l]);
+	      simde__m256d vv_abs = simde_mm256_and_pd(vv,absMask_AVX.m);
+	      vv_abs = simde_mm256_cmp_pd(vv_abs,minlikelihood_avx,SIMDE_CMP_LT_OS);
+	      if(simde_mm256_movemask_pd(vv_abs) != 15)
 		scale = 0;	     
 	    }
 
 	  if(scale) 
 	    {		     	      
-	      __m256d twotothe256v = _mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
+	      simde__m256d twotothe256v = simde_mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
 	      for(l = 0; l < 80; l += 4) 
 		{
-		  __m256d vv = _mm256_load_pd(&v[l]);
-		  _mm256_store_pd(&v[l],_mm256_mul_pd(vv,twotothe256v));
+		  simde__m256d vv = simde_mm256_load_pd(&v[l]);
+		  simde_mm256_store_pd(&v[l],simde_mm256_mul_pd(vv,twotothe256v));
 		}
 	      if(useFastScaling)
 		addScale += wgt[i];					
@@ -3206,11 +3203,11 @@
 
 
 #if GCC_VERSION < 40500
-   __m256d
-    bitmask = _mm256_set_pd(0,0,0,-1);
+   simde__m256d
+    bitmask = simde_mm256_set_pd(0,0,0,-1);
 #else
-  __m256i
-    bitmask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+  simde__m256i
+    bitmask = simde_mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
 #endif 
   
   switch(tipCase) 
@@ -3233,28 +3230,28 @@
 		  *ll =  &left[k * 20],
 		  *rr =  &right[k * 20];
 		
-		__m256d 
-		  umpX1v = _mm256_setzero_pd(),
-		  umpX2v = _mm256_setzero_pd();
+		simde__m256d 
+		  umpX1v = simde_mm256_setzero_pd(),
+		  umpX2v = simde_mm256_setzero_pd();
 		
 		for(l = 0; l < 20; l+=4) 
 		  {
-		    __m256d vv = _mm256_load_pd(&v[l]);
-#ifdef _FMA
-		    __m256d llv = _mm256_load_pd(&ll[l]);
+		    simde__m256d vv = simde_mm256_load_pd(&v[l]);
+#ifdef SIMDE_FMA_NATIVE
+		    simde__m256d llv = simde_mm256_load_pd(&ll[l]);
 		    umpX1v = FMAMACC(umpX1v,vv,llv);
-		    __m256d rrv = _mm256_load_pd(&rr[l]);
+		    simde__m256d rrv = simde_mm256_load_pd(&rr[l]);
 		    umpX2v = FMAMACC(umpX2v,vv,rrv);
 #else		    
-		    umpX1v = _mm256_add_pd(umpX1v,_mm256_mul_pd(vv,_mm256_load_pd(&ll[l])));
-		    umpX2v = _mm256_add_pd(umpX2v,_mm256_mul_pd(vv,_mm256_load_pd(&rr[l])));
+		    umpX1v = simde_mm256_add_pd(umpX1v,simde_mm256_mul_pd(vv,simde_mm256_load_pd(&ll[l])));
+		    umpX2v = simde_mm256_add_pd(umpX2v,simde_mm256_mul_pd(vv,simde_mm256_load_pd(&rr[l])));
 #endif
 		  }
 		
 		umpX1v = hadd3(umpX1v);
 		umpX2v = hadd3(umpX2v);
-		_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
-		_mm256_maskstore_pd(&umpX2[80 * i + k], bitmask, umpX2v);
+		simde_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
+		simde_mm256_maskstore_pd(&umpX2[80 * i + k], bitmask, umpX2v);
 	      } 
 	  }
 
@@ -3265,61 +3262,61 @@
 	  
 	  for(j = 0; j < 4; j++) 
 	    {     	
-	      __m256d vv[5];  
+	      simde__m256d vv[5];  
 	      
 	      v = &x3_gapColumn[j * 20];
 	      
-	      vv[0] = _mm256_setzero_pd();
-	      vv[1] = _mm256_setzero_pd();
-	      vv[2] = _mm256_setzero_pd();
-	      vv[3] = _mm256_setzero_pd();
-	      vv[4] = _mm256_setzero_pd();
+	      vv[0] = simde_mm256_setzero_pd();
+	      vv[1] = simde_mm256_setzero_pd();
+	      vv[2] = simde_mm256_setzero_pd();
+	      vv[3] = simde_mm256_setzero_pd();
+	      vv[4] = simde_mm256_setzero_pd();
 	      
 	      for(k = 0; k < 20; k++) 
 		{			 
 		  x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
 		  
-		  __m256d x1px2v = _mm256_set1_pd(x1px2);		    
+		  simde__m256d x1px2v = simde_mm256_set1_pd(x1px2);		    
 		  
-		  __m256d extEvv = _mm256_load_pd(&extEV[20 * k]);
-#ifdef _FMA
+		  simde__m256d extEvv = simde_mm256_load_pd(&extEV[20 * k]);
+#ifdef SIMDE_FMA_NATIVE
 		  vv[0] = FMAMACC(vv[0],x1px2v,extEvv);
 #else
-		  vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v,extEvv));
+		  vv[0] = simde_mm256_add_pd(vv[0],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-		  _mm256_store_pd(&v[0],vv[0]);
+		  simde_mm256_store_pd(&v[0],vv[0]);
 		  
-		  extEvv = _mm256_load_pd(&extEV[20 * k + 4]);
-#ifdef _FMA
+		  extEvv = simde_mm256_load_pd(&extEV[20 * k + 4]);
+#ifdef SIMDE_FMA_NATIVE
 		  vv[1] = FMAMACC(vv[1],x1px2v,extEvv);
 #else
-		  vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v,extEvv));
+		  vv[1] = simde_mm256_add_pd(vv[1],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-		  _mm256_store_pd(&v[4],vv[1]);
+		  simde_mm256_store_pd(&v[4],vv[1]);
 		  
-		  extEvv = _mm256_load_pd(&extEV[20 * k + 8]);
-#ifdef _FMA
+		  extEvv = simde_mm256_load_pd(&extEV[20 * k + 8]);
+#ifdef SIMDE_FMA_NATIVE
 		  vv[2] = FMAMACC(vv[2],x1px2v,extEvv);
 #else
-		  vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v,extEvv));
+		  vv[2] = simde_mm256_add_pd(vv[2],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-		  _mm256_store_pd(&v[8],vv[2]);
+		  simde_mm256_store_pd(&v[8],vv[2]);
 		  
-		  extEvv = _mm256_load_pd(&extEV[20 * k + 12]);
-#ifdef _FMA
+		  extEvv = simde_mm256_load_pd(&extEV[20 * k + 12]);
+#ifdef SIMDE_FMA_NATIVE
 		  vv[3] = FMAMACC(vv[3],x1px2v,extEvv);
 #else
-		  vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v,extEvv));
+		  vv[3] = simde_mm256_add_pd(vv[3],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-		  _mm256_store_pd(&v[12],vv[3]);
+		  simde_mm256_store_pd(&v[12],vv[3]);
 		  
-		  extEvv = _mm256_load_pd(&extEV[20 * k + 16]);
-#ifdef _FMA
+		  extEvv = simde_mm256_load_pd(&extEV[20 * k + 16]);
+#ifdef SIMDE_FMA_NATIVE
 		  vv[4] = FMAMACC(vv[4],x1px2v,extEvv);
 #else
-		  vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v,extEvv));
+		  vv[4] = simde_mm256_add_pd(vv[4],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-		  _mm256_store_pd(&v[16],vv[4]);
+		  simde_mm256_store_pd(&v[16],vv[4]);
 		} 
 	    } 
 	}
@@ -3334,61 +3331,61 @@
 	   
 		for(j = 0; j < 4; j++) 
 		  {     	
-		    __m256d vv[5];  
+		    simde__m256d vv[5];  
 		    
 		    v = &x3_ptr[j * 20];
 			
-		    vv[0] = _mm256_setzero_pd();
-		    vv[1] = _mm256_setzero_pd();
-		    vv[2] = _mm256_setzero_pd();
-		    vv[3] = _mm256_setzero_pd();
-		    vv[4] = _mm256_setzero_pd();
+		    vv[0] = simde_mm256_setzero_pd();
+		    vv[1] = simde_mm256_setzero_pd();
+		    vv[2] = simde_mm256_setzero_pd();
+		    vv[3] = simde_mm256_setzero_pd();
+		    vv[4] = simde_mm256_setzero_pd();
 
 		    for(k = 0; k < 20; k++) 
 		      {			 
 			x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
 			
-			__m256d x1px2v = _mm256_set1_pd(x1px2);		    
+			simde__m256d x1px2v = simde_mm256_set1_pd(x1px2);		    
 			
-			__m256d extEvv = _mm256_load_pd(&extEV[20 * k]);
-#ifdef _FMA
+			simde__m256d extEvv = simde_mm256_load_pd(&extEV[20 * k]);
+#ifdef SIMDE_FMA_NATIVE
 			vv[0] = FMAMACC(vv[0],x1px2v,extEvv);
 #else
-			vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v,extEvv));
+			vv[0] = simde_mm256_add_pd(vv[0],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-			_mm256_store_pd(&v[0],vv[0]);
+			simde_mm256_store_pd(&v[0],vv[0]);
 			
-			extEvv = _mm256_load_pd(&extEV[20 * k + 4]);
-#ifdef _FMA
+			extEvv = simde_mm256_load_pd(&extEV[20 * k + 4]);
+#ifdef SIMDE_FMA_NATIVE
 			vv[1] = FMAMACC(vv[1],x1px2v,extEvv);
 #else
-			vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v,extEvv));
+			vv[1] = simde_mm256_add_pd(vv[1],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-			_mm256_store_pd(&v[4],vv[1]);
+			simde_mm256_store_pd(&v[4],vv[1]);
 			
-			extEvv = _mm256_load_pd(&extEV[20 * k + 8]);
-#ifdef _FMA
+			extEvv = simde_mm256_load_pd(&extEV[20 * k + 8]);
+#ifdef SIMDE_FMA_NATIVE
 			vv[2] = FMAMACC(vv[2],x1px2v,extEvv);
 #else
-			vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v,extEvv));
+			vv[2] = simde_mm256_add_pd(vv[2],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-			_mm256_store_pd(&v[8],vv[2]);
+			simde_mm256_store_pd(&v[8],vv[2]);
 			
-			extEvv = _mm256_load_pd(&extEV[20 * k + 12]);
-#ifdef _FMA
+			extEvv = simde_mm256_load_pd(&extEV[20 * k + 12]);
+#ifdef SIMDE_FMA_NATIVE
 			vv[3] = FMAMACC(vv[3],x1px2v,extEvv);
 #else
-			vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v,extEvv));
+			vv[3] = simde_mm256_add_pd(vv[3],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-			_mm256_store_pd(&v[12],vv[3]);
+			simde_mm256_store_pd(&v[12],vv[3]);
 			
-			extEvv = _mm256_load_pd(&extEV[20 * k + 16]);
-#ifdef _FMA
+			extEvv = simde_mm256_load_pd(&extEV[20 * k + 16]);
+#ifdef SIMDE_FMA_NATIVE
 			vv[4] = FMAMACC(vv[4],x1px2v,extEvv);
 #else
-			vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v,extEvv));
+			vv[4] = simde_mm256_add_pd(vv[4],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-			_mm256_store_pd(&v[16],vv[4]);
+			simde_mm256_store_pd(&v[16],vv[4]);
 		      } 
 		  }
 		x3_ptr += 80;		  
@@ -3410,20 +3407,20 @@
 
 	    for(k = 0; k < 80; k++) 
 	      {
-		__m256d umpX1v = _mm256_setzero_pd();
+		simde__m256d umpX1v = simde_mm256_setzero_pd();
 		for(l = 0; l < 20; l+=4) 
 		  {
-		    __m256d vv = _mm256_load_pd(&v[l]);
-		    __m256d leftv = _mm256_load_pd(&left[k * 20 + l]);
-#ifdef _FMA
+		    simde__m256d vv = simde_mm256_load_pd(&v[l]);
+		    simde__m256d leftv = simde_mm256_load_pd(&left[k * 20 + l]);
+#ifdef SIMDE_FMA_NATIVE
 		   
 		    umpX1v = FMAMACC(umpX1v, vv, leftv);
 #else
-		    umpX1v = _mm256_add_pd(umpX1v, _mm256_mul_pd(vv, leftv));
+		    umpX1v = simde_mm256_add_pd(umpX1v, simde_mm256_mul_pd(vv, leftv));
 #endif
 		  }
 		umpX1v = hadd3(umpX1v);
-		_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
+		simde_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
 	      } 
 	  }
 
@@ -3436,132 +3433,132 @@
 		
 		for(l = 0; l < 20; l++) 
 		  {
-		    __m256d ump_x2v = _mm256_setzero_pd();
+		    simde__m256d ump_x2v = simde_mm256_setzero_pd();
 		    		  
-		    __m256d vv = _mm256_load_pd(&v[0]);
-		    __m256d rightv = _mm256_load_pd(&right[k*400+l*20+0]);
-#ifdef _FMA
+		    simde__m256d vv = simde_mm256_load_pd(&v[0]);
+		    simde__m256d rightv = simde_mm256_load_pd(&right[k*400+l*20+0]);
+#ifdef SIMDE_FMA_NATIVE
 		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+		    ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 		    
-		    vv = _mm256_load_pd(&v[4]);
-		    rightv = _mm256_load_pd(&right[k*400+l*20+4]);
-#ifdef _FMA
+		    vv = simde_mm256_load_pd(&v[4]);
+		    rightv = simde_mm256_load_pd(&right[k*400+l*20+4]);
+#ifdef SIMDE_FMA_NATIVE
 		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+		    ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 
-		    vv = _mm256_load_pd(&v[8]);
-		    rightv = _mm256_load_pd(&right[k*400+l*20+8]);
-#ifdef _FMA
+		    vv = simde_mm256_load_pd(&v[8]);
+		    rightv = simde_mm256_load_pd(&right[k*400+l*20+8]);
+#ifdef SIMDE_FMA_NATIVE
 		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+		    ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 
-		    vv = _mm256_load_pd(&v[12]);
-		    rightv = _mm256_load_pd(&right[k*400+l*20+12]);
-#ifdef _FMA
+		    vv = simde_mm256_load_pd(&v[12]);
+		    rightv = simde_mm256_load_pd(&right[k*400+l*20+12]);
+#ifdef SIMDE_FMA_NATIVE
 		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+		    ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 
-		    vv = _mm256_load_pd(&v[16]);
-		    rightv = _mm256_load_pd(&right[k*400+l*20+16]);
-#ifdef _FMA
+		    vv = simde_mm256_load_pd(&v[16]);
+		    rightv = simde_mm256_load_pd(&right[k*400+l*20+16]);
+#ifdef SIMDE_FMA_NATIVE
 		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+		    ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 		    
 		    ump_x2v = hadd3(ump_x2v);
-		    _mm256_maskstore_pd(&ump_x2[l], bitmask, ump_x2v);
+		    simde_mm256_maskstore_pd(&ump_x2[l], bitmask, ump_x2v);
 		  }
 		
 		v = &x3_gapColumn[20 * k];
 	
-		__m256d vv[5]; 
+		simde__m256d vv[5]; 
 
-		vv[0] = _mm256_setzero_pd();
-		vv[1] = _mm256_setzero_pd();
-		vv[2] = _mm256_setzero_pd();
-		vv[3] = _mm256_setzero_pd();
-		vv[4] = _mm256_setzero_pd();
+		vv[0] = simde_mm256_setzero_pd();
+		vv[1] = simde_mm256_setzero_pd();
+		vv[2] = simde_mm256_setzero_pd();
+		vv[3] = simde_mm256_setzero_pd();
+		vv[4] = simde_mm256_setzero_pd();
 		
 		for(l = 0; l < 20; l++) 
 		  {
 		    x1px2 = uX1[k * 20 + l]	* ump_x2[l];
-		    __m256d x1px2v = _mm256_set1_pd(x1px2);	
+		    simde__m256d x1px2v = simde_mm256_set1_pd(x1px2);	
 	    		 
-#ifdef _FMA
-		    __m256d ev = _mm256_load_pd(&extEV[l * 20 + 0]);
+#ifdef SIMDE_FMA_NATIVE
+		    simde__m256d ev = simde_mm256_load_pd(&extEV[l * 20 + 0]);
 		    vv[0] = FMAMACC(vv[0],x1px2v, ev);
 #else
-		    vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 0])));
+		    vv[0] = simde_mm256_add_pd(vv[0],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[l * 20 + 0])));
 #endif
-		    _mm256_store_pd(&v[0],vv[0]);
+		    simde_mm256_store_pd(&v[0],vv[0]);
 
-#ifdef _FMA
-		    ev = _mm256_load_pd(&extEV[l * 20 + 4]);
+#ifdef SIMDE_FMA_NATIVE
+		    ev = simde_mm256_load_pd(&extEV[l * 20 + 4]);
 		    vv[1] = FMAMACC(vv[1],x1px2v, ev);
 #else
-		    vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 4])));
+		    vv[1] = simde_mm256_add_pd(vv[1],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[l * 20 + 4])));
 #endif
-		    _mm256_store_pd(&v[4],vv[1]);
+		    simde_mm256_store_pd(&v[4],vv[1]);
 
-#ifdef _FMA
-		    ev = _mm256_load_pd(&extEV[l * 20 + 8]);
+#ifdef SIMDE_FMA_NATIVE
+		    ev = simde_mm256_load_pd(&extEV[l * 20 + 8]);
 		    vv[2] = FMAMACC(vv[2],x1px2v, ev);
 #else
-		    vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 8])));
+		    vv[2] = simde_mm256_add_pd(vv[2],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[l * 20 + 8])));
 #endif
-		    _mm256_store_pd(&v[8],vv[2]);
+		    simde_mm256_store_pd(&v[8],vv[2]);
 		    
-#ifdef _FMA
-		    ev = _mm256_load_pd(&extEV[l * 20 + 12]);
+#ifdef SIMDE_FMA_NATIVE
+		    ev = simde_mm256_load_pd(&extEV[l * 20 + 12]);
 		    vv[3] = FMAMACC(vv[3],x1px2v, ev);
 #else
-		    vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 12])));
+		    vv[3] = simde_mm256_add_pd(vv[3],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[l * 20 + 12])));
 #endif
-		    _mm256_store_pd(&v[12],vv[3]);
+		    simde_mm256_store_pd(&v[12],vv[3]);
 
 
-#ifdef _FMA
-		    ev = _mm256_load_pd(&extEV[l * 20 + 16]);
+#ifdef SIMDE_FMA_NATIVE
+		    ev = simde_mm256_load_pd(&extEV[l * 20 + 16]);
 		    vv[4] = FMAMACC(vv[4],x1px2v, ev);
 #else
-		    vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 16])));
+		    vv[4] = simde_mm256_add_pd(vv[4],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[l * 20 + 16])));
 #endif
-		    _mm256_store_pd(&v[16],vv[4]);
+		    simde_mm256_store_pd(&v[16],vv[4]);
 
 		  } 
 	      }
 	   
 	    v = x3_gapColumn;
-	    __m256d minlikelihood_avx = _mm256_set1_pd(minlikelihood);
+	    simde__m256d minlikelihood_avx = simde_mm256_set1_pd(minlikelihood);
 	    scale = 1;
 	    for(l = 0; scale && (l < 80); l += 4) 
 	      {
-		__m256d vv = _mm256_load_pd(&v[l]);
-		__m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
-		vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
-		if(_mm256_movemask_pd(vv_abs) != 15)
+		simde__m256d vv = simde_mm256_load_pd(&v[l]);
+		simde__m256d vv_abs = simde_mm256_and_pd(vv,absMask_AVX.m);
+		vv_abs = simde_mm256_cmp_pd(vv_abs,minlikelihood_avx,SIMDE_CMP_LT_OS);
+		if(simde_mm256_movemask_pd(vv_abs) != 15)
 		  scale = 0;
 	      }
 	    
 	    if(scale) 
 	      {		
-		__m256d twotothe256v = _mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
+		simde__m256d twotothe256v = simde_mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
 		gapScaling = 1;
 
 		for(l = 0; l < 80; l += 4) 
 		  {
-		    __m256d vv = _mm256_load_pd(&v[l]);
-		    _mm256_store_pd(&v[l],_mm256_mul_pd(vv,twotothe256v));
+		    simde__m256d vv = simde_mm256_load_pd(&v[l]);
+		    simde_mm256_store_pd(&v[l],simde_mm256_mul_pd(vv,twotothe256v));
 		  }	
 	      } 
 	}       
@@ -3596,131 +3593,131 @@
 		    
 		    for(l = 0; l < 20; l++) 
 		      {
-			__m256d ump_x2v = _mm256_setzero_pd();
+			simde__m256d ump_x2v = simde_mm256_setzero_pd();
 		    	
-			__m256d vv = _mm256_load_pd(&v[0]);
-			__m256d rightv = _mm256_load_pd(&right[k*400+l*20+0]);
-#ifdef _FMA
+			simde__m256d vv = simde_mm256_load_pd(&v[0]);
+			simde__m256d rightv = simde_mm256_load_pd(&right[k*400+l*20+0]);
+#ifdef SIMDE_FMA_NATIVE
 			ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-			ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+			ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 			
-			vv = _mm256_load_pd(&v[4]);
-			rightv = _mm256_load_pd(&right[k*400+l*20+4]);
-#ifdef _FMA
+			vv = simde_mm256_load_pd(&v[4]);
+			rightv = simde_mm256_load_pd(&right[k*400+l*20+4]);
+#ifdef SIMDE_FMA_NATIVE
 			ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-			ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+			ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 			
-			vv = _mm256_load_pd(&v[8]);
-			rightv = _mm256_load_pd(&right[k*400+l*20+8]);
-#ifdef _FMA
+			vv = simde_mm256_load_pd(&v[8]);
+			rightv = simde_mm256_load_pd(&right[k*400+l*20+8]);
+#ifdef SIMDE_FMA_NATIVE
 			ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-			ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+			ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 			
-			vv = _mm256_load_pd(&v[12]);
-			rightv = _mm256_load_pd(&right[k*400+l*20+12]);
-#ifdef _FMA
+			vv = simde_mm256_load_pd(&v[12]);
+			rightv = simde_mm256_load_pd(&right[k*400+l*20+12]);
+#ifdef SIMDE_FMA_NATIVE
 			ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-			ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+			ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 			
-			vv = _mm256_load_pd(&v[16]);
-			rightv = _mm256_load_pd(&right[k*400+l*20+16]);
-#ifdef _FMA
+			vv = simde_mm256_load_pd(&v[16]);
+			rightv = simde_mm256_load_pd(&right[k*400+l*20+16]);
+#ifdef SIMDE_FMA_NATIVE
 			ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-			ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+			ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 			
 			ump_x2v = hadd3(ump_x2v);
-			_mm256_maskstore_pd(&ump_x2[l], bitmask, ump_x2v);
+			simde_mm256_maskstore_pd(&ump_x2[l], bitmask, ump_x2v);
 		      }
 		  
 		    
 		    v = &x3_ptr[k * 20];
 		    
-		    __m256d vv[5]; 
+		    simde__m256d vv[5]; 
 		    
-		    vv[0] = _mm256_setzero_pd();
-		    vv[1] = _mm256_setzero_pd();
-		    vv[2] = _mm256_setzero_pd();
-		    vv[3] = _mm256_setzero_pd();
-		    vv[4] = _mm256_setzero_pd();
+		    vv[0] = simde_mm256_setzero_pd();
+		    vv[1] = simde_mm256_setzero_pd();
+		    vv[2] = simde_mm256_setzero_pd();
+		    vv[3] = simde_mm256_setzero_pd();
+		    vv[4] = simde_mm256_setzero_pd();
 		    
 		    for(l = 0; l < 20; l++) 
 		      {
 			x1px2 = uX1[k * 20 + l]	* ump_x2[l];
-			__m256d x1px2v = _mm256_set1_pd(x1px2);	
+			simde__m256d x1px2v = simde_mm256_set1_pd(x1px2);	
 			
-#ifdef _FMA
-			__m256d ev = _mm256_load_pd(&extEV[l * 20 + 0]);
+#ifdef SIMDE_FMA_NATIVE
+			simde__m256d ev = simde_mm256_load_pd(&extEV[l * 20 + 0]);
 			vv[0] = FMAMACC(vv[0],x1px2v, ev);
 #else
-			vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 0])));
+			vv[0] = simde_mm256_add_pd(vv[0],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[l * 20 + 0])));
 #endif
-			_mm256_store_pd(&v[0],vv[0]);
+			simde_mm256_store_pd(&v[0],vv[0]);
 			
-#ifdef _FMA
-			ev = _mm256_load_pd(&extEV[l * 20 + 4]);
+#ifdef SIMDE_FMA_NATIVE
+			ev = simde_mm256_load_pd(&extEV[l * 20 + 4]);
 			vv[1] = FMAMACC(vv[1],x1px2v, ev);
 #else
-			vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 4])));
+			vv[1] = simde_mm256_add_pd(vv[1],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[l * 20 + 4])));
 #endif
-			_mm256_store_pd(&v[4],vv[1]);
+			simde_mm256_store_pd(&v[4],vv[1]);
 			
-#ifdef _FMA
-			ev = _mm256_load_pd(&extEV[l * 20 + 8]);
+#ifdef SIMDE_FMA_NATIVE
+			ev = simde_mm256_load_pd(&extEV[l * 20 + 8]);
 			vv[2] = FMAMACC(vv[2],x1px2v, ev);
 #else
-			vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 8])));
+			vv[2] = simde_mm256_add_pd(vv[2],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[l * 20 + 8])));
 #endif
-			_mm256_store_pd(&v[8],vv[2]);
+			simde_mm256_store_pd(&v[8],vv[2]);
 			
-#ifdef _FMA
-			ev = _mm256_load_pd(&extEV[l * 20 + 12]);
+#ifdef SIMDE_FMA_NATIVE
+			ev = simde_mm256_load_pd(&extEV[l * 20 + 12]);
 			vv[3] = FMAMACC(vv[3],x1px2v, ev);
 #else
-			vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 12])));
+			vv[3] = simde_mm256_add_pd(vv[3],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[l * 20 + 12])));
 #endif
-			_mm256_store_pd(&v[12],vv[3]);
+			simde_mm256_store_pd(&v[12],vv[3]);
 			
 			
-#ifdef _FMA
-			ev = _mm256_load_pd(&extEV[l * 20 + 16]);
+#ifdef SIMDE_FMA_NATIVE
+			ev = simde_mm256_load_pd(&extEV[l * 20 + 16]);
 			vv[4] = FMAMACC(vv[4],x1px2v, ev);
 #else
-			vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 16])));
+			vv[4] = simde_mm256_add_pd(vv[4],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[l * 20 + 16])));
 #endif
-			_mm256_store_pd(&v[16],vv[4]);
+			simde_mm256_store_pd(&v[16],vv[4]);
 			
 		      } 
 		  }
 		
 		v = x3_ptr;
-		__m256d minlikelihood_avx = _mm256_set1_pd(minlikelihood);
+		simde__m256d minlikelihood_avx = simde_mm256_set1_pd(minlikelihood);
 		scale = 1;
 		for(l = 0; scale && (l < 80); l += 4) 
 		  {
-		    __m256d vv = _mm256_load_pd(&v[l]);
-		    __m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
-		    vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
-		    if(_mm256_movemask_pd(vv_abs) != 15)
+		    simde__m256d vv = simde_mm256_load_pd(&v[l]);
+		    simde__m256d vv_abs = simde_mm256_and_pd(vv,absMask_AVX.m);
+		    vv_abs = simde_mm256_cmp_pd(vv_abs,minlikelihood_avx,SIMDE_CMP_LT_OS);
+		    if(simde_mm256_movemask_pd(vv_abs) != 15)
 		      scale = 0;
 		  }
 	    
 		if(scale) 
 		  {		
-		    __m256d twotothe256v = _mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
+		    simde__m256d twotothe256v = simde_mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
 		    for(l = 0; l < 80; l += 4) 
 		      {
-			__m256d vv = _mm256_load_pd(&v[l]);
-			_mm256_store_pd(&v[l],_mm256_mul_pd(vv,twotothe256v));
+			simde__m256d vv = simde_mm256_load_pd(&v[l]);
+			simde_mm256_store_pd(&v[l],simde_mm256_mul_pd(vv,twotothe256v));
 		      }
 		    if(useFastScaling)
 		      addScale += wgt[i];				
@@ -3739,156 +3736,156 @@
 	  vr = &(x2_gapColumn[20 * k]);
 	  v  = &(x3_gapColumn[20 * k]);	      	   
 
-	  __m256d vv[5]; 
+	  simde__m256d vv[5]; 
 	  
-	  vv[0] = _mm256_setzero_pd();
-	  vv[1] = _mm256_setzero_pd();
-	  vv[2] = _mm256_setzero_pd();
-	  vv[3] = _mm256_setzero_pd();
-	  vv[4] = _mm256_setzero_pd();
+	  vv[0] = simde_mm256_setzero_pd();
+	  vv[1] = simde_mm256_setzero_pd();
+	  vv[2] = simde_mm256_setzero_pd();
+	  vv[3] = simde_mm256_setzero_pd();
+	  vv[4] = simde_mm256_setzero_pd();
 	  
 	  for(l = 0; l < 20; l++) 
 	    {		  
-	      __m256d al = _mm256_setzero_pd();
-	      __m256d ar = _mm256_setzero_pd();
+	      simde__m256d al = simde_mm256_setzero_pd();
+	      simde__m256d ar = simde_mm256_setzero_pd();
 	      
-	      __m256d leftv  = _mm256_load_pd(&left[k * 400 + l * 20 + 0]);
-	      __m256d rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 0]);
-	      __m256d vlv = _mm256_load_pd(&vl[0]);
-	      __m256d vrv = _mm256_load_pd(&vr[0]);
+	      simde__m256d leftv  = simde_mm256_load_pd(&left[k * 400 + l * 20 + 0]);
+	      simde__m256d rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 0]);
+	      simde__m256d vlv = simde_mm256_load_pd(&vl[0]);
+	      simde__m256d vrv = simde_mm256_load_pd(&vr[0]);
 	      
-#ifdef _FMA
+#ifdef SIMDE_FMA_NATIVE
 	      
 	      al = FMAMACC(al, vlv, leftv);
 	      ar = FMAMACC(ar, vrv, rightv);
 #else
-	      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-	      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));		  
+	      al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+	      ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));		  
 #endif
 	      
-	      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 4]);
-	      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 4]);
-	      vlv = _mm256_load_pd(&vl[4]);
-	      vrv = _mm256_load_pd(&vr[4]);
-#ifdef _FMA
+	      leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 4]);
+	      rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 4]);
+	      vlv = simde_mm256_load_pd(&vl[4]);
+	      vrv = simde_mm256_load_pd(&vr[4]);
+#ifdef SIMDE_FMA_NATIVE
 	      
 	      al = FMAMACC(al, vlv, leftv);
 	      ar = FMAMACC(ar, vrv, rightv);
 #else
-	      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-	      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+	      al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+	      ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 	      
-	      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 8]);
-	      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 8]);
-	      vlv = _mm256_load_pd(&vl[8]);
-	      vrv = _mm256_load_pd(&vr[8]);
-#ifdef _FMA
+	      leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 8]);
+	      rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 8]);
+	      vlv = simde_mm256_load_pd(&vl[8]);
+	      vrv = simde_mm256_load_pd(&vr[8]);
+#ifdef SIMDE_FMA_NATIVE
 	      
 	      al = FMAMACC(al, vlv, leftv);
 	      ar = FMAMACC(ar, vrv, rightv);
 #else
-	      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-	      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+	      al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+	      ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 	      
-	      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 12]);
-	      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 12]);
-	      vlv = _mm256_load_pd(&vl[12]);
-	      vrv = _mm256_load_pd(&vr[12]);
-#ifdef _FMA
+	      leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 12]);
+	      rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 12]);
+	      vlv = simde_mm256_load_pd(&vl[12]);
+	      vrv = simde_mm256_load_pd(&vr[12]);
+#ifdef SIMDE_FMA_NATIVE
 	      
 	      al = FMAMACC(al, vlv, leftv);
 	      ar = FMAMACC(ar, vrv, rightv);
 #else
-	      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-	      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+	      al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+	      ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 	      
-	      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 16]);
-	      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 16]);
-	      vlv = _mm256_load_pd(&vl[16]);
-	      vrv = _mm256_load_pd(&vr[16]);
+	      leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 16]);
+	      rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 16]);
+	      vlv = simde_mm256_load_pd(&vl[16]);
+	      vrv = simde_mm256_load_pd(&vr[16]);
 	      
-#ifdef _FMA		    
+#ifdef SIMDE_FMA_NATIVE		    
 	      al = FMAMACC(al, vlv, leftv);
 	      ar = FMAMACC(ar, vrv, rightv);
 #else
-	      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-	      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+	      al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+	      ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 	      
 	      /**************************************************************************************************************/
 	      
 	      al = hadd3(al);
 	      ar = hadd3(ar);
-	      al = _mm256_mul_pd(ar,al);
+	      al = simde_mm256_mul_pd(ar,al);
 	      
 	      /************************************************************************************************************/
-#ifdef _FMA		    
-	      __m256d ev =  _mm256_load_pd(&extEV[20 * l + 0]);
+#ifdef SIMDE_FMA_NATIVE		    
+	      simde__m256d ev =  simde_mm256_load_pd(&extEV[20 * l + 0]);
 	      vv[0] = FMAMACC(vv[0], al, ev);		 
 #else
-	      vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 0])));			  		 		  
+	      vv[0] = simde_mm256_add_pd(vv[0],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[20 * l + 0])));			  		 		  
 #endif
-	      _mm256_store_pd(&v[0],vv[0]);
+	      simde_mm256_store_pd(&v[0],vv[0]);
 	      
-#ifdef _FMA		    
-	      ev =  _mm256_load_pd(&extEV[20 * l + 4]);
+#ifdef SIMDE_FMA_NATIVE		    
+	      ev =  simde_mm256_load_pd(&extEV[20 * l + 4]);
 	      vv[1] = FMAMACC(vv[1], al, ev);		 
 #else
-	      vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 4])));		  		 
+	      vv[1] = simde_mm256_add_pd(vv[1],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[20 * l + 4])));		  		 
 #endif
-	      _mm256_store_pd(&v[4],vv[1]);
+	      simde_mm256_store_pd(&v[4],vv[1]);
 	      
-#ifdef _FMA		    
-	      ev =  _mm256_load_pd(&extEV[20 * l + 8]);
+#ifdef SIMDE_FMA_NATIVE		    
+	      ev =  simde_mm256_load_pd(&extEV[20 * l + 8]);
 	      vv[2] = FMAMACC(vv[2], al, ev);		 
 #else
-	      vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 8])));		  		 
+	      vv[2] = simde_mm256_add_pd(vv[2],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[20 * l + 8])));		  		 
 #endif
-	      _mm256_store_pd(&v[8],vv[2]);
+	      simde_mm256_store_pd(&v[8],vv[2]);
 	      
-#ifdef _FMA		    
-	      ev =  _mm256_load_pd(&extEV[20 * l + 12]);
+#ifdef SIMDE_FMA_NATIVE		    
+	      ev =  simde_mm256_load_pd(&extEV[20 * l + 12]);
 	      vv[3] = FMAMACC(vv[3], al, ev);		 
 #else
-	      vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 12])));		  		 
+	      vv[3] = simde_mm256_add_pd(vv[3],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[20 * l + 12])));		  		 
 #endif
-	      _mm256_store_pd(&v[12],vv[3]);
+	      simde_mm256_store_pd(&v[12],vv[3]);
 	      
-#ifdef _FMA		    
-	      ev =  _mm256_load_pd(&extEV[20 * l + 16]);
+#ifdef SIMDE_FMA_NATIVE		    
+	      ev =  simde_mm256_load_pd(&extEV[20 * l + 16]);
 	      vv[4] = FMAMACC(vv[4], al, ev);		 
 #else
-	      vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 16])));			 	  
+	      vv[4] = simde_mm256_add_pd(vv[4],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[20 * l + 16])));			 	  
 #endif
-	      _mm256_store_pd(&v[16],vv[4]);		 
+	      simde_mm256_store_pd(&v[16],vv[4]);		 
 	    } 
 	}
 	
       v = x3_gapColumn;
       scale = 1;
-      __m256d minlikelihood_avx = _mm256_set1_pd(minlikelihood);	 
+      simde__m256d minlikelihood_avx = simde_mm256_set1_pd(minlikelihood);	 
       
       for(l = 0; scale && (l < 80); l += 4) 
 	{
-	  __m256d vv = _mm256_load_pd(&v[l]);
-	  __m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
-	  vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
-	  if(_mm256_movemask_pd(vv_abs) != 15)
+	  simde__m256d vv = simde_mm256_load_pd(&v[l]);
+	  simde__m256d vv_abs = simde_mm256_and_pd(vv,absMask_AVX.m);
+	  vv_abs = simde_mm256_cmp_pd(vv_abs,minlikelihood_avx,SIMDE_CMP_LT_OS);
+	  if(simde_mm256_movemask_pd(vv_abs) != 15)
 	    scale = 0;	     
 	}
 
       if(scale) 
 	{		     	      
-	  __m256d twotothe256v = _mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
+	  simde__m256d twotothe256v = simde_mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
 	  gapScaling = 1;
 
 	  for(l = 0; l < 80; l += 4) 
 	    {
-	      __m256d vv = _mm256_load_pd(&v[l]);
-	      _mm256_store_pd(&v[l],_mm256_mul_pd(vv,twotothe256v));
+	      simde__m256d vv = simde_mm256_load_pd(&v[l]);
+	      simde_mm256_store_pd(&v[l],simde_mm256_mul_pd(vv,twotothe256v));
 	    }
 	  
 	} 
@@ -3932,155 +3929,155 @@
 		  vr = &(x2[20 * k]);
 		  v  = &(x3_ptr[20 * k]);	      	   
 		  
-		  __m256d vv[5]; 
+		  simde__m256d vv[5]; 
 		  
-		  vv[0] = _mm256_setzero_pd();
-		  vv[1] = _mm256_setzero_pd();
-		  vv[2] = _mm256_setzero_pd();
-		  vv[3] = _mm256_setzero_pd();
-		  vv[4] = _mm256_setzero_pd();
+		  vv[0] = simde_mm256_setzero_pd();
+		  vv[1] = simde_mm256_setzero_pd();
+		  vv[2] = simde_mm256_setzero_pd();
+		  vv[3] = simde_mm256_setzero_pd();
+		  vv[4] = simde_mm256_setzero_pd();
 		  
 		  for(l = 0; l < 20; l++) 
 		    {		  
-		      __m256d al = _mm256_setzero_pd();
-		      __m256d ar = _mm256_setzero_pd();
+		      simde__m256d al = simde_mm256_setzero_pd();
+		      simde__m256d ar = simde_mm256_setzero_pd();
 		      
-		      __m256d leftv  = _mm256_load_pd(&left[k * 400 + l * 20 + 0]);
-		      __m256d rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 0]);
-		      __m256d vlv = _mm256_load_pd(&vl[0]);
-		      __m256d vrv = _mm256_load_pd(&vr[0]);
+		      simde__m256d leftv  = simde_mm256_load_pd(&left[k * 400 + l * 20 + 0]);
+		      simde__m256d rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 0]);
+		      simde__m256d vlv = simde_mm256_load_pd(&vl[0]);
+		      simde__m256d vrv = simde_mm256_load_pd(&vr[0]);
 		      
-#ifdef _FMA
+#ifdef SIMDE_FMA_NATIVE
 		      
 		      al = FMAMACC(al, vlv, leftv);
 		      ar = FMAMACC(ar, vrv, rightv);
 #else
-		      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-		      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));		  
+		      al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+		      ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));		  
 #endif
 		      
-		      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 4]);
-		      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 4]);
-		      vlv = _mm256_load_pd(&vl[4]);
-		      vrv = _mm256_load_pd(&vr[4]);
-#ifdef _FMA
+		      leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 4]);
+		      rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 4]);
+		      vlv = simde_mm256_load_pd(&vl[4]);
+		      vrv = simde_mm256_load_pd(&vr[4]);
+#ifdef SIMDE_FMA_NATIVE
 		      
 		      al = FMAMACC(al, vlv, leftv);
 		      ar = FMAMACC(ar, vrv, rightv);
 #else
-		      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-		      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+		      al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+		      ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 		      
-		      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 8]);
-		      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 8]);
-		      vlv = _mm256_load_pd(&vl[8]);
-		      vrv = _mm256_load_pd(&vr[8]);
-#ifdef _FMA
+		      leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 8]);
+		      rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 8]);
+		      vlv = simde_mm256_load_pd(&vl[8]);
+		      vrv = simde_mm256_load_pd(&vr[8]);
+#ifdef SIMDE_FMA_NATIVE
 		      
 		      al = FMAMACC(al, vlv, leftv);
 		      ar = FMAMACC(ar, vrv, rightv);
 #else
-		      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-		      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+		      al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+		      ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 		      
-		      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 12]);
-		      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 12]);
-		      vlv = _mm256_load_pd(&vl[12]);
-		      vrv = _mm256_load_pd(&vr[12]);
-#ifdef _FMA
+		      leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 12]);
+		      rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 12]);
+		      vlv = simde_mm256_load_pd(&vl[12]);
+		      vrv = simde_mm256_load_pd(&vr[12]);
+#ifdef SIMDE_FMA_NATIVE
 		      
 		      al = FMAMACC(al, vlv, leftv);
 		      ar = FMAMACC(ar, vrv, rightv);
 #else
-		      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-		      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+		      al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+		      ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 		      
-		      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 16]);
-		      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 16]);
-		      vlv = _mm256_load_pd(&vl[16]);
-		      vrv = _mm256_load_pd(&vr[16]);
+		      leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 16]);
+		      rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 16]);
+		      vlv = simde_mm256_load_pd(&vl[16]);
+		      vrv = simde_mm256_load_pd(&vr[16]);
 		      
-#ifdef _FMA		    
+#ifdef SIMDE_FMA_NATIVE		    
 		      al = FMAMACC(al, vlv, leftv);
 		      ar = FMAMACC(ar, vrv, rightv);
 #else
-		      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-		      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+		      al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+		      ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 		      
 		      /**************************************************************************************************************/
 		      
 		      al = hadd3(al);
 		      ar = hadd3(ar);
-		      al = _mm256_mul_pd(ar,al);
+		      al = simde_mm256_mul_pd(ar,al);
 		      
 		      /************************************************************************************************************/
-#ifdef _FMA		    
-		      __m256d ev =  _mm256_load_pd(&extEV[20 * l + 0]);
+#ifdef SIMDE_FMA_NATIVE		    
+		      simde__m256d ev =  simde_mm256_load_pd(&extEV[20 * l + 0]);
 		      vv[0] = FMAMACC(vv[0], al, ev);		 
 #else
-		      vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 0])));			  		 		  
+		      vv[0] = simde_mm256_add_pd(vv[0],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[20 * l + 0])));			  		 		  
 #endif
-		      _mm256_store_pd(&v[0],vv[0]);
+		      simde_mm256_store_pd(&v[0],vv[0]);
 		      
-#ifdef _FMA		    
-		      ev =  _mm256_load_pd(&extEV[20 * l + 4]);
+#ifdef SIMDE_FMA_NATIVE		    
+		      ev =  simde_mm256_load_pd(&extEV[20 * l + 4]);
 		      vv[1] = FMAMACC(vv[1], al, ev);		 
 #else
-		      vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 4])));		  		 
+		      vv[1] = simde_mm256_add_pd(vv[1],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[20 * l + 4])));		  		 
 #endif
-		      _mm256_store_pd(&v[4],vv[1]);
+		      simde_mm256_store_pd(&v[4],vv[1]);
 		      
-#ifdef _FMA		    
-		      ev =  _mm256_load_pd(&extEV[20 * l + 8]);
+#ifdef SIMDE_FMA_NATIVE		    
+		      ev =  simde_mm256_load_pd(&extEV[20 * l + 8]);
 		      vv[2] = FMAMACC(vv[2], al, ev);		 
 #else
-		      vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 8])));		  		 
+		      vv[2] = simde_mm256_add_pd(vv[2],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[20 * l + 8])));		  		 
 #endif
-		      _mm256_store_pd(&v[8],vv[2]);
+		      simde_mm256_store_pd(&v[8],vv[2]);
 		      
-#ifdef _FMA		    
-		      ev =  _mm256_load_pd(&extEV[20 * l + 12]);
+#ifdef SIMDE_FMA_NATIVE		    
+		      ev =  simde_mm256_load_pd(&extEV[20 * l + 12]);
 		      vv[3] = FMAMACC(vv[3], al, ev);		 
 #else
-		      vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 12])));		  		 
+		      vv[3] = simde_mm256_add_pd(vv[3],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[20 * l + 12])));		  		 
 #endif
-		      _mm256_store_pd(&v[12],vv[3]);
+		      simde_mm256_store_pd(&v[12],vv[3]);
 		      
-#ifdef _FMA		    
-		      ev =  _mm256_load_pd(&extEV[20 * l + 16]);
+#ifdef SIMDE_FMA_NATIVE		    
+		      ev =  simde_mm256_load_pd(&extEV[20 * l + 16]);
 		      vv[4] = FMAMACC(vv[4], al, ev);		 
 #else
-		      vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 16])));			 	  
+		      vv[4] = simde_mm256_add_pd(vv[4],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[20 * l + 16])));			 	  
 #endif
-		      _mm256_store_pd(&v[16],vv[4]);		 
+		      simde_mm256_store_pd(&v[16],vv[4]);		 
 		    }
 		}
 	      
 	      v = x3_ptr;
 	      scale = 1;
 	      
-	      __m256d minlikelihood_avx = _mm256_set1_pd(minlikelihood);	 
+	      simde__m256d minlikelihood_avx = simde_mm256_set1_pd(minlikelihood);	 
 	      
 	      for(l = 0; scale && (l < 80); l += 4) 
 		{
-		  __m256d vv = _mm256_load_pd(&v[l]);
-		  __m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
-		  vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
-		  if(_mm256_movemask_pd(vv_abs) != 15)
+		  simde__m256d vv = simde_mm256_load_pd(&v[l]);
+		  simde__m256d vv_abs = simde_mm256_and_pd(vv,absMask_AVX.m);
+		  vv_abs = simde_mm256_cmp_pd(vv_abs,minlikelihood_avx,SIMDE_CMP_LT_OS);
+		  if(simde_mm256_movemask_pd(vv_abs) != 15)
 		    scale = 0;	     
 		}
 	      
 	      if(scale) 
 		{		     	      
-		  __m256d twotothe256v = _mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
+		  simde__m256d twotothe256v = simde_mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
 		  for(l = 0; l < 80; l += 4) 
 		    {
-		      __m256d vv = _mm256_load_pd(&v[l]);
-		      _mm256_store_pd(&v[l],_mm256_mul_pd(vv,twotothe256v));
+		      simde__m256d vv = simde_mm256_load_pd(&v[l]);
+		      simde_mm256_store_pd(&v[l],simde_mm256_mul_pd(vv,twotothe256v));
 		    }
 		  if(useFastScaling)
 		    addScale += wgt[i];					
--- raxml.orig/axml.c
+++ raxml/axml.c
@@ -66,17 +66,7 @@
 
 #endif
 
-#if ! (defined(__ppc) || defined(__powerpc__) || defined(PPC))
-#include <xmmintrin.h>
-/*
-  special bug fix, enforces denormalized numbers to be flushed to zero,
-  without this program is a tiny bit faster though.
-  #include <emmintrin.h> 
-  #define MM_DAZ_MASK    0x0040
-  #define MM_DAZ_ON    0x0040
-  #define MM_DAZ_OFF    0x0000
-*/
-#endif
+#include "debian/include/simde/x86/sse.h"
 
 #include "axml.h"
 #include "globalVariables.h"
@@ -13707,11 +13697,11 @@
     
     
     
-#if ! (defined(__ppc) || defined(__powerpc__) || defined(PPC))
+#if defined(SIMDE_SSE_NATIVE)
     
     /* 
        David Defour's command  
-       _mm_setcsr( _mm_getcsr() | (_MM_FLUSH_ZERO_ON | MM_DAZ_ON));  
+       simde_mm_setcsr( simde_mm_getcsr() | (_MM_FLUSH_ZERO_ON | MM_DAZ_ON));  
     */
     
     _mm_setcsr( _mm_getcsr() | _MM_FLUSH_ZERO_ON);
--- raxml.orig/bipartitionList.c
+++ raxml/bipartitionList.c
@@ -45,12 +45,7 @@
 #include "axml.h"
 #include "rmq.h" //include range minimum queries for fast plausibility checker
 
-#ifdef __SIM_SSE3
-
-#include <xmmintrin.h>
-#include <pmmintrin.h>
-
-#endif
+#include "debian/include/simde/x86/sse3.h"
 
 #ifdef _USE_PTHREADS
 #include <pthread.h>
--- raxml.orig/evaluateGenericSpecial.c
+++ raxml/evaluateGenericSpecial.c
@@ -40,11 +40,7 @@
 #include "axml.h"
 
 
-#ifdef __SIM_SSE3
-#include <xmmintrin.h>
-#include <pmmintrin.h>
-/*#include <tmmintrin.h>*/
-#endif
+#include "debian/include/simde/x86/sse3.h"
 
 #ifdef _USE_PTHREADS
 extern volatile double *reductionBuffer;
@@ -1028,20 +1024,20 @@
 
       diagptable = &diagptable_start[20 * cptr[i]];	           	 
 
-      __m128d tv = _mm_setzero_pd();	    
+      simde__m128d tv = simde_mm_setzero_pd();	    
 
       for(l = 0; l < 20; l+=2)
       {
-        __m128d lv = _mm_load_pd(&left[l]);
-        __m128d rv = _mm_load_pd(&right[l]);
-        __m128d mul = _mm_mul_pd(lv, rv);
-        __m128d dv = _mm_load_pd(&diagptable[l]);
+        simde__m128d lv = simde_mm_load_pd(&left[l]);
+        simde__m128d rv = simde_mm_load_pd(&right[l]);
+        simde__m128d mul = simde_mm_mul_pd(lv, rv);
+        simde__m128d dv = simde_mm_load_pd(&diagptable[l]);
 
-        tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv));		   
+        tv = simde_mm_add_pd(tv, simde_mm_mul_pd(mul, dv));		   
       }		 		
 
-      tv = _mm_hadd_pd(tv, tv);
-      _mm_storel_pd(&term, tv);
+      tv = simde_mm_hadd_pd(tv, tv);
+      simde_mm_storel_pd(&term, tv);
 
       if(fastScaling)
 	term = LOG(term);
@@ -1074,20 +1070,20 @@
 
       diagptable = &diagptable_start[20 * cptr[i]];	  	
 
-      __m128d tv = _mm_setzero_pd();	    
+      simde__m128d tv = simde_mm_setzero_pd();	    
 
       for(l = 0; l < 20; l+=2)
       {
-        __m128d lv = _mm_load_pd(&left[l]);
-        __m128d rv = _mm_load_pd(&right[l]);
-        __m128d mul = _mm_mul_pd(lv, rv);
-        __m128d dv = _mm_load_pd(&diagptable[l]);
+        simde__m128d lv = simde_mm_load_pd(&left[l]);
+        simde__m128d rv = simde_mm_load_pd(&right[l]);
+        simde__m128d mul = simde_mm_mul_pd(lv, rv);
+        simde__m128d dv = simde_mm_load_pd(&diagptable[l]);
 
-        tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv));		   
+        tv = simde_mm_add_pd(tv, simde_mm_mul_pd(mul, dv));		   
       }		 		
 
-      tv = _mm_hadd_pd(tv, tv);
-      _mm_storel_pd(&term, tv);
+      tv = simde_mm_hadd_pd(tv, tv);
+      simde_mm_storel_pd(&term, tv);
       
       if(fastScaling)
 	term = LOG(term);	 
@@ -1121,7 +1117,7 @@
     for (i = 0; i < n; i++) 
     {	
       double t[2] __attribute__ ((aligned (BYTE_ALIGNMENT)));
-      __m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
+      simde__m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
 
       x1 = &(tipVector[4 * tipX1[i]]);
 
@@ -1135,22 +1131,22 @@
 
       diagptable = &diagptable_start[4 * cptr[i]];
 
-      x1v1 =  _mm_load_pd(&x1[0]);
-      x1v2 =  _mm_load_pd(&x1[2]);
-      x2v1 =  _mm_load_pd(&x2[0]);
-      x2v2 =  _mm_load_pd(&x2[2]);
-      dv1  =  _mm_load_pd(&diagptable[0]);
-      dv2  =  _mm_load_pd(&diagptable[2]);
-
-      x1v1 = _mm_mul_pd(x1v1, x2v1);
-      x1v1 = _mm_mul_pd(x1v1, dv1);
+      x1v1 =  simde_mm_load_pd(&x1[0]);
+      x1v2 =  simde_mm_load_pd(&x1[2]);
+      x2v1 =  simde_mm_load_pd(&x2[0]);
+      x2v2 =  simde_mm_load_pd(&x2[2]);
+      dv1  =  simde_mm_load_pd(&diagptable[0]);
+      dv2  =  simde_mm_load_pd(&diagptable[2]);
+
+      x1v1 = simde_mm_mul_pd(x1v1, x2v1);
+      x1v1 = simde_mm_mul_pd(x1v1, dv1);
 
-      x1v2 = _mm_mul_pd(x1v2, x2v2);
-      x1v2 = _mm_mul_pd(x1v2, dv2);
+      x1v2 = simde_mm_mul_pd(x1v2, x2v2);
+      x1v2 = simde_mm_mul_pd(x1v2, dv2);
 
-      x1v1 = _mm_add_pd(x1v1, x1v2);
+      x1v1 = simde_mm_add_pd(x1v1, x1v2);
 
-      _mm_store_pd(t, x1v1);
+      simde_mm_store_pd(t, x1v1);
 
       if(fastScaling)
 	term = LOG(t[0] + t[1]);      
@@ -1165,7 +1161,7 @@
     for (i = 0; i < n; i++) 
     { 
       double t[2] __attribute__ ((aligned (BYTE_ALIGNMENT)));
-      __m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
+      simde__m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
 
       if(isGap(x1_gap, i))
         x1 = x1_gapColumn;
@@ -1185,22 +1181,22 @@
 
       diagptable = &diagptable_start[4 * cptr[i]];	
 
-      x1v1 =  _mm_load_pd(&x1[0]);
-      x1v2 =  _mm_load_pd(&x1[2]);
-      x2v1 =  _mm_load_pd(&x2[0]);
-      x2v2 =  _mm_load_pd(&x2[2]);
-      dv1  =  _mm_load_pd(&diagptable[0]);
-      dv2  =  _mm_load_pd(&diagptable[2]);
-
-      x1v1 = _mm_mul_pd(x1v1, x2v1);
-      x1v1 = _mm_mul_pd(x1v1, dv1);
+      x1v1 =  simde_mm_load_pd(&x1[0]);
+      x1v2 =  simde_mm_load_pd(&x1[2]);
+      x2v1 =  simde_mm_load_pd(&x2[0]);
+      x2v2 =  simde_mm_load_pd(&x2[2]);
+      dv1  =  simde_mm_load_pd(&diagptable[0]);
+      dv2  =  simde_mm_load_pd(&diagptable[2]);
+
+      x1v1 = simde_mm_mul_pd(x1v1, x2v1);
+      x1v1 = simde_mm_mul_pd(x1v1, dv1);
 
-      x1v2 = _mm_mul_pd(x1v2, x2v2);
-      x1v2 = _mm_mul_pd(x1v2, dv2);
+      x1v2 = simde_mm_mul_pd(x1v2, x2v2);
+      x1v2 = simde_mm_mul_pd(x1v2, dv2);
 
-      x1v1 = _mm_add_pd(x1v1, x1v2);
+      x1v1 = simde_mm_add_pd(x1v1, x1v2);
 
-      _mm_store_pd(t, x1v1);
+      simde_mm_store_pd(t, x1v1);
 
       if(fastScaling)
 	term = LOG(t[0] + t[1]);
@@ -1236,20 +1232,20 @@
 	  
 	  diagptable = &diagptable_start[20 * cptr[i]];	           	 
 #ifdef __SIM_SSE3
-	  __m128d tv = _mm_setzero_pd();	    
+	  simde__m128d tv = simde_mm_setzero_pd();	    
 	  
 	  for(l = 0; l < 20; l+=2)
 	    {
-	      __m128d lv = _mm_load_pd(&left[l]);
-	      __m128d rv = _mm_load_pd(&right[l]);
-	      __m128d mul = _mm_mul_pd(lv, rv);
-	      __m128d dv = _mm_load_pd(&diagptable[l]);
+	      simde__m128d lv = simde_mm_load_pd(&left[l]);
+	      simde__m128d rv = simde_mm_load_pd(&right[l]);
+	      simde__m128d mul = simde_mm_mul_pd(lv, rv);
+	      simde__m128d dv = simde_mm_load_pd(&diagptable[l]);
 	      
-	      tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv));		   
+	      tv = simde_mm_add_pd(tv, simde_mm_mul_pd(mul, dv));		   
 	    }		 		
 	  
-	  tv = _mm_hadd_pd(tv, tv);
-	  _mm_storel_pd(&term, tv);
+	  tv = simde_mm_hadd_pd(tv, tv);
+	  simde_mm_storel_pd(&term, tv);
 #else  
 	  for(l = 0, term = 0.0; l < 20; l++)
 	    term += left[l] * right[l] * diagptable[l];	 	  	  
@@ -1272,20 +1268,20 @@
 	  
 	  diagptable = &diagptable_start[20 * cptr[i]];	  	
 #ifdef __SIM_SSE3
-	    __m128d tv = _mm_setzero_pd();	    
+	    simde__m128d tv = simde_mm_setzero_pd();	    
 	      	    
 	    for(l = 0; l < 20; l+=2)
 	      {
-		__m128d lv = _mm_load_pd(&left[l]);
-		__m128d rv = _mm_load_pd(&right[l]);
-		__m128d mul = _mm_mul_pd(lv, rv);
-		__m128d dv = _mm_load_pd(&diagptable[l]);
+		simde__m128d lv = simde_mm_load_pd(&left[l]);
+		simde__m128d rv = simde_mm_load_pd(&right[l]);
+		simde__m128d mul = simde_mm_mul_pd(lv, rv);
+		simde__m128d dv = simde_mm_load_pd(&diagptable[l]);
 		
-		tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv));		   
+		tv = simde_mm_add_pd(tv, simde_mm_mul_pd(mul, dv));		   
 	      }		 		
 	      
-	      tv = _mm_hadd_pd(tv, tv);
-	      _mm_storel_pd(&term, tv);
+	      tv = simde_mm_hadd_pd(tv, tv);
+	      simde_mm_storel_pd(&term, tv);
 #else  
 	  for(l = 0, term = 0.0; l < 20; l++)
 	    term += left[l] * right[l] * diagptable[l];	
@@ -1487,7 +1483,7 @@
 	  diagptable = &(diagptable_start[2 * cptr[i]]);	    	    	  
 	
 #ifdef __SIM_SSE3	  
-	  _mm_store_pd(t, _mm_mul_pd(_mm_load_pd(x1), _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(diagptable))));
+	  simde_mm_store_pd(t, simde_mm_mul_pd(simde_mm_load_pd(x1), simde_mm_mul_pd(simde_mm_load_pd(x2), simde_mm_load_pd(diagptable))));
 	  
 	  if(fastScaling)
 	    term = LOG(FABS(t[0] + t[1]));
@@ -1518,7 +1514,7 @@
 	  
 	  diagptable = &diagptable_start[2 * cptr[i]];		  
 #ifdef __SIM_SSE3	  
-	  _mm_store_pd(t, _mm_mul_pd(_mm_load_pd(x1), _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(diagptable))));
+	  simde_mm_store_pd(t, simde_mm_mul_pd(simde_mm_load_pd(x1), simde_mm_mul_pd(simde_mm_load_pd(x2), simde_mm_load_pd(diagptable))));
 	  
 	  if(fastScaling)
 	    term = LOG(FABS(t[0] + t[1]));
@@ -1560,26 +1556,26 @@
 	{
 #ifdef __SIM_SSE3
 	  double t[2] __attribute__ ((aligned (BYTE_ALIGNMENT)));
-	  __m128d termv, x1v, x2v, dv;
+	  simde__m128d termv, x1v, x2v, dv;
 #endif
 	  x1 = &(tipVector[2 * tipX1[i]]);	 
 	  x2 = &x2_start[8 * i];	          	  	
 #ifdef __SIM_SSE3	
-	  termv = _mm_set1_pd(0.0);	    	   
+	  termv = simde_mm_set1_pd(0.0);	    	   
 	  
 	  for(j = 0; j < 4; j++)
 	    {
-	      x1v = _mm_load_pd(&x1[0]);
-	      x2v = _mm_load_pd(&x2[j * 2]);
-	      dv   = _mm_load_pd(&diagptable[j * 2]);
+	      x1v = simde_mm_load_pd(&x1[0]);
+	      x2v = simde_mm_load_pd(&x2[j * 2]);
+	      dv   = simde_mm_load_pd(&diagptable[j * 2]);
 	      
-	      x1v = _mm_mul_pd(x1v, x2v);
-	      x1v = _mm_mul_pd(x1v, dv);
+	      x1v = simde_mm_mul_pd(x1v, x2v);
+	      x1v = simde_mm_mul_pd(x1v, dv);
 	      
-	      termv = _mm_add_pd(termv, x1v);	      	      
+	      termv = simde_mm_add_pd(termv, x1v);	      	      
 	    }
 	  
-	  _mm_store_pd(t, termv);	        
+	  simde_mm_store_pd(t, termv);	        
 	  
 	  if(fastScaling)
 	    term = LOG(0.25 * (FABS(t[0] + t[1])));
@@ -1605,27 +1601,27 @@
 	{
 #ifdef __SIM_SSE3
 	  double t[2] __attribute__ ((aligned (BYTE_ALIGNMENT)));
-	  __m128d termv, x1v, x2v, dv;
+	  simde__m128d termv, x1v, x2v, dv;
 #endif	  	 	  	  
 	  x1 = &x1_start[8 * i];
 	  x2 = &x2_start[8 * i];
 	  	  
 #ifdef __SIM_SSE3	
-	  termv = _mm_set1_pd(0.0);	    	   
+	  termv = simde_mm_set1_pd(0.0);	    	   
 	  
 	  for(j = 0; j < 4; j++)
 	    {
-	      x1v = _mm_load_pd(&x1[j * 2]);
-	      x2v = _mm_load_pd(&x2[j * 2]);
-	      dv   = _mm_load_pd(&diagptable[j * 2]);
+	      x1v = simde_mm_load_pd(&x1[j * 2]);
+	      x2v = simde_mm_load_pd(&x2[j * 2]);
+	      dv   = simde_mm_load_pd(&diagptable[j * 2]);
 	      
-	      x1v = _mm_mul_pd(x1v, x2v);
-	      x1v = _mm_mul_pd(x1v, dv);
+	      x1v = simde_mm_mul_pd(x1v, x2v);
+	      x1v = simde_mm_mul_pd(x1v, dv);
 	      
-	      termv = _mm_add_pd(termv, x1v);	      	      
+	      termv = simde_mm_add_pd(termv, x1v);	      	      
 	    }
 	  
-	  _mm_store_pd(t, termv);
+	  simde_mm_store_pd(t, termv);
 	  
 	  
 	  if(fastScaling)
@@ -1739,7 +1735,7 @@
 	{	
 #ifdef __SIM_SSE3
 	  double t[2] __attribute__ ((aligned (BYTE_ALIGNMENT)));
-	  __m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
+	  simde__m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
 #endif
 	  x1 = &(tipVector[4 * tipX1[i]]);
 	  x2 = &x2_start[4 * i];
@@ -1747,22 +1743,22 @@
 	  diagptable = &diagptable_start[4 * cptr[i]];
 	  
 #ifdef __SIM_SSE3	    	  
-	  x1v1 =  _mm_load_pd(&x1[0]);
-	  x1v2 =  _mm_load_pd(&x1[2]);
-	  x2v1 =  _mm_load_pd(&x2[0]);
-	  x2v2 =  _mm_load_pd(&x2[2]);
-	  dv1  =  _mm_load_pd(&diagptable[0]);
-	  dv2  =  _mm_load_pd(&diagptable[2]);
-	  
-	  x1v1 = _mm_mul_pd(x1v1, x2v1);
-	  x1v1 = _mm_mul_pd(x1v1, dv1);
+	  x1v1 =  simde_mm_load_pd(&x1[0]);
+	  x1v2 =  simde_mm_load_pd(&x1[2]);
+	  x2v1 =  simde_mm_load_pd(&x2[0]);
+	  x2v2 =  simde_mm_load_pd(&x2[2]);
+	  dv1  =  simde_mm_load_pd(&diagptable[0]);
+	  dv2  =  simde_mm_load_pd(&diagptable[2]);
+	  
+	  x1v1 = simde_mm_mul_pd(x1v1, x2v1);
+	  x1v1 = simde_mm_mul_pd(x1v1, dv1);
 	  
-	  x1v2 = _mm_mul_pd(x1v2, x2v2);
-	  x1v2 = _mm_mul_pd(x1v2, dv2);
+	  x1v2 = simde_mm_mul_pd(x1v2, x2v2);
+	  x1v2 = simde_mm_mul_pd(x1v2, dv2);
 	  
-	  x1v1 = _mm_add_pd(x1v1, x1v2);
+	  x1v1 = simde_mm_add_pd(x1v1, x1v2);
 	  
-	  _mm_store_pd(t, x1v1);
+	  simde_mm_store_pd(t, x1v1);
 	  
 	  if(fastScaling)
 	    term = LOG(FABS(t[0] + t[1]));
@@ -1800,7 +1796,7 @@
 	{ 
 #ifdef __SIM_SSE3
 	  double t[2] __attribute__ ((aligned (BYTE_ALIGNMENT)));
-	   __m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
+	   simde__m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
 #endif
 	  x1 = &x1_start[4 * i];
 	  x2 = &x2_start[4 * i];
@@ -1808,22 +1804,22 @@
 	  diagptable = &diagptable_start[4 * cptr[i]];	
 	  
 #ifdef __SIM_SSE3	  
-	  x1v1 =  _mm_load_pd(&x1[0]);
-	  x1v2 =  _mm_load_pd(&x1[2]);
-	  x2v1 =  _mm_load_pd(&x2[0]);
-	  x2v2 =  _mm_load_pd(&x2[2]);
-	  dv1  =  _mm_load_pd(&diagptable[0]);
-	  dv2  =  _mm_load_pd(&diagptable[2]);
-	  
-	  x1v1 = _mm_mul_pd(x1v1, x2v1);
-	  x1v1 = _mm_mul_pd(x1v1, dv1);
+	  x1v1 =  simde_mm_load_pd(&x1[0]);
+	  x1v2 =  simde_mm_load_pd(&x1[2]);
+	  x2v1 =  simde_mm_load_pd(&x2[0]);
+	  x2v2 =  simde_mm_load_pd(&x2[2]);
+	  dv1  =  simde_mm_load_pd(&diagptable[0]);
+	  dv2  =  simde_mm_load_pd(&diagptable[2]);
+	  
+	  x1v1 = simde_mm_mul_pd(x1v1, x2v1);
+	  x1v1 = simde_mm_mul_pd(x1v1, dv1);
 	  
-	  x1v2 = _mm_mul_pd(x1v2, x2v2);
-	  x1v2 = _mm_mul_pd(x1v2, dv2);
+	  x1v2 = simde_mm_mul_pd(x1v2, x2v2);
+	  x1v2 = simde_mm_mul_pd(x1v2, dv2);
 	  
-	  x1v1 = _mm_add_pd(x1v1, x1v2);
+	  x1v1 = simde_mm_add_pd(x1v1, x1v2);
 	  
-	  _mm_store_pd(t, x1v1);
+	  simde_mm_store_pd(t, x1v1);
 	  
 	  if(fastScaling)
 	    term = LOG(FABS(t[0] + t[1]));
@@ -1874,7 +1870,7 @@
       for (i = 0; i < n; i++)
 	{
 	  double t[2] __attribute__ ((aligned (BYTE_ALIGNMENT)));
-	  __m128d termv, x1v, x2v, dv;
+	  simde__m128d termv, x1v, x2v, dv;
 
 	  x1 = &(tipVector[4 * tipX1[i]]);	 
 	  if(x2_gap[i / 32] & mask32[i % 32])
@@ -1886,30 +1882,30 @@
 	    }
 	  
 	
-	  termv = _mm_set1_pd(0.0);	    	   
+	  termv = simde_mm_set1_pd(0.0);	    	   
 	  
 	  for(j = 0; j < 4; j++)
 	    {
-	      x1v = _mm_load_pd(&x1[0]);
-	      x2v = _mm_load_pd(&x2[j * 4]);
-	      dv   = _mm_load_pd(&diagptable[j * 4]);
+	      x1v = simde_mm_load_pd(&x1[0]);
+	      x2v = simde_mm_load_pd(&x2[j * 4]);
+	      dv   = simde_mm_load_pd(&diagptable[j * 4]);
 	      
-	      x1v = _mm_mul_pd(x1v, x2v);
-	      x1v = _mm_mul_pd(x1v, dv);
+	      x1v = simde_mm_mul_pd(x1v, x2v);
+	      x1v = simde_mm_mul_pd(x1v, dv);
 	      
-	      termv = _mm_add_pd(termv, x1v);
+	      termv = simde_mm_add_pd(termv, x1v);
 	      
-	      x1v = _mm_load_pd(&x1[2]);
-	      x2v = _mm_load_pd(&x2[j * 4 + 2]);
-	      dv   = _mm_load_pd(&diagptable[j * 4 + 2]);
+	      x1v = simde_mm_load_pd(&x1[2]);
+	      x2v = simde_mm_load_pd(&x2[j * 4 + 2]);
+	      dv   = simde_mm_load_pd(&diagptable[j * 4 + 2]);
 	      
-	      x1v = _mm_mul_pd(x1v, x2v);
-	      x1v = _mm_mul_pd(x1v, dv);
+	      x1v = simde_mm_mul_pd(x1v, x2v);
+	      x1v = simde_mm_mul_pd(x1v, dv);
 	      
-	      termv = _mm_add_pd(termv, x1v);
+	      termv = simde_mm_add_pd(termv, x1v);
 	    }
 	  
-	  _mm_store_pd(t, termv);	  	 
+	  simde_mm_store_pd(t, termv);	  	 
 
 	  if(fastScaling)
 	    term = LOG(0.25 * FABS(t[0] + t[1]));
@@ -1926,7 +1922,7 @@
 	{
 
 	  double t[2] __attribute__ ((aligned (BYTE_ALIGNMENT)));
-	  __m128d termv, x1v, x2v, dv;
+	  simde__m128d termv, x1v, x2v, dv;
 
 	  if(x1_gap[i / 32] & mask32[i % 32])
 	    x1 = x1_gapColumn;
@@ -1944,30 +1940,30 @@
 	      x2_ptr += 16;
 	    }
 	
-	  termv = _mm_set1_pd(0.0);	  	 
+	  termv = simde_mm_set1_pd(0.0);	  	 
 	  
 	  for(j = 0; j < 4; j++)
 	    {
-	      x1v = _mm_load_pd(&x1[j * 4]);
-	      x2v = _mm_load_pd(&x2[j * 4]);
-	      dv   = _mm_load_pd(&diagptable[j * 4]);
+	      x1v = simde_mm_load_pd(&x1[j * 4]);
+	      x2v = simde_mm_load_pd(&x2[j * 4]);
+	      dv   = simde_mm_load_pd(&diagptable[j * 4]);
 	      
-	      x1v = _mm_mul_pd(x1v, x2v);
-	      x1v = _mm_mul_pd(x1v, dv);
+	      x1v = simde_mm_mul_pd(x1v, x2v);
+	      x1v = simde_mm_mul_pd(x1v, dv);
 	      
-	      termv = _mm_add_pd(termv, x1v);
+	      termv = simde_mm_add_pd(termv, x1v);
 	      
-	      x1v = _mm_load_pd(&x1[j * 4 + 2]);
-	      x2v = _mm_load_pd(&x2[j * 4 + 2]);
-	      dv   = _mm_load_pd(&diagptable[j * 4 + 2]);
+	      x1v = simde_mm_load_pd(&x1[j * 4 + 2]);
+	      x2v = simde_mm_load_pd(&x2[j * 4 + 2]);
+	      dv   = simde_mm_load_pd(&diagptable[j * 4 + 2]);
 	      
-	      x1v = _mm_mul_pd(x1v, x2v);
-	      x1v = _mm_mul_pd(x1v, dv);
+	      x1v = simde_mm_mul_pd(x1v, x2v);
+	      x1v = simde_mm_mul_pd(x1v, dv);
 	      
-	      termv = _mm_add_pd(termv, x1v);
+	      termv = simde_mm_add_pd(termv, x1v);
 	    }
 	  
-	  _mm_store_pd(t, termv);
+	  simde_mm_store_pd(t, termv);
 
 	  if(fastScaling)
 	    term = LOG(0.25 * FABS(t[0] + t[1]));
@@ -2007,36 +2003,36 @@
 	{
 #ifdef __SIM_SSE3
 	  double t[2] __attribute__ ((aligned (BYTE_ALIGNMENT)));
-	  __m128d termv, x1v, x2v, dv;
+	  simde__m128d termv, x1v, x2v, dv;
 #endif
 	  x1 = &(tipVector[4 * tipX1[i]]);	 
 	  x2 = &x2_start[16 * i];	 
 	  
 #ifdef __SIM_SSE3	
-	  termv = _mm_set1_pd(0.0);	    	   
+	  termv = simde_mm_set1_pd(0.0);	    	   
 	  
 	  for(j = 0; j < 4; j++)
 	    {
-	      x1v = _mm_load_pd(&x1[0]);
-	      x2v = _mm_load_pd(&x2[j * 4]);
-	      dv   = _mm_load_pd(&diagptable[j * 4]);
+	      x1v = simde_mm_load_pd(&x1[0]);
+	      x2v = simde_mm_load_pd(&x2[j * 4]);
+	      dv   = simde_mm_load_pd(&diagptable[j * 4]);
 	      
-	      x1v = _mm_mul_pd(x1v, x2v);
-	      x1v = _mm_mul_pd(x1v, dv);
+	      x1v = simde_mm_mul_pd(x1v, x2v);
+	      x1v = simde_mm_mul_pd(x1v, dv);
 	      
-	      termv = _mm_add_pd(termv, x1v);
+	      termv = simde_mm_add_pd(termv, x1v);
 	      
-	      x1v = _mm_load_pd(&x1[2]);
-	      x2v = _mm_load_pd(&x2[j * 4 + 2]);
-	      dv   = _mm_load_pd(&diagptable[j * 4 + 2]);
+	      x1v = simde_mm_load_pd(&x1[2]);
+	      x2v = simde_mm_load_pd(&x2[j * 4 + 2]);
+	      dv   = simde_mm_load_pd(&diagptable[j * 4 + 2]);
 	      
-	      x1v = _mm_mul_pd(x1v, x2v);
-	      x1v = _mm_mul_pd(x1v, dv);
+	      x1v = simde_mm_mul_pd(x1v, x2v);
+	      x1v = simde_mm_mul_pd(x1v, dv);
 	      
-	      termv = _mm_add_pd(termv, x1v);
+	      termv = simde_mm_add_pd(termv, x1v);
 	    }
 	  
-	  _mm_store_pd(t, termv);
+	  simde_mm_store_pd(t, termv);
 	  
 	  
 	  if(fastScaling)
@@ -2063,37 +2059,37 @@
 	{
 #ifdef __SIM_SSE3
 	  double t[2] __attribute__ ((aligned (BYTE_ALIGNMENT)));
-	  __m128d termv, x1v, x2v, dv;
+	  simde__m128d termv, x1v, x2v, dv;
 #endif
 	  	 	  	  
 	  x1 = &x1_start[16 * i];
 	  x2 = &x2_start[16 * i];	  	  
 	
 #ifdef __SIM_SSE3	
-	  termv = _mm_set1_pd(0.0);	  	 
+	  termv = simde_mm_set1_pd(0.0);	  	 
 	  
 	  for(j = 0; j < 4; j++)
 	    {
-	      x1v = _mm_load_pd(&x1[j * 4]);
-	      x2v = _mm_load_pd(&x2[j * 4]);
-	      dv   = _mm_load_pd(&diagptable[j * 4]);
+	      x1v = simde_mm_load_pd(&x1[j * 4]);
+	      x2v = simde_mm_load_pd(&x2[j * 4]);
+	      dv   = simde_mm_load_pd(&diagptable[j * 4]);
 	      
-	      x1v = _mm_mul_pd(x1v, x2v);
-	      x1v = _mm_mul_pd(x1v, dv);
+	      x1v = simde_mm_mul_pd(x1v, x2v);
+	      x1v = simde_mm_mul_pd(x1v, dv);
 	      
-	      termv = _mm_add_pd(termv, x1v);
+	      termv = simde_mm_add_pd(termv, x1v);
 	      
-	      x1v = _mm_load_pd(&x1[j * 4 + 2]);
-	      x2v = _mm_load_pd(&x2[j * 4 + 2]);
-	      dv   = _mm_load_pd(&diagptable[j * 4 + 2]);
+	      x1v = simde_mm_load_pd(&x1[j * 4 + 2]);
+	      x2v = simde_mm_load_pd(&x2[j * 4 + 2]);
+	      dv   = simde_mm_load_pd(&diagptable[j * 4 + 2]);
 	      
-	      x1v = _mm_mul_pd(x1v, x2v);
-	      x1v = _mm_mul_pd(x1v, dv);
+	      x1v = simde_mm_mul_pd(x1v, x2v);
+	      x1v = simde_mm_mul_pd(x1v, dv);
 	      
-	      termv = _mm_add_pd(termv, x1v);
+	      termv = simde_mm_add_pd(termv, x1v);
 	    }
 	  
-	  _mm_store_pd(t, termv);
+	  simde_mm_store_pd(t, termv);
 
 	  if(fastScaling)
 	    term = LOG(0.25 * FABS(t[0] + t[1]));
@@ -2215,7 +2211,7 @@
       for (i = 0; i < n; i++) 
 	{
 #ifdef __SIM_SSE3
-	  __m128d tv = _mm_setzero_pd();
+	  simde__m128d tv = simde_mm_setzero_pd();
 	  left = &(tipVector[20 * tipX1[i]]);	  	  
 	  
 	  for(j = 0, term = 0.0; j < 4; j++)
@@ -2224,12 +2220,12 @@
 	      right = &(x2[80 * i + 20 * j]);
 	      for(l = 0; l < 20; l+=2)
 		{
-		  __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
-		  tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l])));		   
+		  simde__m128d mul = simde_mm_mul_pd(simde_mm_load_pd(&left[l]), simde_mm_load_pd(&right[l]));
+		  tv = simde_mm_add_pd(tv, simde_mm_mul_pd(mul, simde_mm_load_pd(&d[l])));		   
 		}		 		
 	    }
-	  tv = _mm_hadd_pd(tv, tv);
-	  _mm_storel_pd(&term, tv);
+	  tv = simde_mm_hadd_pd(tv, tv);
+	  simde_mm_storel_pd(&term, tv);
 	  
 #else
 	  left = &(tipVector[20 * tipX1[i]]);	  	  
@@ -2255,7 +2251,7 @@
       for (i = 0; i < n; i++) 
 	{	  	 	             
 #ifdef __SIM_SSE3
-	  __m128d tv = _mm_setzero_pd();	 	  	  
+	  simde__m128d tv = simde_mm_setzero_pd();	 	  	  
 	      
 	  for(j = 0, term = 0.0; j < 4; j++)
 	    {
@@ -2265,12 +2261,12 @@
 	      
 	      for(l = 0; l < 20; l+=2)
 		{
-		  __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
-		  tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l])));		   
+		  simde__m128d mul = simde_mm_mul_pd(simde_mm_load_pd(&left[l]), simde_mm_load_pd(&right[l]));
+		  tv = simde_mm_add_pd(tv, simde_mm_mul_pd(mul, simde_mm_load_pd(&d[l])));		   
 		}		 		
 	    }
-	  tv = _mm_hadd_pd(tv, tv);
-	  _mm_storel_pd(&term, tv);	  
+	  tv = simde_mm_hadd_pd(tv, tv);
+	  simde_mm_storel_pd(&term, tv);	  
 #else
 	  for(j = 0, term = 0.0; j < 4; j++)
 	    {
@@ -2393,7 +2389,7 @@
         x2_ptr += 80;
       }
 
-      __m128d tv = _mm_setzero_pd();
+      simde__m128d tv = simde_mm_setzero_pd();
       left = &(tipVector[20 * tipX1[i]]);	  	  
 
       for(j = 0, term = 0.0; j < 4; j++)
@@ -2402,13 +2398,13 @@
         right = &(x2v[20 * j]);
         for(l = 0; l < 20; l+=2)
         {
-          __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
-          tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l])));		   
+          simde__m128d mul = simde_mm_mul_pd(simde_mm_load_pd(&left[l]), simde_mm_load_pd(&right[l]));
+          tv = simde_mm_add_pd(tv, simde_mm_mul_pd(mul, simde_mm_load_pd(&d[l])));		   
         }		 		
       }
 
-      tv = _mm_hadd_pd(tv, tv);
-      _mm_storel_pd(&term, tv);
+      tv = simde_mm_hadd_pd(tv, tv);
+      simde_mm_storel_pd(&term, tv);
 
 
       if(fastScaling)
@@ -2439,7 +2435,7 @@
         x2_ptr += 80;
       }
 
-      __m128d tv = _mm_setzero_pd();	 	  	  
+      simde__m128d tv = simde_mm_setzero_pd();	 	  	  
 
       for(j = 0, term = 0.0; j < 4; j++)
       {
@@ -2449,12 +2445,12 @@
 
         for(l = 0; l < 20; l+=2)
         {
-          __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
-          tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l])));		   
+          simde__m128d mul = simde_mm_mul_pd(simde_mm_load_pd(&left[l]), simde_mm_load_pd(&right[l]));
+          tv = simde_mm_add_pd(tv, simde_mm_mul_pd(mul, simde_mm_load_pd(&d[l])));		   
         }		 		
       }
-      tv = _mm_hadd_pd(tv, tv);
-      _mm_storel_pd(&term, tv);	  
+      tv = simde_mm_hadd_pd(tv, tv);
+      simde_mm_storel_pd(&term, tv);	  
 
       if(fastScaling)
         term = LOG(0.25 * term);
--- raxml.orig/evaluatePartialGenericSpecial.c
+++ raxml/evaluatePartialGenericSpecial.c
@@ -40,11 +40,7 @@
 #include <string.h>
 #include "axml.h"
 
-#ifdef __SIM_SSE3
-#include <xmmintrin.h>
-#include <pmmintrin.h>
-#endif
-
+#include "debian/include/simde/x86/sse3.h"
 
 /********************** GTRCAT ***************************************/
 
@@ -104,46 +100,46 @@
 
     for(l = 0; l < 20; l+=2)
       {
-	__m128d d1v = _mm_mul_pd(_mm_load_pd(&x1[l]), _mm_load_pd(&e1[l]));
-	__m128d d2v = _mm_mul_pd(_mm_load_pd(&x2[l]), _mm_load_pd(&e2[l]));
+	simde__m128d d1v = simde_mm_mul_pd(simde_mm_load_pd(&x1[l]), simde_mm_load_pd(&e1[l]));
+	simde__m128d d2v = simde_mm_mul_pd(simde_mm_load_pd(&x2[l]), simde_mm_load_pd(&e2[l]));
 	
-	_mm_store_pd(&d1[l], d1v);
-	_mm_store_pd(&d2[l], d2v);	
+	simde_mm_store_pd(&d1[l], d1v);
+	simde_mm_store_pd(&d2[l], d2v);	
       }
 
-    __m128d zero = _mm_setzero_pd();
+    simde__m128d zero = simde_mm_setzero_pd();
 
     for(l = 0; l < 20; l+=2)
-      _mm_store_pd(&x3[l], zero);
+      simde_mm_store_pd(&x3[l], zero);
                 
     for(l = 0; l < 20; l++)
       { 	      
 	double *ev = &EV[l * 20];
-	__m128d ump_x1v = _mm_setzero_pd();
-	__m128d ump_x2v = _mm_setzero_pd();
-	__m128d x1px2v;
+	simde__m128d ump_x1v = simde_mm_setzero_pd();
+	simde__m128d ump_x2v = simde_mm_setzero_pd();
+	simde__m128d x1px2v;
 
 	for(k = 0; k < 20; k+=2)
 	  {       
-	    __m128d eiv = _mm_load_pd(&EI[20 * l + k]);
-	    __m128d d1v = _mm_load_pd(&d1[k]);
-	    __m128d d2v = _mm_load_pd(&d2[k]);
-	    ump_x1v = _mm_add_pd(ump_x1v, _mm_mul_pd(d1v, eiv));
-	    ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(d2v, eiv));	  
+	    simde__m128d eiv = simde_mm_load_pd(&EI[20 * l + k]);
+	    simde__m128d d1v = simde_mm_load_pd(&d1[k]);
+	    simde__m128d d2v = simde_mm_load_pd(&d2[k]);
+	    ump_x1v = simde_mm_add_pd(ump_x1v, simde_mm_mul_pd(d1v, eiv));
+	    ump_x2v = simde_mm_add_pd(ump_x2v, simde_mm_mul_pd(d2v, eiv));	  
 	  }
 
-	ump_x1v = _mm_hadd_pd(ump_x1v, ump_x1v);
-	ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
+	ump_x1v = simde_mm_hadd_pd(ump_x1v, ump_x1v);
+	ump_x2v = simde_mm_hadd_pd(ump_x2v, ump_x2v);
 
-	x1px2v = _mm_mul_pd(ump_x1v, ump_x2v);
+	x1px2v = simde_mm_mul_pd(ump_x1v, ump_x2v);
 
 	for(k = 0; k < 20; k+=2)
 	  {
-	    __m128d ex3v = _mm_load_pd(&x3[k]);
-	    __m128d EVV  = _mm_load_pd(&ev[k]);
-	    ex3v = _mm_add_pd(ex3v, _mm_mul_pd(x1px2v, EVV));
+	    simde__m128d ex3v = simde_mm_load_pd(&x3[k]);
+	    simde__m128d EVV  = simde_mm_load_pd(&ev[k]);
+	    ex3v = simde_mm_add_pd(ex3v, simde_mm_mul_pd(x1px2v, EVV));
 	    
-	    _mm_store_pd(&x3[k], ex3v);	   	   
+	    simde_mm_store_pd(&x3[k], ex3v);	   	   
 	  }
       }                      
     
@@ -153,12 +149,12 @@
     
     if(scale)
       {	      
-	__m128d twoto = _mm_set_pd(twotothe256, twotothe256);
+	simde__m128d twoto = simde_mm_set_pd(twotothe256, twotothe256);
 
 	for(l = 0; l < 20; l+=2)
 	  {
-	    __m128d ex3v = _mm_mul_pd(_mm_load_pd(&x3[l]),twoto);
-	    _mm_store_pd(&x3[l], ex3v);	
+	    simde__m128d ex3v = simde_mm_mul_pd(simde_mm_load_pd(&x3[l]),twoto);
+	    simde_mm_store_pd(&x3[l], ex3v);	
 	  }
  	
 	/*
--- raxml.orig/fastDNAparsimony.c
+++ raxml/fastDNAparsimony.c
@@ -58,21 +58,7 @@
 
 #endif
 
-
-#ifdef __SIM_SSE3
-
-#include <xmmintrin.h>
-#include <pmmintrin.h>
-  
-#endif
-
-#ifdef __AVX
-
-#include <xmmintrin.h>
-#include <immintrin.h>
-
-#endif
-
+#include "debian/include/simde/x86/avx.h"
 
 #include "axml.h"
 
@@ -85,30 +71,30 @@
 #ifdef __SIM_SSE3
 
 #define INTS_PER_VECTOR 4
-#define INT_TYPE __m128i
-#define CAST __m128i*
-#define SET_ALL_BITS_ONE _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
-#define SET_ALL_BITS_ZERO _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000)
-#define VECTOR_LOAD _mm_load_si128
-#define VECTOR_BIT_AND _mm_and_si128
-#define VECTOR_BIT_OR  _mm_or_si128
-#define VECTOR_STORE  _mm_store_si128
-#define VECTOR_AND_NOT _mm_andnot_si128
+#define INT_TYPE simde__m128i
+#define CAST simde__m128i*
+#define SET_ALL_BITS_ONE simde_mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
+#define SET_ALL_BITS_ZERO simde_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000)
+#define VECTOR_LOAD simde_mm_load_si128
+#define VECTOR_BIT_AND simde_mm_and_si128
+#define VECTOR_BIT_OR  simde_mm_or_si128
+#define VECTOR_STORE  simde_mm_store_si128
+#define VECTOR_AND_NOT simde_mm_andnot_si128
 
 #endif
 
 #ifdef __AVX
 
 #define INTS_PER_VECTOR 8
-#define INT_TYPE __m256d
+#define INT_TYPE simde__m256d
 #define CAST double*
-#define SET_ALL_BITS_ONE (__m256d)_mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
-#define SET_ALL_BITS_ZERO (__m256d)_mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000)
-#define VECTOR_LOAD _mm256_load_pd
-#define VECTOR_BIT_AND _mm256_and_pd
-#define VECTOR_BIT_OR  _mm256_or_pd
-#define VECTOR_STORE  _mm256_store_pd
-#define VECTOR_AND_NOT _mm256_andnot_pd
+#define SET_ALL_BITS_ONE simde_mm256_set_pd(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF)
+#define SET_ALL_BITS_ZERO simde_mm256_setzero_pd()
+#define VECTOR_LOAD simde_mm256_load_pd
+#define VECTOR_BIT_AND simde_mm256_and_pd
+#define VECTOR_BIT_OR  simde_mm256_or_pd
+#define VECTOR_STORE  simde_mm256_store_pd
+#define VECTOR_AND_NOT simde_mm256_andnot_pd
 
 #endif
 
--- raxml.orig/makenewzGenericSpecial.c
+++ raxml/makenewzGenericSpecial.c
@@ -43,11 +43,7 @@
 #include <string.h>
 #include "axml.h"
 
-#ifdef __SIM_SSE3
-#include <xmmintrin.h>
-#include <pmmintrin.h>
-/*#include <tmmintrin.h>*/
-#endif
+#include "debian/include/simde/x86/sse3.h"
 
 #ifdef _USE_PTHREADS
 extern volatile double *reductionBuffer;
@@ -82,7 +78,7 @@
 	  for(j = 0; j < 2; j++)
 	    sum[i * 2 + j]     = x1[j] * x2[j];
 #else
-	  _mm_store_pd(&sum[i * 2], _mm_mul_pd( _mm_load_pd(x1), _mm_load_pd(x2)));
+	  simde_mm_store_pd(&sum[i * 2], simde_mm_mul_pd( simde_mm_load_pd(x1), simde_mm_load_pd(x2)));
 #endif
 	}
       break;
@@ -96,7 +92,7 @@
 	  for(j = 0; j < 2; j++)
 	    sum[i * 2 + j]     = x1[j] * x2[j];
 #else
-	  _mm_store_pd(&sum[i * 2], _mm_mul_pd( _mm_load_pd(x1), _mm_load_pd(x2)));  
+	  simde_mm_store_pd(&sum[i * 2], simde_mm_mul_pd( simde_mm_load_pd(x1), simde_mm_load_pd(x2)));  
 #endif
 	}
       break;
@@ -109,7 +105,7 @@
 	  for(j = 0; j < 2; j++)
 	    sum[i * 2 + j]     = x1[j] * x2[j];
 #else
-	  _mm_store_pd(&sum[i * 2], _mm_mul_pd( _mm_load_pd(x1), _mm_load_pd(x2)));   
+	  simde_mm_store_pd(&sum[i * 2], simde_mm_mul_pd( simde_mm_load_pd(x1), simde_mm_load_pd(x2)));   
 #endif
 	}
       break;
@@ -185,8 +181,8 @@
         x1 = &(tipVector[4 * tipX1[i]]);
         x2 = &(tipVector[4 * tipX2[i]]);
 
-        _mm_store_pd( &sum[i*4 + 0], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[0] )));
-        _mm_store_pd( &sum[i*4 + 2], _mm_mul_pd( _mm_load_pd( &x1[2] ), _mm_load_pd( &x2[2] )));
+        simde_mm_store_pd( &sum[i*4 + 0], simde_mm_mul_pd( simde_mm_load_pd( &x1[0] ), simde_mm_load_pd( &x2[0] )));
+        simde_mm_store_pd( &sum[i*4 + 2], simde_mm_mul_pd( simde_mm_load_pd( &x1[2] ), simde_mm_load_pd( &x2[2] )));
       }
       break;
     case TIP_INNER:
@@ -201,8 +197,8 @@
           x2_ptr += 4;
         }
 
-        _mm_store_pd( &sum[i*4 + 0], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[0] )));
-        _mm_store_pd( &sum[i*4 + 2], _mm_mul_pd( _mm_load_pd( &x1[2] ), _mm_load_pd( &x2[2] )));
+        simde_mm_store_pd( &sum[i*4 + 0], simde_mm_mul_pd( simde_mm_load_pd( &x1[0] ), simde_mm_load_pd( &x2[0] )));
+        simde_mm_store_pd( &sum[i*4 + 2], simde_mm_mul_pd( simde_mm_load_pd( &x1[2] ), simde_mm_load_pd( &x2[2] )));
       }
       break;
     case INNER_INNER:
@@ -224,8 +220,8 @@
           x2_ptr += 4;
         }
 
-        _mm_store_pd( &sum[i*4 + 0], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[0] )));
-        _mm_store_pd( &sum[i*4 + 2], _mm_mul_pd( _mm_load_pd( &x1[2] ), _mm_load_pd( &x2[2] )));
+        simde_mm_store_pd( &sum[i*4 + 0], simde_mm_mul_pd( simde_mm_load_pd( &x1[0] ), simde_mm_load_pd( &x2[0] )));
+        simde_mm_store_pd( &sum[i*4 + 2], simde_mm_mul_pd( simde_mm_load_pd( &x1[2] ), simde_mm_load_pd( &x2[2] )));
 
       }    
       break;
@@ -251,8 +247,8 @@
 	  x1 = &(tipVector[4 * tipX1[i]]);
 	  x2 = &(tipVector[4 * tipX2[i]]);
 
-	  _mm_store_pd( &sum[i*4 + 0], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[0] )));
-	  _mm_store_pd( &sum[i*4 + 2], _mm_mul_pd( _mm_load_pd( &x1[2] ), _mm_load_pd( &x2[2] )));
+	  simde_mm_store_pd( &sum[i*4 + 0], simde_mm_mul_pd( simde_mm_load_pd( &x1[0] ), simde_mm_load_pd( &x2[0] )));
+	  simde_mm_store_pd( &sum[i*4 + 2], simde_mm_mul_pd( simde_mm_load_pd( &x1[2] ), simde_mm_load_pd( &x2[2] )));
 	}
       break;
     case TIP_INNER:
@@ -261,8 +257,8 @@
 	  x1 = &(tipVector[4 * tipX1[i]]);
 	  x2 = &x2_start[4 * i];
 
-	  _mm_store_pd( &sum[i*4 + 0], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[0] )));
-	  _mm_store_pd( &sum[i*4 + 2], _mm_mul_pd( _mm_load_pd( &x1[2] ), _mm_load_pd( &x2[2] )));
+	  simde_mm_store_pd( &sum[i*4 + 0], simde_mm_mul_pd( simde_mm_load_pd( &x1[0] ), simde_mm_load_pd( &x2[0] )));
+	  simde_mm_store_pd( &sum[i*4 + 2], simde_mm_mul_pd( simde_mm_load_pd( &x1[2] ), simde_mm_load_pd( &x2[2] )));
 	}
       break;
     case INNER_INNER:
@@ -271,8 +267,8 @@
 	  x1 = &x1_start[4 * i];
 	  x2 = &x2_start[4 * i];
 
-	  _mm_store_pd( &sum[i*4 + 0], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[0] )));
-	  _mm_store_pd( &sum[i*4 + 2], _mm_mul_pd( _mm_load_pd( &x1[2] ), _mm_load_pd( &x2[2] )));
+	  simde_mm_store_pd( &sum[i*4 + 0], simde_mm_mul_pd( simde_mm_load_pd( &x1[0] ), simde_mm_load_pd( &x2[0] )));
+	  simde_mm_store_pd( &sum[i*4 + 2], simde_mm_mul_pd( simde_mm_load_pd( &x1[2] ), simde_mm_load_pd( &x2[2] )));
 
 	}    
       break;
@@ -356,7 +352,7 @@
   double e2[4] __attribute__ ((aligned (BYTE_ALIGNMENT)));
   double dd1, dd2, dd3;
 
-  __m128d
+  simde__m128d
     e1v[2],
     e2v[2];
 
@@ -369,11 +365,11 @@
   e1[3] = EIGN[2];
   e2[3] = EIGN[2] * EIGN[2];
 
-  e1v[0]= _mm_load_pd(&e1[0]);
-  e1v[1]= _mm_load_pd(&e1[2]);
+  e1v[0]= simde_mm_load_pd(&e1[0]);
+  e1v[1]= simde_mm_load_pd(&e1[2]);
 
-  e2v[0]= _mm_load_pd(&e2[0]);
-  e2v[1]= _mm_load_pd(&e2[2]);
+  e2v[0]= simde_mm_load_pd(&e2[0]);
+  e2v[1]= simde_mm_load_pd(&e2[2]);
 
   d = d_start = (double *)rax_malloc(numberOfCategories * 4 * sizeof(double));
 
@@ -399,22 +395,22 @@
       
       d = &d_start[4 * cptr[i]];  
       
-      __m128d tmp_0v =_mm_mul_pd(_mm_load_pd(&d[0]),_mm_load_pd(&s[0]));
-      __m128d tmp_1v =_mm_mul_pd(_mm_load_pd(&d[2]),_mm_load_pd(&s[2]));
+      simde__m128d tmp_0v =simde_mm_mul_pd(simde_mm_load_pd(&d[0]),simde_mm_load_pd(&s[0]));
+      simde__m128d tmp_1v =simde_mm_mul_pd(simde_mm_load_pd(&d[2]),simde_mm_load_pd(&s[2]));
 
-      __m128d inv_Liv    = _mm_add_pd(tmp_0v, tmp_1v);      
+      simde__m128d inv_Liv    = simde_mm_add_pd(tmp_0v, tmp_1v);      
             	  
-      __m128d dlnLidlzv   = _mm_add_pd(_mm_mul_pd(tmp_0v, e1v[0]), _mm_mul_pd(tmp_1v, e1v[1]));	  
-      __m128d d2lnLidlz2v = _mm_add_pd(_mm_mul_pd(tmp_0v, e2v[0]), _mm_mul_pd(tmp_1v, e2v[1]));
+      simde__m128d dlnLidlzv   = simde_mm_add_pd(simde_mm_mul_pd(tmp_0v, e1v[0]), simde_mm_mul_pd(tmp_1v, e1v[1]));	  
+      simde__m128d d2lnLidlz2v = simde_mm_add_pd(simde_mm_mul_pd(tmp_0v, e2v[0]), simde_mm_mul_pd(tmp_1v, e2v[1]));
 
 
-      inv_Liv   = _mm_hadd_pd(inv_Liv, inv_Liv);
-      dlnLidlzv = _mm_hadd_pd(dlnLidlzv, dlnLidlzv);
-      d2lnLidlz2v = _mm_hadd_pd(d2lnLidlz2v, d2lnLidlz2v);                 
+      inv_Liv   = simde_mm_hadd_pd(inv_Liv, inv_Liv);
+      dlnLidlzv = simde_mm_hadd_pd(dlnLidlzv, dlnLidlzv);
+      d2lnLidlz2v = simde_mm_hadd_pd(d2lnLidlz2v, d2lnLidlz2v);                 
  
-      _mm_storel_pd(&inv_Li, inv_Liv);     
-      _mm_storel_pd(&dlnLidlz, dlnLidlzv);                 
-      _mm_storel_pd(&d2lnLidlz2, d2lnLidlz2v);      
+      simde_mm_storel_pd(&inv_Li, inv_Liv);     
+      simde_mm_storel_pd(&dlnLidlz, dlnLidlzv);                 
+      simde_mm_storel_pd(&d2lnLidlz2, d2lnLidlz2v);      
 
       inv_Li = 1.0/FABS(inv_Li);
 
@@ -539,9 +535,9 @@
 
         for(l = 0; l < 20; l+=2)
         {
-          __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+          simde__m128d sumv = simde_mm_mul_pd(simde_mm_load_pd(&left[l]), simde_mm_load_pd(&right[l]));
 
-          _mm_store_pd(&sum[l], sumv);		 
+          simde_mm_store_pd(&sum[l], sumv);		 
         }
 
       }
@@ -563,9 +559,9 @@
 
         for(l = 0; l < 20; l+=2)
         {
-          __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+          simde__m128d sumv = simde_mm_mul_pd(simde_mm_load_pd(&left[l]), simde_mm_load_pd(&right[l]));
 
-          _mm_store_pd(&sum[l], sumv);		 
+          simde_mm_store_pd(&sum[l], sumv);		 
         }
 
       }
@@ -593,9 +589,9 @@
 
         for(l = 0; l < 20; l+=2)
         {
-          __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+          simde__m128d sumv = simde_mm_mul_pd(simde_mm_load_pd(&left[l]), simde_mm_load_pd(&right[l]));
 
-          _mm_store_pd(&sum[l], sumv);		 
+          simde_mm_store_pd(&sum[l], sumv);		 
         }
       }
       break;
@@ -625,9 +621,9 @@
 #ifdef __SIM_SSE3
 	  for(l = 0; l < 20; l+=2)
 	    {
-	      __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+	      simde__m128d sumv = simde_mm_mul_pd(simde_mm_load_pd(&left[l]), simde_mm_load_pd(&right[l]));
 	      
-	      _mm_store_pd(&sum[l], sumv);		 
+	      simde_mm_store_pd(&sum[l], sumv);		 
 	    }
 #else
 	  for(l = 0; l < 20; l++)
@@ -644,9 +640,9 @@
 #ifdef __SIM_SSE3
 	  for(l = 0; l < 20; l+=2)
 	    {
-	      __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+	      simde__m128d sumv = simde_mm_mul_pd(simde_mm_load_pd(&left[l]), simde_mm_load_pd(&right[l]));
 	      
-	      _mm_store_pd(&sum[l], sumv);		 
+	      simde_mm_store_pd(&sum[l], sumv);		 
 	    }
 #else
 	  for(l = 0; l < 20; l++)
@@ -663,9 +659,9 @@
 #ifdef __SIM_SSE3
 	  for(l = 0; l < 20; l+=2)
 	    {
-	      __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+	      simde__m128d sumv = simde_mm_mul_pd(simde_mm_load_pd(&left[l]), simde_mm_load_pd(&right[l]));
 	      
-	      _mm_store_pd(&sum[l], sumv);		 
+	      simde_mm_store_pd(&sum[l], sumv);		 
 	    }
 #else
 	  for(l = 0; l < 20; l++)
@@ -908,33 +904,33 @@
 	wr1 = r * wgt[i],
 	wr2 = r * r * wgt[i];
 
-      __m128d a0 = _mm_setzero_pd();
-      __m128d a1 = _mm_setzero_pd();
-      __m128d a2 = _mm_setzero_pd();
+      simde__m128d a0 = simde_mm_setzero_pd();
+      simde__m128d a1 = simde_mm_setzero_pd();
+      simde__m128d a2 = simde_mm_setzero_pd();
 
       d1 = &d_start[20 * cptr[i]];
       sum = &sumtable[20 * i];
           
       for(l = 0; l < 20; l+=2)
 	{	  
-	  __m128d tmpv = _mm_mul_pd(_mm_load_pd(&d1[l]), _mm_load_pd(&sum[l]));
+	  simde__m128d tmpv = simde_mm_mul_pd(simde_mm_load_pd(&d1[l]), simde_mm_load_pd(&sum[l]));
 	  
-	  a0 = _mm_add_pd(a0, tmpv);
-	  __m128d sv = _mm_load_pd(&s[l]);	  
+	  a0 = simde_mm_add_pd(a0, tmpv);
+	  simde__m128d sv = simde_mm_load_pd(&s[l]);	  
 	  
-	  a1 = _mm_add_pd(a1, _mm_mul_pd(tmpv, sv));
-	  __m128d ev = _mm_load_pd(&e[l]);	  
+	  a1 = simde_mm_add_pd(a1, simde_mm_mul_pd(tmpv, sv));
+	  simde__m128d ev = simde_mm_load_pd(&e[l]);	  
 
-	  a2 = _mm_add_pd(a2, _mm_mul_pd(tmpv, ev));
+	  a2 = simde_mm_add_pd(a2, simde_mm_mul_pd(tmpv, ev));
 	}
 
-      a0 = _mm_hadd_pd(a0, a0);
-      a1 = _mm_hadd_pd(a1, a1);
-      a2 = _mm_hadd_pd(a2, a2);
+      a0 = simde_mm_hadd_pd(a0, a0);
+      a1 = simde_mm_hadd_pd(a1, a1);
+      a2 = simde_mm_hadd_pd(a2, a2);
 
-      _mm_storel_pd(&inv_Li, a0);     
-      _mm_storel_pd(&dlnLidlz, a1);                 
-      _mm_storel_pd(&d2lnLidlz2, a2);
+      simde_mm_storel_pd(&inv_Li, a0);     
+      simde_mm_storel_pd(&dlnLidlz, a1);                 
+      simde_mm_storel_pd(&d2lnLidlz2, a2);
       
       inv_Li = 1.0/FABS(inv_Li);
 
@@ -1937,7 +1933,7 @@
 	      sum[j * 2 + k] = x1[k] * x2[k];
 #else
 	  for(j = 0; j < 4; j++)
-	    _mm_store_pd( &sum[j*2], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[0] )));	 
+	    simde_mm_store_pd( &sum[j*2], simde_mm_mul_pd( simde_mm_load_pd( &x1[0] ), simde_mm_load_pd( &x2[0] )));	 
 #endif
 	}
       break;
@@ -1954,7 +1950,7 @@
 	      sum[j * 2 + k] = x1[k] * x2[j * 2 + k];
 #else
 	  for(j = 0; j < 4; j++)
-	    _mm_store_pd( &sum[j*2], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[j * 2] )));
+	    simde_mm_store_pd( &sum[j*2], simde_mm_mul_pd( simde_mm_load_pd( &x1[0] ), simde_mm_load_pd( &x2[j * 2] )));
 #endif
 	}
       break;
@@ -1970,7 +1966,7 @@
 	      sum[j * 2 + k] = x1[j * 2 + k] * x2[j * 2 + k];
 #else
 	  for(j = 0; j < 4; j++)
-	    _mm_store_pd( &sum[j*2], _mm_mul_pd( _mm_load_pd( &x1[j * 2] ), _mm_load_pd( &x2[j * 2] )));
+	    simde_mm_store_pd( &sum[j*2], simde_mm_mul_pd( simde_mm_load_pd( &x1[j * 2] ), simde_mm_load_pd( &x2[j * 2] )));
 #endif
 	}
       break;
@@ -2009,7 +2005,7 @@
 #else
 	  for(j = 0; j < 4; j++)	    
 	    for(k = 0; k < 4; k+=2)
-	      _mm_store_pd( &sum[j*4 + k], _mm_mul_pd( _mm_load_pd( &x1[k] ), _mm_load_pd( &x2[k] )));
+	      simde_mm_store_pd( &sum[j*4 + k], simde_mm_mul_pd( simde_mm_load_pd( &x1[k] ), simde_mm_load_pd( &x2[k] )));
 #endif
 	}
       break;
@@ -2034,7 +2030,7 @@
 #else
 	  for(j = 0; j < 4; j++)	    
 	    for(k = 0; k < 4; k+=2)
-	      _mm_store_pd( &sum[j*4 + k], _mm_mul_pd( _mm_load_pd( &x1[k] ), _mm_load_pd( &x2[j * 4 + k] )));
+	      simde_mm_store_pd( &sum[j*4 + k], simde_mm_mul_pd( simde_mm_load_pd( &x1[k] ), simde_mm_load_pd( &x2[j * 4 + k] )));
 #endif
 	}
       break;
@@ -2066,7 +2062,7 @@
 #else
 	   for(j = 0; j < 4; j++)	    
 	    for(k = 0; k < 4; k+=2)
-	      _mm_store_pd( &sum[j*4 + k], _mm_mul_pd( _mm_load_pd( &x1[j * 4 + k] ), _mm_load_pd( &x2[j * 4 + k] )));
+	      simde_mm_store_pd( &sum[j*4 + k], simde_mm_mul_pd( simde_mm_load_pd( &x1[j * 4 + k] ), simde_mm_load_pd( &x2[j * 4 + k] )));
 #endif
 	}
       break;
@@ -2103,7 +2099,7 @@
 #else
 	  for(j = 0; j < 4; j++)	    
 	    for(k = 0; k < 4; k+=2)
-	      _mm_store_pd( &sum[j*4 + k], _mm_mul_pd( _mm_load_pd( &x1[k] ), _mm_load_pd( &x2[k] )));
+	      simde_mm_store_pd( &sum[j*4 + k], simde_mm_mul_pd( simde_mm_load_pd( &x1[k] ), simde_mm_load_pd( &x2[k] )));
 #endif
 	}
       break;
@@ -2120,7 +2116,7 @@
 #else
 	  for(j = 0; j < 4; j++)	    
 	    for(k = 0; k < 4; k+=2)
-	      _mm_store_pd( &sum[j*4 + k], _mm_mul_pd( _mm_load_pd( &x1[k] ), _mm_load_pd( &x2[j * 4 + k] )));
+	      simde_mm_store_pd( &sum[j*4 + k], simde_mm_mul_pd( simde_mm_load_pd( &x1[k] ), simde_mm_load_pd( &x2[j * 4 + k] )));
 #endif
 	}
       break;
@@ -2137,7 +2133,7 @@
 #else
 	   for(j = 0; j < 4; j++)	    
 	    for(k = 0; k < 4; k+=2)
-	      _mm_store_pd( &sum[j*4 + k], _mm_mul_pd( _mm_load_pd( &x1[j * 4 + k] ), _mm_load_pd( &x2[j * 4 + k] )));
+	      simde_mm_store_pd( &sum[j*4 + k], simde_mm_mul_pd( simde_mm_load_pd( &x1[j * 4 + k] ), simde_mm_load_pd( &x2[j * 4 + k] )));
 #endif
 	}
       break;
@@ -2243,9 +2239,9 @@
 
   for (i = 0; i < upper; i++)
     { 
-      __m128d a0 = _mm_setzero_pd();
-      __m128d a1 = _mm_setzero_pd();
-      __m128d a2 = _mm_setzero_pd();
+      simde__m128d a0 = simde_mm_setzero_pd();
+      simde__m128d a1 = simde_mm_setzero_pd();
+      simde__m128d a2 = simde_mm_setzero_pd();
 
       sum = &sumtable[i * 8];         
 
@@ -2256,20 +2252,20 @@
 	    *d1 = &diagptable1[j * 2],
 	    *d2 = &diagptable2[j * 2];
   	 	 	 
-	  __m128d tmpv = _mm_mul_pd(_mm_load_pd(d0), _mm_load_pd(&sum[j * 2]));
-	  a0 = _mm_add_pd(a0, tmpv);
-	  a1 = _mm_add_pd(a1, _mm_mul_pd(tmpv, _mm_load_pd(d1)));
-	  a2 = _mm_add_pd(a2, _mm_mul_pd(tmpv, _mm_load_pd(d2)));
+	  simde__m128d tmpv = simde_mm_mul_pd(simde_mm_load_pd(d0), simde_mm_load_pd(&sum[j * 2]));
+	  a0 = simde_mm_add_pd(a0, tmpv);
+	  a1 = simde_mm_add_pd(a1, simde_mm_mul_pd(tmpv, simde_mm_load_pd(d1)));
+	  a2 = simde_mm_add_pd(a2, simde_mm_mul_pd(tmpv, simde_mm_load_pd(d2)));
 	    	 	  
 	}
 
-      a0 = _mm_hadd_pd(a0, a0);
-      a1 = _mm_hadd_pd(a1, a1);
-      a2 = _mm_hadd_pd(a2, a2);
-
-      _mm_storel_pd(&inv_Li, a0);     
-      _mm_storel_pd(&dlnLidlz, a1);
-      _mm_storel_pd(&d2lnLidlz2, a2); 
+      a0 = simde_mm_hadd_pd(a0, a0);
+      a1 = simde_mm_hadd_pd(a1, a1);
+      a2 = simde_mm_hadd_pd(a2, a2);
+
+      simde_mm_storel_pd(&inv_Li, a0);     
+      simde_mm_storel_pd(&dlnLidlz, a1);
+      simde_mm_storel_pd(&d2lnLidlz2, a2); 
 
       inv_Li = 1.0 / FABS(inv_Li);
      
@@ -2401,9 +2397,9 @@
 
   for (i = 0; i < upper; i++)
     { 
-      __m128d a0 = _mm_setzero_pd();
-      __m128d a1 = _mm_setzero_pd();
-      __m128d a2 = _mm_setzero_pd();
+      simde__m128d a0 = simde_mm_setzero_pd();
+      simde__m128d a1 = simde_mm_setzero_pd();
+      simde__m128d a2 = simde_mm_setzero_pd();
 
       sum = &sumtable[i * 16];         
 
@@ -2416,20 +2412,20 @@
   	 	 
 	  for(l = 0; l < 4; l+=2)
 	    {
-	      __m128d tmpv = _mm_mul_pd(_mm_load_pd(&d0[l]), _mm_load_pd(&sum[j * 4 + l]));
-	      a0 = _mm_add_pd(a0, tmpv);
-	      a1 = _mm_add_pd(a1, _mm_mul_pd(tmpv, _mm_load_pd(&d1[l])));
-	      a2 = _mm_add_pd(a2, _mm_mul_pd(tmpv, _mm_load_pd(&d2[l])));
+	      simde__m128d tmpv = simde_mm_mul_pd(simde_mm_load_pd(&d0[l]), simde_mm_load_pd(&sum[j * 4 + l]));
+	      a0 = simde_mm_add_pd(a0, tmpv);
+	      a1 = simde_mm_add_pd(a1, simde_mm_mul_pd(tmpv, simde_mm_load_pd(&d1[l])));
+	      a2 = simde_mm_add_pd(a2, simde_mm_mul_pd(tmpv, simde_mm_load_pd(&d2[l])));
 	    }	 	  
 	}
 
-      a0 = _mm_hadd_pd(a0, a0);
-      a1 = _mm_hadd_pd(a1, a1);
-      a2 = _mm_hadd_pd(a2, a2);
-
-      _mm_storel_pd(&inv_Li, a0);     
-      _mm_storel_pd(&dlnLidlz, a1);
-      _mm_storel_pd(&d2lnLidlz2, a2);       
+      a0 = simde_mm_hadd_pd(a0, a0);
+      a1 = simde_mm_hadd_pd(a1, a1);
+      a2 = simde_mm_hadd_pd(a2, a2);
+
+      simde_mm_storel_pd(&inv_Li, a0);     
+      simde_mm_storel_pd(&dlnLidlz, a1);
+      simde_mm_storel_pd(&d2lnLidlz2, a2);       
 
       inv_Li = 1.0 / FABS(inv_Li);
      
@@ -2475,9 +2471,9 @@
 #ifdef __SIM_SSE3
 	      for(k = 0; k < 20; k+=2)
 		{
-		  __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+		  simde__m128d sumv = simde_mm_mul_pd(simde_mm_load_pd(&left[k]), simde_mm_load_pd(&right[k]));
 		  
-		  _mm_store_pd(&sum[k], sumv);		 
+		  simde_mm_store_pd(&sum[k], sumv);		 
 		}
 #else
 	      for(k = 0; k < 20; k++)
@@ -2498,9 +2494,9 @@
 #ifdef __SIM_SSE3
 	      for(k = 0; k < 20; k+=2)
 		{
-		  __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+		  simde__m128d sumv = simde_mm_mul_pd(simde_mm_load_pd(&left[k]), simde_mm_load_pd(&right[k]));
 		  
-		  _mm_store_pd(&sum[k], sumv);		 
+		  simde_mm_store_pd(&sum[k], sumv);		 
 		}
 #else
 	      for(k = 0; k < 20; k++)
@@ -2521,9 +2517,9 @@
 #ifdef __SIM_SSE3
 	      for(k = 0; k < 20; k+=2)
 		{
-		  __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+		  simde__m128d sumv = simde_mm_mul_pd(simde_mm_load_pd(&left[k]), simde_mm_load_pd(&right[k]));
 		  
-		  _mm_store_pd(&sum[k], sumv);		 
+		  simde_mm_store_pd(&sum[k], sumv);		 
 		}
 #else
 	      for(k = 0; k < 20; k++)
@@ -2557,9 +2553,9 @@
 #ifdef __SIM_SSE3
 	      for(k = 0; k < 20; k+=2)
 		{
-		  __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+		  simde__m128d sumv = simde_mm_mul_pd(simde_mm_load_pd(&left[k]), simde_mm_load_pd(&right[k]));
 		  
-		  _mm_store_pd(&sum[k], sumv);		 
+		  simde_mm_store_pd(&sum[k], sumv);		 
 		}
 #else
 	      for(k = 0; k < 20; k++)
@@ -2581,9 +2577,9 @@
 #ifdef __SIM_SSE3
 	      for(k = 0; k < 20; k+=2)
 		{
-		  __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+		  simde__m128d sumv = simde_mm_mul_pd(simde_mm_load_pd(&left[k]), simde_mm_load_pd(&right[k]));
 		  
-		  _mm_store_pd(&sum[k], sumv);		 
+		  simde_mm_store_pd(&sum[k], sumv);		 
 		}
 #else
 	      for(k = 0; k < 20; k++)
@@ -2604,9 +2600,9 @@
 #ifdef __SIM_SSE3
 	      for(k = 0; k < 20; k+=2)
 		{
-		  __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+		  simde__m128d sumv = simde_mm_mul_pd(simde_mm_load_pd(&left[k]), simde_mm_load_pd(&right[k]));
 		  
-		  _mm_store_pd(&sum[k], sumv);		 
+		  simde_mm_store_pd(&sum[k], sumv);		 
 		}
 #else
 	      for(k = 0; k < 20; k++)
@@ -2650,9 +2646,9 @@
 
           for(k = 0; k < 20; k+=2)
           {
-            __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+            simde__m128d sumv = simde_mm_mul_pd(simde_mm_load_pd(&left[k]), simde_mm_load_pd(&right[k]));
 
-            _mm_store_pd(&sum[k], sumv);		 
+            simde_mm_store_pd(&sum[k], sumv);		 
           }
 
         }
@@ -2678,9 +2674,9 @@
 
           for(k = 0; k < 20; k+=2)
           {
-            __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+            simde__m128d sumv = simde_mm_mul_pd(simde_mm_load_pd(&left[k]), simde_mm_load_pd(&right[k]));
 
-            _mm_store_pd(&sum[k], sumv);		 
+            simde_mm_store_pd(&sum[k], sumv);		 
           }
         }
       }
@@ -2712,9 +2708,9 @@
 
           for(k = 0; k < 20; k+=2)
           {
-            __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+            simde__m128d sumv = simde_mm_mul_pd(simde_mm_load_pd(&left[k]), simde_mm_load_pd(&right[k]));
 
-            _mm_store_pd(&sum[k], sumv);		 
+            simde_mm_store_pd(&sum[k], sumv);		 
           }
         }
       }
@@ -3133,9 +3129,9 @@
 
   for (i = 0; i < upper; i++)
     { 
-      __m128d a0 = _mm_setzero_pd();
-      __m128d a1 = _mm_setzero_pd();
-      __m128d a2 = _mm_setzero_pd();
+      simde__m128d a0 = simde_mm_setzero_pd();
+      simde__m128d a1 = simde_mm_setzero_pd();
+      simde__m128d a2 = simde_mm_setzero_pd();
 
       sum = &sumtable[i * 80];         
 
@@ -3148,20 +3144,20 @@
   	 	 
 	  for(l = 0; l < 20; l+=2)
 	    {
-	      __m128d tmpv = _mm_mul_pd(_mm_load_pd(&d0[l]), _mm_load_pd(&sum[j * 20 +l]));
-	      a0 = _mm_add_pd(a0, tmpv);
-	      a1 = _mm_add_pd(a1, _mm_mul_pd(tmpv, _mm_load_pd(&d1[l])));
-	      a2 = _mm_add_pd(a2, _mm_mul_pd(tmpv, _mm_load_pd(&d2[l])));
+	      simde__m128d tmpv = simde_mm_mul_pd(simde_mm_load_pd(&d0[l]), simde_mm_load_pd(&sum[j * 20 +l]));
+	      a0 = simde_mm_add_pd(a0, tmpv);
+	      a1 = simde_mm_add_pd(a1, simde_mm_mul_pd(tmpv, simde_mm_load_pd(&d1[l])));
+	      a2 = simde_mm_add_pd(a2, simde_mm_mul_pd(tmpv, simde_mm_load_pd(&d2[l])));
 	    }	 	  
 	}
 
-      a0 = _mm_hadd_pd(a0, a0);
-      a1 = _mm_hadd_pd(a1, a1);
-      a2 = _mm_hadd_pd(a2, a2);
-
-      _mm_storel_pd(&inv_Li, a0);
-      _mm_storel_pd(&dlnLidlz, a1);
-      _mm_storel_pd(&d2lnLidlz2, a2);
+      a0 = simde_mm_hadd_pd(a0, a0);
+      a1 = simde_mm_hadd_pd(a1, a1);
+      a2 = simde_mm_hadd_pd(a2, a2);
+
+      simde_mm_storel_pd(&inv_Li, a0);
+      simde_mm_storel_pd(&dlnLidlz, a1);
+      simde_mm_storel_pd(&d2lnLidlz2, a2);
 
       inv_Li = 1.0 / FABS(inv_Li);
 
@@ -3224,28 +3220,28 @@
 	    *d1 = &diagptable1[j * 20],
 	    *d2 = &diagptable2[j * 20];
 	  
-	  __m128d 
-	    a0 = _mm_setzero_pd(),
-	    a1 = _mm_setzero_pd(),
-	    a2 = _mm_setzero_pd();
+	  simde__m128d 
+	    a0 = simde_mm_setzero_pd(),
+	    a1 = simde_mm_setzero_pd(),
+	    a2 = simde_mm_setzero_pd();
   	 	 
 	  for(l = 0; l < 20; l+=2)
 	    {
-	      __m128d 
-		tmpv = _mm_mul_pd(_mm_load_pd(&d0[l]), _mm_load_pd(&sum[j * 20 +l]));
+	      simde__m128d 
+		tmpv = simde_mm_mul_pd(simde_mm_load_pd(&d0[l]), simde_mm_load_pd(&sum[j * 20 +l]));
 	      
-	      a0 = _mm_add_pd(a0, tmpv);
-	      a1 = _mm_add_pd(a1, _mm_mul_pd(tmpv, _mm_load_pd(&d1[l])));
-	      a2 = _mm_add_pd(a2, _mm_mul_pd(tmpv, _mm_load_pd(&d2[l])));
+	      a0 = simde_mm_add_pd(a0, tmpv);
+	      a1 = simde_mm_add_pd(a1, simde_mm_mul_pd(tmpv, simde_mm_load_pd(&d1[l])));
+	      a2 = simde_mm_add_pd(a2, simde_mm_mul_pd(tmpv, simde_mm_load_pd(&d2[l])));
 	    }
 	  
-	  a0 = _mm_hadd_pd(a0, a0);
-	  a1 = _mm_hadd_pd(a1, a1);
-	  a2 = _mm_hadd_pd(a2, a2);
-
-	  _mm_storel_pd(&l0, a0);
-	  _mm_storel_pd(&l1, a1);
-	  _mm_storel_pd(&l2, a2);
+	  a0 = simde_mm_hadd_pd(a0, a0);
+	  a1 = simde_mm_hadd_pd(a1, a1);
+	  a2 = simde_mm_hadd_pd(a2, a2);
+
+	  simde_mm_storel_pd(&l0, a0);
+	  simde_mm_storel_pd(&l1, a1);
+	  simde_mm_storel_pd(&l2, a2);
 	  
 	  inv_Li     += weights[j] * l0;
 	  dlnLidlz   += weights[j] * l1;
--- raxml.orig/newviewGenericSpecial.c
+++ raxml/newviewGenericSpecial.c
@@ -41,24 +41,16 @@
 #include <limits.h>
 #include "axml.h"
 
-#ifdef __SIM_SSE3
 
 #include <stdint.h>
-#include <xmmintrin.h>
-#include <pmmintrin.h>
+#include "debian/include/simde/x86/sse3.h"
 
 const union __attribute__ ((aligned (BYTE_ALIGNMENT)))
 {
        uint64_t i[2];
-       __m128d m;
+       simde__m128d m;
 } absMask = {{0x7fffffffffffffffULL , 0x7fffffffffffffffULL }};
 
-
-
-
-#endif
-
-
 #ifdef _USE_PTHREADS
 #include <pthread.h>
 extern volatile int NumberOfThreads;
@@ -911,7 +903,7 @@
 
 	for(i = 0; i < numberOfCategories; i++)
 	  {	   
-	    __m128d 
+	    simde__m128d 
 	      d1_0, d1_1,
 	      d2_0, d2_1;
  
@@ -924,11 +916,11 @@
 		d2[j+1] = EXP(rptr[i] * ez2[j]);
 	      }
 
-	    d1_0 = _mm_load_pd(&d1[0]);
-	    d1_1 = _mm_load_pd(&d1[2]);
+	    d1_0 = simde_mm_load_pd(&d1[0]);
+	    d1_1 = simde_mm_load_pd(&d1[2]);
 
-	    d2_0 = _mm_load_pd(&d2[0]);
-	    d2_1 = _mm_load_pd(&d2[2]);
+	    d2_0 = simde_mm_load_pd(&d2[0]);
+	    d2_1 = simde_mm_load_pd(&d2[2]);
 	    
 
 	    for(j = 0; j < 4; j++)
@@ -936,15 +928,15 @@
 		double *ll = &left[i * 16 + j * 4];
 		double *rr = &right[i * 16 + j * 4];	       
 
-		__m128d eev = _mm_load_pd(&EI_16[4 * j]);
+		simde__m128d eev = simde_mm_load_pd(&EI_16[4 * j]);
 		
-		_mm_store_pd(&ll[0], _mm_mul_pd(d1_0, eev));
-		_mm_store_pd(&rr[0], _mm_mul_pd(d2_0, eev));
+		simde_mm_store_pd(&ll[0], simde_mm_mul_pd(d1_0, eev));
+		simde_mm_store_pd(&rr[0], simde_mm_mul_pd(d2_0, eev));
 		
-		eev = _mm_load_pd(&EI_16[4 * j + 2]);
+		eev = simde_mm_load_pd(&EI_16[4 * j + 2]);
 		
-		_mm_store_pd(&ll[2], _mm_mul_pd(d1_1, eev));
-		_mm_store_pd(&rr[2], _mm_mul_pd(d2_1, eev));
+		simde_mm_store_pd(&ll[2], simde_mm_mul_pd(d1_1, eev));
+		simde_mm_store_pd(&rr[2], simde_mm_mul_pd(d2_1, eev));
 
 		
 	      }
@@ -955,7 +947,7 @@
 	    i = maxCat;
 	    	    
 	    {	   
-	      __m128d 
+	      simde__m128d 
 		d1_0, d1_1,
 		d2_0, d2_1;
 	      
@@ -968,26 +960,26 @@
 		  d2[j+1] = EXP(ez2[j]);
 		}	     
 	      
-	      d1_0 = _mm_load_pd(&d1[0]);
-	      d1_1 = _mm_load_pd(&d1[2]);
+	      d1_0 = simde_mm_load_pd(&d1[0]);
+	      d1_1 = simde_mm_load_pd(&d1[2]);
 	      
-	      d2_0 = _mm_load_pd(&d2[0]);
-	      d2_1 = _mm_load_pd(&d2[2]);
+	      d2_0 = simde_mm_load_pd(&d2[0]);
+	      d2_1 = simde_mm_load_pd(&d2[2]);
 	      	      
 	      for(j = 0; j < 4; j++)
 		{	       
 		  double *ll = &left[i * 16 + j * 4];
 		  double *rr = &right[i * 16 + j * 4];	       
 		  
-		  __m128d eev = _mm_load_pd(&EI_16[4 * j]);
+		  simde__m128d eev = simde_mm_load_pd(&EI_16[4 * j]);
 		  
-		  _mm_store_pd(&ll[0], _mm_mul_pd(d1_0, eev));
-		  _mm_store_pd(&rr[0], _mm_mul_pd(d2_0, eev));
+		  simde_mm_store_pd(&ll[0], simde_mm_mul_pd(d1_0, eev));
+		  simde_mm_store_pd(&rr[0], simde_mm_mul_pd(d2_0, eev));
 		  
-		  eev = _mm_load_pd(&EI_16[4 * j + 2]);
+		  eev = simde_mm_load_pd(&EI_16[4 * j + 2]);
 		  
-		  _mm_store_pd(&ll[2], _mm_mul_pd(d1_1, eev));
-		  _mm_store_pd(&rr[2], _mm_mul_pd(d2_1, eev));
+		  simde_mm_store_pd(&ll[2], simde_mm_mul_pd(d1_1, eev));
+		  simde_mm_store_pd(&rr[2], simde_mm_mul_pd(d2_1, eev));
 		  
 		  
 		}
@@ -1356,24 +1348,24 @@
 	    le =  &left[cptr[i] * 4];
 	    ri =  &right[cptr[i] * 4];
 
-	    _mm_store_pd(x3, _mm_setzero_pd());	    
+	    simde_mm_store_pd(x3, simde_mm_setzero_pd());	    
 	    	     
 	    for(l = 0; l < 2; l++)
 	      {		 		 						   		  		 		 
-		__m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&le[l * 2]));
-		__m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&ri[l * 2]));
+		simde__m128d al = simde_mm_mul_pd(simde_mm_load_pd(x1), simde_mm_load_pd(&le[l * 2]));
+		simde__m128d ar = simde_mm_mul_pd(simde_mm_load_pd(x2), simde_mm_load_pd(&ri[l * 2]));
 		
-		al = _mm_hadd_pd(al, al);
-		ar = _mm_hadd_pd(ar, ar);
+		al = simde_mm_hadd_pd(al, al);
+		ar = simde_mm_hadd_pd(ar, ar);
 		
-		al = _mm_mul_pd(al, ar);
+		al = simde_mm_mul_pd(al, ar);
 		
-		__m128d vv  = _mm_load_pd(x3);
-		__m128d EVV = _mm_load_pd(&EV[2 * l]);
+		simde__m128d vv  = simde_mm_load_pd(x3);
+		simde__m128d EVV = simde_mm_load_pd(&EV[2 * l]);
 		
-		vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+		vv = simde_mm_add_pd(vv, simde_mm_mul_pd(al, EVV));
 		
-		_mm_store_pd(x3, vv);		     	  		   		  
+		simde_mm_store_pd(x3, vv);		     	  		   		  
 	      }	    	   
 	  }
       }
@@ -1389,41 +1381,41 @@
 	    le =  &left[cptr[i] * 4];
 	    ri =  &right[cptr[i] * 4];
 
-	    _mm_store_pd(x3, _mm_setzero_pd());	    
+	    simde_mm_store_pd(x3, simde_mm_setzero_pd());	    
 	    	     
 	    for(l = 0; l < 2; l++)
 	      {		 		 						   		  		 		 
-		__m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&le[l * 2]));
-		__m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&ri[l * 2]));
+		simde__m128d al = simde_mm_mul_pd(simde_mm_load_pd(x1), simde_mm_load_pd(&le[l * 2]));
+		simde__m128d ar = simde_mm_mul_pd(simde_mm_load_pd(x2), simde_mm_load_pd(&ri[l * 2]));
 		
-		al = _mm_hadd_pd(al, al);
-		ar = _mm_hadd_pd(ar, ar);
+		al = simde_mm_hadd_pd(al, al);
+		ar = simde_mm_hadd_pd(ar, ar);
 		
-		al = _mm_mul_pd(al, ar);
+		al = simde_mm_mul_pd(al, ar);
 		
-		__m128d vv  = _mm_load_pd(x3);
-		__m128d EVV = _mm_load_pd(&EV[2 * l]);
+		simde__m128d vv  = simde_mm_load_pd(x3);
+		simde__m128d EVV = simde_mm_load_pd(&EV[2 * l]);
 		
-		vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+		vv = simde_mm_add_pd(vv, simde_mm_mul_pd(al, EVV));
 		
-		_mm_store_pd(x3, vv);		     	  		   		  
+		simde_mm_store_pd(x3, vv);		     	  		   		  
 	      }	 
 	    
-	    __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
+	    simde__m128d minlikelihood_sse = simde_mm_set1_pd( minlikelihood );
 	 
 	    scale = 1;
 	    
-	    __m128d v1 = _mm_and_pd(_mm_load_pd(x3), absMask.m);
-	    v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	    if(_mm_movemask_pd( v1 ) != 3)
+	    simde__m128d v1 = simde_mm_and_pd(simde_mm_load_pd(x3), absMask.m);
+	    v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
+	    if(simde_mm_movemask_pd( v1 ) != 3)
 	      scale = 0;	  	         
 	    
 	    if(scale)
 	      {
-		__m128d twoto = _mm_set_pd(twotothe256, twotothe256);
+		simde__m128d twoto = simde_mm_set_pd(twotothe256, twotothe256);
 		
-		__m128d ex3v = _mm_load_pd(x3);		  
-		_mm_store_pd(x3, _mm_mul_pd(ex3v,twoto));		    		   		  
+		simde__m128d ex3v = simde_mm_load_pd(x3);		  
+		simde_mm_store_pd(x3, simde_mm_mul_pd(ex3v,twoto));		    		   		  
 		
 		if(useFastScaling)
 		  addScale += wgt[i];
@@ -1443,41 +1435,41 @@
 	  le = &left[cptr[i] * 4];
 	  ri = &right[cptr[i] * 4];
 
-	  _mm_store_pd(x3, _mm_setzero_pd());	    
+	  simde_mm_store_pd(x3, simde_mm_setzero_pd());	    
 	  
 	  for(l = 0; l < 2; l++)
 	    {		 		 						   		  		 		 
-	      __m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&le[l * 2]));
-	      __m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&ri[l * 2]));
+	      simde__m128d al = simde_mm_mul_pd(simde_mm_load_pd(x1), simde_mm_load_pd(&le[l * 2]));
+	      simde__m128d ar = simde_mm_mul_pd(simde_mm_load_pd(x2), simde_mm_load_pd(&ri[l * 2]));
 	      
-	      al = _mm_hadd_pd(al, al);
-	      ar = _mm_hadd_pd(ar, ar);
+	      al = simde_mm_hadd_pd(al, al);
+	      ar = simde_mm_hadd_pd(ar, ar);
 	      
-	      al = _mm_mul_pd(al, ar);
+	      al = simde_mm_mul_pd(al, ar);
 	      
-	      __m128d vv  = _mm_load_pd(x3);
-	      __m128d EVV = _mm_load_pd(&EV[2 * l]);
+	      simde__m128d vv  = simde_mm_load_pd(x3);
+	      simde__m128d EVV = simde_mm_load_pd(&EV[2 * l]);
 	      
-	      vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+	      vv = simde_mm_add_pd(vv, simde_mm_mul_pd(al, EVV));
 	      
-	      _mm_store_pd(x3, vv);		     	  		   		  
+	      simde_mm_store_pd(x3, vv);		     	  		   		  
 	    }	 	 	 	  
 
-	  __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
+	  simde__m128d minlikelihood_sse = simde_mm_set1_pd( minlikelihood );
 	 
 	  scale = 1;
 	  	  
-	  __m128d v1 = _mm_and_pd(_mm_load_pd(x3), absMask.m);
-	  v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	  if(_mm_movemask_pd( v1 ) != 3)
+	  simde__m128d v1 = simde_mm_and_pd(simde_mm_load_pd(x3), absMask.m);
+	  v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
+	  if(simde_mm_movemask_pd( v1 ) != 3)
 	    scale = 0;	  	         
 	 
 	  if(scale)
 	    {
-	      __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
+	      simde__m128d twoto = simde_mm_set_pd(twotothe256, twotothe256);
 	      	    
-	      __m128d ex3v = _mm_load_pd(x3);		  
-	      _mm_store_pd(x3, _mm_mul_pd(ex3v,twoto));		    		   		  
+	      simde__m128d ex3v = simde_mm_load_pd(x3);		  
+	      simde_mm_store_pd(x3, simde_mm_mul_pd(ex3v,twoto));		    		   		  
 	     
 	      if(useFastScaling)
 		addScale += wgt[i];
@@ -1523,24 +1515,24 @@
 	   {	     	     	    
 	     x3 = &(x3_start[8 * i + 2 * k]);	     
 	    	         
-	     _mm_store_pd(x3, _mm_setzero_pd());	    
+	     simde_mm_store_pd(x3, simde_mm_setzero_pd());	    
 	    	     
 	     for(l = 0; l < 2; l++)
 	       {		 		 						   		  		 		 
-		 __m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&left[k * 4 + l * 2]));
-		 __m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&right[k * 4 + l * 2]));
+		 simde__m128d al = simde_mm_mul_pd(simde_mm_load_pd(x1), simde_mm_load_pd(&left[k * 4 + l * 2]));
+		 simde__m128d ar = simde_mm_mul_pd(simde_mm_load_pd(x2), simde_mm_load_pd(&right[k * 4 + l * 2]));
 		 		       
-		 al = _mm_hadd_pd(al, al);
-		 ar = _mm_hadd_pd(ar, ar);
+		 al = simde_mm_hadd_pd(al, al);
+		 ar = simde_mm_hadd_pd(ar, ar);
 		   
-		 al = _mm_mul_pd(al, ar);
+		 al = simde_mm_mul_pd(al, ar);
 		   
-		 __m128d vv  = _mm_load_pd(x3);
-		 __m128d EVV = _mm_load_pd(&EV[2 * l]);
+		 simde__m128d vv  = simde_mm_load_pd(x3);
+		 simde__m128d EVV = simde_mm_load_pd(&EV[2 * l]);
 		 
-		 vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+		 vv = simde_mm_add_pd(vv, simde_mm_mul_pd(al, EVV));
 		 
-		 _mm_store_pd(x3, vv);		     	  		   		  
+		 simde_mm_store_pd(x3, vv);		     	  		   		  
 	       }	     	    
 	   }
        }
@@ -1555,48 +1547,48 @@
 	     x2 = &(x2_start[8 * i + 2 * k]);
 	     x3 = &(x3_start[8 * i + 2 * k]);	     
 	    	         
-	     _mm_store_pd(x3, _mm_setzero_pd());	    
+	     simde_mm_store_pd(x3, simde_mm_setzero_pd());	    
 	    	     
 	     for(l = 0; l < 2; l++)
 	       {		 		 						   		  		 		 
-		 __m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&left[k * 4 + l * 2]));
-		 __m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&right[k * 4 + l * 2]));
+		 simde__m128d al = simde_mm_mul_pd(simde_mm_load_pd(x1), simde_mm_load_pd(&left[k * 4 + l * 2]));
+		 simde__m128d ar = simde_mm_mul_pd(simde_mm_load_pd(x2), simde_mm_load_pd(&right[k * 4 + l * 2]));
 		 		       
-		 al = _mm_hadd_pd(al, al);
-		 ar = _mm_hadd_pd(ar, ar);
+		 al = simde_mm_hadd_pd(al, al);
+		 ar = simde_mm_hadd_pd(ar, ar);
 		   
-		 al = _mm_mul_pd(al, ar);
+		 al = simde_mm_mul_pd(al, ar);
 		   
-		 __m128d vv  = _mm_load_pd(x3);
-		 __m128d EVV = _mm_load_pd(&EV[2 * l]);
+		 simde__m128d vv  = simde_mm_load_pd(x3);
+		 simde__m128d EVV = simde_mm_load_pd(&EV[2 * l]);
 		 
-		 vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+		 vv = simde_mm_add_pd(vv, simde_mm_mul_pd(al, EVV));
 		 
-		 _mm_store_pd(x3, vv);		     	  		   		  
+		 simde_mm_store_pd(x3, vv);		     	  		   		  
 	       }	     	    
 	   }
 	
 	 x3 = &(x3_start[8 * i]);
-	 __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
+	 simde__m128d minlikelihood_sse = simde_mm_set1_pd( minlikelihood );
 	 
 	 scale = 1;
 	 for(l = 0; scale && (l < 8); l += 2)
 	   {
-	     __m128d vv = _mm_load_pd(&x3[l]);
-	     __m128d v1 = _mm_and_pd(vv, absMask.m);
-	     v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	     if(_mm_movemask_pd( v1 ) != 3)
+	     simde__m128d vv = simde_mm_load_pd(&x3[l]);
+	     simde__m128d v1 = simde_mm_and_pd(vv, absMask.m);
+	     v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
+	     if(simde_mm_movemask_pd( v1 ) != 3)
 	       scale = 0;
 	   }	    	         
 	 
 	 if(scale)
 	   {
-	     __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
+	     simde__m128d twoto = simde_mm_set_pd(twotothe256, twotothe256);
 	     
 	     for(l = 0; l < 8; l+=2)
 	       {
-		 __m128d ex3v = _mm_load_pd(&x3[l]);		  
-		 _mm_store_pd(&x3[l], _mm_mul_pd(ex3v,twoto));	
+		 simde__m128d ex3v = simde_mm_load_pd(&x3[l]);		  
+		 simde_mm_store_pd(&x3[l], simde_mm_mul_pd(ex3v,twoto));	
 	       }		   		  
 	     
 	     if(useFastScaling)
@@ -1615,48 +1607,48 @@
 	     x2 = &(x2_start[8 * i + 2 * k]);
 	     x3 = &(x3_start[8 * i + 2 * k]);	     
 	    	         
-	     _mm_store_pd(x3, _mm_setzero_pd());	    
+	     simde_mm_store_pd(x3, simde_mm_setzero_pd());	    
 	    	     
 	     for(l = 0; l < 2; l++)
 	       {		 		 						   		  		 		 
-		 __m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&left[k * 4 + l * 2]));
-		 __m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&right[k * 4 + l * 2]));
+		 simde__m128d al = simde_mm_mul_pd(simde_mm_load_pd(x1), simde_mm_load_pd(&left[k * 4 + l * 2]));
+		 simde__m128d ar = simde_mm_mul_pd(simde_mm_load_pd(x2), simde_mm_load_pd(&right[k * 4 + l * 2]));
 		 		       
-		 al = _mm_hadd_pd(al, al);
-		 ar = _mm_hadd_pd(ar, ar);
+		 al = simde_mm_hadd_pd(al, al);
+		 ar = simde_mm_hadd_pd(ar, ar);
 		   
-		 al = _mm_mul_pd(al, ar);
+		 al = simde_mm_mul_pd(al, ar);
 		   
-		 __m128d vv  = _mm_load_pd(x3);
-		 __m128d EVV = _mm_load_pd(&EV[2 * l]);
+		 simde__m128d vv  = simde_mm_load_pd(x3);
+		 simde__m128d EVV = simde_mm_load_pd(&EV[2 * l]);
 		 
-		 vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+		 vv = simde_mm_add_pd(vv, simde_mm_mul_pd(al, EVV));
 		 
-		 _mm_store_pd(x3, vv);		     	  		   		  
+		 simde_mm_store_pd(x3, vv);		     	  		   		  
 	       }	     	    
 	   }
 	
 	 x3 = &(x3_start[8 * i]);
-	 __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
+	 simde__m128d minlikelihood_sse = simde_mm_set1_pd( minlikelihood );
 	 
 	 scale = 1;
 	 for(l = 0; scale && (l < 8); l += 2)
 	   {
-	     __m128d vv = _mm_load_pd(&x3[l]);
-	     __m128d v1 = _mm_and_pd(vv, absMask.m);
-	     v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	     if(_mm_movemask_pd( v1 ) != 3)
+	     simde__m128d vv = simde_mm_load_pd(&x3[l]);
+	     simde__m128d v1 = simde_mm_and_pd(vv, absMask.m);
+	     v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
+	     if(simde_mm_movemask_pd( v1 ) != 3)
 	       scale = 0;
 	   }	    	         
 	 
 	 if(scale)
 	   {
-	     __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
+	     simde__m128d twoto = simde_mm_set_pd(twotothe256, twotothe256);
 	     
 	     for(l = 0; l < 8; l+=2)
 	       {
-		 __m128d ex3v = _mm_load_pd(&x3[l]);		  
-		 _mm_store_pd(&x3[l], _mm_mul_pd(ex3v,twoto));	
+		 simde__m128d ex3v = simde_mm_load_pd(&x3[l]);		  
+		 simde_mm_store_pd(&x3[l], simde_mm_mul_pd(ex3v,twoto));	
 	       }		   		  
 	     
 	     if(useFastScaling)
@@ -2034,9 +2026,9 @@
     scaleGap = 0,
     addScale = 0;
 
-  __m128d
-    minlikelihood_sse = _mm_set1_pd( minlikelihood ),
-    sc = _mm_set1_pd(twotothe256),
+  simde__m128d
+    minlikelihood_sse = simde_mm_set1_pd( minlikelihood ),
+    sc = simde_mm_set1_pd(twotothe256),
     EVV[8];  
 
   for(i = 0; i < 4; i++)
@@ -2044,7 +2036,7 @@
       EV_t[4 * j + i] = EV[4 * i + j];
 
   for(i = 0; i < 8; i++)
-    EVV[i] = _mm_load_pd(&EV_t[i * 2]);
+    EVV[i] = simde_mm_load_pd(&EV_t[i * 2]);
 
   {
     x1 = x1_gapColumn;	      
@@ -2054,137 +2046,137 @@
     le =  &left[maxCats * 16];	     	 
     ri =  &right[maxCats * 16];		   	  	  	  	         
 
-    __m128d x1_0 = _mm_load_pd( &x1[0] );
-    __m128d x1_2 = _mm_load_pd( &x1[2] );
+    simde__m128d x1_0 = simde_mm_load_pd( &x1[0] );
+    simde__m128d x1_2 = simde_mm_load_pd( &x1[2] );
 
-    __m128d left_k0_0 = _mm_load_pd( &le[0] );
-    __m128d left_k0_2 = _mm_load_pd( &le[2] );
-    __m128d left_k1_0 = _mm_load_pd( &le[4] );
-    __m128d left_k1_2 = _mm_load_pd( &le[6] );
-    __m128d left_k2_0 = _mm_load_pd( &le[8] );
-    __m128d left_k2_2 = _mm_load_pd( &le[10] );
-    __m128d left_k3_0 = _mm_load_pd( &le[12] );
-    __m128d left_k3_2 = _mm_load_pd( &le[14] );
-
-    left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
-    left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
-
-    left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
-    left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
-
-    left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
-    left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
-    left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
-
-    left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
-    left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
-
-    left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
-    left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
-
-    left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
-    left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
-    left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
-
-    __m128d x2_0 = _mm_load_pd( &x2[0] );
-    __m128d x2_2 = _mm_load_pd( &x2[2] );
-
-    __m128d right_k0_0 = _mm_load_pd( &ri[0] );
-    __m128d right_k0_2 = _mm_load_pd( &ri[2] );
-    __m128d right_k1_0 = _mm_load_pd( &ri[4] );
-    __m128d right_k1_2 = _mm_load_pd( &ri[6] );
-    __m128d right_k2_0 = _mm_load_pd( &ri[8] );
-    __m128d right_k2_2 = _mm_load_pd( &ri[10] );
-    __m128d right_k3_0 = _mm_load_pd( &ri[12] );
-    __m128d right_k3_2 = _mm_load_pd( &ri[14] );
-
-    right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-    right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-
-    right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-    right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-
-    right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-    right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-    right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-
-    right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-    right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-
-    right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-    right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-
-    right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-    right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-    right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);	   
-
-    __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
-    __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
-
-    __m128d EV_t_l0_k0 = EVV[0];
-    __m128d EV_t_l0_k2 = EVV[1];
-    __m128d EV_t_l1_k0 = EVV[2];
-    __m128d EV_t_l1_k2 = EVV[3];
-    __m128d EV_t_l2_k0 = EVV[4];
-    __m128d EV_t_l2_k2 = EVV[5];
-    __m128d EV_t_l3_k0 = EVV[6];
-    __m128d EV_t_l3_k2 = EVV[7];
-
-    EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-    EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-
-    EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-    EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-
-    EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-
-    EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-    EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-
-    EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-    EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-    EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+    simde__m128d left_k0_0 = simde_mm_load_pd( &le[0] );
+    simde__m128d left_k0_2 = simde_mm_load_pd( &le[2] );
+    simde__m128d left_k1_0 = simde_mm_load_pd( &le[4] );
+    simde__m128d left_k1_2 = simde_mm_load_pd( &le[6] );
+    simde__m128d left_k2_0 = simde_mm_load_pd( &le[8] );
+    simde__m128d left_k2_2 = simde_mm_load_pd( &le[10] );
+    simde__m128d left_k3_0 = simde_mm_load_pd( &le[12] );
+    simde__m128d left_k3_2 = simde_mm_load_pd( &le[14] );
+
+    left_k0_0 = simde_mm_mul_pd(x1_0, left_k0_0);
+    left_k0_2 = simde_mm_mul_pd(x1_2, left_k0_2);
+
+    left_k1_0 = simde_mm_mul_pd(x1_0, left_k1_0);
+    left_k1_2 = simde_mm_mul_pd(x1_2, left_k1_2);
+
+    left_k0_0 = simde_mm_hadd_pd( left_k0_0, left_k0_2 );
+    left_k1_0 = simde_mm_hadd_pd( left_k1_0, left_k1_2);
+    left_k0_0 = simde_mm_hadd_pd( left_k0_0, left_k1_0);
+
+    left_k2_0 = simde_mm_mul_pd(x1_0, left_k2_0);
+    left_k2_2 = simde_mm_mul_pd(x1_2, left_k2_2);
+
+    left_k3_0 = simde_mm_mul_pd(x1_0, left_k3_0);
+    left_k3_2 = simde_mm_mul_pd(x1_2, left_k3_2);
+
+    left_k2_0 = simde_mm_hadd_pd( left_k2_0, left_k2_2);
+    left_k3_0 = simde_mm_hadd_pd( left_k3_0, left_k3_2);
+    left_k2_0 = simde_mm_hadd_pd( left_k2_0, left_k3_0);
+
+    simde__m128d x2_0 = simde_mm_load_pd( &x2[0] );
+    simde__m128d x2_2 = simde_mm_load_pd( &x2[2] );
+
+    simde__m128d right_k0_0 = simde_mm_load_pd( &ri[0] );
+    simde__m128d right_k0_2 = simde_mm_load_pd( &ri[2] );
+    simde__m128d right_k1_0 = simde_mm_load_pd( &ri[4] );
+    simde__m128d right_k1_2 = simde_mm_load_pd( &ri[6] );
+    simde__m128d right_k2_0 = simde_mm_load_pd( &ri[8] );
+    simde__m128d right_k2_2 = simde_mm_load_pd( &ri[10] );
+    simde__m128d right_k3_0 = simde_mm_load_pd( &ri[12] );
+    simde__m128d right_k3_2 = simde_mm_load_pd( &ri[14] );
+
+    right_k0_0 = simde_mm_mul_pd( x2_0, right_k0_0);
+    right_k0_2 = simde_mm_mul_pd( x2_2, right_k0_2);
+
+    right_k1_0 = simde_mm_mul_pd( x2_0, right_k1_0);
+    right_k1_2 = simde_mm_mul_pd( x2_2, right_k1_2);
+
+    right_k0_0 = simde_mm_hadd_pd( right_k0_0, right_k0_2);
+    right_k1_0 = simde_mm_hadd_pd( right_k1_0, right_k1_2);
+    right_k0_0 = simde_mm_hadd_pd( right_k0_0, right_k1_0);
+
+    right_k2_0 = simde_mm_mul_pd( x2_0, right_k2_0);
+    right_k2_2 = simde_mm_mul_pd( x2_2, right_k2_2);
+
+    right_k3_0 = simde_mm_mul_pd( x2_0, right_k3_0);
+    right_k3_2 = simde_mm_mul_pd( x2_2, right_k3_2);
+
+    right_k2_0 = simde_mm_hadd_pd( right_k2_0, right_k2_2);
+    right_k3_0 = simde_mm_hadd_pd( right_k3_0, right_k3_2);
+    right_k2_0 = simde_mm_hadd_pd( right_k2_0, right_k3_0);	   
+
+    simde__m128d x1px2_k0 = simde_mm_mul_pd( left_k0_0, right_k0_0 );
+    simde__m128d x1px2_k2 = simde_mm_mul_pd( left_k2_0, right_k2_0 );
+
+    simde__m128d EV_t_l0_k0 = EVV[0];
+    simde__m128d EV_t_l0_k2 = EVV[1];
+    simde__m128d EV_t_l1_k0 = EVV[2];
+    simde__m128d EV_t_l1_k2 = EVV[3];
+    simde__m128d EV_t_l2_k0 = EVV[4];
+    simde__m128d EV_t_l2_k2 = EVV[5];
+    simde__m128d EV_t_l3_k0 = EVV[6];
+    simde__m128d EV_t_l3_k2 = EVV[7];
+
+    EV_t_l0_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+    EV_t_l0_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+    EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+    EV_t_l1_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+    EV_t_l1_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+    EV_t_l1_k0 = simde_mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+    EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+    EV_t_l2_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+    EV_t_l2_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+    EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+    EV_t_l3_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+    EV_t_l3_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+    EV_t_l3_k0 = simde_mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
 
-    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );	  	 	    		  
+    EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );	  	 	    		  
     
     if(tipCase != TIP_TIP)
       {    
 	scale = 1;
 	
-	__m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m);
+	simde__m128d v1 = simde_mm_and_pd(EV_t_l0_k0, absMask.m);
 	
-	v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+	v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
 	
-	if(_mm_movemask_pd( v1 ) != 3)
+	if(simde_mm_movemask_pd( v1 ) != 3)
 	  scale = 0;
 	else
 	  {
-	    v1 = _mm_and_pd(EV_t_l2_k0, absMask.m);
-	    v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	    if(_mm_movemask_pd( v1 ) != 3)
+	    v1 = simde_mm_and_pd(EV_t_l2_k0, absMask.m);
+	    v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
+	    if(simde_mm_movemask_pd( v1 ) != 3)
 	      scale = 0;
 	  }
 
       if(scale)
 	{		      
-	  _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc));
-	  _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc));	      	      
+	  simde_mm_store_pd(&x3[0], simde_mm_mul_pd(EV_t_l0_k0, sc));
+	  simde_mm_store_pd(&x3[2], simde_mm_mul_pd(EV_t_l2_k0, sc));	      	      
 	  
 	  scaleGap = TRUE;	   
 	}	
       else
 	{
-	  _mm_store_pd(x3, EV_t_l0_k0);
-	  _mm_store_pd(&x3[2], EV_t_l2_k0);
+	  simde_mm_store_pd(x3, EV_t_l0_k0);
+	  simde_mm_store_pd(&x3[2], EV_t_l2_k0);
 	}
       }
     else
       {
-	_mm_store_pd(x3, EV_t_l0_k0);
-	_mm_store_pd(&x3[2], EV_t_l2_k0);
+	simde_mm_store_pd(x3, EV_t_l0_k0);
+	simde_mm_store_pd(&x3[2], EV_t_l2_k0);
       }
   }
   
@@ -2210,104 +2202,104 @@
           else	 	  
             ri =  &right[cptr[i] * 16];
 
-          __m128d x1_0 = _mm_load_pd( &x1[0] );
-          __m128d x1_2 = _mm_load_pd( &x1[2] );
+          simde__m128d x1_0 = simde_mm_load_pd( &x1[0] );
+          simde__m128d x1_2 = simde_mm_load_pd( &x1[2] );
 
-          __m128d left_k0_0 = _mm_load_pd( &le[0] );
-          __m128d left_k0_2 = _mm_load_pd( &le[2] );
-          __m128d left_k1_0 = _mm_load_pd( &le[4] );
-          __m128d left_k1_2 = _mm_load_pd( &le[6] );
-          __m128d left_k2_0 = _mm_load_pd( &le[8] );
-          __m128d left_k2_2 = _mm_load_pd( &le[10] );
-          __m128d left_k3_0 = _mm_load_pd( &le[12] );
-          __m128d left_k3_2 = _mm_load_pd( &le[14] );
-
-          left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
-          left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
-
-          left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
-          left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
-
-          left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
-          left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
-          left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
-
-          left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
-          left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
-
-          left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
-          left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
-
-          left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
-          left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
-          left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
-
-          __m128d x2_0 = _mm_load_pd( &x2[0] );
-          __m128d x2_2 = _mm_load_pd( &x2[2] );
-
-          __m128d right_k0_0 = _mm_load_pd( &ri[0] );
-          __m128d right_k0_2 = _mm_load_pd( &ri[2] );
-          __m128d right_k1_0 = _mm_load_pd( &ri[4] );
-          __m128d right_k1_2 = _mm_load_pd( &ri[6] );
-          __m128d right_k2_0 = _mm_load_pd( &ri[8] );
-          __m128d right_k2_2 = _mm_load_pd( &ri[10] );
-          __m128d right_k3_0 = _mm_load_pd( &ri[12] );
-          __m128d right_k3_2 = _mm_load_pd( &ri[14] );
-
-          right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-          right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-
-          right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-          right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-
-          right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-          right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-          right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-
-          right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-          right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-
-          right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-          right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-
-          right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-          right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-          right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);	   
-
-          __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
-          __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );	  	  
-
-          __m128d EV_t_l0_k0 = EVV[0];
-          __m128d EV_t_l0_k2 = EVV[1];
-          __m128d EV_t_l1_k0 = EVV[2];
-          __m128d EV_t_l1_k2 = EVV[3];
-          __m128d EV_t_l2_k0 = EVV[4];
-          __m128d EV_t_l2_k2 = EVV[5];
-          __m128d EV_t_l3_k0 = EVV[6];
-          __m128d EV_t_l3_k2 = EVV[7];
-
-          EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-          EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-          EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-
-          EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-          EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-
-          EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-          EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-
-          EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-          EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-          EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-
-          EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-          EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-          EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+          simde__m128d left_k0_0 = simde_mm_load_pd( &le[0] );
+          simde__m128d left_k0_2 = simde_mm_load_pd( &le[2] );
+          simde__m128d left_k1_0 = simde_mm_load_pd( &le[4] );
+          simde__m128d left_k1_2 = simde_mm_load_pd( &le[6] );
+          simde__m128d left_k2_0 = simde_mm_load_pd( &le[8] );
+          simde__m128d left_k2_2 = simde_mm_load_pd( &le[10] );
+          simde__m128d left_k3_0 = simde_mm_load_pd( &le[12] );
+          simde__m128d left_k3_2 = simde_mm_load_pd( &le[14] );
+
+          left_k0_0 = simde_mm_mul_pd(x1_0, left_k0_0);
+          left_k0_2 = simde_mm_mul_pd(x1_2, left_k0_2);
+
+          left_k1_0 = simde_mm_mul_pd(x1_0, left_k1_0);
+          left_k1_2 = simde_mm_mul_pd(x1_2, left_k1_2);
+
+          left_k0_0 = simde_mm_hadd_pd( left_k0_0, left_k0_2 );
+          left_k1_0 = simde_mm_hadd_pd( left_k1_0, left_k1_2);
+          left_k0_0 = simde_mm_hadd_pd( left_k0_0, left_k1_0);
+
+          left_k2_0 = simde_mm_mul_pd(x1_0, left_k2_0);
+          left_k2_2 = simde_mm_mul_pd(x1_2, left_k2_2);
+
+          left_k3_0 = simde_mm_mul_pd(x1_0, left_k3_0);
+          left_k3_2 = simde_mm_mul_pd(x1_2, left_k3_2);
+
+          left_k2_0 = simde_mm_hadd_pd( left_k2_0, left_k2_2);
+          left_k3_0 = simde_mm_hadd_pd( left_k3_0, left_k3_2);
+          left_k2_0 = simde_mm_hadd_pd( left_k2_0, left_k3_0);
+
+          simde__m128d x2_0 = simde_mm_load_pd( &x2[0] );
+          simde__m128d x2_2 = simde_mm_load_pd( &x2[2] );
+
+          simde__m128d right_k0_0 = simde_mm_load_pd( &ri[0] );
+          simde__m128d right_k0_2 = simde_mm_load_pd( &ri[2] );
+          simde__m128d right_k1_0 = simde_mm_load_pd( &ri[4] );
+          simde__m128d right_k1_2 = simde_mm_load_pd( &ri[6] );
+          simde__m128d right_k2_0 = simde_mm_load_pd( &ri[8] );
+          simde__m128d right_k2_2 = simde_mm_load_pd( &ri[10] );
+          simde__m128d right_k3_0 = simde_mm_load_pd( &ri[12] );
+          simde__m128d right_k3_2 = simde_mm_load_pd( &ri[14] );
+
+          right_k0_0 = simde_mm_mul_pd( x2_0, right_k0_0);
+          right_k0_2 = simde_mm_mul_pd( x2_2, right_k0_2);
+
+          right_k1_0 = simde_mm_mul_pd( x2_0, right_k1_0);
+          right_k1_2 = simde_mm_mul_pd( x2_2, right_k1_2);
+
+          right_k0_0 = simde_mm_hadd_pd( right_k0_0, right_k0_2);
+          right_k1_0 = simde_mm_hadd_pd( right_k1_0, right_k1_2);
+          right_k0_0 = simde_mm_hadd_pd( right_k0_0, right_k1_0);
+
+          right_k2_0 = simde_mm_mul_pd( x2_0, right_k2_0);
+          right_k2_2 = simde_mm_mul_pd( x2_2, right_k2_2);
+
+          right_k3_0 = simde_mm_mul_pd( x2_0, right_k3_0);
+          right_k3_2 = simde_mm_mul_pd( x2_2, right_k3_2);
+
+          right_k2_0 = simde_mm_hadd_pd( right_k2_0, right_k2_2);
+          right_k3_0 = simde_mm_hadd_pd( right_k3_0, right_k3_2);
+          right_k2_0 = simde_mm_hadd_pd( right_k2_0, right_k3_0);	   
+
+          simde__m128d x1px2_k0 = simde_mm_mul_pd( left_k0_0, right_k0_0 );
+          simde__m128d x1px2_k2 = simde_mm_mul_pd( left_k2_0, right_k2_0 );	  	  
+
+          simde__m128d EV_t_l0_k0 = EVV[0];
+          simde__m128d EV_t_l0_k2 = EVV[1];
+          simde__m128d EV_t_l1_k0 = EVV[2];
+          simde__m128d EV_t_l1_k2 = EVV[3];
+          simde__m128d EV_t_l2_k0 = EVV[4];
+          simde__m128d EV_t_l2_k2 = EVV[5];
+          simde__m128d EV_t_l3_k0 = EVV[6];
+          simde__m128d EV_t_l3_k2 = EVV[7];
+
+          EV_t_l0_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+          EV_t_l0_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+          EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+          EV_t_l1_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+          EV_t_l1_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+          EV_t_l1_k0 = simde_mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+          EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+          EV_t_l2_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+          EV_t_l2_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+          EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+          EV_t_l3_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+          EV_t_l3_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+          EV_t_l3_k0 = simde_mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
 
-          EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );	 
+          EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );	 
 
-          _mm_store_pd(x3, EV_t_l0_k0);
-          _mm_store_pd(&x3[2], EV_t_l2_k0);	  	 	   	    
+          simde_mm_store_pd(x3, EV_t_l0_k0);
+          simde_mm_store_pd(&x3[2], EV_t_l2_k0);	  	 	   	    
 
           x3_ptr += 4;
         }
@@ -2350,121 +2342,121 @@
             x2_ptr += 4;
           }	  	  	  	  
 
-          __m128d x1_0 = _mm_load_pd( &x1[0] );
-          __m128d x1_2 = _mm_load_pd( &x1[2] );
+          simde__m128d x1_0 = simde_mm_load_pd( &x1[0] );
+          simde__m128d x1_2 = simde_mm_load_pd( &x1[2] );
 
-          __m128d left_k0_0 = _mm_load_pd( &le[0] );
-          __m128d left_k0_2 = _mm_load_pd( &le[2] );
-          __m128d left_k1_0 = _mm_load_pd( &le[4] );
-          __m128d left_k1_2 = _mm_load_pd( &le[6] );
-          __m128d left_k2_0 = _mm_load_pd( &le[8] );
-          __m128d left_k2_2 = _mm_load_pd( &le[10] );
-          __m128d left_k3_0 = _mm_load_pd( &le[12] );
-          __m128d left_k3_2 = _mm_load_pd( &le[14] );
-
-          left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
-          left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
-
-          left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
-          left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
-
-          left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
-          left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
-          left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
-
-          left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
-          left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
-
-          left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
-          left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
-
-          left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
-          left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
-          left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
-
-          __m128d x2_0 = _mm_load_pd( &x2[0] );
-          __m128d x2_2 = _mm_load_pd( &x2[2] );
-
-          __m128d right_k0_0 = _mm_load_pd( &ri[0] );
-          __m128d right_k0_2 = _mm_load_pd( &ri[2] );
-          __m128d right_k1_0 = _mm_load_pd( &ri[4] );
-          __m128d right_k1_2 = _mm_load_pd( &ri[6] );
-          __m128d right_k2_0 = _mm_load_pd( &ri[8] );
-          __m128d right_k2_2 = _mm_load_pd( &ri[10] );
-          __m128d right_k3_0 = _mm_load_pd( &ri[12] );
-          __m128d right_k3_2 = _mm_load_pd( &ri[14] );
-
-          right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-          right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-
-          right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-          right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-
-          right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-          right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-          right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-
-          right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-          right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-
-          right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-          right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-
-          right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-          right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-          right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);	   
-
-          __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
-          __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
-
-          __m128d EV_t_l0_k0 = EVV[0];
-          __m128d EV_t_l0_k2 = EVV[1];
-          __m128d EV_t_l1_k0 = EVV[2];
-          __m128d EV_t_l1_k2 = EVV[3];
-          __m128d EV_t_l2_k0 = EVV[4];
-          __m128d EV_t_l2_k2 = EVV[5];
-          __m128d EV_t_l3_k0 = EVV[6];
-          __m128d EV_t_l3_k2 = EVV[7];
-
-
-          EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-          EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-          EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-
-          EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-          EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-
-          EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-          EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-
-          EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-          EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-          EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-
-          EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-          EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-          EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+          simde__m128d left_k0_0 = simde_mm_load_pd( &le[0] );
+          simde__m128d left_k0_2 = simde_mm_load_pd( &le[2] );
+          simde__m128d left_k1_0 = simde_mm_load_pd( &le[4] );
+          simde__m128d left_k1_2 = simde_mm_load_pd( &le[6] );
+          simde__m128d left_k2_0 = simde_mm_load_pd( &le[8] );
+          simde__m128d left_k2_2 = simde_mm_load_pd( &le[10] );
+          simde__m128d left_k3_0 = simde_mm_load_pd( &le[12] );
+          simde__m128d left_k3_2 = simde_mm_load_pd( &le[14] );
+
+          left_k0_0 = simde_mm_mul_pd(x1_0, left_k0_0);
+          left_k0_2 = simde_mm_mul_pd(x1_2, left_k0_2);
+
+          left_k1_0 = simde_mm_mul_pd(x1_0, left_k1_0);
+          left_k1_2 = simde_mm_mul_pd(x1_2, left_k1_2);
+
+          left_k0_0 = simde_mm_hadd_pd( left_k0_0, left_k0_2 );
+          left_k1_0 = simde_mm_hadd_pd( left_k1_0, left_k1_2);
+          left_k0_0 = simde_mm_hadd_pd( left_k0_0, left_k1_0);
+
+          left_k2_0 = simde_mm_mul_pd(x1_0, left_k2_0);
+          left_k2_2 = simde_mm_mul_pd(x1_2, left_k2_2);
+
+          left_k3_0 = simde_mm_mul_pd(x1_0, left_k3_0);
+          left_k3_2 = simde_mm_mul_pd(x1_2, left_k3_2);
+
+          left_k2_0 = simde_mm_hadd_pd( left_k2_0, left_k2_2);
+          left_k3_0 = simde_mm_hadd_pd( left_k3_0, left_k3_2);
+          left_k2_0 = simde_mm_hadd_pd( left_k2_0, left_k3_0);
+
+          simde__m128d x2_0 = simde_mm_load_pd( &x2[0] );
+          simde__m128d x2_2 = simde_mm_load_pd( &x2[2] );
+
+          simde__m128d right_k0_0 = simde_mm_load_pd( &ri[0] );
+          simde__m128d right_k0_2 = simde_mm_load_pd( &ri[2] );
+          simde__m128d right_k1_0 = simde_mm_load_pd( &ri[4] );
+          simde__m128d right_k1_2 = simde_mm_load_pd( &ri[6] );
+          simde__m128d right_k2_0 = simde_mm_load_pd( &ri[8] );
+          simde__m128d right_k2_2 = simde_mm_load_pd( &ri[10] );
+          simde__m128d right_k3_0 = simde_mm_load_pd( &ri[12] );
+          simde__m128d right_k3_2 = simde_mm_load_pd( &ri[14] );
+
+          right_k0_0 = simde_mm_mul_pd( x2_0, right_k0_0);
+          right_k0_2 = simde_mm_mul_pd( x2_2, right_k0_2);
+
+          right_k1_0 = simde_mm_mul_pd( x2_0, right_k1_0);
+          right_k1_2 = simde_mm_mul_pd( x2_2, right_k1_2);
+
+          right_k0_0 = simde_mm_hadd_pd( right_k0_0, right_k0_2);
+          right_k1_0 = simde_mm_hadd_pd( right_k1_0, right_k1_2);
+          right_k0_0 = simde_mm_hadd_pd( right_k0_0, right_k1_0);
+
+          right_k2_0 = simde_mm_mul_pd( x2_0, right_k2_0);
+          right_k2_2 = simde_mm_mul_pd( x2_2, right_k2_2);
+
+          right_k3_0 = simde_mm_mul_pd( x2_0, right_k3_0);
+          right_k3_2 = simde_mm_mul_pd( x2_2, right_k3_2);
+
+          right_k2_0 = simde_mm_hadd_pd( right_k2_0, right_k2_2);
+          right_k3_0 = simde_mm_hadd_pd( right_k3_0, right_k3_2);
+          right_k2_0 = simde_mm_hadd_pd( right_k2_0, right_k3_0);	   
+
+          simde__m128d x1px2_k0 = simde_mm_mul_pd( left_k0_0, right_k0_0 );
+          simde__m128d x1px2_k2 = simde_mm_mul_pd( left_k2_0, right_k2_0 );
+
+          simde__m128d EV_t_l0_k0 = EVV[0];
+          simde__m128d EV_t_l0_k2 = EVV[1];
+          simde__m128d EV_t_l1_k0 = EVV[2];
+          simde__m128d EV_t_l1_k2 = EVV[3];
+          simde__m128d EV_t_l2_k0 = EVV[4];
+          simde__m128d EV_t_l2_k2 = EVV[5];
+          simde__m128d EV_t_l3_k0 = EVV[6];
+          simde__m128d EV_t_l3_k2 = EVV[7];
+
+
+          EV_t_l0_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+          EV_t_l0_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+          EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+          EV_t_l1_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+          EV_t_l1_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+          EV_t_l1_k0 = simde_mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+          EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+          EV_t_l2_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+          EV_t_l2_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+          EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+          EV_t_l3_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+          EV_t_l3_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+          EV_t_l3_k0 = simde_mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
 
-          EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );	  	 	    		  
+          EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );	  	 	    		  
 
           scale = 1;
 
-          __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m);
-          v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-          if(_mm_movemask_pd( v1 ) != 3)
+          simde__m128d v1 = simde_mm_and_pd(EV_t_l0_k0, absMask.m);
+          v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
+          if(simde_mm_movemask_pd( v1 ) != 3)
             scale = 0;
           else
           {
-            v1 = _mm_and_pd(EV_t_l2_k0, absMask.m);
-            v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-            if(_mm_movemask_pd( v1 ) != 3)
+            v1 = simde_mm_and_pd(EV_t_l2_k0, absMask.m);
+            v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
+            if(simde_mm_movemask_pd( v1 ) != 3)
               scale = 0;
           }
 
           if(scale)
           {		      
-            _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc));
-            _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc));	      	      
+            simde_mm_store_pd(&x3[0], simde_mm_mul_pd(EV_t_l0_k0, sc));
+            simde_mm_store_pd(&x3[2], simde_mm_mul_pd(EV_t_l2_k0, sc));	      	      
 
 	    if(useFastScaling)
 	      addScale += wgt[i];
@@ -2473,8 +2465,8 @@
           }	
           else
           {
-            _mm_store_pd(x3, EV_t_l0_k0);
-            _mm_store_pd(&x3[2], EV_t_l2_k0);
+            simde_mm_store_pd(x3, EV_t_l0_k0);
+            simde_mm_store_pd(&x3[2], EV_t_l2_k0);
           }
 
           x3_ptr += 4;
@@ -2523,120 +2515,120 @@
 		  x2_ptr += 4;
 		}	 	  	  	  
 
-	      __m128d x1_0 = _mm_load_pd( &x1[0] );
-	      __m128d x1_2 = _mm_load_pd( &x1[2] );
+	      simde__m128d x1_0 = simde_mm_load_pd( &x1[0] );
+	      simde__m128d x1_2 = simde_mm_load_pd( &x1[2] );
 	      
-	      __m128d left_k0_0 = _mm_load_pd( &le[0] );
-	      __m128d left_k0_2 = _mm_load_pd( &le[2] );
-	      __m128d left_k1_0 = _mm_load_pd( &le[4] );
-	      __m128d left_k1_2 = _mm_load_pd( &le[6] );
-	      __m128d left_k2_0 = _mm_load_pd( &le[8] );
-	      __m128d left_k2_2 = _mm_load_pd( &le[10] );
-	      __m128d left_k3_0 = _mm_load_pd( &le[12] );
-	      __m128d left_k3_2 = _mm_load_pd( &le[14] );
-	      
-	      left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
-	      left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
-	      
-	      left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
-	      left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
-	      
-	      left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
-	      left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
-	      left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
-	      
-	      left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
-	      left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
-	      
-	      left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
-	      left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
-	      
-	      left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
-	      left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
-	      left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
-	      
-	      __m128d x2_0 = _mm_load_pd( &x2[0] );
-	      __m128d x2_2 = _mm_load_pd( &x2[2] );
-	      
-	      __m128d right_k0_0 = _mm_load_pd( &ri[0] );
-	      __m128d right_k0_2 = _mm_load_pd( &ri[2] );
-	      __m128d right_k1_0 = _mm_load_pd( &ri[4] );
-	      __m128d right_k1_2 = _mm_load_pd( &ri[6] );
-	      __m128d right_k2_0 = _mm_load_pd( &ri[8] );
-	      __m128d right_k2_2 = _mm_load_pd( &ri[10] );
-	      __m128d right_k3_0 = _mm_load_pd( &ri[12] );
-	      __m128d right_k3_2 = _mm_load_pd( &ri[14] );
-	      
-	      right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-	      right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-	      
-	      right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-	      right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-	      
-	      right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-	      right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-	      right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-	      
-	      right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-	      right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-	      
-	      right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-	      right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-	      
-	      right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-	      right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-	      right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);	   
-	      
-	      __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
-	      __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
-	      
-	      __m128d EV_t_l0_k0 = EVV[0];
-	      __m128d EV_t_l0_k2 = EVV[1];
-	      __m128d EV_t_l1_k0 = EVV[2];
-	      __m128d EV_t_l1_k2 = EVV[3];
-	      __m128d EV_t_l2_k0 = EVV[4];
-	      __m128d EV_t_l2_k2 = EVV[5];
-	      __m128d EV_t_l3_k0 = EVV[6];
-	      __m128d EV_t_l3_k2 = EVV[7];
-
-	      EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-	      EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-	      EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-	      
-	      EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-	      EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-	      
-	      EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-	      EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-	      
-	      EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-	      EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-	      EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-	      
-	      EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-	      EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-	      EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+	      simde__m128d left_k0_0 = simde_mm_load_pd( &le[0] );
+	      simde__m128d left_k0_2 = simde_mm_load_pd( &le[2] );
+	      simde__m128d left_k1_0 = simde_mm_load_pd( &le[4] );
+	      simde__m128d left_k1_2 = simde_mm_load_pd( &le[6] );
+	      simde__m128d left_k2_0 = simde_mm_load_pd( &le[8] );
+	      simde__m128d left_k2_2 = simde_mm_load_pd( &le[10] );
+	      simde__m128d left_k3_0 = simde_mm_load_pd( &le[12] );
+	      simde__m128d left_k3_2 = simde_mm_load_pd( &le[14] );
+	      
+	      left_k0_0 = simde_mm_mul_pd(x1_0, left_k0_0);
+	      left_k0_2 = simde_mm_mul_pd(x1_2, left_k0_2);
+	      
+	      left_k1_0 = simde_mm_mul_pd(x1_0, left_k1_0);
+	      left_k1_2 = simde_mm_mul_pd(x1_2, left_k1_2);
+	      
+	      left_k0_0 = simde_mm_hadd_pd( left_k0_0, left_k0_2 );
+	      left_k1_0 = simde_mm_hadd_pd( left_k1_0, left_k1_2);
+	      left_k0_0 = simde_mm_hadd_pd( left_k0_0, left_k1_0);
+	      
+	      left_k2_0 = simde_mm_mul_pd(x1_0, left_k2_0);
+	      left_k2_2 = simde_mm_mul_pd(x1_2, left_k2_2);
+	      
+	      left_k3_0 = simde_mm_mul_pd(x1_0, left_k3_0);
+	      left_k3_2 = simde_mm_mul_pd(x1_2, left_k3_2);
+	      
+	      left_k2_0 = simde_mm_hadd_pd( left_k2_0, left_k2_2);
+	      left_k3_0 = simde_mm_hadd_pd( left_k3_0, left_k3_2);
+	      left_k2_0 = simde_mm_hadd_pd( left_k2_0, left_k3_0);
+	      
+	      simde__m128d x2_0 = simde_mm_load_pd( &x2[0] );
+	      simde__m128d x2_2 = simde_mm_load_pd( &x2[2] );
+	      
+	      simde__m128d right_k0_0 = simde_mm_load_pd( &ri[0] );
+	      simde__m128d right_k0_2 = simde_mm_load_pd( &ri[2] );
+	      simde__m128d right_k1_0 = simde_mm_load_pd( &ri[4] );
+	      simde__m128d right_k1_2 = simde_mm_load_pd( &ri[6] );
+	      simde__m128d right_k2_0 = simde_mm_load_pd( &ri[8] );
+	      simde__m128d right_k2_2 = simde_mm_load_pd( &ri[10] );
+	      simde__m128d right_k3_0 = simde_mm_load_pd( &ri[12] );
+	      simde__m128d right_k3_2 = simde_mm_load_pd( &ri[14] );
+	      
+	      right_k0_0 = simde_mm_mul_pd( x2_0, right_k0_0);
+	      right_k0_2 = simde_mm_mul_pd( x2_2, right_k0_2);
+	      
+	      right_k1_0 = simde_mm_mul_pd( x2_0, right_k1_0);
+	      right_k1_2 = simde_mm_mul_pd( x2_2, right_k1_2);
+	      
+	      right_k0_0 = simde_mm_hadd_pd( right_k0_0, right_k0_2);
+	      right_k1_0 = simde_mm_hadd_pd( right_k1_0, right_k1_2);
+	      right_k0_0 = simde_mm_hadd_pd( right_k0_0, right_k1_0);
+	      
+	      right_k2_0 = simde_mm_mul_pd( x2_0, right_k2_0);
+	      right_k2_2 = simde_mm_mul_pd( x2_2, right_k2_2);
+	      
+	      right_k3_0 = simde_mm_mul_pd( x2_0, right_k3_0);
+	      right_k3_2 = simde_mm_mul_pd( x2_2, right_k3_2);
+	      
+	      right_k2_0 = simde_mm_hadd_pd( right_k2_0, right_k2_2);
+	      right_k3_0 = simde_mm_hadd_pd( right_k3_0, right_k3_2);
+	      right_k2_0 = simde_mm_hadd_pd( right_k2_0, right_k3_0);	   
+	      
+	      simde__m128d x1px2_k0 = simde_mm_mul_pd( left_k0_0, right_k0_0 );
+	      simde__m128d x1px2_k2 = simde_mm_mul_pd( left_k2_0, right_k2_0 );
+	      
+	      simde__m128d EV_t_l0_k0 = EVV[0];
+	      simde__m128d EV_t_l0_k2 = EVV[1];
+	      simde__m128d EV_t_l1_k0 = EVV[2];
+	      simde__m128d EV_t_l1_k2 = EVV[3];
+	      simde__m128d EV_t_l2_k0 = EVV[4];
+	      simde__m128d EV_t_l2_k2 = EVV[5];
+	      simde__m128d EV_t_l3_k0 = EVV[6];
+	      simde__m128d EV_t_l3_k2 = EVV[7];
+
+	      EV_t_l0_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+	      EV_t_l0_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+	      EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+	      
+	      EV_t_l1_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+	      EV_t_l1_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+	      
+	      EV_t_l1_k0 = simde_mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+	      EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+	      
+	      EV_t_l2_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+	      EV_t_l2_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+	      EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+	      
+	      EV_t_l3_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+	      EV_t_l3_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+	      EV_t_l3_k0 = simde_mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
 	      
-	      EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );	  	 	    		  	 
+	      EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );	  	 	    		  	 
 	      
 	      scale = 1;
 	      
-	      __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m);
-	      v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	      if(_mm_movemask_pd( v1 ) != 3)
+	      simde__m128d v1 = simde_mm_and_pd(EV_t_l0_k0, absMask.m);
+	      v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
+	      if(simde_mm_movemask_pd( v1 ) != 3)
 		scale = 0;
 	      else
 		{
-		  v1 = _mm_and_pd(EV_t_l2_k0, absMask.m);
-		  v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-		  if(_mm_movemask_pd( v1 ) != 3)
+		  v1 = simde_mm_and_pd(EV_t_l2_k0, absMask.m);
+		  v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
+		  if(simde_mm_movemask_pd( v1 ) != 3)
 		    scale = 0;
 		}
 
 	      if(scale)
 		{		      
-		  EV_t_l0_k0 = _mm_mul_pd(EV_t_l0_k0, sc);
-		  EV_t_l2_k0 = _mm_mul_pd(EV_t_l2_k0, sc);	      	      
+		  EV_t_l0_k0 = simde_mm_mul_pd(EV_t_l0_k0, sc);
+		  EV_t_l2_k0 = simde_mm_mul_pd(EV_t_l2_k0, sc);	      	      
 		  
 		  if(useFastScaling)
 		    addScale += wgt[i];
@@ -2644,8 +2636,8 @@
 		    ex3[i] += 1;            
 		}	
 
-	      _mm_store_pd(&x3[0], EV_t_l0_k0);
-	      _mm_store_pd(&x3[2], EV_t_l2_k0);
+	      simde_mm_store_pd(&x3[0], EV_t_l0_k0);
+	      simde_mm_store_pd(&x3[2], EV_t_l2_k0);
 	      
 	      x3_ptr += 4;
 	    }
@@ -2679,9 +2671,9 @@
     scale, 
     addScale = 0;
    
-  __m128d
-    minlikelihood_sse = _mm_set1_pd( minlikelihood ),
-    sc = _mm_set1_pd(twotothe256),
+  simde__m128d
+    minlikelihood_sse = simde_mm_set1_pd( minlikelihood ),
+    sc = simde_mm_set1_pd(twotothe256),
     EVV[8];  
   
   for(i = 0; i < 4; i++)
@@ -2689,7 +2681,7 @@
       EV_t[4 * j + i] = EV[4 * i + j];
   
   for(i = 0; i < 8; i++)
-    EVV[i] = _mm_load_pd(&EV_t[i * 2]);
+    EVV[i] = simde_mm_load_pd(&EV_t[i * 2]);
   
   switch(tipCase)
     {
@@ -2704,104 +2696,104 @@
 	  le =  &left[cptr[i] * 16];
 	  ri =  &right[cptr[i] * 16];
 	  
-	  __m128d x1_0 = _mm_load_pd( &x1[0] );
-	  __m128d x1_2 = _mm_load_pd( &x1[2] );
+	  simde__m128d x1_0 = simde_mm_load_pd( &x1[0] );
+	  simde__m128d x1_2 = simde_mm_load_pd( &x1[2] );
 	  
-	  __m128d left_k0_0 = _mm_load_pd( &le[0] );
-	  __m128d left_k0_2 = _mm_load_pd( &le[2] );
-	  __m128d left_k1_0 = _mm_load_pd( &le[4] );
-	  __m128d left_k1_2 = _mm_load_pd( &le[6] );
-	  __m128d left_k2_0 = _mm_load_pd( &le[8] );
-	  __m128d left_k2_2 = _mm_load_pd( &le[10] );
-	  __m128d left_k3_0 = _mm_load_pd( &le[12] );
-	  __m128d left_k3_2 = _mm_load_pd( &le[14] );
-	  
-	  left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
-	  left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
-	  
-	  left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
-	  left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
-	  
-	  left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
-	  left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
-	  left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
-	  
-	  left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
-	  left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
-	  
-	  left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
-	  left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
-	  
-	  left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
-	  left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
-	  left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
-	  
-	  __m128d x2_0 = _mm_load_pd( &x2[0] );
-	  __m128d x2_2 = _mm_load_pd( &x2[2] );
-	  
-	  __m128d right_k0_0 = _mm_load_pd( &ri[0] );
-	  __m128d right_k0_2 = _mm_load_pd( &ri[2] );
-	  __m128d right_k1_0 = _mm_load_pd( &ri[4] );
-	  __m128d right_k1_2 = _mm_load_pd( &ri[6] );
-	  __m128d right_k2_0 = _mm_load_pd( &ri[8] );
-	  __m128d right_k2_2 = _mm_load_pd( &ri[10] );
-	  __m128d right_k3_0 = _mm_load_pd( &ri[12] );
-	  __m128d right_k3_2 = _mm_load_pd( &ri[14] );
-	  
-	  right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-	  right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-	  
-	  right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-	  right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-	  
-	  right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-	  right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-	  right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-	  
-	  right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-	  right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-	  
-	  right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-	  right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-	  
-	  right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-	  right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-	  right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);	   
-	  
-	  __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
-	  __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );	  	  
-
-	  __m128d EV_t_l0_k0 = EVV[0];
-	  __m128d EV_t_l0_k2 = EVV[1];
-	  __m128d EV_t_l1_k0 = EVV[2];
-	  __m128d EV_t_l1_k2 = EVV[3];
-	  __m128d EV_t_l2_k0 = EVV[4];
-	  __m128d EV_t_l2_k2 = EVV[5];
-	  __m128d EV_t_l3_k0 = EVV[6];
-	  __m128d EV_t_l3_k2 = EVV[7];
-	  
-	  EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-	  EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-	  EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-	  
-	  EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-	  EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-	  
-	  EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-	  EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-	  
-	  EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-	  EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-	  EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+	  simde__m128d left_k0_0 = simde_mm_load_pd( &le[0] );
+	  simde__m128d left_k0_2 = simde_mm_load_pd( &le[2] );
+	  simde__m128d left_k1_0 = simde_mm_load_pd( &le[4] );
+	  simde__m128d left_k1_2 = simde_mm_load_pd( &le[6] );
+	  simde__m128d left_k2_0 = simde_mm_load_pd( &le[8] );
+	  simde__m128d left_k2_2 = simde_mm_load_pd( &le[10] );
+	  simde__m128d left_k3_0 = simde_mm_load_pd( &le[12] );
+	  simde__m128d left_k3_2 = simde_mm_load_pd( &le[14] );
+	  
+	  left_k0_0 = simde_mm_mul_pd(x1_0, left_k0_0);
+	  left_k0_2 = simde_mm_mul_pd(x1_2, left_k0_2);
+	  
+	  left_k1_0 = simde_mm_mul_pd(x1_0, left_k1_0);
+	  left_k1_2 = simde_mm_mul_pd(x1_2, left_k1_2);
+	  
+	  left_k0_0 = simde_mm_hadd_pd( left_k0_0, left_k0_2 );
+	  left_k1_0 = simde_mm_hadd_pd( left_k1_0, left_k1_2);
+	  left_k0_0 = simde_mm_hadd_pd( left_k0_0, left_k1_0);
+	  
+	  left_k2_0 = simde_mm_mul_pd(x1_0, left_k2_0);
+	  left_k2_2 = simde_mm_mul_pd(x1_2, left_k2_2);
+	  
+	  left_k3_0 = simde_mm_mul_pd(x1_0, left_k3_0);
+	  left_k3_2 = simde_mm_mul_pd(x1_2, left_k3_2);
+	  
+	  left_k2_0 = simde_mm_hadd_pd( left_k2_0, left_k2_2);
+	  left_k3_0 = simde_mm_hadd_pd( left_k3_0, left_k3_2);
+	  left_k2_0 = simde_mm_hadd_pd( left_k2_0, left_k3_0);
+	  
+	  simde__m128d x2_0 = simde_mm_load_pd( &x2[0] );
+	  simde__m128d x2_2 = simde_mm_load_pd( &x2[2] );
+	  
+	  simde__m128d right_k0_0 = simde_mm_load_pd( &ri[0] );
+	  simde__m128d right_k0_2 = simde_mm_load_pd( &ri[2] );
+	  simde__m128d right_k1_0 = simde_mm_load_pd( &ri[4] );
+	  simde__m128d right_k1_2 = simde_mm_load_pd( &ri[6] );
+	  simde__m128d right_k2_0 = simde_mm_load_pd( &ri[8] );
+	  simde__m128d right_k2_2 = simde_mm_load_pd( &ri[10] );
+	  simde__m128d right_k3_0 = simde_mm_load_pd( &ri[12] );
+	  simde__m128d right_k3_2 = simde_mm_load_pd( &ri[14] );
+	  
+	  right_k0_0 = simde_mm_mul_pd( x2_0, right_k0_0);
+	  right_k0_2 = simde_mm_mul_pd( x2_2, right_k0_2);
+	  
+	  right_k1_0 = simde_mm_mul_pd( x2_0, right_k1_0);
+	  right_k1_2 = simde_mm_mul_pd( x2_2, right_k1_2);
+	  
+	  right_k0_0 = simde_mm_hadd_pd( right_k0_0, right_k0_2);
+	  right_k1_0 = simde_mm_hadd_pd( right_k1_0, right_k1_2);
+	  right_k0_0 = simde_mm_hadd_pd( right_k0_0, right_k1_0);
+	  
+	  right_k2_0 = simde_mm_mul_pd( x2_0, right_k2_0);
+	  right_k2_2 = simde_mm_mul_pd( x2_2, right_k2_2);
+	  
+	  right_k3_0 = simde_mm_mul_pd( x2_0, right_k3_0);
+	  right_k3_2 = simde_mm_mul_pd( x2_2, right_k3_2);
+	  
+	  right_k2_0 = simde_mm_hadd_pd( right_k2_0, right_k2_2);
+	  right_k3_0 = simde_mm_hadd_pd( right_k3_0, right_k3_2);
+	  right_k2_0 = simde_mm_hadd_pd( right_k2_0, right_k3_0);	   
+	  
+	  simde__m128d x1px2_k0 = simde_mm_mul_pd( left_k0_0, right_k0_0 );
+	  simde__m128d x1px2_k2 = simde_mm_mul_pd( left_k2_0, right_k2_0 );	  	  
+
+	  simde__m128d EV_t_l0_k0 = EVV[0];
+	  simde__m128d EV_t_l0_k2 = EVV[1];
+	  simde__m128d EV_t_l1_k0 = EVV[2];
+	  simde__m128d EV_t_l1_k2 = EVV[3];
+	  simde__m128d EV_t_l2_k0 = EVV[4];
+	  simde__m128d EV_t_l2_k2 = EVV[5];
+	  simde__m128d EV_t_l3_k0 = EVV[6];
+	  simde__m128d EV_t_l3_k2 = EVV[7];
+	  
+	  EV_t_l0_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+	  EV_t_l0_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+	  EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+	  
+	  EV_t_l1_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+	  EV_t_l1_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+	  
+	  EV_t_l1_k0 = simde_mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+	  EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+	  
+	  EV_t_l2_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+	  EV_t_l2_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+	  EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
 	  	  
-	  EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-	  EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-	  EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+	  EV_t_l3_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+	  EV_t_l3_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+	  EV_t_l3_k0 = simde_mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
 	  
-	  EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );	 
+	  EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );	 
 	  	  
-	  _mm_store_pd(x3, EV_t_l0_k0);
-	  _mm_store_pd(&x3[2], EV_t_l2_k0);	  	 	   	    
+	  simde_mm_store_pd(x3, EV_t_l0_k0);
+	  simde_mm_store_pd(&x3[2], EV_t_l2_k0);	  	 	   	    
 	}
       break;
     case TIP_INNER:      
@@ -2814,121 +2806,121 @@
 	  le =  &left[cptr[i] * 16];
 	  ri =  &right[cptr[i] * 16];
 
-	  __m128d x1_0 = _mm_load_pd( &x1[0] );
-	  __m128d x1_2 = _mm_load_pd( &x1[2] );
+	  simde__m128d x1_0 = simde_mm_load_pd( &x1[0] );
+	  simde__m128d x1_2 = simde_mm_load_pd( &x1[2] );
 	  
-	  __m128d left_k0_0 = _mm_load_pd( &le[0] );
-	  __m128d left_k0_2 = _mm_load_pd( &le[2] );
-	  __m128d left_k1_0 = _mm_load_pd( &le[4] );
-	  __m128d left_k1_2 = _mm_load_pd( &le[6] );
-	  __m128d left_k2_0 = _mm_load_pd( &le[8] );
-	  __m128d left_k2_2 = _mm_load_pd( &le[10] );
-	  __m128d left_k3_0 = _mm_load_pd( &le[12] );
-	  __m128d left_k3_2 = _mm_load_pd( &le[14] );
-	  
-	  left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
-	  left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
-	  
-	  left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
-	  left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
-	  
-	  left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
-	  left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
-	  left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
-	  
-	  left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
-	  left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
-	  
-	  left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
-	  left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
-	  
-	  left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
-	  left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
-	  left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
-	  
-	  __m128d x2_0 = _mm_load_pd( &x2[0] );
-	  __m128d x2_2 = _mm_load_pd( &x2[2] );
-	  
-	  __m128d right_k0_0 = _mm_load_pd( &ri[0] );
-	  __m128d right_k0_2 = _mm_load_pd( &ri[2] );
-	  __m128d right_k1_0 = _mm_load_pd( &ri[4] );
-	  __m128d right_k1_2 = _mm_load_pd( &ri[6] );
-	  __m128d right_k2_0 = _mm_load_pd( &ri[8] );
-	  __m128d right_k2_2 = _mm_load_pd( &ri[10] );
-	  __m128d right_k3_0 = _mm_load_pd( &ri[12] );
-	  __m128d right_k3_2 = _mm_load_pd( &ri[14] );
-	  
-	  right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-	  right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-	  
-	  right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-	  right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-	  
-	  right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-	  right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-	  right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-	  
-	  right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-	  right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-	  
-	  right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-	  right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-	  
-	  right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-	  right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-	  right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);	   
-	  
-	  __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
-	  __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
-	  
-	  __m128d EV_t_l0_k0 = EVV[0];
-	  __m128d EV_t_l0_k2 = EVV[1];
-	  __m128d EV_t_l1_k0 = EVV[2];
-	  __m128d EV_t_l1_k2 = EVV[3];
-	  __m128d EV_t_l2_k0 = EVV[4];
-	  __m128d EV_t_l2_k2 = EVV[5];
-	  __m128d EV_t_l3_k0 = EVV[6];
-	  __m128d EV_t_l3_k2 = EVV[7];
+	  simde__m128d left_k0_0 = simde_mm_load_pd( &le[0] );
+	  simde__m128d left_k0_2 = simde_mm_load_pd( &le[2] );
+	  simde__m128d left_k1_0 = simde_mm_load_pd( &le[4] );
+	  simde__m128d left_k1_2 = simde_mm_load_pd( &le[6] );
+	  simde__m128d left_k2_0 = simde_mm_load_pd( &le[8] );
+	  simde__m128d left_k2_2 = simde_mm_load_pd( &le[10] );
+	  simde__m128d left_k3_0 = simde_mm_load_pd( &le[12] );
+	  simde__m128d left_k3_2 = simde_mm_load_pd( &le[14] );
+	  
+	  left_k0_0 = simde_mm_mul_pd(x1_0, left_k0_0);
+	  left_k0_2 = simde_mm_mul_pd(x1_2, left_k0_2);
+	  
+	  left_k1_0 = simde_mm_mul_pd(x1_0, left_k1_0);
+	  left_k1_2 = simde_mm_mul_pd(x1_2, left_k1_2);
+	  
+	  left_k0_0 = simde_mm_hadd_pd( left_k0_0, left_k0_2 );
+	  left_k1_0 = simde_mm_hadd_pd( left_k1_0, left_k1_2);
+	  left_k0_0 = simde_mm_hadd_pd( left_k0_0, left_k1_0);
+	  
+	  left_k2_0 = simde_mm_mul_pd(x1_0, left_k2_0);
+	  left_k2_2 = simde_mm_mul_pd(x1_2, left_k2_2);
+	  
+	  left_k3_0 = simde_mm_mul_pd(x1_0, left_k3_0);
+	  left_k3_2 = simde_mm_mul_pd(x1_2, left_k3_2);
+	  
+	  left_k2_0 = simde_mm_hadd_pd( left_k2_0, left_k2_2);
+	  left_k3_0 = simde_mm_hadd_pd( left_k3_0, left_k3_2);
+	  left_k2_0 = simde_mm_hadd_pd( left_k2_0, left_k3_0);
+	  
+	  simde__m128d x2_0 = simde_mm_load_pd( &x2[0] );
+	  simde__m128d x2_2 = simde_mm_load_pd( &x2[2] );
+	  
+	  simde__m128d right_k0_0 = simde_mm_load_pd( &ri[0] );
+	  simde__m128d right_k0_2 = simde_mm_load_pd( &ri[2] );
+	  simde__m128d right_k1_0 = simde_mm_load_pd( &ri[4] );
+	  simde__m128d right_k1_2 = simde_mm_load_pd( &ri[6] );
+	  simde__m128d right_k2_0 = simde_mm_load_pd( &ri[8] );
+	  simde__m128d right_k2_2 = simde_mm_load_pd( &ri[10] );
+	  simde__m128d right_k3_0 = simde_mm_load_pd( &ri[12] );
+	  simde__m128d right_k3_2 = simde_mm_load_pd( &ri[14] );
+	  
+	  right_k0_0 = simde_mm_mul_pd( x2_0, right_k0_0);
+	  right_k0_2 = simde_mm_mul_pd( x2_2, right_k0_2);
+	  
+	  right_k1_0 = simde_mm_mul_pd( x2_0, right_k1_0);
+	  right_k1_2 = simde_mm_mul_pd( x2_2, right_k1_2);
+	  
+	  right_k0_0 = simde_mm_hadd_pd( right_k0_0, right_k0_2);
+	  right_k1_0 = simde_mm_hadd_pd( right_k1_0, right_k1_2);
+	  right_k0_0 = simde_mm_hadd_pd( right_k0_0, right_k1_0);
+	  
+	  right_k2_0 = simde_mm_mul_pd( x2_0, right_k2_0);
+	  right_k2_2 = simde_mm_mul_pd( x2_2, right_k2_2);
+	  
+	  right_k3_0 = simde_mm_mul_pd( x2_0, right_k3_0);
+	  right_k3_2 = simde_mm_mul_pd( x2_2, right_k3_2);
+	  
+	  right_k2_0 = simde_mm_hadd_pd( right_k2_0, right_k2_2);
+	  right_k3_0 = simde_mm_hadd_pd( right_k3_0, right_k3_2);
+	  right_k2_0 = simde_mm_hadd_pd( right_k2_0, right_k3_0);	   
+	  
+	  simde__m128d x1px2_k0 = simde_mm_mul_pd( left_k0_0, right_k0_0 );
+	  simde__m128d x1px2_k2 = simde_mm_mul_pd( left_k2_0, right_k2_0 );
+	  
+	  simde__m128d EV_t_l0_k0 = EVV[0];
+	  simde__m128d EV_t_l0_k2 = EVV[1];
+	  simde__m128d EV_t_l1_k0 = EVV[2];
+	  simde__m128d EV_t_l1_k2 = EVV[3];
+	  simde__m128d EV_t_l2_k0 = EVV[4];
+	  simde__m128d EV_t_l2_k2 = EVV[5];
+	  simde__m128d EV_t_l3_k0 = EVV[6];
+	  simde__m128d EV_t_l3_k2 = EVV[7];
 	 
 	  
-	  EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-	  EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-	  EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-	  
-	  EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-	  EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-	  
-	  EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-	  EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-	  
-	  EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-	  EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-	  EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+	  EV_t_l0_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+	  EV_t_l0_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+	  EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+	  
+	  EV_t_l1_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+	  EV_t_l1_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+	  
+	  EV_t_l1_k0 = simde_mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+	  EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+	  
+	  EV_t_l2_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+	  EV_t_l2_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+	  EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
 	  	  
-	  EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-	  EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-	  EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+	  EV_t_l3_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+	  EV_t_l3_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+	  EV_t_l3_k0 = simde_mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
 	  
-	  EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );	  	 	    		  
+	  EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );	  	 	    		  
 	 
 	  scale = 1;
 	  	  	  	    
-	  __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m);
-	  v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	  if(_mm_movemask_pd( v1 ) != 3)
+	  simde__m128d v1 = simde_mm_and_pd(EV_t_l0_k0, absMask.m);
+	  v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
+	  if(simde_mm_movemask_pd( v1 ) != 3)
 	    scale = 0;
 	  else
 	    {
-	      v1 = _mm_and_pd(EV_t_l2_k0, absMask.m);
-	      v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	      if(_mm_movemask_pd( v1 ) != 3)
+	      v1 = simde_mm_and_pd(EV_t_l2_k0, absMask.m);
+	      v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
+	      if(simde_mm_movemask_pd( v1 ) != 3)
 		scale = 0;
 	    }
 	  	  
 	  if(scale)
 	    {		      
-	      _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc));
-	      _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc));	      	      
+	      simde_mm_store_pd(&x3[0], simde_mm_mul_pd(EV_t_l0_k0, sc));
+	      simde_mm_store_pd(&x3[2], simde_mm_mul_pd(EV_t_l2_k0, sc));	      	      
 	      
 	      if(useFastScaling)
 		addScale += wgt[i];
@@ -2937,8 +2929,8 @@
 	    }	
 	  else
 	    {
-	      _mm_store_pd(x3, EV_t_l0_k0);
-	      _mm_store_pd(&x3[2], EV_t_l2_k0);
+	      simde_mm_store_pd(x3, EV_t_l0_k0);
+	      simde_mm_store_pd(&x3[2], EV_t_l2_k0);
 	    }
 	 
 	  	  
@@ -2954,121 +2946,121 @@
 	  le =  &left[cptr[i] * 16];
 	  ri =  &right[cptr[i] * 16];
 
-	  __m128d x1_0 = _mm_load_pd( &x1[0] );
-	  __m128d x1_2 = _mm_load_pd( &x1[2] );
+	  simde__m128d x1_0 = simde_mm_load_pd( &x1[0] );
+	  simde__m128d x1_2 = simde_mm_load_pd( &x1[2] );
 	  
-	  __m128d left_k0_0 = _mm_load_pd( &le[0] );
-	  __m128d left_k0_2 = _mm_load_pd( &le[2] );
-	  __m128d left_k1_0 = _mm_load_pd( &le[4] );
-	  __m128d left_k1_2 = _mm_load_pd( &le[6] );
-	  __m128d left_k2_0 = _mm_load_pd( &le[8] );
-	  __m128d left_k2_2 = _mm_load_pd( &le[10] );
-	  __m128d left_k3_0 = _mm_load_pd( &le[12] );
-	  __m128d left_k3_2 = _mm_load_pd( &le[14] );
-	  
-	  left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
-	  left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
-	  
-	  left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
-	  left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
-	  
-	  left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
-	  left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
-	  left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
-	  
-	  left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
-	  left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
-	  
-	  left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
-	  left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
-	  
-	  left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
-	  left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
-	  left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
-	  
-	  __m128d x2_0 = _mm_load_pd( &x2[0] );
-	  __m128d x2_2 = _mm_load_pd( &x2[2] );
-	  
-	  __m128d right_k0_0 = _mm_load_pd( &ri[0] );
-	  __m128d right_k0_2 = _mm_load_pd( &ri[2] );
-	  __m128d right_k1_0 = _mm_load_pd( &ri[4] );
-	  __m128d right_k1_2 = _mm_load_pd( &ri[6] );
-	  __m128d right_k2_0 = _mm_load_pd( &ri[8] );
-	  __m128d right_k2_2 = _mm_load_pd( &ri[10] );
-	  __m128d right_k3_0 = _mm_load_pd( &ri[12] );
-	  __m128d right_k3_2 = _mm_load_pd( &ri[14] );
-	  
-	  right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-	  right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-	  
-	  right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-	  right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-	  
-	  right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-	  right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-	  right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-	  
-	  right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-	  right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-	  
-	  right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-	  right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-	  
-	  right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-	  right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-	  right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);	   
-	  
-	  __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
-	  __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
-	  
-	  __m128d EV_t_l0_k0 = EVV[0];
-	  __m128d EV_t_l0_k2 = EVV[1];
-	  __m128d EV_t_l1_k0 = EVV[2];
-	  __m128d EV_t_l1_k2 = EVV[3];
-	  __m128d EV_t_l2_k0 = EVV[4];
-	  __m128d EV_t_l2_k2 = EVV[5];
-	  __m128d EV_t_l3_k0 = EVV[6];
-	  __m128d EV_t_l3_k2 = EVV[7];
+	  simde__m128d left_k0_0 = simde_mm_load_pd( &le[0] );
+	  simde__m128d left_k0_2 = simde_mm_load_pd( &le[2] );
+	  simde__m128d left_k1_0 = simde_mm_load_pd( &le[4] );
+	  simde__m128d left_k1_2 = simde_mm_load_pd( &le[6] );
+	  simde__m128d left_k2_0 = simde_mm_load_pd( &le[8] );
+	  simde__m128d left_k2_2 = simde_mm_load_pd( &le[10] );
+	  simde__m128d left_k3_0 = simde_mm_load_pd( &le[12] );
+	  simde__m128d left_k3_2 = simde_mm_load_pd( &le[14] );
+	  
+	  left_k0_0 = simde_mm_mul_pd(x1_0, left_k0_0);
+	  left_k0_2 = simde_mm_mul_pd(x1_2, left_k0_2);
+	  
+	  left_k1_0 = simde_mm_mul_pd(x1_0, left_k1_0);
+	  left_k1_2 = simde_mm_mul_pd(x1_2, left_k1_2);
+	  
+	  left_k0_0 = simde_mm_hadd_pd( left_k0_0, left_k0_2 );
+	  left_k1_0 = simde_mm_hadd_pd( left_k1_0, left_k1_2);
+	  left_k0_0 = simde_mm_hadd_pd( left_k0_0, left_k1_0);
+	  
+	  left_k2_0 = simde_mm_mul_pd(x1_0, left_k2_0);
+	  left_k2_2 = simde_mm_mul_pd(x1_2, left_k2_2);
+	  
+	  left_k3_0 = simde_mm_mul_pd(x1_0, left_k3_0);
+	  left_k3_2 = simde_mm_mul_pd(x1_2, left_k3_2);
+	  
+	  left_k2_0 = simde_mm_hadd_pd( left_k2_0, left_k2_2);
+	  left_k3_0 = simde_mm_hadd_pd( left_k3_0, left_k3_2);
+	  left_k2_0 = simde_mm_hadd_pd( left_k2_0, left_k3_0);
+	  
+	  simde__m128d x2_0 = simde_mm_load_pd( &x2[0] );
+	  simde__m128d x2_2 = simde_mm_load_pd( &x2[2] );
+	  
+	  simde__m128d right_k0_0 = simde_mm_load_pd( &ri[0] );
+	  simde__m128d right_k0_2 = simde_mm_load_pd( &ri[2] );
+	  simde__m128d right_k1_0 = simde_mm_load_pd( &ri[4] );
+	  simde__m128d right_k1_2 = simde_mm_load_pd( &ri[6] );
+	  simde__m128d right_k2_0 = simde_mm_load_pd( &ri[8] );
+	  simde__m128d right_k2_2 = simde_mm_load_pd( &ri[10] );
+	  simde__m128d right_k3_0 = simde_mm_load_pd( &ri[12] );
+	  simde__m128d right_k3_2 = simde_mm_load_pd( &ri[14] );
+	  
+	  right_k0_0 = simde_mm_mul_pd( x2_0, right_k0_0);
+	  right_k0_2 = simde_mm_mul_pd( x2_2, right_k0_2);
+	  
+	  right_k1_0 = simde_mm_mul_pd( x2_0, right_k1_0);
+	  right_k1_2 = simde_mm_mul_pd( x2_2, right_k1_2);
+	  
+	  right_k0_0 = simde_mm_hadd_pd( right_k0_0, right_k0_2);
+	  right_k1_0 = simde_mm_hadd_pd( right_k1_0, right_k1_2);
+	  right_k0_0 = simde_mm_hadd_pd( right_k0_0, right_k1_0);
+	  
+	  right_k2_0 = simde_mm_mul_pd( x2_0, right_k2_0);
+	  right_k2_2 = simde_mm_mul_pd( x2_2, right_k2_2);
+	  
+	  right_k3_0 = simde_mm_mul_pd( x2_0, right_k3_0);
+	  right_k3_2 = simde_mm_mul_pd( x2_2, right_k3_2);
+	  
+	  right_k2_0 = simde_mm_hadd_pd( right_k2_0, right_k2_2);
+	  right_k3_0 = simde_mm_hadd_pd( right_k3_0, right_k3_2);
+	  right_k2_0 = simde_mm_hadd_pd( right_k2_0, right_k3_0);	   
+	  
+	  simde__m128d x1px2_k0 = simde_mm_mul_pd( left_k0_0, right_k0_0 );
+	  simde__m128d x1px2_k2 = simde_mm_mul_pd( left_k2_0, right_k2_0 );
+	  
+	  simde__m128d EV_t_l0_k0 = EVV[0];
+	  simde__m128d EV_t_l0_k2 = EVV[1];
+	  simde__m128d EV_t_l1_k0 = EVV[2];
+	  simde__m128d EV_t_l1_k2 = EVV[3];
+	  simde__m128d EV_t_l2_k0 = EVV[4];
+	  simde__m128d EV_t_l2_k2 = EVV[5];
+	  simde__m128d EV_t_l3_k0 = EVV[6];
+	  simde__m128d EV_t_l3_k2 = EVV[7];
 	 
 	  
-	  EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-	  EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-	  EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-	  
-	  EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-	  EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-	  
-	  EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-	  EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-	  
-	  EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-	  EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-	  EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+	  EV_t_l0_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+	  EV_t_l0_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+	  EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+	  
+	  EV_t_l1_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+	  EV_t_l1_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+	  
+	  EV_t_l1_k0 = simde_mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+	  EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+	  
+	  EV_t_l2_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+	  EV_t_l2_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+	  EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
 	  	  
-	  EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-	  EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-	  EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+	  EV_t_l3_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+	  EV_t_l3_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+	  EV_t_l3_k0 = simde_mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
 	  
-	  EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );	  	 	    		  	 
+	  EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );	  	 	    		  	 
 
 	  scale = 1;
 	  	  
-	  __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m);
-	  v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	  if(_mm_movemask_pd( v1 ) != 3)
+	  simde__m128d v1 = simde_mm_and_pd(EV_t_l0_k0, absMask.m);
+	  v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
+	  if(simde_mm_movemask_pd( v1 ) != 3)
 	    scale = 0;
 	  else
 	    {
-	      v1 = _mm_and_pd(EV_t_l2_k0, absMask.m);
-	      v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	      if(_mm_movemask_pd( v1 ) != 3)
+	      v1 = simde_mm_and_pd(EV_t_l2_k0, absMask.m);
+	      v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
+	      if(simde_mm_movemask_pd( v1 ) != 3)
 		scale = 0;
 	    }
 	  	  
 	  if(scale)
 	    {		      
-	      _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc));
-	      _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc));	      	      
+	      simde_mm_store_pd(&x3[0], simde_mm_mul_pd(EV_t_l0_k0, sc));
+	      simde_mm_store_pd(&x3[2], simde_mm_mul_pd(EV_t_l2_k0, sc));	      	      
 	      
 	      if(useFastScaling)
 		addScale += wgt[i];
@@ -3077,8 +3069,8 @@
 	    }	
 	  else
 	    {
-	      _mm_store_pd(x3, EV_t_l0_k0);
-	      _mm_store_pd(&x3[2], EV_t_l2_k0);
+	      simde_mm_store_pd(x3, EV_t_l0_k0);
+	      simde_mm_store_pd(&x3[2], EV_t_l2_k0);
 	    }
 	  	  
 	}
@@ -3123,7 +3115,7 @@
     maxima[2] __attribute__ ((aligned (BYTE_ALIGNMENT))),       
     EV_t[16] __attribute__ ((aligned (BYTE_ALIGNMENT)));      
     
-  __m128d 
+  simde__m128d 
     values[8],
     EVV[8];  
 
@@ -3132,7 +3124,7 @@
       EV_t[4 * l + k] = EV[4 * k + l];
 
   for(k = 0; k < 8; k++)
-    EVV[k] = _mm_load_pd(&EV_t[k * 2]);
+    EVV[k] = simde_mm_load_pd(&EV_t[k * 2]);
    
   switch(tipCase)
     {
@@ -3143,8 +3135,8 @@
 
 	for (i = 1; i < 16; i++)
 	  {
-	    __m128d x1_1 = _mm_load_pd(&(tipVector[i*4]));
-	    __m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2]));	   
+	    simde__m128d x1_1 = simde_mm_load_pd(&(tipVector[i*4]));
+	    simde__m128d x1_2 = simde_mm_load_pd(&(tipVector[i*4 + 2]));	   
 
 
 	    if(mask32[i] & x1_presenceMap)
@@ -3152,16 +3144,16 @@
 		for (j = 0; j < 4; j++)
 		  for (k = 0; k < 4; k++)
 		    {		 
-		      __m128d left1 = _mm_load_pd(&left[j*16 + k*4]);
-		      __m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]);
+		      simde__m128d left1 = simde_mm_load_pd(&left[j*16 + k*4]);
+		      simde__m128d left2 = simde_mm_load_pd(&left[j*16 + k*4 + 2]);
 		      
-		      __m128d acc = _mm_setzero_pd();
+		      simde__m128d acc = simde_mm_setzero_pd();
 		      
-		      acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
-		      acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
+		      acc = simde_mm_add_pd(acc, simde_mm_mul_pd(left1, x1_1));
+		      acc = simde_mm_add_pd(acc, simde_mm_mul_pd(left2, x1_2));
 		      
-		      acc = _mm_hadd_pd(acc, acc);
-		      _mm_storel_pd(&umpX1[i*16 + j*4 + k], acc);
+		      acc = simde_mm_hadd_pd(acc, acc);
+		      simde_mm_storel_pd(&umpX1[i*16 + j*4 + k], acc);
 		    }
 	      }
 
@@ -3171,16 +3163,16 @@
 		for (j = 0; j < 4; j++)
 		  for (k = 0; k < 4; k++)
 		    {
-		      __m128d left1 = _mm_load_pd(&right[j*16 + k*4]);
-		      __m128d left2 = _mm_load_pd(&right[j*16 + k*4 + 2]);
+		      simde__m128d left1 = simde_mm_load_pd(&right[j*16 + k*4]);
+		      simde__m128d left2 = simde_mm_load_pd(&right[j*16 + k*4 + 2]);
 		      
-		      __m128d acc = _mm_setzero_pd();
+		      simde__m128d acc = simde_mm_setzero_pd();
 		      
-		      acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
-		      acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
+		      acc = simde_mm_add_pd(acc, simde_mm_mul_pd(left1, x1_1));
+		      acc = simde_mm_add_pd(acc, simde_mm_mul_pd(left2, x1_2));
 		      
-		      acc = _mm_hadd_pd(acc, acc);
-		      _mm_storel_pd(&umpX2[i*16 + j*4 + k], acc);
+		      acc = simde_mm_hadd_pd(acc, acc);
+		      simde_mm_storel_pd(&umpX2[i*16 + j*4 + k], acc);
 		      
 		    }
 	      }
@@ -3196,56 +3188,56 @@
 	    
 	    for (j = 0; j < 4; j++)
 	       {				 		  		  		   
-		 __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
-		 __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
+		 simde__m128d uX1_k0_sse = simde_mm_load_pd( &uX1[j * 4] );
+		 simde__m128d uX1_k2_sse = simde_mm_load_pd( &uX1[j * 4 + 2] );
 		 				  
 		   
-		 __m128d uX2_k0_sse = _mm_load_pd( &uX2[j * 4] );
-		 __m128d uX2_k2_sse = _mm_load_pd( &uX2[j * 4 + 2] );
+		 simde__m128d uX2_k0_sse = simde_mm_load_pd( &uX2[j * 4] );
+		 simde__m128d uX2_k2_sse = simde_mm_load_pd( &uX2[j * 4 + 2] );
  		 
 
 		
 		 /* multiply left * right */		
 		 
-		 __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, uX2_k0_sse );
-		 __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, uX2_k2_sse );
+		 simde__m128d x1px2_k0 = simde_mm_mul_pd( uX1_k0_sse, uX2_k0_sse );
+		 simde__m128d x1px2_k2 = simde_mm_mul_pd( uX1_k2_sse, uX2_k2_sse );
 		 
 		 
 		 
 		 /* multiply with EV matrix (!?) */
 		
 		 
-		 __m128d EV_t_l0_k0 = EVV[0];
-		 __m128d EV_t_l0_k2 = EVV[1];
-		 __m128d EV_t_l1_k0 = EVV[2];
-		 __m128d EV_t_l1_k2 = EVV[3];
-		 __m128d EV_t_l2_k0 = EVV[4];
-		 __m128d EV_t_l2_k2 = EVV[5];
-		 __m128d EV_t_l3_k0 = EVV[6]; 
-		 __m128d EV_t_l3_k2 = EVV[7];
-		 
-		 EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-		 EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-		 EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-		 
-		 EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-		 EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-		 
-		 EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-		 EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-		 
-		 EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-		 EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-		 EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-		 
-		 EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-		 EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-		 EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+		 simde__m128d EV_t_l0_k0 = EVV[0];
+		 simde__m128d EV_t_l0_k2 = EVV[1];
+		 simde__m128d EV_t_l1_k0 = EVV[2];
+		 simde__m128d EV_t_l1_k2 = EVV[3];
+		 simde__m128d EV_t_l2_k0 = EVV[4];
+		 simde__m128d EV_t_l2_k2 = EVV[5];
+		 simde__m128d EV_t_l3_k0 = EVV[6]; 
+		 simde__m128d EV_t_l3_k2 = EVV[7];
+		 
+		 EV_t_l0_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+		 EV_t_l0_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+		 EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+		 
+		 EV_t_l1_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+		 EV_t_l1_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+		 
+		 EV_t_l1_k0 = simde_mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+		 EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+		 
+		 EV_t_l2_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+		 EV_t_l2_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+		 EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+		 
+		 EV_t_l3_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+		 EV_t_l3_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+		 EV_t_l3_k0 = simde_mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
 		 
-		 EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
+		 EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
 		 
-		 _mm_store_pd( &x3[j * 4 + 0], EV_t_l0_k0 );
-		 _mm_store_pd( &x3[j * 4 + 2], EV_t_l2_k0 );
+		 simde_mm_store_pd( &x3[j * 4 + 0], EV_t_l0_k0 );
+		 simde_mm_store_pd( &x3[j * 4 + 2], EV_t_l2_k0 );
 	       }
 	  }
       }
@@ -3259,29 +3251,29 @@
 	  {
 	    if(mask32[i] & x1_presenceMap)
 	      {
-		__m128d x1_1 = _mm_load_pd(&(tipVector[i*4]));
-		__m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2]));	   
+		simde__m128d x1_1 = simde_mm_load_pd(&(tipVector[i*4]));
+		simde__m128d x1_2 = simde_mm_load_pd(&(tipVector[i*4 + 2]));	   
 		
 		for (j = 0; j < 4; j++)
 		  for (k = 0; k < 4; k++)
 		    {		 
-		      __m128d left1 = _mm_load_pd(&left[j*16 + k*4]);
-		      __m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]);
+		      simde__m128d left1 = simde_mm_load_pd(&left[j*16 + k*4]);
+		      simde__m128d left2 = simde_mm_load_pd(&left[j*16 + k*4 + 2]);
 		      
-		      __m128d acc = _mm_setzero_pd();
+		      simde__m128d acc = simde_mm_setzero_pd();
 		      
-		      acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
-		      acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
+		      acc = simde_mm_add_pd(acc, simde_mm_mul_pd(left1, x1_1));
+		      acc = simde_mm_add_pd(acc, simde_mm_mul_pd(left2, x1_2));
 		      
-		      acc = _mm_hadd_pd(acc, acc);
-		      _mm_storel_pd(&umpX1[i*16 + j*4 + k], acc);		 
+		      acc = simde_mm_hadd_pd(acc, acc);
+		      simde_mm_storel_pd(&umpX1[i*16 + j*4 + k], acc);		 
 		    }
 	      }	   
 	  }
 
 	 for (i = 0; i < n; i++)
 	   {
-	     __m128d maxv =_mm_setzero_pd();
+	     simde__m128d maxv =simde_mm_setzero_pd();
 	     
 	     x2 = &x2_start[i * 16];
 	     x3 = &x3_start[i * 16];
@@ -3299,117 +3291,117 @@
 		 double *right_k1_p = &right[j*16 + 1*4];
 		 double *right_k2_p = &right[j*16 + 2*4];
 		 double *right_k3_p = &right[j*16 + 3*4];
-		 __m128d x2_0 = _mm_load_pd( &x2_p[0] );
-		 __m128d x2_2 = _mm_load_pd( &x2_p[2] );
+		 simde__m128d x2_0 = simde_mm_load_pd( &x2_p[0] );
+		 simde__m128d x2_2 = simde_mm_load_pd( &x2_p[2] );
 
-		 __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
-		 __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
-		 __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
-		 __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
-		 __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
-		 __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
-		 __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
-		 __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
+		 simde__m128d right_k0_0 = simde_mm_load_pd( &right_k0_p[0] );
+		 simde__m128d right_k0_2 = simde_mm_load_pd( &right_k0_p[2] );
+		 simde__m128d right_k1_0 = simde_mm_load_pd( &right_k1_p[0] );
+		 simde__m128d right_k1_2 = simde_mm_load_pd( &right_k1_p[2] );
+		 simde__m128d right_k2_0 = simde_mm_load_pd( &right_k2_p[0] );
+		 simde__m128d right_k2_2 = simde_mm_load_pd( &right_k2_p[2] );
+		 simde__m128d right_k3_0 = simde_mm_load_pd( &right_k3_p[0] );
+		 simde__m128d right_k3_2 = simde_mm_load_pd( &right_k3_p[2] );
 
 
 
-		 right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-		 right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
+		 right_k0_0 = simde_mm_mul_pd( x2_0, right_k0_0);
+		 right_k0_2 = simde_mm_mul_pd( x2_2, right_k0_2);
 
-		 right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-		 right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
+		 right_k1_0 = simde_mm_mul_pd( x2_0, right_k1_0);
+		 right_k1_2 = simde_mm_mul_pd( x2_2, right_k1_2);
 
-		 right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-		 right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-		 right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+		 right_k0_0 = simde_mm_hadd_pd( right_k0_0, right_k0_2);
+		 right_k1_0 = simde_mm_hadd_pd( right_k1_0, right_k1_2);
+		 right_k0_0 = simde_mm_hadd_pd( right_k0_0, right_k1_0);
 
 
-		 right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-		 right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+		 right_k2_0 = simde_mm_mul_pd( x2_0, right_k2_0);
+		 right_k2_2 = simde_mm_mul_pd( x2_2, right_k2_2);
 
 
-		 right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-		 right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+		 right_k3_0 = simde_mm_mul_pd( x2_0, right_k3_0);
+		 right_k3_2 = simde_mm_mul_pd( x2_2, right_k3_2);
 
-		 right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-		 right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-		 right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);
+		 right_k2_0 = simde_mm_hadd_pd( right_k2_0, right_k2_2);
+		 right_k3_0 = simde_mm_hadd_pd( right_k3_0, right_k3_2);
+		 right_k2_0 = simde_mm_hadd_pd( right_k2_0, right_k3_0);
 
 		 {
 		  
 		   /* load left side from tip vector */
 		  
 		   
-		   __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
-		   __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
+		   simde__m128d uX1_k0_sse = simde_mm_load_pd( &uX1[j * 4] );
+		   simde__m128d uX1_k2_sse = simde_mm_load_pd( &uX1[j * 4 + 2] );
 		 
 		 
 		   
 		   /* multiply left * right */
 		  
 		   
-		   __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, right_k0_0 );
-		   __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, right_k2_0 );
+		   simde__m128d x1px2_k0 = simde_mm_mul_pd( uX1_k0_sse, right_k0_0 );
+		   simde__m128d x1px2_k2 = simde_mm_mul_pd( uX1_k2_sse, right_k2_0 );
 		   
 		   
 		   
 		   /* multiply with EV matrix (!?) */		 
 
-		   __m128d EV_t_l0_k0 = EVV[0];
-		   __m128d EV_t_l0_k2 = EVV[1];
-		   __m128d EV_t_l1_k0 = EVV[2];
-		   __m128d EV_t_l1_k2 = EVV[3];
-		   __m128d EV_t_l2_k0 = EVV[4];
-		   __m128d EV_t_l2_k2 = EVV[5];
-		   __m128d EV_t_l3_k0 = EVV[6]; 
-		   __m128d EV_t_l3_k2 = EVV[7];
+		   simde__m128d EV_t_l0_k0 = EVV[0];
+		   simde__m128d EV_t_l0_k2 = EVV[1];
+		   simde__m128d EV_t_l1_k0 = EVV[2];
+		   simde__m128d EV_t_l1_k2 = EVV[3];
+		   simde__m128d EV_t_l2_k0 = EVV[4];
+		   simde__m128d EV_t_l2_k2 = EVV[5];
+		   simde__m128d EV_t_l3_k0 = EVV[6]; 
+		   simde__m128d EV_t_l3_k2 = EVV[7];
 
 		   
-		   EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-		   EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-		   EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+		   EV_t_l0_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+		   EV_t_l0_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+		   EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
 		   
-		   EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-		   EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+		   EV_t_l1_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+		   EV_t_l1_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
 		   
-		   EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-		   EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+		   EV_t_l1_k0 = simde_mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+		   EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
 		   
-		   EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-		   EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-		   EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+		   EV_t_l2_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+		   EV_t_l2_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+		   EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
 		   		   
-		   EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-		   EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-		   EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+		   EV_t_l3_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+		   EV_t_l3_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+		   EV_t_l3_k0 = simde_mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
 		   
-		   EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
+		   EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
 		   
 		   values[j * 2]     = EV_t_l0_k0;
 		   values[j * 2 + 1] = EV_t_l2_k0;		   		   
 		   
-		   maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
-		   maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));		   
+		   maxv = simde_mm_max_pd(maxv, simde_mm_and_pd(EV_t_l0_k0, absMask.m));
+		   maxv = simde_mm_max_pd(maxv, simde_mm_and_pd(EV_t_l2_k0, absMask.m));		   
 		 }
 	       }
 
 	     
-	     _mm_store_pd(maxima, maxv);
+	     simde_mm_store_pd(maxima, maxv);
 
 	     max = MAX(maxima[0], maxima[1]);
 
 	     if(max < minlikelihood)
 	       {
-		 __m128d sv = _mm_set1_pd(twotothe256);
+		 simde__m128d sv = simde_mm_set1_pd(twotothe256);
 	       		       	   	 	     
-		 _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));	   
-		 _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
-		 _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
-		 _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
-		 _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));	   
-		 _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
-		 _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
-		 _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));	     
+		 simde_mm_store_pd(&x3[0], simde_mm_mul_pd(values[0], sv));	   
+		 simde_mm_store_pd(&x3[2], simde_mm_mul_pd(values[1], sv));
+		 simde_mm_store_pd(&x3[4], simde_mm_mul_pd(values[2], sv));
+		 simde_mm_store_pd(&x3[6], simde_mm_mul_pd(values[3], sv));
+		 simde_mm_store_pd(&x3[8], simde_mm_mul_pd(values[4], sv));	   
+		 simde_mm_store_pd(&x3[10], simde_mm_mul_pd(values[5], sv));
+		 simde_mm_store_pd(&x3[12], simde_mm_mul_pd(values[6], sv));
+		 simde_mm_store_pd(&x3[14], simde_mm_mul_pd(values[7], sv));	     
 		 
 		 if(useFastScaling)
 		   addScale += wgt[i];
@@ -3418,14 +3410,14 @@
 	       }
 	     else
 	       {
-		 _mm_store_pd(&x3[0], values[0]);	   
-		 _mm_store_pd(&x3[2], values[1]);
-		 _mm_store_pd(&x3[4], values[2]);
-		 _mm_store_pd(&x3[6], values[3]);
-		 _mm_store_pd(&x3[8], values[4]);	   
-		 _mm_store_pd(&x3[10], values[5]);
-		 _mm_store_pd(&x3[12], values[6]);
-		 _mm_store_pd(&x3[14], values[7]);
+		 simde_mm_store_pd(&x3[0], values[0]);	   
+		 simde_mm_store_pd(&x3[2], values[1]);
+		 simde_mm_store_pd(&x3[4], values[2]);
+		 simde_mm_store_pd(&x3[6], values[3]);
+		 simde_mm_store_pd(&x3[8], values[4]);	   
+		 simde_mm_store_pd(&x3[10], values[5]);
+		 simde_mm_store_pd(&x3[12], values[6]);
+		 simde_mm_store_pd(&x3[14], values[7]);
 	       }
 	   }
       }
@@ -3433,7 +3425,7 @@
     case INNER_INNER:     
      for (i = 0; i < n; i++)
        {
-	 __m128d maxv =_mm_setzero_pd();
+	 simde__m128d maxv =simde_mm_setzero_pd();
 	 
 
 	 x1 = &x1_start[i * 16];
@@ -3449,37 +3441,37 @@
 	     double *left_k2_p = &left[j*16 + 2*4];
 	     double *left_k3_p = &left[j*16 + 3*4];
 	     
-	     __m128d x1_0 = _mm_load_pd( &x1_p[0] );
-	     __m128d x1_2 = _mm_load_pd( &x1_p[2] );
+	     simde__m128d x1_0 = simde_mm_load_pd( &x1_p[0] );
+	     simde__m128d x1_2 = simde_mm_load_pd( &x1_p[2] );
 	     
-	     __m128d left_k0_0 = _mm_load_pd( &left_k0_p[0] );
-	     __m128d left_k0_2 = _mm_load_pd( &left_k0_p[2] );
-	     __m128d left_k1_0 = _mm_load_pd( &left_k1_p[0] );
-	     __m128d left_k1_2 = _mm_load_pd( &left_k1_p[2] );
-	     __m128d left_k2_0 = _mm_load_pd( &left_k2_p[0] );
-	     __m128d left_k2_2 = _mm_load_pd( &left_k2_p[2] );
-	     __m128d left_k3_0 = _mm_load_pd( &left_k3_p[0] );
-	     __m128d left_k3_2 = _mm_load_pd( &left_k3_p[2] );
+	     simde__m128d left_k0_0 = simde_mm_load_pd( &left_k0_p[0] );
+	     simde__m128d left_k0_2 = simde_mm_load_pd( &left_k0_p[2] );
+	     simde__m128d left_k1_0 = simde_mm_load_pd( &left_k1_p[0] );
+	     simde__m128d left_k1_2 = simde_mm_load_pd( &left_k1_p[2] );
+	     simde__m128d left_k2_0 = simde_mm_load_pd( &left_k2_p[0] );
+	     simde__m128d left_k2_2 = simde_mm_load_pd( &left_k2_p[2] );
+	     simde__m128d left_k3_0 = simde_mm_load_pd( &left_k3_p[0] );
+	     simde__m128d left_k3_2 = simde_mm_load_pd( &left_k3_p[2] );
 	     
-	     left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
-	     left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
+	     left_k0_0 = simde_mm_mul_pd(x1_0, left_k0_0);
+	     left_k0_2 = simde_mm_mul_pd(x1_2, left_k0_2);
 	     
-	     left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
-	     left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
+	     left_k1_0 = simde_mm_mul_pd(x1_0, left_k1_0);
+	     left_k1_2 = simde_mm_mul_pd(x1_2, left_k1_2);
 	     
-	     left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
-	     left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
-	     left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
+	     left_k0_0 = simde_mm_hadd_pd( left_k0_0, left_k0_2 );
+	     left_k1_0 = simde_mm_hadd_pd( left_k1_0, left_k1_2);
+	     left_k0_0 = simde_mm_hadd_pd( left_k0_0, left_k1_0);
 	     
-	     left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
-	     left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
+	     left_k2_0 = simde_mm_mul_pd(x1_0, left_k2_0);
+	     left_k2_2 = simde_mm_mul_pd(x1_2, left_k2_2);
 	     
-	     left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
-	     left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
+	     left_k3_0 = simde_mm_mul_pd(x1_0, left_k3_0);
+	     left_k3_2 = simde_mm_mul_pd(x1_2, left_k3_2);
 	     
-	     left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
-	     left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
-	     left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
+	     left_k2_0 = simde_mm_hadd_pd( left_k2_0, left_k2_2);
+	     left_k3_0 = simde_mm_hadd_pd( left_k3_0, left_k3_2);
+	     left_k2_0 = simde_mm_hadd_pd( left_k2_0, left_k3_0);
 	     
 	     
 	     
@@ -3490,106 +3482,106 @@
 	     double *right_k1_p = &right[j*16 + 1*4];
 	     double *right_k2_p = &right[j*16 + 2*4];
 	     double *right_k3_p = &right[j*16 + 3*4];
-	     __m128d x2_0 = _mm_load_pd( &x2_p[0] );
-	     __m128d x2_2 = _mm_load_pd( &x2_p[2] );
+	     simde__m128d x2_0 = simde_mm_load_pd( &x2_p[0] );
+	     simde__m128d x2_2 = simde_mm_load_pd( &x2_p[2] );
 	     
-	     __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
-	     __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
-	     __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
-	     __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
-	     __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
-	     __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
-	     __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
-	     __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
+	     simde__m128d right_k0_0 = simde_mm_load_pd( &right_k0_p[0] );
+	     simde__m128d right_k0_2 = simde_mm_load_pd( &right_k0_p[2] );
+	     simde__m128d right_k1_0 = simde_mm_load_pd( &right_k1_p[0] );
+	     simde__m128d right_k1_2 = simde_mm_load_pd( &right_k1_p[2] );
+	     simde__m128d right_k2_0 = simde_mm_load_pd( &right_k2_p[0] );
+	     simde__m128d right_k2_2 = simde_mm_load_pd( &right_k2_p[2] );
+	     simde__m128d right_k3_0 = simde_mm_load_pd( &right_k3_p[0] );
+	     simde__m128d right_k3_2 = simde_mm_load_pd( &right_k3_p[2] );
 	     
-	     right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-	     right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
+	     right_k0_0 = simde_mm_mul_pd( x2_0, right_k0_0);
+	     right_k0_2 = simde_mm_mul_pd( x2_2, right_k0_2);
 	     
-	     right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-	     right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
+	     right_k1_0 = simde_mm_mul_pd( x2_0, right_k1_0);
+	     right_k1_2 = simde_mm_mul_pd( x2_2, right_k1_2);
 	     
-	     right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-	     right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-	     right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+	     right_k0_0 = simde_mm_hadd_pd( right_k0_0, right_k0_2);
+	     right_k1_0 = simde_mm_hadd_pd( right_k1_0, right_k1_2);
+	     right_k0_0 = simde_mm_hadd_pd( right_k0_0, right_k1_0);
 	     
-	     right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-	     right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+	     right_k2_0 = simde_mm_mul_pd( x2_0, right_k2_0);
+	     right_k2_2 = simde_mm_mul_pd( x2_2, right_k2_2);
 	     
 	     
-	     right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-	     right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+	     right_k3_0 = simde_mm_mul_pd( x2_0, right_k3_0);
+	     right_k3_2 = simde_mm_mul_pd( x2_2, right_k3_2);
 	     
-	     right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-	     right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-	     right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);	   
+	     right_k2_0 = simde_mm_hadd_pd( right_k2_0, right_k2_2);
+	     right_k3_0 = simde_mm_hadd_pd( right_k3_0, right_k3_2);
+	     right_k2_0 = simde_mm_hadd_pd( right_k2_0, right_k3_0);	   
 
              
              /* multiply left * right */
             
 
-	     __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
-	     __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
+	     simde__m128d x1px2_k0 = simde_mm_mul_pd( left_k0_0, right_k0_0 );
+	     simde__m128d x1px2_k2 = simde_mm_mul_pd( left_k2_0, right_k2_0 );
 
 
              
              /* multiply with EV matrix (!?) */            
 
-	     __m128d EV_t_l0_k0 = EVV[0];
-	     __m128d EV_t_l0_k2 = EVV[1];
-	     __m128d EV_t_l1_k0 = EVV[2];
-	     __m128d EV_t_l1_k2 = EVV[3];
-	     __m128d EV_t_l2_k0 = EVV[4];
-	     __m128d EV_t_l2_k2 = EVV[5];
-	     __m128d EV_t_l3_k0 = EVV[6]; 
-	     __m128d EV_t_l3_k2 = EVV[7];
+	     simde__m128d EV_t_l0_k0 = EVV[0];
+	     simde__m128d EV_t_l0_k2 = EVV[1];
+	     simde__m128d EV_t_l1_k0 = EVV[2];
+	     simde__m128d EV_t_l1_k2 = EVV[3];
+	     simde__m128d EV_t_l2_k0 = EVV[4];
+	     simde__m128d EV_t_l2_k2 = EVV[5];
+	     simde__m128d EV_t_l3_k0 = EVV[6]; 
+	     simde__m128d EV_t_l3_k2 = EVV[7];
 
 
-	    EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-	    EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-	    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+	    EV_t_l0_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+	    EV_t_l0_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+	    EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
 
-	    EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-	    EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+	    EV_t_l1_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+	    EV_t_l1_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
 
-	    EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-	    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+	    EV_t_l1_k0 = simde_mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+	    EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
 
-	    EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-	    EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-	    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+	    EV_t_l2_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+	    EV_t_l2_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+	    EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
 
 
-	    EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-            EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-            EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+	    EV_t_l3_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+            EV_t_l3_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+            EV_t_l3_k0 = simde_mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
 
-            EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
+            EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
 
 	    
 	    values[j * 2] = EV_t_l0_k0;
 	    values[j * 2 + 1] = EV_t_l2_k0;            	   	    
 
-	    maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
-	    maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));
+	    maxv = simde_mm_max_pd(maxv, simde_mm_and_pd(EV_t_l0_k0, absMask.m));
+	    maxv = simde_mm_max_pd(maxv, simde_mm_and_pd(EV_t_l2_k0, absMask.m));
            }
 	 	 
 	 
-	 _mm_store_pd(maxima, maxv);
+	 simde_mm_store_pd(maxima, maxv);
 	 
 	 max = MAX(maxima[0], maxima[1]);
 	 
 	 if(max < minlikelihood)
 	   {
-	     __m128d sv = _mm_set1_pd(twotothe256);
+	     simde__m128d sv = simde_mm_set1_pd(twotothe256);
 	       		       	   	 	     
-	     _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));	   
-	     _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
-	     _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
-	     _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
-	     _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));	   
-	     _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
-	     _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
-	     _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));	     
+	     simde_mm_store_pd(&x3[0], simde_mm_mul_pd(values[0], sv));	   
+	     simde_mm_store_pd(&x3[2], simde_mm_mul_pd(values[1], sv));
+	     simde_mm_store_pd(&x3[4], simde_mm_mul_pd(values[2], sv));
+	     simde_mm_store_pd(&x3[6], simde_mm_mul_pd(values[3], sv));
+	     simde_mm_store_pd(&x3[8], simde_mm_mul_pd(values[4], sv));	   
+	     simde_mm_store_pd(&x3[10], simde_mm_mul_pd(values[5], sv));
+	     simde_mm_store_pd(&x3[12], simde_mm_mul_pd(values[6], sv));
+	     simde_mm_store_pd(&x3[14], simde_mm_mul_pd(values[7], sv));	     
 	     
 	     if(useFastScaling)
 	       addScale += wgt[i];
@@ -3598,14 +3590,14 @@
 	   }
 	 else
 	   {
-	     _mm_store_pd(&x3[0], values[0]);	   
-	     _mm_store_pd(&x3[2], values[1]);
-	     _mm_store_pd(&x3[4], values[2]);
-	     _mm_store_pd(&x3[6], values[3]);
-	     _mm_store_pd(&x3[8], values[4]);	   
-	     _mm_store_pd(&x3[10], values[5]);
-	     _mm_store_pd(&x3[12], values[6]);
-	     _mm_store_pd(&x3[14], values[7]);
+	     simde_mm_store_pd(&x3[0], values[0]);	   
+	     simde_mm_store_pd(&x3[2], values[1]);
+	     simde_mm_store_pd(&x3[4], values[2]);
+	     simde_mm_store_pd(&x3[6], values[3]);
+	     simde_mm_store_pd(&x3[8], values[4]);	   
+	     simde_mm_store_pd(&x3[10], values[5]);
+	     simde_mm_store_pd(&x3[12], values[6]);
+	     simde_mm_store_pd(&x3[14], values[7]);
 	   }	 
        }
    
@@ -3648,7 +3640,7 @@
     maxima[2] __attribute__ ((aligned (BYTE_ALIGNMENT))),        
     EV_t[16] __attribute__ ((aligned (BYTE_ALIGNMENT)));      
     
-  __m128d 
+  simde__m128d 
     values[8],
     EVV[8];  
 
@@ -3657,7 +3649,7 @@
       EV_t[4 * l + k] = EV[4 * k + l];
 
   for(k = 0; k < 8; k++)
-    EVV[k] = _mm_load_pd(&EV_t[k * 2]);      
+    EVV[k] = simde_mm_load_pd(&EV_t[k * 2]);      
  
   
 
@@ -3670,24 +3662,24 @@
 
 	for (i = 1; i < 16; i++)
 	  {	    
-	    __m128d x1_1 = _mm_load_pd(&(tipVector[i*4]));
-	    __m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2]));	   
+	    simde__m128d x1_1 = simde_mm_load_pd(&(tipVector[i*4]));
+	    simde__m128d x1_2 = simde_mm_load_pd(&(tipVector[i*4 + 2]));	   
 	    
 	    if((mask32[i] & x1_presenceMap) || i == 15)
 	      {
 		for (j = 0; j < 4; j++)
 		  for (k = 0; k < 4; k++)
 		    {			 	 
-		      __m128d left1 = _mm_load_pd(&left[j*16 + k*4]);
-		      __m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]);
+		      simde__m128d left1 = simde_mm_load_pd(&left[j*16 + k*4]);
+		      simde__m128d left2 = simde_mm_load_pd(&left[j*16 + k*4 + 2]);
 		      
-		      __m128d acc = _mm_setzero_pd();
+		      simde__m128d acc = simde_mm_setzero_pd();
 		      
-		      acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
-		      acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
+		      acc = simde_mm_add_pd(acc, simde_mm_mul_pd(left1, x1_1));
+		      acc = simde_mm_add_pd(acc, simde_mm_mul_pd(left2, x1_2));
 		      
-		      acc = _mm_hadd_pd(acc, acc);
-		      _mm_storel_pd(&umpX1[i*16 + j*4 + k], acc);
+		      acc = simde_mm_hadd_pd(acc, acc);
+		      simde_mm_storel_pd(&umpX1[i*16 + j*4 + k], acc);
 		    }
 	      }
 	  
@@ -3696,16 +3688,16 @@
 		for (j = 0; j < 4; j++)
 		  for (k = 0; k < 4; k++)
 		    {
-		      __m128d left1 = _mm_load_pd(&right[j*16 + k*4]);
-		      __m128d left2 = _mm_load_pd(&right[j*16 + k*4 + 2]);
+		      simde__m128d left1 = simde_mm_load_pd(&right[j*16 + k*4]);
+		      simde__m128d left2 = simde_mm_load_pd(&right[j*16 + k*4 + 2]);
 		      
-		      __m128d acc = _mm_setzero_pd();
+		      simde__m128d acc = simde_mm_setzero_pd();
 		      
-		      acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
-		      acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
+		      acc = simde_mm_add_pd(acc, simde_mm_mul_pd(left1, x1_1));
+		      acc = simde_mm_add_pd(acc, simde_mm_mul_pd(left2, x1_2));
 		      
-		      acc = _mm_hadd_pd(acc, acc);
-		      _mm_storel_pd(&umpX2[i*16 + j*4 + k], acc);
+		      acc = simde_mm_hadd_pd(acc, acc);
+		      simde_mm_storel_pd(&umpX2[i*16 + j*4 + k], acc);
 		      
 		    }
 	      }   	
@@ -3716,46 +3708,46 @@
 	
 	for (j = 0; j < 4; j++)
 	  {				 		  		  		   
-	    __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
-	    __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
+	    simde__m128d uX1_k0_sse = simde_mm_load_pd( &uX1[j * 4] );
+	    simde__m128d uX1_k2_sse = simde_mm_load_pd( &uX1[j * 4 + 2] );
 	    	    
-	    __m128d uX2_k0_sse = _mm_load_pd( &uX2[j * 4] );
-	    __m128d uX2_k2_sse = _mm_load_pd( &uX2[j * 4 + 2] );
+	    simde__m128d uX2_k0_sse = simde_mm_load_pd( &uX2[j * 4] );
+	    simde__m128d uX2_k2_sse = simde_mm_load_pd( &uX2[j * 4 + 2] );
 	    
-	    __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, uX2_k0_sse );
-	    __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, uX2_k2_sse );		    		    		   
+	    simde__m128d x1px2_k0 = simde_mm_mul_pd( uX1_k0_sse, uX2_k0_sse );
+	    simde__m128d x1px2_k2 = simde_mm_mul_pd( uX1_k2_sse, uX2_k2_sse );		    		    		   
 	    
-	    __m128d EV_t_l0_k0 = EVV[0];
-	    __m128d EV_t_l0_k2 = EVV[1];
-	    __m128d EV_t_l1_k0 = EVV[2];
-	    __m128d EV_t_l1_k2 = EVV[3];
-	    __m128d EV_t_l2_k0 = EVV[4];
-	    __m128d EV_t_l2_k2 = EVV[5];
-	    __m128d EV_t_l3_k0 = EVV[6]; 
-	    __m128d EV_t_l3_k2 = EVV[7];
-	    
-	    EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-	    EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-	    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-	    
-	    EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-	    EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-	    
-	    EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-	    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-	    
-	    EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-	    EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-	    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-	    
-	    EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-	    EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-	    EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+	    simde__m128d EV_t_l0_k0 = EVV[0];
+	    simde__m128d EV_t_l0_k2 = EVV[1];
+	    simde__m128d EV_t_l1_k0 = EVV[2];
+	    simde__m128d EV_t_l1_k2 = EVV[3];
+	    simde__m128d EV_t_l2_k0 = EVV[4];
+	    simde__m128d EV_t_l2_k2 = EVV[5];
+	    simde__m128d EV_t_l3_k0 = EVV[6]; 
+	    simde__m128d EV_t_l3_k2 = EVV[7];
+	    
+	    EV_t_l0_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+	    EV_t_l0_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+	    EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+	    
+	    EV_t_l1_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+	    EV_t_l1_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+	    
+	    EV_t_l1_k0 = simde_mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+	    EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+	    
+	    EV_t_l2_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+	    EV_t_l2_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+	    EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+	    
+	    EV_t_l3_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+	    EV_t_l3_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+	    EV_t_l3_k0 = simde_mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
 	    
-	    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
+	    EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
 	    	  
-	    _mm_store_pd( &x3_gapColumn[j * 4 + 0], EV_t_l0_k0 );
-	    _mm_store_pd( &x3_gapColumn[j * 4 + 2], EV_t_l2_k0 );	   
+	    simde_mm_store_pd( &x3_gapColumn[j * 4 + 0], EV_t_l0_k0 );
+	    simde_mm_store_pd( &x3_gapColumn[j * 4 + 2], EV_t_l2_k0 );	   
 	  }  
 	
        
@@ -3770,52 +3762,52 @@
 		
 		for (j = 0; j < 4; j++)
 		  {				 		  		  		   
-		    __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
-		    __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
+		    simde__m128d uX1_k0_sse = simde_mm_load_pd( &uX1[j * 4] );
+		    simde__m128d uX1_k2_sse = simde_mm_load_pd( &uX1[j * 4 + 2] );
 		    
 		    
-		    __m128d uX2_k0_sse = _mm_load_pd( &uX2[j * 4] );
-		    __m128d uX2_k2_sse = _mm_load_pd( &uX2[j * 4 + 2] );
+		    simde__m128d uX2_k0_sse = simde_mm_load_pd( &uX2[j * 4] );
+		    simde__m128d uX2_k2_sse = simde_mm_load_pd( &uX2[j * 4 + 2] );
 		    		    		    
 		    /* multiply left * right */		   
 		    
-		    __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, uX2_k0_sse );
-		    __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, uX2_k2_sse );
+		    simde__m128d x1px2_k0 = simde_mm_mul_pd( uX1_k0_sse, uX2_k0_sse );
+		    simde__m128d x1px2_k2 = simde_mm_mul_pd( uX1_k2_sse, uX2_k2_sse );
 		    
 		    		   
 		    /* multiply with EV matrix (!?) */		   
 		    
-		    __m128d EV_t_l0_k0 = EVV[0];
-		    __m128d EV_t_l0_k2 = EVV[1];
-		    __m128d EV_t_l1_k0 = EVV[2];
-		    __m128d EV_t_l1_k2 = EVV[3];
-		    __m128d EV_t_l2_k0 = EVV[4];
-		    __m128d EV_t_l2_k2 = EVV[5];
-		    __m128d EV_t_l3_k0 = EVV[6]; 
-		    __m128d EV_t_l3_k2 = EVV[7];
+		    simde__m128d EV_t_l0_k0 = EVV[0];
+		    simde__m128d EV_t_l0_k2 = EVV[1];
+		    simde__m128d EV_t_l1_k0 = EVV[2];
+		    simde__m128d EV_t_l1_k2 = EVV[3];
+		    simde__m128d EV_t_l2_k0 = EVV[4];
+		    simde__m128d EV_t_l2_k2 = EVV[5];
+		    simde__m128d EV_t_l3_k0 = EVV[6]; 
+		    simde__m128d EV_t_l3_k2 = EVV[7];
 		    
-		    EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-		    EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-		    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+		    EV_t_l0_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+		    EV_t_l0_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+		    EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
 		    
-		    EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-		    EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+		    EV_t_l1_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+		    EV_t_l1_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
 		    
-		    EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-		    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+		    EV_t_l1_k0 = simde_mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+		    EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
 		    
-		    EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-		    EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-		    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+		    EV_t_l2_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+		    EV_t_l2_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+		    EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
 		    
-		    EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-		    EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-		    EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+		    EV_t_l3_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+		    EV_t_l3_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+		    EV_t_l3_k0 = simde_mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
 		    
-		    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
+		    EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
 		    
-		    _mm_store_pd( &x3[j * 4 + 0], EV_t_l0_k0 );
-		    _mm_store_pd( &x3[j * 4 + 2], EV_t_l2_k0 );
+		    simde_mm_store_pd( &x3[j * 4 + 0], EV_t_l0_k0 );
+		    simde_mm_store_pd( &x3[j * 4 + 2], EV_t_l2_k0 );
 		  }
 		
 		x3 += 16;
@@ -3833,28 +3825,28 @@
 	  {
 	    if((mask32[i] & x1_presenceMap) || i == 15)
 	      {
-		__m128d x1_1 = _mm_load_pd(&(tipVector[i*4]));
-		__m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2]));	   
+		simde__m128d x1_1 = simde_mm_load_pd(&(tipVector[i*4]));
+		simde__m128d x1_2 = simde_mm_load_pd(&(tipVector[i*4 + 2]));	   
 		
 		for (j = 0; j < 4; j++)
 		  for (k = 0; k < 4; k++)
 		    {		 
-		      __m128d left1 = _mm_load_pd(&left[j*16 + k*4]);
-		      __m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]);
+		      simde__m128d left1 = simde_mm_load_pd(&left[j*16 + k*4]);
+		      simde__m128d left2 = simde_mm_load_pd(&left[j*16 + k*4 + 2]);
 		      
-		      __m128d acc = _mm_setzero_pd();
+		      simde__m128d acc = simde_mm_setzero_pd();
 		      
-		      acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
-		      acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
+		      acc = simde_mm_add_pd(acc, simde_mm_mul_pd(left1, x1_1));
+		      acc = simde_mm_add_pd(acc, simde_mm_mul_pd(left2, x1_2));
 		      
-		      acc = _mm_hadd_pd(acc, acc);
-		      _mm_storel_pd(&umpX1[i*16 + j*4 + k], acc);		 
+		      acc = simde_mm_hadd_pd(acc, acc);
+		      simde_mm_storel_pd(&umpX1[i*16 + j*4 + k], acc);		 
 		    }
 	      }
 	  }
 
 	{
-	  __m128d maxv =_mm_setzero_pd();
+	  simde__m128d maxv =simde_mm_setzero_pd();
 	  
 	  scaleGap = 0;
 	  
@@ -3870,82 +3862,82 @@
 	      double *right_k1_p = &right[j*16 + 1*4];
 	      double *right_k2_p = &right[j*16 + 2*4];
 	      double *right_k3_p = &right[j*16 + 3*4];
-	      __m128d x2_0 = _mm_load_pd( &x2_p[0] );
-	      __m128d x2_2 = _mm_load_pd( &x2_p[2] );
+	      simde__m128d x2_0 = simde_mm_load_pd( &x2_p[0] );
+	      simde__m128d x2_2 = simde_mm_load_pd( &x2_p[2] );
 	      
-	      __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
-	      __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
-	      __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
-	      __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
-	      __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
-	      __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
-	      __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
-	      __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
+	      simde__m128d right_k0_0 = simde_mm_load_pd( &right_k0_p[0] );
+	      simde__m128d right_k0_2 = simde_mm_load_pd( &right_k0_p[2] );
+	      simde__m128d right_k1_0 = simde_mm_load_pd( &right_k1_p[0] );
+	      simde__m128d right_k1_2 = simde_mm_load_pd( &right_k1_p[2] );
+	      simde__m128d right_k2_0 = simde_mm_load_pd( &right_k2_p[0] );
+	      simde__m128d right_k2_2 = simde_mm_load_pd( &right_k2_p[2] );
+	      simde__m128d right_k3_0 = simde_mm_load_pd( &right_k3_p[0] );
+	      simde__m128d right_k3_2 = simde_mm_load_pd( &right_k3_p[2] );
 	      	      
-	      right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-	      right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
+	      right_k0_0 = simde_mm_mul_pd( x2_0, right_k0_0);
+	      right_k0_2 = simde_mm_mul_pd( x2_2, right_k0_2);
 	      
-	      right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-	      right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
+	      right_k1_0 = simde_mm_mul_pd( x2_0, right_k1_0);
+	      right_k1_2 = simde_mm_mul_pd( x2_2, right_k1_2);
 	      
-	      right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-	      right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-	      right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+	      right_k0_0 = simde_mm_hadd_pd( right_k0_0, right_k0_2);
+	      right_k1_0 = simde_mm_hadd_pd( right_k1_0, right_k1_2);
+	      right_k0_0 = simde_mm_hadd_pd( right_k0_0, right_k1_0);
 	      	       
-	      right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-	      right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+	      right_k2_0 = simde_mm_mul_pd( x2_0, right_k2_0);
+	      right_k2_2 = simde_mm_mul_pd( x2_2, right_k2_2);
 	      	       
-	      right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-	      right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+	      right_k3_0 = simde_mm_mul_pd( x2_0, right_k3_0);
+	      right_k3_2 = simde_mm_mul_pd( x2_2, right_k3_2);
 	      
-	      right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-	      right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-	      right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);
-	      
-	      __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
-	      __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
-	      
-	      __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, right_k0_0 );
-	      __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, right_k2_0 );
-	      
-	      __m128d EV_t_l0_k0 = EVV[0];
-	      __m128d EV_t_l0_k2 = EVV[1];
-	      __m128d EV_t_l1_k0 = EVV[2];
-	      __m128d EV_t_l1_k2 = EVV[3];
-	      __m128d EV_t_l2_k0 = EVV[4];
-	      __m128d EV_t_l2_k2 = EVV[5];
-	      __m128d EV_t_l3_k0 = EVV[6]; 
-	      __m128d EV_t_l3_k2 = EVV[7];
-	      
-	      EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-	      EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-	      EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-	      
-	      EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-	      EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-	      
-	      EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-	      EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-	      
-	      EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-	      EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-	      EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-	      
-	      EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-	      EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-	      EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+	      right_k2_0 = simde_mm_hadd_pd( right_k2_0, right_k2_2);
+	      right_k3_0 = simde_mm_hadd_pd( right_k3_0, right_k3_2);
+	      right_k2_0 = simde_mm_hadd_pd( right_k2_0, right_k3_0);
+	      
+	      simde__m128d uX1_k0_sse = simde_mm_load_pd( &uX1[j * 4] );
+	      simde__m128d uX1_k2_sse = simde_mm_load_pd( &uX1[j * 4 + 2] );
+	      
+	      simde__m128d x1px2_k0 = simde_mm_mul_pd( uX1_k0_sse, right_k0_0 );
+	      simde__m128d x1px2_k2 = simde_mm_mul_pd( uX1_k2_sse, right_k2_0 );
+	      
+	      simde__m128d EV_t_l0_k0 = EVV[0];
+	      simde__m128d EV_t_l0_k2 = EVV[1];
+	      simde__m128d EV_t_l1_k0 = EVV[2];
+	      simde__m128d EV_t_l1_k2 = EVV[3];
+	      simde__m128d EV_t_l2_k0 = EVV[4];
+	      simde__m128d EV_t_l2_k2 = EVV[5];
+	      simde__m128d EV_t_l3_k0 = EVV[6]; 
+	      simde__m128d EV_t_l3_k2 = EVV[7];
+	      
+	      EV_t_l0_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+	      EV_t_l0_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+	      EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+	      
+	      EV_t_l1_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+	      EV_t_l1_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+	      
+	      EV_t_l1_k0 = simde_mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+	      EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+	      
+	      EV_t_l2_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+	      EV_t_l2_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+	      EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+	      
+	      EV_t_l3_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+	      EV_t_l3_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+	      EV_t_l3_k0 = simde_mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
 	      
-	      EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
+	      EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
 	      
 	      values[j * 2]     = EV_t_l0_k0;
 	      values[j * 2 + 1] = EV_t_l2_k0;		   		   
 	      
-	      maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
-	      maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));		   	     		   
+	      maxv = simde_mm_max_pd(maxv, simde_mm_and_pd(EV_t_l0_k0, absMask.m));
+	      maxv = simde_mm_max_pd(maxv, simde_mm_and_pd(EV_t_l2_k0, absMask.m));		   	     		   
 	    }
 
 	  
-	  _mm_store_pd(maxima, maxv);
+	  simde_mm_store_pd(maxima, maxv);
 		 
 	  max = MAX(maxima[0], maxima[1]);
 	  
@@ -3953,27 +3945,27 @@
 	    {
 	      scaleGap = 1;
 	      
-	      __m128d sv = _mm_set1_pd(twotothe256);
+	      simde__m128d sv = simde_mm_set1_pd(twotothe256);
 	      
-	      _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));	   
-	      _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
-	      _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
-	      _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
-	      _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));	   
-	      _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
-	      _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
-	      _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));	     	      	     
+	      simde_mm_store_pd(&x3[0], simde_mm_mul_pd(values[0], sv));	   
+	      simde_mm_store_pd(&x3[2], simde_mm_mul_pd(values[1], sv));
+	      simde_mm_store_pd(&x3[4], simde_mm_mul_pd(values[2], sv));
+	      simde_mm_store_pd(&x3[6], simde_mm_mul_pd(values[3], sv));
+	      simde_mm_store_pd(&x3[8], simde_mm_mul_pd(values[4], sv));	   
+	      simde_mm_store_pd(&x3[10], simde_mm_mul_pd(values[5], sv));
+	      simde_mm_store_pd(&x3[12], simde_mm_mul_pd(values[6], sv));
+	      simde_mm_store_pd(&x3[14], simde_mm_mul_pd(values[7], sv));	     	      	     
 	    }
 	  else
 	    {
-	      _mm_store_pd(&x3[0], values[0]);	   
-	      _mm_store_pd(&x3[2], values[1]);
-	      _mm_store_pd(&x3[4], values[2]);
-	      _mm_store_pd(&x3[6], values[3]);
-	      _mm_store_pd(&x3[8], values[4]);	   
-	      _mm_store_pd(&x3[10], values[5]);
-	      _mm_store_pd(&x3[12], values[6]);
-	      _mm_store_pd(&x3[14], values[7]);
+	      simde_mm_store_pd(&x3[0], values[0]);	   
+	      simde_mm_store_pd(&x3[2], values[1]);
+	      simde_mm_store_pd(&x3[4], values[2]);
+	      simde_mm_store_pd(&x3[6], values[3]);
+	      simde_mm_store_pd(&x3[8], values[4]);	   
+	      simde_mm_store_pd(&x3[10], values[5]);
+	      simde_mm_store_pd(&x3[12], values[6]);
+	      simde_mm_store_pd(&x3[14], values[7]);
 	    }
 	}		       	
       	
@@ -3993,7 +3985,7 @@
 	       }
 	     else
 	       {				 
-		 __m128d maxv =_mm_setzero_pd();		 
+		 simde__m128d maxv =simde_mm_setzero_pd();		 
 		 
 		 if(x2_gap[i / 32] & mask32[i % 32])
 		   x2 = x2_gapColumn;
@@ -4013,117 +4005,117 @@
 		     double *right_k1_p = &right[j*16 + 1*4];
 		     double *right_k2_p = &right[j*16 + 2*4];
 		     double *right_k3_p = &right[j*16 + 3*4];
-		     __m128d x2_0 = _mm_load_pd( &x2_p[0] );
-		     __m128d x2_2 = _mm_load_pd( &x2_p[2] );
+		     simde__m128d x2_0 = simde_mm_load_pd( &x2_p[0] );
+		     simde__m128d x2_2 = simde_mm_load_pd( &x2_p[2] );
 		     
-		     __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
-		     __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
-		     __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
-		     __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
-		     __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
-		     __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
-		     __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
-		     __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
+		     simde__m128d right_k0_0 = simde_mm_load_pd( &right_k0_p[0] );
+		     simde__m128d right_k0_2 = simde_mm_load_pd( &right_k0_p[2] );
+		     simde__m128d right_k1_0 = simde_mm_load_pd( &right_k1_p[0] );
+		     simde__m128d right_k1_2 = simde_mm_load_pd( &right_k1_p[2] );
+		     simde__m128d right_k2_0 = simde_mm_load_pd( &right_k2_p[0] );
+		     simde__m128d right_k2_2 = simde_mm_load_pd( &right_k2_p[2] );
+		     simde__m128d right_k3_0 = simde_mm_load_pd( &right_k3_p[0] );
+		     simde__m128d right_k3_2 = simde_mm_load_pd( &right_k3_p[2] );
 		     
 		     		     
-		     right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-		     right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
+		     right_k0_0 = simde_mm_mul_pd( x2_0, right_k0_0);
+		     right_k0_2 = simde_mm_mul_pd( x2_2, right_k0_2);
 		     
-		     right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-		     right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
+		     right_k1_0 = simde_mm_mul_pd( x2_0, right_k1_0);
+		     right_k1_2 = simde_mm_mul_pd( x2_2, right_k1_2);
 		     
-		     right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-		     right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-		     right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+		     right_k0_0 = simde_mm_hadd_pd( right_k0_0, right_k0_2);
+		     right_k1_0 = simde_mm_hadd_pd( right_k1_0, right_k1_2);
+		     right_k0_0 = simde_mm_hadd_pd( right_k0_0, right_k1_0);
 		     
 		     
-		     right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-		     right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+		     right_k2_0 = simde_mm_mul_pd( x2_0, right_k2_0);
+		     right_k2_2 = simde_mm_mul_pd( x2_2, right_k2_2);
 		     
 		     
-		     right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-		     right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+		     right_k3_0 = simde_mm_mul_pd( x2_0, right_k3_0);
+		     right_k3_2 = simde_mm_mul_pd( x2_2, right_k3_2);
 		     
-		     right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-		     right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-		     right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);
+		     right_k2_0 = simde_mm_hadd_pd( right_k2_0, right_k2_2);
+		     right_k3_0 = simde_mm_hadd_pd( right_k3_0, right_k3_2);
+		     right_k2_0 = simde_mm_hadd_pd( right_k2_0, right_k3_0);
 		     
 		     {
 		       //
 		       // load left side from tip vector
 		       //
 		       
-		       __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
-		       __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
+		       simde__m128d uX1_k0_sse = simde_mm_load_pd( &uX1[j * 4] );
+		       simde__m128d uX1_k2_sse = simde_mm_load_pd( &uX1[j * 4 + 2] );
 		       
 		       
 		       //
 		       // multiply left * right
 		       //
 		       
-		       __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, right_k0_0 );
-		       __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, right_k2_0 );
+		       simde__m128d x1px2_k0 = simde_mm_mul_pd( uX1_k0_sse, right_k0_0 );
+		       simde__m128d x1px2_k2 = simde_mm_mul_pd( uX1_k2_sse, right_k2_0 );
 		       
 		       
 		       //
 		       // multiply with EV matrix (!?)
 		       //		   		  
 		       
-		       __m128d EV_t_l0_k0 = EVV[0];
-		       __m128d EV_t_l0_k2 = EVV[1];
-		       __m128d EV_t_l1_k0 = EVV[2];
-		       __m128d EV_t_l1_k2 = EVV[3];
-		       __m128d EV_t_l2_k0 = EVV[4];
-		       __m128d EV_t_l2_k2 = EVV[5];
-		       __m128d EV_t_l3_k0 = EVV[6]; 
-		       __m128d EV_t_l3_k2 = EVV[7];
+		       simde__m128d EV_t_l0_k0 = EVV[0];
+		       simde__m128d EV_t_l0_k2 = EVV[1];
+		       simde__m128d EV_t_l1_k0 = EVV[2];
+		       simde__m128d EV_t_l1_k2 = EVV[3];
+		       simde__m128d EV_t_l2_k0 = EVV[4];
+		       simde__m128d EV_t_l2_k2 = EVV[5];
+		       simde__m128d EV_t_l3_k0 = EVV[6]; 
+		       simde__m128d EV_t_l3_k2 = EVV[7];
 		       
 		       
-		       EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-		       EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-		       EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+		       EV_t_l0_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+		       EV_t_l0_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+		       EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
 		       
-		       EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-		       EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+		       EV_t_l1_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+		       EV_t_l1_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
 		       
-		       EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-		       EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+		       EV_t_l1_k0 = simde_mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+		       EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
 		       
-		       EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-		       EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-		       EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+		       EV_t_l2_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+		       EV_t_l2_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+		       EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
 		       
-		       EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-		       EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-		       EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+		       EV_t_l3_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+		       EV_t_l3_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+		       EV_t_l3_k0 = simde_mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
 		       
-		       EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
+		       EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
 		       
 		       values[j * 2]     = EV_t_l0_k0;
 		       values[j * 2 + 1] = EV_t_l2_k0;		   		   
 			   
-		       maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
-		       maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));		   
+		       maxv = simde_mm_max_pd(maxv, simde_mm_and_pd(EV_t_l0_k0, absMask.m));
+		       maxv = simde_mm_max_pd(maxv, simde_mm_and_pd(EV_t_l2_k0, absMask.m));		   
 		     }		   
 		   }
 
 	     
-		 _mm_store_pd(maxima, maxv);
+		 simde_mm_store_pd(maxima, maxv);
 		 
 		 max = MAX(maxima[0], maxima[1]);
 		 
 		 if(max < minlikelihood)
 		   {
-		     __m128d sv = _mm_set1_pd(twotothe256);
+		     simde__m128d sv = simde_mm_set1_pd(twotothe256);
 		     
-		     _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));	   
-		     _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
-		     _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
-		     _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
-		     _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));	   
-		     _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
-		     _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
-		     _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));	     
+		     simde_mm_store_pd(&x3[0], simde_mm_mul_pd(values[0], sv));	   
+		     simde_mm_store_pd(&x3[2], simde_mm_mul_pd(values[1], sv));
+		     simde_mm_store_pd(&x3[4], simde_mm_mul_pd(values[2], sv));
+		     simde_mm_store_pd(&x3[6], simde_mm_mul_pd(values[3], sv));
+		     simde_mm_store_pd(&x3[8], simde_mm_mul_pd(values[4], sv));	   
+		     simde_mm_store_pd(&x3[10], simde_mm_mul_pd(values[5], sv));
+		     simde_mm_store_pd(&x3[12], simde_mm_mul_pd(values[6], sv));
+		     simde_mm_store_pd(&x3[14], simde_mm_mul_pd(values[7], sv));	     
 		     
 		     if(useFastScaling)
 		       addScale += wgt[i];
@@ -4132,14 +4124,14 @@
 		   }
 		 else
 		   {
-		     _mm_store_pd(&x3[0], values[0]);	   
-		     _mm_store_pd(&x3[2], values[1]);
-		     _mm_store_pd(&x3[4], values[2]);
-		     _mm_store_pd(&x3[6], values[3]);
-		     _mm_store_pd(&x3[8], values[4]);	   
-		     _mm_store_pd(&x3[10], values[5]);
-		     _mm_store_pd(&x3[12], values[6]);
-		     _mm_store_pd(&x3[14], values[7]);
+		     simde_mm_store_pd(&x3[0], values[0]);	   
+		     simde_mm_store_pd(&x3[2], values[1]);
+		     simde_mm_store_pd(&x3[4], values[2]);
+		     simde_mm_store_pd(&x3[6], values[3]);
+		     simde_mm_store_pd(&x3[8], values[4]);	   
+		     simde_mm_store_pd(&x3[10], values[5]);
+		     simde_mm_store_pd(&x3[12], values[6]);
+		     simde_mm_store_pd(&x3[14], values[7]);
 		   }		 
 		 
 		 x3 += 16;
@@ -4149,7 +4141,7 @@
       break;
     case INNER_INNER:         
       {
-	__m128d maxv =_mm_setzero_pd();
+	simde__m128d maxv =simde_mm_setzero_pd();
 	
 	scaleGap = 0;
 	
@@ -4166,37 +4158,37 @@
 	    double *left_k2_p = &left[j*16 + 2*4];
 	    double *left_k3_p = &left[j*16 + 3*4];
 	    
-	    __m128d x1_0 = _mm_load_pd( &x1_p[0] );
-	    __m128d x1_2 = _mm_load_pd( &x1_p[2] );
+	    simde__m128d x1_0 = simde_mm_load_pd( &x1_p[0] );
+	    simde__m128d x1_2 = simde_mm_load_pd( &x1_p[2] );
 	    
-	    __m128d left_k0_0 = _mm_load_pd( &left_k0_p[0] );
-	    __m128d left_k0_2 = _mm_load_pd( &left_k0_p[2] );
-	    __m128d left_k1_0 = _mm_load_pd( &left_k1_p[0] );
-	    __m128d left_k1_2 = _mm_load_pd( &left_k1_p[2] );
-	    __m128d left_k2_0 = _mm_load_pd( &left_k2_p[0] );
-	    __m128d left_k2_2 = _mm_load_pd( &left_k2_p[2] );
-	    __m128d left_k3_0 = _mm_load_pd( &left_k3_p[0] );
-	    __m128d left_k3_2 = _mm_load_pd( &left_k3_p[2] );
-	    
-	    left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
-	    left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
-	    
-	    left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
-	    left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
-	    
-	    left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
-	    left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
-	    left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
-	    
-	    left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
-	    left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
-	    
-	    left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
-	    left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
-	    
-	    left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
-	    left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
-	    left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
+	    simde__m128d left_k0_0 = simde_mm_load_pd( &left_k0_p[0] );
+	    simde__m128d left_k0_2 = simde_mm_load_pd( &left_k0_p[2] );
+	    simde__m128d left_k1_0 = simde_mm_load_pd( &left_k1_p[0] );
+	    simde__m128d left_k1_2 = simde_mm_load_pd( &left_k1_p[2] );
+	    simde__m128d left_k2_0 = simde_mm_load_pd( &left_k2_p[0] );
+	    simde__m128d left_k2_2 = simde_mm_load_pd( &left_k2_p[2] );
+	    simde__m128d left_k3_0 = simde_mm_load_pd( &left_k3_p[0] );
+	    simde__m128d left_k3_2 = simde_mm_load_pd( &left_k3_p[2] );
+	    
+	    left_k0_0 = simde_mm_mul_pd(x1_0, left_k0_0);
+	    left_k0_2 = simde_mm_mul_pd(x1_2, left_k0_2);
+	    
+	    left_k1_0 = simde_mm_mul_pd(x1_0, left_k1_0);
+	    left_k1_2 = simde_mm_mul_pd(x1_2, left_k1_2);
+	    
+	    left_k0_0 = simde_mm_hadd_pd( left_k0_0, left_k0_2 );
+	    left_k1_0 = simde_mm_hadd_pd( left_k1_0, left_k1_2);
+	    left_k0_0 = simde_mm_hadd_pd( left_k0_0, left_k1_0);
+	    
+	    left_k2_0 = simde_mm_mul_pd(x1_0, left_k2_0);
+	    left_k2_2 = simde_mm_mul_pd(x1_2, left_k2_2);
+	    
+	    left_k3_0 = simde_mm_mul_pd(x1_0, left_k3_0);
+	    left_k3_2 = simde_mm_mul_pd(x1_2, left_k3_2);
+	    
+	    left_k2_0 = simde_mm_hadd_pd( left_k2_0, left_k2_2);
+	    left_k3_0 = simde_mm_hadd_pd( left_k3_0, left_k3_2);
+	    left_k2_0 = simde_mm_hadd_pd( left_k2_0, left_k3_0);
 	    
 	    
 	    double *x2_p = &x2[j*4];
@@ -4204,107 +4196,107 @@
 	    double *right_k1_p = &right[j*16 + 1*4];
 	    double *right_k2_p = &right[j*16 + 2*4];
 	    double *right_k3_p = &right[j*16 + 3*4];
-	    __m128d x2_0 = _mm_load_pd( &x2_p[0] );
-	    __m128d x2_2 = _mm_load_pd( &x2_p[2] );
+	    simde__m128d x2_0 = simde_mm_load_pd( &x2_p[0] );
+	    simde__m128d x2_2 = simde_mm_load_pd( &x2_p[2] );
 	    
-	    __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
-	    __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
-	    __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
-	    __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
-	    __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
-	    __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
-	    __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
-	    __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
-	    
-	    right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-	    right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-	    
-	    right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-	    right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-	    
-	    right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-	    right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-	    right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+	    simde__m128d right_k0_0 = simde_mm_load_pd( &right_k0_p[0] );
+	    simde__m128d right_k0_2 = simde_mm_load_pd( &right_k0_p[2] );
+	    simde__m128d right_k1_0 = simde_mm_load_pd( &right_k1_p[0] );
+	    simde__m128d right_k1_2 = simde_mm_load_pd( &right_k1_p[2] );
+	    simde__m128d right_k2_0 = simde_mm_load_pd( &right_k2_p[0] );
+	    simde__m128d right_k2_2 = simde_mm_load_pd( &right_k2_p[2] );
+	    simde__m128d right_k3_0 = simde_mm_load_pd( &right_k3_p[0] );
+	    simde__m128d right_k3_2 = simde_mm_load_pd( &right_k3_p[2] );
+	    
+	    right_k0_0 = simde_mm_mul_pd( x2_0, right_k0_0);
+	    right_k0_2 = simde_mm_mul_pd( x2_2, right_k0_2);
+	    
+	    right_k1_0 = simde_mm_mul_pd( x2_0, right_k1_0);
+	    right_k1_2 = simde_mm_mul_pd( x2_2, right_k1_2);
+	    
+	    right_k0_0 = simde_mm_hadd_pd( right_k0_0, right_k0_2);
+	    right_k1_0 = simde_mm_hadd_pd( right_k1_0, right_k1_2);
+	    right_k0_0 = simde_mm_hadd_pd( right_k0_0, right_k1_0);
 	    
-	    right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-	    right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+	    right_k2_0 = simde_mm_mul_pd( x2_0, right_k2_0);
+	    right_k2_2 = simde_mm_mul_pd( x2_2, right_k2_2);
 	    	    
-	    right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-	    right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+	    right_k3_0 = simde_mm_mul_pd( x2_0, right_k3_0);
+	    right_k3_2 = simde_mm_mul_pd( x2_2, right_k3_2);
 	    
-	    right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-	    right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-	    right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);	   		 		
-	    
-	    __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
-	    __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );		 		 	   
-	    
-	    __m128d EV_t_l0_k0 = EVV[0];
-	    __m128d EV_t_l0_k2 = EVV[1];
-	    __m128d EV_t_l1_k0 = EVV[2];
-	    __m128d EV_t_l1_k2 = EVV[3];
-	    __m128d EV_t_l2_k0 = EVV[4];
-	    __m128d EV_t_l2_k2 = EVV[5];
-	    __m128d EV_t_l3_k0 = EVV[6]; 
-	    __m128d EV_t_l3_k2 = EVV[7];
-	    
-	    EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-	    EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-	    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-	    
-	    EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-	    EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-	    
-	    EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-	    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-	    
-	    EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-	    EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-	    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-	    
-	    EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-	    EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-	    EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+	    right_k2_0 = simde_mm_hadd_pd( right_k2_0, right_k2_2);
+	    right_k3_0 = simde_mm_hadd_pd( right_k3_0, right_k3_2);
+	    right_k2_0 = simde_mm_hadd_pd( right_k2_0, right_k3_0);	   		 		
+	    
+	    simde__m128d x1px2_k0 = simde_mm_mul_pd( left_k0_0, right_k0_0 );
+	    simde__m128d x1px2_k2 = simde_mm_mul_pd( left_k2_0, right_k2_0 );		 		 	   
+	    
+	    simde__m128d EV_t_l0_k0 = EVV[0];
+	    simde__m128d EV_t_l0_k2 = EVV[1];
+	    simde__m128d EV_t_l1_k0 = EVV[2];
+	    simde__m128d EV_t_l1_k2 = EVV[3];
+	    simde__m128d EV_t_l2_k0 = EVV[4];
+	    simde__m128d EV_t_l2_k2 = EVV[5];
+	    simde__m128d EV_t_l3_k0 = EVV[6]; 
+	    simde__m128d EV_t_l3_k2 = EVV[7];
+	    
+	    EV_t_l0_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+	    EV_t_l0_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+	    EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+	    
+	    EV_t_l1_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+	    EV_t_l1_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+	    
+	    EV_t_l1_k0 = simde_mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+	    EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+	    
+	    EV_t_l2_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+	    EV_t_l2_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+	    EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+	    
+	    EV_t_l3_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+	    EV_t_l3_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+	    EV_t_l3_k0 = simde_mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
 	    
-	    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
+	    EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
 	    
 	    
 	    values[j * 2] = EV_t_l0_k0;
 	    values[j * 2 + 1] = EV_t_l2_k0;            	   	    
 	    
-	    maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
-	    maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));
+	    maxv = simde_mm_max_pd(maxv, simde_mm_and_pd(EV_t_l0_k0, absMask.m));
+	    maxv = simde_mm_max_pd(maxv, simde_mm_and_pd(EV_t_l2_k0, absMask.m));
 	  }
 		     
-	_mm_store_pd(maxima, maxv);
+	simde_mm_store_pd(maxima, maxv);
 	
 	max = MAX(maxima[0], maxima[1]);
 	
 	if(max < minlikelihood)
 	  {
-	    __m128d sv = _mm_set1_pd(twotothe256);
+	    simde__m128d sv = simde_mm_set1_pd(twotothe256);
 	    
 	    scaleGap = 1;
 	    
-	    _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));	   
-	    _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
-	    _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
-	    _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
-	    _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));	   
-	    _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
-	    _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
-	    _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));	     	    	 
+	    simde_mm_store_pd(&x3[0], simde_mm_mul_pd(values[0], sv));	   
+	    simde_mm_store_pd(&x3[2], simde_mm_mul_pd(values[1], sv));
+	    simde_mm_store_pd(&x3[4], simde_mm_mul_pd(values[2], sv));
+	    simde_mm_store_pd(&x3[6], simde_mm_mul_pd(values[3], sv));
+	    simde_mm_store_pd(&x3[8], simde_mm_mul_pd(values[4], sv));	   
+	    simde_mm_store_pd(&x3[10], simde_mm_mul_pd(values[5], sv));
+	    simde_mm_store_pd(&x3[12], simde_mm_mul_pd(values[6], sv));
+	    simde_mm_store_pd(&x3[14], simde_mm_mul_pd(values[7], sv));	     	    	 
 	  }
 	else
 	  {
-	    _mm_store_pd(&x3[0], values[0]);	   
-	    _mm_store_pd(&x3[2], values[1]);
-	    _mm_store_pd(&x3[4], values[2]);
-	    _mm_store_pd(&x3[6], values[3]);
-	    _mm_store_pd(&x3[8], values[4]);	   
-	    _mm_store_pd(&x3[10], values[5]);
-	    _mm_store_pd(&x3[12], values[6]);
-	    _mm_store_pd(&x3[14], values[7]);
+	    simde_mm_store_pd(&x3[0], values[0]);	   
+	    simde_mm_store_pd(&x3[2], values[1]);
+	    simde_mm_store_pd(&x3[4], values[2]);
+	    simde_mm_store_pd(&x3[6], values[3]);
+	    simde_mm_store_pd(&x3[8], values[4]);	   
+	    simde_mm_store_pd(&x3[10], values[5]);
+	    simde_mm_store_pd(&x3[12], values[6]);
+	    simde_mm_store_pd(&x3[14], values[7]);
 	  }
       }
 
@@ -4325,7 +4317,7 @@
 	   }
 	 else
 	   {
-	     __m128d maxv =_mm_setzero_pd();	     	    
+	     simde__m128d maxv =simde_mm_setzero_pd();	     	    
 	     
 	     if(x1_gap[i / 32] & mask32[i % 32])
 	       x1 = x1_gapColumn;
@@ -4353,37 +4345,37 @@
 		 double *left_k2_p = &left[j*16 + 2*4];
 		 double *left_k3_p = &left[j*16 + 3*4];
 		 
-		 __m128d x1_0 = _mm_load_pd( &x1_p[0] );
-		 __m128d x1_2 = _mm_load_pd( &x1_p[2] );
+		 simde__m128d x1_0 = simde_mm_load_pd( &x1_p[0] );
+		 simde__m128d x1_2 = simde_mm_load_pd( &x1_p[2] );
 		 
-		 __m128d left_k0_0 = _mm_load_pd( &left_k0_p[0] );
-		 __m128d left_k0_2 = _mm_load_pd( &left_k0_p[2] );
-		 __m128d left_k1_0 = _mm_load_pd( &left_k1_p[0] );
-		 __m128d left_k1_2 = _mm_load_pd( &left_k1_p[2] );
-		 __m128d left_k2_0 = _mm_load_pd( &left_k2_p[0] );
-		 __m128d left_k2_2 = _mm_load_pd( &left_k2_p[2] );
-		 __m128d left_k3_0 = _mm_load_pd( &left_k3_p[0] );
-		 __m128d left_k3_2 = _mm_load_pd( &left_k3_p[2] );
-		 
-		 left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
-		 left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
-		 
-		 left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
-		 left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
-		 
-		 left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
-		 left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
-		 left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
-		 
-		 left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
-		 left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
-		 
-		 left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
-		 left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
-		 
-		 left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
-		 left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
-		 left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
+		 simde__m128d left_k0_0 = simde_mm_load_pd( &left_k0_p[0] );
+		 simde__m128d left_k0_2 = simde_mm_load_pd( &left_k0_p[2] );
+		 simde__m128d left_k1_0 = simde_mm_load_pd( &left_k1_p[0] );
+		 simde__m128d left_k1_2 = simde_mm_load_pd( &left_k1_p[2] );
+		 simde__m128d left_k2_0 = simde_mm_load_pd( &left_k2_p[0] );
+		 simde__m128d left_k2_2 = simde_mm_load_pd( &left_k2_p[2] );
+		 simde__m128d left_k3_0 = simde_mm_load_pd( &left_k3_p[0] );
+		 simde__m128d left_k3_2 = simde_mm_load_pd( &left_k3_p[2] );
+		 
+		 left_k0_0 = simde_mm_mul_pd(x1_0, left_k0_0);
+		 left_k0_2 = simde_mm_mul_pd(x1_2, left_k0_2);
+		 
+		 left_k1_0 = simde_mm_mul_pd(x1_0, left_k1_0);
+		 left_k1_2 = simde_mm_mul_pd(x1_2, left_k1_2);
+		 
+		 left_k0_0 = simde_mm_hadd_pd( left_k0_0, left_k0_2 );
+		 left_k1_0 = simde_mm_hadd_pd( left_k1_0, left_k1_2);
+		 left_k0_0 = simde_mm_hadd_pd( left_k0_0, left_k1_0);
+		 
+		 left_k2_0 = simde_mm_mul_pd(x1_0, left_k2_0);
+		 left_k2_2 = simde_mm_mul_pd(x1_2, left_k2_2);
+		 
+		 left_k3_0 = simde_mm_mul_pd(x1_0, left_k3_0);
+		 left_k3_2 = simde_mm_mul_pd(x1_2, left_k3_2);
+		 
+		 left_k2_0 = simde_mm_hadd_pd( left_k2_0, left_k2_2);
+		 left_k3_0 = simde_mm_hadd_pd( left_k3_0, left_k3_2);
+		 left_k2_0 = simde_mm_hadd_pd( left_k2_0, left_k3_0);
 		 
 		 
 		 //
@@ -4394,107 +4386,107 @@
 		 double *right_k1_p = &right[j*16 + 1*4];
 		 double *right_k2_p = &right[j*16 + 2*4];
 		 double *right_k3_p = &right[j*16 + 3*4];
-		 __m128d x2_0 = _mm_load_pd( &x2_p[0] );
-		 __m128d x2_2 = _mm_load_pd( &x2_p[2] );
+		 simde__m128d x2_0 = simde_mm_load_pd( &x2_p[0] );
+		 simde__m128d x2_2 = simde_mm_load_pd( &x2_p[2] );
 		 
-		 __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
-		 __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
-		 __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
-		 __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
-		 __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
-		 __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
-		 __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
-		 __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
-		 
-		 right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-		 right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-		 
-		 right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-		 right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-		 
-		 right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-		 right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-		 right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-		 
-		 right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-		 right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-		 
-		 
-		 right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-		 right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-		 
-		 right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-		 right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-		 right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);	   
+		 simde__m128d right_k0_0 = simde_mm_load_pd( &right_k0_p[0] );
+		 simde__m128d right_k0_2 = simde_mm_load_pd( &right_k0_p[2] );
+		 simde__m128d right_k1_0 = simde_mm_load_pd( &right_k1_p[0] );
+		 simde__m128d right_k1_2 = simde_mm_load_pd( &right_k1_p[2] );
+		 simde__m128d right_k2_0 = simde_mm_load_pd( &right_k2_p[0] );
+		 simde__m128d right_k2_2 = simde_mm_load_pd( &right_k2_p[2] );
+		 simde__m128d right_k3_0 = simde_mm_load_pd( &right_k3_p[0] );
+		 simde__m128d right_k3_2 = simde_mm_load_pd( &right_k3_p[2] );
+		 
+		 right_k0_0 = simde_mm_mul_pd( x2_0, right_k0_0);
+		 right_k0_2 = simde_mm_mul_pd( x2_2, right_k0_2);
+		 
+		 right_k1_0 = simde_mm_mul_pd( x2_0, right_k1_0);
+		 right_k1_2 = simde_mm_mul_pd( x2_2, right_k1_2);
+		 
+		 right_k0_0 = simde_mm_hadd_pd( right_k0_0, right_k0_2);
+		 right_k1_0 = simde_mm_hadd_pd( right_k1_0, right_k1_2);
+		 right_k0_0 = simde_mm_hadd_pd( right_k0_0, right_k1_0);
+		 
+		 right_k2_0 = simde_mm_mul_pd( x2_0, right_k2_0);
+		 right_k2_2 = simde_mm_mul_pd( x2_2, right_k2_2);
+		 
+		 
+		 right_k3_0 = simde_mm_mul_pd( x2_0, right_k3_0);
+		 right_k3_2 = simde_mm_mul_pd( x2_2, right_k3_2);
+		 
+		 right_k2_0 = simde_mm_hadd_pd( right_k2_0, right_k2_2);
+		 right_k3_0 = simde_mm_hadd_pd( right_k3_0, right_k3_2);
+		 right_k2_0 = simde_mm_hadd_pd( right_k2_0, right_k3_0);	   
 		 
 		 //
 		 // multiply left * right
 		 //
 		 
-		 __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
-		 __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
+		 simde__m128d x1px2_k0 = simde_mm_mul_pd( left_k0_0, right_k0_0 );
+		 simde__m128d x1px2_k2 = simde_mm_mul_pd( left_k2_0, right_k2_0 );
 		 
 		 
 		 //
 		 // multiply with EV matrix (!?)
 		 //	     
 		 
-		 __m128d EV_t_l0_k0 = EVV[0];
-		 __m128d EV_t_l0_k2 = EVV[1];
-		 __m128d EV_t_l1_k0 = EVV[2];
-		 __m128d EV_t_l1_k2 = EVV[3];
-		 __m128d EV_t_l2_k0 = EVV[4];
-		 __m128d EV_t_l2_k2 = EVV[5];
-		 __m128d EV_t_l3_k0 = EVV[6]; 
-		 __m128d EV_t_l3_k2 = EVV[7];
+		 simde__m128d EV_t_l0_k0 = EVV[0];
+		 simde__m128d EV_t_l0_k2 = EVV[1];
+		 simde__m128d EV_t_l1_k0 = EVV[2];
+		 simde__m128d EV_t_l1_k2 = EVV[3];
+		 simde__m128d EV_t_l2_k0 = EVV[4];
+		 simde__m128d EV_t_l2_k2 = EVV[5];
+		 simde__m128d EV_t_l3_k0 = EVV[6]; 
+		 simde__m128d EV_t_l3_k2 = EVV[7];
 		 
 		 
-		 EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-		 EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-		 EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+		 EV_t_l0_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+		 EV_t_l0_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+		 EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
 		 
-		 EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-		 EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+		 EV_t_l1_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+		 EV_t_l1_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
 		 
-		 EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-		 EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+		 EV_t_l1_k0 = simde_mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+		 EV_t_l0_k0 = simde_mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
 		 
-		 EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-		 EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-		 EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+		 EV_t_l2_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+		 EV_t_l2_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+		 EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
 		 
 		 
-		 EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-		 EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-		 EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+		 EV_t_l3_k0 = simde_mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+		 EV_t_l3_k2 = simde_mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+		 EV_t_l3_k0 = simde_mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
 		 
-		 EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
+		 EV_t_l2_k0 = simde_mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
 		 
 		 
 		 values[j * 2] = EV_t_l0_k0;
 		 values[j * 2 + 1] = EV_t_l2_k0;            	   	    
 		 
-		 maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
-		 maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));
+		 maxv = simde_mm_max_pd(maxv, simde_mm_and_pd(EV_t_l0_k0, absMask.m));
+		 maxv = simde_mm_max_pd(maxv, simde_mm_and_pd(EV_t_l2_k0, absMask.m));
 	       }
 	     
 	     
-	     _mm_store_pd(maxima, maxv);
+	     simde_mm_store_pd(maxima, maxv);
 	     
 	     max = MAX(maxima[0], maxima[1]);
 	     
 	     if(max < minlikelihood)
 	       {
-		 __m128d sv = _mm_set1_pd(twotothe256);
+		 simde__m128d sv = simde_mm_set1_pd(twotothe256);
 		 
-		 _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));	   
-		 _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
-		 _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
-		 _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
-		 _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));	   
-		 _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
-		 _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
-		 _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));	     
+		 simde_mm_store_pd(&x3[0], simde_mm_mul_pd(values[0], sv));	   
+		 simde_mm_store_pd(&x3[2], simde_mm_mul_pd(values[1], sv));
+		 simde_mm_store_pd(&x3[4], simde_mm_mul_pd(values[2], sv));
+		 simde_mm_store_pd(&x3[6], simde_mm_mul_pd(values[3], sv));
+		 simde_mm_store_pd(&x3[8], simde_mm_mul_pd(values[4], sv));	   
+		 simde_mm_store_pd(&x3[10], simde_mm_mul_pd(values[5], sv));
+		 simde_mm_store_pd(&x3[12], simde_mm_mul_pd(values[6], sv));
+		 simde_mm_store_pd(&x3[14], simde_mm_mul_pd(values[7], sv));	     
 		 
 		 if(useFastScaling)
 		   addScale += wgt[i];
@@ -4503,14 +4495,14 @@
 	       }
 	     else
 	       {
-		 _mm_store_pd(&x3[0], values[0]);	   
-		 _mm_store_pd(&x3[2], values[1]);
-		 _mm_store_pd(&x3[4], values[2]);
-		 _mm_store_pd(&x3[6], values[3]);
-		 _mm_store_pd(&x3[8], values[4]);	   
-		 _mm_store_pd(&x3[10], values[5]);
-		 _mm_store_pd(&x3[12], values[6]);
-		 _mm_store_pd(&x3[14], values[7]);
+		 simde_mm_store_pd(&x3[0], values[0]);	   
+		 simde_mm_store_pd(&x3[2], values[1]);
+		 simde_mm_store_pd(&x3[4], values[2]);
+		 simde_mm_store_pd(&x3[6], values[3]);
+		 simde_mm_store_pd(&x3[8], values[4]);	   
+		 simde_mm_store_pd(&x3[10], values[5]);
+		 simde_mm_store_pd(&x3[12], values[6]);
+		 simde_mm_store_pd(&x3[14], values[7]);
 	       }	 
 
 	    
@@ -4793,12 +4785,12 @@
     ri = &right[maxCats * 400];	  
 
     for(l = 0; l < 20; l+=2)
-      _mm_store_pd(&v[l], _mm_setzero_pd());	      		
+      simde_mm_store_pd(&v[l], simde_mm_setzero_pd());	      		
 
     for(l = 0; l < 20; l++)
     {
-      __m128d x1v = _mm_setzero_pd();
-      __m128d x2v = _mm_setzero_pd();
+      simde__m128d x1v = simde_mm_setzero_pd();
+      simde__m128d x2v = simde_mm_setzero_pd();
       double 
         *ev = &extEV[l * 20],
         *lv = &le[l * 20],
@@ -4807,45 +4799,45 @@
 
       for(j = 0; j < 20; j+=2)
       {
-        x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));		    
-        x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
+        x1v = simde_mm_add_pd(x1v, simde_mm_mul_pd(simde_mm_load_pd(&vl[j]), simde_mm_load_pd(&lv[j])));		    
+        x2v = simde_mm_add_pd(x2v, simde_mm_mul_pd(simde_mm_load_pd(&vr[j]), simde_mm_load_pd(&rv[j])));
       }
 
-      x1v = _mm_hadd_pd(x1v, x1v);
-      x2v = _mm_hadd_pd(x2v, x2v);
+      x1v = simde_mm_hadd_pd(x1v, x1v);
+      x2v = simde_mm_hadd_pd(x2v, x2v);
 
-      x1v = _mm_mul_pd(x1v, x2v);
+      x1v = simde_mm_mul_pd(x1v, x2v);
 
       for(j = 0; j < 20; j+=2)
       {
-        __m128d vv = _mm_load_pd(&v[j]);
-        vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
-        _mm_store_pd(&v[j], vv);
+        simde__m128d vv = simde_mm_load_pd(&v[j]);
+        vv = simde_mm_add_pd(vv, simde_mm_mul_pd(x1v, simde_mm_load_pd(&ev[j])));
+        simde_mm_store_pd(&v[j], vv);
       }		    	
     }
 
     if(tipCase != TIP_TIP)
     { 	    
-      __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
+      simde__m128d minlikelihood_sse = simde_mm_set1_pd( minlikelihood );
 
       scale = 1;
       for(l = 0; scale && (l < 20); l += 2)
       {
-        __m128d vv = _mm_load_pd(&v[l]);
-        __m128d v1 = _mm_and_pd(vv, absMask.m);
-        v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-        if(_mm_movemask_pd( v1 ) != 3)
+        simde__m128d vv = simde_mm_load_pd(&v[l]);
+        simde__m128d v1 = simde_mm_and_pd(vv, absMask.m);
+        v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
+        if(simde_mm_movemask_pd( v1 ) != 3)
           scale = 0;
       }	    	        
 
       if(scale)
       {
-        __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
+        simde__m128d twoto = simde_mm_set_pd(twotothe256, twotothe256);
 
         for(l = 0; l < 20; l+=2)
         {
-          __m128d ex3v = _mm_load_pd(&v[l]);		  
-          _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
+          simde__m128d ex3v = simde_mm_load_pd(&v[l]);		  
+          simde_mm_store_pd(&v[l], simde_mm_mul_pd(ex3v,twoto));	
         }		   		  
 
         scaleGap = TRUE;	   
@@ -4876,12 +4868,12 @@
               ri =  &right[cptr[i] * 400];
 
             for(l = 0; l < 20; l+=2)
-              _mm_store_pd(&v[l], _mm_setzero_pd());	      		
+              simde_mm_store_pd(&v[l], simde_mm_setzero_pd());	      		
 
             for(l = 0; l < 20; l++)
             {
-              __m128d x1v = _mm_setzero_pd();
-              __m128d x2v = _mm_setzero_pd();	 
+              simde__m128d x1v = simde_mm_setzero_pd();
+              simde__m128d x2v = simde_mm_setzero_pd();	 
               double 
                 *ev = &extEV[l * 20],
                 *lv = &le[l * 20],
@@ -4889,20 +4881,20 @@
 
               for(j = 0; j < 20; j+=2)
               {
-                x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));		    
-                x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
+                x1v = simde_mm_add_pd(x1v, simde_mm_mul_pd(simde_mm_load_pd(&vl[j]), simde_mm_load_pd(&lv[j])));		    
+                x2v = simde_mm_add_pd(x2v, simde_mm_mul_pd(simde_mm_load_pd(&vr[j]), simde_mm_load_pd(&rv[j])));
               }
 
-              x1v = _mm_hadd_pd(x1v, x1v);
-              x2v = _mm_hadd_pd(x2v, x2v);
+              x1v = simde_mm_hadd_pd(x1v, x1v);
+              x2v = simde_mm_hadd_pd(x2v, x2v);
 
-              x1v = _mm_mul_pd(x1v, x2v);
+              x1v = simde_mm_mul_pd(x1v, x2v);
 
               for(j = 0; j < 20; j+=2)
               {
-                __m128d vv = _mm_load_pd(&v[j]);
-                vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
-                _mm_store_pd(&v[j], vv);
+                simde__m128d vv = simde_mm_load_pd(&v[j]);
+                vv = simde_mm_add_pd(vv, simde_mm_mul_pd(x1v, simde_mm_load_pd(&ev[j])));
+                simde_mm_store_pd(&v[j], vv);
               }		   
             }
 
@@ -4951,12 +4943,12 @@
             }	  	  	  	  		  
 
             for(l = 0; l < 20; l+=2)
-              _mm_store_pd(&v[l], _mm_setzero_pd());	      			   
+              simde_mm_store_pd(&v[l], simde_mm_setzero_pd());	      			   
 
             for(l = 0; l < 20; l++)
             {
-              __m128d x1v = _mm_setzero_pd();
-              __m128d x2v = _mm_setzero_pd();	
+              simde__m128d x1v = simde_mm_setzero_pd();
+              simde__m128d x2v = simde_mm_setzero_pd();	
               double 
                 *ev = &extEV[l * 20],
                 *lv = &le[l * 20],
@@ -4964,33 +4956,33 @@
 
               for(j = 0; j < 20; j+=2)
               {
-                x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));		    
-                x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
+                x1v = simde_mm_add_pd(x1v, simde_mm_mul_pd(simde_mm_load_pd(&vl[j]), simde_mm_load_pd(&lv[j])));		    
+                x2v = simde_mm_add_pd(x2v, simde_mm_mul_pd(simde_mm_load_pd(&vr[j]), simde_mm_load_pd(&rv[j])));
               }
 
-              x1v = _mm_hadd_pd(x1v, x1v);
-              x2v = _mm_hadd_pd(x2v, x2v);
+              x1v = simde_mm_hadd_pd(x1v, x1v);
+              x2v = simde_mm_hadd_pd(x2v, x2v);
 
-              x1v = _mm_mul_pd(x1v, x2v);
+              x1v = simde_mm_mul_pd(x1v, x2v);
 
               for(j = 0; j < 20; j+=2)
               {
-                __m128d vv = _mm_load_pd(&v[j]);
-                vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
-                _mm_store_pd(&v[j], vv);
+                simde__m128d vv = simde_mm_load_pd(&v[j]);
+                vv = simde_mm_add_pd(vv, simde_mm_mul_pd(x1v, simde_mm_load_pd(&ev[j])));
+                simde_mm_store_pd(&v[j], vv);
               }		    
             }
 
             { 	    
-              __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
+              simde__m128d minlikelihood_sse = simde_mm_set1_pd( minlikelihood );
 
               scale = 1;
               for(l = 0; scale && (l < 20); l += 2)
               {
-                __m128d vv = _mm_load_pd(&v[l]);
-                __m128d v1 = _mm_and_pd(vv, absMask.m);
-                v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-                if(_mm_movemask_pd( v1 ) != 3)
+                simde__m128d vv = simde_mm_load_pd(&v[l]);
+                simde__m128d v1 = simde_mm_and_pd(vv, absMask.m);
+                v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
+                if(simde_mm_movemask_pd( v1 ) != 3)
                   scale = 0;
               }	    	  
             }
@@ -4998,12 +4990,12 @@
 
             if(scale)
 	      {
-		__m128d twoto = _mm_set_pd(twotothe256, twotothe256);
+		simde__m128d twoto = simde_mm_set_pd(twotothe256, twotothe256);
 		
 		for(l = 0; l < 20; l+=2)
 		  {
-		    __m128d ex3v = _mm_load_pd(&v[l]);
-		    _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));		    
+		    simde__m128d ex3v = simde_mm_load_pd(&v[l]);
+		    simde_mm_store_pd(&v[l], simde_mm_mul_pd(ex3v,twoto));		    
 		  }
 		
 		if(useFastScaling)
@@ -5060,12 +5052,12 @@
           }	 	  	  	  
 
           for(l = 0; l < 20; l+=2)
-            _mm_store_pd(&v[l], _mm_setzero_pd());	      		
+            simde_mm_store_pd(&v[l], simde_mm_setzero_pd());	      		
 
           for(l = 0; l < 20; l++)
           {
-            __m128d x1v = _mm_setzero_pd();
-            __m128d x2v = _mm_setzero_pd();
+            simde__m128d x1v = simde_mm_setzero_pd();
+            simde__m128d x2v = simde_mm_setzero_pd();
             double 
               *ev = &extEV[l * 20],
               *lv = &le[l * 20],
@@ -5073,46 +5065,46 @@
 
             for(j = 0; j < 20; j+=2)
             {
-              x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));		    
-              x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
+              x1v = simde_mm_add_pd(x1v, simde_mm_mul_pd(simde_mm_load_pd(&vl[j]), simde_mm_load_pd(&lv[j])));		    
+              x2v = simde_mm_add_pd(x2v, simde_mm_mul_pd(simde_mm_load_pd(&vr[j]), simde_mm_load_pd(&rv[j])));
             }
 
-            x1v = _mm_hadd_pd(x1v, x1v);
-            x2v = _mm_hadd_pd(x2v, x2v);
+            x1v = simde_mm_hadd_pd(x1v, x1v);
+            x2v = simde_mm_hadd_pd(x2v, x2v);
 
-            x1v = _mm_mul_pd(x1v, x2v);
+            x1v = simde_mm_mul_pd(x1v, x2v);
 
             for(j = 0; j < 20; j+=2)
             {
-              __m128d vv = _mm_load_pd(&v[j]);
-              vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
-              _mm_store_pd(&v[j], vv);
+              simde__m128d vv = simde_mm_load_pd(&v[j]);
+              vv = simde_mm_add_pd(vv, simde_mm_mul_pd(x1v, simde_mm_load_pd(&ev[j])));
+              simde_mm_store_pd(&v[j], vv);
             }		    
 
           }
 
           { 	    
-            __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
+            simde__m128d minlikelihood_sse = simde_mm_set1_pd( minlikelihood );
 
             scale = 1;
             for(l = 0; scale && (l < 20); l += 2)
             {
-              __m128d vv = _mm_load_pd(&v[l]);
-              __m128d v1 = _mm_and_pd(vv, absMask.m);
-              v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-              if(_mm_movemask_pd( v1 ) != 3)
+              simde__m128d vv = simde_mm_load_pd(&v[l]);
+              simde__m128d v1 = simde_mm_and_pd(vv, absMask.m);
+              v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
+              if(simde_mm_movemask_pd( v1 ) != 3)
                 scale = 0;
             }	    	  
           }
 
           if(scale)
           {
-            __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
+            simde__m128d twoto = simde_mm_set_pd(twotothe256, twotothe256);
 
             for(l = 0; l < 20; l+=2)
             {
-              __m128d ex3v = _mm_load_pd(&v[l]);		  
-              _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
+              simde__m128d ex3v = simde_mm_load_pd(&v[l]);		  
+              simde_mm_store_pd(&v[l], simde_mm_mul_pd(ex3v,twoto));	
             }		   		  
 	    
 	    if(useFastScaling)
@@ -5164,7 +5156,7 @@
 	    v  = &x3[20 * i];
 #ifdef __SIM_SSE3
 	    for(l = 0; l < 20; l+=2)
-	      _mm_store_pd(&v[l], _mm_setzero_pd());	      		
+	      simde_mm_store_pd(&v[l], simde_mm_setzero_pd());	      		
 #else
 	    for(l = 0; l < 20; l++)
 	      v[l] = 0.0;
@@ -5173,8 +5165,8 @@
 	    for(l = 0; l < 20; l++)
 	      {
 #ifdef __SIM_SSE3
-		__m128d x1v = _mm_setzero_pd();
-		__m128d x2v = _mm_setzero_pd();	 
+		simde__m128d x1v = simde_mm_setzero_pd();
+		simde__m128d x2v = simde_mm_setzero_pd();	 
 		double 
 		  *ev = &extEV[l * 20],
 		  *lv = &le[l * 20],
@@ -5182,20 +5174,20 @@
 
 		for(j = 0; j < 20; j+=2)
 		  {
-		    x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));		    
-		    x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
+		    x1v = simde_mm_add_pd(x1v, simde_mm_mul_pd(simde_mm_load_pd(&vl[j]), simde_mm_load_pd(&lv[j])));		    
+		    x2v = simde_mm_add_pd(x2v, simde_mm_mul_pd(simde_mm_load_pd(&vr[j]), simde_mm_load_pd(&rv[j])));
 		  }
 
-		x1v = _mm_hadd_pd(x1v, x1v);
-		x2v = _mm_hadd_pd(x2v, x2v);
+		x1v = simde_mm_hadd_pd(x1v, x1v);
+		x2v = simde_mm_hadd_pd(x2v, x2v);
 
-		x1v = _mm_mul_pd(x1v, x2v);
+		x1v = simde_mm_mul_pd(x1v, x2v);
 		
 		for(j = 0; j < 20; j+=2)
 		  {
-		    __m128d vv = _mm_load_pd(&v[j]);
-		    vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
-		    _mm_store_pd(&v[j], vv);
+		    simde__m128d vv = simde_mm_load_pd(&v[j]);
+		    vv = simde_mm_add_pd(vv, simde_mm_mul_pd(x1v, simde_mm_load_pd(&ev[j])));
+		    simde_mm_store_pd(&v[j], vv);
 		  }		    
 #else
 		ump_x1 = 0.0;
@@ -5229,7 +5221,7 @@
 
 #ifdef __SIM_SSE3
 	    for(l = 0; l < 20; l+=2)
-	      _mm_store_pd(&v[l], _mm_setzero_pd());	      		
+	      simde_mm_store_pd(&v[l], simde_mm_setzero_pd());	      		
 #else
 	    for(l = 0; l < 20; l++)
 	      v[l] = 0.0;
@@ -5240,8 +5232,8 @@
 	      {
 #ifdef __SIM_SSE3
 
-		__m128d x1v = _mm_setzero_pd();
-		__m128d x2v = _mm_setzero_pd();	
+		simde__m128d x1v = simde_mm_setzero_pd();
+		simde__m128d x2v = simde_mm_setzero_pd();	
 		double 
 		  *ev = &extEV[l * 20],
 		  *lv = &le[l * 20],
@@ -5249,20 +5241,20 @@
 
 		for(j = 0; j < 20; j+=2)
 		  {
-		    x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));		    
-		    x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
+		    x1v = simde_mm_add_pd(x1v, simde_mm_mul_pd(simde_mm_load_pd(&vl[j]), simde_mm_load_pd(&lv[j])));		    
+		    x2v = simde_mm_add_pd(x2v, simde_mm_mul_pd(simde_mm_load_pd(&vr[j]), simde_mm_load_pd(&rv[j])));
 		  }
 
-		x1v = _mm_hadd_pd(x1v, x1v);
-		x2v = _mm_hadd_pd(x2v, x2v);
+		x1v = simde_mm_hadd_pd(x1v, x1v);
+		x2v = simde_mm_hadd_pd(x2v, x2v);
 
-		x1v = _mm_mul_pd(x1v, x2v);
+		x1v = simde_mm_mul_pd(x1v, x2v);
 		
 		for(j = 0; j < 20; j+=2)
 		  {
-		    __m128d vv = _mm_load_pd(&v[j]);
-		    vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
-		    _mm_store_pd(&v[j], vv);
+		    simde__m128d vv = simde_mm_load_pd(&v[j]);
+		    vv = simde_mm_add_pd(vv, simde_mm_mul_pd(x1v, simde_mm_load_pd(&ev[j])));
+		    simde_mm_store_pd(&v[j], vv);
 		  }		    
 #else
 		ump_x1 = 0.0;
@@ -5282,15 +5274,15 @@
 	      }
 #ifdef __SIM_SSE3
 	    { 	    
-	      __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
+	      simde__m128d minlikelihood_sse = simde_mm_set1_pd( minlikelihood );
 	      
 	      scale = 1;
 	      for(l = 0; scale && (l < 20); l += 2)
 		{
-		  __m128d vv = _mm_load_pd(&v[l]);
-		  __m128d v1 = _mm_and_pd(vv, absMask.m);
-		  v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-		  if(_mm_movemask_pd( v1 ) != 3)
+		  simde__m128d vv = simde_mm_load_pd(&v[l]);
+		  simde__m128d v1 = simde_mm_and_pd(vv, absMask.m);
+		  v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
+		  if(simde_mm_movemask_pd( v1 ) != 3)
 		    scale = 0;
 		}	    	  
 	    }
@@ -5303,12 +5295,12 @@
 	    if(scale)
 	      {
 #ifdef __SIM_SSE3
-		__m128d twoto = _mm_set_pd(twotothe256, twotothe256);
+		simde__m128d twoto = simde_mm_set_pd(twotothe256, twotothe256);
 
 		for(l = 0; l < 20; l+=2)
 		  {
-		    __m128d ex3v = _mm_load_pd(&v[l]);
-		    _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));		    
+		    simde__m128d ex3v = simde_mm_load_pd(&v[l]);
+		    simde_mm_store_pd(&v[l], simde_mm_mul_pd(ex3v,twoto));		    
 		  }
 #else
 		for(l = 0; l < 20; l++)
@@ -5335,7 +5327,7 @@
 
 #ifdef __SIM_SSE3
 	    for(l = 0; l < 20; l+=2)
-	      _mm_store_pd(&v[l], _mm_setzero_pd());	      		
+	      simde_mm_store_pd(&v[l], simde_mm_setzero_pd());	      		
 #else
 	    for(l = 0; l < 20; l++)
 	      v[l] = 0.0;
@@ -5344,8 +5336,8 @@
 	  for(l = 0; l < 20; l++)
 	    {
 #ifdef __SIM_SSE3
-		__m128d x1v = _mm_setzero_pd();
-		__m128d x2v = _mm_setzero_pd();
+		simde__m128d x1v = simde_mm_setzero_pd();
+		simde__m128d x2v = simde_mm_setzero_pd();
 		double 
 		  *ev = &extEV[l * 20],
 		  *lv = &le[l * 20],
@@ -5354,20 +5346,20 @@
 
 		for(j = 0; j < 20; j+=2)
 		  {
-		    x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));		    
-		    x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
+		    x1v = simde_mm_add_pd(x1v, simde_mm_mul_pd(simde_mm_load_pd(&vl[j]), simde_mm_load_pd(&lv[j])));		    
+		    x2v = simde_mm_add_pd(x2v, simde_mm_mul_pd(simde_mm_load_pd(&vr[j]), simde_mm_load_pd(&rv[j])));
 		  }
 
-		x1v = _mm_hadd_pd(x1v, x1v);
-		x2v = _mm_hadd_pd(x2v, x2v);
+		x1v = simde_mm_hadd_pd(x1v, x1v);
+		x2v = simde_mm_hadd_pd(x2v, x2v);
 
-		x1v = _mm_mul_pd(x1v, x2v);
+		x1v = simde_mm_mul_pd(x1v, x2v);
 		
 		for(j = 0; j < 20; j+=2)
 		  {
-		    __m128d vv = _mm_load_pd(&v[j]);
-		    vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
-		    _mm_store_pd(&v[j], vv);
+		    simde__m128d vv = simde_mm_load_pd(&v[j]);
+		    vv = simde_mm_add_pd(vv, simde_mm_mul_pd(x1v, simde_mm_load_pd(&ev[j])));
+		    simde_mm_store_pd(&v[j], vv);
 		  }		    
 #else
 	      ump_x1 = 0.0;
@@ -5387,15 +5379,15 @@
 	    }
 #ifdef __SIM_SSE3
 	    { 	    
-	      __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
+	      simde__m128d minlikelihood_sse = simde_mm_set1_pd( minlikelihood );
 	      
 	      scale = 1;
 	      for(l = 0; scale && (l < 20); l += 2)
 		{
-		  __m128d vv = _mm_load_pd(&v[l]);
-		  __m128d v1 = _mm_and_pd(vv, absMask.m);
-		  v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-		  if(_mm_movemask_pd( v1 ) != 3)
+		  simde__m128d vv = simde_mm_load_pd(&v[l]);
+		  simde__m128d v1 = simde_mm_and_pd(vv, absMask.m);
+		  v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
+		  if(simde_mm_movemask_pd( v1 ) != 3)
 		    scale = 0;
 		}	    	  
 	    }
@@ -5408,12 +5400,12 @@
 	   if(scale)
 	     {
 #ifdef __SIM_SSE3
-	       __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
+	       simde__m128d twoto = simde_mm_set_pd(twotothe256, twotothe256);
 	       
 	       for(l = 0; l < 20; l+=2)
 		 {
-		   __m128d ex3v = _mm_load_pd(&v[l]);		  
-		   _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
+		   simde__m128d ex3v = simde_mm_load_pd(&v[l]);		  
+		   simde_mm_store_pd(&v[l], simde_mm_mul_pd(ex3v,twoto));	
 		 }		   		  
 #else
 	       for(l = 0; l < 20; l++)
@@ -5926,21 +5918,21 @@
 		double *ll =  &left[k * 20];
 		double *rr =  &right[k * 20];
 		
-		__m128d umpX1v = _mm_setzero_pd();
-		__m128d umpX2v = _mm_setzero_pd();
+		simde__m128d umpX1v = simde_mm_setzero_pd();
+		simde__m128d umpX2v = simde_mm_setzero_pd();
 
 		for(l = 0; l < 20; l+=2)
 		  {
-		    __m128d vv = _mm_load_pd(&v[l]);
-		    umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));
-		    umpX2v = _mm_add_pd(umpX2v, _mm_mul_pd(vv, _mm_load_pd(&rr[l])));					
+		    simde__m128d vv = simde_mm_load_pd(&v[l]);
+		    umpX1v = simde_mm_add_pd(umpX1v, simde_mm_mul_pd(vv, simde_mm_load_pd(&ll[l])));
+		    umpX2v = simde_mm_add_pd(umpX2v, simde_mm_mul_pd(vv, simde_mm_load_pd(&rr[l])));					
 		  }
 		
-		umpX1v = _mm_hadd_pd(umpX1v, umpX1v);
-		umpX2v = _mm_hadd_pd(umpX2v, umpX2v);
+		umpX1v = simde_mm_hadd_pd(umpX1v, umpX1v);
+		umpX2v = simde_mm_hadd_pd(umpX2v, umpX2v);
 		
-		_mm_storel_pd(&umpX1[80 * i + k], umpX1v);
-		_mm_storel_pd(&umpX2[80 * i + k], umpX2v);
+		simde_mm_storel_pd(&umpX1[80 * i + k], umpX1v);
+		simde_mm_storel_pd(&umpX2[80 * i + k], umpX2v);
 #else
 		umpX1[80 * i + k] = 0.0;
 		umpX2[80 * i + k] = 0.0;
@@ -5964,24 +5956,24 @@
 		v = &x3[i * 80 + j * 20];
 
 #ifdef __SIM_SSE3
-		__m128d zero =  _mm_setzero_pd();
+		simde__m128d zero =  simde_mm_setzero_pd();
 		for(k = 0; k < 20; k+=2)		  		    
-		  _mm_store_pd(&v[k], zero);
+		  simde_mm_store_pd(&v[k], zero);
 
 		for(k = 0; k < 20; k++)
 		  { 
 		    double *eev = &extEV[k * 20];
 		    x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
-		    __m128d x1px2v = _mm_set1_pd(x1px2);
+		    simde__m128d x1px2v = simde_mm_set1_pd(x1px2);
 
 		    for(l = 0; l < 20; l+=2)
 		      {
-		      	__m128d vv = _mm_load_pd(&v[l]);
-			__m128d ee = _mm_load_pd(&eev[l]);
+		      	simde__m128d vv = simde_mm_load_pd(&v[l]);
+			simde__m128d ee = simde_mm_load_pd(&eev[l]);
 
-			vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
+			vv = simde_mm_add_pd(vv, simde_mm_mul_pd(x1px2v,ee));
 			
-			_mm_store_pd(&v[l], vv);
+			simde_mm_store_pd(&v[l], vv);
 		      }
 		  }
 
@@ -6016,16 +6008,16 @@
 #ifdef __SIM_SSE3
 		double *ll =  &left[k * 20];
 				
-		__m128d umpX1v = _mm_setzero_pd();
+		simde__m128d umpX1v = simde_mm_setzero_pd();
 		
 		for(l = 0; l < 20; l+=2)
 		  {
-		    __m128d vv = _mm_load_pd(&v[l]);
-		    umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));		    					
+		    simde__m128d vv = simde_mm_load_pd(&v[l]);
+		    umpX1v = simde_mm_add_pd(umpX1v, simde_mm_mul_pd(vv, simde_mm_load_pd(&ll[l])));		    					
 		  }
 		
-		umpX1v = _mm_hadd_pd(umpX1v, umpX1v);				
-		_mm_storel_pd(&umpX1[80 * i + k], umpX1v);		
+		umpX1v = simde_mm_hadd_pd(umpX1v, umpX1v);				
+		simde_mm_storel_pd(&umpX1[80 * i + k], umpX1v);		
 #else	    
 		umpX1[80 * i + k] = 0.0;
 
@@ -6047,40 +6039,40 @@
 		for(l = 0; l < 20; l++)
 		  {		   
 		    double *r =  &right[k * 400 + l * 20];
-		    __m128d ump_x2v = _mm_setzero_pd();	    
+		    simde__m128d ump_x2v = simde_mm_setzero_pd();	    
 		    
 		    for(j = 0; j < 20; j+= 2)
 		      {
-			__m128d vv = _mm_load_pd(&v[j]);
-			__m128d rr = _mm_load_pd(&r[j]);
-			ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr));
+			simde__m128d vv = simde_mm_load_pd(&v[j]);
+			simde__m128d rr = simde_mm_load_pd(&r[j]);
+			ump_x2v = simde_mm_add_pd(ump_x2v, simde_mm_mul_pd(vv, rr));
 		      }
 		     
-		    ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
+		    ump_x2v = simde_mm_hadd_pd(ump_x2v, ump_x2v);
 		    
-		    _mm_storel_pd(&ump_x2[l], ump_x2v);		   		     
+		    simde_mm_storel_pd(&ump_x2[l], ump_x2v);		   		     
 		  }
 
 		v = &(x3[80 * i + 20 * k]);
 
-		__m128d zero =  _mm_setzero_pd();
+		simde__m128d zero =  simde_mm_setzero_pd();
 		for(l = 0; l < 20; l+=2)		  		    
-		  _mm_store_pd(&v[l], zero);
+		  simde_mm_store_pd(&v[l], zero);
 		  
 		for(l = 0; l < 20; l++)
 		  {
 		    double *eev = &extEV[l * 20];
 		    x1px2 = uX1[k * 20 + l]  * ump_x2[l];
-		    __m128d x1px2v = _mm_set1_pd(x1px2);
+		    simde__m128d x1px2v = simde_mm_set1_pd(x1px2);
 		  
 		    for(j = 0; j < 20; j+=2)
 		      {
-			__m128d vv = _mm_load_pd(&v[j]);
-			__m128d ee = _mm_load_pd(&eev[j]);
+			simde__m128d vv = simde_mm_load_pd(&v[j]);
+			simde__m128d ee = simde_mm_load_pd(&eev[j]);
 			
-			vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
+			vv = simde_mm_add_pd(vv, simde_mm_mul_pd(x1px2v,ee));
 			
-			_mm_store_pd(&v[j], vv);
+			simde_mm_store_pd(&v[j], vv);
 		      }		     		    
 		  }			
 #else
@@ -6109,15 +6101,15 @@
 #ifdef __SIM_SSE3
 	    { 
 	      v = &(x3[80 * i]);
-	      __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
+	      simde__m128d minlikelihood_sse = simde_mm_set1_pd( minlikelihood );
 	      
 	      scale = 1;
 	      for(l = 0; scale && (l < 80); l += 2)
 		{
-		  __m128d vv = _mm_load_pd(&v[l]);
-		  __m128d v1 = _mm_and_pd(vv, absMask.m);
-		  v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-		  if(_mm_movemask_pd( v1 ) != 3)
+		  simde__m128d vv = simde_mm_load_pd(&v[l]);
+		  simde__m128d v1 = simde_mm_and_pd(vv, absMask.m);
+		  v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
+		  if(simde_mm_movemask_pd( v1 ) != 3)
 		    scale = 0;
 		}	    	  
 	    }
@@ -6131,12 +6123,12 @@
 	    if (scale)
 	      {
 #ifdef __SIM_SSE3
-	       __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
+	       simde__m128d twoto = simde_mm_set_pd(twotothe256, twotothe256);
 	       
 	       for(l = 0; l < 80; l+=2)
 		 {
-		   __m128d ex3v = _mm_load_pd(&v[l]);		  
-		   _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
+		   simde__m128d ex3v = simde_mm_load_pd(&v[l]);		  
+		   simde_mm_store_pd(&v[l], simde_mm_mul_pd(ex3v,twoto));	
 		 }		   		  
 #else
 		for(l = 0; l < 80; l++)
@@ -6161,9 +6153,9 @@
 	     v =  &(x3[80 * i + 20 * k]);
 
 #ifdef __SIM_SSE3
-	     __m128d zero =  _mm_setzero_pd();
+	     simde__m128d zero =  simde_mm_setzero_pd();
 	     for(l = 0; l < 20; l+=2)		  		    
-	       _mm_store_pd(&v[l], zero);
+	       simde_mm_store_pd(&v[l], zero);
 #else
 	     for(l = 0; l < 20; l++)
 	       v[l] = 0;
@@ -6173,8 +6165,8 @@
 	       {		 
 #ifdef __SIM_SSE3
 		 {
-		   __m128d al = _mm_setzero_pd();
-		   __m128d ar = _mm_setzero_pd();
+		   simde__m128d al = simde_mm_setzero_pd();
+		   simde__m128d ar = simde_mm_setzero_pd();
 
 		   double *ll   = &left[k * 400 + l * 20];
 		   double *rr   = &right[k * 400 + l * 20];
@@ -6182,28 +6174,28 @@
 		   
 		   for(j = 0; j < 20; j+=2)
 		     {
-		       __m128d lv  = _mm_load_pd(&ll[j]);
-		       __m128d rv  = _mm_load_pd(&rr[j]);
-		       __m128d vll = _mm_load_pd(&vl[j]);
-		       __m128d vrr = _mm_load_pd(&vr[j]);
+		       simde__m128d lv  = simde_mm_load_pd(&ll[j]);
+		       simde__m128d rv  = simde_mm_load_pd(&rr[j]);
+		       simde__m128d vll = simde_mm_load_pd(&vl[j]);
+		       simde__m128d vrr = simde_mm_load_pd(&vr[j]);
 		       
-		       al = _mm_add_pd(al, _mm_mul_pd(vll, lv));
-		       ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv));
+		       al = simde_mm_add_pd(al, simde_mm_mul_pd(vll, lv));
+		       ar = simde_mm_add_pd(ar, simde_mm_mul_pd(vrr, rv));
 		     }  		 
 		       
-		   al = _mm_hadd_pd(al, al);
-		   ar = _mm_hadd_pd(ar, ar);
+		   al = simde_mm_hadd_pd(al, al);
+		   ar = simde_mm_hadd_pd(ar, ar);
 		   
-		   al = _mm_mul_pd(al, ar);
+		   al = simde_mm_mul_pd(al, ar);
 
 		   for(j = 0; j < 20; j+=2)
 		     {
-		       __m128d vv  = _mm_load_pd(&v[j]);
-		       __m128d EVV = _mm_load_pd(&EVEV[j]);
+		       simde__m128d vv  = simde_mm_load_pd(&v[j]);
+		       simde__m128d EVV = simde_mm_load_pd(&EVEV[j]);
 
-		       vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+		       vv = simde_mm_add_pd(vv, simde_mm_mul_pd(al, EVV));
 
-		       _mm_store_pd(&v[j], vv);
+		       simde_mm_store_pd(&v[j], vv);
 		     }		  		   		  
 		 }		 
 #else
@@ -6228,15 +6220,15 @@
 #ifdef __SIM_SSE3
 	 { 
 	   v = &(x3[80 * i]);
-	   __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
+	   simde__m128d minlikelihood_sse = simde_mm_set1_pd( minlikelihood );
 	   
 	   scale = 1;
 	   for(l = 0; scale && (l < 80); l += 2)
 	     {
-	       __m128d vv = _mm_load_pd(&v[l]);
-	       __m128d v1 = _mm_and_pd(vv, absMask.m);
-	       v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	       if(_mm_movemask_pd( v1 ) != 3)
+	       simde__m128d vv = simde_mm_load_pd(&v[l]);
+	       simde__m128d v1 = simde_mm_and_pd(vv, absMask.m);
+	       v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
+	       if(simde_mm_movemask_pd( v1 ) != 3)
 		 scale = 0;
 	     }	    	  
 	 }
@@ -6250,12 +6242,12 @@
 	 if (scale)
 	   {
 #ifdef __SIM_SSE3
-	       __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
+	       simde__m128d twoto = simde_mm_set_pd(twotothe256, twotothe256);
 	       
 	       for(l = 0; l < 80; l+=2)
 		 {
-		   __m128d ex3v = _mm_load_pd(&v[l]);		  
-		   _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
+		   simde__m128d ex3v = simde_mm_load_pd(&v[l]);		  
+		   simde_mm_store_pd(&v[l], simde_mm_mul_pd(ex3v,twoto));	
 		 }		   		  
 #else	     
 	     for(l = 0; l < 80; l++)
@@ -6311,21 +6303,21 @@
 		double *ll =  &left[k * 20];
 		double *rr =  &right[k * 20];
 		
-		__m128d umpX1v = _mm_setzero_pd();
-		__m128d umpX2v = _mm_setzero_pd();
+		simde__m128d umpX1v = simde_mm_setzero_pd();
+		simde__m128d umpX2v = simde_mm_setzero_pd();
 
 		for(l = 0; l < 20; l+=2)
 		  {
-		    __m128d vv = _mm_load_pd(&v[l]);
-		    umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));
-		    umpX2v = _mm_add_pd(umpX2v, _mm_mul_pd(vv, _mm_load_pd(&rr[l])));					
+		    simde__m128d vv = simde_mm_load_pd(&v[l]);
+		    umpX1v = simde_mm_add_pd(umpX1v, simde_mm_mul_pd(vv, simde_mm_load_pd(&ll[l])));
+		    umpX2v = simde_mm_add_pd(umpX2v, simde_mm_mul_pd(vv, simde_mm_load_pd(&rr[l])));					
 		  }
 		
-		umpX1v = _mm_hadd_pd(umpX1v, umpX1v);
-		umpX2v = _mm_hadd_pd(umpX2v, umpX2v);
+		umpX1v = simde_mm_hadd_pd(umpX1v, umpX1v);
+		umpX2v = simde_mm_hadd_pd(umpX2v, umpX2v);
 		
-		_mm_storel_pd(&umpX1[80 * i + k], umpX1v);
-		_mm_storel_pd(&umpX2[80 * i + k], umpX2v);
+		simde_mm_storel_pd(&umpX1[80 * i + k], umpX1v);
+		simde_mm_storel_pd(&umpX2[80 * i + k], umpX2v);
 #else
 		umpX1[80 * i + k] = 0.0;
 		umpX2[80 * i + k] = 0.0;
@@ -6349,24 +6341,24 @@
 		v = &x3[i * 80 + j * 20];
 
 #ifdef __SIM_SSE3
-		__m128d zero =  _mm_setzero_pd();
+		simde__m128d zero =  simde_mm_setzero_pd();
 		for(k = 0; k < 20; k+=2)		  		    
-		  _mm_store_pd(&v[k], zero);
+		  simde_mm_store_pd(&v[k], zero);
 
 		for(k = 0; k < 20; k++)
 		  { 
 		    double *eev = &extEV[j][k * 20];
 		    x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
-		    __m128d x1px2v = _mm_set1_pd(x1px2);
+		    simde__m128d x1px2v = simde_mm_set1_pd(x1px2);
 
 		    for(l = 0; l < 20; l+=2)
 		      {
-		      	__m128d vv = _mm_load_pd(&v[l]);
-			__m128d ee = _mm_load_pd(&eev[l]);
+		      	simde__m128d vv = simde_mm_load_pd(&v[l]);
+			simde__m128d ee = simde_mm_load_pd(&eev[l]);
 
-			vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
+			vv = simde_mm_add_pd(vv, simde_mm_mul_pd(x1px2v,ee));
 			
-			_mm_store_pd(&v[l], vv);
+			simde_mm_store_pd(&v[l], vv);
 		      }
 		  }
 
@@ -6402,16 +6394,16 @@
 #ifdef __SIM_SSE3
 		double *ll =  &left[k * 20];
 				
-		__m128d umpX1v = _mm_setzero_pd();
+		simde__m128d umpX1v = simde_mm_setzero_pd();
 		
 		for(l = 0; l < 20; l+=2)
 		  {
-		    __m128d vv = _mm_load_pd(&v[l]);
-		    umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));		    					
+		    simde__m128d vv = simde_mm_load_pd(&v[l]);
+		    umpX1v = simde_mm_add_pd(umpX1v, simde_mm_mul_pd(vv, simde_mm_load_pd(&ll[l])));		    					
 		  }
 		
-		umpX1v = _mm_hadd_pd(umpX1v, umpX1v);				
-		_mm_storel_pd(&umpX1[80 * i + k], umpX1v);		
+		umpX1v = simde_mm_hadd_pd(umpX1v, umpX1v);				
+		simde_mm_storel_pd(&umpX1[80 * i + k], umpX1v);		
 #else	    
 		umpX1[80 * i + k] = 0.0;
 
@@ -6433,40 +6425,40 @@
 		for(l = 0; l < 20; l++)
 		  {		   
 		    double *r =  &right[k * 400 + l * 20];
-		    __m128d ump_x2v = _mm_setzero_pd();	    
+		    simde__m128d ump_x2v = simde_mm_setzero_pd();	    
 		    
 		    for(j = 0; j < 20; j+= 2)
 		      {
-			__m128d vv = _mm_load_pd(&v[j]);
-			__m128d rr = _mm_load_pd(&r[j]);
-			ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr));
+			simde__m128d vv = simde_mm_load_pd(&v[j]);
+			simde__m128d rr = simde_mm_load_pd(&r[j]);
+			ump_x2v = simde_mm_add_pd(ump_x2v, simde_mm_mul_pd(vv, rr));
 		      }
 		     
-		    ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
+		    ump_x2v = simde_mm_hadd_pd(ump_x2v, ump_x2v);
 		    
-		    _mm_storel_pd(&ump_x2[l], ump_x2v);		   		     
+		    simde_mm_storel_pd(&ump_x2[l], ump_x2v);		   		     
 		  }
 
 		v = &(x3[80 * i + 20 * k]);
 
-		__m128d zero =  _mm_setzero_pd();
+		simde__m128d zero =  simde_mm_setzero_pd();
 		for(l = 0; l < 20; l+=2)		  		    
-		  _mm_store_pd(&v[l], zero);
+		  simde_mm_store_pd(&v[l], zero);
 		  
 		for(l = 0; l < 20; l++)
 		  {
 		    double *eev = &extEV[k][l * 20];
 		    x1px2 = uX1[k * 20 + l]  * ump_x2[l];
-		    __m128d x1px2v = _mm_set1_pd(x1px2);
+		    simde__m128d x1px2v = simde_mm_set1_pd(x1px2);
 		  
 		    for(j = 0; j < 20; j+=2)
 		      {
-			__m128d vv = _mm_load_pd(&v[j]);
-			__m128d ee = _mm_load_pd(&eev[j]);
+			simde__m128d vv = simde_mm_load_pd(&v[j]);
+			simde__m128d ee = simde_mm_load_pd(&eev[j]);
 			
-			vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
+			vv = simde_mm_add_pd(vv, simde_mm_mul_pd(x1px2v,ee));
 			
-			_mm_store_pd(&v[j], vv);
+			simde_mm_store_pd(&v[j], vv);
 		      }		     		    
 		  }			
 #else
@@ -6495,15 +6487,15 @@
 #ifdef __SIM_SSE3
 	    { 
 	      v = &(x3[80 * i]);
-	      __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
+	      simde__m128d minlikelihood_sse = simde_mm_set1_pd( minlikelihood );
 	      
 	      scale = 1;
 	      for(l = 0; scale && (l < 80); l += 2)
 		{
-		  __m128d vv = _mm_load_pd(&v[l]);
-		  __m128d v1 = _mm_and_pd(vv, absMask.m);
-		  v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-		  if(_mm_movemask_pd( v1 ) != 3)
+		  simde__m128d vv = simde_mm_load_pd(&v[l]);
+		  simde__m128d v1 = simde_mm_and_pd(vv, absMask.m);
+		  v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
+		  if(simde_mm_movemask_pd( v1 ) != 3)
 		    scale = 0;
 		}	    	  
 	    }
@@ -6517,12 +6509,12 @@
 	    if (scale)
 	      {
 #ifdef __SIM_SSE3
-	       __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
+	       simde__m128d twoto = simde_mm_set_pd(twotothe256, twotothe256);
 	       
 	       for(l = 0; l < 80; l+=2)
 		 {
-		   __m128d ex3v = _mm_load_pd(&v[l]);		  
-		   _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
+		   simde__m128d ex3v = simde_mm_load_pd(&v[l]);		  
+		   simde_mm_store_pd(&v[l], simde_mm_mul_pd(ex3v,twoto));	
 		 }		   		  
 #else
 		for(l = 0; l < 80; l++)
@@ -6547,9 +6539,9 @@
 	     v =  &(x3[80 * i + 20 * k]);
 
 #ifdef __SIM_SSE3
-	     __m128d zero =  _mm_setzero_pd();
+	     simde__m128d zero =  simde_mm_setzero_pd();
 	     for(l = 0; l < 20; l+=2)		  		    
-	       _mm_store_pd(&v[l], zero);
+	       simde_mm_store_pd(&v[l], zero);
 #else
 	     for(l = 0; l < 20; l++)
 	       v[l] = 0;
@@ -6559,8 +6551,8 @@
 	       {		 
 #ifdef __SIM_SSE3
 		 {
-		   __m128d al = _mm_setzero_pd();
-		   __m128d ar = _mm_setzero_pd();
+		   simde__m128d al = simde_mm_setzero_pd();
+		   simde__m128d ar = simde_mm_setzero_pd();
 
 		   double *ll   = &left[k * 400 + l * 20];
 		   double *rr   = &right[k * 400 + l * 20];
@@ -6568,28 +6560,28 @@
 		   
 		   for(j = 0; j < 20; j+=2)
 		     {
-		       __m128d lv  = _mm_load_pd(&ll[j]);
-		       __m128d rv  = _mm_load_pd(&rr[j]);
-		       __m128d vll = _mm_load_pd(&vl[j]);
-		       __m128d vrr = _mm_load_pd(&vr[j]);
+		       simde__m128d lv  = simde_mm_load_pd(&ll[j]);
+		       simde__m128d rv  = simde_mm_load_pd(&rr[j]);
+		       simde__m128d vll = simde_mm_load_pd(&vl[j]);
+		       simde__m128d vrr = simde_mm_load_pd(&vr[j]);
 		       
-		       al = _mm_add_pd(al, _mm_mul_pd(vll, lv));
-		       ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv));
+		       al = simde_mm_add_pd(al, simde_mm_mul_pd(vll, lv));
+		       ar = simde_mm_add_pd(ar, simde_mm_mul_pd(vrr, rv));
 		     }  		 
 		       
-		   al = _mm_hadd_pd(al, al);
-		   ar = _mm_hadd_pd(ar, ar);
+		   al = simde_mm_hadd_pd(al, al);
+		   ar = simde_mm_hadd_pd(ar, ar);
 		   
-		   al = _mm_mul_pd(al, ar);
+		   al = simde_mm_mul_pd(al, ar);
 
 		   for(j = 0; j < 20; j+=2)
 		     {
-		       __m128d vv  = _mm_load_pd(&v[j]);
-		       __m128d EVV = _mm_load_pd(&EVEV[j]);
+		       simde__m128d vv  = simde_mm_load_pd(&v[j]);
+		       simde__m128d EVV = simde_mm_load_pd(&EVEV[j]);
 
-		       vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+		       vv = simde_mm_add_pd(vv, simde_mm_mul_pd(al, EVV));
 
-		       _mm_store_pd(&v[j], vv);
+		       simde_mm_store_pd(&v[j], vv);
 		     }		  		   		  
 		 }		 
 #else
@@ -6614,15 +6606,15 @@
 #ifdef __SIM_SSE3
 	 { 
 	   v = &(x3[80 * i]);
-	   __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
+	   simde__m128d minlikelihood_sse = simde_mm_set1_pd( minlikelihood );
 	   
 	   scale = 1;
 	   for(l = 0; scale && (l < 80); l += 2)
 	     {
-	       __m128d vv = _mm_load_pd(&v[l]);
-	       __m128d v1 = _mm_and_pd(vv, absMask.m);
-	       v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	       if(_mm_movemask_pd( v1 ) != 3)
+	       simde__m128d vv = simde_mm_load_pd(&v[l]);
+	       simde__m128d v1 = simde_mm_and_pd(vv, absMask.m);
+	       v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
+	       if(simde_mm_movemask_pd( v1 ) != 3)
 		 scale = 0;
 	     }	    	  
 	 }
@@ -6636,12 +6628,12 @@
 	 if (scale)
 	   {
 #ifdef __SIM_SSE3
-	       __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
+	       simde__m128d twoto = simde_mm_set_pd(twotothe256, twotothe256);
 	       
 	       for(l = 0; l < 80; l+=2)
 		 {
-		   __m128d ex3v = _mm_load_pd(&v[l]);		  
-		   _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
+		   simde__m128d ex3v = simde_mm_load_pd(&v[l]);		  
+		   simde_mm_store_pd(&v[l], simde_mm_mul_pd(ex3v,twoto));	
 		 }		   		  
 #else	     
 	     for(l = 0; l < 80; l++)
@@ -6703,21 +6695,21 @@
             double *ll =  &left[k * 20];
             double *rr =  &right[k * 20];
 
-            __m128d umpX1v = _mm_setzero_pd();
-            __m128d umpX2v = _mm_setzero_pd();
+            simde__m128d umpX1v = simde_mm_setzero_pd();
+            simde__m128d umpX2v = simde_mm_setzero_pd();
 
             for(l = 0; l < 20; l+=2)
             {
-              __m128d vv = _mm_load_pd(&v[l]);
-              umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));
-              umpX2v = _mm_add_pd(umpX2v, _mm_mul_pd(vv, _mm_load_pd(&rr[l])));					
+              simde__m128d vv = simde_mm_load_pd(&v[l]);
+              umpX1v = simde_mm_add_pd(umpX1v, simde_mm_mul_pd(vv, simde_mm_load_pd(&ll[l])));
+              umpX2v = simde_mm_add_pd(umpX2v, simde_mm_mul_pd(vv, simde_mm_load_pd(&rr[l])));					
             }
 
-            umpX1v = _mm_hadd_pd(umpX1v, umpX1v);
-            umpX2v = _mm_hadd_pd(umpX2v, umpX2v);
+            umpX1v = simde_mm_hadd_pd(umpX1v, umpX1v);
+            umpX2v = simde_mm_hadd_pd(umpX2v, umpX2v);
 
-            _mm_storel_pd(&umpX1[80 * i + k], umpX1v);
-            _mm_storel_pd(&umpX2[80 * i + k], umpX2v);
+            simde_mm_storel_pd(&umpX1[80 * i + k], umpX1v);
+            simde_mm_storel_pd(&umpX2[80 * i + k], umpX2v);
           }
         }
 
@@ -6729,24 +6721,24 @@
           {
             v = &x3_gapColumn[j * 20];
 
-            __m128d zero =  _mm_setzero_pd();
+            simde__m128d zero =  simde_mm_setzero_pd();
             for(k = 0; k < 20; k+=2)		  		    
-              _mm_store_pd(&v[k], zero);
+              simde_mm_store_pd(&v[k], zero);
 
             for(k = 0; k < 20; k++)
             { 
               double *eev = &extEV[k * 20];
               x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
-              __m128d x1px2v = _mm_set1_pd(x1px2);
+              simde__m128d x1px2v = simde_mm_set1_pd(x1px2);
 
               for(l = 0; l < 20; l+=2)
               {
-                __m128d vv = _mm_load_pd(&v[l]);
-                __m128d ee = _mm_load_pd(&eev[l]);
+                simde__m128d vv = simde_mm_load_pd(&v[l]);
+                simde__m128d ee = simde_mm_load_pd(&eev[l]);
 
-                vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
+                vv = simde_mm_add_pd(vv, simde_mm_mul_pd(x1px2v,ee));
 
-                _mm_store_pd(&v[l], vv);
+                simde_mm_store_pd(&v[l], vv);
               }
             }
           }	   
@@ -6764,24 +6756,24 @@
               v = &x3_ptr[j * 20];
 
 
-              __m128d zero =  _mm_setzero_pd();
+              simde__m128d zero =  simde_mm_setzero_pd();
               for(k = 0; k < 20; k+=2)		  		    
-                _mm_store_pd(&v[k], zero);
+                simde_mm_store_pd(&v[k], zero);
 
               for(k = 0; k < 20; k++)
               { 
                 double *eev = &extEV[k * 20];
                 x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
-                __m128d x1px2v = _mm_set1_pd(x1px2);
+                simde__m128d x1px2v = simde_mm_set1_pd(x1px2);
 
                 for(l = 0; l < 20; l+=2)
                 {
-                  __m128d vv = _mm_load_pd(&v[l]);
-                  __m128d ee = _mm_load_pd(&eev[l]);
+                  simde__m128d vv = simde_mm_load_pd(&v[l]);
+                  simde__m128d ee = simde_mm_load_pd(&eev[l]);
 
-                  vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
+                  vv = simde_mm_add_pd(vv, simde_mm_mul_pd(x1px2v,ee));
 
-                  _mm_store_pd(&v[l], vv);
+                  simde_mm_store_pd(&v[l], vv);
                 }
               }
             }	   
@@ -6803,16 +6795,16 @@
           {
             double *ll =  &left[k * 20];
 
-            __m128d umpX1v = _mm_setzero_pd();
+            simde__m128d umpX1v = simde_mm_setzero_pd();
 
             for(l = 0; l < 20; l+=2)
             {
-              __m128d vv = _mm_load_pd(&v[l]);
-              umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));		    					
+              simde__m128d vv = simde_mm_load_pd(&v[l]);
+              umpX1v = simde_mm_add_pd(umpX1v, simde_mm_mul_pd(vv, simde_mm_load_pd(&ll[l])));		    					
             }
 
-            umpX1v = _mm_hadd_pd(umpX1v, umpX1v);				
-            _mm_storel_pd(&umpX1[80 * i + k], umpX1v);		
+            umpX1v = simde_mm_hadd_pd(umpX1v, umpX1v);				
+            simde_mm_storel_pd(&umpX1[80 * i + k], umpX1v);		
 
           }
         }
@@ -6827,40 +6819,40 @@
             for(l = 0; l < 20; l++)
             {		   
               double *r =  &right[k * 400 + l * 20];
-              __m128d ump_x2v = _mm_setzero_pd();	    
+              simde__m128d ump_x2v = simde_mm_setzero_pd();	    
 
               for(j = 0; j < 20; j+= 2)
               {
-                __m128d vv = _mm_load_pd(&v[j]);
-                __m128d rr = _mm_load_pd(&r[j]);
-                ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr));
+                simde__m128d vv = simde_mm_load_pd(&v[j]);
+                simde__m128d rr = simde_mm_load_pd(&r[j]);
+                ump_x2v = simde_mm_add_pd(ump_x2v, simde_mm_mul_pd(vv, rr));
               }
 
-              ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
+              ump_x2v = simde_mm_hadd_pd(ump_x2v, ump_x2v);
 
-              _mm_storel_pd(&ump_x2[l], ump_x2v);		   		     
+              simde_mm_storel_pd(&ump_x2[l], ump_x2v);		   		     
             }
 
             v = &(x3_gapColumn[20 * k]);
 
-            __m128d zero =  _mm_setzero_pd();
+            simde__m128d zero =  simde_mm_setzero_pd();
             for(l = 0; l < 20; l+=2)		  		    
-              _mm_store_pd(&v[l], zero);
+              simde_mm_store_pd(&v[l], zero);
 
             for(l = 0; l < 20; l++)
             {
               double *eev = &extEV[l * 20];
               x1px2 = uX1[k * 20 + l]  * ump_x2[l];
-              __m128d x1px2v = _mm_set1_pd(x1px2);
+              simde__m128d x1px2v = simde_mm_set1_pd(x1px2);
 
               for(j = 0; j < 20; j+=2)
               {
-                __m128d vv = _mm_load_pd(&v[j]);
-                __m128d ee = _mm_load_pd(&eev[j]);
+                simde__m128d vv = simde_mm_load_pd(&v[j]);
+                simde__m128d ee = simde_mm_load_pd(&eev[j]);
 
-                vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
+                vv = simde_mm_add_pd(vv, simde_mm_mul_pd(x1px2v,ee));
 
-                _mm_store_pd(&v[j], vv);
+                simde_mm_store_pd(&v[j], vv);
               }		     		    
             }			
 
@@ -6868,15 +6860,15 @@
 
           { 
             v = x3_gapColumn;
-            __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
+            simde__m128d minlikelihood_sse = simde_mm_set1_pd( minlikelihood );
 
             scale = 1;
             for(l = 0; scale && (l < 80); l += 2)
             {
-              __m128d vv = _mm_load_pd(&v[l]);
-              __m128d v1 = _mm_and_pd(vv, absMask.m);
-              v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-              if(_mm_movemask_pd( v1 ) != 3)
+              simde__m128d vv = simde_mm_load_pd(&v[l]);
+              simde__m128d v1 = simde_mm_and_pd(vv, absMask.m);
+              v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
+              if(simde_mm_movemask_pd( v1 ) != 3)
                 scale = 0;
             }	    	  
           }
@@ -6885,12 +6877,12 @@
           if (scale)
           {
             gapScaling = 1;
-            __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
+            simde__m128d twoto = simde_mm_set_pd(twotothe256, twotothe256);
 
             for(l = 0; l < 80; l+=2)
             {
-              __m128d ex3v = _mm_load_pd(&v[l]);		  
-              _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
+              simde__m128d ex3v = simde_mm_load_pd(&v[l]);		  
+              simde_mm_store_pd(&v[l], simde_mm_mul_pd(ex3v,twoto));	
             }		   		  	      	    	       
           }
         }
@@ -6926,40 +6918,40 @@
               for(l = 0; l < 20; l++)
               {		   
                 double *r =  &right[k * 400 + l * 20];
-                __m128d ump_x2v = _mm_setzero_pd();	    
+                simde__m128d ump_x2v = simde_mm_setzero_pd();	    
 
                 for(j = 0; j < 20; j+= 2)
                 {
-                  __m128d vv = _mm_load_pd(&v[j]);
-                  __m128d rr = _mm_load_pd(&r[j]);
-                  ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr));
+                  simde__m128d vv = simde_mm_load_pd(&v[j]);
+                  simde__m128d rr = simde_mm_load_pd(&r[j]);
+                  ump_x2v = simde_mm_add_pd(ump_x2v, simde_mm_mul_pd(vv, rr));
                 }
 
-                ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
+                ump_x2v = simde_mm_hadd_pd(ump_x2v, ump_x2v);
 
-                _mm_storel_pd(&ump_x2[l], ump_x2v);		   		     
+                simde_mm_storel_pd(&ump_x2[l], ump_x2v);		   		     
               }
 
               v = &x3_ptr[20 * k];
 
-              __m128d zero =  _mm_setzero_pd();
+              simde__m128d zero =  simde_mm_setzero_pd();
               for(l = 0; l < 20; l+=2)		  		    
-                _mm_store_pd(&v[l], zero);
+                simde_mm_store_pd(&v[l], zero);
 
               for(l = 0; l < 20; l++)
               {
                 double *eev = &extEV[l * 20];
                 x1px2 = uX1[k * 20 + l]  * ump_x2[l];
-                __m128d x1px2v = _mm_set1_pd(x1px2);
+                simde__m128d x1px2v = simde_mm_set1_pd(x1px2);
 
                 for(j = 0; j < 20; j+=2)
                 {
-                  __m128d vv = _mm_load_pd(&v[j]);
-                  __m128d ee = _mm_load_pd(&eev[j]);
+                  simde__m128d vv = simde_mm_load_pd(&v[j]);
+                  simde__m128d ee = simde_mm_load_pd(&eev[j]);
 
-                  vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
+                  vv = simde_mm_add_pd(vv, simde_mm_mul_pd(x1px2v,ee));
 
-                  _mm_store_pd(&v[j], vv);
+                  simde_mm_store_pd(&v[j], vv);
                 }		     		    
               }			
 
@@ -6968,15 +6960,15 @@
 
             { 
               v = x3_ptr;
-              __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
+              simde__m128d minlikelihood_sse = simde_mm_set1_pd( minlikelihood );
 
               scale = 1;
               for(l = 0; scale && (l < 80); l += 2)
               {
-                __m128d vv = _mm_load_pd(&v[l]);
-                __m128d v1 = _mm_and_pd(vv, absMask.m);
-                v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-                if(_mm_movemask_pd( v1 ) != 3)
+                simde__m128d vv = simde_mm_load_pd(&v[l]);
+                simde__m128d v1 = simde_mm_and_pd(vv, absMask.m);
+                v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
+                if(simde_mm_movemask_pd( v1 ) != 3)
                   scale = 0;
               }	    	  
             }
@@ -6984,12 +6976,12 @@
 
             if (scale)
             {
-              __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
+              simde__m128d twoto = simde_mm_set_pd(twotothe256, twotothe256);
 
               for(l = 0; l < 80; l+=2)
               {
-                __m128d ex3v = _mm_load_pd(&v[l]);		  
-                _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
+                simde__m128d ex3v = simde_mm_load_pd(&v[l]);		  
+                simde_mm_store_pd(&v[l], simde_mm_mul_pd(ex3v,twoto));	
               }		   		  
 
               if(useFastScaling)
@@ -7011,15 +7003,15 @@
           vr = &(x2_gapColumn[20 * k]);
           v =  &(x3_gapColumn[20 * k]);
 
-          __m128d zero =  _mm_setzero_pd();
+          simde__m128d zero =  simde_mm_setzero_pd();
           for(l = 0; l < 20; l+=2)		  		    
-            _mm_store_pd(&v[l], zero);
+            simde_mm_store_pd(&v[l], zero);
 
           for(l = 0; l < 20; l++)
           {		 
             {
-              __m128d al = _mm_setzero_pd();
-              __m128d ar = _mm_setzero_pd();
+              simde__m128d al = simde_mm_setzero_pd();
+              simde__m128d ar = simde_mm_setzero_pd();
 
               double *ll   = &left[k * 400 + l * 20];
               double *rr   = &right[k * 400 + l * 20];
@@ -7027,28 +7019,28 @@
 
               for(j = 0; j < 20; j+=2)
               {
-                __m128d lv  = _mm_load_pd(&ll[j]);
-                __m128d rv  = _mm_load_pd(&rr[j]);
-                __m128d vll = _mm_load_pd(&vl[j]);
-                __m128d vrr = _mm_load_pd(&vr[j]);
+                simde__m128d lv  = simde_mm_load_pd(&ll[j]);
+                simde__m128d rv  = simde_mm_load_pd(&rr[j]);
+                simde__m128d vll = simde_mm_load_pd(&vl[j]);
+                simde__m128d vrr = simde_mm_load_pd(&vr[j]);
 
-                al = _mm_add_pd(al, _mm_mul_pd(vll, lv));
-                ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv));
+                al = simde_mm_add_pd(al, simde_mm_mul_pd(vll, lv));
+                ar = simde_mm_add_pd(ar, simde_mm_mul_pd(vrr, rv));
               }  		 
 
-              al = _mm_hadd_pd(al, al);
-              ar = _mm_hadd_pd(ar, ar);
+              al = simde_mm_hadd_pd(al, al);
+              ar = simde_mm_hadd_pd(ar, ar);
 
-              al = _mm_mul_pd(al, ar);
+              al = simde_mm_mul_pd(al, ar);
 
               for(j = 0; j < 20; j+=2)
               {
-                __m128d vv  = _mm_load_pd(&v[j]);
-                __m128d EVV = _mm_load_pd(&EVEV[j]);
+                simde__m128d vv  = simde_mm_load_pd(&v[j]);
+                simde__m128d EVV = simde_mm_load_pd(&EVEV[j]);
 
-                vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+                vv = simde_mm_add_pd(vv, simde_mm_mul_pd(al, EVV));
 
-                _mm_store_pd(&v[j], vv);
+                simde_mm_store_pd(&v[j], vv);
               }		  		   		  
             }		 
 
@@ -7058,15 +7050,15 @@
 
         { 
           v = x3_gapColumn;
-          __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
+          simde__m128d minlikelihood_sse = simde_mm_set1_pd( minlikelihood );
 
           scale = 1;
           for(l = 0; scale && (l < 80); l += 2)
           {
-            __m128d vv = _mm_load_pd(&v[l]);
-            __m128d v1 = _mm_and_pd(vv, absMask.m);
-            v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-            if(_mm_movemask_pd( v1 ) != 3)
+            simde__m128d vv = simde_mm_load_pd(&v[l]);
+            simde__m128d v1 = simde_mm_and_pd(vv, absMask.m);
+            v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
+            if(simde_mm_movemask_pd( v1 ) != 3)
               scale = 0;
           }	    	  
         }
@@ -7074,12 +7066,12 @@
         if (scale)
         {
           gapScaling = 1;
-          __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
+          simde__m128d twoto = simde_mm_set_pd(twotothe256, twotothe256);
 
           for(l = 0; l < 80; l+=2)
           {
-            __m128d ex3v = _mm_load_pd(&v[l]);		  
-            _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
+            simde__m128d ex3v = simde_mm_load_pd(&v[l]);		  
+            simde_mm_store_pd(&v[l], simde_mm_mul_pd(ex3v,twoto));	
           }		   		  
 
 
@@ -7122,15 +7114,15 @@
             vr = &(x2v[20 * k]);
             v =  &x3_ptr[20 * k];
 
-            __m128d zero =  _mm_setzero_pd();
+            simde__m128d zero =  simde_mm_setzero_pd();
             for(l = 0; l < 20; l+=2)		  		    
-              _mm_store_pd(&v[l], zero);
+              simde_mm_store_pd(&v[l], zero);
 
             for(l = 0; l < 20; l++)
             {		 
               {
-                __m128d al = _mm_setzero_pd();
-                __m128d ar = _mm_setzero_pd();
+                simde__m128d al = simde_mm_setzero_pd();
+                simde__m128d ar = simde_mm_setzero_pd();
 
                 double *ll   = &left[k * 400 + l * 20];
                 double *rr   = &right[k * 400 + l * 20];
@@ -7138,28 +7130,28 @@
 
                 for(j = 0; j < 20; j+=2)
                 {
-                  __m128d lv  = _mm_load_pd(&ll[j]);
-                  __m128d rv  = _mm_load_pd(&rr[j]);
-                  __m128d vll = _mm_load_pd(&vl[j]);
-                  __m128d vrr = _mm_load_pd(&vr[j]);
+                  simde__m128d lv  = simde_mm_load_pd(&ll[j]);
+                  simde__m128d rv  = simde_mm_load_pd(&rr[j]);
+                  simde__m128d vll = simde_mm_load_pd(&vl[j]);
+                  simde__m128d vrr = simde_mm_load_pd(&vr[j]);
 
-                  al = _mm_add_pd(al, _mm_mul_pd(vll, lv));
-                  ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv));
+                  al = simde_mm_add_pd(al, simde_mm_mul_pd(vll, lv));
+                  ar = simde_mm_add_pd(ar, simde_mm_mul_pd(vrr, rv));
                 }  		 
 
-                al = _mm_hadd_pd(al, al);
-                ar = _mm_hadd_pd(ar, ar);
+                al = simde_mm_hadd_pd(al, al);
+                ar = simde_mm_hadd_pd(ar, ar);
 
-                al = _mm_mul_pd(al, ar);
+                al = simde_mm_mul_pd(al, ar);
 
                 for(j = 0; j < 20; j+=2)
                 {
-                  __m128d vv  = _mm_load_pd(&v[j]);
-                  __m128d EVV = _mm_load_pd(&EVEV[j]);
+                  simde__m128d vv  = simde_mm_load_pd(&v[j]);
+                  simde__m128d EVV = simde_mm_load_pd(&EVEV[j]);
 
-                  vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+                  vv = simde_mm_add_pd(vv, simde_mm_mul_pd(al, EVV));
 
-                  _mm_store_pd(&v[j], vv);
+                  simde_mm_store_pd(&v[j], vv);
                 }		  		   		  
               }		 
 
@@ -7170,15 +7162,15 @@
 
           { 
             v = x3_ptr;
-            __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
+            simde__m128d minlikelihood_sse = simde_mm_set1_pd( minlikelihood );
 
             scale = 1;
             for(l = 0; scale && (l < 80); l += 2)
             {
-              __m128d vv = _mm_load_pd(&v[l]);
-              __m128d v1 = _mm_and_pd(vv, absMask.m);
-              v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-              if(_mm_movemask_pd( v1 ) != 3)
+              simde__m128d vv = simde_mm_load_pd(&v[l]);
+              simde__m128d v1 = simde_mm_and_pd(vv, absMask.m);
+              v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
+              if(simde_mm_movemask_pd( v1 ) != 3)
                 scale = 0;
             }	    	  
           }
@@ -7186,12 +7178,12 @@
 
           if (scale)
           {
-            __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
+            simde__m128d twoto = simde_mm_set_pd(twotothe256, twotothe256);
 
             for(l = 0; l < 80; l+=2)
             {
-              __m128d ex3v = _mm_load_pd(&v[l]);		  
-              _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
+              simde__m128d ex3v = simde_mm_load_pd(&v[l]);		  
+              simde_mm_store_pd(&v[l], simde_mm_mul_pd(ex3v,twoto));	
             }		   		  
 
             if(useFastScaling)
--- raxml.orig/Makefile.AVX.PTHREADS.gcc
+++ raxml/Makefile.AVX.PTHREADS.gcc
@@ -3,7 +3,7 @@
 
 CC = gcc 
 
-CFLAGS = -D_USE_PTHREADS  -D__SIM_SSE3 -O2 -D_GNU_SOURCE -msse3 -fomit-frame-pointer -funroll-loops  -D__AVX #-Wall -pedantic -Wunused-parameter -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes  -Wdeclaration-after-statement -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat  -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast
+CFLAGS := -D_USE_PTHREADS  -D__SIM_SSE3 -O2 -D_GNU_SOURCE -msse3 -fomit-frame-pointer -funroll-loops  -D__AVX #-Wall -pedantic -Wunused-parameter -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes  -Wdeclaration-after-statement -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat  -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast
 
 
 LIBRARIES = -lm -pthread 
@@ -53,4 +53,4 @@
 clean : 
 	$(RM) *.o raxmlHPC-PTHREADS-AVX
 
-dev: raxmlHPC-PTHREADS-AVX
\ No newline at end of file
+dev: raxmlHPC-PTHREADS-AVX
