Author: Michael R. Crusoe <michael.crusoe@gmail.com>
Description: Use "SIMD Everywhere" to enable wider portability
--- examl.orig/examl/avxLikelihood.c
+++ examl/examl/avxLikelihood.c
@@ -10,9 +10,7 @@
 #include <limits.h>
 #include "axml.h"
 #include <stdint.h>
-#include <xmmintrin.h>
-#include <pmmintrin.h>
-#include <immintrin.h>
+#include "../debian/include/simde/x86/avx2.h"
 
 #ifdef _FMA
 #include <x86intrin.h>
@@ -24,38 +22,38 @@
 const union __attribute__ ((aligned (BYTE_ALIGNMENT)))
 {
   uint64_t i[4];
-  __m256d m;
+  simde__m256d m;
   
 } absMask_AVX = {{0x7fffffffffffffffULL, 0x7fffffffffffffffULL, 0x7fffffffffffffffULL, 0x7fffffffffffffffULL}};
 
 
 
-static inline __m256d hadd4(__m256d v, __m256d u)
+static inline simde__m256d hadd4(simde__m256d v, simde__m256d u)
 { 
-  __m256d
+  simde__m256d
     a, b;
   
-  v = _mm256_hadd_pd(v, v);
-  a = _mm256_permute2f128_pd(v, v, 1);
-  v = _mm256_add_pd(a, v);
-
-  u = _mm256_hadd_pd(u, u);
-  b = _mm256_permute2f128_pd(u, u, 1);
-  u = _mm256_add_pd(b, u);
+  v = simde_mm256_hadd_pd(v, v);
+  a = simde_mm256_permute2f128_pd(v, v, 1);
+  v = simde_mm256_add_pd(a, v);
+
+  u = simde_mm256_hadd_pd(u, u);
+  b = simde_mm256_permute2f128_pd(u, u, 1);
+  u = simde_mm256_add_pd(b, u);
 
-  v = _mm256_mul_pd(v, u);	
+  v = simde_mm256_mul_pd(v, u);	
   
   return v;
 }
 
-static inline __m256d hadd3(__m256d v)
+static inline simde__m256d hadd3(simde__m256d v)
 { 
-  __m256d
+  simde__m256d
     a;
   
-  v = _mm256_hadd_pd(v, v);
-  a = _mm256_permute2f128_pd(v, v, 1);
-  v = _mm256_add_pd(a, v);
+  v = simde_mm256_hadd_pd(v, v);
+  a = simde_mm256_permute2f128_pd(v, v, 1);
+  v = simde_mm256_add_pd(a, v);
   
   return v;
 }
@@ -75,9 +73,9 @@
     scale, 
     addScale = 0;
  
-  __m256d 
-    minlikelihood_avx = _mm256_set1_pd( minlikelihood ),
-    twoto = _mm256_set1_pd(twotothe256);
+  simde__m256d 
+    minlikelihood_avx = simde_mm256_set1_pd( minlikelihood ),
+    twoto = simde_mm256_set1_pd(twotothe256);
  
 
   switch(tipCase)
@@ -92,8 +90,8 @@
 
 	for (i = 1; i < 16; i++)
 	  {
-	    __m256d 
-	      tv = _mm256_load_pd(&(tipVector[i * 4]));
+	    simde__m256d 
+	      tv = simde_mm256_load_pd(&(tipVector[i * 4]));
 
 	    int 
 	      j;
@@ -101,25 +99,25 @@
 	    for (j = 0; j < 4; j++)
 	      for (k = 0; k < 4; k++)
 		{		 
-		  __m256d 
-		    left1 = _mm256_load_pd(&left[j * 16 + k * 4]);		  		  		  
+		  simde__m256d 
+		    left1 = simde_mm256_load_pd(&left[j * 16 + k * 4]);		  		  		  
 
-		  left1 = _mm256_mul_pd(left1, tv);		  
+		  left1 = simde_mm256_mul_pd(left1, tv);		  
 		  left1 = hadd3(left1);
 		  		  		  
-		  _mm256_store_pd(&umpX1[i * 64 + j * 16 + k * 4], left1);
+		  simde_mm256_store_pd(&umpX1[i * 64 + j * 16 + k * 4], left1);
 		}
 	  
 	    for (j = 0; j < 4; j++)
 	      for (k = 0; k < 4; k++)
 		{		 
-		  __m256d 
-		    left1 = _mm256_load_pd(&right[j * 16 + k * 4]);		  		  		  
+		  simde__m256d 
+		    left1 = simde_mm256_load_pd(&right[j * 16 + k * 4]);		  		  		  
 
-		  left1 = _mm256_mul_pd(left1, tv);		  
+		  left1 = simde_mm256_mul_pd(left1, tv);		  
 		  left1 = hadd3(left1);
 		  		  		  
-		  _mm256_store_pd(&umpX2[i * 64 + j * 16 + k * 4], left1);
+		  simde_mm256_store_pd(&umpX2[i * 64 + j * 16 + k * 4], left1);
 		}	    
 	  }   	
 	  
@@ -131,27 +129,27 @@
 	    
 	    for(k = 0; k < 4; k++)
 	      {
-		__m256d	   
-		  xv = _mm256_setzero_pd();
+		simde__m256d	   
+		  xv = simde_mm256_setzero_pd();
 	       
 		int 
 		  l;
 		
 		for(l = 0; l < 4; l++)
 		  {	       	     				      	      																	   
-		    __m256d
-		      x1v =  _mm256_mul_pd(_mm256_load_pd(&uX1[k * 16 + l * 4]), _mm256_load_pd(&uX2[k * 16 + l * 4]));
+		    simde__m256d
+		      x1v =  simde_mm256_mul_pd(simde_mm256_load_pd(&uX1[k * 16 + l * 4]), simde_mm256_load_pd(&uX2[k * 16 + l * 4]));
 		
-		    __m256d 
-		      evv = _mm256_load_pd(&extEV[l * 4]);
+		    simde__m256d 
+		      evv = simde_mm256_load_pd(&extEV[l * 4]);
 #ifdef _FMA
 		    xv = FMAMACC(xv,x1v,evv);
 #else						  
-		    xv = _mm256_add_pd(xv, _mm256_mul_pd(x1v, evv));
+		    xv = simde_mm256_add_pd(xv, simde_mm256_mul_pd(x1v, evv));
 #endif
 		  }
 		
-		_mm256_store_pd(&x3[16 * i + 4 * k], xv);
+		simde_mm256_store_pd(&x3[16 * i + 4 * k], xv);
 	      }	         	   	    
 	  }
       }
@@ -164,8 +162,8 @@
 
 	for (i = 1; i < 16; i++)
 	  {
-	    __m256d 
-	      tv = _mm256_load_pd(&(tipVector[i*4]));
+	    simde__m256d 
+	      tv = simde_mm256_load_pd(&(tipVector[i*4]));
 
 	    int 
 	      j;
@@ -173,19 +171,19 @@
 	    for (j = 0; j < 4; j++)
 	      for (k = 0; k < 4; k++)
 		{		 
-		  __m256d 
-		    left1 = _mm256_load_pd(&left[j * 16 + k * 4]);		  		  		  
+		  simde__m256d 
+		    left1 = simde_mm256_load_pd(&left[j * 16 + k * 4]);		  		  		  
 
-		  left1 = _mm256_mul_pd(left1, tv);		  
+		  left1 = simde_mm256_mul_pd(left1, tv);		  
 		  left1 = hadd3(left1);
 		  		  		  
-		  _mm256_store_pd(&umpX1[i * 64 + j * 16 + k * 4], left1);
+		  simde_mm256_store_pd(&umpX1[i * 64 + j * 16 + k * 4], left1);
 		}	 	   
 	  }   	
 	
 	for(i = 0; i < n; i++)
 	  { 
-	    __m256d
+	    simde__m256d
 	      xv[4];	    	   
 	    
 	    scale = 1;
@@ -193,58 +191,58 @@
 
 	    for(k = 0; k < 4; k++)
 	      {
-		__m256d	   		 
-		  xvr = _mm256_load_pd(&(x2[i * 16 + k * 4]));
+		simde__m256d	   		 
+		  xvr = simde_mm256_load_pd(&(x2[i * 16 + k * 4]));
 
 		int 
 		  l;
 
-		xv[k]  = _mm256_setzero_pd();
+		xv[k]  = simde_mm256_setzero_pd();
 		  
 		for(l = 0; l < 4; l++)
 		  {	       	     				      	      															
-		    __m256d  
-		      x1v = _mm256_load_pd(&uX1[k * 16 + l * 4]),		     
-		      x2v = _mm256_mul_pd(xvr, _mm256_load_pd(&right[k * 16 + l * 4]));			    
+		    simde__m256d  
+		      x1v = simde_mm256_load_pd(&uX1[k * 16 + l * 4]),		     
+		      x2v = simde_mm256_mul_pd(xvr, simde_mm256_load_pd(&right[k * 16 + l * 4]));			    
 			
 		    x2v = hadd3(x2v);
-		    x1v = _mm256_mul_pd(x1v, x2v);			
+		    x1v = simde_mm256_mul_pd(x1v, x2v);			
 		
-		    __m256d 
-		      evv = _mm256_load_pd(&extEV[l * 4]);
+		    simde__m256d 
+		      evv = simde_mm256_load_pd(&extEV[l * 4]);
 			
 #ifdef _FMA
 		    xv[k] = FMAMACC(xv[k],x1v,evv);
 #else			  
-		    xv[k] = _mm256_add_pd(xv[k], _mm256_mul_pd(x1v, evv));
+		    xv[k] = simde_mm256_add_pd(xv[k], simde_mm256_mul_pd(x1v, evv));
 #endif
 		  }
 		    
 		if(scale)
 		  {
-		    __m256d 	     
-		      v1 = _mm256_and_pd(xv[k], absMask_AVX.m);
+		    simde__m256d 	     
+		      v1 = simde_mm256_and_pd(xv[k], absMask_AVX.m);
 
-		    v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+		    v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
 		    
-		    if(_mm256_movemask_pd( v1 ) != 15)
+		    if(simde_mm256_movemask_pd( v1 ) != 15)
 		      scale = 0;
 		  }
 	      }	    
 
 	    if(scale)
 	      {
-		xv[0] = _mm256_mul_pd(xv[0], twoto);
-		xv[1] = _mm256_mul_pd(xv[1], twoto);
-		xv[2] = _mm256_mul_pd(xv[2], twoto);
-		xv[3] = _mm256_mul_pd(xv[3], twoto);
+		xv[0] = simde_mm256_mul_pd(xv[0], twoto);
+		xv[1] = simde_mm256_mul_pd(xv[1], twoto);
+		xv[2] = simde_mm256_mul_pd(xv[2], twoto);
+		xv[3] = simde_mm256_mul_pd(xv[3], twoto);
 		addScale += wgt[i];
 	      }
 
-	    _mm256_store_pd(&x3[16 * i],      xv[0]);
-	    _mm256_store_pd(&x3[16 * i + 4],  xv[1]);
-	    _mm256_store_pd(&x3[16 * i + 8],  xv[2]);
-	    _mm256_store_pd(&x3[16 * i + 12], xv[3]);
+	    simde_mm256_store_pd(&x3[16 * i],      xv[0]);
+	    simde_mm256_store_pd(&x3[16 * i + 4],  xv[1]);
+	    simde_mm256_store_pd(&x3[16 * i + 8],  xv[2]);
+	    simde_mm256_store_pd(&x3[16 * i + 12], xv[3]);
 	  }
       }
       break;
@@ -252,62 +250,62 @@
       {
 	for(i = 0; i < n; i++)
 	  {	
-	    __m256d
+	    simde__m256d
 	      xv[4];
 	    
 	    scale = 1;
 
 	    for(k = 0; k < 4; k++)
 	      {
-		__m256d	   
+		simde__m256d	   
 		 
-		  xvl = _mm256_load_pd(&(x1[i * 16 + k * 4])),
-		  xvr = _mm256_load_pd(&(x2[i * 16 + k * 4]));
+		  xvl = simde_mm256_load_pd(&(x1[i * 16 + k * 4])),
+		  xvr = simde_mm256_load_pd(&(x2[i * 16 + k * 4]));
 
 		int 
 		  l;
 
-		xv[k] = _mm256_setzero_pd();
+		xv[k] = simde_mm256_setzero_pd();
 
 		for(l = 0; l < 4; l++)
 		  {	       	     				      	      															
-		    __m256d 
-		      x1v = _mm256_mul_pd(xvl, _mm256_load_pd(&left[k * 16 + l * 4])),
-		      x2v = _mm256_mul_pd(xvr, _mm256_load_pd(&right[k * 16 + l * 4]));			    
+		    simde__m256d 
+		      x1v = simde_mm256_mul_pd(xvl, simde_mm256_load_pd(&left[k * 16 + l * 4])),
+		      x2v = simde_mm256_mul_pd(xvr, simde_mm256_load_pd(&right[k * 16 + l * 4]));			    
 			
 		    x1v = hadd4(x1v, x2v);			
 		
-		    __m256d 
-		      evv = _mm256_load_pd(&extEV[l * 4]);
+		    simde__m256d 
+		      evv = simde_mm256_load_pd(&extEV[l * 4]);
 						  
-		    xv[k] = _mm256_add_pd(xv[k], _mm256_mul_pd(x1v, evv));
+		    xv[k] = simde_mm256_add_pd(xv[k], simde_mm256_mul_pd(x1v, evv));
 		  }
 		
 		if(scale)
 		  {
-		    __m256d 	     
-		      v1 = _mm256_and_pd(xv[k], absMask_AVX.m);
+		    simde__m256d 	     
+		      v1 = simde_mm256_and_pd(xv[k], absMask_AVX.m);
 
-		    v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+		    v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
 		    
-		    if(_mm256_movemask_pd( v1 ) != 15)
+		    if(simde_mm256_movemask_pd( v1 ) != 15)
 		      scale = 0;
 		  }
 	      }
 
 	     if(scale)
 	      {
-		xv[0] = _mm256_mul_pd(xv[0], twoto);
-		xv[1] = _mm256_mul_pd(xv[1], twoto);
-		xv[2] = _mm256_mul_pd(xv[2], twoto);
-		xv[3] = _mm256_mul_pd(xv[3], twoto);
+		xv[0] = simde_mm256_mul_pd(xv[0], twoto);
+		xv[1] = simde_mm256_mul_pd(xv[1], twoto);
+		xv[2] = simde_mm256_mul_pd(xv[2], twoto);
+		xv[3] = simde_mm256_mul_pd(xv[3], twoto);
 		addScale += wgt[i];
 	      }
 		
-	    _mm256_store_pd(&x3[16 * i],      xv[0]);
-	    _mm256_store_pd(&x3[16 * i + 4],  xv[1]);
-	    _mm256_store_pd(&x3[16 * i + 8],  xv[2]);
-	    _mm256_store_pd(&x3[16 * i + 12], xv[3]);
+	    simde_mm256_store_pd(&x3[16 * i],      xv[0]);
+	    simde_mm256_store_pd(&x3[16 * i + 4],  xv[1]);
+	    simde_mm256_store_pd(&x3[16 * i + 8],  xv[2]);
+	    simde_mm256_store_pd(&x3[16 * i + 12], xv[3]);
 	  }
       }
       break;
@@ -338,9 +336,9 @@
     i,     
     addScale = 0;
    
-  __m256d 
-    minlikelihood_avx = _mm256_set1_pd( minlikelihood ),
-    twoto = _mm256_set1_pd(twotothe256);
+  simde__m256d 
+    minlikelihood_avx = simde_mm256_set1_pd( minlikelihood ),
+    twoto = simde_mm256_set1_pd(twotothe256);
   
   switch(tipCase)
     {
@@ -356,27 +354,27 @@
 	  x1 = &(tipVector[4 * tipX1[i]]);
 	  x2 = &(tipVector[4 * tipX2[i]]);
 	  
-	  __m256d	   
-	    vv = _mm256_setzero_pd();
+	  simde__m256d	   
+	    vv = simde_mm256_setzero_pd();
 	   	   	    
 	  for(l = 0; l < 4; l++)
 	    {	       	     				      	      															
-	      __m256d 
-		x1v = _mm256_mul_pd(_mm256_load_pd(x1), _mm256_load_pd(&le[l * 4])),
-		x2v = _mm256_mul_pd(_mm256_load_pd(x2), _mm256_load_pd(&ri[l * 4]));			    
+	      simde__m256d 
+		x1v = simde_mm256_mul_pd(simde_mm256_load_pd(x1), simde_mm256_load_pd(&le[l * 4])),
+		x2v = simde_mm256_mul_pd(simde_mm256_load_pd(x2), simde_mm256_load_pd(&ri[l * 4]));			    
 			
 	      x1v = hadd4(x1v, x2v);			
 		
-	      __m256d 
-		evv = _mm256_load_pd(&EV[l * 4]);
+	      simde__m256d 
+		evv = simde_mm256_load_pd(&EV[l * 4]);
 #ifdef _FMA
 	      vv = FMAMACC(vv,x1v,evv);
 #else				
-	      vv = _mm256_add_pd(vv, _mm256_mul_pd(x1v, evv));						      	
+	      vv = simde_mm256_add_pd(vv, simde_mm256_mul_pd(x1v, evv));						      	
 #endif
 	    }	  		  
 
-	  _mm256_store_pd(&x3_start[4 * i], vv);	    	   	    
+	  simde_mm256_store_pd(&x3_start[4 * i], vv);	    	   	    
 	}
       break;
     case TIP_INNER:      
@@ -391,40 +389,40 @@
 	  le =  &left[cptr[i] * 16];
 	  ri =  &right[cptr[i] * 16];
 
-	  __m256d	   
-	    vv = _mm256_setzero_pd();
+	  simde__m256d	   
+	    vv = simde_mm256_setzero_pd();
 	  
 	  for(l = 0; l < 4; l++)
 	    {	       	     				      	      															
-	      __m256d 
-		x1v = _mm256_mul_pd(_mm256_load_pd(x1), _mm256_load_pd(&le[l * 4])),
-		x2v = _mm256_mul_pd(_mm256_load_pd(x2), _mm256_load_pd(&ri[l * 4]));			    
+	      simde__m256d 
+		x1v = simde_mm256_mul_pd(simde_mm256_load_pd(x1), simde_mm256_load_pd(&le[l * 4])),
+		x2v = simde_mm256_mul_pd(simde_mm256_load_pd(x2), simde_mm256_load_pd(&ri[l * 4]));			    
 			
 	      x1v = hadd4(x1v, x2v);			
 		
-	      __m256d 
-		evv = _mm256_load_pd(&EV[l * 4]);
+	      simde__m256d 
+		evv = simde_mm256_load_pd(&EV[l * 4]);
 				
 #ifdef _FMA
 	      vv = FMAMACC(vv,x1v,evv);
 #else	      
-	      vv = _mm256_add_pd(vv, _mm256_mul_pd(x1v, evv));
+	      vv = simde_mm256_add_pd(vv, simde_mm256_mul_pd(x1v, evv));
 #endif
 	    }	  		  
 	  
 	  
-	  __m256d 	     
-	    v1 = _mm256_and_pd(vv, absMask_AVX.m);
+	  simde__m256d 	     
+	    v1 = simde_mm256_and_pd(vv, absMask_AVX.m);
 
-	  v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+	  v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
 	    
-	  if(_mm256_movemask_pd( v1 ) == 15)
+	  if(simde_mm256_movemask_pd( v1 ) == 15)
 	    {	     	      
-	      vv = _mm256_mul_pd(vv, twoto);	      
+	      vv = simde_mm256_mul_pd(vv, twoto);	      
 	      addScale += wgt[i];
 	    }       
 	  
-	  _mm256_store_pd(&x3_start[4 * i], vv);	 	  	  
+	  simde_mm256_store_pd(&x3_start[4 * i], vv);	 	  	  
 	}
       break;
     case INNER_INNER:
@@ -440,39 +438,39 @@
 	  le =  &left[cptr[i] * 16];
 	  ri =  &right[cptr[i] * 16];
 
-	  __m256d	   
-	    vv = _mm256_setzero_pd();
+	  simde__m256d	   
+	    vv = simde_mm256_setzero_pd();
 	  
 	  for(l = 0; l < 4; l++)
 	    {	       	     				      	      															
-	      __m256d 
-		x1v = _mm256_mul_pd(_mm256_load_pd(x1), _mm256_load_pd(&le[l * 4])),
-		x2v = _mm256_mul_pd(_mm256_load_pd(x2), _mm256_load_pd(&ri[l * 4]));			    
+	      simde__m256d 
+		x1v = simde_mm256_mul_pd(simde_mm256_load_pd(x1), simde_mm256_load_pd(&le[l * 4])),
+		x2v = simde_mm256_mul_pd(simde_mm256_load_pd(x2), simde_mm256_load_pd(&ri[l * 4]));			    
 			
 	      x1v = hadd4(x1v, x2v);			
 		
-	      __m256d 
-		evv = _mm256_load_pd(&EV[l * 4]);
+	      simde__m256d 
+		evv = simde_mm256_load_pd(&EV[l * 4]);
 #ifdef _FMA
 	      vv = FMAMACC(vv,x1v,evv);
 #else						
-	      vv = _mm256_add_pd(vv, _mm256_mul_pd(x1v, evv));						      	
+	      vv = simde_mm256_add_pd(vv, simde_mm256_mul_pd(x1v, evv));						      	
 #endif
 	    }	  		  
 
 	 
-	  __m256d 	     
-	    v1 = _mm256_and_pd(vv, absMask_AVX.m);
+	  simde__m256d 	     
+	    v1 = simde_mm256_and_pd(vv, absMask_AVX.m);
 
-	  v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+	  v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
 	    
-	  if(_mm256_movemask_pd( v1 ) == 15)
+	  if(simde_mm256_movemask_pd( v1 ) == 15)
 	    {	
-	      vv = _mm256_mul_pd(vv, twoto);	      
+	      vv = simde_mm256_mul_pd(vv, twoto);	      
 	      addScale += wgt[i];
 	    }	
 
-	  _mm256_store_pd(&x3_start[4 * i], vv);
+	  simde_mm256_store_pd(&x3_start[4 * i], vv);
 	  	  
 	}
       break;
@@ -512,19 +510,19 @@
 	    vr = &(tipVector[20 * tipX2[i]]);
 	    v  = &x3[20 * i];	    	    	   	    
 
-	    __m256d vv[5];
+	    simde__m256d vv[5];
 	    
-	    vv[0] = _mm256_setzero_pd();
-	    vv[1] = _mm256_setzero_pd();
-	    vv[2] = _mm256_setzero_pd();
-	    vv[3] = _mm256_setzero_pd();
-	    vv[4] = _mm256_setzero_pd();	   	    
+	    vv[0] = simde_mm256_setzero_pd();
+	    vv[1] = simde_mm256_setzero_pd();
+	    vv[2] = simde_mm256_setzero_pd();
+	    vv[3] = simde_mm256_setzero_pd();
+	    vv[4] = simde_mm256_setzero_pd();	   	    
 
 	    for(l = 0; l < 20; l++)
 	      {	       
-		__m256d 
-		  x1v = _mm256_setzero_pd(),
-		  x2v = _mm256_setzero_pd();	
+		simde__m256d 
+		  x1v = simde_mm256_setzero_pd(),
+		  x2v = simde_mm256_setzero_pd();	
 				
 		double 
 		  *ev = &extEV[l * 20],
@@ -534,56 +532,56 @@
 #ifdef _FMA		
 		for(k = 0; k < 20; k += 4) 
 		  {
-		    __m256d vlv = _mm256_load_pd(&vl[k]);
-		    __m256d lvv = _mm256_load_pd(&lv[k]);
+		    simde__m256d vlv = simde_mm256_load_pd(&vl[k]);
+		    simde__m256d lvv = simde_mm256_load_pd(&lv[k]);
 		    x1v = FMAMACC(x1v,vlv,lvv);
-		    __m256d vrv = _mm256_load_pd(&vr[k]);
-		    __m256d rvv = _mm256_load_pd(&rv[k]);
+		    simde__m256d vrv = simde_mm256_load_pd(&vr[k]);
+		    simde__m256d rvv = simde_mm256_load_pd(&rv[k]);
 		    x2v = FMAMACC(x2v,vrv,rvv);
 		  }
 #else		
-		x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[0]), _mm256_load_pd(&lv[0])));
-		x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[4]), _mm256_load_pd(&lv[4])));
-		x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[8]), _mm256_load_pd(&lv[8])));
-		x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[12]), _mm256_load_pd(&lv[12])));
-		x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[16]), _mm256_load_pd(&lv[16])));
-
-		x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[0]), _mm256_load_pd(&rv[0])));			    
-		x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[4]), _mm256_load_pd(&rv[4])));				    
-		x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[8]), _mm256_load_pd(&rv[8])));			    
-		x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[12]), _mm256_load_pd(&rv[12])));				    
-		x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[16]), _mm256_load_pd(&rv[16])));	
+		x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[0]), simde_mm256_load_pd(&lv[0])));
+		x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[4]), simde_mm256_load_pd(&lv[4])));
+		x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[8]), simde_mm256_load_pd(&lv[8])));
+		x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[12]), simde_mm256_load_pd(&lv[12])));
+		x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[16]), simde_mm256_load_pd(&lv[16])));
+
+		x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[0]), simde_mm256_load_pd(&rv[0])));			    
+		x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[4]), simde_mm256_load_pd(&rv[4])));				    
+		x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[8]), simde_mm256_load_pd(&rv[8])));			    
+		x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[12]), simde_mm256_load_pd(&rv[12])));				    
+		x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[16]), simde_mm256_load_pd(&rv[16])));	
 #endif
 
 		x1v = hadd4(x1v, x2v);			
 #ifdef _FMA
 		for(k = 0; k < 5; k++) 
 		  {
-		    __m256d evv = _mm256_load_pd(&ev[k*4]);
+		    simde__m256d evv = simde_mm256_load_pd(&ev[k*4]);
 		    vv[k] = FMAMACC(vv[k],x1v,evv);
 		  }	  
 #else		
-		__m256d 
+		simde__m256d 
 		  evv[5];
 	    	
-		evv[0] = _mm256_load_pd(&ev[0]);
-		evv[1] = _mm256_load_pd(&ev[4]);
-		evv[2] = _mm256_load_pd(&ev[8]);
-		evv[3] = _mm256_load_pd(&ev[12]);
-		evv[4] = _mm256_load_pd(&ev[16]);		
-		
-		vv[0] = _mm256_add_pd(vv[0], _mm256_mul_pd(x1v, evv[0]));
-		vv[1] = _mm256_add_pd(vv[1], _mm256_mul_pd(x1v, evv[1]));
-		vv[2] = _mm256_add_pd(vv[2], _mm256_mul_pd(x1v, evv[2]));
-		vv[3] = _mm256_add_pd(vv[3], _mm256_mul_pd(x1v, evv[3]));
-		vv[4] = _mm256_add_pd(vv[4], _mm256_mul_pd(x1v, evv[4]));				      		      	  
+		evv[0] = simde_mm256_load_pd(&ev[0]);
+		evv[1] = simde_mm256_load_pd(&ev[4]);
+		evv[2] = simde_mm256_load_pd(&ev[8]);
+		evv[3] = simde_mm256_load_pd(&ev[12]);
+		evv[4] = simde_mm256_load_pd(&ev[16]);		
+		
+		vv[0] = simde_mm256_add_pd(vv[0], simde_mm256_mul_pd(x1v, evv[0]));
+		vv[1] = simde_mm256_add_pd(vv[1], simde_mm256_mul_pd(x1v, evv[1]));
+		vv[2] = simde_mm256_add_pd(vv[2], simde_mm256_mul_pd(x1v, evv[2]));
+		vv[3] = simde_mm256_add_pd(vv[3], simde_mm256_mul_pd(x1v, evv[3]));
+		vv[4] = simde_mm256_add_pd(vv[4], simde_mm256_mul_pd(x1v, evv[4]));				      		      	  
 #endif
 	      }
-	    _mm256_store_pd(&v[0], vv[0]);
-	    _mm256_store_pd(&v[4], vv[1]);
-	    _mm256_store_pd(&v[8], vv[2]);
-	    _mm256_store_pd(&v[12], vv[3]);
-	    _mm256_store_pd(&v[16], vv[4]);
+	    simde_mm256_store_pd(&v[0], vv[0]);
+	    simde_mm256_store_pd(&v[4], vv[1]);
+	    simde_mm256_store_pd(&v[8], vv[2]);
+	    simde_mm256_store_pd(&v[12], vv[3]);
+	    simde_mm256_store_pd(&v[16], vv[4]);
 	  }
       }
       break;
@@ -597,21 +595,21 @@
 	  vr = &x2[20 * i];
 	  v  = &x3[20 * i];	   
 	  
-	  __m256d vv[5];
+	  simde__m256d vv[5];
 	  
-	  vv[0] = _mm256_setzero_pd();
-	  vv[1] = _mm256_setzero_pd();
-	  vv[2] = _mm256_setzero_pd();
-	  vv[3] = _mm256_setzero_pd();
-	  vv[4] = _mm256_setzero_pd();
+	  vv[0] = simde_mm256_setzero_pd();
+	  vv[1] = simde_mm256_setzero_pd();
+	  vv[2] = simde_mm256_setzero_pd();
+	  vv[3] = simde_mm256_setzero_pd();
+	  vv[4] = simde_mm256_setzero_pd();
 	  
 	 
 
 	  for(l = 0; l < 20; l++)
 	    {	       
-	      __m256d 
-		x1v = _mm256_setzero_pd(),
-		x2v = _mm256_setzero_pd();	
+	      simde__m256d 
+		x1v = simde_mm256_setzero_pd(),
+		x2v = simde_mm256_setzero_pd();	
 	      
 	      double 
 		*ev = &extEV[l * 20],
@@ -620,84 +618,84 @@
 #ifdef _FMA
 	      for(k = 0; k < 20; k += 4) 
 		{
-		  __m256d vlv = _mm256_load_pd(&vl[k]);
-		  __m256d lvv = _mm256_load_pd(&lv[k]);
+		  simde__m256d vlv = simde_mm256_load_pd(&vl[k]);
+		  simde__m256d lvv = simde_mm256_load_pd(&lv[k]);
 		  x1v = FMAMACC(x1v,vlv,lvv);
-		  __m256d vrv = _mm256_load_pd(&vr[k]);
-		  __m256d rvv = _mm256_load_pd(&rv[k]);
+		  simde__m256d vrv = simde_mm256_load_pd(&vr[k]);
+		  simde__m256d rvv = simde_mm256_load_pd(&rv[k]);
 		  x2v = FMAMACC(x2v,vrv,rvv);
 		}
 #else	      
-	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[0]), _mm256_load_pd(&lv[0])));
-	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[4]), _mm256_load_pd(&lv[4])));
-	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[8]), _mm256_load_pd(&lv[8])));
-	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[12]), _mm256_load_pd(&lv[12])));
-	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[16]), _mm256_load_pd(&lv[16])));
-	      
-	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[0]), _mm256_load_pd(&rv[0])));			    
-	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[4]), _mm256_load_pd(&rv[4])));				    
-	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[8]), _mm256_load_pd(&rv[8])));			    
-	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[12]), _mm256_load_pd(&rv[12])));				    
-	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[16]), _mm256_load_pd(&rv[16])));
+	      x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[0]), simde_mm256_load_pd(&lv[0])));
+	      x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[4]), simde_mm256_load_pd(&lv[4])));
+	      x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[8]), simde_mm256_load_pd(&lv[8])));
+	      x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[12]), simde_mm256_load_pd(&lv[12])));
+	      x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[16]), simde_mm256_load_pd(&lv[16])));
+	      
+	      x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[0]), simde_mm256_load_pd(&rv[0])));			    
+	      x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[4]), simde_mm256_load_pd(&rv[4])));				    
+	      x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[8]), simde_mm256_load_pd(&rv[8])));			    
+	      x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[12]), simde_mm256_load_pd(&rv[12])));				    
+	      x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[16]), simde_mm256_load_pd(&rv[16])));
 #endif
 
 	      x1v = hadd4(x1v, x2v);			
 	      
-	      __m256d 
+	      simde__m256d 
 		evv[5];
 	      
-	      evv[0] = _mm256_load_pd(&ev[0]);
-	      evv[1] = _mm256_load_pd(&ev[4]);
-	      evv[2] = _mm256_load_pd(&ev[8]);
-	      evv[3] = _mm256_load_pd(&ev[12]);
-	      evv[4] = _mm256_load_pd(&ev[16]);		
+	      evv[0] = simde_mm256_load_pd(&ev[0]);
+	      evv[1] = simde_mm256_load_pd(&ev[4]);
+	      evv[2] = simde_mm256_load_pd(&ev[8]);
+	      evv[3] = simde_mm256_load_pd(&ev[12]);
+	      evv[4] = simde_mm256_load_pd(&ev[16]);		
 
 #ifdef _FMA
 	      for(k = 0; k < 5; k++)
 		vv[k] = FMAMACC(vv[k],x1v,evv[k]);		 
 #else	      
-	      vv[0] = _mm256_add_pd(vv[0], _mm256_mul_pd(x1v, evv[0]));
-	      vv[1] = _mm256_add_pd(vv[1], _mm256_mul_pd(x1v, evv[1]));
-	      vv[2] = _mm256_add_pd(vv[2], _mm256_mul_pd(x1v, evv[2]));
-	      vv[3] = _mm256_add_pd(vv[3], _mm256_mul_pd(x1v, evv[3]));
-	      vv[4] = _mm256_add_pd(vv[4], _mm256_mul_pd(x1v, evv[4]));				      	
+	      vv[0] = simde_mm256_add_pd(vv[0], simde_mm256_mul_pd(x1v, evv[0]));
+	      vv[1] = simde_mm256_add_pd(vv[1], simde_mm256_mul_pd(x1v, evv[1]));
+	      vv[2] = simde_mm256_add_pd(vv[2], simde_mm256_mul_pd(x1v, evv[2]));
+	      vv[3] = simde_mm256_add_pd(vv[3], simde_mm256_mul_pd(x1v, evv[3]));
+	      vv[4] = simde_mm256_add_pd(vv[4], simde_mm256_mul_pd(x1v, evv[4]));				      	
 #endif
 	    }	  
 
 	   	     
-	  __m256d minlikelihood_avx = _mm256_set1_pd( minlikelihood );
+	  simde__m256d minlikelihood_avx = simde_mm256_set1_pd( minlikelihood );
 	  
 	  scale = 1;
 	  
 	  for(l = 0; scale && (l < 20); l += 4)
 	    {	       
-	      __m256d 
-		v1 = _mm256_and_pd(vv[l / 4], absMask_AVX.m);
-	      v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+	      simde__m256d 
+		v1 = simde_mm256_and_pd(vv[l / 4], absMask_AVX.m);
+	      v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
 	      
-	      if(_mm256_movemask_pd( v1 ) != 15)
+	      if(simde_mm256_movemask_pd( v1 ) != 15)
 		scale = 0;
 	    }	    	  	  
 	 
 
 	  if(scale)
 	    {
-	      __m256d 
-		twoto = _mm256_set1_pd(twotothe256);
+	      simde__m256d 
+		twoto = simde_mm256_set1_pd(twotothe256);
 	      
 	      for(l = 0; l < 20; l += 4)
-		vv[l / 4] = _mm256_mul_pd(vv[l / 4] , twoto);		    		 
+		vv[l / 4] = simde_mm256_mul_pd(vv[l / 4] , twoto);		    		 
 	  
 	     
 	      addScale += wgt[i];
 	     	      
 	    }
 
-	  _mm256_store_pd(&v[0], vv[0]);
-	  _mm256_store_pd(&v[4], vv[1]);
-	  _mm256_store_pd(&v[8], vv[2]);
-	  _mm256_store_pd(&v[12], vv[3]);
-	  _mm256_store_pd(&v[16], vv[4]);	       
+	  simde_mm256_store_pd(&v[0], vv[0]);
+	  simde_mm256_store_pd(&v[4], vv[1]);
+	  simde_mm256_store_pd(&v[8], vv[2]);
+	  simde_mm256_store_pd(&v[12], vv[3]);
+	  simde_mm256_store_pd(&v[16], vv[4]);	       
 	}
       break;
     case INNER_INNER:
@@ -710,94 +708,94 @@
 	  vr = &x2[20 * i];
 	  v = &x3[20 * i];
 
-	  __m256d vv[5];
+	  simde__m256d vv[5];
 	  
-	  vv[0] = _mm256_setzero_pd();
-	  vv[1] = _mm256_setzero_pd();
-	  vv[2] = _mm256_setzero_pd();
-	  vv[3] = _mm256_setzero_pd();
-	  vv[4] = _mm256_setzero_pd();
+	  vv[0] = simde_mm256_setzero_pd();
+	  vv[1] = simde_mm256_setzero_pd();
+	  vv[2] = simde_mm256_setzero_pd();
+	  vv[3] = simde_mm256_setzero_pd();
+	  vv[4] = simde_mm256_setzero_pd();
 	  
 	  for(l = 0; l < 20; l++)
 	    {	       
-	      __m256d 
-		x1v = _mm256_setzero_pd(),
-		x2v = _mm256_setzero_pd();	
+	      simde__m256d 
+		x1v = simde_mm256_setzero_pd(),
+		x2v = simde_mm256_setzero_pd();	
 	      
 	      double 
 		*ev = &extEV[l * 20],
 		*lv = &le[l * 20],
 		*rv = &ri[l * 20];														
 	      
-	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[0]), _mm256_load_pd(&lv[0])));
-	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[4]), _mm256_load_pd(&lv[4])));
-	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[8]), _mm256_load_pd(&lv[8])));
-	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[12]), _mm256_load_pd(&lv[12])));
-	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[16]), _mm256_load_pd(&lv[16])));
-	      
-	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[0]), _mm256_load_pd(&rv[0])));			    
-	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[4]), _mm256_load_pd(&rv[4])));				    
-	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[8]), _mm256_load_pd(&rv[8])));			    
-	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[12]), _mm256_load_pd(&rv[12])));				    
-	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[16]), _mm256_load_pd(&rv[16])));
+	      x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[0]), simde_mm256_load_pd(&lv[0])));
+	      x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[4]), simde_mm256_load_pd(&lv[4])));
+	      x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[8]), simde_mm256_load_pd(&lv[8])));
+	      x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[12]), simde_mm256_load_pd(&lv[12])));
+	      x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[16]), simde_mm256_load_pd(&lv[16])));
+	      
+	      x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[0]), simde_mm256_load_pd(&rv[0])));			    
+	      x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[4]), simde_mm256_load_pd(&rv[4])));				    
+	      x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[8]), simde_mm256_load_pd(&rv[8])));			    
+	      x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[12]), simde_mm256_load_pd(&rv[12])));				    
+	      x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[16]), simde_mm256_load_pd(&rv[16])));
 
 	      x1v = hadd4(x1v, x2v);			
 #ifdef _FMA
 	       for(k = 0; k < 5; k++) 
 		 {
-		   __m256d evv = _mm256_load_pd(&ev[k*4]);
+		   simde__m256d evv = simde_mm256_load_pd(&ev[k*4]);
 		   vv[k] = FMAMACC(vv[k],x1v,evv);
 		 }
 #else	      
-	      __m256d 
+	      simde__m256d 
 		evv[5];
 	      
-	      evv[0] = _mm256_load_pd(&ev[0]);
-	      evv[1] = _mm256_load_pd(&ev[4]);
-	      evv[2] = _mm256_load_pd(&ev[8]);
-	      evv[3] = _mm256_load_pd(&ev[12]);
-	      evv[4] = _mm256_load_pd(&ev[16]);		
-	      
-	      vv[0] = _mm256_add_pd(vv[0], _mm256_mul_pd(x1v, evv[0]));
-	      vv[1] = _mm256_add_pd(vv[1], _mm256_mul_pd(x1v, evv[1]));
-	      vv[2] = _mm256_add_pd(vv[2], _mm256_mul_pd(x1v, evv[2]));
-	      vv[3] = _mm256_add_pd(vv[3], _mm256_mul_pd(x1v, evv[3]));
-	      vv[4] = _mm256_add_pd(vv[4], _mm256_mul_pd(x1v, evv[4]));				      	
+	      evv[0] = simde_mm256_load_pd(&ev[0]);
+	      evv[1] = simde_mm256_load_pd(&ev[4]);
+	      evv[2] = simde_mm256_load_pd(&ev[8]);
+	      evv[3] = simde_mm256_load_pd(&ev[12]);
+	      evv[4] = simde_mm256_load_pd(&ev[16]);		
+	      
+	      vv[0] = simde_mm256_add_pd(vv[0], simde_mm256_mul_pd(x1v, evv[0]));
+	      vv[1] = simde_mm256_add_pd(vv[1], simde_mm256_mul_pd(x1v, evv[1]));
+	      vv[2] = simde_mm256_add_pd(vv[2], simde_mm256_mul_pd(x1v, evv[2]));
+	      vv[3] = simde_mm256_add_pd(vv[3], simde_mm256_mul_pd(x1v, evv[3]));
+	      vv[4] = simde_mm256_add_pd(vv[4], simde_mm256_mul_pd(x1v, evv[4]));				      	
 #endif
 	    }	  
 
 	   	     
-	  __m256d minlikelihood_avx = _mm256_set1_pd( minlikelihood );
+	  simde__m256d minlikelihood_avx = simde_mm256_set1_pd( minlikelihood );
 	  
 	  scale = 1;
 	  
 	  for(l = 0; scale && (l < 20); l += 4)
 	    {	       
-	      __m256d 
-		v1 = _mm256_and_pd(vv[l / 4], absMask_AVX.m);
-	      v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+	      simde__m256d 
+		v1 = simde_mm256_and_pd(vv[l / 4], absMask_AVX.m);
+	      v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
 	      
-	      if(_mm256_movemask_pd( v1 ) != 15)
+	      if(simde_mm256_movemask_pd( v1 ) != 15)
 		scale = 0;
 	    }	    	  	  
 
 	  if(scale)
 	    {
-	      __m256d 
-		twoto = _mm256_set1_pd(twotothe256);
+	      simde__m256d 
+		twoto = simde_mm256_set1_pd(twotothe256);
 	      
 	      for(l = 0; l < 20; l += 4)
-		vv[l / 4] = _mm256_mul_pd(vv[l / 4] , twoto);		    		 
+		vv[l / 4] = simde_mm256_mul_pd(vv[l / 4] , twoto);		    		 
 	  
 	     
 	      addScale += wgt[i];	      
 	    }
 
-	  _mm256_store_pd(&v[0], vv[0]);
-	  _mm256_store_pd(&v[4], vv[1]);
-	  _mm256_store_pd(&v[8], vv[2]);
-	  _mm256_store_pd(&v[12], vv[3]);
-	  _mm256_store_pd(&v[16], vv[4]);
+	  simde_mm256_store_pd(&v[0], vv[0]);
+	  simde_mm256_store_pd(&v[4], vv[1]);
+	  simde_mm256_store_pd(&v[8], vv[2]);
+	  simde_mm256_store_pd(&v[12], vv[3]);
+	  simde_mm256_store_pd(&v[16], vv[4]);
 	 
 	}
       break;
@@ -839,11 +837,11 @@
 
 
 #if GCC_VERSION < 40500
-   __m256d
-    bitmask = _mm256_set_pd(0,0,0,-1);
+   simde__m256d
+    bitmask = simde_mm256_set_pd(0,0,0,-1);
 #else
-  __m256i
-    bitmask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+  simde__m256i
+    bitmask = simde_mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
 #endif 
   
   switch(tipCase) 
@@ -864,30 +862,30 @@
 		  *ll =  &left[k * 20],
 		  *rr =  &right[k * 20];
 		
-		__m256d 
-		  umpX1v = _mm256_setzero_pd(),
-		  umpX2v = _mm256_setzero_pd();
+		simde__m256d 
+		  umpX1v = simde_mm256_setzero_pd(),
+		  umpX2v = simde_mm256_setzero_pd();
 		
 		v = &(tipVector[k / 20][20 * i]);
 
 		for(l = 0; l < 20; l+=4) 
 		  {
-		    __m256d vv = _mm256_load_pd(&v[l]);
+		    simde__m256d vv = simde_mm256_load_pd(&v[l]);
 #ifdef _FMA
-		    __m256d llv = _mm256_load_pd(&ll[l]);
+		    simde__m256d llv = simde_mm256_load_pd(&ll[l]);
 		    umpX1v = FMAMACC(umpX1v,vv,llv);
-		    __m256d rrv = _mm256_load_pd(&rr[l]);
+		    simde__m256d rrv = simde_mm256_load_pd(&rr[l]);
 		    umpX2v = FMAMACC(umpX2v,vv,rrv);
 #else		    
-		    umpX1v = _mm256_add_pd(umpX1v,_mm256_mul_pd(vv,_mm256_load_pd(&ll[l])));
-		    umpX2v = _mm256_add_pd(umpX2v,_mm256_mul_pd(vv,_mm256_load_pd(&rr[l])));
+		    umpX1v = simde_mm256_add_pd(umpX1v,simde_mm256_mul_pd(vv,simde_mm256_load_pd(&ll[l])));
+		    umpX2v = simde_mm256_add_pd(umpX2v,simde_mm256_mul_pd(vv,simde_mm256_load_pd(&rr[l])));
 #endif
 		  }
 		
 		umpX1v = hadd3(umpX1v);
 		umpX2v = hadd3(umpX2v);
-		_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
-		_mm256_maskstore_pd(&umpX2[80 * i + k], bitmask, umpX2v);
+		simde_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
+		simde_mm256_maskstore_pd(&umpX2[80 * i + k], bitmask, umpX2v);
 	      } 
 	  }
 
@@ -898,61 +896,61 @@
 	   
 	    for(j = 0; j < 4; j++) 
 	      {     	
-		__m256d vv[5];  
+		simde__m256d vv[5];  
 
 		v = &x3[i * 80 + j * 20];
 			
-		vv[0] = _mm256_setzero_pd();
-		vv[1] = _mm256_setzero_pd();
-		vv[2] = _mm256_setzero_pd();
-		vv[3] = _mm256_setzero_pd();
-		vv[4] = _mm256_setzero_pd();
+		vv[0] = simde_mm256_setzero_pd();
+		vv[1] = simde_mm256_setzero_pd();
+		vv[2] = simde_mm256_setzero_pd();
+		vv[3] = simde_mm256_setzero_pd();
+		vv[4] = simde_mm256_setzero_pd();
 
 		for(k = 0; k < 20; k++) 
 		  {			 
 		    x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
 
-		    __m256d x1px2v = _mm256_set1_pd(x1px2);		    
+		    simde__m256d x1px2v = simde_mm256_set1_pd(x1px2);		    
 		    
-		    __m256d extEvv = _mm256_load_pd(&extEV[j][20 * k]);
+		    simde__m256d extEvv = simde_mm256_load_pd(&extEV[j][20 * k]);
 #ifdef _FMA
 		    vv[0] = FMAMACC(vv[0],x1px2v,extEvv);
 #else
-		    vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v,extEvv));
+		    vv[0] = simde_mm256_add_pd(vv[0],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-		    _mm256_store_pd(&v[0],vv[0]);
+		    simde_mm256_store_pd(&v[0],vv[0]);
 		    
-		    extEvv = _mm256_load_pd(&extEV[j][20 * k + 4]);
+		    extEvv = simde_mm256_load_pd(&extEV[j][20 * k + 4]);
 #ifdef _FMA
 		    vv[1] = FMAMACC(vv[1],x1px2v,extEvv);
 #else
-		    vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v,extEvv));
+		    vv[1] = simde_mm256_add_pd(vv[1],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-		    _mm256_store_pd(&v[4],vv[1]);
+		    simde_mm256_store_pd(&v[4],vv[1]);
 
-		    extEvv = _mm256_load_pd(&extEV[j][20 * k + 8]);
+		    extEvv = simde_mm256_load_pd(&extEV[j][20 * k + 8]);
 #ifdef _FMA
 		    vv[2] = FMAMACC(vv[2],x1px2v,extEvv);
 #else
-		    vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v,extEvv));
+		    vv[2] = simde_mm256_add_pd(vv[2],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-		    _mm256_store_pd(&v[8],vv[2]);
+		    simde_mm256_store_pd(&v[8],vv[2]);
 
-		    extEvv = _mm256_load_pd(&extEV[j][20 * k + 12]);
+		    extEvv = simde_mm256_load_pd(&extEV[j][20 * k + 12]);
 #ifdef _FMA
 		    vv[3] = FMAMACC(vv[3],x1px2v,extEvv);
 #else
-		    vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v,extEvv));
+		    vv[3] = simde_mm256_add_pd(vv[3],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-		    _mm256_store_pd(&v[12],vv[3]);
+		    simde_mm256_store_pd(&v[12],vv[3]);
 
-		    extEvv = _mm256_load_pd(&extEV[j][20 * k + 16]);
+		    extEvv = simde_mm256_load_pd(&extEV[j][20 * k + 16]);
 #ifdef _FMA
 		    vv[4] = FMAMACC(vv[4],x1px2v,extEvv);
 #else
-		    vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v,extEvv));
+		    vv[4] = simde_mm256_add_pd(vv[4],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-		    _mm256_store_pd(&v[16],vv[4]);
+		    simde_mm256_store_pd(&v[16],vv[4]);
 		  } 
 	      } 
 	  } 
@@ -969,23 +967,23 @@
 	  {	   
 	    for(k = 0; k < 80; k++) 
 	      {
-		__m256d umpX1v = _mm256_setzero_pd();
+		simde__m256d umpX1v = simde_mm256_setzero_pd();
 		
 		 v = &(tipVector[k / 20][20 * i]);
 
 		for(l = 0; l < 20; l+=4) 
 		  {
-		    __m256d vv = _mm256_load_pd(&v[l]);
-		    __m256d leftv = _mm256_load_pd(&left[k * 20 + l]);
+		    simde__m256d vv = simde_mm256_load_pd(&v[l]);
+		    simde__m256d leftv = simde_mm256_load_pd(&left[k * 20 + l]);
 #ifdef _FMA
 		   
 		    umpX1v = FMAMACC(umpX1v, vv, leftv);
 #else
-		    umpX1v = _mm256_add_pd(umpX1v, _mm256_mul_pd(vv, leftv));
+		    umpX1v = simde_mm256_add_pd(umpX1v, simde_mm256_mul_pd(vv, leftv));
 #endif
 		  }
 		umpX1v = hadd3(umpX1v);
-		_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
+		simde_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
 	      } 
 	  }
 	
@@ -999,131 +997,131 @@
 		
 		for(l = 0; l < 20; l++) 
 		  {
-		    __m256d ump_x2v = _mm256_setzero_pd();
+		    simde__m256d ump_x2v = simde_mm256_setzero_pd();
 		    		  
-		    __m256d vv = _mm256_load_pd(&v[0]);
-		    __m256d rightv = _mm256_load_pd(&right[k*400+l*20+0]);
+		    simde__m256d vv = simde_mm256_load_pd(&v[0]);
+		    simde__m256d rightv = simde_mm256_load_pd(&right[k*400+l*20+0]);
 #ifdef _FMA
 		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+		    ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 		    
-		    vv = _mm256_load_pd(&v[4]);
-		    rightv = _mm256_load_pd(&right[k*400+l*20+4]);
+		    vv = simde_mm256_load_pd(&v[4]);
+		    rightv = simde_mm256_load_pd(&right[k*400+l*20+4]);
 #ifdef _FMA
 		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+		    ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 
-		    vv = _mm256_load_pd(&v[8]);
-		    rightv = _mm256_load_pd(&right[k*400+l*20+8]);
+		    vv = simde_mm256_load_pd(&v[8]);
+		    rightv = simde_mm256_load_pd(&right[k*400+l*20+8]);
 #ifdef _FMA
 		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+		    ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 
-		    vv = _mm256_load_pd(&v[12]);
-		    rightv = _mm256_load_pd(&right[k*400+l*20+12]);
+		    vv = simde_mm256_load_pd(&v[12]);
+		    rightv = simde_mm256_load_pd(&right[k*400+l*20+12]);
 #ifdef _FMA
 		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+		    ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 
-		    vv = _mm256_load_pd(&v[16]);
-		    rightv = _mm256_load_pd(&right[k*400+l*20+16]);
+		    vv = simde_mm256_load_pd(&v[16]);
+		    rightv = simde_mm256_load_pd(&right[k*400+l*20+16]);
 #ifdef _FMA
 		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+		    ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 		    
 		    ump_x2v = hadd3(ump_x2v);
-		    _mm256_maskstore_pd(&ump_x2[l], bitmask, ump_x2v);
+		    simde_mm256_maskstore_pd(&ump_x2[l], bitmask, ump_x2v);
 		  }
 		
 		v = &(x3[80 * i + 20 * k]);
 	
 
-		__m256d vv[5]; 
+		simde__m256d vv[5]; 
 
-		vv[0] = _mm256_setzero_pd();
-		vv[1] = _mm256_setzero_pd();
-		vv[2] = _mm256_setzero_pd();
-		vv[3] = _mm256_setzero_pd();
-		vv[4] = _mm256_setzero_pd();
+		vv[0] = simde_mm256_setzero_pd();
+		vv[1] = simde_mm256_setzero_pd();
+		vv[2] = simde_mm256_setzero_pd();
+		vv[3] = simde_mm256_setzero_pd();
+		vv[4] = simde_mm256_setzero_pd();
 		
 		for(l = 0; l < 20; l++) 
 		  {
 		    x1px2 = uX1[k * 20 + l]	* ump_x2[l];
-		    __m256d x1px2v = _mm256_set1_pd(x1px2);	
+		    simde__m256d x1px2v = simde_mm256_set1_pd(x1px2);	
 	    		 
 #ifdef _FMA
-		    __m256d ev = _mm256_load_pd(&extEV[l * 20 + 0]);
+		    simde__m256d ev = simde_mm256_load_pd(&extEV[l * 20 + 0]);
 		    vv[0] = FMAMACC(vv[0],x1px2v, ev);
 #else
-		    vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[k][l * 20 + 0])));
+		    vv[0] = simde_mm256_add_pd(vv[0],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[k][l * 20 + 0])));
 #endif
-		    _mm256_store_pd(&v[0],vv[0]);
+		    simde_mm256_store_pd(&v[0],vv[0]);
 
 #ifdef _FMA
-		    ev = _mm256_load_pd(&extEV[l * 20 + 4]);
+		    ev = simde_mm256_load_pd(&extEV[l * 20 + 4]);
 		    vv[1] = FMAMACC(vv[1],x1px2v, ev);
 #else
-		    vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[k][l * 20 + 4])));
+		    vv[1] = simde_mm256_add_pd(vv[1],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[k][l * 20 + 4])));
 #endif
-		    _mm256_store_pd(&v[4],vv[1]);
+		    simde_mm256_store_pd(&v[4],vv[1]);
 
 #ifdef _FMA
-		    ev = _mm256_load_pd(&extEV[l * 20 + 8]);
+		    ev = simde_mm256_load_pd(&extEV[l * 20 + 8]);
 		    vv[2] = FMAMACC(vv[2],x1px2v, ev);
 #else
-		    vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[k][l * 20 + 8])));
+		    vv[2] = simde_mm256_add_pd(vv[2],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[k][l * 20 + 8])));
 #endif
-		    _mm256_store_pd(&v[8],vv[2]);
+		    simde_mm256_store_pd(&v[8],vv[2]);
 		    
 #ifdef _FMA
-		    ev = _mm256_load_pd(&extEV[l * 20 + 12]);
+		    ev = simde_mm256_load_pd(&extEV[l * 20 + 12]);
 		    vv[3] = FMAMACC(vv[3],x1px2v, ev);
 #else
-		    vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[k][l * 20 + 12])));
+		    vv[3] = simde_mm256_add_pd(vv[3],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[k][l * 20 + 12])));
 #endif
-		    _mm256_store_pd(&v[12],vv[3]);
+		    simde_mm256_store_pd(&v[12],vv[3]);
 
 
 #ifdef _FMA
-		    ev = _mm256_load_pd(&extEV[l * 20 + 16]);
+		    ev = simde_mm256_load_pd(&extEV[l * 20 + 16]);
 		    vv[4] = FMAMACC(vv[4],x1px2v, ev);
 #else
-		    vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[k][l * 20 + 16])));
+		    vv[4] = simde_mm256_add_pd(vv[4],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[k][l * 20 + 16])));
 #endif
-		    _mm256_store_pd(&v[16],vv[4]);
+		    simde_mm256_store_pd(&v[16],vv[4]);
 
 		  } 
 	      }
 	   
 	    v = &x3[80 * i];
-	    __m256d minlikelihood_avx = _mm256_set1_pd(minlikelihood);
+	    simde__m256d minlikelihood_avx = simde_mm256_set1_pd(minlikelihood);
 	    scale = 1;
 	    for(l = 0; scale && (l < 80); l += 4) 
 	      {
-		__m256d vv = _mm256_load_pd(&v[l]);
-		__m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
-		vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
-		if(_mm256_movemask_pd(vv_abs) != 15)
+		simde__m256d vv = simde_mm256_load_pd(&v[l]);
+		simde__m256d vv_abs = simde_mm256_and_pd(vv,absMask_AVX.m);
+		vv_abs = simde_mm256_cmp_pd(vv_abs,minlikelihood_avx,SIMDE_CMP_LT_OS);
+		if(simde_mm256_movemask_pd(vv_abs) != 15)
 		  scale = 0;
 	      }
 	    
 	    if(scale) 
 	      {		
-		__m256d twotothe256v = _mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
+		simde__m256d twotothe256v = simde_mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
 		for(l = 0; l < 80; l += 4) 
 		  {
-		    __m256d vv = _mm256_load_pd(&v[l]);
-		    _mm256_store_pd(&v[l],_mm256_mul_pd(vv,twotothe256v));
+		    simde__m256d vv = simde_mm256_load_pd(&v[l]);
+		    simde_mm256_store_pd(&v[l],simde_mm256_mul_pd(vv,twotothe256v));
 		  }
 		if(useFastScaling)
 		  addScale += wgt[i];				
@@ -1144,153 +1142,153 @@
 	      vr = &(x2[80 * i + 20 * k]);
 	      v  = &(x3[80 * i + 20 * k]);	      	   
 
-	      __m256d vv[5]; 
+	      simde__m256d vv[5]; 
 	      
-	      vv[0] = _mm256_setzero_pd();
-	      vv[1] = _mm256_setzero_pd();
-	      vv[2] = _mm256_setzero_pd();
-	      vv[3] = _mm256_setzero_pd();
-	      vv[4] = _mm256_setzero_pd();
+	      vv[0] = simde_mm256_setzero_pd();
+	      vv[1] = simde_mm256_setzero_pd();
+	      vv[2] = simde_mm256_setzero_pd();
+	      vv[3] = simde_mm256_setzero_pd();
+	      vv[4] = simde_mm256_setzero_pd();
 	      
 	      for(l = 0; l < 20; l++) 
 		{		  
-		  __m256d al = _mm256_setzero_pd();
-		  __m256d ar = _mm256_setzero_pd();
+		  simde__m256d al = simde_mm256_setzero_pd();
+		  simde__m256d ar = simde_mm256_setzero_pd();
        		  
-		  __m256d leftv  = _mm256_load_pd(&left[k * 400 + l * 20 + 0]);
-		  __m256d rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 0]);
-		  __m256d vlv = _mm256_load_pd(&vl[0]);
-		  __m256d vrv = _mm256_load_pd(&vr[0]);
+		  simde__m256d leftv  = simde_mm256_load_pd(&left[k * 400 + l * 20 + 0]);
+		  simde__m256d rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 0]);
+		  simde__m256d vlv = simde_mm256_load_pd(&vl[0]);
+		  simde__m256d vrv = simde_mm256_load_pd(&vr[0]);
 		  
 #ifdef _FMA
 		    
 		  al = FMAMACC(al, vlv, leftv);
 		  ar = FMAMACC(ar, vrv, rightv);
 #else
-		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));		  
+		  al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+		  ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));		  
 #endif
 
-		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 4]);
-		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 4]);
-		  vlv = _mm256_load_pd(&vl[4]);
-		  vrv = _mm256_load_pd(&vr[4]);
+		  leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 4]);
+		  rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 4]);
+		  vlv = simde_mm256_load_pd(&vl[4]);
+		  vrv = simde_mm256_load_pd(&vr[4]);
 #ifdef _FMA
 		    
 		  al = FMAMACC(al, vlv, leftv);
 		  ar = FMAMACC(ar, vrv, rightv);
 #else
-		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+		  al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+		  ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 
-		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 8]);
-		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 8]);
-		  vlv = _mm256_load_pd(&vl[8]);
-		  vrv = _mm256_load_pd(&vr[8]);
+		  leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 8]);
+		  rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 8]);
+		  vlv = simde_mm256_load_pd(&vl[8]);
+		  vrv = simde_mm256_load_pd(&vr[8]);
 #ifdef _FMA
 		    
 		  al = FMAMACC(al, vlv, leftv);
 		  ar = FMAMACC(ar, vrv, rightv);
 #else
-		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+		  al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+		  ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 
-		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 12]);
-		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 12]);
-		  vlv = _mm256_load_pd(&vl[12]);
-		  vrv = _mm256_load_pd(&vr[12]);
+		  leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 12]);
+		  rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 12]);
+		  vlv = simde_mm256_load_pd(&vl[12]);
+		  vrv = simde_mm256_load_pd(&vr[12]);
 #ifdef _FMA
 		    
 		  al = FMAMACC(al, vlv, leftv);
 		  ar = FMAMACC(ar, vrv, rightv);
 #else
-		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+		  al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+		  ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 
-		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 16]);
-		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 16]);
-		  vlv = _mm256_load_pd(&vl[16]);
-		  vrv = _mm256_load_pd(&vr[16]);
+		  leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 16]);
+		  rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 16]);
+		  vlv = simde_mm256_load_pd(&vl[16]);
+		  vrv = simde_mm256_load_pd(&vr[16]);
 
 #ifdef _FMA		    
 		  al = FMAMACC(al, vlv, leftv);
 		  ar = FMAMACC(ar, vrv, rightv);
 #else
-		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+		  al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+		  ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 
 		  /**************************************************************************************************************/
 
 		  al = hadd3(al);
 		  ar = hadd3(ar);
-		  al = _mm256_mul_pd(ar,al);
+		  al = simde_mm256_mul_pd(ar,al);
 		  
 		  /************************************************************************************************************/
 #ifdef _FMA		    
-		  __m256d ev =  _mm256_load_pd(&extEV[20 * l + 0]);
+		  simde__m256d ev =  simde_mm256_load_pd(&extEV[20 * l + 0]);
 		  vv[0] = FMAMACC(vv[0], al, ev);		 
 #else
-		  vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(al, _mm256_load_pd(&extEV[k][20 * l + 0])));			  		 		  
+		  vv[0] = simde_mm256_add_pd(vv[0],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[k][20 * l + 0])));			  		 		  
 #endif
-		  _mm256_store_pd(&v[0],vv[0]);
+		  simde_mm256_store_pd(&v[0],vv[0]);
 
 #ifdef _FMA		    
-		  ev =  _mm256_load_pd(&extEV[20 * l + 4]);
+		  ev =  simde_mm256_load_pd(&extEV[20 * l + 4]);
 		  vv[1] = FMAMACC(vv[1], al, ev);		 
 #else
-		  vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(al, _mm256_load_pd(&extEV[k][20 * l + 4])));		  		 
+		  vv[1] = simde_mm256_add_pd(vv[1],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[k][20 * l + 4])));		  		 
 #endif
-		  _mm256_store_pd(&v[4],vv[1]);
+		  simde_mm256_store_pd(&v[4],vv[1]);
 
 #ifdef _FMA		    
-		  ev =  _mm256_load_pd(&extEV[20 * l + 8]);
+		  ev =  simde_mm256_load_pd(&extEV[20 * l + 8]);
 		  vv[2] = FMAMACC(vv[2], al, ev);		 
 #else
-		  vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(al, _mm256_load_pd(&extEV[k][20 * l + 8])));		  		 
+		  vv[2] = simde_mm256_add_pd(vv[2],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[k][20 * l + 8])));		  		 
 #endif
-		  _mm256_store_pd(&v[8],vv[2]);
+		  simde_mm256_store_pd(&v[8],vv[2]);
 
 #ifdef _FMA		    
-		  ev =  _mm256_load_pd(&extEV[20 * l + 12]);
+		  ev =  simde_mm256_load_pd(&extEV[20 * l + 12]);
 		  vv[3] = FMAMACC(vv[3], al, ev);		 
 #else
-		  vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(al, _mm256_load_pd(&extEV[k][20 * l + 12])));		  		 
+		  vv[3] = simde_mm256_add_pd(vv[3],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[k][20 * l + 12])));		  		 
 #endif
-		  _mm256_store_pd(&v[12],vv[3]);
+		  simde_mm256_store_pd(&v[12],vv[3]);
 
 #ifdef _FMA		    
-		  ev =  _mm256_load_pd(&extEV[20 * l + 16]);
+		  ev =  simde_mm256_load_pd(&extEV[20 * l + 16]);
 		  vv[4] = FMAMACC(vv[4], al, ev);		 
 #else
-		  vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(al, _mm256_load_pd(&extEV[k][20 * l + 16])));			 	  
+		  vv[4] = simde_mm256_add_pd(vv[4],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[k][20 * l + 16])));			 	  
 #endif
-		  _mm256_store_pd(&v[16],vv[4]);		 
+		  simde_mm256_store_pd(&v[16],vv[4]);		 
 		} 
 	    }
 	  v = &(x3[80 * i]);
 	  scale = 1;
-	  __m256d minlikelihood_avx = _mm256_set1_pd(minlikelihood);	 
+	  simde__m256d minlikelihood_avx = simde_mm256_set1_pd(minlikelihood);	 
 
 	  for(l = 0; scale && (l < 80); l += 4) 
 	    {
-	      __m256d vv = _mm256_load_pd(&v[l]);
-	      __m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
-	      vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
-	      if(_mm256_movemask_pd(vv_abs) != 15)
+	      simde__m256d vv = simde_mm256_load_pd(&v[l]);
+	      simde__m256d vv_abs = simde_mm256_and_pd(vv,absMask_AVX.m);
+	      vv_abs = simde_mm256_cmp_pd(vv_abs,minlikelihood_avx,SIMDE_CMP_LT_OS);
+	      if(simde_mm256_movemask_pd(vv_abs) != 15)
 		scale = 0;	     
 	    }
 
 	  if(scale) 
 	    {		     	      
-	      __m256d twotothe256v = _mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
+	      simde__m256d twotothe256v = simde_mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
 	      for(l = 0; l < 80; l += 4) 
 		{
-		  __m256d vv = _mm256_load_pd(&v[l]);
-		  _mm256_store_pd(&v[l],_mm256_mul_pd(vv,twotothe256v));
+		  simde__m256d vv = simde_mm256_load_pd(&v[l]);
+		  simde_mm256_store_pd(&v[l],simde_mm256_mul_pd(vv,twotothe256v));
 		}
 	      if(useFastScaling)
 		addScale += wgt[i];					
@@ -1337,11 +1335,11 @@
 
 
 #if GCC_VERSION < 40500
-   __m256d
-    bitmask = _mm256_set_pd(0,0,0,-1);
+   simde__m256d
+    bitmask = simde_mm256_set_pd(0,0,0,-1);
 #else
-  __m256i
-    bitmask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+  simde__m256i
+    bitmask = simde_mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
 #endif 
   
   switch(tipCase) 
@@ -1363,28 +1361,28 @@
 		  *ll =  &left[k * 20],
 		  *rr =  &right[k * 20];
 		
-		__m256d 
-		  umpX1v = _mm256_setzero_pd(),
-		  umpX2v = _mm256_setzero_pd();
+		simde__m256d 
+		  umpX1v = simde_mm256_setzero_pd(),
+		  umpX2v = simde_mm256_setzero_pd();
 		
 		for(l = 0; l < 20; l+=4) 
 		  {
-		    __m256d vv = _mm256_load_pd(&v[l]);
+		    simde__m256d vv = simde_mm256_load_pd(&v[l]);
 #ifdef _FMA
-		    __m256d llv = _mm256_load_pd(&ll[l]);
+		    simde__m256d llv = simde_mm256_load_pd(&ll[l]);
 		    umpX1v = FMAMACC(umpX1v,vv,llv);
-		    __m256d rrv = _mm256_load_pd(&rr[l]);
+		    simde__m256d rrv = simde_mm256_load_pd(&rr[l]);
 		    umpX2v = FMAMACC(umpX2v,vv,rrv);
 #else		    
-		    umpX1v = _mm256_add_pd(umpX1v,_mm256_mul_pd(vv,_mm256_load_pd(&ll[l])));
-		    umpX2v = _mm256_add_pd(umpX2v,_mm256_mul_pd(vv,_mm256_load_pd(&rr[l])));
+		    umpX1v = simde_mm256_add_pd(umpX1v,simde_mm256_mul_pd(vv,simde_mm256_load_pd(&ll[l])));
+		    umpX2v = simde_mm256_add_pd(umpX2v,simde_mm256_mul_pd(vv,simde_mm256_load_pd(&rr[l])));
 #endif
 		  }
 		
 		umpX1v = hadd3(umpX1v);
 		umpX2v = hadd3(umpX2v);
-		_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
-		_mm256_maskstore_pd(&umpX2[80 * i + k], bitmask, umpX2v);
+		simde_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
+		simde_mm256_maskstore_pd(&umpX2[80 * i + k], bitmask, umpX2v);
 	      } 
 	  }
 
@@ -1395,61 +1393,61 @@
 	   
 	    for(j = 0; j < 4; j++) 
 	      {     	
-		__m256d vv[5];  
+		simde__m256d vv[5];  
 
 		v = &x3[i * 80 + j * 20];
 			
-		vv[0] = _mm256_setzero_pd();
-		vv[1] = _mm256_setzero_pd();
-		vv[2] = _mm256_setzero_pd();
-		vv[3] = _mm256_setzero_pd();
-		vv[4] = _mm256_setzero_pd();
+		vv[0] = simde_mm256_setzero_pd();
+		vv[1] = simde_mm256_setzero_pd();
+		vv[2] = simde_mm256_setzero_pd();
+		vv[3] = simde_mm256_setzero_pd();
+		vv[4] = simde_mm256_setzero_pd();
 
 		for(k = 0; k < 20; k++) 
 		  {			 
 		    x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
 
-		    __m256d x1px2v = _mm256_set1_pd(x1px2);		    
+		    simde__m256d x1px2v = simde_mm256_set1_pd(x1px2);		    
 		    
-		    __m256d extEvv = _mm256_load_pd(&extEV[20 * k]);
+		    simde__m256d extEvv = simde_mm256_load_pd(&extEV[20 * k]);
 #ifdef _FMA
 		    vv[0] = FMAMACC(vv[0],x1px2v,extEvv);
 #else
-		    vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v,extEvv));
+		    vv[0] = simde_mm256_add_pd(vv[0],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-		    _mm256_store_pd(&v[0],vv[0]);
+		    simde_mm256_store_pd(&v[0],vv[0]);
 		    
-		    extEvv = _mm256_load_pd(&extEV[20 * k + 4]);
+		    extEvv = simde_mm256_load_pd(&extEV[20 * k + 4]);
 #ifdef _FMA
 		    vv[1] = FMAMACC(vv[1],x1px2v,extEvv);
 #else
-		    vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v,extEvv));
+		    vv[1] = simde_mm256_add_pd(vv[1],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-		    _mm256_store_pd(&v[4],vv[1]);
+		    simde_mm256_store_pd(&v[4],vv[1]);
 
-		    extEvv = _mm256_load_pd(&extEV[20 * k + 8]);
+		    extEvv = simde_mm256_load_pd(&extEV[20 * k + 8]);
 #ifdef _FMA
 		    vv[2] = FMAMACC(vv[2],x1px2v,extEvv);
 #else
-		    vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v,extEvv));
+		    vv[2] = simde_mm256_add_pd(vv[2],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-		    _mm256_store_pd(&v[8],vv[2]);
+		    simde_mm256_store_pd(&v[8],vv[2]);
 
-		    extEvv = _mm256_load_pd(&extEV[20 * k + 12]);
+		    extEvv = simde_mm256_load_pd(&extEV[20 * k + 12]);
 #ifdef _FMA
 		    vv[3] = FMAMACC(vv[3],x1px2v,extEvv);
 #else
-		    vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v,extEvv));
+		    vv[3] = simde_mm256_add_pd(vv[3],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-		    _mm256_store_pd(&v[12],vv[3]);
+		    simde_mm256_store_pd(&v[12],vv[3]);
 
-		    extEvv = _mm256_load_pd(&extEV[20 * k + 16]);
+		    extEvv = simde_mm256_load_pd(&extEV[20 * k + 16]);
 #ifdef _FMA
 		    vv[4] = FMAMACC(vv[4],x1px2v,extEvv);
 #else
-		    vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v,extEvv));
+		    vv[4] = simde_mm256_add_pd(vv[4],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-		    _mm256_store_pd(&v[16],vv[4]);
+		    simde_mm256_store_pd(&v[16],vv[4]);
 		  } 
 	      } 
 	  } 
@@ -1468,20 +1466,20 @@
 
 	    for(k = 0; k < 80; k++) 
 	      {
-		__m256d umpX1v = _mm256_setzero_pd();
+		simde__m256d umpX1v = simde_mm256_setzero_pd();
 		for(l = 0; l < 20; l+=4) 
 		  {
-		    __m256d vv = _mm256_load_pd(&v[l]);
-		    __m256d leftv = _mm256_load_pd(&left[k * 20 + l]);
+		    simde__m256d vv = simde_mm256_load_pd(&v[l]);
+		    simde__m256d leftv = simde_mm256_load_pd(&left[k * 20 + l]);
 #ifdef _FMA
 		   
 		    umpX1v = FMAMACC(umpX1v, vv, leftv);
 #else
-		    umpX1v = _mm256_add_pd(umpX1v, _mm256_mul_pd(vv, leftv));
+		    umpX1v = simde_mm256_add_pd(umpX1v, simde_mm256_mul_pd(vv, leftv));
 #endif
 		  }
 		umpX1v = hadd3(umpX1v);
-		_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
+		simde_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
 	      } 
 	  }
 	
@@ -1495,131 +1493,131 @@
 		
 		for(l = 0; l < 20; l++) 
 		  {
-		    __m256d ump_x2v = _mm256_setzero_pd();
+		    simde__m256d ump_x2v = simde_mm256_setzero_pd();
 		    		  
-		    __m256d vv = _mm256_load_pd(&v[0]);
-		    __m256d rightv = _mm256_load_pd(&right[k*400+l*20+0]);
+		    simde__m256d vv = simde_mm256_load_pd(&v[0]);
+		    simde__m256d rightv = simde_mm256_load_pd(&right[k*400+l*20+0]);
 #ifdef _FMA
 		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+		    ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 		    
-		    vv = _mm256_load_pd(&v[4]);
-		    rightv = _mm256_load_pd(&right[k*400+l*20+4]);
+		    vv = simde_mm256_load_pd(&v[4]);
+		    rightv = simde_mm256_load_pd(&right[k*400+l*20+4]);
 #ifdef _FMA
 		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+		    ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 
-		    vv = _mm256_load_pd(&v[8]);
-		    rightv = _mm256_load_pd(&right[k*400+l*20+8]);
+		    vv = simde_mm256_load_pd(&v[8]);
+		    rightv = simde_mm256_load_pd(&right[k*400+l*20+8]);
 #ifdef _FMA
 		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+		    ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 
-		    vv = _mm256_load_pd(&v[12]);
-		    rightv = _mm256_load_pd(&right[k*400+l*20+12]);
+		    vv = simde_mm256_load_pd(&v[12]);
+		    rightv = simde_mm256_load_pd(&right[k*400+l*20+12]);
 #ifdef _FMA
 		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+		    ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 
-		    vv = _mm256_load_pd(&v[16]);
-		    rightv = _mm256_load_pd(&right[k*400+l*20+16]);
+		    vv = simde_mm256_load_pd(&v[16]);
+		    rightv = simde_mm256_load_pd(&right[k*400+l*20+16]);
 #ifdef _FMA
 		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+		    ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 		    
 		    ump_x2v = hadd3(ump_x2v);
-		    _mm256_maskstore_pd(&ump_x2[l], bitmask, ump_x2v);
+		    simde_mm256_maskstore_pd(&ump_x2[l], bitmask, ump_x2v);
 		  }
 		
 		v = &(x3[80 * i + 20 * k]);
 	
 
-		__m256d vv[5]; 
+		simde__m256d vv[5]; 
 
-		vv[0] = _mm256_setzero_pd();
-		vv[1] = _mm256_setzero_pd();
-		vv[2] = _mm256_setzero_pd();
-		vv[3] = _mm256_setzero_pd();
-		vv[4] = _mm256_setzero_pd();
+		vv[0] = simde_mm256_setzero_pd();
+		vv[1] = simde_mm256_setzero_pd();
+		vv[2] = simde_mm256_setzero_pd();
+		vv[3] = simde_mm256_setzero_pd();
+		vv[4] = simde_mm256_setzero_pd();
 		
 		for(l = 0; l < 20; l++) 
 		  {
 		    x1px2 = uX1[k * 20 + l]	* ump_x2[l];
-		    __m256d x1px2v = _mm256_set1_pd(x1px2);	
+		    simde__m256d x1px2v = simde_mm256_set1_pd(x1px2);	
 	    		 
 #ifdef _FMA
-		    __m256d ev = _mm256_load_pd(&extEV[l * 20 + 0]);
+		    simde__m256d ev = simde_mm256_load_pd(&extEV[l * 20 + 0]);
 		    vv[0] = FMAMACC(vv[0],x1px2v, ev);
 #else
-		    vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 0])));
+		    vv[0] = simde_mm256_add_pd(vv[0],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[l * 20 + 0])));
 #endif
-		    _mm256_store_pd(&v[0],vv[0]);
+		    simde_mm256_store_pd(&v[0],vv[0]);
 
 #ifdef _FMA
-		    ev = _mm256_load_pd(&extEV[l * 20 + 4]);
+		    ev = simde_mm256_load_pd(&extEV[l * 20 + 4]);
 		    vv[1] = FMAMACC(vv[1],x1px2v, ev);
 #else
-		    vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 4])));
+		    vv[1] = simde_mm256_add_pd(vv[1],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[l * 20 + 4])));
 #endif
-		    _mm256_store_pd(&v[4],vv[1]);
+		    simde_mm256_store_pd(&v[4],vv[1]);
 
 #ifdef _FMA
-		    ev = _mm256_load_pd(&extEV[l * 20 + 8]);
+		    ev = simde_mm256_load_pd(&extEV[l * 20 + 8]);
 		    vv[2] = FMAMACC(vv[2],x1px2v, ev);
 #else
-		    vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 8])));
+		    vv[2] = simde_mm256_add_pd(vv[2],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[l * 20 + 8])));
 #endif
-		    _mm256_store_pd(&v[8],vv[2]);
+		    simde_mm256_store_pd(&v[8],vv[2]);
 		    
 #ifdef _FMA
-		    ev = _mm256_load_pd(&extEV[l * 20 + 12]);
+		    ev = simde_mm256_load_pd(&extEV[l * 20 + 12]);
 		    vv[3] = FMAMACC(vv[3],x1px2v, ev);
 #else
-		    vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 12])));
+		    vv[3] = simde_mm256_add_pd(vv[3],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[l * 20 + 12])));
 #endif
-		    _mm256_store_pd(&v[12],vv[3]);
+		    simde_mm256_store_pd(&v[12],vv[3]);
 
 
 #ifdef _FMA
-		    ev = _mm256_load_pd(&extEV[l * 20 + 16]);
+		    ev = simde_mm256_load_pd(&extEV[l * 20 + 16]);
 		    vv[4] = FMAMACC(vv[4],x1px2v, ev);
 #else
-		    vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 16])));
+		    vv[4] = simde_mm256_add_pd(vv[4],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[l * 20 + 16])));
 #endif
-		    _mm256_store_pd(&v[16],vv[4]);
+		    simde_mm256_store_pd(&v[16],vv[4]);
 
 		  } 
 	      }
 	   
 	    v = &x3[80 * i];
-	    __m256d minlikelihood_avx = _mm256_set1_pd(minlikelihood);
+	    simde__m256d minlikelihood_avx = simde_mm256_set1_pd(minlikelihood);
 	    scale = 1;
 	    for(l = 0; scale && (l < 80); l += 4) 
 	      {
-		__m256d vv = _mm256_load_pd(&v[l]);
-		__m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
-		vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
-		if(_mm256_movemask_pd(vv_abs) != 15)
+		simde__m256d vv = simde_mm256_load_pd(&v[l]);
+		simde__m256d vv_abs = simde_mm256_and_pd(vv,absMask_AVX.m);
+		vv_abs = simde_mm256_cmp_pd(vv_abs,minlikelihood_avx,SIMDE_CMP_LT_OS);
+		if(simde_mm256_movemask_pd(vv_abs) != 15)
 		  scale = 0;
 	      }
 	    
 	    if(scale) 
 	      {		
-		__m256d twotothe256v = _mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
+		simde__m256d twotothe256v = simde_mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
 		for(l = 0; l < 80; l += 4) 
 		  {
-		    __m256d vv = _mm256_load_pd(&v[l]);
-		    _mm256_store_pd(&v[l],_mm256_mul_pd(vv,twotothe256v));
+		    simde__m256d vv = simde_mm256_load_pd(&v[l]);
+		    simde_mm256_store_pd(&v[l],simde_mm256_mul_pd(vv,twotothe256v));
 		  }
 	
 		addScale += wgt[i];				
@@ -1639,153 +1637,153 @@
 	      vr = &(x2[80 * i + 20 * k]);
 	      v  = &(x3[80 * i + 20 * k]);	      	   
 
-	      __m256d vv[5]; 
+	      simde__m256d vv[5]; 
 	      
-	      vv[0] = _mm256_setzero_pd();
-	      vv[1] = _mm256_setzero_pd();
-	      vv[2] = _mm256_setzero_pd();
-	      vv[3] = _mm256_setzero_pd();
-	      vv[4] = _mm256_setzero_pd();
+	      vv[0] = simde_mm256_setzero_pd();
+	      vv[1] = simde_mm256_setzero_pd();
+	      vv[2] = simde_mm256_setzero_pd();
+	      vv[3] = simde_mm256_setzero_pd();
+	      vv[4] = simde_mm256_setzero_pd();
 	      
 	      for(l = 0; l < 20; l++) 
 		{		  
-		  __m256d al = _mm256_setzero_pd();
-		  __m256d ar = _mm256_setzero_pd();
+		  simde__m256d al = simde_mm256_setzero_pd();
+		  simde__m256d ar = simde_mm256_setzero_pd();
        		  
-		  __m256d leftv  = _mm256_load_pd(&left[k * 400 + l * 20 + 0]);
-		  __m256d rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 0]);
-		  __m256d vlv = _mm256_load_pd(&vl[0]);
-		  __m256d vrv = _mm256_load_pd(&vr[0]);
+		  simde__m256d leftv  = simde_mm256_load_pd(&left[k * 400 + l * 20 + 0]);
+		  simde__m256d rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 0]);
+		  simde__m256d vlv = simde_mm256_load_pd(&vl[0]);
+		  simde__m256d vrv = simde_mm256_load_pd(&vr[0]);
 		  
 #ifdef _FMA
 		    
 		  al = FMAMACC(al, vlv, leftv);
 		  ar = FMAMACC(ar, vrv, rightv);
 #else
-		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));		  
+		  al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+		  ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));		  
 #endif
 
-		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 4]);
-		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 4]);
-		  vlv = _mm256_load_pd(&vl[4]);
-		  vrv = _mm256_load_pd(&vr[4]);
+		  leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 4]);
+		  rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 4]);
+		  vlv = simde_mm256_load_pd(&vl[4]);
+		  vrv = simde_mm256_load_pd(&vr[4]);
 #ifdef _FMA
 		    
 		  al = FMAMACC(al, vlv, leftv);
 		  ar = FMAMACC(ar, vrv, rightv);
 #else
-		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+		  al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+		  ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 
-		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 8]);
-		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 8]);
-		  vlv = _mm256_load_pd(&vl[8]);
-		  vrv = _mm256_load_pd(&vr[8]);
+		  leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 8]);
+		  rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 8]);
+		  vlv = simde_mm256_load_pd(&vl[8]);
+		  vrv = simde_mm256_load_pd(&vr[8]);
 #ifdef _FMA
 		    
 		  al = FMAMACC(al, vlv, leftv);
 		  ar = FMAMACC(ar, vrv, rightv);
 #else
-		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+		  al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+		  ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 
-		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 12]);
-		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 12]);
-		  vlv = _mm256_load_pd(&vl[12]);
-		  vrv = _mm256_load_pd(&vr[12]);
+		  leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 12]);
+		  rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 12]);
+		  vlv = simde_mm256_load_pd(&vl[12]);
+		  vrv = simde_mm256_load_pd(&vr[12]);
 #ifdef _FMA
 		    
 		  al = FMAMACC(al, vlv, leftv);
 		  ar = FMAMACC(ar, vrv, rightv);
 #else
-		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+		  al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+		  ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 
-		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 16]);
-		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 16]);
-		  vlv = _mm256_load_pd(&vl[16]);
-		  vrv = _mm256_load_pd(&vr[16]);
+		  leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 16]);
+		  rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 16]);
+		  vlv = simde_mm256_load_pd(&vl[16]);
+		  vrv = simde_mm256_load_pd(&vr[16]);
 
 #ifdef _FMA		    
 		  al = FMAMACC(al, vlv, leftv);
 		  ar = FMAMACC(ar, vrv, rightv);
 #else
-		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+		  al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+		  ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 
 		  /**************************************************************************************************************/
 
 		  al = hadd3(al);
 		  ar = hadd3(ar);
-		  al = _mm256_mul_pd(ar,al);
+		  al = simde_mm256_mul_pd(ar,al);
 		  
 		  /************************************************************************************************************/
 #ifdef _FMA		    
-		  __m256d ev =  _mm256_load_pd(&extEV[20 * l + 0]);
+		  simde__m256d ev =  simde_mm256_load_pd(&extEV[20 * l + 0]);
 		  vv[0] = FMAMACC(vv[0], al, ev);		 
 #else
-		  vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 0])));			  		 		  
+		  vv[0] = simde_mm256_add_pd(vv[0],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[20 * l + 0])));			  		 		  
 #endif
-		  _mm256_store_pd(&v[0],vv[0]);
+		  simde_mm256_store_pd(&v[0],vv[0]);
 
 #ifdef _FMA		    
-		  ev =  _mm256_load_pd(&extEV[20 * l + 4]);
+		  ev =  simde_mm256_load_pd(&extEV[20 * l + 4]);
 		  vv[1] = FMAMACC(vv[1], al, ev);		 
 #else
-		  vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 4])));		  		 
+		  vv[1] = simde_mm256_add_pd(vv[1],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[20 * l + 4])));		  		 
 #endif
-		  _mm256_store_pd(&v[4],vv[1]);
+		  simde_mm256_store_pd(&v[4],vv[1]);
 
 #ifdef _FMA		    
-		  ev =  _mm256_load_pd(&extEV[20 * l + 8]);
+		  ev =  simde_mm256_load_pd(&extEV[20 * l + 8]);
 		  vv[2] = FMAMACC(vv[2], al, ev);		 
 #else
-		  vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 8])));		  		 
+		  vv[2] = simde_mm256_add_pd(vv[2],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[20 * l + 8])));		  		 
 #endif
-		  _mm256_store_pd(&v[8],vv[2]);
+		  simde_mm256_store_pd(&v[8],vv[2]);
 
 #ifdef _FMA		    
-		  ev =  _mm256_load_pd(&extEV[20 * l + 12]);
+		  ev =  simde_mm256_load_pd(&extEV[20 * l + 12]);
 		  vv[3] = FMAMACC(vv[3], al, ev);		 
 #else
-		  vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 12])));		  		 
+		  vv[3] = simde_mm256_add_pd(vv[3],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[20 * l + 12])));		  		 
 #endif
-		  _mm256_store_pd(&v[12],vv[3]);
+		  simde_mm256_store_pd(&v[12],vv[3]);
 
 #ifdef _FMA		    
-		  ev =  _mm256_load_pd(&extEV[20 * l + 16]);
+		  ev =  simde_mm256_load_pd(&extEV[20 * l + 16]);
 		  vv[4] = FMAMACC(vv[4], al, ev);		 
 #else
-		  vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 16])));			 	  
+		  vv[4] = simde_mm256_add_pd(vv[4],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[20 * l + 16])));			 	  
 #endif
-		  _mm256_store_pd(&v[16],vv[4]);		 
+		  simde_mm256_store_pd(&v[16],vv[4]);		 
 		} 
 	    }
 	  v = &(x3[80 * i]);
 	  scale = 1;
-	  __m256d minlikelihood_avx = _mm256_set1_pd(minlikelihood);	 
+	  simde__m256d minlikelihood_avx = simde_mm256_set1_pd(minlikelihood);	 
 
 	  for(l = 0; scale && (l < 80); l += 4) 
 	    {
-	      __m256d vv = _mm256_load_pd(&v[l]);
-	      __m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
-	      vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
-	      if(_mm256_movemask_pd(vv_abs) != 15)
+	      simde__m256d vv = simde_mm256_load_pd(&v[l]);
+	      simde__m256d vv_abs = simde_mm256_and_pd(vv,absMask_AVX.m);
+	      vv_abs = simde_mm256_cmp_pd(vv_abs,minlikelihood_avx,SIMDE_CMP_LT_OS);
+	      if(simde_mm256_movemask_pd(vv_abs) != 15)
 		scale = 0;	     
 	    }
 
 	  if(scale) 
 	    {		     	      
-	      __m256d twotothe256v = _mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
+	      simde__m256d twotothe256v = simde_mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
 	      for(l = 0; l < 80; l += 4) 
 		{
-		  __m256d vv = _mm256_load_pd(&v[l]);
-		  _mm256_store_pd(&v[l],_mm256_mul_pd(vv,twotothe256v));
+		  simde__m256d vv = simde_mm256_load_pd(&v[l]);
+		  simde_mm256_store_pd(&v[l],simde_mm256_mul_pd(vv,twotothe256v));
 		}
 	     
 	      addScale += wgt[i];						    
@@ -1820,9 +1818,9 @@
     scaleGap,
     addScale = 0;
  
-  __m256d 
-    minlikelihood_avx = _mm256_set1_pd( minlikelihood ),
-    twoto = _mm256_set1_pd(twotothe256);
+  simde__m256d 
+    minlikelihood_avx = simde_mm256_set1_pd( minlikelihood ),
+    twoto = simde_mm256_set1_pd(twotothe256);
  
   double
     *x1,
@@ -1843,8 +1841,8 @@
 
 	for (i = 1; i < 16; i++)
 	  {
-	    __m256d 
-	      tv = _mm256_load_pd(&(tipVector[i * 4]));
+	    simde__m256d 
+	      tv = simde_mm256_load_pd(&(tipVector[i * 4]));
 
 	    int 
 	      j;
@@ -1852,25 +1850,25 @@
 	    for (j = 0; j < 4; j++)
 	      for (k = 0; k < 4; k++)
 		{		 
-		  __m256d 
-		    left1 = _mm256_load_pd(&left[j * 16 + k * 4]);		  		  		  
+		  simde__m256d 
+		    left1 = simde_mm256_load_pd(&left[j * 16 + k * 4]);		  		  		  
 
-		  left1 = _mm256_mul_pd(left1, tv);		  
+		  left1 = simde_mm256_mul_pd(left1, tv);		  
 		  left1 = hadd3(left1);
 		  		  		  
-		  _mm256_store_pd(&umpX1[i * 64 + j * 16 + k * 4], left1);
+		  simde_mm256_store_pd(&umpX1[i * 64 + j * 16 + k * 4], left1);
 		}
 	  
 	    for (j = 0; j < 4; j++)
 	      for (k = 0; k < 4; k++)
 		{		 
-		  __m256d 
-		    left1 = _mm256_load_pd(&right[j * 16 + k * 4]);		  		  		  
+		  simde__m256d 
+		    left1 = simde_mm256_load_pd(&right[j * 16 + k * 4]);		  		  		  
 
-		  left1 = _mm256_mul_pd(left1, tv);		  
+		  left1 = simde_mm256_mul_pd(left1, tv);		  
 		  left1 = hadd3(left1);
 		  		  		  
-		  _mm256_store_pd(&umpX2[i * 64 + j * 16 + k * 4], left1);
+		  simde_mm256_store_pd(&umpX2[i * 64 + j * 16 + k * 4], left1);
 		}	    
 	  }   	
 	  
@@ -1882,27 +1880,27 @@
 	  
 	  for(k = 0; k < 4; k++)
 	    {
-	      __m256d	   
-		xv = _mm256_setzero_pd();
+	      simde__m256d	   
+		xv = simde_mm256_setzero_pd();
 	      
 	      int 
 		l;
 	      
 	      for(l = 0; l < 4; l++)
 		{	       	     				      	      																	   
-		  __m256d
-		    x1v =  _mm256_mul_pd(_mm256_load_pd(&uX1[k * 16 + l * 4]), _mm256_load_pd(&uX2[k * 16 + l * 4]));
+		  simde__m256d
+		    x1v =  simde_mm256_mul_pd(simde_mm256_load_pd(&uX1[k * 16 + l * 4]), simde_mm256_load_pd(&uX2[k * 16 + l * 4]));
 		  
-		  __m256d 
-		    evv = _mm256_load_pd(&extEV[l * 4]);
+		  simde__m256d 
+		    evv = simde_mm256_load_pd(&extEV[l * 4]);
 #ifdef _FMA
 		  xv = FMAMACC(xv,x1v,evv);
 #else						  
-		  xv = _mm256_add_pd(xv, _mm256_mul_pd(x1v, evv));
+		  xv = simde_mm256_add_pd(xv, simde_mm256_mul_pd(x1v, evv));
 #endif
 		}
 		    
-	      _mm256_store_pd(&x3[4 * k], xv);
+	      simde_mm256_store_pd(&x3[4 * k], xv);
 	    }
 	}
 	
@@ -1917,27 +1915,27 @@
 	    
 		for(k = 0; k < 4; k++)
 		  {
-		    __m256d	   
-		      xv = _mm256_setzero_pd();
+		    simde__m256d	   
+		      xv = simde_mm256_setzero_pd();
 	       
 		    int 
 		      l;
 		
 		    for(l = 0; l < 4; l++)
 		      {	       	     				      	      																	   
-			__m256d
-			  x1v =  _mm256_mul_pd(_mm256_load_pd(&uX1[k * 16 + l * 4]), _mm256_load_pd(&uX2[k * 16 + l * 4]));
+			simde__m256d
+			  x1v =  simde_mm256_mul_pd(simde_mm256_load_pd(&uX1[k * 16 + l * 4]), simde_mm256_load_pd(&uX2[k * 16 + l * 4]));
 			
-			__m256d 
-			  evv = _mm256_load_pd(&extEV[l * 4]);
+			simde__m256d 
+			  evv = simde_mm256_load_pd(&extEV[l * 4]);
 #ifdef _FMA
 			xv = FMAMACC(xv,x1v,evv);
 #else						  
-			xv = _mm256_add_pd(xv, _mm256_mul_pd(x1v, evv));
+			xv = simde_mm256_add_pd(xv, simde_mm256_mul_pd(x1v, evv));
 #endif
 		      }
 		    
-		    _mm256_store_pd(&x3[4 * k], xv);
+		    simde_mm256_store_pd(&x3[4 * k], xv);
 		  }
 
 		x3 += 16;
@@ -1953,8 +1951,8 @@
        
 	for (i = 1; i < 16; i++)
 	  {
-	    __m256d 
-	      tv = _mm256_load_pd(&(tipVector[i*4]));
+	    simde__m256d 
+	      tv = simde_mm256_load_pd(&(tipVector[i*4]));
 
 	    int 
 	      j;
@@ -1962,18 +1960,18 @@
 	    for (j = 0; j < 4; j++)
 	      for (k = 0; k < 4; k++)
 		{		 
-		  __m256d 
-		    left1 = _mm256_load_pd(&left[j * 16 + k * 4]);		  		  		  
+		  simde__m256d 
+		    left1 = simde_mm256_load_pd(&left[j * 16 + k * 4]);		  		  		  
 
-		  left1 = _mm256_mul_pd(left1, tv);		  
+		  left1 = simde_mm256_mul_pd(left1, tv);		  
 		  left1 = hadd3(left1);
 		  		  		  
-		  _mm256_store_pd(&umpX1[i * 64 + j * 16 + k * 4], left1);
+		  simde_mm256_store_pd(&umpX1[i * 64 + j * 16 + k * 4], left1);
 		}	 	   
 	  }	
 
 	{ 
-	  __m256d
+	  simde__m256d
 	    xv[4];
 	  
 	  scaleGap = 1;
@@ -1984,57 +1982,57 @@
 
 	  for(k = 0; k < 4; k++)
 	    {
-	      __m256d	   		 
-		xvr = _mm256_load_pd(&(x2[k * 4]));
+	      simde__m256d	   		 
+		xvr = simde_mm256_load_pd(&(x2[k * 4]));
 
 	      int 
 		l;
 
-	      xv[k]  = _mm256_setzero_pd();
+	      xv[k]  = simde_mm256_setzero_pd();
 		  
 	      for(l = 0; l < 4; l++)
 		{	       	     				      	      															
-		  __m256d  
-		    x1v = _mm256_load_pd(&uX1[k * 16 + l * 4]),		     
-		    x2v = _mm256_mul_pd(xvr, _mm256_load_pd(&right[k * 16 + l * 4]));			    
+		  simde__m256d  
+		    x1v = simde_mm256_load_pd(&uX1[k * 16 + l * 4]),		     
+		    x2v = simde_mm256_mul_pd(xvr, simde_mm256_load_pd(&right[k * 16 + l * 4]));			    
 			
 		  x2v = hadd3(x2v);
-		  x1v = _mm256_mul_pd(x1v, x2v);			
+		  x1v = simde_mm256_mul_pd(x1v, x2v);			
 		
-		  __m256d 
-		    evv = _mm256_load_pd(&extEV[l * 4]);
+		  simde__m256d 
+		    evv = simde_mm256_load_pd(&extEV[l * 4]);
 			
 #ifdef _FMA
 		  xv[k] = FMAMACC(xv[k],x1v,evv);
 #else			  
-		  xv[k] = _mm256_add_pd(xv[k], _mm256_mul_pd(x1v, evv));
+		  xv[k] = simde_mm256_add_pd(xv[k], simde_mm256_mul_pd(x1v, evv));
 #endif
 		}
 		    
 	      if(scaleGap)
 		{
-		  __m256d 	     
-		    v1 = _mm256_and_pd(xv[k], absMask_AVX.m);
+		  simde__m256d 	     
+		    v1 = simde_mm256_and_pd(xv[k], absMask_AVX.m);
 		  
-		  v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+		  v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
 		    
-		  if(_mm256_movemask_pd( v1 ) != 15)
+		  if(simde_mm256_movemask_pd( v1 ) != 15)
 		    scaleGap = 0;
 		}
 	    }
 	
 	  if(scaleGap)
 	    {
-	      xv[0] = _mm256_mul_pd(xv[0], twoto);
-	      xv[1] = _mm256_mul_pd(xv[1], twoto);
-	      xv[2] = _mm256_mul_pd(xv[2], twoto);
-	      xv[3] = _mm256_mul_pd(xv[3], twoto);	    
+	      xv[0] = simde_mm256_mul_pd(xv[0], twoto);
+	      xv[1] = simde_mm256_mul_pd(xv[1], twoto);
+	      xv[2] = simde_mm256_mul_pd(xv[2], twoto);
+	      xv[3] = simde_mm256_mul_pd(xv[3], twoto);	    
 	    }
 
-	  _mm256_store_pd(&x3[0],      xv[0]);
-	  _mm256_store_pd(&x3[4],  xv[1]);
-	  _mm256_store_pd(&x3[8],  xv[2]);
-	  _mm256_store_pd(&x3[12], xv[3]);
+	  simde_mm256_store_pd(&x3[0],      xv[0]);
+	  simde_mm256_store_pd(&x3[4],  xv[1]);
+	  simde_mm256_store_pd(&x3[8],  xv[2]);
+	  simde_mm256_store_pd(&x3[12], xv[3]);
 	}
 	
 	x3 = x3_start;
@@ -2061,7 +2059,7 @@
 		    x2_ptr += 16;
 		  }
 		
-		__m256d
+		simde__m256d
 		  xv[4];	    	   
 		
 		scale = 1;
@@ -2069,51 +2067,51 @@
 		
 		for(k = 0; k < 4; k++)
 		  {
-		    __m256d	   		 
-		      xvr = _mm256_load_pd(&(x2[k * 4]));
+		    simde__m256d	   		 
+		      xvr = simde_mm256_load_pd(&(x2[k * 4]));
 		    
 		    int 
 		      l;
 		    
-		    xv[k]  = _mm256_setzero_pd();
+		    xv[k]  = simde_mm256_setzero_pd();
 		    
 		    for(l = 0; l < 4; l++)
 		      {	       	     				      	      															
-			__m256d  
-			  x1v = _mm256_load_pd(&uX1[k * 16 + l * 4]),		     
-			  x2v = _mm256_mul_pd(xvr, _mm256_load_pd(&right[k * 16 + l * 4]));			    
+			simde__m256d  
+			  x1v = simde_mm256_load_pd(&uX1[k * 16 + l * 4]),		     
+			  x2v = simde_mm256_mul_pd(xvr, simde_mm256_load_pd(&right[k * 16 + l * 4]));			    
 			
 			x2v = hadd3(x2v);
-			x1v = _mm256_mul_pd(x1v, x2v);			
+			x1v = simde_mm256_mul_pd(x1v, x2v);			
 			
-			__m256d 
-			  evv = _mm256_load_pd(&extEV[l * 4]);
+			simde__m256d 
+			  evv = simde_mm256_load_pd(&extEV[l * 4]);
 			
 #ifdef _FMA
 			xv[k] = FMAMACC(xv[k],x1v,evv);
 #else			  
-			xv[k] = _mm256_add_pd(xv[k], _mm256_mul_pd(x1v, evv));
+			xv[k] = simde_mm256_add_pd(xv[k], simde_mm256_mul_pd(x1v, evv));
 #endif
 		      }
 		    
 		    if(scale)
 		      {
-			__m256d 	     
-			  v1 = _mm256_and_pd(xv[k], absMask_AVX.m);
+			simde__m256d 	     
+			  v1 = simde_mm256_and_pd(xv[k], absMask_AVX.m);
 			
-			v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+			v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
 			
-			if(_mm256_movemask_pd( v1 ) != 15)
+			if(simde_mm256_movemask_pd( v1 ) != 15)
 			  scale = 0;
 		      }
 		  }	    
 	      
 		if(scale)
 		  {
-		    xv[0] = _mm256_mul_pd(xv[0], twoto);
-		    xv[1] = _mm256_mul_pd(xv[1], twoto);
-		    xv[2] = _mm256_mul_pd(xv[2], twoto);
-		    xv[3] = _mm256_mul_pd(xv[3], twoto);
+		    xv[0] = simde_mm256_mul_pd(xv[0], twoto);
+		    xv[1] = simde_mm256_mul_pd(xv[1], twoto);
+		    xv[2] = simde_mm256_mul_pd(xv[2], twoto);
+		    xv[3] = simde_mm256_mul_pd(xv[3], twoto);
 
 		    if(useFastScaling)
 		      addScale += wgt[i];
@@ -2121,10 +2119,10 @@
 		      ex3[i] += 1;		   
 		  }
 	      
-		_mm256_store_pd(&x3[0],      xv[0]);
-		_mm256_store_pd(&x3[4],  xv[1]);
-		_mm256_store_pd(&x3[8],  xv[2]);
-		_mm256_store_pd(&x3[12], xv[3]);
+		simde_mm256_store_pd(&x3[0],      xv[0]);
+		simde_mm256_store_pd(&x3[4],  xv[1]);
+		simde_mm256_store_pd(&x3[8],  xv[2]);
+		simde_mm256_store_pd(&x3[12], xv[3]);
 	      
 		x3 += 16;
 	      }
@@ -2138,61 +2136,61 @@
 	  x2 = x2_gapColumn;	    
 	  x3 = x3_gapColumn;
 
-	  __m256d
+	  simde__m256d
 	    xv[4];
 	    
 	  scaleGap = 1;
 
 	  for(k = 0; k < 4; k++)
 	    {
-	      __m256d	   
+	      simde__m256d	   
 		
-		xvl = _mm256_load_pd(&(x1[k * 4])),
-		xvr = _mm256_load_pd(&(x2[k * 4]));
+		xvl = simde_mm256_load_pd(&(x1[k * 4])),
+		xvr = simde_mm256_load_pd(&(x2[k * 4]));
 
 	      int 
 		l;
 
-	      xv[k] = _mm256_setzero_pd();
+	      xv[k] = simde_mm256_setzero_pd();
 
 	      for(l = 0; l < 4; l++)
 		{	       	     				      	      															
-		  __m256d 
-		    x1v = _mm256_mul_pd(xvl, _mm256_load_pd(&left[k * 16 + l * 4])),
-		    x2v = _mm256_mul_pd(xvr, _mm256_load_pd(&right[k * 16 + l * 4]));			    
+		  simde__m256d 
+		    x1v = simde_mm256_mul_pd(xvl, simde_mm256_load_pd(&left[k * 16 + l * 4])),
+		    x2v = simde_mm256_mul_pd(xvr, simde_mm256_load_pd(&right[k * 16 + l * 4]));			    
 		  
 		  x1v = hadd4(x1v, x2v);			
 		  
-		  __m256d 
-		    evv = _mm256_load_pd(&extEV[l * 4]);
+		  simde__m256d 
+		    evv = simde_mm256_load_pd(&extEV[l * 4]);
 		  
-		  xv[k] = _mm256_add_pd(xv[k], _mm256_mul_pd(x1v, evv));
+		  xv[k] = simde_mm256_add_pd(xv[k], simde_mm256_mul_pd(x1v, evv));
 		}
 		
 	      if(scaleGap)
 		  {
-		    __m256d 	     
-		      v1 = _mm256_and_pd(xv[k], absMask_AVX.m);
+		    simde__m256d 	     
+		      v1 = simde_mm256_and_pd(xv[k], absMask_AVX.m);
 
-		    v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+		    v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
 		    
-		    if(_mm256_movemask_pd( v1 ) != 15)
+		    if(simde_mm256_movemask_pd( v1 ) != 15)
 		      scaleGap = 0;
 		  }
 	    }
 
 	  if(scaleGap)
 	    {
-	      xv[0] = _mm256_mul_pd(xv[0], twoto);
-	      xv[1] = _mm256_mul_pd(xv[1], twoto);
-	      xv[2] = _mm256_mul_pd(xv[2], twoto);
-	      xv[3] = _mm256_mul_pd(xv[3], twoto);	       
+	      xv[0] = simde_mm256_mul_pd(xv[0], twoto);
+	      xv[1] = simde_mm256_mul_pd(xv[1], twoto);
+	      xv[2] = simde_mm256_mul_pd(xv[2], twoto);
+	      xv[3] = simde_mm256_mul_pd(xv[3], twoto);	       
 	    }
 		
-	  _mm256_store_pd(&x3[0],  xv[0]);
-	  _mm256_store_pd(&x3[4],  xv[1]);
-	  _mm256_store_pd(&x3[8],  xv[2]);
-	  _mm256_store_pd(&x3[12], xv[3]);
+	  simde_mm256_store_pd(&x3[0],  xv[0]);
+	  simde_mm256_store_pd(&x3[4],  xv[1]);
+	  simde_mm256_store_pd(&x3[8],  xv[2]);
+	  simde_mm256_store_pd(&x3[12], xv[3]);
 	}	  
       
 	x3 = x3_start;
@@ -2227,55 +2225,55 @@
 		    x2_ptr += 16;
 		  }
 
-		__m256d
+		simde__m256d
 		  xv[4];
 	    
 		scale = 1;
 
 		for(k = 0; k < 4; k++)
 		  {
-		    __m256d	   
+		    simde__m256d	   
 		      
-		      xvl = _mm256_load_pd(&(x1[k * 4])),
-		      xvr = _mm256_load_pd(&(x2[k * 4]));
+		      xvl = simde_mm256_load_pd(&(x1[k * 4])),
+		      xvr = simde_mm256_load_pd(&(x2[k * 4]));
 		    
 		    int 
 		      l;
 		    
-		    xv[k] = _mm256_setzero_pd();
+		    xv[k] = simde_mm256_setzero_pd();
 		    
 		    for(l = 0; l < 4; l++)
 		      {	       	     				      	      															
-			__m256d 
-			  x1v = _mm256_mul_pd(xvl, _mm256_load_pd(&left[k * 16 + l * 4])),
-			  x2v = _mm256_mul_pd(xvr, _mm256_load_pd(&right[k * 16 + l * 4]));			    
+			simde__m256d 
+			  x1v = simde_mm256_mul_pd(xvl, simde_mm256_load_pd(&left[k * 16 + l * 4])),
+			  x2v = simde_mm256_mul_pd(xvr, simde_mm256_load_pd(&right[k * 16 + l * 4]));			    
 			
 			x1v = hadd4(x1v, x2v);			
 			
-			__m256d 
-			  evv = _mm256_load_pd(&extEV[l * 4]);
+			simde__m256d 
+			  evv = simde_mm256_load_pd(&extEV[l * 4]);
 			
-			xv[k] = _mm256_add_pd(xv[k], _mm256_mul_pd(x1v, evv));
+			xv[k] = simde_mm256_add_pd(xv[k], simde_mm256_mul_pd(x1v, evv));
 		      }
 		    
 		    if(scale)
 		      {
-			__m256d 	     
-			  v1 = _mm256_and_pd(xv[k], absMask_AVX.m);
+			simde__m256d 	     
+			  v1 = simde_mm256_and_pd(xv[k], absMask_AVX.m);
 			
-			v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+			v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
 			
-			if(_mm256_movemask_pd( v1 ) != 15)
+			if(simde_mm256_movemask_pd( v1 ) != 15)
 			  scale = 0;
 		      }
 		  }
 
 		if(scale)
 		  {
-		    xv[0] = _mm256_mul_pd(xv[0], twoto);
-		    xv[1] = _mm256_mul_pd(xv[1], twoto);
-		    xv[2] = _mm256_mul_pd(xv[2], twoto);
-		    xv[3] = _mm256_mul_pd(xv[3], twoto);
+		    xv[0] = simde_mm256_mul_pd(xv[0], twoto);
+		    xv[1] = simde_mm256_mul_pd(xv[1], twoto);
+		    xv[2] = simde_mm256_mul_pd(xv[2], twoto);
+		    xv[3] = simde_mm256_mul_pd(xv[3], twoto);
 		    
 		    if(useFastScaling)
 		      addScale += wgt[i];
@@ -2283,10 +2281,10 @@
 		      ex3[i] += 1;
 		  }
 		
-		_mm256_store_pd(&x3[0],      xv[0]);
-		_mm256_store_pd(&x3[4],  xv[1]);
-		_mm256_store_pd(&x3[8],  xv[2]);
-		_mm256_store_pd(&x3[12], xv[3]);
+		simde_mm256_store_pd(&x3[0],      xv[0]);
+		simde_mm256_store_pd(&x3[4],  xv[1]);
+		simde_mm256_store_pd(&x3[8],  xv[2]);
+		simde_mm256_store_pd(&x3[12], xv[3]);
 	      
 		x3 += 16;
 	      }
@@ -2325,9 +2323,9 @@
     scaleGap = 0,
     addScale = 0;
    
-  __m256d 
-    minlikelihood_avx = _mm256_set1_pd( minlikelihood ),
-    twoto = _mm256_set1_pd(twotothe256);
+  simde__m256d 
+    minlikelihood_avx = simde_mm256_set1_pd( minlikelihood ),
+    twoto = simde_mm256_set1_pd(twotothe256);
   
 
   {
@@ -2341,41 +2339,41 @@
     le =  &left[maxCats * 16];
     ri =  &right[maxCats * 16];
 
-    __m256d	   
-      vv = _mm256_setzero_pd();
+    simde__m256d	   
+      vv = simde_mm256_setzero_pd();
 	  
     for(l = 0; l < 4; l++)
       {	       	     				      	      															
-	__m256d 
-	  x1v = _mm256_mul_pd(_mm256_load_pd(x1), _mm256_load_pd(&le[l * 4])),
-	  x2v = _mm256_mul_pd(_mm256_load_pd(x2), _mm256_load_pd(&ri[l * 4]));			    
+	simde__m256d 
+	  x1v = simde_mm256_mul_pd(simde_mm256_load_pd(x1), simde_mm256_load_pd(&le[l * 4])),
+	  x2v = simde_mm256_mul_pd(simde_mm256_load_pd(x2), simde_mm256_load_pd(&ri[l * 4]));			    
 	
 	x1v = hadd4(x1v, x2v);			
 	
-	__m256d 
-	  evv = _mm256_load_pd(&EV[l * 4]);
+	simde__m256d 
+	  evv = simde_mm256_load_pd(&EV[l * 4]);
 #ifdef _FMA
 	vv = FMAMACC(vv,x1v,evv);
 #else						
-	vv = _mm256_add_pd(vv, _mm256_mul_pd(x1v, evv));						      	
+	vv = simde_mm256_add_pd(vv, simde_mm256_mul_pd(x1v, evv));						      	
 #endif
       }	  		  
 
     if(tipCase != TIP_TIP)
       {
-	__m256d 	     
-	  v1 = _mm256_and_pd(vv, absMask_AVX.m);
+	simde__m256d 	     
+	  v1 = simde_mm256_and_pd(vv, absMask_AVX.m);
     
-	v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+	v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
     
-	if(_mm256_movemask_pd( v1 ) == 15)
+	if(simde_mm256_movemask_pd( v1 ) == 15)
 	  {
-	    vv = _mm256_mul_pd(vv, twoto);	      	 
+	    vv = simde_mm256_mul_pd(vv, twoto);	      	 
 	    scaleGap = 1;
 	  }
       }
     
-    _mm256_store_pd(x3, vv);    
+    simde_mm256_store_pd(x3, vv);    
   }
 
   switch(tipCase)
@@ -2403,27 +2401,27 @@
 	      else	 	  
 		ri =  &right[cptr[i] * 16];
 	  	  
-	      __m256d	   
-		vv = _mm256_setzero_pd();
+	      simde__m256d	   
+		vv = simde_mm256_setzero_pd();
 	      
 	      for(l = 0; l < 4; l++)
 		{	       	     				      	      															
-		  __m256d 
-		    x1v = _mm256_mul_pd(_mm256_load_pd(x1), _mm256_load_pd(&le[l * 4])),
-		    x2v = _mm256_mul_pd(_mm256_load_pd(x2), _mm256_load_pd(&ri[l * 4]));			    
+		  simde__m256d 
+		    x1v = simde_mm256_mul_pd(simde_mm256_load_pd(x1), simde_mm256_load_pd(&le[l * 4])),
+		    x2v = simde_mm256_mul_pd(simde_mm256_load_pd(x2), simde_mm256_load_pd(&ri[l * 4]));			    
 		  
 		  x1v = hadd4(x1v, x2v);			
 		  
-		  __m256d 
-		    evv = _mm256_load_pd(&EV[l * 4]);
+		  simde__m256d 
+		    evv = simde_mm256_load_pd(&EV[l * 4]);
 #ifdef _FMA
 		  vv = FMAMACC(vv,x1v,evv);
 #else				
-		  vv = _mm256_add_pd(vv, _mm256_mul_pd(x1v, evv));						      	
+		  vv = simde_mm256_add_pd(vv, simde_mm256_mul_pd(x1v, evv));						      	
 #endif
 		}	  		  
 
-	      _mm256_store_pd(x3, vv);	 
+	      simde_mm256_store_pd(x3, vv);	 
 	      
 	      x3_ptr += 4;
 	    }
@@ -2467,36 +2465,36 @@
 		  x2_ptr += 4;
 		}	  	 
 
-	      __m256d	   
-		vv = _mm256_setzero_pd();
+	      simde__m256d	   
+		vv = simde_mm256_setzero_pd();
 	      
 	      for(l = 0; l < 4; l++)
 		{	       	     				      	      															
-		  __m256d 
-		    x1v = _mm256_mul_pd(_mm256_load_pd(x1), _mm256_load_pd(&le[l * 4])),
-		    x2v = _mm256_mul_pd(_mm256_load_pd(x2), _mm256_load_pd(&ri[l * 4]));			    
+		  simde__m256d 
+		    x1v = simde_mm256_mul_pd(simde_mm256_load_pd(x1), simde_mm256_load_pd(&le[l * 4])),
+		    x2v = simde_mm256_mul_pd(simde_mm256_load_pd(x2), simde_mm256_load_pd(&ri[l * 4]));			    
 		  
 		  x1v = hadd4(x1v, x2v);			
 		  
-		  __m256d 
-		    evv = _mm256_load_pd(&EV[l * 4]);
+		  simde__m256d 
+		    evv = simde_mm256_load_pd(&EV[l * 4]);
 		  
 #ifdef _FMA
 		  vv = FMAMACC(vv,x1v,evv);
 #else	      
-		  vv = _mm256_add_pd(vv, _mm256_mul_pd(x1v, evv));
+		  vv = simde_mm256_add_pd(vv, simde_mm256_mul_pd(x1v, evv));
 #endif
 		}	  		  
 	  
 	  
-	      __m256d 	     
-		v1 = _mm256_and_pd(vv, absMask_AVX.m);
+	      simde__m256d 	     
+		v1 = simde_mm256_and_pd(vv, absMask_AVX.m);
 	      
-	      v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+	      v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
 	      
-	      if(_mm256_movemask_pd( v1 ) == 15)
+	      if(simde_mm256_movemask_pd( v1 ) == 15)
 		{	     	      
-		  vv = _mm256_mul_pd(vv, twoto);	      
+		  vv = simde_mm256_mul_pd(vv, twoto);	      
 		  
 		  if(useFastScaling)
 		    addScale += wgt[i];
@@ -2504,7 +2502,7 @@
 		    ex3[i] += 1;		 
 		}       
 	  
-	      _mm256_store_pd(x3, vv);	 	  	  
+	      simde_mm256_store_pd(x3, vv);	 	  	  
 
 	      x3_ptr += 4;
 	    }
@@ -2554,35 +2552,35 @@
 		  x2_ptr += 4;
 		}	 	  	  	  
 	  
-	      __m256d	   
-		vv = _mm256_setzero_pd();
+	      simde__m256d	   
+		vv = simde_mm256_setzero_pd();
 	      
 	      for(l = 0; l < 4; l++)
 		{	       	     				      	      															
-		  __m256d 
-		    x1v = _mm256_mul_pd(_mm256_load_pd(x1), _mm256_load_pd(&le[l * 4])),
-		    x2v = _mm256_mul_pd(_mm256_load_pd(x2), _mm256_load_pd(&ri[l * 4]));			    
+		  simde__m256d 
+		    x1v = simde_mm256_mul_pd(simde_mm256_load_pd(x1), simde_mm256_load_pd(&le[l * 4])),
+		    x2v = simde_mm256_mul_pd(simde_mm256_load_pd(x2), simde_mm256_load_pd(&ri[l * 4]));			    
 		  
 		  x1v = hadd4(x1v, x2v);			
 		  
-		  __m256d 
-		    evv = _mm256_load_pd(&EV[l * 4]);
+		  simde__m256d 
+		    evv = simde_mm256_load_pd(&EV[l * 4]);
 #ifdef _FMA
 		  vv = FMAMACC(vv,x1v,evv);
 #else						
-		  vv = _mm256_add_pd(vv, _mm256_mul_pd(x1v, evv));						      	
+		  vv = simde_mm256_add_pd(vv, simde_mm256_mul_pd(x1v, evv));						      	
 #endif
 		}	  		  
 	      
 	      
-	      __m256d 	     
-		v1 = _mm256_and_pd(vv, absMask_AVX.m);
+	      simde__m256d 	     
+		v1 = simde_mm256_and_pd(vv, absMask_AVX.m);
 	      
-	      v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+	      v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
 	      
-	      if(_mm256_movemask_pd( v1 ) == 15)
+	      if(simde_mm256_movemask_pd( v1 ) == 15)
 		{	
-		  vv = _mm256_mul_pd(vv, twoto);	      
+		  vv = simde_mm256_mul_pd(vv, twoto);	      
 		  
 		  if(useFastScaling)
 		    addScale += wgt[i];
@@ -2590,7 +2588,7 @@
 		    ex3[i] += 1;		
 		}	
 	      
-	      _mm256_store_pd(x3, vv);
+	      simde_mm256_store_pd(x3, vv);
 	      
 	      x3_ptr += 4;
 	    }	  	  
@@ -2641,96 +2639,96 @@
     vr = x2_gapColumn;
     v  = x3_gapColumn;
 
-    __m256d vv[5];
+    simde__m256d vv[5];
     
-    vv[0] = _mm256_setzero_pd();
-    vv[1] = _mm256_setzero_pd();
-    vv[2] = _mm256_setzero_pd();
-    vv[3] = _mm256_setzero_pd();
-    vv[4] = _mm256_setzero_pd();
+    vv[0] = simde_mm256_setzero_pd();
+    vv[1] = simde_mm256_setzero_pd();
+    vv[2] = simde_mm256_setzero_pd();
+    vv[3] = simde_mm256_setzero_pd();
+    vv[4] = simde_mm256_setzero_pd();
     
     for(l = 0; l < 20; l++)
       {	       
-	__m256d 
-	  x1v = _mm256_setzero_pd(),
-	  x2v = _mm256_setzero_pd();	
+	simde__m256d 
+	  x1v = simde_mm256_setzero_pd(),
+	  x2v = simde_mm256_setzero_pd();	
 	
 	double 
 	  *ev = &extEV[l * 20],
 	  *lv = &le[l * 20],
 	  *rv = &ri[l * 20];														
 	
-	x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[0]), _mm256_load_pd(&lv[0])));
-	x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[4]), _mm256_load_pd(&lv[4])));
-	x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[8]), _mm256_load_pd(&lv[8])));
-	x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[12]), _mm256_load_pd(&lv[12])));
-	x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[16]), _mm256_load_pd(&lv[16])));
-	
-	x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[0]), _mm256_load_pd(&rv[0])));			    
-	x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[4]), _mm256_load_pd(&rv[4])));				    
-	x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[8]), _mm256_load_pd(&rv[8])));			    
-	x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[12]), _mm256_load_pd(&rv[12])));				    
-	x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[16]), _mm256_load_pd(&rv[16])));
+	x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[0]), simde_mm256_load_pd(&lv[0])));
+	x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[4]), simde_mm256_load_pd(&lv[4])));
+	x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[8]), simde_mm256_load_pd(&lv[8])));
+	x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[12]), simde_mm256_load_pd(&lv[12])));
+	x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[16]), simde_mm256_load_pd(&lv[16])));
+	
+	x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[0]), simde_mm256_load_pd(&rv[0])));			    
+	x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[4]), simde_mm256_load_pd(&rv[4])));				    
+	x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[8]), simde_mm256_load_pd(&rv[8])));			    
+	x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[12]), simde_mm256_load_pd(&rv[12])));				    
+	x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[16]), simde_mm256_load_pd(&rv[16])));
 	
 	x1v = hadd4(x1v, x2v);			
 #ifdef _FMA
 	for(k = 0; k < 5; k++) 
 	  {
-	    __m256d evv = _mm256_load_pd(&ev[k*4]);
+	    simde__m256d evv = simde_mm256_load_pd(&ev[k*4]);
 	    vv[k] = FMAMACC(vv[k],x1v,evv);
 	  }
 #else	      
-	__m256d 
+	simde__m256d 
 	  evv[5];
 	
-	evv[0] = _mm256_load_pd(&ev[0]);
-	evv[1] = _mm256_load_pd(&ev[4]);
-	evv[2] = _mm256_load_pd(&ev[8]);
-	evv[3] = _mm256_load_pd(&ev[12]);
-	evv[4] = _mm256_load_pd(&ev[16]);		
-	
-	vv[0] = _mm256_add_pd(vv[0], _mm256_mul_pd(x1v, evv[0]));
-	vv[1] = _mm256_add_pd(vv[1], _mm256_mul_pd(x1v, evv[1]));
-	vv[2] = _mm256_add_pd(vv[2], _mm256_mul_pd(x1v, evv[2]));
-	vv[3] = _mm256_add_pd(vv[3], _mm256_mul_pd(x1v, evv[3]));
-	vv[4] = _mm256_add_pd(vv[4], _mm256_mul_pd(x1v, evv[4]));				      	
+	evv[0] = simde_mm256_load_pd(&ev[0]);
+	evv[1] = simde_mm256_load_pd(&ev[4]);
+	evv[2] = simde_mm256_load_pd(&ev[8]);
+	evv[3] = simde_mm256_load_pd(&ev[12]);
+	evv[4] = simde_mm256_load_pd(&ev[16]);		
+	
+	vv[0] = simde_mm256_add_pd(vv[0], simde_mm256_mul_pd(x1v, evv[0]));
+	vv[1] = simde_mm256_add_pd(vv[1], simde_mm256_mul_pd(x1v, evv[1]));
+	vv[2] = simde_mm256_add_pd(vv[2], simde_mm256_mul_pd(x1v, evv[2]));
+	vv[3] = simde_mm256_add_pd(vv[3], simde_mm256_mul_pd(x1v, evv[3]));
+	vv[4] = simde_mm256_add_pd(vv[4], simde_mm256_mul_pd(x1v, evv[4]));				      	
 #endif
       }	  
 
 
      if(tipCase != TIP_TIP)
        {
-	 __m256d minlikelihood_avx = _mm256_set1_pd( minlikelihood );
+	 simde__m256d minlikelihood_avx = simde_mm256_set1_pd( minlikelihood );
 	  
 	 scale = 1;
 	  
 	 for(l = 0; scale && (l < 20); l += 4)
 	   {	       
-	     __m256d 
-	       v1 = _mm256_and_pd(vv[l / 4], absMask_AVX.m);
-	     v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+	     simde__m256d 
+	       v1 = simde_mm256_and_pd(vv[l / 4], absMask_AVX.m);
+	     v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
 	     
-	     if(_mm256_movemask_pd( v1 ) != 15)
+	     if(simde_mm256_movemask_pd( v1 ) != 15)
 	       scale = 0;
 	   }	    	  	  
 
 	 if(scale)
 	   {
-	      __m256d 
-		twoto = _mm256_set1_pd(twotothe256);
+	      simde__m256d 
+		twoto = simde_mm256_set1_pd(twotothe256);
 	      
 	      for(l = 0; l < 20; l += 4)
-		vv[l / 4] = _mm256_mul_pd(vv[l / 4] , twoto);		    		 	      	     	      
+		vv[l / 4] = simde_mm256_mul_pd(vv[l / 4] , twoto);		    		 	      	     	      
 	   
 	      scaleGap = 1;
 	   }
        }
 
-     _mm256_store_pd(&v[0], vv[0]);
-     _mm256_store_pd(&v[4], vv[1]);
-     _mm256_store_pd(&v[8], vv[2]);
-     _mm256_store_pd(&v[12], vv[3]);
-     _mm256_store_pd(&v[16], vv[4]);     
+     simde_mm256_store_pd(&v[0], vv[0]);
+     simde_mm256_store_pd(&v[4], vv[1]);
+     simde_mm256_store_pd(&v[8], vv[2]);
+     simde_mm256_store_pd(&v[12], vv[3]);
+     simde_mm256_store_pd(&v[16], vv[4]);     
   }
 
 
@@ -2757,19 +2755,19 @@
 		else	 	  
 		  ri =  &right[cptr[i] * 400];
 
-		__m256d vv[5];
+		simde__m256d vv[5];
 		
-		vv[0] = _mm256_setzero_pd();
-		vv[1] = _mm256_setzero_pd();
-		vv[2] = _mm256_setzero_pd();
-		vv[3] = _mm256_setzero_pd();
-		vv[4] = _mm256_setzero_pd();	   	    
+		vv[0] = simde_mm256_setzero_pd();
+		vv[1] = simde_mm256_setzero_pd();
+		vv[2] = simde_mm256_setzero_pd();
+		vv[3] = simde_mm256_setzero_pd();
+		vv[4] = simde_mm256_setzero_pd();	   	    
 		
 		for(l = 0; l < 20; l++)
 		  {	       
-		    __m256d 
-		      x1v = _mm256_setzero_pd(),
-		      x2v = _mm256_setzero_pd();	
+		    simde__m256d 
+		      x1v = simde_mm256_setzero_pd(),
+		      x2v = simde_mm256_setzero_pd();	
 		    
 		    double 
 		      *ev = &extEV[l * 20],
@@ -2779,57 +2777,57 @@
 #ifdef _FMA		
 		    for(k = 0; k < 20; k += 4) 
 		      {
-			__m256d vlv = _mm256_load_pd(&vl[k]);
-			__m256d lvv = _mm256_load_pd(&lv[k]);
+			simde__m256d vlv = simde_mm256_load_pd(&vl[k]);
+			simde__m256d lvv = simde_mm256_load_pd(&lv[k]);
 			x1v = FMAMACC(x1v,vlv,lvv);
-			__m256d vrv = _mm256_load_pd(&vr[k]);
-			__m256d rvv = _mm256_load_pd(&rv[k]);
+			simde__m256d vrv = simde_mm256_load_pd(&vr[k]);
+			simde__m256d rvv = simde_mm256_load_pd(&rv[k]);
 			x2v = FMAMACC(x2v,vrv,rvv);
 		      }
 #else		
-		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[0]), _mm256_load_pd(&lv[0])));
-		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[4]), _mm256_load_pd(&lv[4])));
-		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[8]), _mm256_load_pd(&lv[8])));
-		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[12]), _mm256_load_pd(&lv[12])));
-		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[16]), _mm256_load_pd(&lv[16])));
-		    
-		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[0]), _mm256_load_pd(&rv[0])));			    
-		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[4]), _mm256_load_pd(&rv[4])));				    
-		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[8]), _mm256_load_pd(&rv[8])));			    
-		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[12]), _mm256_load_pd(&rv[12])));				    
-		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[16]), _mm256_load_pd(&rv[16])));	
+		    x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[0]), simde_mm256_load_pd(&lv[0])));
+		    x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[4]), simde_mm256_load_pd(&lv[4])));
+		    x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[8]), simde_mm256_load_pd(&lv[8])));
+		    x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[12]), simde_mm256_load_pd(&lv[12])));
+		    x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[16]), simde_mm256_load_pd(&lv[16])));
+		    
+		    x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[0]), simde_mm256_load_pd(&rv[0])));			    
+		    x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[4]), simde_mm256_load_pd(&rv[4])));				    
+		    x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[8]), simde_mm256_load_pd(&rv[8])));			    
+		    x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[12]), simde_mm256_load_pd(&rv[12])));				    
+		    x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[16]), simde_mm256_load_pd(&rv[16])));	
 #endif
 		    
 		    x1v = hadd4(x1v, x2v);			
 #ifdef _FMA
 		    for(k = 0; k < 5; k++) 
 		      {
-			__m256d evv = _mm256_load_pd(&ev[k*4]);
+			simde__m256d evv = simde_mm256_load_pd(&ev[k*4]);
 			vv[k] = FMAMACC(vv[k],x1v,evv);
 		      }	  
 #else		
-		    __m256d 
+		    simde__m256d 
 		      evv[5];
 		    
-		    evv[0] = _mm256_load_pd(&ev[0]);
-		    evv[1] = _mm256_load_pd(&ev[4]);
-		    evv[2] = _mm256_load_pd(&ev[8]);
-		    evv[3] = _mm256_load_pd(&ev[12]);
-		    evv[4] = _mm256_load_pd(&ev[16]);		
-		    
-		    vv[0] = _mm256_add_pd(vv[0], _mm256_mul_pd(x1v, evv[0]));
-		    vv[1] = _mm256_add_pd(vv[1], _mm256_mul_pd(x1v, evv[1]));
-		    vv[2] = _mm256_add_pd(vv[2], _mm256_mul_pd(x1v, evv[2]));
-		    vv[3] = _mm256_add_pd(vv[3], _mm256_mul_pd(x1v, evv[3]));
-		    vv[4] = _mm256_add_pd(vv[4], _mm256_mul_pd(x1v, evv[4]));				      		      	  
+		    evv[0] = simde_mm256_load_pd(&ev[0]);
+		    evv[1] = simde_mm256_load_pd(&ev[4]);
+		    evv[2] = simde_mm256_load_pd(&ev[8]);
+		    evv[3] = simde_mm256_load_pd(&ev[12]);
+		    evv[4] = simde_mm256_load_pd(&ev[16]);		
+		    
+		    vv[0] = simde_mm256_add_pd(vv[0], simde_mm256_mul_pd(x1v, evv[0]));
+		    vv[1] = simde_mm256_add_pd(vv[1], simde_mm256_mul_pd(x1v, evv[1]));
+		    vv[2] = simde_mm256_add_pd(vv[2], simde_mm256_mul_pd(x1v, evv[2]));
+		    vv[3] = simde_mm256_add_pd(vv[3], simde_mm256_mul_pd(x1v, evv[3]));
+		    vv[4] = simde_mm256_add_pd(vv[4], simde_mm256_mul_pd(x1v, evv[4]));				      		      	  
 #endif
 		  }
 		
-		_mm256_store_pd(&v[0], vv[0]);
-		_mm256_store_pd(&v[4], vv[1]);
-		_mm256_store_pd(&v[8], vv[2]);
-		_mm256_store_pd(&v[12], vv[3]);
-		_mm256_store_pd(&v[16], vv[4]);
+		simde_mm256_store_pd(&v[0], vv[0]);
+		simde_mm256_store_pd(&v[4], vv[1]);
+		simde_mm256_store_pd(&v[8], vv[2]);
+		simde_mm256_store_pd(&v[12], vv[3]);
+		simde_mm256_store_pd(&v[16], vv[4]);
 
 		x3_ptr += 20;
 	      }
@@ -2873,19 +2871,19 @@
 		  x2_ptr += 20;
 		}	  	  
 	  
-	      __m256d vv[5];
+	      simde__m256d vv[5];
 	      
-	      vv[0] = _mm256_setzero_pd();
-	      vv[1] = _mm256_setzero_pd();
-	      vv[2] = _mm256_setzero_pd();
-	      vv[3] = _mm256_setzero_pd();
-	      vv[4] = _mm256_setzero_pd();
+	      vv[0] = simde_mm256_setzero_pd();
+	      vv[1] = simde_mm256_setzero_pd();
+	      vv[2] = simde_mm256_setzero_pd();
+	      vv[3] = simde_mm256_setzero_pd();
+	      vv[4] = simde_mm256_setzero_pd();
 	      	      	      
 	      for(l = 0; l < 20; l++)
 		{	       
-		  __m256d 
-		    x1v = _mm256_setzero_pd(),
-		    x2v = _mm256_setzero_pd();	
+		  simde__m256d 
+		    x1v = simde_mm256_setzero_pd(),
+		    x2v = simde_mm256_setzero_pd();	
 		  
 		  double 
 		    *ev = &extEV[l * 20],
@@ -2894,72 +2892,72 @@
 #ifdef _FMA
 		  for(k = 0; k < 20; k += 4) 
 		    {
-		      __m256d vlv = _mm256_load_pd(&vl[k]);
-		      __m256d lvv = _mm256_load_pd(&lv[k]);
+		      simde__m256d vlv = simde_mm256_load_pd(&vl[k]);
+		      simde__m256d lvv = simde_mm256_load_pd(&lv[k]);
 		      x1v = FMAMACC(x1v,vlv,lvv);
-		      __m256d vrv = _mm256_load_pd(&vr[k]);
-		      __m256d rvv = _mm256_load_pd(&rv[k]);
+		      simde__m256d vrv = simde_mm256_load_pd(&vr[k]);
+		      simde__m256d rvv = simde_mm256_load_pd(&rv[k]);
 		      x2v = FMAMACC(x2v,vrv,rvv);
 		    }
 #else	      
-		  x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[0]), _mm256_load_pd(&lv[0])));
-		  x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[4]), _mm256_load_pd(&lv[4])));
-		  x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[8]), _mm256_load_pd(&lv[8])));
-		  x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[12]), _mm256_load_pd(&lv[12])));
-		  x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[16]), _mm256_load_pd(&lv[16])));
-		  
-		  x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[0]), _mm256_load_pd(&rv[0])));			    
-		  x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[4]), _mm256_load_pd(&rv[4])));				    
-		  x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[8]), _mm256_load_pd(&rv[8])));			    
-		  x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[12]), _mm256_load_pd(&rv[12])));				    
-		  x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[16]), _mm256_load_pd(&rv[16])));
+		  x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[0]), simde_mm256_load_pd(&lv[0])));
+		  x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[4]), simde_mm256_load_pd(&lv[4])));
+		  x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[8]), simde_mm256_load_pd(&lv[8])));
+		  x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[12]), simde_mm256_load_pd(&lv[12])));
+		  x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[16]), simde_mm256_load_pd(&lv[16])));
+		  
+		  x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[0]), simde_mm256_load_pd(&rv[0])));			    
+		  x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[4]), simde_mm256_load_pd(&rv[4])));				    
+		  x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[8]), simde_mm256_load_pd(&rv[8])));			    
+		  x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[12]), simde_mm256_load_pd(&rv[12])));				    
+		  x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[16]), simde_mm256_load_pd(&rv[16])));
 #endif
 		  
 		  x1v = hadd4(x1v, x2v);			
 		  
-		  __m256d 
+		  simde__m256d 
 		    evv[5];
 		  
-		  evv[0] = _mm256_load_pd(&ev[0]);
-		  evv[1] = _mm256_load_pd(&ev[4]);
-		  evv[2] = _mm256_load_pd(&ev[8]);
-		  evv[3] = _mm256_load_pd(&ev[12]);
-		  evv[4] = _mm256_load_pd(&ev[16]);		
+		  evv[0] = simde_mm256_load_pd(&ev[0]);
+		  evv[1] = simde_mm256_load_pd(&ev[4]);
+		  evv[2] = simde_mm256_load_pd(&ev[8]);
+		  evv[3] = simde_mm256_load_pd(&ev[12]);
+		  evv[4] = simde_mm256_load_pd(&ev[16]);		
 		  
 #ifdef _FMA
 		  for(k = 0; k < 5; k++)
 		    vv[k] = FMAMACC(vv[k],x1v,evv[k]);		 
 #else	      
-		  vv[0] = _mm256_add_pd(vv[0], _mm256_mul_pd(x1v, evv[0]));
-		  vv[1] = _mm256_add_pd(vv[1], _mm256_mul_pd(x1v, evv[1]));
-		  vv[2] = _mm256_add_pd(vv[2], _mm256_mul_pd(x1v, evv[2]));
-		  vv[3] = _mm256_add_pd(vv[3], _mm256_mul_pd(x1v, evv[3]));
-		  vv[4] = _mm256_add_pd(vv[4], _mm256_mul_pd(x1v, evv[4]));				      	
+		  vv[0] = simde_mm256_add_pd(vv[0], simde_mm256_mul_pd(x1v, evv[0]));
+		  vv[1] = simde_mm256_add_pd(vv[1], simde_mm256_mul_pd(x1v, evv[1]));
+		  vv[2] = simde_mm256_add_pd(vv[2], simde_mm256_mul_pd(x1v, evv[2]));
+		  vv[3] = simde_mm256_add_pd(vv[3], simde_mm256_mul_pd(x1v, evv[3]));
+		  vv[4] = simde_mm256_add_pd(vv[4], simde_mm256_mul_pd(x1v, evv[4]));				      	
 #endif
 		}	  
 
 	   	     
-	      __m256d minlikelihood_avx = _mm256_set1_pd( minlikelihood );
+	      simde__m256d minlikelihood_avx = simde_mm256_set1_pd( minlikelihood );
 	  
 	      scale = 1;
 	      
 	      for(l = 0; scale && (l < 20); l += 4)
 		{	       
-		  __m256d 
-		    v1 = _mm256_and_pd(vv[l / 4], absMask_AVX.m);
-		  v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+		  simde__m256d 
+		    v1 = simde_mm256_and_pd(vv[l / 4], absMask_AVX.m);
+		  v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
 		  
-		  if(_mm256_movemask_pd( v1 ) != 15)
+		  if(simde_mm256_movemask_pd( v1 ) != 15)
 		    scale = 0;
 		}	    	  	  
 	 
 	      if(scale)
 		{
-		  __m256d 
-		    twoto = _mm256_set1_pd(twotothe256);
+		  simde__m256d 
+		    twoto = simde_mm256_set1_pd(twotothe256);
 		  
 		  for(l = 0; l < 20; l += 4)
-		    vv[l / 4] = _mm256_mul_pd(vv[l / 4] , twoto);		    		 
+		    vv[l / 4] = simde_mm256_mul_pd(vv[l / 4] , twoto);		    		 
 		  
 		  if(useFastScaling)
 		    addScale += wgt[i];
@@ -2967,11 +2965,11 @@
 		    ex3[i]  += 1;	      
 		}
 
-	      _mm256_store_pd(&v[0], vv[0]);
-	      _mm256_store_pd(&v[4], vv[1]);
-	      _mm256_store_pd(&v[8], vv[2]);
-	      _mm256_store_pd(&v[12], vv[3]);
-	      _mm256_store_pd(&v[16], vv[4]);	       
+	      simde_mm256_store_pd(&v[0], vv[0]);
+	      simde_mm256_store_pd(&v[4], vv[1]);
+	      simde_mm256_store_pd(&v[8], vv[2]);
+	      simde_mm256_store_pd(&v[12], vv[3]);
+	      simde_mm256_store_pd(&v[16], vv[4]);	       
 	      
 	      x3_ptr += 20;
 	    }
@@ -3019,84 +3017,84 @@
 		    x2_ptr += 20;
 		  }	 	  	 
 		
-		__m256d vv[5];
+		simde__m256d vv[5];
 		
-		vv[0] = _mm256_setzero_pd();
-		vv[1] = _mm256_setzero_pd();
-		vv[2] = _mm256_setzero_pd();
-		vv[3] = _mm256_setzero_pd();
-		vv[4] = _mm256_setzero_pd();
+		vv[0] = simde_mm256_setzero_pd();
+		vv[1] = simde_mm256_setzero_pd();
+		vv[2] = simde_mm256_setzero_pd();
+		vv[3] = simde_mm256_setzero_pd();
+		vv[4] = simde_mm256_setzero_pd();
 		
 		for(l = 0; l < 20; l++)
 		  {	       
-		    __m256d 
-		      x1v = _mm256_setzero_pd(),
-		      x2v = _mm256_setzero_pd();	
+		    simde__m256d 
+		      x1v = simde_mm256_setzero_pd(),
+		      x2v = simde_mm256_setzero_pd();	
 		    
 		    double 
 		      *ev = &extEV[l * 20],
 		      *lv = &le[l * 20],
 		      *rv = &ri[l * 20];														
 		    
-		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[0]), _mm256_load_pd(&lv[0])));
-		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[4]), _mm256_load_pd(&lv[4])));
-		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[8]), _mm256_load_pd(&lv[8])));
-		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[12]), _mm256_load_pd(&lv[12])));
-		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[16]), _mm256_load_pd(&lv[16])));
-		    
-		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[0]), _mm256_load_pd(&rv[0])));			    
-		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[4]), _mm256_load_pd(&rv[4])));				    
-		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[8]), _mm256_load_pd(&rv[8])));			    
-		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[12]), _mm256_load_pd(&rv[12])));				    
-		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[16]), _mm256_load_pd(&rv[16])));
+		    x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[0]), simde_mm256_load_pd(&lv[0])));
+		    x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[4]), simde_mm256_load_pd(&lv[4])));
+		    x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[8]), simde_mm256_load_pd(&lv[8])));
+		    x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[12]), simde_mm256_load_pd(&lv[12])));
+		    x1v = simde_mm256_add_pd(x1v, simde_mm256_mul_pd(simde_mm256_load_pd(&vl[16]), simde_mm256_load_pd(&lv[16])));
+		    
+		    x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[0]), simde_mm256_load_pd(&rv[0])));			    
+		    x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[4]), simde_mm256_load_pd(&rv[4])));				    
+		    x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[8]), simde_mm256_load_pd(&rv[8])));			    
+		    x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[12]), simde_mm256_load_pd(&rv[12])));				    
+		    x2v = simde_mm256_add_pd(x2v,  simde_mm256_mul_pd(simde_mm256_load_pd(&vr[16]), simde_mm256_load_pd(&rv[16])));
 		    
 		    x1v = hadd4(x1v, x2v);			
 #ifdef _FMA
 		    for(k = 0; k < 5; k++) 
 		      {
-			__m256d evv = _mm256_load_pd(&ev[k*4]);
+			simde__m256d evv = simde_mm256_load_pd(&ev[k*4]);
 			vv[k] = FMAMACC(vv[k],x1v,evv);
 		      }
 #else	      
-		    __m256d 
+		    simde__m256d 
 		      evv[5];
 		    
-		    evv[0] = _mm256_load_pd(&ev[0]);
-		    evv[1] = _mm256_load_pd(&ev[4]);
-		    evv[2] = _mm256_load_pd(&ev[8]);
-		    evv[3] = _mm256_load_pd(&ev[12]);
-		    evv[4] = _mm256_load_pd(&ev[16]);		
-		    
-		    vv[0] = _mm256_add_pd(vv[0], _mm256_mul_pd(x1v, evv[0]));
-		    vv[1] = _mm256_add_pd(vv[1], _mm256_mul_pd(x1v, evv[1]));
-		    vv[2] = _mm256_add_pd(vv[2], _mm256_mul_pd(x1v, evv[2]));
-		    vv[3] = _mm256_add_pd(vv[3], _mm256_mul_pd(x1v, evv[3]));
-		    vv[4] = _mm256_add_pd(vv[4], _mm256_mul_pd(x1v, evv[4]));				      	
+		    evv[0] = simde_mm256_load_pd(&ev[0]);
+		    evv[1] = simde_mm256_load_pd(&ev[4]);
+		    evv[2] = simde_mm256_load_pd(&ev[8]);
+		    evv[3] = simde_mm256_load_pd(&ev[12]);
+		    evv[4] = simde_mm256_load_pd(&ev[16]);		
+		    
+		    vv[0] = simde_mm256_add_pd(vv[0], simde_mm256_mul_pd(x1v, evv[0]));
+		    vv[1] = simde_mm256_add_pd(vv[1], simde_mm256_mul_pd(x1v, evv[1]));
+		    vv[2] = simde_mm256_add_pd(vv[2], simde_mm256_mul_pd(x1v, evv[2]));
+		    vv[3] = simde_mm256_add_pd(vv[3], simde_mm256_mul_pd(x1v, evv[3]));
+		    vv[4] = simde_mm256_add_pd(vv[4], simde_mm256_mul_pd(x1v, evv[4]));				      	
 #endif
 		  }	  
 
 	   	     
-		__m256d minlikelihood_avx = _mm256_set1_pd( minlikelihood );
+		simde__m256d minlikelihood_avx = simde_mm256_set1_pd( minlikelihood );
 		
 		scale = 1;
 		
 		for(l = 0; scale && (l < 20); l += 4)
 		  {	       
-		    __m256d 
-		      v1 = _mm256_and_pd(vv[l / 4], absMask_AVX.m);
-		    v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+		    simde__m256d 
+		      v1 = simde_mm256_and_pd(vv[l / 4], absMask_AVX.m);
+		    v1 = simde_mm256_cmp_pd(v1,  minlikelihood_avx, SIMDE_CMP_LT_OS);
 		    
-		    if(_mm256_movemask_pd( v1 ) != 15)
+		    if(simde_mm256_movemask_pd( v1 ) != 15)
 		      scale = 0;
 		  }	    	  	  
 		
 		if(scale)
 		  {
-		    __m256d 
-		      twoto = _mm256_set1_pd(twotothe256);
+		    simde__m256d 
+		      twoto = simde_mm256_set1_pd(twotothe256);
 		    
 		    for(l = 0; l < 20; l += 4)
-		      vv[l / 4] = _mm256_mul_pd(vv[l / 4] , twoto);		    		 
+		      vv[l / 4] = simde_mm256_mul_pd(vv[l / 4] , twoto);		    		 
 		    
 		    if(useFastScaling)
 		      addScale += wgt[i];
@@ -3104,11 +3102,11 @@
 		      ex3[i]  += 1;	      
 		  }
 
-		_mm256_store_pd(&v[0], vv[0]);
-		_mm256_store_pd(&v[4], vv[1]);
-		_mm256_store_pd(&v[8], vv[2]);
-		_mm256_store_pd(&v[12], vv[3]);
-		_mm256_store_pd(&v[16], vv[4]);
+		simde_mm256_store_pd(&v[0], vv[0]);
+		simde_mm256_store_pd(&v[4], vv[1]);
+		simde_mm256_store_pd(&v[8], vv[2]);
+		simde_mm256_store_pd(&v[12], vv[3]);
+		simde_mm256_store_pd(&v[16], vv[4]);
 
 		 x3_ptr += 20;
 	     }
@@ -3158,11 +3156,11 @@
 
 
 #if GCC_VERSION < 40500
-   __m256d
-    bitmask = _mm256_set_pd(0,0,0,-1);
+   simde__m256d
+    bitmask = simde_mm256_set_pd(0,0,0,-1);
 #else
-  __m256i
-    bitmask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+  simde__m256i
+    bitmask = simde_mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
 #endif 
   
   switch(tipCase) 
@@ -3185,28 +3183,28 @@
 		  *ll =  &left[k * 20],
 		  *rr =  &right[k * 20];
 		
-		__m256d 
-		  umpX1v = _mm256_setzero_pd(),
-		  umpX2v = _mm256_setzero_pd();
+		simde__m256d 
+		  umpX1v = simde_mm256_setzero_pd(),
+		  umpX2v = simde_mm256_setzero_pd();
 		
 		for(l = 0; l < 20; l+=4) 
 		  {
-		    __m256d vv = _mm256_load_pd(&v[l]);
+		    simde__m256d vv = simde_mm256_load_pd(&v[l]);
 #ifdef _FMA
-		    __m256d llv = _mm256_load_pd(&ll[l]);
+		    simde__m256d llv = simde_mm256_load_pd(&ll[l]);
 		    umpX1v = FMAMACC(umpX1v,vv,llv);
-		    __m256d rrv = _mm256_load_pd(&rr[l]);
+		    simde__m256d rrv = simde_mm256_load_pd(&rr[l]);
 		    umpX2v = FMAMACC(umpX2v,vv,rrv);
 #else		    
-		    umpX1v = _mm256_add_pd(umpX1v,_mm256_mul_pd(vv,_mm256_load_pd(&ll[l])));
-		    umpX2v = _mm256_add_pd(umpX2v,_mm256_mul_pd(vv,_mm256_load_pd(&rr[l])));
+		    umpX1v = simde_mm256_add_pd(umpX1v,simde_mm256_mul_pd(vv,simde_mm256_load_pd(&ll[l])));
+		    umpX2v = simde_mm256_add_pd(umpX2v,simde_mm256_mul_pd(vv,simde_mm256_load_pd(&rr[l])));
 #endif
 		  }
 		
 		umpX1v = hadd3(umpX1v);
 		umpX2v = hadd3(umpX2v);
-		_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
-		_mm256_maskstore_pd(&umpX2[80 * i + k], bitmask, umpX2v);
+		simde_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
+		simde_mm256_maskstore_pd(&umpX2[80 * i + k], bitmask, umpX2v);
 	      } 
 	  }
 
@@ -3217,61 +3215,61 @@
 	  
 	  for(j = 0; j < 4; j++) 
 	    {     	
-	      __m256d vv[5];  
+	      simde__m256d vv[5];  
 	      
 	      v = &x3_gapColumn[j * 20];
 	      
-	      vv[0] = _mm256_setzero_pd();
-	      vv[1] = _mm256_setzero_pd();
-	      vv[2] = _mm256_setzero_pd();
-	      vv[3] = _mm256_setzero_pd();
-	      vv[4] = _mm256_setzero_pd();
+	      vv[0] = simde_mm256_setzero_pd();
+	      vv[1] = simde_mm256_setzero_pd();
+	      vv[2] = simde_mm256_setzero_pd();
+	      vv[3] = simde_mm256_setzero_pd();
+	      vv[4] = simde_mm256_setzero_pd();
 	      
 	      for(k = 0; k < 20; k++) 
 		{			 
 		  x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
 		  
-		  __m256d x1px2v = _mm256_set1_pd(x1px2);		    
+		  simde__m256d x1px2v = simde_mm256_set1_pd(x1px2);		    
 		  
-		  __m256d extEvv = _mm256_load_pd(&extEV[20 * k]);
+		  simde__m256d extEvv = simde_mm256_load_pd(&extEV[20 * k]);
 #ifdef _FMA
 		  vv[0] = FMAMACC(vv[0],x1px2v,extEvv);
 #else
-		  vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v,extEvv));
+		  vv[0] = simde_mm256_add_pd(vv[0],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-		  _mm256_store_pd(&v[0],vv[0]);
+		  simde_mm256_store_pd(&v[0],vv[0]);
 		  
-		  extEvv = _mm256_load_pd(&extEV[20 * k + 4]);
+		  extEvv = simde_mm256_load_pd(&extEV[20 * k + 4]);
 #ifdef _FMA
 		  vv[1] = FMAMACC(vv[1],x1px2v,extEvv);
 #else
-		  vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v,extEvv));
+		  vv[1] = simde_mm256_add_pd(vv[1],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-		  _mm256_store_pd(&v[4],vv[1]);
+		  simde_mm256_store_pd(&v[4],vv[1]);
 		  
-		  extEvv = _mm256_load_pd(&extEV[20 * k + 8]);
+		  extEvv = simde_mm256_load_pd(&extEV[20 * k + 8]);
 #ifdef _FMA
 		  vv[2] = FMAMACC(vv[2],x1px2v,extEvv);
 #else
-		  vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v,extEvv));
+		  vv[2] = simde_mm256_add_pd(vv[2],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-		  _mm256_store_pd(&v[8],vv[2]);
+		  simde_mm256_store_pd(&v[8],vv[2]);
 		  
-		  extEvv = _mm256_load_pd(&extEV[20 * k + 12]);
+		  extEvv = simde_mm256_load_pd(&extEV[20 * k + 12]);
 #ifdef _FMA
 		  vv[3] = FMAMACC(vv[3],x1px2v,extEvv);
 #else
-		  vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v,extEvv));
+		  vv[3] = simde_mm256_add_pd(vv[3],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-		  _mm256_store_pd(&v[12],vv[3]);
+		  simde_mm256_store_pd(&v[12],vv[3]);
 		  
-		  extEvv = _mm256_load_pd(&extEV[20 * k + 16]);
+		  extEvv = simde_mm256_load_pd(&extEV[20 * k + 16]);
 #ifdef _FMA
 		  vv[4] = FMAMACC(vv[4],x1px2v,extEvv);
 #else
-		  vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v,extEvv));
+		  vv[4] = simde_mm256_add_pd(vv[4],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-		  _mm256_store_pd(&v[16],vv[4]);
+		  simde_mm256_store_pd(&v[16],vv[4]);
 		} 
 	    } 
 	}
@@ -3286,61 +3284,61 @@
 	   
 		for(j = 0; j < 4; j++) 
 		  {     	
-		    __m256d vv[5];  
+		    simde__m256d vv[5];  
 		    
 		    v = &x3_ptr[j * 20];
 			
-		    vv[0] = _mm256_setzero_pd();
-		    vv[1] = _mm256_setzero_pd();
-		    vv[2] = _mm256_setzero_pd();
-		    vv[3] = _mm256_setzero_pd();
-		    vv[4] = _mm256_setzero_pd();
+		    vv[0] = simde_mm256_setzero_pd();
+		    vv[1] = simde_mm256_setzero_pd();
+		    vv[2] = simde_mm256_setzero_pd();
+		    vv[3] = simde_mm256_setzero_pd();
+		    vv[4] = simde_mm256_setzero_pd();
 
 		    for(k = 0; k < 20; k++) 
 		      {			 
 			x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
 			
-			__m256d x1px2v = _mm256_set1_pd(x1px2);		    
+			simde__m256d x1px2v = simde_mm256_set1_pd(x1px2);		    
 			
-			__m256d extEvv = _mm256_load_pd(&extEV[20 * k]);
+			simde__m256d extEvv = simde_mm256_load_pd(&extEV[20 * k]);
 #ifdef _FMA
 			vv[0] = FMAMACC(vv[0],x1px2v,extEvv);
 #else
-			vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v,extEvv));
+			vv[0] = simde_mm256_add_pd(vv[0],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-			_mm256_store_pd(&v[0],vv[0]);
+			simde_mm256_store_pd(&v[0],vv[0]);
 			
-			extEvv = _mm256_load_pd(&extEV[20 * k + 4]);
+			extEvv = simde_mm256_load_pd(&extEV[20 * k + 4]);
 #ifdef _FMA
 			vv[1] = FMAMACC(vv[1],x1px2v,extEvv);
 #else
-			vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v,extEvv));
+			vv[1] = simde_mm256_add_pd(vv[1],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-			_mm256_store_pd(&v[4],vv[1]);
+			simde_mm256_store_pd(&v[4],vv[1]);
 			
-			extEvv = _mm256_load_pd(&extEV[20 * k + 8]);
+			extEvv = simde_mm256_load_pd(&extEV[20 * k + 8]);
 #ifdef _FMA
 			vv[2] = FMAMACC(vv[2],x1px2v,extEvv);
 #else
-			vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v,extEvv));
+			vv[2] = simde_mm256_add_pd(vv[2],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-			_mm256_store_pd(&v[8],vv[2]);
+			simde_mm256_store_pd(&v[8],vv[2]);
 			
-			extEvv = _mm256_load_pd(&extEV[20 * k + 12]);
+			extEvv = simde_mm256_load_pd(&extEV[20 * k + 12]);
 #ifdef _FMA
 			vv[3] = FMAMACC(vv[3],x1px2v,extEvv);
 #else
-			vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v,extEvv));
+			vv[3] = simde_mm256_add_pd(vv[3],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-			_mm256_store_pd(&v[12],vv[3]);
+			simde_mm256_store_pd(&v[12],vv[3]);
 			
-			extEvv = _mm256_load_pd(&extEV[20 * k + 16]);
+			extEvv = simde_mm256_load_pd(&extEV[20 * k + 16]);
 #ifdef _FMA
 			vv[4] = FMAMACC(vv[4],x1px2v,extEvv);
 #else
-			vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v,extEvv));
+			vv[4] = simde_mm256_add_pd(vv[4],simde_mm256_mul_pd(x1px2v,extEvv));
 #endif
-			_mm256_store_pd(&v[16],vv[4]);
+			simde_mm256_store_pd(&v[16],vv[4]);
 		      } 
 		  }
 		x3_ptr += 80;		  
@@ -3362,20 +3360,20 @@
 
 	    for(k = 0; k < 80; k++) 
 	      {
-		__m256d umpX1v = _mm256_setzero_pd();
+		simde__m256d umpX1v = simde_mm256_setzero_pd();
 		for(l = 0; l < 20; l+=4) 
 		  {
-		    __m256d vv = _mm256_load_pd(&v[l]);
-		    __m256d leftv = _mm256_load_pd(&left[k * 20 + l]);
+		    simde__m256d vv = simde_mm256_load_pd(&v[l]);
+		    simde__m256d leftv = simde_mm256_load_pd(&left[k * 20 + l]);
 #ifdef _FMA
 		   
 		    umpX1v = FMAMACC(umpX1v, vv, leftv);
 #else
-		    umpX1v = _mm256_add_pd(umpX1v, _mm256_mul_pd(vv, leftv));
+		    umpX1v = simde_mm256_add_pd(umpX1v, simde_mm256_mul_pd(vv, leftv));
 #endif
 		  }
 		umpX1v = hadd3(umpX1v);
-		_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
+		simde_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
 	      } 
 	  }
 
@@ -3388,132 +3386,132 @@
 		
 		for(l = 0; l < 20; l++) 
 		  {
-		    __m256d ump_x2v = _mm256_setzero_pd();
+		    simde__m256d ump_x2v = simde_mm256_setzero_pd();
 		    		  
-		    __m256d vv = _mm256_load_pd(&v[0]);
-		    __m256d rightv = _mm256_load_pd(&right[k*400+l*20+0]);
+		    simde__m256d vv = simde_mm256_load_pd(&v[0]);
+		    simde__m256d rightv = simde_mm256_load_pd(&right[k*400+l*20+0]);
 #ifdef _FMA
 		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+		    ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 		    
-		    vv = _mm256_load_pd(&v[4]);
-		    rightv = _mm256_load_pd(&right[k*400+l*20+4]);
+		    vv = simde_mm256_load_pd(&v[4]);
+		    rightv = simde_mm256_load_pd(&right[k*400+l*20+4]);
 #ifdef _FMA
 		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+		    ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 
-		    vv = _mm256_load_pd(&v[8]);
-		    rightv = _mm256_load_pd(&right[k*400+l*20+8]);
+		    vv = simde_mm256_load_pd(&v[8]);
+		    rightv = simde_mm256_load_pd(&right[k*400+l*20+8]);
 #ifdef _FMA
 		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+		    ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 
-		    vv = _mm256_load_pd(&v[12]);
-		    rightv = _mm256_load_pd(&right[k*400+l*20+12]);
+		    vv = simde_mm256_load_pd(&v[12]);
+		    rightv = simde_mm256_load_pd(&right[k*400+l*20+12]);
 #ifdef _FMA
 		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+		    ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 
-		    vv = _mm256_load_pd(&v[16]);
-		    rightv = _mm256_load_pd(&right[k*400+l*20+16]);
+		    vv = simde_mm256_load_pd(&v[16]);
+		    rightv = simde_mm256_load_pd(&right[k*400+l*20+16]);
 #ifdef _FMA
 		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+		    ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 		    
 		    ump_x2v = hadd3(ump_x2v);
-		    _mm256_maskstore_pd(&ump_x2[l], bitmask, ump_x2v);
+		    simde_mm256_maskstore_pd(&ump_x2[l], bitmask, ump_x2v);
 		  }
 		
 		v = &x3_gapColumn[20 * k];
 	
-		__m256d vv[5]; 
+		simde__m256d vv[5]; 
 
-		vv[0] = _mm256_setzero_pd();
-		vv[1] = _mm256_setzero_pd();
-		vv[2] = _mm256_setzero_pd();
-		vv[3] = _mm256_setzero_pd();
-		vv[4] = _mm256_setzero_pd();
+		vv[0] = simde_mm256_setzero_pd();
+		vv[1] = simde_mm256_setzero_pd();
+		vv[2] = simde_mm256_setzero_pd();
+		vv[3] = simde_mm256_setzero_pd();
+		vv[4] = simde_mm256_setzero_pd();
 		
 		for(l = 0; l < 20; l++) 
 		  {
 		    x1px2 = uX1[k * 20 + l]	* ump_x2[l];
-		    __m256d x1px2v = _mm256_set1_pd(x1px2);	
+		    simde__m256d x1px2v = simde_mm256_set1_pd(x1px2);	
 	    		 
 #ifdef _FMA
-		    __m256d ev = _mm256_load_pd(&extEV[l * 20 + 0]);
+		    simde__m256d ev = simde_mm256_load_pd(&extEV[l * 20 + 0]);
 		    vv[0] = FMAMACC(vv[0],x1px2v, ev);
 #else
-		    vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 0])));
+		    vv[0] = simde_mm256_add_pd(vv[0],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[l * 20 + 0])));
 #endif
-		    _mm256_store_pd(&v[0],vv[0]);
+		    simde_mm256_store_pd(&v[0],vv[0]);
 
 #ifdef _FMA
-		    ev = _mm256_load_pd(&extEV[l * 20 + 4]);
+		    ev = simde_mm256_load_pd(&extEV[l * 20 + 4]);
 		    vv[1] = FMAMACC(vv[1],x1px2v, ev);
 #else
-		    vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 4])));
+		    vv[1] = simde_mm256_add_pd(vv[1],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[l * 20 + 4])));
 #endif
-		    _mm256_store_pd(&v[4],vv[1]);
+		    simde_mm256_store_pd(&v[4],vv[1]);
 
 #ifdef _FMA
-		    ev = _mm256_load_pd(&extEV[l * 20 + 8]);
+		    ev = simde_mm256_load_pd(&extEV[l * 20 + 8]);
 		    vv[2] = FMAMACC(vv[2],x1px2v, ev);
 #else
-		    vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 8])));
+		    vv[2] = simde_mm256_add_pd(vv[2],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[l * 20 + 8])));
 #endif
-		    _mm256_store_pd(&v[8],vv[2]);
+		    simde_mm256_store_pd(&v[8],vv[2]);
 		    
 #ifdef _FMA
-		    ev = _mm256_load_pd(&extEV[l * 20 + 12]);
+		    ev = simde_mm256_load_pd(&extEV[l * 20 + 12]);
 		    vv[3] = FMAMACC(vv[3],x1px2v, ev);
 #else
-		    vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 12])));
+		    vv[3] = simde_mm256_add_pd(vv[3],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[l * 20 + 12])));
 #endif
-		    _mm256_store_pd(&v[12],vv[3]);
+		    simde_mm256_store_pd(&v[12],vv[3]);
 
 
 #ifdef _FMA
-		    ev = _mm256_load_pd(&extEV[l * 20 + 16]);
+		    ev = simde_mm256_load_pd(&extEV[l * 20 + 16]);
 		    vv[4] = FMAMACC(vv[4],x1px2v, ev);
 #else
-		    vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 16])));
+		    vv[4] = simde_mm256_add_pd(vv[4],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[l * 20 + 16])));
 #endif
-		    _mm256_store_pd(&v[16],vv[4]);
+		    simde_mm256_store_pd(&v[16],vv[4]);
 
 		  } 
 	      }
 	   
 	    v = x3_gapColumn;
-	    __m256d minlikelihood_avx = _mm256_set1_pd(minlikelihood);
+	    simde__m256d minlikelihood_avx = simde_mm256_set1_pd(minlikelihood);
 	    scale = 1;
 	    for(l = 0; scale && (l < 80); l += 4) 
 	      {
-		__m256d vv = _mm256_load_pd(&v[l]);
-		__m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
-		vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
-		if(_mm256_movemask_pd(vv_abs) != 15)
+		simde__m256d vv = simde_mm256_load_pd(&v[l]);
+		simde__m256d vv_abs = simde_mm256_and_pd(vv,absMask_AVX.m);
+		vv_abs = simde_mm256_cmp_pd(vv_abs,minlikelihood_avx,SIMDE_CMP_LT_OS);
+		if(simde_mm256_movemask_pd(vv_abs) != 15)
 		  scale = 0;
 	      }
 	    
 	    if(scale) 
 	      {		
-		__m256d twotothe256v = _mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
+		simde__m256d twotothe256v = simde_mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
 		gapScaling = 1;
 
 		for(l = 0; l < 80; l += 4) 
 		  {
-		    __m256d vv = _mm256_load_pd(&v[l]);
-		    _mm256_store_pd(&v[l],_mm256_mul_pd(vv,twotothe256v));
+		    simde__m256d vv = simde_mm256_load_pd(&v[l]);
+		    simde_mm256_store_pd(&v[l],simde_mm256_mul_pd(vv,twotothe256v));
 		  }	
 	      } 
 	}       
@@ -3548,131 +3546,131 @@
 		    
 		    for(l = 0; l < 20; l++) 
 		      {
-			__m256d ump_x2v = _mm256_setzero_pd();
+			simde__m256d ump_x2v = simde_mm256_setzero_pd();
 		    	
-			__m256d vv = _mm256_load_pd(&v[0]);
-			__m256d rightv = _mm256_load_pd(&right[k*400+l*20+0]);
+			simde__m256d vv = simde_mm256_load_pd(&v[0]);
+			simde__m256d rightv = simde_mm256_load_pd(&right[k*400+l*20+0]);
 #ifdef _FMA
 			ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-			ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+			ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 			
-			vv = _mm256_load_pd(&v[4]);
-			rightv = _mm256_load_pd(&right[k*400+l*20+4]);
+			vv = simde_mm256_load_pd(&v[4]);
+			rightv = simde_mm256_load_pd(&right[k*400+l*20+4]);
 #ifdef _FMA
 			ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-			ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+			ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 			
-			vv = _mm256_load_pd(&v[8]);
-			rightv = _mm256_load_pd(&right[k*400+l*20+8]);
+			vv = simde_mm256_load_pd(&v[8]);
+			rightv = simde_mm256_load_pd(&right[k*400+l*20+8]);
 #ifdef _FMA
 			ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-			ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+			ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 			
-			vv = _mm256_load_pd(&v[12]);
-			rightv = _mm256_load_pd(&right[k*400+l*20+12]);
+			vv = simde_mm256_load_pd(&v[12]);
+			rightv = simde_mm256_load_pd(&right[k*400+l*20+12]);
 #ifdef _FMA
 			ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-			ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+			ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 			
-			vv = _mm256_load_pd(&v[16]);
-			rightv = _mm256_load_pd(&right[k*400+l*20+16]);
+			vv = simde_mm256_load_pd(&v[16]);
+			rightv = simde_mm256_load_pd(&right[k*400+l*20+16]);
 #ifdef _FMA
 			ump_x2v = FMAMACC(ump_x2v,vv,rightv);
 #else
-			ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+			ump_x2v = simde_mm256_add_pd(ump_x2v, simde_mm256_mul_pd(vv, rightv));
 #endif
 			
 			ump_x2v = hadd3(ump_x2v);
-			_mm256_maskstore_pd(&ump_x2[l], bitmask, ump_x2v);
+			simde_mm256_maskstore_pd(&ump_x2[l], bitmask, ump_x2v);
 		      }
 		  
 		    
 		    v = &x3_ptr[k * 20];
 		    
-		    __m256d vv[5]; 
+		    simde__m256d vv[5]; 
 		    
-		    vv[0] = _mm256_setzero_pd();
-		    vv[1] = _mm256_setzero_pd();
-		    vv[2] = _mm256_setzero_pd();
-		    vv[3] = _mm256_setzero_pd();
-		    vv[4] = _mm256_setzero_pd();
+		    vv[0] = simde_mm256_setzero_pd();
+		    vv[1] = simde_mm256_setzero_pd();
+		    vv[2] = simde_mm256_setzero_pd();
+		    vv[3] = simde_mm256_setzero_pd();
+		    vv[4] = simde_mm256_setzero_pd();
 		    
 		    for(l = 0; l < 20; l++) 
 		      {
 			x1px2 = uX1[k * 20 + l]	* ump_x2[l];
-			__m256d x1px2v = _mm256_set1_pd(x1px2);	
+			simde__m256d x1px2v = simde_mm256_set1_pd(x1px2);	
 			
 #ifdef _FMA
-			__m256d ev = _mm256_load_pd(&extEV[l * 20 + 0]);
+			simde__m256d ev = simde_mm256_load_pd(&extEV[l * 20 + 0]);
 			vv[0] = FMAMACC(vv[0],x1px2v, ev);
 #else
-			vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 0])));
+			vv[0] = simde_mm256_add_pd(vv[0],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[l * 20 + 0])));
 #endif
-			_mm256_store_pd(&v[0],vv[0]);
+			simde_mm256_store_pd(&v[0],vv[0]);
 			
 #ifdef _FMA
-			ev = _mm256_load_pd(&extEV[l * 20 + 4]);
+			ev = simde_mm256_load_pd(&extEV[l * 20 + 4]);
 			vv[1] = FMAMACC(vv[1],x1px2v, ev);
 #else
-			vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 4])));
+			vv[1] = simde_mm256_add_pd(vv[1],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[l * 20 + 4])));
 #endif
-			_mm256_store_pd(&v[4],vv[1]);
+			simde_mm256_store_pd(&v[4],vv[1]);
 			
 #ifdef _FMA
-			ev = _mm256_load_pd(&extEV[l * 20 + 8]);
+			ev = simde_mm256_load_pd(&extEV[l * 20 + 8]);
 			vv[2] = FMAMACC(vv[2],x1px2v, ev);
 #else
-			vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 8])));
+			vv[2] = simde_mm256_add_pd(vv[2],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[l * 20 + 8])));
 #endif
-			_mm256_store_pd(&v[8],vv[2]);
+			simde_mm256_store_pd(&v[8],vv[2]);
 			
 #ifdef _FMA
-			ev = _mm256_load_pd(&extEV[l * 20 + 12]);
+			ev = simde_mm256_load_pd(&extEV[l * 20 + 12]);
 			vv[3] = FMAMACC(vv[3],x1px2v, ev);
 #else
-			vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 12])));
+			vv[3] = simde_mm256_add_pd(vv[3],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[l * 20 + 12])));
 #endif
-			_mm256_store_pd(&v[12],vv[3]);
+			simde_mm256_store_pd(&v[12],vv[3]);
 			
 			
 #ifdef _FMA
-			ev = _mm256_load_pd(&extEV[l * 20 + 16]);
+			ev = simde_mm256_load_pd(&extEV[l * 20 + 16]);
 			vv[4] = FMAMACC(vv[4],x1px2v, ev);
 #else
-			vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 16])));
+			vv[4] = simde_mm256_add_pd(vv[4],simde_mm256_mul_pd(x1px2v, simde_mm256_load_pd(&extEV[l * 20 + 16])));
 #endif
-			_mm256_store_pd(&v[16],vv[4]);
+			simde_mm256_store_pd(&v[16],vv[4]);
 			
 		      } 
 		  }
 		
 		v = x3_ptr;
-		__m256d minlikelihood_avx = _mm256_set1_pd(minlikelihood);
+		simde__m256d minlikelihood_avx = simde_mm256_set1_pd(minlikelihood);
 		scale = 1;
 		for(l = 0; scale && (l < 80); l += 4) 
 		  {
-		    __m256d vv = _mm256_load_pd(&v[l]);
-		    __m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
-		    vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
-		    if(_mm256_movemask_pd(vv_abs) != 15)
+		    simde__m256d vv = simde_mm256_load_pd(&v[l]);
+		    simde__m256d vv_abs = simde_mm256_and_pd(vv,absMask_AVX.m);
+		    vv_abs = simde_mm256_cmp_pd(vv_abs,minlikelihood_avx,SIMDE_CMP_LT_OS);
+		    if(simde_mm256_movemask_pd(vv_abs) != 15)
 		      scale = 0;
 		  }
 	    
 		if(scale) 
 		  {		
-		    __m256d twotothe256v = _mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
+		    simde__m256d twotothe256v = simde_mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
 		    for(l = 0; l < 80; l += 4) 
 		      {
-			__m256d vv = _mm256_load_pd(&v[l]);
-			_mm256_store_pd(&v[l],_mm256_mul_pd(vv,twotothe256v));
+			simde__m256d vv = simde_mm256_load_pd(&v[l]);
+			simde_mm256_store_pd(&v[l],simde_mm256_mul_pd(vv,twotothe256v));
 		      }
 		    if(useFastScaling)
 		      addScale += wgt[i];				
@@ -3691,156 +3689,156 @@
 	  vr = &(x2_gapColumn[20 * k]);
 	  v  = &(x3_gapColumn[20 * k]);	      	   
 
-	  __m256d vv[5]; 
+	  simde__m256d vv[5]; 
 	  
-	  vv[0] = _mm256_setzero_pd();
-	  vv[1] = _mm256_setzero_pd();
-	  vv[2] = _mm256_setzero_pd();
-	  vv[3] = _mm256_setzero_pd();
-	  vv[4] = _mm256_setzero_pd();
+	  vv[0] = simde_mm256_setzero_pd();
+	  vv[1] = simde_mm256_setzero_pd();
+	  vv[2] = simde_mm256_setzero_pd();
+	  vv[3] = simde_mm256_setzero_pd();
+	  vv[4] = simde_mm256_setzero_pd();
 	  
 	  for(l = 0; l < 20; l++) 
 	    {		  
-	      __m256d al = _mm256_setzero_pd();
-	      __m256d ar = _mm256_setzero_pd();
+	      simde__m256d al = simde_mm256_setzero_pd();
+	      simde__m256d ar = simde_mm256_setzero_pd();
 	      
-	      __m256d leftv  = _mm256_load_pd(&left[k * 400 + l * 20 + 0]);
-	      __m256d rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 0]);
-	      __m256d vlv = _mm256_load_pd(&vl[0]);
-	      __m256d vrv = _mm256_load_pd(&vr[0]);
+	      simde__m256d leftv  = simde_mm256_load_pd(&left[k * 400 + l * 20 + 0]);
+	      simde__m256d rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 0]);
+	      simde__m256d vlv = simde_mm256_load_pd(&vl[0]);
+	      simde__m256d vrv = simde_mm256_load_pd(&vr[0]);
 	      
 #ifdef _FMA
 	      
 	      al = FMAMACC(al, vlv, leftv);
 	      ar = FMAMACC(ar, vrv, rightv);
 #else
-	      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-	      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));		  
+	      al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+	      ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));		  
 #endif
 	      
-	      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 4]);
-	      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 4]);
-	      vlv = _mm256_load_pd(&vl[4]);
-	      vrv = _mm256_load_pd(&vr[4]);
+	      leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 4]);
+	      rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 4]);
+	      vlv = simde_mm256_load_pd(&vl[4]);
+	      vrv = simde_mm256_load_pd(&vr[4]);
 #ifdef _FMA
 	      
 	      al = FMAMACC(al, vlv, leftv);
 	      ar = FMAMACC(ar, vrv, rightv);
 #else
-	      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-	      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+	      al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+	      ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 	      
-	      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 8]);
-	      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 8]);
-	      vlv = _mm256_load_pd(&vl[8]);
-	      vrv = _mm256_load_pd(&vr[8]);
+	      leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 8]);
+	      rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 8]);
+	      vlv = simde_mm256_load_pd(&vl[8]);
+	      vrv = simde_mm256_load_pd(&vr[8]);
 #ifdef _FMA
 	      
 	      al = FMAMACC(al, vlv, leftv);
 	      ar = FMAMACC(ar, vrv, rightv);
 #else
-	      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-	      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+	      al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+	      ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 	      
-	      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 12]);
-	      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 12]);
-	      vlv = _mm256_load_pd(&vl[12]);
-	      vrv = _mm256_load_pd(&vr[12]);
+	      leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 12]);
+	      rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 12]);
+	      vlv = simde_mm256_load_pd(&vl[12]);
+	      vrv = simde_mm256_load_pd(&vr[12]);
 #ifdef _FMA
 	      
 	      al = FMAMACC(al, vlv, leftv);
 	      ar = FMAMACC(ar, vrv, rightv);
 #else
-	      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-	      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+	      al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+	      ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 	      
-	      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 16]);
-	      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 16]);
-	      vlv = _mm256_load_pd(&vl[16]);
-	      vrv = _mm256_load_pd(&vr[16]);
+	      leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 16]);
+	      rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 16]);
+	      vlv = simde_mm256_load_pd(&vl[16]);
+	      vrv = simde_mm256_load_pd(&vr[16]);
 	      
 #ifdef _FMA		    
 	      al = FMAMACC(al, vlv, leftv);
 	      ar = FMAMACC(ar, vrv, rightv);
 #else
-	      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-	      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+	      al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+	      ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 	      
 	      /**************************************************************************************************************/
 	      
 	      al = hadd3(al);
 	      ar = hadd3(ar);
-	      al = _mm256_mul_pd(ar,al);
+	      al = simde_mm256_mul_pd(ar,al);
 	      
 	      /************************************************************************************************************/
 #ifdef _FMA		    
-	      __m256d ev =  _mm256_load_pd(&extEV[20 * l + 0]);
+	      simde__m256d ev =  simde_mm256_load_pd(&extEV[20 * l + 0]);
 	      vv[0] = FMAMACC(vv[0], al, ev);		 
 #else
-	      vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 0])));			  		 		  
+	      vv[0] = simde_mm256_add_pd(vv[0],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[20 * l + 0])));			  		 		  
 #endif
-	      _mm256_store_pd(&v[0],vv[0]);
+	      simde_mm256_store_pd(&v[0],vv[0]);
 	      
 #ifdef _FMA		    
-	      ev =  _mm256_load_pd(&extEV[20 * l + 4]);
+	      ev =  simde_mm256_load_pd(&extEV[20 * l + 4]);
 	      vv[1] = FMAMACC(vv[1], al, ev);		 
 #else
-	      vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 4])));		  		 
+	      vv[1] = simde_mm256_add_pd(vv[1],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[20 * l + 4])));		  		 
 #endif
-	      _mm256_store_pd(&v[4],vv[1]);
+	      simde_mm256_store_pd(&v[4],vv[1]);
 	      
 #ifdef _FMA		    
-	      ev =  _mm256_load_pd(&extEV[20 * l + 8]);
+	      ev =  simde_mm256_load_pd(&extEV[20 * l + 8]);
 	      vv[2] = FMAMACC(vv[2], al, ev);		 
 #else
-	      vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 8])));		  		 
+	      vv[2] = simde_mm256_add_pd(vv[2],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[20 * l + 8])));		  		 
 #endif
-	      _mm256_store_pd(&v[8],vv[2]);
+	      simde_mm256_store_pd(&v[8],vv[2]);
 	      
 #ifdef _FMA		    
-	      ev =  _mm256_load_pd(&extEV[20 * l + 12]);
+	      ev =  simde_mm256_load_pd(&extEV[20 * l + 12]);
 	      vv[3] = FMAMACC(vv[3], al, ev);		 
 #else
-	      vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 12])));		  		 
+	      vv[3] = simde_mm256_add_pd(vv[3],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[20 * l + 12])));		  		 
 #endif
-	      _mm256_store_pd(&v[12],vv[3]);
+	      simde_mm256_store_pd(&v[12],vv[3]);
 	      
 #ifdef _FMA		    
-	      ev =  _mm256_load_pd(&extEV[20 * l + 16]);
+	      ev =  simde_mm256_load_pd(&extEV[20 * l + 16]);
 	      vv[4] = FMAMACC(vv[4], al, ev);		 
 #else
-	      vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 16])));			 	  
+	      vv[4] = simde_mm256_add_pd(vv[4],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[20 * l + 16])));			 	  
 #endif
-	      _mm256_store_pd(&v[16],vv[4]);		 
+	      simde_mm256_store_pd(&v[16],vv[4]);		 
 	    } 
 	}
 	
       v = x3_gapColumn;
       scale = 1;
-      __m256d minlikelihood_avx = _mm256_set1_pd(minlikelihood);	 
+      simde__m256d minlikelihood_avx = simde_mm256_set1_pd(minlikelihood);	 
       
       for(l = 0; scale && (l < 80); l += 4) 
 	{
-	  __m256d vv = _mm256_load_pd(&v[l]);
-	  __m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
-	  vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
-	  if(_mm256_movemask_pd(vv_abs) != 15)
+	  simde__m256d vv = simde_mm256_load_pd(&v[l]);
+	  simde__m256d vv_abs = simde_mm256_and_pd(vv,absMask_AVX.m);
+	  vv_abs = simde_mm256_cmp_pd(vv_abs,minlikelihood_avx,SIMDE_CMP_LT_OS);
+	  if(simde_mm256_movemask_pd(vv_abs) != 15)
 	    scale = 0;	     
 	}
 
       if(scale) 
 	{		     	      
-	  __m256d twotothe256v = _mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
+	  simde__m256d twotothe256v = simde_mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
 	  gapScaling = 1;
 
 	  for(l = 0; l < 80; l += 4) 
 	    {
-	      __m256d vv = _mm256_load_pd(&v[l]);
-	      _mm256_store_pd(&v[l],_mm256_mul_pd(vv,twotothe256v));
+	      simde__m256d vv = simde_mm256_load_pd(&v[l]);
+	      simde_mm256_store_pd(&v[l],simde_mm256_mul_pd(vv,twotothe256v));
 	    }
 	  
 	} 
@@ -3884,155 +3882,155 @@
 		  vr = &(x2[20 * k]);
 		  v  = &(x3_ptr[20 * k]);	      	   
 		  
-		  __m256d vv[5]; 
+		  simde__m256d vv[5]; 
 		  
-		  vv[0] = _mm256_setzero_pd();
-		  vv[1] = _mm256_setzero_pd();
-		  vv[2] = _mm256_setzero_pd();
-		  vv[3] = _mm256_setzero_pd();
-		  vv[4] = _mm256_setzero_pd();
+		  vv[0] = simde_mm256_setzero_pd();
+		  vv[1] = simde_mm256_setzero_pd();
+		  vv[2] = simde_mm256_setzero_pd();
+		  vv[3] = simde_mm256_setzero_pd();
+		  vv[4] = simde_mm256_setzero_pd();
 		  
 		  for(l = 0; l < 20; l++) 
 		    {		  
-		      __m256d al = _mm256_setzero_pd();
-		      __m256d ar = _mm256_setzero_pd();
+		      simde__m256d al = simde_mm256_setzero_pd();
+		      simde__m256d ar = simde_mm256_setzero_pd();
 		      
-		      __m256d leftv  = _mm256_load_pd(&left[k * 400 + l * 20 + 0]);
-		      __m256d rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 0]);
-		      __m256d vlv = _mm256_load_pd(&vl[0]);
-		      __m256d vrv = _mm256_load_pd(&vr[0]);
+		      simde__m256d leftv  = simde_mm256_load_pd(&left[k * 400 + l * 20 + 0]);
+		      simde__m256d rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 0]);
+		      simde__m256d vlv = simde_mm256_load_pd(&vl[0]);
+		      simde__m256d vrv = simde_mm256_load_pd(&vr[0]);
 		      
 #ifdef _FMA
 		      
 		      al = FMAMACC(al, vlv, leftv);
 		      ar = FMAMACC(ar, vrv, rightv);
 #else
-		      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-		      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));		  
+		      al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+		      ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));		  
 #endif
 		      
-		      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 4]);
-		      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 4]);
-		      vlv = _mm256_load_pd(&vl[4]);
-		      vrv = _mm256_load_pd(&vr[4]);
+		      leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 4]);
+		      rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 4]);
+		      vlv = simde_mm256_load_pd(&vl[4]);
+		      vrv = simde_mm256_load_pd(&vr[4]);
 #ifdef _FMA
 		      
 		      al = FMAMACC(al, vlv, leftv);
 		      ar = FMAMACC(ar, vrv, rightv);
 #else
-		      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-		      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+		      al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+		      ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 		      
-		      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 8]);
-		      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 8]);
-		      vlv = _mm256_load_pd(&vl[8]);
-		      vrv = _mm256_load_pd(&vr[8]);
+		      leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 8]);
+		      rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 8]);
+		      vlv = simde_mm256_load_pd(&vl[8]);
+		      vrv = simde_mm256_load_pd(&vr[8]);
 #ifdef _FMA
 		      
 		      al = FMAMACC(al, vlv, leftv);
 		      ar = FMAMACC(ar, vrv, rightv);
 #else
-		      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-		      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+		      al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+		      ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 		      
-		      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 12]);
-		      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 12]);
-		      vlv = _mm256_load_pd(&vl[12]);
-		      vrv = _mm256_load_pd(&vr[12]);
+		      leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 12]);
+		      rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 12]);
+		      vlv = simde_mm256_load_pd(&vl[12]);
+		      vrv = simde_mm256_load_pd(&vr[12]);
 #ifdef _FMA
 		      
 		      al = FMAMACC(al, vlv, leftv);
 		      ar = FMAMACC(ar, vrv, rightv);
 #else
-		      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-		      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+		      al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+		      ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 		      
-		      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 16]);
-		      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 16]);
-		      vlv = _mm256_load_pd(&vl[16]);
-		      vrv = _mm256_load_pd(&vr[16]);
+		      leftv = simde_mm256_load_pd(&left[k * 400 + l * 20 + 16]);
+		      rightv = simde_mm256_load_pd(&right[k * 400 + l * 20 + 16]);
+		      vlv = simde_mm256_load_pd(&vl[16]);
+		      vrv = simde_mm256_load_pd(&vr[16]);
 		      
 #ifdef _FMA		    
 		      al = FMAMACC(al, vlv, leftv);
 		      ar = FMAMACC(ar, vrv, rightv);
 #else
-		      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
-		      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+		      al = simde_mm256_add_pd(al,simde_mm256_mul_pd(vlv,leftv));
+		      ar = simde_mm256_add_pd(ar,simde_mm256_mul_pd(vrv,rightv));
 #endif
 		      
 		      /**************************************************************************************************************/
 		      
 		      al = hadd3(al);
 		      ar = hadd3(ar);
-		      al = _mm256_mul_pd(ar,al);
+		      al = simde_mm256_mul_pd(ar,al);
 		      
 		      /************************************************************************************************************/
 #ifdef _FMA		    
-		      __m256d ev =  _mm256_load_pd(&extEV[20 * l + 0]);
+		      simde__m256d ev =  simde_mm256_load_pd(&extEV[20 * l + 0]);
 		      vv[0] = FMAMACC(vv[0], al, ev);		 
 #else
-		      vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 0])));			  		 		  
+		      vv[0] = simde_mm256_add_pd(vv[0],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[20 * l + 0])));			  		 		  
 #endif
-		      _mm256_store_pd(&v[0],vv[0]);
+		      simde_mm256_store_pd(&v[0],vv[0]);
 		      
 #ifdef _FMA		    
-		      ev =  _mm256_load_pd(&extEV[20 * l + 4]);
+		      ev =  simde_mm256_load_pd(&extEV[20 * l + 4]);
 		      vv[1] = FMAMACC(vv[1], al, ev);		 
 #else
-		      vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 4])));		  		 
+		      vv[1] = simde_mm256_add_pd(vv[1],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[20 * l + 4])));		  		 
 #endif
-		      _mm256_store_pd(&v[4],vv[1]);
+		      simde_mm256_store_pd(&v[4],vv[1]);
 		      
 #ifdef _FMA		    
-		      ev =  _mm256_load_pd(&extEV[20 * l + 8]);
+		      ev =  simde_mm256_load_pd(&extEV[20 * l + 8]);
 		      vv[2] = FMAMACC(vv[2], al, ev);		 
 #else
-		      vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 8])));		  		 
+		      vv[2] = simde_mm256_add_pd(vv[2],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[20 * l + 8])));		  		 
 #endif
-		      _mm256_store_pd(&v[8],vv[2]);
+		      simde_mm256_store_pd(&v[8],vv[2]);
 		      
 #ifdef _FMA		    
-		      ev =  _mm256_load_pd(&extEV[20 * l + 12]);
+		      ev =  simde_mm256_load_pd(&extEV[20 * l + 12]);
 		      vv[3] = FMAMACC(vv[3], al, ev);		 
 #else
-		      vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 12])));		  		 
+		      vv[3] = simde_mm256_add_pd(vv[3],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[20 * l + 12])));		  		 
 #endif
-		      _mm256_store_pd(&v[12],vv[3]);
+		      simde_mm256_store_pd(&v[12],vv[3]);
 		      
 #ifdef _FMA		    
-		      ev =  _mm256_load_pd(&extEV[20 * l + 16]);
+		      ev =  simde_mm256_load_pd(&extEV[20 * l + 16]);
 		      vv[4] = FMAMACC(vv[4], al, ev);		 
 #else
-		      vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 16])));			 	  
+		      vv[4] = simde_mm256_add_pd(vv[4],simde_mm256_mul_pd(al, simde_mm256_load_pd(&extEV[20 * l + 16])));			 	  
 #endif
-		      _mm256_store_pd(&v[16],vv[4]);		 
+		      simde_mm256_store_pd(&v[16],vv[4]);		 
 		    }
 		}
 	      
 	      v = x3_ptr;
 	      scale = 1;
 	      
-	      __m256d minlikelihood_avx = _mm256_set1_pd(minlikelihood);	 
+	      simde__m256d minlikelihood_avx = simde_mm256_set1_pd(minlikelihood);	 
 	      
 	      for(l = 0; scale && (l < 80); l += 4) 
 		{
-		  __m256d vv = _mm256_load_pd(&v[l]);
-		  __m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
-		  vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
-		  if(_mm256_movemask_pd(vv_abs) != 15)
+		  simde__m256d vv = simde_mm256_load_pd(&v[l]);
+		  simde__m256d vv_abs = simde_mm256_and_pd(vv,absMask_AVX.m);
+		  vv_abs = simde_mm256_cmp_pd(vv_abs,minlikelihood_avx,SIMDE_CMP_LT_OS);
+		  if(simde_mm256_movemask_pd(vv_abs) != 15)
 		    scale = 0;	     
 		}
 	      
 	      if(scale) 
 		{		     	      
-		  __m256d twotothe256v = _mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
+		  simde__m256d twotothe256v = simde_mm256_set_pd(twotothe256,twotothe256,twotothe256,twotothe256);
 		  for(l = 0; l < 80; l += 4) 
 		    {
-		      __m256d vv = _mm256_load_pd(&v[l]);
-		      _mm256_store_pd(&v[l],_mm256_mul_pd(vv,twotothe256v));
+		      simde__m256d vv = simde_mm256_load_pd(&v[l]);
+		      simde_mm256_store_pd(&v[l],simde_mm256_mul_pd(vv,twotothe256v));
 		    }
 		  if(useFastScaling)
 		    addScale += wgt[i];					
--- examl.orig/examl/evaluateGenericSpecial.c
+++ examl/examl/evaluateGenericSpecial.c
@@ -43,11 +43,7 @@
 
 /* includes for using SSE3 intrinsics */
 
-#ifdef __SIM_SSE3
-#include <xmmintrin.h>
-#include <pmmintrin.h>
-/*#include <tmmintrin.h>*/
-#endif
+#include "../debian/include/simde/x86/sse3.h"
 
 #ifdef __MIC_NATIVE
 #include "mic_native.h"
@@ -1033,7 +1029,7 @@
           diagptable = &(diagptable_start[2 * cptr[i]]);                          
         
 
-          _mm_store_pd(t, _mm_mul_pd(_mm_load_pd(x1), _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(diagptable))));
+          simde_mm_store_pd(t, simde_mm_mul_pd(simde_mm_load_pd(x1), simde_mm_mul_pd(simde_mm_load_pd(x2), simde_mm_load_pd(diagptable))));
           
           if(fastScaling)
             term = log(fabs(t[0] + t[1]));
@@ -1055,7 +1051,7 @@
           
           diagptable = &diagptable_start[2 * cptr[i]];            
 
-          _mm_store_pd(t, _mm_mul_pd(_mm_load_pd(x1), _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(diagptable))));
+          simde_mm_store_pd(t, simde_mm_mul_pd(simde_mm_load_pd(x1), simde_mm_mul_pd(simde_mm_load_pd(x2), simde_mm_load_pd(diagptable))));
           
           if(fastScaling)
             term = log(fabs(t[0] + t[1]));
@@ -1085,26 +1081,26 @@
       for (i = 0; i < n; i++)
         {
           double t[2] __attribute__ ((aligned (BYTE_ALIGNMENT)));
-          __m128d termv, x1v, x2v, dv;
+          simde__m128d termv, x1v, x2v, dv;
 	  
           x1 = &(tipVector[2 * tipX1[i]]);       
           x2 = &x2_start[8 * i];                                
 
-          termv = _mm_set1_pd(0.0);                
+          termv = simde_mm_set1_pd(0.0);                
           
           for(j = 0; j < 4; j++)
             {
-              x1v = _mm_load_pd(&x1[0]);
-              x2v = _mm_load_pd(&x2[j * 2]);
-              dv   = _mm_load_pd(&diagptable[j * 2]);
+              x1v = simde_mm_load_pd(&x1[0]);
+              x2v = simde_mm_load_pd(&x2[j * 2]);
+              dv   = simde_mm_load_pd(&diagptable[j * 2]);
               
-              x1v = _mm_mul_pd(x1v, x2v);
-              x1v = _mm_mul_pd(x1v, dv);
+              x1v = simde_mm_mul_pd(x1v, x2v);
+              x1v = simde_mm_mul_pd(x1v, dv);
               
-              termv = _mm_add_pd(termv, x1v);                 
+              termv = simde_mm_add_pd(termv, x1v);                 
             }
           
-          _mm_store_pd(t, termv);               
+          simde_mm_store_pd(t, termv);               
           
           if(fastScaling)
             term = log(0.25 * (fabs(t[0] + t[1])));
@@ -1121,27 +1117,27 @@
         {
 
           double t[2] __attribute__ ((aligned (BYTE_ALIGNMENT)));
-          __m128d termv, x1v, x2v, dv;
+          simde__m128d termv, x1v, x2v, dv;
                         
           x1 = &x1_start[8 * i];
           x2 = &x2_start[8 * i];
                   
 
-          termv = _mm_set1_pd(0.0);                
+          termv = simde_mm_set1_pd(0.0);                
           
           for(j = 0; j < 4; j++)
             {
-              x1v = _mm_load_pd(&x1[j * 2]);
-              x2v = _mm_load_pd(&x2[j * 2]);
-              dv   = _mm_load_pd(&diagptable[j * 2]);
+              x1v = simde_mm_load_pd(&x1[j * 2]);
+              x2v = simde_mm_load_pd(&x2[j * 2]);
+              dv   = simde_mm_load_pd(&diagptable[j * 2]);
               
-              x1v = _mm_mul_pd(x1v, x2v);
-              x1v = _mm_mul_pd(x1v, dv);
+              x1v = simde_mm_mul_pd(x1v, x2v);
+              x1v = simde_mm_mul_pd(x1v, dv);
               
-              termv = _mm_add_pd(termv, x1v);                 
+              termv = simde_mm_add_pd(termv, x1v);                 
             }
           
-          _mm_store_pd(t, termv);
+          simde_mm_store_pd(t, termv);
           
           
           if(fastScaling)
@@ -1174,18 +1170,17 @@
     {               
       for (i = 0; i < n; i++) 
 	{
-#ifdef __SIM_SSE3  	  
-	  __m128d 
-	    tv = _mm_setzero_pd();
+	  simde__m128d 
+	    tv = simde_mm_setzero_pd();
 	      	 	  	 
 	  for(j = 0, term = 0.0; j < 4; j++)
 	    {
 	      double 
 		*d = &diagptable[j * 20];
 
-	      __m128d 
-		t = _mm_setzero_pd(),
-		w = _mm_set1_pd(weights[j]);
+	      simde__m128d 
+		t = simde_mm_setzero_pd(),
+		w = simde_mm_set1_pd(weights[j]);
 	      
 	      
 	      left = &(tipVector[j][20 * tipX1[i]]);
@@ -1193,30 +1188,16 @@
 	      
 	      for(l = 0; l < 20; l+=2)
 		{
-		  __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
-		  t = _mm_add_pd(t, _mm_mul_pd(mul, _mm_load_pd(&d[l])));		   
+		  simde__m128d mul = simde_mm_mul_pd(simde_mm_load_pd(&left[l]), simde_mm_load_pd(&right[l]));
+		  t = simde_mm_add_pd(t, simde_mm_mul_pd(mul, simde_mm_load_pd(&d[l])));		   
 		}
 	      
-	      tv = _mm_add_pd(tv, _mm_mul_pd(t, w));	      	      	     
+	      tv = simde_mm_add_pd(tv, simde_mm_mul_pd(t, w));	      	      	     
 	    }
 	  
-	  tv = _mm_hadd_pd(tv, tv);
-	  _mm_storel_pd(&term, tv);
+	  tv = simde_mm_hadd_pd(tv, tv);
+	  simde_mm_storel_pd(&term, tv);
 	  
-#else	  	  	  	  
-	  for(j = 0, term = 0.0; j < 4; j++)
-	    {
-	      double 
-		t = 0.0;
-	      
-	      left = &(tipVector[j][20 * tipX1[i]]);
-	      right = &(x2[80 * i + 20 * j]);
-	      for(l = 0; l < 20; l++)
-		t += left[l] * right[l] * diagptable[j * 20 + l];	      
-
-	      term += weights[j] * t;
-	    }	  
-#endif
 	  
 	  if(fastScaling)
 	    term = LOG(FABS(term));
@@ -1230,49 +1211,32 @@
     {
       for (i = 0; i < n; i++) 
 	{	  	 	             
-#ifdef __SIM_SSE3        
-	  __m128d 
-	    tv = _mm_setzero_pd();	 	  	  
+	  simde__m128d 
+	    tv = simde_mm_setzero_pd();	 	  	  
 	      
 	  for(j = 0, term = 0.0; j < 4; j++)
 	    {
 	      double 
 		*d = &diagptable[j * 20];
 
-	      __m128d 
-		t = _mm_setzero_pd(),
-		w = _mm_set1_pd(weights[j]);
+	      simde__m128d 
+		t = simde_mm_setzero_pd(),
+		w = simde_mm_set1_pd(weights[j]);
 	      
 	      left  = &(x1[80 * i + 20 * j]);
 	      right = &(x2[80 * i + 20 * j]);
 	      
 	      for(l = 0; l < 20; l+=2)
 		{
-		  __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
-		  t = _mm_add_pd(t, _mm_mul_pd(mul, _mm_load_pd(&d[l])));		   
+		  simde__m128d mul = simde_mm_mul_pd(simde_mm_load_pd(&left[l]), simde_mm_load_pd(&right[l]));
+		  t = simde_mm_add_pd(t, simde_mm_mul_pd(mul, simde_mm_load_pd(&d[l])));		   
 		}		 
 
-	       tv = _mm_add_pd(tv, _mm_mul_pd(t, w));
+	       tv = simde_mm_add_pd(tv, simde_mm_mul_pd(t, w));
 	    }
 	  
-	  tv = _mm_hadd_pd(tv, tv);
-	  _mm_storel_pd(&term, tv);	  
-	  
-#else
-	  for(j = 0, term = 0.0; j < 4; j++)
-	    {
-	      double
-		t = 0.0;
-	      
-	      left  = &(x1[80 * i + 20 * j]);
-	      right = &(x2[80 * i + 20 * j]);	    
-	      
-	      for(l = 0; l < 20; l++)
-		t += left[l] * right[l] * diagptable[j * 20 + l];	
-
-	      term += weights[j] * t;
-	    }
-#endif
+	  tv = simde_mm_hadd_pd(tv, tv);
+	  simde_mm_storel_pd(&term, tv);	  
 	  
 	  if(fastScaling)
 	    term = LOG(FABS(term));
@@ -1316,7 +1280,7 @@
 	      x2_ptr += 80;
 	    }
 
-	  __m128d tv = _mm_setzero_pd();
+	  simde__m128d tv = simde_mm_setzero_pd();
 	  left = &(tipVector[20 * tipX1[i]]);	  	  
 	  
 	  for(j = 0, term = 0.0; j < 4; j++)
@@ -1325,13 +1289,13 @@
 	      right = &(x2v[20 * j]);
 	      for(l = 0; l < 20; l+=2)
 		{
-		  __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
-		  tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l])));		   
+		  simde__m128d mul = simde_mm_mul_pd(simde_mm_load_pd(&left[l]), simde_mm_load_pd(&right[l]));
+		  tv = simde_mm_add_pd(tv, simde_mm_mul_pd(mul, simde_mm_load_pd(&d[l])));		   
 		}		 		
 	    }
 
-	  tv = _mm_hadd_pd(tv, tv);
-	  _mm_storel_pd(&term, tv);
+	  tv = simde_mm_hadd_pd(tv, tv);
+	  simde_mm_storel_pd(&term, tv);
 	  
 
 	  
@@ -1360,7 +1324,7 @@
 	      x2_ptr += 80;
 	    }
 	  	 	             
-	  __m128d tv = _mm_setzero_pd();	 	  	  
+	  simde__m128d tv = simde_mm_setzero_pd();	 	  	  
 	      
 	  for(j = 0, term = 0.0; j < 4; j++)
 	    {
@@ -1370,12 +1334,12 @@
 	      
 	      for(l = 0; l < 20; l+=2)
 		{
-		  __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
-		  tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l])));		   
+		  simde__m128d mul = simde_mm_mul_pd(simde_mm_load_pd(&left[l]), simde_mm_load_pd(&right[l]));
+		  tv = simde_mm_add_pd(tv, simde_mm_mul_pd(mul, simde_mm_load_pd(&d[l])));		   
 		}		 		
 	    }
-	  tv = _mm_hadd_pd(tv, tv);
-	  _mm_storel_pd(&term, tv);	  
+	  tv = simde_mm_hadd_pd(tv, tv);
+	  simde_mm_storel_pd(&term, tv);	  
 	  
 	 
 	  term = LOG(0.25 * FABS(term));
@@ -1404,7 +1368,7 @@
       for (i = 0; i < n; i++) 
 	{
 
-	  __m128d tv = _mm_setzero_pd();
+	  simde__m128d tv = simde_mm_setzero_pd();
 	  left = &(tipVector[20 * tipX1[i]]);	  	  
 	  
 	  for(j = 0, term = 0.0; j < 4; j++)
@@ -1413,12 +1377,12 @@
 	      right = &(x2[80 * i + 20 * j]);
 	      for(l = 0; l < 20; l+=2)
 		{
-		  __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
-		  tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l])));		   
+		  simde__m128d mul = simde_mm_mul_pd(simde_mm_load_pd(&left[l]), simde_mm_load_pd(&right[l]));
+		  tv = simde_mm_add_pd(tv, simde_mm_mul_pd(mul, simde_mm_load_pd(&d[l])));		   
 		}		 		
 	    }
-	  tv = _mm_hadd_pd(tv, tv);
-	  _mm_storel_pd(&term, tv);
+	  tv = simde_mm_hadd_pd(tv, tv);
+	  simde_mm_storel_pd(&term, tv);
 	  
 	  
 	 
@@ -1432,7 +1396,7 @@
     {
       for (i = 0; i < n; i++) 
 	{	  	 	             
-	  __m128d tv = _mm_setzero_pd();	 	  	  
+	  simde__m128d tv = simde_mm_setzero_pd();	 	  	  
 	      
 	  for(j = 0, term = 0.0; j < 4; j++)
 	    {
@@ -1442,12 +1406,12 @@
 	      
 	      for(l = 0; l < 20; l+=2)
 		{
-		  __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
-		  tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l])));		   
+		  simde__m128d mul = simde_mm_mul_pd(simde_mm_load_pd(&left[l]), simde_mm_load_pd(&right[l]));
+		  tv = simde_mm_add_pd(tv, simde_mm_mul_pd(mul, simde_mm_load_pd(&d[l])));		   
 		}		 		
 	    }
-	  tv = _mm_hadd_pd(tv, tv);
-	  _mm_storel_pd(&term, tv);	  
+	  tv = simde_mm_hadd_pd(tv, tv);
+	  simde_mm_storel_pd(&term, tv);	  
 	  
 	
 	  term = LOG(0.25 * FABS(term));
@@ -1478,20 +1442,20 @@
 	  
 	  diagptable = &diagptable_start[20 * cptr[i]];	           	 
 
-	  __m128d tv = _mm_setzero_pd();	    
+	  simde__m128d tv = simde_mm_setzero_pd();	    
 	  
 	  for(l = 0; l < 20; l+=2)
 	    {
-	      __m128d lv = _mm_load_pd(&left[l]);
-	      __m128d rv = _mm_load_pd(&right[l]);
-	      __m128d mul = _mm_mul_pd(lv, rv);
-	      __m128d dv = _mm_load_pd(&diagptable[l]);
+	      simde__m128d lv = simde_mm_load_pd(&left[l]);
+	      simde__m128d rv = simde_mm_load_pd(&right[l]);
+	      simde__m128d mul = simde_mm_mul_pd(lv, rv);
+	      simde__m128d dv = simde_mm_load_pd(&diagptable[l]);
 	      
-	      tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv));		   
+	      tv = simde_mm_add_pd(tv, simde_mm_mul_pd(mul, dv));		   
 	    }		 		
 	  
-	  tv = _mm_hadd_pd(tv, tv);
-	  _mm_storel_pd(&term, tv);
+	  tv = simde_mm_hadd_pd(tv, tv);
+	  simde_mm_storel_pd(&term, tv);
   
 	  
 	  term = LOG(FABS(term));
@@ -1509,20 +1473,20 @@
 	  
 	  diagptable = &diagptable_start[20 * cptr[i]];	  	
 
-	  __m128d tv = _mm_setzero_pd();	    
+	  simde__m128d tv = simde_mm_setzero_pd();	    
 	      	    
 	  for(l = 0; l < 20; l+=2)
 	    {
-	      __m128d lv = _mm_load_pd(&left[l]);
-	      __m128d rv = _mm_load_pd(&right[l]);
-	      __m128d mul = _mm_mul_pd(lv, rv);
-	      __m128d dv = _mm_load_pd(&diagptable[l]);
+	      simde__m128d lv = simde_mm_load_pd(&left[l]);
+	      simde__m128d rv = simde_mm_load_pd(&right[l]);
+	      simde__m128d mul = simde_mm_mul_pd(lv, rv);
+	      simde__m128d dv = simde_mm_load_pd(&diagptable[l]);
 	      
-	      tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv));		   
+	      tv = simde_mm_add_pd(tv, simde_mm_mul_pd(mul, dv));		   
 	    }		 		
 	  
-	  tv = _mm_hadd_pd(tv, tv);
-	  _mm_storel_pd(&term, tv);
+	  tv = simde_mm_hadd_pd(tv, tv);
+	  simde_mm_storel_pd(&term, tv);
 	  	  
 	  term = LOG(FABS(term));	 
 	  
@@ -1568,20 +1532,20 @@
 	  
 	  diagptable = &diagptable_start[20 * cptr[i]];	           	 
 
-	  __m128d tv = _mm_setzero_pd();	    
+	  simde__m128d tv = simde_mm_setzero_pd();	    
 	  
 	  for(l = 0; l < 20; l+=2)
 	    {
-	      __m128d lv = _mm_load_pd(&left[l]);
-	      __m128d rv = _mm_load_pd(&right[l]);
-	      __m128d mul = _mm_mul_pd(lv, rv);
-	      __m128d dv = _mm_load_pd(&diagptable[l]);
+	      simde__m128d lv = simde_mm_load_pd(&left[l]);
+	      simde__m128d rv = simde_mm_load_pd(&right[l]);
+	      simde__m128d mul = simde_mm_mul_pd(lv, rv);
+	      simde__m128d dv = simde_mm_load_pd(&diagptable[l]);
 	      
-	      tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv));		   
+	      tv = simde_mm_add_pd(tv, simde_mm_mul_pd(mul, dv));		   
 	    }		 		
 	  
-	  tv = _mm_hadd_pd(tv, tv);
-	  _mm_storel_pd(&term, tv);
+	  tv = simde_mm_hadd_pd(tv, tv);
+	  simde_mm_storel_pd(&term, tv);
     
 	  
 	  term = LOG(FABS(term));
@@ -1612,20 +1576,20 @@
 	  
 	  diagptable = &diagptable_start[20 * cptr[i]];	  	
 
-	  __m128d tv = _mm_setzero_pd();	    
+	  simde__m128d tv = simde_mm_setzero_pd();	    
 	  
 	  for(l = 0; l < 20; l+=2)
 	    {
-	      __m128d lv = _mm_load_pd(&left[l]);
-	      __m128d rv = _mm_load_pd(&right[l]);
-	      __m128d mul = _mm_mul_pd(lv, rv);
-	      __m128d dv = _mm_load_pd(&diagptable[l]);
+	      simde__m128d lv = simde_mm_load_pd(&left[l]);
+	      simde__m128d rv = simde_mm_load_pd(&right[l]);
+	      simde__m128d mul = simde_mm_mul_pd(lv, rv);
+	      simde__m128d dv = simde_mm_load_pd(&diagptable[l]);
 	      
-	      tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv));		   
+	      tv = simde_mm_add_pd(tv, simde_mm_mul_pd(mul, dv));		   
 	    }		 		
 	  
-	  tv = _mm_hadd_pd(tv, tv);
-	  _mm_storel_pd(&term, tv);
+	  tv = simde_mm_hadd_pd(tv, tv);
+	  simde_mm_storel_pd(&term, tv);
 	  	  
 	  term = LOG(FABS(term));	 
 	  
@@ -1656,7 +1620,7 @@
       for (i = 0; i < n; i++) 
 	{	
 	  double t[2] __attribute__ ((aligned (BYTE_ALIGNMENT)));
-	  __m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
+	  simde__m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
 
 	  x1 = &(tipVector[4 * tipX1[i]]);
 
@@ -1670,22 +1634,22 @@
 	  
 	  diagptable = &diagptable_start[4 * cptr[i]];
 	  	    	  
-	  x1v1 =  _mm_load_pd(&x1[0]);
-	  x1v2 =  _mm_load_pd(&x1[2]);
-	  x2v1 =  _mm_load_pd(&x2[0]);
-	  x2v2 =  _mm_load_pd(&x2[2]);
-	  dv1  =  _mm_load_pd(&diagptable[0]);
-	  dv2  =  _mm_load_pd(&diagptable[2]);
-	  
-	  x1v1 = _mm_mul_pd(x1v1, x2v1);
-	  x1v1 = _mm_mul_pd(x1v1, dv1);
+	  x1v1 =  simde_mm_load_pd(&x1[0]);
+	  x1v2 =  simde_mm_load_pd(&x1[2]);
+	  x2v1 =  simde_mm_load_pd(&x2[0]);
+	  x2v2 =  simde_mm_load_pd(&x2[2]);
+	  dv1  =  simde_mm_load_pd(&diagptable[0]);
+	  dv2  =  simde_mm_load_pd(&diagptable[2]);
+	  
+	  x1v1 = simde_mm_mul_pd(x1v1, x2v1);
+	  x1v1 = simde_mm_mul_pd(x1v1, dv1);
 	  
-	  x1v2 = _mm_mul_pd(x1v2, x2v2);
-	  x1v2 = _mm_mul_pd(x1v2, dv2);
+	  x1v2 = simde_mm_mul_pd(x1v2, x2v2);
+	  x1v2 = simde_mm_mul_pd(x1v2, dv2);
 	  
-	  x1v1 = _mm_add_pd(x1v1, x1v2);
+	  x1v1 = simde_mm_add_pd(x1v1, x1v2);
 	  
-	  _mm_store_pd(t, x1v1);
+	  simde_mm_store_pd(t, x1v1);
 	  	  
 	  term = LOG(FABS(t[0] + t[1]));
 	      
@@ -1699,7 +1663,7 @@
       for (i = 0; i < n; i++) 
 	{ 
 	  double t[2] __attribute__ ((aligned (BYTE_ALIGNMENT)));
-	  __m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
+	  simde__m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
 	   
 	  if(isGap(x1_gap, i))
 	    x1 = x1_gapColumn;
@@ -1719,22 +1683,22 @@
 	  
 	  diagptable = &diagptable_start[4 * cptr[i]];	
 	  
-	  x1v1 =  _mm_load_pd(&x1[0]);
-	  x1v2 =  _mm_load_pd(&x1[2]);
-	  x2v1 =  _mm_load_pd(&x2[0]);
-	  x2v2 =  _mm_load_pd(&x2[2]);
-	  dv1  =  _mm_load_pd(&diagptable[0]);
-	  dv2  =  _mm_load_pd(&diagptable[2]);
-	  
-	  x1v1 = _mm_mul_pd(x1v1, x2v1);
-	  x1v1 = _mm_mul_pd(x1v1, dv1);
+	  x1v1 =  simde_mm_load_pd(&x1[0]);
+	  x1v2 =  simde_mm_load_pd(&x1[2]);
+	  x2v1 =  simde_mm_load_pd(&x2[0]);
+	  x2v2 =  simde_mm_load_pd(&x2[2]);
+	  dv1  =  simde_mm_load_pd(&diagptable[0]);
+	  dv2  =  simde_mm_load_pd(&diagptable[2]);
+	  
+	  x1v1 = simde_mm_mul_pd(x1v1, x2v1);
+	  x1v1 = simde_mm_mul_pd(x1v1, dv1);
 	  
-	  x1v2 = _mm_mul_pd(x1v2, x2v2);
-	  x1v2 = _mm_mul_pd(x1v2, dv2);
+	  x1v2 = simde_mm_mul_pd(x1v2, x2v2);
+	  x1v2 = simde_mm_mul_pd(x1v2, dv2);
 	  
-	  x1v1 = _mm_add_pd(x1v1, x1v2);
+	  x1v1 = simde_mm_add_pd(x1v1, x1v2);
 	  
-	  _mm_store_pd(t, x1v1);
+	  simde_mm_store_pd(t, x1v1);
 	  
 	 
 	  term = LOG(FABS(t[0] + t[1]));
@@ -1770,7 +1734,7 @@
       for (i = 0; i < n; i++)
 	{
 	  double t[2] __attribute__ ((aligned (BYTE_ALIGNMENT)));
-	  __m128d termv, x1v, x2v, dv;
+	  simde__m128d termv, x1v, x2v, dv;
 
 	  x1 = &(tipVector[4 * tipX1[i]]);	 
 	  if(x2_gap[i / 32] & mask32[i % 32])
@@ -1782,30 +1746,30 @@
 	    }
 	  
 	
-	  termv = _mm_set1_pd(0.0);	    	   
+	  termv = simde_mm_set1_pd(0.0);	    	   
 	  
 	  for(j = 0; j < 4; j++)
 	    {
-	      x1v = _mm_load_pd(&x1[0]);
-	      x2v = _mm_load_pd(&x2[j * 4]);
-	      dv   = _mm_load_pd(&diagptable[j * 4]);
+	      x1v = simde_mm_load_pd(&x1[0]);
+	      x2v = simde_mm_load_pd(&x2[j * 4]);
+	      dv   = simde_mm_load_pd(&diagptable[j * 4]);
 	      
-	      x1v = _mm_mul_pd(x1v, x2v);
-	      x1v = _mm_mul_pd(x1v, dv);
+	      x1v = simde_mm_mul_pd(x1v, x2v);
+	      x1v = simde_mm_mul_pd(x1v, dv);
 	      
-	      termv = _mm_add_pd(termv, x1v);
+	      termv = simde_mm_add_pd(termv, x1v);
 	      
-	      x1v = _mm_load_pd(&x1[2]);
-	      x2v = _mm_load_pd(&x2[j * 4 + 2]);
-	      dv   = _mm_load_pd(&diagptable[j * 4 + 2]);
+	      x1v = simde_mm_load_pd(&x1[2]);
+	      x2v = simde_mm_load_pd(&x2[j * 4 + 2]);
+	      dv   = simde_mm_load_pd(&diagptable[j * 4 + 2]);
 	      
-	      x1v = _mm_mul_pd(x1v, x2v);
-	      x1v = _mm_mul_pd(x1v, dv);
+	      x1v = simde_mm_mul_pd(x1v, x2v);
+	      x1v = simde_mm_mul_pd(x1v, dv);
 	      
-	      termv = _mm_add_pd(termv, x1v);
+	      termv = simde_mm_add_pd(termv, x1v);
 	    }
 	  
-	  _mm_store_pd(t, termv);	  	 
+	  simde_mm_store_pd(t, termv);	  	 
 
 	 
 	  term = LOG(0.25 * FABS(t[0] + t[1]));
@@ -1821,7 +1785,7 @@
 	{
 
 	  double t[2] __attribute__ ((aligned (BYTE_ALIGNMENT)));
-	  __m128d termv, x1v, x2v, dv;
+	  simde__m128d termv, x1v, x2v, dv;
 
 	  if(x1_gap[i / 32] & mask32[i % 32])
 	    x1 = x1_gapColumn;
@@ -1839,30 +1803,30 @@
 	      x2_ptr += 16;
 	    }
 	
-	  termv = _mm_set1_pd(0.0);	  	 
+	  termv = simde_mm_set1_pd(0.0);	  	 
 	  
 	  for(j = 0; j < 4; j++)
 	    {
-	      x1v = _mm_load_pd(&x1[j * 4]);
-	      x2v = _mm_load_pd(&x2[j * 4]);
-	      dv   = _mm_load_pd(&diagptable[j * 4]);
+	      x1v = simde_mm_load_pd(&x1[j * 4]);
+	      x2v = simde_mm_load_pd(&x2[j * 4]);
+	      dv   = simde_mm_load_pd(&diagptable[j * 4]);
 	      
-	      x1v = _mm_mul_pd(x1v, x2v);
-	      x1v = _mm_mul_pd(x1v, dv);
+	      x1v = simde_mm_mul_pd(x1v, x2v);
+	      x1v = simde_mm_mul_pd(x1v, dv);
 	      
-	      termv = _mm_add_pd(termv, x1v);
+	      termv = simde_mm_add_pd(termv, x1v);
 	      
-	      x1v = _mm_load_pd(&x1[j * 4 + 2]);
-	      x2v = _mm_load_pd(&x2[j * 4 + 2]);
-	      dv   = _mm_load_pd(&diagptable[j * 4 + 2]);
+	      x1v = simde_mm_load_pd(&x1[j * 4 + 2]);
+	      x2v = simde_mm_load_pd(&x2[j * 4 + 2]);
+	      dv   = simde_mm_load_pd(&diagptable[j * 4 + 2]);
 	      
-	      x1v = _mm_mul_pd(x1v, x2v);
-	      x1v = _mm_mul_pd(x1v, dv);
+	      x1v = simde_mm_mul_pd(x1v, x2v);
+	      x1v = simde_mm_mul_pd(x1v, dv);
 	      
-	      termv = _mm_add_pd(termv, x1v);
+	      termv = simde_mm_add_pd(termv, x1v);
 	    }
 	  
-	  _mm_store_pd(t, termv);
+	  simde_mm_store_pd(t, termv);
 
 	 
 	  term = LOG(0.25 * FABS(t[0] + t[1]));
@@ -1894,36 +1858,36 @@
 	{
 
 	  double t[2] __attribute__ ((aligned (BYTE_ALIGNMENT)));
-	  __m128d termv, x1v, x2v, dv;
+	  simde__m128d termv, x1v, x2v, dv;
 
 	  x1 = &(tipVector[4 * tipX1[i]]);	 
 	  x2 = &x2_start[16 * i];	 
 	  
 	
-	  termv = _mm_set1_pd(0.0);	    	   
+	  termv = simde_mm_set1_pd(0.0);	    	   
 	  
 	  for(j = 0; j < 4; j++)
 	    {
-	      x1v = _mm_load_pd(&x1[0]);
-	      x2v = _mm_load_pd(&x2[j * 4]);
-	      dv   = _mm_load_pd(&diagptable[j * 4]);
+	      x1v = simde_mm_load_pd(&x1[0]);
+	      x2v = simde_mm_load_pd(&x2[j * 4]);
+	      dv   = simde_mm_load_pd(&diagptable[j * 4]);
 	      
-	      x1v = _mm_mul_pd(x1v, x2v);
-	      x1v = _mm_mul_pd(x1v, dv);
+	      x1v = simde_mm_mul_pd(x1v, x2v);
+	      x1v = simde_mm_mul_pd(x1v, dv);
 	      
-	      termv = _mm_add_pd(termv, x1v);
+	      termv = simde_mm_add_pd(termv, x1v);
 	      
-	      x1v = _mm_load_pd(&x1[2]);
-	      x2v = _mm_load_pd(&x2[j * 4 + 2]);
-	      dv   = _mm_load_pd(&diagptable[j * 4 + 2]);
+	      x1v = simde_mm_load_pd(&x1[2]);
+	      x2v = simde_mm_load_pd(&x2[j * 4 + 2]);
+	      dv   = simde_mm_load_pd(&diagptable[j * 4 + 2]);
 	      
-	      x1v = _mm_mul_pd(x1v, x2v);
-	      x1v = _mm_mul_pd(x1v, dv);
+	      x1v = simde_mm_mul_pd(x1v, x2v);
+	      x1v = simde_mm_mul_pd(x1v, dv);
 	      
-	      termv = _mm_add_pd(termv, x1v);
+	      termv = simde_mm_add_pd(termv, x1v);
 	    }
 	  
-	  _mm_store_pd(t, termv);
+	  simde_mm_store_pd(t, termv);
 	  
 	  
 	
@@ -1940,37 +1904,37 @@
 	{
 
 	  double t[2] __attribute__ ((aligned (BYTE_ALIGNMENT)));
-	  __m128d termv, x1v, x2v, dv;
+	  simde__m128d termv, x1v, x2v, dv;
 
 	  	 	  	  
 	  x1 = &x1_start[16 * i];
 	  x2 = &x2_start[16 * i];	  	  
 	
 	
-	  termv = _mm_set1_pd(0.0);	  	 
+	  termv = simde_mm_set1_pd(0.0);	  	 
 	  
 	  for(j = 0; j < 4; j++)
 	    {
-	      x1v = _mm_load_pd(&x1[j * 4]);
-	      x2v = _mm_load_pd(&x2[j * 4]);
-	      dv   = _mm_load_pd(&diagptable[j * 4]);
+	      x1v = simde_mm_load_pd(&x1[j * 4]);
+	      x2v = simde_mm_load_pd(&x2[j * 4]);
+	      dv   = simde_mm_load_pd(&diagptable[j * 4]);
 	      
-	      x1v = _mm_mul_pd(x1v, x2v);
-	      x1v = _mm_mul_pd(x1v, dv);
+	      x1v = simde_mm_mul_pd(x1v, x2v);
+	      x1v = simde_mm_mul_pd(x1v, dv);
 	      
-	      termv = _mm_add_pd(termv, x1v);
+	      termv = simde_mm_add_pd(termv, x1v);
 	      
-	      x1v = _mm_load_pd(&x1[j * 4 + 2]);
-	      x2v = _mm_load_pd(&x2[j * 4 + 2]);
-	      dv   = _mm_load_pd(&diagptable[j * 4 + 2]);
+	      x1v = simde_mm_load_pd(&x1[j * 4 + 2]);
+	      x2v = simde_mm_load_pd(&x2[j * 4 + 2]);
+	      dv   = simde_mm_load_pd(&diagptable[j * 4 + 2]);
 	      
-	      x1v = _mm_mul_pd(x1v, x2v);
-	      x1v = _mm_mul_pd(x1v, dv);
+	      x1v = simde_mm_mul_pd(x1v, x2v);
+	      x1v = simde_mm_mul_pd(x1v, dv);
 	      
-	      termv = _mm_add_pd(termv, x1v);
+	      termv = simde_mm_add_pd(termv, x1v);
 	    }
 	  
-	  _mm_store_pd(t, termv);
+	  simde_mm_store_pd(t, termv);
 
 	  
 	    term = LOG(0.25 * FABS(t[0] + t[1]));
@@ -2000,7 +1964,7 @@
 	{	
 
 	  double t[2] __attribute__ ((aligned (BYTE_ALIGNMENT)));
-	  __m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
+	  simde__m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
 
 	  x1 = &(tipVector[4 * tipX1[i]]);
 	  x2 = &x2_start[4 * i];
@@ -2008,22 +1972,22 @@
 	  diagptable = &diagptable_start[4 * cptr[i]];
 	  
 	    	  
-	  x1v1 =  _mm_load_pd(&x1[0]);
-	  x1v2 =  _mm_load_pd(&x1[2]);
-	  x2v1 =  _mm_load_pd(&x2[0]);
-	  x2v2 =  _mm_load_pd(&x2[2]);
-	  dv1  =  _mm_load_pd(&diagptable[0]);
-	  dv2  =  _mm_load_pd(&diagptable[2]);
-	  
-	  x1v1 = _mm_mul_pd(x1v1, x2v1);
-	  x1v1 = _mm_mul_pd(x1v1, dv1);
+	  x1v1 =  simde_mm_load_pd(&x1[0]);
+	  x1v2 =  simde_mm_load_pd(&x1[2]);
+	  x2v1 =  simde_mm_load_pd(&x2[0]);
+	  x2v2 =  simde_mm_load_pd(&x2[2]);
+	  dv1  =  simde_mm_load_pd(&diagptable[0]);
+	  dv2  =  simde_mm_load_pd(&diagptable[2]);
+	  
+	  x1v1 = simde_mm_mul_pd(x1v1, x2v1);
+	  x1v1 = simde_mm_mul_pd(x1v1, dv1);
 	  
-	  x1v2 = _mm_mul_pd(x1v2, x2v2);
-	  x1v2 = _mm_mul_pd(x1v2, dv2);
+	  x1v2 = simde_mm_mul_pd(x1v2, x2v2);
+	  x1v2 = simde_mm_mul_pd(x1v2, dv2);
 	  
-	  x1v1 = _mm_add_pd(x1v1, x1v2);
+	  x1v1 = simde_mm_add_pd(x1v1, x1v2);
 	  
-	  _mm_store_pd(t, x1v1);
+	  simde_mm_store_pd(t, x1v1);
 	  
 	  
 	  term = LOG(FABS(t[0] + t[1]));
@@ -2038,7 +2002,7 @@
 	{ 
 
 	  double t[2] __attribute__ ((aligned (BYTE_ALIGNMENT)));
-	   __m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
+	   simde__m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
 
 	  x1 = &x1_start[4 * i];
 	  x2 = &x2_start[4 * i];
@@ -2046,22 +2010,22 @@
 	  diagptable = &diagptable_start[4 * cptr[i]];	
 	  
   
-	  x1v1 =  _mm_load_pd(&x1[0]);
-	  x1v2 =  _mm_load_pd(&x1[2]);
-	  x2v1 =  _mm_load_pd(&x2[0]);
-	  x2v2 =  _mm_load_pd(&x2[2]);
-	  dv1  =  _mm_load_pd(&diagptable[0]);
-	  dv2  =  _mm_load_pd(&diagptable[2]);
-	  
-	  x1v1 = _mm_mul_pd(x1v1, x2v1);
-	  x1v1 = _mm_mul_pd(x1v1, dv1);
+	  x1v1 =  simde_mm_load_pd(&x1[0]);
+	  x1v2 =  simde_mm_load_pd(&x1[2]);
+	  x2v1 =  simde_mm_load_pd(&x2[0]);
+	  x2v2 =  simde_mm_load_pd(&x2[2]);
+	  dv1  =  simde_mm_load_pd(&diagptable[0]);
+	  dv2  =  simde_mm_load_pd(&diagptable[2]);
+	  
+	  x1v1 = simde_mm_mul_pd(x1v1, x2v1);
+	  x1v1 = simde_mm_mul_pd(x1v1, dv1);
 	  
-	  x1v2 = _mm_mul_pd(x1v2, x2v2);
-	  x1v2 = _mm_mul_pd(x1v2, dv2);
+	  x1v2 = simde_mm_mul_pd(x1v2, x2v2);
+	  x1v2 = simde_mm_mul_pd(x1v2, dv2);
 	  
-	  x1v1 = _mm_add_pd(x1v1, x1v2);
+	  x1v1 = simde_mm_add_pd(x1v1, x1v2);
 	  
-	  _mm_store_pd(t, x1v1);
+	  simde_mm_store_pd(t, x1v1);
 	  
 	 
 	  term = LOG(FABS(t[0] + t[1]));
--- examl.orig/examl/evaluatePartialGenericSpecial.c
+++ examl/examl/evaluatePartialGenericSpecial.c
@@ -40,14 +40,9 @@
 #include <string.h>
 #include "axml.h"
 
-#ifdef __SIM_SSE3
-#include <xmmintrin.h>
-#include <pmmintrin.h>
-#endif
-
+#include "../debian/include/simde/x86/sse3.h"
 
-
-#if defined(_OPTIMIZED_FUNCTIONS) && !defined(__MIC_NATIVE)
+#if !defined(__MIC_NATIVE)
 static inline void computeVectorGTRCATPROT(double *lVector, int *eVector, double ki, int i, double qz, double rz,
 					   traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector, 
 					   unsigned  char **yVector, int mxtips);
@@ -85,175 +80,6 @@
 					   int w, double *EIGN, double *EI, double *EV,
 					   double *tipVector, unsigned  char **yVector, 
 					   int branchReference, int mxtips);
-
-
-#else
-
-static inline void computeVectorCAT_FLEX(double *lVector, int *eVector, double ki, int i, double qz, double rz,
-					 traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector, 
-					 unsigned char **yVector, int mxtips, const int states)
-{       
-  double  
-    *d1 =    (double *)malloc(sizeof(double) * states), 
-    *d2 =    (double *)malloc(sizeof(double) * states),  
-    *x1px2 = (double *)malloc(sizeof(double) * states), 
-    ump_x1, 
-    ump_x2,    
-    lz1, 
-    lz2,
-    *x1, 
-    *x2, 
-    *x3;
-  
-  int 
-    scale,
-    j, 
-    k,
-    pNumber = ti->pNumber,
-    rNumber = ti->rNumber,
-    qNumber = ti->qNumber;
- 
-  x3  = &lVector[states * (pNumber  - mxtips)];  
- 
-  switch(ti->tipCase)
-    {
-    case TIP_TIP:     
-      x1 = &(tipVector[states * yVector[qNumber][i]]);
-      x2 = &(tipVector[states * yVector[rNumber][i]]);    
-      break;
-    case TIP_INNER:     
-      x1 = &(tipVector[states * yVector[qNumber][i]]);
-      x2 = &(lVector[states * (rNumber - mxtips)]);           
-      break;
-    case INNER_INNER:            
-      x1 = &(lVector[states * (qNumber - mxtips)]);
-      x2 = &(lVector[states * (rNumber - mxtips)]);     
-      break;
-    default:
-      assert(0);
-    }
-     
-  lz1 = qz * ki;  
-  lz2 = rz * ki;
-  
-  d1[0] = x1[0];
-  d2[0] = x2[0];
-
-
-  for(j = 1; j < states; j++)
-    {
-      d1[j] = x1[j] * EXP(EIGN[j] * lz1);
-      d2[j] = x2[j] * EXP(EIGN[j] * lz2);	    
-    }
- 
- 
-  for(j = 0; j < states; j++)
-    {         
-      ump_x1 = 0.0;
-      ump_x2 = 0.0;
-
-      for(k = 0; k < states; k++)
-	{
-	  ump_x1 += d1[k] * EI[j * states + k];
-	  ump_x2 += d2[k] * EI[j * states + k];
-	}
-      
-      x1px2[j] = ump_x1 * ump_x2;
-    }
-  
-  for(j = 0; j < states; j++)
-    x3[j] = 0.0;
-
-  for(j = 0; j < states; j++)          
-    for(k = 0; k < states; k++)	
-      x3[k] +=  x1px2[j] *  EV[states * j + k];	   
-      
-  scale = 1;
-  for(j = 0; scale && (j < states); j++)
-    scale = ((x3[j] < minlikelihood) && (x3[j] > minusminlikelihood));
-  
-  if(scale)
-    {
-      for(j = 0; j < states; j++)
-	x3[j] *= twotothe256;       
-      *eVector = *eVector + 1;
-    }	              
-
-  free(d1);
-  free(d2);
-  free(x1px2);
-       
-  return;
-}
-
-
-static double evaluatePartialCAT_FLEX(int i, double ki, int counter,  traversalInfo *ti, double qz,
-				      int w, double *EIGN, double *EI, double *EV,
-				      double *tipVector, unsigned  char **yVector, 
-				      int branchReference, int mxtips, const int states)
-{
-  int 
-    scale = 0, 
-    k;
-  
-  double 
-    *lVector = (double *)malloc_aligned(sizeof(double) * states * mxtips),
-    *d = (double *)malloc_aligned(sizeof(double) * states),
-    lz, 
-    term, 
-    *x1, 
-    *x2; 
-
-  traversalInfo 
-    *trav = &ti[0];
- 
-  assert(isTip(trav->pNumber, mxtips));
-     
-  x1 = &(tipVector[states *  yVector[trav->pNumber][i]]);   
-
-  for(k = 1; k < counter; k++)    
-    {
-      double 
-	qz = ti[k].qz[branchReference],
-	rz = ti[k].rz[branchReference];
-      
-      qz = (qz > zmin) ? log(qz) : log(zmin);
-      rz = (rz > zmin) ? log(rz) : log(zmin);
-
-      computeVectorCAT_FLEX(lVector, &scale, ki, i, qz, rz, &ti[k], 
-			    EIGN, EI, EV, 
-			    tipVector, yVector, mxtips, states);       
-    }
-   
-  x2 = &lVector[states * (trav->qNumber - mxtips)]; 
-
-  assert(0 <=  (trav->qNumber - mxtips) && (trav->qNumber - mxtips) < mxtips);  
-       
-  if(qz < zmin) 
-    lz = zmin;
-  lz  = log(qz); 
-  lz *= ki;  
-  
-  d[0] = 1.0;
-
-  for(k = 1; k < states; k++)
-    d[k] = EXP (EIGN[k] * lz);
-  
-  term = 0.0;
-
-  for(k = 0; k < states; k++) 
-    term += x1[k] * x2[k] * d[k];       
-
-  term = LOG(FABS(term)) + (scale * LOG(minlikelihood));   
-
-  term = term * w;
-
-  free(lVector);  
-  free(d);
-
-  return  term;
-}
-
 #endif
 
 double evaluatePartialGeneric (tree *tr, int i, double ki, int _model)
@@ -274,25 +100,7 @@
   else
     branchReference = 0;
 
-#ifndef _OPTIMIZED_FUNCTIONS
-  if(tr->rateHetModel == CAT)
-    result = evaluatePartialCAT_FLEX(index, ki, tr->td[0].count, tr->td[0].ti, tr->td[0].ti[0].qz[branchReference], 
-				     tr->partitionData[_model].wgt[index],
-				     tr->partitionData[_model].EIGN, 
-				     tr->partitionData[_model].EI, 
-				     tr->partitionData[_model].EV,
-				     tr->partitionData[_model].tipVector,
-				     tr->partitionData[_model].yVector, branchReference, tr->mxtips, states);
-  else
-    /* 
-       the per-site site likelihood function should only be called for the CAT model
-       under the GAMMA model this is required only for estimating per-site protein models 
-       which has however been removed in this version of the code
-    */
-    assert(0); 
-  
- 
-#elif defined(__MIC_NATIVE)
+#if defined(__MIC_NATIVE)
 if (tr->rateHetModel == CAT)
     result = evaluatePartialCAT_FLEX(index, ki, tr->td[0].count, tr->td[0].ti, tr->td[0].ti[0].qz[branchReference],
                      tr->partitionData[_model].wgt[index],
@@ -357,13 +165,9 @@
       assert(0);
     }
   #endif
- 
-
   return result;
 }
 
-#ifdef _OPTIMIZED_FUNCTIONS
-
 
 static inline void computeVectorGTRCAT_BINARY(double *lVector, int *eVector, double ki, int i, double qz, double rz,
 					      traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector, 
@@ -548,46 +352,46 @@
 
     for(l = 0; l < 20; l+=2)
       {
-	__m128d d1v = _mm_mul_pd(_mm_load_pd(&x1[l]), _mm_load_pd(&e1[l]));
-	__m128d d2v = _mm_mul_pd(_mm_load_pd(&x2[l]), _mm_load_pd(&e2[l]));
+	simde__m128d d1v = simde_mm_mul_pd(simde_mm_load_pd(&x1[l]), simde_mm_load_pd(&e1[l]));
+	simde__m128d d2v = simde_mm_mul_pd(simde_mm_load_pd(&x2[l]), simde_mm_load_pd(&e2[l]));
 	
-	_mm_store_pd(&d1[l], d1v);
-	_mm_store_pd(&d2[l], d2v);	
+	simde_mm_store_pd(&d1[l], d1v);
+	simde_mm_store_pd(&d2[l], d2v);	
       }
 
-    __m128d zero = _mm_setzero_pd();
+    simde__m128d zero = simde_mm_setzero_pd();
 
     for(l = 0; l < 20; l+=2)
-      _mm_store_pd(&x3[l], zero);
+      simde_mm_store_pd(&x3[l], zero);
                 
     for(l = 0; l < 20; l++)
       { 	      
 	double *ev = &EV[l * 20];
-	__m128d ump_x1v = _mm_setzero_pd();
-	__m128d ump_x2v = _mm_setzero_pd();
-	__m128d x1px2v;
+	simde__m128d ump_x1v = simde_mm_setzero_pd();
+	simde__m128d ump_x2v = simde_mm_setzero_pd();
+	simde__m128d x1px2v;
 
 	for(k = 0; k < 20; k+=2)
 	  {       
-	    __m128d eiv = _mm_load_pd(&EI[20 * l + k]);
-	    __m128d d1v = _mm_load_pd(&d1[k]);
-	    __m128d d2v = _mm_load_pd(&d2[k]);
-	    ump_x1v = _mm_add_pd(ump_x1v, _mm_mul_pd(d1v, eiv));
-	    ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(d2v, eiv));	  
+	    simde__m128d eiv = simde_mm_load_pd(&EI[20 * l + k]);
+	    simde__m128d d1v = simde_mm_load_pd(&d1[k]);
+	    simde__m128d d2v = simde_mm_load_pd(&d2[k]);
+	    ump_x1v = simde_mm_add_pd(ump_x1v, simde_mm_mul_pd(d1v, eiv));
+	    ump_x2v = simde_mm_add_pd(ump_x2v, simde_mm_mul_pd(d2v, eiv));	  
 	  }
 
-	ump_x1v = _mm_hadd_pd(ump_x1v, ump_x1v);
-	ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
+	ump_x1v = simde_mm_hadd_pd(ump_x1v, ump_x1v);
+	ump_x2v = simde_mm_hadd_pd(ump_x2v, ump_x2v);
 
-	x1px2v = _mm_mul_pd(ump_x1v, ump_x2v);
+	x1px2v = simde_mm_mul_pd(ump_x1v, ump_x2v);
 
 	for(k = 0; k < 20; k+=2)
 	  {
-	    __m128d ex3v = _mm_load_pd(&x3[k]);
-	    __m128d EVV  = _mm_load_pd(&ev[k]);
-	    ex3v = _mm_add_pd(ex3v, _mm_mul_pd(x1px2v, EVV));
+	    simde__m128d ex3v = simde_mm_load_pd(&x3[k]);
+	    simde__m128d EVV  = simde_mm_load_pd(&ev[k]);
+	    ex3v = simde_mm_add_pd(ex3v, simde_mm_mul_pd(x1px2v, EVV));
 	    
-	    _mm_store_pd(&x3[k], ex3v);	   	   
+	    simde_mm_store_pd(&x3[k], ex3v);	   	   
 	  }
       }                      
     
@@ -597,12 +401,12 @@
     
     if(scale)
       {	      
-	__m128d twoto = _mm_set_pd(twotothe256, twotothe256);
+	simde__m128d twoto = simde_mm_set_pd(twotothe256, twotothe256);
 
 	for(l = 0; l < 20; l+=2)
 	  {
-	    __m128d ex3v = _mm_mul_pd(_mm_load_pd(&x3[l]),twoto);
-	    _mm_store_pd(&x3[l], ex3v);	
+	    simde__m128d ex3v = simde_mm_mul_pd(simde_mm_load_pd(&x3[l]),twoto);
+	    simde_mm_store_pd(&x3[l], ex3v);	
 	  }
  	
 
@@ -766,46 +570,46 @@
 
 	for(l = 0; l < 20; l+=2)
 	  {
-	    __m128d d1v = _mm_mul_pd(_mm_load_pd(&x1[20 * index1[j] + l]), _mm_load_pd(&e1[l]));
-	    __m128d d2v = _mm_mul_pd(_mm_load_pd(&x2[20 * index2[j] + l]), _mm_load_pd(&e2[l]));
+	    simde__m128d d1v = simde_mm_mul_pd(simde_mm_load_pd(&x1[20 * index1[j] + l]), simde_mm_load_pd(&e1[l]));
+	    simde__m128d d2v = simde_mm_mul_pd(simde_mm_load_pd(&x2[20 * index2[j] + l]), simde_mm_load_pd(&e2[l]));
 	    
-	    _mm_store_pd(&d1[l], d1v);
-	    _mm_store_pd(&d2[l], d2v);	
+	    simde_mm_store_pd(&d1[l], d1v);
+	    simde_mm_store_pd(&d2[l], d2v);	
 	  }
 
-	__m128d zero = _mm_setzero_pd();
+	simde__m128d zero = simde_mm_setzero_pd();
 
 	for(l = 0; l < 20; l+=2)
-	  _mm_store_pd(&x3[j * 20 + l], zero);
+	  simde_mm_store_pd(&x3[j * 20 + l], zero);
                 
 	for(l = 0; l < 20; l++)
 	  { 	      
 	    double *ev = &EV[l * 20];
-	    __m128d ump_x1v = _mm_setzero_pd();
-	    __m128d ump_x2v = _mm_setzero_pd();
-	    __m128d x1px2v;
+	    simde__m128d ump_x1v = simde_mm_setzero_pd();
+	    simde__m128d ump_x2v = simde_mm_setzero_pd();
+	    simde__m128d x1px2v;
 	    
 	    for(k = 0; k < 20; k+=2)
 	      {       
-		__m128d eiv = _mm_load_pd(&EI[20 * l + k]);
-		__m128d d1v = _mm_load_pd(&d1[k]);
-		__m128d d2v = _mm_load_pd(&d2[k]);
-		ump_x1v = _mm_add_pd(ump_x1v, _mm_mul_pd(d1v, eiv));
-		ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(d2v, eiv));	  
+		simde__m128d eiv = simde_mm_load_pd(&EI[20 * l + k]);
+		simde__m128d d1v = simde_mm_load_pd(&d1[k]);
+		simde__m128d d2v = simde_mm_load_pd(&d2[k]);
+		ump_x1v = simde_mm_add_pd(ump_x1v, simde_mm_mul_pd(d1v, eiv));
+		ump_x2v = simde_mm_add_pd(ump_x2v, simde_mm_mul_pd(d2v, eiv));	  
 	      }
 
-	    ump_x1v = _mm_hadd_pd(ump_x1v, ump_x1v);
-	    ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
+	    ump_x1v = simde_mm_hadd_pd(ump_x1v, ump_x1v);
+	    ump_x2v = simde_mm_hadd_pd(ump_x2v, ump_x2v);
 
-	    x1px2v = _mm_mul_pd(ump_x1v, ump_x2v);
+	    x1px2v = simde_mm_mul_pd(ump_x1v, ump_x2v);
 
 	    for(k = 0; k < 20; k+=2)
 	      {
-		__m128d ex3v = _mm_load_pd(&x3[j * 20 + k]);
-		__m128d EVV  = _mm_load_pd(&ev[k]);
-		ex3v = _mm_add_pd(ex3v, _mm_mul_pd(x1px2v, EVV));
+		simde__m128d ex3v = simde_mm_load_pd(&x3[j * 20 + k]);
+		simde__m128d EVV  = simde_mm_load_pd(&ev[k]);
+		ex3v = simde_mm_add_pd(ex3v, simde_mm_mul_pd(x1px2v, EVV));
 		
-		_mm_store_pd(&x3[j * 20 + k], ex3v);	   	   
+		simde_mm_store_pd(&x3[j * 20 + k], ex3v);	   	   
 	      }
 	  }        
       }
@@ -816,12 +620,12 @@
     
     if(scale)
       {	      
-	__m128d twoto = _mm_set_pd(twotothe256, twotothe256);
+	simde__m128d twoto = simde_mm_set_pd(twotothe256, twotothe256);
 
 	for(l = 0; l < 80; l+=2)
 	  {
-	    __m128d ex3v = _mm_mul_pd(_mm_load_pd(&x3[l]),twoto);
-	    _mm_store_pd(&x3[l], ex3v);	
+	    simde__m128d ex3v = simde_mm_mul_pd(simde_mm_load_pd(&x3[l]),twoto);
+	    simde_mm_store_pd(&x3[l], ex3v);	
 	  }
 
 	*eVector = *eVector + 1;
@@ -1055,4 +859,3 @@
 
 
 
-#endif
--- examl.orig/examl/makenewzGenericSpecial.c
+++ examl/examl/makenewzGenericSpecial.c
@@ -43,11 +43,7 @@
 #include <string.h>
 #include "axml.h"
 
-#ifdef __SIM_SSE3
-#include <xmmintrin.h>
-#include <pmmintrin.h>
-/*#include <tmmintrin.h>*/
-#endif
+#include "../debian/include/simde/x86/sse3.h"
 
 /* includes MIC-optimized functions */
 
@@ -164,167 +160,10 @@
    So if we want to do a Newton-Rpahson optimization we only execute this function once in the beginning for each new branch we are considering !
 */
 
-#ifndef _OPTIMIZED_FUNCTIONS
-
-static void sumCAT_FLEX(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector,
-			unsigned char *tipX1, unsigned char *tipX2, int n, const int states)
-{
-  int 
-    i, 
-    l;
-  
-  double 
-    *sum, 
-    *left, 
-    *right;
-
-  switch(tipCase)
-    {
-      
-      /* switch over possible configurations of the nodes p and q defining the branch */
-
-    case TIP_TIP:
-      for (i = 0; i < n; i++)
-	{
-	  left  = &(tipVector[states * tipX1[i]]);
-	  right = &(tipVector[states * tipX2[i]]);
-	  sum = &sumtable[states * i];
-
-	  /* just multiply the values with each other for each site, note the similarity with evaluate() 
-	     we precompute the product which will remain constant and then just multiply this pre-computed 
-	     product with the changing P matrix exponentaions that depend on the branch lengths */
-
-	  for(l = 0; l < states; l++)
-	    sum[l] = left[l] * right[l];
-	}
-      break;
-    case TIP_INNER:
-
-      /* same as for TIP_TIP only that 
-	 we now access on tip vector and one 
-	 inner vector. 
-
-	 You may also observe that we do not consider using scaling vectors anywhere here.
-
-	 This is because we are interested in the first and second derivatives of the likelihood and 
-	 hence the addition of the log() of the scaling factor times the number of scaling events
-	 becomes obsolete through the derivative */
-
-      for (i = 0; i < n; i++)
-	{
-	  left = &(tipVector[states * tipX1[i]]);
-	  right = &x2[states * i];
-	  sum = &sumtable[states * i];
-
-	  for(l = 0; l < states; l++)
-	    sum[l] = left[l] * right[l];
-	}
-      break;
-    case INNER_INNER:
-      for (i = 0; i < n; i++)
-	{
-	  left  = &x1[states * i];
-	  right = &x2[states * i];
-	  sum = &sumtable[states * i];
-
-	  for(l = 0; l < states; l++)
-	    sum[l] = left[l] * right[l];
-	}
-      break;
-    default:
-      assert(0);
-    }
-}
-
-
-/* same thing for GAMMA models. The only noteworthy thing here is that we have an additional inner loop over the 
-   number of discrete gamma rates. The data access pattern is also different since for tip vector accesses through our 
-   lookup table, we do not distnguish between rates 
-
-   Note the different access pattern in TIP_INNER:
-
-   left = &(tipVector[states * tipX1[i]]);	  
-   right = &(x2[span * i + l * states]);
-
-*/
-
-static void sumGAMMA_FLEX(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector,
-			  unsigned char *tipX1, unsigned char *tipX2, int n, const int states)
-{
-  int 
-    i, 
-    l, 
-    k;
-  
-  const int 
-    span = 4 * states;
-
-  double 
-    *left, 
-    *right, 
-    *sum;
-
-  switch(tipCase)
-    {
-    case TIP_TIP:
-      for(i = 0; i < n; i++)
-	{
-	  left  = &(tipVector[states * tipX1[i]]);
-	  right = &(tipVector[states * tipX2[i]]);
-
-	  for(l = 0; l < 4; l++)
-	    {
-	      sum = &sumtable[i * span + l * states];
-
-	      for(k = 0; k < states; k++)
-		sum[k] = left[k] * right[k];
-
-	    }
-	}
-      break;
-    case TIP_INNER:
-      for(i = 0; i < n; i++)
-	{
-	  left = &(tipVector[states * tipX1[i]]);
-
-	  for(l = 0; l < 4; l++)
-	    {
-	      right = &(x2[span * i + l * states]);
-	      sum = &sumtable[i * span + l * states];
-
-	      for(k = 0; k < states; k++)
-		sum[k] = left[k] * right[k];
-
-	    }
-	}
-      break;
-    case INNER_INNER:
-      for(i = 0; i < n; i++)
-	{
-	  for(l = 0; l < 4; l++)
-	    {
-	      left  = &(x1[span * i + l * states]);
-	      right = &(x2[span * i + l * states]);
-	      sum   = &(sumtable[i * span + l * states]);
-
-
-	      for(k = 0; k < states; k++)
-		sum[k] = left[k] * right[k];
-	    }
-	}
-      break;
-    default:
-      assert(0);
-    }
-}
-
-#endif
 
 /* optimized functions for branch length optimization */
 
 
-#ifdef _OPTIMIZED_FUNCTIONS
-
 static void sumGAMMA_BINARY(int tipCase, double *sumtable, double *x1_start, double *x2_start, double *tipVector,
                             unsigned char *tipX1, unsigned char *tipX2, int n);
 static void coreGTRGAMMA_BINARY(const int upper, double *sumtable,
@@ -382,243 +221,6 @@
 static void coreGTRCATPROT(double *EIGN, double lz, int numberOfCategories, double *rptr, int *cptr, int upper,
 			   int *wgt,  volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double *sumtable);
 
-#endif
-
-
-#ifndef _OPTIMIZED_FUNCTIONS
-
-/* now this is the core function of the newton-Raphson based branch length optimization that actually computes 
-   the first and second derivative of the likelihood given a new proposed branch length lz */
-
-
-static void coreCAT_FLEX(int upper, int numberOfCategories, double *sum,
-			 volatile double *d1, volatile double *d2, int *wgt,
-			 double *rptr, double *EIGN, int *cptr, double lz, const int states)
-{
-  int 
-    i, 
-    l;
-  
-  double 
-    *d, 
-    
-    /* arrays to store stuff we can pre-compute */
-
-    *d_start = (double *)malloc_aligned(numberOfCategories * states * sizeof(double)),
-    *e =(double *)malloc_aligned(states * sizeof(double)),
-    *s = (double *)malloc_aligned(states * sizeof(double)),
-    *dd = (double *)malloc_aligned(states * sizeof(double)),
-    inv_Li, 
-    dlnLidlz, 
-    d2lnLidlz2,
-    dlnLdlz = 0.0,
-    d2lnLdlz2 = 0.0;
-
-  d = d_start;
-  
-  e[0] = 0.0;
-  s[0] = 0.0; 
-  dd[0] = 0.0;
-
-
-  /* we are pre-computing values for computing the first and second derivative of P(lz)
-     since this requires an exponetial that the only thing we really have to derive here */
-
-  for(l = 1; l < states; l++)
-    { 
-      s[l]  = EIGN[l];
-      e[l]  = EIGN[l] * EIGN[l];     
-      dd[l] = s[l] * lz;
-    }
-
-  /* compute the P matrices and their derivatives for 
-     all per-site rate categories */
-
-  for(i = 0; i < numberOfCategories; i++)
-    {      
-      d[states * i] = 1.0;
-      for(l = 1; l < states; l++)
-	d[states * i + l] = EXP(dd[l] * rptr[i]);
-    }
-
-
-  /* now loop over the sites in this partition to obtain the per-site 1st and 2nd derivatives */
-
-  for (i = 0; i < upper; i++)
-    {    
-      double 
-	r = rptr[cptr[i]],
-	wr1 = r * wgt[i],
-	wr2 = r * r * wgt[i];
-
-      /* get the correct p matrix for the rate at the current site i */
-      
-      d = &d_start[states * cptr[i]];      
-          
-      /* this is the likelihood at site i, NOT the log likelihood, we don't need the log 
-	 likelihood to compute derivatives ! */
-
-      inv_Li     = sum[states * i]; 
-      
-      /* those are for storing the first and second derivative of the Likelihood at site i */
-
-      dlnLidlz   = 0.0;
-      d2lnLidlz2 = 0.0;
-
-      /* now multiply the likelihood and the first and second derivative with the 
-	 appropriate derivatives of P(lz) */
-
-      for(l = 1; l < states; l++)
-	{
-	  double
-	    tmpv = d[l] * sum[states * i + l];
-	  
-	  inv_Li     += tmpv;	 	  
-	  dlnLidlz   += tmpv * s[l];       
-	  d2lnLidlz2 += tmpv * e[l];
-	}     
-      
-      /* below we are implementing the other mathematical operations that are required 
-	 to obtain the deirivatives */
-
-      inv_Li = 1.0/ FABS(inv_Li);
-
-      dlnLidlz   *= inv_Li;
-      d2lnLidlz2 *= inv_Li;
-
-      /* under the CAT model, wrptr[] and wr2ptr[] are pre-computed extension sof the weight pointer:
-	 wrptr[i]  = wgt[i] * rptr[cptr[i]].
-	 and 
-	 wr2ptr[i]  = wgt[i] * rptr[cptr[i]] * rptr[cptr[i]] 
-
-	 this is also something that is required for the derivatives because when computing the 
-	 derivative of the exponential() the rate must be multiplied with the 
-	 exponential 
-
-	 wgt is just the pattern site wieght 
-      */
-
-      /* compute the accumulated first and second derivatives of this site */
-
-      dlnLdlz  += wr1 * dlnLidlz;
-      d2lnLdlz2 += wr2 * (d2lnLidlz2 - dlnLidlz * dlnLidlz);
-    }
-
-  /* 
-     set the result values, i.e., the sum of the per-site first and second derivatives of the likelihood function 
-     for this partition. 
-   */
-
-  *d1  = dlnLdlz;
-  *d2 = d2lnLdlz2;
-
-  /* free the temporary arrays */
-
-  free(d_start);
-  free(e);
-  free(s);
-  free(dd);
-}
-
-static void coreGAMMA_FLEX(int upper, double *sumtable, volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, 
-			   double *EIGN, double *gammaRates, double lz, int *wgt, const int states)
-{
-   double  
-    *sum, 
-     diagptable[1024], /* TODO make this dynamic */
-    dlnLdlz = 0.0,
-    d2lnLdlz2 = 0.0,
-    ki, 
-    kisqr,
-    tmp,
-    inv_Li, 
-    dlnLidlz, 
-    d2lnLidlz2;
-
-  int     
-    i, 
-    j, 
-    l;  
-
-  const int 
-    gammaStates = 4 * states;
-
-  /* pre-compute the derivatives of the P matrix for all discrete GAMMA rates */
-
-  for(i = 0; i < 4; i++)
-    {
-      ki = gammaRates[i];
-      kisqr = ki * ki;
-
-      for(l = 1; l < states; l++)
-	{
-	  diagptable[i * gammaStates + l * 4]     = EXP(EIGN[l] * ki * lz);
-	  diagptable[i * gammaStates + l * 4 + 1] = EIGN[l] * ki;
-	  diagptable[i * gammaStates + l * 4 + 2] = EIGN[l] * EIGN[l] * kisqr;
-	}
-    }
-
-  /* loop over sites in this partition */
-
-  for (i = 0; i < upper; i++)
-    {
-      double 
-	r = rptr[cptr[i]],
-	wr1 = r * wgt[i],
-	wr2 = r * r * wgt[i];
-
-      /* access the array with pre-computed values */
-      sum = &sumtable[i * gammaStates];
-
-      /* initial per-site likelihood and 1st and 2nd derivatives */
-
-      inv_Li   = 0.0;
-      dlnLidlz = 0.0;
-      d2lnLidlz2 = 0.0;
-
-      /* loop over discrete GAMMA rates */
-
-      for(j = 0; j < 4; j++)
-	{
-	  inv_Li += sum[j * states];
-
-	  for(l = 1; l < states; l++)
-	    {
-	      inv_Li     += (tmp = diagptable[j * gammaStates + l * 4] * sum[j * states + l]);
-	      dlnLidlz   +=  tmp * diagptable[j * gammaStates + l * 4 + 1];
-	      d2lnLidlz2 +=  tmp * diagptable[j * gammaStates + l * 4 + 2];
-	    }
-	}
-
-      /* finalize derivative computation */
-      /* note that wrptr[] here unlike in CAT above is the 
-	 integer weight vector of the current site 
-
-	 The operations:
-
-	 EIGN[l] * ki;
-	 EIGN[l] * EIGN[l] * kisqr;
-
-	 that are hidden in CAT in wrptr (at least the * ki and * ki *ki part of them 
-	 are done explicitely here 
-
-      */
-
-      inv_Li = 1.0 / FABS(inv_Li);
-
-      dlnLidlz   *= inv_Li;
-      d2lnLidlz2 *= inv_Li;
-
-      dlnLdlz   += wr1 * dlnLidlz;
-      d2lnLdlz2 += wr2 * (d2lnLidlz2 - dlnLidlz * dlnLidlz);
-    }
-
-  *ext_dlnLdlz   = dlnLdlz;
-  *ext_d2lnLdlz2 = d2lnLdlz2;
-  
-}
-
-#endif
 
 /* the function below is called only once at the very beginning of each Newton-Raphson procedure for optimizing barnch lengths.
    It initially invokes an iterative newview call to get a consistent pair of vectors at the left and the right end of the 
@@ -721,15 +323,6 @@
 	  double
 	    *sumBuffer = tr->partitionData[model].sumBuffer + x_offset;
 	 
-#ifndef _OPTIMIZED_FUNCTIONS
-	  assert(!tr->saveMemory);
-	  if(tr->rateHetModel == CAT)
-	    sumCAT_FLEX(tipCase, sumBuffer, x1_start, x2_start, tr->partitionData[model].tipVector, tipX1, tipX2,
-			width, states);
-	  else
-	    sumGAMMA_FLEX(tipCase, sumBuffer, x1_start, x2_start, tr->partitionData[model].tipVector, tipX1, tipX2,
-			  width, states);
-#else
 	  switch(states)
 	    {
 	    case 2:
@@ -834,7 +427,6 @@
 	    default:
 	      assert(0);
 	    }
-#endif
 	}
     }  // for model
   }  // omp parallel region
@@ -977,19 +569,6 @@
 	    dlnLdlz   = 0.0,
 	    d2lnLdlz2 = 0.0;
 
-  #ifndef _OPTIMIZED_FUNCTIONS
-
-	    /* compute first and second derivatives with the slow generic functions */
-
-	    if(tr->rateHetModel == CAT)
-	      coreCAT_FLEX(width, tr->partitionData[model].numberOfCategories, sumBuffer,
-			   &dlnLdlz, &d2lnLdlz2, wgt,
-			   tr->partitionData[model].perSiteRates, tr->partitionData[model].EIGN, rateCategory, lz, states);
-	    else
-	      coreGAMMA_FLEX(width, sumBuffer,
-			     &dlnLdlz, &d2lnLdlz2, tr->partitionData[model].EIGN, tr->partitionData[model].gammaRates, lz,
-			     wgt, states);
-  #else
 	    switch(states)
 	      {
 	      case 2:
@@ -1068,7 +647,6 @@
 	      default:
 		assert(0);
 	      }
-  #endif
 
 	    /* store first and second derivative */
 
@@ -1402,8 +980,6 @@
 
 /* below are, once again the optimized functions */
 
-#ifdef _OPTIMIZED_FUNCTIONS
-
 /**** binary ***/
 static void coreGTRCAT_BINARY(int upper, int numberOfCategories, double *sum,
                               volatile double *d1, volatile double *d2, 
@@ -1497,9 +1073,9 @@
 
   for (i = 0; i < upper; i++)
     { 
-      __m128d a0 = _mm_setzero_pd();
-      __m128d a1 = _mm_setzero_pd();
-      __m128d a2 = _mm_setzero_pd();
+      simde__m128d a0 = simde_mm_setzero_pd();
+      simde__m128d a1 = simde_mm_setzero_pd();
+      simde__m128d a2 = simde_mm_setzero_pd();
 
       sum = &sumtable[i * 8];         
 
@@ -1510,20 +1086,20 @@
             *d1 = &diagptable1[j * 2],
             *d2 = &diagptable2[j * 2];
                          
-          __m128d tmpv = _mm_mul_pd(_mm_load_pd(d0), _mm_load_pd(&sum[j * 2]));
-          a0 = _mm_add_pd(a0, tmpv);
-          a1 = _mm_add_pd(a1, _mm_mul_pd(tmpv, _mm_load_pd(d1)));
-          a2 = _mm_add_pd(a2, _mm_mul_pd(tmpv, _mm_load_pd(d2)));
+          simde__m128d tmpv = simde_mm_mul_pd(simde_mm_load_pd(d0), simde_mm_load_pd(&sum[j * 2]));
+          a0 = simde_mm_add_pd(a0, tmpv);
+          a1 = simde_mm_add_pd(a1, simde_mm_mul_pd(tmpv, simde_mm_load_pd(d1)));
+          a2 = simde_mm_add_pd(a2, simde_mm_mul_pd(tmpv, simde_mm_load_pd(d2)));
                           
         }
 
-      a0 = _mm_hadd_pd(a0, a0);
-      a1 = _mm_hadd_pd(a1, a1);
-      a2 = _mm_hadd_pd(a2, a2);
-
-      _mm_storel_pd(&inv_Li, a0);     
-      _mm_storel_pd(&dlnLidlz, a1);
-      _mm_storel_pd(&d2lnLidlz2, a2); 
+      a0 = simde_mm_hadd_pd(a0, a0);
+      a1 = simde_mm_hadd_pd(a1, a1);
+      a2 = simde_mm_hadd_pd(a2, a2);
+
+      simde_mm_storel_pd(&inv_Li, a0);     
+      simde_mm_storel_pd(&dlnLidlz, a1);
+      simde_mm_storel_pd(&d2lnLidlz2, a2); 
 
       inv_Li = 1.0 / fabs(inv_Li);
      
@@ -1565,7 +1141,7 @@
           sum = &sumtable[i * 8];
 
           for(j = 0; j < 4; j++)
-            _mm_store_pd( &sum[j*2], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[0] )));         
+            simde_mm_store_pd( &sum[j*2], simde_mm_mul_pd( simde_mm_load_pd( &x1[0] ), simde_mm_load_pd( &x2[0] )));         
         }
       break;
     case TIP_INNER:
@@ -1576,7 +1152,7 @@
           sum = &sumtable[8 * i];
 
           for(j = 0; j < 4; j++)
-            _mm_store_pd( &sum[j*2], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[j * 2] )));
+            simde_mm_store_pd( &sum[j*2], simde_mm_mul_pd( simde_mm_load_pd( &x1[0] ), simde_mm_load_pd( &x2[j * 2] )));
         }
       break;
     case INNER_INNER:
@@ -1587,7 +1163,7 @@
           sum = &sumtable[8 * i];
 
           for(j = 0; j < 4; j++)
-            _mm_store_pd( &sum[j*2], _mm_mul_pd( _mm_load_pd( &x1[j * 2] ), _mm_load_pd( &x2[j * 2] )));
+            simde_mm_store_pd( &sum[j*2], simde_mm_mul_pd( simde_mm_load_pd( &x1[j * 2] ), simde_mm_load_pd( &x2[j * 2] )));
         }
       break;
     default:
@@ -1615,7 +1191,7 @@
           x1 = &(tipVector[2 * tipX1[i]]);
           x2 = &(tipVector[2 * tipX2[i]]);
 
-          _mm_store_pd(&sum[i * 2], _mm_mul_pd( _mm_load_pd(x1), _mm_load_pd(x2)));
+          simde_mm_store_pd(&sum[i * 2], simde_mm_mul_pd( simde_mm_load_pd(x1), simde_mm_load_pd(x2)));
         }
       break;
     case TIP_INNER:
@@ -1624,7 +1200,7 @@
           x1 = &(tipVector[2 * tipX1[i]]);
           x2 = &x2_start[2 * i];
 
-          _mm_store_pd(&sum[i * 2], _mm_mul_pd( _mm_load_pd(x1), _mm_load_pd(x2)));  
+          simde_mm_store_pd(&sum[i * 2], simde_mm_mul_pd( simde_mm_load_pd(x1), simde_mm_load_pd(x2)));  
         }
       break;
     case INNER_INNER:
@@ -1633,7 +1209,7 @@
           x1 = &x1_start[2 * i];
           x2 = &x2_start[2 * i];
 
-          _mm_store_pd(&sum[i * 2], _mm_mul_pd( _mm_load_pd(x1), _mm_load_pd(x2)));   
+          simde_mm_store_pd(&sum[i * 2], simde_mm_mul_pd( simde_mm_load_pd(x1), simde_mm_load_pd(x2)));   
         }
       break;
     default:
@@ -1663,8 +1239,8 @@
 	  x1 = &(tipVector[4 * tipX1[i]]);
 	  x2 = &(tipVector[4 * tipX2[i]]);
 
-	  _mm_store_pd( &sum[i*4 + 0], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[0] )));
-	  _mm_store_pd( &sum[i*4 + 2], _mm_mul_pd( _mm_load_pd( &x1[2] ), _mm_load_pd( &x2[2] )));
+	  simde_mm_store_pd( &sum[i*4 + 0], simde_mm_mul_pd( simde_mm_load_pd( &x1[0] ), simde_mm_load_pd( &x2[0] )));
+	  simde_mm_store_pd( &sum[i*4 + 2], simde_mm_mul_pd( simde_mm_load_pd( &x1[2] ), simde_mm_load_pd( &x2[2] )));
 	}
       break;
     case TIP_INNER:
@@ -1679,8 +1255,8 @@
 	      x2_ptr += 4;
 	    }
 
-	  _mm_store_pd( &sum[i*4 + 0], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[0] )));
-	  _mm_store_pd( &sum[i*4 + 2], _mm_mul_pd( _mm_load_pd( &x1[2] ), _mm_load_pd( &x2[2] )));
+	  simde_mm_store_pd( &sum[i*4 + 0], simde_mm_mul_pd( simde_mm_load_pd( &x1[0] ), simde_mm_load_pd( &x2[0] )));
+	  simde_mm_store_pd( &sum[i*4 + 2], simde_mm_mul_pd( simde_mm_load_pd( &x1[2] ), simde_mm_load_pd( &x2[2] )));
 	}
       break;
     case INNER_INNER:
@@ -1702,8 +1278,8 @@
 	      x2_ptr += 4;
 	    }
 
-	  _mm_store_pd( &sum[i*4 + 0], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[0] )));
-	  _mm_store_pd( &sum[i*4 + 2], _mm_mul_pd( _mm_load_pd( &x1[2] ), _mm_load_pd( &x2[2] )));
+	  simde_mm_store_pd( &sum[i*4 + 0], simde_mm_mul_pd( simde_mm_load_pd( &x1[0] ), simde_mm_load_pd( &x2[0] )));
+	  simde_mm_store_pd( &sum[i*4 + 2], simde_mm_mul_pd( simde_mm_load_pd( &x1[2] ), simde_mm_load_pd( &x2[2] )));
 
 	}    
       break;
@@ -1737,7 +1313,7 @@
 
 	  for(j = 0; j < 4; j++)	    
 	    for(k = 0; k < 4; k+=2)
-	      _mm_store_pd( &sum[j*4 + k], _mm_mul_pd( _mm_load_pd( &x1[k] ), _mm_load_pd( &x2[k] )));
+	      simde_mm_store_pd( &sum[j*4 + k], simde_mm_mul_pd( simde_mm_load_pd( &x1[k] ), simde_mm_load_pd( &x2[k] )));
 	}
       break;
     case TIP_INNER:
@@ -1757,7 +1333,7 @@
 
 	  for(j = 0; j < 4; j++)	    
 	    for(k = 0; k < 4; k+=2)
-	      _mm_store_pd( &sum[j*4 + k], _mm_mul_pd( _mm_load_pd( &x1[k] ), _mm_load_pd( &x2[j * 4 + k] )));
+	      simde_mm_store_pd( &sum[j*4 + k], simde_mm_mul_pd( simde_mm_load_pd( &x1[k] ), simde_mm_load_pd( &x2[j * 4 + k] )));
 	}
       break;
     case INNER_INNER:
@@ -1784,7 +1360,7 @@
 
 	   for(j = 0; j < 4; j++)	    
 	    for(k = 0; k < 4; k+=2)
-	      _mm_store_pd( &sum[j*4 + k], _mm_mul_pd( _mm_load_pd( &x1[j * 4 + k] ), _mm_load_pd( &x2[j * 4 + k] )));
+	      simde_mm_store_pd( &sum[j*4 + k], simde_mm_mul_pd( simde_mm_load_pd( &x1[j * 4 + k] ), simde_mm_load_pd( &x2[j * 4 + k] )));
 	}
       break;
     default:
@@ -1815,7 +1391,7 @@
 
 	  for(j = 0; j < 4; j++)	    
 	    for(k = 0; k < 4; k+=2)
-	      _mm_store_pd( &sum[j*4 + k], _mm_mul_pd( _mm_load_pd( &x1[k] ), _mm_load_pd( &x2[k] )));
+	      simde_mm_store_pd( &sum[j*4 + k], simde_mm_mul_pd( simde_mm_load_pd( &x1[k] ), simde_mm_load_pd( &x2[k] )));
 	}
       break;
     case TIP_INNER:
@@ -1827,7 +1403,7 @@
 
 	  for(j = 0; j < 4; j++)	    
 	    for(k = 0; k < 4; k+=2)
-	      _mm_store_pd( &sum[j*4 + k], _mm_mul_pd( _mm_load_pd( &x1[k] ), _mm_load_pd( &x2[j * 4 + k] )));
+	      simde_mm_store_pd( &sum[j*4 + k], simde_mm_mul_pd( simde_mm_load_pd( &x1[k] ), simde_mm_load_pd( &x2[j * 4 + k] )));
 	}
       break;
     case INNER_INNER:
@@ -1839,7 +1415,7 @@
 
 	   for(j = 0; j < 4; j++)	    
 	    for(k = 0; k < 4; k+=2)
-	      _mm_store_pd( &sum[j*4 + k], _mm_mul_pd( _mm_load_pd( &x1[j * 4 + k] ), _mm_load_pd( &x2[j * 4 + k] )));
+	      simde_mm_store_pd( &sum[j*4 + k], simde_mm_mul_pd( simde_mm_load_pd( &x1[j * 4 + k] ), simde_mm_load_pd( &x2[j * 4 + k] )));
 	}
       break;
     default:
@@ -1864,8 +1440,8 @@
 	  x1 = &(tipVector[4 * tipX1[i]]);
 	  x2 = &(tipVector[4 * tipX2[i]]);
 
-	  _mm_store_pd( &sum[i*4 + 0], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[0] )));
-	  _mm_store_pd( &sum[i*4 + 2], _mm_mul_pd( _mm_load_pd( &x1[2] ), _mm_load_pd( &x2[2] )));
+	  simde_mm_store_pd( &sum[i*4 + 0], simde_mm_mul_pd( simde_mm_load_pd( &x1[0] ), simde_mm_load_pd( &x2[0] )));
+	  simde_mm_store_pd( &sum[i*4 + 2], simde_mm_mul_pd( simde_mm_load_pd( &x1[2] ), simde_mm_load_pd( &x2[2] )));
 	}
       break;
     case TIP_INNER:
@@ -1874,8 +1450,8 @@
 	  x1 = &(tipVector[4 * tipX1[i]]);
 	  x2 = &x2_start[4 * i];
 
-	  _mm_store_pd( &sum[i*4 + 0], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[0] )));
-	  _mm_store_pd( &sum[i*4 + 2], _mm_mul_pd( _mm_load_pd( &x1[2] ), _mm_load_pd( &x2[2] )));
+	  simde_mm_store_pd( &sum[i*4 + 0], simde_mm_mul_pd( simde_mm_load_pd( &x1[0] ), simde_mm_load_pd( &x2[0] )));
+	  simde_mm_store_pd( &sum[i*4 + 2], simde_mm_mul_pd( simde_mm_load_pd( &x1[2] ), simde_mm_load_pd( &x2[2] )));
 	}
       break;
     case INNER_INNER:
@@ -1884,8 +1460,8 @@
 	  x1 = &x1_start[4 * i];
 	  x2 = &x2_start[4 * i];
 
-	  _mm_store_pd( &sum[i*4 + 0], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[0] )));
-	  _mm_store_pd( &sum[i*4 + 2], _mm_mul_pd( _mm_load_pd( &x1[2] ), _mm_load_pd( &x2[2] )));
+	  simde_mm_store_pd( &sum[i*4 + 0], simde_mm_mul_pd( simde_mm_load_pd( &x1[0] ), simde_mm_load_pd( &x2[0] )));
+	  simde_mm_store_pd( &sum[i*4 + 2], simde_mm_mul_pd( simde_mm_load_pd( &x1[2] ), simde_mm_load_pd( &x2[2] )));
 
 	}    
       break;
@@ -1921,9 +1497,9 @@
 
 	      for(k = 0; k < 20; k+=2)
 		{
-		  __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+		  simde__m128d sumv = simde_mm_mul_pd(simde_mm_load_pd(&left[k]), simde_mm_load_pd(&right[k]));
 		  
-		  _mm_store_pd(&sum[k], sumv);		 
+		  simde_mm_store_pd(&sum[k], sumv);		 
 		}
 
 	    }
@@ -1949,9 +1525,9 @@
 
 	      for(k = 0; k < 20; k+=2)
 		{
-		  __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+		  simde__m128d sumv = simde_mm_mul_pd(simde_mm_load_pd(&left[k]), simde_mm_load_pd(&right[k]));
 		  
-		  _mm_store_pd(&sum[k], sumv);		 
+		  simde_mm_store_pd(&sum[k], sumv);		 
 		}
 	    }
 	}
@@ -1983,9 +1559,9 @@
 
 	      for(k = 0; k < 20; k+=2)
 		{
-		  __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+		  simde__m128d sumv = simde_mm_mul_pd(simde_mm_load_pd(&left[k]), simde_mm_load_pd(&right[k]));
 		  
-		  _mm_store_pd(&sum[k], sumv);		 
+		  simde_mm_store_pd(&sum[k], sumv);		 
 		}
 	    }
 	}
@@ -2013,17 +1589,12 @@
 	      right = &(tipVector[l][20 * tipX2[i]]);
 
 	      sum = &sumtable[i * 80 + l * 20];
-#ifdef __SIM_SSE3
 	      for(k = 0; k < 20; k+=2)
 		{
-		  __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+		  simde__m128d sumv = simde_mm_mul_pd(simde_mm_load_pd(&left[k]), simde_mm_load_pd(&right[k]));
 		  
-		  _mm_store_pd(&sum[k], sumv);		 
+		  simde_mm_store_pd(&sum[k], sumv);		 
 		}
-#else
-	      for(k = 0; k < 20; k++)
-		sum[k] = left[k] * right[k];
-#endif
 	    }
 	}
       break;
@@ -2037,17 +1608,12 @@
 	      left = &(tipVector[l][20 * tipX1[i]]);
 	      right = &(x2[80 * i + l * 20]);
 	      sum = &sumtable[i * 80 + l * 20];
-#ifdef __SIM_SSE3
 	      for(k = 0; k < 20; k+=2)
 		{
-		  __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+		  simde__m128d sumv = simde_mm_mul_pd(simde_mm_load_pd(&left[k]), simde_mm_load_pd(&right[k]));
 		  
-		  _mm_store_pd(&sum[k], sumv);		 
+		  simde_mm_store_pd(&sum[k], sumv);		 
 		}
-#else
-	      for(k = 0; k < 20; k++)
-		sum[k] = left[k] * right[k];
-#endif
 	    }
 	}
       break;
@@ -2060,17 +1626,12 @@
 	      right = &(x2[80 * i + l * 20]);
 	      sum   = &(sumtable[i * 80 + l * 20]);
 
-#ifdef __SIM_SSE3
 	      for(k = 0; k < 20; k+=2)
 		{
-		  __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+		  simde__m128d sumv = simde_mm_mul_pd(simde_mm_load_pd(&left[k]), simde_mm_load_pd(&right[k]));
 		  
-		  _mm_store_pd(&sum[k], sumv);		 
+		  simde_mm_store_pd(&sum[k], sumv);		 
 		}
-#else
-	      for(k = 0; k < 20; k++)
-		sum[k] = left[k] * right[k];
-#endif
 	    }
 	}
       break;
@@ -2100,9 +1661,9 @@
 
 	      for(k = 0; k < 20; k+=2)
 		{
-		  __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+		  simde__m128d sumv = simde_mm_mul_pd(simde_mm_load_pd(&left[k]), simde_mm_load_pd(&right[k]));
 		  
-		  _mm_store_pd(&sum[k], sumv);		 
+		  simde_mm_store_pd(&sum[k], sumv);		 
 		}
 
 	    }
@@ -2120,9 +1681,9 @@
 
 	      for(k = 0; k < 20; k+=2)
 		{
-		  __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+		  simde__m128d sumv = simde_mm_mul_pd(simde_mm_load_pd(&left[k]), simde_mm_load_pd(&right[k]));
 		  
-		  _mm_store_pd(&sum[k], sumv);		 
+		  simde_mm_store_pd(&sum[k], sumv);		 
 		}
 
 	    }
@@ -2140,9 +1701,9 @@
 
 	      for(k = 0; k < 20; k+=2)
 		{
-		  __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+		  simde__m128d sumv = simde_mm_mul_pd(simde_mm_load_pd(&left[k]), simde_mm_load_pd(&right[k]));
 		  
-		  _mm_store_pd(&sum[k], sumv);		 
+		  simde_mm_store_pd(&sum[k], sumv);		 
 		}
 	    }
 	}
@@ -2170,9 +1731,9 @@
 
 	  for(l = 0; l < 20; l+=2)
 	    {
-	      __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+	      simde__m128d sumv = simde_mm_mul_pd(simde_mm_load_pd(&left[l]), simde_mm_load_pd(&right[l]));
 	      
-	      _mm_store_pd(&sum[l], sumv);		 
+	      simde_mm_store_pd(&sum[l], sumv);		 
 	    }
 
 	}
@@ -2186,9 +1747,9 @@
 
 	  for(l = 0; l < 20; l+=2)
 	    {
-	      __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+	      simde__m128d sumv = simde_mm_mul_pd(simde_mm_load_pd(&left[l]), simde_mm_load_pd(&right[l]));
 	      
-	      _mm_store_pd(&sum[l], sumv);		 
+	      simde_mm_store_pd(&sum[l], sumv);		 
 	    }
 
 	}
@@ -2202,9 +1763,9 @@
 
 	  for(l = 0; l < 20; l+=2)
 	    {
-	      __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+	      simde__m128d sumv = simde_mm_mul_pd(simde_mm_load_pd(&left[l]), simde_mm_load_pd(&right[l]));
 	      
-	      _mm_store_pd(&sum[l], sumv);		 
+	      simde_mm_store_pd(&sum[l], sumv);		 
 	    }
 
 	}
@@ -2241,9 +1802,9 @@
 
 	  for(l = 0; l < 20; l+=2)
 	    {
-	      __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+	      simde__m128d sumv = simde_mm_mul_pd(simde_mm_load_pd(&left[l]), simde_mm_load_pd(&right[l]));
 	      
-	      _mm_store_pd(&sum[l], sumv);		 
+	      simde_mm_store_pd(&sum[l], sumv);		 
 	    }
 
 	}
@@ -2265,9 +1826,9 @@
 
 	  for(l = 0; l < 20; l+=2)
 	    {
-	      __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+	      simde__m128d sumv = simde_mm_mul_pd(simde_mm_load_pd(&left[l]), simde_mm_load_pd(&right[l]));
 	      
-	      _mm_store_pd(&sum[l], sumv);		 
+	      simde_mm_store_pd(&sum[l], sumv);		 
 	    }
 
 	}
@@ -2295,9 +1856,9 @@
 
 	  for(l = 0; l < 20; l+=2)
 	    {
-	      __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+	      simde__m128d sumv = simde_mm_mul_pd(simde_mm_load_pd(&left[l]), simde_mm_load_pd(&right[l]));
 	      
-	      _mm_store_pd(&sum[l], sumv);		 
+	      simde_mm_store_pd(&sum[l], sumv);		 
 	    }
 	}
       break;
@@ -2346,9 +1907,9 @@
 
   for (i = 0; i < upper; i++)
     { 
-      __m128d a0 = _mm_setzero_pd();
-      __m128d a1 = _mm_setzero_pd();
-      __m128d a2 = _mm_setzero_pd();
+      simde__m128d a0 = simde_mm_setzero_pd();
+      simde__m128d a1 = simde_mm_setzero_pd();
+      simde__m128d a2 = simde_mm_setzero_pd();
       
       
       
@@ -2363,20 +1924,20 @@
   	 	 
 	  for(l = 0; l < 4; l+=2)
 	    {
-	      __m128d tmpv = _mm_mul_pd(_mm_load_pd(&d0[l]), _mm_load_pd(&sum[j * 4 + l]));
-	      a0 = _mm_add_pd(a0, tmpv);
-	      a1 = _mm_add_pd(a1, _mm_mul_pd(tmpv, _mm_load_pd(&d1[l])));
-	      a2 = _mm_add_pd(a2, _mm_mul_pd(tmpv, _mm_load_pd(&d2[l])));
+	      simde__m128d tmpv = simde_mm_mul_pd(simde_mm_load_pd(&d0[l]), simde_mm_load_pd(&sum[j * 4 + l]));
+	      a0 = simde_mm_add_pd(a0, tmpv);
+	      a1 = simde_mm_add_pd(a1, simde_mm_mul_pd(tmpv, simde_mm_load_pd(&d1[l])));
+	      a2 = simde_mm_add_pd(a2, simde_mm_mul_pd(tmpv, simde_mm_load_pd(&d2[l])));
 	    }	 	  
 	}
 
-      a0 = _mm_hadd_pd(a0, a0);
-      a1 = _mm_hadd_pd(a1, a1);
-      a2 = _mm_hadd_pd(a2, a2);
-
-      _mm_storel_pd(&inv_Li, a0);     
-      _mm_storel_pd(&dlnLidlz, a1);
-      _mm_storel_pd(&d2lnLidlz2, a2); 
+      a0 = simde_mm_hadd_pd(a0, a0);
+      a1 = simde_mm_hadd_pd(a1, a1);
+      a2 = simde_mm_hadd_pd(a2, a2);
+
+      simde_mm_storel_pd(&inv_Li, a0);     
+      simde_mm_storel_pd(&dlnLidlz, a1);
+      simde_mm_storel_pd(&d2lnLidlz2, a2); 
 
       inv_Li = 1.0 / FABS(inv_Li);
      
@@ -2408,7 +1969,7 @@
   double e2[4] __attribute__ ((aligned (BYTE_ALIGNMENT)));
   double dd1, dd2, dd3;
 
-  __m128d
+  simde__m128d
     e1v[2],
     e2v[2];
 
@@ -2421,11 +1982,11 @@
   e1[3] = EIGN[3];
   e2[3] = EIGN[3] * EIGN[3];
 
-  e1v[0]= _mm_load_pd(&e1[0]);
-  e1v[1]= _mm_load_pd(&e1[2]);
+  e1v[0]= simde_mm_load_pd(&e1[0]);
+  e1v[1]= simde_mm_load_pd(&e1[2]);
 
-  e2v[0]= _mm_load_pd(&e2[0]);
-  e2v[1]= _mm_load_pd(&e2[2]);
+  e2v[0]= simde_mm_load_pd(&e2[0]);
+  e2v[1]= simde_mm_load_pd(&e2[2]);
 
   d = d_start = (double *)malloc_aligned(numberOfCategories * 4 * sizeof(double));
 
@@ -2453,22 +2014,22 @@
 
       d = &d_start[4 * cptr[i]];  
       
-      __m128d tmp_0v =_mm_mul_pd(_mm_load_pd(&d[0]),_mm_load_pd(&s[0]));
-      __m128d tmp_1v =_mm_mul_pd(_mm_load_pd(&d[2]),_mm_load_pd(&s[2]));
+      simde__m128d tmp_0v =simde_mm_mul_pd(simde_mm_load_pd(&d[0]),simde_mm_load_pd(&s[0]));
+      simde__m128d tmp_1v =simde_mm_mul_pd(simde_mm_load_pd(&d[2]),simde_mm_load_pd(&s[2]));
 
-      __m128d inv_Liv    = _mm_add_pd(tmp_0v, tmp_1v);      
+      simde__m128d inv_Liv    = simde_mm_add_pd(tmp_0v, tmp_1v);      
             	  
-      __m128d dlnLidlzv   = _mm_add_pd(_mm_mul_pd(tmp_0v, e1v[0]), _mm_mul_pd(tmp_1v, e1v[1]));	  
-      __m128d d2lnLidlz2v = _mm_add_pd(_mm_mul_pd(tmp_0v, e2v[0]), _mm_mul_pd(tmp_1v, e2v[1]));
+      simde__m128d dlnLidlzv   = simde_mm_add_pd(simde_mm_mul_pd(tmp_0v, e1v[0]), simde_mm_mul_pd(tmp_1v, e1v[1]));	  
+      simde__m128d d2lnLidlz2v = simde_mm_add_pd(simde_mm_mul_pd(tmp_0v, e2v[0]), simde_mm_mul_pd(tmp_1v, e2v[1]));
 
 
-      inv_Liv   = _mm_hadd_pd(inv_Liv, inv_Liv);
-      dlnLidlzv = _mm_hadd_pd(dlnLidlzv, dlnLidlzv);
-      d2lnLidlz2v = _mm_hadd_pd(d2lnLidlz2v, d2lnLidlz2v);                 
+      inv_Liv   = simde_mm_hadd_pd(inv_Liv, inv_Liv);
+      dlnLidlzv = simde_mm_hadd_pd(dlnLidlzv, dlnLidlzv);
+      d2lnLidlz2v = simde_mm_hadd_pd(d2lnLidlz2v, d2lnLidlz2v);                 
  
-      _mm_storel_pd(&inv_Li, inv_Liv);     
-      _mm_storel_pd(&dlnLidlz, dlnLidlzv);                 
-      _mm_storel_pd(&d2lnLidlz2, d2lnLidlz2v);      
+      simde_mm_storel_pd(&inv_Li, inv_Liv);     
+      simde_mm_storel_pd(&dlnLidlz, dlnLidlzv);                 
+      simde_mm_storel_pd(&d2lnLidlz2, d2lnLidlz2v);      
 
       inv_Li = 1.0/FABS(inv_Li);
 
@@ -2536,26 +2097,26 @@
 	    *d1 = &diagptable1[j * 20],
 	    *d2 = &diagptable2[j * 20];
 
-	  __m128d 
-	    a0 = _mm_setzero_pd(),
-	    a1 = _mm_setzero_pd(),
-	    a2 = _mm_setzero_pd();
+	  simde__m128d 
+	    a0 = simde_mm_setzero_pd(),
+	    a1 = simde_mm_setzero_pd(),
+	    a2 = simde_mm_setzero_pd();
 	  
 	  for(l = 0; l < 20; l+=2)
 	    {
-	      __m128d tmpv = _mm_mul_pd(_mm_load_pd(&d0[l]), _mm_load_pd(&sum[j * 20 +l]));
-	      a0 = _mm_add_pd(a0, tmpv);
-	      a1 = _mm_add_pd(a1, _mm_mul_pd(tmpv, _mm_load_pd(&d1[l])));
-	      a2 = _mm_add_pd(a2, _mm_mul_pd(tmpv, _mm_load_pd(&d2[l])));
+	      simde__m128d tmpv = simde_mm_mul_pd(simde_mm_load_pd(&d0[l]), simde_mm_load_pd(&sum[j * 20 +l]));
+	      a0 = simde_mm_add_pd(a0, tmpv);
+	      a1 = simde_mm_add_pd(a1, simde_mm_mul_pd(tmpv, simde_mm_load_pd(&d1[l])));
+	      a2 = simde_mm_add_pd(a2, simde_mm_mul_pd(tmpv, simde_mm_load_pd(&d2[l])));
 	    }
 
-	  a0 = _mm_hadd_pd(a0, a0);
-	  a1 = _mm_hadd_pd(a1, a1);
-	  a2 = _mm_hadd_pd(a2, a2);
-
-	  _mm_storel_pd(&l0, a0);
-	  _mm_storel_pd(&l1, a1);
-	  _mm_storel_pd(&l2, a2);
+	  a0 = simde_mm_hadd_pd(a0, a0);
+	  a1 = simde_mm_hadd_pd(a1, a1);
+	  a2 = simde_mm_hadd_pd(a2, a2);
+
+	  simde_mm_storel_pd(&l0, a0);
+	  simde_mm_storel_pd(&l1, a1);
+	  simde_mm_storel_pd(&l2, a2);
 	  
 	  inv_Li     += weights[j] * l0;
 	  dlnLidlz   += weights[j] * l1;
@@ -2610,9 +2171,9 @@
 
   for (i = 0; i < upper; i++)
     { 
-      __m128d a0 = _mm_setzero_pd();
-      __m128d a1 = _mm_setzero_pd();
-      __m128d a2 = _mm_setzero_pd();
+      simde__m128d a0 = simde_mm_setzero_pd();
+      simde__m128d a1 = simde_mm_setzero_pd();
+      simde__m128d a2 = simde_mm_setzero_pd();
 
      
       sum = &sumtable[i * 80];         
@@ -2626,20 +2187,20 @@
   	 	 
 	  for(l = 0; l < 20; l+=2)
 	    {
-	      __m128d tmpv = _mm_mul_pd(_mm_load_pd(&d0[l]), _mm_load_pd(&sum[j * 20 +l]));
-	      a0 = _mm_add_pd(a0, tmpv);
-	      a1 = _mm_add_pd(a1, _mm_mul_pd(tmpv, _mm_load_pd(&d1[l])));
-	      a2 = _mm_add_pd(a2, _mm_mul_pd(tmpv, _mm_load_pd(&d2[l])));
+	      simde__m128d tmpv = simde_mm_mul_pd(simde_mm_load_pd(&d0[l]), simde_mm_load_pd(&sum[j * 20 +l]));
+	      a0 = simde_mm_add_pd(a0, tmpv);
+	      a1 = simde_mm_add_pd(a1, simde_mm_mul_pd(tmpv, simde_mm_load_pd(&d1[l])));
+	      a2 = simde_mm_add_pd(a2, simde_mm_mul_pd(tmpv, simde_mm_load_pd(&d2[l])));
 	    }	 	  
 	}
 
-      a0 = _mm_hadd_pd(a0, a0);
-      a1 = _mm_hadd_pd(a1, a1);
-      a2 = _mm_hadd_pd(a2, a2);
-
-      _mm_storel_pd(&inv_Li, a0);
-      _mm_storel_pd(&dlnLidlz, a1);
-      _mm_storel_pd(&d2lnLidlz2, a2);
+      a0 = simde_mm_hadd_pd(a0, a0);
+      a1 = simde_mm_hadd_pd(a1, a1);
+      a2 = simde_mm_hadd_pd(a2, a2);
+
+      simde_mm_storel_pd(&inv_Li, a0);
+      simde_mm_storel_pd(&dlnLidlz, a1);
+      simde_mm_storel_pd(&d2lnLidlz2, a2);
 
       inv_Li = 1.0 / FABS(inv_Li);
 
@@ -2690,9 +2251,9 @@
 
   for (i = 0; i < upper; i++)
     {
-      __m128d a0 = _mm_setzero_pd();
-      __m128d a1 = _mm_setzero_pd();
-      __m128d a2 = _mm_setzero_pd();
+      simde__m128d a0 = simde_mm_setzero_pd();
+      simde__m128d a1 = simde_mm_setzero_pd();
+      simde__m128d a2 = simde_mm_setzero_pd();
 
        double 
 	r = rptr[cptr[i]],
@@ -2704,24 +2265,24 @@
           
       for(l = 0; l < 20; l+=2)
 	{	  
-	  __m128d tmpv = _mm_mul_pd(_mm_load_pd(&d1[l]), _mm_load_pd(&sum[l]));
+	  simde__m128d tmpv = simde_mm_mul_pd(simde_mm_load_pd(&d1[l]), simde_mm_load_pd(&sum[l]));
 	  
-	  a0 = _mm_add_pd(a0, tmpv);
-	  __m128d sv = _mm_load_pd(&s[l]);	  
+	  a0 = simde_mm_add_pd(a0, tmpv);
+	  simde__m128d sv = simde_mm_load_pd(&s[l]);	  
 	  
-	  a1 = _mm_add_pd(a1, _mm_mul_pd(tmpv, sv));
-	  __m128d ev = _mm_load_pd(&e[l]);	  
+	  a1 = simde_mm_add_pd(a1, simde_mm_mul_pd(tmpv, sv));
+	  simde__m128d ev = simde_mm_load_pd(&e[l]);	  
 
-	  a2 = _mm_add_pd(a2, _mm_mul_pd(tmpv, ev));
+	  a2 = simde_mm_add_pd(a2, simde_mm_mul_pd(tmpv, ev));
 	}
 
-      a0 = _mm_hadd_pd(a0, a0);
-      a1 = _mm_hadd_pd(a1, a1);
-      a2 = _mm_hadd_pd(a2, a2);
+      a0 = simde_mm_hadd_pd(a0, a0);
+      a1 = simde_mm_hadd_pd(a1, a1);
+      a2 = simde_mm_hadd_pd(a2, a2);
 
-      _mm_storel_pd(&inv_Li, a0);     
-      _mm_storel_pd(&dlnLidlz, a1);                 
-      _mm_storel_pd(&d2lnLidlz2, a2);
+      simde_mm_storel_pd(&inv_Li, a0);     
+      simde_mm_storel_pd(&dlnLidlz, a1);                 
+      simde_mm_storel_pd(&d2lnLidlz2, a2);
       
       inv_Li = 1.0/FABS(inv_Li);
 
@@ -2737,11 +2298,3 @@
 
   free(d_start);
 }
-
-
-
-
-#endif
-
-
-
--- examl.orig/examl/mic_native_aa.c
+++ examl/examl/mic_native_aa.c
@@ -1,4 +1,4 @@
-#include <immintrin.h>
+#include "../debian/include/simde/x86/avx2.h"
 #include <string.h>
 #include <math.h>
 
@@ -215,7 +215,7 @@
             {
                 for (int j = 0; j < span; j += 8)
                 {
-                    _mm_prefetch((const char *)&aEV[span*(k+1) + j], _MM_HINT_T0);
+                    simde_mm_prefetch((const char *)&aEV[span*(k+1) + j], _MM_HINT_T0);
                 }
 
                 mic_fma4x80(&uX[k], v3, &aEV[k * span]);
@@ -231,8 +231,8 @@
             #pragma unroll(10)
             for (int j = 0; j < span; j += 8)
             {
-                _mm_prefetch((const char *)&x2[span*(i+1) + j], _MM_HINT_T1);
-//                _mm_prefetch((const char *)&x2[span*(i+1) + j], _MM_HINT_T0);
+                simde_mm_prefetch((const char *)&x2[span*(i+1) + j], _MM_HINT_T1);
+//                simde_mm_prefetch((const char *)&x2[span*(i+1) + j], _MM_HINT_T0);
             }
 
             /* access pre-computed value based on the raw sequence data tipX1 that is used as an index */
@@ -255,7 +255,7 @@
 		#pragma unroll(10)
             	for (int j = 0; j < span; j += 8)
                 {
-                    _mm_prefetch((const char *)&aRight[span*(k+1) + j], _MM_HINT_T0);
+                    simde_mm_prefetch((const char *)&aRight[span*(k+1) + j], _MM_HINT_T0);
                 }
 
                 mic_fma4x80(&v2[k], uX2, &aRight[k * span]);
@@ -274,7 +274,7 @@
 		#pragma unroll(10)
             	for (int j = 0; j < span; j += 8)
                 {
-                    _mm_prefetch((const char *)&aEV[span*(k+1) + j], _MM_HINT_T0);
+                    simde_mm_prefetch((const char *)&aEV[span*(k+1) + j], _MM_HINT_T0);
                 }
 
                 mic_fma4x80(&uX[k], v3, &aEV[k * span]);
@@ -314,10 +314,10 @@
             #pragma unroll(10)
             for (int j = 0; j < span; j += 8)
             {
-                _mm_prefetch((const char *)&x1[span*(i+1) + j], _MM_HINT_T1);
-                _mm_prefetch((const char *)&x2[span*(i+1) + j], _MM_HINT_T1);
-//                _mm_prefetch((const char *)&x1[span*(i+1) + j], _MM_HINT_T0);
-//                _mm_prefetch((const char *)&x2[span*(i+1) + j], _MM_HINT_T0);
+                simde_mm_prefetch((const char *)&x1[span*(i+1) + j], _MM_HINT_T1);
+                simde_mm_prefetch((const char *)&x2[span*(i+1) + j], _MM_HINT_T1);
+//                simde_mm_prefetch((const char *)&x1[span*(i+1) + j], _MM_HINT_T0);
+//                simde_mm_prefetch((const char *)&x2[span*(i+1) + j], _MM_HINT_T0);
             }
 
 
@@ -342,8 +342,8 @@
 		#pragma unroll(10)
             	for (int j = 0; j < span; j += 8)
                 {
-                    _mm_prefetch((const char *)&aRight[span*(k+1) + j], _MM_HINT_T0);
-                    _mm_prefetch((const char *)&aLeft[span*(k+1) + j], _MM_HINT_T0);
+                    simde_mm_prefetch((const char *)&aRight[span*(k+1) + j], _MM_HINT_T0);
+                    simde_mm_prefetch((const char *)&aLeft[span*(k+1) + j], _MM_HINT_T0);
                 }
 
                 mic_fma4x80(&v1[k], uX1, &aLeft[k * span]);
@@ -363,7 +363,7 @@
 		#pragma unroll(10)
             	for (int j = 0; j < span; j += 8)
                 {
-                    _mm_prefetch((const char *)&aEV[span*(k+1) + j], _MM_HINT_T0);
+                    simde_mm_prefetch((const char *)&aEV[span*(k+1) + j], _MM_HINT_T0);
                 }
 
                 mic_fma4x80(&uX[k], v3, &aEV[k * span]);
@@ -423,8 +423,8 @@
 	    #pragma unroll(10)
 	    for (int k = 0; k < span; k += 8)
 	    {
-		    _mm_prefetch((const char *) &x2_start[span*(i+2) + k], _MM_HINT_T1);
-		    _mm_prefetch((const char *) &x2_start[span*(i+1) + k], _MM_HINT_T0);
+		    simde_mm_prefetch((const char *) &x2_start[span*(i+2) + k], _MM_HINT_T1);
+		    simde_mm_prefetch((const char *) &x2_start[span*(i+1) + k], _MM_HINT_T0);
 	    }
 
 	    double term = 0.;
@@ -448,11 +448,11 @@
 	    #pragma unroll(10)
 	    for (int k = 0; k < span; k += 8)
 	    {
-	      _mm_prefetch((const char *) &x1_start[span*(i+2) + k], _MM_HINT_T1);
-	      _mm_prefetch((const char *) &x1_start[span*(i+1) + k], _MM_HINT_T0);
+	      simde_mm_prefetch((const char *) &x1_start[span*(i+2) + k], _MM_HINT_T1);
+	      simde_mm_prefetch((const char *) &x1_start[span*(i+1) + k], _MM_HINT_T0);
 
-	      _mm_prefetch((const char *) &x2_start[span*(i+2) + k], _MM_HINT_T1);
-	      _mm_prefetch((const char *) &x2_start[span*(i+1) + k], _MM_HINT_T0);
+	      simde_mm_prefetch((const char *) &x2_start[span*(i+2) + k], _MM_HINT_T1);
+	      simde_mm_prefetch((const char *) &x2_start[span*(i+1) + k], _MM_HINT_T0);
 	    }
 
 	    const double *x1 = &(x1_start[span * i]);
@@ -505,8 +505,8 @@
 	  #pragma unroll(10)
 	  for (int k = 0; k < span; k += 8)
 	    {
-	      _mm_prefetch((const char *) &x2_start[span*(i+2) + k], _MM_HINT_T1);
-	      _mm_prefetch((const char *) &x2_start[span*(i+1) + k], _MM_HINT_T0);
+	      simde_mm_prefetch((const char *) &x2_start[span*(i+2) + k], _MM_HINT_T1);
+	      simde_mm_prefetch((const char *) &x2_start[span*(i+1) + k], _MM_HINT_T0);
 	    }
 
           const double *left = &(aTipVec[span * tipX1[i]]);
@@ -528,11 +528,11 @@
 	    #pragma unroll(10)
 	    for (int k = 0; k < span; k += 8)
 	      {
-		_mm_prefetch((const char *) &x1_start[span*(i+2) + k], _MM_HINT_T1);
-		_mm_prefetch((const char *) &x1_start[span*(i+1) + k], _MM_HINT_T0);
+		simde_mm_prefetch((const char *) &x1_start[span*(i+2) + k], _MM_HINT_T1);
+		simde_mm_prefetch((const char *) &x1_start[span*(i+1) + k], _MM_HINT_T0);
 
-		_mm_prefetch((const char *) &x2_start[span*(i+2) + k], _MM_HINT_T1);
-		_mm_prefetch((const char *) &x2_start[span*(i+1) + k], _MM_HINT_T0);
+		simde_mm_prefetch((const char *) &x2_start[span*(i+2) + k], _MM_HINT_T1);
+		simde_mm_prefetch((const char *) &x2_start[span*(i+1) + k], _MM_HINT_T0);
 	      }
 
             const double *left  = &(x1_start[span * i]);
@@ -627,8 +627,8 @@
 	    #pragma unroll(10)
 	    for (int k = 0; k < span; k += 8)
 	      {
-		_mm_prefetch((const char *) &sum[span*(j+2) + k], _MM_HINT_T1);
-		_mm_prefetch((const char *) &sum[span*(j+1) + k], _MM_HINT_T0);
+		simde_mm_prefetch((const char *) &sum[span*(j+2) + k], _MM_HINT_T1);
+		simde_mm_prefetch((const char *) &sum[span*(j+1) + k], _MM_HINT_T0);
 	      }
 
             __m512d inv_1 = _mm512_setzero_pd();
@@ -846,7 +846,7 @@
             {
                 for (int j = 0; j < span; j += 8)
                 {
-                    _mm_prefetch((const char *)&aEV[span*(k+1) + j], _MM_HINT_T0);
+                    simde_mm_prefetch((const char *)&aEV[span*(k+1) + j], _MM_HINT_T0);
                 }
 
                 mic_fma4x80(&uX[k], v3, &aEV[k * span]);
@@ -862,8 +862,8 @@
             #pragma unroll(10)
             for (int j = 0; j < span; j += 8)
             {
-                _mm_prefetch((const char *)&x2[span*(i+1) + j], _MM_HINT_T1);
-//                _mm_prefetch((const char *)&x2[span*(i+1) + j], _MM_HINT_T0);
+                simde_mm_prefetch((const char *)&x2[span*(i+1) + j], _MM_HINT_T1);
+//                simde_mm_prefetch((const char *)&x2[span*(i+1) + j], _MM_HINT_T0);
             }
 
             /* access pre-computed value based on the raw sequence data tipX1 that is used as an index */
@@ -886,7 +886,7 @@
 				#pragma unroll(10)
             	for (int j = 0; j < span; j += 8)
                 {
-                    _mm_prefetch((const char *)&aRight[span*(k+1) + j], _MM_HINT_T0);
+                    simde_mm_prefetch((const char *)&aRight[span*(k+1) + j], _MM_HINT_T0);
                 }
 
                 mic_fma4x80(&v2[k], uX2, &aRight[k * span]);
@@ -905,7 +905,7 @@
 		#pragma unroll(10)
             	for (int j = 0; j < span; j += 8)
                 {
-                    _mm_prefetch((const char *)&aEV[span*(k+1) + j], _MM_HINT_T0);
+                    simde_mm_prefetch((const char *)&aEV[span*(k+1) + j], _MM_HINT_T0);
                 }
 
                 mic_fma4x80(&uX[k], v3, &aEV[k * span]);
@@ -943,10 +943,10 @@
             #pragma unroll(10)
             for (int j = 0; j < span; j += 8)
             {
-                _mm_prefetch((const char *)&x1[span*(i+1) + j], _MM_HINT_T1);
-                _mm_prefetch((const char *)&x2[span*(i+1) + j], _MM_HINT_T1);
-//                _mm_prefetch((const char *)&x1[span*(i+1) + j], _MM_HINT_T0);
-//                _mm_prefetch((const char *)&x2[span*(i+1) + j], _MM_HINT_T0);
+                simde_mm_prefetch((const char *)&x1[span*(i+1) + j], _MM_HINT_T1);
+                simde_mm_prefetch((const char *)&x2[span*(i+1) + j], _MM_HINT_T1);
+//                simde_mm_prefetch((const char *)&x1[span*(i+1) + j], _MM_HINT_T0);
+//                simde_mm_prefetch((const char *)&x2[span*(i+1) + j], _MM_HINT_T0);
             }
 
 
@@ -971,8 +971,8 @@
 		#pragma unroll(10)
             	for (int j = 0; j < span; j += 8)
                 {
-                    _mm_prefetch((const char *)&aRight[span*(k+1) + j], _MM_HINT_T0);
-                    _mm_prefetch((const char *)&aLeft[span*(k+1) + j], _MM_HINT_T0);
+                    simde_mm_prefetch((const char *)&aRight[span*(k+1) + j], _MM_HINT_T0);
+                    simde_mm_prefetch((const char *)&aLeft[span*(k+1) + j], _MM_HINT_T0);
                 }
 
                 mic_fma4x80(&v1[k], uX1, &aLeft[k * span]);
@@ -992,7 +992,7 @@
 		#pragma unroll(10)
             	for (int j = 0; j < span; j += 8)
                 {
-                    _mm_prefetch((const char *)&aEV[span*(k+1) + j], _MM_HINT_T0);
+                    simde_mm_prefetch((const char *)&aEV[span*(k+1) + j], _MM_HINT_T0);
                 }
 
                 mic_fma4x80(&uX[k], v3, &aEV[k * span]);
@@ -1062,8 +1062,8 @@
 	  #pragma unroll(10)
 	  for (int k = 0; k < span; k += 8)
 	    {
-	      _mm_prefetch((const char *) &x2_start[span*(i+2) + k], _MM_HINT_T1);
-	      _mm_prefetch((const char *) &x2_start[span*(i+1) + k], _MM_HINT_T0);
+	      simde_mm_prefetch((const char *) &x2_start[span*(i+2) + k], _MM_HINT_T1);
+	      simde_mm_prefetch((const char *) &x2_start[span*(i+1) + k], _MM_HINT_T0);
 	    }
 
 	  double term = 0.;
@@ -1088,11 +1088,11 @@
 	  #pragma unroll(10)
 	  for (int k = 0; k < span; k += 8)
 	    {
-	      _mm_prefetch((const char *) &x1_start[span*(i+2) + k], _MM_HINT_T1);
-	      _mm_prefetch((const char *) &x1_start[span*(i+1) + k], _MM_HINT_T0);
+	      simde_mm_prefetch((const char *) &x1_start[span*(i+2) + k], _MM_HINT_T1);
+	      simde_mm_prefetch((const char *) &x1_start[span*(i+1) + k], _MM_HINT_T0);
 
-	      _mm_prefetch((const char *) &x2_start[span*(i+2) + k], _MM_HINT_T1);
-	      _mm_prefetch((const char *) &x2_start[span*(i+1) + k], _MM_HINT_T0);
+	      simde_mm_prefetch((const char *) &x2_start[span*(i+2) + k], _MM_HINT_T1);
+	      simde_mm_prefetch((const char *) &x2_start[span*(i+1) + k], _MM_HINT_T0);
 	    }
 
 	  const double *x1 = &(x1_start[span * i]);
@@ -1145,8 +1145,8 @@
 	  #pragma unroll(10)
 	  for (int k = 0; k < span; k += 8)
 	    {
-	      _mm_prefetch((const char *) &x2_start[span*(i+2) + k], _MM_HINT_T1);
-	      _mm_prefetch((const char *) &x2_start[span*(i+1) + k], _MM_HINT_T0);
+	      simde_mm_prefetch((const char *) &x2_start[span*(i+2) + k], _MM_HINT_T1);
+	      simde_mm_prefetch((const char *) &x2_start[span*(i+1) + k], _MM_HINT_T0);
 	    }
 
           const double *left = &(aTipVec[span * tipX1[i]]);
@@ -1168,11 +1168,11 @@
 	      #pragma unroll(10)
 	      for (int k = 0; k < span; k += 8)
 	      {
-		_mm_prefetch((const char *) &x1_start[span*(i+2) + k], _MM_HINT_T1);
-		_mm_prefetch((const char *) &x1_start[span*(i+1) + k], _MM_HINT_T0);
+		simde_mm_prefetch((const char *) &x1_start[span*(i+2) + k], _MM_HINT_T1);
+		simde_mm_prefetch((const char *) &x1_start[span*(i+1) + k], _MM_HINT_T0);
 
-		_mm_prefetch((const char *) &x2_start[span*(i+2) + k], _MM_HINT_T1);
-		_mm_prefetch((const char *) &x2_start[span*(i+1) + k], _MM_HINT_T0);
+		simde_mm_prefetch((const char *) &x2_start[span*(i+2) + k], _MM_HINT_T1);
+		simde_mm_prefetch((const char *) &x2_start[span*(i+1) + k], _MM_HINT_T0);
 	      }
 
 	      const double *left  = &(x1_start[span * i]);
@@ -1260,8 +1260,8 @@
 	    #pragma unroll(10)
 	    for (int k = 0; k < span; k += 8)
 	    {
-		    _mm_prefetch((const char *) &sum[span*(j+2) + k], _MM_HINT_T1);
-		    _mm_prefetch((const char *) &sum[span*(j+1) + k], _MM_HINT_T0);
+		    simde_mm_prefetch((const char *) &sum[span*(j+2) + k], _MM_HINT_T1);
+		    simde_mm_prefetch((const char *) &sum[span*(j+1) + k], _MM_HINT_T0);
 	    }
 
             __m512d inv_1 = _mm512_setzero_pd();
--- examl.orig/examl/newviewGenericSpecial.c
+++ examl/examl/newviewGenericSpecial.c
@@ -41,23 +41,19 @@
 #include <limits.h>
 #include "axml.h"
 
-#ifdef __SIM_SSE3
-
 #include <stdint.h>
-#include <xmmintrin.h>
-#include <pmmintrin.h>
+#include "../debian/include/simde/x86/avx2.h"
 
 /* required to compute the absoliute values of double precision numbers with SSE3 */
 
 const union __attribute__ ((aligned (BYTE_ALIGNMENT)))
 {
        uint64_t i[2];
-       __m128d m;
+       simde__m128d m;
 } absMask = {{0x7fffffffffffffffULL , 0x7fffffffffffffffULL }};
 
 
 
-#endif
 
 /* includes MIC-optimized functions */
 
@@ -212,478 +208,6 @@
    conditional likelihood arrays at p, given child nodes q and r. Once again we need 
    two generic function implementations, one for CAT and one for GAMMA */
 
-#ifndef _OPTIMIZED_FUNCTIONS
-
-static void newviewCAT_FLEX(int tipCase, double *extEV,
-			    int *cptr,
-			    double *x1, double *x2, double *x3, double *tipVector,
-			    unsigned char *tipX1, unsigned char *tipX2,
-			    int n, double *left, double *right, int *wgt, int *scalerIncrement, const int states)
-{
-  double
-    *le, 
-    *ri, 
-    *v, 
-    *vl, 
-    *vr,
-    ump_x1, 
-    ump_x2, 
-    x1px2;
-
-  int 
-    i, 
-    l, 
-    j, 
-    scale, 
-    addScale = 0;
-
-  const int 
-    statesSquare = states * states;
-
-
-  /* here we switch over the different cases for efficiency, but also because 
-     each case accesses different data types.
-
-     We consider three cases: either q and r are both tips, q or r are tips, and q and r are inner 
-     nodes.
-  */
-     
-
-  switch(tipCase)
-    {
-      
-      /* both child nodes of p weher we want to update the conditional likelihood are tips */
-    case TIP_TIP:     
-      /* loop over sites */
-      for (i = 0; i < n; i++)
-	{
-	  /* set a pointer to the P-Matrices for the rate category of this site */
-	  le = &left[cptr[i] * statesSquare];
-	  ri = &right[cptr[i] * statesSquare];
-	  
-	  /* pointers to the likelihood entries of the tips q (vl) and r (vr) 
-	     We will do reading accesses to these values only.
-	   */
-	  vl = &(tipVector[states * tipX1[i]]);
-	  vr = &(tipVector[states * tipX2[i]]);
-	  
-	  /* address of the conditional likelihood array entres at site i. This is 
-	     a writing access to v */
-	  v  = &x3[states * i];
-	  
-	  /* initialize v */
-	  for(l = 0; l < states; l++)
-	    v[l] = 0.0;
-	  	  
-	  /* loop over states to compute the cond likelihoods at p (v) */
-
-	  for(l = 0; l < states; l++)
-	    {	      
-	      ump_x1 = 0.0;
-	      ump_x2 = 0.0;
-	      
-	      /* le and ri are the P-matrices */
-
-	      for(j = 0; j < states; j++)
-		{
-		  ump_x1 += vl[j] * le[l * states + j];
-		  ump_x2 += vr[j] * ri[l * states + j];
-		}
-	      
-	      x1px2 = ump_x1 * ump_x2;
-	      
-	      /* multiply with matrix of eigenvectors extEV */
-
-	      for(j = 0; j < states; j++)
-		v[j] += x1px2 * extEV[l * states + j];
-	    }	   
-	}    
-      break;
-    case TIP_INNER:      
-
-      /* same as above, only that now vl is a tip and vr is the conditional probability vector 
-	 at an inner node. Note that, if we have the case that either q or r is a tip, the 
-	 nodes will be flipped to ensure that tipX1 always points to the sequence at the tip.
-      */
-
-      for (i = 0; i < n; i++)
-	{
-	  le = &left[cptr[i] * statesSquare];
-	  ri = &right[cptr[i] * statesSquare];
-	  
-	  /* access tip vector lookup table */
-	  vl = &(tipVector[states * tipX1[i]]);
-
-	  /* access conditional likelihoo arrays */
-	  /* again, vl and vr are reading accesses, while v is a writing access */
-	  vr = &x2[states * i];
-	  v  = &x3[states * i];
-	  
-	  /* same as in the loop above */
-
-	  for(l = 0; l < states; l++)
-	    v[l] = 0.0;
-	  
-	  for(l = 0; l < states; l++)
-	    {
-	      ump_x1 = 0.0;
-	      ump_x2 = 0.0;
-	      
-	      for(j = 0; j < states; j++)
-		{
-		  ump_x1 += vl[j] * le[l * states + j];
-		  ump_x2 += vr[j] * ri[l * states + j];
-		}
-	      
-	      x1px2 = ump_x1 * ump_x2;
-	      
-	      for(j = 0; j < states; j++)
-		v[j] += x1px2 * extEV[l * states + j];
-	    }
-	  
-	  /* now let's check for numerical scaling. 
-	     The maths in RAxML are a bit non-standard to avoid/economize on arithmetic operations 
-	     at the virtual root and for branch length optimization and hence values stored 
-	     in the conditional likelihood vectors can become negative.
-	     Below we check if all absolute values stored at position i of v are smaller 
-	     than a pre-defined value in axml.h. If they are all smaller we can then safely 
-	     multiply them by a large, constant number twotothe256 (without numerical overflow) 
-	     that is also speced in axml.h */
-
-	  scale = 1;
-	  for(l = 0; scale && (l < states); l++)
-	    scale = ((v[l] < minlikelihood) && (v[l] > minusminlikelihood));	   
-	  
-	  if(scale)
-	    {
-	      for(l = 0; l < states; l++)
-		v[l] *= twotothe256;
-	      
-	      /* if we have scaled the entries to prevent underflow, we need to keep track of how many scaling 
-		 multiplications we did per node such as to undo them at the virtual root, e.g., in 
-		 evaluateGeneric() 
-		 Note here, that, if we scaled the site we need to increment the scaling counter by the wieght, i.e., 
-		 the number of sites this potentially compressed pattern represents ! */ 
-
-	      addScale += wgt[i];	  
-	    }
-	}   
-      break;
-    case INNER_INNER:
-      
-      /* same as above, only that the two child nodes q and r are now inner nodes */
-
-      for(i = 0; i < n; i++)
-	{
-	  le = &left[cptr[i] * statesSquare];
-	  ri = &right[cptr[i] * statesSquare];
-
-	  /* index conditional likelihood vectors of inner nodes */
-
-	  vl = &x1[states * i];
-	  vr = &x2[states * i];
-	  v = &x3[states * i];
-
-	  for(l = 0; l < states; l++)
-	    v[l] = 0.0;
-	 
-	  for(l = 0; l < states; l++)
-	    {
-	      ump_x1 = 0.0;
-	      ump_x2 = 0.0;
-
-	      for(j = 0; j < states; j++)
-		{
-		  ump_x1 += vl[j] * le[l * states + j];
-		  ump_x2 += vr[j] * ri[l * states + j];
-		}
-
-	      x1px2 =  ump_x1 * ump_x2;
-
-	      for(j = 0; j < states; j++)
-		v[j] += x1px2 * extEV[l * states + j];	      
-	    }
-
-	   scale = 1;
-	   for(l = 0; scale && (l < states); l++)
-	     scale = ((v[l] < minlikelihood) && (v[l] > minusminlikelihood));
-  
-	   if(scale)
-	     {
-	       for(l = 0; l < states; l++)
-		 v[l] *= twotothe256;
-
-	       addScale += wgt[i];	   
-	     }
-	}
-      break;
-    default:
-      assert(0);
-    }
-   
-  /* increment the scaling counter by the additional scalings done at node p */
-
-  *scalerIncrement = addScale;
-}
-
-
-static void newviewGAMMA_FLEX(int tipCase,
-			      double *x1, double *x2, double *x3, double *extEV, double *tipVector,
-			      unsigned char *tipX1, unsigned char *tipX2,
-			      int n, double *left, double *right, int *wgt, int *scalerIncrement, const int states, const int maxStateValue)
-{
-  double  
-    *uX1, 
-    *uX2, 
-    *v, 
-    x1px2, 
-    *vl, 
-    *vr, 
-    al, 
-    ar;
-  
-  int  
-    i, 
-    j, 
-    l, 
-    k, 
-    scale, 
-    addScale = 0;
-
-  const int     
-    statesSquare = states * states,
-    span = states * 4,
-    /* this is required for doing some pre-computations that help to save 
-       numerical operations. What we are actually computing here are additional lookup tables 
-       for each possible state a certain data-type can assume.
-       for DNA with ambuguity coding this is 15, for proteins this is 22 or 23, since there 
-       also exist one or two amibguity codes for protein data.
-       Essentially this is very similar to the tip vectors which we also use as lookup tables */
-    precomputeLength = maxStateValue * span;
-
-  switch(tipCase)
-    {
-    case TIP_TIP:
-      {
-	/* allocate pre-compute memory space */
-
-	double 
-	  *umpX1 = (double*)malloc(sizeof(double) * precomputeLength),
-	  *umpX2 = (double*)malloc(sizeof(double) * precomputeLength);
-
-	/* multiply all possible tip state vectors with the respective P-matrices 
-	 */
-
-	for(i = 0; i < maxStateValue; i++)
-	  {
-	    v = &(tipVector[states * i]);
-
-	    for(k = 0; k < span; k++)
-	      {
-
-		umpX1[span * i + k] = 0.0;
-		umpX2[span * i + k] = 0.0;
-
-		for(l = 0; l < states; l++)
-		  {
-		    umpX1[span * i + k] +=  v[l] *  left[k * states + l];
-		    umpX2[span * i + k] +=  v[l] * right[k * states + l];
-		  }
-
-	      }
-	  }
-
-	for(i = 0; i < n; i++)
-	  {
-	    /* access the precomputed arrays (pre-computed multiplication of conditional with the tip state) 
-	     */
-
-	    uX1 = &umpX1[span * tipX1[i]];
-	    uX2 = &umpX2[span * tipX2[i]];
-
-	    /* loop over discrete GAMMA rates */
-
-	    for(j = 0; j < 4; j++)
-	      {
-		/* the rest is the same as for CAT */
-		v = &x3[i * span + j * states];
-
-		for(k = 0; k < states; k++)
-		  v[k] = 0.0;
-
-		for(k = 0; k < states; k++)
-		  {		   
-		    x1px2 = uX1[j * states + k] * uX2[j * states + k];
-		   
-		    for(l = 0; l < states; l++)		      					
-		      v[l] += x1px2 * extEV[states * k + l];		     
-		  }
-
-	      }	   
-	  }
-	
-	/* free precomputed vectors */
-
-	free(umpX1);
-	free(umpX2);
-      }
-      break;
-    case TIP_INNER:
-      {
-	/* we do analogous pre-computations as above, with the only difference that we now do them 
-	   only for one tip vector */
-
-	double 
-	  *umpX1 = (double*)malloc(sizeof(double) * precomputeLength),
-	  *ump_x2 = (double*)malloc(sizeof(double) * states);
-
-	/* precompute P and left tip vector product */
-
-	for(i = 0; i < maxStateValue; i++)
-	  {
-	    v = &(tipVector[states * i]);
-
-	    for(k = 0; k < span; k++)
-	      {
-  
-		umpX1[span * i + k] = 0.0;
-
-		for(l = 0; l < states; l++)
-		  umpX1[span * i + k] +=  v[l] * left[k * states + l];
-
-
-	      }
-	  }
-
-	for (i = 0; i < n; i++)
-	  {
-	    /* access pre-computed value based on the raw sequence data tipX1 that is used as an index */
-
-	    uX1 = &umpX1[span * tipX1[i]];
-
-	    /* loop over discrete GAMMA rates */
-
-	    for(k = 0; k < 4; k++)
-	      {
-		v = &(x2[span * i + k * states]);
-
-		for(l = 0; l < states; l++)
-		  {
-		    ump_x2[l] = 0.0;
-
-		    for(j = 0; j < states; j++)
-		      ump_x2[l] += v[j] * right[k * statesSquare + l * states + j];
-		  }
-
-		v = &(x3[span * i + states * k]);
-
-		for(l = 0; l < states; l++)
-		  v[l] = 0;
-
-		for(l = 0; l < states; l++)
-		  {
-		    x1px2 = uX1[k * states + l]  * ump_x2[l];
-		    for(j = 0; j < states; j++)
-		      v[j] += x1px2 * extEV[l * states  + j];
-		  }
-	      }
-	   
-	    /* also do numerical scaling as above. Note that here we need to scale 
-	       4 * 4 values for DNA or 4 * 20 values for protein data.
-	       If they are ALL smaller than our threshold, we scale. Note that,
-	       this can cause numerical problems with GAMMA, if the values generated 
-	       by the four discrete GAMMA rates are too different.
-
-	       For details, see: 
-	       
-	       F. Izquierdo-Carrasco, S.A. Smith, A. Stamatakis: "Algorithms, Data Structures, and Numerics for Likelihood-based Phylogenetic Inference of Huge Trees"
-
-	    */
-	    
-
-	    v = &x3[span * i];
-	    scale = 1;
-	    for(l = 0; scale && (l < span); l++)
-	      scale = (ABS(v[l]) <  minlikelihood);
-
-
-	    if (scale)
-	      {
-		for(l = 0; l < span; l++)
-		  v[l] *= twotothe256;
-	
-		addScale += wgt[i];		    
-	      }
-	  }
-
-	free(umpX1);
-	free(ump_x2);
-      }
-      break;
-    case INNER_INNER:
-
-      /* same as above, without pre-computations */
-
-      for (i = 0; i < n; i++)
-       {
-	 for(k = 0; k < 4; k++)
-	   {
-	     vl = &(x1[span * i + states * k]);
-	     vr = &(x2[span * i + states * k]);
-	     v =  &(x3[span * i + states * k]);
-
-
-	     for(l = 0; l < states; l++)
-	       v[l] = 0;
-
-
-	     for(l = 0; l < states; l++)
-	       {		 
-
-		 al = 0.0;
-		 ar = 0.0;
-
-		 for(j = 0; j < states; j++)
-		   {
-		     al += vl[j] * left[k * statesSquare + l * states + j];
-		     ar += vr[j] * right[k * statesSquare + l * states + j];
-		   }
-
-		 x1px2 = al * ar;
-
-		 for(j = 0; j < states; j++)
-		   v[j] += x1px2 * extEV[states * l + j];
-
-	       }
-	   }
-	 
-	 v = &(x3[span * i]);
-	 scale = 1;
-	 for(l = 0; scale && (l < span); l++)
-	   scale = ((ABS(v[l]) <  minlikelihood));
-
-	 if(scale)
-	   {  
-	     for(l = 0; l < span; l++)
-	       v[l] *= twotothe256;
-	     
-	     addScale += wgt[i];	    	  
-	   }
-       }
-      break;
-    default:
-      assert(0);
-    }
-
-  /* as above, increment the global counter that counts scaling multiplications by the scaling multiplications 
-     carried out for computing the likelihood array at node p */
-
-  *scalerIncrement = addScale;
-}
-
-#endif
-
-
     
 /* The function below computes partial traversals only down to the point/node in the tree where the 
    conditional likelihhod vector summarizing a subtree is already oriented in the correct direction */
@@ -817,71 +341,6 @@
    file.
 */
 
-#if (defined(_OPTIMIZED_FUNCTIONS) && !defined(__AVX))
-
-static void newviewGTRGAMMAPROT_LG4(int tipCase,
-				    double *x1, double *x2, double *x3, double *extEV[4], double *tipVector[4],
-				    int *ex3, unsigned char *tipX1, unsigned char *tipX2,
-				    int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling);
-
-static void newviewGTRGAMMA_GAPPED_SAVE(int tipCase,
-					double *x1_start, double *x2_start, double *x3_start,
-					double *EV, double *tipVector,
-					unsigned char *tipX1, unsigned char *tipX2,
-					const int n, double *left, double *right, int *wgt, int *scalerIncrement, 
-					unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap, 
-					double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn);
-
-static void newviewGTRGAMMA(int tipCase,
-			    double *x1_start, double *x2_start, double *x3_start,
-			    double *EV, double *tipVector,
-			    unsigned char *tipX1, unsigned char *tipX2,
-			    const int n, double *left, double *right, int *wgt, int *scalerIncrement
-			    );
-
-static void newviewGTRCAT( int tipCase,  double *EV,  int *cptr,
-			   double *x1_start, double *x2_start,  double *x3_start, double *tipVector,
-			   unsigned char *tipX1, unsigned char *tipX2,
-			   int n,  double *left, double *right, int *wgt, int *scalerIncrement);
-
-
-static void newviewGTRCAT_SAVE( int tipCase,  double *EV,  int *cptr,
-				double *x1_start, double *x2_start,  double *x3_start, double *tipVector,
-				unsigned char *tipX1, unsigned char *tipX2,
-				int n,  double *left, double *right, int *wgt, int *scalerIncrement,
-				unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
-				double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats);
-
-static void newviewGTRGAMMAPROT_GAPPED_SAVE(int tipCase,
-					    double *x1, double *x2, double *x3, double *extEV, double *tipVector,
-					    unsigned char *tipX1, unsigned char *tipX2,
-					    int n, double *left, double *right, int *wgt, int *scalerIncrement, 
-					    unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,  
-					    double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn
-					    );
-
-static void newviewGTRGAMMAPROT(int tipCase,
-				double *x1, double *x2, double *x3, double *extEV, double *tipVector,
-				unsigned char *tipX1, unsigned char *tipX2,
-				int n, double *left, double *right, int *wgt, int *scalerIncrement);
-static void newviewGTRCATPROT(int tipCase, double *extEV,
-			      int *cptr,
-			      double *x1, double *x2, double *x3, double *tipVector,
-			      unsigned char *tipX1, unsigned char *tipX2,
-			      int n, double *left, double *right, int *wgt, int *scalerIncrement );
-
-static void newviewGTRCATPROT_SAVE(int tipCase, double *extEV,
-				   int *cptr,
-				   double *x1, double *x2, double *x3, double *tipVector,
-				   unsigned char *tipX1, unsigned char *tipX2,
-				   int n, double *left, double *right, int *wgt, int *scalerIncrement,
-				   unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
-				   double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats);
-
-#endif
-
-#ifdef _OPTIMIZED_FUNCTIONS
-
 static void newviewGTRCAT_BINARY( int tipCase,  double *EV,  int *cptr,
                                   double *x1_start,  double *x2_start,  double *x3_start,  double *tipVector,
                                   int *ex3, unsigned char *tipX1, unsigned char *tipX2,
@@ -894,8 +353,6 @@
 				   const int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling
 				   );
 
-#endif
-
 boolean isGap(unsigned int *x, int pos)
 {
   return (x[pos / 32] & mask32[pos % 32]);
@@ -1260,26 +717,6 @@
 		    assert(0);
 		  }
 		
-#ifndef _OPTIMIZED_FUNCTIONS
-
-	      /* memory saving not implemented */
-
-	      assert(!tr->saveMemory);
-
-	      /* figure out if we need to compute the CAT or GAMMA model of rate heterogeneity */
-
-	      if(tr->rateHetModel == CAT)
-		newviewCAT_FLEX(tInfo->tipCase,  tr->partitionData[model].EV, rateCategory,
-				x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
-				tipX1, tipX2,
-				width, left, right, wgt, &scalerIncrement, states);
-	      else
-		newviewGAMMA_FLEX(tInfo->tipCase,
-				  x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector,
-				  tipX1, tipX2,
-				  width, left, right, wgt, &scalerIncrement, states, getUndetermined(tr->partitionData[model].dataType) + 1);
-
-#else
 	      /* dedicated highly optimized functions. Analogously to the functions in evaluateGeneric() 
 		 we also siwtch over the state number */
 
@@ -1309,32 +746,20 @@
 		      if(tr->saveMemory)
 #ifdef __MIC_NATIVE
 		     assert(0 && "Neither CAT model of rate heterogeneity nor memory saving are implemented on Intel MIC");
-#elif __AVX
+#else
 			newviewGTRCAT_AVX_GAPPED_SAVE(tInfo->tipCase,  tr->partitionData[model].EV, rateCategory,
 						      x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
 						      (int*)NULL, tipX1, tipX2,
 						      width, left, right, wgt, &scalerIncrement, TRUE, x1_gap, x2_gap, x3_gap,
 						      x1_gapColumn, x2_gapColumn, x3_gapColumn, tr->maxCategories);
-#else
-			newviewGTRCAT_SAVE(tInfo->tipCase,  tr->partitionData[model].EV, rateCategory,
-					   x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
-					   tipX1, tipX2,
-					   width, left, right, wgt, &scalerIncrement, x1_gap, x2_gap, x3_gap,
-					   x1_gapColumn, x2_gapColumn, x3_gapColumn, tr->maxCategories);
-#endif
 		      else
 #ifdef __MIC_NATIVE
 		     assert(0 && "CAT model of rate heterogeneity is not implemented on Intel MIC");
-#elif __AVX
+#else
 			newviewGTRCAT_AVX(tInfo->tipCase,  tr->partitionData[model].EV, rateCategory,
 					  x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
 					  tipX1, tipX2,
 					  width, left, right, wgt, &scalerIncrement);
-#else
-			newviewGTRCAT(tInfo->tipCase,  tr->partitionData[model].EV, rateCategory,
-				      x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
-				      tipX1, tipX2,
-				      width, left, right, wgt, &scalerIncrement);
 #endif
 		    }
 		  else
@@ -1344,20 +769,13 @@
 		       if(tr->saveMemory)
 #ifdef __MIC_NATIVE
 		     assert(0 && "Memory saving is not implemented on Intel MIC");
-#elif __AVX
+#else
 			 newviewGTRGAMMA_AVX_GAPPED_SAVE(tInfo->tipCase,
 							 x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, (int*)NULL,
 							 tipX1, tipX2,
 							 width, left, right, wgt, &scalerIncrement, TRUE,
 							 x1_gap, x2_gap, x3_gap, 
 							 x1_gapColumn, x2_gapColumn, x3_gapColumn);
-#else
-		       newviewGTRGAMMA_GAPPED_SAVE(tInfo->tipCase,
-						   x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector,
-						   tipX1, tipX2,
-						   width, left, right, wgt, &scalerIncrement, 
-						   x1_gap, x2_gap, x3_gap, 
-						   x1_gapColumn, x2_gapColumn, x3_gapColumn);
 #endif
 		       else
 #ifdef __MIC_NATIVE
@@ -1366,16 +784,11 @@
 				  tipX1, tipX2,
 				  width, left, right, wgt, &scalerIncrement,
 				  tr->partitionData[model].mic_umpLeft, tr->partitionData[model].mic_umpRight);
-#elif __AVX
+#else
 			 newviewGTRGAMMA_AVX(tInfo->tipCase,
 					     x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector,
 					     tipX1, tipX2,
 					     width, left, right, wgt, &scalerIncrement);
-#else
-		       newviewGTRGAMMA(tInfo->tipCase,
-					 x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector,
-					 tipX1, tipX2,
-					 width, left, right, wgt, &scalerIncrement);
 #endif
 		    }
 		
@@ -1388,30 +801,21 @@
 			{
 #ifdef __MIC_NATIVE
 		     assert(0 && "Neither CAT model of rate heterogeneity nor memory saving are implemented on Intel MIC");
-#elif __AVX
+#else
 			  newviewGTRCATPROT_AVX_GAPPED_SAVE(tInfo->tipCase,  tr->partitionData[model].EV, rateCategory,
 							    x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, (int*)NULL,
 							    tipX1, tipX2, width, left, right, wgt, &scalerIncrement, TRUE, x1_gap, x2_gap, x3_gap,
 							    x1_gapColumn, x2_gapColumn, x3_gapColumn, tr->maxCategories);
-#else
-			  newviewGTRCATPROT_SAVE(tInfo->tipCase,  tr->partitionData[model].EV, rateCategory,
-						 x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
-						 tipX1, tipX2, width, left, right, wgt, &scalerIncrement, x1_gap, x2_gap, x3_gap,
-						 x1_gapColumn, x2_gapColumn, x3_gapColumn, tr->maxCategories);
 #endif
 			}
 		      else
 			{			 			
 #ifdef __MIC_NATIVE
 		     assert(0 && "CAT model of rate heterogeneity is not implemented on Intel MIC");
-#elif __AVX
+#else
 			  newviewGTRCATPROT_AVX(tInfo->tipCase,  tr->partitionData[model].EV, rateCategory,
 						x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
 						tipX1, tipX2, width, left, right, wgt, &scalerIncrement);
-#else
-			  newviewGTRCATPROT(tInfo->tipCase,  tr->partitionData[model].EV, rateCategory,
-					    x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
-					    tipX1, tipX2, width, left, right, wgt, &scalerIncrement);			
 #endif
 			}
 		    }
@@ -1421,7 +825,7 @@
 			{
 #ifdef __MIC_NATIVE
 		     assert(0 && "Memory saving is not implemented on Intel MIC");
-#elif __AVX
+#else
 			  newviewGTRGAMMAPROT_AVX_GAPPED_SAVE(tInfo->tipCase,
 							      x1_start, x2_start, x3_start,
 							      tr->partitionData[model].EV,
@@ -1430,15 +834,6 @@
 							      width, left, right, wgt, &scalerIncrement, TRUE,
 							      x1_gap, x2_gap, x3_gap,
 							      x1_gapColumn, x2_gapColumn, x3_gapColumn);
-#else
-			  newviewGTRGAMMAPROT_GAPPED_SAVE(tInfo->tipCase,
-							  x1_start, x2_start, x3_start,
-							  tr->partitionData[model].EV,
-							  tr->partitionData[model].tipVector,
-							  tipX1, tipX2,
-							  width, left, right, wgt, &scalerIncrement,
-							  x1_gap, x2_gap, x3_gap,
-							  x1_gapColumn, x2_gapColumn, x3_gapColumn);
 #endif
 			}
 		      else
@@ -1451,21 +846,13 @@
 							tipX1, tipX2,
 							width, left, right, wgt, &scalerIncrement,
 							tr->partitionData[model].mic_umpLeft, tr->partitionData[model].mic_umpRight);
-#elif __AVX
+#else
 			      newviewGTRGAMMAPROT_AVX_LG4(tInfo->tipCase,
 							  x1_start, x2_start, x3_start,
 							  tr->partitionData[model].EV_LG4,
 							  tr->partitionData[model].tipVector_LG4,
 							  (int*)NULL, tipX1, tipX2,
 							  width, left, right, wgt, &scalerIncrement, TRUE);
-#else
-			      newviewGTRGAMMAPROT_LG4(tInfo->tipCase,
-						      x1_start, x2_start, x3_start,
-						      tr->partitionData[model].EV_LG4,
-						      tr->partitionData[model].tipVector_LG4,
-						      (int*)NULL, tipX1, tipX2,
-						      width, left, right, 
-						      wgt, &scalerIncrement, TRUE);
 #endif			    
 			    }
 			  else
@@ -1476,16 +863,11 @@
 							tipX1, tipX2,
 							width, left, right, wgt, &scalerIncrement,
 							tr->partitionData[model].mic_umpLeft, tr->partitionData[model].mic_umpRight);
-#elif __AVX
+#else
 			      newviewGTRGAMMAPROT_AVX(tInfo->tipCase,
 						      x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector,
 						      tipX1, tipX2,
 						      width, left, right, wgt, &scalerIncrement);
-#else
-			      newviewGTRGAMMAPROT(tInfo->tipCase,
-						  x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector,
-						  tipX1, tipX2,
-						  width, left, right, wgt, &scalerIncrement);
 #endif
 			    }
 			}
@@ -1595,4277 +977,6 @@
 
 /* optimized function implementations */
 
-#if (defined(_OPTIMIZED_FUNCTIONS) && !defined(__AVX))
-
-static void newviewGTRGAMMA_GAPPED_SAVE(int tipCase,
-					double *x1_start, double *x2_start, double *x3_start,
-					double *EV, double *tipVector,
-					unsigned char *tipX1, unsigned char *tipX2,
-					const int n, double *left, double *right, int *wgt, int *scalerIncrement, 
-					unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap, 
-					double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn)
-{
-  int     
-    i, 
-    j, 
-    k, 
-    l,
-    addScale = 0, 
-    scaleGap = 0;
-  
-  double
-    *x1,
-    *x2,
-    *x3,
-    *x1_ptr = x1_start,
-    *x2_ptr = x2_start,       
-    max,
-    maxima[2] __attribute__ ((aligned (BYTE_ALIGNMENT))),        
-    EV_t[16] __attribute__ ((aligned (BYTE_ALIGNMENT)));      
-    
-  __m128d 
-    values[8],
-    EVV[8];  
-
-  for(k = 0; k < 4; k++)
-    for (l=0; l < 4; l++)
-      EV_t[4 * l + k] = EV[4 * k + l];
-
-  for(k = 0; k < 8; k++)
-    EVV[k] = _mm_load_pd(&EV_t[k * 2]);      
- 
-  
-
-  switch(tipCase)
-    {
-    case TIP_TIP:
-      {
-	double *uX1, umpX1[256] __attribute__ ((aligned (BYTE_ALIGNMENT))), *uX2, umpX2[256] __attribute__ ((aligned (BYTE_ALIGNMENT)));
-
-
-	for (i = 1; i < 16; i++)
-	  {	    
-	    __m128d x1_1 = _mm_load_pd(&(tipVector[i*4]));
-	    __m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2]));	   
-
-	    for (j = 0; j < 4; j++)
-	      for (k = 0; k < 4; k++)
-		{			 	 
-		  __m128d left1 = _mm_load_pd(&left[j*16 + k*4]);
-		  __m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]);
-		  
-		  __m128d acc = _mm_setzero_pd();
-
-		  acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
-		  acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
-		  		  
-		  acc = _mm_hadd_pd(acc, acc);
-		  _mm_storel_pd(&umpX1[i*16 + j*4 + k], acc);
-		}
-	  
-	    for (j = 0; j < 4; j++)
-	      for (k = 0; k < 4; k++)
-		{
-		  __m128d left1 = _mm_load_pd(&right[j*16 + k*4]);
-		  __m128d left2 = _mm_load_pd(&right[j*16 + k*4 + 2]);
-		  
-		  __m128d acc = _mm_setzero_pd();
-
-		  acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
-		  acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
-		  		  
-		  acc = _mm_hadd_pd(acc, acc);
-		  _mm_storel_pd(&umpX2[i*16 + j*4 + k], acc);
-		 
-		}
-	  }   		  
-	
-	uX1 = &umpX1[240];
-	uX2 = &umpX2[240];	   	    	    
-	
-	for (j = 0; j < 4; j++)
-	  {				 		  		  		   
-	    __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
-	    __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
-	    	    
-	    __m128d uX2_k0_sse = _mm_load_pd( &uX2[j * 4] );
-	    __m128d uX2_k2_sse = _mm_load_pd( &uX2[j * 4 + 2] );
-	    
-	    __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, uX2_k0_sse );
-	    __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, uX2_k2_sse );		    		    		   
-	    
-	    __m128d EV_t_l0_k0 = EVV[0];
-	    __m128d EV_t_l0_k2 = EVV[1];
-	    __m128d EV_t_l1_k0 = EVV[2];
-	    __m128d EV_t_l1_k2 = EVV[3];
-	    __m128d EV_t_l2_k0 = EVV[4];
-	    __m128d EV_t_l2_k2 = EVV[5];
-	    __m128d EV_t_l3_k0 = EVV[6]; 
-	    __m128d EV_t_l3_k2 = EVV[7];
-	    
-	    EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-	    EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-	    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-	    
-	    EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-	    EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-	    
-	    EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-	    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-	    
-	    EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-	    EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-	    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-	    
-	    EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-	    EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-	    EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-	    
-	    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
-	    	  
-	    _mm_store_pd( &x3_gapColumn[j * 4 + 0], EV_t_l0_k0 );
-	    _mm_store_pd( &x3_gapColumn[j * 4 + 2], EV_t_l2_k0 );	   
-	  }  
-	
-       
-	x3 = x3_start;
-	
-	for (i = 0; i < n; i++)
-	  {	    
-	    if(!(x3_gap[i / 32] & mask32[i % 32]))	     
-	      {
-		uX1 = &umpX1[16 * tipX1[i]];
-		uX2 = &umpX2[16 * tipX2[i]];	   	    	    		
-		
-		for (j = 0; j < 4; j++)
-		  {				 		  		  		   
-		    __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
-		    __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
-		    
-		    
-		    __m128d uX2_k0_sse = _mm_load_pd( &uX2[j * 4] );
-		    __m128d uX2_k2_sse = _mm_load_pd( &uX2[j * 4 + 2] );
-		    
-		    
-		    //
-		    // multiply left * right
-		    //
-		    
-		    __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, uX2_k0_sse );
-		    __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, uX2_k2_sse );
-		    
-		    
-		    //
-		    // multiply with EV matrix (!?)
-		    //
-		    
-		    __m128d EV_t_l0_k0 = EVV[0];
-		    __m128d EV_t_l0_k2 = EVV[1];
-		    __m128d EV_t_l1_k0 = EVV[2];
-		    __m128d EV_t_l1_k2 = EVV[3];
-		    __m128d EV_t_l2_k0 = EVV[4];
-		    __m128d EV_t_l2_k2 = EVV[5];
-		    __m128d EV_t_l3_k0 = EVV[6]; 
-		    __m128d EV_t_l3_k2 = EVV[7];
-		    
-		    EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-		    EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-		    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-		    
-		    EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-		    EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-		    
-		    EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-		    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-		    
-		    EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-		    EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-		    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-		    
-		    EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-		    EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-		    EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-		    
-		    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
-		    
-		    _mm_store_pd( &x3[j * 4 + 0], EV_t_l0_k0 );
-		    _mm_store_pd( &x3[j * 4 + 2], EV_t_l2_k0 );
-		  }
-		
-		x3 += 16;
-	      }
-	  }
-      }
-      break;
-    case TIP_INNER:
-      {	
-	double 
-	  *uX1, 
-	  umpX1[256] __attribute__ ((aligned (BYTE_ALIGNMENT)));		 
-
-	for (i = 1; i < 16; i++)
-	  {
-	    __m128d x1_1 = _mm_load_pd(&(tipVector[i*4]));
-	    __m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2]));	   
-
-	    for (j = 0; j < 4; j++)
-	      for (k = 0; k < 4; k++)
-		{		 
-		  __m128d left1 = _mm_load_pd(&left[j*16 + k*4]);
-		  __m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]);
-		  
-		  __m128d acc = _mm_setzero_pd();
-
-		  acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
-		  acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
-		  		  
-		  acc = _mm_hadd_pd(acc, acc);
-		  _mm_storel_pd(&umpX1[i*16 + j*4 + k], acc);		 
-		}
-	  }
-
-	{
-	  __m128d maxv =_mm_setzero_pd();
-	  
-	  scaleGap = 0;
-	  
-	  x2 = x2_gapColumn;			 
-	  x3 = x3_gapColumn;
-	  
-	  uX1 = &umpX1[240];	     
-	  
-	  for (j = 0; j < 4; j++)
-	    {		     		   
-	      double *x2_p = &x2[j*4];
-	      double *right_k0_p = &right[j*16];
-	      double *right_k1_p = &right[j*16 + 1*4];
-	      double *right_k2_p = &right[j*16 + 2*4];
-	      double *right_k3_p = &right[j*16 + 3*4];
-	      __m128d x2_0 = _mm_load_pd( &x2_p[0] );
-	      __m128d x2_2 = _mm_load_pd( &x2_p[2] );
-	      
-	      __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
-	      __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
-	      __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
-	      __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
-	      __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
-	      __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
-	      __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
-	      __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
-	      	      
-	      right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-	      right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-	      
-	      right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-	      right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-	      
-	      right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-	      right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-	      right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-	      	       
-	      right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-	      right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-	      	       
-	      right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-	      right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-	      
-	      right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-	      right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-	      right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);
-	      
-	      __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
-	      __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
-	      
-	      __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, right_k0_0 );
-	      __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, right_k2_0 );
-	      
-	      __m128d EV_t_l0_k0 = EVV[0];
-	      __m128d EV_t_l0_k2 = EVV[1];
-	      __m128d EV_t_l1_k0 = EVV[2];
-	      __m128d EV_t_l1_k2 = EVV[3];
-	      __m128d EV_t_l2_k0 = EVV[4];
-	      __m128d EV_t_l2_k2 = EVV[5];
-	      __m128d EV_t_l3_k0 = EVV[6]; 
-	      __m128d EV_t_l3_k2 = EVV[7];
-	      
-	      EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-	      EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-	      EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-	      
-	      EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-	      EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-	      
-	      EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-	      EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-	      
-	      EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-	      EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-	      EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-	      
-	      EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-	      EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-	      EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-	      
-	      EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
-	      
-	      values[j * 2]     = EV_t_l0_k0;
-	      values[j * 2 + 1] = EV_t_l2_k0;		   		   
-	      
-	      maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
-	      maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));		   	     		   
-	    }
-
-	  
-	  _mm_store_pd(maxima, maxv);
-		 
-	  max = MAX(maxima[0], maxima[1]);
-	  
-	  if(max < minlikelihood)
-	    {
-	      scaleGap = 1;
-	      
-	      __m128d sv = _mm_set1_pd(twotothe256);
-	      
-	      _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));	   
-	      _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
-	      _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
-	      _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
-	      _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));	   
-	      _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
-	      _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
-	      _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));	     	      	     
-	    }
-	  else
-	    {
-	      _mm_store_pd(&x3[0], values[0]);	   
-	      _mm_store_pd(&x3[2], values[1]);
-	      _mm_store_pd(&x3[4], values[2]);
-	      _mm_store_pd(&x3[6], values[3]);
-	      _mm_store_pd(&x3[8], values[4]);	   
-	      _mm_store_pd(&x3[10], values[5]);
-	      _mm_store_pd(&x3[12], values[6]);
-	      _mm_store_pd(&x3[14], values[7]);
-	    }
-	}		       	
-      	
-	x3 = x3_start;
-
-	for (i = 0; i < n; i++)
-	   {
-	     if((x3_gap[i / 32] & mask32[i % 32]))
-	       {	       
-		 if(scaleGap)
-		   {		    
-		       addScale += wgt[i];		     
-		   }
-	       }
-	     else
-	       {				 
-		 __m128d maxv =_mm_setzero_pd();		 
-		 
-		 if(x2_gap[i / 32] & mask32[i % 32])
-		   x2 = x2_gapColumn;
-		 else
-		   {
-		     x2 = x2_ptr;
-		     x2_ptr += 16;
-		   }
-		 		 		 
-		 uX1 = &umpX1[16 * tipX1[i]];	     
-		 
-		 
-		 for (j = 0; j < 4; j++)
-		   {		     		   
-		     double *x2_p = &x2[j*4];
-		     double *right_k0_p = &right[j*16];
-		     double *right_k1_p = &right[j*16 + 1*4];
-		     double *right_k2_p = &right[j*16 + 2*4];
-		     double *right_k3_p = &right[j*16 + 3*4];
-		     __m128d x2_0 = _mm_load_pd( &x2_p[0] );
-		     __m128d x2_2 = _mm_load_pd( &x2_p[2] );
-		     
-		     __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
-		     __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
-		     __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
-		     __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
-		     __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
-		     __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
-		     __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
-		     __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
-		     
-		     		     
-		     right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-		     right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-		     
-		     right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-		     right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-		     
-		     right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-		     right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-		     right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-		     
-		     
-		     right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-		     right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-		     
-		     
-		     right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-		     right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-		     
-		     right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-		     right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-		     right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);
-		     
-		     {
-		       //
-		       // load left side from tip vector
-		       //
-		       
-		       __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
-		       __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
-		       
-		       
-		       //
-		       // multiply left * right
-			   //
-		       
-		       __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, right_k0_0 );
-		       __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, right_k2_0 );
-		       
-		       
-		       //
-		       // multiply with EV matrix (!?)
-		       //		   		  
-		       
-		       __m128d EV_t_l0_k0 = EVV[0];
-		       __m128d EV_t_l0_k2 = EVV[1];
-		       __m128d EV_t_l1_k0 = EVV[2];
-		       __m128d EV_t_l1_k2 = EVV[3];
-		       __m128d EV_t_l2_k0 = EVV[4];
-		       __m128d EV_t_l2_k2 = EVV[5];
-		       __m128d EV_t_l3_k0 = EVV[6]; 
-		       __m128d EV_t_l3_k2 = EVV[7];
-		       
-		       
-		       EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-		       EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-		       EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-		       
-		       EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-		       EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-		       
-		       EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-		       EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-		       
-		       EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-		       EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-		       EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-		       
-		       EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-		       EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-		       EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-		       
-		       EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
-		       
-		       values[j * 2]     = EV_t_l0_k0;
-		       values[j * 2 + 1] = EV_t_l2_k0;		   		   
-			   
-		       maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
-		       maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));		   
-		     }		   
-		   }
-
-	     
-		 _mm_store_pd(maxima, maxv);
-		 
-		 max = MAX(maxima[0], maxima[1]);
-		 
-		 if(max < minlikelihood)
-		   {
-		     __m128d sv = _mm_set1_pd(twotothe256);
-		     
-		     _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));	   
-		     _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
-		     _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
-		     _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
-		     _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));	   
-		     _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
-		     _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
-		     _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));	     
-		     
-		     
-		     addScale += wgt[i];
-		    
-		   }
-		 else
-		   {
-		     _mm_store_pd(&x3[0], values[0]);	   
-		     _mm_store_pd(&x3[2], values[1]);
-		     _mm_store_pd(&x3[4], values[2]);
-		     _mm_store_pd(&x3[6], values[3]);
-		     _mm_store_pd(&x3[8], values[4]);	   
-		     _mm_store_pd(&x3[10], values[5]);
-		     _mm_store_pd(&x3[12], values[6]);
-		     _mm_store_pd(&x3[14], values[7]);
-		   }		 
-		 
-		 x3 += 16;
-	       }
-	   }
-      }
-      break;
-    case INNER_INNER:         
-      {
-	__m128d maxv =_mm_setzero_pd();
-	
-	scaleGap = 0;
-	
-	x1 = x1_gapColumn;	     	    
-	x2 = x2_gapColumn;	    
-	x3 = x3_gapColumn;
-	
-	for (j = 0; j < 4; j++)
-	  {
-	    
-	    double *x1_p = &x1[j*4];
-	    double *left_k0_p = &left[j*16];
-	    double *left_k1_p = &left[j*16 + 1*4];
-	    double *left_k2_p = &left[j*16 + 2*4];
-	    double *left_k3_p = &left[j*16 + 3*4];
-	    
-	    __m128d x1_0 = _mm_load_pd( &x1_p[0] );
-	    __m128d x1_2 = _mm_load_pd( &x1_p[2] );
-	    
-	    __m128d left_k0_0 = _mm_load_pd( &left_k0_p[0] );
-	    __m128d left_k0_2 = _mm_load_pd( &left_k0_p[2] );
-	    __m128d left_k1_0 = _mm_load_pd( &left_k1_p[0] );
-	    __m128d left_k1_2 = _mm_load_pd( &left_k1_p[2] );
-	    __m128d left_k2_0 = _mm_load_pd( &left_k2_p[0] );
-	    __m128d left_k2_2 = _mm_load_pd( &left_k2_p[2] );
-	    __m128d left_k3_0 = _mm_load_pd( &left_k3_p[0] );
-	    __m128d left_k3_2 = _mm_load_pd( &left_k3_p[2] );
-	    
-	    left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
-	    left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
-	    
-	    left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
-	    left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
-	    
-	    left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
-	    left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
-	    left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
-	    
-	    left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
-	    left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
-	    
-	    left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
-	    left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
-	    
-	    left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
-	    left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
-	    left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
-	    
-	    
-	    double *x2_p = &x2[j*4];
-	    double *right_k0_p = &right[j*16];
-	    double *right_k1_p = &right[j*16 + 1*4];
-	    double *right_k2_p = &right[j*16 + 2*4];
-	    double *right_k3_p = &right[j*16 + 3*4];
-	    __m128d x2_0 = _mm_load_pd( &x2_p[0] );
-	    __m128d x2_2 = _mm_load_pd( &x2_p[2] );
-	    
-	    __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
-	    __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
-	    __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
-	    __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
-	    __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
-	    __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
-	    __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
-	    __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
-	    
-	    right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-	    right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-	    
-	    right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-	    right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-	    
-	    right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-	    right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-	    right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-	    
-	    right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-	    right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-	    	    
-	    right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-	    right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-	    
-	    right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-	    right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-	    right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);	   		 		
-	    
-	    __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
-	    __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );		 		 	   
-	    
-	    __m128d EV_t_l0_k0 = EVV[0];
-	    __m128d EV_t_l0_k2 = EVV[1];
-	    __m128d EV_t_l1_k0 = EVV[2];
-	    __m128d EV_t_l1_k2 = EVV[3];
-	    __m128d EV_t_l2_k0 = EVV[4];
-	    __m128d EV_t_l2_k2 = EVV[5];
-	    __m128d EV_t_l3_k0 = EVV[6]; 
-	    __m128d EV_t_l3_k2 = EVV[7];
-	    
-	    EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-	    EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-	    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-	    
-	    EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-	    EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-	    
-	    EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-	    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-	    
-	    EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-	    EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-	    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-	    
-	    EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-	    EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-	    EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-	    
-	    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
-	    
-	    
-	    values[j * 2] = EV_t_l0_k0;
-	    values[j * 2 + 1] = EV_t_l2_k0;            	   	    
-	    
-	    maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
-	    maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));
-	  }
-		     
-	_mm_store_pd(maxima, maxv);
-	
-	max = MAX(maxima[0], maxima[1]);
-	
-	if(max < minlikelihood)
-	  {
-	    __m128d sv = _mm_set1_pd(twotothe256);
-	    
-	    scaleGap = 1;
-	    
-	    _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));	   
-	    _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
-	    _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
-	    _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
-	    _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));	   
-	    _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
-	    _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
-	    _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));	     	    	 
-	  }
-	else
-	  {
-	    _mm_store_pd(&x3[0], values[0]);	   
-	    _mm_store_pd(&x3[2], values[1]);
-	    _mm_store_pd(&x3[4], values[2]);
-	    _mm_store_pd(&x3[6], values[3]);
-	    _mm_store_pd(&x3[8], values[4]);	   
-	    _mm_store_pd(&x3[10], values[5]);
-	    _mm_store_pd(&x3[12], values[6]);
-	    _mm_store_pd(&x3[14], values[7]);
-	  }
-      }
-
-     
-      x3 = x3_start;
-
-     for (i = 0; i < n; i++)
-       { 
-	 if(x3_gap[i / 32] & mask32[i % 32])
-	   {	     
-	     if(scaleGap)
-	       {		 
-		 addScale += wgt[i];		 	       
-	       }
-	   }
-	 else
-	   {
-	     __m128d maxv =_mm_setzero_pd();	     	    
-	     
-	     if(x1_gap[i / 32] & mask32[i % 32])
-	       x1 = x1_gapColumn;
-	     else
-	       {
-		 x1 = x1_ptr;
-		 x1_ptr += 16;
-	       }
-	     
-	     if(x2_gap[i / 32] & mask32[i % 32])
-	       x2 = x2_gapColumn;
-	     else
-	       {
-		 x2 = x2_ptr;
-		 x2_ptr += 16;
-	       }
-	     
-	     
-	     for (j = 0; j < 4; j++)
-	       {
-		 
-		 double *x1_p = &x1[j*4];
-		 double *left_k0_p = &left[j*16];
-		 double *left_k1_p = &left[j*16 + 1*4];
-		 double *left_k2_p = &left[j*16 + 2*4];
-		 double *left_k3_p = &left[j*16 + 3*4];
-		 
-		 __m128d x1_0 = _mm_load_pd( &x1_p[0] );
-		 __m128d x1_2 = _mm_load_pd( &x1_p[2] );
-		 
-		 __m128d left_k0_0 = _mm_load_pd( &left_k0_p[0] );
-		 __m128d left_k0_2 = _mm_load_pd( &left_k0_p[2] );
-		 __m128d left_k1_0 = _mm_load_pd( &left_k1_p[0] );
-		 __m128d left_k1_2 = _mm_load_pd( &left_k1_p[2] );
-		 __m128d left_k2_0 = _mm_load_pd( &left_k2_p[0] );
-		 __m128d left_k2_2 = _mm_load_pd( &left_k2_p[2] );
-		 __m128d left_k3_0 = _mm_load_pd( &left_k3_p[0] );
-		 __m128d left_k3_2 = _mm_load_pd( &left_k3_p[2] );
-		 
-		 left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
-		 left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
-		 
-		 left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
-		 left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
-		 
-		 left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
-		 left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
-		 left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
-		 
-		 left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
-		 left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
-		 
-		 left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
-		 left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
-		 
-		 left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
-		 left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
-		 left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
-		 
-		 
-		 //
-		 // multiply/add right side
-		 //
-		 double *x2_p = &x2[j*4];
-		 double *right_k0_p = &right[j*16];
-		 double *right_k1_p = &right[j*16 + 1*4];
-		 double *right_k2_p = &right[j*16 + 2*4];
-		 double *right_k3_p = &right[j*16 + 3*4];
-		 __m128d x2_0 = _mm_load_pd( &x2_p[0] );
-		 __m128d x2_2 = _mm_load_pd( &x2_p[2] );
-		 
-		 __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
-		 __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
-		 __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
-		 __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
-		 __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
-		 __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
-		 __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
-		 __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
-		 
-		 right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-		 right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-		 
-		 right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-		 right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-		 
-		 right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-		 right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-		 right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-		 
-		 right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-		 right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-		 
-		 
-		 right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-		 right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-		 
-		 right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-		 right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-		 right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);	   
-		 
-		 //
-		 // multiply left * right
-		 //
-		 
-		 __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
-		 __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
-		 
-		 
-		 //
-		 // multiply with EV matrix (!?)
-		 //	     
-		 
-		 __m128d EV_t_l0_k0 = EVV[0];
-		 __m128d EV_t_l0_k2 = EVV[1];
-		 __m128d EV_t_l1_k0 = EVV[2];
-		 __m128d EV_t_l1_k2 = EVV[3];
-		 __m128d EV_t_l2_k0 = EVV[4];
-		 __m128d EV_t_l2_k2 = EVV[5];
-		 __m128d EV_t_l3_k0 = EVV[6]; 
-		 __m128d EV_t_l3_k2 = EVV[7];
-		 
-		 
-		 EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-		 EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-		 EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-		 
-		 EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-		 EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-		 
-		 EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-		 EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-		 
-		 EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-		 EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-		 EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-		 
-		 
-		 EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-		 EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-		 EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-		 
-		 EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
-		 
-		 
-		 values[j * 2] = EV_t_l0_k0;
-		 values[j * 2 + 1] = EV_t_l2_k0;            	   	    
-		 
-		 maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
-		 maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));
-	       }
-	     
-	     
-	     _mm_store_pd(maxima, maxv);
-	     
-	     max = MAX(maxima[0], maxima[1]);
-	     
-	     if(max < minlikelihood)
-	       {
-		 __m128d sv = _mm_set1_pd(twotothe256);
-		 
-		 _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));	   
-		 _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
-		 _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
-		 _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
-		 _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));	   
-		 _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
-		 _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
-		 _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));	     
-		 
-		 
-		 addScale += wgt[i];
-		
-	       }
-	     else
-	       {
-		 _mm_store_pd(&x3[0], values[0]);	   
-		 _mm_store_pd(&x3[2], values[1]);
-		 _mm_store_pd(&x3[4], values[2]);
-		 _mm_store_pd(&x3[6], values[3]);
-		 _mm_store_pd(&x3[8], values[4]);	   
-		 _mm_store_pd(&x3[10], values[5]);
-		 _mm_store_pd(&x3[12], values[6]);
-		 _mm_store_pd(&x3[14], values[7]);
-	       }	 
-
-	    
-		 
-	     x3 += 16;
-
-	   }
-       }
-     break;
-    default:
-      assert(0);
-    }
-  
- 
-  *scalerIncrement = addScale;
-}
-
-
-static void newviewGTRGAMMA(int tipCase,
-			    double *x1_start, double *x2_start, double *x3_start,
-			    double *EV, double *tipVector,
-			    unsigned char *tipX1, unsigned char *tipX2,
-			    const int n, double *left, double *right, int *wgt, int *scalerIncrement
-			    )
-{
-  int 
-    i, 
-    j, 
-    k, 
-    l,
-    addScale = 0;
-  
-  double
-    *x1,
-    *x2,
-    *x3,
-    max,
-    maxima[2] __attribute__ ((aligned (BYTE_ALIGNMENT))),       
-    EV_t[16] __attribute__ ((aligned (BYTE_ALIGNMENT)));      
-    
-  __m128d 
-    values[8],
-    EVV[8];  
-
-  for(k = 0; k < 4; k++)
-    for (l=0; l < 4; l++)
-      EV_t[4 * l + k] = EV[4 * k + l];
-
-  for(k = 0; k < 8; k++)
-    EVV[k] = _mm_load_pd(&EV_t[k * 2]);
-   
-  switch(tipCase)
-    {
-    case TIP_TIP:
-      {
-	double *uX1, umpX1[256] __attribute__ ((aligned (BYTE_ALIGNMENT))), *uX2, umpX2[256] __attribute__ ((aligned (BYTE_ALIGNMENT)));
-
-
-	for (i = 1; i < 16; i++)
-	  {
-	    __m128d x1_1 = _mm_load_pd(&(tipVector[i*4]));
-	    __m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2]));	   
-
-	    for (j = 0; j < 4; j++)
-	      for (k = 0; k < 4; k++)
-		{		 
-		  __m128d left1 = _mm_load_pd(&left[j*16 + k*4]);
-		  __m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]);
-		  
-		  __m128d acc = _mm_setzero_pd();
-
-		  acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
-		  acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
-		  		  
-		  acc = _mm_hadd_pd(acc, acc);
-		  _mm_storel_pd(&umpX1[i*16 + j*4 + k], acc);
-		}
-	  
-	    for (j = 0; j < 4; j++)
-	      for (k = 0; k < 4; k++)
-		{
-		  __m128d left1 = _mm_load_pd(&right[j*16 + k*4]);
-		  __m128d left2 = _mm_load_pd(&right[j*16 + k*4 + 2]);
-		  
-		  __m128d acc = _mm_setzero_pd();
-
-		  acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
-		  acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
-		  		  
-		  acc = _mm_hadd_pd(acc, acc);
-		  _mm_storel_pd(&umpX2[i*16 + j*4 + k], acc);
-		 
-		}
-	  }   	
-	  
-	for (i = 0; i < n; i++)
-	  {
-	    x3 = &x3_start[i * 16];
-
-	    
-	    uX1 = &umpX1[16 * tipX1[i]];
-	    uX2 = &umpX2[16 * tipX2[i]];	   	    	    
-	    
-	    for (j = 0; j < 4; j++)
-	       {				 		  		  		   
-		 __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
-		 __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
-		 				  
-		   
-		 __m128d uX2_k0_sse = _mm_load_pd( &uX2[j * 4] );
-		 __m128d uX2_k2_sse = _mm_load_pd( &uX2[j * 4 + 2] );
- 		 
-
-		 //
-		 // multiply left * right
-		 //
-		 
-		 __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, uX2_k0_sse );
-		 __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, uX2_k2_sse );
-		 
-		 
-		 //
-		 // multiply with EV matrix (!?)
-		 //
-		 
-		 __m128d EV_t_l0_k0 = EVV[0];
-		 __m128d EV_t_l0_k2 = EVV[1];
-		 __m128d EV_t_l1_k0 = EVV[2];
-		 __m128d EV_t_l1_k2 = EVV[3];
-		 __m128d EV_t_l2_k0 = EVV[4];
-		 __m128d EV_t_l2_k2 = EVV[5];
-		 __m128d EV_t_l3_k0 = EVV[6]; 
-		 __m128d EV_t_l3_k2 = EVV[7];
-		 
-		 EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-		 EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-		 EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-		 
-		 EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-		 EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-		 
-		 EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-		 EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-		 
-		 EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-		 EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-		 EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-		 
-		 EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-		 EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-		 EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-		 
-		 EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
-		 
-		 _mm_store_pd( &x3[j * 4 + 0], EV_t_l0_k0 );
-		 _mm_store_pd( &x3[j * 4 + 2], EV_t_l2_k0 );
-	       }
-	  }
-      }
-      break;
-    case TIP_INNER:
-      {	
-	double *uX1, umpX1[256] __attribute__ ((aligned (BYTE_ALIGNMENT)));
-
-
-	for (i = 1; i < 16; i++)
-	  {
-	    __m128d x1_1 = _mm_load_pd(&(tipVector[i*4]));
-	    __m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2]));	   
-
-	    for (j = 0; j < 4; j++)
-	      for (k = 0; k < 4; k++)
-		{		 
-		  __m128d left1 = _mm_load_pd(&left[j*16 + k*4]);
-		  __m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]);
-		  
-		  __m128d acc = _mm_setzero_pd();
-
-		  acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
-		  acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
-		  		  
-		  acc = _mm_hadd_pd(acc, acc);
-		  _mm_storel_pd(&umpX1[i*16 + j*4 + k], acc);		 
-		}
-	  }
-
-	 for (i = 0; i < n; i++)
-	   {
-	     __m128d maxv =_mm_setzero_pd();
-	     
-	     x2 = &x2_start[i * 16];
-	     x3 = &x3_start[i * 16];
-
-	     uX1 = &umpX1[16 * tipX1[i]];	     
-
-	     for (j = 0; j < 4; j++)
-	       {
-
-		 //
-		 // multiply/add right side
-		 //
-		 double *x2_p = &x2[j*4];
-		 double *right_k0_p = &right[j*16];
-		 double *right_k1_p = &right[j*16 + 1*4];
-		 double *right_k2_p = &right[j*16 + 2*4];
-		 double *right_k3_p = &right[j*16 + 3*4];
-		 __m128d x2_0 = _mm_load_pd( &x2_p[0] );
-		 __m128d x2_2 = _mm_load_pd( &x2_p[2] );
-
-		 __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
-		 __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
-		 __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
-		 __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
-		 __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
-		 __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
-		 __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
-		 __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
-
-
-
-		 right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-		 right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-
-		 right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-		 right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-
-		 right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-		 right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-		 right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-
-
-		 right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-		 right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-
-
-		 right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-		 right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-
-		 right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-		 right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-		 right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);
-
-		 {
-		   //
-		   // load left side from tip vector
-		   //
-		   
-		   __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
-		   __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
-		 
-		 
-		   //
-		   // multiply left * right
-		   //
-		   
-		   __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, right_k0_0 );
-		   __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, right_k2_0 );
-		   
-		   
-		   //
-		   // multiply with EV matrix (!?)
-		   //		   		  
-
-		   __m128d EV_t_l0_k0 = EVV[0];
-		   __m128d EV_t_l0_k2 = EVV[1];
-		   __m128d EV_t_l1_k0 = EVV[2];
-		   __m128d EV_t_l1_k2 = EVV[3];
-		   __m128d EV_t_l2_k0 = EVV[4];
-		   __m128d EV_t_l2_k2 = EVV[5];
-		   __m128d EV_t_l3_k0 = EVV[6]; 
-		   __m128d EV_t_l3_k2 = EVV[7];
-
-		   
-		   EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-		   EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-		   EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-		   
-		   EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-		   EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-		   
-		   EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-		   EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-		   
-		   EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-		   EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-		   EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-		   		   
-		   EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-		   EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-		   EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-		   
-		   EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
-		   
-		   values[j * 2]     = EV_t_l0_k0;
-		   values[j * 2 + 1] = EV_t_l2_k0;		   		   
-		   
-		   maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
-		   maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));		   
-		 }
-	       }
-
-	     
-	     _mm_store_pd(maxima, maxv);
-
-	     max = MAX(maxima[0], maxima[1]);
-
-	     if(max < minlikelihood)
-	       {
-		 __m128d sv = _mm_set1_pd(twotothe256);
-	       		       	   	 	     
-		 _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));	   
-		 _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
-		 _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
-		 _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
-		 _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));	   
-		 _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
-		 _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
-		 _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));	     
-		 
-		 
-		 addScale += wgt[i];
-		 
-	       }
-	     else
-	       {
-		 _mm_store_pd(&x3[0], values[0]);	   
-		 _mm_store_pd(&x3[2], values[1]);
-		 _mm_store_pd(&x3[4], values[2]);
-		 _mm_store_pd(&x3[6], values[3]);
-		 _mm_store_pd(&x3[8], values[4]);	   
-		 _mm_store_pd(&x3[10], values[5]);
-		 _mm_store_pd(&x3[12], values[6]);
-		 _mm_store_pd(&x3[14], values[7]);
-	       }
-	   }
-      }
-      break;
-    case INNER_INNER:     
-     for (i = 0; i < n; i++)
-       {
-	 __m128d maxv =_mm_setzero_pd();
-	 
-
-	 x1 = &x1_start[i * 16];
-	 x2 = &x2_start[i * 16];
-	 x3 = &x3_start[i * 16];
-	 
-	 for (j = 0; j < 4; j++)
-	   {
-	     
-	     double *x1_p = &x1[j*4];
-	     double *left_k0_p = &left[j*16];
-	     double *left_k1_p = &left[j*16 + 1*4];
-	     double *left_k2_p = &left[j*16 + 2*4];
-	     double *left_k3_p = &left[j*16 + 3*4];
-	     
-	     __m128d x1_0 = _mm_load_pd( &x1_p[0] );
-	     __m128d x1_2 = _mm_load_pd( &x1_p[2] );
-	     
-	     __m128d left_k0_0 = _mm_load_pd( &left_k0_p[0] );
-	     __m128d left_k0_2 = _mm_load_pd( &left_k0_p[2] );
-	     __m128d left_k1_0 = _mm_load_pd( &left_k1_p[0] );
-	     __m128d left_k1_2 = _mm_load_pd( &left_k1_p[2] );
-	     __m128d left_k2_0 = _mm_load_pd( &left_k2_p[0] );
-	     __m128d left_k2_2 = _mm_load_pd( &left_k2_p[2] );
-	     __m128d left_k3_0 = _mm_load_pd( &left_k3_p[0] );
-	     __m128d left_k3_2 = _mm_load_pd( &left_k3_p[2] );
-	     
-	     left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
-	     left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
-	     
-	     left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
-	     left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
-	     
-	     left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
-	     left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
-	     left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
-	     
-	     left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
-	     left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
-	     
-	     left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
-	     left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
-	     
-	     left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
-	     left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
-	     left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
-	     
-	     
-	     //
-	     // multiply/add right side
-	     //
-	     double *x2_p = &x2[j*4];
-	     double *right_k0_p = &right[j*16];
-	     double *right_k1_p = &right[j*16 + 1*4];
-	     double *right_k2_p = &right[j*16 + 2*4];
-	     double *right_k3_p = &right[j*16 + 3*4];
-	     __m128d x2_0 = _mm_load_pd( &x2_p[0] );
-	     __m128d x2_2 = _mm_load_pd( &x2_p[2] );
-	     
-	     __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
-	     __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
-	     __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
-	     __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
-	     __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
-	     __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
-	     __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
-	     __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
-	     
-	     right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-	     right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-	     
-	     right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-	     right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-	     
-	     right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-	     right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-	     right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-	     
-	     right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-	     right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-	     
-	     
-	     right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-	     right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-	     
-	     right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-	     right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-	     right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);	   
-
-             //
-             // multiply left * right
-             //
-
-	     __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
-	     __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
-
-
-             //
-             // multiply with EV matrix (!?)
-             //	     
-
-	     __m128d EV_t_l0_k0 = EVV[0];
-	     __m128d EV_t_l0_k2 = EVV[1];
-	     __m128d EV_t_l1_k0 = EVV[2];
-	     __m128d EV_t_l1_k2 = EVV[3];
-	     __m128d EV_t_l2_k0 = EVV[4];
-	     __m128d EV_t_l2_k2 = EVV[5];
-	     __m128d EV_t_l3_k0 = EVV[6]; 
-	     __m128d EV_t_l3_k2 = EVV[7];
-
-
-	    EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-	    EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-	    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-
-	    EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-	    EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-
-	    EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-	    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-
-	    EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-	    EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-	    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-
-
-	    EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-            EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-            EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-
-            EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
-
-	    
-	    values[j * 2] = EV_t_l0_k0;
-	    values[j * 2 + 1] = EV_t_l2_k0;            	   	    
-
-	    maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
-	    maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));
-           }
-	 	 
-	 
-	 _mm_store_pd(maxima, maxv);
-	 
-	 max = MAX(maxima[0], maxima[1]);
-	 
-	 if(max < minlikelihood)
-	   {
-	     __m128d sv = _mm_set1_pd(twotothe256);
-	       		       	   	 	     
-	     _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));	   
-	     _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
-	     _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
-	     _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
-	     _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));	   
-	     _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
-	     _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
-	     _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));	     
-	     
-	    
-	     addScale += wgt[i];
-	    
-	   }
-	 else
-	   {
-	     _mm_store_pd(&x3[0], values[0]);	   
-	     _mm_store_pd(&x3[2], values[1]);
-	     _mm_store_pd(&x3[4], values[2]);
-	     _mm_store_pd(&x3[6], values[3]);
-	     _mm_store_pd(&x3[8], values[4]);	   
-	     _mm_store_pd(&x3[10], values[5]);
-	     _mm_store_pd(&x3[12], values[6]);
-	     _mm_store_pd(&x3[14], values[7]);
-	   }	 
-       }
-   
-     break;
-    default:
-      assert(0);
-    }
-  
- 
-  *scalerIncrement = addScale;
-
-}
-static void newviewGTRCAT( int tipCase,  double *EV,  int *cptr,
-			   double *x1_start, double *x2_start,  double *x3_start, double *tipVector,
-			   unsigned char *tipX1, unsigned char *tipX2,
-			   int n,  double *left, double *right, int *wgt, int *scalerIncrement)
-{
-  double
-    *le,
-    *ri,
-    *x1,
-    *x2, 
-    *x3, 
-    EV_t[16] __attribute__ ((aligned (BYTE_ALIGNMENT)));
-    
-  int 
-    i, 
-    j, 
-    scale, 
-    addScale = 0;
-   
-  __m128d
-    minlikelihood_sse = _mm_set1_pd( minlikelihood ),
-    sc = _mm_set1_pd(twotothe256),
-    EVV[8];  
-  
-  for(i = 0; i < 4; i++)
-    for (j=0; j < 4; j++)
-      EV_t[4 * j + i] = EV[4 * i + j];
-  
-  for(i = 0; i < 8; i++)
-    EVV[i] = _mm_load_pd(&EV_t[i * 2]);
-  
-  switch(tipCase)
-    {
-    case TIP_TIP:      
-      for (i = 0; i < n; i++)
-	{	 
-	  x1 = &(tipVector[4 * tipX1[i]]);
-	  x2 = &(tipVector[4 * tipX2[i]]);
-	  
-	  x3 = &x3_start[i * 4];
-	  
-	  le =  &left[cptr[i] * 16];
-	  ri =  &right[cptr[i] * 16];
-	  
-	  __m128d x1_0 = _mm_load_pd( &x1[0] );
-	  __m128d x1_2 = _mm_load_pd( &x1[2] );
-	  
-	  __m128d left_k0_0 = _mm_load_pd( &le[0] );
-	  __m128d left_k0_2 = _mm_load_pd( &le[2] );
-	  __m128d left_k1_0 = _mm_load_pd( &le[4] );
-	  __m128d left_k1_2 = _mm_load_pd( &le[6] );
-	  __m128d left_k2_0 = _mm_load_pd( &le[8] );
-	  __m128d left_k2_2 = _mm_load_pd( &le[10] );
-	  __m128d left_k3_0 = _mm_load_pd( &le[12] );
-	  __m128d left_k3_2 = _mm_load_pd( &le[14] );
-	  
-	  left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
-	  left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
-	  
-	  left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
-	  left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
-	  
-	  left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
-	  left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
-	  left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
-	  
-	  left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
-	  left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
-	  
-	  left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
-	  left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
-	  
-	  left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
-	  left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
-	  left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
-	  
-	  __m128d x2_0 = _mm_load_pd( &x2[0] );
-	  __m128d x2_2 = _mm_load_pd( &x2[2] );
-	  
-	  __m128d right_k0_0 = _mm_load_pd( &ri[0] );
-	  __m128d right_k0_2 = _mm_load_pd( &ri[2] );
-	  __m128d right_k1_0 = _mm_load_pd( &ri[4] );
-	  __m128d right_k1_2 = _mm_load_pd( &ri[6] );
-	  __m128d right_k2_0 = _mm_load_pd( &ri[8] );
-	  __m128d right_k2_2 = _mm_load_pd( &ri[10] );
-	  __m128d right_k3_0 = _mm_load_pd( &ri[12] );
-	  __m128d right_k3_2 = _mm_load_pd( &ri[14] );
-	  
-	  right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-	  right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-	  
-	  right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-	  right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-	  
-	  right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-	  right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-	  right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-	  
-	  right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-	  right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-	  
-	  right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-	  right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-	  
-	  right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-	  right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-	  right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);	   
-	  
-	  __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
-	  __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );	  	  
-
-	  __m128d EV_t_l0_k0 = EVV[0];
-	  __m128d EV_t_l0_k2 = EVV[1];
-	  __m128d EV_t_l1_k0 = EVV[2];
-	  __m128d EV_t_l1_k2 = EVV[3];
-	  __m128d EV_t_l2_k0 = EVV[4];
-	  __m128d EV_t_l2_k2 = EVV[5];
-	  __m128d EV_t_l3_k0 = EVV[6];
-	  __m128d EV_t_l3_k2 = EVV[7];
-	  
-	  EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-	  EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-	  EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-	  
-	  EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-	  EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-	  
-	  EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-	  EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-	  
-	  EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-	  EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-	  EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-	  	  
-	  EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-	  EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-	  EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-	  
-	  EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );	 
-	  	  
-	  _mm_store_pd(x3, EV_t_l0_k0);
-	  _mm_store_pd(&x3[2], EV_t_l2_k0);	  	 	   	    
-	}
-      break;
-    case TIP_INNER:      
-      for (i = 0; i < n; i++)
-	{
-	  x1 = &(tipVector[4 * tipX1[i]]);
-	  x2 = &x2_start[4 * i];
-	  x3 = &x3_start[4 * i];
-	  
-	  le =  &left[cptr[i] * 16];
-	  ri =  &right[cptr[i] * 16];
-
-	  __m128d x1_0 = _mm_load_pd( &x1[0] );
-	  __m128d x1_2 = _mm_load_pd( &x1[2] );
-	  
-	  __m128d left_k0_0 = _mm_load_pd( &le[0] );
-	  __m128d left_k0_2 = _mm_load_pd( &le[2] );
-	  __m128d left_k1_0 = _mm_load_pd( &le[4] );
-	  __m128d left_k1_2 = _mm_load_pd( &le[6] );
-	  __m128d left_k2_0 = _mm_load_pd( &le[8] );
-	  __m128d left_k2_2 = _mm_load_pd( &le[10] );
-	  __m128d left_k3_0 = _mm_load_pd( &le[12] );
-	  __m128d left_k3_2 = _mm_load_pd( &le[14] );
-	  
-	  left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
-	  left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
-	  
-	  left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
-	  left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
-	  
-	  left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
-	  left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
-	  left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
-	  
-	  left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
-	  left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
-	  
-	  left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
-	  left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
-	  
-	  left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
-	  left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
-	  left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
-	  
-	  __m128d x2_0 = _mm_load_pd( &x2[0] );
-	  __m128d x2_2 = _mm_load_pd( &x2[2] );
-	  
-	  __m128d right_k0_0 = _mm_load_pd( &ri[0] );
-	  __m128d right_k0_2 = _mm_load_pd( &ri[2] );
-	  __m128d right_k1_0 = _mm_load_pd( &ri[4] );
-	  __m128d right_k1_2 = _mm_load_pd( &ri[6] );
-	  __m128d right_k2_0 = _mm_load_pd( &ri[8] );
-	  __m128d right_k2_2 = _mm_load_pd( &ri[10] );
-	  __m128d right_k3_0 = _mm_load_pd( &ri[12] );
-	  __m128d right_k3_2 = _mm_load_pd( &ri[14] );
-	  
-	  right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-	  right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-	  
-	  right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-	  right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-	  
-	  right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-	  right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-	  right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-	  
-	  right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-	  right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-	  
-	  right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-	  right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-	  
-	  right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-	  right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-	  right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);	   
-	  
-	  __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
-	  __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
-	  
-	  __m128d EV_t_l0_k0 = EVV[0];
-	  __m128d EV_t_l0_k2 = EVV[1];
-	  __m128d EV_t_l1_k0 = EVV[2];
-	  __m128d EV_t_l1_k2 = EVV[3];
-	  __m128d EV_t_l2_k0 = EVV[4];
-	  __m128d EV_t_l2_k2 = EVV[5];
-	  __m128d EV_t_l3_k0 = EVV[6];
-	  __m128d EV_t_l3_k2 = EVV[7];
-	 
-	  
-	  EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-	  EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-	  EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-	  
-	  EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-	  EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-	  
-	  EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-	  EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-	  
-	  EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-	  EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-	  EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-	  	  
-	  EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-	  EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-	  EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-	  
-	  EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );	  	 	    		  
-	 
-	  scale = 1;
-	  	  	  	    
-	  __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m);
-	  v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	  if(_mm_movemask_pd( v1 ) != 3)
-	    scale = 0;
-	  else
-	    {
-	      v1 = _mm_and_pd(EV_t_l2_k0, absMask.m);
-	      v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	      if(_mm_movemask_pd( v1 ) != 3)
-		scale = 0;
-	    }
-	  	  
-	  if(scale)
-	    {		      
-	      _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc));
-	      _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc));	      	      
-	      
-	      
-	      addScale += wgt[i];	  
-	    }	
-	  else
-	    {
-	      _mm_store_pd(x3, EV_t_l0_k0);
-	      _mm_store_pd(&x3[2], EV_t_l2_k0);
-	    }
-	 
-	  	  
-	}
-      break;
-    case INNER_INNER:
-      for (i = 0; i < n; i++)
-	{
-	  x1 = &x1_start[4 * i];
-	  x2 = &x2_start[4 * i];
-	  x3 = &x3_start[4 * i];
-	  
-	  le =  &left[cptr[i] * 16];
-	  ri =  &right[cptr[i] * 16];
-
-	  __m128d x1_0 = _mm_load_pd( &x1[0] );
-	  __m128d x1_2 = _mm_load_pd( &x1[2] );
-	  
-	  __m128d left_k0_0 = _mm_load_pd( &le[0] );
-	  __m128d left_k0_2 = _mm_load_pd( &le[2] );
-	  __m128d left_k1_0 = _mm_load_pd( &le[4] );
-	  __m128d left_k1_2 = _mm_load_pd( &le[6] );
-	  __m128d left_k2_0 = _mm_load_pd( &le[8] );
-	  __m128d left_k2_2 = _mm_load_pd( &le[10] );
-	  __m128d left_k3_0 = _mm_load_pd( &le[12] );
-	  __m128d left_k3_2 = _mm_load_pd( &le[14] );
-	  
-	  left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
-	  left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
-	  
-	  left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
-	  left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
-	  
-	  left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
-	  left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
-	  left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
-	  
-	  left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
-	  left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
-	  
-	  left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
-	  left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
-	  
-	  left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
-	  left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
-	  left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
-	  
-	  __m128d x2_0 = _mm_load_pd( &x2[0] );
-	  __m128d x2_2 = _mm_load_pd( &x2[2] );
-	  
-	  __m128d right_k0_0 = _mm_load_pd( &ri[0] );
-	  __m128d right_k0_2 = _mm_load_pd( &ri[2] );
-	  __m128d right_k1_0 = _mm_load_pd( &ri[4] );
-	  __m128d right_k1_2 = _mm_load_pd( &ri[6] );
-	  __m128d right_k2_0 = _mm_load_pd( &ri[8] );
-	  __m128d right_k2_2 = _mm_load_pd( &ri[10] );
-	  __m128d right_k3_0 = _mm_load_pd( &ri[12] );
-	  __m128d right_k3_2 = _mm_load_pd( &ri[14] );
-	  
-	  right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-	  right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-	  
-	  right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-	  right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-	  
-	  right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-	  right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-	  right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-	  
-	  right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-	  right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-	  
-	  right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-	  right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-	  
-	  right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-	  right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-	  right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);	   
-	  
-	  __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
-	  __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
-	  
-	  __m128d EV_t_l0_k0 = EVV[0];
-	  __m128d EV_t_l0_k2 = EVV[1];
-	  __m128d EV_t_l1_k0 = EVV[2];
-	  __m128d EV_t_l1_k2 = EVV[3];
-	  __m128d EV_t_l2_k0 = EVV[4];
-	  __m128d EV_t_l2_k2 = EVV[5];
-	  __m128d EV_t_l3_k0 = EVV[6];
-	  __m128d EV_t_l3_k2 = EVV[7];
-	 
-	  
-	  EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-	  EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-	  EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-	  
-	  EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-	  EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-	  
-	  EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-	  EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-	  
-	  EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-	  EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-	  EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-	  	  
-	  EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-	  EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-	  EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-	  
-	  EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );	  	 	    		  	 
-
-	  scale = 1;
-	  	  
-	  __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m);
-	  v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	  if(_mm_movemask_pd( v1 ) != 3)
-	    scale = 0;
-	  else
-	    {
-	      v1 = _mm_and_pd(EV_t_l2_k0, absMask.m);
-	      v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	      if(_mm_movemask_pd( v1 ) != 3)
-		scale = 0;
-	    }
-	  	  
-	  if(scale)
-	    {		      
-	      _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc));
-	      _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc));	      	      
-	      
-	      
-	      addScale += wgt[i];	  
-	    }	
-	  else
-	    {
-	      _mm_store_pd(x3, EV_t_l0_k0);
-	      _mm_store_pd(&x3[2], EV_t_l2_k0);
-	    }
-	  	  
-	}
-      break;
-    default:
-      assert(0);
-    }
-
-  
-  *scalerIncrement = addScale;
-}
-
-
-
-static void newviewGTRCAT_SAVE( int tipCase,  double *EV,  int *cptr,
-				double *x1_start, double *x2_start,  double *x3_start, double *tipVector,
-				unsigned char *tipX1, unsigned char *tipX2,
-				int n,  double *left, double *right, int *wgt, int *scalerIncrement,
-				unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
-				double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats)
-{
-  double
-    *le,
-    *ri,
-    *x1,
-    *x2,
-    *x3,
-    *x1_ptr = x1_start,
-    *x2_ptr = x2_start, 
-    *x3_ptr = x3_start, 
-    EV_t[16] __attribute__ ((aligned (BYTE_ALIGNMENT)));
-    
-  int 
-    i, 
-    j, 
-    scale, 
-    scaleGap = 0,
-    addScale = 0;
-   
-  __m128d
-    minlikelihood_sse = _mm_set1_pd( minlikelihood ),
-    sc = _mm_set1_pd(twotothe256),
-    EVV[8];  
-  
-  for(i = 0; i < 4; i++)
-    for (j=0; j < 4; j++)
-      EV_t[4 * j + i] = EV[4 * i + j];
-  
-  for(i = 0; i < 8; i++)
-    EVV[i] = _mm_load_pd(&EV_t[i * 2]);
-  
-  {
-    x1 = x1_gapColumn;	      
-    x2 = x2_gapColumn;
-    x3 = x3_gapColumn;
-    
-    le =  &left[maxCats * 16];	     	 
-    ri =  &right[maxCats * 16];		   	  	  	  	         
-
-    __m128d x1_0 = _mm_load_pd( &x1[0] );
-    __m128d x1_2 = _mm_load_pd( &x1[2] );
-    
-    __m128d left_k0_0 = _mm_load_pd( &le[0] );
-    __m128d left_k0_2 = _mm_load_pd( &le[2] );
-    __m128d left_k1_0 = _mm_load_pd( &le[4] );
-    __m128d left_k1_2 = _mm_load_pd( &le[6] );
-    __m128d left_k2_0 = _mm_load_pd( &le[8] );
-    __m128d left_k2_2 = _mm_load_pd( &le[10] );
-    __m128d left_k3_0 = _mm_load_pd( &le[12] );
-    __m128d left_k3_2 = _mm_load_pd( &le[14] );
-    
-    left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
-    left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
-    
-    left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
-    left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
-    
-    left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
-    left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
-    left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
-    
-    left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
-    left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
-    
-    left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
-    left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
-    
-    left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
-    left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
-    left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
-    
-    __m128d x2_0 = _mm_load_pd( &x2[0] );
-    __m128d x2_2 = _mm_load_pd( &x2[2] );
-    
-    __m128d right_k0_0 = _mm_load_pd( &ri[0] );
-    __m128d right_k0_2 = _mm_load_pd( &ri[2] );
-    __m128d right_k1_0 = _mm_load_pd( &ri[4] );
-    __m128d right_k1_2 = _mm_load_pd( &ri[6] );
-    __m128d right_k2_0 = _mm_load_pd( &ri[8] );
-    __m128d right_k2_2 = _mm_load_pd( &ri[10] );
-    __m128d right_k3_0 = _mm_load_pd( &ri[12] );
-    __m128d right_k3_2 = _mm_load_pd( &ri[14] );
-    
-    right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-    right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-    
-    right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-    right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-    
-    right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-    right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-    right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-    
-    right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-    right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-    
-    right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-    right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-    
-    right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-    right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-    right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);	   
-    
-    __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
-    __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
-    
-    __m128d EV_t_l0_k0 = EVV[0];
-    __m128d EV_t_l0_k2 = EVV[1];
-    __m128d EV_t_l1_k0 = EVV[2];
-    __m128d EV_t_l1_k2 = EVV[3];
-    __m128d EV_t_l2_k0 = EVV[4];
-    __m128d EV_t_l2_k2 = EVV[5];
-    __m128d EV_t_l3_k0 = EVV[6];
-    __m128d EV_t_l3_k2 = EVV[7];
-        
-    EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-    EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-    
-    EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-    EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-    
-    EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-    
-    EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-    EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-    
-    EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-    EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-    EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-    
-    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );	  	 	    		  
-	
-    if(tipCase != TIP_TIP)
-      {    
-	scale = 1;
-	      
-	__m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m);
-	v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	if(_mm_movemask_pd( v1 ) != 3)
-	  scale = 0;
-	else
-	  {
-	    v1 = _mm_and_pd(EV_t_l2_k0, absMask.m);
-	    v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	    if(_mm_movemask_pd( v1 ) != 3)
-	      scale = 0;
-	  }
-	
-	if(scale)
-	  {		      
-	    _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc));
-	    _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc));	      	      
-	    
-	    scaleGap = TRUE;	   
-	  }	
-	else
-	  {
-	    _mm_store_pd(x3, EV_t_l0_k0);
-	    _mm_store_pd(&x3[2], EV_t_l2_k0);
-	  }
-      }
-    else
-      {
-	_mm_store_pd(x3, EV_t_l0_k0);
-	_mm_store_pd(&x3[2], EV_t_l2_k0);
-      }
-  }
-  
-
-  switch(tipCase)
-    {
-    case TIP_TIP:      
-      for (i = 0; i < n; i++)
-	{
-	  if(noGap(x3_gap, i))
-	    {
-	      x1 = &(tipVector[4 * tipX1[i]]);
-	      x2 = &(tipVector[4 * tipX2[i]]);
-	  
-	      x3 = x3_ptr;
-	  
-	      if(isGap(x1_gap, i))
-		le =  &left[maxCats * 16];
-	      else	  	  
-		le =  &left[cptr[i] * 16];	  
-	  
-	      if(isGap(x2_gap, i))
-		ri =  &right[maxCats * 16];
-	      else	 	  
-		ri =  &right[cptr[i] * 16];
-	  
-	      __m128d x1_0 = _mm_load_pd( &x1[0] );
-	      __m128d x1_2 = _mm_load_pd( &x1[2] );
-	      
-	      __m128d left_k0_0 = _mm_load_pd( &le[0] );
-	      __m128d left_k0_2 = _mm_load_pd( &le[2] );
-	      __m128d left_k1_0 = _mm_load_pd( &le[4] );
-	      __m128d left_k1_2 = _mm_load_pd( &le[6] );
-	      __m128d left_k2_0 = _mm_load_pd( &le[8] );
-	      __m128d left_k2_2 = _mm_load_pd( &le[10] );
-	      __m128d left_k3_0 = _mm_load_pd( &le[12] );
-	      __m128d left_k3_2 = _mm_load_pd( &le[14] );
-	  
-	      left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
-	      left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
-	      
-	      left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
-	      left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
-	      
-	      left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
-	      left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
-	      left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
-	      
-	      left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
-	      left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
-	      
-	      left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
-	      left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
-	      
-	      left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
-	      left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
-	      left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
-	      
-	      __m128d x2_0 = _mm_load_pd( &x2[0] );
-	      __m128d x2_2 = _mm_load_pd( &x2[2] );
-	      
-	      __m128d right_k0_0 = _mm_load_pd( &ri[0] );
-	      __m128d right_k0_2 = _mm_load_pd( &ri[2] );
-	      __m128d right_k1_0 = _mm_load_pd( &ri[4] );
-	      __m128d right_k1_2 = _mm_load_pd( &ri[6] );
-	      __m128d right_k2_0 = _mm_load_pd( &ri[8] );
-	      __m128d right_k2_2 = _mm_load_pd( &ri[10] );
-	      __m128d right_k3_0 = _mm_load_pd( &ri[12] );
-	      __m128d right_k3_2 = _mm_load_pd( &ri[14] );
-	      
-	      right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-	      right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-	      
-	      right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-	      right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-	      
-	      right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-	      right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-	      right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-	      
-	      right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-	      right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-	      
-	      right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-	      right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-	      
-	      right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-	      right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-	      right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);	   
-	      
-	      __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
-	      __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );	  	  
-	      
-	      __m128d EV_t_l0_k0 = EVV[0];
-	      __m128d EV_t_l0_k2 = EVV[1];
-	      __m128d EV_t_l1_k0 = EVV[2];
-	      __m128d EV_t_l1_k2 = EVV[3];
-	      __m128d EV_t_l2_k0 = EVV[4];
-	      __m128d EV_t_l2_k2 = EVV[5];
-	      __m128d EV_t_l3_k0 = EVV[6];
-	      __m128d EV_t_l3_k2 = EVV[7];
-	      
-	      EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-	      EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-	      EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-	      
-	      EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-	      EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-	      
-	      EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-	      EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-	      
-	      EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-	      EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-	      EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-	      
-	      EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-	      EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-	      EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-	      
-	      EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );	 
-	  	  
-	      _mm_store_pd(x3, EV_t_l0_k0);
-	      _mm_store_pd(&x3[2], EV_t_l2_k0);	  	 	   	    
-
-	      x3_ptr += 4;
-	    }
-	}
-      break;
-    case TIP_INNER:      
-      for (i = 0; i < n; i++)
-	{ 
-	  if(isGap(x3_gap, i))
-	    {
-	      if(scaleGap)		   		    
-		addScale += wgt[i];
-	    }
-	  else
-	    {	      
-	      x1 = &(tipVector[4 * tipX1[i]]);
-	      
-	      x2 = x2_ptr;
-	      x3 = x3_ptr;
-
-	      if(isGap(x1_gap, i))
-		le =  &left[maxCats * 16];
-	      else
-		le =  &left[cptr[i] * 16];
-
-	      if(isGap(x2_gap, i))
-		{		 
-		  ri =  &right[maxCats * 16];
-		  x2 = x2_gapColumn;
-		}
-	      else
-		{
-		  ri =  &right[cptr[i] * 16];
-		  x2 = x2_ptr;
-		  x2_ptr += 4;
-		}	  	  	  	  
-
-	      __m128d x1_0 = _mm_load_pd( &x1[0] );
-	      __m128d x1_2 = _mm_load_pd( &x1[2] );
-	      
-	      __m128d left_k0_0 = _mm_load_pd( &le[0] );
-	      __m128d left_k0_2 = _mm_load_pd( &le[2] );
-	      __m128d left_k1_0 = _mm_load_pd( &le[4] );
-	      __m128d left_k1_2 = _mm_load_pd( &le[6] );
-	      __m128d left_k2_0 = _mm_load_pd( &le[8] );
-	      __m128d left_k2_2 = _mm_load_pd( &le[10] );
-	      __m128d left_k3_0 = _mm_load_pd( &le[12] );
-	      __m128d left_k3_2 = _mm_load_pd( &le[14] );
-	      
-	      left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
-	      left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
-	      
-	      left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
-	      left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
-	      
-	      left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
-	      left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
-	      left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
-	      
-	      left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
-	      left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
-	      
-	      left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
-	      left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
-	      
-	      left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
-	      left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
-	      left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
-	      
-	      __m128d x2_0 = _mm_load_pd( &x2[0] );
-	      __m128d x2_2 = _mm_load_pd( &x2[2] );
-	      
-	      __m128d right_k0_0 = _mm_load_pd( &ri[0] );
-	      __m128d right_k0_2 = _mm_load_pd( &ri[2] );
-	      __m128d right_k1_0 = _mm_load_pd( &ri[4] );
-	      __m128d right_k1_2 = _mm_load_pd( &ri[6] );
-	      __m128d right_k2_0 = _mm_load_pd( &ri[8] );
-	      __m128d right_k2_2 = _mm_load_pd( &ri[10] );
-	      __m128d right_k3_0 = _mm_load_pd( &ri[12] );
-	      __m128d right_k3_2 = _mm_load_pd( &ri[14] );
-	      
-	      right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-	      right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-	  
-	      right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-	      right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-	      
-	      right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-	      right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-	      right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-	      
-	      right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-	      right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-	      
-	      right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-	      right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-	      
-	      right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-	      right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-	      right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);	   
-	      
-	      __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
-	      __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
-	      
-	      __m128d EV_t_l0_k0 = EVV[0];
-	      __m128d EV_t_l0_k2 = EVV[1];
-	      __m128d EV_t_l1_k0 = EVV[2];
-	      __m128d EV_t_l1_k2 = EVV[3];
-	      __m128d EV_t_l2_k0 = EVV[4];
-	      __m128d EV_t_l2_k2 = EVV[5];
-	      __m128d EV_t_l3_k0 = EVV[6];
-	      __m128d EV_t_l3_k2 = EVV[7];
-	      
-	      
-	      EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-	      EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-	      EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-	      
-	      EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-	      EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-	      
-	      EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-	      EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-	      
-	      EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-	      EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-	      EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-	      
-	      EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-	      EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-	      EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-	      
-	      EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );	  	 	    		  
-	      
-	      scale = 1;
-	      
-	      __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m);
-	      v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	      if(_mm_movemask_pd( v1 ) != 3)
-		scale = 0;
-	      else
-		{
-		  v1 = _mm_and_pd(EV_t_l2_k0, absMask.m);
-		  v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-		  if(_mm_movemask_pd( v1 ) != 3)
-		    scale = 0;
-		}
-	  	  
-	      if(scale)
-		{		      
-		  _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc));
-		  _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc));	      	      
-		  		  
-		  addScale += wgt[i];	  
-		}	
-	      else
-		{
-		  _mm_store_pd(x3, EV_t_l0_k0);
-		  _mm_store_pd(&x3[2], EV_t_l2_k0);
-		}
-
-	      x3_ptr += 4;
-	    }
-	  	  
-	}
-      break;
-    case INNER_INNER:
-      for (i = 0; i < n; i++)
-	{ 
-	  if(isGap(x3_gap, i))
-	    {
-	      if(scaleGap)		   		    
-		addScale += wgt[i];
-	    }
-	  else
-	    {	     
-	      x3 = x3_ptr;
-	  	  
-	      if(isGap(x1_gap, i))
-		{
-		  x1 = x1_gapColumn;
-		  le =  &left[maxCats * 16];
-		}
-	      else
-		{
-		  le =  &left[cptr[i] * 16];
-		  x1 = x1_ptr;
-		  x1_ptr += 4;
-		}
-
-	      if(isGap(x2_gap, i))	
-		{
-		  x2 = x2_gapColumn;
-		  ri =  &right[maxCats * 16];	    
-		}
-	      else
-		{
-		  ri =  &right[cptr[i] * 16];
-		  x2 = x2_ptr;
-		  x2_ptr += 4;
-		}	 	  	  	  
-
-	      __m128d x1_0 = _mm_load_pd( &x1[0] );
-	      __m128d x1_2 = _mm_load_pd( &x1[2] );
-	      
-	      __m128d left_k0_0 = _mm_load_pd( &le[0] );
-	      __m128d left_k0_2 = _mm_load_pd( &le[2] );
-	      __m128d left_k1_0 = _mm_load_pd( &le[4] );
-	      __m128d left_k1_2 = _mm_load_pd( &le[6] );
-	      __m128d left_k2_0 = _mm_load_pd( &le[8] );
-	      __m128d left_k2_2 = _mm_load_pd( &le[10] );
-	      __m128d left_k3_0 = _mm_load_pd( &le[12] );
-	      __m128d left_k3_2 = _mm_load_pd( &le[14] );
-	      
-	      left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
-	      left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
-	      
-	      left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
-	      left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
-	      
-	      left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
-	      left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
-	      left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
-	      
-	      left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
-	      left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
-	      
-	      left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
-	      left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
-	      
-	      left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
-	      left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
-	      left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
-	      
-	      __m128d x2_0 = _mm_load_pd( &x2[0] );
-	      __m128d x2_2 = _mm_load_pd( &x2[2] );
-	      
-	      __m128d right_k0_0 = _mm_load_pd( &ri[0] );
-	      __m128d right_k0_2 = _mm_load_pd( &ri[2] );
-	      __m128d right_k1_0 = _mm_load_pd( &ri[4] );
-	      __m128d right_k1_2 = _mm_load_pd( &ri[6] );
-	      __m128d right_k2_0 = _mm_load_pd( &ri[8] );
-	      __m128d right_k2_2 = _mm_load_pd( &ri[10] );
-	      __m128d right_k3_0 = _mm_load_pd( &ri[12] );
-	      __m128d right_k3_2 = _mm_load_pd( &ri[14] );
-	      
-	      right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-	      right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-	      
-	      right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-	      right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-	      
-	      right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-	      right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-	      right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-	      
-	      right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-	      right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-	      
-	      right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-	      right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-	      
-	      right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-	      right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-	      right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);	   
-	      
-	      __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
-	      __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
-	      
-	      __m128d EV_t_l0_k0 = EVV[0];
-	      __m128d EV_t_l0_k2 = EVV[1];
-	      __m128d EV_t_l1_k0 = EVV[2];
-	      __m128d EV_t_l1_k2 = EVV[3];
-	      __m128d EV_t_l2_k0 = EVV[4];
-	      __m128d EV_t_l2_k2 = EVV[5];
-	      __m128d EV_t_l3_k0 = EVV[6];
-	      __m128d EV_t_l3_k2 = EVV[7];
-	      
-	      
-	      EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-	      EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-	      EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-	      
-	      EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-	      EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-	      
-	      EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-	      EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-	      
-	      EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-	      EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-	      EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-	      
-	      EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-	      EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-	      EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-	      
-	      EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );	  	 	    		  	 
-	      
-	      scale = 1;
-	      
-	      __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m);
-	      v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	      if(_mm_movemask_pd( v1 ) != 3)
-		scale = 0;
-	      else
-		{
-		  v1 = _mm_and_pd(EV_t_l2_k0, absMask.m);
-		  v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-		  if(_mm_movemask_pd( v1 ) != 3)
-		    scale = 0;
-		}
-	  	  
-	      if(scale)
-		{		      
-		  _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc));
-		  _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc));	      	      
-		  	      
-		  addScale += wgt[i];	  
-		}	
-	      else
-		{
-		  _mm_store_pd(x3, EV_t_l0_k0);
-		  _mm_store_pd(&x3[2], EV_t_l2_k0);
-		}
-	     
-	      x3_ptr += 4;
-	    }
-	}
-      break;
-    default:
-      assert(0);
-    }
-
-  
-  *scalerIncrement = addScale;
-}
-
-static void newviewGTRGAMMAPROT_GAPPED_SAVE(int tipCase,
-					    double *x1, double *x2, double *x3, double *extEV, double *tipVector,
-					    unsigned char *tipX1, unsigned char *tipX2,
-					    int n, double *left, double *right, int *wgt, int *scalerIncrement, 
-					    unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,  
-					    double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn
-					    )
-{
-  double  *uX1, *uX2, *v;
-  double x1px2;
-  int  i, j, l, k, scale, addScale = 0,   
-    gapScaling = 0;
-  double 
-    *vl, *vr, *x1v, *x2v,
-    *x1_ptr = x1,
-    *x2_ptr = x2,
-    *x3_ptr = x3;
-
-  
-
-  switch(tipCase)
-    {
-    case TIP_TIP:
-      {
-	double umpX1[1840], umpX2[1840];
-
-	for(i = 0; i < 23; i++)
-	  {
-	    v = &(tipVector[20 * i]);
-
-	    for(k = 0; k < 80; k++)
-	      {
-		double *ll =  &left[k * 20];
-		double *rr =  &right[k * 20];
-		
-		__m128d umpX1v = _mm_setzero_pd();
-		__m128d umpX2v = _mm_setzero_pd();
-
-		for(l = 0; l < 20; l+=2)
-		  {
-		    __m128d vv = _mm_load_pd(&v[l]);
-		    umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));
-		    umpX2v = _mm_add_pd(umpX2v, _mm_mul_pd(vv, _mm_load_pd(&rr[l])));					
-		  }
-		
-		umpX1v = _mm_hadd_pd(umpX1v, umpX1v);
-		umpX2v = _mm_hadd_pd(umpX2v, umpX2v);
-		
-		_mm_storel_pd(&umpX1[80 * i + k], umpX1v);
-		_mm_storel_pd(&umpX2[80 * i + k], umpX2v);
-	      }
-	  }
-
-	{
-	  uX1 = &umpX1[1760];
-	  uX2 = &umpX2[1760];
-
-	  for(j = 0; j < 4; j++)
-	    {
-	      v = &x3_gapColumn[j * 20];
-
-	      __m128d zero =  _mm_setzero_pd();
-	      for(k = 0; k < 20; k+=2)		  		    
-		_mm_store_pd(&v[k], zero);
-
-	      for(k = 0; k < 20; k++)
-		{ 
-		  double *eev = &extEV[k * 20];
-		  x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
-		  __m128d x1px2v = _mm_set1_pd(x1px2);
-		  
-		  for(l = 0; l < 20; l+=2)
-		    {
-		      __m128d vv = _mm_load_pd(&v[l]);
-		      __m128d ee = _mm_load_pd(&eev[l]);
-		      
-		      vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
-		      
-		      _mm_store_pd(&v[l], vv);
-		    }
-		}
-	    }	   
-	}	
-
-	for(i = 0; i < n; i++)
-	  {
-	    if(!(x3_gap[i / 32] & mask32[i % 32]))
-	      {
-		uX1 = &umpX1[80 * tipX1[i]];
-		uX2 = &umpX2[80 * tipX2[i]];
-		
-		for(j = 0; j < 4; j++)
-		  {
-		    v = &x3_ptr[j * 20];
-		    
-		    
-		    __m128d zero =  _mm_setzero_pd();
-		    for(k = 0; k < 20; k+=2)		  		    
-		      _mm_store_pd(&v[k], zero);
-		    
-		    for(k = 0; k < 20; k++)
-		      { 
-			double *eev = &extEV[k * 20];
-			x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
-			__m128d x1px2v = _mm_set1_pd(x1px2);
-			
-			for(l = 0; l < 20; l+=2)
-			  {
-			    __m128d vv = _mm_load_pd(&v[l]);
-			    __m128d ee = _mm_load_pd(&eev[l]);
-			    
-			    vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
-			    
-			    _mm_store_pd(&v[l], vv);
-			  }
-		      }
-		  }	   
-		x3_ptr += 80;
-	      }
-	  }
-      }
-      break;
-    case TIP_INNER:
-      {
-	double umpX1[1840], ump_x2[20];
-
-
-	for(i = 0; i < 23; i++)
-	  {
-	    v = &(tipVector[20 * i]);
-
-	    for(k = 0; k < 80; k++)
-	      {
-		double *ll =  &left[k * 20];
-				
-		__m128d umpX1v = _mm_setzero_pd();
-		
-		for(l = 0; l < 20; l+=2)
-		  {
-		    __m128d vv = _mm_load_pd(&v[l]);
-		    umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));		    					
-		  }
-		
-		umpX1v = _mm_hadd_pd(umpX1v, umpX1v);				
-		_mm_storel_pd(&umpX1[80 * i + k], umpX1v);		
-
-	      }
-	  }
-
-	{
-	  uX1 = &umpX1[1760];
-
-	  for(k = 0; k < 4; k++)
-	    {
-	      v = &(x2_gapColumn[k * 20]);
-	       
-	      for(l = 0; l < 20; l++)
-		{		   
-		  double *r =  &right[k * 400 + l * 20];
-		  __m128d ump_x2v = _mm_setzero_pd();	    
-		  
-		  for(j = 0; j < 20; j+= 2)
-		    {
-		      __m128d vv = _mm_load_pd(&v[j]);
-		      __m128d rr = _mm_load_pd(&r[j]);
-		      ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr));
-		    }
-		  
-		  ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
-		  
-		  _mm_storel_pd(&ump_x2[l], ump_x2v);		   		     
-		}
-
-	      v = &(x3_gapColumn[20 * k]);
-
-	      __m128d zero =  _mm_setzero_pd();
-	      for(l = 0; l < 20; l+=2)		  		    
-		_mm_store_pd(&v[l], zero);
-		  
-	      for(l = 0; l < 20; l++)
-		{
-		  double *eev = &extEV[l * 20];
-		  x1px2 = uX1[k * 20 + l]  * ump_x2[l];
-		  __m128d x1px2v = _mm_set1_pd(x1px2);
-		  
-		  for(j = 0; j < 20; j+=2)
-		    {
-		      __m128d vv = _mm_load_pd(&v[j]);
-		      __m128d ee = _mm_load_pd(&eev[j]);
-		      
-		      vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
-		      
-		      _mm_store_pd(&v[j], vv);
-		    }		     		    
-		}			
-	      
-	    }
-	  
-	  { 
-	    v = x3_gapColumn;
-	    __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
-	    
-	    scale = 1;
-	    for(l = 0; scale && (l < 80); l += 2)
-	      {
-		__m128d vv = _mm_load_pd(&v[l]);
-		__m128d v1 = _mm_and_pd(vv, absMask.m);
-		v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-		if(_mm_movemask_pd( v1 ) != 3)
-		  scale = 0;
-	      }	    	  
-	  }
-
-
-	  if (scale)
-	    {
-	      gapScaling = 1;
-	      __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
-	      
-	      for(l = 0; l < 80; l+=2)
-		{
-		  __m128d ex3v = _mm_load_pd(&v[l]);		  
-		  _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
-		}		   		  	      	    	       
-	    }
-	}
-
-	for (i = 0; i < n; i++)
-	  {	    
-	    if((x3_gap[i / 32] & mask32[i % 32]))
-	       {	       
-		 if(gapScaling)
-		   {		     
-		     addScale += wgt[i];		     
-		   }
-	       }
-	     else
-	       {
-		 uX1 = &umpX1[80 * tipX1[i]];
-
-		  if(x2_gap[i / 32] & mask32[i % 32])
-		   x2v = x2_gapColumn;
-		  else
-		    {
-		      x2v = x2_ptr;
-		      x2_ptr += 80;
-		    }
-		 
-		 for(k = 0; k < 4; k++)
-		   {
-		     v = &(x2v[k * 20]);
-		     
-		     for(l = 0; l < 20; l++)
-		       {		   
-			 double *r =  &right[k * 400 + l * 20];
-			 __m128d ump_x2v = _mm_setzero_pd();	    
-			 
-			 for(j = 0; j < 20; j+= 2)
-			   {
-			     __m128d vv = _mm_load_pd(&v[j]);
-			     __m128d rr = _mm_load_pd(&r[j]);
-			     ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr));
-			   }
-			 
-			 ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
-			 
-			 _mm_storel_pd(&ump_x2[l], ump_x2v);		   		     
-		       }
-		     
-		     v = &x3_ptr[20 * k];
-		     
-		     __m128d zero =  _mm_setzero_pd();
-		     for(l = 0; l < 20; l+=2)		  		    
-		       _mm_store_pd(&v[l], zero);
-		     
-		     for(l = 0; l < 20; l++)
-		       {
-			 double *eev = &extEV[l * 20];
-			 x1px2 = uX1[k * 20 + l]  * ump_x2[l];
-			 __m128d x1px2v = _mm_set1_pd(x1px2);
-			 
-			 for(j = 0; j < 20; j+=2)
-			   {
-			     __m128d vv = _mm_load_pd(&v[j]);
-			     __m128d ee = _mm_load_pd(&eev[j]);
-			     
-			     vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
-			     
-			     _mm_store_pd(&v[j], vv);
-			   }		     		    
-		       }			
-		     
-		   }
-		 
-		 
-		 { 
-		   v = x3_ptr;
-		   __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
-		   
-		   scale = 1;
-		   for(l = 0; scale && (l < 80); l += 2)
-		     {
-		       __m128d vv = _mm_load_pd(&v[l]);
-		       __m128d v1 = _mm_and_pd(vv, absMask.m);
-		       v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-		       if(_mm_movemask_pd( v1 ) != 3)
-			 scale = 0;
-		     }	    	  
-		 }
-		 
-		 
-		 if (scale)
-		   {
-		     __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
-		     
-		     for(l = 0; l < 80; l+=2)
-		       {
-			 __m128d ex3v = _mm_load_pd(&v[l]);		  
-			 _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
-		       }		   		  
-		     		    
-		     addScale += wgt[i];		      
-		   }
-		 
-		 x3_ptr += 80;
-	       }
-	  }
-      }
-      break;
-    case INNER_INNER:
-      {
-	for(k = 0; k < 4; k++)
-	   {
-	     vl = &(x1_gapColumn[20 * k]);
-	     vr = &(x2_gapColumn[20 * k]);
-	     v =  &(x3_gapColumn[20 * k]);
-
-	     __m128d zero =  _mm_setzero_pd();
-	     for(l = 0; l < 20; l+=2)		  		    
-	       _mm_store_pd(&v[l], zero);
-	     
-	     for(l = 0; l < 20; l++)
-	       {		 
-		 {
-		   __m128d al = _mm_setzero_pd();
-		   __m128d ar = _mm_setzero_pd();
-
-		   double *ll   = &left[k * 400 + l * 20];
-		   double *rr   = &right[k * 400 + l * 20];
-		   double *EVEV = &extEV[20 * l];
-		   
-		   for(j = 0; j < 20; j+=2)
-		     {
-		       __m128d lv  = _mm_load_pd(&ll[j]);
-		       __m128d rv  = _mm_load_pd(&rr[j]);
-		       __m128d vll = _mm_load_pd(&vl[j]);
-		       __m128d vrr = _mm_load_pd(&vr[j]);
-		       
-		       al = _mm_add_pd(al, _mm_mul_pd(vll, lv));
-		       ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv));
-		     }  		 
-		       
-		   al = _mm_hadd_pd(al, al);
-		   ar = _mm_hadd_pd(ar, ar);
-		   
-		   al = _mm_mul_pd(al, ar);
-
-		   for(j = 0; j < 20; j+=2)
-		     {
-		       __m128d vv  = _mm_load_pd(&v[j]);
-		       __m128d EVV = _mm_load_pd(&EVEV[j]);
-
-		       vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
-
-		       _mm_store_pd(&v[j], vv);
-		     }		  		   		  
-		 }		 
-
-	       }
-	   }
-	 
-
-	{ 
-	   v = x3_gapColumn;
-	   __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
-	   
-	   scale = 1;
-	   for(l = 0; scale && (l < 80); l += 2)
-	     {
-	       __m128d vv = _mm_load_pd(&v[l]);
-	       __m128d v1 = _mm_and_pd(vv, absMask.m);
-	       v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	       if(_mm_movemask_pd( v1 ) != 3)
-		 scale = 0;
-	     }	    	  
-	 }
-
-	 if (scale)
-	   {
-	     gapScaling = 1;
-	     __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
-	     
-	     for(l = 0; l < 80; l+=2)
-	       {
-		 __m128d ex3v = _mm_load_pd(&v[l]);		  
-		 _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
-	       }		   		  
-	     
-	    	  
-	   }
-      }
-
-      for (i = 0; i < n; i++)
-       {
-	  if(x3_gap[i / 32] & mask32[i % 32])
-	   {	     
-	     if(gapScaling)
-	       {		
-		 addScale += wgt[i];			       
-	       }
-	   }
-	 else
-	   {
-	     if(x1_gap[i / 32] & mask32[i % 32])
-	       x1v = x1_gapColumn;
-	     else
-	       {
-		 x1v = x1_ptr;
-		 x1_ptr += 80;
-	       }
-
-	     if(x2_gap[i / 32] & mask32[i % 32])
-	       x2v = x2_gapColumn;
-	     else
-	       {
-		 x2v = x2_ptr;
-		 x2_ptr += 80;
-	       }
-
-	     for(k = 0; k < 4; k++)
-	       {
-		 vl = &(x1v[20 * k]);
-		 vr = &(x2v[20 * k]);
-		 v =  &x3_ptr[20 * k];
-		 		 
-		 __m128d zero =  _mm_setzero_pd();
-		 for(l = 0; l < 20; l+=2)		  		    
-		   _mm_store_pd(&v[l], zero);
-		 		 
-		 for(l = 0; l < 20; l++)
-		   {		 
-		     {
-		       __m128d al = _mm_setzero_pd();
-		       __m128d ar = _mm_setzero_pd();
-		       
-		       double *ll   = &left[k * 400 + l * 20];
-		       double *rr   = &right[k * 400 + l * 20];
-		       double *EVEV = &extEV[20 * l];
-		       
-		       for(j = 0; j < 20; j+=2)
-			 {
-			   __m128d lv  = _mm_load_pd(&ll[j]);
-			   __m128d rv  = _mm_load_pd(&rr[j]);
-			   __m128d vll = _mm_load_pd(&vl[j]);
-			   __m128d vrr = _mm_load_pd(&vr[j]);
-			   
-			   al = _mm_add_pd(al, _mm_mul_pd(vll, lv));
-			   ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv));
-			 }  		 
-		       
-		       al = _mm_hadd_pd(al, al);
-		       ar = _mm_hadd_pd(ar, ar);
-		       
-		       al = _mm_mul_pd(al, ar);
-		       
-		       for(j = 0; j < 20; j+=2)
-			 {
-			   __m128d vv  = _mm_load_pd(&v[j]);
-			   __m128d EVV = _mm_load_pd(&EVEV[j]);
-			   
-			   vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
-			   
-			   _mm_store_pd(&v[j], vv);
-			 }		  		   		  
-		     }		 
-		     
-		   }
-	       }
-	     
-
-	     
-	     { 
-	       v = x3_ptr;
-	       __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
-	       
-	       scale = 1;
-	       for(l = 0; scale && (l < 80); l += 2)
-		 {
-		   __m128d vv = _mm_load_pd(&v[l]);
-		   __m128d v1 = _mm_and_pd(vv, absMask.m);
-		   v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-		   if(_mm_movemask_pd( v1 ) != 3)
-		     scale = 0;
-		 }	    	  
-	     }
-	     
-	     
-	     if (scale)
-	       {
-		 __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
-		 
-		 for(l = 0; l < 80; l+=2)
-		   {
-		     __m128d ex3v = _mm_load_pd(&v[l]);		  
-		     _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
-		   }		   		  
-		 		
-		 addScale += wgt[i];		 	  
-	       }
-	     x3_ptr += 80;
-	   }
-       }
-      break;
-    default:
-      assert(0);
-    }
-
- 
-  *scalerIncrement = addScale;  
-}
-
-
-
-static void newviewGTRGAMMAPROT(int tipCase,
-				double *x1, double *x2, double *x3, double *extEV, double *tipVector,
-				unsigned char *tipX1, unsigned char *tipX2,
-				int n, double *left, double *right, int *wgt, int *scalerIncrement)
-{
-  double  *uX1, *uX2, *v;
-  double x1px2;
-  int  i, j, l, k, scale, addScale = 0;
-  double *vl, *vr;
-
-
-
-  switch(tipCase)
-    {
-    case TIP_TIP:
-      {
-	double umpX1[1840], umpX2[1840];
-
-	for(i = 0; i < 23; i++)
-	  {
-	    v = &(tipVector[20 * i]);
-
-	    for(k = 0; k < 80; k++)
-	      {
-		double *ll =  &left[k * 20];
-		double *rr =  &right[k * 20];
-		
-		__m128d umpX1v = _mm_setzero_pd();
-		__m128d umpX2v = _mm_setzero_pd();
-
-		for(l = 0; l < 20; l+=2)
-		  {
-		    __m128d vv = _mm_load_pd(&v[l]);
-		    umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));
-		    umpX2v = _mm_add_pd(umpX2v, _mm_mul_pd(vv, _mm_load_pd(&rr[l])));					
-		  }
-		
-		umpX1v = _mm_hadd_pd(umpX1v, umpX1v);
-		umpX2v = _mm_hadd_pd(umpX2v, umpX2v);
-		
-		_mm_storel_pd(&umpX1[80 * i + k], umpX1v);
-		_mm_storel_pd(&umpX2[80 * i + k], umpX2v);
-
-	      }
-	  }
-
-	for(i = 0; i < n; i++)
-	  {
-	    uX1 = &umpX1[80 * tipX1[i]];
-	    uX2 = &umpX2[80 * tipX2[i]];
-
-	    for(j = 0; j < 4; j++)
-	      {
-		v = &x3[i * 80 + j * 20];
-
-
-		__m128d zero =  _mm_setzero_pd();
-		for(k = 0; k < 20; k+=2)		  		    
-		  _mm_store_pd(&v[k], zero);
-
-		for(k = 0; k < 20; k++)
-		  { 
-		    double *eev = &extEV[k * 20];
-		    x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
-		    __m128d x1px2v = _mm_set1_pd(x1px2);
-
-		    for(l = 0; l < 20; l+=2)
-		      {
-		      	__m128d vv = _mm_load_pd(&v[l]);
-			__m128d ee = _mm_load_pd(&eev[l]);
-
-			vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
-			
-			_mm_store_pd(&v[l], vv);
-		      }
-		  }
-
-
-	      }	   
-	  }
-      }
-      break;
-    case TIP_INNER:
-      {
-	double umpX1[1840], ump_x2[20];
-
-
-	for(i = 0; i < 23; i++)
-	  {
-	    v = &(tipVector[20 * i]);
-
-	    for(k = 0; k < 80; k++)
-	      {
-		double *ll =  &left[k * 20];
-				
-		__m128d umpX1v = _mm_setzero_pd();
-		
-		for(l = 0; l < 20; l+=2)
-		  {
-		    __m128d vv = _mm_load_pd(&v[l]);
-		    umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));		    					
-		  }
-		
-		umpX1v = _mm_hadd_pd(umpX1v, umpX1v);				
-		_mm_storel_pd(&umpX1[80 * i + k], umpX1v);		
-
-
-	      }
-	  }
-
-	for (i = 0; i < n; i++)
-	  {
-	    uX1 = &umpX1[80 * tipX1[i]];
-
-	    for(k = 0; k < 4; k++)
-	      {
-		v = &(x2[80 * i + k * 20]);
-	       
-		for(l = 0; l < 20; l++)
-		  {		   
-		    double *r =  &right[k * 400 + l * 20];
-		    __m128d ump_x2v = _mm_setzero_pd();	    
-		    
-		    for(j = 0; j < 20; j+= 2)
-		      {
-			__m128d vv = _mm_load_pd(&v[j]);
-			__m128d rr = _mm_load_pd(&r[j]);
-			ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr));
-		      }
-		     
-		    ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
-		    
-		    _mm_storel_pd(&ump_x2[l], ump_x2v);		   		     
-		  }
-
-		v = &(x3[80 * i + 20 * k]);
-
-		__m128d zero =  _mm_setzero_pd();
-		for(l = 0; l < 20; l+=2)		  		    
-		  _mm_store_pd(&v[l], zero);
-		  
-		for(l = 0; l < 20; l++)
-		  {
-		    double *eev = &extEV[l * 20];
-		    x1px2 = uX1[k * 20 + l]  * ump_x2[l];
-		    __m128d x1px2v = _mm_set1_pd(x1px2);
-		  
-		    for(j = 0; j < 20; j+=2)
-		      {
-			__m128d vv = _mm_load_pd(&v[j]);
-			__m128d ee = _mm_load_pd(&eev[j]);
-			
-			vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
-			
-			_mm_store_pd(&v[j], vv);
-		      }		     		    
-		  }			
-
-	      }
-	   
-
-	    { 
-	      v = &(x3[80 * i]);
-	      __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
-	      
-	      scale = 1;
-	      for(l = 0; scale && (l < 80); l += 2)
-		{
-		  __m128d vv = _mm_load_pd(&v[l]);
-		  __m128d v1 = _mm_and_pd(vv, absMask.m);
-		  v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-		  if(_mm_movemask_pd( v1 ) != 3)
-		    scale = 0;
-		}	    	  
-	    }
-
-
-	    if (scale)
-	      {
-
-	       __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
-	       
-	       for(l = 0; l < 80; l+=2)
-		 {
-		   __m128d ex3v = _mm_load_pd(&v[l]);		  
-		   _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
-		 }		   		  
-
-
-	
-		addScale += wgt[i];
-		       
-	      }
-	  }
-      }
-      break;
-    case INNER_INNER:
-      for (i = 0; i < n; i++)
-       {
-	 for(k = 0; k < 4; k++)
-	   {
-	     vl = &(x1[80 * i + 20 * k]);
-	     vr = &(x2[80 * i + 20 * k]);
-	     v =  &(x3[80 * i + 20 * k]);
-
-
-	     __m128d zero =  _mm_setzero_pd();
-	     for(l = 0; l < 20; l+=2)		  		    
-	       _mm_store_pd(&v[l], zero);
-
-
-	     for(l = 0; l < 20; l++)
-	       {		 
-
-		 {
-		   __m128d al = _mm_setzero_pd();
-		   __m128d ar = _mm_setzero_pd();
-
-		   double *ll   = &left[k * 400 + l * 20];
-		   double *rr   = &right[k * 400 + l * 20];
-		   double *EVEV = &extEV[20 * l];
-		   
-		   for(j = 0; j < 20; j+=2)
-		     {
-		       __m128d lv  = _mm_load_pd(&ll[j]);
-		       __m128d rv  = _mm_load_pd(&rr[j]);
-		       __m128d vll = _mm_load_pd(&vl[j]);
-		       __m128d vrr = _mm_load_pd(&vr[j]);
-		       
-		       al = _mm_add_pd(al, _mm_mul_pd(vll, lv));
-		       ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv));
-		     }  		 
-		       
-		   al = _mm_hadd_pd(al, al);
-		   ar = _mm_hadd_pd(ar, ar);
-		   
-		   al = _mm_mul_pd(al, ar);
-
-		   for(j = 0; j < 20; j+=2)
-		     {
-		       __m128d vv  = _mm_load_pd(&v[j]);
-		       __m128d EVV = _mm_load_pd(&EVEV[j]);
-
-		       vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
-
-		       _mm_store_pd(&v[j], vv);
-		     }		  		   		  
-		 }		 
-
-	       }
-	   }
-	 
-
-
-	 { 
-	   v = &(x3[80 * i]);
-	   __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
-	   
-	   scale = 1;
-	   for(l = 0; scale && (l < 80); l += 2)
-	     {
-	       __m128d vv = _mm_load_pd(&v[l]);
-	       __m128d v1 = _mm_and_pd(vv, absMask.m);
-	       v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	       if(_mm_movemask_pd( v1 ) != 3)
-		 scale = 0;
-	     }	    	  
-	 }
-
-
-	 if (scale)
-	   {
-
-	       __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
-	       
-	       for(l = 0; l < 80; l+=2)
-		 {
-		   __m128d ex3v = _mm_load_pd(&v[l]);		  
-		   _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
-		 }		   		  
-
-
-	    
-	     addScale += wgt[i];
-	      
-	   }
-       }
-      break;
-    default:
-      assert(0);
-    }
-
-  
-  *scalerIncrement = addScale;
-
-}
-
-
-     
-static void newviewGTRCATPROT(int tipCase, double *extEV,
-			      int *cptr,
-			      double *x1, double *x2, double *x3, double *tipVector,
-			      unsigned char *tipX1, unsigned char *tipX2,
-			      int n, double *left, double *right, int *wgt, int *scalerIncrement )
-{
-  double
-    *le, *ri, *v, *vl, *vr;
-
-  int i, l, j, scale, addScale = 0;
-
-  switch(tipCase)
-    {
-    case TIP_TIP:
-      {
-	for (i = 0; i < n; i++)
-	  {
-	    le = &left[cptr[i] * 400];
-	    ri = &right[cptr[i] * 400];
-
-	    vl = &(tipVector[20 * tipX1[i]]);
-	    vr = &(tipVector[20 * tipX2[i]]);
-	    v  = &x3[20 * i];
-
-	    for(l = 0; l < 20; l+=2)
-	      _mm_store_pd(&v[l], _mm_setzero_pd());	      		
-
-
-	    for(l = 0; l < 20; l++)
-	      {
-		__m128d x1v = _mm_setzero_pd();
-		__m128d x2v = _mm_setzero_pd();	 
-		double 
-		  *ev = &extEV[l * 20],
-		  *lv = &le[l * 20],
-		  *rv = &ri[l * 20];
-
-		for(j = 0; j < 20; j+=2)
-		  {
-		    x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));		    
-		    x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
-		  }
-
-		x1v = _mm_hadd_pd(x1v, x1v);
-		x2v = _mm_hadd_pd(x2v, x2v);
-
-		x1v = _mm_mul_pd(x1v, x2v);
-		
-		for(j = 0; j < 20; j+=2)
-		  {
-		    __m128d vv = _mm_load_pd(&v[j]);
-		    vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
-		    _mm_store_pd(&v[j], vv);
-		  }		    
-
-	      }	   
-	  }
-      }
-      break;
-    case TIP_INNER:
-      {
-	for (i = 0; i < n; i++)
-	  {
-	    le = &left[cptr[i] * 400];
-	    ri = &right[cptr[i] * 400];
-
-	    vl = &(tipVector[20 * tipX1[i]]);
-	    vr = &x2[20 * i];
-	    v  = &x3[20 * i];
-
-	    for(l = 0; l < 20; l+=2)
-	      _mm_store_pd(&v[l], _mm_setzero_pd());	      		
-
-	   
-
-	    for(l = 0; l < 20; l++)
-	      {
-
-		__m128d x1v = _mm_setzero_pd();
-		__m128d x2v = _mm_setzero_pd();	
-		double 
-		  *ev = &extEV[l * 20],
-		  *lv = &le[l * 20],
-		  *rv = &ri[l * 20];
-
-		for(j = 0; j < 20; j+=2)
-		  {
-		    x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));		    
-		    x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
-		  }
-
-		x1v = _mm_hadd_pd(x1v, x1v);
-		x2v = _mm_hadd_pd(x2v, x2v);
-
-		x1v = _mm_mul_pd(x1v, x2v);
-		
-		for(j = 0; j < 20; j+=2)
-		  {
-		    __m128d vv = _mm_load_pd(&v[j]);
-		    vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
-		    _mm_store_pd(&v[j], vv);
-		  }		    
-
-	      }
-
-	    { 	    
-	      __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
-	      
-	      scale = 1;
-	      for(l = 0; scale && (l < 20); l += 2)
-		{
-		  __m128d vv = _mm_load_pd(&v[l]);
-		  __m128d v1 = _mm_and_pd(vv, absMask.m);
-		  v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-		  if(_mm_movemask_pd( v1 ) != 3)
-		    scale = 0;
-		}	    	  
-	    }
-
-
-	    if(scale)
-	      {
-
-		__m128d twoto = _mm_set_pd(twotothe256, twotothe256);
-
-		for(l = 0; l < 20; l+=2)
-		  {
-		    __m128d ex3v = _mm_load_pd(&v[l]);
-		    _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));		    
-		  }
-	
-		addScale += wgt[i];	  
-	      }
-	  }
-      }
-      break;
-    case INNER_INNER:
-      for(i = 0; i < n; i++)
-	{
-	  le = &left[cptr[i] * 400];
-	  ri = &right[cptr[i] * 400];
-
-	  vl = &x1[20 * i];
-	  vr = &x2[20 * i];
-	  v = &x3[20 * i];
-
-
-	    for(l = 0; l < 20; l+=2)
-	      _mm_store_pd(&v[l], _mm_setzero_pd());	      		
-
-	 
-	  for(l = 0; l < 20; l++)
-	    {
-
-		__m128d x1v = _mm_setzero_pd();
-		__m128d x2v = _mm_setzero_pd();
-		double 
-		  *ev = &extEV[l * 20],
-		  *lv = &le[l * 20],
-		  *rv = &ri[l * 20];
-
-
-		for(j = 0; j < 20; j+=2)
-		  {
-		    x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));		    
-		    x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
-		  }
-
-		x1v = _mm_hadd_pd(x1v, x1v);
-		x2v = _mm_hadd_pd(x2v, x2v);
-
-		x1v = _mm_mul_pd(x1v, x2v);
-		
-		for(j = 0; j < 20; j+=2)
-		  {
-		    __m128d vv = _mm_load_pd(&v[j]);
-		    vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
-		    _mm_store_pd(&v[j], vv);
-		  }		    
-
-	    }
-
-	    { 	    
-	      __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
-	      
-	      scale = 1;
-	      for(l = 0; scale && (l < 20); l += 2)
-		{
-		  __m128d vv = _mm_load_pd(&v[l]);
-		  __m128d v1 = _mm_and_pd(vv, absMask.m);
-		  v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-		  if(_mm_movemask_pd( v1 ) != 3)
-		    scale = 0;
-		}	    	  
-	    }
-   
-
-	   if(scale)
-	     {
-
-	       __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
-	       
-	       for(l = 0; l < 20; l+=2)
-		 {
-		   __m128d ex3v = _mm_load_pd(&v[l]);		  
-		   _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
-		 }		   		  
-
-
-	       
-	       addScale += wgt[i];	   
-	     }
-	}
-      break;
-    default:
-      assert(0);
-    }
-  
- 
-  *scalerIncrement = addScale;
-
-}
-
-static void newviewGTRCATPROT_SAVE(int tipCase, double *extEV,
-				   int *cptr,
-				   double *x1, double *x2, double *x3, double *tipVector,
-				   unsigned char *tipX1, unsigned char *tipX2,
-				   int n, double *left, double *right, int *wgt, int *scalerIncrement,
-				   unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
-				   double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats)
-{
-  double
-    *le, 
-    *ri, 
-    *v, 
-    *vl, 
-    *vr,
-    *x1_ptr = x1,
-    *x2_ptr = x2, 
-    *x3_ptr = x3;
-
-  int 
-    i, 
-    l, 
-    j, 
-    scale, 
-    scaleGap = 0,
-    addScale = 0;
-
-  {
-    vl = x1_gapColumn;	      
-    vr = x2_gapColumn;
-    v = x3_gapColumn;
-
-    le = &left[maxCats * 400];
-    ri = &right[maxCats * 400];	  
-
-    for(l = 0; l < 20; l+=2)
-      _mm_store_pd(&v[l], _mm_setzero_pd());	      		
-	 
-    for(l = 0; l < 20; l++)
-      {
-	__m128d x1v = _mm_setzero_pd();
-	__m128d x2v = _mm_setzero_pd();
-	double 
-	  *ev = &extEV[l * 20],
-	  *lv = &le[l * 20],
-	  *rv = &ri[l * 20];
-
-
-	for(j = 0; j < 20; j+=2)
-	  {
-	    x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));		    
-	    x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
-	  }
-	
-	x1v = _mm_hadd_pd(x1v, x1v);
-	x2v = _mm_hadd_pd(x2v, x2v);
-	
-	x1v = _mm_mul_pd(x1v, x2v);
-	
-	for(j = 0; j < 20; j+=2)
-	  {
-	    __m128d vv = _mm_load_pd(&v[j]);
-	    vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
-	    _mm_store_pd(&v[j], vv);
-	  }		    	
-      }
-    
-    if(tipCase != TIP_TIP)
-      { 	    
-	__m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
-	      
-	scale = 1;
-	for(l = 0; scale && (l < 20); l += 2)
-	  {
-	    __m128d vv = _mm_load_pd(&v[l]);
-	    __m128d v1 = _mm_and_pd(vv, absMask.m);
-	    v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	    if(_mm_movemask_pd( v1 ) != 3)
-	      scale = 0;
-	  }	    	        
-  
-	if(scale)
-	  {
-	    __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
-	    
-	    for(l = 0; l < 20; l+=2)
-	      {
-		__m128d ex3v = _mm_load_pd(&v[l]);		  
-		_mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
-	      }		   		  
-	       
-	    scaleGap = TRUE;	   
-	  }
-      }
-  }
-  
-  switch(tipCase)
-    {
-    case TIP_TIP:
-      {
-	for (i = 0; i < n; i++)
-	  {
-	    if(noGap(x3_gap, i))
-	      {		
-		vl = &(tipVector[20 * tipX1[i]]);
-		vr = &(tipVector[20 * tipX2[i]]);
-		v  = x3_ptr;
-
-		if(isGap(x1_gap, i))
-		  le =  &left[maxCats * 400];
-		else	  	  
-		  le =  &left[cptr[i] * 400];	  
-	  
-		if(isGap(x2_gap, i))
-		  ri =  &right[maxCats * 400];
-		else	 	  
-		  ri =  &right[cptr[i] * 400];
-
-		for(l = 0; l < 20; l+=2)
-		  _mm_store_pd(&v[l], _mm_setzero_pd());	      		
-		
-		for(l = 0; l < 20; l++)
-		  {
-		    __m128d x1v = _mm_setzero_pd();
-		    __m128d x2v = _mm_setzero_pd();	 
-		    double 
-		      *ev = &extEV[l * 20],
-		      *lv = &le[l * 20],
-		      *rv = &ri[l * 20];
-		    
-		    for(j = 0; j < 20; j+=2)
-		      {
-			x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));		    
-			x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
-		      }
-		    
-		    x1v = _mm_hadd_pd(x1v, x1v);
-		    x2v = _mm_hadd_pd(x2v, x2v);
-		    
-		    x1v = _mm_mul_pd(x1v, x2v);
-		    
-		    for(j = 0; j < 20; j+=2)
-		      {
-			__m128d vv = _mm_load_pd(&v[j]);
-			vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
-			_mm_store_pd(&v[j], vv);
-		      }		   
-		  }
-
-		x3_ptr += 20;
-
-	      }   
-	  }
-      }
-      break;
-    case TIP_INNER:
-      {
-	for (i = 0; i < n; i++)
-	  {
-	    if(isGap(x3_gap, i))
-	      {
-		if(scaleGap)		   		    
-		  addScale += wgt[i];
-	      }
-	    else
-	      {	 
-		vl = &(tipVector[20 * tipX1[i]]);
-	      
-		vr = x2_ptr;
-		v = x3_ptr;
-
-		if(isGap(x1_gap, i))
-		  le =  &left[maxCats * 400];
-		else
-		  le =  &left[cptr[i] * 400];
-
-		if(isGap(x2_gap, i))
-		  {		 
-		    ri =  &right[maxCats * 400];
-		    vr = x2_gapColumn;
-		  }
-		else
-		  {
-		    ri =  &right[cptr[i] * 400];
-		    vr = x2_ptr;
-		    x2_ptr += 20;
-		  }	  	  	  	  		  
-
-		for(l = 0; l < 20; l+=2)
-		  _mm_store_pd(&v[l], _mm_setzero_pd());	      			   
-
-		for(l = 0; l < 20; l++)
-		  {
-		    __m128d x1v = _mm_setzero_pd();
-		    __m128d x2v = _mm_setzero_pd();	
-		    double 
-		      *ev = &extEV[l * 20],
-		      *lv = &le[l * 20],
-		      *rv = &ri[l * 20];
-		    
-		    for(j = 0; j < 20; j+=2)
-		      {
-			x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));		    
-			x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
-		      }
-		    
-		    x1v = _mm_hadd_pd(x1v, x1v);
-		    x2v = _mm_hadd_pd(x2v, x2v);
-		    
-		    x1v = _mm_mul_pd(x1v, x2v);
-		    
-		    for(j = 0; j < 20; j+=2)
-		      {
-			__m128d vv = _mm_load_pd(&v[j]);
-			vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
-			_mm_store_pd(&v[j], vv);
-		      }		    
-		  }
-		
-		{ 	    
-		  __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
-		  
-		  scale = 1;
-		  for(l = 0; scale && (l < 20); l += 2)
-		    {
-		      __m128d vv = _mm_load_pd(&v[l]);
-		      __m128d v1 = _mm_and_pd(vv, absMask.m);
-		      v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-		      if(_mm_movemask_pd( v1 ) != 3)
-			scale = 0;
-		    }	    	  
-		}
-		
-		
-		if(scale)
-		  {
-		    __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
-		    
-		    for(l = 0; l < 20; l+=2)
-		      {
-			__m128d ex3v = _mm_load_pd(&v[l]);
-			_mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));		    
-		      }
-		    
-		    addScale += wgt[i];	  
-		  }
-		x3_ptr += 20;
-	      }
-	  }
-      }
-      break;
-    case INNER_INNER:
-      for(i = 0; i < n; i++)
-	{ 
-	  if(isGap(x3_gap, i))
-	    {
-	      if(scaleGap)		   		    
-		addScale += wgt[i];
-	    }
-	  else
-	    {	  	     
-	      v = x3_ptr;
-	  	  
-	      if(isGap(x1_gap, i))
-		{
-		  vl = x1_gapColumn;
-		  le =  &left[maxCats * 400];
-		}
-	      else
-		{
-		  le =  &left[cptr[i] * 400];
-		  vl = x1_ptr;
-		  x1_ptr += 20;
-		}
-
-	      if(isGap(x2_gap, i))	
-		{
-		  vr = x2_gapColumn;
-		  ri =  &right[maxCats * 400];	    
-		}
-	      else
-		{
-		  ri =  &right[cptr[i] * 400];
-		  vr = x2_ptr;
-		  x2_ptr += 20;
-		}	 	  	  	  
-
-	      for(l = 0; l < 20; l+=2)
-		_mm_store_pd(&v[l], _mm_setzero_pd());	      		
-	 
-	      for(l = 0; l < 20; l++)
-		{
-		  __m128d x1v = _mm_setzero_pd();
-		  __m128d x2v = _mm_setzero_pd();
-		  double 
-		    *ev = &extEV[l * 20],
-		    *lv = &le[l * 20],
-		    *rv = &ri[l * 20];
-		  		  
-		  for(j = 0; j < 20; j+=2)
-		    {
-		      x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));		    
-		      x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
-		    }
-		  
-		  x1v = _mm_hadd_pd(x1v, x1v);
-		  x2v = _mm_hadd_pd(x2v, x2v);
-		  
-		  x1v = _mm_mul_pd(x1v, x2v);
-		  
-		  for(j = 0; j < 20; j+=2)
-		    {
-		      __m128d vv = _mm_load_pd(&v[j]);
-		      vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
-		      _mm_store_pd(&v[j], vv);
-		    }		    
-		  
-		}
-	      
-	      { 	    
-		__m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
-		
-		scale = 1;
-		for(l = 0; scale && (l < 20); l += 2)
-		  {
-		    __m128d vv = _mm_load_pd(&v[l]);
-		    __m128d v1 = _mm_and_pd(vv, absMask.m);
-		    v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-		    if(_mm_movemask_pd( v1 ) != 3)
-		      scale = 0;
-		  }	    	  
-	      }
-  
-	      if(scale)
-		{
-		  __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
-		  
-		  for(l = 0; l < 20; l+=2)
-		    {
-		      __m128d ex3v = _mm_load_pd(&v[l]);		  
-		      _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
-		    }		   		  
-		  
-		  addScale += wgt[i];	   
-		}
-	      x3_ptr += 20;
-	    }
-	}
-      break;
-    default:
-      assert(0);
-    }
-  
- 
-  *scalerIncrement = addScale;
-
-}
-
-static void newviewGTRGAMMAPROT_LG4(int tipCase,
-				    double *x1, double *x2, double *x3, double *extEV[4], double *tipVector[4],
-				    int *ex3, unsigned char *tipX1, unsigned char *tipX2,
-				    int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling)
-{
-  double  *uX1, *uX2, *v;
-  double x1px2;
-  int  i, j, l, k, scale, addScale = 0;
-  double *vl, *vr;
-#ifndef __SIM_SSE3
-  double al, ar;
-#endif
-
-
-
-  switch(tipCase)
-    {
-    case TIP_TIP:
-      {
-	double umpX1[1840], umpX2[1840];
-
-	for(i = 0; i < 23; i++)
-	  {
-	   
-
-	    for(k = 0; k < 80; k++)
-	      {
-		
-		v = &(tipVector[k / 20][20 * i]);
-#ifdef __SIM_SSE3
-		double *ll =  &left[k * 20];
-		double *rr =  &right[k * 20];
-		
-		__m128d umpX1v = _mm_setzero_pd();
-		__m128d umpX2v = _mm_setzero_pd();
-
-		for(l = 0; l < 20; l+=2)
-		  {
-		    __m128d vv = _mm_load_pd(&v[l]);
-		    umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));
-		    umpX2v = _mm_add_pd(umpX2v, _mm_mul_pd(vv, _mm_load_pd(&rr[l])));					
-		  }
-		
-		umpX1v = _mm_hadd_pd(umpX1v, umpX1v);
-		umpX2v = _mm_hadd_pd(umpX2v, umpX2v);
-		
-		_mm_storel_pd(&umpX1[80 * i + k], umpX1v);
-		_mm_storel_pd(&umpX2[80 * i + k], umpX2v);
-#else
-		umpX1[80 * i + k] = 0.0;
-		umpX2[80 * i + k] = 0.0;
-
-		for(l = 0; l < 20; l++)
-		  {
-		    umpX1[80 * i + k] +=  v[l] *  left[k * 20 + l];
-		    umpX2[80 * i + k] +=  v[l] * right[k * 20 + l];
-		  }
-#endif
-	      }
-	  }
-
-	for(i = 0; i < n; i++)
-	  {
-	    uX1 = &umpX1[80 * tipX1[i]];
-	    uX2 = &umpX2[80 * tipX2[i]];
-
-	    for(j = 0; j < 4; j++)
-	      {
-		v = &x3[i * 80 + j * 20];
-
-#ifdef __SIM_SSE3
-		__m128d zero =  _mm_setzero_pd();
-		for(k = 0; k < 20; k+=2)		  		    
-		  _mm_store_pd(&v[k], zero);
-
-		for(k = 0; k < 20; k++)
-		  { 
-		    double *eev = &extEV[j][k * 20];
-		    x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
-		    __m128d x1px2v = _mm_set1_pd(x1px2);
-
-		    for(l = 0; l < 20; l+=2)
-		      {
-		      	__m128d vv = _mm_load_pd(&v[l]);
-			__m128d ee = _mm_load_pd(&eev[l]);
-
-			vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
-			
-			_mm_store_pd(&v[l], vv);
-		      }
-		  }
-
-#else
-
-		for(k = 0; k < 20; k++)
-		  v[k] = 0.0;
-
-		for(k = 0; k < 20; k++)
-		  {		   
-		    x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
-		   
-		    for(l = 0; l < 20; l++)		      					
-		      v[l] += x1px2 * extEV[j][20 * k + l];		     
-		  }
-#endif
-	      }	   
-	  }
-      }
-      break;
-    case TIP_INNER:
-      {
-	double umpX1[1840], ump_x2[20];
-
-
-	for(i = 0; i < 23; i++)
-	  {
-	   
-
-	    for(k = 0; k < 80; k++)
-	      { 
-		v = &(tipVector[k / 20][20 * i]);
-#ifdef __SIM_SSE3
-		double *ll =  &left[k * 20];
-				
-		__m128d umpX1v = _mm_setzero_pd();
-		
-		for(l = 0; l < 20; l+=2)
-		  {
-		    __m128d vv = _mm_load_pd(&v[l]);
-		    umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));		    					
-		  }
-		
-		umpX1v = _mm_hadd_pd(umpX1v, umpX1v);				
-		_mm_storel_pd(&umpX1[80 * i + k], umpX1v);		
-#else	    
-		umpX1[80 * i + k] = 0.0;
-
-		for(l = 0; l < 20; l++)
-		  umpX1[80 * i + k] +=  v[l] * left[k * 20 + l];
-#endif
-
-	      }
-	  }
-
-	for (i = 0; i < n; i++)
-	  {
-	    uX1 = &umpX1[80 * tipX1[i]];
-
-	    for(k = 0; k < 4; k++)
-	      {
-		v = &(x2[80 * i + k * 20]);
-#ifdef __SIM_SSE3	       
-		for(l = 0; l < 20; l++)
-		  {		   
-		    double *r =  &right[k * 400 + l * 20];
-		    __m128d ump_x2v = _mm_setzero_pd();	    
-		    
-		    for(j = 0; j < 20; j+= 2)
-		      {
-			__m128d vv = _mm_load_pd(&v[j]);
-			__m128d rr = _mm_load_pd(&r[j]);
-			ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr));
-		      }
-		     
-		    ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
-		    
-		    _mm_storel_pd(&ump_x2[l], ump_x2v);		   		     
-		  }
-
-		v = &(x3[80 * i + 20 * k]);
-
-		__m128d zero =  _mm_setzero_pd();
-		for(l = 0; l < 20; l+=2)		  		    
-		  _mm_store_pd(&v[l], zero);
-		  
-		for(l = 0; l < 20; l++)
-		  {
-		    double *eev = &extEV[k][l * 20];
-		    x1px2 = uX1[k * 20 + l]  * ump_x2[l];
-		    __m128d x1px2v = _mm_set1_pd(x1px2);
-		  
-		    for(j = 0; j < 20; j+=2)
-		      {
-			__m128d vv = _mm_load_pd(&v[j]);
-			__m128d ee = _mm_load_pd(&eev[j]);
-			
-			vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
-			
-			_mm_store_pd(&v[j], vv);
-		      }		     		    
-		  }			
-#else
-		for(l = 0; l < 20; l++)
-		  {
-		    ump_x2[l] = 0.0;
-
-		    for(j = 0; j < 20; j++)
-		      ump_x2[l] += v[j] * right[k * 400 + l * 20 + j];
-		  }
-
-		v = &(x3[80 * i + 20 * k]);
-
-		for(l = 0; l < 20; l++)
-		  v[l] = 0;
-
-		for(l = 0; l < 20; l++)
-		  {
-		    x1px2 = uX1[k * 20 + l]  * ump_x2[l];
-		    for(j = 0; j < 20; j++)
-		      v[j] += x1px2 * extEV[k][l * 20  + j];
-		  }
-#endif
-	      }
-	   
-#ifdef __SIM_SSE3
-	    { 
-	      v = &(x3[80 * i]);
-	      __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
-	      
-	      scale = 1;
-	      for(l = 0; scale && (l < 80); l += 2)
-		{
-		  __m128d vv = _mm_load_pd(&v[l]);
-		  __m128d v1 = _mm_and_pd(vv, absMask.m);
-		  v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-		  if(_mm_movemask_pd( v1 ) != 3)
-		    scale = 0;
-		}	    	  
-	    }
-#else
-	    v = &x3[80 * i];
-	    scale = 1;
-	    for(l = 0; scale && (l < 80); l++)
-	      scale = (ABS(v[l]) <  minlikelihood);
-#endif
-
-	    if (scale)
-	      {
-#ifdef __SIM_SSE3
-	       __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
-	       
-	       for(l = 0; l < 80; l+=2)
-		 {
-		   __m128d ex3v = _mm_load_pd(&v[l]);		  
-		   _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
-		 }		   		  
-#else
-		for(l = 0; l < 80; l++)
-		  v[l] *= twotothe256;
-#endif
-
-		if(useFastScaling)
-		  addScale += wgt[i];
-		else
-		  ex3[i]  += 1;	       
-	      }
-	  }
-      }
-      break;
-    case INNER_INNER:
-      for (i = 0; i < n; i++)
-       {
-	 for(k = 0; k < 4; k++)
-	   {
-	     vl = &(x1[80 * i + 20 * k]);
-	     vr = &(x2[80 * i + 20 * k]);
-	     v =  &(x3[80 * i + 20 * k]);
-
-#ifdef __SIM_SSE3
-	     __m128d zero =  _mm_setzero_pd();
-	     for(l = 0; l < 20; l+=2)		  		    
-	       _mm_store_pd(&v[l], zero);
-#else
-	     for(l = 0; l < 20; l++)
-	       v[l] = 0;
-#endif
-
-	     for(l = 0; l < 20; l++)
-	       {		 
-#ifdef __SIM_SSE3
-		 {
-		   __m128d al = _mm_setzero_pd();
-		   __m128d ar = _mm_setzero_pd();
-
-		   double *ll   = &left[k * 400 + l * 20];
-		   double *rr   = &right[k * 400 + l * 20];
-		   double *EVEV = &extEV[k][20 * l];
-		   
-		   for(j = 0; j < 20; j+=2)
-		     {
-		       __m128d lv  = _mm_load_pd(&ll[j]);
-		       __m128d rv  = _mm_load_pd(&rr[j]);
-		       __m128d vll = _mm_load_pd(&vl[j]);
-		       __m128d vrr = _mm_load_pd(&vr[j]);
-		       
-		       al = _mm_add_pd(al, _mm_mul_pd(vll, lv));
-		       ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv));
-		     }  		 
-		       
-		   al = _mm_hadd_pd(al, al);
-		   ar = _mm_hadd_pd(ar, ar);
-		   
-		   al = _mm_mul_pd(al, ar);
-
-		   for(j = 0; j < 20; j+=2)
-		     {
-		       __m128d vv  = _mm_load_pd(&v[j]);
-		       __m128d EVV = _mm_load_pd(&EVEV[j]);
-
-		       vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
-
-		       _mm_store_pd(&v[j], vv);
-		     }		  		   		  
-		 }		 
-#else
-		 al = 0.0;
-		 ar = 0.0;
-
-		 for(j = 0; j < 20; j++)
-		   {
-		     al += vl[j] * left[k * 400 + l * 20 + j];
-		     ar += vr[j] * right[k * 400 + l * 20 + j];
-		   }
-
-		 x1px2 = al * ar;
-
-		 for(j = 0; j < 20; j++)
-		   v[j] += x1px2 * extEV[k][20 * l + j];
-#endif
-	       }
-	   }
-	 
-
-#ifdef __SIM_SSE3
-	 { 
-	   v = &(x3[80 * i]);
-	   __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
-	   
-	   scale = 1;
-	   for(l = 0; scale && (l < 80); l += 2)
-	     {
-	       __m128d vv = _mm_load_pd(&v[l]);
-	       __m128d v1 = _mm_and_pd(vv, absMask.m);
-	       v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	       if(_mm_movemask_pd( v1 ) != 3)
-		 scale = 0;
-	     }	    	  
-	 }
-#else
-	 v = &(x3[80 * i]);
-	 scale = 1;
-	 for(l = 0; scale && (l < 80); l++)
-	   scale = ((ABS(v[l]) <  minlikelihood));
-#endif
-
-	 if (scale)
-	   {
-#ifdef __SIM_SSE3
-	       __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
-	       
-	       for(l = 0; l < 80; l+=2)
-		 {
-		   __m128d ex3v = _mm_load_pd(&v[l]);		  
-		   _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
-		 }		   		  
-#else	     
-	     for(l = 0; l < 80; l++)
-	       v[l] *= twotothe256;
-#endif
-
-	     if(useFastScaling)
-	       addScale += wgt[i];
-	     else
-	       ex3[i]  += 1;	  
-	   }
-       }
-      break;
-    default:
-      assert(0);
-    }
-
-  if(useFastScaling)
-    *scalerIncrement = addScale;
-
-}
-
-#endif
-
-#ifdef _OPTIMIZED_FUNCTIONS
-
 /*** BINARY DATA functions *****/
 
 static void newviewGTRCAT_BINARY( int tipCase,  double *EV,  int *cptr,
@@ -5892,24 +1003,24 @@
             le =  &left[cptr[i] * 4];
             ri =  &right[cptr[i] * 4];
 
-            _mm_store_pd(x3, _mm_setzero_pd());     
+            simde_mm_store_pd(x3, simde_mm_setzero_pd());     
                      
             for(l = 0; l < 2; l++)
               {                                                                                                                          
-                __m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&le[l * 2]));
-                __m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&ri[l * 2]));
+                simde__m128d al = simde_mm_mul_pd(simde_mm_load_pd(x1), simde_mm_load_pd(&le[l * 2]));
+                simde__m128d ar = simde_mm_mul_pd(simde_mm_load_pd(x2), simde_mm_load_pd(&ri[l * 2]));
                 
-                al = _mm_hadd_pd(al, al);
-                ar = _mm_hadd_pd(ar, ar);
+                al = simde_mm_hadd_pd(al, al);
+                ar = simde_mm_hadd_pd(ar, ar);
                 
-                al = _mm_mul_pd(al, ar);
+                al = simde_mm_mul_pd(al, ar);
                 
-                __m128d vv  = _mm_load_pd(x3);
-                __m128d EVV = _mm_load_pd(&EV[2 * l]);
+                simde__m128d vv  = simde_mm_load_pd(x3);
+                simde__m128d EVV = simde_mm_load_pd(&EV[2 * l]);
                 
-                vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+                vv = simde_mm_add_pd(vv, simde_mm_mul_pd(al, EVV));
                 
-                _mm_store_pd(x3, vv);                                                     
+                simde_mm_store_pd(x3, vv);                                                     
               }            
           }
       }
@@ -5925,41 +1036,41 @@
             le =  &left[cptr[i] * 4];
             ri =  &right[cptr[i] * 4];
 
-            _mm_store_pd(x3, _mm_setzero_pd());     
+            simde_mm_store_pd(x3, simde_mm_setzero_pd());     
                      
             for(l = 0; l < 2; l++)
               {                                                                                                                          
-                __m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&le[l * 2]));
-                __m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&ri[l * 2]));
+                simde__m128d al = simde_mm_mul_pd(simde_mm_load_pd(x1), simde_mm_load_pd(&le[l * 2]));
+                simde__m128d ar = simde_mm_mul_pd(simde_mm_load_pd(x2), simde_mm_load_pd(&ri[l * 2]));
                 
-                al = _mm_hadd_pd(al, al);
-                ar = _mm_hadd_pd(ar, ar);
+                al = simde_mm_hadd_pd(al, al);
+                ar = simde_mm_hadd_pd(ar, ar);
                 
-                al = _mm_mul_pd(al, ar);
+                al = simde_mm_mul_pd(al, ar);
                 
-                __m128d vv  = _mm_load_pd(x3);
-                __m128d EVV = _mm_load_pd(&EV[2 * l]);
+                simde__m128d vv  = simde_mm_load_pd(x3);
+                simde__m128d EVV = simde_mm_load_pd(&EV[2 * l]);
                 
-                vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+                vv = simde_mm_add_pd(vv, simde_mm_mul_pd(al, EVV));
                 
-                _mm_store_pd(x3, vv);                                                     
+                simde_mm_store_pd(x3, vv);                                                     
               }  
             
-            __m128d minlikelihood_sse = _mm_set1_pd(minlikelihood);
+            simde__m128d minlikelihood_sse = simde_mm_set1_pd(minlikelihood);
          
             scale = 1;
             
-            __m128d v1 = _mm_and_pd(_mm_load_pd(x3), absMask.m);
-            v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-            if(_mm_movemask_pd( v1 ) != 3)
+            simde__m128d v1 = simde_mm_and_pd(simde_mm_load_pd(x3), absMask.m);
+            v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
+            if(simde_mm_movemask_pd( v1 ) != 3)
               scale = 0;                         
             
             if(scale)
               {
-                __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
+                simde__m128d twoto = simde_mm_set_pd(twotothe256, twotothe256);
                 
-                __m128d ex3v = _mm_load_pd(x3);           
-                _mm_store_pd(x3, _mm_mul_pd(ex3v,twoto));                                                 
+                simde__m128d ex3v = simde_mm_load_pd(x3);           
+                simde_mm_store_pd(x3, simde_mm_mul_pd(ex3v,twoto));                                                 
                 
                 if(useFastScaling)
                   addScale += wgt[i];
@@ -5979,41 +1090,41 @@
           le = &left[cptr[i] * 4];
           ri = &right[cptr[i] * 4];
 
-          _mm_store_pd(x3, _mm_setzero_pd());       
+          simde_mm_store_pd(x3, simde_mm_setzero_pd());       
           
           for(l = 0; l < 2; l++)
             {                                                                                                                            
-              __m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&le[l * 2]));
-              __m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&ri[l * 2]));
+              simde__m128d al = simde_mm_mul_pd(simde_mm_load_pd(x1), simde_mm_load_pd(&le[l * 2]));
+              simde__m128d ar = simde_mm_mul_pd(simde_mm_load_pd(x2), simde_mm_load_pd(&ri[l * 2]));
               
-              al = _mm_hadd_pd(al, al);
-              ar = _mm_hadd_pd(ar, ar);
+              al = simde_mm_hadd_pd(al, al);
+              ar = simde_mm_hadd_pd(ar, ar);
               
-              al = _mm_mul_pd(al, ar);
+              al = simde_mm_mul_pd(al, ar);
               
-              __m128d vv  = _mm_load_pd(x3);
-              __m128d EVV = _mm_load_pd(&EV[2 * l]);
+              simde__m128d vv  = simde_mm_load_pd(x3);
+              simde__m128d EVV = simde_mm_load_pd(&EV[2 * l]);
               
-              vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+              vv = simde_mm_add_pd(vv, simde_mm_mul_pd(al, EVV));
               
-              _mm_store_pd(x3, vv);                                                       
+              simde_mm_store_pd(x3, vv);                                                       
             }                             
 
-          __m128d minlikelihood_sse = _mm_set1_pd(minlikelihood);
+          simde__m128d minlikelihood_sse = simde_mm_set1_pd(minlikelihood);
          
           scale = 1;
                   
-          __m128d v1 = _mm_and_pd(_mm_load_pd(x3), absMask.m);
-          v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-          if(_mm_movemask_pd( v1 ) != 3)
+          simde__m128d v1 = simde_mm_and_pd(simde_mm_load_pd(x3), absMask.m);
+          v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
+          if(simde_mm_movemask_pd( v1 ) != 3)
             scale = 0;                   
          
           if(scale)
             {
-              __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
+              simde__m128d twoto = simde_mm_set_pd(twotothe256, twotothe256);
                     
-              __m128d ex3v = _mm_load_pd(x3);             
-              _mm_store_pd(x3, _mm_mul_pd(ex3v,twoto));                                           
+              simde__m128d ex3v = simde_mm_load_pd(x3);             
+              simde_mm_store_pd(x3, simde_mm_mul_pd(ex3v,twoto));                                           
              
               if(useFastScaling)
                 addScale += wgt[i];
@@ -6055,24 +1166,24 @@
 	   {	     	     	    
 	     x3 = &(x3_start[8 * i + 2 * k]);	     
 	    	         
-	     _mm_store_pd(x3, _mm_setzero_pd());	    
+	     simde_mm_store_pd(x3, simde_mm_setzero_pd());	    
 	    	     
 	     for(l = 0; l < 2; l++)
 	       {		 		 						   		  		 		 
-		 __m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&left[k * 4 + l * 2]));
-		 __m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&right[k * 4 + l * 2]));
+		 simde__m128d al = simde_mm_mul_pd(simde_mm_load_pd(x1), simde_mm_load_pd(&left[k * 4 + l * 2]));
+		 simde__m128d ar = simde_mm_mul_pd(simde_mm_load_pd(x2), simde_mm_load_pd(&right[k * 4 + l * 2]));
 		 		       
-		 al = _mm_hadd_pd(al, al);
-		 ar = _mm_hadd_pd(ar, ar);
+		 al = simde_mm_hadd_pd(al, al);
+		 ar = simde_mm_hadd_pd(ar, ar);
 		   
-		 al = _mm_mul_pd(al, ar);
+		 al = simde_mm_mul_pd(al, ar);
 		   
-		 __m128d vv  = _mm_load_pd(x3);
-		 __m128d EVV = _mm_load_pd(&EV[2 * l]);
+		 simde__m128d vv  = simde_mm_load_pd(x3);
+		 simde__m128d EVV = simde_mm_load_pd(&EV[2 * l]);
 		 
-		 vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+		 vv = simde_mm_add_pd(vv, simde_mm_mul_pd(al, EVV));
 		 
-		 _mm_store_pd(x3, vv);		     	  		   		  
+		 simde_mm_store_pd(x3, vv);		     	  		   		  
 	       }	     	    
 	   }
        }
@@ -6087,48 +1198,48 @@
 	     x2 = &(x2_start[8 * i + 2 * k]);
 	     x3 = &(x3_start[8 * i + 2 * k]);	     
 	    	         
-	     _mm_store_pd(x3, _mm_setzero_pd());	    
+	     simde_mm_store_pd(x3, simde_mm_setzero_pd());	    
 	    	     
 	     for(l = 0; l < 2; l++)
 	       {		 		 						   		  		 		 
-		 __m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&left[k * 4 + l * 2]));
-		 __m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&right[k * 4 + l * 2]));
+		 simde__m128d al = simde_mm_mul_pd(simde_mm_load_pd(x1), simde_mm_load_pd(&left[k * 4 + l * 2]));
+		 simde__m128d ar = simde_mm_mul_pd(simde_mm_load_pd(x2), simde_mm_load_pd(&right[k * 4 + l * 2]));
 		 		       
-		 al = _mm_hadd_pd(al, al);
-		 ar = _mm_hadd_pd(ar, ar);
+		 al = simde_mm_hadd_pd(al, al);
+		 ar = simde_mm_hadd_pd(ar, ar);
 		   
-		 al = _mm_mul_pd(al, ar);
+		 al = simde_mm_mul_pd(al, ar);
 		   
-		 __m128d vv  = _mm_load_pd(x3);
-		 __m128d EVV = _mm_load_pd(&EV[2 * l]);
+		 simde__m128d vv  = simde_mm_load_pd(x3);
+		 simde__m128d EVV = simde_mm_load_pd(&EV[2 * l]);
 		 
-		 vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+		 vv = simde_mm_add_pd(vv, simde_mm_mul_pd(al, EVV));
 		 
-		 _mm_store_pd(x3, vv);		     	  		   		  
+		 simde_mm_store_pd(x3, vv);		     	  		   		  
 	       }	     	    
 	   }
 	
 	 x3 = &(x3_start[8 * i]);
-	 __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
+	 simde__m128d minlikelihood_sse = simde_mm_set1_pd( minlikelihood );
 	 
 	 scale = 1;
 	 for(l = 0; scale && (l < 8); l += 2)
 	   {
-	     __m128d vv = _mm_load_pd(&x3[l]);
-	     __m128d v1 = _mm_and_pd(vv, absMask.m);
-	     v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	     if(_mm_movemask_pd( v1 ) != 3)
+	     simde__m128d vv = simde_mm_load_pd(&x3[l]);
+	     simde__m128d v1 = simde_mm_and_pd(vv, absMask.m);
+	     v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
+	     if(simde_mm_movemask_pd( v1 ) != 3)
 	       scale = 0;
 	   }	    	         
 	 
 	 if(scale)
 	   {
-	     __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
+	     simde__m128d twoto = simde_mm_set_pd(twotothe256, twotothe256);
 	     
 	     for(l = 0; l < 8; l+=2)
 	       {
-		 __m128d ex3v = _mm_load_pd(&x3[l]);		  
-		 _mm_store_pd(&x3[l], _mm_mul_pd(ex3v,twoto));	
+		 simde__m128d ex3v = simde_mm_load_pd(&x3[l]);		  
+		 simde_mm_store_pd(&x3[l], simde_mm_mul_pd(ex3v,twoto));	
 	       }		   		  
 	     
 	     if(useFastScaling)
@@ -6147,48 +1258,48 @@
 	     x2 = &(x2_start[8 * i + 2 * k]);
 	     x3 = &(x3_start[8 * i + 2 * k]);	     
 	    	         
-	     _mm_store_pd(x3, _mm_setzero_pd());	    
+	     simde_mm_store_pd(x3, simde_mm_setzero_pd());	    
 	    	     
 	     for(l = 0; l < 2; l++)
 	       {		 		 						   		  		 		 
-		 __m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&left[k * 4 + l * 2]));
-		 __m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&right[k * 4 + l * 2]));
+		 simde__m128d al = simde_mm_mul_pd(simde_mm_load_pd(x1), simde_mm_load_pd(&left[k * 4 + l * 2]));
+		 simde__m128d ar = simde_mm_mul_pd(simde_mm_load_pd(x2), simde_mm_load_pd(&right[k * 4 + l * 2]));
 		 		       
-		 al = _mm_hadd_pd(al, al);
-		 ar = _mm_hadd_pd(ar, ar);
+		 al = simde_mm_hadd_pd(al, al);
+		 ar = simde_mm_hadd_pd(ar, ar);
 		   
-		 al = _mm_mul_pd(al, ar);
+		 al = simde_mm_mul_pd(al, ar);
 		   
-		 __m128d vv  = _mm_load_pd(x3);
-		 __m128d EVV = _mm_load_pd(&EV[2 * l]);
+		 simde__m128d vv  = simde_mm_load_pd(x3);
+		 simde__m128d EVV = simde_mm_load_pd(&EV[2 * l]);
 		 
-		 vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+		 vv = simde_mm_add_pd(vv, simde_mm_mul_pd(al, EVV));
 		 
-		 _mm_store_pd(x3, vv);		     	  		   		  
+		 simde_mm_store_pd(x3, vv);		     	  		   		  
 	       }	     	    
 	   }
 	
 	 x3 = &(x3_start[8 * i]);
-	 __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
+	 simde__m128d minlikelihood_sse = simde_mm_set1_pd( minlikelihood );
 	 
 	 scale = 1;
 	 for(l = 0; scale && (l < 8); l += 2)
 	   {
-	     __m128d vv = _mm_load_pd(&x3[l]);
-	     __m128d v1 = _mm_and_pd(vv, absMask.m);
-	     v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	     if(_mm_movemask_pd( v1 ) != 3)
+	     simde__m128d vv = simde_mm_load_pd(&x3[l]);
+	     simde__m128d v1 = simde_mm_and_pd(vv, absMask.m);
+	     v1 = simde_mm_cmplt_pd(v1,  minlikelihood_sse);
+	     if(simde_mm_movemask_pd( v1 ) != 3)
 	       scale = 0;
 	   }	    	         
 	 
 	 if(scale)
 	   {
-	     __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
+	     simde__m128d twoto = simde_mm_set_pd(twotothe256, twotothe256);
 	     
 	     for(l = 0; l < 8; l+=2)
 	       {
-		 __m128d ex3v = _mm_load_pd(&x3[l]);		  
-		 _mm_store_pd(&x3[l], _mm_mul_pd(ex3v,twoto));	
+		 simde__m128d ex3v = simde_mm_load_pd(&x3[l]);		  
+		 simde_mm_store_pd(&x3[l], simde_mm_mul_pd(ex3v,twoto));	
 	       }		   		  
 	     
 	     if(useFastScaling)
@@ -6210,9 +1321,3 @@
 
 
 /**** BINARY DATA functions end ****/
-
-
-
-#endif
-
-
--- examl.orig/parser/axml.c
+++ examl/parser/axml.c
@@ -60,7 +60,9 @@
 
 #endif
 
-#if ! (defined(__ppc) || defined(__powerpc__) || defined(PPC))
+#include "../debian/include/simde/x86/sse.h"
+
+#if defined(SIMDE_SSE_NATIVE)
 #include <xmmintrin.h>
 /*
   special bug fix, enforces denormalized numbers to be flushed to zero,
--- examl.orig/examl/mic_native_dna.c
+++ examl/examl/mic_native_dna.c
@@ -219,11 +219,11 @@
 	#pragma noprefetch umpX1,umpX2
 	for (int i = 0; i < n; i++)
         {
-            _mm_prefetch((const char *)&x3[span*(i+8)], _MM_HINT_ET1);
-            _mm_prefetch((const char *)&x3[span*(i+8) + 8], _MM_HINT_ET1);
+            simde_mm_prefetch((const char *)&x3[span*(i+8)], _MM_HINT_ET1);
+            simde_mm_prefetch((const char *)&x3[span*(i+8) + 8], _MM_HINT_ET1);
 
-            _mm_prefetch((const char *)&x3[span*(i+1)], _MM_HINT_ET0);
-            _mm_prefetch((const char *)&x3[span*(i+1) + 8], _MM_HINT_ET0);
+            simde_mm_prefetch((const char *)&x3[span*(i+1)], _MM_HINT_ET0);
+            simde_mm_prefetch((const char *)&x3[span*(i+1) + 8], _MM_HINT_ET0);
 
             const double *uX1 = &umpX1[16 * tipX1[i]];
             const double *uX2 = &umpX2[16 * tipX2[i]];
@@ -248,15 +248,15 @@
         #pragma noprefetch umpX1
 	for (int i = 0; i < n; i++)
         {
-            _mm_prefetch((const char *)&x2[span*(i+16)], _MM_HINT_T1);
-            _mm_prefetch((const char *)&x2[span*(i+16) + 8], _MM_HINT_T1);
-            _mm_prefetch((const char *)&x3[span*(i+16)], _MM_HINT_ET1);
-            _mm_prefetch((const char *)&x3[span*(i+16) + 8], _MM_HINT_ET1);
-
-            _mm_prefetch((const char *)&x2[span*(i+1)], _MM_HINT_T0);
-            _mm_prefetch((const char *)&x2[span*(i+1) + 8], _MM_HINT_T0);
-            _mm_prefetch((const char *)&x3[span*(i+1)], _MM_HINT_ET0);
-            _mm_prefetch((const char *)&x3[span*(i+1) + 8], _MM_HINT_ET0);
+            simde_mm_prefetch((const char *)&x2[span*(i+16)], _MM_HINT_T1);
+            simde_mm_prefetch((const char *)&x2[span*(i+16) + 8], _MM_HINT_T1);
+            simde_mm_prefetch((const char *)&x3[span*(i+16)], _MM_HINT_ET1);
+            simde_mm_prefetch((const char *)&x3[span*(i+16) + 8], _MM_HINT_ET1);
+
+            simde_mm_prefetch((const char *)&x2[span*(i+1)], _MM_HINT_T0);
+            simde_mm_prefetch((const char *)&x2[span*(i+1) + 8], _MM_HINT_T0);
+            simde_mm_prefetch((const char *)&x3[span*(i+1)], _MM_HINT_ET0);
+            simde_mm_prefetch((const char *)&x3[span*(i+1) + 8], _MM_HINT_ET0);
 
             /* access pre-computed value based on the raw sequence data tipX1 that is used as an index */
             double* uX1 = &umpX1[span * tipX1[i]];
@@ -305,19 +305,19 @@
     {
         for (int i = 0; i < n; i++)
         {
-            _mm_prefetch((const char *) &x1[span*(i+8)], _MM_HINT_T1);
-            _mm_prefetch((const char *) &x1[span*(i+8) + 8], _MM_HINT_T1);
-            _mm_prefetch((const char *) &x2[span*(i+8)], _MM_HINT_T1);
-            _mm_prefetch((const char *) &x2[span*(i+8) + 8], _MM_HINT_T1);
-            _mm_prefetch((const char *) &x3[span*(i+8)], _MM_HINT_ET1);
-            _mm_prefetch((const char *) &x3[span*(i+8) + 8], _MM_HINT_ET1);
-
-            _mm_prefetch((const char *) &x1[span*(i+1)], _MM_HINT_T0);
-            _mm_prefetch((const char *) &x1[span*(i+1) + 8], _MM_HINT_T0);
-            _mm_prefetch((const char *) &x2[span*(i+1)], _MM_HINT_T0);
-            _mm_prefetch((const char *) &x2[span*(i+1) + 8], _MM_HINT_T0);
-            _mm_prefetch((const char *) &x3[span*(i+1)], _MM_HINT_ET0);
-            _mm_prefetch((const char *) &x3[span*(i+1) + 8], _MM_HINT_ET0);
+            simde_mm_prefetch((const char *) &x1[span*(i+8)], _MM_HINT_T1);
+            simde_mm_prefetch((const char *) &x1[span*(i+8) + 8], _MM_HINT_T1);
+            simde_mm_prefetch((const char *) &x2[span*(i+8)], _MM_HINT_T1);
+            simde_mm_prefetch((const char *) &x2[span*(i+8) + 8], _MM_HINT_T1);
+            simde_mm_prefetch((const char *) &x3[span*(i+8)], _MM_HINT_ET1);
+            simde_mm_prefetch((const char *) &x3[span*(i+8) + 8], _MM_HINT_ET1);
+
+            simde_mm_prefetch((const char *) &x1[span*(i+1)], _MM_HINT_T0);
+            simde_mm_prefetch((const char *) &x1[span*(i+1) + 8], _MM_HINT_T0);
+            simde_mm_prefetch((const char *) &x2[span*(i+1)], _MM_HINT_T0);
+            simde_mm_prefetch((const char *) &x2[span*(i+1) + 8], _MM_HINT_T0);
+            simde_mm_prefetch((const char *) &x3[span*(i+1)], _MM_HINT_ET0);
+            simde_mm_prefetch((const char *) &x3[span*(i+1) + 8], _MM_HINT_ET0);
 
             double uX1[16] __attribute__((aligned(64)));
             double uX2[16] __attribute__((aligned(64)));
@@ -407,15 +407,15 @@
     {
         for (int i = 0; i < n; i++)
         {
-            _mm_prefetch((const char *) &x1_start[span*(i+8)], _MM_HINT_T1);
-            _mm_prefetch((const char *) &x1_start[span*(i+8) + 8], _MM_HINT_T1);
-            _mm_prefetch((const char *) &x2_start[span*(i+8)], _MM_HINT_T1);
-            _mm_prefetch((const char *) &x2_start[span*(i+8) + 8], _MM_HINT_T1);
-
-            _mm_prefetch((const char *) &x1_start[span*(i+1)], _MM_HINT_T0);
-            _mm_prefetch((const char *) &x1_start[span*(i+1) + 8], _MM_HINT_T0);
-            _mm_prefetch((const char *) &x2_start[span*(i+1)], _MM_HINT_T0);
-            _mm_prefetch((const char *) &x2_start[span*(i+1) + 8], _MM_HINT_T0);
+            simde_mm_prefetch((const char *) &x1_start[span*(i+8)], _MM_HINT_T1);
+            simde_mm_prefetch((const char *) &x1_start[span*(i+8) + 8], _MM_HINT_T1);
+            simde_mm_prefetch((const char *) &x2_start[span*(i+8)], _MM_HINT_T1);
+            simde_mm_prefetch((const char *) &x2_start[span*(i+8) + 8], _MM_HINT_T1);
+
+            simde_mm_prefetch((const char *) &x1_start[span*(i+1)], _MM_HINT_T0);
+            simde_mm_prefetch((const char *) &x1_start[span*(i+1) + 8], _MM_HINT_T0);
+            simde_mm_prefetch((const char *) &x2_start[span*(i+1)], _MM_HINT_T0);
+            simde_mm_prefetch((const char *) &x2_start[span*(i+1) + 8], _MM_HINT_T0);
 
           const double *x1 = &(x1_start[span * i]);
           const double *x2 = &(x2_start[span * i]);
@@ -465,11 +465,11 @@
 	#pragma unroll(8)
 	for(int i = 0; i < n; i++)
         {
-          _mm_prefetch((const char *) &x2_start[span*(i+32)], _MM_HINT_T1);
-          _mm_prefetch((const char *) &x2_start[span*(i+32) + 8], _MM_HINT_T1);
+          simde_mm_prefetch((const char *) &x2_start[span*(i+32)], _MM_HINT_T1);
+          simde_mm_prefetch((const char *) &x2_start[span*(i+32) + 8], _MM_HINT_T1);
 
-          _mm_prefetch((const char *) &x2_start[span*(i+8)], _MM_HINT_T0);
-          _mm_prefetch((const char *) &x2_start[span*(i+8) + 8], _MM_HINT_T0);
+          simde_mm_prefetch((const char *) &x2_start[span*(i+8)], _MM_HINT_T0);
+          simde_mm_prefetch((const char *) &x2_start[span*(i+8) + 8], _MM_HINT_T0);
 
           const double *left = &(aTipVec[16 * tipX1[i]]);
           const double *right = &(x2_start[span * i]);
@@ -487,15 +487,15 @@
 	#pragma unroll(8)
         for(int i = 0; i < n; i++)
         {
-            _mm_prefetch((const char *) &x1_start[span*(i+16)], _MM_HINT_T1);
-            _mm_prefetch((const char *) &x1_start[span*(i+16) + 8], _MM_HINT_T1);
-            _mm_prefetch((const char *) &x2_start[span*(i+16)], _MM_HINT_T1);
-            _mm_prefetch((const char *) &x2_start[span*(i+16) + 8], _MM_HINT_T1);
-
-            _mm_prefetch((const char *) &x1_start[span*(i+4)], _MM_HINT_T0);
-            _mm_prefetch((const char *) &x1_start[span*(i+4) + 8], _MM_HINT_T0);
-            _mm_prefetch((const char *) &x2_start[span*(i+4)], _MM_HINT_T0);
-            _mm_prefetch((const char *) &x2_start[span*(i+4) + 8], _MM_HINT_T0);
+            simde_mm_prefetch((const char *) &x1_start[span*(i+16)], _MM_HINT_T1);
+            simde_mm_prefetch((const char *) &x1_start[span*(i+16) + 8], _MM_HINT_T1);
+            simde_mm_prefetch((const char *) &x2_start[span*(i+16)], _MM_HINT_T1);
+            simde_mm_prefetch((const char *) &x2_start[span*(i+16) + 8], _MM_HINT_T1);
+
+            simde_mm_prefetch((const char *) &x1_start[span*(i+4)], _MM_HINT_T0);
+            simde_mm_prefetch((const char *) &x1_start[span*(i+4) + 8], _MM_HINT_T0);
+            simde_mm_prefetch((const char *) &x2_start[span*(i+4)], _MM_HINT_T0);
+            simde_mm_prefetch((const char *) &x2_start[span*(i+4) + 8], _MM_HINT_T0);
 
             const double *left  = &(x1_start[span * i]);
             const double *right = &(x2_start[span * i]);
@@ -564,8 +564,8 @@
 
     for (int i = 0; i < aligned_width; i++)
     {
-        _mm_prefetch((const char *) &sumtable[i * span * 8], _MM_HINT_T0);
-        _mm_prefetch((const char *) &sumtable[i * span * 8 + 8], _MM_HINT_T0);
+        simde_mm_prefetch((const char *) &sumtable[i * span * 8], _MM_HINT_T0);
+        simde_mm_prefetch((const char *) &sumtable[i * span * 8 + 8], _MM_HINT_T0);
 
         /* access the array with pre-computed values */
         const double *sum = &sumtable[i * span * 8];
@@ -585,11 +585,11 @@
         #pragma unroll(8)
         for(int j = 0; j < 8; j++)
         {
-            _mm_prefetch((const char *) &sum[span*(j+8)], _MM_HINT_T1);
-            _mm_prefetch((const char *) &sum[span*(j+8) + 8], _MM_HINT_T1);
+            simde_mm_prefetch((const char *) &sum[span*(j+8)], _MM_HINT_T1);
+            simde_mm_prefetch((const char *) &sum[span*(j+8) + 8], _MM_HINT_T1);
 
-            _mm_prefetch((const char *) &sum[span*(j+1)], _MM_HINT_T0);
-            _mm_prefetch((const char *) &sum[span*(j+1) + 8], _MM_HINT_T0);
+            simde_mm_prefetch((const char *) &sum[span*(j+1)], _MM_HINT_T0);
+            simde_mm_prefetch((const char *) &sum[span*(j+1) + 8], _MM_HINT_T0);
 
             __m512d d0_1 = _mm512_load_pd(&diagptable0[0]);
             __m512d d0_2 = _mm512_load_pd(&diagptable0[8]);
--- examl.orig/examl/Makefile.OMP.AVX.gcc
+++ examl/examl/Makefile.OMP.AVX.gcc
@@ -3,28 +3,30 @@
 
 CC = mpicc
 
-COMMON_FLAGS = -D__SIM_SSE3 -D__AVX -D_USE_OMP -fopenmp -D_OPTIMIZED_FUNCTIONS -msse3 -D_GNU_SOURCE -fomit-frame-pointer -funroll-loops -D_USE_ALLREDUCE  -Wall #  -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat  -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast    -Wno-unused-parameter
+COMMON_FLAGS = -D_USE_OMP -fopenmp -D_OPTIMIZED_FUNCTIONS -D_GNU_SOURCE -fomit-frame-pointer -funroll-loops -D_USE_ALLREDUCE  -Wall #  -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat  -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast    -Wno-unused-parameter
 
 OPT_FLAG_1 = -O1
 OPT_FLAG_2 = -O2
 
 CFLAGS += $(COMMON_FLAGS) $(OPT_FLAG_2)
 
-LIBRARIES = -lm -mavx -fopenmp
+LIBRARIES = -lm -fopenmp
 
 RM = rm -f
 
 objs    = axml.o optimizeModel.o trash.o searchAlgo.o topologies.o treeIO.o models.o evaluatePartialGenericSpecial.o evaluateGenericSpecial.o newviewGenericSpecial.o makenewzGenericSpecial.o bipartitionList.o restartHashTable.o avxLikelihood.o byteFile.o partitionAssignment.o communication.o quartets.o
 
-all : clean examl-OMP-AVX
+SFX :=""
+
+all: examl-OMP$(SFX)
 
 GLOBAL_DEPS = axml.h globalVariables.h ../versionHeader/version.h
 
-examl-OMP-AVX : $(objs)
-	$(CC) -o examl-OMP-AVX $(objs) $(LIBRARIES) $(LDFLAGS)
+examl-OMP$(SFX) : $(objs)
+	$(CC) -o $@ $(objs) $(LIBRARIES) $(LDFLAGS)
 
 avxLikelihood.o : avxLikelihood.c $(GLOBAL_DEPS)
-	$(CC) $(CFLAGS) -mavx -c -o avxLikelihood.o avxLikelihood.c
+	$(CC) $(CFLAGS) -c -o avxLikelihood.o avxLikelihood.c
 
 models.o : models.c $(GLOBAL_DEPS)
 	 $(CC) $(COMMON_FLAGS) $(OPT_FLAG_1) -c -o models.o models.c
@@ -49,6 +51,6 @@
 
 
 clean : 
-	$(RM) *.o examl-OMP-AVX
+	$(RM) *.o examl-OMP$(SFX)
 
-dev : examl-OMP-AVX
+dev : examl-OMP$(SFX)
--- examl.orig/parser/Makefile.SSE3.gcc
+++ examl/parser/Makefile.SSE3.gcc
@@ -2,7 +2,7 @@
 # Makefile cleanup October 2006, Courtesy of Peter Cordes <peter@cordes.ca>
 
 CC = gcc 
-CFLAGS += -fomit-frame-pointer -O2 -D_GNU_SOURCE -msse -funroll-loops  -Wall -Wunused-parameter -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes   -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat  -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast
+CFLAGS += -fomit-frame-pointer -O2 -D_GNU_SOURCE -funroll-loops  -Wall -Wunused-parameter -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes   -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat  -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast
 
 
 LIBRARIES = -lm
@@ -11,19 +11,21 @@
 
 objs    = axml.o parsePartitions.o
 
-all : clean parse-examl
+SFX := ""
+
+all : clean parse-examl$(SFX)
 
 GLOBAL_DEPS = axml.h globalVariables.h ../versionHeader/version.h 
 
-parse-examl : $(objs)
-	$(CC) -o parse-examl $(objs) $(LIBRARIES) $(LDFLAGS)
+parse-examl$(SFX) : $(objs)
+	$(CC) -o $@ $(objs) $(LIBRARIES) $(LDFLAGS)
 
 
 axml.o : axml.c $(GLOBAL_DEPS)
 parsePartitions.o : parsePartitions.c $(GLOBAL_DEPS)
 
 clean : 
-	$(RM) *.o parse-examl
+	$(RM) *.o parse-examl$(SFX)
 
 
 dev : parse-examl
--- examl.orig/examl/axml.c
+++ examl/examl/axml.c
@@ -53,7 +53,7 @@
 #include <mpi.h>
 
 #if ! (defined(__ppc) || defined(__powerpc__) || defined(PPC))
-#include <xmmintrin.h>
+#include "../debian/include/simde/x86/sse.h"
 /*
   special bug fix, enforces denormalized numbers to be flushed to zero,
   without this program is a tiny bit faster though.
@@ -2591,7 +2591,8 @@
        substantial run-time differences for vectors of equal length.
     */
     
-#if ! (defined(__ppc) || defined(__powerpc__) || defined(PPC))
+#if defined(SIMDE_SSE_NATIVE)    
+# include <xmmintrin.h>
     _mm_setcsr( _mm_getcsr() | _MM_FLUSH_ZERO_ON);
 #endif   
 
--- examl.orig/examl/Makefile.AVX.gcc
+++ examl/examl/Makefile.AVX.gcc
@@ -3,28 +3,30 @@
 
 CC = mpicc
 
-COMMON_FLAGS = -D__SIM_SSE3 -D__AVX -D_OPTIMIZED_FUNCTIONS -msse3 -D_GNU_SOURCE -fomit-frame-pointer -funroll-loops -D_USE_ALLREDUCE #-Wall   -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat  -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast    -Wno-unused-parameter
+COMMON_FLAGS = -D_OPTIMIZED_FUNCTIONS -D_GNU_SOURCE -fomit-frame-pointer -funroll-loops -D_USE_ALLREDUCE #-Wall   -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat  -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast    -Wno-unused-parameter
 
 OPT_FLAG_1 = -O1
 OPT_FLAG_2 = -O2
 
 CFLAGS += $(COMMON_FLAGS) $(OPT_FLAG_2)
 
-LIBRARIES = -lm -mavx
+LIBRARIES = -lm
 
 RM = rm -f
 
 objs    = axml.o optimizeModel.o trash.o searchAlgo.o topologies.o treeIO.o models.o evaluatePartialGenericSpecial.o evaluateGenericSpecial.o newviewGenericSpecial.o makenewzGenericSpecial.o bipartitionList.o restartHashTable.o avxLikelihood.o byteFile.o partitionAssignment.o communication.o quartets.o
 
-all : clean examl-AVX
+SFX :=""
+
+all: examl$(SFX)
 
 GLOBAL_DEPS = axml.h globalVariables.h ../versionHeader/version.h
 
-examl-AVX : $(objs)
-	$(CC) -o examl-AVX $(objs) $(LIBRARIES) $(LDFLAGS)
+examl$(SFX) : $(objs)
+	$(CC) -o $@ $(objs) $(LIBRARIES) $(LDFLAGS)
 
 avxLikelihood.o : avxLikelihood.c $(GLOBAL_DEPS)
-	$(CC) $(CFLAGS) -mavx -c -o avxLikelihood.o avxLikelihood.c
+	$(CC) $(CFLAGS) -c -o avxLikelihood.o avxLikelihood.c
 
 models.o : models.c $(GLOBAL_DEPS)
 	 $(CC) $(COMMON_FLAGS) $(OPT_FLAG_1) -c -o models.o models.c
@@ -49,6 +51,6 @@
 
 
 clean : 
-	$(RM) *.o examl-AVX
+	$(RM) *.o examl$(SFX)
 
-dev : examl-AVX
+dev : examl$(SFX)
--- examl.orig/examl/Makefile.KNL.icc
+++ examl/examl/Makefile.KNL.icc
@@ -7,7 +7,7 @@
 
 KNLFLAGS=-xMIC-AVX512 -fma -align -finline-functions -D__MIC_NATIVE -qopenmp -D_USE_OMP
 
-COMMON_FLAGS = -std=c99 -D__SIM_SSE3 -D_OPTIMIZED_FUNCTIONS -D_GNU_SOURCE -fomit-frame-pointer -funroll-loops -D_USE_ALLREDUCE  $(KNLFLAGS) # -Wall   -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat  -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast    -Wno-unused-parameter
+COMMON_FLAGS = -std=c99 -D_OPTIMIZED_FUNCTIONS -D_GNU_SOURCE -fomit-frame-pointer -funroll-loops -D_USE_ALLREDUCE  $(KNLFLAGS) # -Wall   -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat  -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast    -Wno-unused-parameter
 
 OPT_FLAG_1 = -O1
 OPT_FLAG_2 = -O2
--- examl.orig/examl/Makefile.MIC.icc
+++ examl/examl/Makefile.MIC.icc
@@ -4,7 +4,7 @@
 CC = mpicc
 
 MICFLAGS = -D__MIC_NATIVE -mmic -qopt-streaming-cache-evict=0 -qopenmp -D_USE_OMP #-D_PROFILE_MPI
-COMMON_FLAGS = -std=c99 -D__SIM_SSE3 -D_OPTIMIZED_FUNCTIONS -D_GNU_SOURCE -fomit-frame-pointer -funroll-loops -D_USE_ALLREDUCE  $(MICFLAGS) # -Wall   -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat  -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast    -Wno-unused-parameter
+COMMON_FLAGS = -std=c99 -D_OPTIMIZED_FUNCTIONS -D_GNU_SOURCE -fomit-frame-pointer -funroll-loops -D_USE_ALLREDUCE  $(MICFLAGS) # -Wall   -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat  -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast    -Wno-unused-parameter
 
 OPT_FLAG_1 = -O1
 OPT_FLAG_2 = -O2
--- examl.orig/examl/Makefile.OMP.SSE3.gcc
+++ examl/examl/Makefile.OMP.SSE3.gcc
@@ -3,7 +3,7 @@
 
 CC = mpicc
 
-COMMON_FLAGS = -D_USE_OMP -fopenmp -D_GNU_SOURCE -D__SIM_SSE3  -msse3 -fomit-frame-pointer -funroll-loops -D_OPTIMIZED_FUNCTIONS -D_USE_ALLREDUCE -Wall #-Wunused-parameter -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat  -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast -Wno-unused-parameter
+COMMON_FLAGS = -D_USE_OMP -fopenmp -D_GNU_SOURCE -msse3 -fomit-frame-pointer -funroll-loops -D_OPTIMIZED_FUNCTIONS -D_USE_ALLREDUCE -Wall #-Wunused-parameter -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat  -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast -Wno-unused-parameter
 
 OPT_FLAG_1 = -O1 
 OPT_FLAG_2 = -O2
--- examl.orig/examl/Makefile.SSE3.gcc
+++ examl/examl/Makefile.SSE3.gcc
@@ -4,7 +4,7 @@
 CC = mpicc
 
 
-COMMON_FLAGS = -D_GNU_SOURCE -D__SIM_SSE3  -msse3 -fomit-frame-pointer -funroll-loops -D_OPTIMIZED_FUNCTIONS -D_USE_ALLREDUCE #-Wall -Wunused-parameter -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat  -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast -Wno-unused-parameter
+COMMON_FLAGS = -D_GNU_SOURCE -msse3 -fomit-frame-pointer -funroll-loops -D_OPTIMIZED_FUNCTIONS -D_USE_ALLREDUCE #-Wall -Wunused-parameter -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat  -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast -Wno-unused-parameter
 
 OPT_FLAG_1 = -O1 
 OPT_FLAG_2 = -O2
--- examl.orig/examl/axml.h
+++ examl/examl/axml.h
@@ -42,11 +42,8 @@
 #ifdef __MIC_NATIVE
 #define BYTE_ALIGNMENT 64
 #define VECTOR_PADDING 8
-#elif defined __AVX
-#define BYTE_ALIGNMENT 32
-#define VECTOR_PADDING 1
 #else
-#define BYTE_ALIGNMENT 16
+#define BYTE_ALIGNMENT 32
 #define VECTOR_PADDING 1
 #endif
 
@@ -1337,8 +1334,6 @@
 extern void myBinFwrite(void *ptr, size_t size, size_t nmemb, FILE *byteFile);
 extern void myBinFread(void *ptr, size_t size, size_t nmemb, FILE *byteFile);
 
-#ifdef __AVX
-
 extern void newviewGTRGAMMAPROT_AVX_LG4(int tipCase,
 					double *x1, double *x2, double *x3, double *extEV[4], double *tipVector[4],
 					int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, 
@@ -1401,7 +1396,6 @@
 					 double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling,
 					 unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap, 
 					 double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn); 
-#endif
 
 
 
--- examl.orig/parser/axml.h
+++ examl/parser/axml.h
@@ -34,11 +34,7 @@
 #include "../versionHeader/version.h"
 
 
-#ifdef __AVX
 #define BYTE_ALIGNMENT 32
-#else
-#define BYTE_ALIGNMENT 16
-#endif
 
 
 
