INTEL SSE指令集优化 By Liang Zhu 2015-12-09 更新日期:2015-12-09 文章目录 1. SSE1.1. SSE概述1.2. SSE DEMO SSESSE概述SSE是指令集的简称,它包括70条指令,其中包含单指令多数据浮点计算、以及额外的SIMD整数和高速缓存控制指令。其优势包括:更高分辨率的图像浏览和处理、高质量音频、MPEG2视频、同时MPEG2加解密码以及语音识别占用更少CPU资源;更高精度和更快响应速度。 大部分涉及到128位内存变量操作的,内存变量首地址必须要对齐16字节,也就是内存地址低4位为0,否则会引起CPU异常,导致指令执行失败,此错误编译器不检查. SSE DEMO1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980#include <stdio.h>#include <vector>#include <iostream>#include <unistd.h>#include <fcntl.h>#include <sys/types.h>#include <time.h>#include <stdlib.h>#include <xmmintrin.h>// test sse vector calc commandusing namespace std;void ComputeArrayCPlusPlusSSE( float* pArray1, // [输入] 源数组1 float* pArray2, // [输入] 源数组2 float* pResult, // [输出] 用来存放结果的数组 int nSize) // [输入] 数组的大小{ int nLoop = nSize / 4; __m128 m1, m2, m3, m4; __m128* pSrc1 = (__m128*) pArray1; __m128* pSrc2 = (__m128*) pArray2; __m128* pDest = (__m128*) pResult; __m128 m0_5 = _mm_set_ps1(0.5f); // m0_5[0, 1, 2, 3] = 0.5 for ( int i = 0; i < nLoop; i++ ) { m1 = _mm_mul_ps(*pSrc1, *pSrc1); // m1 = *pSrc1 * *pSrc1 m2 = _mm_mul_ps(*pSrc2, *pSrc2); // m2 = *pSrc2 * *pSrc2 m3 = _mm_add_ps(m1, m2); // m3 = m1 + m2 m4 = _mm_sqrt_ps(m3); // m4 = sqrt(m3) *pDest = _mm_add_ps(m4, m0_5); // *pDest = m4 + 0.5 pSrc1++; pSrc2++; pDest++; }}void test() { float *va = new float[64]; float *vb = new float[64]; float *vo = new float[64]; for (int i=0; i<64; ++i) { va[i] = i/10.0; vb[i] = (i+1)/10.0; } ComputeArrayCPlusPlusSSE(va, vb, vo, 64); for (int i=0; i<64; ++i) { printf("%.3f ", vo[i]); } printf("\n"); delete [] va; delete [] vb; delete [] vo;}// test new alignmentvoid test2() { float *a, *b; for (int i=0; i<10; ++i) { a = new float[9]; b = new float[1]; printf("a:%p b:%p\n", a, b); }}int main(int argc, char *argv[]){ test(); //test2(); //test1(); return 0;}