这是一个对数组中的元素进行归一化的程序,两段程序代码是一样的,但是第二段的就多了一个mulps ,而速度却慢了很多。请高手帮忙解答一下,谢谢 #include "stdio.h" #include "stdlib.h" #include "conio.h" #include "math.h" #include "time.h" #include "windows.h" #define NUM 10000000 typedef _declspec(align (16)) float vec3_t[3]; vec3_t vec[NUM]; int main() { int i, j, index; int s, e; float len; srand(time(NULL)); for (i = 0; i < 5; i++) { for (j = 0; j < NUM; j++) { vec[j][0] = (rand()%100)/100.0f; vec[j][1] = (rand()%100)/100.0f; vec[j][2] = (rand()%100)/100.0f; } s = clock(); for (j = 0; j < NUM; j++) { /* len = vec[j][0]*vec[j][0] + vec[j][1]*vec[j][1] + vec[j][2]*vec[j][2]; len = (float)sqrt(len); len = 1.0f/len; vec[j][0] *= len; vec[j][1] *= len; vec[j][2] *= len;*/ _asm { lea ecx, vec[j] movups xmm0, [ecx] movaps xmm1, xmm0 // mulps xmm1, xmm1 // movaps xmm2, xmm1 // movaps xmm3, xmm1 // shufps xmm2, xmm2, 0xc9 // shufps xmm3, xmm3, 0xd2 // addps xmm1, xmm2 // addps xmm1, xmm3 // rsqrtps xmm1, xmm1 // mulps xmm0, xmm1 // movups [ecx], xmm0 } } e = clock(); printf("%d\n", e - s); for (j = 0; j < NUM; j++) { vec[j][0] = (rand()%100)/100.0f; vec[j][1] = (rand()%100)/100.0f; vec[j][2] = (rand()%100)/100.0f; } s = clock(); for (j = 0; j < NUM; j++) { /* len = vec[j][0]*vec[j][0] + vec[j][1]*vec[j][1] + vec[j][2]*vec[j][2]; len = (float)sqrt(len); len = 1.0f/len; vec[j][0] *= len; vec[j][1] *= len; vec[j][2] *= len;*/ _asm { lea ecx, vec[j] movups xmm0, [ecx] movaps xmm1, xmm0 mulps xmm1, xmm1 //这个语句很慢,why // movaps xmm2, xmm1 // movaps xmm3, xmm1 // shufps xmm2, xmm2, 0xc9 // shufps xmm3, xmm3, 0xd2 // addps xmm1, xmm2 // addps xmm1, xmm3 // rsqrtps xmm1, xmm1 // mulps xmm0, xmm1 // movups [ecx], xmm0 } } e = clock(); printf("%d\n\n", e - s); } getch(); return 0; }
既然会用sse2,难道不知道各类指令大致的周期?费解 - 回复 | (1363) | aaaa | 2007-01-07 12:07:22