返回
SSE居然比C还慢这是一段归一化向量到单位向量的代码,发现C的版本比SSE要快得多!!!真是费解,不知到是程序写的有的问题还是其他什么原因
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <conio.h>
typedef _declspec(align(16)) float vec3_t[3];
inline void vec_normalize_sse(vec3_t vec)
{
_asm {
mov esi, vec
movaps xmm0, [esi]
movaps xmm1, xmm0
mulps xmm1, xmm1
movaps xmm2, xmm1
shufps xmm2, xmm1, 0xe1
movaps xmm3, xmm1
shufps xmm3, xmm1, 0xc6
addps xmm1, xmm2
addps xmm1, xmm3
shufps xmm1, xmm1, 0x00
sqrtps xmm1, xmm1
divps xmm0, xmm1
movaps [esi], xmm0
}
}
inline void vec_normalize_c(vec3_t vec)
{
float len;
len = vec[0]*vec[0] + vec[1]*vec[1] + vec[2]*vec[2];
len = (float)sqrt(len);
len = 1.0f/len;
vec[0] *= len;
vec[1] *= len;
vec[2] *= len;
}
int main()
{
int i, s, e, count;
vec3_t vec;
count = 10000000;
vec[0] = 1.0f;
vec[1] = 2.0f;
vec[2] = 3.0f;
s = clock();
for (i = 0; i < count; i++) {
vec_normalize_sse(vec);
}
e = clock();
printf("sse = %d, %f, %f, %f\n", e - s, vec[0], vec[1], vec[2]);
vec[0] = 1.0f;
vec[1] = 2.0f;
vec[2] = 3.0f;
s = clock();
for (i = 0; i < count; i++) {
vec_normalize_c(vec);
}
e = clock();
printf("c = %d, %f, %f, %f\n", e - s, vec[0], vec[1], vec[2]);
getch();
return 0;
}
回复 |
(1054) |
leo1981816 | 2006-04-11 09:03:28
SIMD 指令重在同一条指令计算多组数据。单组运算的速度当然不如 fpu, 否则要 fpu 干什么?你只利用 sse 同时多计算了两个乘法。但是指令却多了很多。 - 回复 | (1055) | 云风 | 2006-04-11 12:53:18