SSE指令集的介绍网上一大堆, 这里贴一个用VS2008环境下的SSE测试程序, 分别用C++代码, C++内联汇编, C++的SSE Intrinsics三种方式计算卷积的程序...这是一个win32控制台程序.....
主文件的代码一览:
// Test_SSE.cpp : 定义控制台应用程序的入口点。
// calc conversion
//
#include "stdafx.h"
#include <xmmintrin.h> // __m128 data type and SSE functions
#include <float.h>
#include <math.h>
#include <Windows.h> // Support odprintf
#include <stdarg.h>
#include <ctype.h>
#include "MMX_SSESupport.h"
#include "TimeCounter.h"
#define ARRAY_SIZE 100000
#pragma warning(disable : 4324)
// Arrays processed by SSE should have 16 bytes alignment:
__declspec(align(16)) float m_fInitialArray[ARRAY_SIZE];
__declspec(align(16)) float m_fResultArray[ARRAY_SIZE];
// minimum and maximum values in the result array
float m_fMin;
float m_fMax;
#define TIME_START CTimeCounter* pT = new CTimeCounter()
#define TIME_END ShowTime(pT->GetExecutionTime())
//
//odprintf -- debug function
void __cdecl odprintf(const char* fmt, ...)
{
char buf[4096], *p = buf;
va_list args;
va_start(args, fmt);
p += vsnprintf_s(p, sizeof(buf), _TRUNCATE, fmt, args);
va_end(args);
while ( p > buf && isspace(p[-1]) )
*--p = '\0';
*p++ = '\r';
*p++ = '\n';
*p = '\0';
OutputDebugStringA(buf); //output as ANSI string //OutputDebugString
}
//
// Show execution time (ms)
void ShowTime(__int64 nTime)
{
printf("usage time: %I64d\n\n",nTime); //在g++中对应的是<stdint.h> int64_t, 应该用%lld输出
}
//
// ShowArray, display array's data
void ShowArray(float* pArray)
{
if ( !(*pArray))
return;
float* p = pArray;
for ( int i = 0; i < ARRAY_SIZE; i += 500 ) //没有显示所有的数据出来
{
printf("%f ", p[i]);
if (i == 5)
printf("\n");
}
printf("\n\n");
}
//
// InitArray, Fill initial array
void InitArray()
{
m_fMin = FLT_MAX;
m_fMax = FLT_MIN;
float f;
int i;
for ( i = 0; i < ARRAY_SIZE; i++ )
{
// Fill array with one sin cycle and ensure that all values are positive
// (to use sqrt in conversion)
f = (float) sin(((double)i * 6.29 / ARRAY_SIZE)) + 2.0f;
if ( f < m_fMin )
m_fMin = f;
if ( f > m_fMax )
m_fMax = f;
m_fInitialArray[i] = f;
}
ShowArray(m_fInitialArray);
}
//
// Make conversion using C++ code
//
// Each initial array member is converted to result array member
// using some formula (just to demonstrate SSE features).
// Minimum and maximum result values are calculated and shown.
//
// Function also calculates and shows conversion time (ms).
//
void OnCplusplus()
{
TIME_START;
m_fMin = FLT_MAX;
m_fMax = FLT_MIN;
int i;
for ( i = 0; i < ARRAY_SIZE; i++ )
{
m_fResultArray[i] = sqrt(m_fInitialArray[i] * 2.8f);
if ( m_fResultArray[i] < m_fMin )
m_fMin = m_fResultArray[i];
if ( m_fResultArray[i] > m_fMax )
m_fMax = m_fResultArray[i];
}
TIME_END;
ShowArray(m_fResultArray);
}
//
//OnSseAssembly, Make conversion using C++ code with inline Assembly
void OnSseAssembly()
{
TIME_START;
float* pIn = m_fInitialArray;
float* pOut = m_fResultArray;
float f = 2.8f;
float flt_min = FLT_MIN;
float flt_max = FLT_MAX;
__m128 min128;
__m128 max128;
// using additional registers:
// xmm2 - multiplication coefficient
// xmm3 - minimum
// xmm4 - maximum
_asm
{
movss xmm2, f // xmm2[0] = 2.8
shufps xmm2, xmm2, 0 // xmm2[1, 2, 3] = xmm2[0]
movss xmm3, flt_max // xmm3 = FLT_MAX
shufps xmm3, xmm3, 0 // xmm3[1, 2, 3] = xmm3[0]
movss xmm4, flt_min // xmm4 = FLT_MIN
shufps xmm4, xmm4, 0 // xmm3[1, 2, 3] = xmm3[0]
mov esi, pIn // input pointer
mov edi, pOut // output pointer
mov ecx, ARRAY_SIZE/4 // loop counter
start_loop:
movaps xmm1, [esi] // xmm1 = [esi]
mulps xmm1, xmm2 // xmm1 = xmm1 * xmm2
sqrtps xmm1, xmm1 // xmm1 = sqrt(xmm1)
movaps [edi], xmm1 // [edi] = xmm1
minps xmm3, xmm1
maxps xmm4, xmm1
add esi, 16
add edi, 16
dec ecx
jnz start_loop
movaps min128, xmm3
movaps max128, xmm4
}
// extract minimum and maximum values from min128 and max128
union u
{
__m128 m;
float f[4];
} x;
x.m = min128;
m_fMin = min(x.f[0], min(x.f[1], min(x.f[2], x.f[3])));
x.m = max128;
m_fMax = max(x.f[0], max(x.f[1], max(x.f[2], x.f[3])));
TIME_END;
ShowArray(m_fResultArray);
}
//
// OnSseCpp, Make conversion using C++ code with SSE Intrinsics
void OnSseCpp()
{
TIME_START;
__m128 coeff = _mm_set_ps1(2.8f); // coeff[0, 1, 2, 3] = 2.8
__m128 tmp;
__m128 min128 = _mm_set_ps1(FLT_MAX); // min128[0, 1, 2, 3] = FLT_MAX
__m128 max128 = _mm_set_ps1(FLT_MIN); // max128[0, 1, 2, 3] = FLT_MIN
__m128* pSource = (__m128*) m_fInitialArray;
__m128* pDest = (__m128*) m_fResultArray;
for ( int i = 0; i < ARRAY_SIZE/4; i++ )
{
tmp = _mm_mul_ps(*pSource, coeff); // tmp = *pSource * coeff
*pDest = _mm_sqrt_ps(tmp); // *pDest = sqrt(tmp)
min128 = _mm_min_ps(*pDest, min128);
max128 = _mm_max_ps(*pDest, max128);
pSource++;
pDest++;
}
// extract minimum and maximum values from min128 and max128
union u
{
__m128 m;
float f[4];
} x;
x.m = min128;
m_fMin = min(x.f[0], min(x.f[1], min(x.f[2], x.f[3])));
x.m = max128;
m_fMax = max(x.f[0], max(x.f[1], max(x.f[2], x.f[3])));
TIME_END;
ShowArray(m_fResultArray);
}
int _tmain(int argc, _TCHAR* argv[])
{
// Test SSE support ?
bool bMMX, bSSE;
TestFeatures(&bMMX, &bSSE);
if ( !bSSE )
{
// Do not support SSE
odprintf("Do not support SSE.\n");
return 0;
}
odprintf("everything is ok...");
//first, prepare data
printf("program generate %d floating point(Not all data are displayed)...\n\n", ARRAY_SIZE);
InitArray();
//second, Make conversion using C++ code
getchar();
printf("Make conversion using C++ code\n\n");
OnCplusplus();
//third,Make conversion using C++ code with inline Assembly
getchar();
printf("Make conversion using C++ code with inline Assembly\n\n");
OnSseAssembly();
//finally, Make conversion using C++ code with SSE Intrinsics
getchar();
printf("Make conversion using C++ code with SSE Intrinsics\n\n");
OnSseCpp();
getchar();
return 0;
}