SSE指令集

原创

mb63982c735c3d9 2022-12-13 15:56:50 ©著作权

文章标签 c++ alignment float assembly arrays 文章分类 运维

©著作权归作者所有：来自51CTO博客作者mb63982c735c3d9的原创作品，请联系作者获取转载授权，否则将追究法律责任

SSE指令集的介绍网上一大堆, 这里贴一个用VS2008环境下的SSE测试程序, 分别用C++代码, C++内联汇编, C++的SSE Intrinsics三种方式计算卷积的程序...这是一个win32控制台程序.....

主文件的代码一览:

// Test_SSE.cpp : 定义控制台应用程序的入口点。
// calc conversion
//

#include "stdafx.h"
#include <xmmintrin.h>      // __m128 data type and SSE functions
#include <float.h>
#include <math.h>
#include <Windows.h>    // Support odprintf
#include <stdarg.h>
#include <ctype.h>
#include "MMX_SSESupport.h"
#include "TimeCounter.h"

#define ARRAY_SIZE 100000

#pragma warning(disable : 4324)

// Arrays processed by SSE should have 16 bytes alignment:
__declspec(align(16)) float m_fInitialArray[ARRAY_SIZE];
__declspec(align(16)) float m_fResultArray[ARRAY_SIZE];


// minimum and maximum values in the result array
float m_fMin;
float m_fMax;

#define TIME_START CTimeCounter* pT = new CTimeCounter()
#define TIME_END   ShowTime(pT->GetExecutionTime())

//
//odprintf -- debug function
void __cdecl odprintf(const char* fmt, ...)
{
  char buf[4096], *p = buf;
  va_list args;

  va_start(args, fmt);
  p += vsnprintf_s(p, sizeof(buf), _TRUNCATE, fmt, args);
  va_end(args);

  while ( p > buf  &&  isspace(p[-1]) )
    *--p = '\0';
  *p++ = '\r';
  *p++ = '\n';
  *p   = '\0';

  OutputDebugStringA(buf);  //output as ANSI string //OutputDebugString
}

//
// Show execution time (ms)
void ShowTime(__int64 nTime)
{
  printf("usage time: %I64d\n\n",nTime);    //在g++中对应的是<stdint.h> int64_t, 应该用%lld输出
}

//
// ShowArray, display array's data
void ShowArray(float* pArray)
{
  if ( !(*pArray))
    return;

  float* p = pArray;

  for ( int i = 0; i < ARRAY_SIZE; i += 500 )    //没有显示所有的数据出来
  {
    printf("%f  ", p[i]);

    if (i == 5)
      printf("\n");
  }

  printf("\n\n");
}

//
// InitArray, Fill initial array
void InitArray()
{
  m_fMin = FLT_MAX;
  m_fMax = FLT_MIN;

  float f;
  int i;


  for ( i = 0; i < ARRAY_SIZE; i++ )
  {
    // Fill array with one sin cycle and ensure that all values are positive
    // (to use sqrt in conversion)
    f = (float) sin(((double)i * 6.29 / ARRAY_SIZE)) + 2.0f;

    if ( f < m_fMin )
      m_fMin = f;

    if ( f > m_fMax )
      m_fMax = f;

    m_fInitialArray[i] = f;
  }

  ShowArray(m_fInitialArray);
}

//
// Make conversion using C++ code
//
// Each initial array member is converted to result array member
// using some formula (just to demonstrate SSE features).
// Minimum and maximum result values are calculated and shown.
//
// Function also calculates and shows conversion time (ms).
//
void OnCplusplus()
{
  TIME_START;

  m_fMin = FLT_MAX;
  m_fMax = FLT_MIN;

  int i;

  for ( i = 0; i < ARRAY_SIZE; i++ )
  {
    m_fResultArray[i] = sqrt(m_fInitialArray[i]  * 2.8f);

    if ( m_fResultArray[i] < m_fMin )
      m_fMin = m_fResultArray[i];

    if ( m_fResultArray[i] > m_fMax )
      m_fMax = m_fResultArray[i];
  }


  TIME_END;

  ShowArray(m_fResultArray);
}

//
//OnSseAssembly, Make conversion using C++ code with inline Assembly
void OnSseAssembly()
{
  TIME_START;

  float* pIn = m_fInitialArray;
  float* pOut = m_fResultArray;

  float f = 2.8f;
  float flt_min = FLT_MIN;
  float flt_max = FLT_MAX;

  __m128 min128;
  __m128 max128;

  // using additional registers:
  // xmm2 - multiplication coefficient
  // xmm3 - minimum
  // xmm4 - maximum

  _asm
  {
    movss   xmm2, f                         // xmm2[0] = 2.8
      shufps  xmm2, xmm2, 0                   // xmm2[1, 2, 3] = xmm2[0]

      movss   xmm3, flt_max                   // xmm3 = FLT_MAX
      shufps  xmm3, xmm3, 0                   // xmm3[1, 2, 3] = xmm3[0]

      movss   xmm4, flt_min                   // xmm4 = FLT_MIN
      shufps  xmm4, xmm4, 0                   // xmm3[1, 2, 3] = xmm3[0]

      mov         esi, pIn                    // input pointer
      mov         edi, pOut                   // output pointer
      mov         ecx, ARRAY_SIZE/4           // loop counter

start_loop:
    movaps      xmm1, [esi]                 // xmm1 = [esi]
    mulps       xmm1, xmm2                  // xmm1 = xmm1 * xmm2
      sqrtps      xmm1, xmm1                  // xmm1 = sqrt(xmm1)
      movaps      [edi], xmm1                 // [edi] = xmm1

      minps       xmm3, xmm1
      maxps       xmm4, xmm1

      add         esi, 16
      add         edi, 16

      dec         ecx
      jnz         start_loop


      movaps      min128, xmm3
      movaps      max128, xmm4
  }

  // extract minimum and maximum values from min128 and max128
  union u
  {
    __m128 m;
    float f[4];
  } x;

  x.m = min128;
  m_fMin = min(x.f[0], min(x.f[1], min(x.f[2], x.f[3])));

  x.m = max128;
  m_fMax = max(x.f[0], max(x.f[1], max(x.f[2], x.f[3])));


  TIME_END;

  ShowArray(m_fResultArray);
}

//
// OnSseCpp, Make conversion using C++ code with SSE Intrinsics
void OnSseCpp()
{
  TIME_START;

  __m128 coeff = _mm_set_ps1(2.8f);      // coeff[0, 1, 2, 3] = 2.8
  __m128 tmp;

  __m128 min128 = _mm_set_ps1(FLT_MAX);  // min128[0, 1, 2, 3] = FLT_MAX
  __m128 max128 = _mm_set_ps1(FLT_MIN);  // max128[0, 1, 2, 3] = FLT_MIN

  __m128* pSource = (__m128*) m_fInitialArray;
  __m128* pDest = (__m128*) m_fResultArray;

  for ( int i = 0; i < ARRAY_SIZE/4; i++ )
  {
    tmp = _mm_mul_ps(*pSource, coeff);      // tmp = *pSource * coeff
    *pDest = _mm_sqrt_ps(tmp);              // *pDest = sqrt(tmp)

    min128 =  _mm_min_ps(*pDest, min128);
    max128 =  _mm_max_ps(*pDest, max128);

    pSource++;
    pDest++;
  }

  // extract minimum and maximum values from min128 and max128
  union u
  {
    __m128 m;
    float f[4];
  } x;

  x.m = min128;
  m_fMin = min(x.f[0], min(x.f[1], min(x.f[2], x.f[3])));

  x.m = max128;
  m_fMax = max(x.f[0], max(x.f[1], max(x.f[2], x.f[3])));


  TIME_END;

  ShowArray(m_fResultArray);
}



int _tmain(int argc, _TCHAR* argv[])
{
  // Test SSE support ?
  bool bMMX, bSSE;
  TestFeatures(&bMMX, &bSSE);

  if ( !bSSE )
  {
    // Do not support SSE
    odprintf("Do not support SSE.\n");
    return 0;
  }

  odprintf("everything is ok...");

  //first, prepare data
  printf("program generate %d floating point(Not all data are displayed)...\n\n", ARRAY_SIZE);
  InitArray();

  //second, Make conversion using C++ code
  getchar();
  printf("Make conversion using C++ code\n\n");
  OnCplusplus();

  
  //third,Make conversion using C++ code with inline Assembly
  getchar();
  printf("Make conversion using C++ code with inline Assembly\n\n");
  OnSseAssembly();

  //finally, Make conversion using C++ code with SSE Intrinsics 
  getchar();
  printf("Make conversion using C++ code with SSE Intrinsics\n\n");
  OnSseCpp();

  getchar();
  return 0;
}