Inner Product Experiment: CPU, FPU vs. SSE*

Chesnokov Yuriy

Rate me:

3.57/5 (16 votes)

20 May 2008GPL32 min read

69.5K

824

The article demonstrating inner product operation performed with shorts, ints, floats and doubles with CPU/FPU and SSE for comparison.

inner_src.zip
- src
  - inner.cpp
  - inner.vcproj
  - ReadMe.txt
  - stdafx.cpp
  - stdafx.h
inner_demo.zip
- bin
  - inner.exe
conv_src.zip
- conv.cpp
- conv.vcproj
- ReadMe.txt
- stdafx.cpp
- stdafx.h
conv_demo.zip
- conv.exe

// inner.cpp : Defines the entry point for the console application.
//

#include "stdafx.h"


LARGE_INTEGER m_nFreq;
LARGE_INTEGER m_nBeginTime;

void tic()
{
        QueryPerformanceFrequency(&m_nFreq);
        QueryPerformanceCounter(&m_nBeginTime);
}
__int64 toc()
{
        LARGE_INTEGER nEndTime;
        __int64 nCalcTime;

        QueryPerformanceCounter(&nEndTime);
        nCalcTime = (nEndTime.QuadPart - m_nBeginTime.QuadPart) * 1000 / m_nFreq.QuadPart;

        wprintf(L" processing time: %d ms\n", nCalcTime);
        return nCalcTime;
}

unsigned int size = 1000000;

float sse_inner(const float* a, const float* b, unsigned int size)
{
        float z = 0.0f, fres = 0.0f;
        __declspec(align(16)) float ftmp[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
        __m128 mres;

        if ((size / 4) != 0) {
                mres = _mm_load_ss(&z);
                for (unsigned int i = 0; i < size / 4; i++)
                        mres = _mm_add_ps(mres, _mm_mul_ps(_mm_loadu_ps(&a[4*i]), _mm_loadu_ps(&b[4*i])));

                //mres = a,b,c,d
                __m128 mv1 = _mm_movelh_ps(mres, mres);     //a,b,a,b
                __m128 mv2 = _mm_movehl_ps(mres, mres);     //c,d,c,d
                mres = _mm_add_ps(mv1, mv2);                //res[0],res[1]

                _mm_store_ps(ftmp, mres);                

                fres = ftmp[0] + ftmp[1];
        }

        if ((size % 4) != 0) {
                for (unsigned int i = size - size % 4; i < size; i++)
                        fres += a[i] * b[i];
        }

        return fres;
}

double sse_inner_d(const double* a, const double* b, unsigned int size)
{
        double z = 0.0, fres = 0.0;
        __declspec(align(16)) double ftmp[2] = { 0.0, 0.0 };
        __m128d mres;
        
        if ((size / 2) != 0) {
                mres = _mm_load_sd(&z);
                for (unsigned int i = 0; i < size / 2; i++)
                        mres = _mm_add_pd(mres, _mm_mul_pd(_mm_loadu_pd(&a[2*i]), _mm_loadu_pd(&b[2*i])));                

                _mm_store_pd(ftmp, mres);                

                fres = ftmp[0] + ftmp[1];
        }

        if ((size % 2) != 0) {
                for (unsigned int i = size - size % 2; i < size; i++)
                        fres += a[i] * b[i];
        }

        return fres;
}


float sse3_inner(const float* a, const float* b, unsigned int size)
{
        float z = 0.0f, fres = 0.0f;
        
        if ((size / 4) != 0) {
                const float* pa = a;
                const float* pb = b;
                __asm {
                        movss   xmm0, xmmword ptr[z]
                }
                for (unsigned int i = 0; i < size / 4; i++) {
                        __asm {
                                mov     eax, dword ptr[pa]
                                mov     ebx, dword ptr[pb]
                                movups  xmm1, [eax]
                                movups  xmm2, [ebx]
                                mulps   xmm1, xmm2
                                addps   xmm0, xmm1
                        }
                        pa += 4;
                        pb += 4;
                }  
                __asm {
                        haddps  xmm0, xmm0
                        haddps  xmm0, xmm0
                        movss   dword ptr[fres], xmm0                        
                }                
        }

        return fres;
}

short sse2_inner_s(const short* p1, const short* p2, unsigned int size)
{
        __m128i* mp1 = (__m128i *)p1;
        __m128i* mp2 = (__m128i *)p2;
        __m128i mres = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0);
        
        for(unsigned int i = 0; i < size/8; i++) {                 
                __m128i mtmp = _mm_mullo_epi16(_mm_loadu_si128(mp1), _mm_loadu_si128(mp2)); 
                mres = _mm_add_epi16(mres, mtmp);
                mp1++;
                mp2++;
        }

        short res[8];
        __m128i* pmres = (__m128i *)res;
        _mm_storeu_si128(pmres, mres);

        return res[0]+res[1]+res[2]+res[3]+res[4]+res[5]+res[6]+res[7];
}

void vchar()
{
        char* pv1;
        char* pv2;
        std::vector<char> v1;
        std::vector<char> v2;
        v1.resize(size);
        v2.resize(size);
        pv1 = &v1[0];
        pv2 = &v2[0];

        for (unsigned int i = 0; i < size; i++) {
                pv1[i] = char(rand() % 64 - 32);
                pv2[i] = char(rand() % 64 - 32);
        }

        char sum = 0;
        wprintf(L" chars\t\t");
        tic();
        for (unsigned int i = 0; i < size; i++)
                sum += pv1[i] * pv2[i];
        toc();

        wprintf(L" %d\n", sum);
}

void vshort()
{
        short* pv1;
        short* pv2;
        std::vector<short> v1;
        std::vector<short> v2;
        v1.resize(size);
        v2.resize(size);
        pv1 = &v1[0];
        pv2 = &v2[0];

        for (unsigned int i = 0; i < size; i++) {
                pv1[i] = short(rand() % 64 - 32);
                pv2[i] = short(rand() % 64 - 32);
        }

        short sum = 0;
        wprintf(L" shorts\t\t");
        tic();
        for (unsigned int i = 0; i < size; i++)
                sum += pv1[i] * pv2[i];
        toc();
        wprintf(L" %d\n", sum);

        wprintf(L" shorts sse2\t");
        tic();
        sum = sse2_inner_s(pv1, pv2, size);
        toc();
        wprintf(L" %d\n", sum);
}

void vint()
{
        int* pv1;
        int* pv2;
        std::vector<int> v1;
        std::vector<int> v2;
        v1.resize(size);
        v2.resize(size);
        pv1 = &v1[0];
        pv2 = &v2[0];

        for (unsigned int i = 0; i < size; i++) {
                pv1[i] = int(rand() % 64 - 32);
                pv2[i] = int(rand() % 64 - 32);
        }

        int sum = 0;
        wprintf(L" ints\t\t");
        tic();
        for (unsigned int i = 0; i < size; i++)
                sum += pv1[i] * pv2[i];
        toc();

        wprintf(L" %d\n", sum);
}

void vfloat()
{
        float* pv1;
        float* pv2;
        std::vector<float> v1;
        std::vector<float> v2;
        v1.resize(size);
        v2.resize(size);
        pv1 = &v1[0];
        pv2 = &v2[0];

        for (unsigned int i = 0; i < size; i++) {
                pv1[i] = float(rand() % 64 - 32);
                pv2[i] = float(rand() % 64 - 32);
        }

        float sum = 0;
        wprintf(L" floats\t\t");
        tic();
        for (unsigned int i = 0; i < size; i++)
                sum += pv1[i] * pv2[i];
        toc();
        wprintf(L" %d\n", (int)sum);

        wprintf(L" sse2 intrin\t");
        tic();
        sum = sse_inner(pv1, pv2, size);
        toc();
        wprintf(L" %d\n", (int)sum);

        wprintf(L" sse3 assembly\t");
        tic();
        sum = sse3_inner(pv1, pv2, size);
        toc();
        wprintf(L" %d\n", (int)sum);        
}

void vdouble()
{
        double* pv1;
        double* pv2;
        std::vector<double> v1;
        std::vector<double> v2;
        v1.resize(size);
        v2.resize(size);
        pv1 = &v1[0];
        pv2 = &v2[0];

        for (unsigned int i = 0; i < size; i++) {
                pv1[i] = double(rand() % 64 - 32);
                pv2[i] = double(rand() % 64 - 32);
        }

        double sum = 0;
        wprintf(L" doubles\t");
        tic();
        for (unsigned int i = 0; i < size; i++)
                sum += pv1[i] * pv2[i];
        toc();
        wprintf(L" %d\n", (int)sum);

        wprintf(L" doubles sse2\t");
        tic();
        sum = sse_inner_d(pv1, pv2, size);
        toc();
        wprintf(L" %d\n", (int)sum);

}

int _tmain(int argc, _TCHAR* argv[])
{
        if (argc >= 1 + 1)
                size = _wtoi(argv[1]);

        srand(time(0));
        
        vchar();
        __try {
                vshort();
        }
        __except(EXCEPTION_EXECUTE_HANDLER) {
                wprintf(L" seh failure!\n");
        }
        vint();
        __try {
                vfloat();
        }
        __except(EXCEPTION_EXECUTE_HANDLER) {
                wprintf(L" seh failure!\n");
        }
        __try {
                vdouble();
        }
        __except(EXCEPTION_EXECUTE_HANDLER) {
                wprintf(L" seh failure!\n");
        }

        return 0;
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The GNU General Public License (GPLv3)

Written By

Chesnokov Yuriy

Engineer

Russian Federation

Highly skilled Engineer with 14 years of experience in academia, R&D and commercial product development supporting full software life-cycle from idea to implementation and further support. During my academic career I was able to succeed in MIT Computers in Cardiology 2006 international challenge, as a R&D and SW engineer gain CodeProject MVP, find algorithmic solutions to quickly resolve tough customer problems to pass product requirements in tight deadlines. My key areas of expertise involve Object-Oriented
Analysis and Design OOAD, OOP, machine learning, natural language processing, face recognition, computer vision and image processing, wavelet analysis, digital signal processing in cardiology.

Inner Product Experiment: CPU, FPU vs. SSE*

License

Comments and Discussions