Click here to Skip to main content
15,885,201 members
Articles / Desktop Programming / MFC

Fast SIMD Prototyping

Rate me:
Please Sign up or sign in to vote.
4.97/5 (51 votes)
30 Oct 2016Ms-PL9 min read 74.3K   1.4K   99  
Prototype SIMD vectorized code effortlessly.
#include <dvec.h>
#include <cassert>

class UInt128Array
{
public:
	//! Constructor
	UInt128Array(int size32)
		:
		arr(NULL),
		size128(0),
		size32(0),
		remainder(0)
	{
		// divide by 4, to get the size in 128bit.
		size128 = (size32) >> 2;
		// calculate the remainder
		remainder = size32%4; 

#ifdef _DEBUG
		assert(remainder<4);
#endif // _DEBUG

		// if there is remainder, 
		// add 1 to the 128bit m_arraysize
		if(remainder!=0) 
			++size128;

		this->size32 = size32;

		size_t fullsize = (size128) * 16;

		arr = (Iu32vec4 *)(_aligned_malloc((size_t)(fullsize), 16));

		Iu32vec4 initialised(1,1,1,1);

		for(size_t i=0; i<size128; ++i)
		{
			arr[i] = initialised;
		}
	}

	//! Constructor
	UInt128Array(int size32, unsigned int initialize)
		:
		arr(NULL),
		size128(0),
		size32(0),
		remainder(0)
	{
		// divide by 4, to get the size in 128bit.
		size128 = (size32) >> 2;
		// calculate the remainder
		remainder = size32%4; 

#ifdef _DEBUG
		assert(remainder<4);
#endif // _DEBUG

		// if there is remainder, 
		// add 1 to the 128bit m_arraysize
		if(remainder!=0) 
			++size128;

		this->size32 = size32;

		size_t fullsize = (size128) * 16;

		arr = (Iu32vec4 *)(_aligned_malloc((size_t)(fullsize), 16));

		Iu32vec4 initialised(initialize,initialize,initialize,initialize);

		for(size_t i=0; i<size128; ++i)
		{
			arr[i] = initialised;
		}
	}

	//! Destructor
	~UInt128Array()
	{
		if(arr!=NULL)
		{
			_aligned_free(arr);
			arr=NULL;
		}
	}

	//! overloaded operator to return 32 bit integer
	unsigned int operator()(size_t i)
	{
#ifdef _DEBUG
		assert(i<size32);
#endif // _DEBUG

		int a = i >> 2; // divide by 4
		int b = a << 2; // multiply by 4 again

		int rem = i - b; // compute the remainder

		return arr[a][rem];
	}

	//! overloaded operator to return 128 bit vector
	Iu32vec4& operator[](size_t i)
	{
#ifdef _DEBUG
		assert(i<size128);
#endif // _DEBUG

		return arr[i];
	}

	void setUnused(int num)
	{
		if(size128==0||remainder==0)
			return;

		size_t unused = 4 - remainder;

		if(unused>=1)
			arr[size128-1][3] = num;
		if(unused>=2)
			arr[size128-1][2] = num;
		if(unused>=3)
			arr[size128-1][1] = num;
	}

	size_t GetSize128() { return size128; }
	size_t GetSize32() { return size32; }
	size_t GetRemainder() { return remainder; }

private:
	//! point to array of 128 bit elements
	Iu32vec4* arr;
	//! size of array in 128 bit chunks
	size_t size128;
	//! size of array in 32 bit chunks (might be (32 * x) < (128/4 * x)
	size_t size32;
	//! Remainder of 32 bit element at the last 128 bit element.
	size_t remainder;
};

static inline Iu32vec4 operator * (Iu32vec4 const & a,
	Iu32vec4 const & b) {
		__m128i a13, b13, prod02, prod13, prod01, prod23, prod0123;
		a13 = _mm_shuffle_epi32(a, 0xF5); // (-,a3,-,a1)
		b13 = _mm_shuffle_epi32(b, 0xF5); // (-,b3,-,b1)
		prod02 = _mm_mul_epu32(a, b); // (-,a2*b2,-,a0*b0)
		prod13 = _mm_mul_epu32(a13, b13); // (-,a3*b3,-,a1*b1)
		prod01 = _mm_unpacklo_epi32(prod02,prod13); // (-,-,a1*b1,a0*b0)
		prod23 = _mm_unpackhi_epi32(prod02,prod13); // (-,-,a3*b3,a2*b2)
		prod0123 = _mm_unpacklo_epi64(prod01,prod23); // (ab3,ab2,ab1,ab0)
		return prod0123;
}

static inline Iu32vec4 operator / (Iu32vec4 const & a,
	Iu32vec4 const & b) {
		Iu32vec4 ans;
		ans[0] = a[0] / b[0];
		ans[1] = a[1] / b[1];
		ans[2] = a[2] / b[2];
		ans[3] = a[3] / b[3];
		return ans;
}

static inline Iu32vec4 operator % (Iu32vec4 const & a,
	Iu32vec4 const & b) {
		Iu32vec4 ans;
		ans[0] = a[0] % b[0];
		ans[1] = a[1] % b[1];
		ans[2] = a[2] % b[2];
		ans[3] = a[3] % b[3];
		return ans;
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Microsoft Public License (Ms-PL)


Written By
Software Developer (Senior)
Singapore Singapore
Shao Voon is from Singapore. His interest lies primarily in computer graphics, software optimization, concurrency, security, and Agile methodologies.

In recent years, he shifted focus to software safety research. His hobby is writing a free C++ DirectX photo slideshow application which can be viewed here.

Comments and Discussions