Click here to Skip to main content
15,888,816 members
Please Sign up or sign in to vote.
0.00/5 (No votes)
See more:
I have a speed test code like below, Use this test the serial process use time is 187ms and parallel use time is 5834ms, why parallel is so slow.

C++
#include <windows.h>
#include <ppl.h>
#include <vector>
using namespace concurrency;
using namespace std;

void DumpDataToFile(LPCTSTR lpszFile, LPVOID lpData, LONGLONG llDataLen, BOOL bAppend = FALSE);
static void calc(const double b[120], const double x[163970], double y[163970])
{
	int k, j;
	memset(&y[0], 0, 163970U * sizeof(double));
	for (k = 0; k < 120; k++) {
		for (j = k; j + 1 < 163971; j++) {
			y[j] += b[k] * x[j - k];
		}
	}
}

static void calc_ex(const double b[120], const double x[163970], double y[163970])
{
	int k, j;
	memset(&y[0], 0, 163970U * sizeof(double));
	for (k = 0; k < 120; k++) {
		parallel_for(int(k), 163970, [&](int j) {
			y[j] += b[k] * x[j - k];
		});
	}
}

template <class Function>
__int64 time_call(Function&& f)
{
	__int64 begin = GetTickCount();
	f();
	return GetTickCount() - begin;
}


int _tmain(int argc, _TCHAR* argv[])
{
	double *pb = new double[120];
	double *px = new double[163970];
	double *py = new double[163970];

	auto InitData = [&]() {
		for (int i = 0; i < 120; ++i) {
			pb[i] = i;
		}

		for (int i = 0; i < 163970; ++i) {
			px[i] = i;
		}
	};

	
	InitData();
	printf("serial time:%d\n", time_call([&](){		
		calc(pb, px, py);
	}));
	
	DumpDataToFile(_T("E:\\1.dat"), py, 163970 * sizeof(double));
	
	InitData();
	printf("parallel time:%d\n", time_call([&](){
		calc_ex(pb, px, py);
	}));
	DumpDataToFile(_T("E:\\2.dat"), py, 163970 * sizeof(double));

	delete[]pb;
	delete[]px;
	delete[]py;

	system("pause>nul");

	return 0;
}

void DumpDataToFile(LPCTSTR lpszFile, LPVOID lpData, LONGLONG llDataLen, BOOL bAppend)
{
	HANDLE hFile = CreateFile(lpszFile, GENERIC_WRITE, FILE_SHARE_READ, NULL, bAppend ? OPEN_ALWAYS : CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
	if (hFile == INVALID_HANDLE_VALUE) return;

	DWORD		dwWriteLen = 2 * 1024 * 1024;
	DWORD		dwWritten = 0;
	LONGLONG	llOffset = 0;

	if (llDataLen < (LONGLONG)dwWriteLen) {
		dwWriteLen = (DWORD)llDataLen;
	}

	if (bAppend) {
		SetFilePointer(hFile, 0, 0, FILE_END);
	}

	while (llDataLen>0)
	{
		if (!WriteFile(hFile, (byte*)lpData + llOffset, dwWriteLen, &dwWritten, NULL)) {
			printf("%d\n", GetLastError());
		}
		llOffset += dwWritten;
		llDataLen -= dwWritten;

		if (llDataLen < (LONGLONG)dwWriteLen) {
			dwWriteLen = (DWORD)llDataLen;
		}
	}

	CloseHandle(hFile);
}
Posted

1 solution

Because the parallel version is is taking nanoseconds to execute each iteration:
C#
y[j] += b[k] * x[j - k];
But has the overhead of setting up and scheduling a thread each time it has to execute one: and that means memory allocation, thread constriction, system scheduling or processes, and the tidy up time when it finishes.

It's like putting each individual word in a book in a separate chapter: it seems like it might make things easier, but it means the book is three foot thick!

Parallel processing is only useful when the task if long (so the setup overhead become irrelevant and there are sufficient space cores to execute the code on so that it can be parallelised properly.
In your case you are trying to set off 163970 separate processes - and there isn't a PC on the planet that can execute that many processes simultaneously!
 
Share this answer
 
Comments
Hongjun Ge 28-Jun-14 21:09pm    
Thanks for your answer. Do you have another way to parallel do this.

This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)



CodeProject, 20 Bay Street, 11th Floor Toronto, Ontario, Canada M5J 2N8 +1 (416) 849-8900