With

*nv3*'s suggested 'flat approach', using pointers I obtained a somewhat surprising result (about 3x speed improvement):

register int i, j, k;
LARGE_INTEGER t[4];
QueryPerformanceCounter(&t[0]);
for (i=0; i<128; i++)
{
for (j=0; j<5; j++)
{
for (k=0; k<9; k++)
m_tControlMatrix[i][j][k].cCtrlVal = 255;
}
}
QueryPerformanceCounter(&t[1]);
QueryPerformanceCounter(&t[2]);
register BYTE * p = &m_tControlMatrix[0][0][0].cCtrlVal;
register BYTE * q = p + 5760 * sizeof(typControlMatrix);
while (p < q)
{
*p = 255;
p += sizeof(typControlMatrix);
}
QueryPerformanceCounter(&t[3]);
CString s;
s.Format("3 loops: %I64d flat pointers: %I64d speed ratio %g ", (t[1].QuadPart-t[0].QuadPart), (t[3].QuadPart-t[2].QuadPart), ((double)(t[1].QuadPart-t[0].QuadPart))/(t[3].QuadPart-t[2].QuadPart));
MessageBox(s, "Test");

The output:

3 loops: 195327 flat pointers: 67743 speed ratio 2.88335