#include "stdafx.h"
/*
* -- SuperLU MT routine (version 1.0) --
* Univ. of California Berkeley, Xerox Palo Alto Research Center,
* and Lawrence Berkeley National Lab.
* August 15, 1997
*
*/
#ifndef _MSC_VER
#include <unistd.h>
#endif
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include "hnum_pcsp_defs.h"
#include "hnum_pdsp_defs.h"
#include "hnum_pssp_defs.h"
#include "hnum_pzsp_defs.h"
namespace harlinn
{
namespace numerics
{
namespace SuperLU
{
void superlu_abort_and_exit(char* msg)
{
fprintf(stderr, msg);
exit (-1);
}
void *superlu_malloc(size_t size)
{
void *buf;
buf = (void *) malloc(size);
return (buf);
}
void superlu_free(void *addr)
{
free (addr);
}
/* Deallocate the structure pointing to the actual storage of the matrix. */
void
Destroy_SuperMatrix_Store(SuperMatrix *A)
{
SUPERLU_FREE ( A->Store );
#if ( PRNTlevel==1 )
printf(".. Destroy_SuperMatrix_Store ...\n");
#endif
}
/* A is of type Stype==NC */
void
Destroy_CompCol_Matrix(SuperMatrix *A)
{
SUPERLU_FREE( ((NCformat *)A->Store)->rowind );
SUPERLU_FREE( ((NCformat *)A->Store)->colptr );
SUPERLU_FREE( ((NCformat *)A->Store)->nzval );
SUPERLU_FREE( A->Store );
#if ( PRNTlevel==1 )
printf(".. Destroy_CompCol_Matrix ...\n");
#endif
}
/* A is of type Stype==NCP */
void
Destroy_CompCol_Permuted(SuperMatrix *A)
{
SUPERLU_FREE ( ((NCPformat *)A->Store)->colbeg );
SUPERLU_FREE ( ((NCPformat *)A->Store)->colend );
SUPERLU_FREE ( A->Store );
#if ( PRNTlevel==1 )
printf(".. Destroy_CompCol_Permuted ...\n");
#endif
}
/* A is of type Stype==NCP */
void
Destroy_CompCol_NCP(SuperMatrix *A)
{
SUPERLU_FREE ( ((NCPformat *)A->Store)->nzval );
SUPERLU_FREE ( ((NCPformat *)A->Store)->rowind );
SUPERLU_FREE ( ((NCPformat *)A->Store)->colbeg );
SUPERLU_FREE ( ((NCPformat *)A->Store)->colend );
SUPERLU_FREE ( A->Store );
#if ( PRNTlevel==1 )
printf(".. Destroy_CompCol_NCP ...\n");
#endif
}
/* A is of type Stype==SC */
void
Destroy_SuperNode_Matrix(SuperMatrix *A)
{
SUPERLU_FREE ( ((SCformat *)A->Store)->rowind );
SUPERLU_FREE ( ((SCformat *)A->Store)->rowind_colptr );
SUPERLU_FREE ( ((SCformat *)A->Store)->nzval );
SUPERLU_FREE ( ((SCformat *)A->Store)->nzval_colptr );
SUPERLU_FREE ( ((SCformat *)A->Store)->col_to_sup );
SUPERLU_FREE ( ((SCformat *)A->Store)->sup_to_col );
SUPERLU_FREE( A->Store );
#if ( PRNTlevel==1 )
printf(".. Destroy_SuperNode_Matrix ...\n");
#endif
}
/* A is of type Stype==SCP */
void
Destroy_SuperNode_SCP(SuperMatrix *A)
{
SUPERLU_FREE ( ((SCPformat *)A->Store)->rowind );
SUPERLU_FREE ( ((SCPformat *)A->Store)->rowind_colbeg );
SUPERLU_FREE ( ((SCPformat *)A->Store)->rowind_colend );
SUPERLU_FREE ( ((SCPformat *)A->Store)->nzval );
SUPERLU_FREE ( ((SCPformat *)A->Store)->nzval_colbeg );
SUPERLU_FREE ( ((SCPformat *)A->Store)->nzval_colend );
SUPERLU_FREE ( ((SCPformat *)A->Store)->col_to_sup );
SUPERLU_FREE ( ((SCPformat *)A->Store)->sup_to_colbeg );
SUPERLU_FREE ( ((SCPformat *)A->Store)->sup_to_colend );
SUPERLU_FREE( A->Store );
#if ( PRNTlevel==1 )
printf(".. Destroy_SuperNode_SCP ...\n");
#endif
}
/*
* Reset repfnz[*] for the current column
*/
void pxgstrf_resetrep_col(const int nseg, const int *segrep, int *repfnz)
{
register int i, irep;
for (i = 0; i < nseg; ++i) {
irep = segrep[i];
repfnz[irep] = EMPTY;
}
}
/*
* Count the total number of nonzeros in factors L and U, and in the
* symmetrically reduced L.
*/
void countnz(const int n, int *xprune, int *nnzL, int *nnzU, GlobalLUBase_t *Glu)
{
register int nsuper, fsupc, i, j, nnzL0, jlen, irep;
register int nnzsup = 0;
register int *xsup, *xsup_end, *xlsub, *xlsub_end, *supno;
xsup = Glu->xsup;
xsup_end = Glu->xsup_end;
xlsub = Glu->xlsub;
xlsub_end = Glu->xlsub_end;
supno = Glu->supno;
*nnzU = Glu->nextu;
nnzL0 = 0;
*nnzL = 0;
nsuper = supno[n];
if ( n <= 0 ) return;
/*
* For each supernode ...
*/
for (i = 0; i <= nsuper; i++) {
fsupc = xsup[i];
jlen = xlsub_end[fsupc] - xlsub[fsupc];
nnzsup += jlen * (xsup_end[i] - fsupc);
for (j = fsupc; j < xsup_end[i]; j++) {
*nnzL += jlen;
*nnzU += j - fsupc + 1;
jlen--;
}
irep = SUPER_REP(i);
if ( SINGLETON(supno[irep]) )
nnzL0 += xprune[irep] - xlsub_end[irep];
else
nnzL0 += xprune[irep] - xlsub[irep];
}
#if ( PRNTlevel==1 )
printf(".. # supernodes = %d\n", nsuper+1);
printf(".. # edges in symm-reduced L = %d\n", nnzL0);
if ( Glu->dynamic_snode_bound )
printf(".. # NZ in LUSUP %d, dynamic bound %d, utilization %.2f\n",
nnzsup, Glu->nextlu, (float)nnzsup/Glu->nextlu);
else
printf(".. # NNZ in LUSUP %d, static bound %d, utilization %.2f\n",
nnzsup, Glu->nzlumax, (float)nnzsup/Glu->nzlumax);
#endif
}
/*
* Fix up the data storage lsub for L-subscripts. It reclaims the
* storage for the adjancency lists of the pruned graph, and applies
* row permuation to the row subscripts of matrix $L$.
*/
void fixupL(const int n, const int *perm_r, GlobalLUBase_t *Glu)
{
register int nsuper, fsupc, nextl, i, j, jstrt;
register int *xsup, *xsup_end, *lsub, *xlsub, *xlsub_end;
if ( n <= 1 ) return;
xsup = Glu->xsup;
xsup_end = Glu->xsup_end;
lsub = Glu->lsub;
xlsub = Glu->xlsub;
xlsub_end = Glu->xlsub_end;
nsuper = Glu->supno[n];
nextl = 0;
/*
* For each supernode ...
*/
for (i = 0; i <= nsuper; i++) {
fsupc = xsup[i];
jstrt = xlsub[fsupc];
xlsub[fsupc] = nextl;
for (j = jstrt; j < xlsub_end[fsupc]; j++) {
lsub[nextl] = perm_r[lsub[j]]; /* Now indexed into P*A */
nextl++;
}
xlsub_end[fsupc] = nextl;
}
xlsub[n] = nextl;
#if ( PRNTlevel==1 )
printf(".. # edges in supernodal graph of L = %d\n", nextl);
fflush(stdout);
#endif
}
/*
* Print all definitions to be used by CPP.
*/
int cpp_defs()
{
printf("CPP Defs:\n");
#ifdef PRNTlevel
printf("\tPRNTlevel=%d\n", PRNTlevel);
#endif
#ifdef DEBUGlevel
printf("\tDEBUGlevel=%d\n", DEBUGlevel);
#endif
#ifdef PROFILE
printf("\tPROFILE\n");
#endif
#ifdef PREDICT_OPT
printf("\tPREDICT_OPT\n");
#endif
#ifdef USE_VENDOR_BLAS
printf("\tUSE_VENDOR_BLAS\n");
#endif
#ifdef GEMV2
printf("\tGEMV2\n");
#endif
#ifdef SCATTER_FOUND
printf("\tSCATTER_FOUND\n");
#endif
return 0;
}
/*
* Compress the data storage LUSUP[*] for supernodes. It removes the
* memory holes due to untightness of the upper bounds by A'A-supernode.
*/
namespace Complex
{
void compressSUP(const int n, GlobalLU_t *Glu)
{
register int nextlu, i, j, jstrt;
int *xlusup, *xlusup_end;
complex *lusup;
if ( n <= 1 ) return;
lusup = Glu->lusup;
xlusup = Glu->xlusup;
xlusup_end= Glu->xlusup_end;
nextlu = 0;
for (j = 0; j < n; ++j) {
jstrt = xlusup[j];
xlusup[j] = nextlu;
for (i = jstrt; i < xlusup_end[j]; ++i, ++nextlu)
lusup[nextlu] = lusup[i];
xlusup_end[j] = nextlu;
}
xlusup[n] = nextlu;
printf("\tcompressSUP() nextlu %d\n", nextlu);
}
}
namespace Double
{
void compressSUP(const int n, GlobalLU_t *Glu)
{
register int nextlu, i, j, jstrt;
int *xlusup, *xlusup_end;
double *lusup;
if ( n <= 1 ) return;
lusup = Glu->lusup;
xlusup = Glu->xlusup;
xlusup_end= Glu->xlusup_end;
nextlu = 0;
for (j = 0; j < n; ++j) {
jstrt = xlusup[j];
xlusup[j] = nextlu;
for (i = jstrt; i < xlusup_end[j]; ++i, ++nextlu)
lusup[nextlu] = lusup[i];
xlusup_end[j] = nextlu;
}
xlusup[n] = nextlu;
printf("\tcompressSUP() nextlu %d\n", nextlu);
}
}
namespace DoubleComplex
{
void compressSUP(const int n, GlobalLU_t *Glu)
{
register int nextlu, i, j, jstrt;
int *xlusup, *xlusup_end;
doublecomplex *lusup;
if ( n <= 1 ) return;
lusup = Glu->lusup;
xlusup = Glu->xlusup;
xlusup_end= Glu->xlusup_end;
nextlu = 0;
for (j = 0; j < n; ++j) {
jstrt = xlusup[j];
xlusup[j] = nextlu;
for (i = jstrt; i < xlusup_end[j]; ++i, ++nextlu)
lusup[nextlu] = lusup[i];
xlusup_end[j] = nextlu;
}
xlusup[n] = nextlu;
printf("\tcompressSUP() nextlu %d\n", nextlu);
}
}
namespace Single
{
void compressSUP(const int n, GlobalLU_t *Glu)
{
register int nextlu, i, j, jstrt;
int *xlusup, *xlusup_end;
float *lusup;
if ( n <= 1 ) return;
lusup = Glu->lusup;
xlusup = Glu->xlusup;
xlusup_end= Glu->xlusup_end;
nextlu = 0;
for (j = 0; j < n; ++j) {
jstrt = xlusup[j];
xlusup[j] = nextlu;
for (i = jstrt; i < xlusup_end[j]; ++i, ++nextlu)
lusup[nextlu] = lusup[i];
xlusup_end[j] = nextlu;
}
xlusup[n] = nextlu;
printf("\tcompressSUP() nextlu %d\n", nextlu);
}
}
int check_mem_leak(char *where)
{
#ifndef _MSC_VER
void *addr;
addr = (void *)sbrk(0);
printf("\tsbrk(0) %s: addr = %u\n", where, addr);
#endif
return 0;
}
/*
* Diagnostic print of segment info after pdgstrf_panel_dfs().
*/
void print_panel_seg(int n, int w, int jcol, int nseg,
int *segrep, int *repfnz)
{
int j, k;
for (j = jcol; j < jcol+w; j++) {
printf("\tcol %d:\n", j);
for (k = 0; k < nseg; k++)
printf("\t\tseg %d, segrep %d, repfnz %d\n", k,
segrep[k], repfnz[(j-jcol)*n + segrep[k]]);
}
}
/*
* Allocate storage for various statistics.
*/
void
StatAlloc(const int n, const int nprocs, const int panel_size,
const int relax, Gstat_t *Gstat)
{
register int w;
w = SUPERLU_MAX( panel_size, relax ) + 1;
Gstat->panel_histo = intCalloc(w);
Gstat->utime = (double *) Double::doubleMalloc(int(PhaseType::NPHASES));
Gstat->ops = (flops_t *) SUPERLU_MALLOC(int(PhaseType::NPHASES) * sizeof(flops_t));
if ( !(Gstat->procstat =
(procstat_t *) SUPERLU_MALLOC(nprocs*sizeof(procstat_t))) )
SUPERLU_ABORT( "SUPERLU_MALLOC failed for procstat[]" );
#if (PRNTlevel==1)
printf(".. StatAlloc(): n %d, nprocs %d, panel_size %d, relax %d\n",
n, nprocs, panel_size, relax);
#endif
#ifdef PROFILE
if ( !(Gstat->panstat =
(panstat_t*) SUPERLU_MALLOC(n * sizeof(panstat_t))) )
SUPERLU_ABORT( "SUPERLU_MALLOC failed for panstat[]" );
Gstat->panhows = intCalloc(3);
Gstat->height = intCalloc(n+1);
if ( !(Gstat->flops_by_height =
(float *) SUPERLU_MALLOC(n * sizeof(float))) )
SUPERLU_ABORT("SUPERLU_MALLOC failed for flops_by_height[]");
#endif
#ifdef PREDICT_OPT
if ( !(cp_panel = (cp_panel_t *) SUPERLU_MALLOC(n * sizeof(cp_panel_t))) )
SUPERLU_ABORT( "SUPERLU_MALLOC failed for cp_panel[]" );
if ( !(desc_eft = (desc_eft_t *) SUPERLU_MALLOC(n * sizeof(desc_eft_t))) )
SUPERLU_ABORT( "SUPERLU_MALLOC failed for desc_eft[]" );
cp_firstkid = intMalloc(n+1);
cp_nextkid = intMalloc(n+1);
#endif
}
/*
* Initialize various statistics variables.
*/
void
StatInit(const int n, const int nprocs, Gstat_t *Gstat)
{
register int i;
for (i = 0; i < int(PhaseType::NPHASES); ++i) {
Gstat->utime[i] = 0;
Gstat->ops[i] = 0;
}
for (i = 0; i < nprocs; ++i) {
Gstat->procstat[i].panels = 0;
Gstat->procstat[i].fcops = 0.0;
Gstat->procstat[i].skedwaits = 0;
Gstat->procstat[i].skedtime = 0.0;
Gstat->procstat[i].cs_time = 0.0;
Gstat->procstat[i].spintime = 0.0;
Gstat->procstat[i].pruned = 0;
Gstat->procstat[i].unpruned = 0;
}
#ifdef PROFILE
for (i = 0; i < n; ++i) {
Gstat->panstat[i].fctime = 0.0;
Gstat->panstat[i].flopcnt = 0.0;
Gstat->panstat[i].pipewaits = 0;
Gstat->panstat[i].spintime = 0.0;
Gstat->flops_by_height[i] = 0.0;
}
for (i = 0; i < 3; ++i) Gstat->panhows[i] = 0;
Gstat->dom_flopcnt = 0.;
Gstat->flops_last_P_panels = 0;
#endif
#ifdef PREDICT_OPT
for (i = 0; i < n; ++i)
cp_panel[i].est = cp_panel[i].pdiv = 0;
#endif
#if ( PRNTlevel==1 )
printf(".. StatInit(): n %d, nprocs %d\n", n, nprocs);
#endif
}
/* Print timings used in factorization and solve. */
void
PrintStat(Gstat_t *Gstat)
{
double *utime;
flops_t *ops;
utime = Gstat->utime;
ops = Gstat->ops;
printf("Factor time = %8.2f\n", utime[int(PhaseType::FACT)]);
if ( utime[int(PhaseType::FACT)] != 0.0 )
printf("Factor flops = %e\tMflops = %8.2f\n", ops[int(PhaseType::FACT)],
ops[int(PhaseType::FACT)]*1e-6/utime[int(PhaseType::FACT)]);
printf("Solve time = %8.2f\n", utime[int(PhaseType::SOLVE)]);
if ( utime[int(PhaseType::SOLVE)] != 0.0 )
printf("Solve flops = %e\tMflops = %8.2f\n", ops[int(PhaseType::SOLVE)],
ops[int(PhaseType::SOLVE)]*1e-6/utime[int(PhaseType::SOLVE)]);
}
void
StatFree(Gstat_t *Gstat)
{
SUPERLU_FREE (Gstat->panel_histo);
SUPERLU_FREE (Gstat->utime);
SUPERLU_FREE (Gstat->ops);
SUPERLU_FREE (Gstat->procstat);
#ifdef PROFILE
SUPERLU_FREE (Gstat->panstat);
SUPERLU_FREE (Gstat->panhows);
SUPERLU_FREE (Gstat->height);
SUPERLU_FREE (Gstat->flops_by_height);
#endif
#ifdef PREDICT_OPT
SUPERLU_FREE (Gstat->cp_panel);
SUPERLU_FREE (Gstat->desc_eft);
SUPERLU_FREE (Gstat->cp_firstkid);
SUPERLU_FREE (Gstat->cp_nextkid);
#endif
#if (PRNTlevel==1)
printf(".. StatFree(): Free Stat variables.\n");
#endif
}
flops_t
LUFactFlops(Gstat_t *Gstat)
{
return (Gstat->ops[int(PhaseType::FACT)]);
}
flops_t
LUSolveFlops(Gstat_t *Gstat)
{
return (Gstat->ops[int(PhaseType::SOLVE)]);
}
/*
* Fills an integer array with a given value.
*/
void ifill(int *a, int alen, int ival)
{
register int i;
for (i = 0; i < alen; i++) a[i] = ival;
}
/*
* Get the statistics of the supernodes
*/
#define NBUCKS 10
static int max_sup_size;
void super_stats(int nsuper, int *xsup, int *xsup_end)
{
register int nsup1 = 0;
int i, isize, whichb, bl, bh;
int bucket[NBUCKS];
max_sup_size = 0;
/* Histogram of the supernode sizes */
ifill (bucket, NBUCKS, 0);
for (i = 0; i <= nsuper; i++) {
isize = xsup_end[i] - xsup[i];
if ( isize == 1 ) nsup1++;
if ( max_sup_size < isize ) max_sup_size = isize;
whichb = (float) isize / max_sup_size * NBUCKS;
if (whichb >= NBUCKS) whichb = NBUCKS - 1;
bucket[whichb]++;
}
printf("** Supernode statistics:\n\tno of supernodes = %d\n", nsuper+1);
printf("\tmax supernode size = %d\n", max_sup_size);
printf("\tno of size 1 supernodes = %d\n", nsup1);
printf("\tHistogram of supernode size:\n");
for (i = 0; i < NBUCKS; i++) {
bl = (float) i * max_sup_size / NBUCKS;
bh = (float) (i+1) * max_sup_size / NBUCKS;
printf("\t%3d-%3d\t\t%d\n", bl+1, bh, bucket[i]);
}
}
void panel_stats(int n, int max_w, int* in_domain, Gstat_t *Gstat)
{
register int i, w;
float *histo_flops, total;
histo_flops = (float *) SUPERLU_MALLOC( max_w * sizeof(float) );
for (i = 0; i < max_w; ++i) histo_flops[i] = 0;
total = 0;
for (i = 0; i < n; i += w) {
w = Gstat->panstat[i].size;
if ( in_domain[i] != TREE_DOMAIN ) {
histo_flops[w - 1] += Gstat->panstat[i].flopcnt;
total += Gstat->panstat[i].flopcnt;
}
}
if ( total != 0.0 ) {
printf("** Panel & flops distribution: nondomain flopcnt %e\n", total);
for (i = 1; i <= max_w; i++)
printf("\t%d\t%d\t%e (%.2f)\n", i, Gstat->panel_histo[i],
histo_flops[i-1], histo_flops[i-1]/total);
}
SUPERLU_FREE (histo_flops);
}
float SpaSize(int n, int np, float sum_npw)
{
return (sum_npw*8 + np*8 + n*4)/1024.;
}
float DenseSize(int n, float sum_nw)
{
return (sum_nw*8 + n*8)/1024.;;
}
/*
* Check whether repfnz[] == EMPTY after reset.
*/
void check_repfnz(int n, int w, int jcol, int *repfnz)
{
int jj, k;
for (jj = jcol; jj < jcol+w; jj++)
for (k = 0; k < n; k++)
if ( repfnz[(jj-jcol)*n + k] != EMPTY ) {
fprintf(stderr, "col %d, repfnz_col[%d] = %d\n", jj,
k, repfnz[(jj-jcol)*n + k]);
SUPERLU_ABORT("repfnz[] not empty.");
}
}
int PrintInt10(char *name, int len, int *x)
{
register int i;
printf("(len=%d) %s:", len, name);
for (i = 0; i < len; ++i) {
if ( i % 10 == 0 ) printf("\n[%4d-%4d]", i, i+9);
printf("%6d", x[i]);
}
printf("\n");
return 0;
}
/* Print a summary of the testing results. */
void
PrintSumm(char *type, int nfail, int nrun, int nerrs)
{
if ( nfail > 0 )
printf("%3s driver: %d out of %d tests failed to pass the threshold\n",
type, nfail, nrun);
else
printf("All tests for %3s driver passed the threshold (%6d tests run)\n", type, nrun);
if ( nerrs > 0 )
printf("%6d error messages recorded\n", nerrs);
}
/* Print the adjacency list for graph of L, including the pruned graph,
graph of U, and L supernodes partition */
int PrintGLGU(int n, int *xprune, GlobalLUBase_t *Glu)
{
register int nsuper = Glu->nsuper;
PrintInt10("LSUB", Glu->xlsub_end[n-1], Glu->lsub);
PrintInt10("XLSUB", n, Glu->xlsub);
PrintInt10("XLSUB_END", n, Glu->xlsub_end);
PrintInt10("XPRUNE", n, xprune);
PrintInt10("USUB", Glu->xusub_end[n-1], Glu->usub);
PrintInt10("XUSUB", n, Glu->xusub);
PrintInt10("XUSUB_END", n, Glu->xusub_end);
PrintInt10("SUPNO", n, Glu->supno);
PrintInt10("XSUP", nsuper+1, Glu->xsup);
PrintInt10("XSUP_END", nsuper+1, Glu->xsup_end);
return 0;
}
#if 0
/*
* Print the statistics of the relaxed snodes for matlab process
*/
void relax_stats(int start, int end, int step)
{
FILE *fp;
int i;
fp = fopen("relax.m", "w");
fprintf(fp,"relax = [\n");
for (i = start; i <= end; i += step) fprintf(fp, "%d ", i);
fprintf(fp, "];\n");
fprintf(fp, "fctime = [\n");
for (i = start; i <= end; i += step)
fprintf(fp, "%15.8e\n ", stat_relax[i].fctime);
fprintf(fp, "];\n");
fprintf(fp, "mflops = [\n");
for (i = start; i <= end; i += step)
fprintf(fp, "%15.8e\n ", (float)stat_relax[i].flops / 1e6);
fprintf(fp, "];\n");
fprintf(fp, "mnzs = [\n");
for (i = start; i <= end; i += step)
fprintf(fp, "%15.8e\n ", stat_relax[i].nzs / 1e6);
fprintf(fp, "];\n");
fclose(fp);
}
/*
* Obtain the distribution of time/flops/nzs on the snode size.
*/
void snode_profile(int nsuper, int *xsup)
{
FILE *fp;
int i, j;
int ssize;
if ( !(stat_snode = (stat_snode_t *) SUPERLU_MALLOC((max_sup_size+1) *
sizeof(stat_snode_t))) ) ABORT("SUPERLU_MALLOC fails for stat_snode[].");
for (i = 0; i <= max_sup_size; i++) {
stat_snode[i].ncols = 0;
stat_snode[i].flops = 0;
stat_snode[i].nzs = 0;
stat_snode[i].fctime = 0.0;
}
for (i = 0; i <= nsuper; i++) {
ssize = xsup[i+1] - xsup[i];
stat_snode[ssize].ncols += ssize;
for (j=xsup[i]; j<xsup[i+1]; j++) {
stat_snode[ssize].flops += stat_col[j].flops;
stat_snode[ssize].nzs += stat_col[j].nzs;
stat_snode[ssize].fctime += stat_col[j].fctime;
}
}
fp = fopen("snode.m", "w");
fprintf(fp, "max_sup_size = %d;\n", max_sup_size);
fprintf(fp,"ncols = [");
for (i = 1; i <= max_sup_size; i++)
fprintf(fp, "%d ", stat_snode[i].ncols);
fprintf(fp, "];\n");
fprintf(fp, "fctime = [");
for (i = 1; i <= max_sup_size; i++)
fprintf(fp, "%15.8e\n", stat_snode[i].fctime);
fprintf(fp, "];\n");
fprintf(fp, "mflops = [");
for (i = 1; i <= max_sup_size; i++)
fprintf(fp, "%15.8e\n", (float) stat_snode[i].flops / 1e6);
fprintf(fp, "];\n");
fprintf(fp, "mnzs = [");
for (i = 1; i <= max_sup_size; i++)
fprintf(fp, "%15.8e\n", (float) stat_snode[i].nzs / 1e6);
fprintf(fp, "];\n");
fclose(fp);
SUPERLU_FREE (stat_snode);
}
#endif
int print_int_vec(char *what, int n, int *vec)
{
int i;
printf("%s\n", what);
for (i = 0; i < n; ++i) printf("%d\t%d\n", i, vec[i]);
return 0;
}
/*
* Print the parallel execution statistics.
*/
int ParallelProfile(const int n, const int supers, const int panels,
const int procs, Gstat_t *Gstat)
{
register int i, imax, pruned, unpruned, waits, itemp, cs_numbers;
register float loadmax, loadtot, temp, thresh, loadprint;
register float waittime, cs_time;
double *utime = Gstat->utime;
procstat_t *pstat;
panstat_t *pan;
void print_flops_by_height(int, panstat_t *, int *, float *);
printf("\n---- Parallel Profile Per Processor ----\n");
printf("%4s%16s%8s%10s%10s%10s%10s%8s\n", "proc", "factops",
"seconds", "skedwaits", "skedtime", "CS-time",
"spin-time", "[%tot]");
for (i = 0; i < procs; ++i) {
pstat = &(Gstat->procstat[i]);
if ( pstat->fctime != 0 ) {
temp = pstat->spintime/pstat->fctime*100.;
printf("%4d%16e%8.2f%10d%10.3f%10.3f%10.3f%8.1f\n",
i, pstat->fcops, pstat->fctime, pstat->skedwaits,
pstat->skedtime, pstat->cs_time, pstat->spintime, temp);
}
}
printf("%4s%8s%12s%14s\n",
"proc", "#panels", "dfs_pruned","dfs_unpruned");
pruned = unpruned = 0;
cs_time = 0.0;
for (i = 0; i < procs; ++i) {
pstat = &(Gstat->procstat[i]);
printf("%4d%8d%12d%14d\n", i, pstat->panels,
pstat->pruned, pstat->unpruned);
pruned += Gstat->procstat[i].pruned;
unpruned += Gstat->procstat[i].unpruned;
cs_time += Gstat->procstat[i].cs_time;
}
temp = pruned + unpruned;
if ( temp != 0 ) {
printf("%12s%26s\n", "", "--------------------");
printf("%12s%12d%14d%14.0f\n", "total", pruned, unpruned, temp);
printf("%12s%12.2f%14.2f\n", "frac.", pruned/temp, unpruned/temp);
}
printf("%16s%16d\n", "piped-panels", Gstat->panhows[PIPE]);
printf("%16s%16d\n", "nonpiped-DADs", Gstat->panhows[DADPAN]);
printf("%16s%16d\n", "nonpiped-panels", Gstat->panhows[NOPIPE]);
/* work load distribution */
loadmax = loadtot = Gstat->procstat[0].fcops;
imax = 0;
for (i = 1; i < procs; ++i) {
temp = Gstat->procstat[i].fcops;
loadtot += temp;
if ( temp > loadmax ) {
loadmax = temp;
imax = i;
}
}
printf("%25s%8.2f\n", "Load balance [mean/max]", loadtot/loadmax/procs);
/* Delays due to pipelining. */
waits = waittime = 0;
for (i = 0; i < n; i += Gstat->panstat[i].size) { /* add up all panels */
waits += Gstat->panstat[i].pipewaits;
waittime += Gstat->panstat[i].spintime;
}
printf("%25s%8d,\tper-panel %.1f\n", "total #delays in pipeline",
waits, (float)waits/panels);
temp = waittime / procs;
printf("%25s%8.2f\t[%.1f%]\n", "mean spin time per-proc",
temp, temp/utime[int(PhaseType::FACT)]*100);
/* Delays due to scheduling. */
waits = waittime = 0;
for (i = 0; i < procs; ++i) {
waits += Gstat->procstat[i].skedwaits;
waittime += Gstat->procstat[i].skedtime;
}
printf("%25s%8d\n", "total #delays in schedule", waits);
temp = waittime / procs;
printf("%25s%8.2f\t[%.1f%]\n", "mean sched. time per-proc",
temp, temp/utime[int(PhaseType::FACT)]*100);
/* estimated overhead in spin-locks */
#if ( MACH==CRAY_PVP ) /* measured for mutex lock/unlock on 4 cpus */
#define TMUTEX 4.42e-6
#define FLOPS_PER_LOCK 221
#elif ( MACH==SUN )
#define TMUTEX 4.36e-6
#define FLOPS_PER_LOCK 109
#elif ( MACH==SGI || MACH==ORIGIN )
#define TMUTEX 2.02e-6
#define FLOPS_PER_LOCK 364
#elif ( MACH==DEC || PTHREAD )
#define TMUTEX 2.71e-6
#define FLOPS_PER_LOCK 407
#endif
cs_numbers = n + 3*supers + panels + procs;
itemp = cs_numbers * FLOPS_PER_LOCK; /* translated to flops */
temp = cs_numbers * TMUTEX;
printf("mutex-lock overhead (est.) %8.2f, #locks %d, equiv. flops %e\n",
temp, cs_numbers, (float) itemp);
printf("time in critical section %8.2f\t[%.1f%]\n",
cs_time/procs, cs_time/procs/utime[int(PhaseType::FACT)]*100);
printf("\n---- Parallel Profile Per Panel ----\n");
printf("%8s%8s%16s%8s%8s%12s%8s\n", "panel", "height",
"factops", "[tot%]", "msec", "spin(msec)", "Mflops");
thresh = 0.005 * loadtot;
loadprint = 0;
itemp = 0;
for (i = 0; i < n; i += Gstat->panstat[i].size) {
pan = &(Gstat->panstat[i]);
if ( pan->flopcnt > thresh ) {
loadprint += pan->flopcnt;
++itemp;
if ( pan->fctime != 0 ) temp = pan->flopcnt/pan->fctime*1e-6;
printf("%4d%4d%8d%16e%8.1f%8.2f%12.2f%8.2f\n", i, pan->size,
Gstat->height[i], pan->flopcnt, pan->flopcnt/loadtot*100.,
pan->fctime*1e3, pan->spintime*1e3, temp);
}
}
printf("Total panels %d, height(T) %d, height(T)/n= %.4f\n",
panels, Gstat->height[n], (float)Gstat->height[n]/n);
printf("Printed flops %e [%.1f], printed panels %d [%.1f]\n",
loadprint, loadprint/loadtot*100.,
itemp, (float)itemp/panels);
/* print_flops_by_height(n, panstat, height, flops_by_height);*/
printf("---- End ParallelProfile().\n\n");
fflush(stdout);
return 0;
}
/*
* Print the distribution of flops by the height of etree.
*/
void
print_flops_by_height(int n, panstat_t *panstat,
int *height, float *flops_by_height)
{
register int i, w, ht;
register float flops;
for (i = 0; i < n; i += w) {
w = panstat[i].size;
ht = height[i];
flops_by_height[ht] += panstat[i].flopcnt;
}
printf("\n%8s\t%8s\n", "height", "flops");
ht = height[n-1]; /* root */
for (i = 0; i <= ht; ++i) {
flops = flops_by_height[i];
if ( flops != 0.0 )
printf("%8d\t%e\n", i, flops);
}
}
/*
* Print the analysis of the optimal runtime.
*/
int
CPprofile(const int n, cp_panel_t *cp_panel, pxgstrf_shared_Base_t *pxgstrf_shared)
{
Gstat_t *Gstat = pxgstrf_shared->Gstat;
register int maxpan, i, j, treecnt;
register float eft, maxeft; /* earliest (possible) finish time */
flops_t *ops;
/* Find the longest (weighted) path in the elimination forest. */
treecnt = 0;
maxeft = 0;
for (i = Gstat->cp_firstkid[n]; i != EMPTY; i = Gstat->cp_nextkid[i]) {
/* printf("Root %d, height %d\n", i, height[i]);*/
j = (pxgstrf_shared->pan_status[i].size > 0) ?
i : (i + pxgstrf_shared->pan_status[i].size);
eft = cp_panel[j].est + cp_panel[j].pdiv;
if ( eft > maxeft ) {
maxeft = eft;
maxpan = j;
}
++treecnt;
}
ops = Gstat->ops;
printf("\n** Runtime prediction model: #trees %d\n", treecnt);
printf("Last panel %d, seq-time %e, EFT %e, ideal-speedup %.2f\n",
maxpan, ops[int(PhaseType::FACT)], maxeft, ops[int(PhaseType::FACT)]/maxeft);
#if ( DEBUGlevel>=2 )
printf("last panel %d\n", maxpan);
for (i = 0; i < n; i += pxgstrf_shared->pan_status[i].size)
printf("%6d %8s%e\t%8s%8.0f\n", i, "est ", cp_panel[i].est,
"pdiv ", cp_panel[i].pdiv);
#endif
return 0;
}
/***************************************************************
* Utilities to print the supermatrix.
***************************************************************/
#define PERLINE 10
#define FMT "%7.4f "
/* A is of type Stype==SCP */
void
Print_SuperNode_SCP(SuperMatrix *A)
{
int i, j, c;
int n = A->ncol;
SCPformat *Astore = (SCPformat *)A->Store;
double *nzval = (double *)Astore->nzval;
int *colbeg = Astore->nzval_colbeg, *colend = Astore->nzval_colend;
printf("SuperNode_SCP: nnz %d, nsuper %d\n", Astore->nnz, Astore->nsuper);
printf("valL=[\n");
for (c = 0, j = 0; j < n; ++j) {
for (i = colbeg[j]; i < colend[j]; ++i) {
if (c == PERLINE) { printf("\n"); c = 0; }
printf(FMT, nzval[i]);
++c;
}
}
printf("];\n");
fflush(stdout);
/* SUPERLU_FREE ( ((SCPformat *)A->Store)->rowind );
SUPERLU_FREE ( ((SCPformat *)A->Store)->rowind_colbeg );
SUPERLU_FREE ( ((SCPformat *)A->Store)->rowind_colend );
SUPERLU_FREE ( ((SCPformat *)A->Store)->col_to_sup );
SUPERLU_FREE ( ((SCPformat *)A->Store)->sup_to_colbeg );
SUPERLU_FREE ( ((SCPformat *)A->Store)->sup_to_colend );*/
}
/* A is of type NCP */
void
Print_CompCol_NC(SuperMatrix *A)
{
int i, j, c;
int n = A->ncol;
NCformat *Astore = (NCformat *)A->Store;
double *nzval = (double *)Astore->nzval;
int *colptr = Astore->colptr;
printf("CompCol_NC: nnz %d\n", Astore->nnz);
printf("valA=[\n");
for (c = 0, j = 0; j < n; ++j) {
for (i = colptr[j]; i < colptr[j+1]; ++i, ++c) {
if (c == PERLINE) { printf("\n"); c = 0; }
printf(FMT, nzval[i]);
}
}
printf("];\n");
fflush(stdout);
}
/* A is of type NCP */
void
Print_CompCol_NCP(SuperMatrix *A)
{
int i, j, c;
int n = A->ncol;
NCPformat *Astore = (NCPformat *)A->Store;
double *nzval = (double *)Astore->nzval;
int *colbeg = Astore->colbeg, *colend = Astore->colend;
printf("SuperNode_NCP: nnz %d\n", Astore->nnz);
printf("nzval[U]\n");
for (c = 0, j = 0; j < n; ++j) {
for (i = colbeg[j]; i < colend[j]; ++i, ++c) {
if (c == PERLINE) { printf("\n"); c = 0; }
printf(FMT, nzval[i]);
}
}
printf("\n");
fflush(stdout);
}
/* A is of type DN */
void
Print_Dense(SuperMatrix *A)
{
int i, j, c;
int m = A->nrow, n = A->ncol;
DNformat *Astore = (DNformat *)A->Store;
int lda = Astore->lda;
double *nzval = (double *)Astore->nzval;
printf("Dense: lda %d\n", lda);
printf("val=[\n");
for (c = 0, j = 0; j < n; ++j) {
for (i = 0; i < m; ++i, ++c) {
if (c == PERLINE) { printf("\n"); c = 0; }
printf(FMT, nzval[i + j*lda]);
}
}
printf("];\n");
fflush(stdout);
}
};
};
};