#include "stdafx.h"
#include "hnum_pssp_defs.h"
namespace harlinn
{
namespace numerics
{
namespace SuperLU
{
namespace Single
{
void
*psgstrf_thread(void *arg)
{
/*
* -- SuperLU MT routine (version 2.0) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley,
* and Xerox Palo Alto Research Center.
* September 10, 2007
*
*
* Purpose
* =======
*
* This is the slave process, representing the main scheduling loop to
* perform the factorization. Each process executes a copy of the
* following code ... (SPMD paradigm)
*
* Working arrays local to each process
* ======================================
* marker[0:3*m-1]: marker[i] == j means node i has been reached when
* working on column j.
* Storage: relative to original row subscripts
*
* THERE ARE 3 OF THEM:
* marker[0 : m-1]: used by psgstrf_factor_snode() and
* psgstrf_panel_dfs();
* marker[m : 2m-1]: used by psgstrf_panel_dfs() and
* pxgstrf_super_bnd_dfs();
* values in [0 : n-1] when used by psgstrf_panel_dfs()
* values in [n : 2n-1] when used by pxgstrf_super_bnd_dfs()
* marker[2m : 3m-1]: used by psgstrf_column_dfs() in inner-factor
*
* parent[0:n-1]: parent vector used during dfs
* Storage: relative to new row subscripts
*
* xplore[0:2m-1]: xplore[i] gives the location of the next (dfs)
* unexplored neighbor of i in lsub[*]; xplore[n+i] gives the
* location of the last unexplored neighbor of i in lsub[*].
*
* segrep[0:nseg-1]: contains the list of supernodal representatives
* in topological order of the dfs. A supernode representative is the
* last column of a supernode.
*
* repfnz[0:m-1]: for a nonzero segment U[*,j] that ends at a
* supernodal representative r, repfnz[r] is the location of the first
* nonzero in this segment. It is also used during the dfs:
* repfnz[r]>0 indicates that supernode r has been explored.
* NOTE: There are w of them, each used for one column of a panel.
*
* panel_lsub[0:w*m-1]: temporary for the nonzero row indices below
* the panel diagonal. These are filled in during psgstrf_panel_dfs(),
* and are used later in the inner LU factorization.
* panel_lsub[]/dense[] pair forms the SPA data structure.
* NOTE: There are w of them.
*
* dense[0:w*m-1]: sparse accumulator (SPA) for intermediate values;
* NOTE: there are w of them.
*
* tempv[0:m-1]: real temporary used for dense numeric kernels;
*
*
* Scheduling algorithm (For each process ...)
* ====================
* Shared task Q <-- { relaxed s-nodes (CANGO) };
*
* WHILE (not finished)
*
* panel = Scheduler(Q); (see pxgstrf_scheduler.c for policy)
*
* IF (panel == RELAXED_SNODE)
* factor_relax_snode(panel);
* ELSE
* * psgstrf_panel_dfs()
* - skip all BUSY s-nodes (or panels)
*
* * dpanel_bmod()
* - updates from DONE s-nodes
* - wait for BUSY s-nodes to become DONE
*
* * inner-factor()
* - identical as it is in the sequential algorithm,
* except that pruning() will interact with the
* psgstrf_panel_dfs() of other panels.
* ENDIF
*
* END WHILE;
*
*/
#if ( MACH==SGI || MACH==ORIGIN )
#if ( MACH==SGI )
int pnum = mpc_my_threadnum();
#elif ( MACH==ORIGIN )
int pnum = mp_my_threadnum();
#endif
psgstrf_threadarg_t *thr_arg = &((psgstrf_threadarg_t *)arg)[pnum];
#else
psgstrf_threadarg_t *thr_arg = (psgstrf_threadarg_t *)arg;
int pnum = thr_arg->pnum;
#endif
/* Unpack the options argument */
superlumt_options_t *superlumt_options = thr_arg->superlumt_options;
pxgstrf_shared_t *pxgstrf_shared= thr_arg->pxgstrf_shared;
int panel_size = superlumt_options->panel_size;
float diag_pivot_thresh = superlumt_options->diag_pivot_thresh;
yes_no_t *usepr = &superlumt_options->usepr; /* may be modified */
int *etree = superlumt_options->etree;
int *super_bnd = superlumt_options->part_super_h;
int *perm_r = superlumt_options->perm_r;
int *inv_perm_c= pxgstrf_shared->inv_perm_c;
int *inv_perm_r= pxgstrf_shared->inv_perm_r;
int *xprune = pxgstrf_shared->xprune;
int *ispruned = pxgstrf_shared->ispruned;
SuperMatrix *A = pxgstrf_shared->A;
GlobalLU_t *Glu = pxgstrf_shared->Glu;
Gstat_t *Gstat = pxgstrf_shared->Gstat;
int *info = &thr_arg->info;
/* Local working arrays */
int *iwork;
float *dwork;
int *segrep, *repfnz, *parent, *xplore;
int *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */
int *marker, *marker1, *marker2;
int *lbusy; /* "Local busy" array, indicates which descendants
were busy when this panel's computation began.
Those columns (s-nodes) are treated specially
during psgstrf_panel_dfs() and dpanel_bmod(). */
int *spa_marker; /* size n-by-w */
int *w_lsub_end; /* record the end of each column in panel_lsub */
float *dense, *tempv;
int *lsub, *xlsub, *xlsub_end;
/* Local scalars */
register int m, n, k, jj, jcolm1, itemp, singular;
int pivrow; /* pivotal row number in the original matrix A */
int nseg1; /* no of segments in U-column above panel row jcol */
int nseg; /* no of segments in each U-column */
int w, bcol, jcol;
#ifdef PROFILE
double *utime = Gstat->utime;
double t1, t2, t, stime;
register float flopcnt;
#endif
#ifdef PREDICT_OPT
flops_t *ops = Gstat->ops;
register float pdiv;
#endif
#if ( DEBUGlevel>=1 )
printf("(%d) thr_arg-> pnum %d, info %d\n", pnum, thr_arg->pnum, thr_arg->info);
#endif
singular = 0;
m = A->nrow;
n = A->ncol;
lsub = Glu->lsub;
xlsub = Glu->xlsub;
xlsub_end = Glu->xlsub_end;
/* Allocate and initialize the per-process working storage. */
if ( (*info = psgstrf_WorkInit(m, panel_size, &iwork, &dwork)) ) {
*info += psgstrf_memory_use(Glu->nzlmax, Glu->nzumax, Glu->nzlumax);
return 0;
}
pxgstrf_SetIWork(m, panel_size, iwork, &segrep, &parent, &xplore,
&repfnz, &panel_lsub, &marker, &lbusy);
psgstrf_SetRWork(m, panel_size, dwork, &dense, &tempv);
/* New data structures to facilitate parallel algorithm */
spa_marker = intMalloc(m * panel_size);
w_lsub_end = intMalloc(panel_size);
ifill (spa_marker, m * panel_size, EMPTY);
ifill (marker, m * NO_MARKER, EMPTY);
ifill (lbusy, m, EMPTY);
jcol = EMPTY;
marker1 = marker + m;
marker2 = marker + 2*m;
#ifdef PROFILE
stime = SuperLU_timer_();
#endif
/* -------------------------
Main loop: repeatedly ...
------------------------- */
while ( pxgstrf_shared->tasks_remain > 0 ) {
#ifdef PROFILE
TIC(t);
#endif
/* Get a panel from the scheduler. */
pxgstrf_scheduler(pnum, n, etree, &jcol, &bcol, pxgstrf_shared);
#if ( DEBUGlevel>=1 )
if ( jcol>=LOCOL && jcol<=HICOL ) {
printf("(%d) Scheduler(): jcol %d, bcol %d, tasks_remain %d\n",
pnum, jcol, bcol, pxgstrf_shared->tasks_remain);
fflush(stdout);
}
#endif
#ifdef PROFILE
TOC(t2, t);
Gstat->procstat[pnum].skedtime += t2;
#endif
if ( jcol != EMPTY ) {
w = pxgstrf_shared->pan_status[jcol].size;
#if ( DEBUGlevel>=3 )
printf("P%2d got panel %5d-%5d\ttime %.4f\tpanels_left %d\n",
pnum, jcol, jcol+w-1, SuperLU_timer_(),
pxgstrf_shared->tasks_remain);
fflush(stdout);
#endif
/* Nondomain panels */
#ifdef PROFILE
flopcnt = Gstat->procstat[pnum].fcops;
Gstat->panstat[jcol].pnum = pnum;
TIC(t1);
Gstat->panstat[jcol].starttime = t1;
#endif
if ( pxgstrf_shared->pan_status[jcol].type == RELAXED_SNODE ) {
#ifdef PREDICT_OPT
pdiv = Gstat->procstat[pnum].fcops;
#endif
/* A relaxed supernode at the bottom of the etree */
psgstrf_factor_snode
(pnum, jcol, A, diag_pivot_thresh, usepr,
perm_r, inv_perm_r, inv_perm_c, xprune, marker,
panel_lsub, dense, tempv, pxgstrf_shared, info);
if ( *info ) {
if ( *info > n ) return 0;
else if ( singular == 0 || *info < singular )
singular = *info;
#if ( DEBUGlevel>=1 )
printf("(%d) After psgstrf_factor_snode(): singular=%d\n", pnum, singular);
#endif
}
/* Release the whole relaxed supernode */
for (jj = jcol; jj < jcol + w; ++jj)
pxgstrf_shared->spin_locks[jj] = 0;
#ifdef PREDICT_OPT
pdiv = Gstat->procstat[pnum].fcops - pdiv;
cp_panel[jcol].pdiv = pdiv;
#endif
} else { /* Regular panel */
#ifdef PROFILE
TIC(t);
#endif
psgstrf_mark_busy_descends(pnum, jcol, etree, pxgstrf_shared,
&bcol, lbusy);
/* Symbolic factor on a panel of columns */
psgstrf_panel_dfs
(pnum, m, w, jcol, A, perm_r, xprune,ispruned,lbusy,
&nseg1, panel_lsub, w_lsub_end, segrep, repfnz,
marker, spa_marker, parent, xplore, dense, Glu);
#if ( DEBUGlevel>=2 )
if ( jcol==BADPAN )
printf("(%d) After psgstrf_panel_dfs(): nseg1 %d, w_lsub_end %d\n",
pnum, nseg1, w_lsub_end[0]);
#endif
#ifdef PROFILE
TOC(t2, t);
utime[DFS] += t2;
#endif
/* Numeric sup-panel updates in topological order.
* On return, the update values are temporarily stored in
* the n-by-w SPA dense[m,w].
*/
psgstrf_panel_bmod
(pnum, m, w, jcol, bcol, inv_perm_r, etree,
&nseg1, segrep, repfnz, panel_lsub, w_lsub_end,
spa_marker, dense, tempv, pxgstrf_shared);
/*
* All "busy" descendants are "done" now --
* Find the set of row subscripts in the preceeding column
* "jcol-1" of the current panel. Column "jcol-1" is
* usually taken by a process other than myself.
* This row-subscripts information will be used by myself
* during column dfs to detect whether "jcol" belongs
* to the same supernode as "jcol-1".
*
* ACCORDING TO PROFILE, THE AMOUNT OF TIME SPENT HERE
* IS NEGLIGIBLE.
*/
jcolm1 = jcol - 1;
itemp = xlsub_end[jcolm1];
for (k = xlsub[jcolm1]; k < itemp; ++k)
marker2[lsub[k]] = jcolm1;
#ifdef PREDICT_OPT
pdiv = Gstat->procstat[pnum].fcops;
#endif
/* Inner-factorization, using sup-col algorithm */
for ( jj = jcol; jj < jcol + w; jj++) {
k = (jj - jcol) * m; /* index into w-wide arrays */
nseg = nseg1; /* begin after all the panel segments */
#ifdef PROFILE
TIC(t);
#endif
/* Allocate storage for the current H-supernode. */
if ( Glu->dynamic_snode_bound && super_bnd[jj] ) {
/* jj starts a supernode in H */
pxgstrf_super_bnd_dfs
(pnum, m, n, jj, super_bnd[jj], A, perm_r,
inv_perm_r, xprune, ispruned, marker1, parent,
xplore, pxgstrf_shared->Glu);
}
if ( (*info = psgstrf_column_dfs
(pnum, m, jj, jcol, perm_r, ispruned,
&panel_lsub[k],w_lsub_end[jj-jcol],
super_bnd, &nseg, segrep,
&repfnz[k], xprune, marker2,
parent, xplore, pxgstrf_shared)) )
return 0;
#ifdef PROFILE
TOC(t2, t);
utime[DFS] += t2;
#endif
/* On return, the L supernode is gathered into the
global storage. */
if ( (*info = psgstrf_column_bmod
(pnum, jj, jcol, (nseg - nseg1),
&segrep[nseg1], &repfnz[k],
&dense[k], tempv, pxgstrf_shared, Gstat)) )
return 0;
if ( (*info = psgstrf_pivotL
(pnum, jj, diag_pivot_thresh, usepr,
perm_r, inv_perm_r, inv_perm_c,
&pivrow, Glu, Gstat)) )
if ( singular == 0 || *info < singular ) {
singular = *info;
#if ( DEBUGlevel>=1 )
printf("(%d) After psgstrf_pivotL(): singular=%d\n", pnum, singular);
#endif
}
/* release column "jj", so that the other processes
waiting for this column can proceed */
pxgstrf_shared->spin_locks[jj] = 0;
/* copy the U-segments to ucol[*] */
if ( (*info = psgstrf_copy_to_ucol
(pnum,jj,nseg,segrep,&repfnz[k],
perm_r, &dense[k], pxgstrf_shared)) )
return 0;
/* Prune columns [0:jj-1] using column jj */
psgstrf_pruneL(jj, perm_r, pivrow, nseg, segrep,
&repfnz[k], xprune, ispruned, Glu);
/* Reset repfnz[] for this column */
pxgstrf_resetrep_col (nseg, segrep, &repfnz[k]);
#if ( DEBUGlevel>=2 )
/* if (jj >= LOCOL && jj <= HICOL) {*/
if ( jj==BADCOL ) {
dprint_lu_col(pnum, "panel:", jcol, jj, w, pivrow, xprune, Glu);
dcheck_zero_vec(pnum, "after psgstrf_copy_to_ucol() dense_col[]", n, &dense[k]);
}
#endif
} /* for jj ... */
#ifdef PREDICT_OPT
pdiv = Gstat->procstat[pnum].fcops - pdiv;
cp_panel[jcol].pdiv = pdiv;
#endif
} /* else regular panel ... */
STATE( jcol ) = DONE; /* Release panel jcol. */
#ifdef PROFILE
TOC(Gstat->panstat[jcol].fctime, t1);
Gstat->panstat[jcol].flopcnt += Gstat->procstat[pnum].fcops - flopcnt;
/*if ( Glu->tasks_remain < P ) {
flops_last_P_panels += Gstat->panstat[jcol].flopcnt;
printf("Panel %d, flops %e\n", jcol, Gstat->panstat[jcol].flopcnt);
fflush(stdout);
} */
#endif
}
#ifdef PROFILE
else { /* No panel from the task queue - wait and try again */
Gstat->procstat[pnum].skedwaits++;
}
#endif
} /* while there are more panels */
*info = singular;
/* Free work space and compress storage */
psgstrf_WorkFree(iwork, dwork, Glu);
SUPERLU_FREE (spa_marker);
SUPERLU_FREE (w_lsub_end);
#ifdef PROFILE
Gstat->procstat[pnum].fctime = SuperLU_timer_() - stime;
#endif
return 0;
}
};
};
};
};