#include "stdafx.h"
#include <stdlib.h> /* for getenv and atoi */
#include "hnum_pssp_defs.h"
namespace harlinn
{
namespace numerics
{
namespace SuperLU
{
namespace Single
{
void
psgstrf(superlumt_options_t *superlumt_options, SuperMatrix *A, int *perm_r,
SuperMatrix *L, SuperMatrix *U, Gstat_t *Gstat, int *info)
{
/*
* -- SuperLU MT routine (version 2.0) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley,
* and Xerox Palo Alto Research Center.
* September 10, 2007
*
* Purpose
* =======
*
* PSGSTRF computes an LU factorization of a general sparse nrow-by-ncol
* matrix A using partial pivoting with row interchanges. The factorization
* has the form
* Pr * A = L * U
* where Pr is a row permutation matrix, L is lower triangular with unit
* diagonal elements (lower trapezoidal if A->nrow > A->ncol), and U is
* upper triangular (upper trapezoidal if A->nrow < A->ncol).
*
* Arguments
* =========
*
* superlumt_options (input) superlumt_options_t*
* The structure defines the parameters to control how the sparse
* LU factorization is performed. The following fields must be set
* by the user:
*
* o nprocs (int)
* Number of processes to be spawned and used for factorization.
*
* o refact (yes_no_t)
* Specifies whether this is first time or subsequent factorization.
* = NO: this factorization is treated as the first one;
* = YES: it means that a factorization was performed prior to this
* one. Therefore, this factorization will re-use some
* existing data structures, such as L and U storage, column
* elimination tree, and the symbolic information of the
* Householder matrix.
*
* o panel_size (int)
* A panel consists of at most panel_size consecutive columns.
*
* o relax (int)
* Degree of relaxing supernodes. If the number of nodes (columns)
* in a subtree of the elimination tree is less than relax, this
* subtree is considered as one supernode, regardless of the row
* structures of those columns.
*
* o diag_pivot_thresh (float)
* Diagonal pivoting threshold. At step j of Gaussian elimination,
* if abs(A_jj) >= diag_pivot_thresh * (max_(i>=j) abs(A_ij)),
* use A_jj as pivot. 0 <= diag_pivot_thresh <= 1. The default
* value is 1.0, corresponding to partial pivoting.
*
* o usepr (yes_no_t)
* Whether the pivoting will use perm_r specified by the user.
* = YES: use perm_r; perm_r is input, unchanged on exit.
* = NO: perm_r is determined by partial pivoting, and is output.
*
* o drop_tol (double) (NOT IMPLEMENTED)
* Drop tolerance parameter. At step j of the Gaussian elimination,
* if abs(A_ij)/(max_i abs(A_ij)) < drop_tol, drop entry A_ij.
* 0 <= drop_tol <= 1. The default value of drop_tol is 0,
* corresponding to not dropping any entry.
*
* o perm_c (int*)
* Column permutation vector of size A->ncol, which defines the
* permutation matrix Pc; perm_c[i] = j means column i of A is
* in position j in A*Pc.
*
* o perm_r (int*)
* Column permutation vector of size A->nrow.
* If superlumt_options->usepr = NO, this is an output argument.
*
* o work (void*) of size lwork
* User-supplied work space and space for the output data structures.
* Not referenced if lwork = 0;
*
* o lwork (int)
* Specifies the length of work array.
* = 0: allocate space internally by system malloc;
* > 0: use user-supplied work array of length lwork in bytes,
* returns error if space runs out.
* = -1: the routine guesses the amount of space needed without
* performing the factorization, and returns it in
* superlu_memusage->total_needed; no other side effects.
*
* A (input) SuperMatrix*
* Original matrix A, permuted by columns, of dimension
* (A->nrow, A->ncol). The type of A can be:
* Stype = NCP; Dtype = _D; Mtype = GE.
*
* perm_r (input/output) int*, dimension A->nrow
* Row permutation vector which defines the permutation matrix Pr,
* perm_r[i] = j means row i of A is in position j in Pr*A.
* If superlumt_options->usepr = NO, perm_r is output argument;
* If superlumt_options->usepr = YES, the pivoting routine will try
* to use the input perm_r, unless a certain threshold criterion
* is violated. In that case, perm_r is overwritten by a new
* permutation determined by partial pivoting or diagonal
* threshold pivoting.
*
* L (output) SuperMatrix*
* The factor L from the factorization Pr*A=L*U; use compressed row
* subscripts storage for supernodes, i.e., L has type:
* Stype = SCP, Dtype = _D, Mtype = TRLU.
*
* U (output) SuperMatrix*
* The factor U from the factorization Pr*A*Pc=L*U. Use column-wise
* storage scheme, i.e., U has types: Stype = NCP, Dtype = _D,
* Mtype = TRU.
*
* Gstat (output) Gstat_t*
* Record all the statistics about the factorization;
* See Gstat_t structure defined in slu_mt_util.h.
*
* info (output) int*
* = 0: successful exit
* < 0: if info = -i, the i-th argument had an illegal value
* > 0: if info = i, and i is
* <= A->ncol: U(i,i) is exactly zero. The factorization has
* been completed, but the factor U is exactly singular,
* and division by zero will occur if it is used to solve a
* system of equations.
* > A->ncol: number of bytes allocated when memory allocation
* failure occurred, plus A->ncol.
*
*/
psgstrf_threadarg_t *psgstrf_threadarg;
pxgstrf_shared_t pxgstrf_shared;
register int nprocs = superlumt_options->nprocs;
register int i, iinfo;
double *utime = Gstat->utime;
double usrtime, wtime;
#if ( MACH==SUN )
thread_t *thread_id;
#elif ( MACH==DEC || MACH==PTHREAD )
pthread_t *thread_id;
void *status;
#endif
void *psgstrf_thread(void *);
/* --------------------------------------------------------------
Initializes the parallel data structures for psgstrf_thread().
--------------------------------------------------------------*/
psgstrf_threadarg = psgstrf_thread_init(A, L, U, superlumt_options,
&pxgstrf_shared, Gstat, info);
if ( *info ) return;
/* Start timing factorization. */
usrtime = usertimer_();
wtime = SuperLU_timer_();
/* ------------------------------------------------------------
On a SUN multiprocessor system, use Solaris thread.
------------------------------------------------------------*/
#if ( MACH==SUN )
/* Create nproc threads for concurrent factorization. */
thread_id = (thread_t *) SUPERLU_MALLOC(nprocs * sizeof(thread_t));
for (i = 1; i < nprocs; ++i) {
#if ( PRNTlevel==1 )
printf(".. Create unbound threads: i %d, nprocs %d\n", i, nprocs);
#endif
if ( (iinfo = thr_create(NULL, 0,
psgstrf_thread, &(psgstrf_threadarg[i]),
0, &thread_id[i])) )
{
fprintf(stderr, "thr_create: %d\n", iinfo);
SUPERLU_ABORT("thr_creat()");
}
}
psgstrf_thread( &(psgstrf_threadarg[0]) );
/* Wait for all threads to terminate. */
for (i = 1; i < nprocs; i++) thr_join(thread_id[i], 0, 0);
SUPERLU_FREE (thread_id);
/* _SOLARIS_2 */
/* ------------------------------------------------------------
On a DEC multiprocessor system, use pthread.
------------------------------------------------------------*/
#elif ( MACH==DEC ) /* Use DECthreads ... */
/* Create nproc threads for concurrent factorization. */
thread_id = (pthread_t *) SUPERLU_MALLOC(nprocs * sizeof(pthread_t));
for (i = 0; i < nprocs; ++i) {
if ( iinfo = pthread_create(&thread_id[i],
NULL,
psgstrf_thread,
&(psgstrf_threadarg[i])) ) {
fprintf(stderr, "pthread_create: %d\n", iinfo);
SUPERLU_ABORT("pthread_create()");
}
/* pthread_bind_to_cpu_np(thread_id[i], i);*/
}
/* Wait for all threads to terminate. */
for (i = 0; i < nprocs; i++)
pthread_join(thread_id[i], &status);
SUPERLU_FREE (thread_id);
/* _DEC */
/* ------------------------------------------------------------
On a SGI Power Challenge or Origin multiprocessor system,
use parallel C.
------------------------------------------------------------*/
#elif ( MACH==SGI || MACH==ORIGIN ) /* Use parallel C ... */
if ( getenv("MP_SET_NUMTHREADS") ) {
i = atoi(getenv("MP_SET_NUMTHREADS"));
if ( nprocs > i ) {
printf("nprocs=%d > environment allowed: MP_SET_NUMTHREADS=%d\n",
nprocs, i);
exit(-1);
}
}
#pragma parallel
#pragma shared (psgstrf_threadarg)
/*#pragma numthreads (max = nprocs)*/
#pragma numthreads (nprocs)
{
psgstrf_thread( psgstrf_threadarg );
}
/* _SGI or _ORIGIN */
/* ------------------------------------------------------------
On a Cray PVP multiprocessor system, use microtasking.
------------------------------------------------------------*/
#elif ( MACH==CRAY_PVP ) /* Use C microtasking. */
if ( getenv("NCPUS") ) {
i = atoi(getenv("NCPUS"));
if ( nprocs > i ) {
printf("nprocs=%d > environment allowed: NCPUS=%d\n",
nprocs, i);
exit(-1);
}
}
#pragma _CRI taskloop private (i,nprocs) shared (psgstrf_threadarg)
/* Stand-alone task loop */
for (i = 0; i < nprocs; ++i) {
psgstrf_thread( &(psgstrf_threadarg[i]) );
}
/* _CRAY_PVP */
/* ------------------------------------------------------------
Use POSIX threads.
------------------------------------------------------------*/
#elif ( MACH==PTHREAD ) /* Use pthread ... */
/* Create nproc threads for concurrent factorization. */
thread_id = (pthread_t *) SUPERLU_MALLOC(nprocs * sizeof(pthread_t));
for (i = 0; i < nprocs; ++i) {
if ( iinfo = pthread_create(&thread_id[i],
NULL,
psgstrf_thread,
&(psgstrf_threadarg[i])) ) {
fprintf(stderr, "pthread_create: %d\n", iinfo);
SUPERLU_ABORT("pthread_create()");
}
}
/* Wait for all threads to terminate. */
for (i = 0; i < nprocs; i++)
pthread_join(thread_id[i], &status);
SUPERLU_FREE (thread_id);
/* _PTHREAD */
/* ------------------------------------------------------------
Use openMP.
------------------------------------------------------------*/
#elif ( MACH==OPENMP ) /* Use openMP ... */
#pragma omp parallel for shared (psgstrf_threadarg) private (i)
/* Stand-alone task loop */
for (i = 0; i < nprocs; ++i) {
psgstrf_thread( &(psgstrf_threadarg[i]) );
}
/* _OPENMP */
/* ------------------------------------------------------------
On all other systems, use single processor.
------------------------------------------------------------*/
#else
printf("psgstrf() is not parallelized on this machine.\n");
printf("psgstrf() will be run on single processor.\n");
psgstrf_thread( &(psgstrf_threadarg[0]) );
#endif
wtime = SuperLU_timer_() - wtime;
usrtime = usertimer_() - usrtime;
utime[int(PhaseType::FACT)] = wtime;
#if ( PRNTlevel==1 )
printf(".. psgstrf_thread() returns info %d, usrtime %.2f, wtime %.2f\n",
*info, usrtime, wtime);
#endif
/* check_mem_leak("after psgstrf_thread()"); */
/* ------------------------------------------------------------
Clean up and free storage after multithreaded factorization.
------------------------------------------------------------*/
psgstrf_thread_finalize(psgstrf_threadarg, &pxgstrf_shared,
A, perm_r, L, U);
}
};
};
};
};