// #include "hnumslucmn.h"
#pragma once
#ifndef __HNUMSLUCMN_H__
#define __HNUMSLUCMN_H__
#include "hnum_slu_mt_machines.h"
/*
* File name: pssp_defs.h
* Purpose: Sparse matrix types and function prototypes
* History:
*/
/****************************
Include thread header file
***************************/
#if defined ( _SOLARIS )
#include <thread.h>
#include <sched.h>
#elif defined( _DEC )
#include <pthread.h>
#include <unistd.h>
#include <sys/mman.h>
#elif defined ( _OPENMP )
#include <omp.h>
#elif defined ( _PTHREAD )
#include <pthread.h>
#elif defined ( _CRAY )
#include <fortran.h>
#include <string.h>
#endif
/* Define my integer type int_t */
typedef int int_t; /* default */
#include "hnum_slu_mt_Cnames.h"
#include "hnum_supermatrix.h"
#include "hnum_slu_mt_util.h"
#include "hnum_pxgstrf_synch.h"
#include "hnum_cblas.h"
namespace harlinn
{
namespace numerics
{
namespace SuperLU
{
/*
* *************************************************
* Global data structures used in LU factorization
* *************************************************
*
* nsuper: number of supernodes = nsuper+1, numbered between 0 and nsuper.
*
* (supno, xsup, xsup_end):
* supno[i] is the supernode number to which column i belongs;
* xsup[s] points to the first column of supernode s;
* xsup_end[s] points to one past the last column of supernode s.
* Example: supno 0 1 2 2 3 3 3 4 4 4 4 4 (n=12)
* xsup 0 1 2 4 7
* xsup_end 1 2 4 7 12
* Note: dfs will be performed on supernode rep. relative to the new
* row pivoting ordering
*
* (lsub, xlsub, xlsub_end):
* lsub[*] contains the compressed subscripts of the supernodes;
* xlsub[j] points to the starting location of the j-th column in
* lsub[*];
* xlsub_end[j] points to one past the ending location of the j-th
* column in lsub[*].
* Storage: original row subscripts in A.
*
* During the course of sparse LU factorization, we also use
* (lsub, xlsub, xlsub_end, xprune) to represent symmetrically
* pruned graph. Contention will occur when one processor is
* performing DFS on supernode S, while another processor is pruning
* supernode S. We use the following data structure to deal with
* this problem. Suppose each supernode contains columns {s,s+1,...,t},
* with first column s and last column t.
*
* (1) if t > s, only the subscript sets for column s and column t
* are stored. Column t represents pruned adjacency structure.
*
* --------------------------------------------
* lsub[*] ... | col s | col t | ...
* --------------------------------------------
* ^ ^ ^
* xlsub[s] xlsub_end[s] xlsub_end[s+1]
* xlsub[s+1] :
* : :
* : xlsub_end[t]
* xlsub[t] xprune[t]
* xprune[s]
*
* (2) if t == s, i.e., a singleton supernode, the subscript set
* is stored twice:
*
* --------------------------------------
* lsub[*] ... | s | s | ...
* --------------------------------------
* ^ ^ ^
* xlsub[s] xlsub_end[s] xprune[s]
*
* There are two subscript sets for each supernode, the last column
* structures (for pruning) will be removed after the numerical LU
* factorization phase:
* o lsub[j], j = xlsub[s], ..., xlsub_end[s]-1
* is the structure of column s (i.e. structure of this supernode).
* It is used for the storage of numerical values.
* o lsub[j], j = xlsub[t], ..., xlsub_end[t]-1
* is the structure of the last column t of this supernode.
* It is for the purpose of symmetric pruning. Therefore, the
* structural subscripts can be rearranged without making physical
* interchanges among the numerical values.
*
* DFS will traverse the first subscript set if the supernode
* has not been pruned; otherwise it will traverse the second
* subscript list, i.e., the part of the pruned graph.
*
* (lusup, xlusup, xlusup_end):
* lusup[*] contains the numerical values of the supernodes;
* xlusup[j] points to the starting location of the j-th column in
* storage vector lusup[*];
* xlusup_end[j] points to one past the ending location of the j-th
* column in lusup[*].
* Each supernode is stored in column-major, consistent with Fortran
* two-dimensional array storage.
*
* (ucol, usub, xusub, xusub_end):
* ucol[*] stores the numerical values of the U-columns above the
* supernodes.
* usub[k] stores the row subscripts of nonzeros ucol[k];
* xusub[j] points to the starting location of column j in ucol/usub[];
* xusub_end[j] points to one past the ending location column j in
* ucol/usub[].
* Storage: new row subscripts; that is indexed intp PA.
*
*/
struct GlobalLUBase_t
{
int *xsup; // supernode and column mapping
int *xsup_end;
int *supno;
int *lsub; // compressed L subscripts
int *xlsub;
int *xlsub_end;
int *xlusup;
int *xlusup_end;
int *usub;
int *xusub;
int *xusub_end;
int nsuper; // current supernode number
int nextl; // next position in lsub[]
int nextu; // next position in usub[]/ucol[]
int nextlu; // next position in lusup[]
int nzlmax; // current max size of lsub[]
int nzumax; // " " " ucol[]
int nzlumax; // " " " lusup[]
// ---------------------------------------------------------------
// Memory managemant for L supernodes
//
int *map_in_sup; // size n+1 - the address offset of each column
// in lusup[*], which is divided into regions
// by the supernodes of Householder matrix H.
// If column k starts a supernode in H,
// map_in_sup[k] is the next open position in
// lusup[*]; otherwise map_in_sup[k] gives the
// offset (negative) to the leading column
// of the supernode in H.
//
int dynamic_snode_bound;
/* --------------------------------------------------------------- */
~GlobalLUBase_t()
{
if(map_in_sup)
{
delete [] map_in_sup;
}
}
};
template <typename T>
struct GlobalLU_T : public GlobalLUBase_t
{
T *lusup; // L supernodes
T *ucol; // U columns
};
/*
* *********************************************************************
* The pxgstrf_shared_t structure contains the shared task queue and
* the synchronization variables to facilitate parallel factorization.
* It also contains the shared L and U data structures.
* *********************************************************************
*/
struct pxgstrf_shared_Base_t
{
// ----------------------------------------------------------------
// Global variables introduced in parallel code for synchronization.
//
volatile int tasks_remain; // number of untaken panels
int num_splits; // number of panels split at the top
queue_t taskq; // size ncol - shared work queue
mutex_t *lu_locks; // 5 named mutual exclusive locks
volatile int *spin_locks; // size ncol - mark every busy column
pan_status_t *pan_status; // size ncol - panel status
int *fb_cols; // size ncol - mark farthest busy column
// ----------------------------------------------------------------
int *inv_perm_c;
int *inv_perm_r;
int *xprune;
int *ispruned;
SuperMatrix *A;
Gstat_t *Gstat;
int *info;
};
template <typename T>
struct pxgstrf_shared_T : public pxgstrf_shared_Base_t
{
T *Glu;
};
/* Arguments passed to each thread. */
struct pxgstrf_threadarg_t
{
int pnum; /* process number */
int info; /* error code returned from each thread */
superlumt_options_t *superlumt_options;
};
template <typename T>
struct pxgstrf_threadarg_T : public pxgstrf_threadarg_t
{
T *pxgstrf_shared; /* shared for LU factorization */
};
int *intMalloc (int);
int *intCalloc (int);
void Destroy_SuperMatrix_Store(SuperMatrix *);
void Destroy_CompCol_Matrix(SuperMatrix *);
void Destroy_CompCol_Permuted(SuperMatrix *);
void Destroy_CompCol_NCP(SuperMatrix *);
void Destroy_SuperNode_Matrix(SuperMatrix *);
void Destroy_SuperNode_SCP(SuperMatrix *);
void pxgstrf_finalize(superlumt_options_t *, SuperMatrix *);
void pxgstrf_relax_snode( const int n, superlumt_options_t *superlumt_options, pxgstrf_relax_t *pxgstrf_relax );
void pxgstrf_super_bnd_dfs(
const int pnum, /* process number */
const int m, /* number of rows in the matrix */
const int n, /* number of columns in the matrix */
const int jcol, /* first column of the H-supernode */
const int w, /* size of the H-supernode */
SuperMatrix *A, /* original matrix */
int *perm_r, /* in */
int *iperm_r, /* in; inverse of perm_r */
int *xprune, /* in */
int *ispruned, /* in */
int *marker, /* modified */
int *parent, /* working array */
int *xplore, /* working array */
GlobalLUBase_t *Glu /* modified */
);
void copy_mem_int(int howmany, void *old, void *new_);
double dlangs(char *norm, SuperMatrix *A);
float clangs(char *norm, SuperMatrix *A);
float slangs(char *norm, SuperMatrix *A);
double zlangs(char *norm, SuperMatrix *A);
/*
* Dynamically set up storage image in lusup[*], using the supernode
* boundaries in H.
*/
int DynamicSetMap(
const int pnum, /* process number */
const int jcol, /* leading column of the s-node in H */
const int num, /* number of elements requested */
GlobalLUBase_t *Glu
);
double SuperLU_timer_();
double usertimer_();
int sp_ienv(int);
double slamch_(const char*);
double dlamch_(const char*);
float slangs(char *, SuperMatrix *);
void superlu_abort_and_exit(char *);
void *superlu_malloc (size_t);
void superlu_free (void*);
void PrintStat(Gstat_t *);
int ParallelProfile(const int, const int, const int,const int procs, Gstat_t *);
void ifill(int *, int, int);
int sp_coletree (int *, int *, int *, int, int, int *);
void sp_colorder(SuperMatrix *A, int *perm_c, superlumt_options_t *options,SuperMatrix *AC);
int *TreePostorder (int, int *);
int qrnzcnt(int neqns, int adjlen, int *xadj, int *adjncy, int *zfdperm,
int *perm, int *invp, int *etpar, int *colcnt_h,
int *nlnz, int *part_super_ata, int *part_super_h);
//
int await(volatile int *status);
// Factorization related
void pxgstrf_scheduler(const int pnum, const int n, const int *etree, int *cur_pan, int *bcol, pxgstrf_shared_Base_t *pxgstrf_shared);
int ParallelFinalize (pxgstrf_shared_Base_t *pxgstrf_shared);
int ParallelInit(int n, pxgstrf_relax_t *pxgstrf_relax, superlumt_options_t *superlumt_options, pxgstrf_shared_Base_t *pxgstrf_shared);
int queue_init (queue_t *, int);
int queue_destroy (queue_t *);
int EnqueueRelaxSnode (queue_t *, int, pxgstrf_relax_t *, pxgstrf_shared_Base_t *);
int EnqueueDomains(queue_t *, struct Branch *, pxgstrf_shared_Base_t *);
int Enqueue (queue_t *, qitem_t);
int Dequeue (queue_t *, qitem_t *);
int NewNsuper (const int, pxgstrf_shared_Base_t *, int *);
void pxgstrf_resetrep_col(const int nseg, const int *segrep, int *repfnz);
void countnz(const int n, int *xprune, int *nnzL, int *nnzU, GlobalLUBase_t *Glu);
void fixupL(const int n, const int *perm_r, GlobalLUBase_t *Glu);
void StatAlloc (const int, const int, const int, const int, Gstat_t*);
void StatInit (const int, const int, Gstat_t*);
void StatFree (Gstat_t*);
// Memory related
void pxgstrf_SetIWork (int, int, int *, int **, int **, int **,int **, int **, int **, int **);
};
};
};
#endif //__HNUMSLUCMN_H__