Click here to Skip to main content
15,894,291 members
Articles / Desktop Programming / Win32

Stopwatch

Rate me:
Please Sign up or sign in to vote.
4.97/5 (29 votes)
3 Jan 2015CPOL6 min read 66.3K   1.5K   43  
Benchmark C++ std::vector vs raw arrays, move assignable/constructable & copy assignable/constructable
/*
 * -- SuperLU MT routine (version 2.0) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley,
 * and Xerox Palo Alto Research Center.
 * September 10, 2007
 *
 */

#ifndef __SUPERLU_UTIL /* allow multiple inclusions */
#define __SUPERLU_UTIL

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "hnum_slu_mt_machines.h"

/* Macros */
#ifndef USER_ABORT
#define USER_ABORT(msg) superlu_abort_and_exit(msg)
#endif

#define SUPERLU_ABORT(err_msg) \
 { char msg[256];\
   sprintf_s(msg,"%s at line %d in file %s\n",err_msg,__LINE__, __FILE__);\
   USER_ABORT(msg); }


#ifndef USER_MALLOC
#define USER_MALLOC(size) superlu_malloc(size)
#endif

#define SUPERLU_MALLOC(size) USER_MALLOC(size)

#ifndef USER_FREE
#define USER_FREE(addr) superlu_free(addr)
#endif

#define SUPERLU_FREE(addr) USER_FREE(addr)

#define MAX(x, y) 	   ( (x) > (y) ? (x) : (y) )
#define MIN(x, y) 	   ( (x) < (y) ? (x) : (y) )
#define SUPERLU_MAX(x, y)  ( (x) > (y) ? (x) : (y) )
#define SUPERLU_MIN(x, y)  ( (x) < (y) ? (x) : (y) )

/*********************************************************
 * Macros used for easy access of sparse matrix entries. *
 *********************************************************/
#define L_SUB_START(col)     ( Lstore->rowind_colbeg[col] )
#define L_SUB_END(col)       ( Lstore->rowind_colend[col] )
#define L_SUB(ptr)           ( Lstore->rowind[ptr] )
#define L_NZ_START(col)      ( Lstore->nzval_colbeg[col] )
#define L_NZ_END(col)        ( Lstore->nzval_colend[col] )
#define L_FST_SUPC(superno)  ( Lstore->sup_to_colbeg[superno] )
#define L_LAST_SUPC(superno) ( Lstore->sup_to_colend[superno] )
#define U_NZ_START(col)      ( Ustore->colbeg[col] )
#define U_NZ_END(col)        ( Ustore->colend[col] )
#define U_SUB(ptr)           ( Ustore->rowind[ptr] )

#define SUPER_REP(s)    ( xsup_end[s]-1 )
#define SUPER_FSUPC(s)  ( xsup[s] )
#define SINGLETON(s)    ( (xsup_end[s] - xsup[s]) == 1 )
#define ISPRUNED(j)     ( ispruned[j] )
#define STATE(j)        ( pxgstrf_shared->pan_status[j].state )
#define DADPANEL(j)     ( etree[j + pxgstrf_shared->pan_status[j].size-1] )

#ifdef PROFILE
#define TIC(t)          t = SuperLU_timer_()
#define TOC(t2, t1)     t2 = SuperLU_timer_() - t1
#else
#define TIC(t)
#define TOC(t2, t1)
#endif

/* 
 * Constants 
 */
#define EMPTY	(-1)
#define FALSE	0
#define TRUE	1

/**********************
  Enumerated constants
  *********************/
typedef enum {NO, YES}                       yes_no_t;
typedef enum {NOTRANS, TRANS, CONJ}          trans_t;
typedef enum {DOFACT, EQUILIBRATE, FACTORED} fact_t;
typedef enum {NATURAL, MMD_ATA, MMD_AT_PLUS_A, COLAMD,
	      METIS_AT_PLUS_A, PARMETIS, MY_PERMC} colperm_t;
typedef enum {NOEQUIL, ROW, COL, BOTH}       equed_t;
typedef enum {LUSUP, UCOL, LSUB, USUB}       MemType;

/* Number of marker arrays used in the symbolic factorization, 
   each of size nrow. */
#define NO_MARKER     3

#define LOCOL    70
#define HICOL    78
#define BADROW   44
#define BADCOL   35
#define BADPAN   BADCOL
#define BADREP   35

/*
 * Type definitions
 */
typedef float    flops_t;
typedef unsigned char Logical;

#if ( MACH==DEC || MACH==PTHREAD )
#include <pthread.h>
typedef pthread_mutex_t mutex_t;
#elif ( MACH==SGI || MACH==ORIGIN )
typedef int mutex_t;
#elif ( MACH==CRAY_PVP || MACH==OPENMP )
typedef int mutex_t;
#endif


enum class PhaseType : int
{
    RELAX,
    COLPERM,
    ETREE,
    EQUIL,
    FINDDOMAIN,
    FACT,
    DFS,
    FLOAT,
    TRSV,
    GEMV,
    RCOND,
    TRISOLVE,
    SOLVE,
    REFINE,
    FERR,
    NPHASES
};

/* 
 * *********************************************************************
 * The superlumt_options_t structure contains the shared variables used 
 * for factorization, which are passed to each thread.
 * *********************************************************************
 * 
 * nprocs (int)
 *        Number of processes (or threads) to be spawned and used to perform
 *        the LU factorization by pdgstrf().
 *
 * fact   (fact_t)
 *        Specifies whether or not the factored form of the matrix
 *        A is supplied on entry, and if not, whether the matrix A should
 *        be equilibrated before it is factored.
 *        = DOFACT: The matrix A will be factored, and the factors will be
 *              stored in L and U.
 *        = EQUILIBRATE: The matrix A will be equilibrated if necessary, then
 *              factored into L and U.
 *
 * trans  (trans_t)
 *        Specifies the form of the system of equations:
 *        = NOTRANS: A * X = B        (No transpose)
 *        = TRANS:   A**T * X = B     (Transpose)
 *        = CONJ:    A**H * X = B     (Transpose)
 *
 * refact (yes_no_t)
 *        Specifies whether this is first time or subsequent factorization.
 *        = NO:  this factorization is treated as the first one;
 *        = YES: it means that a factorization was performed prior to this
 *               one. Therefore, this factorization will re-use some
 *               existing data structures, such as L and U storage, column
 *               elimination tree, and the symbolic information of the
 *               Householder matrix.
 *
 * panel_size (int)
 *        A panel consists of at most panel_size consecutive columns.
 *
 * relax  (int)
 *        To control degree of relaxing supernodes. If the number
 *        of nodes (columns) in a subtree of the elimination tree is less
 *        than relax, this subtree is considered as one supernode,
 *        regardless of the row structures of those columns.
 *
 * diag_pivot_thresh (double)
 *        Diagonal pivoting threshold. At step j of the Gaussian elimination,
 *        if abs(A_jj) >= diag_pivot_thresh * (max_(i>=j) abs(A_ij)),
 *        use A_jj as pivot, else use A_ij with maximum magnitude. 
 *        0 <= diag_pivot_thresh <= 1. The default value is 1, 
 *        corresponding to partial pivoting.
 *
 * drop_tol (double) (NOT IMPLEMENTED)
 *	  Drop tolerance parameter. At step j of the Gaussian elimination,
 *        if abs(A_ij)/(max_i abs(A_ij)) < drop_tol, drop entry A_ij.
 *        0 <= drop_tol <= 1. The default value of drop_tol is 0,
 *        corresponding to not dropping any entry.
 *
 * usepr  (yes_no_t)
 *        Whether the pivoting will use perm_r specified by the user.
 *        = YES: use perm_r; perm_r is input, unchanged on exit.
 *        = NO:  perm_r is determined by partial pivoting, and is output.
 *
 * SymmetricMode (yest_no_t)
 *        Specifies whether to use symmetric mode.
 *
 * PrintStat (yes_no_t)
 *        Specifies whether to print solver's statistics.
 *
 * perm_c (int*) dimension A->ncol
 *	  Column permutation vector, which defines the 
 *        permutation matrix Pc; perm_c[i] = j means column i of A is 
 *        in position j in A*Pc.
 *        When search for diagonal, perm_c[*] is applied to the
 *        row subscripts of A, so that diagonal threshold pivoting
 *        can find the diagonal of A, instead of that of A*Pc.
 *
 * perm_r (int*) dimension A->nrow
 *        Row permutation vector which defines the permutation matrix Pr,
 *        perm_r[i] = j means row i of A is in position j in Pr*A.
 *        If usepr = NO, perm_r is output argument;
 *        If usepr = YES, the pivoting routine will try to use the input
 *           perm_r, unless a certain threshold criterion is violated.
 *           In that case, perm_r is overwritten by a new permutation
 *           determined by partial pivoting or diagonal threshold pivoting.
 *
 * work   (void*) of size lwork
 *        User-supplied work space and space for the output data structures.
 *        Not referenced if lwork = 0;
 *
 * lwork  (int)
 *        Specifies the length of work array.
 *        = 0:  allocate space internally by system malloc;
 *        > 0:  use user-supplied work array of length lwork in bytes,
 *              returns error if space runs out.
 *        = -1: the routine guesses the amount of space needed without
 *              performing the factorization, and returns it in
 *              superlu_memusage->total_needed; no other side effects.
 *
 * etree  (int*)
 *        Elimination tree of A'*A, dimension A->ncol.
 *        Note: etree is a vector of parent pointers for a forest whose
 *        vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
 *        On input, the columns of A should be permutated so that the
 *        etree is in a certain postorder.
 *
 * colcnt_h (int*)
 *        Column colunts of the Householder matrix.
 *
 * part_super_h (int*)
 *        Partition of the supernodes in the Householder matrix.
 *	  part_super_h[k] = size of the supernode beginning at column k;
 * 	                  = 0, elsewhere.
 *
 *
 */
typedef struct {
    int        nprocs;
    fact_t     fact;
    trans_t    trans;
    yes_no_t   refact;
    int        panel_size;
    int        relax;
    double     diag_pivot_thresh;
    double     drop_tol;
    colperm_t  ColPerm;
    yes_no_t   usepr;
    yes_no_t   SymmetricMode;
    yes_no_t   PrintStat;

    /* The following arrays are persistent during repeated factorizations. */
    int  *perm_c;
    int  *perm_r;
    void *work;
    int  lwork;

    /* The following structural arrays are computed internally by 
       dsp_colorder(), so the user does not provide them on input.
       These 3 arrays are computed in the first factorization, and are 
       re-used in the subsequent factors of the matrices with the same
       nonzero structure. */
    int  *etree;
    int  *colcnt_h;
    int  *part_super_h;
} superlumt_options_t;

/* ----------------------------------------------
    The definitions below are used for profiling.
   ---------------------------------------------- */

/* The statistics to be kept by each processor. */
typedef struct {
    int	    panels;    /* number of panels taken */
    float   fcops;     /* factor floating-point operations */
    double  fctime;    /* factor time */
    int     skedwaits; /* how many times the processor fails to get a task */
    double  skedtime;  /* time spent in the scheduler */
    double  cs_time;   /* time spent in the critical sections */
    double  spintime;  /* spin-wait time */
    int     pruned;
    int     unpruned;
} procstat_t;


/* Statistics about each panel. */

typedef struct {
    int    size;      /* size of the panel */
    int    pnum;      /* which processor grabs this panel */
    double starttime; /* at waht time this panel is assigned to a proc */
    double fctime;    /* factorization time */
    float  flopcnt;   /* floating-point operations */
    int    pipewaits; /* how many times the panel waited during pipelining */
    double spintime;  /* spin waiting time during pipelining */
} panstat_t;

/* How was a panel selected by the scheduler */
typedef enum {NOPIPE, DADPAN, PIPE} how_selected_t;

/* Headers for 4 types of dynamatically managed memory */
typedef struct e_node {
    int size;      /* length of the memory that has been used */
    void *mem;     /* pointer to the new malloc'd store */
} ExpHeader;

/* The structure to keep track of memory usage. */
typedef struct {
    float for_lu;
    float total_needed;
    int   expansions;
} superlu_memusage_t;

typedef struct {
     flops_t flops;
     int     nzs;
     double  fctime;
} stat_relax_t;

typedef struct {
     flops_t flops;
     int nzs;
     double fctime;
} stat_col_t;

typedef struct {
     int ncols;
     flops_t flops;
     int nzs;
     double fctime;
} stat_snode_t;

/* -------------------------------------------------------------------
   The definitions below are used to simulate parallel execution time.
   ------------------------------------------------------------------- */
typedef struct {
    float est;  /* earliest (possible) start time of the panel */
    float pdiv; /* time in flops spent in the (inner) panel factorization */
} cp_panel_t;

typedef struct {
    float eft;  /* earliest finishing time */
    float pmod; /* pmod update to the ancestor panel */
} desc_eft_t;
		   
/* All statistics. */
typedef struct {
    int     	*panel_histo;	/* Panel size distribution */
    double  	*utime;
    flops_t 	*ops;
    procstat_t 	*procstat;
    panstat_t	*panstat;
    int      	num_panels;
    float     	dom_flopcnt;
    float     	flops_last_P_panels;
    /**/
    stat_relax_t *stat_relax;
    stat_col_t *stat_col; 
    stat_snode_t *stat_snode; 
    int *panhows;
    cp_panel_t *cp_panel; /* panels on the critical path */
    desc_eft_t *desc_eft; /* all we need to know from descendants */
    int        *cp_firstkid, *cp_nextkid; /* linked list of children */
    int        *height;
    float      *flops_by_height;
} Gstat_t;

struct Branch {
    int root, first_desc, which_bin;
    struct Branch *next;
};


#if 0

/* Statistics for supernode and panel size */
int 	no_panels;
float   sum_w;          /* Sum (Wi) */
float 	sum_np_w;       /* Sum (Npi*Wi) */
int 	max_np;          
int     no_sups;
float   sum_sup;        /* Sum (Supi) */
int     max_sup;     
flops_t reuse_flops;    /* Triangular solve and matrix vector multiply */
float   reuse_data;     /* Doubles in updating supernode */

/* Statistics for blas operations */
int     num_blas;       /* no of BLAS2 operations, including trsv/gemv */
int     max_blas_n;     /* max dimension n in tri-solve and mat-vec */
int     min_blas_n;     /* min dimension n in tri-solve and mat-vec */
float   sum_blas_n;     /* sum of "        "        " */
int     max_gemv_m;     /* max dimension m in mat-vec */
int     min_gemv_m;     /* max dimension m in mat-vec */
float   sum_gemv_m;     /* sum of "        "        " */
int     lda_blas_m;
int     lda_blas_n;
flops_t *gemv_ops;      /* flops distribution on (m,n) */
flops_t *trsv_ops;      /* flops distribution on n */

#define i_trsv_ops(i)      trsv_ops[i]
#define ij_gemv_ops(i,j)   gemv_ops[j*lda_blas_m + i]

#endif


/* *********************
   Function prototypes
   *********************/





#endif /* __SUPERLU_UTIL */

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
Architect Sea Surveillance AS
Norway Norway
Chief Architect - Sea Surveillance AS.

Specializing in integrated operations and high performance computing solutions.

I’ve been fooling around with computers since the early eighties, I’ve even done work on CP/M and MP/M.

Wrote my first “real” program on a BBC micro model B based on a series in a magazine at that time. It was fun and I got hooked on this thing called programming ...

A few Highlights:

  • High performance application server development
  • Model Driven Architecture and Code generators
  • Real-Time Distributed Solutions
  • C, C++, C#, Java, TSQL, PL/SQL, Delphi, ActionScript, Perl, Rexx
  • Microsoft SQL Server, Oracle RDBMS, IBM DB2, PostGreSQL
  • AMQP, Apache qpid, RabbitMQ, Microsoft Message Queuing, IBM WebSphereMQ, Oracle TuxidoMQ
  • Oracle WebLogic, IBM WebSphere
  • Corba, COM, DCE, WCF
  • AspenTech InfoPlus.21(IP21), OsiSoft PI


More information about what I do for a living can be found at: harlinn.com or LinkedIn

You can contact me at espen@harlinn.no

Comments and Discussions