Click here to Skip to main content
15,891,721 members
Articles / Desktop Programming / Win32

Stopwatch

Rate me:
Please Sign up or sign in to vote.
4.97/5 (29 votes)
3 Jan 2015CPOL6 min read 66.3K   1.5K   43  
Benchmark C++ std::vector vs raw arrays, move assignable/constructable & copy assignable/constructable
#include "stdafx.h"

/*
 * -- SuperLU MT routine (version 2.0) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley,
 * and Xerox Palo Alto Research Center.
 * September 10, 2007
 *
 */
#include <stdio.h>
#include <stdlib.h>
#include "hnum_pcsp_defs.h"
namespace harlinn
{
    namespace numerics
    {
        namespace SuperLU
        {
            namespace Complex
            {
                /* ------------------
                   Constants & Macros
                   ------------------ */
                #define EXPAND      1.5
                #define NO_MEMTYPE  4      /* 0: lusup;
			                      1: ucol;
			                      2: lsub;
			                      3: usub */
                #define GluIntArray(n)   (9 * (n) + 5)

                /* -------------------
                   Internal prototypes
                   ------------------- */
                void    *pcgstrf_expand (int *, MemType,int, int, GlobalLU_t *);
                void    copy_mem_complex (int, void *, void *);
                void    pcgstrf_StackCompress(GlobalLU_t *);
                void    pcgstrf_SetupSpace (void *, int);
                void    *cuser_malloc   (int, int);
                void    cuser_free      (int, int);

                /* ----------------------------------------------
                   External prototypes (in memory.c - prec-indep)
                   ---------------------------------------------- */
                
                namespace
                {
                    inline void user_bcopy(const void *src, void *dest, size_t n)
                    {
                        memcpy(dest,src,n);
                    }

                }
                typedef struct 
                {
                    int  size;
                    int  used;
                    int  top1;  /* grow upward, relative to &array[0] */
                    int  top2;  /* grow downward */
                    void *array;
                } LU_stack_t;

                typedef enum {HEAD, TAIL}   stack_end_t;
                typedef enum {SYSTEM, USER} LU_space_t;

                // Array of pointers to 4 types of memory 
                ExpHeader *cexpanders = 0; 
                static LU_stack_t stack;
                static int        no_expand;
                static int        ndim;
                static LU_space_t whichspace; /* 0 - system malloc'd; 1 - user provided */

                /* Macros to manipulate stack */
                #define StackFull(x)         ( x + stack.used >= stack.size )
                #define NotDoubleAlign(addr) ( (long int)addr & 7 )
                #define DoubleAlign(addr)    ( ((long int)addr + 7) & ~7L )

                #define Reduce(alpha)        ((alpha + 1) / 2)     /* i.e. (alpha-1)/2 + 1 */

                /* temporary space used by BLAS calls */
                #define NUM_TEMPV(n,w,t,b)  (SUPERLU_MAX( 2*n, (t + b)*w ))

                /*
                 * Setup the memory model to be used for factorization.
                 *    lwork = 0: use system malloc;
                 *    lwork > 0: use user-supplied work[] space.
                 */
                void pcgstrf_SetupSpace(void *work, int lwork)
                {
                    if ( lwork == 0 ) 
                    {
                        whichspace = SYSTEM; /* malloc/free */
                    } 
                    else if ( lwork > 0 ) 
                    {
                        whichspace = USER;   /* user provided space */
                        stack.size = lwork;
                        stack.used = 0;
                        stack.top1 = 0;
                        stack.top2 = lwork;
                        stack.array = (void *) work;
                    }
                }


                void *cuser_malloc(int bytes, int which_end)
                {
                    void *buf;
    
                    if ( StackFull(bytes) ) return (NULL);

                    if ( which_end == HEAD ) 
                    {
	                    buf = (char*) stack.array + stack.top1;
	                    stack.top1 += bytes;
                    } 
                    else 
                    {
	                    stack.top2 -= bytes;
	                    buf = (char*) stack.array + stack.top2;
                    }
    
                    stack.used += bytes;
                    return buf;
                }


                void cuser_free(int bytes, int which_end)
                {
                    if ( which_end == HEAD ) 
                    {
	                    stack.top1 -= bytes;
                    } 
                    else 
                    {
	                    stack.top2 += bytes;
                    }
                    stack.used -= bytes;
                }


                // Returns the working storage used during factorization 
                int superlu_cTempSpace(int n, int w, int p)
                {
                    float tmp;
                    float ptmp;
                    int iword = sizeof(int); 
                    int dword = sizeof(complex);
                    int maxsuper = sp_ienv(3);
                    int rowblk   = sp_ienv(4);

                    // globally shared 
                    tmp = 14 * n * iword;

                    // local to each processor
                    ptmp = (2 * w + 5 + NO_MARKER) * n * iword;
                    ptmp += (n * w + NUM_TEMPV(n,w,maxsuper,rowblk)) * dword;

                #if ( PRNTlevel>=1 )
                    printf("Per-processor work[] %.0f MB\n", ptmp/1024/1024);
                #endif

                    ptmp *= p;

                    return tmp + ptmp;
                }

                /*
                 * superlu_memusage consists of the following fields:
                 *    o for_lu (float)
                 *      The amount of space used in bytes for L\U data structures.
                 *    o total_needed (float)
                 *      The amount of space needed in bytes to perform factorization.
                 *    o expansions (int)
                 *      The number of memory expansions during the LU factorization.
                 */
                int superlu_cQuerySpace(int P, SuperMatrix *L, SuperMatrix *U, int panel_size,superlu_memusage_t *superlu_memusage)
                {
                    SCPformat *Lstore;
                    NCPformat *Ustore;
                    register int n, iword, dword, lwork;

                    Lstore = (SCPformat *)L->Store;
                    Ustore = (NCPformat *)U->Store;
                    n = L->ncol;
                    iword = sizeof(int);
                    dword = sizeof(complex);

                    /* L supernodes of type SCP */
                    superlu_memusage->for_lu = (float) (7*n + 3) * iword
                                             + (float) Lstore->nzval_colend[n-1] * dword
                                             + (float) Lstore->rowind_colend[n-1] * iword;

                    /* U columns of type NCP */
                    superlu_memusage->for_lu += (2*n + 1) * iword
                        + (float) Ustore->colend[n-1] * (dword + iword);

                    /* Working storage to support factorization */
                    lwork = superlu_cTempSpace(n, panel_size, P);
                    superlu_memusage->total_needed = superlu_memusage->for_lu + lwork;

                    superlu_memusage->expansions = --no_expand;

                    return 0;
                }

                float pcgstrf_memory_use(const int nzlmax, const int nzumax, const int nzlumax)
                {
                    register float iword, dword, t;

                    iword   = sizeof(int);
                    dword   = sizeof(complex);

                    t = 10. * ndim * iword + nzlmax * iword + nzumax * (iword + dword)
	                + nzlumax * dword;
                    return t;
                }


                /*
                 * Allocate storage for the data structures common to all factor routines.
                 * For those unpredictable size, make a guess as FILL * nnz(A).
                 * Return value:
                 *     If lwork = -1, return the estimated amount of space required;
                 *     otherwise, return the amount of space actually allocated when
                 *     memory allocation failure occurred.
                 */
                float pcgstrf_MemInit(int n, int annz, superlumt_options_t *superlumt_options, SuperMatrix *L, SuperMatrix *U, GlobalLU_t *Glu)
                {
                    register int nprocs = superlumt_options->nprocs;
                    yes_no_t refact = superlumt_options->refact;
                    register int panel_size = superlumt_options->panel_size;
                    register int lwork = superlumt_options->lwork;
                    void     *work = superlumt_options->work;
                    int      iword, dword, retries = 0;
                    SCPformat *Lstore;
                    NCPformat *Ustore;
                    int      *xsup, *xsup_end, *supno;
                    int      *lsub, *xlsub, *xlsub_end;
                    complex   *lusup;
                    int      *xlusup, *xlusup_end;
                    complex   *ucol;
                    int      *usub, *xusub, *xusub_end;
                    int      nzlmax, nzumax, nzlumax;
                    // Guess the fill-in growth for LUSUP 
                    int      FILL_LUSUP = sp_ienv(6);
                    // Guess the fill-in growth for UCOL
                    int      FILL_UCOL = sp_ienv(7); 
                    // Guess the fill-in growth for LSUB 
                    int      FILL_LSUB = sp_ienv(8); 
    
                    no_expand = 0;
                    ndim      = n;
                    iword     = sizeof(int);
                    dword     = sizeof(complex);

                    if ( !cexpanders )
                    {
                        cexpanders = (ExpHeader *) SUPERLU_MALLOC(NO_MEMTYPE * sizeof(ExpHeader));
                    }

                    if ( refact == NO ) 
                    {

	                    // Guess amount of storage needed by L\U factors. 
                        if ( FILL_UCOL < 0 ) 
                        {
                            nzumax = -FILL_UCOL * annz;
                        }
	                    else
                        {
                            nzumax = FILL_UCOL;
                        }
	                    if ( FILL_LSUB < 0 )
                        {    
                            nzlmax = -FILL_LSUB * annz;
                        }
	                    else 
                        {
                            nzlmax = FILL_LSUB;
                        }

	                    if ( Glu->dynamic_snode_bound == YES ) 
                        {
	                        if ( FILL_LUSUP < 0 ) 
                            {
                                nzlumax = -FILL_LUSUP * annz;
                            }
	                        else 
                            {
                                // estimate an upper bound 
                                nzlumax = FILL_LUSUP; 
                            }
	                    } 
                        else 
                        {
                            // preset as static upper bound
    	                    nzlumax = Glu->nzlumax; 
	                    }

	                    if ( lwork == -1 ) 
                        {
	                        return (GluIntArray(n) * iword +  superlu_cTempSpace(n, panel_size, nprocs) + (nzlmax+nzumax)*iword + (nzlumax+nzumax)*dword);
                        } 
                        else 
                        {
	                        pcgstrf_SetupSpace(work, lwork);
	                    }
	
	                    // Integer pointers for L\U factors
	                    if ( whichspace == SYSTEM ) 
                        {
	                        xsup       = intMalloc(n+1);
	                        xsup_end   = intMalloc(n);
	                        supno      = intMalloc(n+1);
	                        xlsub      = intMalloc(n+1);
	                        xlsub_end  = intMalloc(n);
	                        xlusup     = intMalloc(n+1);
	                        xlusup_end = intMalloc(n);
	                        xusub      = intMalloc(n+1);
	                        xusub_end  = intMalloc(n);
	                    } 
                        else 
                        {
	                        xsup       = (int *)cuser_malloc((n+1) * iword, HEAD);
	                        xsup_end   = (int *)cuser_malloc((n) * iword, HEAD);
	                        supno      = (int *)cuser_malloc((n+1) * iword, HEAD);
	                        xlsub      = (int *)cuser_malloc((n+1) * iword, HEAD);
	                        xlsub_end  = (int *)cuser_malloc((n) * iword, HEAD);
	                        xlusup     = (int *)cuser_malloc((n+1) * iword, HEAD);
	                        xlusup_end = (int *)cuser_malloc((n) * iword, HEAD);
	                        xusub      = (int *)cuser_malloc((n+1) * iword, HEAD);
	                        xusub_end  = (int *)cuser_malloc((n) * iword, HEAD);
	                    }

	                    lusup = (complex *) pcgstrf_expand( &nzlumax, LUSUP, 0, 0, Glu );
	                    ucol  = (complex *) pcgstrf_expand( &nzumax, UCOL, 0, 0, Glu );
	                    lsub  = (int *)    pcgstrf_expand( &nzlmax, LSUB, 0, 0, Glu );
	                    usub  = (int *)    pcgstrf_expand( &nzumax, USUB, 0, 1, Glu );

	                    while ( !ucol || !lsub || !usub ) 
                        {
	                        /*SUPERLU_ABORT("Not enough core in LUMemInit()");*/
                #if (PRNTlevel==1)
	                        printf(".. pcgstrf_MemInit(): #retries %d\n", ++retries);
                #endif
	                        if ( whichspace == SYSTEM ) 
                            {
		                        SUPERLU_FREE(ucol);
		                        SUPERLU_FREE(lsub);
		                        SUPERLU_FREE(usub);
	                        } 
                            else 
                            {
		                        cuser_free(nzumax*dword+(nzlmax+nzumax)*iword, HEAD);
	                        }
	                        nzumax /= 2;    /* reduce request */
	                        nzlmax /= 2;
	                        if ( nzumax < annz/2 ) 
                            {
		                        printf("Not enough memory to perform factorization.\n");
		                        return (pcgstrf_memory_use(nzlmax, nzumax, nzlumax) + n);
	                        }
	                        ucol  = (complex *) pcgstrf_expand( &nzumax, UCOL, 0, 0, Glu );
	                        lsub  = (int *)    pcgstrf_expand( &nzlmax, LSUB, 0, 0, Glu );
	                        usub  = (int *)    pcgstrf_expand( &nzumax, USUB, 0, 1, Glu );
	                    }
	
	                    if ( !lusup )  
                        {
	                        float t = pcgstrf_memory_use(nzlmax, nzumax, nzlumax) + n;
	                        printf("Not enough memory to perform factorization .. " "need %.1f GBytes\n", t*1e-9);
	                        fflush(stdout);
	                        return (t);
	                    }
	
                    } 
                    else 
                    { 
                        /* refact == YES */
	                    Lstore   = (SCPformat*)L->Store;
	                    Ustore   = (NCPformat*)U->Store;
	                    xsup     = Lstore->sup_to_colbeg;
	                    xsup_end = Lstore->sup_to_colend;
	                    supno    = Lstore->col_to_sup;
	                    xlsub    = Lstore->rowind_colbeg;
	                    xlsub_end= Lstore->rowind_colend;
	                    xlusup   = Lstore->nzval_colbeg;
	                    xlusup_end= Lstore->nzval_colend;
	                    xusub    = Ustore->colbeg;
	                    xusub_end= Ustore->colend;
	                    nzlmax   = Glu->nzlmax;    /* max from previous factorization */
	                    nzumax   = Glu->nzumax;
	                    nzlumax  = Glu->nzlumax;
	
	                    if ( lwork == -1 ) 
                        {
	                        return (GluIntArray(n) * iword + superlu_cTempSpace(n, panel_size, nprocs) + (nzlmax+nzumax)*iword + (nzlumax+nzumax)*dword);
                        } 
                        else if ( lwork == 0 ) 
                        {
	                        whichspace = SYSTEM;
	                    } 
                        else 
                        {
	                        whichspace = USER;
	                        stack.size = lwork;
	                        stack.top2 = lwork;
	                    }
	
                        cexpanders[LSUB].mem  = Lstore->rowind;
	                    lsub  = (int*)cexpanders[LSUB].mem;

                        cexpanders[LUSUP].mem = Lstore->nzval;
	                    lusup = (complex*)cexpanders[LUSUP].mem;

                        cexpanders[USUB].mem  = Ustore->rowind;
	                    usub  = (int*)cexpanders[USUB].mem;

                        cexpanders[UCOL].mem  = Ustore->nzval;
	                    ucol  = (complex*)cexpanders[UCOL].mem;

	                    cexpanders[LSUB].size         = nzlmax;
	                    cexpanders[LUSUP].size        = nzlumax;
	                    cexpanders[USUB].size         = nzumax;
	                    cexpanders[UCOL].size         = nzumax;	
                    }

                    Glu->xsup       = xsup;
                    Glu->xsup_end   = xsup_end;
                    Glu->supno      = supno;
                    Glu->lsub       = lsub;
                    Glu->xlsub      = xlsub;
                    Glu->xlsub_end  = xlsub_end;
                    Glu->lusup      = lusup;
                    Glu->xlusup     = xlusup;
                    Glu->xlusup_end = xlusup_end;
                    Glu->ucol       = ucol;
                    Glu->usub       = usub;
                    Glu->xusub      = xusub;
                    Glu->xusub_end  = xusub_end;
                    Glu->nzlmax     = nzlmax;
                    Glu->nzumax     = nzumax;
                    Glu->nzlumax    = nzlumax;
                    ++no_expand;

                #if ( PRNTlevel>=1 )
                    printf(".. pcgstrf_MemInit() refact %d, space? %d, nzlumax %d, nzumax %d, nzlmax %d\n",
	                refact, whichspace, nzlumax, nzumax, nzlmax);
                    printf(".. pcgstrf_MemInit() FILL_LUSUP %d, FILL_UCOL %d, FILL_LSUB %d\n",
	                FILL_LUSUP, FILL_UCOL, FILL_LSUB);
                    fflush(stdout);
                #endif

                    return 0;
    
                } /* pcgstrf_MemInit */

                /* 
                 * Allocate known working storage. Returns 0 if success, otherwise
                 * returns the number of bytes allocated so far when failure occurred.
                 */
                int
                pcgstrf_WorkInit(int n, int panel_size, int **iworkptr, complex **dworkptr)
                {
                    size_t  isize, dsize, extra;
                    complex *old_ptr;
                    int maxsuper = sp_ienv(3);
                    int rowblk   = sp_ienv(4);

                    isize = (2*panel_size + 5 + NO_MARKER) * n * sizeof(int);
                    dsize = (n * panel_size + NUM_TEMPV(n,panel_size,maxsuper,rowblk)) * sizeof(complex);
    
                    if ( whichspace == SYSTEM ) 
                    {
	                    *iworkptr = (int *) intCalloc(isize/sizeof(int));
                    }
                    else
                    {
	                    *iworkptr = (int *) cuser_malloc(isize, TAIL);
                    }
                    if ( ! *iworkptr ) 
                    {
	                    fprintf(stderr, "pcgstrf_WorkInit: malloc fails for local iworkptr[]\n");
	                    return (isize + n);
                    }

                    if ( whichspace == SYSTEM )
                    {
	                    *dworkptr = (complex *) SUPERLU_MALLOC(dsize);
                    }
                    else 
                    {
	                    *dworkptr = (complex *) cuser_malloc(dsize, TAIL);
	                    if ( NotDoubleAlign(*dworkptr) ) 
                        {
	                        old_ptr = *dworkptr;
	                        *dworkptr = (complex*) DoubleAlign(*dworkptr);
	                        *dworkptr = (complex*) ((double*)*dworkptr - 1);
	                        extra = (char*)old_ptr - (char*)*dworkptr;
                #ifdef CHK_EXPAND	    
	                    printf("pcgstrf_WorkInit: not aligned, extra %d\n", extra);
                #endif	    
	                        stack.top2 -= extra;
	                        stack.used += extra;
	                    }
                    }
                    if ( ! *dworkptr ) 
                    {
	                    fprintf(stderr, "malloc fails for local dworkptr[].");
	                    return (isize + dsize + n);
                    }
	
                    return 0;
                }


                /*
                 * Set up pointers for real working arrays.
                 */
                void
                pcgstrf_SetRWork(int n, int panel_size, complex *dworkptr,complex **dense, complex **tempv)
                {
                    complex zero = {0.0, 0.0};

                    int maxsuper = sp_ienv(3);
                    int rowblk   = sp_ienv(4);
                    *dense = dworkptr;
                    *tempv = *dense + panel_size*n;
                    cfill (*dense, n * panel_size, zero);
                    cfill (*tempv, NUM_TEMPV(n,panel_size,maxsuper,rowblk), zero);     
                }
	
                /*
                 * Free the working storage used by factor routines.
                 */
                void pcgstrf_WorkFree(int *iwork, complex *dwork, GlobalLU_t *Glu)
                {
                    if ( whichspace == SYSTEM ) 
                    {
	                    SUPERLU_FREE (iwork);
	                    SUPERLU_FREE (dwork);
                    } 
                    else 
                    {
	                    stack.used -= (stack.size - stack.top2);
	                    stack.top2 = stack.size;
                    }
                }

                /* 
                 * Expand the data structures for L and U during the factorization.
                 * Return value:   0 - successful return
                 *               > 0 - number of bytes allocated when run out of space
                 */
                int pcgstrf_MemXpand(
		                 int jcol,
		                 int next, /* number of elements currently in the factors */
		                 MemType mem_type,/* which type of memory to expand  */
		                 int *maxlen, /* modified - max. length of a data structure */
		                 GlobalLU_t *Glu /* modified - global LU data structures */
		                 )
                {
                    void   *new_mem;
    
                #ifdef CHK_EXPAND    
                    printf("pcgstrf_MemXpand(): jcol %d, next %d, maxlen %d, MemType %d\n", jcol, next, *maxlen, mem_type);
                #endif    

                    if (mem_type == USUB) 
                    {
    	                new_mem = pcgstrf_expand(maxlen, mem_type, next, 1, Glu);
                    }
                    else
                    {
	                    new_mem = pcgstrf_expand(maxlen, mem_type, next, 0, Glu);
                    }
    
                    if ( !new_mem ) 
                    {
	                    int nzlmax  = Glu->nzlmax;
	                    int nzumax  = Glu->nzumax;
	                    int nzlumax = Glu->nzlumax;
    	                fprintf(stderr, "Can't expand MemType %d: jcol %d\n", mem_type, jcol);
    	                return (pcgstrf_memory_use(nzlmax, nzumax, nzlumax) + ndim);
                    }

                    switch ( mem_type ) 
                    {
                        case LUSUP:
	                        Glu->lusup   = (complex *) new_mem;
	                        Glu->nzlumax = *maxlen;
	                        break;
                        case UCOL:
	                        Glu->ucol   = (complex *) new_mem;
	                        Glu->nzumax = *maxlen;
	                        break;
                        case LSUB:
	                        Glu->lsub   = (int *) new_mem;
	                        Glu->nzlmax = *maxlen;
	                        break;
                        case USUB:
	                        Glu->usub   = (int *) new_mem;
	                        Glu->nzumax = *maxlen;
	                        break;
                    }
    
                    return 0;
    
                }


                void copy_mem_complex(int howmany, void *old, void *new_)
                {
                    register int i;
                    complex *dold = (complex *)old;
                    complex *dnew = (complex *)new_;
                    for (i = 0; i < howmany; i++) 
                    {
                        dnew[i] = dold[i];
                    }
                }


                /*
                 * Expand the existing storage to accommodate more fill-ins.
                 */
                void* pcgstrf_expand(
                                int *prev_len,   /* length used from previous call */
                                MemType type,    /* which part of the memory to expand */
                                int len_to_copy, /* size of memory to be copied to new store */
                                int keep_prev,   /* = 1: use prev_len;
                                                    = 0: compute new_len to expand */
                                GlobalLU_t *Glu  /* modified - global LU data structures */
                                )
                {
                    double   alpha = EXPAND;
                    void     *new_mem, *old_mem;
                    int      new_len, tries, lword, extra, bytes_to_copy;

                    /* First time allocate requested */
                    if ( no_expand == 0 || keep_prev ) 
                    {
                        new_len = *prev_len;
                    }
                    else 
                    {
                        new_len = alpha * *prev_len;
                    }

                    if ( type == LSUB || type == USUB ) 
                    {
                        lword = sizeof(int);
                    }
                    else 
                    {
                        lword = sizeof(complex);
                    }

                    if ( whichspace == SYSTEM ) 
                    {
                        new_mem = (void *) SUPERLU_MALLOC( (size_t) new_len * lword );

                        if ( no_expand != 0 ) {
                            tries = 0;
                            if ( keep_prev ) {
                                if ( !new_mem ) return (NULL);
                            } else {
                                while ( !new_mem ) {
                                    if ( ++tries > 10 ) return (NULL);
                                    alpha = Reduce(alpha);
                                    new_len = alpha * *prev_len;
                                    new_mem = (void *) SUPERLU_MALLOC((size_t) new_len * lword);
                                }
                            }
                            if ( type == LSUB || type == USUB ) {
                                copy_mem_int(len_to_copy, cexpanders[type].mem, new_mem);
                            } else {
                                copy_mem_complex(len_to_copy, cexpanders[type].mem, new_mem);
                            }
                            SUPERLU_FREE (cexpanders[type].mem);
                        }
                        cexpanders[type].mem = (void *) new_mem;
                #if ( MACH==SGI || MACH==ORIGIN )
                /*      bzero(new_mem, new_len*lword);*/
                #endif

                    } else { /* whichspace == USER */
                        if ( no_expand == 0 ) {
                            new_mem = cuser_malloc(new_len * lword, HEAD);
                            if ( NotDoubleAlign(new_mem) &&
                                (type == LUSUP || type == UCOL) ) {
                                old_mem = new_mem;
                                new_mem = (void *)DoubleAlign(new_mem);
                                extra = (char*)new_mem - (char*)old_mem;
                #ifdef CHK_EXPAND
                                printf("expand(): not aligned, extra %d\n", extra);
                #endif
                                stack.top1 += extra;
                                stack.used += extra;
                            }
                            cexpanders[type].mem = (void *) new_mem;
                        }
                        else 
                        {
                            tries = 0;
                            extra = (new_len - *prev_len) * lword;
                            if ( keep_prev ) 
                            {
                                if ( StackFull(extra) ) 
                                {
                                        return (NULL);
                                }
                            } else 
                            {
                                while ( StackFull(extra) ) 
                                {
                                    if ( ++tries > 10 ) 
                                    {
                                        return (NULL);
                                    }
                                    alpha = Reduce(alpha);
                                    new_len = alpha * *prev_len;
                                    extra = (new_len - *prev_len) * lword;
                                }
                            }

                            if ( type != USUB ) 
                            {
                                new_mem = (void*)((char*)cexpanders[type + 1].mem + extra);
                                bytes_to_copy = (char*)stack.array + stack.top1 - (char*)cexpanders[type + 1].mem;

                                user_bcopy(cexpanders[type+1].mem, new_mem, bytes_to_copy);

                                if ( type < USUB ) 
                                {
                                    cexpanders[USUB].mem = (void*)((char*)cexpanders[USUB].mem + extra);
                                    Glu->usub = (int*)cexpanders[USUB].mem;
                                }
                                if ( type < LSUB ) 
                                {
                                    cexpanders[LSUB].mem = (void*)((char*)cexpanders[LSUB].mem + extra);
                                    Glu->lsub = (int*)cexpanders[LSUB].mem;
                                }
                                if ( type < UCOL ) 
                                {
                                    cexpanders[UCOL].mem = (void*)((char*)cexpanders[UCOL].mem + extra);
                                    Glu->ucol = (complex*)cexpanders[UCOL].mem;
                                }
                                stack.top1 += extra;
                                stack.used += extra;
                                if ( type == UCOL ) 
                                {
                                    /* Add same amount for USUB */
                                    stack.top1 += extra;
                                    stack.used += extra;
                                }

                            } /* if ... */

                        } /* else ... */
                    }
                #ifdef DEBUG
                    printf("pcgstrf_expand[type %d]\n", type);
                #endif
                    cexpanders[type].size = new_len;
                    *prev_len = new_len;
                    if ( no_expand )
                    {
                        ++no_expand;
                    }

                    return (void *) cexpanders[type].mem;

                } /* expand */


                /*
                 * Compress the work[] array to remove fragmentation.
                 */
                void
                pcgstrf_StackCompress(GlobalLU_t *Glu)
                {
                    register int iword, dword;
                    char     *last, *fragment;
                    int      *ifrom, *ito;
                    complex   *dfrom, *dto;
                    int      *xlsub, *lsub, *xusub_end, *usub, *xlusup;
                    complex   *ucol, *lusup;
    
                    iword = sizeof(int);
                    dword = sizeof(complex);

                    xlsub  = Glu->xlsub;
                    lsub   = Glu->lsub;
                    xusub_end  = Glu->xusub_end;
                    usub   = Glu->usub;
                    xlusup = Glu->xlusup;
                    ucol   = Glu->ucol;
                    lusup  = Glu->lusup;
    
                    dfrom = ucol;
                    dto = (complex *)((char*)lusup + xlusup[ndim] * dword);
                    copy_mem_complex(xusub_end[ndim-1], dfrom, dto);
                    ucol = dto;

                    ifrom = lsub;
                    ito = (int *) ((char*)ucol + xusub_end[ndim-1] * iword);
                    copy_mem_int(xlsub[ndim], ifrom, ito);
                    lsub = ito;
    
                    ifrom = usub;
                    ito = (int *) ((char*)lsub + xlsub[ndim] * iword);
                    copy_mem_int(xusub_end[ndim-1], ifrom, ito);
                    usub = ito;
    
                    last = (char*)usub + xusub_end[ndim-1] * iword;
                    fragment = (char*) ((char*)stack.array + stack.top1 - last);
                    stack.used -= (long int) fragment;
                    stack.top1 -= (long int) fragment;

                    Glu->ucol = ucol;
                    Glu->lsub = lsub;
                    Glu->usub = usub;
    
                #ifdef CHK_EXPAND
                    printf("pcgstrf_StackCompress: fragment %d\n", fragment);
                    /* PrintStack("After compress", Glu);
                    for (last = 0; last < ndim; ++last)
	                print_lu_col("After compress:", last, 0);*/
                #endif    
    
                }

                /*
                 * Allocate storage for original matrix A
                 */
                void
                callocateA(int n, int nnz, complex **a, int **asub, int **xa)
                {
                    *a    = (complex *) complexMalloc(nnz);
                    *asub = (int *) intMalloc(nnz);
                    *xa   = (int *) intMalloc(n+1);
                }

                complex *complexMalloc(int n)
                {
                    complex *buf;
                    buf = (complex *) SUPERLU_MALLOC( (size_t) n * sizeof(complex) ); 
                    if ( !buf ) {
	                fprintf(stderr, "SUPERLU_MALLOC failed for buf in complexMalloc()");
	                exit (1);
                    }
                    return (buf);
                }

                complex *complexCalloc(int n)
                {
                    complex *buf;
                    register int i;
                    complex zero = {0.0, 0.0};
                    buf = (complex *) SUPERLU_MALLOC( (size_t) n * sizeof(complex) );
                    if ( !buf ) {
	                fprintf(stderr, "SUPERLU_MALLOC failed for buf in complexCalloc()");
	                exit (1);
                    }
                    for (i = 0; i < n; ++i) buf[i] = zero;
                    return (buf);
                }

                /*
                 * Set up memory image in lusup[*], using the supernode boundaries in 
                 * the Householder matrix.
                 * 
                 * In both static and dynamic scheme, the relaxed supernodes (leaves) 
                 * are stored in the beginning of lusup[*]. In the static scheme, the
                 * memory is also set aside for the internal supernodes using upper
                 * bound information from H. In the dynamic scheme, however, the memory
                 * for the internal supernodes is not allocated by this routine.
                 *
                 * Return value
                 *   o Static scheme: number of nonzeros of all the supernodes in H.
                 *   o Dynamic scheme: number of nonzeros of the relaxed supernodes. 
                 */
                int
                cPresetMap(
	                  const int n,
	                  SuperMatrix *A, /* original matrix permuted by columns */
	                  pxgstrf_relax_t *pxgstrf_relax, /* relaxed supernodes */
	                  superlumt_options_t *superlumt_options, /* input */
	                  GlobalLU_t *Glu /* modified */
	                  )
                {
                    register int i, j, k, w, rs, rs_lastcol, krow, kmark, maxsup, nextpos;
                    register int rs_nrow; /* number of nonzero rows in a relaxed supernode */
                    int          *marker, *asub, *xa_begin, *xa_end;
                    NCPformat    *Astore;
                    int *map_in_sup; /* memory mapping function; values irrelevant on entry. */
                    int *colcnt;     /* column count of Lc or H */
                    int *super_bnd;  /* supernodes partition in H */
                    char *snode_env;

                    snode_env = getenv("SuperLU_DYNAMIC_SNODE_STORE");
                    if ( snode_env != NULL ) {
	                Glu->dynamic_snode_bound = YES;
                #if ( PRNTlevel>=1 )
	                printf(".. Use dynamic alg. to allocate storage for L supernodes.\n");
                #endif
                    } else  Glu->dynamic_snode_bound = NO;

                    Astore   = (NCPformat*)A->Store;
                    asub     = Astore->rowind;
                    xa_begin = Astore->colbeg;
                    xa_end   = Astore->colend;
                    rs       = 1;
                    marker   = intMalloc(n);
                    ifill(marker, n, EMPTY);
                    map_in_sup = Glu->map_in_sup = intCalloc(n+1);
                    colcnt = superlumt_options->colcnt_h;
                    super_bnd = superlumt_options->part_super_h;
                    nextpos = 0;

                    /* Split large supernode into smaller pieces */
                    maxsup = sp_ienv(3);
                    for (j = 0; j < n; ) {
	                w = super_bnd[j];
	                k = j + w;
	                if ( w > maxsup ) {
	                    w = w % maxsup;
	                    if ( w == 0 ) w = maxsup;
	                    while ( j < k ) {
		                super_bnd[j] = w;
		                j += w;
		                w = maxsup;
	                    }
	                }
	                j = k;
                    }
    
                    for (j = 0; j < n; j += w) {
                        if ( Glu->dynamic_snode_bound == NO ) map_in_sup[j] = nextpos;

	                if ( pxgstrf_relax[rs].fcol == j ) {
	                    /* Column j starts a relaxed supernode. */
	                    map_in_sup[j] = nextpos;
	                    rs_nrow = 0;
	                    w = pxgstrf_relax[rs++].size;
	                    rs_lastcol = j + w;
	                    for (i = j; i < rs_lastcol; ++i) {
		                /* for each nonzero in A[*,i] */
		                for (k = xa_begin[i]; k < xa_end[i]; k++) {	
		                    krow = asub[k];
		                    kmark = marker[krow];
		                    if ( kmark != j ) { /* first time visit krow */
			                marker[krow] = j;
			                ++rs_nrow;
		                    }
		                }
	                    }
	                    nextpos += w * rs_nrow;
	    
	                    /* Find the next H-supernode, with leading column i, which is
	                       outside the relaxed supernode, rs. */
	                    for (i = j; i < rs_lastcol; k = i, i += super_bnd[i]);
	                    if ( i > rs_lastcol ) {
		                /* The w columns [rs_lastcol, i) may join in the
		                   preceeding relaxed supernode; make sure we leave
		                   enough room for the combined supernode. */
		                w = i - rs_lastcol;
		                nextpos += w * SUPERLU_MAX( rs_nrow, colcnt[k] );
	                    }
	                    w = i - j;
	                } else { /* Column j starts a supernode in H */
	                    w = super_bnd[j];
	                    if ( Glu->dynamic_snode_bound == NO ) nextpos += w * colcnt[j];
	                }

	                /* Set up the offset (negative) to the leading column j of a
	                   supernode in H */ 
	                for (i = 1; i < w; ++i) map_in_sup[j + i] = -i;
	
                    } /* for j ... */

                    if ( Glu->dynamic_snode_bound == YES ) Glu->nextlu = nextpos;
                    else map_in_sup[n] = nextpos;

                #if ( PRNTlevel>=1 )
                    printf("** PresetMap() allocates %d reals to lusup[*]....\n", nextpos);
                #endif

                    free (marker);
                    return nextpos;
                }

            };
        };
    };
};

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
Architect Sea Surveillance AS
Norway Norway
Chief Architect - Sea Surveillance AS.

Specializing in integrated operations and high performance computing solutions.

I’ve been fooling around with computers since the early eighties, I’ve even done work on CP/M and MP/M.

Wrote my first “real” program on a BBC micro model B based on a series in a magazine at that time. It was fun and I got hooked on this thing called programming ...

A few Highlights:

  • High performance application server development
  • Model Driven Architecture and Code generators
  • Real-Time Distributed Solutions
  • C, C++, C#, Java, TSQL, PL/SQL, Delphi, ActionScript, Perl, Rexx
  • Microsoft SQL Server, Oracle RDBMS, IBM DB2, PostGreSQL
  • AMQP, Apache qpid, RabbitMQ, Microsoft Message Queuing, IBM WebSphereMQ, Oracle TuxidoMQ
  • Oracle WebLogic, IBM WebSphere
  • Corba, COM, DCE, WCF
  • AspenTech InfoPlus.21(IP21), OsiSoft PI


More information about what I do for a living can be found at: harlinn.com or LinkedIn

You can contact me at espen@harlinn.no

Comments and Discussions