Click here to Skip to main content
15,885,278 members
Articles / High Performance Computing / Vectorization

A C++ String Class

Rate me:
Please Sign up or sign in to vote.
4.96/5 (29 votes)
3 Jan 2015CPOL13 min read 120.7K   2.6K   93  
A fast, reference counted, copy-on-write string class
#include "stdafx.h"
/*
 * -- SuperLU MT routine (version 1.0) --
 * Univ. of California Berkeley, Xerox Palo Alto Research Center,
 * and Lawrence Berkeley National Lab.
 * August 15, 1997
 *
 */
#include <stdio.h>
#include <math.h>
#include "hnum_slucmn.h"

namespace harlinn
{
    namespace numerics
    {
        namespace SuperLU
        {

            #define SPLIT_TOP

            int
            ParallelInit(int n, pxgstrf_relax_t *pxgstrf_relax, 
	                 superlumt_options_t *superlumt_options, 
	                 pxgstrf_shared_Base_t *pxgstrf_shared)
            {
                int      *etree = superlumt_options->etree;
                register int w, dad, ukids, i, j, k, rs, panel_size, relax;
                register int P, w_top, do_split = 0;
                panel_t panel_type;
                int      *panel_histo = pxgstrf_shared->Gstat->panel_histo;
                register int nthr, concurrency, info;
                Gstat_t  *Gstat = pxgstrf_shared->Gstat;

            #if ( MACH==SUN )
                register int sync_type = USYNC_THREAD;
    
                /* Set concurrency level. */
                nthr = sysconf(_SC_NPROCESSORS_ONLN);
                thr_setconcurrency(nthr);            /* number of LWPs */
                concurrency = thr_getconcurrency();

            #if ( PRNTlevel==1 )    
                printf(".. CPUs %d, concurrency (#LWP) %d, P %d\n",
	               nthr, concurrency, P);
            #endif

                /* Initialize mutex variables. */
                pxgstrf_shared->lu_locks = (mutex_t *) 
                    SUPERLU_MALLOC(NO_GLU_LOCKS * sizeof(mutex_t));
                for (i = 0; i < NO_GLU_LOCKS; ++i)
	            mutex_init(&pxgstrf_shared->lu_locks[i], sync_type, 0);

            #elif ( MACH==DEC || MACH==PTHREAD )
                pxgstrf_shared->lu_locks = (pthread_mutex_t *) 
                    SUPERLU_MALLOC(NO_GLU_LOCKS * sizeof(pthread_mutex_t));
                for (i = 0; i < NO_GLU_LOCKS; ++i)
	            pthread_mutex_init(&pxgstrf_shared->lu_locks[i], NULL);
            #else

                pxgstrf_shared->lu_locks = (mutex_t *) SUPERLU_MALLOC(NO_GLU_LOCKS * sizeof(mutex_t));

            #endif    
    
            #if ( PRNTlevel==1 )
                printf(".. ParallelInit() ... nprocs %2d\n", superlumt_options->nprocs);
            #endif

                pxgstrf_shared->spin_locks = intCalloc(n);
                pxgstrf_shared->pan_status = 
                    (pan_status_t *) SUPERLU_MALLOC((n+1)*sizeof(pan_status_t));
                pxgstrf_shared->fb_cols    = intMalloc(n+1);

                panel_size = superlumt_options->panel_size;
                relax = superlumt_options->relax;
                w = SUPERLU_MAX(panel_size, relax) + 1;
                for (i = 0; i < w; ++i) panel_histo[i] = 0;
                pxgstrf_shared->num_splits = 0;
    
                if ( (info = queue_init(&pxgstrf_shared->taskq, n)) ) {
	            fprintf(stderr, "ParallelInit(): %d\n", info);
	            SUPERLU_ABORT("queue_init fails.");
                }

                /* Count children of each node in the etree. */
                for (i = 0; i <= n; ++i) pxgstrf_shared->pan_status[i].ukids = 0;
                for (i = 0; i < n; ++i) {
	            dad = etree[i];
	            ++pxgstrf_shared->pan_status[dad].ukids;
                }

    
                /* Find the panel partitions and initialize each panel's status */

            #ifdef PROFILE
                Gstat->num_panels = 0;
            #endif

                pxgstrf_shared->tasks_remain = 0;
                rs = 1;   /* index for the next relaxed s-node */
                w_top = panel_size/2;
                if ( w_top == 0 ) w_top = 1;
                P = 12;

                for (i = 0; i < n; ) {
	            if ( pxgstrf_relax[rs].fcol == i ) {
	                w = pxgstrf_relax[rs++].size;
	                panel_type = RELAXED_SNODE;
	                pxgstrf_shared->pan_status[i].state = CANGO;
	            } else {
	                /* Adjust panel_size so that a panel won't overlap with
	                   the next relaxed snode.     */
            #if 0
	                /* Only works when etree is postordered. */
	                w = SUPERLU_MIN(panel_size, pxgstrf_relax[rs].fcol - i);
            #else
	                w = panel_size;
	                for (k = i + 1; k < SUPERLU_MIN(i + panel_size, n); ++k)
		            if ( k == pxgstrf_relax[rs].fcol ) {
		                w = k - i;  /* panel stops at column k-1 */
		                break;
		            }
	                if ( k == n ) w = n - i;
            #endif

            #ifdef SPLIT_TOP
	                if ( !do_split ) {
	  	            if ( (n-i) < panel_size * P ) do_split = 1;
	                }
	                if ( do_split && w > w_top ) { /* split large panel */
	    	            w = w_top;
	    	            ++pxgstrf_shared->num_splits;
	                }
            #endif
	                for (j = i+1; j < i + w; ++j) 
		            /* Do not allow panel to cross a branch point in the etree. */
		            if ( pxgstrf_shared->pan_status[j].ukids > 1 ) break;
	                w = j - i;    /* j should start a new panel */
	                panel_type = REGULAR_PANEL;
	                pxgstrf_shared->pan_status[i].state = UNREADY;
            #ifdef DOMAINS
	                if ( in_domain[i] == TREE_DOMAIN ) panel_type = TREE_DOMAIN;
            #endif
	            }

	            if ( panel_type == REGULAR_PANEL ) {
	                ++pxgstrf_shared->tasks_remain;
	                /*printf("nondomain panel %6d -- %6d\n", i, i+w-1);
	                fflush(stdout);*/
	            }

	            ukids = k = 0;
	            for (j = i; j < i + w; ++j) {
	                pxgstrf_shared->pan_status[j].size = k--;
	                pxgstrf_shared->pan_status[j].type = panel_type;
	                ukids += pxgstrf_shared->pan_status[j].ukids;
	            }
	            pxgstrf_shared->pan_status[i].size = w; /* leading column */
	            /* only count those kids outside the panel */
	            pxgstrf_shared->pan_status[i].ukids = ukids - (w-1);
	            panel_histo[w]++;
	
            #ifdef PROFILE
	            Gstat->panstat[i].size = w;
	            ++Gstat->num_panels;
            #endif
	
	            pxgstrf_shared->fb_cols[i] = i;
	            i += w;    /* move to the next panel */

                } /* for i ... */
    
                /* Dummy root */
                pxgstrf_shared->pan_status[n].size = 1;
                pxgstrf_shared->pan_status[n].state = UNREADY;

            #if ( PRNTlevel==1 )
                printf(".. Split: P %d, #nondomain panels %d\n", P, pxgstrf_shared->tasks_remain);
            #endif
            #ifdef DOMAINS
                EnqueueDomains(&pxgstrf_shared->taskq, list_head, pxgstrf_shared);
            #else
                EnqueueRelaxSnode(&pxgstrf_shared->taskq, n, pxgstrf_relax, pxgstrf_shared);
            #endif
            #if ( PRNTlevel==1 )
                printf(".. # tasks %d\n", pxgstrf_shared->tasks_remain);
                fflush(stdout);
            #endif

            #ifdef PREDICT_OPT
                /* Set up structure describing children */
                for (i = 0; i <= n; cp_firstkid[i++] = EMPTY);
                for (i = n-1; i >= 0; i--) {
	            dad = etree[i];
	            cp_nextkid[i] = cp_firstkid[dad];
	            cp_firstkid[dad] = i;
                }
            #endif

                return 0;
            } /* ParallelInit */


            /*
             * Free the storage used by the parallel scheduling algorithm.
             */
            int ParallelFinalize(pxgstrf_shared_Base_t *pxgstrf_shared)
            {
                /* Destroy mutexes */
            #if ( MACH==SUN )
                register int i;
                for (i = 0; i < NO_GLU_LOCKS; ++i)
                    mutex_destroy( &pxgstrf_shared->lu_locks[i] );
            #elif ( MACH==DEC || MACH==PTHREAD )
                register int i;
                for (i = 0; i < NO_GLU_LOCKS; ++i) 
                    pthread_mutex_destroy( &pxgstrf_shared->lu_locks[i] );
            #endif    
    
                SUPERLU_FREE ((void*)pxgstrf_shared->lu_locks);
                SUPERLU_FREE ((int*)pxgstrf_shared->spin_locks);
                SUPERLU_FREE (pxgstrf_shared->pan_status);
                SUPERLU_FREE (pxgstrf_shared->fb_cols);
                //SUPERLU_FREE (pxgstrf_shared->Glu->map_in_sup); TODO: Ensure that map_in_sup gets deleted 
                queue_destroy(&pxgstrf_shared->taskq);

            #if ( PRNTlevel==1 )
                printf(".. # panel splittings %d\n", pxgstrf_shared->num_splits);
            #endif

                return 0;
            }

            int queue_init(queue_t *q, int n)
            {
                if ( n < 1 ) return (-1);

                q->queue = (qitem_t *) SUPERLU_MALLOC(n*sizeof(qitem_t));
                q->count = 0;
                q->head = 0;
                q->tail = 0;

                return 0;
            }

            int queue_destroy(queue_t *q)
            {
                SUPERLU_FREE( q->queue );
                return 0;
            }

            /*
             * Return value: number of items in the queue
             */
            int Enqueue(queue_t *q, qitem_t item)
            {
                q->queue[q->tail++] = item;
                ++q->count;
                return (q->count);
            }

            /*
             * Return value: >= 0 number of items in the queue
             *               = -1 queue is empty
             */
            int Dequeue(queue_t *q, qitem_t *item)
            {
                if ( q->count <= 0 ) return EMPTY;
    
                *item = q->queue[q->head++];
                --q->count;
                return (q->count);
            }

            int QueryQueue(queue_t *q)
            {
                register int     i;
                printf("Queue count: %d\n", q->count);
                for (i = q->head; i < q->tail; ++i)
	            printf("%8d\titem %8d\n", i, q->queue[i]);

                return 0;
            }

            int EnqueueRelaxSnode(queue_t *q, int n, pxgstrf_relax_t *pxgstrf_relax,
		                  pxgstrf_shared_Base_t *pxgstrf_shared)
            {
                register int rs, j, m;

                m = pxgstrf_relax[0].size;
                for (rs = 1; rs <= m; ++rs) {
	            j = pxgstrf_relax[rs].fcol;
	            q->queue[q->tail++] = j;
	            q->count++;
	            ++pxgstrf_shared->tasks_remain;
                }
            #if ( PRNTlevel==1 )    
                printf(".. EnqueueRelaxSnode(): count %d\n", q->count);
            #endif
                return 0;
            }

            /*
             * Enqueue the initial independent domains.
             * A pair of two numbers {root, fst_desc} is added in the queue.
             */
            /*int EnqueueDomains(int P, queue_t *q, struct Branch **proc_domains_h)*/
            int EnqueueDomains(queue_t *q, struct Branch *list_head,
		               pxgstrf_shared_Base_t *pxgstrf_shared)
            {
                struct Branch *b, *thrash;

            /*    for (pnum = 0; pnum < P; ++pnum) {
	            for (b = proc_domains_h[pnum]; b != NULL; ) {*/
                b = list_head;
                while ( b ) {
	            thrash = b;
	            q->queue[q->tail++] = b->root;
	            q->queue[q->tail++] = b->first_desc;
	            q->count = q->count + 2;
	            STATE ( b->root ) = CANGO;
	            ++pxgstrf_shared->tasks_remain;
	            b = b->next;
	            SUPERLU_FREE (thrash);
                }
                printf("EnqueueDomains(): count %d\n", q->count);
                return 0;
            }

            int NewNsuper(const int pnum, pxgstrf_shared_Base_t *pxgstrf_shared, int *data)
            {
                register int i;
                mutex_t *lock = &pxgstrf_shared->lu_locks[NSUPER_LOCK];
                Gstat_t *Gstat = pxgstrf_shared->Gstat;

            #ifdef PROFILE
                double t = SuperLU_timer_();
            #endif    
	
            #if ( MACH==SUN )
                mutex_lock(lock);
            #elif ( MACH==DEC || MACH==PTHREAD )
                pthread_mutex_lock(lock);
            #elif ( MACH==SGI || MACH==ORIGIN )
            #pragma critical lock(lock)
            #elif  ( MACH==CRAY_PVP )
            #pragma _CRI guard (*lock)
            #endif    
                {
                  i = ++(*data);
                }
            #if ( MACH==SUN )
                mutex_unlock(lock);
            #elif ( MACH==DEC || MACH==PTHREAD )
                pthread_mutex_unlock(lock);
            #elif ( MACH==CRAY_PVP )
            #pragma _CRI endguard (*lock)
            #endif    

            #ifdef PROFILE
                Gstat->procstat[pnum].cs_time += SuperLU_timer_() - t;
            #endif
	
                return i;
            }



        };
    };
};

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
Architect Sea Surveillance AS
Norway Norway
Chief Architect - Sea Surveillance AS.

Specializing in integrated operations and high performance computing solutions.

I’ve been fooling around with computers since the early eighties, I’ve even done work on CP/M and MP/M.

Wrote my first “real” program on a BBC micro model B based on a series in a magazine at that time. It was fun and I got hooked on this thing called programming ...

A few Highlights:

  • High performance application server development
  • Model Driven Architecture and Code generators
  • Real-Time Distributed Solutions
  • C, C++, C#, Java, TSQL, PL/SQL, Delphi, ActionScript, Perl, Rexx
  • Microsoft SQL Server, Oracle RDBMS, IBM DB2, PostGreSQL
  • AMQP, Apache qpid, RabbitMQ, Microsoft Message Queuing, IBM WebSphereMQ, Oracle TuxidoMQ
  • Oracle WebLogic, IBM WebSphere
  • Corba, COM, DCE, WCF
  • AspenTech InfoPlus.21(IP21), OsiSoft PI


More information about what I do for a living can be found at: harlinn.com or LinkedIn

You can contact me at espen@harlinn.no

Comments and Discussions