// This is version 2 of the multithreaded GC Bench. // Heap expansion is handled differently from version 1, in an attempt // to make scalability measurements more meaningful. The version with // N threads now immediately expands the heap to N*32MB. // // To run this with BDWGC versions 6 and later with thread local allocation, // define GC and LOCAL. Without thread-local allocation, define just GC. // To run it with the University of Tokyo scalable GC, // define SGC. To run it with malloc and explicit deallocation, define // none of these. (This should also work for Hoard.) // // Note that defining GC or SGC removes the explicit deallocation passes, // which seems fair. // // This is adapted from a benchmark written by John Ellis and Pete Kovac // of Post Communications. // It was modified by Hans Boehm of Silicon Graphics. // Translated to C++ 30 May 1997 by William D Clinger of Northeastern Univ. // Translated to C 15 March 2000 by Hans Boehm, now at HP Labs. // Adapted to run NTHREADS client threads concurrently. Each // thread executes the original benchmark. 12 June 2000 by Hans Boehm. // Changed heap expansion rule, and made the number of threads run-time // configurable. 25 Oct 2000 by Hans Boehm. // // This is no substitute for real applications. No actual application // is likely to behave in exactly this way. However, this benchmark was // designed to be more representative of real applications than other // Java GC benchmarks of which we were aware at the time. // It still doesn't seem too bad for something this small. // It attempts to model those properties of allocation requests that // are important to current GC techniques. // It is designed to be used either to obtain a single overall performance // number, or to give a more detailed estimate of how collector // performance varies with object lifetimes. It prints the time // required to allocate and collect balanced binary trees of various // sizes. Smaller trees result in shorter object lifetimes. Each cycle // allocates roughly the same amount of memory. // Two data structures are kept around during the entire process, so // that the measured performance is representative of applications // that maintain some live in-memory data. One of these is a tree // containing many pointers. The other is a large array containing // double precision floating point numbers. Both should be of comparable // size. // // The results are only really meaningful together with a specification // of how much memory was used. This versions of the benchmark tries // to preallocate a sufficiently large heap that expansion should not be // needed. // // Unlike the original Ellis and Kovac benchmark, we do not attempt // measure pause times. This facility should eventually be added back // in. There are several reasons for omitting it for now. The original // implementation depended on assumptions about the thread scheduler // that don't hold uniformly. The results really measure both the // scheduler and GC. Pause time measurements tend to not fit well with // current benchmark suites. As far as we know, none of the current // commercial Java implementations seriously attempt to minimize GC pause // times. // // Since this benchmark has recently been more widely used, some // anomalous behavious has been uncovered. The user should be aware // of this: // 1) Nearly all objects are of the same size. This benchmark is // not useful for analyzing fragmentation behavior. It is unclear // whether this is an issue for well-designed allocators. // 2) Unless HOLES is defined, it tends to drop consecutively allocated // memory at the same time. Many real applications do exhibit this // phenomenon, but probably not to this extent. (Defining HOLES tends // to move the benchmark to the opposite extreme.) // 3) It appears harder to predict object lifetimes than for most real // Java programs (see T. Harris, "Dynamic adptive pre-tenuring", // ISMM '00). #include #include #include #include #ifdef GC # ifndef LINUX_THREADS # define LINUX_THREADS # endif # ifndef _REENTRANT # define _REENTRANT # endif # ifdef LOCAL # define GC_REDIRECT_TO_LOCAL # include "gc_local_alloc.h" # endif # include "gc.h" #endif #ifdef SGC # include "sgc.h" # define GC # define pthread_create GC_pthread_create # define pthread_join GC_pthread_join #endif #define MAX_NTHREADS 1024 int nthreads = 0; #ifdef PROFIL extern void init_profiling(); extern dump_profile(); #endif // These macros were a quick hack for the Macintosh. // // #define currentTime() clock() // #define elapsedTime(x) ((1000*(x))/CLOCKS_PER_SEC) #define currentTime() stats_rtclock() #define elapsedTime(x) (x) /* Get the current time in milliseconds */ unsigned stats_rtclock( void ) { struct timeval t; struct timezone tz; if (gettimeofday( &t, &tz ) == -1) return 0; return (t.tv_sec * 1000 + t.tv_usec / 1000); } static const int kStretchTreeDepth = 18; // about 16Mb static const int kLongLivedTreeDepth = 16; // about 4Mb static const int kArraySize = 500000; // about 4Mb static const int kMinTreeDepth = 4; static const int kMaxTreeDepth = 16; typedef struct Node0_struct { struct Node0_struct * left; struct Node0_struct * right; int i, j; } Node0; #ifdef HOLES # define HOLE() GC_NEW(Node0); #else # define HOLE() #endif typedef Node0 *Node; void init_Node(Node me, Node l, Node r) { me -> left = l; me -> right = r; } #ifndef GC void destroy_Node(Node me) { if (me -> left) { destroy_Node(me -> left); } if (me -> right) { destroy_Node(me -> right); } free(me); } #endif // Nodes used by a tree of a given size static int TreeSize(int i) { return ((1 << (i + 1)) - 1); } // Number of iterations to use for a given tree depth static int NumIters(int i) { return 2 * TreeSize(kStretchTreeDepth) / TreeSize(i); } // Build tree top down, assigning to older objects. static void Populate(int iDepth, Node thisNode) { if (iDepth<=0) { return; } else { iDepth--; # ifdef GC thisNode->left = GC_NEW(Node0); HOLE(); thisNode->right = GC_NEW(Node0); HOLE(); # else thisNode->left = calloc(1, sizeof(Node0)); thisNode->right = calloc(1, sizeof(Node0)); # endif Populate (iDepth, thisNode->left); Populate (iDepth, thisNode->right); } } // Build tree bottom-up static Node MakeTree(int iDepth) { Node result; if (iDepth<=0) { # ifndef GC result = calloc(1, sizeof(Node0)); # else result = GC_NEW(Node0); HOLE(); # endif /* result is implicitly initialized in both cases. */ return result; } else { Node left = MakeTree(iDepth-1); Node right = MakeTree(iDepth-1); # ifndef GC result = malloc(sizeof(Node0)); # else result = GC_NEW(Node0); HOLE(); # endif init_Node(result, left, right); return result; } } static void PrintDiagnostics() { #if 0 long lFreeMemory = Runtime.getRuntime().freeMemory(); long lTotalMemory = Runtime.getRuntime().totalMemory(); System.out.print(" Total memory available=" + lTotalMemory + " bytes"); System.out.println(" Free memory=" + lFreeMemory + " bytes"); #endif } static void TimeConstruction(int depth) { long tStart, tFinish; int iNumIters = NumIters(depth); Node tempTree; int i; printf("0x%x: Creating %d trees of depth %d\n", pthread_self(), iNumIters, depth); tStart = currentTime(); for (i = 0; i < iNumIters; ++i) { # ifndef GC tempTree = calloc(1, sizeof(Node0)); # else tempTree = GC_NEW(Node0); # endif Populate(depth, tempTree); # ifndef GC destroy_Node(tempTree); # endif tempTree = 0; } tFinish = currentTime(); printf("\t0x%x: Top down construction took %d msec\n", pthread_self(), elapsedTime(tFinish - tStart)); tStart = currentTime(); for (i = 0; i < iNumIters; ++i) { tempTree = MakeTree(depth); # ifndef GC destroy_Node(tempTree); # endif tempTree = 0; } tFinish = currentTime(); printf("\t0x%x: Bottom up construction took %d msec\n", pthread_self(), elapsedTime(tFinish - tStart)); } void * run_one_test(void * arg) { int d, i; Node longLivedTree; double *array; /* size_t initial_bytes = GC_get_total_bytes(); */ // Create a long lived object printf(" Creating a long-lived binary tree of depth %d\n", kLongLivedTreeDepth); # ifndef GC longLivedTree = calloc(1, sizeof(Node0)); # else longLivedTree = GC_NEW(Node0); # endif Populate(kLongLivedTreeDepth, longLivedTree); // Create long-lived array, filling half of it printf(" Creating a long-lived array of %d doubles\n", kArraySize); # ifndef GC array = malloc(kArraySize * sizeof(double)); # else # ifndef NO_PTRFREE array = GC_MALLOC_ATOMIC(sizeof(double) * kArraySize); # else array = GC_MALLOC(sizeof(double) * kArraySize); # endif # endif for (i = 0; i < kArraySize/2; ++i) { array[i] = 1.0/i; } for (d = kMinTreeDepth; d <= kMaxTreeDepth; d += 2) { TimeConstruction(d); } /* printf("Allocated %ld bytes before start, %ld after\n", initial_bytes, GC_get_total_bytes() - initial_bytes); */ if (longLivedTree->left -> right == 0 || array[1000] != 1.0/1000) fprintf(stderr, "Failed\n"); // fake reference to LongLivedTree // and array // to keep them from being optimized away } int main(int argc, char **argv) { Node root; Node tempTree[MAX_NTHREADS]; long tStart, tFinish; long tElapsed; int i; # ifdef SGC SGC_attr_t attr; # endif if (1 == argc) { nthreads = 1; } else if (2 == argc) { nthreads = atoi(argv[1]); if (nthreads < 1 || nthreads > MAX_NTHREADS) { fprintf(stderr, "Invalid # of threads argument\n"); exit(1); } } else { fprintf(stderr, "Usage: %s [# of threads]\n"); exit(1); } # if defined(SGC) /* The University of Tokyo collector needs explicit */ /* initialization. */ SGC_attr_init(&attr); SGC_init(nthreads, &attr); # endif #ifdef GC // GC_full_freq = 30; // GC_free_space_divisor = 16; // GC_enable_incremental(); #endif printf("Garbage Collector Test\n"); printf(" Live storage will peak at %d bytes or less .\n\n", 2 * sizeof(Node0) * nthreads * (TreeSize(kLongLivedTreeDepth) + TreeSize(kMaxTreeDepth)) + sizeof(double) * kArraySize); PrintDiagnostics(); # ifdef GC /* GC_expand_hp fails with empty heap */ GC_malloc(1); GC_expand_hp(32*1024*1024*nthreads); # endif # ifdef PROFIL init_profiling(); # endif tStart = currentTime(); { pthread_t thread[MAX_NTHREADS]; for (i = 1; i < nthreads; ++i) { int code; if ((code = pthread_create(thread+i, 0, run_one_test, 0)) != 0) { fprintf(stderr, "Thread creation failed %u\n", code); exit(1); } } /* We use the main thread to run one test. This allows */ /* profiling to work, for example. */ run_one_test(0); for (i = 1; i < nthreads; ++i) { int code; if ((code = pthread_join(thread[i], 0)) != 0) { fprintf(stderr, "Thread join failed %u\n", code); } } } PrintDiagnostics(); tFinish = currentTime(); tElapsed = elapsedTime(tFinish-tStart); PrintDiagnostics(); printf("Completed in %d msec\n", tElapsed); # ifdef GC printf("Completed %d collections\n", GC_gc_no); printf("Heap size is %d\n", GC_get_heap_size()); # endif # ifdef PROFIL dump_profile(); # endif return 0; }