<tb@panthema.net>
<http://www.gnu.org/licenses/>
#include <iostream>
#include <sstream>
#include <fstream>
#include <iomanip>
#include <vector>
#include <stdlib.h>
#include <inttypes.h>
#include <string.h>
#include <assert.h>
#include <unistd.h>
#include <time.h>
#include <pthread.h>
#include <malloc.h>
std::vector<const char*> gopt_funcfilter;
uint64_t gopt_sizelimit = 4*1024*1024*1024LLU;
uint64_t gopt_memlimit = 0;
int gopt_nthreads_min = 0, gopt_nthreads_max = 0;
bool gopt_nthreads_quadratic = false;
bool gopt_testcycle = false;
#define ERR(x) do { std::cerr << x << std::endl; } while(0)
#define ERRX(x) do { (std::cerr << x).flush(); } while(0)
char* g_memarea = NULL;
size_t g_memsize = 0;
const struct TestFunction* g_func = NULL;
int g_physical_cpus;
typedef void (*testfunc_type)(char* memarea, size_t size, size_t repeats);
struct TestFunction
{
const char* name;
testfunc_type func;
const char* cpufeat;
unsigned int bytes_per_access;
unsigned int access_offset;
bool make_permutation;
TestFunction(const char* n, testfunc_type f, const char* cf,
unsigned int bpa, unsigned int ao, bool mp);
bool is_supported() const;
};
std::vector<TestFunction*> g_testlist;
TestFunction::TestFunction(const char* n, testfunc_type f,const char* cf,
unsigned int bpa, unsigned int ao, bool mp)
: name(n), func(f), cpufeat(cf),
bytes_per_access(bpa), access_offset(ao), make_permutation(mp)
{
g_testlist.push_back(this);
}
#define REGISTER(func, bytes, offset) \
static const class TestFunction* _##func##_register = \
new TestFunction(#func,func,NULL,bytes,offset,false);
#define REGISTER_CPUFEAT(func, cpufeat, bytes, offset) \
static const class TestFunction* _##func##_register = \
new TestFunction(#func,func,cpufeat,bytes,offset,false);
#define REGISTER_PERM(func, bytes) \
static const class TestFunction* _##func##_register = \
new TestFunction(#func,func,NULL,bytes,bytes,true);
#if __x86_64__
#include "funcs_x86_64.h"
#elif __arm__
#include "funcs_arm.h"
#else
#include "funcs_x86_32.h"
#endif
static inline void cpuid(int op, int out[4])
{
asm volatile("cpuid"
: "=a" (out[0]), "=b" (out[1]), "=c" (out[2]), "=d" (out[3])
: "a" (op)
);
}
int g_cpuid_op1[4];
static bool cpuid_mmx()
{
return (g_cpuid_op1[3] & ((int)1 << 23));
}
static bool cpuid_sse()
{
return (g_cpuid_op1[3] & ((int)1 << 25));
}
static bool cpuid_avx()
{
return (g_cpuid_op1[2] & ((int)1 << 28));
}
static void cpuid_detect()
{
ERRX("CPUID:");
cpuid(1, g_cpuid_op1);
if (cpuid_mmx()) ERRX(" mmx");
if (cpuid_sse()) ERRX(" sse");
if (cpuid_avx()) ERRX(" avx");
ERR("");
}
bool TestFunction::is_supported() const
{
if (!cpufeat) return true;
if (strcmp(cpufeat,"mmx") == 0) return cpuid_mmx();
if (strcmp(cpufeat,"sse") == 0) return cpuid_sse();
if (strcmp(cpufeat,"avx") == 0) return cpuid_avx();
return false;
}
static inline bool
parse_uint64t(const char* value, uint64_t& out)
{
char* endp;
out = strtoull(value, &endp, 10);
return (endp && *endp == 0);
}
static inline bool
parse_int(const char* value, int& out)
{
char* endp;
out = strtoul(value, &endp, 10);
return (endp && *endp == 0);
}
struct LCGRandom
{
uint64_t xn;
inline LCGRandom(uint64_t seed) : xn(seed) { }
inline uint64_t operator()()
{
xn = 0x27BB2EE687B0B0FDLLU * xn + 0xB504F32DLU;
return xn;
}
};
static inline double timestamp()
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_sec + ts.tv_nsec * 1e-9;
}
static inline bool match_funcfilter(const char* funcname)
{
if (gopt_funcfilter.size() == 0) return true;
for (size_t i = 0; i < gopt_funcfilter.size(); ++i) {
if (strstr(funcname, gopt_funcfilter[i]) != NULL)
return true;
}
return false;
}
const uint64_t areasize_list[] = {
1 * 1024,
2 * 1024,
3 * 1024,
4 * 1024,
6 * 1024,
8 * 1024,
12 * 1024,
16 * 1024,
20 * 1024,
24 * 1024,
28 * 1024,
32 * 1024,
40 * 1024,
48 * 1024,
64 * 1024,
96 * 1024,
128 * 1024,
192 * 1024,
256 * 1024,
384 * 1024,
512 * 1024,
768 * 1024,
1024 * 1024,
(1024 + 256) * 1024,
(1024 + 512) * 1024,
(1024 + 768) * 1024,
2048 * 1024,
(2048 + 256) * 1024,
(2048 + 512) * 1024,
(2048 + 768) * 1024,
3 * 1024 * 1024,
4 * 1024 * 1024,
5 * 1024 * 1024,
6 * 1024 * 1024,
7 * 1024 * 1024,
8 * 1024 * 1024,
9 * 1024 * 1024,
10 * 1024 * 1024,
12 * 1024 * 1024,
14 * 1024 * 1024,
16 * 1024 * 1024,
20 * 1024 * 1024,
24 * 1024 * 1024,
28 * 1024 * 1024,
32 * 1024 * 1024,
64 * 1024 * 1024,
128 * 1024 * 1024,
256 * 1024 * 1024,
512 * 1024 * 1024,
1 * 1024 * 1024 * 1024LLU,
2 * 1024 * 1024 * 1024LLU,
4 * 1024 * 1024 * 1024LLU,
8 * 1024 * 1024 * 1024LLU,
16 * 1024 * 1024 * 1024LLU,
32 * 1024 * 1024 * 1024LLU,
64 * 1024 * 1024 * 1024LLU,
128 * 1024 * 1024 * 1024LLU,
256 * 1024 * 1024 * 1024LLU,
512 * 1024 * 1024 * 1024LLU,
1024 * 1024 * 1024 * 1024LLU,
0
};
bool g_done;
int g_nthreads = 0;
pthread_barrier_t g_barrier;
uint64_t g_thrsize;
uint64_t g_thrsize_spaced;
uint64_t g_repeats;
void make_cyclic_permutation(int thread_num, void* memarea, size_t bytesize)
{
void** ptrarray = (void**)memarea;
size_t size = bytesize / sizeof(void*);
if (thread_num == 0)
(std::cout << "Make permutation:").flush();
pthread_barrier_wait(&g_barrier);
(std::cout << " filling").flush();
for (size_t i = 0; i < size; ++i)
{
ptrarray[i] = &ptrarray[i];
}
(std::cout << " permuting").flush();
LCGRandom srnd((size_t)ptrarray + 233349568);
for (size_t n = size; n > 1; --n)
{
size_t i = srnd() % (n-1);
std::swap( ptrarray[i], ptrarray[n-1] );
}
if (gopt_testcycle)
{
(std::cout << " testing").flush();
void* ptr = ptrarray[0];
size_t steps = 1;
while ( ptr != &ptrarray[0] && steps < size*2 )
{
ptr = *(void**)ptr;
++steps;
}
(std::cout << " cycle=" << steps).flush();
assert(steps == size);
}
else
{
(std::cout << " cycle=" << size).flush();
}
pthread_barrier_wait(&g_barrier);
if (thread_num == 0)
std::cout << std::endl;
}
void* thread_master(void* cookie)
{
int thread_num = *((int*)cookie);
delete (int*)cookie;
uint64_t factor = 1024*1024*1024;
for (const uint64_t* areasize = areasize_list; *areasize; ++areasize)
{
if (*areasize > gopt_sizelimit && gopt_sizelimit != 0) {
ERR("Skipping " << g_func->name << " test with " << *areasize
<< " array size due to -s <size limit>.");
continue;
}
for (unsigned int round = 0; round < 1; ++round)
{
g_thrsize = *areasize / g_nthreads;
uint64_t unrollsize = 16 * g_func->bytes_per_access;
g_thrsize = ((g_thrsize + unrollsize) / unrollsize) * unrollsize;
uint64_t testsize = g_thrsize * g_nthreads;
if (g_memsize < testsize) continue;
g_thrsize_spaced = std::max<uint64_t>(g_thrsize, 4*1024*1024 + 16*1024);
if (g_memsize < g_thrsize_spaced * g_nthreads) continue;
g_repeats = (factor + g_thrsize-1) / g_thrsize;
uint64_t testvol = testsize * g_repeats * g_func->bytes_per_access / g_func->access_offset;
uint64_t testaccess = testsize * g_repeats / g_func->access_offset;
ERR("Running"
<< " nthreads=" << g_nthreads
<< " factor=" << factor
<< " areasize=" << *areasize
<< " thrsize=" << g_thrsize
<< " testsize=" << testsize
<< " repeats=" << g_repeats
<< " testvol=" << testvol
<< " testaccess=" << testaccess);
g_done = false;
double runtime;
{
pthread_barrier_wait(&g_barrier);
assert(!g_done);
if (g_func->make_permutation)
make_cyclic_permutation(thread_num, g_memarea + thread_num * g_thrsize_spaced, g_thrsize);
pthread_barrier_wait(&g_barrier);
double ts1 = timestamp();
g_func->func(g_memarea + thread_num * g_thrsize_spaced, g_thrsize, g_repeats);
pthread_barrier_wait(&g_barrier);
double ts2 = timestamp();
runtime = ts2 - ts1;
}
if ( runtime < 1.0 )
{
factor = g_thrsize * g_repeats * 3/2 / runtime;
ERR("run time = " << runtime << " -> rerunning test with repeat factor=" << factor);
--round;
}
else
{
factor = g_thrsize * g_repeats * 3/2 / runtime;
ERR("run time = " << runtime << " -> next test with repeat factor=" << factor);
std::ostringstream result;
result << "RESULT\t";
char datetime[64];
time_t tnow = time(NULL);
strftime(datetime, sizeof(datetime), "%Y-%m-%d %H:%M:%S", localtime(&tnow));
result << "datetime=" << datetime << '\t';
char hostname[256];
gethostname(hostname, sizeof(hostname));
result << "host=" << hostname << '\t';
result << "version=" << PACKAGE_VERSION << '\t'
<< "funcname=" << g_func->name << '\t'
<< "nthreads=" << g_nthreads << '\t'
<< "areasize=" << *areasize << '\t'
<< "threadsize=" << g_thrsize << '\t'
<< "testsize=" << testsize << '\t'
<< "repeats=" << g_repeats << '\t'
<< "testvol=" << testvol << '\t'
<< "testaccess=" << testaccess << '\t'
<< "time=" << std::setprecision(20) << runtime << '\t'
<< "bandwidth=" << testvol / runtime << '\t'
<< "rate=" << runtime / testaccess;
std::cout << result.str() << std::endl;
std::ofstream resultfile("stats.txt", std::ios::app);
resultfile << result.str() << std::endl;
}
}
}
g_done = true;
pthread_barrier_wait(&g_barrier);
return NULL;
}
void* thread_worker(void* cookie)
{
int thread_num = *((int*)cookie);
delete (int*)cookie;
while (1)
{
pthread_barrier_wait(&g_barrier);
if (g_done) break;
if (g_func->make_permutation)
make_cyclic_permutation(thread_num, g_memarea + thread_num * g_thrsize_spaced, g_thrsize);
pthread_barrier_wait(&g_barrier);
g_func->func(g_memarea + thread_num * g_thrsize_spaced, g_thrsize, g_repeats);
pthread_barrier_wait(&g_barrier);
}
return NULL;
}
void testfunc(const TestFunction* func)
{
if (!match_funcfilter(func->name)) {
ERR("Skipping " << func->name << " tests");
return;
}
int nthreads = 1;
if (gopt_nthreads_min != 0)
nthreads = gopt_nthreads_min;
if (gopt_nthreads_max == 0)
gopt_nthreads_max = g_physical_cpus + 2;
while (1)
{
g_func = func;
g_nthreads = nthreads;
pthread_barrier_init(&g_barrier, NULL, nthreads);
pthread_t thr[nthreads];
pthread_create(&thr[0], NULL, thread_master, new int(0));
for (int p = 1; p < nthreads; ++p)
pthread_create(&thr[p], NULL, thread_worker, new int(p));
for (int p = 0; p < nthreads; ++p)
pthread_join(thr[p], NULL);
pthread_barrier_destroy(&g_barrier);
if (nthreads >= gopt_nthreads_max) break;
if (gopt_nthreads_quadratic)
nthreads = 2 * nthreads;
else
nthreads++;
if (nthreads > gopt_nthreads_max)
nthreads = gopt_nthreads_max;
}
}
static inline uint64_t round_up_power2(uint64_t v)
{
v--;
v |= v >> 1; v |= v >> 2;
v |= v >> 4; v |= v >> 8;
v |= v >> 16; v |= v >> 32;
v++;
return v + (v == 0);
}
void print_usage(const char* prog)
{
ERR("Usage: " << prog << " [options]" << std::endl
<< "Options:" << std::endl
<< " -f <match> Run only benchmarks containing this substring, can be used multile times. Try \"list\"." << std::endl
<< " -M <size> Limit the maximum amount of memory allocated at startup." << std::endl
<< " -p <nthrs> Run benchmarks with at least this thread count." << std::endl
<< " -P <nthrs> Run benchmarks with at most this thread count (overrides detected processor count)." << std::endl
<< " -Q Run benchmarks with quadratically increasing thread count." << std::endl
<< " -s <size> Limit the maximum test array size. Set to 0 for no limit." << std::endl
);
}
int main(int argc, char* argv[])
{
int opt;
while ( (opt = getopt(argc, argv, "hf:M:p:P:Qs:")) != -1 )
{
switch (opt) {
default:
case 'h':
print_usage(argv[0]);
return EXIT_FAILURE;
case 'f':
if (strcmp(optarg,"list") == 0)
{
cpuid_detect();
std::cout << "Test Function List" << std::endl;
for (size_t i = 0; i < g_testlist.size(); ++i)
{
if (!g_testlist[i]->is_supported()) continue;
if (!match_funcfilter(g_testlist[i]->name)) continue;
std::cout << " " << g_testlist[i]->name << std::endl;
}
return 0;
}
gopt_funcfilter.push_back(optarg);
ERR("Running only functions containing '" << optarg << "'");
break;
case 'M':
if (!parse_uint64t(optarg, gopt_memlimit)) {
ERR("Invalid parameter for -M <memory limit>.");
exit(EXIT_FAILURE);
}
else if (gopt_memlimit == 0) {
ERR("Lifting memory limit: allocating highest power of two fitting into RAM.");
}
else {
ERR("Setting memory limit to " << gopt_memlimit << ".");
}
break;
case 'Q':
ERR("Running benchmarks with quadratically increasing thread counts.");
gopt_nthreads_quadratic = true;
break;
case 'p':
if (!parse_int(optarg, gopt_nthreads_min)) {
ERR("Invalid parameter for -p <lower nthreads limit>.");
exit(EXIT_FAILURE);
}
else {
ERR("Running benchmarks with at least " << gopt_nthreads_min << " threads.");
}
break;
case 'P':
if (!parse_int(optarg, gopt_nthreads_max)) {
ERR("Invalid parameter for -p <upper nthreads limit>.");
exit(EXIT_FAILURE);
}
else {
ERR("Running benchmarks with up to " << gopt_nthreads_max << " threads.");
}
break;
case 's':
if (!parse_uint64t(optarg, gopt_sizelimit)) {
ERR("Invalid parameter for -s <size limit>.");
exit(EXIT_FAILURE);
}
else if (gopt_sizelimit == 0) {
ERR("Running benchmarks with no array size limit.");
}
else {
ERR("Running benchmarks with array size up to " << gopt_sizelimit << ".");
}
break;
}
}
cpuid_detect();
size_t physical_mem = sysconf(_SC_PHYS_PAGES) * (size_t)sysconf(_SC_PAGESIZE);
g_physical_cpus = sysconf(_SC_NPROCESSORS_ONLN);
if (gopt_memlimit && gopt_memlimit < physical_mem)
physical_mem = gopt_memlimit;
g_memsize = round_up_power2(physical_mem) / 2;
g_memsize += g_physical_cpus * 256;
ERR("Detected " << physical_mem / 1024/1024 << " MiB physical RAM and " << g_physical_cpus << " CPUs. " << std::endl
<< "Allocating " << g_memsize / 1024/1024 << " MiB for testing.");
if (posix_memalign((void**)&g_memarea, 32, g_memsize) != 0) {
ERR("Error allocating memory.");
return -1;
}
memset(g_memarea, 1, g_memsize);
unlink("stats.txt");
for (size_t i = 0; i < g_testlist.size(); ++i)
{
TestFunction* tf = g_testlist[i];
if (!tf->is_supported())
{
ERR("Skipping " << tf->name << " test "
<< "due to missing CPU feature '" << tf->cpufeat << "'.");
continue;
}
testfunc(tf);
}
free(g_memarea);
for (size_t i = 0; i < g_testlist.size(); ++i)
delete g_testlist[i];
return 0;
}