ref: a1c8da91f18c8bd3cdc7ce36c70538969e0c5602
dir: /bench.c/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <sys/time.h>
#include <unistd.h>
#include "bench.h"
#define Nsec 1000000000ULL
#define nelem(x) (sizeof(x)/sizeof((x)[0]))
#define BENCHTIME (Nsec) /* 1s in ns */
typedef struct Hist Hist;
struct Hist {
int min;
int max;
int bktmax;
int bkt[64];
};
int NPROC;
int showhist;
/*
* nsec() is wallclock and can be adjusted by timesync
* so need to use cycles() instead, but fall back to
* nsec() in case we can't
*/
uvlong
nanosec(void)
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (uvlong)ts.tv_sec * Nsec + (uvlong)ts.tv_nsec;
}
static void
cycles(uvlong *x)
{
#if defined(__x86_64__) || defined(__i386__)
unsigned int lo, hi;
__asm__ __volatile__("rdtsc" : "=a" (lo), "=d" (hi));
*x = ((uvlong)hi << 32) | lo;
#else
*x = 0; // No cycle counter support on this architecture
#endif
}
static int
min(int x, int y)
{
if(x > y) {
return y;
}
return x;
}
static int
max(int x, int y)
{
if(x < y) {
return y;
}
return x;
}
void
histrecord(Hist *h, B *b)
{
vlong ns;
int i;
if(h == NULL)
return;
ns = b->ns;
for(i = 0; i < nelem(h->bkt) - 1 && ns > 1; i++)
ns >>= 1;
h->bkt[i]++;
if(h->bkt[i] > h->bktmax)
h->bktmax = h->bkt[i];
if(i < h->min || h->min == -1)
h->min = i;
if(i >= h->max || h->max == -1)
h->max = i+1;
}
// run the benchmarking function once, looping n times
static void
benchrunn(B *b, int n, Hist *h)
{
vlong now;
// reset
b->ns = 0;
b->start = nanosec();
cycles(&b->scycles);
b->N = n;
b->item.fn(b);
// count
cycles(&b->ecycles);
now = nanosec();
b->ns += nanosec() - b->start - b->overheadns;
b->bcycles += b->ecycles - b->scycles - b->overheadcy;
histrecord(h, b);
}
static vlong
nsperop(B *b)
{
if(b->N <= 0)
return 0;
return b->ns / (vlong)b->N;
}
static uvlong
cyperop(B *b)
{
if(b->N <= 0)
return 0;
return b->bcycles / (uvlong)b->N;
}
static int
rounddown10(vlong n)
{
int tens, result, i;
tens = 0;
while(n >= 10) {
n = n / 10;
tens++;
}
result = 1;
for(i = 0; i < tens; i++) {
result *= 10;
}
return result;
}
static int
roundup(vlong ns, vlong div)
{
vlong n;
int base;
if(div == 0 || ns/div >= BENCHTIME)
return BENCHTIME;
n = ns / div;
base = rounddown10(n);
if(n <= 5)
return 5;
if(n <= base)
return base;
if(n <= 2*base)
return 2*base;
if(n <= 5*base)
return 5*base;
return 10*base;
}
// run the benchmark for one function
static void
benchrun(B *b, Hist *h)
{
int i, n;
b->overheadns = 0;
b->overheadcy = 0;
/* warm caches */
benchrunn(b, 0, NULL);
/* measure overhead */
for(i = 0; i < 20; i++){
b->ns = 0;
b->bcycles = 0;
benchrunn(b, 0, NULL);
if(i == 0 || b->ns < b->overheadns)
b->overheadns = b->ns;
if(i == 0 || b->bcycles < b->overheadcy)
b->overheadcy = b->bcycles;
}
/* estimate */
benchrunn(b, 1, h);
n = roundup(BENCHTIME, b->ns);
printf("%10d N ", n);
if(h != NULL){
for(i = 0; i < n; i++)
benchrunn(b, 1, h);
}else
benchrunn(b, n, NULL);
}
double
scaletime(vlong ns, vlong n, char **unit)
{
static const struct {
char *name;
vlong div;
} units[] = {
{"ns", 1},
{"μs", 1000},
{"ms", 1000*1000},
{"s", 1000*1000*1000},
{"m", 60LL*1000*1000*1000},
{"h", 3600LL*1000*1000*1000},
};
int i;
for(i = 0; i < nelem(units)-1; i++)
if(ns / (n * units[i].div) < 1000)
break;
*unit = units[i].name;
return (double)ns / (double)(n*units[i].div);
}
static void
benchres(B *res, Hist *h)
{
char *unit;
char bar[64];
char tmop[64];
char cyop[32];
int i, j, lim;
double nsperop;
uvlong cyperop;
if(res->N <= 0) {
printf("skipped\n");
return;
}
nsperop = scaletime(res->ns, (vlong)res->N, &unit);
snprintf(tmop, sizeof(tmop), "%12.2f %s/op", nsperop, unit);
cyperop = res->bcycles / (uvlong)res->N;
if(cyperop < 10)
snprintf(cyop, sizeof(cyop), "%13.2f cy/op", (double)res->bcycles / (double)res->N);
else if(cyperop < 100)
snprintf(cyop, sizeof(cyop), "%12.1f cy/op", (double)res->bcycles / (double)res->N);
else
snprintf(cyop, sizeof(cyop), "%10lu cy/op", cyperop);
printf("%s\t%s\n", tmop, cyop);
if(h != NULL){
for(i = h->min; i < h->max; i++){
if(h->bktmax < 30)
lim = h->bkt[i];
else
lim = 30*h->bkt[i] / h->bktmax;
if(lim == 0 && h->bkt[i] > 0)
lim = 1;
for(j = 0; j < lim; j++)
bar[j] = '*';
bar[j] = 0;
printf("\t%12lld | %8d %s\n", 1LL<<i, h->bkt[i], bar);
}
}
}
static void
usage(void)
{
fprintf(stderr, "usage: benchmark [-h]\n");
exit(1);
}
void
benchinit(int argc, char **argv)
{
char *e;
if((e = getenv("NPROC")) == NULL)
NPROC = sysconf(_SC_NPROCESSORS_ONLN);
else
NPROC = atoi(e);
if(e != NULL)
free(e);
int c;
while((c = getopt(argc, argv, "h")) != -1) {
switch(c) {
case 'h':
showhist++;
break;
default:
usage();
}
}
}
// bench a single function
void
bench(char *name, void (*fn)(B*))
{
Hist h;
B b;
memset(&b, 0, sizeof(B));
memset(&h, 0, sizeof(Hist));
b.item.name = name;
b.item.fn = fn;
if(strncmp(name, "bench", 5) == 0)
name += 5;
printf("%24s\t", name);
h.min = -1;
h.max = -1;
benchrun(&b, showhist ? &h : NULL);
benchres(&b, showhist ? &h : NULL);
}
// bench an array of functions
void
benchitems(BItem items[], int len)
{
int i;
for(i = 0; i < len; i++) {
bench(items[i].name, items[i].fn);
}
}