/* Write to memory at different strides to test its speed. */
/* E.g.:

    : kragen@inexorable:~/devel/inexorable-misc ; time ./memstride 500000 456 10000
    10860264

    real	0m0.118s
    user	0m0.108s
    sys	0m0.004s

This implies that in a half-megabyte chunk, reading and writing a byte
10.9 million times at 456-byte strides took 118ms, so about 11ns each
time. 11ns is fast because that’s the size of my L2 cache. Compare:

    : kragen@inexorable:~/devel/inexorable-misc ; time ./memstride 500000000 456 1; time ./memstride 500000000 456 1; time ./memstride 500000000 456 1
    1096492

    real	0m0.704s
    user	0m0.036s
    sys	0m0.640s ...

That’s only 1.1 million reads+writes, but it took 700ms, or 636ns each
time, dramatically slower. But wait! Only a tiny fraction of that is
user time. Most of the time seems to have been spent in the system
clearing out memory pages! If we go over the memory 1000 times
instead, we get 51ns or so per access instead:

    : kragen@inexorable:~/devel/inexorable-misc ; time ./memstride 500000000 456 1000
    1096490664

    real	0m55.586s
    user	0m53.083s
    sys	0m0.736s

But suppose instead we access this half-gigabyte sequentially, instead
of at 456-byte strides:

    : kragen@inexorable:~/devel/inexorable-misc ; time ./memstride 500000000 1 10
    4999999955

    real	0m12.715s
    user	0m11.653s
    sys	0m0.644s

Now we’re down to 2.5ns per byte.

On another machine, a largish SMP box:

    $ time ./memstride.macos 500000000 456 1000
    1096490664

    real	0m30.640s
    user	0m30.175s
    sys	0m0.459s

    $ time ./memstride.macos 500000000 1 10
    4999999955

    real	0m6.413s
    user	0m5.972s
    sys	0m0.435s

So that’s 28ns per random access and 1.2ns per byte, or 0.87ns to
clear each byte. A similar ratio.

 */

#include <stdio.h>
#include <stdlib.h>

int usage(char *argv0) {
  fprintf(stderr, "%s: usage: %s totalsize stride nreps\n", argv0, argv0);
  return 1;
}

int main(int argc, char **argv) {
  int totalsize, stride, nreps;
  char *thing;
  int i, j;
  long long n = 0;
  if (argc != 4) return usage(argv[0]);
  totalsize = atoi(argv[1]);
  stride = atoi(argv[2]);
  nreps = atoi(argv[3]);

  thing = malloc(totalsize);
  if (!thing) {
    perror("malloc");
    return 1;
  }

  for (i = 0; i < nreps; i++)
    for (j = i; j < totalsize; j += stride) {
      thing[j] += 'x';
      n++;
    }

  printf("%lld\n", n);

  return 0;
}

