--cut here---------------------------------------------------------------

/*
 *
 * fsx is a File System eXerciser.  It is a benchmark that attempts to
 * quantify the performance of several filesystem operations that have been
 * observed to be bottlenecks in I/O-intensive applications, specifically
 * the text database work done in connection with the New Oxford English
 * Dictionary Project at the University of Waterloo.
 * 
 * It performs a series of tests on a file of known size.  By default, that
 * size is 50 Mb.  For each test, fsx reports the bytes processed per
 * elapsed second, per CPU second, and the % CPU usage (user and system).
 * 
 * In each case, some care is taken to keep optimizers, no matter how
 * clever, from noticing it's all bogus.  The idea is to make sure that
 * these are real transfers from user space to the physical disk.  The
 * tests are:
 * 
 * 1. Sequential Output
 * 
 * 1.1 Per-Character.  The file is written using the putc() stdio macro.
 * The loop that does the writing should be small enough to fit into any
 * reasonable I-cache.  The CPU exercised here is that required to do the
 * stdio code plus the OS file space allocation.
 * 
 * 1.2 Block.  The file is created using write(2).  The CPU consumption
 * should be just the OS file space allocation.
 * 
 * 1.3 Rewrite.  Each BUFSIZ of the file is read with read(2), dirtied, and
 * rewritten with write(2), requiring an lseek(2).  Since no space
 * allocation is done, and the I/O is well-localized, this should test the
 * effectiveness of the filesystem cache and the speed of data transfer.
 * 
 * 2. Sequential Input
 * 
 * 2.1 Per-Character.  The file is read using the getc() stdio macro.  Once
 * again, the inner loop is small.  This should exercise only stdio and
 * sequential input.
 * 
 * 2.2 Block.  The file is read using read(2).  This should be a very pure
 * test of sequential input performance.
 * 
 * 3. Random Seeks
 * 
 * This test seeks 1000 times to locations in the file specified by
 * random() in bsd systems, drand48() on sysV systems.  In each case, the
 * block is read with read(2).  In 10% of cases, it is dirtied and written
 * back with write(2).
 * 
 * AXIOM: For any unix filesystem, the effective number of lseek(2) calls
 * per second declines asymptotically to near 30, once the effect of
 * caching is defeated.
 * 
 * The size of the file has a strong nonlinear effect on the results of
 * this test.  Many Unix systems that have the memory available will make
 * aggressive efforts to cache the whole thing, and report random I/O rates
 * in the thousands per second, which is ridiculous.  As an extreme
 * example, an IBM RISC 6000 with 64 Mb of memory reported 3,722 per second
 * on a 50 Mb file.
 *
 * COPYRIGHT NOTICE: 
 * Copyright (c) Tim Bray, 1990.
 * Everybody is hereby granted rights to use, copy, and modify this program, 
 *  provided only that the copyright notice above and the disclaimer below
 *  are preserved without change.
 * DISCLAIMER:
 * This program is provided AS IS with no warranty of any kind, and
 * The author makes no representation with respect to the adequacy of this
 *  program for any particular purpose or with respect to its adequacy to 
 *  produce any particular result, and
 * The author shall not be liable for loss or damage arising out of
 *  the use of this program regardless of how sustained, and
 * In no event shall the author be liable for special, direct, indirect
 *  or consequential damage, loss, costs or fees or expenses of any
 *  nature or kind.
 */

#include <stdio.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/time.h>
#ifdef i386
#include <limits.h>
#include <sys/times.h>
#else
#include <sys/resource.h>
#endif

#define IntSize (4)
#define Elapsed (0)
#define CPU (1)
#define Searches (1000)
#define UpdateSeek (10)

static double cpu_so_far();
static void   doseek();
static void   get_delta_t();
static void   io_error();
static void   newfile();
static void   report();
static double time_so_far();
static void   timestamp();
static void   usage();
#ifdef i386
static long   random();
#endif

typedef enum
{
  Putc,
  ReWrite,
  FastWrite,
  Getc,
  FastRead,
  Binary,
  TestCount
} tests_t;

static int    basetime;
static double delta[(int) TestCount][2];
static char * myname;
static double last_cpustamp = 0.0;
static double last_timestamp = 0.0;
static int    lseek_count = 0;

main(argc, argv)
  int    argc;
  char * argv[];
{
  int    buf[BUFSIZ / IntSize];
  int    bufindex;
  int    chars[256];
  char * dir;
  int    fd;
  char   name[BUFSIZ];
  int    next;
  int    size;
  FILE * stream;
  int    words;

  myname = argv[0];
  fd = -1;
  basetime = (int) time((time_t *) NULL);
  size = 50;
  dir = ".";

  for (next = 1; next < argc - 1; next++)
    if (argv[next][0] == '-')
    { /* option? */
      if (strcmp(argv[next] + 1, "d") == 0)
	dir = argv[next + 1];
      else if (strcmp(argv[next] + 1, "s") == 0)
	size = atoi(argv[next + 1]);
      else
	usage();
      next++;
    } /* option? */
    else
      usage();

  if (size < 1)
    usage();
  size *= (1024 * 1024);
  sprintf(name, "%s/fsx.%d", dir, getpid());
  printf("File '%s', size: %d\n", name, size);
  fflush(stdout);

  /*
   * Fill up a file, writing it a char at a time with the stdio putc()
   *  facility.  This models the activity of a negligible-CPU filter
   *  and exercises kernel file space allocation.  Should be measured with
   *  disks of varying degrees of fullness.
   */
  printf("Writing with putc()...");
  newfile(name, &fd, &stream, 1);
  fflush(stdout);
  timestamp();
  for (words = 0; words < size; words++)
    if (putc(words & 0x7f, stream) == EOF)
      io_error("putc");
  
  /*
   * note that we always close the file before measuring time, in an
   *  effort to force as much of the I/O out as we can
   */
  if (fclose(stream) == -1)
    io_error("fclose after putc");
  get_delta_t((int) Putc);
  printf("done\n");
  fflush(stdout);

  /*
   * Now read & rewrite it, a block at a time.  Change one word in each block
   *  in case somebody's being clever about rewritten dirty blocks.
   * This exercises sequential read-write performance and hence the cache,
   *  but avoids any space-allocation work.
   */
  newfile(name, &fd, &stream, 0);
  if (lseek(fd, (off_t) 0, 0) == (off_t) -1)
    io_error("lseek(2) before rewrite");
  printf("Rewriting...");
  fflush(stdout);
  timestamp();
  bufindex = 0;
  do
  { /* while we can read a block */
    if ((words = read(fd, (char *) buf, BUFSIZ)) == -1)
      io_error("read(2)");
    if (bufindex == words / IntSize)
      bufindex = 0;
    buf[bufindex++]++;
    if (lseek(fd, (off_t) -words, 1) == -1)
      io_error("relative lseek(2)");
    if (write(fd, (char *) buf, words) == -1)
      io_error("re write(2)");
  } /* while we can read a block */
  while (words == BUFSIZ);
  if (close(fd) == -1)
    io_error("close after rewrite");
  get_delta_t((int) ReWrite);
  printf("done\n");
  fflush(stdout);

  /*
   * Now rewrite the whole file from scratch, but a block at a time rather
   *  than with stdio.  Exercises space allocation a little more stringently,
   *  and by comparison with the putc test, quantifies stdio overhead a bit.
   * Once again, dirty each block.
   */
  newfile(name, &fd, &stream, 1);
  printf("Writing intelligently...");
  for (words = 0; words < BUFSIZ / IntSize; words++)
    buf[words] = 0;
  fflush(stdout);
  timestamp();
  for (words = bufindex = 0; words < (size / BUFSIZ); words++)
  { /* for each word */
    if (bufindex == (BUFSIZ/IntSize))
      bufindex = 0;
    buf[bufindex++]++;
    if (write(fd, (char *) buf, BUFSIZ) == -1)
      io_error("write(2)");
  } /* for each word */
  if (close(fd) == -1)
    io_error("close after fast write");
  get_delta_t((int) FastWrite);
  printf("done\n");
  fflush(stdout);

  /*
   * Now read them all back with getc, excercising default character-
   *  at-a-time input.  Do a character frequency count just to fool
   *  any optimizers that may notice that they're not being used.
   */
  newfile(name, &fd, &stream, 0);
  for (words = 0; words < 256; words++)
    chars[words] = 0;
  printf("Reading with getc()...");
  fflush(stdout);
  timestamp();
  for (words = 0; words < size; words++)
  { /* for each byte */
    if ((next = getc(stream)) == EOF)
      io_error("getc(3)");
    chars[next]++;
  } /* for each byte */
  if (fclose(stream) == -1)
    io_error("fclose after getc");
  get_delta_t((int) Getc);
  printf("done\n");
  fflush(stdout);

  /* use the frequency count */
  for (words = 0; words < 256; words++)
    sprintf((char *) buf, "%d", chars[words]);

  /*
   * Now suck it in, BUFSIZ at a time, as fast as we can.
   */
  newfile(name, &fd, &stream, 0);
  if (lseek(fd, 0L, 0) == -1)
    io_error("lseek before read");
  printf("Reading intelligently...");
  fflush(stdout);
  timestamp();
  do
    if ((words = read(fd, (char *) buf, BUFSIZ)) == -1)
      io_error("read(2)");
  while (words);
  if (close(fd) == -1)
    io_error("close after read");
  get_delta_t((int) FastRead);
  printf("done\n");
  fflush(stdout);

  /*
   * Now do some random I/O.  Originally these were binary searches, but
   *  that is well-behaved, since the first few seeks are always the same.
   *  Probably an application that was doing this kind of thing would keep
   *  its own cache of the top few levels of the tree or whatever - we're
   *  just interested in 'How much random I/O can be done?'
   */
  newfile(name, &fd, &stream, 0);
  timestamp();
  printf("Seeking...");
  fflush(stdout);
  for (lseek_count = 0; lseek_count < Searches; lseek_count++)
    doseek(random() % size, fd, ((lseek_count % UpdateSeek) == 0));
  if (close(fd) == -1)
    io_error("close after read");
  get_delta_t((int) Binary);
  printf("done\n");
  fflush(stdout);

  report(size);
  unlink(name);
}

static void
report(size)
  int   size;
{
  printf("\n");
  printf("Times reported are elapsed / cpu / %%cpu usage.\n");

  printf("\nSequential output\n");
  printf("putc() bytes/sec: %d / %d / %.1f%%\n",
    (int) (((double) size) / delta[(int) Putc][Elapsed]),
    (int) (((double) size) / delta[(int) Putc][CPU]),
    delta[(int) Putc][CPU] / delta[(int) Putc][Elapsed] * 100.0);
  printf("write() bytes/sec: %d / %d / %.1f%%\n",
    (int) (((double) size) / delta[(int) FastWrite][Elapsed]),
    (int) (((double) size) / delta[(int) FastWrite][CPU]),
    delta[(int) FastWrite][CPU] / delta[(int) FastWrite][Elapsed] * 100.0);
  printf("putc() multiplier: %.1f / %.1f\n",
    delta[(int) Putc][Elapsed] / delta[(int) FastWrite][Elapsed],
    delta[(int) Putc][CPU] / delta[(int) FastWrite][CPU]);
  printf("Sequential output time: %.1f / %.1f / %.1f%%\n\n",
    delta[(int) FastWrite][Elapsed] + delta[(int) Putc][Elapsed],
    delta[(int) FastWrite][CPU] + delta[(int) Putc][CPU],
    (delta[(int) FastWrite][CPU] + delta[(int) Putc][CPU])
    / (delta[(int) FastWrite][Elapsed] + delta[(int) Putc][Elapsed])
    * 100.0);

  printf("\nSequential input\n");
  printf("getc() bytes/sec: %d / %d / %.1f%%\n",
    (int) (((double) size) / delta[(int) Getc][Elapsed]),
    (int) (((double) size) / delta[(int) Getc][CPU]),
    delta[(int) Getc][CPU] / delta[(int) Getc][Elapsed] * 100.0);
  printf("read() bytes/sec: %d / %d / %.1f%%\n",
    (int) (((double) size) / delta[(int) FastRead][Elapsed]),
    (int) (((double) size) / delta[(int) FastRead][CPU]),
    delta[(int) FastRead][CPU] / delta[(int) FastRead][Elapsed] * 100.0);
  printf("getc() multiplier: %.1f / %.1f\n",
    delta[(int) Getc][Elapsed] / delta[(int) FastRead][Elapsed],
    delta[(int) Getc][CPU] / delta[(int) FastRead][CPU]);
  printf("Sequential input time: %.1f / %.1f / %.1f%%\n\n",
    delta[(int) Getc][Elapsed] + delta[(int) FastRead][Elapsed],
    delta[(int) Getc][CPU] + delta[(int) FastRead][CPU],
    (delta[(int) Getc][CPU] + delta[(int) FastRead][CPU])
    / (delta[(int) Getc][Elapsed] + delta[(int) FastRead][Elapsed])
    * 100.0);

  printf("\nSequential rewrite\n");
  printf("Sequential rewrite bytes/sec: %d / %d / %.1f%%\n",
    (int) (((double) size) / delta[(int) ReWrite][Elapsed]),
    (int) (((double) size) / delta[(int) ReWrite][CPU]),
    delta[(int) ReWrite][CPU] / delta[(int) ReWrite][Elapsed] * 100.0);
  printf("Sequential rewrite time: %.1f / %.1f / %.1f%%\n",
    delta[(int) ReWrite][Elapsed], delta[(int) ReWrite][CPU],
    delta[(int) ReWrite][CPU] / delta[(int) ReWrite][Elapsed] * 100.0);

  printf("\nRandom I/O\n");
  printf("Random reads/sec: %.1f / %.1f / %.1f%%\n",
    ((double) lseek_count) / delta[(int) Binary][Elapsed],
    ((double) lseek_count) / delta[(int) Binary][CPU],
    delta[(int) Binary][CPU] / delta[(int) Binary][Elapsed] * 100.0);
}

static void
newfile(name, fd, stream, create)
  char *   name;
  int *    fd;
  FILE * * stream;
  int      create;
{
  if (create)
  { /* create from scratch */
    if (unlink(name) == -1 && *fd != -1)
      io_error("unlink");
    *fd = open(name, O_RDWR | O_CREAT | O_EXCL, 0777);
  } /* create from scratch */
  else
    *fd = open(name, O_RDWR, 0777);

  if (*fd == -1)
    io_error(name);
  *stream = fdopen(*fd, "r+");
  if (*stream == NULL)
    io_error("fdopen");
}

static void
usage()
{
  fprintf(stderr, "usage: %s [-d scratch-directory] [-s size-in-megabytes]\n",
    myname);
  exit(1);
}

static void
timestamp()
{
  last_timestamp = time_so_far();
  last_cpustamp = cpu_so_far();
}

static void 
get_delta_t(which)
  int which;
{
  delta[which][Elapsed] = time_so_far() - last_timestamp;
  delta[which][CPU] = cpu_so_far() - last_cpustamp;
}

static double 
cpu_so_far()
{
#ifdef i386
  struct tms tms;

  if (times(&tms) == -1)
    io_error("times");
  return ((double) tms.tms_utime) / ((double) CLK_TCK) +
    ((double) tms.tms_stime) / ((double) CLK_TCK);

#else
  struct rusage rusage;

  getrusage(RUSAGE_SELF, &rusage);
  return
    ((double) rusage.ru_utime.tv_sec) +
      (((double) rusage.ru_utime.tv_usec) / 1000000.0) +
        ((double) rusage.ru_stime.tv_sec) +
          (((double) rusage.ru_stime.tv_usec) / 1000000.0);
#endif
}

static double
time_so_far()
{
#ifdef i386
  int        val;
  struct tms tms;

  if ((val = times(&tms)) == -1)
    io_error("times");

  return ((double) val) / ((double) CLK_TCK);

#else
  struct timeval tp;

  if (gettimeofday(&tp, (struct timezone *) NULL) == -1)
    io_error("gettimeofday");
  return ((double) (tp.tv_sec - basetime)) +
    (((double) tp.tv_usec) / 1000000.0);
#endif
}

static void
io_error(message)
  char * message;
{
  char buf[BUFSIZ];

  sprintf(buf, "%s: drastic I/O error (%s)", myname, message);
  perror(buf);
  exit(1);
}

/*
 * Do a typical-of-something random I/O.  Any serious application that
 *  has a random I/O bottleneck is going to be smart enough to operate
 *  in a page mode, and not stupidly pull individual words out at
 *  odd offsets.  To keep the cache from getting too clever, some
 *  pages must be updated.  However an application that updated each of
 *  many random pages that it looked at is hard to imagine.  
 * However, it would be wrong to put the update percentage in as a
 *  parameter - the effect is too nonlinear.  Need a profile
 *  of what Oracle or Ingres or some such actually does.
 * Be warned - there is a *sharp* elbow in this curve - on a 1-Mb file,
 *  most substantial unix systems show >2000 random I/Os per second -
 *  obviously they've cached the whole thing and are just doing buffer
 *  copies.  
 */
static void 
doseek(where, fd, update)
  long where;
  int  fd;
  int  update;
{
  int  buf[BUFSIZ / IntSize];
  long probe;
  int  size;

  probe = (where / BUFSIZ) * BUFSIZ;
  if (lseek(fd, probe, 0) != probe)
    io_error("lseek in doseek");
  if ((size = read(fd, (char *) buf, BUFSIZ)) == -1)
    io_error("read in doseek");

  /* every so often, update a block */
  if (update)
  { /* update this block */

    /* touch a word */
    buf[((int) random() % (size/IntSize - 2)) + 1]--;
    if (lseek(fd, (long) probe, 0) != probe)
      io_error("lseek in doseek update");
    if (write(fd, (char *) buf, size) == -1)
      io_error("write in doseek");
  } /* update this block */
}
  
#ifdef i386
static char randseed[] = "ioioio";

static long
random()
{
  return nrand48(randseed);
}
#endif