tags:

views:

149

answers:

3

What is the fastest way to count lines and words in a text file in pure ANSI C?

A word is terminated by a space or period. Line is terminated by '\n'.

This seems to be in C++.

+2  A: 
  • Read the file in
  • Iterate over characters increment character counter
  • Check if space/end of line increment word counter
  • Repeat second and third steps until EOF
Alan Haggai Alavi
That kind of approach is widely available on the net.
JMSA
JMSA: Yes, but it is also (almost, except for buffering) the approach that is used in the C++ example you link to, so it sounds like what you want.
gspr
+1  A: 

Here is an explicit answer that counts the number of lines (extension to the number of words is trivial à la the C++ version linked to in OP). This version is buffered. Another answer suggests reading the entire file in first, which is simpler, but the below is more in line with what your C++ example does.

#include <stdio.h>
#include <string.h>

#define BUFSIZE 1024

int main(int argc, char** argv)
{
  int newlines = 0;
  char buf[BUFSIZE];
  FILE* file;

  if (argc != 2)
    return 1;

  file = fopen(argv[1], "r");
  while (fgets(buf, BUFSIZE, file))
  {
    if (!(strlen(buf) == BUFSIZE-1 && buf[BUFSIZE-2] != '\n'))
      newlines++;
  }

  printf("Number of lines in %s: %d\n", argv[1], newlines);

  return 0;
}

The BUFSIZE macro can be tweaked to maximize performance (since you say you want the fastest way). 1024 is simply a guess. Another possibility is probably to read the file memory mapped, but I didn't try since mmap is not ANSI C.

gspr
+2  A: 

Maybe take a look at the source code of the GNU wc utility as this utility does exactly what you want.

#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>

typedef unsigned long count_t;  /* Counter type */

/* Current file counters: chars, words, lines */
count_t ccount;
count_t wcount;
count_t lcount;

/* Totals counters: chars, words, lines */
count_t total_ccount = 0;
count_t total_wcount = 0;
count_t total_lcount = 0;

/* Print error message and exit with error status. If PERR is not 0,
   display current errno status. */
static void
error_print (int perr, char *fmt, va_list ap)
{
  vfprintf (stderr, fmt, ap);
  if (perr)
    perror (" ");
  else
    fprintf (stderr, "\n");
  exit (1);  
}

/* Print error message and exit with error status. */
static void
errf (char *fmt, ...)
{
  va_list ap;

  va_start (ap, fmt);
  error_print (0, fmt, ap);
  va_end (ap);
}

/* Print error message followed by errno status and exit
   with error code. */
static void
perrf (char *fmt, ...)
{
  va_list ap;

  va_start (ap, fmt);
  error_print (1, fmt, ap);
  va_end (ap);
}

/* Output counters for given file */
void
report (char *file, count_t ccount, count_t wcount, count_t lcount)
{
  printf ("%6lu %6lu %6lu %s\n", lcount, wcount, ccount, file);
}

/* Return true if C is a valid word constituent */
static int
isword (unsigned char c)
{
  return isalpha (c);
}

/* Increase character and, if necessary, line counters */
#define COUNT(c)       \
      ccount++;        \
      if ((c) == '\n') \
        lcount++;

/* Get next word from the input stream. Return 0 on end
   of file or error condition. Return 1 otherwise. */
int
getword (FILE *fp)
{
  int c;
  int word = 0;

  if (feof (fp))
    return 0;

  while ((c = getc (fp)) != EOF)
    {
      if (isword (c))
        {
          wcount++;
          break;
        }
      COUNT (c);
    }

  for (; c != EOF; c = getc (fp))
    {
      COUNT (c);
      if (!isword (c))
        break;
    }

  return c != EOF;
}

/* Process file FILE. */
void
counter (char *file)
{
  FILE *fp = fopen (file, "r");

  if (!fp)
    perrf ("cannot open file `%s'", file);

  ccount = wcount = lcount = 0;
  while (getword (fp))
    ;
  fclose (fp);

  report (file, ccount, wcount, lcount);
  total_ccount += ccount;
  total_wcount += wcount;
  total_lcount += lcount;
}

int
main (int argc, char **argv)
{
  int i;

  if (argc < 2)
    errf ("usage: wc FILE [FILE...]");

  for (i = 1; i < argc; i++)
    counter (argv[i]);

  if (argc > 2)
    report ("total", total_ccount, total_wcount, total_lcount);
  return 0;
}

Found at: http://www.gnu.org/software/cflow/manual/html_node/Source-of-wc-command.html

Gary Willoughby