Contrary to popular belief, you probably do not need a parser to accomplish at least the first part of what you've asked for. The only possible exception is for counting "useful strings" -- since I'm not sure what that means, I can't say what you need to do to count them.
To count if
statements, for
statements and comments, you mostly need a lexer, not a parser. Technically, you could need a preprocessor as well, but my guess is that if a macro expands to a for
loop (for example) you probably don't want to count it anyway.
Parsing C is difficult and C++ considerably more so. Fortunately, lexing them (at least close to) properly is a lot easier. It's still harder than with most languages (at least if you want to deal correctly with trigraphs and digraphs), but still the sort of thing you can write in a day or two at most:
/* get_src.h */
#ifndef GET_SRC_INCLUDED
#define GET_SRC_INCLUDED
#include <stdio.h>
#ifdef __cplusplus
extern "C" {
#endif
/* This is the size of the largest token we'll attempt to deal with. If
* you want to deal with bigger tokens, change this, and recompile
* get_src.c. Note that an entire comment is treated as a single token,
* so long comments could overflow this. In case of an overflow, the
* entire comment will be read as a single token, but the part larger
* than this will not be stored.
*/
#define MAX_TOKEN_SIZE 8192
/* `last_token' will contain the text of the most recently read token.
*/
extern char last_token[];
/* This is the maximum number of characters that can be put back into a
* file opened with parse_fopen or parse_fdopen.
*/
#define MAX_UNGETS 5
#include <limits.h>
#include <stdio.h>
typedef struct {
FILE *file;
char peeks[MAX_UNGETS];
int last_peek;
} PFILE;
/* Some codes we return to indicate having found various items in the
* source code. ERROR is returned to indicate a newline found in the
* middle of a character or string literal or if a file ends inside a
* comment, or if a character literal contains more than two characters.
*
* Note that this starts at INT_MIN, the most negative number available
* in an int. This keeps these symbols from conflicting with any
* characters read from the file. However, one of these could
* theoretically conflict with EOF. EOF usually -1, and these are far
* more negative than that. However, officially EOF can be any value
* less than 0...
*/
enum {
ERROR = INT_MIN,
COMMENT,
CHAR_LIT,
STR_LIT,
IDENT, /* This is for any word except those below... */
CASE,
DEFAULT,
DO,
ELSE,
IF,
SWITCH,
WHILE,
INCLUDE,
DEFINE
};
/* Opens a file for parsing and returns a pointer to a structure which
* can be passed to the other functions in the parser/lexer to identify
* the file being worked with.
*/
PFILE *parse_fopen(char const *name);
/* This corresponds closely to fdopen - it takes a FILE * as its
* only parameter, creates a PFILE structure identifying that file, and
* returns a pointer to that structure.
*/
PFILE *parse_fdopen(FILE *stream);
/* Corresponds to fclose.
*/
int parse_fclose(PFILE *stream);
/* returns characters from `stream' read as C source code. String
* literals, characters literals and comments are each returned as a
* single code from those above. All strings of any kind of whitespace
* are returned as a single space character.
*/
int get_source(PFILE *stream);
/* As above, but adds classification of some C keywords as well. The
* keywords recognized are mostly for flow control and are those listed
* in the enumeration above, following IDENT. All identifiers and
* unrecognized keywords are returned as IDENT.
*/
int get_token(PFILE *stream);
/* If called with a value of 0, turns off recognition of C++ digraphs
* and single line comments. If called with a non-zero value, turns
* on recognition of same. Default is 1.
*/
void read_CPP(int status);
/* Basically, these two work just like the normal versions of the same,
* with the minor exception that unget_character can unget more than one
* character.
*/
int get_character(PFILE *stream);
void unget_character(int ch, PFILE *stream);
#ifdef __cplusplus
}
#endif
#endif
Then the implementation:
/* get_src.c */
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <stdlib.h>
#define GET_SOURCE
#include "get_src.h"
/* These are the keywords we recognize - those that involve flow control.
* recognition of all keywords simply involves adding them to this list,
* and adding matching identifiers to the enumeration in get_src.h. The
* matching enumerators for these keywords start after IDENT in the
* enumeration, but MUST be maintained in the same order as the keywords
* appear here. E.g. if "case" remains the first keyword here, `CASE'
* must follow immediately after IDENT in the enumeration. Any types
* added to the enumeration that do not have matching keywords should
* precede `IDENT'.
*/
static char *keys[] = {
"case",
"default",
"do",
"else",
"if",
"switch",
"while",
"#include",
"#define"
};
#define elems(x) (sizeof(x) / sizeof(x[0]))
static size_t current = 0;
static int CPP = 1;
char last_token[MAX_TOKEN_SIZE];
PFILE *parse_fopen(char const *name) {
PFILE *temp = malloc(sizeof(PFILE));
if ( NULL != temp ) {
temp->file = fopen(name, "r");
memset(temp->peeks, 0, sizeof(temp->peeks));
temp->last_peek = 0;
}
return temp;
}
PFILE *parse_fdopen(FILE *file) {
PFILE *temp = malloc(sizeof(PFILE));
if ( NULL != temp) {
temp->file = file;
memset(temp->peeks, 0, sizeof(temp->peeks));
temp->last_peek = 0;
}
return temp;
}
int parse_fclose(PFILE *stream) {
int retval = fclose(stream->file);
free(stream);
return retval;
}
/* If this is called with a non-zero value, we deal with C++ digraphs and
* single line comments. If it's called with 0, it turns off
* recognition of digraphs and single line comments.
*/
void read_CPP(int status) { CPP = status; }
static void addchar(int ch) {
/* adds the passed character to the end of `last_token' */
if ( current < sizeof(last_token) -1 )
last_token[current++] = (char)ch;
if ( current == sizeof(last_token)-1 )
last_token[current] = '\0';
}
static void clear(void) {
/* clears the previous token and starts building a new one. */
current = 0;
}
static int read_char(PFILE *stream) {
if ( stream->last_peek > 0 )
return stream->peeks[--stream->last_peek];
return fgetc(stream->file);
}
void unget_character(int ch, PFILE * stream) {
if ( stream->last_peek < sizeof(stream->peeks) )
stream->peeks[stream->last_peek++] = ch;
}
/* Here's where we start getting into sort of sophisticated stuff.
*/
static int check_trigraph(PFILE *stream) {
/* Checks for trigraphs and returns the equivalant character if there
* is one. Expects that the leading '?' of the trigraph has already
* been read before this is called.
*/
int ch;
if ( '?' != (ch=read_char(stream))) {
unget_character(ch, stream);
return '?';
}
ch = read_char(stream);
switch( ch ) {
case '(': return '[';
case ')': return ']';
case '/': return '\\';
case '\'': return '^';
case '<': return '{';
case '>': return '}';
case '!': return '|';
case '-': return '~';
case '=': return '#';
default:
unget_character('?', stream);
unget_character(ch, stream);
return '?';
}
}
static int check_digraph(PFILE *stream, int first) {
/* Checks for a digraph. The first character of the digraph is
* transmitted as the second parameter, as there are several possible
* first characters of a digraph.
*/
int ch = read_char(stream);
switch(first) {
case '<':
if ( '%' == ch )
return '{';
if ( ':' == ch )
return '[';
break;
case ':':
if ( '>' == ch )
return ']';
break;
case '%':
if ( '>' == ch )
return '}';
if ( ':' == ch )
return '#';
break;
}
/* If it's not one of the specific combos above, return the characters
* separately and unchanged by putting the second one back into the
* stream, and returning the first one as-is.
*/
unget_character(ch, stream);
return first;
}
static int get_char(PFILE *stream) {
/* Gets a single character from the stream with any trigraphs ( and if
* C++ support is turned on, digraphs ) converted to the single character
* represented.
*/
int ch = read_char(stream);
if ( ch == '?' )
return check_trigraph(stream);
if ( CPP && ( ch == '<' || ch == ':' || ch == '%' ))
return check_digraph(stream, ch);
return ch;
}
int get_character(PFILE *stream) {
/* get's a chacter from `stream'. Any amount of any kind of whitespace
* is returned as a single space.
*/
int ch;
if ( !isspace(ch=get_char(stream)))
return ch;
/* If it's a space, skip over consecutive white-space */
while ( isspace(ch) && '\n' != ch )
ch = get_char(stream);
if ( '\n' == ch )
return ch;
/* Then put the non-ws character back */
unget_character(ch, stream);
/* and return a single space character... */
return ' ';
}
static int skip_char_lit(PFILE *stream) {
/* This is used internally by `get_source' (below) - it expects the
* opening quote of a character literal to have already been read and
* returns CHAR_LIT or ERROR if there's a newline before a close
* quote is found, or if the character literal contains more than two
* characters after escapes are taken into account.
*/
int ch;
int i;
clear();
addchar('\'');
for (i=0; i<2 && ('\'' != ( ch = read_char(stream))); i++) {
addchar(ch);
if ( ch == '\n' )
return ERROR;
if (ch == '\\' ) {
ch = get_char(stream);
addchar(ch);
}
}
addchar('\'');
addchar('\0');
if ( i > 2 )
return ERROR;
return CHAR_LIT;
}
static int skip_str_lit(PFILE *stream) {
/* Used internally by get_source. Expects the opening quote of a string
* literal to have already been read. Returns STR_LIT or ERROR if a
* unescaped newline is found before the close quote.
*/
int ch;
clear();
addchar('"');
while ( '"' != ( ch = get_char(stream))) {
if ( '\n' == ch || EOF == ch )
return ERROR;
addchar(ch);
if( ch == '\\' ) {
ch = read_char(stream);
addchar(ch);
}
}
addchar('"');
addchar('\0');
return STR_LIT;
}
static int skip_comment(PFILE *stream) {
/* Skips over a comment in stream. Assumes the leading '/' has already
* been read and skips over the body. If we're reading C++ source, skips
* C++ single line comments as well as normal C comments.
*/
int ch;
clear();
ch = get_char(stream);
/* Handle a C++ single line comment.
*/
if ( CPP && '/' == ch ) {
addchar('/');
addchar('/');
while ( '\n' != ( ch = get_char(stream))) {
if ( '\\' == ch ) {
/* We might have an escaped newline inside the C++
* comment - simply treat the next character as part
* of the comment whether it's a newline or not.
*/
addchar(ch);
addchar(get_char(stream));
}
else
addchar(ch);
}
addchar('\0');
return COMMENT;
}
if ('*' != ch ) {
unget_character(ch, stream);
return '/';
}
addchar('/');
do {
addchar(ch);
while ( '*' != ( ch = get_char(stream)))
if ( EOF == ch )
return ERROR;
else
addchar(ch);
addchar(ch);
} while ( '/' != (ch=get_char(stream)));
addchar('/');
addchar('\0');
return COMMENT;
}
int get_source(PFILE *stream) {
/* reads and returns a single "item" from the stream. An "item" is a
* comment, a literal or a single character after trigraph and possible
* digraph substitution has taken place.
*/
int ch = get_character(stream);
switch(ch) {
case '\'':
return skip_char_lit(stream);
case '"':
return skip_str_lit(stream);
case '/':
return skip_comment(stream);
default:
return ch;
}
}
int get_token(PFILE *stream) {
/* This gets a single token from the input stream and places the text
* of the token in last_token, and returns an identifier of the type of
* the token. Only flow control keywords are recognized individually.
* All other keywords are simply returned as IDENT's, just like other
* identifiers.
*/
int ch;
int i;
ch = get_source(stream);
/* If we've got an identifier, read as many characters as can
* possibly constitute the identifier ( maximal munch ) and build
* up the complete identifier in `last_token'
*/
if ( ch > 0 && ('_' == ch || isalpha(ch))) {
clear();
while(ch > 0 && (isalpha(ch) || isdigit(ch) || '_' == ch )) {
addchar(ch);
ch = get_source(stream);
}
unget_character(ch,stream);
addchar('\0');
/* Now we look in our table to see if we've got a keyword
* we recognize, or some random identifier.
*/
for (i=0;i<elems(keys);i++) {
if ( 0 == strcmp(last_token, keys[i]))
return IDENT+i+1;
}
/* we didn't recognize it - it must be a normal identifier. */
return IDENT;
}
/* it's not an identifier - just return it as a character. */
return ch;
}
With those, it becomes a fairly trivial matter of calling get_token
repeatedly to get tokens, and counting how many times you get IF, COMMENT or (probably) STR_LIT.