tags:

views:

89

answers:

3
#include<stdio.h>
#include<ctype.h>
#include<string.h>

/* this is a lexer which recognizes constants , variables ,symbols, identifiers , functions , comments and also header files . It stores the lexemes in 3 different files . One file contains all the headers and the comments . Another file will contain all the variables , another will contain all the symbols. */

int main()
{
    int i=0,j,k,count=0;
    char a,b[100],c[10000],d[100];
  memset ( d, 0, 100 );
    j=30;



    FILE *fp1,*fp2;


    fp1=fopen("source.txt","r"); //the source file is opened in read only mode which will passed through the lexer
    fp2=fopen("lext.txt","w");
    //now lets remove all the white spaces and store the rest of the words in a file


    if(fp1==NULL)
    {
        perror("failed to open source.txt");
        //return EXIT_FAILURE;
    }
    i=0;
    k=0;
    while(!feof(fp1))
    {


        a=fgetc(fp1);


        if(a!=' '&&a!='\n')
        {
                if (!isalpha(a))
                    {

                    switch(a)
                        {

                        case '+':{fprintf(fp2,"+ ---->  PLUS \n");
                                i=0;break;}
                        case '-':{fprintf(fp2,"- ---> MINUS \n");
                                i=0;break;}
                        case '*':{fprintf(fp2,  "* --->MULT \n");
                                i=0;break;}
                        case '/':{fprintf(fp2,  "/ --->DIV \n");
                                i=0;break;}
                        //case '+=':fprintf(fp2, "%.20s\n", "ADD_ASSIGN");
                        //case '-=':fprintf(fp2, "%.20s\n", "SUB_ASSIGN");
                        case '=':{fprintf(fp2,  "= ---> ASSIGN \n");
                                i=0;break;}
                        case '%':{fprintf(fp2,  "% ---> MOD \n");
                                i=0;break;}
                        case '<':{fprintf(fp2,  "< ---> LESSER_THAN \n");
                                i=0;break;}
                        case '>':{fprintf(fp2,  "> --> GREATER_THAN \n");
                                i=0;break;}
                        //case '++':fprintf(fp2, "%.20s\n", "INCREMENT");
                        //case '--':fprintf(fp2, "%.20s\n", "DECREMENT");
                        //case '==':fprintf(fp2, "%.20s\n", "ASSIGNMENT");
                        case ';':{fprintf(fp2,  "; --->SEMI_COLUMN \n");
                                i=0;break;}
                        case ':':{fprintf(fp2,  ": --->COLUMN \n");
                                i=0;break;}
                        case '(':{fprintf(fp2,  "( --->LPAR \n");
                                i=0;break;}
                        case ')':{fprintf(fp2,  ") --->RPAR \n");
                                i=0;break;}
                        case '{':{fprintf(fp2,  "{ --->LBRACE \n");
                                i=0;break;}
                        case '}':{fprintf(fp2,  "} ---> RBRACE \n");
                                i=0;break;}
                        }
                    }
            else
                {

                    d[i]=a;
                    //printf("%c\n",d[i]);
                    i=i+1;


                }
                        //}
                        /* we can make the lexer more complex by including even more depths of checks for the symbols*/









        }
        else
            {


            d[i+1]='\0';


        printf("\n");

            if((strcmp(d,"if ")==0)){fprintf(fp2,"if ---->  IDENTIFIER \n");
                        //printf("%s \n",d);
                         memset ( d, 0, 100 );
                        //printf("%s \n",d);
                        count=count+1;}

            else if(strcmp(d,"then")==0){fprintf(fp2,"then ---->  IDENTIFIER \n");
                        count=count+1;}

                else if(strcmp(d,"else")==0){fprintf(fp2,"else ----> IDENTIFIER  \n");
                            count=count+1;}

                else if(strcmp(d,"switch")==0){fprintf(fp2,"switch ----> IDENTIFIER  \n");
                            count=count+1;}
                else if(strcmp(d,"printf")==0){fprintf(fp2,"prtintf ----> IDENTIFIER  \n");
                            count=count+1;}
                else if(strcmp(d,"scanf")==0){fprintf(fp2,"scanf ----> IDENTIFIER  \n");
                            count=count+1;}
                else if(strcmp(d,"NULL")==0){fprintf(fp2,"NULL ----> IDENTIFIER  \n");
                            count=count+1;}
                else if(strcmp(d,"int")==0){fprintf(fp2,"INT ----> IDENTIFIER  \n");
                            count=count+1;}
                else if(strcmp(d,"char")==0){fprintf(fp2,"char ----> IDENTIFIER  \n");
                            count=count+1;}
                else if(strcmp(d,"float")==0){fprintf(fp2,"float ----> IDENTIFIER  \n");
                            count=count+1;}
                else if(strcmp(d,"long")==0){fprintf(fp2,"long ----> IDENTIFIER \n");
                            count=count+1;}
                else if(strcmp(d,"double")==0){fprintf(fp2,"double ----> IDENTIFIER \n");
                            count=count+1;}
                else if(strcmp(d,"const")==0){fprintf(fp2,"const ----> IDENTIFIER \n");
                            count=count+1;}
                else if(strcmp(d,"continue")==0)fprintf(fp2,"continue ----> IDENTIFIER \n");

                else if(strcmp(d,"size of")==0){fprintf(fp2,"size of ----> IDENTIFIER \n");
                            count=count+1;}
                else if(strcmp(d,"register")==0){fprintf(fp2,"register ----> IDENTIFIER \n");
                            count=count+1;}
                else if(strcmp(d,"short")==0){fprintf(fp2,"short ----> IDENTIFIER \n");
                            count=count+1;}
            else if(strcmp(d,"auto")==0){fprintf(fp2,"auto ----> IDENTIFIER \n");
                            count=count+1;}
                else if(strcmp(d,"while")==0){fprintf(fp2,"while ----> IDENTIFIER \n");
                            count=count+1;}
                else if(strcmp(d,"do")==0){fprintf(fp2,"do ----> IDENTIFIER \n");
                            count=count+1;}
                else if(strcmp(d,"case")==0){fprintf(fp2,"case ----> IDENTIFIER \n");
                            count=count+1;}
        else if (isdigit(d[i]))
            {
                fprintf(fp2,"%s ---->NUMBER",d);
            }
        else if (isalpha(a))
            {
                fprintf(fp2,"%s ----> Variable",d);
                //printf("%s",d);
                // memset ( d, 0, 100 );}
                //fprintf(fp2, "s\n", b);
                i=0;
        k=k+1;

                continue;
            }

        i=i+1;
    k=k+1;


    }
fclose(fp1);
fclose(fp2);
printf("%d",count);
return 0;
}

In this code , my source.txt has if (a+b) stored . But only ( , + and ) is getting written into lext.txt and not the identifier if or the variable a and b . Any particular reason why?

+1  A: 

The comparison for the "if" keyword is comparing against "if " (with a space), but the code does not copy the space into the buffer d. Your best bet is to step through it with a debugger and see what is happening.

Mark Wilkins
+1  A: 

There are quite a few problems:

Once you find a space or a newline you try and compare what is in string d using a series of strcmp. But you are resetting the value of i in the body of else if (isalpha(a)) which will never execute because a is space or newline. You should unconditionally set the value of i to 0 right after

d[i+1]='\0';

Spaces in the string matter, so

if((strcmp(d,"if ")==0)) // d will never have a space as you never stuff it with one.

should be

if((strcmp(d,"if")==0))
codaddict
A: 
/*
ref: http://msdn2.microsoft.com/en-us/library/y39145bk(vs.80).aspx

C language Tokens
token: 
keyword 
identifier 
constant 
string-literal 
operator 
punctuator


operator: one of 
[ ]   ( )   .   –> ++       &   *   +   –   ~   !   sizeof/   %   <<   >>   <>   <=   >=   ==   !=   ^   |   &&   !!?   :=   *=   /=   %=   +=   –=   <<=   >>=   &=   ^=   |=,   #   ##

assignment-operator: one of
=   *=   /=   %=   +=   –=   <<=   >>= &=   ^=   |=

punctuator: one of
[ ]   ( )   { }   *   ,   :   =   ;   ...   #




 * This is a generalized program working for any kind of input( input C program).
 * It fails to recognise stdio.h in "#include<stdio.h>" as one token.
 * Instead it identifies stdio as Identifier, . as dot operator , h as Identifier.
 * this is becoz i considered .(dot) as a separator b/w tokens of structure variables

ex:
struct book
{
int price;
}b1;

b1.price;  //here dot is used as seperator b/w b1 token and price token.........

This program is not exactly correct.......
It neither parses the comments nor the header files.....

At first time if u get an error saying "Abnormal program termination"
simply reduce the TableSize macro value. Becoz TC cannot allocated more global memory...
*/


#include<stdio.h>
#include<string.h>
#include<conio.h>

//===============================================================================================

#define true 1
#define false 0

//simple macros for moving back and front in the file
#define MoveFront(units)  fseek(fp,+(units),SEEK_CUR)
#define MoveBack(units)   fseek(fp,-(units),SEEK_CUR)

//Token Table size
#define TableSize       9144

//===============================================================================================

//used to open the file
int FileOpen(const char * path);

//Core function which splits the given program in to tokens....
int CreateTokens();

//forms the token string based on the starting(start) of file pointer
//position and ending(end) of file pointer position of the token
void GetToken(int start,int end);

//recognises whether the formed token as keyword,identifier, numeric constant,.......
void IdentifyTokenType(int index);

int EOFReached();
//===============================================================================================
//Keywords are arranged in such a way that most frequently used onces come first.....
//So that the comparison b/w retrieved tokens and keywords become more efficient....
const char *keywords[]=
{
    "int",      "char",     "double",   "float",
    "if",       "else",     "for",      "while",
    "return",   "switch",   "case",     "break",
    "do",       "default",  "void",     "struct",
    "long",     "const",    "static",   "union",
    "enum",     "register", "short",    "unsigned",
    "continue", "goto",     "sizeof",   "signed",
    "auto",     "volatile", "typedef",  "extern",
};
//===============================================================================================
const char *preprocess[]=
{
    "#define", "#include",   // here rest of the preprocessors can be included too.............
};

//===============================================================================================
struct TokenEntry
{
    int start,  //start contains starting of token's file pointer position....
        end;    //end contains one more than the ending of token's file pointer position....
    char * type;    //Type of the token ,whether it is identifier ,keyword,numeric constant....
}tokenTable[TableSize]; //TableSize is the TC limit of TokenArray  :(

//===============================================================================================
struct DelimEntry
{
    char * delim;   //delimiters ....   "[","{"....
    char * type;    //delimiter's name ................"LSquare","LBrace"....
};

//===============================================================================================
//These are the delimiters in C language........(Not all of them are called as Delimiters...)
struct DelimEntry const DelimTable[]=
{
    //Single Character Delimiters   //Set 1
    {"[","LSquare"},                //0
    {"]","RSquare"},                //1
    {"(","LParen"},
    {")","RParen"},
    {"{","LBrace"},
    {"}","RBrace"},
    {",","Comma"},
    {";","SemiColon"},
    {":","Colon"},
    {"?","QuestionMark"},
    {"~","BitwiseNOT"},
    {".","Dot"},                    //11

    //Singles in triple character delimiters.... //Set 2
    //When ever u encounter a Set 2 character ....
    //U cannot confirm it as "Singles in triple character delimiters"
    //U need to check for its next character also.....
    //similarly When ever u encounter a Set 3 character ....
    //U need to check for its next character also.....
    //
    {"<","LessThan"},               //12
    {">","GreaterThan"},
    {"&","BitwiseAND"},
    {"|","BitwiseOR"},
    {"^","XOR"},
    {"=","Assignment"},
    {"!","Not"},
    {"%","Remainder"},
    {"-","Minus"},
    {"+","Plus"},
    {"*","Multiply"},
    {"/","DividedBy"},          //23

    //Doubles in triple character delimiters.... //Set 3
    {"<<","LeftShift"},         //24
    {"<=","LessThanOrEqual"},
    {">>","RightShift"},
    {">=","GreaterThanOrEqual"},
    {"&&","LogicalAND"},
    {"&=","BitwiseANDEqual"},
    {"||","LogicalOR"},
    {"|=","BitwiseOREqual"},
    {"^=","XOREqual"},
    {"==","LogicalEqual"},
    {"!=","LogicalNotEqual"},
    {"%=","RemainderEquals"},
    {"-=","MinusEquals"},
    {"->","PointerArrow"},
    {"--","DecrementOperator"},
    {"++","IncrementOperator"},
    {"+=","PlusEquals"},
    {"*=","MultiplyEquals"},
    {"/=","DividedByEquals"},       //42

    //Triples in triple character delimiters....  //Set 4
    {"<<=","LeftShiftEquals"},  //43
    {">>=","RightShiftEquals"}, //44

};
//===============================================================================================
FILE * fp = NULL;
int index;        //Variable used to iterate over the tokenTable Array..........
char buf[200];        //temporary variable to hold the token string........
//===============================================================================================
main(int argc , char *argv[] )
{
    int i;
    if(!FileOpen(argv[1])){
        printf("\nUnable to Open the File : %s",argv[1]);
        return;
    }
    if(!CreateTokens()){
        //It is the problem of TC ..... :(
        printf("\nUnable to Create Tokens - May be the given program contains tokens more than the maximum token table size");
        return;
    }

    //Printing the Created Tokens..........
    printf("\n%-5s %-16s %-18s %-8s %-8s\n","No","Token","Token Type","Begin","End");
    printf("============================================================");
    for(i=0 ; i<index;i++){
        GetToken(tokenTable[i].start,tokenTable[i].end);
        IdentifyTokenType(i);
        printf("\n%-5d %-16s %-18s %-6d %-6d",i,buf,tokenTable[i].type,tokenTable[i].start,tokenTable[i].end);
    }
}
//===============================================================================================
void IdentifyTokenType(int index)
{
    int no,i;
    if(strcmp(tokenTable[index].type ,"Unknown")==0)
    {
        //determining keywords present in tokens
        no = sizeof(keywords)/sizeof(int);
        for(i = 0;i<no;i++)
            if(strcmp(buf,keywords[i]) == 0){
                tokenTable[index].type = "Keyword";
                return;
            }

        //determining identifiers present in tokens
        if((buf[0]>='a'&&buf[0]<='z')|| (buf[0]>='A'&&buf[0]<='Z')||buf[0] == '_'){
            tokenTable[index].type = "Identifier";
            return;
        }


        //determining Preprocessor directives..
        no = sizeof(preprocess)/sizeof(int);
        for(i = 0;i<no;i++)
            if(strcmp(buf,preprocess[i]) == 0){
                tokenTable[index].type = "Preprocessor";
                return;
            }

        //Determining String Literals
        if(buf[0] == '"' && buf[strlen(buf)-1] == '"'){
            tokenTable[index].type = "String Literal";
            return;
        }
        //Determining Char Literals


        //Determining Numeric constants
        for(i=0;buf[i]!='\0';i++)
            if(!(buf[i]>='0'&&buf[i]<='9'))return;
        tokenTable[index].type = "Numeric Constant";
        return;
    }
}
//===============================================================================================
void GetToken(int start,int end)
{
    int i=0;
    fseek(fp,start,SEEK_SET);
    while(i<end-start)
        buf[i++] = fgetc(fp);

    buf[i] = '\0';
    //Trim trailing newline chars....
    for(i--;i>=0;i--)
        if(buf[i] == '\n')buf[i] = '\0';
}
//===============================================================================================
int EOFReached()
{
    return feof(fp) != 0;
}

//===============================================================================================
//Implements the state machine for splitting the given program(input) in to tokens...............
int CreateTokens()
{
    int  i=0,j=0,k=0;
    char c[4]={'\0','\0','\0','\0'}; //Array holding temporary characters.....
    do{
        //state1
        c[0] = fgetc(fp);

        //--------                                                  //Path A
        if(c[0] == ' ' || c[0] == '\t' || c[0] == '\n' || EOFReached()) goto End;

        /*
        //-------- Skipping the comments.....
        if(c[0] == '/')
        {
        c[1] = fgetc(fp);
        //Skipping // type of comments..............
        if(c[1] == '/')
        {
        do 
        c[2] = fgetc(fp);
        while(c[2]!='\n'&&!EOFReached());
        goto End;
        }
        //Skipping /* type of comments..............
        else if(c[1] == '*')
        {
        do{
        c[1] = fgetc(fp);
        c[2] = fgetc(fp);
        ungetc(c[2],fp);
        }while((c[1] != '*' || c[2] != '/')&&!EOFReached());
        goto End;
        }
        MoveBack(1);
        }
        */
        MoveBack(1);
        tokenTable[index].start = ftell(fp);
        MoveFront(1);

        //String literals
        if(c[0] == '"'){
            do{
                c[1] = fgetc(fp);
                if(c[1] =='\\') { fgetc(fp); c[1] = fgetc(fp);}     //skip \" character in b/w a string literal
            }while(c[1] != '"'&&!EOFReached());
            tokenTable[index].type = "Unknown";
            tokenTable[index++].end = ftell(fp);
            goto End;
        }   
        //--------
        for(i=0;i<12;i++)
        {
            if(strcmp(DelimTable[i].delim,c) == 0)                  //Path B
            {
                tokenTable[index].type = DelimTable[i].type;
                tokenTable[index++].end = ftell(fp);
                goto End;
            }
        }

        //--------
        for(i=12;i<24;i++)
        {
            //Checking for single char in triples
            if(strcmp(DelimTable[i].delim,c) == 0)                  //Path C
            {
                c[1] = fgetc(fp);                                   //State 2
                for(j=24;j<43&&!EOFReached();j++)
                {
                    //Checking for double char in triples
                    if(strcmp(DelimTable[j].delim,c) == 0)          //Path E
                    {
                        c[2] = fgetc(fp);                           //State 3
                        for(k=43;k<45&& !EOFReached();k++)
                        {   //Checking for triplets in triples
                            if(strcmp(DelimTable[k].delim,c) == 0)  //Path G
                            {
                                tokenTable[index].type = DelimTable[k].type;
                                tokenTable[index++].end = ftell(fp);
                                goto End;
                            }
                        }
                        //Path F
                        if(!EOFReached())MoveBack(1);
                        tokenTable[index].type = DelimTable[j].type;
                        tokenTable[index++].end = ftell(fp);
                        goto End;
                    }
                }
                //Path D
                if(!EOFReached())MoveBack(1);
                tokenTable[index].type = DelimTable[i].type;
                tokenTable[index++].end = ftell(fp);
                goto End;
            }
        }
        //---------
        do{
            //Path H
            c[0] = fgetc(fp);                                       //State 4
            //Path I
            //Checking for White Spaces
            if(c[0] == ' ' || c[0] == '\t' || c[0] == '\n' ||EOFReached())
            {
                if(!EOFReached())   MoveBack(1);
                tokenTable[index].type = "Unknown";
                tokenTable[index++].end = ftell(fp);
                goto End;
            }
            //Checking for Single char Delims  //Checking for Single char Delims in triplets
            for(i=0;i<24;i++)
                if(strcmp(DelimTable[i].delim,c) == 0)
                {
                    MoveBack(1);
                    tokenTable[index].type = "Unknown";
                    tokenTable[index++].end = ftell(fp);
                    goto End;
                }

        }while(!feof(fp));

End:
        c[0] = c[1] = c[2] = c[3] = '\0';
    }while((!feof(fp))&&index<TableSize);

    //Tokens May(Not) be Ready
    return index < TableSize;
}

//===============================================================================================
int FileOpen(const char *path)
{
    fp = fopen(path,"r");
    if(fp == NULL) return false;
    return true;
}
//===============================================================================================

Hope this helps you........

Vineel Kumar Reddy
`#` and `##` are only operators inside a preprocessor macro definition. Also, I would be amazed if this answer helped anyone. Too much code, too little explanation.
Chris Lutz