I am trying to write a grammar which will be able to consume the following input:
begin #this is a example
x = 56;
while x > 0 do
begin
point 15.6 78.96;
end;
end;
Here is the lexer.l file:
%option noyywrap
%{
#include "parser.h"
#include <stdlib.h>
#include <stdio.h>
const char * const unrecognizedToken = "Unrecognized token";
%}
NewLine \n
WhiteSpaces [\r\t\f\v ]+
Semicolon ;
digit [0-9]
number1 {digit}+\.?([eE][-+]?{digit}+)?
number2 {digit}*\.{digit}+([eE][-+]?{digit}+)?
double_number {number1}|{number2}
BEGIN "begin"
END "end"
WHILE "while"
DO "do"
POINT "point"
%x POINT_DEFINITIONS
%%
{WhiteSpaces} {
printf("WhiteSpaces");
printf("\n");
}
{NewLine} {
printf("NewLine");
printf("\n");
}
{WHILE} {
printf("While");
printf("\n");
return TOKEN_WHILE;
}
{BEGIN} {
printf("TOKEN_BEGIN");
printf("\n");
return TOKEN_BEGIN;
}
{END} {
printf("TOKEN_END");
printf("\n");
return TOKEN_END;
}
{DO} {
printf("DO");
printf("\n");
return TOKEN_DO;
}
{POINT} {
printf("POINT_START");
printf("\n");
BEGIN POINT_DEFINITIONS;
return TOKEN_POINT;
}
<POINT_DEFINITIONS>{double_number} {
printf("POINT_DEFINITIONS %s", yytext);
printf("\n");
yylval.dval = atof(yytext);
return TOKEN_DOUBLE;
}
<POINT_DEFINITIONS>{WhiteSpaces} {
printf("WhiteSpaces");
printf("\n");
}
[a-zA-Z_][a-zA-Z0-9_]* {
printf("TOKEN_IDENTIFIER");
printf("\n");
yylval.name = strdup(yytext);
return TOKEN_IDENTIFIER;
}
[()=;] {
printf("yytext = %s", yytext);
printf("\n");
return *yytext;
}
[*/+\-<>] {
printf("TOKEN_OPERATOR");
printf("\n");
yylval.op = *yytext;
return TOKEN_OPERATOR;
}
[-]?[0-9]+ {
printf("TOKEN_VALUE");
printf("\n");
yylval.val = atoi(yytext);
return TOKEN_VALUE;
}
#.* {
printf("COMMENT");
printf("\n");
/*comment*/
}
. { printf("%s", unrecognizedToken); }
And here is the parser.y file:
%error-verbose
%{
#define YYDEBUG 1
%}
%union {
int val;
double dval;
char op;
char* name;
}
%token TOKEN_BEGIN TOKEN_END TOKEN_WHILE TOKEN_DO TOKEN_POINT TOKEN_OPERATOR TOKEN_VALUE TOKEN_IDENTIFIER TOKEN_DOUBLE
%start program
%{
void yyerror(const char* const message);
%}
%%
program: statement';';
block: TOKEN_BEGIN statements TOKEN_END { printf("rule block\n"); };
statements:
statement';' statements { printf("rule statements\n"); }
|;
statement:
| assignment
| command
| whileStmt
| block;
assignment: TOKEN_IDENTIFIER '=' TOKEN_VALUE {
printf("rule Assignment\n");
} ;
whileStmt: TOKEN_WHILE condition TOKEN_DO block {printf("rule While\n");};
condition: TOKEN_IDENTIFIER { printf("rule token_identifier\n"); }
| TOKEN_VALUE { printf("rule token_value\n"); }
| condition TOKEN_OPERATOR condition { printf("rule condition TOKEN_OPERATOR condition\n"); };
command: TOKEN_POINT TOKEN_DOUBLE TOKEN_DOUBLE { printf("rule Command\n"); };
%%
#include <stdlib.h>
void yyerror(const char* const message)
{
printf("Parse error:%s\n", message);
exit(1);
}
int main()
{
yyparse();
}
But am getting the following error message:
Parse error:syntax error, unexpected $end, expecting ';'
Compiled like this:
flex -o lexer.c lexer.l
bison -v -d -o parser.c parser.y
gcc parser.c lexer.c -o parser -g -DYYDEBUG=1
To run the parser:
./parser < example
Could you please help me to find out what is the problem? Why grammar cannot accept the above example as an input?
Your problem is in your lexical analyzer (independently of whether there's a problem in your grammar — I've not analyzed the grammar, because I hit on the problem in the lexical analyzer first, and it was sufficient to prevent the grammar working).
I added a test main()
to lexer.l
:
%%
YYSTYPE yylval;
int main(void)
{
int token;
while ((token = yylex()) != 0)
printf("Token: %d (%s)\n", token, yytext);
return 0;
}
I then ran it on your sample code to see whether the token stream is generated correctly. The output I got is:
TOKEN_BEGIN
Token: 258 (begin)
WhiteSpaces
COMMENT
NewLine
WhiteSpaces
TOKEN_IDENTIFIER
Token: 265 (x)
WhiteSpaces
yytext = =
Token: 61 (=)
WhiteSpaces
TOKEN_VALUE
Token: 264 (56)
yytext = ;
Token: 59 (;)
NewLine
NewLine
WhiteSpaces
While
Token: 260 (while)
WhiteSpaces
TOKEN_IDENTIFIER
Token: 265 (x)
WhiteSpaces
TOKEN_OPERATOR
Token: 263 (>)
WhiteSpaces
TOKEN_VALUE
Token: 264 (0)
WhiteSpaces
DO
Token: 261 (do)
NewLine
WhiteSpaces
TOKEN_BEGIN
Token: 258 (begin)
NewLine
WhiteSpaces
POINT_START
Token: 262 (point)
WhiteSpaces
POINT_DEFINITIONS 15.6
Token: 266 (15.6)
WhiteSpaces
POINT_DEFINITIONS 78.96
Token: 266 (78.96)
;
WhiteSpaces
end;
end;
As you can see, the last token returned to the main program is the 78.96.
When you recognize a POINT, you start the state POINT_DEFINITIONS. However, once in that state, you're in that state for ever; you never go back to the INITIAL state. You probably need to add a rule to the POINT_DEFINITIONS start state that recognizes a semicolon and executes BEGIN INITIAL;
:
<POINT_DEFINITIONS>{Semicolon} {
printf("Semicolon in POINT_DEFINITION state\n");
BEGIN INITIAL;
return *yytext;
}
With this in place, the tail end of the output is:
...
TOKEN_BEGIN
Token: 258 (begin)
NewLine
WhiteSpaces
POINT_START
Token: 262 (point)
WhiteSpaces
POINT_DEFINITIONS 15.6
Token: 266 (15.6)
WhiteSpaces
POINT_DEFINITIONS 78.96
Token: 266 (78.96)
Semicolon in POINT_DEFINITION state
Token: 59 (;)
NewLine
WhiteSpaces
TOKEN_END
Token: 259 (end)
punctuation: yytext = ;
Token: 59 (;)
NewLine
TOKEN_END
Token: 259 (end)
punctuation: yytext = ;
Token: 59 (;)
NewLine