Is there a better way to specify optional elements in rules of a CFG? - grammar

Consider a language and a compiler to design and develop for it.
In this language there is a particular statement that is part of the grammar: (=<identifier>). This piece can be recognized by the compiler. But spaces are allowed between the brackets and the equal sign and the identifier. So I have these possibilities:
(=<identifier>)
( = <identifier> )
(=identifier )
( =identifier )
...
Without considering the whole grammar but just the rules to handle this language feature, I have something like this (in a Bison-like syntax for grammar rules):
statement: OBRCKT EQ ID CBRCKT
| OBRCKT S EQ S ID S CBRCKT
| OBRCKT S EQ ID S CBRCKT
| OBRCKT S EQ S ID CBRCKT
| OBRCKT S EQ ID CBRCKT
| OBRCKT EQ S ID S CBRCKT
| OBRCKT EQ ID S CBRCKT
| OBRCKT EQ S ID CBRCKT
| ...
The space terminal S can appear or not. But the way rules are, I need to specify all possible combinations... Is there a better way to achieve this result?

As Jim commented, use your lexical tool to handle these cases instead of writing them into the productions of your grammar.
For example, I commonly use Flex for lexical analysis and Bison to define my grammar (probably as you have done).
You can achieve the result that you want with something like the following (this is just an example so it's pretty simple and cannot do much):
lexicalAnalyzer.l
/* lexicalAnalyzer.l
Specifications of tokens for some language.
*/
%{
%}
/*
* Definitions of regular expressions
* Note: You capture whitespace here...
*/
WSPACE [ \t\r]+ //We take care of the spaces here...
/*
* Tokens
*/
%%
"=" {
printf("TOKEN: EQ LEXEME: %s\n", yytext);
return T_EQ;
}
"(" {
printf("TOKEN: OBRCKT LEXEME: %s\n", yytext);
return T_OBRCKT;
}
")" {
printf("TOKEN: CBRCKT LEXEME: %s\n", yytext);
return T_CBRCKT;
}
"<" {
printf("TOKEN: LT LEXEME: %s\n", yytext);
return T_LT;
}
">" {
printf("TOKEN: GT LEXEME: %s\n", yytext);
return T_GT;
}
"identifier" {
printf("TOKEN: IDENT LEXEME: %s\n", yytext);
return T_IDENT;
}
{WSPACE} { }
. {
printf("TOKEN: UNKNOWN LEXEME: %s\n", yytext);
return T_UNKNOWN;
}
%%
syntaxAnalyzer.y
/*
syntaxAnalyzer.y
To create syntax analyzer:
flex file.l
bison file.y
g++ file.tab.c -o file_parser
file_parser < inputFileName
*/
/*
* Declaration section.
*/
%{
#include <stdio.h>
void printRule(const char *lhs, const char *rhs);
int yyerror(const char *s) {
printf("Error!");
}
extern "C" {
int yyparse(void);
int yylex(void);
int yywrap() {return 1;}
}
%}
/*
* Token declarations
*/
%token T_OBRCKT T_CBRCKT
%token T_LT T_GT T_EQ
%token T_IDENT T_UNKNOWN
/*
* Starting point.
*/
%start N_START
/*
* Translation rules.
*/
%%
N_START : N_STATEMENT
{
printRule("START", "STATEMENT");
printf("\n---- Completed parsing ----\n\n");
return 0;
}
;
N_STATEMENT : T_OBRCKT T_EQ T_LT T_IDENT T_GT T_CBRCKT
{
printRule("EXPR", "T_OBRCKT T_EQ T_LT T_IDENT T_GT T_CBRCKT");
}
;
%%
#include "lex.yy.c"
extern FILE *yyin;
void printRule(const char *lhs, const char *rhs) {
printf("%s -> %s\n", lhs, rhs);
return;
}
int main() {
do {
yyparse();
} while (!feof(yyin));
return 0;
}

Related

Yacc %define parse.error verbose generates error

When I try to get more out of my "syntax error", I seem to use the way described on so many websites, but all seem to create their own errors, for some reason.
I was getting standard "syntax error" on line 5 of the Input file... so I wanted to add better error handling so I can see what exactly is the issue. But
%define parse.error verbose
However, it gives me this;
error: %define variable 'parse.error' is not used
Below are my files, as long as you keep it constructive, feel free to comment on more then just the error parts, any help is welcome :)
(As long as the errors get fixed as well :P )
Thanks in advance!
lex file;
%option nounput yylineno
%{
#include "yaccTest.tab.h"
void InvalidToken();
void extern yyerror (char *s);
%}
whitespace [ \t\r\v\f]
linefeed \n
%%
";" {return SEMICOLON;}
"=" {return EQ;}
"+" {return PLUS;}
"-" {return MINUS;}
"*" {return MULTIPLY;}
"/" {return DEVIDE;}
"(" {return BO;}
")" {return BC;}
"^" {return POWER;}
"print" {return PRINT;}
[a-zA-Z][a-zA-Z0-9]* {yylval.charValue = yytext[0]; return IDENTIFIER;}
[0-9]+ {yylval.intValue = atoi(yytext); return NUMBER;}
{whitespace} {;}
. {InvalidToken();}
%%
void yyerror(char *s) {
fprintf(stderr, "\nERROR ON LINE %d : \n %s\n", yylineno, s);
exit(0);
}
void InvalidToken(){
printf("ERROR ON LINE %d : \n Invalid Token %s\n", yylineno,yytext);
exit(0);
}
int yywrap (void) {return 1;}
yacc file;
%{
#include <stdio.h>
#include <stdlib.h>
int getVariableValue(char varID);
extern int yylineno;
int varIDs[52] = {0};
int varValues[52] = {0};
%}
%define parse.lac full
%define parse.error verbose
%union YYSTYPE {int intValue; char charValue;}
%token COLON SEMICOLON ST SE EQ GE GT PLUS MINUS MULTIPLY DEVIDE BO BC CBO CBC POWER LOOP PRINT
%token <intValue> NUMBER
%token <charValue> IDENTIFIER CHAR
%type <charValue> declaration expression
%type <intValue> numval
%right EQ
%left PLUS MINUS
%left MULTIPLY DEVIDE
%left POWER
%%
declaration : IDENTIFIER EQ expression
| declaration IDENTIFIER EQ expression
;
expression : numval SEMICOLON
| PRINT BO numval BC SEMICOLON {printf("Printing");}
;
numval : NUMBER {$$ = $1;}
| NUMBER PLUS NUMBER {$$ = $1 + $3;}
| NUMBER MINUS NUMBER {$$ = $1 - $3;}
| NUMBER MULTIPLY NUMBER {$$ = $1 * $3;}
| NUMBER DEVIDE NUMBER {$$ = $1 / $3;}
| NUMBER POWER NUMBER {int i;int j = $1;for(i = 1; i < $3; i++){j=j*$1;};$$ = j;}
;
%%
int getVariableValue(char varID) {
int i, j, localTemp;
for (i=0;i<((sizeof(varIDs)/sizeof(varIDs[0])));i++) {
if (varID == varIDs[i]) {
localTemp = varValues[i];
}
}
return localTemp;
}
int setVariableValue(char varID, int varValue) {
int i, varPresent = 0;
for (i=0;i<((sizeof(varIDs)/sizeof(varIDs[0])));i++) {
if (varID == varIDs[i]) {
varValues[i] = varValue;
varPresent = 1;
}
}
if (varPresent == 0) {
for (i=0;i<((sizeof(varIDs)/sizeof(varIDs[0])));i++) {
if (&(varIDs[i]) == NULL) {
if (&(varValues[i]) == NULL) {
varIDs[i] = varID;
varValues[i] = varValue;
}
else {
missingVarIDError(varID, varValue);
}
}
else {
notEnoughStorageError(varID, varValue);
}
}
}
}
int missingVarIDError(char *id, int val){
printf("\nERROR ON LINE %d : \nIdentifier '%s' not found, but assigned location DOES have a value; %s",yylineno,id,val);
exit(0);
}
int notEnoughStorageError(char *id, int val){
printf("\nERROR ON LINE %d : \nIdentifier '%s' did not fit in StorageArray, '%3' not stored!",yylineno,id,val);
exit(0);
}
int main (void) {
return yyparse ( );
return 0;
}
Input file;
x=4;
y=2+6;
X=2;
z=5;
print(4);

Simple Lex/Yacc Calculator not printing output

I'm trying to understand how compilers and programming languages are made. And to do so I thought about creating a simple calculator which does just addition and subtraction. Below are the Lex and Yacc files which I wrote.
calc.yacc file:
%{
#include <stdio.h>
#include <stdlib.h>
extern int yylex();
void yyerror(char *);
%}
%union { int number; }
%start line
%token <number> NUM
%type <number> expression
%%
line: expression { printf("%d\n", $1); };
expression: expression '+' NUM { $$ = $1 + $3; };
expression: expression '-' NUM { $$ = $1 - $3; };
expression: NUM { $$ = $1; };
%%
void yyerror(char *s) {
fprintf(stderr, "%s", s);
exit(1);
}
int main() {
yyparse();
return 0;
}
calc.lex file:
%{
#include <stdio.h>
#include <stdlib.h>
#include "y.tab.h"
%}
%%
[0-9]+ {
yylval.number = atoi(yytext);
return NUM;
}
[-+] { return yytext[0]; }
[ \t\f\v\n] { ; }
%%
int yywrap() {
return 1;
}
It compiles nicely but when I run it and type something like 2 + 4 then it gets stuck and doesn't print the answer. Can somebody explain why? My guess is that my grammar is not correct (but I don't know how).
I came to the same idea like rici and changed your samples appropriately:
file calc.l:
%{
#include <stdio.h>
#include <stdlib.h>
#include "calc.y.h"
%}
%%
[0-9]+ {
yylval.number = atoi(yytext);
return NUM;
}
[-+] { return yytext[0]; }
"\n" { return EOL; }
[ \t\f\v\n] { ; }
%%
int yywrap() {
return 1;
}
file calc.y:
%{
#include <stdio.h>
#include <stdlib.h>
extern int yylex();
void yyerror(char *);
%}
%union { int number; }
%start input
%token EOL
%token <number> NUM
%type <number> expression
%%
input: line input | line
line: expression EOL { printf("%d\n", $1); };
expression: expression '+' NUM { $$ = $1 + $3; };
expression: expression '-' NUM { $$ = $1 - $3; };
expression: NUM { $$ = $1; };
%%
void yyerror(char *s) {
fprintf(stderr, "%s", s);
exit(1);
}
int main() {
yyparse();
return 0;
}
Compiled & tested in cygwin on Windows 10 (64 bit):
$ flex -o calc.l.c calc.l
$ bison -o calc.y.c -d calc.y
$ gcc -o calc calc.l.c calc.y.c
$ ./calc
2 + 4
6
2 - 4
-2
234 + 432
666
Notes:
Minor issue: According to the build commands, I had to change the #include for the generated token table. (A matter of taste.)
I introduced the EOL token in the lex source as well as in the line rule of the parser.
While testing I recognized that the 2nd input ended everytimes in a syntax error. I needed a while until I recognized that the grammer was actually limited now to accept precisely one line. Thus, I inserted the recursive input rule in the parser source.

lex and yacc to parse trignometric expression

I have the following code for lex and yacc. I am getting kind of extra values in the printed statement can anyone tell. whats wrong with the code?
Lex code:
%{
#include <stdio.h>
#include "y.tab.h"
%}
%%
[ \t] ;
[+-] { yylval=yytext; return Sym;}
(s|c|t)..x { yylval=yytext; return Str;}
[a-zA-Z]+ { printf("Invalid");}
%%
int yywrap()
{
return 1;
}
yacc code:
%{
#include<stdio.h>
%}
%start exps
%token Sym Str
%%
exps: exps exp
| exp
;
exp : Str Sym Str {printf("%s",$1); printf("%s",$2); printf("%s",$3);}
;
%%
int main (void)
{
while(1){
return yyparse();
}
}
yyerror(char *err) {
fprintf(stderr, "%s\n",err);
}
Input:
sinx+cosx
output:
sinx+cosx+cosxcosx
look at the output of the code!!!
yytext is a pointer into flex's internal scanning buffer, so its contents will be modified when the next token is read. If you want to return it to the parser, you need to make a copy:
[+-] { yylval=strdup(yytext); return Sym;}
(s|c|t)..x { yylval=strdup(yytext); return Str;}
Where symbols are a single character, it might make more sense to return that character directly in the scanner:
[-+] { return *yytext; }
in which case, your yacc rules should use the character directly in '-single quotes:
exp : Str '+' Str {printf("%s + %s",$1, $3); free($1); free($3); }
| Str '-' Str {printf("%s - %s",$1, $3); free($1); free($3); }

Print tokens properly using Lex and Yacc

I'm having difficulties printing a sequence of tokens that behaves recursively. To better explain, I will show the sections of the corresponding codes: First, the code on Lex:
%{
#include <stdio.h>
#include "y.tab.h"
installID(){
}
%}
abreparentese "("
fechaparentese ")"
pontoevirgula ";"
virgula ","
id {letra}(({letra}|{digito})|({letra}|{digito}|{underline}))*
digito [0-9]
letra [a-z|A-Z]
porreal "%real"
portexto "%texto"
porinteiro "%inteiro"
leia "leia"
%%
{abreparentese} { return ABREPARENTESE; }
{fechaparentese} { return FECHAPARENTESE; }
{pontoevirgula} { return PONTOEVIRGULA; }
{virgula} { return VIRGULA; }
{id} { installID();
return ID; }
{porinteiro} { return PORINTEIRO; }
{porreal} { return PORREAL; }
{portexto} { return PORTEXTO; }
{leia} { return LEIA;}
%%
int yywrap() {
return 1;
}
Now, the code on Yacc:
%{
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#define YYSTYPE char*
int yylex(void);
void yyerror(char *);
extern FILE *yyin, *yyout;
extern char* yytext;
%}
%token ABREPARENTESE FECHAPARENTESE PONTOEVIRGULA VIRGULA ID PORREAL PORTEXTO PORINTEIRO LEIA
%%
programs : programs program
| program
| ABREPARENTESE {fprintf(yyout,"%s",yytext);}
| FECHAPARENTESE {fprintf(yyout,"%s",yytext);}
;
program:
leia
;
leia:
LEIA ABREPARENTESE entradas ids FECHAPARENTESE PONTOEVIRGULA
{
fprintf(yyout,"scanf(\"%s\",%s);",$3,$4);
}
;
entradas:
tipo_entrada VIRGULA entradas {fprintf(yyout,"%s,",$1);}
| tipo_entrada VIRGULA {fprintf(yyout,"%s", $1); }
;
tipo_entrada:
| PORREAL {$$ = "%f";}
| PORTEXTO {$$ = "%c";}
| PORINTEIRO {$$ = "%d";}
;
ids:
id VIRGULA ids {fprintf(yyout,"&%s,",$1);}
| id {fprintf(yyout,"&%s",$1);}
;
id:
ID {$$ = strdup(yytext);}
;
%%
void yyerror(char *s) {
fprintf(stderr, "%s\n", s);
}
int main(int argc, char *argv[]){
yyout = fopen(argv[2],"w");
yyin = fopen(argv[1], "r");
yyparse();
return 0;
}
I believe I have copied all the relevant part of my problem on the code (some things maybe I forgot to copy and paste), however my problem is this part of the code:
leia: LEIA ABREPARENTESE entradas ids FECHAPARENTESE PONTOEVIRGULA
{
fprintf(yyout,"scanf(\"%s\",%s);",$3,$4);
}
;
In the input file, I have the following line:
leia (%real, %inteiro, id1, id2);
The expectation was this on the output file:
scanf("%f,%d",&id1,&id2);
But actually this is the result in the output file:
%d%f,&id2&id1,scanf("%f",id1);
Can you help me solve this problem? How do I print the tokens in the right place?
Normally, with bottom-up parsing, we use left-recursive productions, which has the result that the productions are reduced from left to right.
When you use right recursion, then productions are stacked up until the end, and then popped off the stack and therefore reductions are executed right-to-left.
So for example, it would be more usual to write:
ids: id
| ids ',' id
and then the semantic rules will execute in the expected order.

Syntax error in Bison after one token is processed

I am trying to come up to speed on Flex and Bison. I can parse one token with a very simple "language" but it fails on the second, even though the token is legitimate.
test.l:
%{
#include <stdio.h>
#include "test.hpp"
%}
%%
[0-9]+ {printf("Number entered\n"); return INTEGER_NUMBER;}
[a-zA-Z]+ {printf("plain text entered: '%s'\n",yytext); return PLAIN_TEXT;}
[ \t] ;
. ;
%%
test.y
%{
#include <stdio.h>
extern "C" {
int yyparse(void);
int yylex(void);
int yywrap() { return 1; }
extern int yylineno;
extern char* yytext;
extern int yylval;
}
/* #define YYSTYPE char * */
void yyerror(const char *message)
{
fprintf(stderr, "%d: error: '%s' at '%s', yylval=%u\n", yylineno, message, yytext, yylval);
}
main()
{
yyparse();
}
%}
%token PLAIN_TEXT INTEGER_NUMBER
%%
test : text | number;
text : PLAIN_TEXT
{
/*printf("plain text\n");*/
};
number : INTEGER_NUMBER
{
/*printf("number\n");*/
};
%%
Results:
$ ./test
cat
plain text entered: 'cat'
dog
plain text entered: 'dog'
1: error: 'syntax error' at 'dog', yylval=0
$ ./test
34
Number entered
34
Number entered
1: error: 'syntax error' at '34', yylval=0
Why am I getting this syntax error?
Your test.y seems to lack the grammar for the case that several tests
continue.
So, how about adding the grammar like the following?
%%
tests : test | tests test; /* added */
test : text | number;
...