semantic phase of c compiler - semantics

if write 1=a in the sample c program, it doesnt detect it as an error. How do i solve this problem? Also how do i do global and local scope of variables. Thanks if anyone can solve it
clexer.lex source code
D [0-9]
L [a-zA-Z_]
H [a-fA-F0-9]
E [Ee][+-]?{D}+
FS (f|F|l|L)
IS (u|U|l|L)*
%{
#include <stdio.h>
#include "y.tab.h"
int cnt=1;
int line=1;
char tempid[100];
%}
%%
"/*" {comment();}
"auto" { cnt+=yyleng;ECHO; return(AUTO); }
"break" { cnt+=yyleng;ECHO; return(BREAK); }
"case" { cnt+=yyleng;ECHO; return(CASE); }
"char" { cnt+=yyleng;ECHO; return(CHAR); }
"const" { cnt+=yyleng;ECHO; return(CONST); }
"continue" { cnt+=yyleng;ECHO; return(CONTINUE); }
"default" { cnt+=yyleng;ECHO; return(DEFAULT); }
"do" { cnt+=yyleng;ECHO; return(DO); }
"double" { cnt+=yyleng;ECHO; return(DOUBLE); }
"else" { cnt+=yyleng;ECHO; return(ELSE); }
"enum" { cnt+=yyleng;ECHO; return(ENUM); }
"extern" { cnt+=yyleng;ECHO; return(EXTERN); }
"float" { cnt+=yyleng;ECHO; return(FLOAT); }
"for" { cnt+=yyleng;ECHO; return(FOR); }
"goto" { cnt+=yyleng;ECHO; return(GOTO); }
"if" { cnt+=yyleng;ECHO; return(IF); }
"int" { cnt+=yyleng;ECHO; return(INT); }
"long" { cnt+=yyleng;ECHO; return(LONG); }
"register" { cnt+=yyleng;ECHO; return(REGISTER); }
"return" { cnt+=yyleng;ECHO; return(RETURN); }
"short" { cnt+=yyleng;ECHO; return(SHORT); }
"signed" { cnt+=yyleng;ECHO; return(SIGNED); }
"sizeof" { cnt+=yyleng;ECHO; return(SIZEOF); }
"static" { cnt+=yyleng;ECHO; return(STATIC); }
"struct" { cnt+=yyleng;ECHO; return(STRUCT); }
"switch" { cnt+=yyleng;ECHO; return(SWITCH); }
"typedef" { cnt+=yyleng;ECHO; return(TYPEDEF); }
"union" { cnt+=yyleng;ECHO; return(UNION); }
"unsigned" { cnt+=yyleng;ECHO; return(UNSIGNED); }
"void" { cnt+=yyleng;ECHO; return(VOID); }
"volatile" { cnt+=yyleng;ECHO; return(VOLATILE); }
"while" { cnt+=yyleng;ECHO; return(WHILE); }
(['])+({L}|{D})+([']) { cnt+=yyleng;ECHO; return(SINGLE); }
{L}({L}|{D})* { cnt+=yyleng;ECHO; strcpy(tempid,yytext);return(IDENTIFIER); }
0[xX]{H}+{IS}? { cnt+=yyleng;ECHO; return(CONSTANT); }
0{D}+{IS}? { cnt+=yyleng;ECHO; return(CONSTANT); }
{D}+{IS}? { cnt+=yyleng;ECHO; return(CONSTANT); }
L?'(\\.|[^\\'])+' { cnt+=yyleng;ECHO; return(CONSTANT); }
{D}+{E}{FS}? { cnt+=yyleng;ECHO; return(CONSTANT); }
{D}*"."{D}+({E})?{FS}? { cnt+=yyleng;ECHO; return(CONSTANT); }
{D}+"."{D}*({E})?{FS}? { cnt+=yyleng;ECHO; return(CONSTANT); }
L?\"(\\.|[^\\"])*\" { cnt+=yyleng;ECHO; return(STRING_LITERAL); }
"..." { cnt+=yyleng;ECHO; return(ELLIPSIS); }
">>=" { cnt+=yyleng;ECHO; return(RIGHT_ASSIGN); }
"<<=" { cnt+=yyleng;ECHO; return(LEFT_ASSIGN); }
"+=" { cnt+=yyleng;ECHO; return(ADD_ASSIGN); }
"-=" { cnt+=yyleng;ECHO; return(SUB_ASSIGN); }
"*=" { cnt+=yyleng;ECHO; return(MUL_ASSIGN); }
"/=" { cnt+=yyleng;ECHO; return(DIV_ASSIGN); }
"%=" { cnt+=yyleng;ECHO; return(MOD_ASSIGN); }
"&=" { cnt+=yyleng;ECHO; return(AND_ASSIGN); }
"^=" { cnt+=yyleng;ECHO; return(XOR_ASSIGN); }
"|=" { cnt+=yyleng;ECHO; return(OR_ASSIGN); }
">>" { cnt+=yyleng;ECHO; return(RIGHT_OP); }
"<<" { cnt+=yyleng;ECHO; return(LEFT_OP); }
"++" { cnt+=yyleng;ECHO; return(INC_OP); }
"--" { cnt+=yyleng;ECHO; return(DEC_OP); }
"->" { cnt+=yyleng;ECHO; return(PTR_OP); }
"&&" { cnt+=yyleng;ECHO; return(AND_OP); }
"||" { cnt+=yyleng;ECHO; return(OR_OP); }
"<=" { cnt+=yyleng;ECHO; return(LE_OP); }
">=" { cnt+=yyleng;ECHO; return(GE_OP); }
"==" { cnt+=yyleng;ECHO; return(EQ_OP); }
"!=" { cnt+=yyleng;ECHO; return(NE_OP); }
";" { cnt+=yyleng;ECHO; return(';'); }
("{"|"<%") { cnt+=yyleng;ECHO; return('{'); }
("}"|"%>") { cnt+=yyleng;ECHO; return('}'); }
"," { cnt+=yyleng;ECHO; return(','); }
":" { cnt+=yyleng;ECHO; return(':'); }
"=" { cnt+=yyleng;ECHO; return('='); }
"(" { cnt+=yyleng;ECHO; return('('); }
")" { cnt+=yyleng;ECHO; return(')'); }
("["|"<:") { cnt+=yyleng;ECHO; return('['); }
("]"|":>") { cnt+=yyleng;ECHO; return(']'); }
"." { cnt+=yyleng;ECHO; return('.'); }
"&" { cnt+=yyleng;ECHO; return('&'); }
"!" { cnt+=yyleng;ECHO; return('!'); }
"~" { cnt+=yyleng;ECHO; return('~'); }
"-" { cnt+=yyleng;ECHO; return('-'); }
"+" { cnt+=yyleng;ECHO; return('+'); }
"*" { cnt+=yyleng;ECHO; return('*'); }
"/" { cnt+=yyleng;ECHO; return('/'); }
"%" { cnt+=yyleng;ECHO; return('%'); }
"<" { cnt+=yyleng;ECHO; return('<'); }
">" { cnt+=yyleng;ECHO; return('>'); }
"^" { cnt+=yyleng;ECHO; return('^'); }
"|" { cnt+=yyleng;ECHO; return('|'); }
"?" { cnt+=yyleng;ECHO; return('?'); }
[ ] {cnt+=yyleng;ECHO;}
[\t\v\f] { cnt+=yyleng; }
[\n] {line++;cnt=1;}
. { /* ignore bad characters */ }
%%
yywrap()
{
return(1);
}
comment()
{
char c, c1;
loop:
while ((c = input()) != '*' && c != 0)
{
if(c=='\n') {line++;cnt=1;}
else {cnt++;}
}
//putchar(c); PUTCHAR only if comments need to be shown!
if ((c1 = input()) != '/' && c1 != 0)
{
unput(c1);
goto loop;
}
}
cparser.yacc source code
%{
#include <stdio.h>
#include <string.h>
#include "symbol_table.h"
extern FILE *yyin;
extern FILE *yyout;
extern int column;
extern int line;
extern int cnt;
extern char *yytext,tempid[100];
int temp,err,err1=0;
install()
{
symrec *s;
s = getsym (tempid);
if (s == 0)
s = putsym (tempid,temp);
else
{
printf(" VOID=1 ");
printf(" CHAR=2 ");
printf(" INT=3 ");
printf(" FLOAT=4 ");
printf(" DOUBLE=4 ");
printf( "\n\nThere is a Semantic error at Pos : %d : %d : %s is already defined as %d\n\n",line,cnt,s->name,s->type );
exit(0);
}
err1=1;
}
int context_check()
{
symrec *s;
s = getsym(tempid);
if (s == 0 )
{printf( "\n\nThere is a Semantic error at Pos : %d : %d : %s is an undeclared identifier\n\n",line,cnt,tempid);exit(0);return 0;}
else
return(s->type);
err1=1;
}
type_err(int t1,int t2)
{
if(t1&&t2)
{
printf(" VOID=1 ");
printf(" CHAR=2 ");
printf(" INT=3 ");
printf(" FLOAT=4 ");
printf(" DOUBLE=4 ");
printf( "\n\nThere is a Semantic error at Pos : %d : %d : Type mismatch for %s between %d and %d \n\n",line,cnt,tempid,t1,t2);
err1=1;
exit(0);
}
}
%}
%token IDENTIFIER CONSTANT STRING_LITERAL SIZEOF
%token PTR_OP INC_OP DEC_OP LEFT_OP RIGHT_OP LE_OP GE_OP EQ_OP NE_OP
%token AND_OP OR_OP MUL_ASSIGN DIV_ASSIGN MOD_ASSIGN ADD_ASSIGN
%token SUB_ASSIGN LEFT_ASSIGN RIGHT_ASSIGN AND_ASSIGN
%token XOR_ASSIGN OR_ASSIGN TYPE_NAME SINGLE
%token TYPEDEF EXTERN STATIC AUTO REGISTER
%token CHAR SHORT INT LONG SIGNED UNSIGNED FLOAT DOUBLE CONST VOLATILE VOID
%token STRUCT UNION ENUM ELLIPSIS
%token CASE DEFAULT IF ELSE SWITCH WHILE DO FOR GOTO CONTINUE BREAK RETURN
%nonassoc LOWER_THAN_ELSE
%nonassoc ELSE
%start translation_unit
%%
primary_expression
: IDENTIFIER {$$=context_check();}
| CONSTANT
| STRING_LITERAL
| '(' expression ')' {$$= $2;}
;
postfix_expression
: primary_expression {$$=$1;}
| postfix_expression '[' expression ']'
| postfix_expression '(' ')'
| postfix_expression '(' argument_expression_list ')'
| postfix_expression '.' IDENTIFIER
| postfix_expression PTR_OP IDENTIFIER
| postfix_expression INC_OP
| postfix_expression DEC_OP
;
argument_expression_list
: assignment_expression
| argument_expression_list ',' assignment_expression
;
unary_expression
: postfix_expression {$$=$1;}
| INC_OP unary_expression
| DEC_OP unary_expression
| unary_operator cast_expression
| SIZEOF unary_expression
| SIZEOF '(' type_name ')'
;
unary_operator
: '&'
| '*'
| '+'
| '-'
| '~'
| '!'
;
cast_expression
: unary_expression {$$=$1;}
| '(' type_name ')' cast_expression
;
multiplicative_expression
: cast_expression {$$=$1;}
| multiplicative_expression '*' cast_expression
| multiplicative_expression '/' cast_expression
| multiplicative_expression '%' cast_expression
;
additive_expression
: multiplicative_expression {$$=$1;}
| additive_expression '+' multiplicative_expression
| additive_expression '-' multiplicative_expression
;
shift_expression
: additive_expression {$$=$1;}
| shift_expression LEFT_OP additive_expression
| shift_expression RIGHT_OP additive_expression
;
relational_expression
: shift_expression {$$=$1;}
| relational_expression '<' shift_expression
| relational_expression '>' shift_expression
| relational_expression LE_OP shift_expression
| relational_expression GE_OP shift_expression
;
equality_expression
: relational_expression {$$=$1;}
| equality_expression EQ_OP relational_expression
| equality_expression NE_OP relational_expression
;
and_expression
: equality_expression {$$=$1;}
| and_expression '&' equality_expression
;
exclusive_or_expression
: and_expression {$$=$1;}
| exclusive_or_expression '^' and_expression
;
inclusive_or_expression
: exclusive_or_expression {$$=$1;}
| inclusive_or_expression '|' exclusive_or_expression
;
logical_and_expression
: inclusive_or_expression {$$=$1;}
| logical_and_expression AND_OP inclusive_or_expression
;
logical_or_expression
: logical_and_expression {$$=$1;}
| logical_or_expression OR_OP logical_and_expression
;
conditional_expression
: logical_or_expression {$$=$1;}
| logical_or_expression '?' expression ':' conditional_expression
;
assignment_expression
: conditional_expression {$$=$1;}
| unary_expression assignment_operator assignment_expression {if($1!=$3){type_err($1,$3);}}
;
assignment_operator
: '='
| MUL_ASSIGN
| DIV_ASSIGN
| MOD_ASSIGN
| ADD_ASSIGN
| SUB_ASSIGN
| LEFT_ASSIGN
| RIGHT_ASSIGN
| AND_ASSIGN
| XOR_ASSIGN
| OR_ASSIGN
;
expression
: assignment_expression {$$=$1;}
| expression ',' assignment_expression
;
constant_expression
: conditional_expression
;
declaration
: declaration_specifiers ';'
| declaration_specifiers init_declarator_list ';'
;
declaration_specifiers
: storage_class_specifier
| storage_class_specifier declaration_specifiers
| type_specifier
| type_specifier declaration_specifiers
| type_qualifier
| type_qualifier declaration_specifiers
;
init_declarator_list
: init_declarator
| init_declarator_list ',' init_declarator
;
init_declarator
: declarator
| declarator '=' initializer
;
storage_class_specifier
: TYPEDEF
| EXTERN
| STATIC
| AUTO
| REGISTER
;
type_specifier
: VOID {temp=1;}
| CHAR {temp=2;}
| SHORT {temp=3;}
| INT {temp=3;}
| LONG {temp=3;}
| FLOAT {temp=4;}
| DOUBLE {temp=4;}
| SIGNED
| UNSIGNED
| struct_or_union_specifier
| enum_specifier
| TYPE_NAME
;
struct_or_union_specifier
: struct_or_union IDENTIFIER '{' struct_declaration_list '}' {install();}
| struct_or_union '{' struct_declaration_list '}'
| struct_or_union IDENTIFIER {install();}
;
struct_or_union
: STRUCT
| UNION
;
struct_declaration_list
: struct_declaration
| struct_declaration_list struct_declaration
;
struct_declaration
: specifier_qualifier_list struct_declarator_list ';'
;
specifier_qualifier_list
: type_specifier specifier_qualifier_list
| type_specifier
| type_qualifier specifier_qualifier_list
| type_qualifier
;
struct_declarator_list
: struct_declarator
| struct_declarator_list ',' struct_declarator
;
struct_declarator
: declarator
| ':' constant_expression
| declarator ':' constant_expression
;
enum_specifier
: ENUM '{' enumerator_list '}'
| ENUM IDENTIFIER '{' enumerator_list '}'
| ENUM IDENTIFIER
;
enumerator_list
: enumerator
| enumerator_list ',' enumerator
;
enumerator
: IDENTIFIER {context_check();}
| IDENTIFIER '=' constant_expression //{context_check();}
;
type_qualifier
: CONST
| VOLATILE
;
declarator
: pointer direct_declarator
| direct_declarator
;
direct_declarator
: IDENTIFIER {install();}
| '(' declarator ')'
| direct_declarator '[' constant_expression ']'
| direct_declarator '[' ']'
| direct_declarator '(' parameter_type_list ')'
| direct_declarator '(' identifier_list ')'
| direct_declarator '(' ')'
;
pointer
: '*'
| '*' type_qualifier_list
| '*' pointer
| '*' type_qualifier_list pointer
;
type_qualifier_list
: type_qualifier
| type_qualifier_list type_qualifier
;
parameter_type_list
: parameter_list
| parameter_list ',' ELLIPSIS
;
parameter_list
: parameter_declaration
| parameter_list ',' parameter_declaration
;
parameter_declaration
: declaration_specifiers declarator
| declaration_specifiers abstract_declarator
| declaration_specifiers
;
identifier_list
: IDENTIFIER {install();}
| identifier_list ',' IDENTIFIER {install();}
;
type_name
: specifier_qualifier_list
| specifier_qualifier_list abstract_declarator
;
abstract_declarator
: pointer
| direct_abstract_declarator
| pointer direct_abstract_declarator
;
direct_abstract_declarator
: '(' abstract_declarator ')'
| '[' ']'
| '[' constant_expression ']'
| direct_abstract_declarator '[' ']'
| direct_abstract_declarator '[' constant_expression ']'
| '(' ')'
| '(' parameter_type_list ')'
| direct_abstract_declarator '(' ')'
| direct_abstract_declarator '(' parameter_type_list ')'
;
initializer
: assignment_expression {$$=$1;}
| '{' initializer_list '}'
| '{' initializer_list ',' '}'
;
initializer_list
: initializer
| initializer_list ',' initializer
;
statement
: labeled_statement
| compound_statement
| expression_statement
| selection_statement
| iteration_statement
| jump_statement
;
labeled_statement
: IDENTIFIER ':' statement //{context_check();}
| CASE constant_expression ':' statement
| DEFAULT ':' statement
;
compound_statement
: '{' '}'
| '{' statement_list '}'
| '{' declaration_list '}'
| '{' declaration_list statement_list '}'
;
declaration_list
: declaration
| declaration_list declaration
;
statement_list
: statement
| statement_list statement
;
expression_statement
: ';'
| expression ';'
;
selection_statement
: IF '(' expression ')' statement %prec LOWER_THAN_ELSE ;
| IF '(' expression ')' statement ELSE statement
| SWITCH '(' expression ')' statement
;
iteration_statement
: WHILE '(' expression ')' statement
| DO statement WHILE '(' expression ')' ';'
| FOR '(' expression_statement expression_statement ')' statement
| FOR '(' expression_statement expression_statement expression ')' statement
;
jump_statement
: GOTO IDENTIFIER ';' //{context_check();}
| CONTINUE ';'
| BREAK ';'
| RETURN ';'
| RETURN expression ';'
;
translation_unit
: external_declaration
| translation_unit external_declaration
;
external_declaration
: function_definition
| declaration
;
function_definition
: declaration_specifiers declarator declaration_list compound_statement
| declaration_specifiers declarator compound_statement
| declarator declaration_list compound_statement
| declarator compound_statement
;
%%
yyerror(s)
char *s;
{
fflush(stdout);err=1;
printf("Syntax error at Pos : %d : %d\n",line,cnt);
exit(0);
//printf("\n%*s\n%*s\n", column, "^", column, s);
}
main(argc,argv)
int argc;
char **argv;
{
char *fname;
++argv,--argc;/*skip program name*/
if(argc>0)
{
yyin=fopen(argv[0],"r");
fname=argv[0];
strcat(fname,"_output");
yyout=fopen(fname,"w");
}
else
{
printf("Please give the c filename as an argument.\n");
}
yyparse();
if(err==0)
printf("No Syntax errors found!\n");
fname=argv[0];strcat(fname,"_symbol-table");
FILE *sym_tab=fopen(fname,"w");
fprintf(sym_tab,"Type\tSymbol\n");
symrec *ptr;
for(ptr=sym_table;ptr!=(symrec *)0;ptr=(symrec *)ptr->next)
{
fprintf(sym_tab,"%d\t%s\n",ptr->type,ptr->name);
}
fclose(sym_tab);
}
Symbol table.h source code
#define t_void 1
#define t_char 2
#define t_int 3
#define t_float 4
struct symrec
{
char *name;
int type;
struct symrec *next;
};
typedef struct symrec symrec;
symrec *sym_table = (symrec *)0;
symrec *putsym();
symrec *getsym();
symrec *putsym(char *sym_name,int sym_type)
{
symrec *ptr;
ptr=(symrec *)malloc(sizeof(symrec));
ptr->name=(char *)malloc(strlen(sym_name)+1);
strcpy(ptr->name,sym_name);
ptr->type=sym_type;
ptr->next=(struct symrec *)sym_table;
sym_table=ptr;
return ptr;
}
symrec *getsym(char *sym_name)
{
symrec *ptr;
for(ptr=sym_table;ptr!=(symrec *)0;ptr=(symrec *)ptr->next)
if(strcmp(ptr->name,sym_name)==0)
return ptr;
return 0;
}

In general terms, when you have an assignment operation, you need to check the left operand to make sure its an lvalue and issue an error if its not. This is most commonly done as part of typechecking -- you keep attributes about values (eg, is it an lvalue or not) along with the type, and check that those attributes are correct for each use of a value.
So what you might do is use %union to define a parser value object that can hold this info:
%union {
struct {
Type *type;
int is_lvalue;
} valinfo;
}
%type<valinfo> assignment_expression unary_expression
Then, your rule for assignments would check this along with the type:
assignment_expression:
unary_expression assignment_operator assignment_expression {
if (!$1.is_lvalue)
error("assigning to non-lvalue");
if ($1.type != $3.type && !type_is_implicitly_convertable($3.type, $1.type))
error("type mismatch in assignment");
$$.type = $1. type;
$$.is_lvalue = 0; }
Note that you need to make sure to set $$ properly in EVERY rule action that might have its value used by some other rule action; your code fails to do this, so likely won't do anything useful as is.

Related

ANTLR v3 Treewalker class. How to evaluate right associative function such as Factorial

I'm trying to build an expression evaluator with ANTLR v3 but I can't get the factorial function because it is right associative.
This is the code:
class ExpressionParser extends Parser;
options { buildAST=true; }
imaginaryTokenDefinitions :
SIGN_MINUS
SIGN_PLUS;
expr : LPAREN^ sumExpr RPAREN! ;
sumExpr : prodExpr ((PLUS^|MINUS^) prodExpr)* ;
prodExpr : powExpr ((MUL^|DIV^|MOD^) powExpr)* ;
powExpr : runary (POW^ runary)? ;
runary : unary (FAT)?;
unary : (SIN^|COS^|TAN^|LOG^|LN^|RAD^)* signExpr;
signExpr : (
m:MINUS^ {#m.setType(SIGN_MINUS);}
| p:PLUS^ {#p.setType(SIGN_PLUS);}
)? atom ;
atom : NUMBER | expr ;
class ExpressionLexer extends Lexer;
PLUS : '+' ;
MINUS : '-' ;
MUL : '*' ;
DIV : '/' ;
MOD : '%' ;
POW : '^' ;
SIN : 's' ;
COS : 'c' ;
TAN : 't' ;
LOG : 'l' ;
LN : 'n' ;
RAD : 'r' ;
FAT : 'f' ;
LPAREN: '(' ;
RPAREN: ')' ;
SEMI : ';' ;
protected DIGIT : '0'..'9' ;
NUMBER : (DIGIT)+ ('.' (DIGIT)+)?;
{import java.lang.Math;}
class ExpressionTreeWalker extends TreeParser;
expr returns [double r]
{ double a,b; int i,f=1; r=0; }
: #(PLUS a=expr b=expr) { r=a+b; }
| #(MINUS a=expr b=expr) { r=a-b; }
| #(MUL a=expr b=expr) { r=a*b; }
| #(DIV a=expr b=expr) { r=a/b; }
| #(MOD a=expr b=expr) { r=a%b; }
| #(POW a=expr b=expr) { r=Math.pow(a,b); }
| #(SIN a=expr ) { r=Math.sin(a); }
| #(COS a=expr ) { r=Math.cos(a); }
| #(TAN a=expr ) { r=Math.tan(a); }
| #(LOG a=expr ) { r=Math.log10(a); }
| #(LN a=expr ) { r=Math.log(a); }
| #(RAD a=expr ) { r=Math.sqrt(a); }
| #(FAT a=expr ) { for(i=1; i<=a; i++){f=f*i;}; r=(double)f;}
| #(LPAREN a=expr) { r=a; }
| #(SIGN_MINUS a=expr) { r=-1*a; }
| #(SIGN_PLUS a=expr) { if(a<0)r=0-a; else r=a; }
| d:NUMBER { r=Double.parseDouble(d.getText()); } ;
if I change FAT matching case in class TreeWalker with something like this:
| #(a=expr FAT ) { for(i=1; i<=a; i++){f=f*i;}; r=(double)f;}
I get this errors:
Expression.g:56:7: rule classDef trapped:
Expression.g:56:7: unexpected token: a
error: aborting grammar 'ExpressionTreeWalker' due to errors
Exiting due to errors.
Your tree walker (the original one) is fine, as far as I can see.
However, you probably need to mark FAT in the grammar:
runary : unary (FAT^)?;
(Note the hat ^, as in all the other productions.)
Edit:
As explained in the Antlr3 wiki, the hat operator is needed to make the node the "root of subtree created for entire enclosing rule even if nested in a subrule". In this case, the ! operator is nested in a conditional subrule ((FAT)?). That's independent of whether the operator is prefix or postfix.
Note that in your grammar the ! operator is not right-associative since a!! is not valid at all. But I would say that associativity is only meaningful for infix operators.

Antlr grammar with EBNF

I'm trying to generate a Lexer/Parser through a simple grammar using ANTLR. What I've done right now :
grammar ExprV2;
#header {
package mypack.parte2;
}
#lexer::header {
package mypack.parte2;
}
start
: expr EOF { System.out.println($expr.val);}
;
expr returns [int val]
: term e=exprP[$term.val] { $val = $e.val; }
;
exprP[int i] returns [int val]
: { $val = $i; }
| '+' term e=exprP[$i + $term.val] { $val = $e.val; }
| '-' term e=exprP[$i - $term.val] { $val = $e.val; }
;
term returns [int val]
: fact e=termP[$fact.val] { $val = $e.val; }
;
termP[int i] returns [int val]
: {$val = $i;}
| '*' fact e=termP[$i * $fact.val] {$val = $e.val; }
| '/' fact e=termP[$i / $fact.val] {$val = $e.val; }
;
fact returns [int val]
: '(' expr ')' { $val = $expr.val; }
| NUM { $val=Integer.parseInt($NUM.text); }
;
NUM : '0'..'9'+ ;
WS : (' ' | '\t' |'\r' | '\n')+ { skip(); };
What I would like to obtain is to generate a Lexer/Parser with a grammar written using EBNF but I'm stucked and I don't know how to go ahead. I looked on the internet but I did not succeed in finding anything clear. Thanks to all!

Simplified smalltalk grammar using antlr - unary minus and message chaining

I am writing simple smalltalk-like grammar using antlr. It is simplified version of smalltalk, but basic ideas are the same (message passing for example).
Here is my grammar so far:
grammar GAL;
options {
//k=2;
backtrack=true;
}
ID : ('a'..'z'|'A'..'Z'|'_') ('a'..'z'|'A'..'Z'|'0'..'9'|'_')*
;
INT : '0'..'9'+
;
FLOAT
: ('0'..'9')+ '.' ('0'..'9')* EXPONENT?
| '.' ('0'..'9')+ EXPONENT?
| ('0'..'9')+ EXPONENT
;
COMMENT
: '"' ( options {greedy=false;} : . )* '"' {$channel=HIDDEN;}
;
WS : ( ' '
| '\t'
) {$channel=HIDDEN;}
;
NEW_LINE
: ('\r'?'\n')
;
STRING
: '\'' ( ESC_SEQ | ~('\\'|'\'') )* '\''
;
fragment
EXPONENT : ('e'|'E') ('+'|'-')? ('0'..'9')+ ;
fragment
HEX_DIGIT : ('0'..'9'|'a'..'f'|'A'..'F') ;
fragment
ESC_SEQ
: '\\' ('b'|'t'|'n'|'f'|'r'|'\"'|'\''|'\\')
| UNICODE_ESC
| OCTAL_ESC
;
fragment
OCTAL_ESC
: '\\' ('0'..'3') ('0'..'7') ('0'..'7')
| '\\' ('0'..'7') ('0'..'7')
| '\\' ('0'..'7')
;
fragment
UNICODE_ESC
: '\\' 'u' HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
;
BINARY_MESSAGE_CHAR
: ('~' | '!' | '#' | '%' | '&' | '*' | '-' | '+' | '=' | '|' | '\\' | '<' | '>' | ',' | '?' | '/')
('~' | '!' | '#' | '%' | '&' | '*' | '-' | '+' | '=' | '|' | '\\' | '<' | '>' | ',' | '?' | '/')?
;
// parser
program
: NEW_LINE* (statement (NEW_LINE+ | EOF))*
;
statement
: message_sending
| return_statement
| assignment
| temp_variables
;
return_statement
: '^' statement
;
assignment
: identifier ':=' statement
;
temp_variables
: '|' identifier+ '|'
;
object
: raw_object
;
raw_object
: number
| string
| identifier
| literal
| block
| '(' message_sending ')'
;
message_sending
: keyword_message_sending
;
keyword_message_sending
: binary_message_sending keyword_message?
;
binary_message_sending
: unary_message_sending binary_message*
;
unary_message_sending
: object (unary_message)*
;
unary_message
: unary_message_selector
;
binary_message
: binary_message_selector unary_message_sending
;
keyword_message
: (NEW_LINE? single_keyword_message_selector NEW_LINE? binary_message_sending)+
;
block
:
'[' (block_signiture
)? NEW_LINE*
block_body
NEW_LINE* ']'
;
block_body
: (statement
)?
(NEW_LINE+ statement
)*
;
block_signiture
:
(':' identifier
)+ '|'
;
unary_message_selector
: identifier
;
binary_message_selector
: BINARY_MESSAGE_CHAR
;
single_keyword_message_selector
: identifier ':'
;
keyword_message_selector
: single_keyword_message_selector+
;
symbol
: '#' (string | identifier | binary_message_selector | keyword_message_selector)
;
literal
: symbol block? // if there is block then this is method
;
number
: /*'-'?*/
( INT | FLOAT )
;
string
: STRING
;
identifier
: ID
;
1. Unary Minus
I have a problem with unary minus for numbers (commented part for rule number). The problem is that minus is valid binary message. To make things worse two minus signs are also valid binary message. What I need is unary minus in case where there is no object to send binary message to (for example, -3+4 should be unary minus because there is nothing in frot of -3). Also, (-3) should be binary minus too. It would be great if 1 -- -2 would be binary message '--' with parameter -2, but I can live without that. How can I do this?
If I uncomment unary minus I get error MismatchedSetException(0!=null) when parsing something like 1-2.
2. Message chaining
What would be best way to implement message chainging like in smalltalk? What I mean by this is something like this:
obj message1 + 3;
message2;
+ 3;
keyword: 2+3
where every message would be sent to the same object, in this case obj. Message precedence should be kept (unary > binary > keyword).
3. Backtrack
Most of this grammar can be parsed with k=2, but when input is something like this:
1 + 2
Obj message:
1 + 2
message2: 'string'
parser tries to match Obj as single_keyword_message_selector and raises UnwantedTokenExcaption on token message. If remove k=2 and set backtrack=true (as I did) everything works as it should. How can I remove backtrack and get desired behaviour?
Also, most of the grammar can be parsed using k=1, so I tried to set k=2 only for rules that require it, but that is ignored. I did something like this:
rule
options { k = 2; }
: // rule definition
;
but it doesn't work until I set k in global options. What am I missing here?
Update:
It is not ideal solution to write grammar from scratch, because I have a lot of code that depends on it. Also, some features of smalltalk that are missing - are missing by design. This is not intended to be another smalltalk implementation, smalltalk was just an inspiration.
I would be more then happy to have unary minus working in cases like this: -1+2 or 2+(-1). Cases like 2 -- -1 are just not so important.
Also, message chaining is something that should be done as simple as posible. That means that I don't like idea of changeing AST I am generating.
About backtrack - I can live with it, just asked here out of personal curiosity.
This is little modified grammar that generates AST - maybe it will help to better understand what I don't want to change. (temp_variables are probably going to be deleted, I havent made that decision).
grammar GAL;
options {
//k=2;
backtrack=true;
language=CSharp3;
output=AST;
}
tokens {
HASH = '#';
COLON = ':';
DOT = '.';
CARET = '^';
PIPE = '|';
LBRACKET = '[';
RBRACKET = ']';
LPAREN = '(';
RPAREN = ')';
ASSIGN = ':=';
}
// generated files options
#namespace { GAL.Compiler }
#lexer::namespace { GAL.Compiler}
// this will disable CLSComplaint warning in ANTLR generated code
#parser::header {
// Do not bug me about [System.CLSCompliant(false)]
#pragma warning disable 3021
}
#lexer::header {
// Do not bug me about [System.CLSCompliant(false)]
#pragma warning disable 3021
}
ID : ('a'..'z'|'A'..'Z'|'_') ('a'..'z'|'A'..'Z'|'0'..'9'|'_')*
;
INT : '0'..'9'+
;
FLOAT
: ('0'..'9')+ '.' ('0'..'9')* EXPONENT?
| '.' ('0'..'9')+ EXPONENT?
| ('0'..'9')+ EXPONENT
;
COMMENT
: '"' ( options {greedy=false;} : . )* '"' {$channel=Hidden;}
;
WS : ( ' '
| '\t'
) {$channel=Hidden;}
;
NEW_LINE
: ('\r'?'\n')
;
STRING
: '\'' ( ESC_SEQ | ~('\\'|'\'') )* '\''
;
fragment
EXPONENT : ('e'|'E') ('+'|'-')? ('0'..'9')+ ;
fragment
HEX_DIGIT : ('0'..'9'|'a'..'f'|'A'..'F') ;
fragment
ESC_SEQ
: '\\' ('b'|'t'|'n'|'f'|'r'|'\"'|'\''|'\\')
| UNICODE_ESC
| OCTAL_ESC
;
fragment
OCTAL_ESC
: '\\' ('0'..'3') ('0'..'7') ('0'..'7')
| '\\' ('0'..'7') ('0'..'7')
| '\\' ('0'..'7')
;
fragment
UNICODE_ESC
: '\\' 'u' HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
;
BINARY_MESSAGE_CHAR
: ('~' | '!' | '#' | '%' | '&' | '*' | '-' | '+' | '=' | '|' | '\\' | '<' | '>' | ',' | '?' | '/')
('~' | '!' | '#' | '%' | '&' | '*' | '-' | '+' | '=' | '|' | '\\' | '<' | '>' | ',' | '?' | '/')?
;
// parser
public program returns [ AstProgram program ]
: { $program = new AstProgram(); }
NEW_LINE*
( statement (NEW_LINE+ | EOF)
{ $program.AddStatement($statement.stmt); }
)*
;
statement returns [ AstNode stmt ]
: message_sending
{ $stmt = $message_sending.messageSending; }
| return_statement
{ $stmt = $return_statement.ret; }
| assignment
{ $stmt = $assignment.assignment; }
| temp_variables
{ $stmt = $temp_variables.tempVars; }
;
return_statement returns [ AstReturn ret ]
: CARET statement
{ $ret = new AstReturn($CARET, $statement.stmt); }
;
assignment returns [ AstAssignment assignment ]
: dotted_expression ASSIGN statement
{ $assignment = new AstAssignment($dotted_expression.dottedExpression, $ASSIGN, $statement.stmt); }
;
temp_variables returns [ AstTempVariables tempVars ]
: p1=PIPE
{ $tempVars = new AstTempVariables($p1); }
( identifier
{ $tempVars.AddVar($identifier.identifier); }
)+
p2=PIPE
{ $tempVars.EndToken = $p2; }
;
object returns [ AstNode obj ]
: number
{ $obj = $number.number; }
| string
{ $obj = $string.str; }
| dotted_expression
{ $obj = $dotted_expression.dottedExpression; }
| literal
{ $obj = $literal.literal; }
| block
{ $obj = $block.block; }
| LPAREN message_sending RPAREN
{ $obj = $message_sending.messageSending; }
;
message_sending returns [ AstKeywordMessageSending messageSending ]
: keyword_message_sending
{ $messageSending = $keyword_message_sending.keywordMessageSending; }
;
keyword_message_sending returns [ AstKeywordMessageSending keywordMessageSending ]
: binary_message_sending
{ $keywordMessageSending = new AstKeywordMessageSending($binary_message_sending.binaryMessageSending); }
( keyword_message
{ $keywordMessageSending = $keywordMessageSending.NewMessage($keyword_message.keywordMessage); }
)?
;
binary_message_sending returns [ AstBinaryMessageSending binaryMessageSending ]
: unary_message_sending
{ $binaryMessageSending = new AstBinaryMessageSending($unary_message_sending.unaryMessageSending); }
( binary_message
{ $binaryMessageSending = $binaryMessageSending.NewMessage($binary_message.binaryMessage); }
)*
;
unary_message_sending returns [ AstUnaryMessageSending unaryMessageSending ]
: object
{ $unaryMessageSending = new AstUnaryMessageSending($object.obj); }
(
unary_message
{ $unaryMessageSending = $unaryMessageSending.NewMessage($unary_message.unaryMessage); }
)*
;
unary_message returns [ AstUnaryMessage unaryMessage ]
: unary_message_selector
{ $unaryMessage = new AstUnaryMessage($unary_message_selector.unarySelector); }
;
binary_message returns [ AstBinaryMessage binaryMessage ]
: binary_message_selector unary_message_sending
{ $binaryMessage = new AstBinaryMessage($binary_message_selector.binarySelector, $unary_message_sending.unaryMessageSending); }
;
keyword_message returns [ AstKeywordMessage keywordMessage ]
:
{ $keywordMessage = new AstKeywordMessage(); }
(
NEW_LINE?
single_keyword_message_selector
NEW_LINE?
binary_message_sending
{ $keywordMessage.AddMessagePart($single_keyword_message_selector.singleKwSelector, $binary_message_sending.binaryMessageSending); }
)+
;
block returns [ AstBlock block ]
: LBRACKET
{ $block = new AstBlock($LBRACKET); }
(
block_signiture
{ $block.Signiture = $block_signiture.blkSigniture; }
)? NEW_LINE*
block_body
{ $block.Body = $block_body.blkBody; }
NEW_LINE*
RBRACKET
{ $block.SetEndToken($RBRACKET); }
;
block_body returns [ IList<AstNode> blkBody ]
#init { $blkBody = new List<AstNode>(); }
:
( s1=statement
{ $blkBody.Add($s1.stmt); }
)?
( NEW_LINE+ s2=statement
{ $blkBody.Add($s2.stmt); }
)*
;
block_signiture returns [ AstBlockSigniture blkSigniture ]
#init { $blkSigniture = new AstBlockSigniture(); }
:
( COLON identifier
{ $blkSigniture.AddIdentifier($COLON, $identifier.identifier); }
)+ PIPE
{ $blkSigniture.SetEndToken($PIPE); }
;
unary_message_selector returns [ AstUnaryMessageSelector unarySelector ]
: identifier
{ $unarySelector = new AstUnaryMessageSelector($identifier.identifier); }
;
binary_message_selector returns [ AstBinaryMessageSelector binarySelector ]
: BINARY_MESSAGE_CHAR
{ $binarySelector = new AstBinaryMessageSelector($BINARY_MESSAGE_CHAR); }
;
single_keyword_message_selector returns [ AstIdentifier singleKwSelector ]
: identifier COLON
{ $singleKwSelector = $identifier.identifier; }
;
keyword_message_selector returns [ AstKeywordMessageSelector keywordSelector ]
#init { $keywordSelector = new AstKeywordMessageSelector(); }
:
( single_keyword_message_selector
{ $keywordSelector.AddIdentifier($single_keyword_message_selector.singleKwSelector); }
)+
;
symbol returns [ AstSymbol symbol ]
: HASH
( string
{ $symbol = new AstSymbol($HASH, $string.str); }
| identifier
{ $symbol = new AstSymbol($HASH, $identifier.identifier); }
| binary_message_selector
{ $symbol = new AstSymbol($HASH, $binary_message_selector.binarySelector); }
| keyword_message_selector
{ $symbol = new AstSymbol($HASH, $keyword_message_selector.keywordSelector); }
)
;
literal returns [ AstNode literal ]
: symbol
{ $literal = $symbol.symbol; }
( block
{ $literal = new AstMethod($symbol.symbol, $block.block); }
)? // if there is block then this is method
;
number returns [ AstNode number ]
: /*'-'?*/
( INT
{ $number = new AstInt($INT); }
| FLOAT
{ $number = new AstInt($FLOAT); }
)
;
string returns [ AstString str ]
: STRING
{ $str = new AstString($STRING); }
;
dotted_expression returns [ AstDottedExpression dottedExpression ]
: i1=identifier
{ $dottedExpression = new AstDottedExpression($i1.identifier); }
(DOT i2=identifier
{ $dottedExpression.AddIdentifier($i2.identifier); }
)*
;
identifier returns [ AstIdentifier identifier ]
: ID
{ $identifier = new AstIdentifier($ID); }
;
Hi Smalltalk Grammar writer,
Firstly, to get a smalltalk grammar to parse properly (1 -- -2) and to support the optional '.' on the last statement, etc., you should treat whitespace as significant. Don't put it on the hidden channel.
The grammar so far is not breaking down the rules into small enough fragments. This will be a problem like you have seen with K=2 and backtracking.
I suggest you check out a working Smalltalk grammar in ANTLR as defined by the Redline Smalltalk project http://redline.st & https://github.com/redline-smalltalk/redline-smalltalk
Rgs, James.

ANTLR not matching unicode escaped character

I'm writing a parser/interpreter for a C-like language and I need to interpret escaped characters. One of them is the unicode-escaped sequence with this pattern "\uXXXX" where X is some hex number.
My ANTLR rules look like this:
public char returns [char c]
: '\\"' { $c = '"'; }
| '\\\\' { $c = '\\'; }
| '\\/' { $c = '/'; }
| '\\b' { $c = '\b'; }
| '\\f' { $c = '\f'; }
| '\\n' { $c = '\n'; }
| '\\r' { $c = '\r'; }
| '\\t' { $c = '\t'; }
| '\\u' HEXDIGIT HEXDIGIT HEXDIGIT HEXDIGIT { $c = 'e'; }
| ~('\\' | '"') { $c = '/'; }
;
fragment HEXDIGIT
: ('0'..'9'|'a'..'f'|'A'..'F')
I'm feeding it this string "\u1234" for which I expect an 'e' but I'm getting a '/' instead which is the fallback rule for everything else.
Is there some magic juju going on with fragments and rules or something that I'm not aware of?
As mentioned by Adam, char is a parser rule at the moment, but should be made a lexer rule instead, in which case you can't let it return a char (lexer rules always return an instance of a Token!).
You can adjust the inner-text of a token using its setText(...) method like this (assuming Java is the target language):
// lexer rules start with a capital!
Char
: '\\"' { setText("\""); }
| '\\\\' { setText("\\"); }
| '\\/' { setText("/"); }
| '\\b' { setText("\b"); }
| '\\f' { setText("\f"); }
| '\\n' { setText("\n"); }
| '\\r' { setText("\r"); }
| '\\t' { setText("\t"); }
| '\\u' HEXDIGIT HEXDIGIT HEXDIGIT HEXDIGIT
{
String hex = getText();
int i = Integer.parseInt(hex.substring(2), 16);
setText(hex + " base 10 = " + i);
}
| ~('\\' | '"')
;
fragment HEXDIGIT
: ('0'..'9'|'a'..'f'|'A'..'F')
;

Why do I get a syntax error in my program made with flex and yacc?

I made a program that is supposed to recognize a simple grammar. When I input what I think is supposed to be a valid statement, I get an error. Specifically, if I type
int a;
int b;
it doesn't work. After I type int a; the program echoes ; for some reason. Then when I type int b; I get syntax error.
The lex file:
%{
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include "y.tab.h"
%}
else ELSE
if IF
int INT|int
return RETURN
void VOID
while WHILE
id [a-zA-Z]*
num [0-9]*
lte <=
gte >=
equal ==
notequal !=
%%
{else} { return ELSE; }
{if} { return IF; }
{int} { return INT; }
{return} { return RETURN; }
{void} { return VOID; }
{while} { return WHILE; }
{id} { return ID; }
{num} { return NUM; }
{lte} { return LTE; }
{gte} { return GTE; }
{equal} { return EQUAL; }
{notequal} { return NOTEQUAL; }
%%
The yacc file:
/* C-Minus BNF Grammar */
%token ELSE
%token IF
%token INT
%token RETURN
%token VOID
%token WHILE
%token ID
%token NUM
%token LTE
%token GTE
%token EQUAL
%token NOTEQUAL
%%
program : declaration_list ;
declaration_list : declaration_list declaration | declaration ;
declaration : var_declaration | fun_declaration ;
var_declaration : type_specifier ID ';'
| type_specifier ID '[' NUM ']' ';' ;
type_specifier : INT | VOID ;
fun_declaration : type_specifier ID '(' params ')' compound_stmt ;
params : param_list | VOID ;
param_list : param_list ',' param
| param ;
param : type_specifier ID | type_specifier ID '[' ']' ;
compound_stmt : '{' local_declarations statement_list '}' ;
local_declarations : local_declarations var_declaration
| /* empty */ ;
statement_list : statement_list statement
| /* empty */ ;
statement : expression_stmt
| compound_stmt
| selection_stmt
| iteration_stmt
| return_stmt ;
expression_stmt : expression ';'
| ';' ;
selection_stmt : IF '(' expression ')' statement
| IF '(' expression ')' statement ELSE statement ;
iteration_stmt : WHILE '(' expression ')' statement ;
return_stmt : RETURN ';' | RETURN expression ';' ;
expression : var '=' expression | simple_expression ;
var : ID | ID '[' expression ']' ;
simple_expression : additive_expression relop additive_expression
| additive_expression ;
relop : LTE | '<' | '>' | GTE | EQUAL | NOTEQUAL ;
additive_expression : additive_expression addop term | term ;
addop : '+' | '-' ;
term : term mulop factor | factor ;
mulop : '*' | '/' ;
factor : '(' expression ')' | var | call | NUM ;
call : ID '(' args ')' ;
args : arg_list | /* empty */ ;
arg_list : arg_list ',' expression | expression ;
Ok...you need to add a semi-colon as a token as well in your language spec...as a fyi, do a google search on this ...there are a few lex/yacc files for C programming language as well...and there are plenty of tutorials on this...flex/bison are not exactly forgiving on program spec errors...you really need to understand the elements of how it works...Look for Jack Crenshaw's famous tutorial on how to build a compiler.
Lex:
id [a-zA-Z]*
num [0-9]*
both cases can meet empty strings, use '+' instead