Grammar to parse sql statements delimited by semicolon in antlr4 - grammar

I'm looking to separate sql statements that could have comments /* */ or strings 'test' or line comments --line comment (sql style) separated by semicolons. An example would be:
Blah blah 'string ; ' ;
More text /* semicolon(;) inside comment */
Some more text
in multiple lines
the text above should retrieve only two statements since the semicolon inside the string ' ' and the comment /* */ should not count as a delimiter.
The current grammar I have is :
grammar SqlStatements;
sql_stmts:
text (';' text)* EOF
;
text:
SINGLE_LINE_COMMENT*
| STRING*
| TEXT*
;
TEXT:
~['--';\''/*']*
;
STRING
:
'\'' ('\'\''|~'\'')* '\''
;
SINGLE_LINE_COMMENT
: '--' ~[\r\n]*
;
MULTILINE_COMMENT
: '/*' .*? ( '*/' | EOF )
;
The code above crashes when typing *.

The common approach for parsing SQL is to first split the individual statements. That might involve handling of delimiter switches, which is needed e.g. when you have a stored procedure in the dump which must be handled as a whole but needs the semicolon as internal statement delimiter.
This can be done very quickly with an optimized loop that jumps over comments and strings. Here's code how this is handled in MySQL Workbench:
/**
* A statement splitter to take a list of sql statements and split them into individual statements,
* return their position and length in the original string (instead the copied strings).
*/
size_t MySQLParserServicesImpl::determineStatementRanges(const char *sql, size_t length,
const std::string &initial_delimiter,
std::vector<std::pair<size_t, size_t> > &ranges,
const std::string &line_break)
{
_stop = false;
std::string delimiter = initial_delimiter.empty() ? ";" : initial_delimiter;
const unsigned char *delimiter_head = (unsigned char*)delimiter.c_str();
const unsigned char keyword[] = "delimiter";
const unsigned char *head = (unsigned char *)sql;
const unsigned char *tail = head;
const unsigned char *end = head + length;
const unsigned char *new_line = (unsigned char*)line_break.c_str();
bool have_content = false; // Set when anything else but comments were found for the current statement.
while (!_stop && tail < end)
{
switch (*tail)
{
case '/': // Possible multi line comment or hidden (conditional) command.
if (*(tail + 1) == '*')
{
tail += 2;
bool is_hidden_command = (*tail == '!');
while (true)
{
while (tail < end && *tail != '*')
tail++;
if (tail == end) // Unfinished comment.
break;
else
{
if (*++tail == '/')
{
tail++; // Skip the slash too.
break;
}
}
}
if (!is_hidden_command && !have_content)
head = tail; // Skip over the comment.
}
else
tail++;
break;
case '-': // Possible single line comment.
{
const unsigned char *end_char = tail + 2;
if (*(tail + 1) == '-' && (*end_char == ' ' || *end_char == '\t' || is_line_break(end_char, new_line)))
{
// Skip everything until the end of the line.
tail += 2;
while (tail < end && !is_line_break(tail, new_line))
tail++;
if (!have_content)
head = tail;
}
else
tail++;
break;
}
case '#': // MySQL single line comment.
while (tail < end && !is_line_break(tail, new_line))
tail++;
if (!have_content)
head = tail;
break;
case '"':
case '\'':
case '`': // Quoted string/id. Skip this in a local loop.
{
have_content = true;
char quote = *tail++;
while (tail < end && *tail != quote)
{
// Skip any escaped character too.
if (*tail == '\\')
tail++;
tail++;
}
if (*tail == quote)
tail++; // Skip trailing quote char to if one was there.
break;
}
case 'd':
case 'D':
{
have_content = true;
// Possible start of the keyword DELIMITER. Must be at the start of the text or a character,
// which is not part of a regular MySQL identifier (0-9, A-Z, a-z, _, $, \u0080-\uffff).
unsigned char previous = tail > (unsigned char *)sql ? *(tail - 1) : 0;
bool is_identifier_char = previous >= 0x80
|| (previous >= '0' && previous <= '9')
|| ((previous | 0x20) >= 'a' && (previous | 0x20) <= 'z')
|| previous == '$'
|| previous == '_';
if (tail == (unsigned char *)sql || !is_identifier_char)
{
const unsigned char *run = tail + 1;
const unsigned char *kw = keyword + 1;
int count = 9;
while (count-- > 1 && (*run++ | 0x20) == *kw++)
;
if (count == 0 && *run == ' ')
{
// Delimiter keyword found. Get the new delimiter (everything until the end of the line).
tail = run++;
while (run < end && !is_line_break(run, new_line))
run++;
delimiter = base::trim(std::string((char *)tail, run - tail));
delimiter_head = (unsigned char*)delimiter.c_str();
// Skip over the delimiter statement and any following line breaks.
while (is_line_break(run, new_line))
run++;
tail = run;
head = tail;
}
else
tail++;
}
else
tail++;
break;
}
default:
if (*tail > ' ')
have_content = true;
tail++;
break;
}
if (*tail == *delimiter_head)
{
// Found possible start of the delimiter. Check if it really is.
size_t count = delimiter.size();
if (count == 1)
{
// Most common case. Trim the statement and check if it is not empty before adding the range.
head = skip_leading_whitespace(head, tail);
if (head < tail)
ranges.push_back(std::make_pair<size_t, size_t>(head - (unsigned char *)sql, tail - head));
head = ++tail;
have_content = false;
}
else
{
const unsigned char *run = tail + 1;
const unsigned char *del = delimiter_head + 1;
while (count-- > 1 && (*run++ == *del++))
;
if (count == 0)
{
// Multi char delimiter is complete. Tail still points to the start of the delimiter.
// Run points to the first character after the delimiter.
head = skip_leading_whitespace(head, tail);
if (head < tail)
ranges.push_back(std::make_pair<size_t, size_t>(head - (unsigned char *)sql, tail - head));
tail = run;
head = run;
have_content = false;
}
}
}
}
// Add remaining text to the range list.
head = skip_leading_whitespace(head, tail);
if (head < tail)
ranges.push_back(std::make_pair<size_t, size_t>(head - (unsigned char *)sql, tail - head));
return 0;
}
This works well also for large sql scripts and can split a dump containing 1 million lines in about 1 second (depends of course on the box you run this on). The var _stop is a flag used to allow breaking the split process. The code is handling MySQL code, so it properly handles hidden commands (version comments).
With the start and length info per query you can now go to your parser.

Even when Mike's answer was fine, I needed to create the grammar in antlr. The following grammar worked for me:
sql_stmts:
sql_stmt (';'+ sql_stmt)*
;
sql_stmt:
TEXT*
;
TEXT:
~[']
| STRING
;
BLOCK_COMMENT
: '/*' .*? ( '*/' | EOF ) -> channel(HIDDEN)
;
LINE_COMMENT
: '--' ~[\r\n]* -> channel(HIDDEN)
;
SPACES
: [ \u000B\t\r\n] -> channel(HIDDEN)
;
STRING
:
'\'' ('\'\''|~'\'')* '\''
;

First, don't ignore the warning and error messages generated when compiling the grammar.
Second, the TEXT rule does not do what you think it does -- quotes don't work there. See the doc.
Third, your first line of input is actually TEXT STRING TEXT SEMI. That second TEXT is the space before your SEMI rule, yet your rule only allows for a single non-consecutive occurrence of TEXT before the SEMI.

Related

ESP32: fgets() to read from Serial input does not wait for input

Using ESP-IDF and a ESP32S2 microcontroller, I want my program to wait for input using fgets(), then after pressing Enter printing my input using printf().
void app_main(void)
{
char command[64] = {};
printf("Welcome!\n");
fgets(command, sizeof(command), stdin);
printf("command: %s\n", command);
}
However, my output shows:
Welcome!
command:
I haven't got the chance to type anything, fgets() returns immediately with an empty string.
How can I wait for user input and Enter before continuing to printf()?
I've tried countless things found on StackOverflow and other platforms, including waiting using
while ( (c = getchar()) != EOF && c != '\n') { } and such, without succes.
I wrote this function to accept a string input from the serial monitor
mainly for esp32. Hope it helps you.
It'll wait till either an EOL character is detected (or) max length characters are input, a null terminator will also be inserted appropriately
void getLineInput(char buf[], size_t len)
{
memset(buf, 0, len);
fpurge(stdin); //clears any junk in stdin
char *bufp;
bufp = buf;
while(true)
{
vTaskDelay(100/portTICK_PERIOD_MS);
*bufp = getchar();
if(*bufp != '\0' && *bufp != 0xFF && *bufp != '\r') //ignores null input, 0xFF, CR in CRLF
{
//'enter' (EOL) handler
if(*bufp == '\n'){
*bufp = '\0';
break;
} //backspace handler
else if (*bufp == '\b'){
if(bufp-buf >= 1)
bufp--;
}
else{
//pointer to next character
bufp++;
}
}
//only accept len-1 characters, (len) character being null terminator.
if(bufp-buf > (len)-2){
bufp = buf + (len -1);
*bufp = '\0';
break;
}
}
}

Arduino convert constant char to unsigned long

I'm asking you to know how to convert a constant char variable[] to a unsigned long variable!
The problem doesn't exist if not for :
I've to convert this value for example "0x20DF10EF" if I convert it to long it return me back "551489775".
What i want is to receive back "0x20DF10EF"!
Hope i've explained well enough my problem!
Best regards D.Tibe!
---- Edit ----
while(O != 'I'){
if(reciver.decode(&results)){
CMD[i] = "0x" + String(results.value, HEX);
CMD[i].toUpperCase();
Val[0] = CMD[i].c_str();
//Vil[0] = CMD[i].c_str();
//for(int i = 0; i < sizeof(Val[0])-1 ;i++)
//{
//}
Byte = String(results.bits, DEC);
delay(1000);
O = 'I';
reciver.resume();
}
This is my code!
I have to convert my Val[0] (that is a Constant char) to Unsigned long variable.
Like said before i'll have a value like this 0x20DF10EF in my constant char and i want to get exactly the same on my unsigned long variable, SO :
Val[0] will be = to 0x20DF10EF and i want to get back the same value but into the unsigned long variable like this
unsigned long Var will be = to 0x20DF10EF
If I understood correctly, you want to parse a const char * string with an hex number and put it into a variable.
If this is correct, there are two ways: using the sscanf function or converting it by hand.
Method 1:
unsigned long result;
if (sscanf(Val[0], "0x%x", &result) != 1)
{
Serial.println("Val[0] is not a valid hex value");
}
Method 2:
unsigned long result = 0;
byte i;
for (i = 2; i < strlen(Val[0]); i++)
{
if ((Val[0][i] >= '0') && (Val[0][i] <= '9'))
{
result = (result << 4) + Val[0][i] - '0';
}
else if ((Val[0][i] >= 'A') && (Val[0][i] <= 'F'))
{
result = (result << 4) + 10 + Val[0][i] - 'A';
}
else if ((Val[0][i] >= 'a') && (Val[0][i] <= 'f'))
{
result = (result << 4) + 10 + Val[0][i] - 'a';
}
else
{
Serial.println("Val[0] is not a valid hex value");
break;
}
}
By the way, adding 0x in front of the string is useless for this conversion. If you can, remove it and then replace "0x%x" with "%x" in the sscanf solution, or i = 2 with i = 0 in the hand-made one.

Reading .hex file in VHDL

I'm trying to read an intel .hex file using the following VHDL code snippet. My synthesizer is having a problem with the part of the code that is supposed to check for and discard the ':' character at the start of a line. The synthesis tool gives this error "Call to procedure without body" (line marked with comment). I have never seen this error and don't know what it means. Is there a solution for this error (or an alternate way to discard the ':' character)?
function Load_Data(constant x: in integer) return ROM_Data is
use std.textio.all;
use ieee.std_logic_textio.all;
file ROMFILE: TEXT open READ_MODE is "IIU_Code.hex";
variable newline: line;
variable newchar: character;
variable newbyte: std_logic_vector(7 downto 0);
variable newword: std_logic_vector(15 downto 0);
variable NextAddr, ByteCount: integer;
variable NewROM: ROM_Data := (others => (others => '0'));
variable valid: boolean := True;
begin
while (valid) loop
readline(ROMFILE, newline);
read(newline,newchar,valid); --ERROR HERE!!!
if (newchar = ':') and (valid = True) then
hread(newline,newbyte);
ByteCount := to_integer(unsigned(newbyte));
hread(newline,newword);
NextAddr := to_integer(unsigned(newword));
hread(newline,newbyte);
if newbyte = X"01" then --check for EOF marker
valid := False;
end if;
for i in 1 to ByteCount loop
hread(newline,newbyte);
NewROM(NextAddr) := newbyte;
NextAddr := NextAddr + 1;
end loop;
end if;
end loop;
file_close(ROMFILE);
return NewROM;
end;
In lieu of trying to force synthesis to initialize ROM from a file I've been known to write C programs that convert data for models to constants, in this case by generating entity/architecture pairs:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#define MAX_VECTOR 512
void rom_header (rom_name,array_size)
char *rom_name;
int array_size;
{
printf("library ieee;\nuse ieee.std_logic_1164.all;\n");
printf("\nentity %s is\n port (\n",rom_name);
printf("\tindex:\t\tin integer range 0 to %d;\n",array_size*8-1);
printf("\tOE:\t\tin std_logic;\n");
printf("\toutput:\t\tout std_logic_vector (7 downto 0)\n");
printf(" );\nend ;\n");
printf("\narchitecture behave of %s is\n\n",rom_name);
printf(" subtype bytestring is bit_vector( 7 downto 0);\n");
printf(" type bytestream is array (0 to %d) of bytestring;\n\n",
array_size*8-1);
printf(" constant byte_array:\tbytestream := (\n\t ");
}
void rom_tail() {
printf(" begin\n\n");
printf(" output <= To_StdLogicVector(byte_array(index)) ");
printf("when OE = '1' else\n");
printf(" (others => 'Z') ");
printf("when OE = '0' else\n");
printf(" (others => 'X');\n");
printf("\n\nend behave;\n\n");
}
int main (argc,argv)
int argc;
char *argv[];
{
extern char *optarg;
extern int optind, opterr;
extern int getopt();
char *infile;
char key_vector[MAX_VECTOR][16];
char plain_vector[MAX_VECTOR][16];
char cipher_vector[MAX_VECTOR][16];
char testinput[2047];
char testkey[17];
char testplain[17];
char testcipher[17];
int encrypt[MAX_VECTOR];
int i;
int len;
int testcount = 0;
int totalcount = 0;
int linenumber = 0;
int vector = 0;
int encode = 1;
while ( (i=getopt(argc,argv,"i:")) != -1 ) {
switch (i) {
case 'i':
infile = optarg;
if((freopen(optarg,"r",stdin)) == NULL) {
fprintf(stderr,"ERROR:%s, can't open %s for input\n",
argv[0],optarg);
exit(-1);
}
break;
case '?':
fprintf(stderr,"usage: %s [-i infile] \n",argv[0]);
fprintf(stderr,"\ngenerates VHDL arrays for DES test vectors:\n");
fprintf(stderr,"\tcipher_vector.vhdl\n");
fprintf(stderr,"\tencrypt_vector.vhdl\n");
fprintf(stderr,"\tkey_vector.vhdl\n");
fprintf(stderr,"\tplain_vector.vhdl\n");
exit (-1);
break;
}
}
while (fgets(testinput,(sizeof testinput) -1, stdin) != NULL ) {
linenumber++;
if ( strncmp(testinput,"encrypt",7) == 0) { /* mode = encode */
encode = 1;
fprintf(stderr,"%s",testinput);
}
else
if ( strncmp(testinput,"decrypt",7) == 0) { /* mode = decode */
fprintf(stderr,"%s",testinput);
encode = 0;
}
else
if ( strncmp(testinput," ",1) == 0) { /* key, plain & cipher */
testcount++;
len = sscanf(testinput,"%s%s%s*", testkey, testplain, testcipher);
if (len != 3) {
fprintf(stderr,"ERROR: %s, wrong vector count, line %d\n",
argv[0], linenumber);
exit(-1);
}
else if (strlen(testkey) != 16) {
fprintf(stderr,"ERROR: %s wrong byte count testkey, line %d\n",
argv[0],linenumber);
exit(-1);
}
else if (strlen(testplain) != 16) {
fprintf(stderr,"ERROR: %s wrong byte count testplain, line %d\n",
argv[0],linenumber);
exit(-1);
}
else if (strlen(testcipher) != 16) {
fprintf(stderr,"ERROR: %s wrong byte count testcipher, line %d\n",
argv[0],linenumber);
exit(-1);
}
else {
encrypt[vector] = encode;
strncpy( key_vector[vector], testkey,16);
strncpy( plain_vector[vector], testplain,16);
strncpy(cipher_vector[vector],testcipher,16);
for ( i = 0; i < 16; i++) {
if ( !isxdigit(key_vector[vector][i]) ||
!isxdigit(plain_vector[vector][i]) ||
!isxdigit(cipher_vector[vector][i]) ) {
fprintf(stderr,"ERROR: %s, Vector: %d contains nonhex\n",
argv[0], vector+1);
fprintf(stderr,"\t%s\n",testinput);
exit(-1);
}
}
}
vector++;
if (vector == MAX_VECTOR) {
fprintf(stderr,"%s: Maximum number of vectors = %d\n",
argv[0],MAX_VECTOR);
exit(0);
}
}
else { /* nothing but eyewash */
if ( testcount ) {
fprintf(stderr," %d test vectors\n",testcount);
totalcount +=testcount;
testcount = 0;
}
}
}
fprintf(stderr," Total: %d test vectors\n",totalcount);
if (freopen("key_vector.vhdl","w",stdout) == NULL){
fprintf(stderr,"ERROR: %s can write to key_vector.vhdl\n",argv[0]);
exit (-1);
}
rom_header("key_vector",totalcount);
for(vector = 0; vector < totalcount; vector++) {
for ( i = 0; i <= 15; i++) {
if ( !(i & 1)) {
printf("x\"%c",key_vector[vector][i]);
}
else {
if ( i < 15) {
printf("%c\",",key_vector[vector][i]);
}
else {
printf("%c\"",key_vector[vector][i]); // no comma
}
}
}
if (vector != totalcount-1)
printf(",\n\t ");
else
printf("\n\t);\n");
}
rom_tail();
if (freopen("plain_vector.vhdl","w",stdout) == NULL){
fprintf(stderr,"ERROR: %s can write to plain_vector.vhdl\n",argv[0]);
exit (-1);
}
rom_header("plain_vector",totalcount);
for(vector = 0; vector < totalcount; vector++) {
for ( i = 0; i <= 15; i++) {
if ( !(i & 1)) {
printf("x\"%c",plain_vector[vector][i]);
}
else {
if ( i < 15) {
printf("%c\",",plain_vector[vector][i]);
}
else {
printf("%c\"",plain_vector[vector][i]); // no comma
}
}
}
if (vector != totalcount-1)
printf(",\n\t ");
else
printf("\n\t);\n");
}
rom_tail();
if (freopen("cipher_vector.vhdl","w",stdout) == NULL){
fprintf(stderr,"ERROR: %s can write to cipher_vector.vhdl\n",argv[0]);
exit (-1);
}
rom_header("cipher_vector",totalcount);
for(vector = 0; vector < totalcount; vector++) {
for ( i = 0; i <= 15; i++) {
if ( !(i & 1)) {
printf("x\"%c",cipher_vector[vector][i]);
}
else {
if ( i < 15) {
printf("%c\",",cipher_vector[vector][i]);
}
else {
printf("%c\"",cipher_vector[vector][i]); // no comma
}
}
}
if (vector != totalcount-1)
printf(",\n\t ");
else
printf("\n\t);\n");
}
rom_tail();
if (freopen("encrypt_vector.vhdl","w",stdout) == NULL){
fprintf(stderr,"ERROR: %s can write to encrypt_vector.vhdl\n",argv[0]);
exit (-1);
}
printf("library ieee;\nuse ieee.std_logic_1164.all;\n");
printf("\nentity encrypt_vector is\n port (\n");
printf("\tindex:\t\tin integer range 0 to %d;\n",totalcount-1);
printf("\toutput:\t\tout std_logic\n");
printf(" );\nend ;\n");
printf("\narchitecture behave of encrypt_vector is\n\n");
printf(" constant bit_array:\tstd_logic_vector(0 to %d) := (\n\t ",
totalcount-1);
i = 0;
for(vector = 0; vector < totalcount; vector++) {
printf("'%1d'",encrypt[vector]);i++;
if ((i == 16) && (vector != totalcount-1)) {
printf(",\n\t ");
i = 0;
}
else if (vector == totalcount-1)
printf("\n\t);\n");
else
printf(",");
}
printf(" begin\n\n");
printf(" output <= bit_array(index);");
printf("\n\nend behave;\n\n");
exit (0);
}
You could also do this for packages or even subprograms.
This particular conversion software uses a form of valid vectors preceded by an encryption mode switch and having a first column space, providing hex values of the right string length:
#
encrypt
#
0101010101010101 95F8A5E5DD31D900 8000000000000000
0101010101010101 DD7F121CA5015619 4000000000000000
0101010101010101 2E8653104F3834EA 2000000000000000
0101010101010101 4BD388FF6CD81D4F 1000000000000000
0101010101010101 20B9E767B2FB1456 0800000000000000
0101010101010101 55579380D77138EF 0400000000000000
0101010101010101 6CC5DEFAAF04512F 0200000000000000
#
It's the test vectors for a byte wide interfaced DES chip, and in this case only used in a test bench. There's nothing stopping you from embedding something like you want.
This little C program is quite old but I believe I updated it recently enough it would compile and run, it spits out several different 'vector' files for the test bench based on what the values are used for. It wants the input file to be concluded with a comment line ('#' in the first column), followed by a newline.
So the message here is don't count directly on your synthesis tools to initialize data (unless they handle it with explicitly supported routines).
See How to synthesis a rom and load initial data into it ?, for a hint thread in Xilinx, otherwise noting you haven't specified target platform.
addendum
The questioner has been forthcoming with additional information in comments, wherein automated software has exhorted us to Please avoid extended discussions in comments.
The target is a Microsemi ProASIC3, which also prompted another look at the provided Load_Data function, whose input argument x doesn't show up in the function body. While that indicates the author may have been battling uphill restrictions trying to read a file.
Looking at Microsemi's web site we see that a ProASIC3 can have an embedded 1K bit FLASHROM, which may or may not be the ROM in question. I'm an ASIC designer from way back and can appreciate the size range of these devices, intended for among other uses System on Chip applications. You'd expect the vendor would be able to supply information on how to use the FLASHROM.
For other ROM purposes in lieu of vendor supplied method of loading ROM it would seem that creating a synthesis compatible method of embedding an array of constants is in order (analogous to what's shown in the C programming example).
One characteristic of Read Only Memory in programmable devices is that the values are typically included as part of device programming.

Validate user input - Objective-C

I am trying to validate user input. Here's the code:
do{
NSLog(#"Please select from the following options: D/ W/ T/ Q");
res = scanf("%c", &s1);
if(res ==0) {
NSLog(#"Invalid entry.");
}
}while (res ==0);
I want to improve the above code such that it will not allow the user to input anything (such as a number, a string, or any negative number) but only one single character (to be specific, only one of the option given in the prompt).
The current code doesn't do that.
boolean bValid = true;
do {
NSLog(#"Please select from the following options: D/ W/ T/ Q");
res = scanf("%c", &s1);
if(res == 'D' || res == 'W' || res == 'T' || res == 'Q'){
bValid = false;
}
else{
//Error message
}
} while (bValid == true);
You can use this code.
Just check it out.
Well one option is first to read the keyboard as a string
char buffer[128];
fgets( buffer, sizeof(buffer), stdin );
once you have the line, then check whether it is one of the options, seems only the first letter is significant in your case:
switch( toupper( buffer[0] ) )
{
case 'D': {...} ; // do whatever u need to do
case 'W': {...} ;
case 'T': {...} ;
case 'Q': {...} ;
default: {...} ;
}

ANTLR What is simpliest way to realize python like indent-depending grammar?

I am trying realize python like indent-depending grammar.
Source example:
ABC QWE
CDE EFG
EFG CDE
ABC
QWE ZXC
As i see, what i need is to realize two tokens INDENT and DEDENT, so i could write something like:
grammar mygrammar;
text: (ID | block)+;
block: INDENT (ID|block)+ DEDENT;
INDENT: ????;
DEDENT: ????;
Is there any simple way to realize this using ANTLR?
(I'd prefer, if it's possible, to use standard ANTLR lexer.)
I don't know what the easiest way to handle it is, but the following is a relatively easy way. Whenever you match a line break in your lexer, optionally match one or more spaces. If there are spaces after the line break, compare the length of these spaces with the current indent-size. If it's more than the current indent size, emit an Indent token, if it's less than the current indent-size, emit a Dedent token and if it's the same, don't do anything.
You'll also want to emit a number of Dedent tokens at the end of the file to let every Indent have a matching Dedent token.
For this to work properly, you must add a leading and trailing line break to your input source file!
ANTRL3
A quick demo:
grammar PyEsque;
options {
output=AST;
}
tokens {
BLOCK;
}
#lexer::members {
private int previousIndents = -1;
private int indentLevel = 0;
java.util.Queue<Token> tokens = new java.util.LinkedList<Token>();
#Override
public void emit(Token t) {
state.token = t;
tokens.offer(t);
}
#Override
public Token nextToken() {
super.nextToken();
return tokens.isEmpty() ? Token.EOF_TOKEN : tokens.poll();
}
private void jump(int ttype) {
indentLevel += (ttype == Dedent ? -1 : 1);
emit(new CommonToken(ttype, "level=" + indentLevel));
}
}
parse
: block EOF -> block
;
block
: Indent block_atoms Dedent -> ^(BLOCK block_atoms)
;
block_atoms
: (Id | block)+
;
NewLine
: NL SP?
{
int n = $SP.text == null ? 0 : $SP.text.length();
if(n > previousIndents) {
jump(Indent);
previousIndents = n;
}
else if(n < previousIndents) {
jump(Dedent);
previousIndents = n;
}
else if(input.LA(1) == EOF) {
while(indentLevel > 0) {
jump(Dedent);
}
}
else {
skip();
}
}
;
Id
: ('a'..'z' | 'A'..'Z')+
;
SpaceChars
: SP {skip();}
;
fragment NL : '\r'? '\n' | '\r';
fragment SP : (' ' | '\t')+;
fragment Indent : ;
fragment Dedent : ;
You can test the parser with the class:
import org.antlr.runtime.*;
import org.antlr.runtime.tree.*;
import org.antlr.stringtemplate.*;
public class Main {
public static void main(String[] args) throws Exception {
PyEsqueLexer lexer = new PyEsqueLexer(new ANTLRFileStream("in.txt"));
PyEsqueParser parser = new PyEsqueParser(new CommonTokenStream(lexer));
CommonTree tree = (CommonTree)parser.parse().getTree();
DOTTreeGenerator gen = new DOTTreeGenerator();
StringTemplate st = gen.toDOT(tree);
System.out.println(st);
}
}
If you now put the following in a file called in.txt:
AAA AAAAA
BBB BB B
BB BBBBB BB
CCCCCC C CC
BB BBBBBB
C CCC
DDD DD D
DDD D DDD
(Note the leading and trailing line breaks!)
then you'll see output that corresponds to the following AST:
Note that my demo wouldn't produce enough dedents in succession, like dedenting from ccc to aaa (2 dedent tokens are needed):
aaa
bbb
ccc
aaa
You would need to adjust the code inside else if(n < previousIndents) { ... } to possibly emit more than 1 dedent token based on the difference between n and previousIndents. Off the top of my head, that could look like this:
else if(n < previousIndents) {
// Note: assuming indent-size is 2. Jumping from previousIndents=6
// to n=2 will result in emitting 2 `Dedent` tokens
int numDedents = (previousIndents - n) / 2;
while(numDedents-- > 0) {
jump(Dedent);
}
previousIndents = n;
}
ANTLR4
For ANTLR4, do something like this:
grammar Python3;
tokens { INDENT, DEDENT }
#lexer::members {
// A queue where extra tokens are pushed on (see the NEWLINE lexer rule).
private java.util.LinkedList<Token> tokens = new java.util.LinkedList<>();
// The stack that keeps track of the indentation level.
private java.util.Stack<Integer> indents = new java.util.Stack<>();
// The amount of opened braces, brackets and parenthesis.
private int opened = 0;
// The most recently produced token.
private Token lastToken = null;
#Override
public void emit(Token t) {
super.setToken(t);
tokens.offer(t);
}
#Override
public Token nextToken() {
// Check if the end-of-file is ahead and there are still some DEDENTS expected.
if (_input.LA(1) == EOF && !this.indents.isEmpty()) {
// Remove any trailing EOF tokens from our buffer.
for (int i = tokens.size() - 1; i >= 0; i--) {
if (tokens.get(i).getType() == EOF) {
tokens.remove(i);
}
}
// First emit an extra line break that serves as the end of the statement.
this.emit(commonToken(Python3Parser.NEWLINE, "\n"));
// Now emit as much DEDENT tokens as needed.
while (!indents.isEmpty()) {
this.emit(createDedent());
indents.pop();
}
// Put the EOF back on the token stream.
this.emit(commonToken(Python3Parser.EOF, "<EOF>"));
}
Token next = super.nextToken();
if (next.getChannel() == Token.DEFAULT_CHANNEL) {
// Keep track of the last token on the default channel.
this.lastToken = next;
}
return tokens.isEmpty() ? next : tokens.poll();
}
private Token createDedent() {
CommonToken dedent = commonToken(Python3Parser.DEDENT, "");
dedent.setLine(this.lastToken.getLine());
return dedent;
}
private CommonToken commonToken(int type, String text) {
int stop = this.getCharIndex() - 1;
int start = text.isEmpty() ? stop : stop - text.length() + 1;
return new CommonToken(this._tokenFactorySourcePair, type, DEFAULT_TOKEN_CHANNEL, start, stop);
}
// Calculates the indentation of the provided spaces, taking the
// following rules into account:
//
// "Tabs are replaced (from left to right) by one to eight spaces
// such that the total number of characters up to and including
// the replacement is a multiple of eight [...]"
//
// -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation
static int getIndentationCount(String spaces) {
int count = 0;
for (char ch : spaces.toCharArray()) {
switch (ch) {
case '\t':
count += 8 - (count % 8);
break;
default:
// A normal space char.
count++;
}
}
return count;
}
boolean atStartOfInput() {
return super.getCharPositionInLine() == 0 && super.getLine() == 1;
}
}
single_input
: NEWLINE
| simple_stmt
| compound_stmt NEWLINE
;
// more parser rules
NEWLINE
: ( {atStartOfInput()}? SPACES
| ( '\r'? '\n' | '\r' ) SPACES?
)
{
String newLine = getText().replaceAll("[^\r\n]+", "");
String spaces = getText().replaceAll("[\r\n]+", "");
int next = _input.LA(1);
if (opened > 0 || next == '\r' || next == '\n' || next == '#') {
// If we're inside a list or on a blank line, ignore all indents,
// dedents and line breaks.
skip();
}
else {
emit(commonToken(NEWLINE, newLine));
int indent = getIndentationCount(spaces);
int previous = indents.isEmpty() ? 0 : indents.peek();
if (indent == previous) {
// skip indents of the same size as the present indent-size
skip();
}
else if (indent > previous) {
indents.push(indent);
emit(commonToken(Python3Parser.INDENT, spaces));
}
else {
// Possibly emit more than 1 DEDENT token.
while(!indents.isEmpty() && indents.peek() > indent) {
this.emit(createDedent());
indents.pop();
}
}
}
}
;
// more lexer rules
Taken from: https://github.com/antlr/grammars-v4/blob/master/python3/Python3.g4
There is an open-source library antlr-denter for ANTLR v4 that helps parse indents and dedents for you. Check out its README for how to use it.
Since it is a library, rather than code snippets to copy-and-paste into your grammar, its indentation-handling can be updated separately from the rest of your grammar.
There is a relatively simple way to do this ANTLR, which I wrote as an experiment: DentLexer.g4. This solution is different from the others mentioned on this page that were written by Kiers and Shavit. It integrates with the runtime solely via an override of the Lexer's nextToken() method. It does its work by examining tokens: (1) a NEWLINE token triggers the start of a "keep track of indentation" phase; (2) whitespace and comments, both set to channel HIDDEN, are counted and ignored, respectively, during that phase; and, (3) any non-HIDDEN token ends the phase. Thus controlling the indentation logic is a simple matter of setting a token's channel.
Both of the solutions mentioned on this page require a NEWLINE token to also grab all the subsequent whitespace, but in doing so can't handle multi-line comments interrupting that whitespace. Dent, instead, keeps NEWLINE and whitespace tokens separate and can handle multi-line comments.
Your grammar would be set up something like below. Note that the NEWLINE and WS lexer rules have actions that control the pendingDent state and keep track of indentation level with the indentCount variable.
grammar MyGrammar;
tokens { INDENT, DEDENT }
#lexer::members {
// override of nextToken(), see Dent.g4 grammar on github
// https://github.com/wevrem/wry/blob/master/grammars/Dent.g4
}
script : ( NEWLINE | statement )* EOF ;
statement
: simpleStatement
| blockStatements
;
simpleStatement : LEGIT+ NEWLINE ;
blockStatements : LEGIT+ NEWLINE INDENT statement+ DEDENT ;
NEWLINE : ( '\r'? '\n' | '\r' ) {
if (pendingDent) { setChannel(HIDDEN); }
pendingDent = true;
indentCount = 0;
initialIndentToken = null;
} ;
WS : [ \t]+ {
setChannel(HIDDEN);
if (pendingDent) { indentCount += getText().length(); }
} ;
BlockComment : '/*' ( BlockComment | . )*? '*/' -> channel(HIDDEN) ; // allow nesting comments
LineComment : '//' ~[\r\n]* -> channel(HIDDEN) ;
LEGIT : ~[ \t\r\n]+ ~[\r\n]*; // Replace with your language-specific rules...
Have you looked at the Python ANTLR grammar?
Edit: Added psuedo Python code for creating INDENT/DEDENT tokens
UNKNOWN_TOKEN = 0
INDENT_TOKEN = 1
DEDENT_TOKEN = 2
# filestream has already been processed so that each character is a newline and
# every tab outside of quotations is converted to 8 spaces.
def GetIndentationTokens(filestream):
# Stores (indentation_token, line, character_index)
indentation_record = list()
line = 0
character_index = 0
column = 0
counting_whitespace = true
indentations = list()
for c in filestream:
if IsNewLine(c):
character_index = 0
column = 0
line += 1
counting_whitespace = true
elif c != ' ' and counting_whitespace:
counting_whitespace = false
if(len(indentations) == 0):
indentation_record.append((token, line, character_index))
else:
while(len(indentations) > 0 and indentations[-1] != column:
if(column < indentations[-1]):
indentations.pop()
indentation_record.append((
DEDENT, line, character_index))
elif(column > indentations[-1]):
indentations.append(column)
indentation_record.append((
INDENT, line, character_index))
if not IsNewLine(c):
column += 1
character_index += 1
while(len(indentations) > 0):
indentations.pop()
indentation_record.append((DEDENT_TOKEN, line, character_index))
return indentation_record