tree-sitter match for similar structures - grammar

I'm trying to create a tree-sitter for a Minecraft function grammar.
The structure of the language looks like this:
command #e[key=value] other args
I'm having an issue with the value in the second argument (the target selector) in the above example. This value can be many things, like strings, numbers, booleans, and two similar object structures (NBT and scoreboard object).
Here are examples of each:
NBT
{key:value}
Scoreboard Object
{key=number} // where number is: N, ..N, N.., or N..N
My grammar file contains the following code:
// unrelated code removed
module.exports = grammar({
name: "mcfunction",
rules: {
root: $ => repeat(
choice(
$.command
)
),
command: $ => prec.right(seq(
field("command_name", $.identifier),
repeat(
choice(
$.selector
)
),
"\n"
)),
identifier: $ => /[A-Za-z][\w-]+/,
number: $ => prec(1, /-?\d+(\.\d+)?/),
boolean: $ => choice(
"true",
"false"
),
string: $ => seq(
"\"",
repeat(
choice(
$._escape_sequence,
/[^"]/
)
),
"\""
),
_escape_sequence: $ => seq("\\", "\""),
selector: $ => seq(
token(
seq(
"#",
choice(
"p", "a", "e", "s", "r"
)
)
),
optional(
seq(
token.immediate("["),
optional(
repeat(
seq(
$.selector_option,
optional(",")
)
)
),
"]"
)
),
),
selector_option: $ => seq(
$.selector_key,
"=",
$.selector_value
),
selector_key: $ => /[a-z_-]+/,
selector_value: $ => choice(
$.item,
$.path,
$.selector_key,
$.selector_number,
$.number,
$.boolean,
$.selector_object
),
selector_number: $ => prec.right(1, choice(
seq(
"..",
$.number
),
seq(
$.number,
"..",
$.number
),
seq(
$.number,
".."
),
$.number
)),
selector_object: $ => choice(
seq(
"{",
repeat(
seq(
$.selector_score,
optional(",")
)
),
"}"
),
seq(
"{",
repeat(
seq(
$.selector_nbt,
optional(",")
)
),
"}"
)
),
selector_nbt: $ => seq(
$.nbt_object_key,
":",
$.nbt_object_value
),
selector_score: $ => seq(
field("selector_score_key", $.selector_key),
"=",
field("selector_score_value", $.selector_number)
),
_namespace: $ => /[a-z_-]+:/,
item: $ => seq(
$._namespace,
$.selector_key
),
path: $ => seq(
choice($.item, /[a-z_]+/),
repeat1(
token("/", /[a-z_]/)
)
),
nbt: $ => choice(
$.nbt_array,
$.nbt_object
),
nbt_object: $ => seq(
"{",
repeat(
seq(
$.nbt_object_key,
":",
$.nbt_object_value,
optional(",")
)
),
"}"
),
nbt_array: $ => seq(
"[",
repeat(
seq(
$.nbt_object_value,
optional(",")
)
),
"]"
),
nbt_object_key: $ => choice(
$.string,
$.number,
$.identifier
),
nbt_object_value: $ => choice(
$.string,
$.nbt_number,
$.boolean,
$.nbt
),
nbt_number: $ => seq(
$.number,
field("nbt_number_suffix", optional(choice("l","s","d","f","b")))
)
}
});
However, if I compile and parse test #e[scores={example=1..}], I get:
(root [0, 0] - [6, 0]
(command [0, 0] - [1, 0]
command_name: (identifier [0, 0] - [0, 4])
(selector [0, 5] - [0, 29]
(selector_option [0, 8] - [0, 28]
(selector_key [0, 8] - [0, 14])
(selector_value [0, 15] - [0, 28]
(selector_object [0, 15] - [0, 28]
(ERROR [0, 16] - [0, 27]
(nbt_object_key [0, 16] - [0, 23]
(identifier [0, 16] - [0, 23]))))))))
tests/test.mcfunction 0 ms (ERROR [0, 16] - [0, 27])
Expected: instead of ERROR, it should be selector_score, and there should be a score_key and score_value.
This does not happen if I remove the selector_nbt sequence from selector_object. However, if running the parse (with both sequences or just the selector_nbt) on a command using nbt data, there are no errors.
What am I doing wrong?

I solved this by using a choice of the two conflicting keys, something like this:
choice(
alias($.key_1, $.key_2),
$.key_2
)
ahlinc on GitHub answered:
You can fix your error for the above grammar by assigning lexer precedence for the selector_key terminal over the identifier terminal like:
selector_key: $ => token(prec(1, /[a-z_-]+/)),
But you need to note that you use regexps that clashes:
identifier: $ => /[A-Za-z][\w-]+/,
selector_key: $ => token(prec(1, /[a-z_-]+/)),
If it's impossible to rewrite the above regexps to don't have conflicts in them then you may need a workaround described here: #1287 (reply in thread)

Related

How can I group elements of a list in Raku?

Is there some method in Raku which, when you pass it a "getter", groups together items from the original list for which the getter is returning the same value?
I am looking for something like groupBy in Scala:
# (1 until 10).groupBy(_ % 3)
res0: Map[Int, IndexedSeq[Int]] = HashMap(0 -> Vector(3, 6, 9), 1 -> Vector(1, 4, 7), 2 -> Vector(2, 5, 8))
Or groupBy from Lodash (JavaScript):
> groupBy(range(1, 10), x => x % 3)
{"0": [3,6,9], "1": [1,4,7], "2": [2,5,8]}
It's called classify in Raku:
$ raku -e 'say (1..10).classify(* % 3)'
{0 => [3 6 9], 1 => [1 4 7 10], 2 => [2 5 8]}

Convert PyTorch AutoTokenizer to TensorFlow TextVectorization

I have a PyTorch encoder loaded on my PC with transformers.
I saved it in JSON with tokenizer.save_pretrained(...) and now I need to load it on another PC with TensorFlow TextVectorization as I don't have access to the transformers library.
How can I convert ? I read about the tf.keras.preprocessing.text.tokenizer_from_json but it does not work.
In PyTorch JSON I have :
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [...],
"normalizer": {...},
"pre_tokenizer": {...},
"post_processor": {...},
"decoder": {...},
"model": {...}
}
and TensorFlow is expecting, with TextVectorizer :
def __init__(
self,
max_tokens=None,
standardize="lower_and_strip_punctuation",
split="whitespace",
ngrams=None,
output_mode="int",
output_sequence_length=None,
pad_to_max_tokens=False,
vocabulary=None,
idf_weights=None,
sparse=False,
ragged=False,
**kwargs,
):
or with the tokenizer_from_json these kind of fields :
config = tokenizer_config.get("config")
word_counts = json.loads(config.pop("word_counts"))
word_docs = json.loads(config.pop("word_docs"))
index_docs = json.loads(config.pop("index_docs"))
# Integer indexing gets converted to strings with json.dumps()
index_docs = {int(k): v for k, v in index_docs.items()}
index_word = json.loads(config.pop("index_word"))
index_word = {int(k): v for k, v in index_word.items()}
word_index = json.loads(config.pop("word_index"))
tokenizer = Tokenizer(**config)
Simply "tf.keras.preprocessing.text.tokenizer_from_json.()" but you may need to correct format in JSON.
Sample: The sample they using " I love cats " -> " Sticky "
import tensorflow as tf
text = "I love cats"
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000, oov_token='<oov>')
tokenizer.fit_on_texts([text])
# input
vocab = [ "a", "b", "c", "d", "e", "f", "g", "h", "I", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "_" ]
data = tf.constant([["_", "_", "_", "I"], ["l", "o", "v", "e"], ["c", "a", "t", "s"]])
layer = tf.keras.layers.StringLookup(vocabulary=vocab)
sequences_mapping_string = layer(data)
sequences_mapping_string = tf.constant( sequences_mapping_string, shape=(1,12) )
print( 'result: ' + str( sequences_mapping_string ) )
print( 'tokenizer.to_json(): ' + str( tokenizer.to_json() ) )
new_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(tokenizer.to_json())
print( 'new_tokenizer.to_json(): ' + str( new_tokenizer.to_json() ) )
Output:
result: tf.Tensor([[27 27 27 9 12 15 22 5 3 1 20 19]], shape=(1, 12), dtype=int64)
tokenizer.to_json(): {"class_name": "Tokenizer", "config": {"num_words": 10000, "filters": "!\"#$%&()*+,-./:;<=>?#[\\]^_`{|}~\t\n", "lower": true, "split": " ", "char_level": false, "oov_token": "<oov>", "document_count": 1, "word_counts": "{\"i\": 1, \"love\": 1, \"cats\": 1}", "word_docs": "{\"cats\": 1, \"love\": 1, \"i\": 1}", "index_docs": "{\"4\": 1, \"3\": 1, \"2\": 1}", "index_word": "{\"1\": \"<oov>\", \"2\": \"i\", \"3\": \"love\", \"4\": \"cats\"}", "word_index": "{\"<oov>\": 1, \"i\": 2, \"love\": 3, \"cats\": 4}"}}
new_tokenizer.to_json(): {"class_name": "Tokenizer", "config": {"num_words": 10000, "filters": "!\"#$%&()*+,-./:;<=>?#[\\]^_`{|}~\t\n", "lower": true, "split": " ", "char_level": false, "oov_token": "<oov>", "document_count": 1, "word_counts": "{\"i\": 1, \"love\": 1, \"cats\": 1}", "word_docs": "{\"cats\": 1, \"love\": 1, \"i\": 1}", "index_docs": "{\"4\": 1, \"3\": 1, \"2\": 1}", "index_word": "{\"1\": \"<oov>\", \"2\": \"i\", \"3\": \"love\", \"4\": \"cats\"}", "word_index": "{\"<oov>\": 1, \"i\": 2, \"love\": 3, \"cats\": 4}"}}

Alternative to Expect.all using elm-test?

I'm new to Elm and I have some question about elm-test. I try to have multiple expect in the same test, but didn't find how. so here is what I've done for now but it's not really expressive
suite : Test
suite =
describe "2048-elm"
[ test "moveLeftWithZero" <|
\_ ->
let
expectedCases =
[ ( [ 2, 0, 0, 2 ], [ 4, 0, 0, 0 ] )
, ( [ 2, 2, 0, 4 ], [ 4, 4, 0, 0 ] )
, ( [ 0, 0, 0, 4 ], [ 4, 0, 0, 0 ] )
, ( [ 0, 0, 2, 4 ], [ 2, 4, 0, 0 ] )
, ( [ 2, 4, 2, 4 ], [ 2, 4, 2, 4 ] )
, ( [ 2, 2, 2, 2 ], [ 4, 4, 0, 0 ] )
]
toTest =
List.map (\expected -> ( Tuple.first expected, Main.moveLeftWithZero (Tuple.first expected) )) expectedCases
in
Expect.equal expectedCases toTest
]
I tried with Expect.all but it does not seems to do what I want

Mapping set of keys to a matching list of lists

What is an idiomatic way to map keys to a matching list of lists? An example - given:
val s = listOf(1, 9)
val u = listOf(listOf(1, 2, 3), listOf(1, 4, 7), listOf(1, 5, 9))
I would like to have a Map<Int, List<List<Int>>> such that every key in s is mapped to a list of lists containing that key:
{1=[ [1, 2, 3], [1, 4, 7], [1, 5, 9] ], 9=[ [1, 5, 9] ]}
The following:
s.groupBy({ it }, { x -> u.filter { it.contains(x) } })
produces:
{1=[[[1, 2, 3], [1, 4, 7], [1, 5, 9]]], 9=[[[1, 5, 9]]]}
which is not quite right and it isn't clear how to flatten the result to the expected shape.
I would recommend associateWith and use it like this:
s.associateWith { num -> u.filter { list -> num in list } }
Output:
{1=[[1, 2, 3], [1, 4, 7], [1, 5, 9]], 9=[[1, 5, 9]]}
I recommended associate at first, but you can shorten the code even further if you use associateWith. Thanks to Abhay Agarwal who recommended it.
Update
You just need to flatten the values of the result Map.
val w = s.groupBy({ it }, { x -> u.filter { it.contains(x) } })
.mapValues { it.value.flatten() }
My solution map the first collection to pairs from each element to the list where it appears, and then groupBy the result list.
Example
val w = s.map { elem -> Pair(elem, u.filter { list -> elem in list }) }
.groupBy ({ it.first }, { it.second })
.mapValues { it.value.flatten() }
check(w[1] == listOf(listOf(1, 2, 3), listOf(1, 4, 7), listOf(1, 5, 9)))
check(w[9] == listOf(listOf(1, 5, 9)))
println(w)
Output
{1=[[1, 2, 3], [1, 4, 7], [1, 5, 9]], 9=[[1, 5, 9]]}
Idiomatic to me would be s.groupBy(....) The answer by #Omar Mainegra - s.groupBy(...).mapValues( flatten ) absolutely works but it looks like a hack where the initial result needs some extra massaging.
The issue is with the implementation of groupBy and more specifically with groupByTo:
public inline fun <T, K, V, M : MutableMap<in K, MutableList<V>>> Iterable<T>.groupByTo(destination: M, keySelector: (T) -> K, valueTransform: (T) -> V): M {
for (element in this) {
val key = keySelector(element)
val list = destination.getOrPut(key) { ArrayList<V>() }
list.add(valueTransform(element))
}
return destination
}
The implementation wraps the values associated with a key in a list because in general multiple values can be associated with a key which is not the case here
where values in s are unique which means that groupBy is the wrong function to use. The right function is associateWith:
s.associateWith { x -> u.filter { it.contains(x) } }
produces:
{1=[[1, 2, 3], [1, 4, 7], [1, 5, 9]], 9=[[1, 5, 9]]}

Accumulator not reset between 2 consecutive calls to R.reduce in a R.pipe

Considering this code, using Ramda 0.21.0:
var iteratee = (acc, [k, v]) => {
acc[k] = ++v;
return acc
}
var foo = R.pipe(
R.toPairs,
R.reduce(iteratee, {})
)
console.log(foo({ a: 1, b: 2})) // { a: 2, b: 3 }
console.log(foo({ c: 3, d: 4})) // { a: 2, b: 3, c: 4, d: 5 }
Why does the second call to foo display { a: 2, b: 3, c: 4, d: 5 } instead of { c: 4, d: 5 }?
Is there some kind of memoization going on? I would expect the initial value of acc to be reset to {} each time foo is applied.
This answer mostly expands on the comments by #iofjuupasli
The problem is the mutation of the accumulator object. You create one in the definition of foo which is reused on every call, and then you update it in iteratee (horrible name, IMHO. Call it bar or something. :-) ). There are several ways you could fix this. One might be to make sure that you pass a new accumulator on each call to foo:
var iteratee = (acc, [k, v]) => {
acc[k] = ++v;
return acc
}
var foo = R.pipe(
R.toPairs,
list => R.reduce(iteratee, {}, list)
)
foo({ a: 1, b: 2}); //=> {"a": 2, "b": 3}
foo({ c: 3, d: 4}); //=> {"c": 4, "d": 5}
This works, but feels unsatisfying. Perhaps more helpful would be to avoid mutating the accumulator object on each pass. assoc will create a new object that reuses as much of the previous one as possible:
var iteratee = (acc, [k, v]) => R.assoc(k, v + 1, acc)
var foo = R.pipe(
R.toPairs,
R.reduce(iteratee, {})
);
foo({ a: 1, b: 2}); //=> {"a": 2, "b": 3}
foo({ c: 3, d: 4}); //=> {"c": 4, "d": 5}
This seems cleaner. But in fact Ramda has a much simpler solution. The map function treats objects as functors to be mapped over. Combining this with inc, which simply increments a value, we can just do this:
var foo = R.map(R.inc);
foo({ a: 1, b: 2}); //=> {"a": 2, "b": 3}
foo({ c: 3, d: 4}); //=> {"c": 4, "d": 5}
And that feels really clean!