KQL Remove from array if array item contains equals condition - kql

I have a log source which has the following format:
//file1.png is the filename, the parenthesis hold the filesize in bytes.
// Each item is separated by a semicolon.
file1.png (445b); file2.pdf (2345b); file3.jpg (343b); file4.docx (3243b);
I have this regex to split the items into a list of lists, with 0 being the filename, 1 the extension and 2 the size in bytes. This can be copied as a test:
datatable (item:string ) [
'file1.png (445b); file2.pdf (2345b); file3.jpg (343b); file4.docx (3243b);',
'file1.src (3243b); file2.ps2 (24b); file3.jpg (300b); file4.jpg (326b);'
]
| extend list = extract_all(#'([^<>:"\/\\\|\?\*;]+)\.([a-zA-z0-9]+) \((\d+)b\);', item)
This outputs as the following:
[["file1", ".png", "343"], ["file2", ".pdf", "232"],...]
I want to remove items from the list if the sublist contains certain filetypes (such as .png). How would I go about doing this? KQL has no iteration features as far as I know.
I have tried using regex to exclude the queries I do not want to match. It was the following: (?:^|; =?)([^<>:"\/\\\|\?\*;]+\.(?!jpg\b|png\b)\w+ \(\d+b\))(?=;|$). Unfortunately, KQL does not support negative lookaheads.

mv-apply operator
datatable (item:string ) [
'file1.png (445b); file2.pdf (2345b); file3.jpg (343b); file4.docx (3243b);',
'file1.src (3243b); file2.ps2 (24b); file3.jpg (300b); file4.jpg (326b);'
]
| mv-apply f = extract_all(#'([^<>:"\/\\\|\?\*;]+)\.([a-zA-z0-9]+) \((\d+)b\);', item) on
(
where f[1] !in~ ("png", "jpg")
| summarize make_list(pack_array(f))
)
item
list_
file1.png (445b); file2.pdf (2345b); file3.jpg (343b); file4.docx (3243b);
[["file2","pdf","2345","b"],["file4","docx","3243","b"]]
file1.src (3243b); file2.ps2 (24b); file3.jpg (300b); file4.jpg (326b);
[["file1","src","3243","b"],["file2","ps2","24","b"]]
Fiddle

replace_regex()
datatable (item:string ) [
'file1.png (445b); file2.pdf (2345b); file3.jpg (343b); file4.docx (3243b);',
'file1.src (3243b); file2.ps2 (24b); file3.jpg (300b); file4.jpg (326b);'
]
| extend clean_item = replace_regex(item, #'([^<>:"\/\\\|\?\*;]+)\.(?i:png|jpg) \((\d+)b\);', "")
| extend extract_all(#'([^<>:"\/\\\|\?\*;]+)\.([a-zA-z0-9]+) \((\d+)b\);', clean_item)
item
clean_item
Column1
file1.png (445b); file2.pdf (2345b); file3.jpg (343b); file4.docx (3243b);
file2.pdf (2345b); file4.docx (3243b);
[[" file2","pdf","2345"],[" file4","docx","3243"]]
file1.src (3243b); file2.ps2 (24b); file3.jpg (300b); file4.jpg (326b);
file1.src (3243b); file2.ps2 (24b);
[["file1","src","3243"],[" file2","ps2","24"]]
Fiddle

And another option that doesn't rely on (arguably) hard to read regular expressions:
datatable (item:string ) [
'file1.png (445b); file2.pdf (2345b); file3.jpg (343b); file4.docx (3243b);',
'file1.src (3243b); file2.ps2 (24b); file3.jpg (300b); file4.jpg (326b);'
]
| extend s=split(item, "; ")
| mv-expand s
| parse s with fileName "." extension " (" fileSize "b)" rest
| where extension !in ("png", "jpg")
| extend image = bag_pack("fileName", fileName, "extension", extension, "fileSizeBytes", fileSize)
| summarize make_list(image) by item
item
list_image
file1.png (445b); file2.pdf (2345b); file3.jpg (343b); file4.docx (3243b);
[ { "fileName": "file2", "extension": "pdf", "fileSizeBytes": "2345" }, { "fileName": "file4", "extension": "docx", "fileSizeBytes": "3243" } ]
file1.src (3243b); file2.ps2 (24b); file3.jpg (300b); file4.jpg (326b);
[ { "fileName": "file1", "extension": "src", "fileSizeBytes": "3243" }, { "fileName": "file2", "extension": "ps2", "fileSizeBytes": "24" } ]
According to the original regular expression ; isn't part of the filename which means simply splitting on ; should be safe.

Related

KQL: How to put unique values from a column as names of columns

I want to gather unique entries from one column and pass them as column names to display their count values.
let sales = datatable (store: string, category: string, product: string)
[
"StoreA", "Food", "Steak",
"StoreB", "Drink", "Cola",
"StoreB", "Food", "Fries",
"StoreA", "Sweets", "Cake",
"StoreB", "Food", "Hotdog",
"StoreB", "Food", "Salad",
"StoreA", "Sweets", "Chocolate",
"StoreC", "Food", "Steak"
];
sales
| summarize Food=count(category=="Food"),Drink=count(category=="Drink"),Sweets=count(category=="Sweets") by store
It can be done manually, but I want to make this query universal, so it doesn't need to be changed when new categories are added.
Converting rows to columns is called "pivoting".
See KQL pivot plugin
let sales = datatable (store: string, category: string, product: string)
[
"StoreA", "Food", "Steak",
"StoreB", "Drink", "Cola",
"StoreB", "Food", "Fries",
"StoreA", "Sweets", "Cake",
"StoreB", "Food", "Hotdog",
"StoreB", "Food", "Salad",
"StoreA", "Sweets", "Chocolate",
"StoreC", "Food", "Steak"
];
sales
| evaluate pivot(category, count(), store)
store
Drink
Food
Sweets
StoreA
0
1
2
StoreB
1
3
0
StoreC
0
1
0
Fiddle

How to pass column name to toscalar function?

I have a kusto table with one of the columns as dynamic type with nested json.
If I use ColumnName in toscalar function, I will get this error. Is there any way to do this, or it is impossble?
let T = datatable(ColumnName:dynamic)
[
dynamic({"OtherField": "Unknown","First": [{"Id": "","Second": [{"ConfidenceLevel": "Low","Count": 3}]},{"Id": "","Second":[{"ConfidenceLevel": "High","Count": 0}]}]}),
dynamic({"OtherField": "Unknown","First": [{"Id": "","Second": [{"ConfidenceLevel": "Low","Count": 3}]},{"Id": "","Second":[{"ConfidenceLevel": "High","Count": 2}]}]})
];
let result = T
// The following line works, but regex is not allowed during review.
// | where tostring(ColumnName) matches regex '"ConfidenceLevel":"High","Count":[^0]'
| where isnotnull(toscalar(
// print s = '{"OtherField": "Unknown","First": [{"Id": "","Second": [{"ConfidenceLevel": "Low","Count": 3}]},{"Id": "","Second":[{"ConfidenceLevel": "High","Count": 0}]}]}'
print s = tostring(ColumnName) // Error here: The name 'ColumnName' does not refer to any column, table, varible or function.
| project obj0 = parse_json(s)
| mv-expand obj1 = obj0.First
| mv-expand obj2 = obj1.Second
| where obj2.ConfidenceLevel == "High" and obj2.Count > 0)
)
;
result
I tried and confirmed it was caused by toscalar function.
Expected result (The second row will be selected):
ColumnName
{"OtherField":"Unknown","First":[{"Id":"","Second":[{"ConfidenceLevel":"Low","Count":3}]},{"Id":"","Second":[{"ConfidenceLevel":"High","Count":2}]}]}
The documentation is very clear about this:
"The toscalar() function can't be applied on row-level (for-each-row scenario)."
Nested mv-apply, to deal with the nested arrays
let T = datatable(ColumnName:dynamic)
[
dynamic({"OtherField": "Unknown","First": [{"Id": "","Second": [{"ConfidenceLevel": "Low","Count": 3}]},{"Id": "","Second":[{"ConfidenceLevel": "High","Count": 0}]}]}),
dynamic({"OtherField": "Unknown","First": [{"Id": "","Second": [{"ConfidenceLevel": "Low","Count": 3}]},{"Id": "","Second":[{"ConfidenceLevel": "High","Count": 2}]}]})
];
T
| mv-apply ColumnName.First on
(
mv-apply ColumnName_First.Second on
(
where ColumnName_First_Second.ConfidenceLevel == "High"
and ColumnName_First_Second.Count > 0
)
)
| project ColumnName
ColumnName
{"OtherField":"Unknown","First":[{"Id":"","Second":[{"ConfidenceLevel":"Low","Count":3}]},{"Id":"","Second":[{"ConfidenceLevel":"High","Count":2}]}]}
Fiddle

How to convert row to column on specific condition using awk?

I have a file while I want to convert from row to column on specific condition.
Input file:
cat f
"0/35","0eij8401c
"0/35","59ij41015
"0/35","21ij3e01c
"0/35","dbije401b
"1/35","dbij8a015
"1/35","67ijb9011
"1/35","b5ije001b
"1/35","bdij3701d
"2/35","abij3b011
"2/35","7fij70018
"2/35","77ijf9010
"2/35","e5ij64015
"3/35","59ij41015
"3/35","f6ijae01e
"3/35","c4ij5801c
"3/35","dbij98012
"4/35","edij6801e
"4/35","pdij6801e
"4/35","kdij6801e
"4/35","8cij57018
NOTE: here I am fetching 1st, 5th, 9th, 13th and 17th row's second column in first column in output below. like wise 2nd, 6th, 10th, 14th and 18th row's second column to print second column in output and same for rest of the rows.
There are two expected output:
Expected output 1: (To see it in a report format)
"0eij8401c "dbij8a015 "abij3b011 "59ij41015 "edij6801e
"59ij41015 "67ijb9011 "7fij70018 "f6ijae01e "pdij6801e
"21ij3e01c "b5ije001b "77ijf9010 "c4ij5801c "kdij6801e
"dbije401b "bdij3701d "e5ij64015 "dbij98012 "8cij57018
Expected output2:
And then convert the expected output1 into a single column to perform some operation:
0eij8401c
dbij8a015
abij3b011
59ij41015
edij6801e
59ij41015
67ijb9011
7fij70018
f6ijae01e
pdij6801e
21ij3e01c
b5ije001b
77ijf9010
c4ij5801c
kdij6801e
dbije401b
bdij3701d
e5ij64015
dbij98012
8cij57018
I tried combination of awk and paste, trying to achieve both with awk command.
This is what I tried -
cat f | awk -v batchNo=1 -v Num=4 '{print $1 > "batch_" batchNo ".txt";if(NR%Num==0) {batchNo++}}'
to generate 5 files like below -
ls batch_*
batch_1.txt batch_2.txt batch_3.txt batch_4.txt batch_5.txt
and then combined with paste like below -
paste batch_1.txt batch_2.txt batch_3.txt batch_4.txt batch_5.txt
"0eij8401c "dbij8a015 "abij3b011 "59ij41015 "edij6801e
"59ij41015 "67ijb9011 "7fij70018 "f6ijae01e "pdij6801e
"21ij3e01c "b5ije001b "77ijf9010 "c4ij5801c "kdij6801e
"dbije401b "bdij3701d "e5ij64015 "dbij98012 "8cij57018
I also tried something like this to get the desired result but didn't get it.
awk '{a[$1]++; b[$2]++;c[$3]++;d[$4]++;e[$5]++} END {for (k in a) print k > "out.txt"; for (j in b) print j > "out.txt";for (k in c) print j > "out.txt";for(l in d) print l> "out.txt"; for (m in e) print m> "out.txt";}' batch_*
Any suggestion please.
In addition to the other two good answers, there is yet another simplified way to approach each of your separate output problems. In the first case, you can simply save the values from the second column in an indexed array and then output in rows by groups of 5, e.g.
awk -F, '
{ a[++n] = $2 }
END {
for (i=1; i<=(n/5); i++)
printf "%s %s %s %s %s\n", a[i], a[i+4], a[i+8], a[i+12], a[i+16]
}
' f
Output
"0eij8401c "dbij8a015 "abij3b011 "59ij41015 "edij6801e
"59ij41015 "67ijb9011 "7fij70018 "f6ijae01e "pdij6801e
"21ij3e01c "b5ije001b "77ijf9010 "c4ij5801c "kdij6801e
"dbije401b "bdij3701d "e5ij64015 "dbij98012 "8cij57018
If you need the column output in the specific order shown, you can use the approach to save to an indexed array and then output with '\n' separators instead along with trimming the first char with substr(), e.g.
awk -F, '
{ a[++n]=$2 }
END {
for (i=1; i<=(n/5); i++)
printf "%s\n%s\n%s\n%s\n%s\n", substr(a[i],2), substr(a[i+4],2),
substr(a[i+8],2), substr(a[i+12],2), substr(a[i+16],2)
}
' f
Output
0eij8401c
dbij8a015
abij3b011
59ij41015
edij6801e
59ij41015
67ijb9011
7fij70018
f6ijae01e
pdij6801e
21ij3e01c
b5ije001b
77ijf9010
c4ij5801c
kdij6801e
dbije401b
bdij3701d
e5ij64015
dbij98012
8cij57018
If you just need a column of output of the 2nd field, regardless of order, you can simply use substring to output all but the first character, e.g.
awk -F, '{ print substr($2,2) }' f
Output
0eij8401c
59ij41015
21ij3e01c
dbije401b
dbij8a015
67ijb9011
b5ije001b
bdij3701d
abij3b011
7fij70018
77ijf9010
e5ij64015
59ij41015
f6ijae01e
c4ij5801c
dbij98012
edij6801e
pdij6801e
kdij6801e
8cij57018
About solutions: 3 of these solutions will print continuous view(details one by one for file's continuity) AND report view as well where you need them horizontally. 1st solution: considers that your Input_file is sorted by digits/digits format. 2nd solution sorts Input_file then does the job. 3rd: solution print both styles and create output file batches also.
1st solution: (Considers that your Input_file is sorted by "digit/digits" format)With your shown samples please try following awk code. This will print the output directly sorting order of 1st field eg: "0/35", "1/35" and so on.
awk -v count=0 -v s1="\"" -F'^"|","' '
prev!=$2{
countFile++
max=(max>count?max:count)
count=1
}
{
arr[countFile,count++]=$3
prev=$2
}
END{
print "Printing continous view from here..."
for(i=1;i<=max;i++){
for(j=1;j<countFile;j++){
print(arr[i,j])
}
}
print "Printing REPORT view from here......"
for(i=1;i<countFile;i++){
for(j=1;j<=max;j++){
printf("%s%s",s1 arr[j,i],j==max?ORS:OFS)
}
}
}
' Input_file
2nd solution: In case your Input_file is NOT sorted with("digit/digits" format) then try this code.
awk -F'^"|","' '{print $2,$0}' Input_file | sort -t/ -nk1 -k2 | cut -d' ' -f2 |
awk -v count=0 -v s1="\"" -F'^"|","' '
prev!=$2{
countFile++
max=(max>count?max:count)
count=1
}
{
arr[countFile,count++]=$3
prev=$2
}
END{
print "Printing continous view from here..."
for(i=1;i<=max;i++){
for(j=1;j<countFile;j++){
print(arr[i,j])
}
}
print "Printing REPORT view from here......"
for(i=1;i<countFile;i++){
for(j=1;j<=max;j++){
printf("%s%s",s1 arr[j,i],j==max?ORS:OFS)
}
}
}
'
OR 3rd solution: In case you want to print data on screen as well as you want to create output files also within same awk program then try following:
awk -v count=0 -v s1="\"" -F'^"|","' '
prev!=$2{
close(outputFile)
countFile++
outputFile="batch_"countFile".txt"
max=(max>count?max:count)
count=1
}
{
arr[countFile,count++]=$3
prev=$2
print (s1 $3) > (outputFile)
}
END{
print "Printing continous view from here..."
for(i=1;i<=max;i++){
for(j=1;j<countFile;j++){
print(arr[i,j])
}
}
print "Printing REPORT view from here......"
for(i=1;i<countFile;i++){
for(j=1;j<=max;j++){
printf("%s%s",s1 arr[j,i],j==max?ORS:OFS)
}
}
}
' Input_file
Output will be as follows which it will print:
"0eij8401c "dbij8a015 "abij3b011 "59ij41015 "edij6801e
"59ij41015 "67ijb9011 "7fij70018 "f6ijae01e "pdij6801e
"21ij3e01c "b5ije001b "77ijf9010 "c4ij5801c "kdij6801e
"dbije401b "bdij3701d "e5ij64015 "dbij98012 "8cij57018
As your shown input is already sorted on first field, you may use this solution:
awk -F, '{gsub(/^"|\/[0-9]+"/, ""); print $2 > "batch_" ($1+1) ".txt"}' f
paste batch_1.txt batch_2.txt batch_3.txt batch_4.txt batch_5.txt
"0eij8401c "dbij8a015 "abij3b011 "59ij41015 "edij6801e
"59ij41015 "67ijb9011 "7fij70018 "f6ijae01e "pdij6801e
"21ij3e01c "b5ije001b "77ijf9010 "c4ij5801c "kdij6801e
"dbije401b "bdij3701d "e5ij64015 "dbij98012 "8cij57018
For output2 as per edited question use:
awk '{
a[FNR] = a[FNR] substr($0,2) "\n"
}
END {
for (i=1; i<=FNR; ++i) printf "%s", a[i]
}' batch_1.txt batch_2.txt batch_3.txt batch_4.txt batch_5.txt
0eij8401c
dbij8a015
abij3b011
59ij41015
edij6801e
59ij41015
67ijb9011
7fij70018
f6ijae01e
pdij6801e
21ij3e01c
b5ije001b
77ijf9010
c4ij5801c
kdij6801e
dbije401b
bdij3701d
e5ij64015
dbij98012
8cij57018
Using any awk:
$ cat tst.awk
BEGIN { FS="\"" }
{ vals[++numVals] = $NF }
END {
numValsPerBatch = int(numVals / numBatches) + ( numVals % numBatches ? 1 : 0 )
for ( batchNr=1; batchNr<=numBatches; batchNr++ ) {
for ( valNr=1; valNr<=numValsPerBatch; valNr++ ) {
valIdx = batchNr + (valNr - 1) * numBatches
printf "%s%s", vals[valIdx], (valNr<numValsPerBatch ? OFS : ORS) > "out1.txt"
print vals[valIdx] > "out2.txt"
}
}
}
$ awk -v numBatches=4 -f tst.awk f
$ head -100 out?.txt
==> out1.txt <==
0eij8401c dbij8a015 abij3b011 59ij41015 edij6801e
59ij41015 67ijb9011 7fij70018 f6ijae01e pdij6801e
21ij3e01c b5ije001b 77ijf9010 c4ij5801c kdij6801e
dbije401b bdij3701d e5ij64015 dbij98012 8cij57018
==> out2.txt <==
0eij8401c
dbij8a015
abij3b011
59ij41015
edij6801e
59ij41015
67ijb9011
7fij70018
f6ijae01e
pdij6801e
21ij3e01c
b5ije001b
77ijf9010
c4ij5801c
kdij6801e
dbije401b
bdij3701d
e5ij64015
dbij98012
8cij57018
or if you want the number of batches to be calculated from the key values (YMMV if there's different numbers of values per key in your input):
$ cat tst.awk
BEGIN { FS="\"" }
!seen[$2]++ { numKeys++ }
{ vals[++numVals] = $NF }
END {
numBatches = int(numVals / numKeys) + (numVals % numKeys ? 1 : 0)
numValsPerBatch = int(numVals / numBatches) + (numVals % numBatches ? 1 : 0)
for ( batchNr=1; batchNr<=numBatches; batchNr++ ) {
for ( valNr=1; valNr<=numValsPerBatch; valNr++ ) {
valIdx = batchNr + (valNr - 1) * numBatches
printf "%s%s", vals[valIdx], (valNr<numValsPerBatch ? OFS : ORS) > "out1.txt"
print vals[valIdx] > "out2.txt"
}
}
}
$ awk -f tst.awk f
$ head -100 out?.txt
==> out1.txt <==
0eij8401c dbij8a015 abij3b011 59ij41015 edij6801e
59ij41015 67ijb9011 7fij70018 f6ijae01e pdij6801e
21ij3e01c b5ije001b 77ijf9010 c4ij5801c kdij6801e
dbije401b bdij3701d e5ij64015 dbij98012 8cij57018
==> out2.txt <==
0eij8401c
dbij8a015
abij3b011
59ij41015
edij6801e
59ij41015
67ijb9011
7fij70018
f6ijae01e
pdij6801e
21ij3e01c
b5ije001b
77ijf9010
c4ij5801c
kdij6801e
dbije401b
bdij3701d
e5ij64015
dbij98012
8cij57018
TXR solution:
#(collect)
# (all)
"#id/#nil
# (and)
# (collect :gap 0)
"#id/#nil","#data
# (bind qdata `"#data`)
# (end)
# (end)
#(end)
#(bind tdata #(transpose qdata))
#(bind fdata #(flatten (transpose data)))
#(output)
# (repeat)
#{tdata " "}
# (end)
# (repeat)
#fdata
# (end)
#(end)
$ txr soln.txr data
"0eij8401c "dbij8a015 "abij3b011 "59ij41015 "edij6801e
"59ij41015 "67ijb9011 "7fij70018 "f6ijae01e "pdij6801e
"21ij3e01c "b5ije001b "77ijf9010 "c4ij5801c "kdij6801e
"dbije401b "bdij3701d "e5ij64015 "dbij98012 "8cij57018
0eij8401c
dbij8a015
abij3b011
59ij41015
edij6801e
59ij41015
67ijb9011
7fij70018
f6ijae01e
pdij6801e
21ij3e01c
b5ije001b
77ijf9010
c4ij5801c
kdij6801e
dbije401b
bdij3701d
e5ij64015
dbij98012
8cij57018

Using COPY to import a .json file into a PostgreSQL table

I want to import some weather data (temperature, wind speed, ...) that is all formatted in a JSON file into a PostgreSQL 11 table so I can then make queries on that data.
I've been able to manually insert some data into a table but that's only OK because it's a small amount of data and I'm planning on using a LOT more data afterwards. Here is what I've found using the INSERT function: https://datavirtuality.com/blog-json-in-postgresql/.
That's why I've been trying to use the COPY function but no luck so far, even after having read a lot of stuff on different sources on the Internet ...
The JSON file is downloadable there : https://queueresults.meteoblue.com/F2637B90-45BB-4E7A-B47C-C34CD56674B3 (let me know if the file doesn't exist anymore).
I've been able to import the JSON file as text into a table with:
create table temp_json (values text);
copy temp_json from '/home/cae/test.json';
But I don't think that's the best approach to be able to make efficient queries later on ...
I usually run into the following error during my tests:
ERROR: invalid input syntax for type json
DETAIL: The input string ended unexpectedly.
CONTEXT: JSON data, line 1: [
as if I'm not able to parse the JSON file and the array properly within PostgreSQL ...
Thanks for your help !
Edit: Here is the content of the JSON file:
[
{
"geometry": {
"type": "MultiPoint",
"locationNames": [
"59.4°N/24.7°E31.7m",
"59.4°N/24.8°E36.4m"
],
"coordinates": [
[
24.7,
59.4,
31.73
],
[
24.8,
59.4,
36.445
]
]
},
"domain": "NEMS12",
"codes": [
{
"unit": "°C",
"dataPerTimeInterval": [
{
"data": [
[
-0.395,
-0.195,
-0.099999994,
-0.030000001,
-0.060000002,
-0.099999994,
-0.099999994,
0.005,
-0.055,
0.19,
0.48,
0.725,
1.88,
1.88,
1.855,
1.935,
2.1950002,
2.595,
3.3049998,
4.115,
3.37,
2.97,
3.32,
3.5149999,
3.56,
3.44,
3.355,
3.3600001,
3.32,
3.32,
3.4250002,
3.42,
3.3899999,
3.445,
3.3200002,
3.0549998,
4.58,
4.01,
3.02,
2.79,
2.75,
2.76,
2.855,
2.99,
2.96,
2.775,
2.595,
2.4250002
],
[
-0.49,
-0.26,
-0.16,
-0.09,
-0.1,
-0.13,
-0.12,
0.01,
-0.07,
0.17,
0.44,
0.66,
1.84,
1.85,
1.83,
1.9,
2.15,
2.55,
3.27,
4.11,
3.46,
2.96,
3.31,
3.5,
3.55,
3.42,
3.33,
3.34,
3.29,
3.29,
3.43,
3.44,
3.42,
3.52,
3.41,
3.11,
4.53,
4,
3.01,
2.79,
2.76,
2.77,
2.87,
3,
2.93,
2.71,
2.53,
2.38
]
],
"gapFillRatio": 0
}
],
"level": "2 m above gnd",
"aggregation": "none",
"code": 11,
"variable": "Temperature"
}
],
"timeIntervals": [
[
"20180101T0000",
"20180101T0100",
"20180101T0200",
"20180101T0300",
"20180101T0400",
"20180101T0500",
"20180101T0600",
"20180101T0700",
"20180101T0800",
"20180101T0900",
"20180101T1000",
"20180101T1100",
"20180101T1200",
"20180101T1300",
"20180101T1400",
"20180101T1500",
"20180101T1600",
"20180101T1700",
"20180101T1800",
"20180101T1900",
"20180101T2000",
"20180101T2100",
"20180101T2200",
"20180101T2300",
"20180102T0000",
"20180102T0100",
"20180102T0200",
"20180102T0300",
"20180102T0400",
"20180102T0500",
"20180102T0600",
"20180102T0700",
"20180102T0800",
"20180102T0900",
"20180102T1000",
"20180102T1100",
"20180102T1200",
"20180102T1300",
"20180102T1400",
"20180102T1500",
"20180102T1600",
"20180102T1700",
"20180102T1800",
"20180102T1900",
"20180102T2000",
"20180102T2100",
"20180102T2200",
"20180102T2300"
]
],
"timeResolution": "hourly"
},
{
"geometry": {
"coordinates": [
[
24.7,
59.4,
31.73
],
[
24.8,
59.4,
36.445
]
],
"locationNames": [
"59.4°N/24.7°E31.7m",
"59.4°N/24.8°E36.4m"
],
"type": "MultiPoint"
},
"domain": "NEMS12",
"codes": [
{
"unit": "°C",
"aggregation": "none",
"code": 11,
"level": "1000 mb",
"dataPerTimeInterval": [
{
"data": [
[
-0.585,
-0.265,
-0.055,
0.04,
0.044999998,
0.08,
0.11,
0.205,
0.13499999,
0.43,
0.84000003,
1.2,
2.1,
2.33,
2.5,
2.72,
3.1750002,
3.775,
4.915,
5.37,
4.16,
3.795,
4.1949997,
4.41,
4.415,
4.275,
4.1800003,
4.16,
4.0950003,
4.08,
4.185,
4.1,
3.98,
3.575,
3.22,
2.92,
4.395,
3.7649999,
2.895,
2.66,
2.6550002,
2.72,
2.845,
2.955,
2.89,
2.685,
2.54,
2.355
],
[
-0.64,
-0.29,
-0.08,
0.01,
0.03,
0.08,
0.12,
0.24,
0.14,
0.4,
0.8,
1.13,
2.11,
2.34,
2.52,
2.74,
3.19,
3.82,
4.91,
5.45,
4.29,
3.81,
4.19,
4.42,
4.43,
4.28,
4.17,
4.15,
4.08,
4.06,
4.18,
4.12,
4.01,
3.66,
3.31,
2.97,
4.38,
3.79,
2.9,
2.68,
2.68,
2.75,
2.89,
2.99,
2.88,
2.64,
2.43,
2.27
]
],
"gapFillRatio": 0
}
],
"variable": "Temperature"
}
],
"timeIntervals": [
[
"20180101T0000",
"20180101T0100",
"20180101T0200",
"20180101T0300",
"20180101T0400",
"20180101T0500",
"20180101T0600",
"20180101T0700",
"20180101T0800",
"20180101T0900",
"20180101T1000",
"20180101T1100",
"20180101T1200",
"20180101T1300",
"20180101T1400",
"20180101T1500",
"20180101T1600",
"20180101T1700",
"20180101T1800",
"20180101T1900",
"20180101T2000",
"20180101T2100",
"20180101T2200",
"20180101T2300",
"20180102T0000",
"20180102T0100",
"20180102T0200",
"20180102T0300",
"20180102T0400",
"20180102T0500",
"20180102T0600",
"20180102T0700",
"20180102T0800",
"20180102T0900",
"20180102T1000",
"20180102T1100",
"20180102T1200",
"20180102T1300",
"20180102T1400",
"20180102T1500",
"20180102T1600",
"20180102T1700",
"20180102T1800",
"20180102T1900",
"20180102T2000",
"20180102T2100",
"20180102T2200",
"20180102T2300"
]
],
"timeResolution": "hourly"
},
{
"geometry": {
"type": "MultiPoint",
"locationNames": [
"59.4°N/24.7°E31.7m",
"59.4°N/24.8°E36.4m"
],
"coordinates": [
[
24.7,
59.4,
31.73
],
[
24.8,
59.4,
36.445
]
]
},
"domain": "NEMS12",
"codes": [
{
"unit": "°C",
"dataPerTimeInterval": [
{
"data": [
[
-7.0950003,
-6.615,
-4.815,
-3.55,
-2.6750002,
-2.1950002,
-2.695,
-2.87,
-2.1399999,
-0.995,
0.1,
1,
0.335,
0.38,
-0.030000001,
-0.8,
-0.18,
0.575,
1.11,
-0.32999998,
-1.03,
-2.31,
-3.09,
-3.7350001,
-3.93,
-3.905,
-3.92,
-3.71,
-3.625,
-3.195,
-3.7,
-3.32,
-3.72,
-3.915,
-3.93,
-3.605,
-4.315,
-3.8899999,
-3.815,
-3.38,
-3.2150002,
-3.27,
-3.435,
-3.47,
-3.43,
-3.37,
-3.44,
-3.51
],
[
-7.11,
-6.73,
-4.94,
-3.57,
-2.7,
-2.15,
-2.62,
-2.91,
-2.22,
-1.1,
0.03,
0.9,
0.36,
0.37,
0.11,
-0.74,
-0.13,
0.59,
1.19,
-0.19,
-0.95,
-2.18,
-3.08,
-3.68,
-3.97,
-3.94,
-3.93,
-3.69,
-3.63,
-3.27,
-3.7,
-3.32,
-3.68,
-3.9,
-3.97,
-3.6,
-4.29,
-3.92,
-3.8,
-3.37,
-3.24,
-3.28,
-3.42,
-3.44,
-3.39,
-3.35,
-3.37,
-3.44
]
],
"gapFillRatio": 0
}
],
"level": "850 mb",
"code": 11,
"aggregation": "none",
"variable": "Temperature"
}
],
"timeResolution": "hourly",
"timeIntervals": [
[
"20180101T0000",
"20180101T0100",
"20180101T0200",
"20180101T0300",
"20180101T0400",
"20180101T0500",
"20180101T0600",
"20180101T0700",
"20180101T0800",
"20180101T0900",
"20180101T1000",
"20180101T1100",
"20180101T1200",
"20180101T1300",
"20180101T1400",
"20180101T1500",
"20180101T1600",
"20180101T1700",
"20180101T1800",
"20180101T1900",
"20180101T2000",
"20180101T2100",
"20180101T2200",
"20180101T2300",
"20180102T0000",
"20180102T0100",
"20180102T0200",
"20180102T0300",
"20180102T0400",
"20180102T0500",
"20180102T0600",
"20180102T0700",
"20180102T0800",
"20180102T0900",
"20180102T1000",
"20180102T1100",
"20180102T1200",
"20180102T1300",
"20180102T1400",
"20180102T1500",
"20180102T1600",
"20180102T1700",
"20180102T1800",
"20180102T1900",
"20180102T2000",
"20180102T2100",
"20180102T2200",
"20180102T2300"
]
]
},
{
"geometry": {
"type": "MultiPoint",
"locationNames": [
"59.4°N/24.7°E31.7m",
"59.4°N/24.8°E36.4m"
],
"coordinates": [
[
24.7,
59.4,
31.73
],
[
24.8,
59.4,
36.445
]
]
},
"domain": "NEMS12",
"codes": [
{
"unit": "°C",
"dataPerTimeInterval": [
{
"data": [
[
-10.84,
-12,
-10.280001,
-8.865,
-8.5,
-7.7,
-7.5699997,
-7.655,
-8.434999,
-8.844999,
-8.700001,
-7.1549997,
-9.555,
-10.004999,
-7.885,
-8.32,
-8.370001,
-8.915,
-9.53,
-10.225,
-10.934999,
-11.12,
-11.434999,
-11.575,
-11.965,
-11.64,
-12.12,
-12.345,
-12.34,
-12.48,
-12.844999,
-13.174999,
-13.18,
-13.219999,
-13.434999,
-13.305,
-12.775,
-12.745,
-12.79,
-12.75,
-12.690001,
-12.77,
-12.77,
-12.76,
-12.67,
-12.605,
-12.635,
-12.695
],
[
-10.74,
-11.94,
-10.54,
-8.77,
-8.56,
-7.75,
-7.52,
-7.53,
-8.24,
-8.95,
-8.77,
-7.15,
-9.48,
-10.03,
-7.88,
-8.24,
-8.35,
-8.82,
-9.4,
-10.08,
-10.84,
-11.04,
-11.3,
-11.5,
-11.9,
-11.6,
-12.09,
-12.31,
-12.39,
-12.48,
-12.83,
-13.16,
-13.2,
-13.19,
-13.4,
-13.3,
-12.77,
-12.7,
-12.78,
-12.71,
-12.66,
-12.73,
-12.73,
-12.72,
-12.62,
-12.57,
-12.6,
-12.67
]
],
"gapFillRatio": 0
}
],
"code": 11,
"level": "700 mb",
"aggregation": "none",
"variable": "Temperature"
}
],
"timeResolution": "hourly",
"timeIntervals": [
[
"20180101T0000",
"20180101T0100",
"20180101T0200",
"20180101T0300",
"20180101T0400",
"20180101T0500",
"20180101T0600",
"20180101T0700",
"20180101T0800",
"20180101T0900",
"20180101T1000",
"20180101T1100",
"20180101T1200",
"20180101T1300",
"20180101T1400",
"20180101T1500",
"20180101T1600",
"20180101T1700",
"20180101T1800",
"20180101T1900",
"20180101T2000",
"20180101T2100",
"20180101T2200",
"20180101T2300",
"20180102T0000",
"20180102T0100",
"20180102T0200",
"20180102T0300",
"20180102T0400",
"20180102T0500",
"20180102T0600",
"20180102T0700",
"20180102T0800",
"20180102T0900",
"20180102T1000",
"20180102T1100",
"20180102T1200",
"20180102T1300",
"20180102T1400",
"20180102T1500",
"20180102T1600",
"20180102T1700",
"20180102T1800",
"20180102T1900",
"20180102T2000",
"20180102T2100",
"20180102T2200",
"20180102T2300"
]
]
},
{
"geometry": {
"type": "MultiPoint",
"locationNames": [
"59.4°N/24.7°E",
"59.4°N/24.8°E"
],
"coordinates": [
[
24.7,
59.4,
"NaN"
],
[
24.8,
59.4,
"NaN"
]
]
},
"domain": "CAMSGLOBAL",
"codes": [
{
"unit": "",
"dataPerTimeInterval": [
{
"data": [
[
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN"
],
[
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN"
]
],
"gapFillRatio": 0
}
],
"code": 706,
"level": "sfc",
"aggregation": "none",
"variable": "Air Quality Index"
}
],
"timeResolution": "hourly",
"timeIntervals": [
[
"20180101T0000",
"20180101T0100",
"20180101T0200",
"20180101T0300",
"20180101T0400",
"20180101T0500",
"20180101T0600",
"20180101T0700",
"20180101T0800",
"20180101T0900",
"20180101T1000",
"20180101T1100",
"20180101T1200",
"20180101T1300",
"20180101T1400",
"20180101T1500",
"20180101T1600",
"20180101T1700",
"20180101T1800",
"20180101T1900",
"20180101T2000",
"20180101T2100",
"20180101T2200",
"20180101T2300",
"20180102T0000",
"20180102T0100",
"20180102T0200",
"20180102T0300",
"20180102T0400",
"20180102T0500",
"20180102T0600",
"20180102T0700",
"20180102T0800",
"20180102T0900",
"20180102T1000",
"20180102T1100",
"20180102T1200",
"20180102T1300",
"20180102T1400",
"20180102T1500",
"20180102T1600",
"20180102T1700",
"20180102T1800",
"20180102T1900",
"20180102T2000",
"20180102T2100",
"20180102T2200",
"20180102T2300"
]
]
}
]
Given your first example, you could then process it like this to separate the json array into individual objects and stuff them into a table as separate rows:
create table real_json as select value::jsonb from temp_json join lateral json_array_elements(values::json) on true;
However, this depends on the large single json object always being small enough to fit comfortably into an amount of memory you are willing to use, which seems like a dubious proposition. You need a library which does incremental or streaming parsing on the JSON object, returning one 2nd level object at a time and then clearing it from memory one returned. I don't think that PostgreSQL provides such a facility. If you let us know what your favorite programming language is, perhaps someone can propose a specific library.
Alternatively, you could whip up a quick and dirty script that divides the JSON into lines for separate records based on the assumption that the indenting of the "pretty" file is always correct, and so using "^ [{}]" as markers, and then strips out the newlines to reverse the "pretty" formatting so that each record is a single line. If you had such a script, you could then do:
\copy real_json FROM PROGRAM 'unnest_top_array_and_depretty /home/cae/test_without_new_lines.json';
Same code of #jjanes with a real, working command line tool.
\copy json_table FROM PROGRAM 'jq --stream -nc -f myfile.json';
Removing the ''pretty format'' from the file helped in using the COPY function but it puts the whole content of the file in one row, making it impossible to run a simple SELECT query on an existing column ...
Here is what I used :
CREATE TEMP TABLE target(data jsonb);
copy target from '/home/cae/test_without_new_lines.json';

Perform a coordinate transformation of a 4th-order tensor with np.einsum and np.tensordot

The equation is
$C'_{ijkl} = Q_{im} Q_{jn} C_{mnop} (Q^{-1})_{ok} (Q^{-1})_{pl}$
I was able to use
np.einsum('im,jn,mnop,ok,pl', Q, Q, C, Q_inv, Q_inv)
to do the job, and also expect
np.tensordot(np.tensordot(np.tensordot(Q, np.tensordot(Q, C, axes=[1,1]), axes=[1,0]), Q_inv, axes=[2,0]), Q_inv, axes=[3,0])
to work, but it doesn't.
Specifics:
C is a 4th-order elastic tensor:
array([[[[ 552.62389047, -0.28689554, -0.32194701],
[ -0.28689554, 118.89168597, -0.65559912],
[ -0.32194701, -0.65559912, 130.21758722]],
[[ -0.28689554, 166.02923119, -0.00000123],
[ 166.02923119, 0.49494431, -0.00000127],
[ -0.00000123, -0.00000127, -0.57156702]],
[[ -0.32194701, -0.00000123, 165.99413061],
[ -0.00000123, -0.64666809, -0.0000013 ],
[ 165.99413061, -0.0000013 , 0.42997465]]],
[[[ -0.28689554, 166.02923119, -0.00000123],
[ 166.02923119, 0.49494431, -0.00000127],
[ -0.00000123, -0.00000127, -0.57156702]],
[[ 118.89168597, 0.49494431, -0.64666809],
[ 0.49494431, 516.15898907, -0.33132485],
[ -0.64666809, -0.33132485, 140.09010389]],
[[ -0.65559912, -0.00000127, -0.0000013 ],
[ -0.00000127, -0.33132485, 165.98553869],
[ -0.0000013 , 165.98553869, 0.41913346]]],
[[[ -0.32194701, -0.00000123, 165.99413061],
[ -0.00000123, -0.64666809, -0.0000013 ],
[ 165.99413061, -0.0000013 , 0.42997465]],
[[ -0.65559912, -0.00000127, -0.0000013 ],
[ -0.00000127, -0.33132485, 165.98553869],
[ -0.0000013 , 165.98553869, 0.41913346]],
[[ 130.21758722, -0.57156702, 0.42997465],
[ -0.57156702, 140.09010389, 0.41913346],
[ 0.42997465, 0.41913346, 486.62412063]]]])
Q is a rotation matrix changing x and y coords.
array([[ 0, 1, 0],
[-1, 0, 0],
[ 0, 0, 1]])
Q_inv is
array([[-0., -1., -0.],
[ 1., 0., 0.],
[ 0., 0., 1.]])
np.einsum leads to
array([[[[ 516.15898907, -0.49494431, -0.33132485],
[ -0.49494431, 118.89168597, 0.64666809],
[ -0.33132485, 0.64666809, 140.09010389]],
[[ -0.49494431, 166.02923119, 0.00000127],
[ 166.02923119, 0.28689554, -0.00000123],
[ 0.00000127, -0.00000123, 0.57156702]],
[[ -0.33132485, 0.00000127, 165.98553869],
[ 0.00000127, -0.65559912, 0.0000013 ],
[ 165.98553869, 0.0000013 , 0.41913346]]],
[[[ -0.49494431, 166.02923119, 0.00000127],
[ 166.02923119, 0.28689554, -0.00000123],
[ 0.00000127, -0.00000123, 0.57156702]],
[[ 118.89168597, 0.28689554, -0.65559912],
[ 0.28689554, 552.62389047, 0.32194701],
[ -0.65559912, 0.32194701, 130.21758722]],
[[ 0.64666809, -0.00000123, 0.0000013 ],
[ -0.00000123, 0.32194701, 165.99413061],
[ 0.0000013 , 165.99413061, -0.42997465]]],
[[[ -0.33132485, 0.00000127, 165.98553869],
[ 0.00000127, -0.65559912, 0.0000013 ],
[ 165.98553869, 0.0000013 , 0.41913346]],
[[ 0.64666809, -0.00000123, 0.0000013 ],
[ -0.00000123, 0.32194701, 165.99413061],
[ 0.0000013 , 165.99413061, -0.42997465]],
[[ 140.09010389, 0.57156702, 0.41913346],
[ 0.57156702, 130.21758722, -0.42997465],
[ 0.41913346, -0.42997465, 486.62412063]]]])
which I believe is correct, while four np.tensordot leads to
array([[[[ 552.62389047, -0.28689554, 0.32194701],
[ -0.28689554, 118.89168597, 0.65559912],
[ -0.32194701, -0.65559912, -130.21758722]],
[[ -0.28689554, 166.02923119, 0.00000123],
[ 166.02923119, 0.49494431, 0.00000127],
[ -0.00000123, -0.00000127, 0.57156702]],
[[ -0.32194701, -0.00000123, -165.99413061],
[ -0.00000123, -0.64666809, 0.0000013 ],
[ 165.99413061, -0.0000013 , -0.42997465]]],
[[[ -0.28689554, 166.02923119, 0.00000123],
[ 166.02923119, 0.49494431, 0.00000127],
[ -0.00000123, -0.00000127, 0.57156702]],
[[ 118.89168597, 0.49494431, 0.64666809],
[ 0.49494431, 516.15898907, 0.33132485],
[ -0.64666809, -0.33132485, -140.09010389]],
[[ -0.65559912, -0.00000127, 0.0000013 ],
[ -0.00000127, -0.33132485, -165.98553869],
[ -0.0000013 , 165.98553869, -0.41913346]]],
[[[ 0.32194701, 0.00000123, 165.99413061],
[ 0.00000123, 0.64666809, -0.0000013 ],
[-165.99413061, 0.0000013 , 0.42997465]],
[[ 0.65559912, 0.00000127, -0.0000013 ],
[ 0.00000127, 0.33132485, 165.98553869],
[ 0.0000013 , -165.98553869, 0.41913346]],
[[-130.21758722, 0.57156702, 0.42997465],
[ 0.57156702, -140.09010389, 0.41913346],
[ -0.42997465, -0.41913346, 486.62412063]]]])
Notice the negative big numbers.
Approach #1
One way would be to use np.tensordot to get the same result as with np.einsum though not in a single step and with some help from the trusty broadcasting -
# Get broadcasted elementwise multiplication between two versions of Q.
# This corresponds to "np.einsum('im,jn,..', Q, Q)" producing "'ijmn""
# broadcasted version of elementwise multiplications between Q's.
Q_ext = Q[:,None,:,None]*Q[:,None,:]
# Similarly for Q_inv : For "np.einsum('..ok,pl', Q_inv, Q_inv)" get "'opkl'"
# broadcasted version of elementwise multiplications between Q_inv's.
Q_inv_ext = Q_inv[:,None,:,None]*Q_inv[:,None,:]
# Perform "np.einsum('im,jn,mnop,ok,pl', Q, Q, C)" with "np.tensordot".
# Notice that we are using the last two axes from 'Q_ext', so "axes=[2,3]"
# and first two from 'C', so "axes=[0,1]" for it.
# These axes would be reduced by the dot-product, leaving us with 'ijop'.
parte1 = np.tensordot(Q_ext,C,axes=([2,3],[0,1]))
# Do it one more time to perform "np.einsum('ijop,ok,pl', parte1,Q_inv,Q_inv)"
# to reduce dimensions represented by 'o,p', leaving us with 'ijkl'.
# To confirm, compare the following against original einsum approach :
# "np.einsum('im,jn,mnop,ok,pl->ijkl', Q, Q, C, Q_inv, Q_inv)"
out = np.tensordot(parte1,Q_inv_ext,axes=([2,3],[0,1]))
Approach #2
If you wish to avoid broadcasting in favour of using two more instances of np.tensordot, you could do -
# Perform "np.einsum('jn,mnop', Q, C). Notice how, Q is represented by 'jn'
# and C by 'mnop'. We need to reduce the 'm' dimension, i.e. reduce 'axes=1'
# from Q and `axes=1` from C corresponding to `n' in each of the inputs.
# Thus, 'jn' + 'mnop' => 'jmop' after 'n' is reduced and order is maintained.
Q_C1 = np.tensordot(Q,C,axes=([1],[1]))
# Perform "np.einsum('im,jn,mnop', Q, Q, C). We need to use Q and Q_C1.
# Q is 'im' and Q_C1 is 'jmop'. Thus, again we need to reduce 'axes=1'
# from Q and `axes=1` from Q_C1 corresponding to `m' in each of the inputs.
# Thus, 'im' + 'jmop' => 'ijop' after 'm' is reduced and order is maintained.
parte1 = np.tensordot(Q,Q_C1,axes=([1],[1]))
# Use the same philosophy to get the rest of the einsum equivalent,
# but use parte1 and go right and use Q_inv
out = np.tensordot(np.tensordot(parte1,Q_inv,axes=([2],[0])),Q_inv,axes=([2],[0]))
The trick with np.tensordot is to keep track of the dimensions that are reduced by the axes parameter and how the collapsed dimensions align against the remaining inputs' dimensions.