BigQuery GIS table to GeoJSON type Feature Collection with SQL and/or Python - sql

I have some BigQuery tables with spatial data (lon, lat, point, and linestring) representing boat tracks.
I'm trying to get them into a GeoJSON for an API output.
I have one method that uses Python to query my table into a pandas dataframe, which I can then use a function to create a feature collection.
But, there's so much data/day that it'd be ideal if I could perform some kind of efficient linestring union and simplify.
My current method:
# Query BigQuery table
sql = """SELECT
lon, lat, lead_lon, lead_lat,
CAST(CAST(timestamp AS DATE) as string) as date,
UNIX_SECONDS(timestamp) as unix_secs,
CAST(ship_num AS STRING) AS ship_num,
op,
CAST(knots AS FLOAT64) AS knots,
point,
linestring
FROM `ship_segments`
WHERE timestamp BETWEEN '2020-04-16' AND '2020-04-17';"""
# Make into pandas dataframe
df = client.query(sql).to_dataframe()
#df to geojson fn
def data2geojson(df):
features = []
insert_features = lambda X: features.append(
geojson.Feature(geometry=geojson.LineString(([X["lead_lon"], X["lead_lat"], X["knots"], X["unix_secs"]],
[X["lon"], X["lat"], X["knots"], X["unix_secs"]])),
properties=dict(date=X["date"],
mmsi=X["ship_num"],
operator=X["op"]
)))
df.apply(insert_features, axis=1)
geojson_obj = geojson.dumps(geojson.FeatureCollection(features, indent=2, sort_keys=True), sort_keys=True, ensure_ascii=False)
return(geojson_obj)
results = data2geojson(df)
This returns a GeoJSON:
{"features": [{"geometry": {"coordinates": [[-119.049945, 33.983277, 10.5502, 1587104709], [-119.034677, 33.975823, 10.5502, 1587104709]], "type": "LineString"}, "properties": {"date": "2020-04-17", "mmsi": "235098383", "operator": "Evergreen Marine Corp"}, "type": "Feature"}, {"geometry": {"coordinates": [[-120.176933, 34.282107, 22.7005, 1587114969], [-120.144453, 34.275147, 22.7005, 1587114969]], "type": "LineString"}, "properties": {"date": "2020-04-17", "mmsi": "235098383", "operator": "Evergreen Marine Corp"}, "type": "Feature"}, {"geometry": {"coordinates": [[-118.361737, 33.64647, 11.3283, 1587096305], [-118.356308, 33.643713, 11.3283, 1587096305]], "type": "LineString"}, "properties": {"date": "2020-04-17", "mmsi": "538005412", "operator": "Scorpio MR Pool Ltd"}, "type": "Feature"}, {"geometry": {"coordinates": [[-118.414667, 33.673013, 12.7684, 1587097278], [-118.411707, 33.671493, 12.7684, 1587097278]], "type": "LineString"}, "properties": {"date": "2020-04-17", "mmsi": "538005412", "operator": "Scorpio MR Pool Ltd"}, "type": "Feature"}, {"geometry": {"coordinates": [[-119.377783, 34.062612, 10.5456, 1587102119], [-119.384212, 34.064217, 10.5456, 1587102119]], "type": "LineString"}, "properties": {"date": "2020-04-17", "mmsi": "636018225", "operator": "Ocean Network Express Pte Ltd"}, "type": "Feature"}], "indent": 2, "sort_keys": true, "type": "FeatureCollection"}
But I'm trying something like:
select
ship_num,
date(timestamp) as date,
AVG(speed_knots) as avg_speed_knots,
st_union_agg(linestring) as multiline
from(
SELECT
*,
row_number() OVER w AS num,
ST_GeogPoint(lon,lat) as geom,
LEAD(ST_GeogPoint(lon,lat)) OVER w AS geom2,
ST_MAKELINE((ST_GeogPoint(lon,lat)), (LEAD(ST_GeogPoint(lon,lat)) OVER w)) AS linestring,
LEAD(STRING(timestamp), 0) OVER w AS t1,
LEAD(STRING(timestamp), 1) OVER w AS t2,
FROM
`ship_data`
where timestamp >= '2020-04-10'
WINDOW w AS (PARTITION BY ship_num ORDER BY timestamp)) AS q
group by
ship_num, date(timestamp);
This gives me multilinestrings in a table, but then I need to simplify and get them in a GeoJSON FeatureCollection output.
Any ideas that don't use PostGIS?

Related

BigQuery string-formatting to json

Is the following a full list of all value types as they're passed to json in BigQuery? I've gotten this by trial and error but haven't been able to find this in the documentation:
select
NULL as NullValue,
FALSE as BoolValue,
DATE '2014-01-01' as DateValue,
INTERVAL 1 year as IntervalValue,
DATETIME '2014-01-01 01:02:03' as DatetimeValue,
TIMESTAMP '2014-01-01 01:02:03' as TimestampValue,
"Hello" as StringValue,
B"abc" as BytesValue,
123 as IntegerValue,
NUMERIC '3.14' as NumericValue,
3.14 as FloatValue,
TIME '12:30:00.45' as TimeValue,
[1,2,3] as ArrayValue,
STRUCT('Mark' as first, 'Thomas' as last) as StructValue,
[STRUCT(1 as x, 2 as y), STRUCT(5 as x, 6 as y)] as ArrayStructValue,
STRUCT(1 as x, [1,2,3] as y, ('a','b','c') as z) as StructNestedValue
{
"NullValue": null,
"BoolValue": "false", // why not just false without quotes?
"DateValue": "2014-01-01",
"IntervalValue": "1-0 0 0:0:0",
"DatetimeValue": "2014-01-01T01:02:03",
"TimestampValue": "2014-01-01T01:02:03Z",
"StringValue": "Hello",
"BytesValue": "YWJj",
"IntegerValue": "123",
"NumericValue": "3.14",
"FloatValue": "3.14",
"TimeValue": "12:30:00.450000",
"ArrayValue": ["1", "2", "3"],
"StructValue": {
"first": "Mark",
"last": "Thomas"
},
"ArrayStructValue": [
{"x": "1", "y": "2"},
{"x": "5", "y": "6"}
],
"StructNestedValue": {
"x": "1",
"y": ["1", 2", "3"],
"z": {"a": "a", b": "b", "c": "c"}
}
}
Honestly, it seems to me that other than the null value and the array [] or struct {} container, everything is string-enclosed, which seems a bit odd.
According to this document, json is built on two structures:
A collection of name/value pairs. In various languages, this is
realized as an object, record, struct, dictionary, hash table, keyed
list, or associative array.
An ordered list of values. In most
languages, this is realized as an array, vector, list, or sequence.
The result of the SELECT query is in json format, wherein [] depicts an array datatype, {} depicts an object datatype and double quotes(" ") depicts a string value as in the query itself.

is there's function to extract value from dictionary in list?

kindly need to extract name value, even it's Action or adventure from this column in new column
in pandas
'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'
You want from_records:
import pandas as pd
data = [{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]
df = pd.DataFrame.from_records(data)
df
you get
id name
0 28 Action
1 12 Adventure
2 14 Fantasy
3 878 Science Fiction

How to select a field of a JSON object coming from the WHERE condition

I have this table
id name json
1 alex {"type": "user", "items": [ {"name": "banana", "color": "yellow"}, {"name": "apple", "color": "red"} ] }
2 peter {"type": "user", "items": [ {"name": "watermelon", "color": "green"}, {"name": "pepper", "color": "red"} ] }
3 john {"type": "user", "items": [ {"name": "tomato", "color": "red"} ] }
4 carl {"type": "user", "items": [ {"name": "orange", "color": "orange"}, {"name": "nut", "color": "brown"} ] }
Important, each json object can have different number of "items", but what I need is the "product name" of JUST the object that matched in the WHERE condition.
My desired output would be the two first columns and just the name of the item, WHERE the color is like %red%:
id name fruit
1 alex apple
2 peter pepper
3 john tomato
select id, name, ***** (this is what I don't know) FROM table
where JSON_EXTRACT(json, "$.items[*].color") like '%red%'
I would recommend json_table(), if you are running MySQL 8.0:
select t.id, t.name, x.name as fruit
from mytable t
cross join json_table(
t.js,
'$.items[*]' columns (name varchar(50) path '$.name', color varchar(50) path '$.color')
) x
where x.color = 'red'
This function is not implemented in MariaDB. We can unnest manually with the help of a numbers table:
select t.id, t.name,
json_unquote(json_extract(t.js, concat('$.items[', x.num, '].name'))) as fruit
from mytable t
inner join (select 0 as num union all select 1 union all select 2 ...) x(num)
on x.num < json_length(t.js, '$.items')
where json_unquote(json_extract(t.js, concat('$.items[', x.num, '].color'))) = 'red'
You can use JSON_EXTRACT() function along with Recursive Common Table Expression in order to generate rows dynamically such as
WITH RECURSIVE cte AS
(
SELECT 1 AS n
UNION ALL
SELECT n + 1
FROM cte
WHERE cte.n < (SELECT MAX(JSON_LENGTH(json)) FROM t )
)
SELECT id, name,
JSON_UNQUOTE(JSON_EXTRACT(json,CONCAT('$.items[',n-1,'].name'))) AS fruit
FROM cte
JOIN t
WHERE JSON_EXTRACT(json,CONCAT('$.items[',n-1,'].color')) = "red"
Demo

Combining separate temporal measurement series

I have a data set that combines two temporal measurement series with one row per measurement
time: 1, measurement: a, value: 5
time: 2, measurement: b, value: false
time: 10, measurement: a, value: 2
time: 13, measurement: b, value: true
time: 20, measurement: a, value: 4
time: 24, measurement: b, value: true
time: 30, measurement: a, value: 6
time: 32, measurement: b, value: false
in a visualization using Vega lite, I'd like to combine the measurement series and encode measurement a and b in a single visualization without simply layering their representation on a temporal axis but representing their value in a single encoding spec.
either measurement a values need to be interpolated and added as a new value to rows of measurement b
eg:
time: 2, measurement: b, value: false, interpolatedMeasurementA: 4.6667
or the other way around, which leaves the question of how to interpolate a boolean. maybe closest value by time, or simpler: last value
eg:
time: 30, measurement: a, value: 6, lastValueMeasurementB: true
I suppose this could be done either query side in which case this question would be regarding indexDB Flux query language
or this could be done on the visualization side in which case this would be regarding vega-lite
There's not any true linear interpolation schemes built-in to Vega-Lite (though the loess transform comes close), but you can achieve roughly what you wish with a window transform.
Here is an example (view in editor):
{
"data": {
"values": [
{"time": 1, "measurement": "a", "value": 5},
{"time": 2, "measurement": "b", "value": false},
{"time": 10, "measurement": "a", "value": 2},
{"time": 13, "measurement": "b", "value": true},
{"time": 20, "measurement": "a", "value": 4},
{"time": 24, "measurement": "b", "value": true},
{"time": 30, "measurement": "a", "value": 6},
{"time": 32, "measurement": "b", "value": false}
]
},
"transform": [
{
"calculate": "datum.measurement == 'a' ? datum.value : null",
"as": "measurement_a"
},
{
"window": [
{"op": "mean", "field": "measurement_a", "as": "interpolated"}
],
"sort": [{"field": "time"}],
"frame": [1, 1]
},
{"filter": "datum.measurement == 'b'"}
],
"mark": "line",
"encoding": {
"x": {"field": "time"},
"y": {"field": "interpolated"},
"color": {"field": "value"}
}
}
This first uses a calculate transform to isolate the values to be interpolated, then a window transform that computes the mean over adjacent values (frame: [1, 1]), then a filter transform to isolate interpolated rows.
If you wanted to go the other route, you could do a similar sequence of transforms targeting the boolean value instead.

PLSQL: remove unwanted double quotes in a Json

I have a Json like this (it is contained in a clob variable):
{"id": "33", "type": "abc", "val": "2", "cod": "", "sg1": "1", "sg2": "1"}
{"id": "359", "type": "abcef", "val": "52", "cod": "aa", "sg1": "", "sg2": "0"}
…
I need to remove " from values of: id, val, sg1, sg2
Is it possibile?
For example, I need to obtain this:
{"id": 33, "type": "abc", "val": 2, "cod": "", "sg1": 1, "sg2": 1}
{"id": 359, "type": "abcef", "val": 52, "cod": "aa", "sg1": , "sg2": 0}
…
If you are using Oracle 12 (R2?) or later then you can convert your JSON to the appropriate data types and then convert it back to JSON.
Oracle 18 Setup:
CREATE TABLE test_data ( value CLOB );
INSERT INTO test_data ( value )
VALUES ( '{"id": "33", "type": "abc", "val": "2", "cod": "", "sg1": "1", "sg2": "1"}' );
INSERT INTO test_data ( value )
VALUES ( '{"id": "359", "type": "abcef", "val": "52", "cod": "aa", "sg1": "", "sg2": "0"}' );
Query:
SELECT JSON_OBJECT(
'id' IS j.id,
'type' IS j.typ,
'val' IS j.val,
'cod' IS j.cod,
'sg1' IS j.sg1,
'sg2' IS j.sg2
) AS JSON
FROM test_data t
CROSS JOIN
JSON_TABLE(
t.value,
'$'
COLUMNS
id NUMBER(5,0) PATH '$.id',
typ VARCHAR2(10) PATH '$.type',
val NUMBER(5,0) PATH '$.val',
cod VARCHAR2(10) PATH '$.cod',
sg1 NUMBER(5,0) PATH '$.sg1',
sg2 NUMBER(5,0) PATH '$.sg2'
) j
Output:
| JSON |
| :--------------------------------------------------------------- |
| {"id":33,"type":"abc","val":2,"cod":null,"sg1":1,"sg2":1} |
| {"id":359,"type":"abcef","val":52,"cod":"aa","sg1":null,"sg2":0} |
Or, if you want to use regular expressions (you shouldn't if you have the choice and should use a proper JSON parser instead) then:
Query 2:
SELECT REGEXP_REPLACE(
REGEXP_REPLACE(
value,
'"(id|val|sg1|sg2)": ""',
'"\1": "null"'
),
'"(id|val|sg1|sg2)": "(\d+|null)"',
'"\1": \2'
) AS JSON
FROM test_data
Output:
| JSON |
| :-------------------------------------------------------------------------- |
| {"id": 33, "type": "abc", "val": 2, "cod": "", "sg1": 1, "sg2": 1} |
| {"id": 359, "type": "abcef", "val": 52, "cod": "aa", "sg1": null, "sg2": 0} |
db<>fiddle here