Using Array(Tuple(LowCardinality(String), Int32)) in ClickHouse - sql

I have a table
CREATE TABLE table (
id Int32,
values Array(Tuple(LowCardinality(String), Int32)),
date Date
) ENGINE MergeTree()
PARTITION BY toYYYYMM(date)
ORDER BY (id, date)
but when executing the request
SELECT count(*)
FROM table
WHERE (arrayExists(x -> ((x.1) = toLowCardinality('pattern')), values) = 1)
I get an error
Code: 49. DB::Exception: Received from clickhouse:9000. DB::Exception: Cannot capture column 3 because it has incompatible type: got String, but LowCardinality(String) is expected..
If I replace the column 'values'
values Array(Tuple(String, Int32))
then the request is executed without errors.
What could be the problem when using Array(Tuple(LowCardinality(String), Int32))?

Until it will be fixed (see bug 7815), can be used this workaround:
SELECT uniqExact((id, date)) AS count
FROM table
ARRAY JOIN values
WHERE values.1 = 'pattern'
For the case when there are more than one Array-columns can be used this way:
SELECT uniqExact((id, date)) AS count
FROM
(
SELECT
id,
date,
arrayJoin(values) AS v,
arrayJoin(values2) AS v2
FROM table
WHERE v.1 = 'pattern' AND v2.1 = 'pattern2'
)

values Array(Tuple(LowCardinality(String), Int32)),
Do not use Tuple. It brings only cons.
It's still *2 files on the disk.
It gives twice slowdown then you extract only one tuple element
https://gist.github.com/den-crane/f20a2dce94a2926a1e7cfec7cdd12f6d
valuesS Array(LowCardinality(String)),
valuesI Array(Int32)

Related

SparkSQL regexp_extract function java error

I am trying to extract the id's starting with srsa from the table structure below
id reason_text_field
34394 {"initial_customer":"sda_WWyfr4AXY1fIAS", customer_result":"srsa_CAkAaAvNKL2OSD"}
in order to get the following output:
id srsa_id
34394 srsa_CAkAaAvNKL2OSD
but when I use the following SparkSQL function
REGEXP_EXTRACT(reason_text_field, 'srsa[^"]*') as srsa_id
I get this error:
java.lang.IndexOutOfBoundsException: No group
You need to specify the group to capture. Try this:
SELECT id,
REGEXP_EXTRACT(reason_text_field, '\"(srsa[^"]*)\"', 1) as srsa_id
-- or REGEXP_EXTRACT(reason_text_field, 'srsa[^"]*', 0) as srsa_id
FROM tb
Note however that you can also convert the text column reason_text_field into a map or struct using from_json then extract the field customer_result:
SELECT id,
from_json(reason_text_field, 'map<string,string>')['customer_result'] as srsa_id
FROM tb

BigQuery UPDATE of column within nest from GoogleAnalytics export

I have a website's Google Analytics tracking data exported in BigQuery to a ga_session_* table. It's the standard setup of any GA export to BQ I've come across.
For simplicity's sake I'll refer to only ga_session_20220801.
This table has the following fields:
- visitorId INTEGER NULLABLE
- visitNumber INTEGER NULLABLE
- visitId INTEGER NULLABLE
- visitStartTime INTEGER NULLABLE
- date STRING NULLABLE
- totals RECORD NULLABLE
- trafficSource RECORD NULLABLE
- device RECORD NULLABLE
- geoNetwork RECORD NULLABLE
- customDimensions RECORD REPEATED
- hits RECORD REPEATED
- fullVisitorId STRING NULLABLE
- userId STRING NULLABLE
- clientId STRING NULLABLE
- channelGrouping STRING NULLABLE
- socialEngagementType STRING NULLABLE
- privacyInfo RECORD NULLABLE
Field hits is a repeated field i.e. it contains multiple records for each table record.
My question
How can I execute an update statements on field hits.sourcePropertyInfo.sourcePropertyTrackingId, where sourcePropertyInfo is another nest inside hits (but not a repeated record)?
My attempts
update `my_project.my_dataset.ga_sessions_20220801`
set hits.sourcePropertyInfo.sourcePropertyTrackingId = 'some value'
where 1=1
Cannot access field sourcePropertyInfo on a value with type ARRAY<STRUCT<hitNumber INT64, time INT64, hour INT64, ...>> at [2:10]
update `my_project.my_dataset.ga_sessions_20220801`
set (select sourcePropertyInfo.sourcePropertyDisplayName from unnest(hits)) = 'some value'
where 1=1
Syntax error: Expected keyword DELETE or keyword INSERT or keyword UPDATE but got keyword SELECT at [2:6]
The following is my only attempt that was not stopped by syntax. I ended up trying to re-create the whole nest by listing each field. But still returns an error at runtime due to the record being repeated.
update `my_project.my_dataset.ga_sessions_20220801`
set hits = ARRAY(
SELECT AS STRUCT
hits.hitNumber, hits.time, hits.hour, hits.minute, hits.isSecure, hits.isInteraction, hits.isEntrance, hits.isExit, hits.referer, hits.page,
(SELECT AS STRUCT hits.transaction.transactionId, hits.transaction.transactionRevenue, hits.transaction.transactionTax, hits.transaction.transactionShipping, hits.transaction.affiliation,
hits.transaction.currencyCode,
hits.transaction.localTransactionRevenue, hits.transaction.localTransactionTax, hits.transaction.localTransactionShipping, hits.transaction.transactionCoupon)
as transaction,
hits.item, hits.contentInfo, hits.appInfo, hits.exceptionInfo, hits.eventInfo, hits.product, hits.promotion, hits.promotionActionInfo, hits.refund, hits.eCommerceAction, hits.experiment, hits.publisher, hits.customVariables, hits.customDimensions, hits.customMetrics, hits.type, hits.social, hits.latencyTracking,
(SELECT AS STRUCT 'some value' as sourcePropertyDisplayName, hits.sourcePropertyInfo.sourcePropertyTrackingId)
as sourcePropertyInfo,
hits.contentGroup, hits.dataSource, hits.publisher_infos, hits.uses_transient_token
FROM gas.hits
)
where 1=1
Scalar subquery produced more than one element
You're so close with the third query. You should recreate the hits column, but by preserving the original data structure.
In the query below, I get all the hit rows and replace their sourcePropertyInfo key in the struct.
Then, since I unnested the hits, to gather it again, I used array_agg so it becomes an array again.
update `my_project.my_dataset.ga_sessions_20220801`
set hits =
(
select array_agg(t)
from (
select
hit.* replace(
struct(
hit.sourcePropertyInfo.sourcePropertyDisplayName,
'some value' as sourcePropertyTrackingId
) as sourcePropertyInfo
)
from unnest(hits) as hit
) as t
)
where true

Cannot have map type columns in DataFrame which calls set operations

: org.apache.spark.sql.AnalysisException: Cannot have map type columns in DataFrame which calls set operations(intersect, except, etc.), but the type of column map_col is map
I have a hive table with a column of type - MAP<Float, Float>. I get the above error when I try to do an insertion on this table in a spark context. Insertion works fine without the 'distinct'.
create table test_insert2(`test_col` string, `map_col` MAP<INT,INT>)
location 's3://mybucket/test_insert2';
insert into test_insert2
select distinct 'a' as test_col, map(0,0) as map_col
Try to convert dataframe to .rdd then apply .distinct function.
Example:
spark.sql("select 'a'test_col,map(0,0)map_col
union all
select 'a'test_col,map(0,0)map_col").rdd.distinct.collect
Result:
Array[org.apache.spark.sql.Row] = Array([a,Map(0 -> 0)])

Regex extract in BigQuery issue

I'm trying to simplify a column in BigQuery by using BigQuery extract on it but I am having a bit of an issue.
Here are two examples of the data I'm extracting from:
dc_pre=CLXk_aigyOMCFQb2dwod4dYCZw;gtm=2wg7f1;gcldc=;gclaw=;gac=UA-5815571-8:;auiddc=;u1=OVERDRFT;u2=undefined;u3=undefined;u4=undefined;u5=SSA;u6=undefined;u7=na;u8=undefined;u9=undefined;u10=undefined;u11=undefined;~oref=https://www.online.bank.co.za/onlineContent/ga_bridge.html
dc_pre=COztt4-tyOMCFcji7Qod440PCw;gtm=2wg7f1;gcldc=;gclaw=;gac=UA-5815571-8:;auiddc=;u1=DDA13;u2=undefined;u3=undefined;u4=undefined;u5=SSA;u6=undefined;u7=na;u8=undefined;u9=undefined;u10=undefined;u11=undefined;~oref=https://www.online.support.co.za/onlineContent/ga_bridge.html
I want to extract the portion between ;u1= and ;u2
Running the following legacy SQL Query
SELECT
Date(Event_Time),
Activity_ID,
REGEXP_EXTRACT(Other_Data, r'(?<=u1=)(.*\n?)(?=;u2)')
FROM
[sprt-data-transfer:dtftv2_sprt.p_activity_166401]
WHERE
Activity_ID in ('8179851')
AND Site_ID_DCM NOT IN ('2134603','2136502','2539719','2136304','2134604','2134602','2136701','2378406')
AND Event_Time BETWEEN 1563746400000000 AND 1563832799000000
I get the error...
Failed to parse regular expression "(?<=u1=)(.*\n?)(?=;u2)": invalid
perl operator: (?<
And this is where my talent runs out, is the error being caused because I'm using legacy SQL? Or is an unsupported format for REGEX?
Just tried this, and it worked, but with "Standart SQL" enabled.
select
other_data,
regexp_extract(other_data, ';u1=(.+?);u2') as some_part
from
unnest([
'dc_pre=CLXk_aigyOMCFQb2dwod4dYCZw;gtm=2wg7f1;gcldc=;gclaw=;gac=UA-5815571-8:;auiddc=;u1=OVERDRFT;u2=undefined;u3=undefined;u4=undefined;u5=SSA;u6=undefined;u7=na;u8=undefined;u9=undefined;u10=undefined;u11=undefined;~oref=https://www.online.bank.co.za/onlineContent/ga_bridge.html',
'dc_pre=COztt4-tyOMCFcji7Qod440PCw;gtm=2wg7f1;gcldc=;gclaw=;gac=UA-5815571-8:;auiddc=;u1=DDA13;u2=undefined;u3=undefined;u4=undefined;u5=SSA;u6=undefined;u7=na;u8=undefined;u9=undefined;u10=undefined;u11=undefined;~oref=https://www.online.support.co.za/onlineContent/ga_bridge.html'
]) as other_data
Not using regex but it still works...
with test as (
select 1 as id, 'dc_pre=CLXk_aigyOMCFQb2dwod4dYCZw;gtm=2wg7f1;gcldc=;gclaw=;gac=UA-5815571-8:;auiddc=;u1=OVERDRFT;u2=undefined;u3=undefined;u4=undefined;u5=SSA;u6=undefined;u7=na;u8=undefined;u9=undefined;u10=undefined;u11=undefined;~oref=https://www.online.bank.co.za/onlineContent/ga_bridge.html' as my_str UNION ALL
select 2 as id, 'dc_pre=COztt4-tyOMCFcji7Qod440PCw;gtm=2wg7f1;gcldc=;gclaw=;gac=UA-5815571-8:;auiddc=;u1=DDA13;u2=undefined;u3=undefined;u4=undefined;u5=SSA;u6=undefined;u7=na;u8=undefined;u9=undefined;u10=undefined;u11=undefined;~oref=https://www.online.support.co.za/onlineContent/ga_bridge.html'
),
temp as (
select
id,
split(my_str,';') as items
from test
),
flattened as (
select
id,
split(i,'=')[SAFE_OFFSET(0)] as left_side,
split(i,'=')[SAFE_OFFSET(1)] as right_side
from temp
left join unnest(items) i
)
select * from flattened
where left_side = 'u1'

Select rows from table with jsonb column based on arbitrary jsonb filter expression

Test data
DROP TABLE t;
CREATE TABLE t(_id serial PRIMARY KEY, data jsonb);
INSERT INTO t(data) VALUES
('{"a":1,"b":2, "c":3}')
, ('{"a":11,"b":12, "c":13}')
, ('{"a":21,"b":22, "c":23}')
Problem statement: I want to receive an arbitrary JSONB parameter which acts as a filter on column t.data, such as
{ "b":{ "from":0, "to":20 }, "c":13 }
and use this to select matching rows from my test table t.
In this example, I want rows where b is between 0 and 20 and c = 13.
No error is required if the filter specifies a "column" (or "tag") which does not exist in t.data - it just fails to find a match.
I've used numeric values for simplicity but would like an approach which generalises to text as well.
What I have tried so far. I looked at the containment approach, which works for equality conditions, but am stumped on a generic way of handling range conditions:
select * from t
where t.data#> '{"c":13}'::jsonb;
Background: This problem arose when building a generic table-preview page on a website (for Admin users).
The page displays a filter based on various columns in whichever table is selected for preview.
The filter is then passed to a function in Postgres DB which applies this dynamic filter condition to the table.
It returns a jsonb array of the rows matching the filter specified by the user.
This jsonb array is then used to populate the Preview resultset.
The columns which make up the filter may change.
My Postgres version is 9.6 - thanks.
if you want to parse { "b":{ "from":0, "to":20 }, "c":13 } you need a parser. It is out of scope of json functions, but you can write "generic" query using AND and OR to filter by such json, eg:
https://www.db-fiddle.com/f/jAPBQggG3p7CxqbKLMbPKw/0
with filt(f) as (values('{ "b":{ "from":0, "to":20 }, "c":13 }'::json))
select *
from t
join filt on
(f->'b'->>'from')::int < (data->>'b')::int
and
(f->'b'->>'to')::int > (data->>'b')::int
and
(data->>'c')::int = (f->>'c')::int
;
Thanks for the comments/suggestions.
I will definitely look at GraphQL when I have more time - I'm working under a tight deadline at the moment.
It seems the consensus is that a fully generic solution is not achievable without a parser.
However, I got a workable first draft - it's far from ideal but we can work with it. Any comments/improvements are welcome ...
Test data (expanded to include dates & text fields)
DROP TABLE t;
CREATE TABLE t(_id serial PRIMARY KEY, data jsonb);
INSERT INTO t(data) VALUES
('{"a":1,"b":2, "c":3, "d":"2018-03-10", "e":"2018-03-10", "f":"Blah blah" }')
, ('{"a":11,"b":12, "c":13, "d":"2018-03-14", "e":"2018-03-14", "f":"Howzat!"}')
, ('{"a":21,"b":22, "c":23, "d":"2018-03-14", "e":"2018-03-14", "f":"Blah blah"}')
First draft of code to apply a jsonb filter dynamically, but with restrictions on what syntax is supported.
Also, it just fails silently if the syntax supplied does not match what it expects.
Timestamp handling a bit kludgey, too.
-- Handle timestamp & text types as well as int
-- See is_timestamp(text) function at bottom
with cte as (
select t.data, f.filt, fk.key
from t
, ( values ('{ "a":11, "b":{ "from":0, "to":20 }, "c":13, "d":"2018-03-14", "e":{ "from":"2018-03-11", "to": "2018-03-14" }, "f":"Howzat!" }'::jsonb ) ) as f(filt) -- equiv to cross join
, lateral (select * from jsonb_each(f.filt)) as fk
)
select data, filt --, key, jsonb_typeof(filt->key), jsonb_typeof(filt->key->'from'), is_timestamp((filt->key)::text), is_timestamp((filt->key->'from')::text)
from cte
where
case when (filt->key->>'from') is null then
case jsonb_typeof(filt->key)
when 'number' then (data->>key)::numeric = (filt->>key)::numeric
when 'string' then
case is_timestamp( (filt->key)::text )
when true then (data->>key)::timestamp = (filt->>key)::timestamp
else (data->>key)::text = (filt->>key)::text
end
when 'boolean' then (data->>key)::boolean = (filt->>key)::boolean
else false
end
else
case jsonb_typeof(filt->key->'from')
when 'number' then (data->>key)::numeric between (filt->key->>'from')::numeric and (filt->key->>'to')::numeric
when 'string' then
case is_timestamp( (filt->key->'from')::text )
when true then (data->>key)::timestamp between (filt->key->>'from')::timestamp and (filt->key->>'to')::timestamp
else (data->>key)::text between (filt->key->>'from')::text and (filt->key->>'to')::text
end
when 'boolean' then false
else false
end
end
group by data, filt
having count(*) = ( select count(distinct key) from cte ) -- must match on all filter elements
;
create or replace function is_timestamp(s text) returns boolean as $$
begin
perform s::timestamp;
return true;
exception when others then
return false;
end;
$$ strict language plpgsql immutable;