Extract and sum values with subfields inside string using ClickHouse

Extract and sum values with subfields inside string using ClickHouse - sql

I have a ClickHouse database with a simple table with two fields (pagePath string, pageviews int)
I want sum visitis for each filter + value, to know pageviews by users in filters (most used filter) . Filters are separated by comma
Example data
pagePath
pageviews
/url1/filter1.value1
1
/url1/filter1.value2
2
/url1/filter1.value3,filter1.value2
3
/url1/filter1.value4,filter3.value2
4
/url1/filter1.value5,filter3.value2,filter1.value2
5
/url1/filter2.value1,filter3.value2,filter1.value2
6
/url1/filter2.value2
7
/url-2/filter2.value3
8
/url-2/filter2.value4
9
/url-11/filter3.value1
10
/url-21/filter3.value1
11
/url1/filter3.value2
12
/url1/filter3.value3
13
/url1/filter3.value4
14
create table T (pagePath String , pageviews Int64) Engine=Memory;
insert into T values ('/url1/filter1.value1',1);
insert into T values ('/url1/filter1.value2',2);
insert into T values ('/url1/filter1.value3,filter1.value2',3);
insert into T values ('/url1/filter1.value4,filter3.value2',4);
insert into T values ('/url1/filter1.value5,filter3.value2,filter1.value2',5);
insert into T values ('/url1/filter2.value1,filter3.value2,filter1.value2',6);
insert into T values ('/url1/filter2.value2',7);
insert into T values ('/url-2/filter2.value3',8);
insert into T values ('/url-2/filter2.value4',9);
insert into T values ('/url-11/filter3.value1',10);
insert into T values ('/url-21/filter3.value1',11);
insert into T values ('/url1/filter3.value2',12);
insert into T values ('/url1/filter3.value3',13);
insert into T values ('/url1/filter3.value4',14);
And I want get sum of pageviews foreach filter + value
Filters
pageviews
filter1.value1
1
filter1.value2
16 (2 + 3 +5 +6)
filter1.value3
3
filter1.value4
4
filter1.value5
5
filter2.value1
6
filter2.value2
7
filter2.value3
8
filter2.value4
9
filter3.value1
10
filter3.value1
11
filter3.value2
12 (4+5+6+12)
filter3.value3
13
filter3.value4
14
And also I want get sum of pageviews foreach filter (without values)
Filters
pageviews
filter1
15 (1, 2, 3, 4, 5)
filter2
30 (6+ 7+ 8+ 9)
filter1
60 ( 10, 11, 12, 13, 14)
I try with
select
arrJoinFilters,
sum (PV) as totales
from
(
SELECT
arrJoinFilters,
splitByChar(',',replaceRegexpAll(pagePath,'^/url.*/(.*\..*)$','\\1')) arrFilter,
pageviews as PV
FROM
Table ARRAY JOIN arrFilter AS arrJoinFilters
GROUP by arrFilter,PV,arrJoinFilters
)
group by
arrJoinFilters
order by arrJoinFilters
But, I think there are some wrong, and I don't get second result desired
Thanks!

4+5+6+12 = 27
SELECT
arrayJoin(splitByChar(',',replaceRegexpAll(pagePath,'^/url.*/(.*\..*)$','\\1'))) f,
sum(pageviews) as PV
FROM T
GROUP by f
order by f
┌─f──────────────┬─PV─┐
│ filter1.value1 │ 1 │
│ filter1.value2 │ 16 │
│ filter1.value3 │ 3 │
│ filter1.value4 │ 4 │
│ filter1.value5 │ 5 │
│ filter2.value1 │ 6 │
│ filter2.value2 │ 7 │
│ filter2.value3 │ 8 │
│ filter2.value4 │ 9 │
│ filter3.value1 │ 21 │
│ filter3.value2 │ 27 │
│ filter3.value3 │ 13 │
│ filter3.value4 │ 14 │
└────────────────┴────┘
select splitByChar('.',f)[1] x, sum(PV), groupArray(PV), groupArray(f)
from (
SELECT
arrayJoin(splitByChar(',',replaceRegexpAll(pagePath,'^/url.*/(.*\..*)$','\\1'))) f,
sum(pageviews) as PV
FROM T
GROUP by f
order by f) group by x
┌─x───────┬─sum(PV)─┬─groupArray(PV)─┬─groupArray(f)──────────────────────────────────────────────────────────────────────────┐
│ filter2 │ 30 │ [6,7,8,9] │ ['filter2.value1','filter2.value2','filter2.value3','filter2.value4'] │
│ filter3 │ 75 │ [21,27,13,14] │ ['filter3.value1','filter3.value2','filter3.value3','filter3.value4'] │
│ filter1 │ 29 │ [1,16,3,4,5] │ ['filter1.value1','filter1.value2','filter1.value3','filter1.value4','filter1.value5'] │
└─────────┴─────────┴────────────────┴────────────────────────────────────────────────────────────────────────────────────────┘

Related

How to calculate the difference of two sums in SQL

I'm using Grafana to show some data from Clickhouse. The data comes from a table containing itime, count and some other columns.
id method count itime
1 aaa 12 2021-07-20 00:07:06
2 bbb 9 2021-07-20 00:07:06
3 ccc 7 2021-07-20 00:07:07
...
Now I can execute the following SQL to get the sum of count between two itimes:
SELECT toUnixTimestamp(toStartOfMinute(itime)) * 1000 as t,
method,
sum(count) as c
FROM me.my_table
WHERE itime BETWEEN toDateTime(1631870605) AND toDateTime(1631874205)
and method like 'a%'
GROUP BY method, t
HAVING c > 500
ORDER BY t
It works as expected.
Now, I want to select the sum(count) according to the difference between sum(count) - sum(count)<--7-day-ago. Something like SELECT ... FROM ... WHERE ... HAVING c - c<--7-day-ago >= 100. But I don't know how.

create table test(D Date, Key Int64, Val Int64) Engine=Memory;
insert into test select today(), number, 100 from numbers(5);
insert into test select today()-7, number, 110 from numbers(5);
select sx.2 d1, Key, sumIf(sx.1, D=sx.2) s, sumIf(sx.1, D!=sx.2) s1 from (
select D, Key, arrayJoin([(s, D), (s, D + interval 7 day)]) sx
from (select D, Key, sum(Val) s from test group by D, Key)
)group by d1, Key
order by d1, Key;
┌─────────d1─┬─Key─┬───s─┬──s1─┐
│ 2021-09-10 │ 0 │ 110 │ 0 │
│ 2021-09-10 │ 1 │ 110 │ 0 │
│ 2021-09-10 │ 2 │ 110 │ 0 │
│ 2021-09-10 │ 3 │ 110 │ 0 │
│ 2021-09-10 │ 4 │ 110 │ 0 │
│ 2021-09-17 │ 0 │ 100 │ 110 │
│ 2021-09-17 │ 1 │ 100 │ 110 │
│ 2021-09-17 │ 2 │ 100 │ 110 │
│ 2021-09-17 │ 3 │ 100 │ 110 │
│ 2021-09-17 │ 4 │ 100 │ 110 │
│ 2021-09-24 │ 0 │ 0 │ 100 │
│ 2021-09-24 │ 1 │ 0 │ 100 │
│ 2021-09-24 │ 2 │ 0 │ 100 │
│ 2021-09-24 │ 3 │ 0 │ 100 │
│ 2021-09-24 │ 4 │ 0 │ 100 │
└────────────┴─────┴─────┴─────┘
SELECT
D,
Key,
Val,
any(Val) OVER (PARTITION BY Key ORDER BY D ASC RANGE BETWEEN 7 PRECEDING AND 7 PRECEDING) Val1
FROM test
┌──────────D─┬─Key─┬─Val─┬─Val1─┐
│ 2021-09-10 │ 0 │ 110 │ 0 │
│ 2021-09-17 │ 0 │ 100 │ 110 │
│ 2021-09-10 │ 1 │ 110 │ 0 │
│ 2021-09-17 │ 1 │ 100 │ 110 │
│ 2021-09-10 │ 2 │ 110 │ 0 │
│ 2021-09-17 │ 2 │ 100 │ 110 │
│ 2021-09-10 │ 3 │ 110 │ 0 │
│ 2021-09-17 │ 3 │ 100 │ 110 │
│ 2021-09-10 │ 4 │ 110 │ 0 │
│ 2021-09-17 │ 4 │ 100 │ 110 │
└────────────┴─────┴─────┴──────┘

i had some similar problem a while ago
please check the SQLfiddle
to see the result press buttons: first- build schema, second: run sql
naming
i assumed that you want for the same period A you selected a seven days later period B of time to compare (you need to be more specific, what you really looking for).
period A = your selected time period (between from and to)
period B = your selected time period one week in the past
problem
this is a real delicate question, if i understood the question right.
your example is grouped by minute inside a period A. this means, you really need to have data in period A for every minute you have data in period B, otherwise you will ignore period B data inside your chosen period.
as you can see in the sqlfiddle, i made two query strings. the first one is working, but ignores B data. the second one does a right join (sadly mysql does not support full outer joins to show all in one table) and shows 2 ignored entries.
it even makes it worse, because you group by method too.
(in this case for the fiddle you have to change the last line of the join and add:)
as b on a.unix_itime = b.unix_itime and a.method = b.method
this means, you need for every selected method and period minutewise data.
it would be better if you group only by the method and not time, as you already use a time condition (period A) to keep it small.
or do the stepping bigger, by hour or day..
this code should fit your envirement (mysql does not support toUnixTimestamp, toStartOfMinute, toDateTime):
SELECT
a.unix_itime * 1000 as t,
a.method,
a.sum AS c,
b.sum AS c2,
ifnull(a.sum,0) - ifnull(b.sum,0) as diff,
FROM (select method, sum(count) as sum, toUnixTimestamp(toStartOfMinute(itime)) as unix_itime
from my_table
WHERE method like 'a%' and
itime BETWEEN toDateTime(1631870605)
AND toDateTime(1631874205)
GROUP BY method, unix_itime)
as a
LEFT JOIN (select method, sum(count) as sum, toUnixTimestamp(toStartOfMinute(itime + INTERVAL 7 DAY)) as unix_itime
from my_table
WHERE method like 'a%' and
itime BETWEEN toDateTime(1631870605)- INTERVAL 7 DAY
AND toDateTime(1631874205)- INTERVAL 7 DAY
GROUP BY method, unix_itime)
as b on a.unix_itime = b.unix_itime and a.method = b.method
ORDER BY a.unix_itime;

The logic is slightly ambiguous, but this could produce one possible meaning of the above. If you still want to return overall SUM(count), just add that to the select list.
SELECT toUnixTimestamp(toStartOfMinute(itime)) * 1000 AS t
, method
, SUM(count) AS c
, SUM(count) - SUM(CASE WHEN itime < current_date - INTERVAL 7 DAY THEN count END) AS c2
FROM me.my_table
WHERE method like 'a%'
GROUP BY method, t
HAVING c2 >= 100
ORDER BY t
;
Adjust as needed.
Maybe you didn't want to return the difference, just filter the groups returned. If so, try this:
SELECT toUnixTimestamp(toStartOfMinute(itime)) * 1000 AS t
, method
, SUM(count) AS c
FROM me.my_table
WHERE method like 'a%'
GROUP BY method, t
HAVING SUM(count) - SUM(CASE WHEN itime < current_date - INTERVAL 7 DAY THEN count END) >= 100
ORDER BY t
;

Clickhouse: Mapping BETWEEN filtering from an array

I understand if I want to filter a column between two numbers I can use BETWEEN:
SELECT a
FROM table
WHERE a BETWEEN 1 AND 5
Is there a way of mapping the filtering to an array of values, for instance, if the array was [1, 10, ... , N]:
SELECT a
FROM table
WHERE (a BETWEEN 1 AND 1+4) AND (a BETWEEN 10 AND 10+4) AND ... AND (a BETWEEN N AND N+4)

Try this query:
WITH
[1, 10, 75] AS starts_from,
4 AS step,
arrayMap(x -> (x, x + step), starts_from) AS intervals
SELECT number
FROM numbers(100)
WHERE arrayFirstIndex(x -> number >= x.1 AND number <= x.2, intervals) != 0
/*
┌─number─┐
│ 1 │
│ 2 │
│ 3 │
│ 4 │
│ 5 │
│ 10 │
│ 11 │
│ 12 │
│ 13 │
│ 14 │
│ 75 │
│ 76 │
│ 77 │
│ 78 │
│ 79 │
└────────┘
*/

Clickhouse. How to create a column which preserves the last value from another column?

I'm trying to figure out how one can make in Clickhouse a column with the name "What I want" in the table below:
Category
Row Number
What I have
What I want
A
1
0
0
A
2
1
1
B
3
0
1
B
4
0
1
A
5
3
3
B
6
0
3
B
7
0
3
A
8
2
2
B
9
0
2
There are two categories A and B.
And I want B category to 'remember' the latest value from A category.
There's a column by which all records are ordered: Row Number.
I've found a function arrayFill which looks promising but unfortunately it isn't supported by my version of server (19.14.11.16) and there's no chance it'll be updated soon.
I guess there's should be some trick with clickhouse arrays. But I didn't manage to find a way. Is there any clickhouse-ninja who could give me a hint how to deal with it?
p.s. In fact B category isn't zero filled but I provide it just to simplify a little my problem.

create table z(c String, rn Int64, hv Int64) Engine=Memory;
insert into z values ('A',1,0)('A',2,1)('B',3,0)('B',4,0)('A',5,3)('B',6,0)('B',7,0)('A',8,2)('B',9,0);
select (arrayJoin(flatten(arrayMap( j -> arrayMap(m -> if(m.1 = 'B', (m.1, m.2, ga1[j-1][-1].3), m) , ga1[j]),
arrayEnumerate(arraySplit(k,i -> ga[i].1 <> ga[i-1].1 , (groupArray( (c, rn, hv) ) as ga), arrayEnumerate(ga)) as ga1)))) as r).1 _c,
r.2 _rn, r.3 _n
from (select * from z order by rn)
┌─_c─┬─_rn─┬─_n─┐
│ A │ 1 │ 0 │
│ A │ 2 │ 1 │
│ B │ 3 │ 1 │
│ B │ 4 │ 1 │
│ A │ 5 │ 3 │
│ B │ 6 │ 3 │
│ B │ 7 │ 3 │
│ A │ 8 │ 2 │
│ B │ 9 │ 2 │
└────┴─────┴────┘

Display COUNT(*) for every week instead of every day

Let us say that I have a table with user_id of Int32 type and login_time as DateTime in UTC format. user_id is not unique, so SELECT user_id, login_time FROM some_table; gives following result:
┌─user_id─┬──login_time─┐
│ 1 │ 2021-03-01 │
│ 1 │ 2021-03-01 │
│ 1 │ 2021-03-02 │
│ 2 │ 2021-03-02 │
│ 2 │ 2021-03-03 │
└─────────┴─────────────┘
If I run SELECT COUNT(*) as count, toDate(login_time) as l FROM some_table GROUP BY l I get following result:
┌─count───┬──login_time─┐
│ 2 │ 2021-03-01 │
│ 2 │ 2021-03-02 │
│ 1 │ 2021-03-03 │
└─────────┴─────────────┘
I would like to reformat the result to show COUNT on a weekly level, instead of every day, as I currently do.
My result for the above example could look something like this:
┌──count──┬──year─┬──month──┬─week ordinal┐
│ 5 │ 2021 │ 03 │ 1 │
│ 0 │ 2021 │ 03 │ 2 │
│ 0 │ 2021 │ 03 │ 3 │
│ 0 │ 2021 │ 03 │ 4 │
└─────────┴───────┴─────────┴─────────────┘
I have gone through the documentation, found some interesting functions, but did not manage to make them solve my problem.
I have never worked with clickhouse before and am not very experienced with SQL, which is why I ask here for help.

Try this query:
select count() count, toYear(start_of_month) year, toMonth(start_of_month) month,
toWeek(start_of_week) - toWeek(start_of_month) + 1 AS "week ordinal"
from (
select *, toStartOfMonth(login_time) start_of_month,
toStartOfWeek(login_time) start_of_week
from (
/* emulate test dataset */
select data.1 user_id, toDate(data.2) login_time
from (
select arrayJoin([
(1, '2021-02-27'),
(1, '2021-02-28'),
(1, '2021-03-01'),
(1, '2021-03-01'),
(1, '2021-03-02'),
(2, '2021-03-02'),
(2, '2021-03-03'),
(2, '2021-03-08'),
(2, '2021-03-16'),
(2, '2021-04-01')]) data)
)
)
group by start_of_month, start_of_week
order by start_of_month, start_of_week
/*
┌─count─┬─year─┬─month─┬─week ordinal─┐
│ 1 │ 2021 │ 2 │ 4 │
│ 1 │ 2021 │ 2 │ 5 │
│ 5 │ 2021 │ 3 │ 1 │
│ 1 │ 2021 │ 3 │ 2 │
│ 1 │ 2021 │ 3 │ 3 │
│ 1 │ 2021 │ 4 │ 1 │
└───────┴──────┴───────┴──────────────┘
*/

SQL Query (ClickHouse): group by where timediff between values less then X

I need a little help with sql-query. I'm using clickhouse, but maybe standard SQL syntax is enough for this task.
I've got the following table:
event_time; Text; ID
2021-03-16 09:00:48; Example_1; 1
2021-03-16 09:00:49; Example_2; 1
2021-03-16 09:00:50; Example_3; 1
2021-03-16 09:15:48; Example_1_1; 1
2021-03-16 09:15:49; Example_2_2; 1
2021-03-16 09:15:50; Example_3_3; 1
What I want to have at the end for this example - 2 rows:
Example_1Example2Example_3
Example_1_1Example2_2Example_3_3
Concatenation of Text field based on ID. The problem that this ID is not unique during some time interval. It's unique only for a minute as an example. So I want to concatenate only strings where the difference between first and last row is less than a minute.
Right now I've got a query like:
SELECT arrayStringConcat(groupArray(Text))
FROM (SELECT event_time, Text, ID
FROM Test_Table
ORDER by event_time asc)
GROUP BY ID;
What kind of condition should I add here?

Here is an example
create table X(event_time DateTime, Text String, ID Int64) Engine=Memory;
insert into X values ('2021-03-16 09:00:48','Example_1', 1), ('2021-03-16 09:00:49','Example_2', 1), ('2021-03-16 09:00:50','Example_3', 1), ('2021-03-16 09:01:48','Example_4', 1), ('2021-03-16 09:01:49','Example_5', 1), ('2021-03-16 09:15:48','Example_1_1', 1), ('2021-03-16 09:15:49','Example_2_2', 1),('2021-03-16 09:15:50','Example_3_3', 1);
SELECT * FROM X
┌──────────event_time─┬─Text────────┬─ID─┐
│ 2021-03-16 09:00:48 │ Example_1 │ 1 │
│ 2021-03-16 09:00:49 │ Example_2 │ 1 │
│ 2021-03-16 09:00:50 │ Example_3 │ 1 │
│ 2021-03-16 09:01:48 │ Example_4 │ 1 │
│ 2021-03-16 09:01:49 │ Example_5 │ 1 │
│ 2021-03-16 09:15:48 │ Example_1_1 │ 1 │
│ 2021-03-16 09:15:49 │ Example_2_2 │ 1 │
│ 2021-03-16 09:15:50 │ Example_3_3 │ 1 │
└─────────────────────┴─────────────┴────┘
What result is expected in this case?
CH 21.3
set allow_experimental_window_functions = 1;
SELECT
ID,
y,
groupArray(event_time),
groupArray(Text)
FROM
(
SELECT
ID,
event_time,
Text,
max(event_time) OVER (PARTITION BY ID ORDER BY event_time ASC RANGE BETWEEN CURRENT ROW AND 60 FOLLOWING) AS y
FROM X
)
GROUP BY
ID,
y
ORDER BY
ID ASC,
y ASC
Query id: 9219a1f2-8c96-425f-9301-745fa7b88b40
┌─ID─┬───────────────────y─┬─groupArray(event_time)────────────────────────────────────────────────────────────────────┬─groupArray(Text)──────────────────────────────────┐
│ 1 │ 2021-03-16 09:01:48 │ ['2021-03-16 09:00:48'] │ ['Example_1'] │
│ 1 │ 2021-03-16 09:01:49 │ ['2021-03-16 09:00:49','2021-03-16 09:00:50','2021-03-16 09:01:48','2021-03-16 09:01:49'] │ ['Example_2','Example_3','Example_4','Example_5'] │
│ 1 │ 2021-03-16 09:15:50 │ ['2021-03-16 09:15:48','2021-03-16 09:15:49','2021-03-16 09:15:50'] │ ['Example_1_1','Example_2_2','Example_3_3'] │
└────┴─────────────────────┴───────────────────────────────────────────────────────────────────────────────────────────┴───────────────────────────────────────────────────┘

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Extract and sum values with subfields inside string using ClickHouse - sql

Related

How to calculate the difference of two sums in SQL

Clickhouse: Mapping BETWEEN filtering from an array

Clickhouse. How to create a column which preserves the last value from another column?

Display COUNT(*) for every week instead of every day

SQL Query (ClickHouse): group by where timediff between values less then X

Categories

Resources