Join timeseries with distinct values in ClickHouse

Join timeseries with distinct values in ClickHouse - sql

I've got a following issue that I can't solve. Main purpose is to show graphs in Grafana. First sql request give me:
SELECT toStartOfMinute(date_time) as t, COUNT(1) as count, service_name
FROM SB_STAT.SBCommonJournal
WHERE t BETWEEN toDateTime('2019-06-04 00:00:00') AND toDateTime('2019-06-05 00:00:00')
GROUP BY t, service_name
t;count;service_name
2019-06-04 15:43:00;1;test3
2019-06-04 15:35:00;1;test3
2019-06-04 15:12:00;1;test
2019-06-04 14:57:00;1;test
2019-06-04 15:32:00;1;test3
2019-06-04 16:36:00;1;test3
2019-06-04 15:21:00;1;test
And the second one:
SELECT arrayJoin(
arrayMap(
x -> toStartOfMinute(addMinutes(toDateTime('2019-06-04 00:00:00'), x)),
range(toUInt64(dateDiff('minute', toDateTime('2019-06-04 00:00:00'), toDateTime('2019-06-05 00:00:00')) + 1)))) AS t,
0 AS count;
t;count
2019-06-04 00:00:00;0
2019-06-04 00:01:00;0
2019-06-04 00:02:00;0
2019-06-04 00:03:00;0
2019-06-04 00:04:00;0
2019-06-04 00:05:00;0
2019-06-04 00:06:00;0
2019-06-04 00:07:00;0
2019-06-04 00:08:00;0
2019-06-04 00:09:00;0
2019-06-04 00:10:00;0
etc..
How can I join these two requests to have counter for each service_name per minute? So I'm gonna have something like this
t;count;service_name
2019-06-04 15:12:00;1;test
2019-06-04 15:12:00;0;test3
2019-06-04 15:13:00;0;test
2019-06-04 15:13:00;0;test3
etc...

Grafana actualy has a zero fill option. The only thing you should have to do with ClickHouse is perhaps use groupArray on a tuple of key/value pairs per timestamp. Grafana normally pulls the returned JSON data apart and will use the first element in the tuple as a series name.
SELECT
t,
groupArray((service_name, cnt)) AS series
FROM (
SELECT
service_name,
toStartOfMinute(date_time) AS t,
count() AS cnt
FROM SBCommonJournal
WHERE (date_time >= toDateTime('2019-06-04 00:00:00')) AND (date_time <= toDateTime('2019-06-05 00:00:00'))
GROUP BY
service_name,
t
)
GROUP BY t
ORDER BY t
Failing that use WITH FILL
SELECT
t,
groupArray((service_name, cnt)) AS series
FROM (
SELECT
service_name,
toStartOfMinute(date_time) AS t,
count() AS cnt
FROM SBCommonJournal
WHERE (date_time >= toDateTime('2019-06-04 00:00:00')) AND (date_time <= toDateTime('2019-06-05 00:00:00'))
GROUP BY
service_name,
t
)
GROUP BY t
ORDER BY t WITH FILL STEP 60
If that still doesn't work for you the following should work (use Grafana $to and $from).
Create some sample data with some generated service_names and metrics:
DROP TABLE IF EXISTS SBCommonJournal;
CREATE TEMPORARY TABLE SBCommonJournal AS
WITH
(
SELECT arrayMap(x -> arrayStringConcat(arrayMap(i -> char(65 + (rand((i + x) + 1000) % 26)), range(16))), range(10))
) AS service_names
SELECT
service_names[1 + (rand() % length(service_names))] AS service_name,
toDateTime('2019-06-04 00:00:00') + toIntervalSecond(rand() % 86400) AS date_time
FROM numbers_mt(1000000)
Query:
SELECT
service_name,
t,
sum(cnt) AS cnt
FROM
(
SELECT
arrayJoin(groupUniqArray(service_name)) AS service_name,
arrayJoin(
(
SELECT groupArray(d)
FROM
(
SELECT arrayJoin([toDateTime('2019-06-04 00:00:00'), toDateTime('2019-06-05 00:00:00')]) AS d
GROUP BY d
ORDER BY d ASC WITH FILL STEP 60
)
)) AS t,
0 AS cnt
FROM SBCommonJournal
WHERE (date_time >= toDateTime('2019-06-04 00:00:00')) AND (date_time <= toDateTime('2019-06-05 00:00:00'))
UNION ALL
SELECT
service_name,
toStartOfMinute(date_time) AS t,
count() AS cnt
FROM SBCommonJournal
WHERE (date_time >= toDateTime('2019-06-04 00:00:00')) AND (date_time <= toDateTime('2019-06-05 00:00:00'))
GROUP BY
service_name,
t
)
GROUP BY
service_name,
t
ORDER BY
t ASC,
service_name ASC

Try this query:
SELECT stub_data.time_tick tick, stub_data.service_name service_name, source_data.count > stub_data.count ? source_data.count : stub_data.count AS count
FROM (
SELECT toStartOfMinute(date_time) as time_tick, COUNT() as count, service_name
FROM (
/* test data */
SELECT test_data.1 date_time, test_data.3 service_name, test_data.2 count
FROM (
SELECT arrayJoin([
(toDateTime('2019-06-04 15:43:01'), 1, 'test3'),
(toDateTime('2019-06-04 15:43:51'), 1, 'test4'),
(toDateTime('2019-06-04 15:43:52'), 1, 'test4'),
(toDateTime('2019-06-04 15:43:53'), 1, 'test4'),
(toDateTime('2019-06-04 15:35:02'), 1, 'test3'),
(toDateTime('2019-06-04 15:30:03'), 1, 'test'),
(toDateTime('2019-06-04 15:31:04'), 1, 'test'),
(toDateTime('2019-06-04 15:32:05'), 1, 'test3'),
(toDateTime('2019-06-04 15:36:06'), 1, 'test3'),
(toDateTime('2019-06-04 15:36:07'), 1, 'test3'),
(toDateTime('2019-06-04 15:36:46'), 1, 'test4'),
(toDateTime('2019-06-04 15:38:07'), 1, 'test')
]) test_data)
)
WHERE time_tick BETWEEN toDateTime('2019-06-04 00:00:00') AND toDateTime('2019-06-05 00:00:00')
GROUP BY time_tick, service_name) source_data
RIGHT JOIN (
/* Cartesian product: [ticks * service_names] */
SELECT time_tick, service_name, 0 as count
FROM (
SELECT arrayJoin(
arrayMap(
x -> addMinutes(toDateTime('2019-06-04 15:30:00'), x),
range(toUInt64(dateDiff('minute', toDateTime('2019-06-04 15:30:00'), toDateTime('2019-06-04 15:43:00')) + 1)))) AS time_tick)
CROSS JOIN (
SELECT arrayJoin(groupUniqArray(test_data.3)) service_name
FROM (
/* test data */
SELECT arrayJoin([
(toDateTime('2019-06-04 15:43:01'), 1, 'test3'),
(toDateTime('2019-06-04 15:43:51'), 1, 'test4'),
(toDateTime('2019-06-04 15:43:52'), 1, 'test4'),
(toDateTime('2019-06-04 15:43:53'), 1, 'test4'),
(toDateTime('2019-06-04 15:35:02'), 1, 'test3'),
(toDateTime('2019-06-04 15:30:03'), 1, 'test'),
(toDateTime('2019-06-04 15:31:04'), 1, 'test'),
(toDateTime('2019-06-04 15:32:05'), 1, 'test3'),
(toDateTime('2019-06-04 15:36:06'), 1, 'test3'),
(toDateTime('2019-06-04 15:36:07'), 1, 'test3'),
(toDateTime('2019-06-04 15:36:46'), 1, 'test4'),
(toDateTime('2019-06-04 15:38:07'), 1, 'test')
]) test_data))) stub_data
ON source_data.time_tick = stub_data.time_tick AND source_data.service_name = stub_data.service_name
ORDER BY tick, service_name;
/* Result:
┌────────────────tick─┬─service_name─┬─count─┐
│ 2019-06-04 15:30:00 │ test │ 1 │
│ 2019-06-04 15:30:00 │ test3 │ 0 │
│ 2019-06-04 15:30:00 │ test4 │ 0 │
│ 2019-06-04 15:31:00 │ test │ 1 │
│ 2019-06-04 15:31:00 │ test3 │ 0 │
│ 2019-06-04 15:31:00 │ test4 │ 0 │
│ 2019-06-04 15:32:00 │ test │ 0 │
│ 2019-06-04 15:32:00 │ test3 │ 1 │
│ 2019-06-04 15:32:00 │ test4 │ 0 │
│ 2019-06-04 15:33:00 │ test │ 0 │
│ 2019-06-04 15:33:00 │ test3 │ 0 │
│ 2019-06-04 15:33:00 │ test4 │ 0 │
│ 2019-06-04 15:34:00 │ test │ 0 │
│ 2019-06-04 15:34:00 │ test3 │ 0 │
│ 2019-06-04 15:34:00 │ test4 │ 0 │
│ 2019-06-04 15:35:00 │ test │ 0 │
│ 2019-06-04 15:35:00 │ test3 │ 1 │
│ 2019-06-04 15:35:00 │ test4 │ 0 │
│ 2019-06-04 15:36:00 │ test │ 0 │
│ 2019-06-04 15:36:00 │ test3 │ 2 │
│ 2019-06-04 15:36:00 │ test4 │ 1 │
│ 2019-06-04 15:37:00 │ test │ 0 │
│ 2019-06-04 15:37:00 │ test3 │ 0 │
│ 2019-06-04 15:37:00 │ test4 │ 0 │
│ 2019-06-04 15:38:00 │ test │ 1 │
│ 2019-06-04 15:38:00 │ test3 │ 0 │
│ 2019-06-04 15:38:00 │ test4 │ 0 │
│ 2019-06-04 15:39:00 │ test │ 0 │
│ 2019-06-04 15:39:00 │ test3 │ 0 │
│ 2019-06-04 15:39:00 │ test4 │ 0 │
│ 2019-06-04 15:40:00 │ test │ 0 │
│ 2019-06-04 15:40:00 │ test3 │ 0 │
│ 2019-06-04 15:40:00 │ test4 │ 0 │
│ 2019-06-04 15:41:00 │ test │ 0 │
│ 2019-06-04 15:41:00 │ test3 │ 0 │
│ 2019-06-04 15:41:00 │ test4 │ 0 │
│ 2019-06-04 15:42:00 │ test │ 0 │
│ 2019-06-04 15:42:00 │ test3 │ 0 │
│ 2019-06-04 15:42:00 │ test4 │ 0 │
│ 2019-06-04 15:43:00 │ test │ 0 │
│ 2019-06-04 15:43:00 │ test3 │ 1 │
│ 2019-06-04 15:43:00 │ test4 │ 3 │
└─────────────────────┴──────────────┴───────┘
*/

Related

How can I get the max and the min value for a specific timeframe grouped by day from a timeseries data

I am using DuckDB and want to process some time-series data that has the following format:
┌─────────────────────┬─────────┬─────────┬─────────┬─────────┬────────┬────────────┐
│ Timestamps │ Open │ High │ Low │ Close │ Volume │ CustomDate │
│ timestamp │ double │ double │ double │ double │ int32 │ varchar │
├─────────────────────┼─────────┼─────────┼─────────┼─────────┼────────┼────────────┤
│ 2006-04-11 12:00:00 │ 1.21245 │ 1.21275 │ 1.21235 │ 1.21275 │ 0 │ 2006-04-11 │
│ 2006-04-11 12:05:00 │ 1.21275 │ 1.21275 │ 1.21225 │ 1.21235 │ 0 │ 2006-04-11 │
│ 2006-04-11 12:10:00 │ 1.21235 │ 1.21235 │ 1.21205 │ 1.21225 │ 0 │ 2006-04-11 │
│ · │ · │ · │ · │ · │ · │ · │
│ · │ · │ · │ · │ · │ · │ · │
│ · │ · │ · │ · │ · │ · │ · │
│ 2023-01-31 22:55:00 │ 1.08705 │ 1.0873 │ 1.08705 │ 1.08725 │ 0 │ 2023-01-31 │
│ 2023-01-31 23:00:00 │ 1.08725 │ 1.08735 │ 1.087 │ 1.08705 │ 0 │ 2023-01-31 │
│ 2023-01-31 23:05:00 │ 1.08705 │ 1.0871 │ 1.08695 │ 1.0871 │ 0 │ 2023-01-31 │
└─────────────────────┴─────────┴─────────┴─────────┴─────────┴────────┴────────────┘
I am looking for a "complex" SQL query that can accomplish the following:
Select a specific time frame in a day (f.e. 10:25:00 - 13:40:00)
In this timeframe I want to get the MAX value from f.e High and the MIN value from Low
I also need the corresponding timestamps so that I know when the MAX and MIN values occurred
I want the result grouped by day
I want to further analyze and query the result
This is how the result should ideally look like:
Day | HighMAX | HighMAXTime | LowMIN | LowMINTime
--------------------------------------------------------------------------
2023-01-29 | 1.07545 | 2023-01-29 04:10:00 | 1.0726 | 2023-01-29 18:05:00
2023-01-30 | 1.08465 | 2023-01-30 23:55:00 | 1.08015 | 2023-01-30 15:35:00
2023-01-31 ...
...
This is the SQL query I currently have:
WITH mySession AS (
SELECT *, strftime(Timestamps, '%Y-%m-%d') AS CustomDate,
FROM EURUSD,
WHERE (Timestamps BETWEEN CONCAT(CustomDate, ' 12:00:00')::timestamp AND CONCAT(CustomDate, ' 15:30:00')::timestamp)
),
getSpecificData AS (
SELECT
CustomDate,
MIN(Low) AS LowOfSession,
MAX(High) AS HighOfSession
FROM mySession
GROUP BY CustomDate
ORDER BY CustomDate DESC
)
SELECT * FROM getSpecificData;
Current result:
┌────────────┬──────────────┬───────────────┐
│ CustomDate │ LowOfSession │ HighOfSession │
│ varchar │ double │ double │
├────────────┼──────────────┼───────────────┤
│ 2023-01-26 │ 1.08505 │ 1.0906 │
│ 2023-01-25 │ 1.0874 │ 1.0925 │
│ 2023-01-24 │ 1.0835 │ 1.08905 │
│ · │ · │ · │
│ · │ · │ · │
│ · │ · │ · │
│ 2006-04-13 │ 1.20945 │ 1.21175 │
│ 2006-04-12 │ 1.2094 │ 1.21145 │
│ 2006-04-11 │ 1.21205 │ 1.21415 │
└────────────┴──────────────┴───────────────┘
Currently I get the MIN Lows and MAX Highs but I don't know how to also retrieve the corresponding timestamps of these values.

You can do it by inner join your select with the EURUSD table to get Timestamps needed :
WITH mySession AS (
SELECT *
FROM EURUSD
WHERE (Timestamps BETWEEN CONCAT(CustomDate, ' 12:00:00')::timestamp AND CONCAT(CustomDate, ' 15:30:00')::timestamp)
),
getSpecificData AS (
SELECT
CustomDate,
MIN(Low) AS LowOfSession,
MAX(High) AS HighOfSession
FROM mySession
GROUP BY CustomDate
ORDER BY CustomDate DESC
),
getDetails As (
select s.*, l.Timestamps as TimestampsOfLow, h.Timestamps as TimestampsOfHigh
from getSpecificData as s
inner join mySession as l on s.LowOfSession = l.low and s.CustomDate = l.CustomDate
inner join mySession as h on s.HighOfSession = h.High and s.CustomDate = h.CustomDate
)
SELECT customdate, lowofsession, highofsession, max(timestampsoflow), max(TimestampsOfHigh) FROM getDetails
group by customdate, lowofsession, highofsession;

You want the window functions first_value and last_value. (DuckDB doc)
WITH mySession AS (
SELECT *, strftime(Timestamps, '%Y-%m-%d') AS CustomDate,
FROM EURUSD,
WHERE (Timestamps BETWEEN CONCAT(CustomDate, ' 12:00:00')::timestamp AND CONCAT(CustomDate, ' 15:30:00')::timestamp)
),
getSpecificData AS (
SELECT DISTINCT
CustomDate,
FIRST_VALUE(timestamps) over (PARTITION BY CustomDate order by Low) as LowOfSession_timestamp,
LAST_VALUE(timestamps) over (PARTITION BY CustomDate order by High) as HighOfSession_timestamp,
FIRST_VALUE(Low) over (PARTITION BY CustomDate order by Low) as LowOfSession_value,
LAST_VALUE(High) over (PARTITION BY CustomDate order by High) as HighOfSession_value,
FROM mySession
ORDER BY CustomDate DESC
)
SELECT * FROM getSpecificData;

You could try this query
WITH mySession AS (
SELECT *, strftime(Timestamps, '%Y-%m-%d') AS CustomDate,
FROM EURUSD,
WHERE (Timestamps BETWEEN CONCAT(CustomDate, ' 12:00:00')::timestamp AND CONCAT(CustomDate, ' 15:30:00')::timestamp)
),
minMaxData AS (
SELECT
CustomDate,
MIN(Low) AS LowOfSession,
MAX(High) AS HighOfSession
FROM mySession
GROUP BY CustomDate
),
getSpecificData AS (
SELECT
m.CustomDate,
m.HighOfSession AS HighMAX,
s1.Timestamps AS HighMAXTime,
m.LowOfSession AS LowMIN,
s2.Timestamps AS LowMINTime
FROM minMaxData m
INNER JOIN mySession s1 ON m.CustomDate = s1.CustomDate AND m.HighOfSession = s1.High
INNER JOIN mySession s2 ON m.CustomDate = s2.CustomDate AND m.LowOfSession = s2.Low
ORDER BY m.CustomDate DESC
)
SELECT * FROM getSpecificData;

Calculate percentage group by outputs incorrect results

I'm trying to get percentage from the table in clickhouse DB. I'm having a difficulty writing a query that will calculate percentage of type within each timestamp group.
SELECT
(intDiv(toUInt32(toDateTime(atime)), 120) * 120) * 1000 AS timestamp,
if(dateDiff('second', toDateTime(t1.atime), toDateTime(t2.unixdsn)) <= 5, 'sec5', if((dateDiff('second', toDateTime(t1.atime), toDateTime(t2.unixdsn)) > 5) AND (dateDiff('second', toDateTime(t1.atime), toDateTime(t2.unixdsn)) <= 30), 'sec30', if((dateDiff('second', toDateTime(t1.atime), toDateTime(t2.unixdsn)) > 30) AND (dateDiff('second', toDateTime(t1.atime), toDateTime(t2.unixdsn)) <= 60), 'sec60', 'secgt60'))) AS type,
count() AS total_count,
(total_count * 100) /
(
SELECT count()
FROM sess_logs.logs_view
WHERE (status IN (0, 1)) AND (toDateTime(atime) >= toDateTime(1621410625)) AND (toDateTime(atime) <= toDateTime(1621421425))
) AS percentage_cnt
FROM sess_logs.logs_view AS t1
INNER JOIN
(
SELECT
trid,
atime,
unixdsn,
status
FROM sess_logs.logs_view
WHERE (status = 1) AND (toDate(date) >= toDate(1621410625)) AND if('all' = 'all', 1, userid =
(
SELECT userid
FROM sess_logs.user_details
WHERE (username != 'all') AND (username = 'all')
))
) AS t2 ON t1.trid = t2.trid
WHERE (t1.status = 0) AND (t2.status = 1) AND ((toDate(atime) >= toDate(1621410625)) AND (toDate(atime) <= toDate(1621421425))) AND (toDateTime(atime) >= toDateTime(1621410625)) AND (toDateTime(atime) <= toDateTime(1621421425)) AND if('all' = 'all', 1, userid =
(
SELECT userid
FROM sess_logs.user_details
WHERE (username != 'all') AND (username = 'all')
))
GROUP BY
timestamp,
type
ORDER BY timestamp ASC
Output
┌─────timestamp─┬─type────┬─total_count─┬─────────percentage_cnt─┐
│ 1621410600000 │ sec5 │ 15190 │ 0.9650982602181922 │
│ 1621410600000 │ sec30 │ 1525 │ 0.09689103665785011 │
│ 1621410600000 │ sec60 │ 33 │ 0.002096658498169871 │
│ 1621410600000 │ secgt60 │ 61 │ 0.0038756414663140043 │
│ 1621410720000 │ secgt60 │ 67 │ 0.004256852102344891 │
│ 1621410720000 │ sec30 │ 2082 │ 0.13228009070271735 │
│ 1621410720000 │ sec60 │ 65 │ 0.004129781890334595 │
│ 1621410720000 │ sec5 │ 20101 │ 1.2771191658094723 │
│ 1621410840000 │ sec30 │ 4598 │ 0.29213441741166873 │
│ 1621410840000 │ sec60 │ 36 │ 0.002287263816185314 │
│ 1621410840000 │ secgt60 │ 61 │ 0.0038756414663140043 │
│ 1621410840000 │ sec5 │ 17709 │ 1.1251431922451591 │
│ 1621410960000 │ sec60 │ 17 │ 0.0010800968020875095 │
│ 1621410960000 │ secgt60 │ 81 │ 0.005146343586416957 │
│ 1621410960000 │ sec30 │ 2057 │ 0.13069171305258864 │
│ 1621410960000 │ sec5 │ 18989 │ 1.206468127931748 │
│ 1621411080000 │ sec60 │ 9 │ 0.0005718159540463285 │
│ 1621411080000 │ sec30 │ 3292 │ 0.20915756896894594 │
│ 1621411080000 │ sec5 │ 15276 │ 0.9705622793346349 │
│ 1621411080000 │ secgt60 │ 78 │ 0.004955738268401514 │
└───────────────┴─────────┴─────────────┴────────────────────────┘
It is returning the percentage for each row, but when I do sum of percentage_cnt column, the total does not goes to 100% instead it goes to 80%.
Please help me in correcting my query. I know query is huge, you guys can give simpler example for my use case. Thanks.

Display COUNT(*) for every week instead of every day

Let us say that I have a table with user_id of Int32 type and login_time as DateTime in UTC format. user_id is not unique, so SELECT user_id, login_time FROM some_table; gives following result:
┌─user_id─┬──login_time─┐
│ 1 │ 2021-03-01 │
│ 1 │ 2021-03-01 │
│ 1 │ 2021-03-02 │
│ 2 │ 2021-03-02 │
│ 2 │ 2021-03-03 │
└─────────┴─────────────┘
If I run SELECT COUNT(*) as count, toDate(login_time) as l FROM some_table GROUP BY l I get following result:
┌─count───┬──login_time─┐
│ 2 │ 2021-03-01 │
│ 2 │ 2021-03-02 │
│ 1 │ 2021-03-03 │
└─────────┴─────────────┘
I would like to reformat the result to show COUNT on a weekly level, instead of every day, as I currently do.
My result for the above example could look something like this:
┌──count──┬──year─┬──month──┬─week ordinal┐
│ 5 │ 2021 │ 03 │ 1 │
│ 0 │ 2021 │ 03 │ 2 │
│ 0 │ 2021 │ 03 │ 3 │
│ 0 │ 2021 │ 03 │ 4 │
└─────────┴───────┴─────────┴─────────────┘
I have gone through the documentation, found some interesting functions, but did not manage to make them solve my problem.
I have never worked with clickhouse before and am not very experienced with SQL, which is why I ask here for help.

Try this query:
select count() count, toYear(start_of_month) year, toMonth(start_of_month) month,
toWeek(start_of_week) - toWeek(start_of_month) + 1 AS "week ordinal"
from (
select *, toStartOfMonth(login_time) start_of_month,
toStartOfWeek(login_time) start_of_week
from (
/* emulate test dataset */
select data.1 user_id, toDate(data.2) login_time
from (
select arrayJoin([
(1, '2021-02-27'),
(1, '2021-02-28'),
(1, '2021-03-01'),
(1, '2021-03-01'),
(1, '2021-03-02'),
(2, '2021-03-02'),
(2, '2021-03-03'),
(2, '2021-03-08'),
(2, '2021-03-16'),
(2, '2021-04-01')]) data)
)
)
group by start_of_month, start_of_week
order by start_of_month, start_of_week
/*
┌─count─┬─year─┬─month─┬─week ordinal─┐
│ 1 │ 2021 │ 2 │ 4 │
│ 1 │ 2021 │ 2 │ 5 │
│ 5 │ 2021 │ 3 │ 1 │
│ 1 │ 2021 │ 3 │ 2 │
│ 1 │ 2021 │ 3 │ 3 │
│ 1 │ 2021 │ 4 │ 1 │
└───────┴──────┴───────┴──────────────┘
*/

SQL INNER JOIN duplicates to be remove based on criteria

I have the below two tables:
Table 1
┌──────────┬────────────┬───────────────┐
│ account1 │ Fruit_name │ First_harvest │
├──────────┼────────────┼───────────────┤
│ 567 │ Apple │ 201805 │
│ 432 │ Mango │ 201809 │
│ 567 │ Apple │ 201836 │
└──────────┴────────────┴───────────────┘
Table 2
┌──────────┬─────────────┬──────────────┬───────────────┬──────────────┬─────────────┐
│ account1 │ Fruit_name │ Current_Farm │ Previous_Farm │ FirstHarvest │ LastHarvest │
├──────────┼─────────────┼──────────────┼───────────────┼──────────────┼─────────────┤
│ 567 │ Apple │ APFarm │ AppleYard │ 201801 │ 201810 │
│ 567 │ Apple │ APFarm │ FruitFarm │ 201805 │ 201830 │
│ 567 │ Apple │ APFarm │ FruitMarket │ 201831 │ 999999 │
│ 567 │ Royal Gala │ APFarm │ GrocerWorld │ 201815 │ 999999 │
└──────────┴─────────────┴──────────────┴───────────────┴──────────────┴─────────────┘
My code:
SELECT DISTINCT a.account1,a.fruit_name,Max(a.first_harvest) first_harvest,b.current_farm,b.previous_farm,b.firstharvest,b.lastharvest
FROM fruit_harvest_data a
INNER JOIN fruit_farm_data b
ON a.account1 = b.account1
AND CASE WHEN b.fruit_name = 'Apple' THEN 'Royal Gala'
ELSE b.fruit_name END =
CASE WHEN a.fruit_name = 'Apple' THEN 'Royal gala'
ELSE a.fruit_name END
WHERE a.first_harvest BETWEEN b.firstharvest AND b.lastharvest
GROUP BY a.account1,a.fruit_name,b.current_farm,b.previous_farm,b.firstharvest,b.lastharvest
HAVING Max(a.first_harvest) >= 201801
Result:
┌──────────┬────────────┬───────────────┬──────────────┬───────────────┬──────────────┬─────────────┐
│ account1 │ Fruit_name │ First_harvest │ Current_Farm │ Previous_Farm │ FirstHarvest │ LastHarvest │
├──────────┼────────────┼───────────────┼──────────────┼───────────────┼──────────────┼─────────────┤
│ 567 │ Apple │ 201836 │ APFarm │ FruitMarket │ 201831 │ 999999 │
│ 567 │ Royal Gala │ 201836 │ APFarm │ GrocerWorld │ 201815 │ 999999 │
└──────────┴────────────┴───────────────┴──────────────┴───────────────┴──────────────┴─────────────┘
Request:
I get duplicate data due to the way we have this stored. Is there a
way to only show the result if account1 has both Apple and Royal Gala then it should only select Royal Gala.
Please note: account1 eg., 567 can have multiple fruits like apple, roya gal, mango, orange. but should only select Royal gala in case if exists in both apple and royal gala.

I think below should work
select distinct T.* from
(SELECT DISTINCT a.account1,
case when a.fruit_name='Apple' or a.fruit_name='Royal Gala' then
'Apple' else a.fruit_name end as fruit_name ,Max(a.first_harvest) first_harvest,b.current_farm,b.previous_farm,b.firstharvest,b.lastharvest
FROM fruit_harvest_data a
INNER JOIN fruit_farm_data b
ON a.account1 = b.account1
AND CASE WHEN b.fruit_name = 'Apple' THEN 'Royal Gala'
ELSE b.fruit_name END =
CASE WHEN a.fruit_name = 'Apple' THEN 'Royal gala'
ELSE a.fruit_name END
WHERE a.first_harvest BETWEEN b.firstharvest AND b.lastharvest
GROUP BY a.account1,a.fruit_name,b.current_farm,b.previous_farm,b.firstharvest,b.lastharvest
HAVING Max(a.first_harvest) >= 201801
) as T

Still unclear about what you want in your result set - a more complete desired result would help, but to answer the question as to how to do it:
Since you have mentioned that Apple/Gala is an example, I would create a new table to contain these pairs:
create table replace_list(oldfruit varchar(20), newfruit varchar(20))
insert replace_list values ('Apple','Royal Gala')
Then in your query add this:
left join replace_list r on r.oldfruit=b.fruit_name
left join fruit_farm_data n on n.account1=a.account1 and n.fruit_name=newfruit
and in your where clause, you will check where either the fruit name does not have a replacement r.oldfruit is null or it does have a replacement, but the farm doesnt have that fruit n.fruit_name is null
where r.oldfruit is null or n.fruit_name is null
The rest of the query you can work out for yourself.

Oracle 11g: Unpivot multiple columns and include column name

I'm triyng to unpivot multiple columns in my dataset. Here's what my data look like.
CREATE TABLE T5 (idnum NUMBER,f1 NUMBER(10,5),f2 NUMBER(10,5),f3 NUMBER(10,5)
,e1 NUMBER(10,5),e2 NUMBER(10,5)
,h1 NUMBER(10,5),h2 NUMBER(10,5));
INSERT INTO T5 (IDNUM,F1,F2,F3,E1,E2,H1,H2)
VALUES (1,'10.2004','5.009','7.330','9.008','8.003','.99383','1.43243');
INSERT INTO T5 (IDNUM,F1,F2,F3,E1,E2,H1,H2
VALUES (2,'4.2004','6.009','9.330','4.7008','4.60333','1.993','3.3243');
INSERT INTO T5 (IDNUM,F1,F2,F3,E1,E2,H1,H2)
VALUES (3,'10.2040','52.6009','67.330','9.5008','8.003','.99383','1.43243');
INSERT INTO T5 (IDNUM,F1,F2,F3,E1,E2,H1,H2)
VALUES (4,'9.20704','45.009','17.330','29.008','5.003','3.9583','1.243');
COMMIT;
select * from t5;
IDNUM F1 F2 F3 E1 E2 H1 H2
1 10.2004 5.009 7.33 9.008 8.003 0.99383 1.43243
2 4.2004 6.009 9.33 4.7008 4.60333 1.993 3.3243
3 10.204 52.6009 67.33 9.5008 8.003 0.99383 1.43243
4 9.20704 45.009 17.33 29.008 5.003 3.9583 1.243
I'm unpivoting like so...
select *
from (select IDNUM,F1,F2,F3,E1,E2,H1,H2,
null as E3,null as H3
from T5)
UnPivot((F,E,H) for sk in ((F1,E1,H1) as 1,
(F2,E2,H2) as 2,
(F3,E3,H3) as 3))
order by IDNUM,SK;
IDNUM SK F E H
----- -- ------- ------- -------
1 1 10.2004 9.008 .99383
1 2 5.009 8.003 1.43243
1 3 7.33 null null
2 1 4.2004 4.7008 1.993
2 2 6.009 4.60333 3.3243
2 3 9.33 null null
3 1 10.204 9.5008 .99383
3 2 52.6009 8.003 1.43243
3 3 67.33 null null
4 1 9.20704 29.008 3.9583
4 2 45.009 5.003 1.243
4 3 17.33 null null
But what I really need is as follows...
IDNUM SK F E H F_COL_NAME
----- -- ------- ------- ------- ----------
1 1 10.2004 9.008 .99383 F1
1 2 5.009 8.003 1.43243 F2
1 3 7.33 null null F3
2 1 4.2004 4.7008 1.993 F1
2 2 6.009 4.60333 3.3243 F2
2 3 9.33 null null F3
3 1 10.204 9.5008 .99383 F1
3 2 52.6009 8.003 1.43243 F2
3 3 67.33 null null F3
4 1 9.20704 29.008 3.9583 F1
4 2 45.009 5.003 1.243 F2
4 3 17.33 null null F3
How can I do this?

Change your UNPIVOT to be like this
select *
from (
select IDNUM,F1,F2,F3,E1,E2,H1,H2,
null as E3,null as H3
from T5
) A
UnPivot(
(F,E,H) for sk in (
(F1,E1,H1) as 'F1',
(F2,E2,H2) as 'F2',
(F3,E3,H3) as 'F3')
)
order by IDNUM,SK
This should do the trick

Just select idnum, sk, f, e, h, 'F'||SK as col_name ... You need to specify all columns instead of an asterix.
Like this http://sqlfiddle.com/#!4/12446/21

If you need to store result of UNPIVOT you could use INSERT ALL:
CREATE TABLE T5_unpiv(IDNUM NUMBER,SK NUMBER,F NUMBER,E NUMBER,H NUMBER
,F_COL_NAME VARCHAR2(100));
INSERT ALL
INTO T5_unpiv(IDNUM,SK,F,E,H,F_COL_NAME) VALUES(idnum,1,f1,e1,h1,'F1')
INTO T5_unpiv(IDNUM,SK,F,E,H,F_COL_NAME) VALUES(idnum,2,f2,e2,h2,'F2')
INTO T5_unpiv(IDNUM,SK,F,E,H,F_COL_NAME) VALUES(idnum,3,f3,NULL,NULL,'F3')
SELECT * FROM T5;
SELECT * FROM T5_unpiv;
DBFiddle Demo
Output:
┌───────┬────┬─────────┬─────────┬─────────┬────────────┐
│ IDNUM │ SK │ F │ E │ H │ F_COL_NAME │
├───────┼────┼─────────┼─────────┼─────────┼────────────┤
│ 1 │ 1 │ 10.2004 │ 9.008 │ .99383 │ F1 │
│ 1 │ 2 │ 5.009 │ 8.003 │ 1.43243 │ F2 │
│ 1 │ 3 │ 7.33 │ null │ null │ F3 │
│ 2 │ 1 │ 4.2004 │ 4.7008 │ 1.993 │ F1 │
│ 2 │ 2 │ 6.009 │ 4.60333 │ 3.3243 │ F2 │
│ 2 │ 3 │ 9.33 │ null │ null │ F3 │
│ 3 │ 1 │ 10.204 │ 9.5008 │ .99383 │ F1 │
│ 3 │ 2 │ 52.6009 │ 8.003 │ 1.43243 │ F2 │
│ 3 │ 3 │ 67.33 │ null │ null │ F3 │
│ 4 │ 1 │ 9.20704 │ 29.008 │ 3.9583 │ F1 │
│ 4 │ 2 │ 45.009 │ 5.003 │ 1.243 │ F2 │
│ 4 │ 3 │ 17.33 │ null │ null │ F3 │
└───────┴────┴─────────┴─────────┴─────────┴────────────┘

Try This..
select * from (select IDNUM,F1,F2,F3,E1,E2,H1,H2, null as E3,null as H3 from T5) UnPivot((F,E,H) for sk in ((F1,E1,H1) as 'F1',
(F2,E2,H2) as 'F2',
(F3,E3,H3) as 'F3')) order by IDNUM,SK;

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Join timeseries with distinct values in ClickHouse - sql

Related

How can I get the max and the min value for a specific timeframe grouped by day from a timeseries data

Calculate percentage group by outputs incorrect results

Display COUNT(*) for every week instead of every day

SQL INNER JOIN duplicates to be remove based on criteria

Oracle 11g: Unpivot multiple columns and include column name

Categories

Resources