How to convert the following dictionary format column into different format in Hive or Presto? - sql

I have a Hive table as below:
event_name
attendees_per_countries
a
{'US':5}
b
{'US':4, 'UK': 3, 'CA': 2}
c
{'UK':2, 'CA': 1}
And I want to get a new table like below:
country
number_of_people
US
9
UK
5
CA
4
How to write a query in Hive or Presto?

You may use the following:
If the column type for attendees_per_countries is a string, you may use the following:
WITH sample_data AS (
select
event_name,
str_to_map(
regexp_replace(attendees_per_countries,'[{|}]',''),
',',
':'
) as attendees_per_countries
FROM
raw_data
)
select
regexp_replace(cm.key,"[' ]","") as country,
SUM(cm.value) as no_of_people
from sample_data
lateral view explode(attendees_per_countries) cm
GROUP BY regexp_replace(cm.key,"[' ]","")
ORDER BY no_of_people DESC
However, if the column type for attendees_per_countries is already a map then you may use the following
select
regexp_replace(cm.key,"[' ]","") as country,
SUM(cm.value) as no_of_people
from sample_data
lateral view explode(attendees_per_countries) cm
GROUP BY regexp_replace(cm.key,"[' ]","")
ORDER BY no_of_people DESC
Full reproducible example below
with raw_data AS (
select 'a' as event_name, "{'US':5}" as attendees_per_countries
UNION ALL
select 'b', "{'US':4, 'UK': 3, 'CA': 2}"
UNION ALL
select 'c', "{'UK':2, 'CA': 1}"
),
sample_data AS (
select
event_name,
str_to_map(
regexp_replace(attendees_per_countries,'[{}]',''),
',',
':'
) as attendees_per_countries
FROM
raw_data
)
select
regexp_replace(cm.key,"[' ]","") as country,
SUM(cm.value) as no_of_people
from sample_data
lateral view explode(attendees_per_countries) cm
GROUP BY regexp_replace(cm.key,"[' ]","")
ORDER BY no_of_people DESC
Let me know if this works for you

In presto if you have attendees_per_countries as map you can use map_values and then sum them with array_sum/reduce (I need to use later cause Athena does not support former one). If not - you can treat you data as json and cast it to MAP(VARCHAR, INTEGER) and then use the mentioned functions:
WITH dataset(event_name, attendees_per_countries) AS (
VALUES
('a', JSON '{"US":5}'),
('b', JSON '{"US":4, "UK": 3, "CA": 2}'),
('c', JSON '{"UK":2, "CA": 1}')
)
SELECT event_name as country,
reduce(
map_values(cast(attendees_per_countries as MAP(VARCHAR, INTEGER))),
0,
(agg, curr) -> agg + curr,
s -> s
) as number_of_people
FROM dataset
order by 2 desc
Output:
country
number_of_people
b
9
a
5
c
3

Related

Generate CASE WHEN statement using another table

I would like to create a query that does the following:
Using a regex_mapping table, find all rows in the sample data that REGEXP_MATCH on x
WITH sample_data AS (
SELECT x, y
FROM (SELECT "asd rmkt asdf" AS x, true AS y UNION ALL -- should map to remekartier
SELECT "as asdf", true UNION ALL -- should map to ali sneider
SELECT "asdafsd", false) -- should map to NULL
),
regex_mapping AS (
SELECT regex, map
FROM (SELECT "as" AS regex, "ali sneider" AS map UNION ALL
SELECT "rmkt" AS regex, "remekartier" AS map )
)
SELECT sample_data.*, mapped_item
FROM sample_data
-- but here, use multiple REGEXP_MATCH with CASE WHEN looping over the regex_mappings.
-- e.g. CASE WHEN REGEXP_MATCH(x, "as") THEN "ali sneider"
WHEN REGEXP_MATCH(x, "rmkt") THEN "remakrtier" END AS mapped_item)
Try this -
WITH sample_data AS (
SELECT x, y
FROM (SELECT "asd rmkt asdf" AS x, true AS y UNION ALL -- should map to remekartier
SELECT "as asdf", true UNION ALL -- should map to ali sneider
SELECT "asdafsd", false)
),
regex_mapping AS (
SELECT regex, map
FROM (SELECT "as" AS regex, "ali sneider" AS map UNION ALL
SELECT "rmkt" AS regex, "remekartier" AS map )
)
SELECT s.*, r.map
FROM sample_data s, regex_mapping r
WHERE regexp_contains(s.x,concat('\\b',r.regex,'\\b'))
The results ->
Second way: Instead of cross-join, use a scalar subquery. I have used limit so that the subquery doesn't return more than 1 row and if multiple regexp matches, then it will return only one of them
--- same WITH clause as above query ---
SELECT s.*, (SELECT r.map
FROM regex_mapping r
WHERE regexp_contains(s.x,concat('\\b',r.regex,'\\b'))
LIMIT 1) as map
FROM sample_data s
The results ->
Third way: Deduplicated Data
WITH sample_data AS (
SELECT campaign_name, placement_name
FROM (SELECT "as_rmkt_asdf" AS campaign_name, "xdd" AS placement_name UNION ALL -- should map to remekartier
SELECT "as_asdf", "sdfsdf" UNION ALL -- should map to ali sneider
SELECT "as_rmkt_dafsd", "sdfg" UNION ALL -- should map to rmkt
SELECT "asf_adsdf", "gdf" -- should map to NULL (because higher priority)
)
),
regex_mapping AS (
SELECT regex, map, priority
FROM (SELECT "rmkt" AS regex, "remekartier" AS map, 1 AS priority UNION ALL
SELECT "as" AS regex, "ali sneider" AS map, 2 AS priority)
),
X AS (
SELECT s.*,
CASE WHEN regexp_contains(s.campaign_name, concat('(^|_)',r.regex,'($|_)')) THEN r.map ELSE NULL END AS map,
ROW_NUMBER() OVER (PARTITION BY s.campaign_name ORDER BY regexp_contains(s.campaign_name, concat('(^|_)',r.regex,'($|_)')) DESC, r.priority) AS rn
FROM sample_data s
CROSS JOIN regex_mapping r
)
SELECT * EXCEPT (rn)
FROM X
WHERE rn = 1

Query to find if a aggregate string contains certain numbers

I am working on Big Query Standard SQL. I have a data table like shown below (using ; as separator):
id;operation
107327;-1,-1,-1,-1,5,-1,0,2,-1
108296;-1,6,2,-1,-1,-1
690481;0,-1,-1,-1,5
102643;5,-1,-1,-1,-1,-2,2,3,-1,0,-1,-1,-1,-1,-1,-1
103171;0,5
789481;0,-1,5
I would like to take id that only contains operation 0,5 or 0,-1,5 so the result will show:
690481
103171
789481
Below is for BigQuery Standard SQL
#standardSQL
SELECT *
FROM `project.dataset.table`
WHERE 0 = (
SELECT COUNT(1)
FROM UNNEST(SPLIT(operation)) op
WHERE NOT op IN ('0', '-1', '5')
)
You can test, play with above using sample data form your question as in below example
#standardSQL
WITH `project.dataset.table` AS (
SELECT 107327 id, '-1,-1,-1,-1,5,-1,0,2,-1' operation UNION ALL
SELECT 108296, '-1,6,2,-1,-1,-1' UNION ALL
SELECT 690481, '0,-1,-1,-1,5' UNION ALL
SELECT 102643, '5,-1,-1,-1,-1,-2,2,3,-1,0,-1,-1,-1,-1,-1,-1' UNION ALL
SELECT 103171, '0,5' UNION ALL
SELECT 789481, '0,-1,5'
)
SELECT *
FROM `project.dataset.table`
WHERE 0 = (
SELECT COUNT(1)
FROM UNNEST(SPLIT(operation)) op
WHERE NOT op IN ('0', '-1', '5')
)
with output
I think regular expression does what you want:
select t.*
from t
where regexp_contains(operation, '^0,(-1,)*5$');
If you want matches to rows that contain only 0, -1, or 5, you would use:
where regexp_contains(operation, '^((0|-1|5),)*(0|-1|5)$');

Convert String to Tuple in BigQuery

I have a variable passed as an argument in BigQuery which is in the format "('a','b','c')"
with vars as (
select "{0}" as var1,
)
-- where, {0} = "('a','b','c')"
To use it in BigQuery I need to make it a tuple ('a','b','c').
How can it be done?
Any alternate approach is also welcome.
Example:
with vars as (
select "('a','b','c')" as index
)
select * from `<some_other_db>.table` where index in (
select index from vars)
-- gives me empty results because index is now a string
Present output:
select * from <db_name>.table where index in "('a','b','c')"
Required output:
select * from <db_name>.table where index in ('a','b','c')
Below is for BigQuery Standard SQL
#standardSQL
WITH vars AS (
SELECT "('a','b','c')" AS var
)
SELECT *
FROM `<some_other_db>.table`
WHERE index IN UNNEST((
SELECT SPLIT(REGEXP_REPLACE(var, r'[()\']', ''))
FROM vars
))
You can test, play with above using some dummy data as in below example
#standardSQL
WITH vars AS (
SELECT "('a','b','c')" AS var
), `<some_other_db>.table` AS (
SELECT 1 id, 'a' index UNION ALL
SELECT 2, 'd' UNION ALL
SELECT 3, 'c' UNION ALL
SELECT 4, 'e'
)
SELECT *
FROM `<some_other_db>.table`
WHERE index IN UNNEST((
SELECT SPLIT(REGEXP_REPLACE(var, r'[()\']', ''))
FROM vars
))
with output
Row id index
1 1 a
2 3 c
I think this does what you are asking for:
with vars as ( select "('a','b','c')" as var1)
select as struct
MAX(CASE WHEN n = 0 then var END) as f1,
MAX(CASE WHEN n = 1 then var END) as f2,
MAX(CASE WHEN n = 2 then var END) as f3
from vars v cross join
unnest(SPLIT(REPLACE(REPLACE(var1, '(', ''), ')', ''), ',')) var with offset n;

How to implement generic Oracle DECODE function in BigQuery?

I'm looking into implementing the Oracle DECODE function as a UDF.
Below is the outward functionaliy
https://docs.oracle.com/cd/B19306_01/server.102/b14200/functions040.htm
Below is the outward functionality and syntax of a decode in Oracle:
Oracle:
DECODE( <expr> , <search1> , <result1> [ , <search2> , <result2> ... ] [ , <default> ] )
SELECT product_id,
DECODE (warehouse_id, 1, 'Southlake',
2, 'San Francisco',
3, 'New Jersey',
4, 'Seattle',
'Non domestic')
"Location of inventory" FROM inventories;
Primarily, with BigQuery UDFs SQL or JavaScript, with BigQuery UDFs, when you define the UDF function you need to know the number of parameters you are accepting and typing. When you define the SQL UDF function, you can also accept an array of type anything, but I am not sure if it can work and SQL UDF can perform what we want with an array. It seems based on the Javascript UDF documentation all parameters are named and typed and known up front.
Is there a way to accomplish this using a BigQuery UDF, it has to be dynamic like Oracle decode and fit any scenario you put in front of it not static knowing what you are decoding
Below is for BigQuery Standard SQL
CREATE TEMP FUNCTION DECODE(expr ANY TYPE, map ANY TYPE, `default` ANY TYPE ) AS ((
IFNULL((SELECT result FROM UNNEST(map) WHERE search = expr), `default`)
));
You can see how it works using below example
#standardSQL
CREATE TEMP FUNCTION DECODE(expr ANY TYPE, map ANY TYPE, `default` ANY TYPE ) AS ((
IFNULL((SELECT result FROM UNNEST(map) WHERE search = expr), `default`)
));
WITH `project.dataset.inventories` AS (
SELECT 1 product_id, 4 warehouse_id UNION ALL
SELECT 2, 2 UNION ALL
SELECT 3, 5
)
SELECT product_id, warehouse_id,
DECODE(warehouse_id,
[STRUCT<search INT64, result STRING>
(1,'Southlake'),
(2,'San Francisco'),
(3,'New Jersey'),
(4,'Seattle')
], 'Non domestic') AS `Location_of_inventory`
FROM `project.dataset.inventories`
with result
Row product_id warehouse_id Location_of_inventory
1 1 4 Seattle
2 2 2 San Francisco
3 3 5 Non domestic
Another example of use is:
#standardSQL
CREATE TEMP FUNCTION DECODE(expr ANY TYPE, map ANY TYPE, `default` ANY TYPE ) AS ((
IFNULL((SELECT result FROM UNNEST(map) WHERE search = expr), `default`)
));
WITH `project.dataset.inventories` AS (
SELECT 1 product_id, 4 warehouse_id UNION ALL
SELECT 2, 2 UNION ALL
SELECT 3, 5
), map AS (
SELECT 1 search, 'Southlake' result UNION ALL
SELECT 2, 'San Francisco' UNION ALL
SELECT 3, 'New Jersey' UNION ALL
SELECT 4, 'Seattle'
)
SELECT product_id, warehouse_id,
DECODE(warehouse_id, kv, 'Non domestic') AS `Location_of_inventory`
FROM `project.dataset.inventories`,
(SELECT ARRAY_AGG(STRUCT(search, result)) AS kv FROM map) arr
with the same output
Update to address - "for a reusable UDF, not having to name the fields makes it closer to Oracle's implementation."
CREATE TEMP FUNCTION DECODE(expr ANY TYPE, map ANY TYPE, `default` ANY TYPE ) AS (
IFNULL((
SELECT result FROM (
SELECT NULL AS search, NULL AS result UNION ALL SELECT * FROM UNNEST(map)
)
WHERE search = expr
), `default`)
);
So now - previous examples can be used w/o explicit naming fields as in example below
#standardSQL
CREATE TEMP FUNCTION DECODE(expr ANY TYPE, map ANY TYPE, `default` ANY TYPE ) AS (
IFNULL((
SELECT result FROM (
SELECT NULL AS search, NULL AS result UNION ALL SELECT * FROM UNNEST(map)
)
WHERE search = expr
), `default`)
);
WITH `project.dataset.inventories` AS (
SELECT 1 product_id, 4 warehouse_id UNION ALL
SELECT 2, 2 UNION ALL
SELECT 3, 5
)
SELECT product_id, warehouse_id,
DECODE(warehouse_id,
[ (1,'Southlake'),
(2,'San Francisco'),
(3,'New Jersey'),
(4,'Seattle')
], 'Non domestic') AS `Location_of_inventory`
FROM `project.dataset.inventories`
still with same output as before
```
SELECT product_id,
case
when warehouse_id = 1 then 'Southlake'
when warehouse_id = 2 then 'San Francisco'
when warehouse_id = 3 then 'New Jersey'
when warehouse_id = 4 then 'Seattle'
else
'Non domestic'
end as "Location of inventory" FROM inventories;
```

PL/SQL: Invalid Number error

I am creating a procedure for which I collect data by repeatedly running the following query.
SELECT ATTRIBUTE_VALUE,
COUNT(src1) CNT1,
COUNT(src2) CNT2
FROM (
SELECT a.ATTRIBUTE_VALUE,
1 src1,
TO_NUMBER(NULL) src2
FROM (
SELECT DECODE(
L,
1, IP_ADDRESS,
DECODE(
L,
2, IP_SUBNET_MASK,
DECODE(
L,
3, IP_DEFAULT_GATEWAY
)
)
) ATTRIBUTE_VALUE
FROM ( SELECT LEVEL L FROM DUAL X CONNECT BY LEVEL <= 3 ),
REUT_LOAD_IP_ADDRESSES
WHERE LIP_IPT_NAME = 'CE'
AND IP_LNT_ID IN (
SELECT LNT_ID
FROM REUT_LOAD_NTN
WHERE LNT_ID IN (
SELECT RLPN.LPN_LNT_ID
FROM REUT_LOAD_PI_NTN RLPN
WHERE LPN_LPI_ID IN (
SELECT RLPI.LPI_ID
FROM REUT_LOAD_PAC_INS RLPI
WHERE RLPI.LPI_DATE_ADDED IN (
SELECT MAX(RLPI2.LPI_DATE_ADDED)
FROM REUT_LOAD_PAC_INS RLPI2
WHERE RLPI2.PI_JOB_ID = P_ORDER_ID
)
)
)
AND IP_CEASE_DATE IS NULL
AND LNT_SERVICE_INSTANCE = 'PRIMARY'
)
It is running fine in SQL developer but when executing it as a procedure, I am getting INVALID NUMBER ERROR (ORA-01722: invalid number) at
AND IP_LNT_ID IN (
SELECT LNT_ID, in the code.
Can I get any help?
The error is pretty clear. You're comparing a number to another type of value.
Example:
SELECT 'x'
FROM DUAL
WHERE 1 IN (SELECT 'a'
FROM DUAL)
This means that IP_LNT_ID, LNT_ID, LPN_LNT_ID and LPI_ID have to be NUMBER. And LPI_DATE_ADDED and LPI_DATE_ADDED should both be date or timestamp.
If this is not possible you could compare everything as char:
SELECT ATTRIBUTE_VALUE, COUNT(src1) CNT1, COUNT(src2) CNT2
FROM (SELECT a.ATTRIBUTE_VALUE, 1 src1, TO_NUMBER(NULL) src2
FROM (SELECT
DECODE(L,1,IP_ADDRESS,DECODE(L,2,IP_SUBNET_MASK,DECODE(L,3,IP_DEFAULT_GATEWAY) ) ) ATTRIBUTE_VALUE
FROM
(
SELECT LEVEL L FROM DUAL X CONNECT BY LEVEL <= 3
),
REUT_LOAD_IP_ADDRESSES
WHERE LIP_IPT_NAME = 'CE'
AND to_char(IP_LNT_ID) IN (
SELECT LNT_ID
FROM REUT_LOAD_NTN
WHERE to_char(LNT_ID) IN (
SELECT RLPN.LPN_LNT_ID
FROM REUT_LOAD_PI_NTN RLPN
WHERE to_char(LPN_LPI_ID) IN (
SELECT RLPI.LPI_ID
FROM REUT_LOAD_PAC_INS RLPI
WHERE to_char(RLPI.LPI_DATE_ADDED) IN (
SELECT MAX(RLPI2.LPI_DATE_ADDED)
FROM REUT_LOAD_PAC_INS RLPI2
WHERE RLPI2.PI_JOB_ID = P_ORDER_ID
)
)
)
AND IP_CEASE_DATE IS NULL
AND LNT_SERVICE_INSTANCE = 'PRIMARY'
)
But this should be avoided on any cost. Unfortunately some times we have to cheat a little from time to time to work with our existing infrasructure ;-)
You need to make sure:
REUT_LOAD_IP_ADDRESSES.IP_LNT_ID
and
REUT_LOAD_NTN.LNT_ID
Have the same data type or cast/convert one or other so that they have the same data type.
There are multiple other issues:
You have aggregated and non-aggregated values:
SELECT ATTRIBUTE_VALUE,
COUNT(src1) CNT1,
COUNT(src2) CNT2
FROM ( ... )
Without a GROUP BY clause.
src2 is TO_NUMBER(NULL) which is just NULL and COUNT(NULL) will always be 0 so your query is:
SELECT ATTRIBUTE_VALUE,
COUNT(src1) CNT1,
0 CNT2
...
This code:
SELECT DECODE(
L,
1, IP_ADDRESS,
DECODE(
L,
2, IP_SUBNET_MASK,
DECODE(
L,
3, IP_DEFAULT_GATEWAY
)
)
) ATTRIBUTE_VALUE
FROM ( SELECT LEVEL L FROM DUAL X CONNECT BY LEVEL <= 3 ),
REUT_LOAD_IP_ADDRESSES
Can be rewritten as:
SELECT DECODE(
L,
1, IP_ADDRESS,
2, IP_SUBNET_MASK,
3, IP_DEFAULT_GATEWAY
) ATTRIBUTE_VALUE
FROM ( SELECT LEVEL L FROM DUAL X CONNECT BY LEVEL <= 3 ),
REUT_LOAD_IP_ADDRESSES
Or, without the join as:
SELECT attribute_value
FROM REUT_LOAD_IP_ADDRESSES
UNPIVOT ( attribute_value FOR L IN (
IP_ADDRESS AS 1,
IP_SUBNET_MASK AS 2,
IP_DEFAULT_GATEWAY AS 3
) )
The innermost query:
SELECT RLPI.LPI_ID
FROM REUT_LOAD_PAC_INS RLPI
WHERE RLPI.LPI_DATE_ADDED IN (
SELECT MAX(RLPI2.LPI_DATE_ADDED)
FROM REUT_LOAD_PAC_INS RLPI2
WHERE RLPI2.PI_JOB_ID = P_ORDER_ID
)
The inner query is restricted to have RLPI2.PI_JOB_ID = P_ORDER_ID but there is no correlation between the outer query so you can retrieve results that do not match P_ORDER_ID but just happen to have the same date as a matching row.