Convert an array into a Map - sql

I have a table with a column like
[{"key":"e","value":["253","203","204"]},{"key":"st","value":["mi"]},{"key":"k2","value":["1","2"]}]
Which is of the format array<struct<key:string,value:array<string>>>
I want to convert the column into below format :
{"e":["253","203","204"],"st":["mi"],"k2":["1","2"]}
which is of the type map<string,array<string>>
I have tried exploding the array but that does not work. Any ideas how I can do this in hive.

Without use of external libraries it's impossible. Please refer to brickhouse or create your own UDAF.
Note: further code provides snippets to reproduce the problem and solving the problem that Hive's built-in functions can solve. i.e map<string,string> not map<string, array<string>>.
-- reproducing the problem
CREATE TABLE test_table(id INT, input ARRAY<STRUCT<key:STRING,value:ARRAY<STRING>>>);
INSERT INTO TABLE test_table
SELECT
1 AS id,
ARRAY(
named_struct("key","e", "value", ARRAY("253","203","204")),
named_struct("key","st", "value", ARRAY("mi")),
named_struct("key","k2", "value", ARRAY("1", "2"))
) AS input;
SELECT id, input FROM test_table;
+-----+-------------------------------------------------------------------------------------------------------+--+
| id | input |
+-----+-------------------------------------------------------------------------------------------------------+--+
| 1 | [{"key":"e","value":["253","203","204"]},{"key":"st","value":["mi"]},{"key":"k2","value":["1","2"]}] |
+-----+-------------------------------------------------------------------------------------------------------+--+
With exploding and using STRUCT features, we can split the keys and values.
SELECT id, exploded_input.key, exploded_input.value
FROM (
SELECT id, exploded_input
FROM test_table LATERAL VIEW explode(input) d AS exploded_input
) x;
+-----+------+----------------------+--+
| id | key | value |
+-----+------+----------------------+--+
| 1 | e | ["253","203","204"] |
| 1 | st | ["mi"] |
| 1 | k2 | ["1","2"] |
+-----+------+----------------------+--+
The idea is to use your UDAF to "collect" a map while aggregating on id.
What Hive can solve with built in function is generating map<string,string> by converting rows to strings with a special delimiter, aggregate rows via another special delimiter and use str_to_map built-in function on the delimiters to generate map<string, string>.
SELECT
id,
str_to_map(
-- outputs: e:253,203,204#st:mi#k2:1,2 with delimiters between aggregated rows
concat_ws('#', collect_list(list_to_string)),
'#', -- first delimiter
':' -- second delimiter
) mapped_output
FROM (
SELECT
id,
-- outputs 3 rows: (e:253,203,203), (st:mi), (k2:1,2)
CONCAT(exploded_input.key,':' , CONCAT_WS(',', exploded_input.value)) as list_to_string
FROM (
SELECT id, exploded_input
FROM test_table LATERAL VIEW explode(input) d AS exploded_input
) x
) y
GROUP BY id;
Which outputs a string to string map like:
+-----+-------------------------------------------+--+
| id | mapped_output |
+-----+-------------------------------------------+--+
| 1 | {"e":"253,203,204","st":"mi","k2":"1,2"} |
+-----+-------------------------------------------+--+

with input_set as (
select array(named_struct('key','e','value',array('253','203','204')),named_struct('key','st','value',array('mi')),named_struct('key','k2','value',array('1','2'))) as input_array
), break_input_set as (
select y.col_num as y_col_num,y.col_value as y_col_value from input_set lateral view posexplode(input_set.input_array) y as col_num, col_value
), create_map as (
select map(y_col_value.key,y_col_value.value) as final_map from break_input_set
)
select * from create_map;

var Array = [{"key":"e","value":["253","203","204"]},{"key":"st","value":["mi"]},{"key":"k2","value":["1","2"]}];
var obj = {}
for(var i=0;i<Array.length;i++){
obj[Array[i].key] = Array[i].value
}
obj will be in the required format

Related

Postgres: SQL Error [42883]: ERROR: operator does not exist: uuid = text

Below I have a Postgres query that reverts data in the main table to a specific point in time with the data in the audit table based on INSERT or UPDATE audit_operation.
create or replace function fun(test_val1_input text, test_val3_input text)
returns void
as $functions$
declare test_id text ;
test_val1 text ;
test_val2 text ;
test_val3 timestamp;
test_val4 text ;
declare cur cursor
for select id, val1 , val2 , val3, val4
from test_table at
where val1 = UUID($1) and val3 > to_timestamp($2, 'YYYY-MM-DD HH24:MI:SS')
order by val3 desc;
begin
open cur;
fetch next from cur into test_id, test_val1 , test_val2 , test_val3 , test_val4;
while found
loop
if (test_val4 = 'INSERT')
then
delete from main_table mt where id = test_val1;
elsif (test_val4 = 'UPDATE')
then
delete from main_table mt where id = test_val1;
with cte
as
(
select *
from test_table at
where val1 = test_val1 and val3 < test_val3
order by val3 desc
limit 1
)
update mt
set mt.id = cte.id,
mt.val1 = cte.val1,
mt.val2= cte.val2
from main_table mt
join cte on cte.val1 = brb.val1;
end if;
fetch next from cur into test_id, test_val1, test_val2, test_val3, test_val4;
end loop;
close cur;
end;
$functions$ language plpgsql;
For reference-
Main table
id | val1 | val2
--------------+------------------------+---------
31cc5a4f-7a23 | 4d87-ad12-2f78c1c52b7a | data_1
12da6b6a-8b12 | 4d87-ad12-2f78c1c52b7a | data_2
82na1q1a-1b45 | 4d87-ad12-2f78c1c52b7a | data_3
Type of columns in the main_table
id: uuid
val1: uuid
val2: text
Audit table
id | val1 | val2 | val3 | val4
--------------+------------------------+---------+---------------------+------------------
31cc5a4f-7a23 | 4d87-ad12-2f78c1c52b7a | data_1 | 2001-09-10 12:02:20 | INSERT
12da6b6a-8b12 | 4d87-ad12-2f78c1c52b7a | data_2 | 2001-09-10 12:02:20 | INSERT
82na1q1a-1b45 | 4d87-ad12-2f78c1c52b7a | data_3 | 2001-09-12 15:12:54 | INSERT
Type of columns in the audit_table
id: uuid
val1: uuid
val2: text
val3: timestamp
val4: text
On executing the above SQL function for the following inputs-
select fun('4d87-ad12-2f78c1c52b7a', '2001-09-10 12:02:20')
I'm getting the following error:
SQL Error [42883]: ERROR: operator does not exist: uuid = text
Hint: No operator matches the given name and argument types. You might need to add explicit type casts.
Where: PL/pgSQL function revert_business_rule_days(text,text) line 23 at SQL statement
Instead, I was expecting data in the main table to be reverted at the specified time:
Main table
id | val1 | val2
--------------+------------------------+---------
31cc5a4f-7a23 | 4d87-ad12-2f78c1c52b7a | data_1
12da6b6a-8b12 | 4d87-ad12-2f78c1c52b7a | data_2
Kindly help me and let me know where I'm making mistake, I would be really thankful!
Also, do let me know if you need more understanding on this.
From Postgresql documentation:
A UUID is written as a sequence of lower-case hexadecimal digits, in several groups separated by hyphens, specifically a group of 8 digits followed by three groups of 4 digits followed by a group of 12 digits, for a total of 32 digits representing the 128 bits.
PostgreSQL also accepts the following alternative forms for input: use of upper-case digits, the standard format surrounded by braces, omitting some or all hyphens, adding a hyphen after any group of four digits.
You should correct:
When you create the function its name is fun and not revert_business_rule_days.
The type of variables v_id and v_pk_id should be UUID and not TEXT. The error occurs in WHERE clause of DELETE statement, because id column type is UUID and variable v_id type is text.
Your UPDATE query should be something like (your syntax is not correct):
UPDATE main_table mt
SET id = cte.id,
pk_id = cte.pk_id,
col_1= cte.col_1
FROM cte
WHERE cte.pk_id = mt.pk_id;

Split SQL string with specific string instead of separator?

I have table that looks like:
|ID | String
|546 | 1,2,1,5,7,8
|486 | 2,4,8,1,5,1
|465 | 18,11,20,1,4,18,11
|484 | 11,10,11,12,50,11
I want to split the string to this:
|ID | String
|546 | 1,2
|546 | 1,5
|486 | 1,5,1
|486 | 1
|465 | 1,4
My goal is to show ID and all the strings starting with 1 with just the next number after them.
I filtered all rows without '%1,%' and I don't know how to continue.
If you use SQL Server 2016+, you may try to use a JSON-based approach. You need to transform the data into a valid JSON array and parse the JSON array with OPENJSON(). Note that STRING_SPLIT() is not an option here, because as is mentioned in the documentation, the output rows might be in any order and the order is not guaranteed to match the order of the substrings in the input string.
Table:
CREATE TABLE Data (
ID int,
[String] varchar(100)
)
INSERT INTO Data
(ID, [String])
VALUES
(546, '1,2,1,5,7,8'),
(486, '2,4,8,1,5,1'),
(465, '18,11,20,1,4,18,11'),
(484, '11,10,11,12,50,11')
Statement:
SELECT
ID,
CONCAT(FirstValue, ',', SecondValue) AS [String]
FROM (
SELECT
d.ID,
j.[value] As FirstValue,
LEAD(j.[value]) OVER (PARTITION BY d.ID ORDER BY CONVERT(int, j.[key])) AS SecondValue
FROM Data d
CROSS APPLY OPENJSON(CONCAT('[', d.[String], ']')) j
) t
WHERE t.FirstValue = '1'
Result:
----------
ID String
----------
465 1,4
486 1,5
486 1,
546 1,2
546 1,5
Something like :
SELECT ID, S.value
FROM Data
CROSS APPLY STRING_SPLIT(REPLACE(',' + String, ',1,', '#1,'), '#') AS S
WHERE value LIKE '1,%'
?

How to use REGEXP_SUBSTR properly?

Currently in my select statement I have id and value. The value is json which looks like this:
{"layerId":"nameOfLayer","layerParams":{some unnecessary data}
I would like to have in my select id and nameOfLayer so the output would be for example:
1, layerName
2, layerName2
etc.
The json looks always the same so the layerID is the first.
Could you tell me how can I use REGEXP_SUBSTR properly in my select query which looks like this now?
select
id,
value
from
...
where
table1.id = table2.bookmark_id
and ...;
In Oracle 11g, you can extract the layerId using the following regular expression, where js is the name of your JSON column:
regexp_replace(js, '^.*"layerId":"([^"]+).*$', '\1')
This basically extracts the string between double quotes after "layerId":.
In more recent versions, you would add a check constraint on the table to ensure that the document is valid JSON, and then use the dot notation to access the object attribute as follows:
create table mytable (
id int primary key,
js varchar2(200),
constraint ensure_js_is_json check (js is json)
);
insert into mytable values (1, '{"layerId":"nameOfLayer","layerParams":{} }');
select id, t.js.layerId from mytable t;
Demo on DB Fiddle:
ID | LAYERID
-: | :----------
1 | nameOfLayer
Don't use regular expressions; use a JSON_TABLE or JSON_VALUE to parse JSON:
Oracle 18c Setup:
CREATE TABLE test_data (
id INTEGER,
value VARCHAR2(4000)
);
INSERT INTO test_data ( id, value )
SELECT 1, '{"layerId":"nameOfLayer","layerParams":{"some":"unnecessary data"}}' FROM DUAL UNION ALL
SELECT 2, '{"layerParams":{"layerId":"NOT THIS ONE!"},"layerId":"nameOfLayer"}' FROM DUAL UNION ALL
SELECT 3, '{"layerId":"Name with \"Quotes\"","layerParams":{"layerId":"NOT THIS ONE!"}}' FROM DUAL;
Query 1:
SELECT t.id,
j.layerId
FROM test_data t
CROSS JOIN
JSON_TABLE(
t.value,
'$'
COLUMNS (
layerId VARCHAR2(50) PATH '$.layerId'
)
) j
Query 2:
If you only want a single value you could, alternatively, use JSON_VALUE:
SELECT id,
JSON_VALUE( value, '$.layerId' ) AS layerId
FROM test_data
Output:
Both output:
ID | LAYERID
-: | :-----------------
1 | nameOfLayer
2 | nameOfLayer
3 | Name with "Quotes"
Query 3:
You can try regular expressions but they do not always work as expected:
SELECT id,
REPLACE(
REGEXP_SUBSTR( value, '[{,]"layerId":"((\\"|[^"])*)"', 1, 1, NULL, 1 ),
'\"',
'"'
) AS layerID
FROM test_data
Output:
ID | LAYERID
-: | :-----------------
1 | nameOfLayer
2 | NOT THIS ONE!
3 | Name with "Quotes"
So if you can guarantee that no-one is going to put data into the database where the JSON is in a different order then this may work; however the JSON specification allows key-value pairs to be in any order so regular expressions are not a general solution that will parse every JSON string. You should be using a proper JSON parser and there are 3rd party solutions available for Oracle 11g or you can upgrade to Oracle 12c where there is a native solution.
db<>fiddle here
I think you can use regexp_substr like this:
regexp_substr(str, '[^"]+',1,2) as layer_id,
regexp_substr(str, '[^"]+',1,4) as layername
Db<>fiddle demo
Cheers!!

PostgreSQL query on text array value

I have a table where one column has an array - but stored in a text format:
mytable
id ids
-- -------
1 '[3,4]'
2 '[3,5]'
3 '[3]'
etc ...
I want to find all records that have the value 5 as an array element in the ids column.
I was trying to achieve this by using the "string to array" function and removing the [ symbols with the translate function, but couldn't find a way.
You can do this: http://www.sqlfiddle.com/#!1/5c148/12
select *
from tbl
where translate(ids, '[]','{}')::int[] && array[5];
Output:
| ID | IDS |
--------------
| 2 | [3,5] |
You can also use bool_or: http://www.sqlfiddle.com/#!1/5c148/11
with a as
(
select id, unnest(translate(ids, '[]','{}')::int[]) as elem
from tbl
)
select id
from a
group by id
having bool_or(elem = 5);
To see the original elements:
with a as
(
select id, unnest(translate(ids, '[]','{}')::int[]) as elem
from tbl
)
select id, '[' || array_to_string(array_agg(elem), ',') || ']' as ids
from a
group by id
having bool_or(elem = 5);
Output:
| ID | IDS |
--------------
| 2 | [3,5] |
Postgresql DDL is atomic, if it's not late yet in your project, just structure your stringly-typed array to a real array: http://www.sqlfiddle.com/#!1/6e18c/2
alter table tbl
add column id_array int[];
update tbl set id_array = translate(ids,'[]','{}')::int[];
alter table tbl drop column ids;
Query:
select *
from tbl
where id_array && array[5]
Output:
| ID | ID_ARRAY |
-----------------
| 2 | 3,5 |
You can also use contains operator: http://www.sqlfiddle.com/#!1/6e18c/6
select *
from tbl
where id_array #> array[5];
I prefer the && syntax though, it directly connotes intersection. It reflects that you are detecting if there's an intersection between two sets(array is a set)
http://www.postgresql.org/docs/8.2/static/functions-array.html
If you store the string representation of your arrays slightly differently, you can cast to array of integer directly:
INSERT INTO mytable
VALUES
(1, '{3,4}')
,(2, '{3,5}')
,(3, '{3}');
SELECT id, ids::int[]
FROM mytable;
Else, you have to put in one more step:
SELECT (translate(ids, '[]','{}'))::int[]
FROM mytable
I would consider making the column an array type to begin with.
Either way, you can find your row like this:
SELECT id, ids
FROM (
SELECT id, ids, unnest(ids::int[]) AS elem
FROM mytable
) x
WHERE elem = 5

Performing a complex self-referential query using the Django ORM

I have the following model:
class Message(Model):
url = URLField("URL")
email = EmailField("E-Mail")
contacted = BooleanField("Contacted", default=False)
With example data like:
| url | email | contacted |
+-----+-----------------+-----------+
| foo | foo#example.com | N |
| bar | bar#example.com | N |
| baz | foo#example.com | Y |
I would like to select all distinct rows (by e-mail address) whose e-mail addresses have never been contacted. With this example data, the bar#example.com row would be the only one returned.
This will return the records you want:
not_contacted = Message.objects.exclude(
email__in=Message.objects.filter(contacted=True).values('email')
)
This has the advantage of only running one query. Your query will look something like this:
SELECT
messages_message.id, messages_message.url, messages_message.email, messages_message.contacted
FROM
Messages
WHERE NOT
(messages_message.email IN
( SELECT U0.email from messages_message U0 WHERE U0.contacted = True )
)
Note that for many, many records this query may not be optimal, but it will probably work for most uses.
DROP SCHEMA tmp CASCADE;
CREATE SCHEMA tmp ;
SET search_path=tmp;
CREATE TABLE massage
( zurl varchar NOT NULL
, zemail varchar NOT NULL
, contacted boolean
);
INSERT into massage(zurl, zemail, contacted) VALUES
( 'foo', 'foo#example.com', False)
,( 'bar', 'bar#example.com', False)
,( 'baz', 'foo#example.com', True)
;
SELECT
DISTINCT zemail AS zemail
, MIN(zurl) AS zurl
FROM massage m
WHERE NOT EXISTS (
SELECT *
FROM massage nx
WHERE nx.zemail = m.zemail
AND nx.contacted = True
)
GROUP BY zemail;
If there are multiple records for a given email address, the above one picks the one with the "lowest" URL. If you want them all, the query would be even simpler:
SELECT m.zurl, m.zemail
FROM massage m
WHERE NOT EXISTS (
SELECT *
FROM massage nx
WHERE nx.zemail = m.zemail
AND nx.contacted = True
) ;