Need to replace alpha numeric characters by space and create new column - sql

I need to convert the following
Column 1 Column 2
ABC, Company ABC Company
TA. Comp TA Comp
How can I get Column2 in sql where I am removing all ',' '.' to space.

How about:
with testdata as (
select 'ABC, Company Inc.' as col1 from dual
union all
select 'TA. Comp' as col1 from dual
)
select trim(regexp_replace(regexp_replace(col1, '[[:punct:]]',' '), ' {2,}', ' ')) as col2
from testdata;
Output:
ABC Company Inc
TA Comp
Assuming punctuation is what you're trying to blank out.

You can try to use:
SELECT REGEXP_REPLACE(column1, '[^a-zA-Z0-9 ]+', '')
FROM DUAL

with t (val) as
(
select 'ABC,. Cmpany' from dual union all
select 'A, VC' from dual union all
select 'A,, BC...com' from dual
)
select
val,
replace(replace(val, ',', ''), '.', '') x , -- one way
regexp_replace(val, '[,.]', '') y -- another way
from t
;
VAL X Y
--------------- ---------- ----------
ABC,. Cmpany ABC Cmpany ABC Cmpany
A, VC A VC A VC
A,, BC...com A BCcom A BCcom

Related

How to get ids which does not exist in another listagg | Oracle 19c |

I have written a query to get values in comma separated format from both the table
Table 1 :
SELECT
regex_replace(xmlcast(Xmlagg(XMLELEMENT(empid, empid, ',')) AS clob), '\s*,\s*$', '') AS str1
FROM
(SELECT empid
FROM employee);
Table 2:
SELECT
regex_replace(xmlcast(Xmlagg(XMLELEMENT(depid, depid, ',')) AS clob), '\s*,\s*$', '') AS str2
FROM
(SELECT depid
FROM department);
Output of both queries:
str1 = 1,4,5,6,8
str2 = 1,5,6
How do I compare both the str1 and str2 and get ids which are in str1 but not in str2
Expected output: 4,8
You do not need to compare the delimited strings, you can simply use NOT IN (or NOT EXISTS) and compare the table values:
SELECT regexp_replace( xmlcast(Xmlagg(XMLELEMENT(empid,empid,',')) as clob),'\s*,\s*$','') AS str1
FROM employee
WHERE empid NOT IN (
SELECT depid
FROM department
);
However, you should consider whether it makes sense to compare the IDs for employees to the IDs for departments as that does not appear to make logical sense.
For the sample data:
CREATE TABLE employee (empid) AS
SELECT 1 FROM DUAL UNION ALL
SELECT 4 FROM DUAL UNION ALL
SELECT 5 FROM DUAL UNION ALL
SELECT 6 FROM DUAL UNION ALL
SELECT 8 FROM DUAL;
CREATE TABLE department (depid) AS
SELECT 1 FROM DUAL UNION ALL
SELECT 5 FROM DUAL UNION ALL
SELECT 6 FROM DUAL;
The query outputs:
STR1
4,8
db<>fiddle here
Another approach considering that both str1 and str2 are coming from different tables. Although I consider the comments more than right that this kind of comparison should not be done this way.
with x as
(
select regexp_substr(x.str1,'[^,]+',1,level) as str1_spit
from ( select '1,4,5,6,8' as str1 from dual ) x
CONNECT BY LEVEL <=REGEXP_COUNT(x.str1 ,'[,]') + 1
),
y as
( select regexp_substr(y.str2,'[^,]+',1,level) as str2_spit
from ( select '1,5,6' as str2 from dual ) y
CONNECT BY LEVEL <=REGEXP_COUNT(y.str2 ,'[,]') + 1
)
select LISTAGG(str1_spit, ',') WITHIN GROUP (order by str1_spit) as final_value
from
(
select x.str1_spit , y.str2_spit
from x left join y on x.str1_spit = y.str2_spit
where y.str2_spit is null
order by x.str1_spit
)
Demo
SQL> with x as
2 (
select regexp_substr(x.str1,'[^,]+',1,level) as str1_spit
from ( select '1,4,5,6,8' as str1 from dual ) x
CONNECT BY LEVEL <=REGEXP_COUNT(x.str1 ,'[,]') + 1
),
y as
( select regexp_substr(y.str2,'[^,]+',1,level) as str2_spit
from ( select '1,5,6' as str2 from dual ) y
CONNECT BY LEVEL <=REGEXP_COUNT(y.str2 ,'[,]') + 1
)
select LISTAGG(str1_spit, ',') WITHIN GROUP (order by str1_spit) as final_value
from
(
select x.str1_spit , y.str2_spit
from x left join y on x.str1_spit = y.str2_spit
where y.str2_spit is null
order by x.str1_spit
) 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ;
FINAL_VALUE
--------------------------------------------------------------------------------
4,8
SQL>
If you must do it by comparing strings (don't, use NOT IN or NOT EXISTS and compare the tables) then you can do it by only splitting one of the two strings and using simple string functions (rather than regular expressions, which are an order-of-magnitude slower):
WITH data (str1, str2) AS (
SELECT TO_CLOB('1,4,5,6,8'),
TO_CLOB('1,5,6')
FROM DUAL
),
bounds (str1, str2, spos, epos) AS (
SELECT str1,
str2,
1,
INSTR(str1, ',', 1)
FROM data
UNION ALL
SELECT str1,
str2,
epos + 1,
INSTR(str1, ',', epos + 1)
FROM bounds
WHERE epos > 0
),
items (item, str2) AS (
SELECT CASE epos
WHEN 0
THEN SUBSTR(str1, spos)
ELSE SUBSTR(str1, spos, epos - spos)
END,
',' || str2 || ','
FROM bounds
ORDER BY spos
)
SELECT regexp_replace(
xmlcast(Xmlagg(XMLELEMENT(item,item,',')) as clob),
'\s*,\s*$'
) AS str3
FROM items
WHERE str2 NOT LIKE '%,' || item || ',%';
Which outputs:
STR3
4,8
db<>fiddle here

How to match a string after a token (using regex)?

I'm trying to extract mail addresses after a token 'eaddr:'. So it would match the all occurrences in line entries, first consecutive string without spaces after that token: I tried:
SELECT regexp_substr(tab.entry, 'eaddr:\(.*?\)',1,1,'e',1)
from (
select 'String, email#domain.com' as entry
union
select 'eaddr:mail1#domain.com eaddr:mail2#domain.com sometext eaddr: mail3#domain.com some4354% text' as entry
union
select 'eaddr:mail5#domain.org' as entry
union
select 'Just a string' as entry
) tab
;
but it does not work. The correct result set is:
null
mail1#domain.com mail2#domain.com mail3#domain.com
mail5#domain.org
null
First of all, I suggest using a better regex to verify the email format. I am inspired by Gordon's SPLIT_TO_TABLE + LATERAL approach, and wrote some sample queries to fetch those emails from the entries.
If you want to get all the emails together, you can use this one:
with t as (
select 'String, email#domain.com' as entry
union
select 'eaddr:mail1#domain.com eaddr:mail2#domain.com sometext eaddr: mail3#domain.com some4354% text' as entry
union
select 'eaddr:mail5#domain.org' as entry
union
select 'Just a string' as entry
)
Select LISTAGG( regexp_substr( s.value, '[A-Z0-9a-z._%+-]+#[A-Za-z0-9.-]+\\.[A-Za-z]{2,64}' ) ,' ' ) emails from t,
lateral SPLIT_TO_TABLE(t.entry, 'eaddr:') s
where s.seq > 1;
+---------------------------------------------------------------------+
| EMAILS |
+---------------------------------------------------------------------+
| mail1#domain.com mail2#domain.com mail3#domain.com mail5#domain.org |
+---------------------------------------------------------------------+
To get the exact result in your question, you can use the following query:
with t as (
select 'String, email#domain.com' as entry
union
select 'eaddr:mail1#domain.com eaddr:mail2#domain.com sometext eaddr: mail3#domain.com some4354% text' as entry
union
select 'eaddr:mail5#domain.org' as entry
union
select 'Just a string' as entry
)
select emails from
(
Select t.entry, s.*,
LISTAGG( regexp_substr( IFF(s.seq = 1, '', s.value ), '[A-Z0-9a-z._%+-]+#[A-Za-z0-9.-]+\\.[A-Za-z]{2,64}' ) ,' ' )
OVER ( PARTITION BY s.seq ) emails
from t,
lateral SPLIT_TO_TABLE(t.entry, ' ') s )
where index = 1;
+----------------------------------------------------+
| EMAILS |
+----------------------------------------------------+
| NULL |
| mail1#domain.com mail2#domain.com mail3#domain.com |
| NULL |
| mail5#domain.org |
+----------------------------------------------------+
As far as I know, you can return only one match at a time from REGEXP_SUBSTR. The code below:
with tab(entry) as (
select 'String, email#domain.com' from dual
union
select 'eaddr:mail1#domain.com eaddr:mail2#domain.com sometext eaddr: mail3#domain.com some4354% text' from dual
union
select 'eaddr:mail5#domain.org' from dual
union
select 'Just a string' from dual
)
SELECT
regexp_substr(entry, 'eaddr:\s*(\S*)\s*',1,1,'i',1)
|| coalesce(' ' || regexp_substr(entry, 'eaddr:\s*(\S*)\s*',1,2,'i', 1), '')
|| coalesce(' ' || regexp_substr(entry, 'eaddr:\s*(\S*)\s*',1,3,'i', 1), '') as match,
regexp_count(entry, 'eaddr:\s*(\S*)\s*') as nmatches
from tab
gives the result below (using Oracle). You can use REGEXP_COUNT as shown to get the number of matches. If there are more than 3 email addresses, you can add more || coalesce( lines as needed.
P.S. I'm not sure what the 'e' flag does in your example. I'm guessing that is a Snowflake-specific value.
You need to split the strings, extract the emails, and then reaggregate. I don't have Snowflake on hand, but this or something similar should do:
select t.*, s.emails
from t left join lateral
(select list_agg(split(s.value, ' ')), ' ') as emails
from table(string_split_to_table(t.entry, 'eaddr:')) as s
) s;
I'm not 100% sure that Snowflake supports multiple-character delimiters, for instance. If that is the case, you can use:
select t.*, s.emails
from t left join lateral
(select list_agg(substr(s.value, 7), ' ') as emails
from table(string_split_to_table(t.entry, ' ')) as s
where value like 'eaddr:%'
) s;
Using Javascript UDF
create or replace function ext_mail(col VARCHAR)
returns varchar
language javascript
as
$$
var y = COL.match(/(?!eaddr):(\s+)?\w+#\w+/g);
if (y) {
ext_out = y.join(' ');
return ext_out.replace(/:|\s+/g,' ')
}
else return 'NULL'
$$
;
with t as (
select 'String, email#domain.com' as entry
union
select 'eaddr:mail1#domain.com eaddr:mail2#domain.com sometext eaddr: mail3#domain.com some4354% text' as entry
union
select 'eaddr:mail5#domain.org' as entry
union
select 'Just a string' as entry
) select ext_mail(ENTRY) from t;

Concatenating clob cloumn values in sql query

I am using this statement in my sql query to concate large clob column values but the output contains extra ","(commas) not able to figure out what is going wrong.?
SELECT RTRIM(
XMLAGG(
XMLELEMENT(
E,
CASE WHEN UNIQ_ID IN ( SELECT VAL
FROM SOME_TABLE
WHERE VAL_NM = 'SOME_TEXT' )
THEN TABLE1.COL_NAME
ELSE NULL
END,
', '
).EXTRACT('//text()')
ORDER BY TABLE1.UNIQ_ID
).GETCLOBVAL(),
','
) COMBINED_VAL
If you are asking about the trailing commas, then you are concatenating using comma then space so the trailing character is a space and not a comma.
If you are asking about adjacent separators with no value in between then when the WHEN UNIQ_ID IN ( ... ) part of your CASE statement is not matched you will have a NULL value; this is concatenated into the aggregated output and then you will find that you have two adjacent comma-space separators with no text in between.
For example:
WITH test_data ( id, value ) AS (
SELECT 1, 'a' FROM DUAL UNION ALL
SELECT 2, NULL FROM DUAL UNION ALL
SELECT 3, 'b' FROM DUAL
)
SELECT RTRIM(
XMLAGG(
XMLELEMENT(
E,
value,
', '
).EXTRACT('//text()')
ORDER BY id
).GETCLOBVAL(),
','
) AS COMBINED_VAL
FROM test_data;
Outputs:
| COMBINED_VAL |
| :----------- |
| a, , b, |
The trailing comma-space isn't trimmed as the last character is a space and the values are a then NULL then b and the NULL is represented as a zero-width substring.
db<>fiddle here
That's pretty easy:
do not aggregate rows which you don't want to get. To do that you just need to generate xmlelement only for required rows, and just return null for others.
Just put all characters you want to trim from your result into second parameter of rtrim:
SELECT RTRIM(
XMLAGG(
CASE WHEN UNIQ_ID IN ( SELECT VAL
FROM SOME_TABLE
WHERE VAL_NM = 'SOME_TEXT' )
and COL_NAME is not null
THEN XMLELEMENT(
E,
TABLE1.COL_NAME||', '
)
END
ORDER BY TABLE1.UNIQ_ID
).extract('//text()').GETCLOBVAL(),
', '
) COMBINED_VAL
from table1;
Full test case with sample data and results: https://dbfiddle.uk/?rdbms=oracle_11.2&fiddle=452c715247e8edda8735014ff2fb34f4
with
SOME_TABLE(VAL, VAL_NM) as (
select level*2, 'SOME_TEXT' from dual connect by level<=10
)
,TABLE1(UNIQ_ID, COL_NAME) as (
select level UNIQ_ID
, to_clob(level) COL_NAME
from dual
connect by level<=20
)
SELECT RTRIM(
XMLAGG(
CASE WHEN UNIQ_ID IN ( SELECT VAL
FROM SOME_TABLE
WHERE VAL_NM = 'SOME_TEXT' )
and COL_NAME is not null
THEN XMLELEMENT(
E,
TABLE1.COL_NAME||', '
)
END
ORDER BY TABLE1.UNIQ_ID
).extract('//text()').GETCLOBVAL(),
', '
) COMBINED_VAL
from TABLE1;
Results:
COMBINED_VAL
----------------------------------------
2, 4, 6, 8, 10, 12, 14, 16, 18, 20

distinct and sum if like

I have a table as the following
name
-----------
1#apple#1
2#apple#2
3#apple#4
4#box#4
5#box#5
and I want to get the result as:
name
--------------
apple 3
box 2
Thanks in advance for your help
This is what you need.
select
SUBSTRING(
name,
CHARINDEX('#', name) + 1,
LEN(name) - (
CHARINDEX('#', REVERSE(name)) + CHARINDEX('#', name)
)
),
count(1)
from
tbl
group by
SUBSTRING(
name,
CHARINDEX('#', name) + 1,
LEN(name) - (
CHARINDEX('#', REVERSE(name)) + CHARINDEX('#', name)
)
)
If your data does not contain any full stops (or periods depending on your vernacular), and the length of your string is less than 128 characters, then you can use PARSENAME to effectively split your string into parts, and extract the 2nd part:
DECLARE #T TABLE (Val VARCHAR(20));
INSERT #T (Val)
VALUES ('1#apple#1'), ('2#apple#2'), ('3#apple#4'),
('4#box#4'), ('5#box#5');
SELECT Val = PARSENAME(REPLACE(t.Val, '#', '.'), 2),
[Count] = COUNT(*)
FROM #T AS t
GROUP BY PARSENAME(REPLACE(t.Val, '#', '.'), 2);
Otherwise you will need to use CHARINDEX to find the first and last occurrence of # within your string (REVERSE is also needed to get the last position), then use SUBSTRING to extract the text between these positions:
DECLARE #T TABLE (Val VARCHAR(20));
INSERT #T (Val)
VALUES ('1#apple#1'), ('2#apple#2'), ('3#apple#4'),
('4#box#4'), ('5#box#5');
SELECT Val = SUBSTRING(t.Val, x.FirstPosition + 1, x.LastPosition - x.FirstPosition),
[Count] = COUNT(*)
FROM #T AS t
CROSS APPLY
( SELECT CHARINDEX('#', t.Val) ,
LEN(t.Val) - CHARINDEX('#', REVERSE(t.Val))
) AS x (FirstPosition, LastPosition)
GROUP BY SUBSTRING(t.Val, x.FirstPosition + 1, x.LastPosition - x.FirstPosition);
use case when
select case when name like '%apple%' then 'apple'
when name like '%box%' then 'box' end item_name,
count(*)
group by cas when name like '%apple%' then 'apple'
when name like '%box%' then 'box' end
No DBMS specified, so here is a postgres variant. The query does use regexps to simplify things a bit.
with t0 as (
select '1#apple#1' as value
union all select '2#apple#2'
union all select '3#apple#4'
union all select '4#box#4'
union all select '5#box#5'
),
trimmed as (
select regexp_replace(value,'[0-9]*#(.+?)#[0-9]*','\1') as name
from t0
)
select name, count(*)
from trimmed
group by name
order by name
DB Fiddle
Update
For Oracle DMBS, the query stays basically the same:
with t0 as (
select '1#apple#1' as value from dual
union all select '2#apple#2' from dual
union all select '3#apple#4' from dual
union all select '4#box#4' from dual
union all select '5#box#5' from dual
),
trimmed as (
select regexp_replace(value,'[0-9]*#(.+?)#[0-9]*','\1') as name
from t0
)
select name, count(*)
from trimmed
group by name
order by name
NAME | COUNT(*)
:---- | -------:
apple | 3
box | 2
db<>fiddle here
Update
MySQL 8.0
with t0 as (
select '1#apple#1' as value
union all select '2#apple#2'
union all select '3#apple#4'
union all select '4#box#4'
union all select '5#box#5'
),
trimmed as (
select regexp_replace(value,'[0-9]*#(.+?)#[0-9]*','$1') as name
from t0
)
select name, count(*)
from trimmed
group by name
order by name
name | count(*)
:---- | -------:
apple | 3
box | 2
db<>fiddle here
You can use case and group by to do the same.
select new_col , count(new_col)
from
(
select case when col_name like '%apple%' then 'apple'
when col_name like '%box%' then 'box'
else 'others' end new_col
from table_name
)
group by new_col
;

In SQL Count number of distinct values corresponding to a same Id and DISPLAY in a same row

I am stuck on a SQL query. I have results as below in a table and have to display the final result in a report as below:
id Question
-------------
13 ABC
13 ABC
13 QWE
13 ABC
13 QWE
13 ABC
Expected result:
id Result
--------------------
13 4 ABC, 2 QWE
Can somebody please help me out? Thank you.
This requires pre-aggregation and string aggregation.
with t as (
select id, question, count(*) as cnt
from t
group by id
)
select i.id,
stuff( (select ', ' + convert(varchar(255), cnt) + ' ' + question
from t t2
where t2.id = i.id
for xml path ('')
), 1, 2, ''
) as result
from (select distinct id from t) i;
--testdata-begin
if not object_id(N'Tempdb..#T') is null
drop table #T
Go
Create table #T([id] int,[Question] nvarchar(23))
Insert #T
select 13,N'ABC' union all
select 13,N'ABC' union all
select 13,N'QWE' union all
select 13,N'ABC' union all
select 13,N'QWE' union all
select 13,N'ABC'
Go
--testdata-end
WITH cte AS (
Select id,Question,COUNT(1) AS num from #T GROUP BY id,Question
)
SELECT id,
STUFF(
(
SELECT ',' + RTRIM(b.num) + ' ' + b.Question
FROM cte b
WHERE a.id = b.id
FOR XML PATH('')
),
1,
1,
''
) AS Result
FROM cte a
GROUP BY id;