oracle sql split text into columns based on each occurrence of a certain character set - sql

In our database (Oracle), there is a field named CONVERSATION containing speech to text records (formatted as CLOB).
After some pre-processing and replacement of unnecessary characters, currently this field has a format as the example below.
I want to split texts of agents and customers into separate columns. And I want them separeted by comma for each part starts with "a:" or "c:".
How can I do that?
"a:" stands for agent and "c:" stands for customer
CREATE TABLE TEXT_RECORDS (
CONVERSATION CLOB
);
INSERT INTO TEXT_RECORDS
(CONVERSATION)
VALUES
('a:some text 1 c:some text 2 a:some text 3 c:some text 4 a:some text 5 c:some text 6');
--EDITED (previously it was 'a:some_text_1 c:some_text_2 a:some_text_3 c:some_text_4 a:some_text_5 c:some_text_6')
Desired output as two separate fields:
CONV_AGENT CONV_CUSTOMER
some text 1 ,some text 3, some text 5 some text 2 ,some text 4, some text 6

You can just remove the sub-strings which do not have the correct prefix:
SQL Fiddle
Oracle 11g R2 Schema Setup:
CREATE TABLE TEXT_RECORDS (
CONVERSATION CLOB
);
INSERT INTO TEXT_RECORDS(CONVERSATION)
SELECT 'a:some_text_1 c:some_text_2 a:some_text_3 c:some_text_4 a:some_text_5 c:some_text_6' FROM DUAL UNION ALL
SELECT 'a:some_text_1 a:some_text_2 a:some_text_3' FROM DUAL UNION ALL
SELECT 'c:some_text_1 a:some_text_2 a:some_text_3 c:some_text_4' FROM DUAL;
Query 1:
SELECT REGEXP_REPLACE(
REGEXP_REPLACE(
REGEXP_REPLACE(
conversation,
'.*?(a:(\S+))?(\s|$)', -- Find each word starting with "a:"
'\2, ' -- replace with just that part without prefix
),
'(, ){2,}', -- Replace multiple delimiters
', ' -- With a single delimiter
),
'^, |, $' -- Remove leading and trailing delimiters
) AS conv_agent,
REGEXP_REPLACE(
REGEXP_REPLACE(
REGEXP_REPLACE(
conversation,
'.*?(c:(\S+))?(\s|$)', -- Find each word starting with "c:"
'\2, ' -- replace with just that part without prefix
),
'(, ){2,}', -- Replace multiple delimiters
', ' -- With a single delimiter
),
'^, |, $' -- Remove leading and trailing delimiters
) AS conv_customer
FROM text_records
Results:
| CONV_AGENT | CONV_CUSTOMER |
|---------------------------------------|---------------------------------------|
| some_text_1, some_text_3, some_text_5 | some_text_2, some_text_4, some_text_6 |
| some_text_1, some_text_2, some_text_3 | |
| some_text_2, some_text_3 | some_text_1, some_text_4 |
Updated - Spaces in conversation sentences
SQL Fiddle
Oracle 11g R2 Schema Setup:
CREATE TABLE TEXT_RECORDS (
CONVERSATION CLOB
);
INSERT INTO TEXT_RECORDS(CONVERSATION)
SELECT 'a:some text 1 c:some text 2 a:some text 3 c:some text 4 a:some text 5 c:some text 6' FROM DUAL UNION ALL
SELECT 'a:some text 1 a:some text 2 a:some text 3' FROM DUAL UNION ALL
SELECT 'c:some text 1 a:some text 2 a:some text 3 c:some text 4' FROM DUAL;
Query 1:
SELECT REGEXP_REPLACE(
REGEXP_REPLACE(
REGEXP_REPLACE(
conversation,
'.*?(a:([^:]*))?(\s|$)',
'\2, '
),
'(, ){2,}',
', '
),
'^, |, $'
) AS conv_agent,
REGEXP_REPLACE(
REGEXP_REPLACE(
REGEXP_REPLACE(
conversation,
'.*?(c:([^:]*))?(\s|$)',
'\2, '
),
'(, ){2,}',
', '
),
'^, |, $'
) AS conv_customer
FROM text_records
Results:
| CONV_AGENT | CONV_CUSTOMER |
|---------------------------------------|---------------------------------------|
| some text 1, some text 3, some text 5 | some text 2, some text 4, some text 6 |
| some text 1, some text 2, some text 3 | |
| some text 2, some text 3 | some text 1, some text 4 |

You can create two functions, one that get the agent conversation and the other is for customer conversation, see below function to get for agent conversation.
CREATE OR REPLACE FUNCTION get_agent_conv(p_text CLOB) RETURN clob
IS
v_indx NUMBER := 1;
v_agent_conv CLOB;
v_occur NUMBER := 0;
BEGIN
LOOP
v_occur := v_occur + 1;
v_indx := DBMS_LOB.INSTR(p_text, 'a:', 1, v_occur);
v_agent_conv := v_agent_conv||', '||SUBSTR(p_text, v_indx+2, (DBMS_LOB.INSTR(p_text, 'c:', 1, v_occur)-4)-(v_indx-1));
EXIT WHEN v_indx = 0;
END LOOP;
RETURN TRIM(', ' FROM v_agent_conv);
END;
/
SELECT GET_AGENT_CONV(conversation) agent_conversation
FROM text_records;
AGENT_CONVERSATION
-------------------------------------
some_text_1, some_text_3, some_text_5

Related

We need to mask data for the String up to fixed length in Oracle

I am trying to mask the data for the below String :
This is the new ADHAR NUMBER 123456789989 this is the string 3456798983 from Customer Name like 345678 to a String .
In above data I want to mask data starting from ADHAR NUMBER to length up to 60 characters.
OUTPUT :
This is the new *********************************************************Customer Name like 345678 to a String .
Can anyone please help
A little bit of substr + instr does the job (sample data in the first 2 lines; query begins at line #3):
SQL> with test (col) as
2 (select 'This is the new ADHAR NUMBER 123456789989 this is the string 3456798983 from Customer Name like 345678 to a String .' from dual)
3 select substr(col, 1, instr(col, 'ADHAR NUMBER') - 1) ||
4 lpad('*', 60, '*') ||
5 substr(col, instr(col, 'ADHAR NUMBER') + 60) result
6 from test;
RESULT
--------------------------------------------------------------------------------
This is the new ************************************************************ Cus
tomer Name like 345678 to a String .
SQL>
Here is a solution that covers all possibilities (I think). Notice the different inputs in the WITH clause (which is not part of the solution - remove it, and use your actual table and column names in the query). This is how one should test their solutions - consider all possible cases, including NULL input, non-NULL input string that doesn't contain the "magic words", string that has the "magic words" right at the beginning, etc.
There is one important situation the solution does NOT address, namely when the exact substring 'ADHAR NUMBER' is not two full words, but it is part of longer words - for example 'BHADHAR NUMBERS'. In this case the output will look like 'BH****************' masking ADHAR NUMBER and the S after NUMBER and more characters, up to 60 total.
Note that the output string has the same length as the input. This is generally part of the definition of "masking".
with
test (col) as (
select 'This is the new ADHAR NUMBER 123456789989 this is the string ' ||
'3456798983 from Customer Name like 345678 to a String.'
from dual union all
select 'This string does not contain the magic words' from dual union all
select 'ADHAR NUMBER 12345' from dual union all
select 'Blah blah ADHAR NUMBER 1234' from dual union all
select null from dual union all
select 'Another blah ADHAR NUMBER' from dual
)
select case when pos > 0
then
substr(col, 1, pos - 1) ||
rpad('*', least(60, length(col) - pos + 1), '*') ||
substr(col, pos + 60)
else col end as masked
from (
select col, instr(col, 'ADHAR NUMBER') as pos
from test
)
;
MASKED
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
This is the new ************************************************************ Customer Name like 345678 to a String.
This string does not contain the magic words
******************
Blah blah *****************
Another blah ************

Merging tags to values separated by new line character in Oracle SQL

I have a database field with several values separated by newline.
Eg-(can be more than 3 also)
A
B
C
I want to perform an operation to modify these values by adding tags from front and end.
i.e the previous 3 values should need to be turned into
<Test>A</Test>
<Test>B</Test>
<Test>C</Test>
Is there any possible query operation in Oracle SQL to perform such an operation?
Just replace the start and end of each string with the XML tags using a multi-line match parameter of the regular expression:
SELECT REGEXP_REPLACE(
REGEXP_REPLACE( value, '^', '<Test>', 1, 0, 'm' ),
'$', '</Test>', 1, 0, 'm'
) AS replaced_value
FROM table_name;
Which, for the sample data:
CREATE TABLE table_name ( value ) AS
SELECT 'A
B
C' FROM DUAL;
Outputs:
| REPLACED_VALUE |
| :------------- |
| <Test>A</Test> |
| <Test>B</Test> |
| <Test>C</Test> |
db<>fiddle here
You can use normal replace function as follows:
Select '<test>'
|| replace(your_column,chr(10),'</test>'||chr(10)||'<test>')
|| '</test>'
From your_table;
It will be faster than its regexp_replace function.
Db<>fiddle

How to count length of line break as 2 characters in oracle sql

I have data stored in table's column and it has a line break in the data. When I count the length of the string it returns me the count just fine. I want to make some changes and take the line break as 2 characters so if the data in table is something like this.
This
That
This should return length as 10 instead it is returning 9 for now which is understandable but I was to count the length of line break as 2 characters. So if there are 2 line breaks in data it will count them as 4 characters.
How can I achieve this ?
I want to use this in SUBSTR(COL, 1, 7)
By counting line break as 2 character it should return data like this
This
T
Hope someone can help
Just replace new line in the string with 2 characters, for example 'xx', before counting string length. More info on how to replace new lines in Oracle: Oracle REPLACE() function isn't handling carriage-returns & line-feeds
Update your value to have a line feed character before the carriage return character.
So if you have the table:
CREATE TABLE test_data ( value VARCHAR2(20) );
INSERT INTO test_data ( value ) VALUES ( 'This
That' );
Then you can insert the LF before the CR:
UPDATE test_data
SET value = REPLACE( value, CHR(10), CHR(13) || CHR(10) )
WHERE INSTR( value, CHR(10) ) > 0
Then your query:
SELECT SUBSTR( value, 1, 7 ) FROM test_data;
Outputs:
| SUBSTR(VALUE,1,7) |
| :---------------- |
| This |
| T |
db<>fiddle here

Split string by space and character as delimiter in Oracle with regexp_substr

I'm trying to split a string with regexp_subtr, but i can't make it work.
So, first, i have this query
select regexp_substr('Helloworld - test!' ,'[[:space:]]-[[:space:]]') from dual
which very nicely extracts my delimiter - blank-blank
But then, when i try to split the string with this option, it just doesn't work.
select regexp_substr('Helloworld - test!' ,'[^[[:space:]]-[[:space:]]]+')from dual
The query returns nothing.
Help will be much appreciated!
Thanks
SQL Fiddle
Oracle 11g R2 Schema Setup:
CREATE TABLE TEST( str ) AS
SELECT 'Hello world - test-test! - test' FROM DUAL
UNION ALL SELECT 'Hello world2 - test2 - test-test2' FROM DUAL;
Query 1:
SELECT Str,
COLUMN_VALUE AS Occurrence,
REGEXP_SUBSTR( str ,'(.*?)([[:space:]]-[[:space:]]|$)', 1, COLUMN_VALUE, NULL, 1 ) AS split_value
FROM TEST,
TABLE(
CAST(
MULTISET(
SELECT LEVEL
FROM DUAL
CONNECT BY LEVEL < REGEXP_COUNT( str ,'(.*?)([[:space:]]-[[:space:]]|$)' )
)
AS SYS.ODCINUMBERLIST
)
)
Results:
| STR | OCCURRENCE | SPLIT_VALUE |
|-----------------------------------|------------|--------------|
| Hello world - test-test! - test | 1 | Hello world |
| Hello world - test-test! - test | 2 | test-test! |
| Hello world - test-test! - test | 3 | test |
| Hello world2 - test2 - test-test2 | 1 | Hello world2 |
| Hello world2 - test2 - test-test2 | 2 | test2 |
| Hello world2 - test2 - test-test2 | 3 | test-test2 |
If i understood correctly, this will help you. Currently you are getting output as Helloworld(with space at the end). So i assume u don't want to have space at the end. If so you can simply use the space in the delimiter also like.
select regexp_substr('Helloworld - test!' ,'[^ - ]+',1,1)from dual;
OUTPUT
Helloworld(No space at the end)
As u mentioned in ur comment if u want two columns output with Helloworld and test!. you can do the following.
select regexp_substr('Helloworld - test!' ,'[^ - ]+',1,1),
regexp_substr('Helloworld - test!' ,'[^ - ]+',1,3) from dual;
OUTPUT
col1 col2
Helloworld test!
Trying to negate the match string '[[:space:]]-[[:space:]]' by putting it in a character class with a circumflex (^) to negate it will not work. Everything between a pair of square brackets is treated as a list of optional single characters except for named named character classes which expand out to a list of optional characters, however, due to the way character classes nest, it's very likely that your outer brackets are being interpreted as follows:
[^[[:space:]] A single non space non left square bracket character
- followed by a single hyphen
[[:space:]] followed by a single space character
]+ followed by 1 or more closing square brackets.
It may be easier to convert your multi-character separator to a single character with regexp_replace, then use regex_substr to find you individual pieces:
select regexp_substr(regexp_replace('Helloworld - test!'
,'[[:space:]]-[[:space:]]'
,chr(11))
,'([^'||chr(11)||']*)('||chr(11)||'|$)'
,1 -- Start here
,2 -- return 1st, 2nd, 3rd, etc. match
,null
,1 -- return 1st sub exp
)
from dual;
In this code I first changed - to chr(11). That's the ASCII vertical tab (VT) character which is unlikely to appear in most text strings. Then the match expression of the regexp_substr matches all non VT characters followed by either a VT character or the end of line. Only the non VT characters are returned (the first subexpression).
Slight improvement on MT0's answer. Dynamic count using regexp_count and proves it handles nulls where the format of [^delimiter]+ as a pattern does NOT handle NULL list elements. More info on that here: Split comma seperated values to columns
SQL> with tbl(str) as (
2 select ' - Hello world - test-test! - - test - ' from dual
3 )
4 SELECT LEVEL AS Occurrence,
5 REGEXP_SUBSTR( str ,'(.*?)([[:space:]]-[[:space:]]|$)', 1, LEVEL, NULL, 1 ) AS split_value
6 FROM tbl
7 CONNECT BY LEVEL <= regexp_count(str, '[[:space:]]-[[:space:]]')+1;
OCCURRENCE SPLIT_VALUE
---------- ----------------------------------------
1
2 Hello world
3 test-test!
4
5 test
6
6 rows selected.
SQL>
CREATE OR REPLACE FUNCTION field(i_string VARCHAR2
,i_delimiter VARCHAR2
,i_occurance NUMBER
,i_return_number NUMBER DEFAULT 0
,i_replace_delimiter VARCHAR2) RETURN VARCHAR2 IS
-----------------------------------------------------------------------
-- Function Name.......: FIELD
-- Author..............: Dan Simson
-- Date................: 05/06/2016
-- Description.........: This function is similar to the one I used from
-- long ago by Prime Computer. You can easily
-- parse a delimited string.
-- Example.............:
-- String.............: This is a cool function
-- Delimiter..........: ' '
-- Occurance..........: 2
-- Return Number......: 3
-- Replace Delimiter..: '/'
-- Return Value.......: is/a/cool
-------------------------------------------------------------------------- ---
v_return_string VARCHAR2(32767);
n_start NUMBER := i_occurance;
v_delimiter VARCHAR2(1);
n_return_number NUMBER := i_return_number;
n_max_delimiters NUMBER := regexp_count(i_string, i_delimiter);
BEGIN
IF i_return_number > n_max_delimiters THEN
n_return_number := n_max_delimiters + 1;
END IF;
FOR a IN 1 .. n_return_number LOOP
v_return_string := v_return_string || v_delimiter || regexp_substr (i_string, '[^' || i_delimiter || ']+', 1, n_start);
n_start := n_start + 1;
v_delimiter := nvl(i_replace_delimiter, i_delimiter);
END LOOP;
RETURN(v_return_string);
END field;
SELECT field('This is a cool function',' ',2,3,'/') FROM dual;
SELECT regexp_substr('This is a cool function', '[^ ]+', 1, 1) Word1
,regexp_substr('This is a cool function', '[^ ]+', 1, 2) Word2
,regexp_substr('This is a cool function', '[^ ]+', 1, 3) Word3
,regexp_substr('This is a cool function', '[^ ]+', 1, 4) Word4
,regexp_substr('This is a cool function', '[^ ]+', 1, 5) Word5
FROM dual;

SQL How to extract numbers from a string?

I am working on a query in SQL that should be able to extract numbers on different/random lenght from the beginning of the text string.
Text string: 666 devils number is not 8888.
Text string: 12345 devils number is my PIN, that is 6666.
I want to get in a column
666
12345
Use a combination of Substr & instr
SELECT Substr (textstring, 1,instr(textstring,' ') - 1) AS Output
FROM yourtable
Result:
OUTPUT
666
12345
Use this if you have text at the beginning e.g. aa12345 devils number is my PIN, that is 6666. as it utilises the REGEXP_REPLACE function.
SELECT REGEXP_REPLACE(Substr (textstring, 1,instr(textstring,' ') - 1), '[[:alpha:]]','') AS Output
FROM yourtable
SQL Fiddle: http://sqlfiddle.com/#!4/8edc9/1/0
This version utilizes a regular expression which gives you the first number whether or not it's preceded by text and does not use the ghastly nested instr/substr calls:
SQL> with tbl(data) as (
select '666 devils number is not 8888' from dual
union
select '12345 devils number is my PIN, that is 6666' from dual
union
select 'aa12345 devils number is my PIN, that is 6666' from dual
)
select regexp_substr(data, '^\D*(\d+) ', 1, 1, null, 1) first_nbr
from tbl;
FIRST_NBR
---------------------------------------------
12345
666
12345
SQL>