SQL : extract next character from string where multiple separators exist - sql

Azure MSSQL Database
I have a column that contains values stored per transaction. The string can contain up to 7 values, separated by a '-'.
I need to be able to extract the value that is stored after the 3rd '-'. The issue is that the length of this column (and the characters that come before the 3rd '-') can vary.
For example:
DIM VALUE
1. NHL--WA-S-MOSG-SER-
2. VDS----HAST-SER-
3. ---D---SER
Row 1 needs to return 'S'
Row 2 needs to return '-'
Row 3 needs to return 'D'

This is by no means an optimal solution, but it works in SQL Server. 😊
TempTable added for testing purposes. Maybe it gives you a hint as of where to start.
Edit: added reference for string_split function (works from SQL Server 2016 up).
CREATE TABLE #tempStrings (
VAL VARCHAR(30)
);
INSERT INTO #tempStrings VALUES ('NHL--WA-S-MOSG-SER-');
INSERT INTO #tempStrings VALUES ('VDS----HAST-SER-');
INSERT INTO #tempStrings VALUES ('---D---SER');
INSERT INTO #tempStrings VALUES ('A-V-D-C--SER');
SELECT
t.VAL,
CASE t.PART WHEN '' THEN '-' ELSE t.PART END AS PART
FROM
(SELECT
t.VAL,
ROW_NUMBER() OVER (PARTITION BY VAL ORDER BY (SELECT NULL)) AS IX,
value AS PART
FROM #tempStrings t
CROSS APPLY string_split(VAL, '-')) t
WHERE t.IX = 4; --DASH COUNT + 1
DROP TABLE #tempStrings;
Output is...
VAL PART
---D---SER D
A-V-D-C--SER C
NHL--WA-S-MOSG-SER- S
VDS----HAST-SER- -

If you always want the fourth element then using CHARINDEX is relatively straightforward:
DROP TABLE IF EXISTS #tmp;
CREATE TABLE #tmp (
rowId INT IDENTITY PRIMARY KEY,
xval VARCHAR(30) NOT NULL
);
INSERT INTO #tmp
VALUES
( 'NHL--WA-S-MOSG-SER-' ),
( 'VDS----HAST-SER-' ),
( '---D---SER' ),
( 'A-V-D-C--SER' );
;WITH cte AS
( -- Work out the position of the 3rd dash
SELECT
rowId,
xval,
CHARINDEX( '-', xval, CHARINDEX( '-', xval, CHARINDEX( '-', xval ) + 1 ) + 1 ) + 1 xstart
FROM #tmp t
), cte2 AS
( -- Work out the length for the substring function
SELECT rowId, xval, xstart, CHARINDEX( '-', xval, xstart) - (xstart) AS xlen
FROM cte
)
SELECT rowId, ISNULL( NULLIF( SUBSTRING( xval, xstart, xlen ), '' ), '-' ) xpart
FROM cte2
I also did a volume test at 1 million rows and this was by far the fastest method compared with STRING_SPLIT, OPENJSON, recursive CTE (the worst at high volume). As a downside this method is less extensible, say you want the second or fifth items for example.

Related

SQL get average of a list in sql select

We have this column in the table named "pricehistory"
1634913730;48.38,1634916509;48.38,1635162352;37.96,1635177904;49.14,1635337722;1219.98,1635340811;27.17
that is an example data.
first is the timestamp than after ; is the price at this timestamp
But i want the average price from every timestamp in a select... is that possible?
I dont find any similiar examples somewhere and my tries to select doesnt work... i am not so good with sql
so i want average of all prices behind that ; and before ,
The , split the timestamp and prices
Some test data :
create table test ( id int not null, pricehistory text not null );
insert into test values ( 1, '1634913730;48.38,1634916509;48.38,1635162352;37.96,1635177904;49.14,1635337722;1219.98,1635340811;27.17' );
insert into test values ( 2, '1634913731;42.42,1634916609;21.21' );
If your RDBMS has some splitting function
Then it's quite easy, just split and use AVG. Here is an example using PostgreSQL :
SELECT id, AVG(SUBSTRING(v, 12, 42)::decimal) AS average
FROM test
INNER JOIN LATERAL regexp_split_to_table(pricehistory, E',') t(v) ON TRUE
GROUP BY id;
Then you get:
id | average
----+----------------------
2 | 31.8150000000000000
1 | 238.5016666666666667
(2 rows)
Otherwise
You can use a CTE to split the values manually. This is a bit more involved. Here is an example using PostgreSQL again :
WITH RECURSIVE T AS (
SELECT id,
-- We get the last value ...
SUBSTRING(pricehistory, LENGTH(pricehistory) - STRPOS(REVERSE(pricehistory), ',') + 2) AS oneprice,
pricehistory AS remaining
FROM test
UNION ALL
-- ... as we get the other values from the recursive CTE.
SELECT id,
LEFT(remaining, STRPOS(remaining, ',') - 1),
SUBSTRING(remaining, STRPOS(remaining, ',') + 1)
FROM T
WHERE STRPOS(remaining, ',') > 0
)
SELECT id, AVG(SUBSTRING(oneprice, 12)::decimal) AS average
FROM T
GROUP BY id;
Then you get:
id | average
----+----------------------
2 | 31.8150000000000000
1 | 238.5016666666666667
(2 rows)
MySql >= 8.0
I used Recursive Common Table Expressions (cte) to split pricehistory string by ','. Then I split price from timestamp by ';', cast price as decimal(10,2) and group by id to get average price by id.
WITH RECURSIVE
cte AS (SELECT id,
SUBSTRING_INDEX(pricehistory, ',', 1) AS price,
CASE WHEN POSITION(',' IN pricehistory) > 0
THEN SUBSTR(pricehistory, POSITION(',' IN pricehistory) + 1)
ELSE NULL END AS rest
FROM t
UNION ALL
SELECT id,
SUBSTRING_INDEX(rest, ',', 1) AS price,
CASE WHEN POSITION(',' IN rest) > 0
THEN SUBSTR(rest, POSITION(',' IN rest) + 1)
ELSE NULL END AS rest
FROM cte
WHERE rest IS NOT NULL)
SELECT id, AVG(CAST(SUBSTR(price, POSITION(';' IN price) + 1) AS decimal(10,2))) AS price_average
FROM cte
GROUP BY id;
A similar way to do the same (using regular expressions functions):
WITH RECURSIVE
cte AS (SELECT Id, concat(pricehistory, ',') AS pricehistory FROM t),
unnest AS (SELECT id,
pricehistory,
1 AS i,
REGEXP_SUBSTR(pricehistory, ';[0-9.]*,', 1, 1) AS price
FROM cte
UNION ALL
SELECT id,
pricehistory,
i + 1,
REGEXP_SUBSTR(pricehistory, ';[0-9.]*,', 1, i + 1)
FROM unnest
WHERE REGEXP_SUBSTR(pricehistory, ';[0-9.]*,', 1, i + 1) IS NOT NULL)
SELECT id, AVG(CAST(SUBSTR(price, 2, LENGTH(price) - 2) AS decimal(10,2))) AS price_average
FROM unnest
GROUP BY id;
you don't write what DBMS you are using.
In MS SQL-SERVER you can write something like this.
Create a function to convert string to multiple rows, and then use that in the query.
CREATE or ALTER FUNCTION dbo.BreakStringIntoRows (#CommadelimitedString varchar(1000), #Separator VARCHAR(1))
RETURNS #Result TABLE (Column1 VARCHAR(max))
AS
BEGIN
DECLARE #IntLocation INT
WHILE (CHARINDEX(#Separator, #CommadelimitedString, 0) > 0)
BEGIN
SET #IntLocation = CHARINDEX(#Separator, #CommadelimitedString, 0)
INSERT INTO #Result (Column1)
--LTRIM and RTRIM to ensure blank spaces are removed
SELECT RTRIM(LTRIM(SUBSTRING(#CommadelimitedString, 0, #IntLocation)))
SET #CommadelimitedString = STUFF(#CommadelimitedString, 1, #IntLocation, '')
END
INSERT INTO #Result (Column1)
SELECT RTRIM(LTRIM(#CommadelimitedString))--LTRIM and RTRIM to ensure blank spaces are removed
RETURN
END
create table test1 ( id int not null, pricehistory varchar(max) not null );
insert into test1 values ( 1, '1634913730;48.38,1634916509;48.38,1635162352;37.96,1635177904;49.14,1635337722;1219.98,1635340811;27.17' );
insert into test1 values ( 2, '1634913731;42.42,1634916609;21.21' );
Select *,
(
Select avg(CAST(RTRIM(LTRIM(SUBSTRING(column1, 0, CHARINDEX(';', column1, 0)))) as decimal)) From dbo.BreakStringIntoRows(pricehistory, ',')
) as AVG
FRom test1
sample output:

How to spilt string in a sql column and store it in other column based on string length

I have declared a temporary table inside a stored procedure where i have four column for address
e.g AddressLine1, AddressLine2,AddressLine3,AddressLine4 each of which have length Varchar(50)
i want to insert data inside the temp table from existing table so that existing table stored the address inside AddressLine1
so i want to insert Address in AddressLine1 from existing table to temporary table but if Address exceeds length of 50 with spaces then i want to insert remaining address to AddressLine2 and so on
so in all i want to divide address based on length of the column which is 50 and then store it amongst
addressLine1,addressLine2,addressLine3,addressLine4 in temporary table
select DATALENGTH(ADDRESSLINE1)
from PASSENGER
where DATALENGTH(ADDRESSLINE1) > 50
You can achieve this by some recursive cte and substring... following a quick example which can for sure be optimized / shortened but which should make clear the point:
DECLARE #x NVARCHAR(200) = N'This is some test and this still is the test and yet here the test continues and so on until the test is finished';
WITH cte1 AS(
-- evaluate all positions of spaces
SELECT #x as txt, CHARINDEX(' ', #x) as idx
UNION ALL
SELECT txt, CHARINDEX(' ', txt, idx+1) as idx
FROM cte1
WHERE CHARINDEX(' ', txt, idx+1) >0
),
cte2 AS(
-- evaluate groups basing of the length of 50 as desired output length
SELECT *, idx/50 - CASE WHEN idx%50 = 0 THEN 1 ELSE 0 END AS dividx
FROM cte1
),
cte3 AS(
-- evaluate max space position per group
SELECT txt, dividx, max(idx) maxIdx
FROM cte2
GROUP BY txt, dividx
),
cte4 AS(
-- evaluate required start and end position for substring operation
SELECT txt, dividx
,ISNULL(LAG(maxIdx) OVER (PARTITION BY txt ORDER BY dividx)+1, 1) AS minIdx
,CASE WHEN LEAD(maxIdx) OVER (PARTITION BY txt ORDER BY dividx) IS NULL THEN len(txt) ELSE maxIdx END AS maxIdx
FROM cte3
)
-- perform substring
SELECT SUBSTRING(txt, minIdx, maxIdx-minIdx+1) AS txt
FROM cte4
OPTION (MAXRECURSION 0)

Generate a comma-separated list of numbers in a single string

Is there a way to generate a comma-separated string of a series of numbers where the "begin" and "end" numbers are provided?
For example, provide the numbers 1 and 10 and the output would be a single value of: 1,2,3,4,5,6,7,8,9,10
10/10/2019 edit explaining why I'm interested in this:
My workplace writes queries with several columns in the SELECT statement plus aggregate functions. Then a GROUP BY clause using the column numbers. I figured using a macro that creates a comma-separated list to copy/paste in would save some time.
SELECT t.colA
, t.colB
, t.colC
, t.colD
, t.colE
, t.colF
, t.colG
, t.colH
, t.colI
, t.colJ
, sum(t.colK) as sumK
, sum(t.colL) as sumL
, sum(t.colM) as sumM
FROM t
GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
;
You can use a recursive CTE to generate your numbers, and xml_agg to generate your string:
with recursive nums (counter) as
( select * from (select cast(1 as bigint) as counter) t
union all
select
counter + 1
from nums
where counter between 1 and 9
)
select
trim(trailing ',' from cast(xmlagg(cast(counter as varchar(2)) || ',' order by counter) as varchar(100)))
from nums
Check these methods in SQL Server-
IF OBJECT_ID('TEMPDB..#Sample') IS NOT NULL
DROP TABLE #Sample
Create table #Sample
(
NUM int
)
declare #n int
select #n=10
insert into #Sample(NUM)
SELECT NUM FROM (select row_number() over (order by (select null)) AS NUM from sys.columns) A WHERE NUM<=#N
--Method 1 (For SQL SERVER -NEW VERSION Support)
SELECT STRING_AGG(NUM,',') AS EXPECTED_RESULT FROM #Sample
--Method 1 (For SQL SERVER -OLD VERSION Support)
select DISTINCT STUFF(CAST((
SELECT ' ,' +CAST(c.num AS VARCHAR(MAX))
FROM (
SELECT num
FROM #Sample
) c
FOR XML PATH(''), TYPE) AS VARCHAR(MAX)), 1, 2, '') AS EXPECTED_RESULT
from #Sample t
While loop seems appropriate
declare #begin int=1
declare #end int=11
declare #list varchar(500)
if #begin > #end
begin
select 'error, beginning number ' + convert(varchar(500),#begin)
+ ' must not be greater than ending number '
+ convert(varchar(500),#end) + '.' err
return
end
else
set #list = convert(varchar(500),#begin)
;
while #begin < #end
begin
set #begin += 1
set #list = #list + ',' + convert(varchar(500),#begin)
end
select #list
You might want to use varchar(5000) or something depending on how big you want it to get.
disclaimer -- I don't know if this works with teradata
I'm not sure there is a good direct way to generate a series in Teradata. You can fake it a few different ways though. Here's a comma separated list of numbers from 5 to 15, for example:
SELECT TRIM(TRAILING ',' FROM (XMLAGG(TRIM(rn)|| ',' ) (VARCHAR(10000))))
FROM (SELECT 4 + ROW_NUMBER() OVER (ORDER BY Sys_Calendar."CALENDAR".day_of_calendar) as rn FROM Sys_Calendar."CALENDAR" QUALIFY rn <= 15) t
I've only used sys_calendar.calendar here because it's a big table. Any big table would do here though.
Here's one way to do it in Teradata:
SELECT ARRAY_AGG(src.RowNum)
FROM (
SELECT ROW_NUMBER() OVER() AS RowNum
FROM sys_calendar.calendar
QUALIFY RowNum BETWEEN <begin_num> AND <end_num>
) src
This will give you the output as an ARRAY data type, which you can probably cast as a VARCHAR. It also assumes begin_num > 0 and <end_num> is less than the number of rows in the sys_calendar.calendar view. You can always fiddle with this to fit your required range of values.
There are also DelimitedBuild UDFs out there (if you can find one) that can be used to convert row values into delimited strings.
The cheapest way to achieve your goal is this one (no functions, or joins to tables required):
WITH RECURSIVE NumberRanges(TheNumber,TheString) AS
(
SELECT 1 AS TheNumber,casT(1 as VARCHAR(500)) as TheString
FROM
(
SELECT * FROM (SELECT NULL AS X) X
) DUMMYTABLE
UNION ALL
SELECT
TheNumber + 1 AS TheNumber,
TheString ||',' || TRIM(TheNumber+1)
FROM NumberRanges
WHERE
TheNumber < 10
)
SELECT TheString
FROM NumberRanges
QUALIFY ROW_NUMBER() OVER ( ORDER BY TheNumber DESC) = 1;
Result String: 1,2,3,4,5,6,7,8,9,10

Query SQL with similar values

I have to make a query to a base using as a comparison a string like this 12345678, but the value to compare is this way12.345.678, if I do the following query it does not return anything.
SELECT * FROM TABLA WHERE CAMPO = '12345678'
Where CAMPO would have the value of (12.345.678), if I replace = with a like, it does not return the data either
SELECT * FROM TABLA WHERE CAMPO like '12345678%'
SELECT * FROM TABLA WHERE CAMPO like '%12345678'
SELECT * FROM TABLA WHERE CAMPO like '%12345678%'
None of the 3 previous consultations works for me, how can I make this query?
The value can be of either 7, 8 or 9 numbers and the. It has to be every 3 from the end to the beginning
Use REPLACE() function to replace all the dots '.' as
SELECT *
FROM(
VALUES ('12.345.678'),
('23.456.789')
) T(CAMPO)
WHERE REPLACE(CAMPO, '.', '') = '12345678';
Your query should be
SELECT * FROM TABLA WHERE REPLACE(CAMPO, '.', '') = '12345678';
You can compare the string without the dots to a REPLACE(StringWithDots, '.','')
I recommend you to convert the number to numeric
So you can use < and > operators and all functions that require you to have a number...
the best way to achieve this is to make sure you remove any unecessary dots and convert the commas to dots. like this
CONVERT(NUMERIC(10, 2),
REPLACE(
REPLACE('7.000,45', '.', ''),
',', '.'
)
)
I hope this will help you out.
A SARGABLE solution would be to write a function that takes your target value ('12345678') and inserts the separators ('.') every third character from right to left. The result ('12.345.678') can then be used in a where clause and benefit from an index on CAMPO.
The following code demonstrates an approach without creating a user-defined function (UDF). Instead, a recursive common table expression (CTE) is used to process the input string three characters at a time to build the dotted target string. The result is used in a query against a sample table.
To see the results from the recursive CTE replace the final select statement with the commented select immediately above it.
-- Sample data.
declare #Samples as Table ( SampleId Int Identity, DottedDigits VarChar(20) );
insert into #Samples ( DottedDigits ) values
( '1' ), ( '12' ), ( '123' ), ( '1.234' ), ( '12.345' ),
( '123.456' ), ( '1.234.567' ), ( '12.345.678' ), ( '123.456.789' );
select * from #Samples;
-- Query the data.
declare #Target as VarChar(15) = '12345678';
with
Target as (
-- Get the first group of up to three characters from the tail of the string ...
select
Cast( Right( #Target, 3 ) as VarChar(20) ) as TargetString,
Cast( Left( #Target, case when Len( #Target ) > 3 then Len( #Target ) - 3 else 0 end ) as VarChar(20) ) as Remainder
union all
-- ... and concatenate the next group with a dot in between.
select
Cast( Right( Remainder, 3 ) + '.' + TargetString as VarChar(20) ),
Cast( Left( Remainder, case when Len( Remainder ) > 3 then Len( Remainder ) - 3 else 0 end ) as VarChar(20) )
from Target
where Remainder != ''
)
-- To see the intermediate results replace the final select with the line commented out below:
--select TargetString from Target;
select SampleId, DottedDigits
from #Samples
where DottedDigits = ( select TargetString from Target where Remainder = '' );
An alternative approach would be to add a indexed computed column to the table that contains Replace( CAMPO, '.', '' ).
If the table containing IDs like 12.345.678 is big (contains many records), I would add a computed field that removes the dots (and if this ID does never contain any alphanumeric characters other than dots and has no significant leading zeros then also cast it in an INT or BIGINT) and persist it and lay an index over it. That way you loose a little time when inserting the record but are querying it with maximum speed and therefore saving processor power.

How to extract the n-th value from the text field with delimitors

In SQL table I have a text column with value 'Yellow|Green|Blue' and another column with numeric value. This numeric value defines which part of the text column to be extracted. Values in the text column are separated with '|' separator.
For example:
If numeric value is 0, 1st part of the text field should be extracted: Yellow
If numeric value is 1, 2nd part of the text field should be extracted: Green
And so on.
Is there a way how to extract it dynamically ?
Meaning without using CASE statement like:
case when u.UD_2 =0 then 'Yellow' when u.UD_2=1 then 'Green' when u.UD_2=3 then 'Blue' end Kategorie
UPDATE: We are using SQL Server 2016
This should work for you, in the subquery extract each category to separate columns and after it, use a case statement to choose the needed category.
select case sep when 0 then x.[0] when 1 then x.[1] when 2 then x.[2] end as Kategorie
from (
select *
,LEFT(val, CHARINDEX('|', val) - 1) AS '0'
,LEFT(STUFF(SUBSTRING(val, CHARINDEX('|', val), LEN(val)), 1, 1, ''), CHARINDEX('|', STUFF(SUBSTRING(val, CHARINDEX('|', val), LEN(val)), 1, 1, '')) - 1) AS '1'
,SUBSTRING(SUBSTRING(val, CHARINDEX('|', val), LEN(val)), CHARINDEX('|', val) + 1, LEN(val)) AS '2'
from #test
)x
Sample data:
create table #test
(
val nvarchar(500),
sep int
)
insert into #test values
('Yellow|Green|Blue', 0),
('Yellow|Green|Blue', 1),
('Yellow|Green|Blue', 2)
Note: this only works if there are exact 3 values separated with |
UPDATE
And this is a dynamic way to achieve it, doesn't matter how many categories will be separated:
SELECT x.Kategorie
FROM (
SELECT DISTINCT node.s.value('.', 'NVARCHAR(500)') AS Kategorie
,ROW_NUMBER() OVER (PARTITION BY sep ORDER BY (SELECT NULL)) - 1 as rn
FROM (
SELECT sep
,CAST('<M>' + REPLACE(val, '|', '</M><M>') + '</M>' AS XML) AS Kategorie
FROM #test
) AS s
CROSS APPLY Kategorie.nodes('/M') AS node(s)
)x
JOIN #test AS t ON t.sep = x.rn
One possible approach is to split your text data into substrings and get each substring position.
Starting with SQL Server 2016 you may use STRING_SPLIT() to split a string, but in your case this is not an option, because this function returns a table with all substrings, but they are not ordered and the order of substrings is not guaranteed.
Again, if you use SQL Server 2016+, you may try to transform the text data into a valid JSON array using REPLACE() ('Yellow|Green|Blue' is transformed into '["Yellow","Green","Blue"]') and after that to use OPENJSON() with default schema to retrieve this JSON array as table, which has columns key, value and type (key column contains the index of the element in the specified array).
Input:
CREATE TABLE #Data (
TextValue nvarchar(max),
IndexValue int
)
INSERT INTO #Data
(TextValue, IndexValue)
VALUES
('Yellow|Green|Blue', 0),
('Yellow|Green|Blue', 1)
T-SQL:
SELECT d.TextValue, d.IndexValue, j.[value] AS [Value]
FROM #Data d
CROSS APPLY OPENJSON(CONCAT(N'["', REPLACE(d.TextValue, N'|', N'","'), N'"]')) j
WHERE d.IndexValue = j.[key]
Output:
---------------------------------------
TextValue IndexValue Value
---------------------------------------
Yellow|Green|Blue 0 Yellow
Yellow|Green|Blue 1 Green