How to transform path/string by collapsing repeated elements? - google-bigquery

There is a filed in my table that represents pathways like below:
Item1->Item1->Item2-> Item3->Item3->Item3->Item1
In most cases this is quite looong sequence with many instances of same consecutive Items.
How I can shorted above path to something like below? in BigQuery!
Item1(x2)->Item2->Item3(x3)->Item1

I wanted to convince myself that this was possible just through array manipulation (using standard SQL), and I came up with a solution. An alternate way to solve the problem would be to use analytic functions, where you could detect changes in item along the path.
CREATE TEMPORARY FUNCTION PartsToString(
parts_and_offsets ARRAY<STRUCT<part STRING, off INT64>>) AS ((
SELECT
STRING_AGG(
CONCAT(part_and_offset.part,
IF(parts_and_offsets[OFFSET(off + 1)].off - part_and_offset.off = 1,
"",
CONCAT("(x", CAST(parts_and_offsets[OFFSET(off + 1)].off - part_and_offset.off AS STRING), ")"))))
FROM UNNEST(parts_and_offsets) AS part_and_offset WITH OFFSET off
WHERE off + 1 < ARRAY_LENGTH(parts_and_offsets)
));
CREATE TEMPORARY FUNCTION PathwayToParts(pathway STRING) AS ((
SELECT
ARRAY_CONCAT(
ARRAY_AGG(
STRUCT(part, off)),
[STRUCT("" AS part, ARRAY_LENGTH(ANY_VALUE(parts)) AS off)]) AS parts_and_offsets
FROM (SELECT SPLIT(pathway, "->") AS parts),
UNNEST(parts) AS part WITH OFFSET off
WHERE off = 0 OR part != parts[OFFSET(off - 1)]
));
WITH YourTable AS (
SELECT "Item1->Item2->Item2->Item2->Item3->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item2->Item3->Item3->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item4" AS pathway
UNION ALL SELECT "Item1->Item2->Item2->Item3->Item1->Item1->Item1->Item2->Item3->Item3->Item2->Item2->Item2->Item1->Item4" AS pathway
UNION ALL SELECT "Item1->Item1->Item1" AS pathway
UNION ALL SELECT "Item1->Item2->Item2" AS pathway
UNION ALL SELECT "Item1->Item1->Item2" AS pathway
UNION ALL SELECT "Item1->Item2->Item3" AS pathway
)
SELECT PartsToString(PathwayToParts(pathway)) AS parts_string
FROM YourTable;

Using Scalar JS UDF (Standard SQL) <-- would be my choice
CREATE TEMPORARY FUNCTION collapse_repeated(pathway STRING)
RETURNS STRING LANGUAGE js AS """
var items = pathway.split('->');
short = ''; elem = items[0]; count = 0;
for (var i = 0; i < items.length; i++) {
if (items[i] !== elem) {
if (short.length > 0) {short += '->'}
short += elem; if (count > 1) {short += '(x' + count.toString() + ')';}
elem = items[i]; count = 1;
} else {
count++;
}
}
if (short.length > 0) {short += '->'}
short += elem; if (count > 1) {short += '(x' + count.toString() + ')';}
return short;
""";
WITH YourTable AS (
SELECT "Item1->Item2->Item2->Item2->Item3->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item2->Item3->Item3->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item4" AS pathway
UNION ALL SELECT "Item1->Item2->Item2->Item3->Item1->Item1->Item1->Item2->Item3->Item3->Item2->Item2->Item2->Item1->Item4" AS pathway
UNION ALL SELECT "Item1->Item1->Item1" AS pathway
UNION ALL SELECT "Item1->Item2->Item2" AS pathway
)
SELECT collapse_repeated(pathway) AS shorten_pathway, pathway
FROM YourTable
Note: Same JS can be easily “translated” to JS UDF in Legacy SQL
Using Window Functions (Legacy SQL)
SELECT GROUP_CONCAT_UNQUOTED(IF(repeats=1, item, CONCAT(item, "(x", STRING(repeats), ")")), "->"), pathway
FROM (
SELECT MIN(pos) AS ord, MIN(item) AS item, COUNT(1) AS repeats, pathway
FROM (
SELECT item, pos, IFNULL(grp, 0)AS grp, pathway FROM (
SELECT item, pos, SUM(change) OVER(PARTITION BY pathway ORDER BY pos ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) AS grp, pathway
FROM (
SELECT item, pos, IF(item=next_item, 0, 1) AS change, pathway FROM (
SELECT item, pos, LEAD(item) OVER(PARTITION BY pathway ORDER BY pos) AS next_item, pathway
FROM (
SELECT item, POSITION(item) AS pos, pathway FROM (
SELECT SPLIT(pathway, "->") AS item, pathway FROM
(SELECT "Item1->Item2->Item2->Item2->Item3->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item2->Item3->Item3->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item4" AS pathway),
(SELECT "Item1->Item2->Item2->Item3->Item1->Item1->Item1->Item2->Item3->Item3->Item2->Item2->Item2->Item1->Item4" AS pathway),
(SELECT "Item1->Item1->Item1" AS pathway),
(SELECT "Item1->Item2->Item2" AS pathway)
)
)
)
)
)
)
GROUP BY grp, pathway
ORDER BY ord
)
GROUP BY pathway

Related

sql server, replace chars in string with values in table

how can i replace values in string with values that are in a table?
for example
select *
into #t
from
(
select 'bla'c1,'' c2 union all
select 'table'c1,'TABLE' c2 union all
select 'value'c1,'000' c2 union all
select '...'c1,'' c2
)t1
declare #s nvarchaR(max)='this my string and i want to replace all values that are in table #t'
i have some values in my table and i want to replace C1 with C2 in my string.
the results should be
this my string and i want to replace all 000 that are in TABLE #t
UPDATE:
i solved with a CLR
using System;
using Microsoft.SqlServer.Server;
using System.Data.SqlTypes;
using System.Data.Linq;
namespace ReplaceValues
{
public partial class Functions
{
[SqlFunction
(
//DataAccess = DataAccessKind.Read,
SystemDataAccess = SystemDataAccessKind.Read
)
]
public static string ReplaceValues(string row, string delimitator, string values, string replace/*, bool CaseSensitive*/)
{
//return row;
string[] tmp_values = values.Split(new string[] { delimitator }, StringSplitOptions.None);
string[] tmp_replace = replace.Split(new string[] { delimitator }, StringSplitOptions.None);
row = row.ToUpper();
for (int i = 0; i < Math.Min(tmp_values.Length, tmp_replace.Length); i++)
{
row = row.Replace(tmp_values[i].ToUpper(), tmp_replace[i]);
}
return row;
}
}
}
and then
select *
into #t
from
(
select 'value1'OldValue,'one'NewValue union all
select 'value2'OldValue,'two'NewValue union all
select 'value3'OldValue,'three'NewValue union all
select 'value4'OldValue,'four'NewValue
)t1
select dbo.ReplaceValues(t1.column,'|',t2.v,t2.r)
from MyTable t1
cross apply
(
select dbo.inlineaggr(i1.OldValue,'|',1,1)v,
dbo.inlineaggr(i1.NewValue,'|',1,1)r
from #t i1
)t2
i have to improved it to manage better the case sensitive, but performance are not bad.
(also 'inlineaggr' is a CLR i wrote years ago)
You can do this via recursion. Assuming you have a table of find-replace pairs, you can number the rows and then use recursive cte:
create table #t(c1 nvarchar(100), c2 nvarchar(100));
insert into #t(c1, c2) values
('bla', ''),
('table', 'table'),
('value', '000'),
('...', '');
declare #s nvarchar(max) = 'this my string and i want to replace all values that are in table #t';
with ncte as (
select row_number() over (order by (select null)) as rn, *
from #t
), rcte as (
select rn, replace(#s, c1, c2) as newstr
from ncte
where rn = 1
union all
select ncte.rn, replace(rcte.newstr, ncte.c1, ncte.c2)
from ncte
join rcte on ncte.rn = rcte.rn + 1
)
select *
from rcte
where rn = 4

Applying BigQuery javascript UDF to groups

I am trying to apply a javascript user defined function to groups. In the following code, --group by my_group within tuple is commented out. I want to apply the temp function test on every my_group within test_data. The code runs if group by is commented out. If I try to include group by, it produces a "scalar subquery produced more than one element". What change should I make so that I can output an array per group (my_group)?
#standardSQL create function
CREATE TEMP FUNCTION test(a ARRAY<STRING>)
RETURNS ARRAY< STRING >
LANGUAGE js AS '''
var combine = function(a) {
var fn = function(n, src, got, all) {
if (n == 0) {
if (got.length > 0) {
all[all.length] = got;
} return;
}
for (var j = 0; j < src.length; j++) {
fn(n - 1, src.slice(j + 1), got.concat([src[j]]), all);
} return;
}
var all = [];
for (var i = 1; i < a.length; i++) {
fn(i, a, [], all);
}
all.push(a);
return all;
}
return combine(a)
''';
WITH test_data AS (
SELECT 'Shirt' item, 'Cashless' my_group UNION ALL
SELECT 'Jeans', 'Cashless' UNION ALL
SELECT 'Jeans', 'Cash' UNION ALL
SELECT 'Cap', 'Cash' UNION ALL
SELECT 'Shirt', 'Cash' UNION ALL
SELECT 'Cap', 'Cashless'
),
tuple as (
SELECT ARRAY_AGG(DISTINCT item) items
FROM test_data
--group by my_group (uncommenting it creates error)
)
select * from unnest(test((select items from tuple)))
I am looking for an output like the following:
my_group Item
Cash Shirt
Cash Jeans
Cash Cap
Cash Shirt,Jeans
Cash Shirt,Cap
Cash Jeans,Cap
Cash Shirt,Jeans,Cap
Cashless Shirt
Cashless Jeans
Cashless Cap
Cashless Shirt,Jeans
Cashless Shirt,Cap
Cashless Jeans,Cap
Cashless Shirt,Jeans,Cap
Consider below (omitting the function piece to keep answer compact enough ...)
WITH test_data AS (
SELECT 'Shirt' item, 'Cashless' my_group UNION ALL
SELECT 'Jeans', 'Cashless' UNION ALL
SELECT 'Jeans', 'Cash' UNION ALL
SELECT 'Cap', 'Cash' UNION ALL
SELECT 'Shirt', 'Cash' UNION ALL
SELECT 'Cap', 'Cashless'
), tuple as (
SELECT my_group, ARRAY_AGG(DISTINCT item) items
FROM test_data
group by my_group
)
select my_group, item
from tuple,
unnest(test(items)) item
with output

DAX expression for ROW_NUMBER() PARTITION BY ORDER BY equivalent

I have a SQL statement like this:
(ROW_NUMBER() OVER (PARTITION BY a.[market], [MEASURE_TYPE]
ORDER BY AM, REP, ORDER_KEY)) AS ORDER_KEY
I want to write a DAX to implement the above SQL statement.
This is not as simple in DAX as in SQL. Here is an example:
Order Key Within Partition =
VAR CurrentMarket = [Market]
VAR CurrentMeasureType = [MeasureType]
VAR CurrentAM = [AM]
VAR CurrentREP = [REP]
VAR CurrentOrderKey = [OrderKey]
VAR CurrentPartition = FILTER (
a, -- the table name
[Market] = CurrentMarket
&& [MeasureType] = CurrentMeasureType
)
RETURN SUMX (
CurrentPartition,
IF (
ISONORAFTER (
CurrentAM, [AM], ASC,
CurrentREP, [REP], ASC,
CurrentOrderKey, [OrderKey], ASC
),
1
)
)
EDIT: Power Query would be better to achieve this.
let
/* Steps so far */
Source = ...,
...
a = ...,
/* End of steps so far */
/* Add steps below to add Order Key Within Partition column */
Partitions = Table.Group(
a,
{"Market", "MeasureType"}, {"Partition", each _}
)[Partition],
AddedOrderKeys = List.Transform(
Partitions,
each Table.AddIndexColumn(
Table.Sort(_, {"AM", "REP", "OrderKey"}),
"Order Key Within Partition",
1
)
),
Result = Table.Combine(AddedOrderKeys)
in
Result
I contribute with an alternative solution to the RANKX. The answer containing the Power Query is the correct one because avoid using calculated columns.
Sales[Sequence by Customer] =
VAR CurrentDate = Sales[Date]
VAR CurrentTime = Sales[Time]
RETURN COUNTROWS (
    FILTER (
        CALCULATETABLE (
            Sales,
            ALLEXCEPT ( Sales, Sales[Customer] )
        ),
        Sales[Date] < CurrentDate
          || ( Sales[Date] = CurrentDate
               && Sales[Time] <= CurrentTime )
    )
)
Source

Convert SQL to Linq select in where

I have three table below:
TABLE_PRODUCT (IdProduct, ProductName, ProductUnit)
TABLE_STORE_HOUSE (IdContain, IdProduct, ProductNumber, TimeInput)
TABLE_SELL (IdSell, IdContain, ProductNumberSell, TimeSell)
Current, How to using LinQ query get TABLE_STORE_HOUSE.IdProduct witch condition TABLE_STORE_HOUSE.ProductNumber - Sum(TABLE_SELL.ProductNumberSell) > 0 and TABLE_STORE_HOUSE.TimeInput is smallest
Help me convert Sql to Linq..............
select top 1 IdContain
from
TABLE_STORE_HOUSE
where IdProduct = '6'
and
ProductNumber - (select sum(ProductNumber)
from TABLE_SELL
Where TABLE_SELL.IdContain = IdContain)> 0
order by TimeInput desc;
Can you try this?
from t in TABLE_STORE_HOUSEs
let TSell = (
from s in TABLE_SELLs
where s.IdContain == t.IdContain
orderby s.ProductNumber
select new {
s.ProductNumber
}
)
where t.IdProduct == 6 && (t.ProductNumber - TSell.Sum(si => si.ProductNumber)) > 0
select new { t.IdContain }
for top 1 you can use Take() function.

Flatten association table to multi-value column?

I have a table with just product ID's and category ID's (products can be in more than one category). How can I flatten the category ID's into a product column so I end us with this:
id | name | desc | categories
1 | test1 | lorem | 1,3,4,23
2 | test2 | ipsom | 4,6,24
It is like I need to loop into a separate table for the categories column. How can I do this or is there a better way?
I created an CLR aggregate function that takes a varchar column and returns all its values separated by commas. In other words, it joins several strings into a comma-separated list. I am sure its performance is way better than any T-Sql trick.
As any aggregate function, it can be used in combination with group by. For example:
SELECT id, name, desc, JoinStrings(CONVERT(VARCHAR(20), category_id))
FROM product p
INNER JOIN category_products c ON p.category_id = c.category_id
GROUP BY id, name, desc
Here's the C# code to create the CLR assembly into Sql Server 2008:
using System;
using System.Data;
using System.Data.SqlClient;
using System.Data.SqlTypes;
using Microsoft.SqlServer.Server;
[Serializable]
[Microsoft.SqlServer.Server.SqlUserDefinedAggregate(Format.UserDefined, IsInvariantToDuplicates=false, IsInvariantToOrder=false, IsInvariantToNulls=true, MaxByteSize=-1)]
public struct JoinStrings : IBinarySerialize
{
private char[] sb;
private int pos;
public void Init()
{
sb = new char[512000];
pos = 0;
}
public void Accumulate(SqlString Value)
{
if (Value.IsNull) return;
char[] src = Value.ToString().ToCharArray();
Array.Copy(src, 0, sb, pos, src.Length);
pos += src.Length;
sb[pos] = ',';
pos++;
}
public void Merge(JoinStrings Group)
{
Accumulate(Group.Terminate());
}
public SqlString Terminate()
{
if (pos <= 0)
return new SqlString();
else
return new SqlString(new String(sb, 0, pos-1));
}
public void Read(System.IO.BinaryReader r)
{
this.Init();
pos = r.ReadInt32();
r.Read(sb, 0, pos);
}
public void Write(System.IO.BinaryWriter w)
{
w.Write(pos);
w.Write(sb, 0, pos);
}
}
Here's the code to create the function (although deploying from Visual Studio should do it automatically):
CREATE AGGREGATE [dbo].[JoinStrings]
(#s [nvarchar](4000))
RETURNS[nvarchar](max)
EXTERNAL NAME [YouAssemblyName].[JoinStrings]
There's no in-built way to do it in MSSQL.
Simulating group_concat MySQL function in Microsoft SQL Server 2005? has a good description of how to go about implementing a workaround.
I would suggest using a Recursive CTE. I believe that it would be something like this:
select productid, categoryid,
row_number() over (partition by id order by categoryid) as rownum
into #tabletorecurse
from TABLENAME
with finaloutput as
(
select productid as id, name, desc, categoryid as categories, rownum
from #tabletorecurse
join PRODUCTTABLE
on PRODUCTTABLE.id = #tabletorecurse.productid
where rownum = 1
union all
select tr.id, tr.name, tr.desc,
finaloutput.categories + ', ' + tr.categoryid, tr.rownum
from #tabletorecurse as tr
join finaloutput
on finaloutput.rownum + 1 = tr.rownum
and finaloutput.id = tr.productid
)
select id, name, desc, categories
from finaloutput
join
(
select max(rownum) as maxrow, id
from finaloutput
group by id
) as maxvalues
on maxvalues.id = finaloutput.id
and maxvalues.maxrow = finaloutput.rownum
Use a function.
This does a lookup to text so you will need to adapt.
The COALESCE is just to put a ,.
This is from a large scale production application - it works and it fast.
Function was questioned by JustinPony as function is slow
I am hitting some tables of million of records but only returning 100 rows.
The function is only applied to the hundred rows.
usage:
select top 5 sID, ( select [dbo].[JoinMVEnum](docSVsys.sID, '140') ) as [Flag Issue]
from docSVsys
function
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE FUNCTION [dbo].[JoinMVText]
(
#sID int,
#fieldID tinyint
)
RETURNS VARCHAR(MAX)
AS
BEGIN
DECLARE #MVtextList varchar(max)
SELECT #MVtextList = COALESCE(#MVtextList + '; ', '') + docMVtext.value
FROM docMVtext with (nolock)
WHERE docMVtext.sID = #sID and fieldID = #fieldID
RETURN #MVtextList
END
GO