Create 10 Million products in SQL - sql

Im trying to add 10 Million products to my products table.
That was my attempt so far:
INSERT INTO Artikel (Hersteller, Artikelnummer, Artikelnamen, Artikelbeschreibung, Preis)
VALUES (
(SELECT TOP 1 Hersteller FROM Artikel ORDER BY NEWID()) ,
(SELECT FLOOR(RAND() * (100000000000-101 + 1)) + 101 ) ,
(SELECT REPLACE(NEWID(),'-','')),
(SELECT REPLACE(NEWID(),'-','')),
(SELECT ROUND(RAND(CHECKSUM(NEWID())) * (9999), 2))
)
GO 10000000
But this takes forever. After ~45 minutes my query was nowhere near 200K values. Are there any faster/more efficient solutions?

What you actually want is unclear, however, you could likely get some very good performance by doing this in only a couple of batches. I don't understand why you are getting a value(of Hersteller) from your table (Artikel) only to insert it into the table again, but I've incorporated that anyway.
This does the INSERT in 2 batches of 5,000,000:
WITH N AS(
SELECT N
FROM (VALUES(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL))N(N)),
Tally AS(
SELECT ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) AS I
FROM N N1, N N2, N N3, N N4, N N5, N N6, N N7), --10,000,000 rows
Dataset AS(
SELECT TOP (5000000)
A.Hersteller
FROM dbo.Artikel A
CROSS JOIN Tally T)
INSERT INTO dbo.Artikel
SELECT D.Hersteller,
FLOOR(RAND() * (100000000000-101 + 1)) + 101, --This'll be the same for every row, is that intended?
REPLACE(NEWID(),'-',''),
REPLACE(NEWID(),'-',''),
ROUND(RAND(CHECKSUM(NEWID())) * (9999), 2) --This'll be the same for every row, is that intended?
FROM Dataset D;
GO 2
Note my comment about RAND, and that it'll produce the same value on every row (within the batch). If that isn't desired then see this post about making a random number per row: How do I generate a random number for each row in a T-SQL select?

Related

Count Similar Substrings SQL query

I've tried a few scenarios and googled a lot, but still can't find a solution.
I have a table of user names with entries something like the below:
UserName
Cakes420
18Jack01
18Jack04
16Jack22
22Jack16
Mapple7609
Chrom44
chrom22
chrom77
013Cake
016Cake
122Cake
123Cake87
So I need a query that checks for all records that share 4 or more (in sequence) characters in the table.
So I need to return something like :
Characters
Times Used
Names Sharing
Cake
5
Cakes420, 013Cake, 016Cake, 122Cake, 123Cake87
Chro
3
Chrom44, chrom22, chrom77
or anything similar as I'd prefer not to repeat patterns, but hey, at this stage if it returns the values properly, I don't mind.
The shared characters can naturally appear in any place in the string, which is what makes this so difficult.
Should you do this in T-SQL? Probably not.
Can you do this in T-SQL? Yes.
Sample data
create table Names
(
Name nvarchar(20)
);
insert into Names (Name) values
('Cakes420'),
('18Jack01'),
('18Jack04'),
('16Jack22'),
('22Jack16'),
('Mapple7609'),
('Chrom44'),
('chrom22'),
('chrom77'),
('013Cake'),
('016Cake'),
('122Cake'),
('123Cake87');
Solution
Using STRING_AGG() for easy concatenation. Available from SQL Server 2017. Alternatives available for older SQL versions (use the search box on this site, there are many examples).
with rcte as
(
select n.Name,
convert(nvarchar(4), substring(n.Name, 1, 4)) as Part,
1 as PartFrom
from Names n
where len(n.Name) >= 4
union all
select r.Name,
convert(nvarchar(4), substring(r.Name, r.PartFrom+1, r.PartFrom+4)),
r.PartFrom+1
from rcte r
where len(r.Name) >= r.PartFrom+4
),
cte_count as
(
select r.Part,
count(1) as PartCount
from rcte r
where r.Part not like '%[0-9]%' -- exclude parts with numbers in them
group by r.Part
having count(1) > 1
)
select c.Part,
c.PartCount,
string_agg(r.Name, ', ') as Names
from cte_count c
join rcte r
on r.Part = c.Part
group by c.Part,
c.PartCount
order by c.Part;
Result
Part PartCount Names
---- --------- ----------------------------------------------
Cake 5 Cakes420, 123Cake87, 122Cake, 016Cake, 013Cake
Chro 3 Chrom44, chrom22, chrom77
hrom 3 chrom77, chrom22, Chrom44
Jack 4 22Jack16, 16Jack22, 18Jack04, 18Jack01
Fiddle to see it in action with the intermediate CTE results.
Let's use Itzik Ben-Gan's Tally Function to break out a list of substrings, then group them. This is called N-Gram, after the more common Trigram which is 3-character substrings.
I've removed one extra cross-join from the function to speed it up slightly, it's now good for up to varchar(65536):
CREATE OR ALTER FUNCTION dbo.GetNums(#num AS BIGINT)
RETURNS TABLE
AS
RETURN
WITH
L0 AS ( SELECT 1 AS c
FROM (VALUES(1),(1),(1),(1),(1),(1),(1),(1),
(1),(1),(1),(1),(1),(1),(1),(1)) AS D(c) ),
L1 AS ( SELECT 1 AS c FROM L0 AS A CROSS JOIN L0 AS B ),
L2 AS ( SELECT 1 AS c FROM L1 AS A CROSS JOIN L1 AS B ),
Nums AS ( SELECT ROW_NUMBER() OVER(ORDER BY (SELECT NULL)) AS rownum
FROM L2 )
SELECT TOP(#num)
rownum AS rn
FROM Nums
ORDER BY rownum;
GO
DECLARE #substringLen int = 4;
SELECT
Characters,
[Times Used] = COUNT(*),
[Names Sharing] = STRING_AGG(Username, ', ')
FROM (
SELECT DISTINCT
-- remove DISTINCT if you want to know about multiple in a single username
t.Username,
Characters = SUBSTRING(t.Username, n.rn, #substringLen)
FROM myTable t
CROSS APPLY dbo.GetNums (LEN(t.UserName) - #substringLen + 1) n
) t
GROUP BY t.Characters
HAVING COUNT(*) > 1

Generate group code between two values of the group with SQL

i have a big issue , i need to generate a code in the range of two existing columns (CodeFrom / CodeTo) . Like the following screenshots below :
Input :
estimated Output :
Any shared Ideas can help my sure. Thanks
In SQL Server, you can use a recursive CTE:
with cte as (
select codefrom, codeto, town, codefrom as code
from t
union all
select codefrom, codeto, town, code + 1
from cte
where code < codeto
)
select *
from cte;
SQL Server has a built-in default recursion limit of 100. So, if you might be generating more than 100 codes, then add option (maxrecursion 0).
Like I mentioned under Gordon's answer in the comments, use a Tally for this. They are far faster by far (especially with larger datasets) and don't suffer the max recursion error as they aren't recursive:
WITH N AS(
SELECT N
FROM (VALUES(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL))N(N)),
Tally AS(
SELECT ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) AS I
FROM N N1, N N2, N N3) --1,000 rows, Add more N for more rows
SELECT YT.CodeFrom,
YT.CodeTo,
YT.Town,
T.I AS Code
FROM (VALUES(1,7,'Paris'),
(14,17,'Sao Paulo'))YT(CodeFrom,CodeTo,Town)
JOIN Tally T ON YT.CodeFrom <= T.I
AND YT.CodeTo >= T.I;

Is there a better way to retrieve a random row from an Oracle table?

Not so long ago I needed to fetch a random row from a table in an Oracle database. The most widespread solution that I've found was this:
SELECT * FROM
( SELECT * FROM tabela WHERE warunek
ORDER BY dbms_random.value )
WHERE rownum = 1​
However, this is very performance heavy for large tables, as it sorts the table in random order first, then grabs the first row.
Today, one of my collegues suggested a different way:
SELECT * FROM (
SELECT * FROM MAIN_PRODUCT
WHERE ROWNUM <= CAST((SELECT COUNT(*) FROM MAIN_PRODUCT)*dbms_random.value AS INTEGER)
ORDER BY ROWNUM DESC
) WHERE ROWNUM = 1;
It works way faster and seems to return random values, but does it really? Could someone give an insight into whether it is really random and behaves the way as expected? I'm really curious why I haven't found this approach anywhere else while looking, and if it is indeed random and way better performance wise, why isn't it more widespread?
This is the (possibly) the most simple query possible to get the results.
But the SELECT COUNT(*) FROM MAIN_PRODUCT will table scan i doubt you can get a query which does not do that.
P.s This query assumes not deleted records.
Query
SELECT *
FROM
MAIN_PRODUCT
WHERE
ROWNUM = FLOOR(
(dbms_random.value * (SELECT COUNT(*) FROM MAIN_PRODUCT)) + 1
)
FLOOR(
(dbms_random.value * (SELECT COUNT(*) FROM MAIN_PRODUCT)) + 1
)
Will generate a number between between 1 and the max count of the table see demo how that works when you refresh it.
Oracle12c+ Query
SELECT *
FROM
MAIN_PRODUCT
WHERE
ROWNUM <= FLOOR(
(dbms_random.value * (SELECT COUNT(*) FROM MAIN_PRODUCT)) + 1
)
ORDER BY
ROWNUM DESC
FETCH FIRST ROW ONLY
The second code you have
SELECT * FROM (
SELECT * FROM MAIN_PRODUCT
WHERE ROWNUM <= CAST((SELECT COUNT(*) FROM MAIN_PRODUCT)*dbms_random.value AS INTEGER)
ORDER BY ROWNUM DESC
) WHERE ROWNUM = 1;
is excellent, except that it will get subsequent elements. dbms_random.value is returning a real number between 0 and 1. Multiplying this with the number of rows will provide you a really random number and the bottleneck here is counting the number of rows rather then generating a random value for each row.
Proof
Consider the
0 <= x < 1
number. If we multiply it with n, we get
0 <= n * x < n
which is exactly what you need if you want to load a single element. The reason this is not widespread is that in many cases the performance issues are not felt due to only a few thousands of records.
EDIT
If you would need k number of records, not just the first one, then it would be slightly difficult, however, still solvable. The algorithm would be something like this (I do not have Oracle installed to test it, so I only describe the algorithm):
randomize(n, k)
randomized <- empty_set
while (k > 0) do
newValue <- random(n)
n <- n - 1
k <- k - 1
//find out how many elements are lower than newValue
//increase newValue with that amount
//find out if newValue became larger than some values which were larger than new value
//increase newValue with that amount
//repeat until there is no need to increase newValue
while end
randomize end
If you randomize k elements from n, then you will be able to use those values in your filter.
The key to improving performance is to lessen the load of the ORDER BY.
If you know about how many rows match the conditions, then you can filter before the sort. For instance, the following takes about 1% of the rows:
SELECT *
FROM (SELECT *
FROM tabela
WHERE warunek AND dbms_random.value < 0.01
ORDER BY dbms_random.value
)
WHERE rownum = 1​ ;
A variation is to calculate the number of matching values. Then randomly select a smaller sample. The following gets about 100 matching rows and then sorts them for the random selection:
SELECT a.*
FROM (SELECT *
FROM (SELECT a.*, COUNT(*) OVER () as cnt
FROM tabela a
WHERE warunek
) a
WHERE dbms_random.value < 100 / cnt
ORDER BY dbms_random.value
) a
WHERE rownum = 1​ ;

SQL Server 2008: duplicate a row n-times, where n is a value in a field

In SQL Server 2018 I have three tables:
T1 (idService, dateStart, dateStop)
T2 (idService, totalCostOfService)
T3 (idService, companyName)
Using joins, I created a view:
V1 (idService, dateStart, dateStop, totalCostOfService, companyName)
And we are fine. I can do my selects on the view and obtain the list of services done.
What I would like to do now is to duplicate every row of the view n times, where n=dateStart-dateStop; every row should have a "new" totalCostOfService = totalCostOfService/n.
I can do that using a temporary table, declaring variables, insert in temp using some while etc. etc. Let's call it "the procedure"
But what I would like to understand is:
is it possibile to do that directly with a select on V1? If not, is it possible to save "the procedure" as a view so that I can have it as a easy select?
Sorry if my question looks somewhat stupid, but I'm totally new with SQL. I tried searching here and on google but I couldn't find what an answer to my questions.
Thank you!
Rather than an rCTE (which is RBAR), you could use a Tally Table:
WITH N AS (
SELECT N
FROM (VALUES(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL)) N(N)),
Tally AS(
SELECT ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) -1 AS I
FROM N N1
CROSS JOIN N N2 --100
CROSS JOIN N N3 --1000
CROSS JOIN N N4) --10000
SELECT *
FROM YourTable
JOIN Tally T ON T.I <= dateStart-dateStop --Assumes dateStart and DateStop are integer values, even though their name implies otherwise
--If they are dates, then use DATEDIFF(DAY, dateStart, dateEnd)
That tally will generate numbers up to 10000 (which over 27 years worth of days. That should be far more than enough).
I will assume the existence of a numbers table which has the column val for the individual value numbers. If you don't, you will find plenty by searching around.
Add this in the end of the FROM clause of your view:
cross apply (select datediff(day,T1.dateStart,T1.dateStop)+1 as n_days)q1 -- number of days INCLUDING start
cross apply (select dateadd(day,T1.dateStart,n.val) as day_of_charge)q2 from numbers n where n.val between 0 and n_days-1)
Then you will be able to have the following field on your SELECT:
T2.totalCostOfService/n_days as totalCostOfService
I'll add a numbers table solution shortly.
You can use a recursive CTE:
with cte as (
select idService, dateStart, dateStop,
totalCostOfService / (datediff(day, datestop, datestart) + 1) as dailyCostOfService,
companyName
from v1
union all
select idService,
dateadd(day, 1, dateStart),
dateStop,
dailyCostOfService
companyName
from cte
)
select idservice, dateStart as dateOfService,
dailyCostOfService, companyName
from cte;
Note that if there are more than 100 days in any row, then you will need to add OPTION (MAXRECURSION 0).

Find the longest sequence of a value in a table

This is an SQL Question, I think it is difficult one - I'm not sure it is possible to achieve in a simple SQL sentence or a stored procedure:
I want to find the number of the longest sequence of the same (known) number in a column in a table:
example:
TABLE:
DATE SALEDITEMS
1/1/09 4
1/2/09 3
1/3/09 3
1/4/09 4
1/5/09 3
calling the sp/sentence for 4 will give 1 calling the sp/sentecne for 3 will give 2
as there was 2 times in a row number 3.
I'm running SQL server 2008.
UPDATE: I generated a million rows of random data, and abandoned the recursive CTE solution, as its query plan didn't make good use of indexes in the optimizer.
But the non-recursive solution I originaly posted turned out to work great, as long as there was an additional non-clustered index on (SALEDITEMS, [DATE]). This makes sense, since the query needs to filter in both directions (both by date and by SALEDITEMS). With this additional index, queries on a million rows return in under 2 seconds on my (not very beefy) desktop mathine. Without this index, the query was dog-slow.
BTW, this is a great example of how SQL Server's cost-based query optimization totally breaks down in some cases. The recursive CTE solution has a cost (on my PC) of 42 and takes at least several minutes to finish. The non-recursive solution has a cost of 15,446 (!!!) and completes in 1.5 seconds. Moral of the story: when comparing SQL Server query plans, don't assume that cost necessarily correlates to query performance!
Anyway, here's the solution I'd recommend (the same non-recursive CTE I posted earlier) :
DECLARE #SALEDITEMS INT = 3;
WITH SalesNoMatch ([DATE], SALEDITEMS, NoMatchDate)
AS
(
SELECT [DATE], SALEDITEMS,
(SELECT MIN([DATE]) FROM Sales s2 WHERE s2.SALEDITEMS <> #SALEDITEMS
AND s2.[DATE] > s1.[DATE]) as NoMatchDate
FROM Sales s1
)
, SalesMatchCount ([DATE], ConsecutiveCount) AS
(
SELECT [DATE], 1+(SELECT COUNT(1) FROM Sales s2 WHERE s2.[DATE] > s1.[DATE] AND s2.[DATE] < NoMatchDate)
FROM SalesNoMatch s1
WHERE s1.SALEDITEMS = #SALEDITEMS
)
SELECT MAX(ConsecutiveCount)
FROM SalesMatchCount;
Here's the DDL I used to test this, including indexes you'll need:
CREATE TABLE [Sales](
[DATE] date NOT NULL,
[SALEDITEMS] int NOT NULL
);
CREATE UNIQUE CLUSTERED INDEX IX_Sales ON Sales ([DATE]);
CREATE UNIQUE NONCLUSTERED INDEX IX_Sales2 ON Sales (SALEDITEMS, [DATE]);
And here's how I created my test data-- 1,000,001 rows with ascending dates with SALEDITEMS randomly set between 1 and 10.
INSERT INTO Sales ([DATE], SALEDITEMS)
VALUES ('1/1/09', 5)
DECLARE #i int = 0;
WHILE (#i < 1000000)
BEGIN
INSERT INTO Sales ([DATE], SALEDITEMS)
SELECT DATEADD (d, 1, (SELECT MAX ([DATE]) FROM Sales)), ABS(CHECKSUM(NEWID())) % 10 + 1
SET #i = #i + 1;
END
Here's the recursive-CTE solution that I abandoned:
DECLARE #SALEDITEMS INT = 3;
-- recursive CTE solution (remember to set MAXRECURSION!)
WITH SalesRowNum ([DATE], SALEDITEMS, RowNum)
AS
(
SELECT [DATE], SALEDITEMS, ROW_NUMBER() OVER (ORDER BY s1.[DATE]) as RowNum
FROM Sales s1
)
, SalesCTE (RowNum, [DATE], ConsecutiveCount)
AS
(
SELECT s1.RowNum, s1.[DATE], 1 AS ConsecutiveCount
FROM SalesRowNum s1
WHERE SALEDITEMS = #SALEDITEMS
UNION ALL
SELECT s1.RowNum, s1.[DATE], ConsecutiveCount + 1 AS ConsecutiveCount
FROM SalesRowNum s1
INNER JOIN SalesCTE s2 ON s1.RowNum = s2.RowNum + 1
WHERE SALEDITEMS = #SALEDITEMS
)
SELECT MAX(ConsecutiveCount)
FROM SalesCTE;
Untested, because you did not provide DDL and sample data:
DECLARE #SALEDITEMS INT;
SET #SALEDITEMS=3;
SELECT MAX(cnt) FROM(
SELECT COUNT(*) FROM YourTable JOIN (
SELECT y1.[Date] AS d1, y2.[Date] AS d2
FROM YourTable AS y1 JOIN YourTable AS y2
ON y1.SALEDITEMS=#SALEDITEMS AND y2.SALEDITEMS=#SALEDITEMS
AND NOT EXISTS(SELECT 1 FROM YourTable AS y
WHERE y.SALEDITEMS<>#SALEDITEMS
AND y1.[Date] < y.[Date] AND y.[Date] < y2.[Date])
) AS t
WHERE [Date] BETWEEN t.d1 AND t.d2
) AS t;