Getting datetime count range in SQL Server - sql

I try to get the date range between the data changes in SQL Server
my query is
select count(1) as qty, Info, convert(char,dFError,100) dErr
from TableData
group by Info, convert(char,dFError,100)
order by dErr asc
I have this
qty has the number of reques to a server, info are the servers ip and the date it's when a request it's sended to another server.
qty
Info
dErr
1
1.97
Aug 11 2021 9:01AM
1
1.97
Aug 11 2021 9:06AM
88
1.33
Dec 21 2021 2:04PM
1
1.95
Dec 22 2021 9:44PM
9
1.95
Dec 22 2021 9:45PM
1
1.33
Dec 22 2021 9:51PM
19
1.33
Dec 22 2021 9:52PM
3
1.33
Dec 22 2021 9:53PM
6
1.33
Dec 27 2021 7:10PM
17
1.33
Dec 27 2021 7:11PM
15
1.95
Dec 27 2021 7:17PM
8
1.95
Dec 27 2021 7:18PM
and I want this, in Aug 11 at 9:06AM all are going to 1.97, at Dec 21 at 2:04PM all are going to 1.33, that means the date and the info
qty
Info
dErr
2
1.97
Aug 11 2021 9:06AM
88
1.33
Dec 21 2021 2:04PM
10
1.95
Dec 22 2021 9:45PM
46
1.33
Dec 27 2021 7:11PM
23
1.95
Dec 27 2021 7:18PM
in the same day can be the same group of numbers on distinct hour
qty
Info
dErr
1
1.97
Jan 24 2022 9:39AM
1
1.97
Jan 24 2022 9:51AM
1
1.97
Jan 24 2022 9:58AM
4
1.97
Jan 24 2022 10:08AM
1
1.97
Jan 24 2022 10:12AM
8
1.95
Jan 24 2022 10:24AM
2
1.95
Jan 24 2022 10:32AM
10
1.33
Jan 24 2022 10:33AM
1
1.33
Jan 24 2022 11:37AM
8
1.95
Jan 24 2022 11:59AM
1
1.95
Jan 24 2022 12:00PM
2
1.95
Jan 24 2022 12:08PM
and need to be displayed like
qty
Info
dErr
8
1.97
Jan 24 2022 10:12AM
10
1.95
Jan 24 2022 10:32AM
11
1.33
Jan 24 2022 11:37AM
11
1.95
Jan 24 2022 12:08PM

A double row_number can be used to calculate a ranking.
Then the ranking can be used in the aggregation to solve this Gaps-And-Islands type of problem.
select sum(qty) as qty, Info, max(dFError) as dErr
from (
select Info, dFError, qty
, convert(date, dFError) as dErrorDate
, Rnk = row_number() over (order by dFError)
+ row_number() over (partition by Info order by dFError desc)
from TableData
) q
group by Info, Rnk
order by dErr;
qty
Info
dErr
2
1.97
2021-08-11 09:06:00.000
88
1.33
2021-12-21 14:04:00.000
10
1.95
2021-12-22 21:45:00.000
46
1.33
2021-12-27 19:11:00.000
23
1.95
2021-12-27 19:18:00.000
8
1.97
2022-01-24 10:12:00.000
10
1.95
2022-01-24 10:32:00.000
11
1.33
2022-01-24 11:37:00.000
11
1.95
2022-01-24 12:08:00.000
Demo on db<>fiddle here

select
SUM(P_COUNT) as "COUNT",
P_DATA as "DATA",
MAX(FECHA) as "FECHA"
from
TABLEA
GROUP BY
P_DATA, CONVERT(DATE, FECHA)
ORDER BY "FECHA"

Your expected results don't match the given data - in the first set you have rows for 12/22 with both 1.33 and 1.95, but not included in your expected results.
It seems to me you want to either group by the date - or the date\hour. Here is an example of both:
Declare #testTable table (qty int, Info numeric(3,2), dErr datetime);
Insert Into #testTable (qty, Info, dErr)
Values ( 1, 1.97, 'Aug 11 2021 9:01AM')
, ( 1, 1.97, 'Aug 11 2021 9:06AM')
, (88, 1.33, 'Dec 21 2021 2:04PM')
, ( 1, 1.95, 'Dec 22 2021 9:44PM')
, ( 9, 1.95, 'Dec 22 2021 9:45PM')
, ( 1, 1.33, 'Dec 22 2021 9:51PM')
, (19, 1.33, 'Dec 22 2021 9:52PM')
, ( 3, 1.33, 'Dec 22 2021 9:53PM')
, ( 6, 1.33, 'Dec 27 2021 7:10PM')
, (17, 1.33, 'Dec 27 2021 7:11PM')
, (15, 1.95, 'Dec 27 2021 7:17PM')
, ( 8, 1.95, 'Dec 27 2021 7:18PM')
, ( 1, 1.97, 'Jan 24 2022 9:39AM')
, ( 1, 1.97, 'Jan 24 2022 9:51AM')
, ( 1, 1.97, 'Jan 24 2022 9:58AM')
, ( 4, 1.97, 'Jan 24 2022 10:08AM')
, ( 1, 1.97, 'Jan 24 2022 10:12AM')
, ( 8, 1.95, 'Jan 24 2022 10:24AM')
, ( 2, 1.95, 'Jan 24 2022 10:32AM')
, (10, 1.33, 'Jan 24 2022 10:33AM')
, ( 1, 1.33, 'Jan 24 2022 11:37AM')
, ( 8, 1.95, 'Jan 24 2022 11:59AM')
, ( 1, 1.95, 'Jan 24 2022 12:00PM')
, ( 2, 1.95, 'Jan 24 2022 12:08PM');
--==== Grouped by date
Select total_qty = sum(tt.qty)
, tt.Info
, latest_date = max(tt.dErr)
From #testTable tt
Group By
tt.Info
, cast(tt.dErr As date)
Order By
cast(tt.dErr As date);
--==== Grouped by date\hour
Select total_qty = sum(tt.qty)
, tt.Info
, latest_date = max(tt.dErr)
From #testTable tt
Group By
tt.Info
, cast(tt.dErr As date)
, datepart(Hour, tt.dErr)
Order By
cast(tt.dErr As date)
, datepart(Hour, tt.dErr);

Related

Get the last 4 weeks prior to current week of and the same 4 weeks of last year

I have a list of date, fiscal week, and fiscal year:
DATE_VALUE FISCAL_WEEK FISCAL_YEAR_VALUE
14-Dec-20 51 2020
15-Dec-20 51 2020
16-Dec-20 51 2020
17-Dec-20 51 2020
18-Dec-20 51 2020
19-Dec-20 51 2020
20-Dec-20 51 2020
21-Dec-20 52 2020
22-Dec-20 52 2020
23-Dec-20 52 2020
24-Dec-20 52 2020
25-Dec-20 52 2020
26-Dec-20 52 2020
27-Dec-20 52 2020
28-Dec-20 1 2021
29-Dec-20 1 2021
30-Dec-20 1 2021
31-Dec-20 1 2021
1-Jan-21 1 2021
2-Jan-21 1 2021
3-Jan-21 1 2021
4-Jan-21 2 2021
5-Jan-21 2 2021
6-Jan-21 2 2021
7-Jan-21 2 2021
8-Jan-21 2 2021
9-Jan-21 2 2021
10-Jan-21 2 2021
11-Jan-21 3 2021
12-Jan-21 3 2021
13-Jan-21 3 2021
14-Jan-21 3 2021
15-Jan-21 3 2021
16-Jan-21 3 2021
17-Jan-21 3 2021
18-Jan-21 4 2021
19-Jan-21 4 2021
20-Jan-21 4 2021
21-Jan-21 4 2021
22-Jan-21 4 2021
23-Jan-21 4 2021
24-Jan-21 4 2021
20-Dec-21 52 2021
21-Dec-21 52 2021
22-Dec-21 52 2021
23-Dec-21 52 2021
24-Dec-21 52 2021
25-Dec-21 52 2021
26-Dec-21 52 2021
27-Dec-21 53 2021
28-Dec-21 53 2021
29-Dec-21 53 2021
30-Dec-21 53 2021
31-Dec-21 53 2021
1-Jan-22 53 2021
2-Jan-22 53 2021
3-Jan-22 1 2022
4-Jan-22 1 2022
5-Jan-22 1 2022
6-Jan-22 1 2022
7-Jan-22 1 2022
8-Jan-22 1 2022
9-Jan-22 1 2022
10-Jan-22 2 2022
11-Jan-22 2 2022
12-Jan-22 2 2022
13-Jan-22 2 2022
14-Jan-22 2 2022
15-Jan-22 2 2022
16-Jan-22 2 2022
17-Jan-22 3 2022
18-Jan-22 3 2022
19-Jan-22 3 2022
20-Jan-22 3 2022
21-Jan-22 3 2022
22-Jan-22 3 2022
23-Jan-22 3 2022
24-Jan-22 4 2022
25-Jan-22 4 2022
26-Jan-22 4 2022
27-Jan-22 4 2022
28-Jan-22 4 2022
29-Jan-22 4 2022
30-Jan-22 4 2022
I want to pull the last 4 weeks prior to the current week AND the same 4 weeks of the year before. Please see example 1. This works fine when all 4 weeks are within the same year. But when it comes to the beginning of a year when 1 or more weeks are in the current year but the other are in the previous year, I am not able to get the desired output below:
FISCAL_YEAR_VALUE FISCAL_WEEK
2020 51
2020 52
2021 2
2021 1
2021 52
2021 53
2022 1
2022 2
The code I have is below. I am using the date of 21-JAN-22 as an example:
SELECT
FISCAL_YEAR_VALUE,
FISCAL_WEEK
FROM TABLE_NAME
WHERE FISCAL_YEAR_VALUE IN (SELECT *
FROM (WITH T AS (
SELECT DISTINCT FISCAL_YEAR_VALUE
FROM TABLE_NAME
WHERE TRUNC(DATE_VALUE) <= TRUNC(TO_DATE('21-JAN-22'))--TEST DATE
ORDER BY FISCAL_YEAR_VALUE DESC
FETCH NEXT 2 ROWS ONLY
)
SELECT FISCAL_YEAR_VALUE
FROM T ORDER BY FISCAL_YEAR_VALUE
)
)
AND FISCAL_WEEK IN (SELECT *
FROM (WITH T AS (
SELECT DISTINCT FISCAL_WEEK, FISCAL_YEAR_VALUE
FROM TABLE_NAME
WHERE TRUNC(DATE_VALUE) <= TRUNC(TO_DATE('21-JAN-22'))--TEST DATE
ORDER BY FISCAL_YEAR_VALUE DESC, FISCAL_WEEK DESC
OFFSET 1 ROWS
FETCH NEXT 4 ROWS ONLY
)
SELECT FISCAL_WEEK
FROM T ORDER BY FISCAL_YEAR_VALUE, FISCAL_WEEK
)
)
GROUP BY FISCAL_YEAR_VALUE, FISCAL_WEEK
ORDER BY FISCAL_YEAR_VALUE, FISCAL_WEEK
Output of the code is:
FISCAL_YEAR_VALUE FISCAL_WEEK
2021 2
2021 1
2021 52
2021 53
2022 1
2022 2
As you can see, the last 2 weeks of year 2020 are not included. Please see example 2. How can I also include this exception in the code to make it dynamic? Any help would be greatly appreciated!
To find the values this year, you can use:
SELECT DISTINCT fiscal_year_value, fiscal_week
FROM table_name
WHERE date_value < TRUNC(SYSDATE, 'IW')
AND date_value >= TRUNC(SYSDATE, 'IW') - INTERVAL '28' DAY
To find the values from the previous year, you can find the maximum fiscal week from this year and subtract 1 from the year and then use that to find the upper bound of the date_value for last fiscal year and, given that can use a similar range for last year:
WITH this_year (fiscal_year_value, fiscal_week) AS (
SELECT fiscal_year_value, fiscal_week
FROM table_name
WHERE date_value < TRUNC(SYSDATE, 'IW')
AND date_value >= TRUNC(SYSDATE, 'IW') - INTERVAL '28' DAY
),
max_last_year (max_date_value) AS (
SELECT MAX(date_value) + INTERVAL '1' DAY
FROM table_name
WHERE (fiscal_year_value, fiscal_week) IN (
SELECT fiscal_year_value - 1, fiscal_week
FROM this_year
ORDER BY fiscal_year_value DESC, fiscal_week DESC
FETCH FIRST ROW ONLY
)
)
SELECT fiscal_year_value, fiscal_week
FROM this_year
UNION
SELECT t.fiscal_year_value, t.fiscal_week
FROM table_name t
INNER JOIN max_last_year m
ON ( t.date_value < m.max_date_value
AND t.date_value >= m.max_date_value - INTERVAL '28' DAY);
Which, for the sample data:
Create Table table_name(DATE_VALUE DATE, FISCAL_WEEK INT, FISCAL_YEAR_VALUE INT);
INSERT INTO table_name (date_value, fiscal_week, fiscal_year_value)
SELECT DATE '2019-12-30' + LEVEL - 1, CEIL(LEVEL/7), 2020
FROM DUAL
CONNECT BY LEVEL <= 7 * 52
UNION ALL
SELECT DATE '2020-12-28' + LEVEL - 1, CEIL(LEVEL/7), 2021
FROM DUAL
CONNECT BY LEVEL <= 7 * 53
UNION ALL
SELECT DATE '2022-01-03' + LEVEL - 1, CEIL(LEVEL/7), 2022
FROM DUAL
CONNECT BY LEVEL <= 7 * 52;
Outputs:
FISCAL_YEAR_VALUE
FISCAL_WEEK
2022
38
2022
39
2022
40
2022
41
2021
38
2021
39
2021
40
2021
41
And if today's date was 2022-01-01, would output:
FISCAL_YEAR_VALUE
FISCAL_WEEK
2021
52
2021
53
2022
1
2022
2
2020
51
2020
52
2021
1
2021
2
There may be a simpler method but without any knowledge of how you calculate a fiscal year that is not immediately possible.
fiddle

SQL Custom unique Ordering with repeated sequence

I have a datetime column (data type of timestamp without time zone) named time. I can best explain my issue with a example:
Example I've the following data in this column (pretifying timestamp for this example)
ID TIME
1 1 Mar 2022 - 1PM
2 1 Mar 2022 - 2PM
3 1 Mar 2022 - 1PM
4 1 Mar 2022 - 3PM
5 1 Mar 2022 - 2PM
6 2 Mar 2022 - 2PM
7 2 Mar 2022 - 1PM
8 2 Mar 2022 - 3PM
9 2 Mar 2022 - 1PM
10 1 Mar 2022 - 3PM
11 2 Mar 2022 - 2PM
12 2 Mar 2022 - 3PM
13 3 Mar 2022 - 4PM
14 3 Mar 2022 - 3PM
15 3 Mar 2022 - 3PM
16 3 Mar 2022 - 4PM
If i do ORDER BY time, i get the following result:
ID TIME
1 1 Mar 2022 - 1PM
3 1 Mar 2022 - 1PM
2 1 Mar 2022 - 2PM
5 1 Mar 2022 - 2PM
4 1 Mar 2022 - 3PM
10 1 Mar 2022 - 3PM
7 2 Mar 2022 - 1PM
9 2 Mar 2022 - 1PM
6 2 Mar 2022 - 2PM
11 2 Mar 2022 - 2PM
8 2 Mar 2022 - 3PM
12 2 Mar 2022 - 3PM
14 3 Mar 2022 - 3PM
15 3 Mar 2022 - 3PM
13 3 Mar 2022 - 4PM
16 3 Mar 2022 - 4PM
But i want the result in this way:
ID TIME
1 1 Mar 2022 - 1PM
2 1 Mar 2022 - 2PM
4 1 Mar 2022 - 3PM
13 3 Mar 2022 - 4PM
3 1 Mar 2022 - 1PM
5 1 Mar 2022 - 2PM
10 1 Mar 2022 - 3PM
16 3 Mar 2022 - 4PM
7 2 Mar 2022 - 1PM
6 2 Mar 2022 - 2PM
8 2 Mar 2022 - 3PM
9 2 Mar 2022 - 1PM
11 2 Mar 2022 - 2PM
12 2 Mar 2022 - 3PM
14 3 Mar 2022 - 3PM
13 3 Mar 2022 - 4PM
As you can see first 4 rows have unique timestamp and the sequence should repeat based on Time (1PM, 2PM, 3PM).
How can we do this in SQL? I'm using postresql as my DB. I'm using Rails for my Backend.
EDIT:
Have added more context to example to explain my scenario.
One way you can try to use ROW_NUMBER window function with REPLACE function
SELECT time
FROM (
SELECT *,REPLACE(time,'PM','') val,
ROW_NUMBER() OVER(PARTITION BY REPLACE(time,'PM','')) rn
FROM T
) t1
ORDER BY rn,val
For example, sequence of the col a
with tbl(a, othercol) as
(
SELECT 1,1 UNION ALL
SELECT 1,2 UNION ALL
SELECT 1,3 UNION ALL
SELECT 2,4 UNION ALL
SELECT 2,5 UNION ALL
SELECT 2,6 UNION ALL
SELECT 3,7 UNION ALL
SELECT 3,8 UNION ALL
SELECT 3,9
),
cte as (
SELECT *, row_number() over(partition by a order by a) rn
from tbl
)
select a, othercol
from cte
order by rn, a
The problem you have at hand is a direct result of not choosing the correct data type for the values you store.
To get the sorting correct, you need to convert the string to a proper time value. There is no to_time() function in Postgres, but you can convert it to a timestamp then cast it to a time:
order by to_timestamp("time", 'hham')::time
You should fix your database design and convert that column to a proper time type. Which will also prevent storing invalid values ('3 in the afternoon' or '128foo') in that column

Query to find three instances the same in one column, but must be have three different results in another column

I have a table like the following:
InspectDate | Serial Number | Reference | Error | PartNumber
I need to find the data of errors that occurred in the last 10 days. I can get that, but then I need to find only those problems that occurred on the same reference, but only if they happen to be on three or more different serial numbers.
Please let me know if I need to provide any more info. I have tried using count and filtering by those with more than 3, but that only shows me any one serial number that has more than three errors on that reference.
Sample Data:
InspectDate SerialNumber Reference Error PartNumber
Oct 12 2021 1:58PM 50012 A21 1 PL2-001
Oct 12 2021 3:22PM 50013 A21 1 PL2-001
Oct 12 2021 5:59PM 50062 A21 1 PL2-001
Oct 18 2021 11:24AM 50071 A21 1 PL2-001
Oct 18 2021 12:20PM 50071 A21 2 PL2-001
Oct 18 2021 12:36PM 50071 A21 3 PL2-001
Oct 12 2021 5:59PM 50055 B44 5 AL1-440
Oct 18 2021 11:19AM 50062 B72 1 AL1-660
Oct 18 2021 11:22AM 50071 B72 2 AL1-660
Oct 12 2021 5:39PM 50047 B83 5 AL1-550
Oct 12 2021 3:03PM 50013 V310 2 PL3-010
Oct 18 2021 12:00PM 50071 V310 2 PL3-010
Oct 18 2021 12:37PM 50098 V310 4 PL3-010
Expected Results:
InspectDate SerialNumber Reference Error PartNumber
Oct 12 2021 1:58PM 50012 A21 1 PL2-001
Oct 12 2021 3:22PM 50013 A21 1 PL2-001
Oct 12 2021 5:59PM 50062 A21 1 PL2-001
Oct 18 2021 11:24AM 50071 A21 1 PL2-001
Oct 12 2021 3:03PM 50013 V310 2 PL3-010
Oct 18 2021 12:00PM 50071 V310 2 PL3-010
Oct 18 2021 12:37PM 50098 V310 4 PL3-010
Tempted Code:
Select (all columns), COUNT() AS Instances From (Table)
Where InspectDate >= DATEADD(day, -10, GETDATE())
GROUP BY (all columns)
HAVING COUNT() >= 3
Order by CAST (inspectdate as datetime) DESC
What you need here is a windowed COUNT(DISTINCT. Unfortuantely, SQL Server does not allow COUNT(DISTINCT as a window function.
But we can simulate it using DENSE_RANK and MAX, both as window functions
WITH Ranked AS (
SELECT *,
rn = DENSE_RANK() OVER (PARTITION BY Reference ORDER BY SerialNumber)
FROM [Table]
WHERE InspectDate >= DATEADD(day, -10, GETDATE())
),
DistinctCount AS (
SELECT *,
maxrn = MAX(rn) OVER (PARTITION BY Reference)
FROM Ranked
)
SELECT *
FROM DistinctCount
WHERE maxrn >= 3;

Error building a SQL query while trying to get order by a day for a period of week

I have an ASP.NET Core application, through controller endpoint I pass #by and #period string values to the SQL query.
#by takes one of the following values: day, week
#period takes one of the following values: week, month, year
When the #period is month or year, then #by is a week, else it's a day.
I have the following working query when the #period is a month or a year:
SELECT
l.region_id AS region_id,
'Region ' + r.region_desc AS region_name,
MIN(DATEADD(D, -(DATEPART(WEEKDAY, s.pos_date) - 1), s.pos_date)) AS date_pos,
CONVERT(VARCHAR(20), MIN(DATEADD(D, -(DATEPART(WEEKDAY, s.pos_date) - 1), s.pos_date)), 107) AS display_date_pos
FROM
incent_summary s
INNER JOIN
location l ON s.store_num = l.store_num
INNER JOIN
region r ON l.region_id = r.region_id
WHERE
s.pos_date >= DATEADD(day, #period , CONVERT(date, GETDATE()))
AND s.pos_date <= GETDATE()
GROUP BY
DATEPART (#by, s.pos_date),
l.region_id, r.region_desc
ORDER BY
DATEPART (#by, pos_date),
l.region_id, r.region_desc
The issue is when the #period is a week, #by is day, and the statement
MIN(DATEADD(D, -(DATEPART(WEEKDAY, s.pos_date) - 1), s.pos_date)) AS date_pos
returns the same day for all the 7 days.
Sample output when #period = year and #by = week:
region_id region_name date_pos display_date_pos
---------------------------------------------------------------------
34 Region 43 2019-12-29 00:00:00.000 Dec 29, 2019
50 Region 22 2019-12-29 00:00:00.000 Dec 29, 2019
34 Region 43 2020-01-05 00:00:00.000 Jan 05, 2020
50 Region 22 2020-01-05 00:00:00.000 Jan 05, 2020
34 Region 43 2020-01-12 00:00:00.000 Jan 12, 2020
50 Region 22 2020-01-12 00:00:00.000 Jan 12, 2020
34 Region 43 2020-01-19 00:00:00.000 Jan 19, 2020
50 Region 22 2020-01-19 00:00:00.000 Jan 19, 2020
34 Region 43 2020-01-26 00:00:00.000 Jan 26, 2020
50 Region 22 2020-01-26 00:00:00.000 Jan 26, 2020
Sample output when #period = week and #by = day:
region_id region_name date_pos display_date_pos
--------------------------------------------------------------------
34 Region 43 2020-07-12 00:00:00.000 Jul 12, 2020
50 Region 22 2020-07-12 00:00:00.000 Jul 12, 2020
34 Region 43 2020-07-12 00:00:00.000 Jul 12, 2020
50 Region 22 2020-07-12 00:00:00.000 Jul 12, 2020
34 Region 43 2020-07-19 00:00:00.000 Jul 19, 2020
50 Region 22 2020-07-19 00:00:00.000 Jul 19, 2020
34 Region 43 2020-07-19 00:00:00.000 Jul 19, 2020
50 Region 22 2020-07-19 00:00:00.000 Jul 19, 2020
34 Region 43 2020-07-19 00:00:00.000 Jul 19, 2020
50 Region 22 2020-07-19 00:00:00.000 Jul 19, 2020
How can I fix this?
SELECT
DATEADD(D, -(DATEPART(WEEKDAY, s.pos_date) - 1), s.pos_date)
Will always return the first day of the week because the logic is: "subtract from my date the number of days from sunday and add 1."
Sunday: 1 - 1 + 1 = 1 = Sunday
Monday: 2 - 2 + 1 = 1 = Sunday
.
.
.
Saturday: 7 - 7 + 1 = Sunday
That's fine when you want the first Sunday of the year/month/whatever. But the first sunday of every week is always... sunday. But in this case you really just need to take the MIN(s.pos_date) if #period is week.
There's probably some crazy way to do this in a single statement using quaternions or something else super mathy, but it's easiest to just use a case statement:
MIN
(
CASE
WHEN '#by' = 'day' THEN s.pos_date
ELSE DATEADD(D, -(DATEPART(WEEKDAY, s.pos_date) - 1), s.pos_date)
END
)
I'm not a C# programmer so I can't tell you the exact way to make sure the string DAY is passed to the query as "DAY" but I'm sure you can handle that part.
ALSO IMPORTANT The datepart "day" is day of month, so if you're going to possibly have a span greater than one month (but under a year), use dayofyear.

T-SQL - Partition a running total

I've written a query that returns the size of my individual records in mb. These records contain Blob data.
I would like to partition the records in 50mb batches.
SELECT SourceId, Title, Description,
SUM(DATALENGTH(VersionData) * 0.000001) OVER (PARTITION BY DATALENGTH(SourceId) ORDER BY SourceId) AS RunningTotal,
RANK() OVER(ORDER BY SourceId) AS RowNo
FROM TargetContentVersion WITH(NOLOCK)
The data returned from this query currently looks like this, where RunningTotal is the running total in mb of the records:
SourceId Title RunningTotalRowNo
00Pf4000006gna3EAA 001f400000ZP5yUAAT_3 Oct 2018 (14_37_32).pdf 5.242880 1
00Pf4000006gna8EAA 001f400000ZP5yUAAT_3 Oct 2018 (14_37_38).doc 6.291456 2
00Pf4000006gnacEAA 001f400000ZP5yUAAT_3 Oct 2018 (14_38_44).pdf 7.340032 3
00Pf4000006gnaDEAQ 001f400000ZP5yUAAT_3 Oct 2018 (14_37_41).doc 12.582912 4
00Pf4000006gnahEAA 001f400000ZP5yUAAT_3 Oct 2018 (14_38_47).pdf 17.825792 5
00Pf4000006gnaIEAQ 001f400000ZP5yUAAT_3 Oct 2018 (14_37_46).doc 23.068672 6
00Pf4000006gnamEAA 001f400000ZP5yUAAT_3 Oct 2018 (14_38_54).pdf 33.554432 7
00Pf4000006gnaNEAQ 001f400000ZP5yUAAT_3 Oct 2018 (14_37_52).txt 34.603008 8
00Pf4000006gnarEAA 001f400000ZP5yUAAT_3 Oct 2018 (14_39_20).doc 35.651584 9
00Pf4000006gnaSEAQ 001f400000ZP5yUAAT_3 Oct 2018 (14_37_55).txt 40.894464 10
00Pf4000006gnawEAA 001f400000ZP5yUAAT_3 Oct 2018 (14_39_24).doc 46.137344 11
00Pf4000006gnaXEAQ 001f400000ZP5yUAAT_3 Oct 2018 (14_38_0).txt 51.380224 12
00Pf4000006gnb1EAA 001f400000ZP5yUAAT_3 Oct 2018 (14_39_30).doc 61.865984 13
00Pf4000006gnb6EAA 001f400000ZP5yUAAT_3 Oct 2018 (14_39_50).txt 62.914560 14
00Pf4000006gnbaEAA 001f400000ZP5yVAAT_3 Oct 2018 (14_40_29).doc 68.157440 15
00Pf4000006gnbBEAQ 001f400000ZP5yUAAT_3 Oct 2018 (14_39_58).txt 78.643200 16
00Pf4000006gnbfEAA 001f400000ZP5yVAAT_3 Oct 2018 (14_40_34).doc 89.128960 17
00Pf4000006gnbGEAQ 001f400000ZP5yVAAT_3 Oct 2018 (14_40_7).pdf 90.177536 18
00Pf4000006gnbkEAA 001f400000ZP5yVAAT_3 Oct 2018 (14_40_43).txt 91.226112 19
00Pf4000006gnbLEAQ 001f400000ZP5yVAAT_3 Oct 2018 (14_40_12).pdf 96.468992 20
00Pf4000006gnbpEAA 001f400000ZP5yVAAT_3 Oct 2018 (14_40_46).txt 101.711872 21
00Pf4000006gnbQEAQ 001f400000ZP5yVAAT_3 Oct 2018 (14_40_17).pdf 112.197632 22
00Pf4000006gnbuEAA 001f400000ZP5yVAAT_3 Oct 2018 (14_40_52).txt 122.683392 23
00Pf4000006gnbVEAQ 001f400000ZP5yVAAT_3 Oct 2018 (14_40_26).doc 123.731968 24
00Pf4000006gnbzEAA 001f400000ZP5yWAAT_3 Oct 2018 (14_41_0).pdf 124.780544 25
00Pf4000006gnc4EAA 001f400000ZP5yWAAT_3 Oct 2018 (14_41_5).pdf 130.023424 26
00Pf4000006gnc9EAA 001f400000ZP5yWAAT_3 Oct 2018 (14_41_11).pdf 140.509184 27
00Pf4000006gncdEAA 001f400000ZP5yWAAT_3 Oct 2018 (14_41_56).txt 145.752064 28
00Pf4000006gncEEAQ 001f400000ZP5yWAAT_3 Oct 2018 (14_41_30).doc 146.800640 29
00Pf4000006gnciEAA 001f400000ZP5yWAAT_3 Oct 2018 (14_42_3).txt 157.286400 30
00Pf4000006gncJEAQ 001f400000ZP5yWAAT_3 Oct 2018 (14_41_33).doc 162.529280 31
00Pf4000006gncKEAQ 001f400000ZP5ycAAD_3 Oct 2018 (14_48_11).txt 173.015040 32
00Pf4000006gncnEAA 001f400000ZP5yXAAT_3 Oct 2018 (14_42_12).pdf 174.063616 33
00Pf4000006gncsEAA 001f400000ZP5yXAAT_3 Oct 2018 (14_42_15).pdf 179.306496 34
00Pf4000006gncTEAQ 001f400000ZP5yWAAT_3 Oct 2018 (14_41_44).doc 189.792256 35
00Pf4000006gncxEAA 001f400000ZP5yXAAT_3 Oct 2018 (14_42_30).pdf 200.278016 36
00Pf4000006gncYEAQ 001f400000ZP5yWAAT_3 Oct 2018 (14_41_53).txt 201.326592 37
00Pf4000006gnd2EAA 001f400000ZP5yXAAT_3 Oct 2018 (14_42_46).doc 202.375168 38
00Pf4000006gnd7EAA 001f400000ZP5yXAAT_3 Oct 2018 (14_42_49).doc 207.618048 39
00Pf4000006gndbEAA 001f400000ZP5yYAAT_3 Oct 2018 (14_43_23).pdf 212.860928 40
00Pf4000006gndCEAQ 001f400000ZP5yXAAT_3 Oct 2018 (14_42_54).doc 223.346688 41
00Pf4000006gndgEAA 001f400000ZP5yYAAT_3 Oct 2018 (14_43_30).pdf 233.832448 42
00Pf4000006gnDhEAI Snake_River_(5mb).jpg 239.077777 43
00Pf4000006gndHEAQ 001f400000ZP5yXAAT_3 Oct 2018 (14_43_3).txt 240.126353 44
00Pf4000006gndlEAA 001f400000ZP5yYAAT_3 Oct 2018 (14_43_39).doc 241.174929 45
00Pf4000006gndMEAQ 001f400000ZP5yXAAT_3 Oct 2018 (14_43_6).txt 246.417809 46
00Pf4000006gndqEAA 001f400000ZP5yYAAT_3 Oct 2018 (14_43_41).doc 251.660689 47
00Pf4000006gnDrEAI Pizigani_1367_Chart_10MB.jpg 261.835395 48
00Pf4000006gndREAQ 001f400000ZP5yXAAT_3 Oct 2018 (14_43_11).txt 272.321155 49
00Pf4000006gndvEAA 001f400000ZP5yYAAT_3 Oct 2018 (14_43_47).doc 282.806915 50
00Pf4000006gnDwEAI Spinner_Dolphin_Indian_Ocean_07-2017.jpg 284.109019 51
00Pf4000006gndWEAQ 001f400000ZP5yYAAT_3 Oct 2018 (14_43_20).pdf 285.157595 52
00Pf4000006gnDXEAY 440 Kb.jpg 285.609143 53
00Pf4000006gne0EAA 001f400000ZP5yYAAT_3 Oct 2018 (14_43_59).txt 286.657719 54
00Pf4000006gne5EAA 001f400000ZP5yYAAT_3 Oct 2018 (14_44_2).txt 291.900599 55
00Pf4000006gneaEAA 001f400000ZP5yZAAT_3 Oct 2018 (14_44_59).txt 302.386359 56
00Pf4000006gneAEAQ 001f400000ZP5yYAAT_3 Oct 2018 (14_44_7).txt 312.872119 57
00Pf4000006gneeEAA 001f400000ZP5yZAAT_3 Oct 2018 (14_44_40).doc 323.357879 58
I would like the results to look like this where they are partitioned in 50mb batches:
SourceId Title RunningTotalRowNo Batch
00Pf4000006gna3EAA 001f400000ZP5yUAAT_3 Oct 2018 (14_37_32).pdf 5.242880 1 1
00Pf4000006gna8EAA 001f400000ZP5yUAAT_3 Oct 2018 (14_37_38).doc 6.291456 2 1
00Pf4000006gnacEAA 001f400000ZP5yUAAT_3 Oct 2018 (14_38_44).pdf 7.340032 3 1
00Pf4000006gnaDEAQ 001f400000ZP5yUAAT_3 Oct 2018 (14_37_41).doc 12.582912 4 1
00Pf4000006gnahEAA 001f400000ZP5yUAAT_3 Oct 2018 (14_38_47).pdf 17.825792 5 1
00Pf4000006gnaIEAQ 001f400000ZP5yUAAT_3 Oct 2018 (14_37_46).doc 23.068672 6 1
00Pf4000006gnamEAA 001f400000ZP5yUAAT_3 Oct 2018 (14_38_54).pdf 33.554432 7 1
00Pf4000006gnaNEAQ 001f400000ZP5yUAAT_3 Oct 2018 (14_37_52).txt 34.603008 8 1
00Pf4000006gnarEAA 001f400000ZP5yUAAT_3 Oct 2018 (14_39_20).doc 35.651584 9 1
00Pf4000006gnaSEAQ 001f400000ZP5yUAAT_3 Oct 2018 (14_37_55).txt 40.894464 10 1
00Pf4000006gnawEAA 001f400000ZP5yUAAT_3 Oct 2018 (14_39_24).doc 46.137344 11 1
00Pf4000006gnaXEAQ 001f400000ZP5yUAAT_3 Oct 2018 (14_38_0).txt 51.380224 12 1
00Pf4000006gnb1EAA 001f400000ZP5yUAAT_3 Oct 2018 (14_39_30).doc 61.865984 13 2
00Pf4000006gnb6EAA 001f400000ZP5yUAAT_3 Oct 2018 (14_39_50).txt 62.914560 14 2
00Pf4000006gnbaEAA 001f400000ZP5yVAAT_3 Oct 2018 (14_40_29).doc 68.157440 15 2
00Pf4000006gnbBEAQ 001f400000ZP5yUAAT_3 Oct 2018 (14_39_58).txt 78.643200 16 2
00Pf4000006gnbfEAA 001f400000ZP5yVAAT_3 Oct 2018 (14_40_34).doc 89.128960 17 2
00Pf4000006gnbGEAQ 001f400000ZP5yVAAT_3 Oct 2018 (14_40_7).pdf 90.177536 18 2
00Pf4000006gnbkEAA 001f400000ZP5yVAAT_3 Oct 2018 (14_40_43).txt 91.226112 19 2
00Pf4000006gnbLEAQ 001f400000ZP5yVAAT_3 Oct 2018 (14_40_12).pdf 96.468992 20 2
00Pf4000006gnbpEAA 001f400000ZP5yVAAT_3 Oct 2018 (14_40_46).txt 101.711872 21 3
00Pf4000006gnbQEAQ 001f400000ZP5yVAAT_3 Oct 2018 (14_40_17).pdf 112.197632 22 3
00Pf4000006gnbuEAA 001f400000ZP5yVAAT_3 Oct 2018 (14_40_52).txt 122.683392 23 3
00Pf4000006gnbVEAQ 001f400000ZP5yVAAT_3 Oct 2018 (14_40_26).doc 123.731968 24 3
00Pf4000006gnbzEAA 001f400000ZP5yWAAT_3 Oct 2018 (14_41_0).pdf 124.780544 25 3
00Pf4000006gnc4EAA 001f400000ZP5yWAAT_3 Oct 2018 (14_41_5).pdf 130.023424 26 3
00Pf4000006gnc9EAA 001f400000ZP5yWAAT_3 Oct 2018 (14_41_11).pdf 140.509184 27 3
00Pf4000006gncdEAA 001f400000ZP5yWAAT_3 Oct 2018 (14_41_56).txt 145.752064 28 3
00Pf4000006gncEEAQ 001f400000ZP5yWAAT_3 Oct 2018 (14_41_30).doc 146.800640 29 3
00Pf4000006gnciEAA 001f400000ZP5yWAAT_3 Oct 2018 (14_42_3).txt 157.286400 30 4
00Pf4000006gncJEAQ 001f400000ZP5yWAAT_3 Oct 2018 (14_41_33).doc 162.529280 31 4
00Pf4000006gncKEAQ 001f400000ZP5ycAAD_3 Oct 2018 (14_48_11).txt 173.015040 32 4
00Pf4000006gncnEAA 001f400000ZP5yXAAT_3 Oct 2018 (14_42_12).pdf 174.063616 33 4
00Pf4000006gncsEAA 001f400000ZP5yXAAT_3 Oct 2018 (14_42_15).pdf 179.306496 34 4
00Pf4000006gncTEAQ 001f400000ZP5yWAAT_3 Oct 2018 (14_41_44).doc 189.792256 35 4
00Pf4000006gncxEAA 001f400000ZP5yXAAT_3 Oct 2018 (14_42_30).pdf 200.278016 36 5
00Pf4000006gncYEAQ 001f400000ZP5yWAAT_3 Oct 2018 (14_41_53).txt 201.326592 37 5
00Pf4000006gnd2EAA 001f400000ZP5yXAAT_3 Oct 2018 (14_42_46).doc 202.375168 38 5
00Pf4000006gnd7EAA 001f400000ZP5yXAAT_3 Oct 2018 (14_42_49).doc 207.618048 39 5
00Pf4000006gndbEAA 001f400000ZP5yYAAT_3 Oct 2018 (14_43_23).pdf 212.860928 40 5
00Pf4000006gndCEAQ 001f400000ZP5yXAAT_3 Oct 2018 (14_42_54).doc 223.346688 41 5
00Pf4000006gndgEAA 001f400000ZP5yYAAT_3 Oct 2018 (14_43_30).pdf 233.832448 42 5
00Pf4000006gnDhEAI Snake_River_(5mb).jpg 239.077777 43 5
00Pf4000006gndHEAQ 001f400000ZP5yXAAT_3 Oct 2018 (14_43_3).txt 240.126353 44 5
00Pf4000006gndlEAA 001f400000ZP5yYAAT_3 Oct 2018 (14_43_39).doc 241.174929 45 5
00Pf4000006gndMEAQ 001f400000ZP5yXAAT_3 Oct 2018 (14_43_6).txt 246.417809 46 5
00Pf4000006gndqEAA 001f400000ZP5yYAAT_3 Oct 2018 (14_43_41).doc 251.660689 47 6
00Pf4000006gnDrEAI Pizigani_1367_Chart_10MB.jpg 261.835395 48 6
00Pf4000006gndREAQ 001f400000ZP5yXAAT_3 Oct 2018 (14_43_11).txt 272.321155 49 6
00Pf4000006gndvEAA 001f400000ZP5yYAAT_3 Oct 2018 (14_43_47).doc 282.806915 50 6
00Pf4000006gnDwEAI Spinner_Dolphin_Indian_Ocean_07-2017.jpg 284.109019 51 6
00Pf4000006gndWEAQ 001f400000ZP5yYAAT_3 Oct 2018 (14_43_20).pdf 285.157595 52 6
00Pf4000006gnDXEAY 440 Kb.jpg 285.609143 53
00Pf4000006gne0EAA 001f400000ZP5yYAAT_3 Oct 2018 (14_43_59).txt 286.657719 54 6
00Pf4000006gne5EAA 001f400000ZP5yYAAT_3 Oct 2018 (14_44_2).txt 291.900599 55 6
00Pf4000006gneaEAA 001f400000ZP5yZAAT_3 Oct 2018 (14_44_59).txt 302.386359 56 7
00Pf4000006gneAEAQ 001f400000ZP5yYAAT_3 Oct 2018 (14_44_7).txt 312.872119 57 7
00Pf4000006gneeEAA 001f400000ZP5yZAAT_3 Oct 2018 (14_44_40).doc 323.357879 58 7
Help would be much appreciated, thank you.
You can use integer division:
SELECT ( CAST ( SUM(Datalength(versiondata) * 0.000001)
OVER (
partition BY Datalength(sourceid)
ORDER BY sourceid) AS INT) / 50 ) + 1 AS Batch
FROM TargetContentVersion
Here's a quick sample that demonstrates how it works:
CREATE TABLE #t (id INT IDENTITY(1,1), size NUMERIC(8,6))
GO
INSERT INTO #t
SELECT RAND() * 20
GO 20 -- Create 20 sample rows with random sizes between 0 and 20
SELECT id, SUM(size) OVER (ORDER BY id) AS RunningTotal,
(CAST(SUM(size) OVER (ORDER BY id) AS INT) / 50) + 1 AS Batch
FROM #t
id RunningTotal Batch
1 2.303367 1
2 4.049776 1
3 19.177784 1
4 28.637981 1
5 29.675840 1
6 32.781603 1
7 33.859586 1
8 36.633733 1
9 39.413363 1
10 58.004502 2
11 70.363837 2
12 82.897268 2
13 83.946657 2
14 85.623044 2
15 87.432670 2
16 103.304830 3
17 103.709745 3
18 122.165664 3
19 126.554616 3
20 128.019929 3
I've worked it out.
Script below for those interested.
WITH cte1 AS (
SELECT SourceId, Title, DATALENGTH(VersionData) * 0.000001 AS RecordSize,
CAST(SUM(DATALENGTH(VersionData) * 0.000001) OVER (PARTITION BY
DATALENGTH(SourceId) ORDER BY SourceId) AS INT) AS RunningTotal,
RANK() OVER(ORDER BY SourceId) AS RowNo
FROM TargetContentVersion WITH(NOLOCK)
)
SELECT SourceId, Title, RecordSize, RunningTotal,
RowNo, SUM(RunningTotal) OVER (PARTITION BY SourceId ORDER BY SourceId) / 50 AS
Batch
FROM cte1
Another option would be to use dense_rank:
WITH CTE AS
(
SELECT SourceId, Title, Description,
SUM(DATALENGTH(VersionData) * 0.000001) OVER (PARTITION BY DATALENGTH(SourceId) ORDER BY SourceId) AS RunningTotal,
RANK() OVER(ORDER BY SourceId) AS RowNo
FROM TargetContentVersion WITH(NOLOCK)
)
SELECT SourceId, Title, Description, RunningTotal, RowNo
DENSE_RANK() OVER(PARTITION BY SourceId ORDER BY CAST(RunningTotal as int) / 50) As Batch
from #CTE
Note the casting of RunningTotal to int.