Hive Subquery with Group By

Hive Subquery with Group By - hive

I'm new to Hive but consider myself intermediate at SQL. I'm getting the error shown below. I've tried changing, adding, removing parentheses and their locations as well as adding & removing subquery aliases. Nothing seems to clear the error. Your help is appreciated!
org.apache.hive.service.cli.HiveSQLException: Error while compiling statement:
FAILED: ParseException line 16:11 missing ) at 'testing' near ')' in expression specification
DROP TABLE IF EXISTS batting;
CREATE EXTERNAL TABLE IF NOT EXISTS batting(id STRING, year INT, team STRING, league STRING, games INT, ab INT, runs INT, hits INT, doubles INT, triples INT, homeruns INT, rbi INT, sb INT, cs INT, walks INT, strikeouts INT, ibb INT, hbp INT, sh INT, sf INT, gidp INT) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '/user/maria_dev/hivetest/batting';
DROP TABLE IF EXISTS master;
CREATE EXTERNAL TABLE IF NOT EXISTS master(id STRING, byear INT, bmonth INT, bday INT, bcountry STRING, bstate STRING, bcity STRING, dyear INT, dmonth INT, dday INT, dcountry STRING, dstate STRING, dcity STRING, fname STRING, lname STRING, name STRING, weight INT, height INT, bats STRING, throws STRING, debut STRING, finalgame STRING, retro STRING, bbref STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '/user/maria_dev/hivetest/master';
SELECT
bcity, bstate
FROM(
SELECT bcity, bstate, DENSE_RANK() OVER (ORDER BY total_double_triples DESC) as ranked
FROM
( SELECT bcity, bstate, SUM(doubles+triples) as total_double_triples
FROM
(
((SELECT id as b_id, doubles, triples
FROM batting) batting_data
JOIN
(SELECT id, bcity, bstate
FROM master
WHERE NOT ISNULL(bcity) AND NOT ISNULL(bstate)) master_join
ON master_join.id = batting_data.b_id)
) testing GROUP BY (bcity, bstat)
) total_group_by
) subquery
WHERE subquery.ranked <= 5
;

Missing SELECT in the subquery and extra (). See comments marked -- 1st, 2nd and 3rd in the code:
SELECT bcity, bstate FROM(
SELECT bcity, bstate, DENSE_RANK() OVER (ORDER BY total_double_triples DESC) as ranked
FROM
( SELECT bcity, bstate, SUM(doubles+triples) as total_double_triples
FROM
(--There should be SELECT ... FROM ---------------------------------- 1st
( ----------Remove this line ------------------------------------ 2nd
(SELECT id as b_id, doubles, triples
FROM batting) batting_data
JOIN
(SELECT id, bcity, bstate
FROM master
WHERE NOT ISNULL(bcity) AND NOT ISNULL(bstate)
) master_join
ON master_join.id = batting_data.b_id ) --Remove extra)------3rd
) testing GROUP BY (bcity, bstat)
) total_group_by
) subquery WHERE subquery.ranked <= 5 ;

Had to remove the alias prior to the Group By statement as well as fix a type in bstate field name. Thanks to all that proposed solutions!
SELECT bcity, bstate FROM(
SELECT bcity, bstate, DENSE_RANK() OVER (ORDER BY total_double_triples DESC) as ranked FROM (
SELECT bcity, bstate, SUM(doubles+triples) as total_double_triples FROM (
(SELECT id as b_id, doubles, triples FROM batting) batting_data
JOIN
(SELECT id, bcity, bstate FROM master WHERE NOT ISNULL(bcity) AND NOT ISNULL(bstate)) master_join
ON master_join.id = batting_data.b_id
) GROUP BY bcity, bstate
) group_by_totals
) subquery WHERE subquery.ranked <= 5 ;

Related

Rewrite SQL with LEFT JOIN INSTEAD OF OUTER APPLY

CREATE TABLE #ledgertxn (
txnno int,
lid int,
flid int,
txndate date,
lname varchar(50),
debit int,
credit int,
ledgername varchar(50),
drcr varchar(2),
txntype varchar(30)
)
SELECT
Limit1.Txnno,
Limit1.Txndate,
Limit1.Particulars,
Limit1.Debit,
Limit1.Credit
FROM
(SELECT DISTINCT
txnno
FROM
#ledgertxn ) Distinct1
OUTER APPLY
(SELECT TOP 1
Project2.Txnno,
Project2.Txndate,
Project2.Particulars,
Project2.Debit,
Project2.Credit
FROM
( SELECT
Extent2.txnno,
Extent2.txndate,
Extent2.ledgername AS Particulars,
Extent2.debit,
Extent2.credit,
Extent2.lid
FROM
#ledgertxn Extent2
WHERE
Distinct1.txnno = Extent2.txnno ) Project2
ORDER BY
Project2.Lid desc ) AS Limit1

Trying to create a temp table in Microsoft SQL Server but keep getting hit with an error

create table #PercentofPopulationVaccinated
(
continent nvarchar(255),
location nvarchar(255),
date datetime,
Population numeric,
people_fully_vaccinated numeric,
[%_of_pop_vaxxxed] numeric,
rn int
)
insert into #PercentofPopulationVaccinated
select
cd.continent, cd.location, cd.date, cd.population,
vac.people_fully_vaccinated,
(cast(vac.people_fully_vaccinated as int) / cd.population) * 100 as [%_of_pop_vaxxxed],
rn = row_number() over (partition by cd.Location order by (vac.people_fully_vaccinated / cd.population) * 100 desc, cd.Date)
from
coviddeaths as cd
join
covidvaccinations vac on cd.location = vac.location
and cd.date = vac.date
where
cd.continent is not null
select *
from #PercentofPopulationVaccinated
Error
Column name or number of supplied values does not match table definition
This error is odd; I'm sure it has to do with row number

First if that is your temporary table, you should first check if that table exist and drop it.
if object_id('tempdb..#PercentofPopulationVaccinated') is not null drop table #PercentofPopulationVaccinated
After that table definition should be:
create table #PercentofPopulationVaccinated
(
continent nvarchar(20),
data_location nvarchar(255),
data_date datetime,
total_population float,
people_fully_vaccinated float,
[%_of_pop_vaxxxed] decimal(8,4),
rn
)
and your select:
insert into #PercentofPopulationVaccinated (
continent,
data_location,
data_date,
total_population,
people_fully_vaccinated,
[%_of_pop_vaxxxed],
rn int
)
select
cd.continent,
cd.location, cd.date, cd.population,
vac.people_fully_vaccinated,
(vac.people_fully_vaccinated / cd.population) * 100 as [%_of_pop_vaxxxed],
rn = row_number() over (partition by cd.Location order by
(vac.people_fully_vaccinated / cd.population) * 100 desc, cd.Date)
from
coviddeaths as cd
join
covidvaccinations vac on cd.location = vac.location
and cd.date = vac.date
where
cd.continent is not null
select *
from #PercentofPopulationVaccinated

How to get the Values of recently inserted columns from Temporary Table?

I am not able to use the recently inserted Quantity value of #ACCT table in the Select statement to be used like
ser.ServiceRate * Quantity
Every time I get the ERROR
Cannot insert the value NULL into column 'Amount'
I am just a beginner so any pointers to solve this would help.
DECLARE #MinReservationId INT = (SELECT MIN(f.ReservationId) FROM dbo.Reservation AS f)
DECLARE #MaxReservationId INT = (SELECT MAX(f.ReservationId) FROM dbo.Reservation AS f)
DECLARE #QuantityNew INT
WHILE #MinReservationId <= #MaxReservationId
BEGIN
CREATE TABLE #Acct
(
ServiceId INT,
Quantity INT --> I WANT THIS VALUE TO BE USED BELOW
)
INSERT INTO dbo.[Transaction]
(
ReservationId,
ServiceId,
Rate,
Quantity,
Amount
)
OUTPUT inserted.ServiceId,Inserted.Quantity
INTO #Acct
(
ServiceId,
Quantity
)
SELECT
#MinReservationId,
ser.ServiceId,
ser.ServiceRate,
ABS(CHECKSUM(NEWID())%3) + 1,
ser.ServiceRate * (SELECT acc.Quantity from #Acct as acc) -> QUANTITY from #ACCT should be used here
FROM dbo.[Service] AS ser
SELECT #MinReservationId=#MinReservationId+1
Drop table #Acct
END

You can use a CTE in order to capture the NEWID() value created for every record of table dbo.[Service]. Then use this CTE to do the INSERT:
;WITH ToInsert AS
(
SELECT ServiceId ,
ServiceRate,
ABS(CHECKSUM(NEWID())%3) + 1 AS Quantity
FROM dbo.[Service]
)
INSERT INTO dbo.[Transaction]
(
ReservationId,
ServiceId,
Rate,
Quantity,
Amount
)
SELECT #MinReservationId,
ServiceId,
ServiceRate,
Quantity,
ServiceRate * Quantity
FROM ToInsert

Insert where not exists Violation of PRIMARY KEY

I'm having troubles with an Insert where not exists and I'm not sure if a MERGE statement would be more efficient or what's wrong with my statement.
I have en existing View and need to insert the new records of this View into a Table.
The Table looks like:
CREATE TABLE [dbo].[ser_number_all]
(Serialnumber nvarchar(100) PRIMARY KEY,
TypeName nvarchar(max),
Date datetime,
Parent_Serialnumber nvarchar(100),
JobNumber nvarchar(30),
ProductNode hierarchyid,
);
The Insert statement looks like this:
insert into [dbo].[ser_number_all]
( Serialnumber
, TypeName
, Date
, Parent_Serialnumber
, JobNumber
, ProductNode)
select Serialnumber
, TypeName
, Date
, Parent_Serialnumber
, JobNumber
, ProductNode
from dbo.Hierachical_View_with_Jobnumbers as ser_number_all
where not exists (select 1
from Hierachical_View_with_Jobnumbers as hv
where hv. Serialnumber = ser_number_all.Serialnumber
and hv. TypeName = ser_number_all.TypeName
and hv. Date = ser_number_all.Date
and hv. Parent_Serialnumber = ser_number_all.Parent_Serialnumber
and hv. JobNumber = ser_number_all.JobNumber
and hv. ProductNode = ser_number_all.ProductNode);
As long the View has not any new records, it looks ok and I'm not getting any error, the output is 0 records as it should be.
When I add a new record to the origin table and the view has 1 record more, I'm always getting this error:
Msg 2627, Level 14, State 1, Line 4
Violation of PRIMARY KEY constraint 'PK__ser_numb__F2753A12C4ABA976'. Cannot insert duplicate key in object 'dbo.ser_number_all'. The duplicate key value is (.x3666AB05).
The statement has been terminated.
I don't get it why it will insert a duplicate value in the primary key column because in my WHERE clause I can't see any mistake.
I have also tried with IS NULL instead = ser_number_all.TypeName and for all other columns where it could have a NULL value, but still the same.
Again, I'm coming from Oracle and it looks like I have to learn many diversities with MS SQL compared to Oracle.
Appreciate any suggestion :-)
Thx
EDIT:
Here the code of the View:
CREATE VIEW [dbo].[Hierachical_View_with_Jobnumbers]
AS
WITH ProductList
AS
(
SELECT p.Serialnumber,
p.Type_Id,
p.Parent_Serialnumber,
p.ActiveJob_Jobnumber as JobNumber,
N'/' + CONVERT(NVARCHAR(4000), ROW_NUMBER() OVER (ORDER BY p.Serialnumber)) + N'/' AS ProductNode_AsChar
FROM Products AS p
WHERE p.Parent_Serialnumber IS NULL
UNION ALL
SELECT p.Serialnumber,
p.Type_Id,
p.Parent_Serialnumber,
JobNumber,
pl.ProductNode_AsChar + CONVERT(NVARCHAR(4000), ROW_NUMBER() OVER (ORDER BY p.Serialnumber)) + N'/'
FROM Products AS p
INNER JOIN ProductList AS pl ON p.Parent_Serialnumber = pl.Serialnumber
)
SELECT Serialnumber,
pt.Name as TypeName,
Parent_Serialnumber,
JobNumber,
CONVERT(HIERARCHYID, ProductNode_AsChar) AS ProductNode
FROM ProductList as pl
INNER JOIN ProductTypes as pt on pl.Type_Id = pt.Id;

#TheGameiswar
Sorry, now I got it what you meant ;-) Stupid me...
Here the solution which works now with correctly correlating:
insert into [dbo].[ser_number_all]
( Serialnumber
, TypeName
, Date
, Parent_Serialnumber
, JobNumber
, ProductNode)
select Serialnumber
, TypeName
, Date
, Parent_Serialnumber
, JobNumber
, ProductNode
from dbo.Hierachical_View_with_Jobnumbers as hv
where not exists (select 1
from ser_number_all as sna
where hv. Serialnumber = sna.Serialnumber);
Thank you all for your time and guiding me to the right direction :-)

SQL Select Best practice

The following works, I'm just wondering if this is the correct approach to finding the latest value for each audit field.
USE tempdb
CREATE Table Tbl(
TblID Int,
AuditFieldID Int,
AuditValue Int,
AuditDate Date
)
GO
INSERT INTO Tbl(TblID,AuditFieldID,AuditValue,AuditDate) VALUES(1,10,101,'1/1/2001')
INSERT INTO Tbl(TblID,AuditFieldID,AuditValue,AuditDate) VALUES(2,10,102,'1/1/2002')
INSERT INTO Tbl(TblID,AuditFieldID,AuditValue,AuditDate) VALUES(3,20,201,'1/1/2001')
INSERT INTO Tbl(TblID,AuditFieldID,AuditValue,AuditDate) VALUES(4,20,202,'1/1/2009')
SELECT AuditFieldID,AuditValue,AuditDate
FROM Tbl A
WHERE TblID=
(SELECT TOP 1 TblID
FROM Tbl
WHERE AuditFieldID=A.AuditFieldID
ORDER BY AuditDate DESC
)

Aggregate/ranking to get key and latest date, join back to get value.
This assumes SQL Server 2005+
DECLARE #tbl Table (
TblID Int,
AuditFieldID Int,
AuditValue Int,
AuditDate Date
)
INSERT INTO #tbl(TblID,AuditFieldID,AuditValue,AuditDate) VALUES(1,10,101,'1/1/2001')
INSERT INTO #tbl(TblID,AuditFieldID,AuditValue,AuditDate) VALUES(2,10,102,'1/1/2002')
INSERT INTO #tbl(TblID,AuditFieldID,AuditValue,AuditDate) VALUES(3,20,201,'1/1/2001')
INSERT INTO #tbl(TblID,AuditFieldID,AuditValue,AuditDate) VALUES(4,20,202,'1/1/2009')
;WITH cLatest AS
(
SELECT
ROW_NUMBER() OVER (PARTITION BY AuditFieldID ORDER BY AuditDate DESC) AS Ranking,
AuditFieldID, AuditDate
FROM
#tbl
)
SELECT
A.AuditFieldID, A.AuditValue, A.AuditDate
FROM
#tbl A
JOIN
cLatest C ON A.AuditFieldID = C.AuditFieldID AND A.AuditDate = C.AuditDate
WHERE
C.Ranking = 1

Simpler:
SELECT top 1 AuditFieldID,AuditValue,AuditDate
FROM Tbl
order by AuditDate DES

There are various methods for doing this. Different methods perform differently. I encourage you to look at this blog which explains the various methods.
Including an Aggregated Column's Related Values

you don't need the where statement as you are already selecting from tbl A AND selecting on the same field.

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Hive Subquery with Group By - hive

Related

Rewrite SQL with LEFT JOIN INSTEAD OF OUTER APPLY

Trying to create a temp table in Microsoft SQL Server but keep getting hit with an error

How to get the Values of recently inserted columns from Temporary Table?

Insert where not exists Violation of PRIMARY KEY

SQL Select Best practice

Categories

Resources