SQL Query giving wrong output - sql

I have written this code to find "For each year, count the number of movies in that year that had only female actors".
WITH
k AS
(SELECT MC.MID a
FROM M_CAST MC
JOIN PERSON P ON TRIM(P.PID) = TRIM(MC.PID)
WHERE TRIM(P.GENDER) IN ('Male', 'None'))
SELECT CAST(SUBSTR(M.year,-4) AS UNASSIGNED) Year, COUNT(DISTINCT TRIM(MID)) number_of_movies
FROM MOVIE M
WHERE TRIM(MID) NOT IN (SELECT a FROM k)
GROUP BY CAST(SUBSTR(M.year,-4) AS UNASSIGNED)
ORDER BY Year
I am getting this output -
Year Female_Movie_Count
1939 1
1999 1
2000 1
2009 1
2012 1
2018 2
When i had submitted this code, then i get response that "your output is wrong. when selecting non-female movies you should also include MID's which have null PID's in M_cast table as non-female movies too". Hence i am a beginer in sql, i am not getting that where i am wrong. Please suggest me that where i need to modify this code.
Here is the schema - https://i.stack.imgur.com/sWRSN.png

You could use a UNION ALL with M_CAST where PID is NULL as per your requirement
WITH
k AS
(SELECT MC.MID a
FROM M_CAST MC
JOIN PERSON P ON TRIM(P.PID) = TRIM(MC.PID)
WHERE TRIM(P.GENDER) IN ('Male', 'None')
UNION ALL
SELECT MC.MID a
FROM M_CAST MC
WHERE MC.PID IS NULL
)
SELECT CAST(SUBSTR(M.year,-4) AS UNASSIGNED) Year,
COUNT(DISTINCT TRIM(MID)) number_of_movies
FROM MOVIE M
WHERE TRIM(MID) NOT IN (SELECT a FROM k)
GROUP BY CAST(SUBSTR(M.year,-4) AS UNASSIGNED)
ORDER BY Year

Related

Sql query returning empty table

I am trying to solve 2 queries
Find all the actors that made more movies with Yash Chopra than any other director
Select b.number,b.actor,b.director from (select MAX(a.count) as number,a.director,a.actor from
(select count(p.PID) as count ,p.PID as actor,md.PID as director from person as p left join m_cast
as
mc on trim(p.PID)=trim(mc.PID) inner join m_director as md on trim(md.MID)=trim(mc.MID) group by
md.PID ,p.PID) as a group by a.actor) as b where b.director=(select PID from person where
Name='Yash Chopra')
report for each year the percentage of movies in that year with only female actors, and the total number of movies made that year. For example, one answer will be: 1990 31.81 13522 meaning that in 1990 there were 13,522 movies, and 31.81% had only female actors. You do not need to round your answer.
SELECT female_count.year Year,
((female_count.Total_movies_with_only_female_leads)*100)/total_count.Total Percentage FROM ((SELECT
movie.year Year,count(*) Total_movies_with_only_female_leads FROM movie WHERE NOT EXISTS ( SELECT *
FROM M_Cast,person WHERE M_Cast.mid = movie.MID and M_Cast.PID = person.PID AND person.gender='Male'
) GROUP BY movie.year) female_count, (SELECT movie.year,count(*) as Total FROM movie group by
movie.year) total_count) WHERE female_count.year=total_count.year
Unfortunately for both the queries, I am getting empty table. Can someone help me in solving these 2 queries
I wrote it using CTEs so it is more readable.
First Question:
WITH HowManyMoviesPerActorDirector AS
(select mc.pid as actorpid
,pa.name as actorname
,md.pid as directorpid
,pd.name as producername
,count(mc.MID) as numberofmovies
from m_cast as mc
inner join m_director md on md.MID=mc.MID
inner join person pa ON mc.PID=pa.PID
inner join person pd ON md.PID=pd.PID
group by mc.pid as actorpid,md.pid
)
select h.acorname
,h.producername
,h.numberofmovies
from HowManyMoviesPerActorDirector h
WHERE h.numberofmovies = (select MAX(h2.numberofmovies)
from HowManyMoviesPerActorDirector h2
where h2actorpid=h.actorpid
group by h2actorpid)
AND h.producername='Yash Chopra'
The second one:
WITH MoviesIncludingGendeflag AS
( select m.mid
,m.year
,sum(case when p.gender='female' then 0 else 1 end) as genderflag
from movie m
inner join mc_cast mc on mc.mid=m.mid
inner join person p on p.pid=mc.pid
group by m.mid,m.year
) FemaleOnlyMovies AS
( select m.year,count(m.id) as Total
from MoviesIncludingGendeflag m
where generflag=0
group by m.year
), TotalMovies AS
(
select m.year,count(m.id) as Total
from movie m
group by m.year
)
select TM.year,TM.Total,(COALESCE(FOM.Total,0)*100.0/TM.Total) as percentage
from TotalMovies TM
left join FemaleOnlyMovies FOM ON FOM.year=TM.year

SQL query: To find an actors who did more films with Quentin Tarantino

I'm suppose to find all the actors that made more movies with Quentin Tarantino than any other director
I wrote the following SQL query, to find the number films an actor did with a particular director
SELECT Director, Actor, count(*) As total_films FROM (
SELECT a.name as Director, c.title as Movie
FROM Person a
Inner Join M_director b
ON TRIM(b.PID) = a.PID
Inner Join Movie c
ON TRIM(b.MID) = c.MID
GROUP BY a.name, c.title
) t1
Inner Join
(
Select x.name as Actor, z.title as Movie
FROM Person x
Inner Join M_cast y
ON TRIM(y.PID) = x.PID
Inner Join Movie z
ON TRIM(y.MID) = z.MID
GROUP BY x.name, z.title
) t2
ON t1.movie = t2.movie
GROUP BY t1.director, t2.Actor
ORDER BY total_films DESC
I got the following output:
Director Actor total_films
0 David Dhawan Shashi Kiran 23
1 David Dhawan Shashi Kiran 23
2 David Dhawan Kader Khan 20
3 David Dhawan Shakti Kapoor 20
4 David Dhawan Kader Khan 20
5 David Dhawan Shakti Kapoor 20
-
-
-
-
-
39000 Zunaid Memon Satyendra Kapoor 1
139001 Zunaid Memon Sergio Kato 1
139002 Zunaid Memon Sulabha Deshpande 1
139003 Zunaid Memon Vaibhav Jhalani 1
139004 Zunaid Memon Vivek Madan 1
Would appreciate your insights on this issue.
First, your correct query for the number of movies between an actor and director is:
SELECT pd.name as Director, pa.name as actor, COUNT(*)
FROM M_director d JOIN
Person pd
ON pd.pid = d.pid JOIN
M_cast c
ON c.MID = d.MID JOIN
Person pa
ON pa.PID = c.PID
GROUP BY pd.name, pa.name
Note the use of table aliases that are abbreviations for the table names. This is important if you are learning SQL.
The "more films with" is tricky. The simplest method is window functions, but this is tricky because you don't want ties either. So:
SELECT da.*
FROM (SELECT pd.pid, pd.name as Director, pa.pid, pa.name as actor, COUNT(*) as cnt,
RANK() OVER (PARTITION BY pa.pid ORDER BY COUNT(*) DESC) as seqnum,
COUNT(*) OVER (PARTITION BY pa.pid, COUNT(*)) as num_with_cnt,
FROM M_director d JOIN
Person pd
ON pd.pid = d.pid JOIN
M_cast c
ON c.MID = d.MID JOIN
Person pa
ON pa.PID = c.PID
GROUP BY pd.pid, pd.name, pa.pid, pa.name
) da
WHERE director = 'Yash Chopra' AND
seqnum = 1 AND
num_with_cnt = 1;
If ties were allowed, then you can remove the num_with_cnt logic.
I tried to join M_Cast table and M_Director table, but since there is lots of data in M_cast table, because it gives all the cast (Person ids - actors/actress) for all the movies. So Joining any table with M_Cast is a very time consuming task.
So I have come up with a little optimization to this simple join.
Lets go back to our question - > We need to find actors/actress who have done more movies with director - "Yash Chopra".
Steps:
1. We will first find Person who have done atleast one movie with "Yash Chopra"
SELECT Trim(pid)
FROM m_cast
WHERE Trim(mid) IN (SELECT Trim(mid)
FROM m_director
WHERE Trim(pid) = (SELECT Trim(pid)
FROM person
WHERE NAME LIKE '%Yash Chopra%'
)
)
We can be sure that, the final output will contain Person from above query only, there won't be any other Person, which is not present in the above query.
Its true, that the above query gives all the Person who have done Movies with "Yash Chopra", But we need only the Person, who have done more movies with "Yash Chopra" than any other director. So, we can say that we will get a subset of the above query result, as our final output. So, we will not join the entire M_Cast table. We can add a condition, we can take only the Person PID which we got from above query.
SELECT *
FROM m_cast
WHERE Trim(pid) IN (SELECT Trim(pid)
FROM m_cast
WHERE Trim(mid) IN (SELECT Trim(mid)
FROM m_director
WHERE Trim(pid) = (SELECT Trim(pid)
FROM person
WHERE NAME LIKE '%Yash Chopra%'
)
)
)
Now we will Join above M_Cast table with M_Director and get the count of Movies done by each Person with each Director
SELECT Count(mc.mid) AS Movies,
md.pid AS Director_PID,
mc.pid AS Actor_PID
FROM m_director md
INNER JOIN (SELECT *
FROM m_cast
WHERE Trim(pid) IN (SELECT Trim(pid)
FROM m_cast
WHERE Trim(mid) IN (SELECT Trim(mid)
FROM m_director
WHERE Trim(pid) = (SELECT Trim(pid)
FROM person
WHERE NAME LIKE '%Yash Chopra%'
)
)
)
)as mc
ON Trim(md.mid) = Trim(mc.mid)
GROUP BY actor_pid, director_pid
Now we have Person with count of movies with each director. We will find the max movie count for each Person, and if the max no. of movies is done by Yash Chopra, we will consider that Person.
We also have to make sure, we don't include Yash Chopra as an actor into this.
So, Now our final query will be as below,
SELECT p.NAME,
max_movies
FROM person p
INNER JOIN (SELECT director_pid,
Max(movies) AS max_movies,
actor_pid
FROM (SELECT Count(mc.mid) AS Movies,
md.pid AS Director_PID,
mc.pid AS Actor_PID
FROM m_director md
INNER JOIN (SELECT *
FROM m_cast
WHERE Trim(pid) IN (SELECT Trim(pid)
FROM m_cast
WHERE Trim(mid) IN (SELECT Trim(mid)
FROM m_director
WHERE Trim(pid) = (SELECT Trim(pid)
FROM person
WHERE NAME LIKE '%Yash Chopra%'
)
)
)
)as mc
ON Trim(md.mid) = Trim(mc.mid)
GROUP BY actor_pid, director_pid
)
GROUP BY actor_pid
)
ON Trim(p.pid) = Trim(actor_pid)
WHERE director_pid = (SELECT Trim(pid)
FROM person
WHERE NAME LIKE '%Yash Chopra%'
)
AND Trim(p.pid) <> (SELECT Trim(pid)
FROM person
WHERE NAME LIKE '%Yash Chopra%'
)
ORDER BY max_movies DESC
Output will be as below (227 records)
Name max_movies
Jagdish Raj 11
Manmohan Krishna 10
Iftekhar 9
Shashi Kapoor 7
Waheeda Rehman 5
... ... ...
Select tadc.actorID as actorID, tadc.actorName as actorName, (tadc.total_director_count*1.0) /2 as
tc,
ifnull((select count(Distinct mc2.MID) from
Person P3 join M_Cast mc2 on P3.PID=mc2.PID
join M_Director md2 on md2.MID=mc2.MID
join Person P4 on P4.PID=md2.PID
where P4.Name like '%Yash Chopra%'
and P3.PID = tadc.actorID
group by P3.PID, P4.PID), 0
) as yash_director_count
from (
select adc.actorID, adc.actorName, count(DISTINCT adc.directorID) as total_director_count
from
(select P1.PID as actorID, P1.Name as actorName, P2.PID as directorID, P2.Name as directorName, count(Distinct mc1.MID) as movie_count
from
Person P1 join M_Cast mc1 on P1.PID=mc1.PID
join M_Director md1 on md1.MID=mc1.MID
join Person P2 on P2.PID=md1.PID
group by P1.PID, P2.PID
order by P1.Name, P2.Name) as adc
group by adc.actorID
order by total_director_count ASC
) as tadc
where yash_director_count > tc
"select Director, Actor, Count(*) as Movies_with_YashChopra from
(select p1.name as Director, m1.title as Movie from Person p1 Inner Join M_Director md on TRIM(mb.pid)=p1.pid
Inner Join Movie m1 on TRIM(md.mid)=m1.mid and a.name LIKE 'Yash%' Group By p1.name, m1.title) t1 Inner Join
(select p2.name as Actor,m2.title as Movie from Person p2 Inner Join M_Cast mc on TRIM(mc.pid)=p2.pid
Inner Join Movie m2 on TRIM(mc.mid)=m2.title Group By p2.name,m2.title)t2
on t1.Movie=t2.Movie Group By t1.Director, t2.Actor Order By Movies_with_YashChopra DESC"
I think it should be like this only movies with yashchopra

Remove duplicate rows from answer of below query

**List all directors who directed 5000 movies or more, in descending order of the number of movies they directed
The use of Distinct before d.name doesnot help.
result = pd.read_sql_query("SELECT d.name,count(*) as num
FROM PERSON d, M_DIRECTOR md
WHERE d.Pid = md.Pid
GROUP BY d.Pid,d.name
HAVING COUNT(*) >= 10
order by count(*) desc
",conn)
You must use proper explicit joins between the tables and count on distinct movies:
select
p.name,
count(distinct d.mid) num
from person p
inner join m_director d on d.pid = p.pid
inner join movie m on m.mid = d.mid
group by p.pid, p.name
having num >= 10
order by num desc
Probably you have duplicate records in Person table - people with the same name but different ids. Try to group just by name and not by id
result = pd.read_sql_query("SELECT d.name,count(*) as num
FROM PERSON d, M_DIRECTOR md
WHERE d.Pid = md.Pid
GROUP BY d.name
HAVING COUNT(*) >= 10
order by count(*) desc
",conn)

Shorten a query

I have to write a query that would calculate number of tickets purchased consisting only of movie genre of that type. At the end, I have to return movie genre and number of tickets bought for that genre. I have written a query but I was wondering if it can be made shorter and more compact?
Following is the database scheme:
movies(movieId, movieGenre, moviePrice)
tickets(ticketId, ticketDate, customerId)
details(ticketId, movieId, numOfTickets)
Here is my query:
select m.genre, count(*)
from(select t.ticketId, m.genre
from(select d.ticketId
from(select m.genre, t.ticketId
from tickets t join details d on t.ticketId =
d.ticketId join movies m on d.movieId = m.movieId
group by m.genre, t.ticketId) d
group by d.ticketId
having count(*) = 1) as t join details d on t.ticketId =
d.ticketId join movies m on d.movieId = m.movieId
group by t.ticketId, m.genre) m
group by m.genre;
This runs on a database so I am only able to post sample output:
comedy 29821
action 27857
rom-com 19663
I see no reason to use the table tickets, because the results do not filter or aggregate by ticketDate or customerID. Thus, a shorter sql is
SELECT m.moviegenre,
Sum(d.numoftickets) as SumNum
FROM details d
LEFT JOIN movies m
ON d.movieid = m.movieid
GROUP BY m.moviegenre
HAVING SumNum > 0
ORDER BY m.moviegenre
added 3/28 am
I am not sure what is meant by Duplicates?? In table = details(ticketId, movieId, numOfTickets) ??
I would expect that ticketId is unique, so what would explain duplicates?
Is the same ticketId being printed twice, repeatedly??
Determine what number of ticketId are duplicates--
SELECT ticketId, count(*) as cnt
FROM details d
GROUP By ticketId
HAVING count(*) > 1
Determine what number of "details" rows are duplicates--
SELECT ticketId, movieId, numOfTickets, count(*) as cnt
FROM details d
GROUP By ticketId, movieId, numOfTickets
HAVING count(*) > 1
Then again, it may be that table = movies(movieId, movieGenre, moviePrice) is the one with duplicates??
Determine what number of movieId are duplicates--
SELECT movieId, count(*) as cnt
FROM movies m
GROUP BY movieId
HAVING count(*) > 1
Remove duplicates from details--
SELECT m.moviegenre,
Sum(d.numoftickets) as SumNum
FROM
(Select Distinct * From details) d
LEFT JOIN movies m
ON d.movieid = m.movieid
GROUP BY m.moviegenre
ORDER BY m.moviegenre

Return 1 result per left join

Currently I am performing a left join on two tables. The first table has an id and a persons name, the second table has an id, the id of a person from table 1, and then a timestamp (of a flight).
People Flights
id | name id | person_id | time
------------ ---------------------------
1 Dave 1 1 1284762115
2 Becky 2 1 1284787352
3 2 1284772629
4 2 1286432934
5 1 1283239480
When I perform my left join, I get a list of people and their flight times, but what I would like is just the list of people with the flight time with the highest ID
I have been using
SELECT p.id, p.name max(f.time)
FROM People p
LEFT JOIN Flights f ON p.id = f.person_id
GROUP BY p.id, p.name
However, this just gives me the LAST flight time, rather than the last flight time uploaded into the system (ie, highest ID).
1 Dave 1284787352
2 Becky 1286432934
So to reiterate, I would like to see the name of the person, along with the flight time of their last UPLOADED (highest ID) flight time.
1 Dave 1283239480
2 Becky 1286432934
Use:
SELECT p.id,
p.name,
f.time
FROM PEOPLE p
JOIN FLIGHTS f ON f.person_id = p.id
JOIN (SELECT f.person_id,
MAX(f.id) AS max_id
FROM FLIGHTS f
GROUP BY f.person_id) x ON x.person_id = f.person_id
AND x.max_id = f.id
If you are using a database that supports analytics:
SELECT p.id,
p.name,
x.time
FROM PEOPLE p
JOIN (SELECT f.person_id,
f.time,
ROW_NUMBER() OVER(PARTITION BY f.person_id
ORDER BY f.id DESC) AS rk
FROM FLIGHTS f) x ON x.person_id = p.id
AND x.rk = 1
If you want people, including those without flights:
SELECT p.id,
p.name,
f.time
FROM PEOPLE p
LEFT JOIN FLIGHTS f ON f.person_id = p.id
JOIN (SELECT f.person_id,
MAX(f.id) AS max_id
FROM FLIGHTS f
GROUP BY f.person_id) x ON x.person_id = f.person_id
AND x.max_id = f.id
...and the analytic version:
SELECT p.id,
p.name,
x.time
FROM PEOPLE p
LEFT JOIN (SELECT f.person_id,
f.time,
ROW_NUMBER() OVER(PARTITION BY f.person_id
ORDER BY f.id DESC) AS rk
FROM FLIGHTS f) x ON x.person_id = p.id
AND x.rk = 1
I think you are looking for something like the below. group by the person_id and select the max id then use that list to select from the flights. This is my first thought there may be a more efficent way.
EDITED:
SELECT p.id, p.name MAX(f.time)
FROM People p
LEFT JOIN Flights f ON p.id = f.person_id
WHERE f.id in(SELECT MAX(id) FROM flights GROUP BY person_id)