How to optimize this query with large data - sql

select ticket_type,f_rows.remaining_uses,t.source,count(t.id) as total
FROM (
-- Filter rows to get those where remaining_uses > 0 and status = 1
SELECT * FROM (
--Get all the latest rows for each ticket
SELECT ticket_id,final_remaining_uses AS remaining_uses,final_status AS status,action_when
FROM TicketHistory th
INNER JOIN (SELECT max(th.id) AS id FROM TicketHistory GROUP BY ticket_id) maxid ON th.id = maxid.id
) latest_rows
WHERE remaining_uses > 0 AND status = 1 --and (action_when < current_date and action_when > current_date -30)
) f_rows
INNER JOIN Ticket t ON f_rows.ticket_id = t.id
WHERE t.expiry_date >= current_date -1 and source in (0,1,2,6,7,8) and (created_date < current_date and created_date > current_date - 30)
GROUP BY ticket_type, f_rows.remaining_uses, t.source
order by source, ticket_type, remaining_uses;
What I'm doing here is getting the latest rows for each ticket from history table.
then filtering rows for tickets which are not active and there is not usage left on that ticket.
then filtering the data with expiry date and other checks
Is there is a way to optimize this query? currently this query take a very long time and postgresql crashes before it returns any data.
both ticket and ticket history has more than 11M rows each.
EDIT
CREATE TABLE ticket
(
id serial NOT NULL,
source integer NOT NULL,
status integer NOT NULL,
ticket_type integer NOT NULL,
remaining_uses integer NOT NULL,
expiry_date timestamp with time zone NOT NULL,
price numeric(20,2) NOT NULL,
created_date timestamp with time zone NOT NULL,
pax_type integer NOT NULL,
last_updated timestamp with time zone NOT NULL,
service integer,
client_id character varying(50),
CONSTRAINT skybus_ticket_pkey PRIMARY KEY (id),
CONSTRAINT skybus_ticket_sale_id_fkey FOREIGN KEY (sale_id)
REFERENCES skybus_sale (id) MATCH SIMPLE
ON UPDATE NO ACTION ON DELETE NO ACTION DEFERRABLE INITIALLY DEFERRED
)
WITH (
OIDS=FALSE
);
ALTER TABLE ticket
OWNER TO umd;
-- Index: ticket_client_id_idx
-- DROP INDEX ticket_client_id_idx;
CREATE INDEX ticket_client_id_idx
ON ticket
USING btree
(client_id COLLATE pg_catalog."default");
-- Index: ticket_profile_id_idx
-- DROP INDEX ticket_profile_id_idx;
CREATE INDEX ticket_profile_id_idx
ON ticket
USING btree
(profile_id);
-- Index: ticket_sale_id
-- DROP INDEX ticket_sale_id;
CREATE INDEX skybus_ticket_sale_id
ON ticket
USING btree
(sale_id);
-- Index: ticket_ticket_number
-- DROP INDEX ticket_ticket_number;
CREATE INDEX ticket_ticket_number
ON ticket
USING btree
(ticket_number COLLATE pg_catalog."default");
-- Index: ticket_ticket_number_like
-- DROP INDEX ticket_ticket_number_like;
CREATE INDEX ticket_ticket_number_like
ON ticket
USING btree
(ticket_number COLLATE pg_catalog."default" varchar_pattern_ops);
-- Index: ticket_topup_for_idx
-- DROP INDEX ticket_topup_for_idx;
CREATE INDEX ticket_topup_for_idx
ON ticket
USING btree
(topup_for COLLATE pg_catalog."default");
--===============================
CREATE TABLE tickethistory
(
id serial NOT NULL,
ticket_id integer,
action integer NOT NULL,
action_result integer NOT NULL,
initial_status integer NOT NULL,
final_status integer NOT NULL,
final_remaining_uses integer NOT NULL,
ticket_type integer NOT NULL,
action_when timestamp with time zone NOT NULL,
last_updated timestamp with time zone NOT NULL,
service integer,
CONSTRAINT tickethistory_pkey PRIMARY KEY (id),
CONSTRAINT tickethistory_ticket_id_fkey FOREIGN KEY (ticket_id)
REFERENCES ticket (id) MATCH SIMPLE
ON UPDATE NO ACTION ON DELETE NO ACTION DEFERRABLE INITIALLY DEFERRED
)
WITH (
OIDS=FALSE
);
ALTER TABLE tickethistory
OWNER TO umd;
-- Index: tickethistory_ticket_id
-- DROP INDEX tickethistory_ticket_id;
CREATE INDEX tickethistory_ticket_id
ON tickethistory
USING btree
(ticket_id);
--===== Execution plan -- this is with row_number() change
"HashAggregate (cost=4526158.63..4526158.64 rows=1 width=16) (actual time=382849.323..382849.376 rows=41 loops=1)"
" -> Nested Loop (cost=3880592.94..4526158.62 rows=1 width=16) (actual time=380338.613..382825.688 rows=11745 loops=1)"
" -> Subquery Scan on sub (cost=3880592.94..4463424.47 rows=6563 width=8) (actual time=126346.043..258837.523 rows=293717 loops=1)"
" Filter: ((sub.remaining_uses > 0) AND (sub.rn = 1) AND (sub.status = 1))"
" Rows Removed by Filter: 15244064"
" -> WindowAgg (cost=3880592.94..4191436.42 rows=15542174 width=203) (actual time=126345.775..237172.180 rows=15537781 loops=1)"
" -> Sort (cost=3880592.94..3919448.38 rows=15542174 width=203) (actual time=126345.757..180461.191 rows=15537781 loops=1)"
" Sort Key: th.ticket_id, th.*"
" Sort Method: external merge Disk: 3050616kB"
" -> Seq Scan on skybus_tickethistory th (cost=0.00..483544.74 rows=15542174 width=203) (actual time=14.091..53312.782 rows=15537781 loops=1)"
" -> Index Scan using skybus_ticket_pkey on skybus_ticket t (cost=0.00..9.55 rows=1 width=12) (actual time=0.418..0.418 rows=0 loops=293717)"
" Index Cond: (id = sub.ticket_id)"
" Filter: ((source = ANY ('{0,1,2,6,7,8}'::integer[])) AND (created_date < ('now'::cstring)::date) AND (expiry_date >= (('now'::cstring)::date - 1)) AND (created_date > (('now'::cstring)::date - 30)) AND (ticket_type = ANY ('{2,3,4,5,6,7,16,17, (...)"
" Rows Removed by Filter: 1"
"Total runtime: 383045.381 ms"

You could use row_number() to get the latest row for each ticket in a single pass:
with last_history as
(
select *
from (
select row_number() over (partition by ticket_id
order by th desc) rn
, *
from TicketHistory
) sub
where rn = 1 -- Latest history row only
)
select *
from ticket t
join th
on t.id = th.ticket_id
where remaining_uses > 0
and <... other conditions ...>

distinct on () is typically the fastest way of solving greatest-n-per-group problems in Postgres:
select ticket_type,f_rows.remaining_uses,t.source,count(t.id) as total
FROM (
-- Filter rows to get those where remaining_uses > 0 and status = 1
SELECT *
FROM (
--Get all the latest rows for each ticket
SELECT distinct on (ticket_id)
ticket_id,
final_remaining_uses AS remaining_uses,
final_status AS status, action_when
FROM TicketHistory th
order by ticket_id, id desc
) latest_rows
WHERE remaining_uses > 0
AND status = 1 --and (action_when current_date -30)
) f_rows
JOIN Ticket t ON f_rows.ticket_id = t.id
WHERE t.expiry_date >= current_date -1
and source in (0,1,2,6,7,8)
and created_date current_date - 30
GROUP BY ticket_type, f_rows.remaining_uses, t.source
order by source, ticket_type, remaining_uses;
distinct on() together with the order by returns the row with the highest value of tickethistory.id for each ticket_id.
An index on tickethistory (ticket_id, id desc) would probably help. Maybe even one on tickethistory (ticket_id, id desc, final_remaining_uses, final_status, action_when) to enable an index only scan.
However, a timestamp column that stores the moment of creation might be more accurate. If tickethistory.id is e.g. generated id through sequence (because it's a serial) then those values might not reflect that actual order of insertion.

Related

How can I optimize Postgresql ARRAY_AGG queries for large tables?

I am using PostgreSQL for its array functionality. Here's my schema:
CREATE TABLE questions (
id INTEGER PRIMARY KEY,
product_id INTEGER UNIQUE NOT NULL,
body VARCHAR(1000) NOT NULL,
date_written DATE NOT NULL DEFAULT current_date,
asker_name VARCHAR(60) NOT NULL,
asker_email VARCHAR(60) NOT NULL,
reported BOOLEAN DEFAULT FALSE,
helpful INTEGER NOT NULL DEFAULT 0
);
CREATE TABLE answers (
id PRIMARY KEY NOT NULL,
question_id INTEGER NOT NULL,
body VARCHAR(1000) NOT NULL,
date_written DATE NOT NULL DEFAULT current_date,
answerer_name VARCHAR(60) NOT NULL,
answerer_email VARCHAR(60) NOT NULL,
reported BOOLEAN DEFAULT FALSE,
helpful INTEGER NOT NULL DEFAULT 0
);
CREATE TABLE photos (
id INTEGER UNIQUE,
answer_id INTEGER NOT NULL,
photo VARCHAR(200)
);
I am trying to query my answers table to get a list of all the answers for a given question id and include an array of all photos that exist for that given answer_id. The results should be sorted in descending order of helpfulness. So far, I have a massive query that displays the results I'm looking for, but the execution time is 729.595 ms. I am trying to optimize to get the query's time down to 200 ms. I have the following indexes to try and optimize my query times:
indexname | indexdef
-----------------+---------------------------------------------------------------------------
answer_id | CREATE UNIQUE INDEX answer_id ON public.answers USING btree (id)
question_id | CREATE INDEX question_id ON public.answers USING btree (question_id)
idx_reported_id | CREATE INDEX idx_reported_id ON public.answers USING btree (reported, id)
answers_pkey | CREATE UNIQUE INDEX answers_pkey ON public.answers USING btree (id)
indexname | indexdef
----------------+----------------------------------------------------------------------------
id | CREATE UNIQUE INDEX id ON public.questions USING btree (id)
idx_q_reported | CREATE INDEX idx_q_reported ON public.questions USING btree (id, reported)
questions_pkey | CREATE UNIQUE INDEX questions_pkey ON public.questions USING btree (id)
indexname | indexdef
---------------+---------------------------------------------------------------------
photos_id_key | CREATE UNIQUE INDEX photos_id_key ON public.photos USING btree (id)
p_links | CREATE INDEX p_links ON public.photos USING btree (photo)
In my analysis, I noticed that the GroupAggregate is time-consuming: GroupAggregate (cost=126222.21..126222.71 rows=25 width=129) (actual time=729.497..729.506 rows=5 loops=1)
Group Key: answers.id
Is there a way I can avoid the time-consuming GROUP BY? Am I missing something with the indexes? Here is the query itself:
SELECT answers.id,
question_id,
body,
date_written,
answerer_name,
answerer_email,
reported,
helpful,
ARRAY_AGG(photo) as photos
FROM answers
LEFT JOIN photos ON answers.id = photos.answer_id
WHERE reported IS
false AND answers.id IN (SELECT id
FROM answers
WHERE question_id = 20012)
GROUP BY answers.id
ORDER BY helpful DESC;
Thanks!
I think you can skip the subquery:
SELECT answers.id, question_id, body, date_written, answerer_name, answerer_email, reported, helpful, ARRAY_AGG(photo) as photos
FROM answers
LEFT JOIN photos ON answers.id = photos.answer_id
WHERE reported IS false AND question_id = 20012
GROUP BY answers.id, question_id, body, date_written, answerer_name, answerer_email, reported, helpful
ORDER BY helpful DESC;
You can add a btree index on photos.answer_id because this field is use in the join clause.
You losed same fields on the GROUP BY clause;
One way that often works, is to aggregate first, then join on the result (rather than aggregating the full result). And you don't really need the IN condition either
SELECT a.id,
a.question_id,
a.body,
a.date_written,
a.answerer_name,
a.answerer_email,
a.reported,
a.helpful,
p.photos
FROM answers a
LEFT JOIN (
select answer_id, array_agg(photo) as photos
from photos
group by answer_id
) p ON a.id = p.answer_id
WHERE reported IS false
AND a.question_id = 20012
ORDER BY a.helpful DESC;

Missing table access in PostgreSQL query plan

I have two same tables one having 1k rows and the second 1M rows. I use the following script to populate them.
CREATE TABLE Table1 (
id int NOT NULL primary key,
groupby int NOT NULL,
orderby int NOT NULL,
local_search int NOT NULL,
global_search int NOT NULL,
padding varchar(100) NOT NULL
);
CREATE TABLE Table2 (
id int NOT NULL primary key,
groupby int NOT NULL,
orderby int NOT NULL,
local_search int NOT NULL,
global_search int NOT NULL,
padding varchar(100) NOT NULL
);
INSERT
INTO Table1
WITH t1 AS
(
SELECT id
FROM generate_series(1, 10000) id
), t2 AS
(
SELECT id,
id % 100 groupby
FROM t1
), t3 AS
(
SELECT b.id, b.groupby, row_number() over (partition by groupby order by id) orderby
FROM t2 b
)
SELECT id,
groupby,
orderby,
orderby % 50 local_search,
id % 1000 global_search,
RPAD('Value ' || id || ' ' , 100, '*') as padding
FROM t3;
INSERT
INTO Table2
WITH t1 AS
(
SELECT id
FROM generate_series(1, 1000000) id
), t2 AS
(
SELECT id,
id % 100 groupby
FROM t1
), t3 AS
(
SELECT b.id, b.groupby, row_number() over (partition by groupby order by id) orderby
FROM t2 b
)
SELECT id,
groupby,
orderby,
orderby % 50 local_search,
id % 1000 global_search,
RPAD('Value ' || id || ' ' , 100, '*') as padding
FROM t3;
I created also secondary index on table2
CREATE INDEX ix_Table2_groupby_orderby ON Table2 (groupby, orderby);
Now, I have the following query
select b.id, b.groupby, b.orderby, b.local_search, b.global_search, b.padding
from Table2 b
join Table1 a on b.orderby = a.id
where a.global_search = 1 and b.groupby < 10;
which leads to the following query plan using explain(analyze)
"Nested Loop (cost=0.42..17787.05 rows=100 width=121) (actual time=0.056..34.722 rows=100 loops=1)"
" -> Seq Scan on table1 a (cost=0.00..318.00 rows=10 width=4) (actual time=0.033..1.313 rows=10 loops=1)"
" Filter: (global_search = 1)"
" Rows Removed by Filter: 9990"
" -> Index Scan using ix_table2_groupby_orderby on table2 b (cost=0.42..1746.81 rows=10 width=121) (actual time=0.159..3.337 rows=10 loops=10)"
" Index Cond: ((groupby < 10) AND (orderby = a.id))"
"Planning time: 0.296 ms"
"Execution time: 34.775 ms"
and my question is: how it comes that he does not access the table2 in the query plan? He uses just ix_table2_groupby_orderby, but it contains just groupby, orderby and maybe id columns. How he gets the remaining columns of Table2 and why it is not in the query plan?
** EDIT **
I have tried explain(verbose) As suggested #laurenzalbe. This is the result
"Nested Loop (cost=0.42..17787.05 rows=100 width=121) (actual time=0.070..35.678 rows=100 loops=1)"
" Output: b.id, b.groupby, b.orderby, b.local_search, b.global_search, b.padding"
" -> Seq Scan on public.table1 a (cost=0.00..318.00 rows=10 width=4) (actual time=0.031..1.642 rows=10 loops=1)"
" Output: a.id, a.groupby, a.orderby, a.local_search, a.global_search, a.padding"
" Filter: (a.global_search = 1)"
" Rows Removed by Filter: 9990"
" -> Index Scan using ix_table2_groupby_orderby on public.table2 b (cost=0.42..1746.81 rows=10 width=121) (actual time=0.159..3.398 rows=10 loops=10)"
" Output: b.id, b.groupby, b.orderby, b.local_search, b.global_search, b.padding"
" Index Cond: ((b.groupby < 10) AND (b.orderby = a.id))"
"Planning time: 16.201 ms"
"Execution time: 35.754 ms"
Actually, I do not fully understand why the access to the heap of table2 is not there, but I accept it as an answer.
An index scan in PostgreSQL accesses not only the index, but also the table. This is not explicitly shown in the execution plan and is necessary to find out if a row is visible to the transaction or not.
Try EXPLAIN (VERBOSE) to see what columns are returned.
See the documentation for details:
All indexes in PostgreSQL are secondary indexes, meaning that each index is stored separately from the table's main data area (which is called the table's heap in PostgreSQL terminology). This means that in an ordinary index scan, each row retrieval requires fetching data from both the index and the heap.

Acquiring row level locks in a order

I have a table where I am updating multiple rows inside a transaction.
DROP SCHEMA IF EXISTS s CASCADE;
CREATE SCHEMA s;
CREATE TABLE s.t1 (
"id1" Bigint,
"id2" Bigint,
CONSTRAINT "pk1" PRIMARY KEY (id1)
)
WITH(OIDS=FALSE);
INSERT INTO s.t1( id1, id2 )
SELECT x, x * 100
FROM generate_series( 1,10 ) x;
END TRANSACTION;
BEGIN TRANSACTION;
SELECT id1 FROM s.t1 WHERE id1 > 3 and id1 < 6 ORDER BY id1 FOR UPDATE; /* row lock */
I am assuming this will take row level locks in order (id1).
Is my assumption correct ?
So that I will be able to run multiple transactions without ever worrying about deadlocks due to the order of locks on rows.
END TRANSACTION;
BEGIN TRANSACTION;
SELECT id1,id2 FROM s.t1 order by id1;
DROP SCHEMA s CASCADE;
I did a explain.
EXPLAIN SELECT id1 FROM s.t1 WHERE id1 > 3 and id1 < 6 ORDER BY id1 FOR UPDATE;
QUERY PLAN
------------------------------------------------------------------------------
LockRows (cost=15.05..15.16 rows=9 width=14)
-> Sort (cost=15.05..15.07 rows=9 width=14)
Sort Key: id1
-> Bitmap Heap Scan on t1 (cost=4.34..14.91 rows=9 width=14)
Recheck Cond: ((id1 > 3) AND (id1 < 6))
-> Bitmap Index Scan on pk1 (cost=0.00..4.34 rows=9 width=0)
Index Cond: ((id1 > 3) AND (id1 < 6))
(7 rows)
Answer: This is correct.
Thanks

Would a partial index be used on a query?

Given this partial index:
CREATE INDEX orders_id_created_at_index
ON orders(id) WHERE created_at < '2013-12-31';
Would this query use the index?
SELECT *
FROM orders
WHERE id = 123 AND created_at = '2013-10-12';
As per the documentation, "a partial index can be used in a query only if the system can recognize that the WHERE condition of the query mathematically implies the predicate of the index".
Does that mean that the index will or will not be used?
You can check and yes, it would be used. I've created sql fiddle to check it with a query like this:
create table orders(id int, created_at date);
CREATE INDEX orders_id_created_at_index ON orders(id) WHERE created_at < '2013-12-31';
insert into orders
select
(random()*500)::int, '2013-01-01'::date + ((random() * 200)::int || ' day')::interval
from generate_series(1, 10000) as g
SELECT * FROM orders WHERE id = 123 AND created_at = '2013-10-12';
SELECT * FROM orders WHERE id = 123 AND created_at = '2014-10-12';
sql fiddle demo
If you check execution plans for these queries, you'll see for first query:
Bitmap Heap Scan on orders (cost=4.39..40.06 rows=1 width=8) Recheck Cond: ((id = 123) AND (created_at < '2013-12-31'::date)) Filter: (created_at = '2013-10-12'::date)
-> Bitmap Index Scan on orders_id_created_at_index (cost=0.00..4.39 rows=19 width=0) Index Cond: (id = 123)
and for second query:
Seq Scan on orders (cost=0.00..195.00 rows=1 width=8) Filter: ((id = 123) AND (created_at = '2014-10-12'::date))

How to efficiently select rows having a MIN date in postgres

I need to quickly select a value ( baz ) from the "earliest" ( MIN(save_date) ) rows grouped by an their foo_id. The following query returns the correct rows (well almost, it can return multiples for each foo_id when there are duplicate save_dates).
The foos table contains about 55k rows and the samples table contains about 25 million rows.
CREATE TABLE foos (
foo_id int,
val varchar(40),
# ref_id is a FK, constraint omitted for brevity
ref_id int
)
CREATE TABLE samples (
sample_id int,
save_date date,
baz smallint,
# foo_id is a FK, constraint omitted for brevity
foo_id int
)
WITH foo ( foo_id, val ) AS (
SELECT foo_id, val FROM foos
WHERE foos.ref_id = 1
ORDER BY foos.val ASC
LIMIT 25 OFFSET 0
)
SELECT foo.val, firsts.baz
FROM foo
LEFT JOIN (
SELECT A.baz, A.foo_id
FROM samples A
INNER JOIN (
SELECT foo_id, MIN( save_date ) AS save_date
FROM samples
GROUP BY foo_id
) B
USING ( foo_id, save_date )
) firsts USING ( foo_id )
This query currently takes over 100 seconds; I'd like to see this return in ~1 second (or less!).
How can I write this query to be optimal?
Updated; adding explains:
Obviously the actual query I'm using isn't using tables foo, baz, etc.
The "dumbed down" example query's (from above) explain:
Hash Right Join (cost=337.69..635.47 rows=3 width=100)
Hash Cond: (a.foo_id = foo.foo_id)
CTE foo
-> Limit (cost=71.52..71.53 rows=3 width=102)
-> Sort (cost=71.52..71.53 rows=3 width=102)
Sort Key: foos.val
-> Seq Scan on foos (cost=0.00..71.50 rows=3 width=102)
Filter: (ref_id = 1)
-> Hash Join (cost=265.25..562.90 rows=9 width=6)
Hash Cond: ((a.foo_id = samples.foo_id) AND (a.save_date = (min(samples.save_date))))
-> Seq Scan on samples a (cost=0.00..195.00 rows=1850 width=10)
-> Hash (cost=244.25..244.25 rows=200 width=8)
-> HashAggregate (cost=204.25..224.25 rows=200 width=8)
-> Seq Scan on samples (cost=0.00..195.00 rows=1850 width=8)
-> Hash (cost=0.60..0.60 rows=3 width=102)
-> CTE Scan on foo (cost=0.00..0.60 rows=3 width=102)
If I understand the question, you want windowing.
WITH find_first AS (
SELECT foo_id, baz,
row_number()
OVER (PARTITION BY foo_id ORDER BY foo_id, save_date) AS rnum
FROM samples
)
SELECT foo_id, baz FROM find_first WHERE rnum = 1;
Using row_number instead of rank eliminates duplicates and guarantees only one baz per foo. If you need to know against foos that have no bazzes, just LEFT JOIN the foos table to this query.
With an index on (foo_id, save_date), the optimizer should be smart enough to do the grouping keeping only one baz and skipping merrily along.
row_number() is a beautiful beast, but DISTINCT ON is simpler here.
WITH foo AS (
SELECT foo_id
FROM foos
WHERE ref_id = 1
ORDER BY val
LIMIT 25 OFFSET 0
)
SELECT DISTINCT ON (1) f.foo_id, s.baz
FROM foo f
LEFT JOIN samples s USING (foo_id)
ORDER BY f.foo_id, s.save_date, s.baz;
This is assuming you want exactly 1 row per foo_id. If there are multiple rows in sample sharing the same earliest save_date, baz serves as tie-breaker.
The case is very similar to this question from yesterday.
More advice:
Don't select val in the CTE, you only need it in ORDER BY.
To avoid expensive sequential scans on foos:
If you are always after rows from foos with ref_id = 1, create a partial multi-column index:
CREATE INDEX foos_val_part_idx ON foos (val)
WHERE ref_id = 1;
If ref_id is variable:
CREATE INDEX foos_ref_id_val_idx ON foos (ref_id, val);
The other index that would help best on samples:
CREATE INDEX samples_foo_id_save_date_baz_idx
ON samples (foo_id, save_date, baz);
These indexes become even more effective with the new "index-only scans" in version 9.2. Details and links here.