Using CTE and aggregate functions with UPDATE - sql

In PostgreSQL 10.6 I am trying to store results of 2 aggregate function calls into the avg_score and avg_time columns of the words_users table, but unfortunately get the syntax error:
WITH last_week_moves AS (
SELECT
m.gid,
m.uid,
m.played - LAG(m.played) OVER(PARTITION BY m.gid ORDER BY played) AS diff
FROM words_moves m
JOIN words_games g ON (m.gid = g.gid AND 5 IN (g.player1, g.player2))
WHERE m.played > CURRENT_TIMESTAMP - INTERVAL '1 week'
)
UPDATE words_users SET
avg_score = (SELECT ROUND(AVG(score), 1) FROM words_moves WHERE uid = 5),
avg_time = TO_CHAR(AVG(diff), 'HH24:MI')
FROM last_week_moves
WHERE uid = 5
GROUP BY uid;
(I am using the hardcoded uid = 5 in the above statement, but in the real life the latter is wrapped inside of a PL/PgSQL stored function and is using a parameter uid = in_uid).
ERROR: 42601: syntax error at or near "GROUP"
LINE 15: GROUP BY uid
^
LOCATION: scanner_yyerror, scan.l:1128
The database seems to be unhappy with the GROUP BY, but I need it for AVG(diff), because the CTE delivers the times between moves always for 2 players in a game:
SELECT
m.gid,
m.uid,
m.played - LAG(m.played) OVER(PARTITION BY m.gid ORDER BY played) AS diff
FROM words_moves m
JOIN words_games g ON (m.gid = g.gid AND 5 IN (g.player1, g.player2))
WHERE m.played > CURRENT_TIMESTAMP - INTERVAL '1 week';
gid | uid | diff
-------+-------+-----------------------
50399 | 774 | ¤
50608 | 8977 | ¤
50608 | 5 | 00:39:48.121149
50608 | 8977 | 00:09:46.221235
50608 | 5 | 01:35:23.524209
50608 | 8977 | 09:26:40.794061
50697 | 5 | ¤
50697 | 10322 | 02:13:16.502079
50697 | 5 | 01:47:44.681788
50697 | 10322 | 00:01:31.597973
50697 | 5 | 12:11:24.54716
50697 | 10322 | 12:01:15.078243
50697 | 5 | 11:52:39.60056
50697 | 10322 | 00:11:30.491137
50697 | 5 | 00:14:53.612513
50697 | 10322 | 01:45:23.940957
...
52469 | 5 | 02:46:29.768655
52469 | 8550 | 01:16:45.169882
52469 | 5 | 08:38:00.691552
(59 rows)
Does anybody please know, how to change my UPDATE query?
Below are the 3 tables in question:
# \d words_users
Table "public.words_users"
Column | Type | Collation | Nullable | Default
---------------+--------------------------+-----------+----------+------------------------------------------
uid | integer | | not null | nextval('words_users_uid_seq'::regclass)
created | timestamp with time zone | | not null |
visited | timestamp with time zone | | not null |
ip | inet | | not null |
fcm | text | | |
apns | text | | |
adm | text | | |
motto | text | | |
vip_until | timestamp with time zone | | |
grand_until | timestamp with time zone | | |
banned_until | timestamp with time zone | | |
banned_reason | text | | |
elo | integer | | not null |
medals | integer | | not null |
coins | integer | | not null |
avg_score | double precision | | |
avg_time | text | | |
Indexes:
"words_users_pkey" PRIMARY KEY, btree (uid)
Check constraints:
"words_users_banned_reason_check" CHECK (length(banned_reason) > 0)
"words_users_elo_check" CHECK (elo >= 0)
"words_users_medals_check" CHECK (medals >= 0)
Referenced by:
TABLE "words_chat" CONSTRAINT "words_chat_uid_fkey" FOREIGN KEY (uid) REFERENCES words_users(uid) ON DELETE CASCADE
TABLE "words_games" CONSTRAINT "words_games_player1_fkey" FOREIGN KEY (player1) REFERENCES words_users(uid) ON DELETE CASCADE
TABLE "words_games" CONSTRAINT "words_games_player2_fkey" FOREIGN KEY (player2) REFERENCES words_users(uid) ON DELETE CASCADE
TABLE "words_moves" CONSTRAINT "words_moves_uid_fkey" FOREIGN KEY (uid) REFERENCES words_users(uid) ON DELETE CASCADE
TABLE "words_reviews" CONSTRAINT "words_reviews_author_fkey" FOREIGN KEY (author) REFERENCES words_users(uid) ON DELETE CASCADE
TABLE "words_reviews" CONSTRAINT "words_reviews_uid_fkey" FOREIGN KEY (uid) REFERENCES words_users(uid) ON DELETE CASCADE
TABLE "words_scores" CONSTRAINT "words_scores_uid_fkey" FOREIGN KEY (uid) REFERENCES words_users(uid) ON DELETE CASCADE
TABLE "words_social" CONSTRAINT "words_social_uid_fkey" FOREIGN KEY (uid) REFERENCES words_users(uid) ON DELETE CASCADE
TABLE "words_stats" CONSTRAINT "words_stats_uid_fkey" FOREIGN KEY (uid) REFERENCES words_users(uid) ON DELETE CASCADE
# \d words_moves
Table "public.words_moves"
Column | Type | Collation | Nullable | Default
---------+--------------------------+-----------+----------+------------------------------------------
mid | bigint | | not null | nextval('words_moves_mid_seq'::regclass)
action | text | | not null |
gid | integer | | not null |
uid | integer | | not null |
played | timestamp with time zone | | not null |
tiles | jsonb | | |
score | integer | | |
letters | text | | |
hand | text | | |
puzzle | boolean | | not null | false
Indexes:
"words_moves_pkey" PRIMARY KEY, btree (mid)
"words_moves_gid_played_idx" btree (gid, played DESC)
"words_moves_uid_idx" btree (uid)
Check constraints:
"words_moves_score_check" CHECK (score >= 0)
Foreign-key constraints:
"words_moves_gid_fkey" FOREIGN KEY (gid) REFERENCES words_games(gid) ON DELETE CASCADE
"words_moves_uid_fkey" FOREIGN KEY (uid) REFERENCES words_users(uid) ON DELETE CASCADE
Referenced by:
TABLE "words_scores" CONSTRAINT "words_scores_mid_fkey" FOREIGN KEY (mid) REFERENCES words_moves(mid) ON DELETE CASCADE
# \d words_games
Table "public.words_games"
Column | Type | Collation | Nullable | Default
----------+--------------------------+-----------+----------+------------------------------------------
gid | integer | | not null | nextval('words_games_gid_seq'::regclass)
created | timestamp with time zone | | not null |
finished | timestamp with time zone | | |
player1 | integer | | not null |
player2 | integer | | |
played1 | timestamp with time zone | | |
played2 | timestamp with time zone | | |
state1 | text | | |
state2 | text | | |
reason | text | | |
hint1 | text | | |
hint2 | text | | |
score1 | integer | | not null |
score2 | integer | | not null |
chat1 | integer | | not null |
chat2 | integer | | not null |
hand1 | character(1)[] | | not null |
hand2 | character(1)[] | | not null |
pile | character(1)[] | | not null |
letters | character(1)[] | | not null |
values | integer[] | | not null |
bid | integer | | not null |
friendly | boolean | | |
Indexes:
"words_games_pkey" PRIMARY KEY, btree (gid)
"words_games_player1_coalesce_idx" btree (player1, COALESCE(finished, 'infinity'::timestamp with time zone))
"words_games_player2_coalesce_idx" btree (player2, COALESCE(finished, 'infinity'::timestamp with time zone))
Check constraints:
"words_games_chat1_check" CHECK (chat1 >= 0)
"words_games_chat2_check" CHECK (chat2 >= 0)
"words_games_check" CHECK (player1 <> player2)
"words_games_score1_check" CHECK (score1 >= 0)
"words_games_score2_check" CHECK (score2 >= 0)
Foreign-key constraints:
"words_games_bid_fkey" FOREIGN KEY (bid) REFERENCES words_boards(bid) ON DELETE CASCADE
"words_games_player1_fkey" FOREIGN KEY (player1) REFERENCES words_users(uid) ON DELETE CASCADE
"words_games_player2_fkey" FOREIGN KEY (player2) REFERENCES words_users(uid) ON DELETE CASCADE
Referenced by:
TABLE "words_chat" CONSTRAINT "words_chat_gid_fkey" FOREIGN KEY (gid) REFERENCES words_games(gid) ON DELETE CASCADE
TABLE "words_moves" CONSTRAINT "words_moves_gid_fkey" FOREIGN KEY (gid) REFERENCES words_games(gid) ON DELETE CASCADE
TABLE "words_scores" CONSTRAINT "words_scores_gid_fkey" FOREIGN KEY (gid) REFERENCES words_games(gid) ON DELETE CASCADE
UPDATE:
As suggested by #lau I have tried moving the AVG into the CTE, but get another syntax error:
WITH last_week_moves AS (
SELECT
m.gid,
m.uid,
TO_CHAR(AVG(m.played - LAG(m.played) OVER(PARTITION BY m.gid ORDER BY played)), 'HH24:MI') AS diff
FROM words_moves m
JOIN words_games g ON (m.gid = g.gid AND 5 IN (g.player1, g.player2))
WHERE m.played > CURRENT_TIMESTAMP - INTERVAL '1 week'
GROUP BY uid
)
UPDATE words_users SET
avg_score = (SELECT ROUND(AVG(score), 1) FROM words_moves WHERE uid = 5),
avg_time = diff
FROM last_week_moves
WHERE uid = 5;
ERROR: 42803: aggregate function calls cannot contain window function calls
LINE 5: TO_CHAR(AVG(m.played - LAG(m.played)...
^
LOCATION: check_agg_arguments_walker, parse_agg.c:728

You seem to want:
WITH last_week_moves AS (
SELECT m.uid,
(MAX(m.played) - MIN(m.played)) / NULLIF(COUNT(*) - 1, 0) as avg_diff,
AVG(score) as avg_score
FROM words_moves m JOIN
words_games g
ON m.gid = g.gid AND 5 IN (g.player1, g.player2)
WHERE m.played > CURRENT_TIMESTAMP - INTERVAL '1 week'
GROUP BY m.uid
)
UPDATE words_users wu
SET avg_score = lwm.avg_score,
avg_time = TO_CHAR(avg_diff, 'HH24:MI')
FROM last_week_moves lwm
WHERE wu.uid = lwm.uid AND
wu.uid = 5;
Note that this simplifies the calculate of the diff calculation, so lag() is not needed.
You can see that these are equivalent:
value diff
1
4 3
9 5
The average of diff is obviously 4. This is ((4 - 1) + (9 - 4)) / 2. You see that the "4"s cancel, so it is really (9 - 1) / 2. This observation generalizes.

Related

Selecting two columns gives duplicates in Postgres

I have the following structure:
ChatMessages table:
Column | Type | Collation | Nullable | Default
-----------+-----------------------------+-----------+----------+---------
Id | text | | not null |
SenderId | text | | |
Message | text | | |
ChatId | text | | |
CreatedAt | timestamp without time zone | | not null |
Indexes:
"PK_ChatMessages" PRIMARY KEY, btree ("Id")
"IX_ChatMessages_ChatId" btree ("ChatId")
"IX_ChatMessages_SenderId" btree ("SenderId")
Foreign-key constraints:
"FK_ChatMessages_ChatUsers_SenderId" FOREIGN KEY ("SenderId") REFERENCES "ChatUsers"("Id") ON DELETE RESTRICT
"FK_ChatMessages_Chats_ChatId" FOREIGN KEY ("ChatId") REFERENCES "Chats"("Id") ON DELETE RESTRICT
ChatUsers table:
Column | Type | Collation | Nullable | Default
-----------+-----------------------------+-----------+----------+---------
Id | text | | not null |
Username | text | | |
Email | text | | |
CreatedAt | timestamp without time zone | | not null |
Indexes:
"PK_ChatUsers" PRIMARY KEY, btree ("Id")
Referenced by:
TABLE ""ChatMessages"" CONSTRAINT "FK_ChatMessages_ChatUsers_SenderId" FOREIGN KEY ("SenderId") REFERENCES "ChatUsers"("Id") ON DELETE RESTRICT
TABLE ""Chats"" CONSTRAINT "FK_Chats_ChatUsers_User1Id" FOREIGN KEY ("User1Id") REFERENCES "ChatUsers"("Id") ON DELETE RESTRICT
TABLE ""Chats"" CONSTRAINT "FK_Chats_ChatUsers_User2Id" FOREIGN KEY ("User2Id") REFERENCES "ChatUsers"("Id") ON DELETE RESTRICT
If I execute select "SenderId", "Email", "Message" from "ChatMessages", "ChatUsers"; I receive a list with all messages, however each message is multiplied by the amount of users' email addresses:
SenderId | Email | Message
--------------------------------------+---------------------+-------------------------------------------------------
2f5aca90-0599-400c-8455-31a22c1d4dcd | user1#example.com | This is a test message.
2f5aca90-0599-400c-8455-31a22c1d4dcd | user1#example.com | This is also a test message.
b3e218e6-dd41-4223-978b-a54f46fc465e | user1#example.com | okay ...
2f5aca90-0599-400c-8455-31a22c1d4dcd | user2#example.com | This is a test message.
2f5aca90-0599-400c-8455-31a22c1d4dcd | user2#example.com | This is also a test message.
b3e218e6-dd41-4223-978b-a54f46fc465e | user2#example.com | okay ...
However, I want it to be displayed like the following:
SenderId | Email | Message
--------------------------------------+---------------------+-------------------------------------------------------
2f5aca90-0599-400c-8455-31a22c1d4dcd | user1#example.com | This is a test message.
2f5aca90-0599-400c-8455-31a22c1d4dcd | user1#example.com | This is also a test message.
b3e218e6-dd41-4223-978b-a54f46fc465e | user2#example.com | okay ...
What would be the correct query for that?
try with this (postgres syntax):
SELECT cm.SenderId, cu.Email, cm.Message FROM ChatMessages cm JOIN ChatUsers cu ON cm.SenderId = cu.Id

SQL - Using COUNT with alias shorthand

I'm learning SQL and having trouble performing a query that uses COUNT.
I have a movies database with three tables:
Table public.movies:
Column | Type | Collation | Nullable | Default
--------------+---------+-----------+----------+------------------------------------
id | integer | | not null | nextval('movies_id_seq'::regclass)
title | text | | not null |
release_year | integer | | not null |
runtime | integer | | not null |
rating | text | | not null |
studio_id | integer | | |
Table public.stars:
Column | Type | Collation | Nullable | Default
------------+---------+-----------+----------+-----------------------------------
id | integer | | not null | nextval('stars_id_seq'::regclass)
first_name | text | | not null |
last_name | text | | |
birth_date | date | | not null |
Indexes:
"stars_pkey" PRIMARY KEY, btree (id)
Referenced by:
TABLE "roles" CONSTRAINT "roles_star_id_fkey" FOREIGN KEY (star_id) REFERENCES stars(id) ON DELETE CASCADE
Table public.roles:
Column | Type | Collation | Nullable | Default
----------+---------+-----------+----------+-----------------------------------
id | integer | | not null | nextval('roles_id_seq'::regclass)
movie_id | integer | | |
star_id | integer | | |
Indexes:
"roles_pkey" PRIMARY KEY, btree (id)
Foreign-key constraints:
"roles_movie_id_fkey" FOREIGN KEY (movie_id) REFERENCES movies(id) ON DELETE CASCADE
"roles_star_id_fkey" FOREIGN KEY (star_id) REFERENCES stars(id) ON DELETE CASCADE
I'm trying to select the first and last names of every star along with the number of movies they have been in, and group everything by first and last name (as to not have any duplicate stars). This is what I tried:
SELECT s.first_name, s.last_name, m.COUNT(*)
FROM movies m
JOIN roles r
ON m.id = r.movie_id
JOIN stars s
ON r.star_id = s.id
GROUP BY s.first_name, s.last_name;
I keep getting ERROR: schema "m" does not exist.
I'd appreciate any pointers.
The correct syntax just uses COUNT(*):
SELECT s.first_name, s.last_name, COUNT(*)
FROM movies m JOIN
roles r
ON m.id = r.movie_id JOIN
stars s
ON r.star_id = s.id
GROUP BY s.id, s.first_name, s.last_name;
To avoid duplicates, use the primary key in the GROUP BY.
Note that you don't need movies in the query:
SELECT s.first_name, s.last_name, COUNT(*)
FROM roles r JOIN
stars s
ON r.star_id = s.id
GROUP BY s.id, s.first_name, s.last_name;
Finally, if a star can play more than one role in a movie, you will want to use COUNT(DISTINCT r.movie_id) in the SELECT.

MariaDB - insert record or update timestamp if record exists

Objective: cronjob runs a task; when completed successfully, insert new host record. If record exists, update timestamp to reflect this status.
# Table layout
> describe hosts_completed;
+-----------+---------------------+------+-----+-------------------+-----------------------------+
| Field | Type | Null | Key | Default | Extra |
+-----------+---------------------+------+-----+-------------------+-----------------------------+
| id | bigint(20) unsigned | NO | PRI | NULL | auto_increment |
| timestamp | timestamp | NO | | CURRENT_TIMESTAMP | on update CURRENT_TIMESTAMP |
| hostname | varchar(32) | YES | MUL | NULL | |
+-----------+---------------------+------+-----+-------------------+-----------------------------+
# Current inventory
> select * from hosts_completed;
+----+---------------------+----------+
| id | timestamp | hostname |
+----+---------------------+----------+
| 10 | 2020-11-02 12:51:08 | myHost1 |
| 11 | 2020-11-02 14:32:16 | MyHost2 |
+----+---------------------+----------+
I want to update the status for myHost1 and my best shot would be like
> insert into hosts_completed(hostname) values("myHost1") ON DUPLICATE KEY UPDATE timestamp=now();
and it runs but adds a new record, it does not update the myHost1 record.
Where is the glitch?
The on duplicate key syntax requires a unique constraint on the column that is used to detect the conflict. Create it first:
alter table hosts_completed
add constraint unique_hostname
unique (hostname);
Note that this pre-requires no duplicates in the column (otherwise you need to housekeep your data before you can create the constraint).
Then you can use your curent query:
insert into hosts_completed(hostname)
values('myHost1')
on duplicate key update timestamp = now();

How to choose indexes

I'm trying to execute several times the following query :
SELECT st2.stop_id AS to_stop_id,
TIME_TO_SEC(
ADDTIME(TIMEDIFF(MIN(st1.time), %time),
TIMEDIFF(st2.time, st2.time))) AS duration
FROM stop_times st1,
stop_times st2,
trips tr,
calendar cal
WHERE tr.service_id = cal.service_id
AND tr.trip_id = st1.trip_id
AND st1.trip_id = st2.trip_id
AND st1.stop_id = %sid
AND st1.stop_seq +1 = st2.stop_seq
AND st1.time > %time
AND DATE(NOW()) BETWEEN cal.start_date AND
cal.end_date
GROUP BY st2.stop_id
However, it run extremely slow. I indexed the following attributes:
+------------+------------+------------+--------------+-------------+-----------+-------------+----------+--------+------+------------+---------+---------------+
| Table | Non_unique | Key_name | Seq_in_index | Column_name | Collation | Cardinality | Sub_part | Packed | Null | Index_type | Comment | Index_comment |
+------------+------------+------------+--------------+-------------+-----------+-------------+----------+--------+------+------------+---------+---------------+
| stop_times | 0 | st_id | 1 | st_id | A | 11431583 | NULL | NULL | | BTREE | | |
| stop_times | 1 | fk_tid_s | 1 | trip_id | A | 1039234 | NULL | NULL | YES | BTREE | | |
| stop_times | 1 | st_per_sid | 1 | stop_id | A | 33135 | NULL | NULL | YES | BTREE | | |
| calendar | 0 | PRIMARY | 1 | service_id | A | 5206 | NULL | NULL | | BTREE | | |
| calendar | 0 | PRIMARY | 1 | service_id | A | 5206 | NULL | NULL | | BTREE | | |
| trips | 0 | PRIMARY | 1 | trip_id | A | 449489 | NULL | NULL | | BTREE | | |
| trips | 1 | fk_rid | 1 | route_id | A | 1937 | NULL | NULL | YES | BTREE | | |
| trips | 1 | fk_sid | 1 | service_id | A | 7749 | NULL | NULL | YES | BTREE | | |
+------------+------------+------------+--------------+-------------+-----------+-------------+----------+--------+------+------------+---------+---------------+
(For some reasons, st_id is not show as a PRIMARY KEY, but it is, I don't know if it's important but just in case..)
I ran SQL EXPLAIN on this query and it gave me the following answer :
+------+-------------+-------+--------+-------------------------------------------------+---------------------+---------+------------------------------+------+---------------------------------------------------------------------+
| id | select_type | table | type | possible_keys | key | key_len | ref | rows | Extra |
+------+-------------+-------+--------+-------------------------------------------------+---------------------+---------+------------------------------+------+---------------------------------------------------------------------+
| 1 | SIMPLE | st1 | range | comp_uniq_st_seq,st_per_sid,comp_uniq_stid_time | comp_uniq_stid_time | 9 | NULL | 1396 | Using index condition; Using where; Using temporary; Using filesort |
| 1 | SIMPLE | tr | eq_ref | PRIMARY,fk_sid | PRIMARY | 8 | reseau_ratp.st1.trip_id | 1 | Using where |
| 1 | SIMPLE | cal | eq_ref | PRIMARY,comp_sid_date_en,comp_sid_date_st | PRIMARY | 4 | reseau_ratp.tr.service_id | 1 | Using where |
| 1 | SIMPLE | st2 | ref | comp_uniq_st_seq | comp_uniq_st_seq | 14 | reseau_ratp.st1.trip_id,func | 1 | Using index condition |
+------+-------------+-------+--------+-------------------------------------------------+---------------------+---------+------------------------------+------+---------------------------------------------------------------------+
What should I do to get this query running faster?
EDIT :
Query using the requested syntax :
SELECT st2.stop_id AS to_stop_id,
TIME_TO_SEC(
ADDTIME(TIMEDIFF(MIN(st1.time), %time),
TIMEDIFF(st2.time, st2.time))) AS duration
FROM stop_times st1
INNER JOIN stop_times st2
ON st1.trip_id = st2.trip_id AND st1.stop_seq + 1 = st2.stop_seq
INNER JOIN trips tr
ON tr.trip_id = st1.trip_id
INNER JOIN calendar cal
ON tr.service_id = cal.service_id
WHERE st1.stop_id = %sid
AND st1.time > %time
AND cal.start_date <= NOW()
AND cal.end_date >= NOW()
GROUP BY st2.stop_id
Here SHOW CREATE TABLE stop_times:
CREATE TABLE `stop_times` (
`trip_id` bigint(10) unsigned DEFAULT NULL,
`stop_id` int(10) DEFAULT NULL,
`time` time DEFAULT NULL,
`stop_seq` int(10) unsigned DEFAULT NULL,
UNIQUE KEY `comp_uniq_st_seq` (`trip_id`,`stop_seq`),
KEY `comp_uniq_stid_time` (`stop_id`,`time`),
CONSTRAINT `fk_sid_s` FOREIGN KEY (`stop_id`) REFERENCES `stops` (`stop_id`),
CONSTRAINT `fk_tid_s` FOREIGN KEY (`trip_id`) REFERENCES `trips` (`trip_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8
For calendar :
CREATE TABLE `calendar` (
`service_id` int(10) unsigned NOT NULL,
`start_date` date DEFAULT NULL,
`end_date` date DEFAULT NULL,
PRIMARY KEY (`service_id`),
KEY `comp_sid_date_en` (`service_id`,`end_date`),
KEY `comp_sid_date_st` (`service_id`,`start_date`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8
And for trips :
CREATE TABLE `trips` (
`trip_id` bigint(10) unsigned NOT NULL DEFAULT '0',
`route_id` int(10) unsigned DEFAULT NULL,
`service_id` int(10) unsigned DEFAULT NULL,
`trip_headsign` varchar(15) DEFAULT NULL,
`trip_short_name` varchar(15) DEFAULT NULL,
`direction_id` tinyint(1) DEFAULT NULL,
PRIMARY KEY (`trip_id`),
KEY `fk_rid` (`route_id`),
KEY `fk_sid` (`service_id`),
CONSTRAINT `fk_rid` FOREIGN KEY (`route_id`) REFERENCES `routes` (`route_id`),
CONSTRAINT `fk_sid` FOREIGN KEY (`service_id`) REFERENCES `calendar` (`service_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8
st1 needs this composite index: INDEX(stop_id, time)
Please use the JOIN ... ON syntax.
Please provide SHOW CREATE TABLE.
Here is a Cookbook on creating INDEXes from a SELECT.
(Edit)
Calendar is trickier to handle, and there is no "good" index. These may help:
INDEX(service_id, start_time)
INDEX(service_id, end_time)
plus, reformulate AND DATE(NOW()) BETWEEN cal.start_date AND cal.end_date into
AND cal.start_date <= NOW()
AND cal.end_time >= NOW()
(Edit 2)
Wherever practical, say NOT NULL. This is probably especially important in stop_times which does not have a PRIMARY KEY. Change the two columns in UNIQUE KEY comp_uniq_st_seq (trip_id,stop_seq) to be NOT NULL and turn it into PRIMARY KEY (trip_id, stop_seq). This will allow the performance benefits of "the PK is clustered with the data" to kick in.
Now that I see the CREATE TABLE for Calendar, and that service_id is the PRIMARY KEY, the two indexes I suggested for it are probably useless. (Again, this relates to "clustering".)
My Cookbook for building indexes may come in handy.

Integer comparison in mysql using an index

I need to compare integers in a mysql table. Pretty simple, but this table is fairly large... so queries take a long time. No problem, I can use an index. According to MySQL documentation, I should be able to use an index for comparison operators:
"A B-tree index can be used for column comparisons in expressions that use the =, >, >=, <, <=, or BETWEEN"
However, when I try this it has no effect on performance and the index is not used according to explain :(
SELECT * FROM Node n WHERE n.X < 800000
That results in extremely poor performance and calling explain shows our "Rectangle_Index" as being of the possible_keys but NULL key was actually used... Here's are create table statement:
CREATE TABLE `Visual_Node` (
`Id` bigint(20) NOT NULL AUTO_INCREMENT,
`X` bigint(20) NOT NULL,
`Y` bigint(20) NOT NULL,
`X_plus_Width` bigint(20) DEFAULT NULL,
`Y_plus_Height` bigint(20) DEFAULT NULL,
PRIMARY KEY (`Id`),
KEY `Rectangle_Index` (`X`,`X_plus_Width`,`Y`,`Y_plus_Height`)
) ENGINE=InnoDB AUTO_INCREMENT=4340743 DEFAULT CHARSET=latin1
Can anyone help this query? The actual query I want to run is the following:
SELECT * FROM Node n WHERE 800000 BETWEEN n.X and n.X_plus_Width AND 1234567 BETWEEN n.Y and n.Y_plus_Height
Update (asked in one of the answers below)
Heres the output of the explain for the basic query:
altering the table structure is very difficult for me. Here's the output of my explain:
mysql> explain select * from Node n where n.X < 800000;
+----+-------------+-------+------+-----------------+------+---------+------+--------+-------------+
| id | select_type | table | type | possible_keys | key | key_len | ref | rows | Extra |
+----+-------------+-------+------+-----------------+------+---------+------+--------+-------------+
| 1 | SIMPLE | n | ALL | Rectangle_Index | NULL | NULL | NULL | 173952 | Using where |
+----+-------------+-------+------+-----------------+------+---------+------+--------+-------------+
1 row in set (0.02 sec)
If you rewrite your query as
SELECT *
FROM Node n
WHERE
n.X <= 800000 AND
n.X_plus_Width >= 800000 AND
n.Y <= 1234567 AND
n.Y_plus_Height >= 1234567
Mysql could use index for one column (it can't use index for more than 1 range condition, and you have 4 of them.
I suggest you to take a look at Spatial extensions
Have you checked the details of multiple-column indexes - specifically, the part about how the optimizer is (or is not) able to use them. Here's a quote from this page:
If the table has a multiple-column
index, any leftmost prefix of the
index can be used by the optimizer to
find rows. For example, if you have a
three-column index on (col1, col2,
col3), you have indexed search
capabilities on (col1), (col1, col2),
and (col1, col2, col3).
Perhaps you could try creating multiple single-column indexes, rather than one multiple-column index?
EDIT 1:
I put together a simple test on my copy of MySQL (version 5.0.51a-24+lenny3). It shows that when using both your proper query, and your test query, your Rectangle_Index is being used. However, when using the proper query, the key_len is 8, suggesting that not all the parts of the multi-column index are being used. Perhaps the output from your version of MySQL differs in this respect.
As you'll see from the output below, even when additional indexes are added, the Rectangle_Index index is still chosen in all cases, except only the Y column is referenced in the query:
CREATE TABLE `Visual_Node` (
`Id` bigint(20) NOT NULL AUTO_INCREMENT,
`X` bigint(20) NOT NULL,
`Y` bigint(20) NOT NULL,
`X_plus_Width` bigint(20) DEFAULT NULL,
`Y_plus_Height` bigint(20) DEFAULT NULL,
PRIMARY KEY (`Id`),
KEY `Rectangle_Index` (`X`,`X_plus_Width`,`Y`,`Y_plus_Height`)
) ENGINE=InnoDB DEFAULT CHARSET=latin1;
INSERT INTO `Visual_Node` VALUES
(1, 100000, 1000000, 3000000, 3000000),
(2, 200000, 2000000, 4000000, 4000000),
(3, 300000, 3000000, 5000000, 5000000),
(4, 400000, 4000000, 6000000, 6000000),
(5, 500000, 5000000, 7000000, 7000000),
(6, 600000, 6000000, 8000000, 8000000),
(7, 700000, 7000000, 9000000, 9000000),
(8, 800000, 8000000, 10000000, 10000000),
(9, 900000, 9000000, 11000000, 11000000),
(10, 1000000, 10000000, 12000000, 12000000);
EXPLAIN SELECT * FROM Visual_Node n WHERE n.X < 800000;
+----+-------------+-------+-------+-----------------+-----------------+---------+------+------+--------------------------+
| id | select_type | table | type | possible_keys | key | key_len | ref | rows | Extra |
+----+-------------+-------+-------+-----------------+-----------------+---------+------+------+--------------------------+
| 1 | SIMPLE | n | range | Rectangle_Index | Rectangle_Index | 8 | NULL | 5 | Using where; Using index |
+----+-------------+-------+-------+-----------------+-----------------+---------+------+------+--------------------------+
EXPLAIN SELECT * FROM Visual_Node n WHERE n.Y < 800000;
+----+-------------+-------+-------+---------------+-----------------+---------+------+------+--------------------------+
| id | select_type | table | type | possible_keys | key | key_len | ref | rows | Extra |
+----+-------------+-------+-------+---------------+-----------------+---------+------+------+--------------------------+
| 1 | SIMPLE | n | index | NULL | Rectangle_Index | 34 | NULL | 10 | Using where; Using index |
+----+-------------+-------+-------+---------------+-----------------+---------+------+------+--------------------------+
EXPLAIN SELECT * FROM Visual_Node n
WHERE 800000 BETWEEN n.X and n.X_plus_Width
AND 1234567 BETWEEN n.Y and n.Y_plus_Height;
+----+-------------+-------+-------+-----------------+-----------------+---------+------+------+--------------------------+
| id | select_type | table | type | possible_keys | key | key_len | ref | rows | Extra |
+----+-------------+-------+-------+-----------------+-----------------+---------+------+------+--------------------------+
| 1 | SIMPLE | n | range | Rectangle_Index | Rectangle_Index | 8 | NULL | 5 | Using where; Using index |
+----+-------------+-------+-------+-----------------+-----------------+---------+------+------+--------------------------+
ALTER TABLE `Visual_Node` ADD INDEX `X_Index` (`X`,`X_plus_Width`);
ALTER TABLE `Visual_Node` ADD INDEX `Y_Index` (`Y`,`Y_plus_Height`);
EXPLAIN SELECT * FROM Visual_Node n WHERE n.X < 800000;
+----+-------------+-------+-------+-------------------------+-----------------+---------+------+------+--------------------------+
| id | select_type | table | type | possible_keys | key | key_len | ref | rows | Extra |
+----+-------------+-------+-------+-------------------------+-----------------+---------+------+------+--------------------------+
| 1 | SIMPLE | n | range | Rectangle_Index,X_Index | Rectangle_Index | 8 | NULL | 5 | Using where; Using index |
+----+-------------+-------+-------+-------------------------+-----------------+---------+------+------+--------------------------+
EXPLAIN SELECT * FROM Visual_Node n WHERE n.Y < 800000;
+----+-------------+-------+-------+---------------+---------+---------+------+------+-------------+
| id | select_type | table | type | possible_keys | key | key_len | ref | rows | Extra |
+----+-------------+-------+-------+---------------+---------+---------+------+------+-------------+
| 1 | SIMPLE | n | range | Y_Index | Y_Index | 8 | NULL | 1 | Using where |
+----+-------------+-------+-------+---------------+---------+---------+------+------+-------------+
EXPLAIN SELECT * FROM Visual_Node n
WHERE 800000 BETWEEN n.X and n.X_plus_Width
AND 1234567 BETWEEN n.Y and n.Y_plus_Height;
+----+-------------+-------+-------+---------------------------------+-----------------+---------+------+------+--------------------------+
| id | select_type | table | type | possible_keys | key | key_len | ref | rows | Extra |
+----+-------------+-------+-------+---------------------------------+-----------------+---------+------+------+--------------------------+
| 1 | SIMPLE | n | range | Rectangle_Index,X_Index,Y_Index | Rectangle_Index | 8 | NULL | 5 | Using where; Using index |
+----+-------------+-------+-------+---------------------------------+-----------------+---------+------+------+--------------------------+
ALTER TABLE `Visual_Node` ADD INDEX `X` (`X`,`X_plus_Width`);
ALTER TABLE `Visual_Node` ADD INDEX `X_plus_Width` (`X_plus_Width`);
ALTER TABLE `Visual_Node` ADD INDEX `Y` (`Y`);
ALTER TABLE `Visual_Node` ADD INDEX `Y_plus_Height` (`Y_plus_Height`);
EXPLAIN SELECT * FROM Visual_Node n WHERE n.X < 800000;
+----+-------------+-------+-------+---------------------------+-----------------+---------+------+------+--------------------------+
| id | select_type | table | type | possible_keys | key | key_len | ref | rows | Extra |
+----+-------------+-------+-------+---------------------------+-----------------+---------+------+------+--------------------------+
| 1 | SIMPLE | n | range | Rectangle_Index,X_Index,X | Rectangle_Index | 8 | NULL | 5 | Using where; Using index |
+----+-------------+-------+-------+---------------------------+-----------------+---------+------+------+--------------------------+
EXPLAIN SELECT * FROM Visual_Node n WHERE n.Y < 800000;
+----+-------------+-------+-------+---------------+---------+---------+------+------+-------------+
| id | select_type | table | type | possible_keys | key | key_len | ref | rows | Extra |
+----+-------------+-------+-------+---------------+---------+---------+------+------+-------------+
| 1 | SIMPLE | n | range | Y_Index,Y | Y_Index | 8 | NULL | 1 | Using where |
+----+-------------+-------+-------+---------------+---------+---------+------+------+-------------+
EXPLAIN SELECT * FROM Visual_Node n
WHERE 800000 BETWEEN n.X and n.X_plus_Width
AND 1234567 BETWEEN n.Y and n.Y_plus_Height;
+----+-------------+-------+-------+----------------------------------------------------------------+-----------------+---------+------+------+--------------------------+
| id | select_type | table | type | possible_keys | key | key_len | ref | rows | Extra |
+----+-------------+-------+-------+----------------------------------------------------------------+-----------------+---------+------+------+--------------------------+
| 1 | SIMPLE | n | range | Rectangle_Index,X_Index,Y_Index,X,X_plus_Width,Y,Y_plus_Height | Rectangle_Index | 8 | NULL | 5 | Using where; Using index |
+----+-------------+-------+-------+----------------------------------------------------------------+-----------------+---------+------+------+--------------------------+
Can you post the output from your EXPLAIN query?
What version of MySQL are you using?
EDIT 2:
The Spatial Extensions, as suggested by Naktibalda, are really cool. I'd not used these before, but if you are able to alter your table structure to use them, they may solve your problem.
Curious, I did a little research, and here's the result of my test scripts:
CREATE TABLE `Spatial_Node` (
`Id` bigint(20) NOT NULL AUTO_INCREMENT,
`Rectangle` POLYGON NOT NULL,
PRIMARY KEY (`Id`),
SPATIAL KEY `Rectangle` (`Rectangle`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1;
INSERT INTO `Spatial_Node` (`Rectangle`)
SELECT Polygon(LineString(
Point(X, Y),
Point(X_plus_Width, Y),
Point(X_plus_Width, Y_plus_Height),
Point(X, Y_plus_Height),
Point(X, Y)
))
FROM Visual_Node;
SELECT AsText(`Rectangle`) FROM Spatial_Node
WHERE MBRContains(Rectangle, Point(100001, 1000001));
+-----------------------------------------------------------------------------------------+
| AsText(`Rectangle`) |
+-----------------------------------------------------------------------------------------+
| POLYGON((100000 1000000,3000000 1000000,3000000 3000000,100000 3000000,100000 1000000)) |
+-----------------------------------------------------------------------------------------+
EXPLAIN SELECT AsText(`Rectangle`) FROM Spatial_Node
WHERE MBRContains(Rectangle, Point(100001, 1000001));
+----+-------------+--------------+-------+---------------+-----------+---------+------+------+-------------+
| id | select_type | table | type | possible_keys | key | key_len | ref | rows | Extra |
+----+-------------+--------------+-------+---------------+-----------+---------+------+------+-------------+
| 1 | SIMPLE | Spatial_Node | range | Rectangle | Rectangle | 32 | NULL | 1 | Using where |
+----+-------------+--------------+-------+---------------+-----------+---------+------+------+-------------+
I have no idea how the speed will compare, but I've definitely learned something new and exciting today. Thanks Naktibalda :-)
Have you tried changing the index to:
CREATE TABLE `Visual_Node` (
`Id` bigint(20) NOT NULL AUTO_INCREMENT,
`X` bigint(20) NOT NULL,
`Y` bigint(20) NOT NULL,
`X_plus_Width` bigint(20) DEFAULT NULL,
`Y_plus_Height` bigint(20) DEFAULT NULL,
PRIMARY KEY (`Id`),
KEY `X_Index` (`X`),
KEY `Y_Index` (`Y`),
KEY `X_Width_Index` (`X_plus_Width`),
KEY `Y_Height_Index` (`Y_plus_Height`)
) ENGINE=InnoDB AUTO_INCREMENT=4340743 DEFAULT CHARSET=latin1
Judging by your AI value, you'll probably want to test this with a smaller set of data.