Related
There is a code which has been running since 6 months in production, which runs in a loop for given number of tables and does a redshift copy. It is has been running successfully till 31st October, from 1st November till date it failed (for one particular table; runs fine for others).
## Truncate and execute Copy command.
def ExecuteCopyCommand(TableList):
QueryIdDict = {}
for TableName in TableList:
SourcePath = f's3://{BucketName}/{prefix}'
query = f" truncate table {TableName}; \
copy {TableName} \
from '{SourcePath}' \
iam_role 'abcd' \
delimiter as '.' \
ignoreheader 1 \
dateformat as 'auto' \
timeformat as 'auto' \
Null as 'NULL';"
## Executing truncate and copy command on redshift cluster
try:
response = client.execute_statement(
ClusterIdentifier='redshift-abc',
Database='abc',
SecretArn='arn:aws:secretsmanager:abcd',
Sql= query
)
print(TableName + ": Copy command executed")
print('Query',query)
print('Response',response)
QueryId = response['Id']
QueryIdDict[QueryId] = TableName
DataDict= { 'Level': 'Info',
'SourceLocation': SourcePath,
'TargetDatabaseName': 'redshift-abc',
'TargetSchemaName': str(TableName.split('.')[0]),
'TargetTableName': str(TableName.split('.')[1]),
'ExecutedQuery': query.strip(),
'ExecutedQueryId': str(QueryId),
'Description': 'Copy command executed on redshift and query is in progress.',
'Status': 'Succeeded'
}
DataList.append(DataDict)
time.sleep(1)
except Exception as e:
DataDict= { 'Level': 'Error',
'SourceLocation': SourcePath,
'TargetDatabaseName': 'redshift-abc',
'TargetSchemaName': str(TableName.split('.')[0]),
'TargetTableName': str(TableName.split('.')[1]),
'ExecutedQuery': query.strip(),
'ExecutedQueryId': '',
'Description': f'Fail to execute copy command. Error : {str(e)}',
'Status': 'Failed'
}
DataList.append(DataDict)
print('Error occur in ExecuteCopyCommand block.')
print('Error occur while executing copy command.')
print('TableName : ' + TableName)
print(e)
raise
print('Query dict',QueryIdDict)
return QueryIdDict
The below code fails with the following error:
Main error: Exception: ERROR: could not open relation with OID 591927
Traceback:
test_table: Copy command executed
Query truncate table test_table; copy test_table from 's3://bucket_test/pipeline/test_table/year=2022/month=02/day=28/' iam_role 'arn:aws:iam::xyz:role/Account-B-Glue-Redshift-Cloudwatch' delimiter as '.' ignoreheader 1 dateformat as 'auto' timeformat as 'auto' Null as 'NULL';
Response {'ClusterIdentifier': 'redshift-abc', 'CreatedAt': datetime.datetime(2022, 11, 10, 6, 21, 42, 363000, tzinfo=tzlocal()), 'Database': 'abc', 'Id': 'abcdcs-4878-446b-80e9-8d544860847a', 'SecretArn': 'arn:aws:secretsmanager:abcd', 'ResponseMetadata': {'RequestId': '690f6542-4e33-4d84-afb8-2f9ebc9af62e', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '690f6542-4e33-4d84-afb8-2f9ebc9af62e', 'content-type': 'application/x-amz-json-1.1', 'content-length': '231', 'date': 'Thu, 10 Nov 2022 06:21:42 GMT'}, 'RetryAttempts': 0}}
Query dict {'abcdcs-4878-446b-80e9-8d544860847a': 'test_table'}
QueryId of executed copy command
{'abcdcs-4878-446b-80e9-8d544860847a': 'test_table'}
Checking executed query status for each table.
test_table: Copy command failed
{'ClusterIdentifier': 'redshift-abc', 'CreatedAt': datetime.datetime(2022, 11, 10, 6, 21, 42, 363000, tzinfo=tzlocal()), 'Duration': -1, 'Error': 'ERROR: could not open relation with OID 591927', 'HasResultSet': False, 'Id': '9c6cb33c-4878-446b-80e9-8d544860847a', 'QueryString': " truncate table test_table; copy test_table from 's3://bucket_test/pipeline/test_table/year=2022/month=02/day=28/' iam_role '' delimiter as '\x01' ignoreheader 1 dateformat as 'auto' timeformat as 'auto' Null as 'NULL';", 'RedshiftPid': 1073775000, 'RedshiftQueryId': 6553022, 'ResultRows': -1, 'ResultSize': -1, 'SecretArn': 'arn:aws:secretsmanager:abcd', 'Status': 'FAILED', 'UpdatedAt': datetime.datetime(2022, 11, 10, 6, 21, 42, 937000, tzinfo=tzlocal()), 'ResponseMetadata': {'RequestId': 'c77cb319-14d3-42fd-8c34-611dbd5a17b4', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'c77cb319-14d3-42fd-8c34-611dbd5a17b4', 'content-type': 'application/x-amz-json-1.1', 'content-length': '890', 'date': 'Thu, 10 Nov 2022 06:22:13 GMT'}, 'RetryAttempts': 0}}
Error occur in CheckQueryStatus block
ERROR: could not open relation with OID 591927
Error occur in main block.
Fail to refresh table in redshift.
{'MessageId': 'eb6338b8-cd1d-5d47-8a63-635e57fee266', 'ResponseMetadata': {'RequestId': '60766afd-c861-5c1d-9d61-311b5282333c', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '60766afd-c861-5c1d-9d61-311b5282333c', 'content-type': 'text/xml', 'content-length': '294', 'date': 'Thu, 10 Nov 2022 06:22:26 GMT'}, 'RetryAttempts': 0}}
Email Notification sent to respective e-mail id.
ERROR: could not open relation with OID 591927
The error is raised from the CheckQueryStatus function, that is as follows:
## Check executed query status.
def CheckQueryStatus(QueryIdDict):
InprogressQueryIdList = [key for key in QueryIdDict.keys()]
SucceedTableList = []
## Expected Status of running query
FailStatus = ['ABORTED','FAILED']
InprogressStatus = ['SUBMITTED','PICKED','STARTED']
SucceedStatus = ['FINISHED']
try:
while len(InprogressQueryIdList):
for QueryId in InprogressQueryIdList:
response = client.describe_statement(
Id=QueryId
)
if response['Status'] in SucceedStatus:
SucceedTableList.append(QueryIdDict[QueryId])
InprogressQueryIdList.remove(QueryId)
print('Query Executed Sucessfully : ' + QueryIdDict[QueryId])
SourcePath = f's3://{BucketName}/pipeline/{QueryIdDict[QueryId]}/{PathPrefix}/'
DataDict= { 'Level': 'Info',
'SourceLocation': SourcePath,
'TargetDatabaseName': 'abc',
'TargetSchemaName': str(QueryIdDict[QueryId].split('.')[0]),
'TargetTableName': str(QueryIdDict[QueryId].split('.')[1]),
'ExecutedQuery': '',
'ExecutedQueryId': str(QueryId),
'Description': 'Data loaded successfully in staging table',
'Status': 'Succeed'
}
DataList.append(DataDict)
elif response['Status'] in InprogressStatus:
time.sleep(30)
else:
print(QueryIdDict[QueryId] + ': Copy command failed\n')
print(response)
raise Exception(str(response['Error']))
print('Table refreshed successfully\n')
print(SucceedTableList)
except Exception as e:
SourcePath = f's3://{BucketName}/pipeline/{QueryIdDict[QueryId]}/{PathPrefix}/'
DataDict= { 'Level': 'Error',
'SourceLocation': SourcePath,
'TargetDatabaseName': 'abc',
'TargetSchemaName': str(QueryIdDict[QueryId].split('.')[0]),
'TargetTableName': str(QueryIdDict[QueryId].split('.')[1]),
'ExecutedQuery': '',
'ExecutedQueryId': str(QueryId),
'Description': f'Copy command failed.{response["Error"]}',
'Status': 'Failed'
}
DataList.append(DataDict)
print('Error occur in CheckQueryStatus block')
print(e)
raise
Now:
When I run the same copy command from DBeaver or some other query tool, it works perfectly fine.
When I run this code for other tables, exact same code, it works fine. Only failing for this table.
Created a test table to see if this is not the typical Postgres OID bug, but the error could be replicated.
This has brought me to a state of confusion. Any help?
This error is often caused by stale table info and some other process that is dropping the target table (and possibly recreating a new table of the same name). See similar questions / answers - tracing the cause of "could not open relation with OID" error
I have a problem with ecto query. I have this function:
def get_critials() do
critical_time = DateTime.to_naive(Timex.shift(Timex.now, seconds: -600))
query = "SELECT d.*"
<> " FROM sc_devices AS d"
<> " INNER JOIN log_device_commands AS ldc ON ldc.device_id = d.id"
<> " WHERE ldc.inserted_at < timestamp '#{critical_time}'"
{:ok, result} = Ecto.Adapters.SQL.query(Repo, query, [], [:rows])
result.rows
end
What I want is to get all records from table sc_devices where column updated_at in log_device_commands is older than 600 seconds, but I get that output:
And I receive this output:
[
[1, "LAMP 1XX_1", "1.st Lamp on the corner", 1,
"6c7572e1-460f-43dd-b137-90c21d33525b", "XCA190SS2020DE", 3, 1, 1, 46.55472,
15.64667, true, nil, ~N[2020-11-12 20:32:22.000000],
~N[2020-11-12 20:32:22.000000], 2],
[1, "LAMP 1XX_1", "1.st Lamp on the corner", 1,
"6c7572e1-460f-43dd-b137-90c21d33525b", "XCA190SS2020DE", 3, 1, 1, 46.55472,
15.64667, true, nil, ~N[2020-11-12 20:32:22.000000],
~N[2020-11-12 20:32:22.000000], 2],
[1, "LAMP 1XX_1", "1.st Lamp on the corner", 1,
"6c7572e1-460f-43dd-b137-90c21d33525b", "XCA190SS2020DE", 3, 1, 1, 46.55472,
15.64667, true, nil, ~N[2020-11-12 20:32:22.000000],
~N[2020-11-12 20:32:22.000000], 2]
]
Any ideas how can I solve that?
You could use postgres CURRENT_TIMESTAMP - INTERVAL '600 seconds' instead of using an elixir variable inside the query.
Also, I see you commented that you want to filter by updated_at but your query is actually filtering by inserted_at.
I have two array outputs where I need to iterate over each struct, and compare the counts where the source's match. The comparison needs to be less than or equal to. My output sources look like this:
output_1: [%{source: "facebook", count: 3}, %{count: 1, source: "linkedin"}]
output_2: [%{source: "facebook", count: 2}, %{count: 1, source: "linkedin"}]
Whats the best data structure to implement in order to make the Enumerables easiest and most efficient to compare?
If your order isn't guaranteed, my preferred way is to turn the reference list into a map and compare things by source.
iex> output_1 = [%{source: "facebook", count: 3}, %{count: 1, source: "linkedin"}]
[%{count: 3, source: "facebook"}, %{count: 1, source: "linkedin"}]
iex> output_2 = [%{source: "facebook", count: 2}, %{count: 1, source: "linkedin"}]
[%{count: 2, source: "facebook"}, %{count: 1, source: "linkedin"}]
iex> limits = Map.new(output_1, &{&1.source, &1.count})
%{"facebook" => 3, "linkedin" => 1}
iex> Enum.all?(output_2, & &1.count <= limits[&1.source])
true
Your current output format should be very efficient with the following code. You didn't say what you expected your output to be, nor in which direction the comparison should be done: output2 <= output1 or output1 <= output2, so I'm assuming a list of booleans and output1 <= output2:
defmodule A do
def compare([%{count: count1}|maps1], [%{count: count2}|maps2]) do
[count1 <= count2 | compare(maps1, maps2) ]
end
def compare([], []), do: []
end
The following does the same thing and is easier to come up with and understand:
defmodule A do
def compare(list1, list2), do: _compare(list1, list2, [])
defp _compare([%{count: count1}|maps1], [%{count: count2}|maps2], acc) do
_compare(maps1, maps2, [count1 <= count2 | acc])
end
defp _compare([], [], acc) do
Enum.reverse(acc)
end
end
In iex:
~/elixir_programs$ iex a.ex
Erlang/OTP 20 [erts-9.3] [source] [64-bit] [smp:4:4] [ds:4:4:10] [async-threads:10] [hipe] [kernel-poll:false]
Interactive Elixir (1.8.2) - press Ctrl+C to exit (type h() ENTER for help)
iex(1)> out1 = [%{source: "facebook", count: 3}, %{count: 1, source: "linkedin"}]
[
%{count: 3, source: "facebook"},
%{count: 1, source: "linkedin"}
]
iex(2)> out2 = [%{source: "facebook", count: 2}, %{count: 1, source: "linkedin"}]
[
%{count: 2, source: "facebook"},
%{count: 1, source: "linkedin"}
]
iex(3)> A.compare(out1, out2)
[false, true]
If instead, you need the result to be a single boolean, i.e. the facebook count is less than or equal to AND the linkedin count is less than or equal to, you can change the accumulator:
defmodule A do
def compare(list1, list2), do: _compare(list1, list2, true)
defp _compare([%{count: count1}|maps1], [%{count: count2}|maps2], true) do
_compare(maps1, maps2, count1 <= count2)
end
defp _compare(_, _, false), do: false #If you find a false comparison, stop and return false
defp _compare([], [], _), do: true
end
In iex:
iex(22)> c "a.ex"
warning: redefining module A (current version defined in memory)
a.ex:1
[A]
iex(23)> A.compare(out1, out2)
false
This also works:
defmodule A do
def compare(list1, list2) do
List.first(list1)[:count] <= List.first(list2)[:count]
and
List.last(list1)[:count] <= List.last(list2)[:count]
end
end
Whats the best data structure to implement in order to make the Enumerables easiest and most efficient to compare?
Otherwise, I would nominate a keyword list like this:
[facebook: 3, linkedin: 1]
[facebook: 2, linkedin: 1]
The easiest would probably to be use Enum.zip/2 with Enum.all?/2. Something like the following should work
output_1 = Enum.sort(output_1, fn a, b -> a.source <= b.source end)
output_2 = Enum.sort(output_2, fn a, b -> a.source <= b.source end)
output_1
|> Enum.zip(output_2)
|> Enum.all?(fn a, b -> a.count == b.count end)
I just want monitor my running spider's stats.I get the latest scrapy-plugins/scrapy-jsonrpc and set the spider as follows:
EXTENSIONS = {
'scrapy_jsonrpc.webservice.WebService': 500,
}
JSONRPC_ENABLED = True
JSONRPC_PORT = [60853]
but when I browse the http://localhost:60853/ , it just return
{"resources": ["crawler"]}
and I just can get the running spiders name without the stats.
anyone who can told me, which place I set wrong, thanks!
http://localhost:60853/ returns the resources available, /crawler being the only top-level one.
If you want to get stats for a spider, you'll need to query the /crawler/stats endpoint and call get_stats().
Here's an example using python-jsonrpc: (here I configured the webservice to listen on localhost and port 6024)
>>> import pyjsonrpc
>>> http_client = pyjsonrpc.HttpClient('http://localhost:6024/crawler/stats')
>>> http_client.call('get_stats', 'httpbin')
{u'log_count/DEBUG': 4, u'scheduler/dequeued': 4, u'log_count/INFO': 9, u'downloader/response_count': 2, u'downloader/response_status_count/200': 2, u'log_count/WARNING': 1, u'scheduler/enqueued/memory': 4, u'downloader/response_bytes': 639, u'start_time': u'2016-09-28 08:49:57', u'scheduler/dequeued/memory': 4, u'scheduler/enqueued': 4, u'downloader/request_bytes': 862, u'response_received_count': 2, u'downloader/request_method_count/GET': 4, u'downloader/request_count': 4}
>>> http_client.call('get_stats')
{u'log_count/DEBUG': 4, u'scheduler/dequeued': 4, u'log_count/INFO': 9, u'downloader/response_count': 2, u'downloader/response_status_count/200': 2, u'log_count/WARNING': 1, u'scheduler/enqueued/memory': 4, u'downloader/response_bytes': 639, u'start_time': u'2016-09-28 08:49:57', u'scheduler/dequeued/memory': 4, u'scheduler/enqueued': 4, u'downloader/request_bytes': 862, u'response_received_count': 2, u'downloader/request_method_count/GET': 4, u'downloader/request_count': 4}
>>> from pprint import pprint
>>> pprint(http_client.call('get_stats'))
{u'downloader/request_bytes': 862,
u'downloader/request_count': 4,
u'downloader/request_method_count/GET': 4,
u'downloader/response_bytes': 639,
u'downloader/response_count': 2,
u'downloader/response_status_count/200': 2,
u'log_count/DEBUG': 4,
u'log_count/INFO': 9,
u'log_count/WARNING': 1,
u'response_received_count': 2,
u'scheduler/dequeued': 4,
u'scheduler/dequeued/memory': 4,
u'scheduler/enqueued': 4,
u'scheduler/enqueued/memory': 4,
u'start_time': u'2016-09-28 08:49:57'}
>>>
You can also use jsonrpc_client_call from scrapy_jsonrpc.jsonrpc.
>>> from scrapy_jsonrpc.jsonrpc import jsonrpc_client_call
>>> jsonrpc_client_call('http://localhost:6024/crawler/stats', 'get_stats', 'httpbin')
{u'log_count/DEBUG': 5, u'scheduler/dequeued': 4, u'log_count/INFO': 11, u'downloader/response_count': 3, u'downloader/response_status_count/200': 3, u'log_count/WARNING': 1, u'scheduler/enqueued/memory': 4, u'downloader/response_bytes': 870, u'start_time': u'2016-09-28 09:01:47', u'scheduler/dequeued/memory': 4, u'scheduler/enqueued': 4, u'downloader/request_bytes': 862, u'response_received_count': 3, u'downloader/request_method_count/GET': 4, u'downloader/request_count': 4}
This is what you get "on the wire" for a request made with a modified example-client.py (see code a bit below, the example in https://github.com/scrapy-plugins/scrapy-jsonrpc is outdated as I write these lines):
POST /crawler/stats HTTP/1.1
Accept-Encoding: identity
Content-Length: 73
Host: localhost:6024
Content-Type: application/x-www-form-urlencoded
Connection: close
User-Agent: Python-urllib/2.7
{"params": ["httpbin"], "jsonrpc": "2.0", "method": "get_stats", "id": 1}
And the response
HTTP/1.1 200 OK
Content-Length: 504
Access-Control-Allow-Headers: X-Requested-With
Server: TwistedWeb/16.4.1
Connection: close
Date: Tue, 27 Sep 2016 11:21:43 GMT
Access-Control-Allow-Origin: *
Access-Control-Allow-Methods: GET, POST, PATCH, PUT, DELETE
Content-Type: application/json
{"jsonrpc": "2.0", "result": {"log_count/DEBUG": 5, "scheduler/dequeued": 4, "log_count/INFO": 11, "downloader/response_count": 3, "downloader/response_status_count/200": 3, "log_count/WARNING": 3, "scheduler/enqueued/memory": 4, "downloader/response_bytes": 870, "start_time": "2016-09-27 11:16:25", "scheduler/dequeued/memory": 4, "scheduler/enqueued": 4, "downloader/request_bytes": 862, "response_received_count": 3, "downloader/request_method_count/GET": 4, "downloader/request_count": 4}, "id": 1}
Here's the modified client to query /crawler/stats, which I called with ./example-client.py -H localhost -P 6024 get-spider-stats httpbin (for a running "httpbin" spider, JSONRPC_PORT being 6024 for me)
#!/usr/bin/env python
"""
Example script to control a Scrapy server using its JSON-RPC web service.
It only provides a reduced functionality as its main purpose is to illustrate
how to write a web service client. Feel free to improve or write you own.
Also, keep in mind that the JSON-RPC API is not stable. The recommended way for
controlling a Scrapy server is through the execution queue (see the "queue"
command).
"""
from __future__ import print_function
import sys, optparse, urllib, json
from six.moves.urllib.parse import urljoin
from scrapy_jsonrpc.jsonrpc import jsonrpc_client_call, JsonRpcError
def get_commands():
return {
'help': cmd_help,
'stop': cmd_stop,
'list-available': cmd_list_available,
'list-running': cmd_list_running,
'list-resources': cmd_list_resources,
'get-global-stats': cmd_get_global_stats,
'get-spider-stats': cmd_get_spider_stats,
}
def cmd_help(args, opts):
"""help - list available commands"""
print("Available commands:")
for _, func in sorted(get_commands().items()):
print(" ", func.__doc__)
def cmd_stop(args, opts):
"""stop <spider> - stop a running spider"""
jsonrpc_call(opts, 'crawler/engine', 'close_spider', args[0])
def cmd_list_running(args, opts):
"""list-running - list running spiders"""
for x in json_get(opts, 'crawler/engine/open_spiders'):
print(x)
def cmd_list_available(args, opts):
"""list-available - list name of available spiders"""
for x in jsonrpc_call(opts, 'crawler/spiders', 'list'):
print(x)
def cmd_list_resources(args, opts):
"""list-resources - list available web service resources"""
for x in json_get(opts, '')['resources']:
print(x)
def cmd_get_spider_stats(args, opts):
"""get-spider-stats <spider> - get stats of a running spider"""
stats = jsonrpc_call(opts, 'crawler/stats', 'get_stats', args[0])
for name, value in stats.items():
print("%-40s %s" % (name, value))
def cmd_get_global_stats(args, opts):
"""get-global-stats - get global stats"""
stats = jsonrpc_call(opts, 'crawler/stats', 'get_stats')
for name, value in stats.items():
print("%-40s %s" % (name, value))
def get_wsurl(opts, path):
return urljoin("http://%s:%s/"% (opts.host, opts.port), path)
def jsonrpc_call(opts, path, method, *args, **kwargs):
url = get_wsurl(opts, path)
return jsonrpc_client_call(url, method, *args, **kwargs)
def json_get(opts, path):
url = get_wsurl(opts, path)
return json.loads(urllib.urlopen(url).read())
def parse_opts():
usage = "%prog [options] <command> [arg] ..."
description = "Scrapy web service control script. Use '%prog help' " \
"to see the list of available commands."
op = optparse.OptionParser(usage=usage, description=description)
op.add_option("-H", dest="host", default="localhost", \
help="Scrapy host to connect to")
op.add_option("-P", dest="port", type="int", default=6080, \
help="Scrapy port to connect to")
opts, args = op.parse_args()
if not args:
op.print_help()
sys.exit(2)
cmdname, cmdargs, opts = args[0], args[1:], opts
commands = get_commands()
if cmdname not in commands:
sys.stderr.write("Unknown command: %s\n\n" % cmdname)
cmd_help(None, None)
sys.exit(1)
return commands[cmdname], cmdargs, opts
def main():
cmd, args, opts = parse_opts()
try:
cmd(args, opts)
except IndexError:
print(cmd.__doc__)
except JsonRpcError as e:
print(str(e))
if e.data:
print("Server Traceback below:")
print(e.data)
if __name__ == '__main__':
main()
In the example command above, I got this:
log_count/DEBUG 5
scheduler/dequeued 4
log_count/INFO 11
downloader/response_count 3
downloader/response_status_count/200 3
log_count/WARNING 3
scheduler/enqueued/memory 4
downloader/response_bytes 870
start_time 2016-09-27 11:16:25
scheduler/dequeued/memory 4
scheduler/enqueued 4
downloader/request_bytes 862
response_received_count 3
downloader/request_method_count/GET 4
downloader/request_count 4
User has_many Plans. I'm trying to find the IDs of all Users that do NOT have a Plan with status of "canceled". Would love to know what's explaining the behavior below.
For context, what should be returned is this:
User.select { |u| u.plans.select { |p| p.status != "canceled" }.count > 0 }.map(&:id)
# => [27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 60, 61, 62, 63]
Here's what I'm getting:
# statement 1
User.joins(:plans).where.not("plans.status" => "canceled").map(&:id)
# User Load (0.3ms) SELECT "users".* FROM "users" INNER JOIN "plans" ON "plans"."user_id" = "users"."id" WHERE ("plans"."status" != 'canceled')
# => [44]
# statement 2
User.joins(:plans).where("plans.status != ?", "canceled").map(&:id)
# User Load (0.3ms) SELECT "users".* FROM "users" INNER JOIN "plans" ON "plans"."user_id" = "users"."id" WHERE (plans.status != 'canceled')
# => [44]
# statement 3
User.joins(:plans).where("plans.status == ?", nil).map(&:id)
# User Load (0.3ms) SELECT "users".* FROM "users" INNER JOIN "plans" ON "plans"."user_id" = "users"."id" WHERE (plans.status == NULL)
# => []
# statement 4
User.joins(:plans).where("plans.status" => nil).map(&:id)
# User Load (0.7ms) SELECT "users".* FROM "users" INNER JOIN "plans" ON "plans"."user_id" = "users"."id" WHERE "plans"."status" IS NULL
# => [27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 41, 44, 42, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 61, 60, 62, 63]
Questions:
Why are statement 3 and 4 not returning the same result?
Why are statement 1 and 2 (fortunately these are the same and are returning the same result) not returning the same result as statement 4? For context, I'd rather not search on nil, but on "canceled". And I can confirm that all plans have either a status of nil or "canceled"
UPDATE PER REQUEST
# plan with nil status
Plan.where(status: nil).first
# => <Plan id: 1, zipcode: "94282", selected_plan: 1, meal_type: "Chef's choice (mixed)", most_favorite: "", least_favorite: "", allergies: "", start_date: "2015-05-27 00:00:00", delivery_address: "d", delivery_instructions: "", phone1: "10", phone2: "222", phone3: "2222", agree_tos: true, user_id: 20, created_at: "2015-05-24 05:18:40", updated_at: "2015-06-21 04:54:31", stripe_subscription_id: nil, stripe_invoice_number: nil, cancel_reason: nil, cancel_reason_other: nil, nps: nil, nps_open: nil, cancel_open: nil, status: nil, referred_by_code: nil>
# plan with canceled status
Plan.where(status: "canceled").first
# => <Plan id: 20, zipcode: "12345", selected_plan: 5, meal_type: "Meat (with veggies)", most_favorite: "", least_favorite: "", allergies: "", start_date: "2015-06-08 00:00:00", delivery_address: "asdf", delivery_instructions: "", phone1: "333", phone2: "333", phone3: "3333", agree_tos: true, user_id: 38, created_at: "2015-06-01 21:39:54", updated_at: "2015-06-23 06:23:10", stripe_subscription_id: "sub_6OKkJoNx2u8ZXZ", stripe_invoice_number: 0, cancel_reason: nil, cancel_reason_other: "", nps: 6, nps_open: "", cancel_open: "", status: "canceled", referred_by_code: nil>
Answer to Question 1:
You missed the concept that in conditional sql, the arguments that follow the condition are replaced in place of ? not compared to. So, replace the double == with =.
User.joins(:plans).where("plans.status = ?", nil).map(&:id)