My redis server is running under ubuntu 16.04 and I have RQ Dashboard running to monitor the queue. The redis server has a password which I supply for the initial connection. Here's my code:
from rq import Queue, Connection, Worker
from redis import Redis
from dblogger import DbLogger
def _redisCon():
redis_host = "192.168.1.169"
redis_port = "6379"
redis_password = "SecretPassword"
return Redis(host=redis_host, port=redis_port, password=redis_password)
rcon = _redisCon()
if rcon is not None:
with Connection(rcon):
DbLogger.log("rqworker", 0, "Launching Worker", "launching an RQ Worker - default Queue")
worker = Worker(list(map(Queue, 'default'))) # this works - I see the worker registered in RQ dashboard
worker.work() # this eventually fails with the Connection error:
"""
16:28:49 RQ worker 'rq:worker:steve-imac.95379' started, version 0.12.0
16:28:49 *** Listening on default...
16:28:49 Cleaning registries for queue: default
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/redis/connection.py", line 177, in _read_from_socket
raise socket.error(SERVER_CLOSED_CONNECTION_ERROR)
OSError: Connection closed by server.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/redis/client.py", line 668, in execute_command
return self.parse_response(connection, command_name, **options)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/redis/client.py", line 680, in parse_response
response = connection.read_response()
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/redis/connection.py", line 624, in read_response
response = self._parser.read_response()
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/redis/connection.py", line 284, in read_response
response = self._buffer.readline()
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/redis/connection.py", line 216, in readline
self._read_from_socket()
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/redis/connection.py", line 191, in _read_from_socket
(e.args,))
redis.exceptions.ConnectionError: Error while reading from socket: ('Connection closed by server.',)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/redis/connection.py", line 489, in connect
raise ConnectionError(self._error_message(e))
redis.exceptions.ConnectionError: Error 61 connecting to 192.168.1.169:6379. Connection refused.
"""
I've tried removing the password and enabling the unixsocket in the redis.conf -- neither seemed to help. This seems to be happening in some sort of timeout, since in other testing the worker actually loads a task and executes it before eventually dying with this error.
Related
I am using Celery with SQS as a broker and I am trying to renew my credentials "AWS_ACCESS_KEY_ID" and "AWS_SECRET_ACCESS_KEY", before they expire, the first time I run the task and the result is success, but after 15 minutes it expires although credentials have been renewed, the function to update credentials is as follows:
import os
import boto3
from celery import Celery
from kombu.utils.url import safequote
def update_aws_credentials():
role_info = {
'RoleArn': f"arn:aws:iam::{os.environ['AWS_ACCOUNT_NUMER']}:role/my_role_execution",
'RoleSessionName': 'roleExecution',
'DurationSeconds': 900
}
sts_client = boto3.client('sts', region_name='eu-central-1')
credentials = sts_client.assume_role(**role_info)
aws_access_key_id = credentials["Credentials"]['AccessKeyId']
aws_secret_access_key = credentials["Credentials"]['SecretAccessKey']
aws_session_token = credentials["Credentials"]["SessionToken"]
os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key
os.environ["AWS_DEFAULT_REGION"] = 'eu-central-1'
os.environ["AWS_SESSION_TOKEN"] = aws_session_token
return aws_access_key_id, aws_secret_access_key
def get_celery(aws_access_key_id, aws_secret_access_key):
broker = f"sqs://{safequote(aws_access_key_id)}:{safequote(aws_secret_access_key)}#"
backend = 'redis://redis-service:6379/0'
celery = Celery(f"my_task", broker=broker, backend=backend)
celery.conf["broker_transport_options"] = {
'polling_interval': 30,
'region': 'eu-central-1',
'predefined_queues': {
f"my_queue": {
'url': f"https://sqs.eu-central-1.amazonaws.com/{os.environ['AWS_ACCOUNT_NUMER']}/my_queue"
}
}
}
celery.conf["task_default_queue"] = f"my_queue"
return celery
def refresh_sqs_credentials():
access, secret = update_aws_credentials()
return get_celery(access, secret)
Running refresh_sqs_credentials, new credentials are created:
celery = worker.refresh_sqs_credentials()
And then I run my task with celery:
task = celery.send_task('my_task.code_of_my_task', args=[content], task_id=task_id)
All tasks that I run before 15 minutes finish successfully, but after 15 minutes the error is the following:
[2021-12-14 14:08:15,637] ERROR in app: Exception on /tasks/run [POST]
Traceback (most recent call last):
File "/api/app.py", line 87, in post
task = celery.send_task('glgt_ap35080_dev_sqs_runalgo.allocation_alg_task', args=[content], task_id=task_id)
File "/usr/local/lib/python3.6/site-packages/celery/app/base.py", line 717, in send_task
amqp.send_task_message(P, name, message, **options)
File "/usr/local/lib/python3.6/site-packages/celery/app/amqp.py", line 547, in send_task_message
**properties
File "/usr/local/lib/python3.6/site-packages/kombu/messaging.py", line 178, in publish
exchange_name, declare,
File "/usr/local/lib/python3.6/site-packages/kombu/connection.py", line 525, in _ensured
return fun(*args, **kwargs)
File "/usr/local/lib/python3.6/site-packages/kombu/messaging.py", line 200, in _publish
mandatory=mandatory, immediate=immediate,
File "/usr/local/lib/python3.6/site-packages/kombu/transport/virtual/base.py", line 605, in basic_publish
return self._put(routing_key, message, **kwargs)
File "/usr/local/lib/python3.6/site-packages/kombu/transport/SQS.py", line 294, in _put
c.send_message(**kwargs)
File "/usr/local/lib/python3.6/site-packages/botocore/client.py", line 337, in _api_call
File "/usr/local/lib/python3.6/site-packages/botocore/client.py", line 656, in _make_api_call
raise error_class(parsed_response, operation_name)
botocore.exceptions.ClientError: An error occurred (ExpiredToken) when calling the SendMessage operation: The security token included in the request is expired
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.6/site-packages/flask/app.py", line 1813, in full_dispatch_request
rv = self.dispatch_request()
File "/usr/local/lib/python3.6/site-packages/flask/app.py", line 1799, in dispatch_request
return self.view_functions[rule.endpoint](**req.view_args)
File "/usr/local/lib/python3.6/site-packages/flask_restplus/api.py", line 325, in wrapper
resp = resource(*args, **kwargs)
File "/usr/local/lib/python3.6/site-packages/flask/views.py", line 88, in view
return self.dispatch_request(*args, **kwargs)
File "/usr/local/lib/python3.6/site-packages/flask_restplus/resource.py", line 44, in dispatch_request
resp = meth(*args, **kwargs)
File "/api/app.py", line 90, in post
abort(500)
File "/usr/local/lib/python3.6/site-packages/werkzeug/exceptions.py", line 774, in abort
return _aborter(status, *args, **kwargs)
File "/usr/local/lib/python3.6/site-packages/werkzeug/exceptions.py", line 755, in __call__
raise self.mapping[code](*args, **kwargs)
werkzeug.exceptions.InternalServerError: 500 Internal Server Error: The server encountered an internal error and was unable to complete your request. Either the server is overloaded or there is an error in the application.
10.142.95.217 - - [14/Dec/2021 14:08:15] "POST /tasks/run HTTP/1.1" 500 -
I'm storing the credentials in environment variables, I don't understand why it expires after 15 minutes, can someone help me please?
The versions of the packages used are:
boto3==1.14.54
celery==5.0.0
kombu==5.0.2
pycurl==7.43.0.6
Thank you
I’m running into a problem of tensorflow-data-validation with direct runner to generate statistics from some large datasets over 400GB.
It seems that all workers stopped working after an error message of “Keepalive watchdog fired. Closing transport.” It seems to be a grpc keepalive timeout.
E0804 17:49:07.419950276 44806 chttp2_transport.cc:2881] ipv6:[::1]:40823: Keepalive watchdog fired. Closing transport.
2020-08-04 17:49:07 local_job_service.py : INFO Worker: severity: ERROR timestamp { seconds: 1596563347 nanos: 420487403 } message: "Python sdk harness failed: \nTraceback (most recent call last):\n File \"/home/ec2-user/lib64/python3.7/site-packages/apache_beam/runners/worker/sdk_worker_main.py\", line 158, in main\n sdk_pipeline_options.view_as(ProfilingOptions))).run()\n File \"/home/ec2-user/lib64/python3.7/site-packages/apache_beam/runners/worker/sdk_worker.py\", line 213, in run\n for work_request in self._control_stub.Control(get_responses()):\n File \"/home/ec2-user/lib64/python3.7/site-packages/grpc/_channel.py\", line 416, in __next__\n return self._next()\n File \"/home/ec2-user/lib64/python3.7/site-packages/grpc/_channel.py\", line 706, in _next\n raise self\ngrpc._channel._MultiThreadedRendezvous: <_MultiThreadedRendezvous of RPC that terminated with:\n\tstatus = StatusCode.UNAVAILABLE\n\tdetails = \"keepalive watchdog timeout\"\n\tdebug_error_string = \"{\"created\":\"#1596563347.420024732\",\"description\":\"Error received from peer ipv6:[::1]:40823\",\"file\":\"src/core/lib/surface/call.cc\",\"file_line\":1055,\"grpc_message\":\"keepalive watchdog timeout\",\"grpc_status\":14}\"\n>" trace: "Traceback (most recent call last):\n File \"/home/ec2-user/lib64/python3.7/site-packages/apache_beam/runners/worker/sdk_worker_main.py\", line 158, in main\n sdk_pipeline_options.view_as(ProfilingOptions))).run()\n File \"/home/ec2-user/lib64/python3.7/site-packages/apache_beam/runners/worker/sdk_worker.py\", line 213, in run\n for work_request in self._control_stub.Control(get_responses()):\n File \"/home/ec2-user/lib64/python3.7/site-packages/grpc/_channel.py\", line 416, in __next__\n return self._next()\n File \"/home/ec2-user/lib64/python3.7/site-packages/grpc/_channel.py\", line 706, in _next\n raise self\ngrpc._channel._MultiThreadedRendezvous: <_MultiThreadedRendezvous of RPC that terminated with:\n\tstatus = StatusCode.UNAVAILABLE\n\tdetails = \"keepalive watchdog timeout\"\n\tdebug_error_string = \"{\"created\":\"#1596563347.420024732\",\"description\":\"Error received from peer ipv6:[::1]:40823\",\"file\":\"src/core/lib/surface/call.cc\",\"file_line\":1055,\"grpc_message\":\"keepalive watchdog timeout\",\"grpc_status\":14}\"\n>\n" log_location: "/home/ec2-user/lib64/python3.7/site-packages/apache_beam/runners/worker/sdk_worker_main.py:161" thread: "MainThread"
Traceback (most recent call last):
File "/usr/lib64/python3.7/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/usr/lib64/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globalse
File "/home/ec2-user/lib64/python3.7/site-packages/apache_beam/runners/worker/sdk_worker_main.py", line 248, in <module>
main(sys.argv)
File "/home/ec2-user/lib64/python3.7/site-packages/apache_beam/runners/worker/sdk_worker_main.py", line 158, in main
sdk_pipeline_options.view_as(ProfilingOptions))).run()
File "/home/ec2-user/lib64/python3.7/site-packages/apache_beam/runners/worker/sdk_worker.py", line 213, in run
for work_request in self._control_stub.Control(get_responses()):
File "/home/ec2-user/lib64/python3.7/site-packages/grpc/_channel.py", line 416, in __next__
return self._next()
File "/home/ec2-user/lib64/python3.7/site-packages/grpc/_channel.py", line 706, in _next
raise self
grpc._channel._MultiThreadedRendezvous: <_MultiThreadedRendezvous of RPC that terminated with:
status = StatusCode.UNAVAILABLE
details = "keepalive watchdog timeout"
debug_error_string = "{"created":"#1596563347.420024732","description":"Error received from peer ipv6:[::1]:40823","file":"src/core/lib/surface/call.cc","file_line":1055,"grpc_message":"keepalive watchdog timeout","grpc_status":14}"
I tried:
- configuring s3 connection from the value.yaml:
connections:
id: aws_default
type: aws
login: xxxaws_access_key_idxxx
password: xxxxxxxxxxxxxaws_secret_access_keyxxxxxxxxxxxxxxxxxx
id: my_s3
type: s3
login: xxxaws_access_key_idxxx
password: xxxxxxxxxxxxxaws_secret_access_keyxxxxxxxxxxxxxxxxxx
write DAG that uses the s3Hook and write string to s3.
run test from scheduler pod:
/entrypoint airflow test dag_id task_id date_before_the_start_data_of_DAG
the file is created and content is ok
# Airflow UI activating the
DAG and RUN it is queued and fail.
any suggestions?
BTW DAG ARGS:
args = {
'owner': 'airflow',
'start_date': airflow.utils.dates.days_ago(0),
'trigger_rule': 'dummy',
#'pool': 'my_workers_pool',
'catchup': False, # dont fill back
}
in addition I added 5 min sleep to the task that cause the DAG to fail and watched the pod creation at the kubectl but the task started for few seconds and disappears. any Ideas how to debug this issue?
logs from task pod:
kubectl logs postgressexamplesababaegozimpostgress-7322d44cb2684a09bef95ad3080b9505
-n airflow-research-p-8482f --tail=200 [2020-08-24 12:04:44,262] {{settings.py:252}} INFO - settings.configure_orm(): Using pool
settings. pool_size=5, max_overflow=10, pool_recycle=1800, pid=1
/usr/local/lib/python3.7/site-packages/psycopg2/init.py:144:
UserWarning: The psycopg2 wheel package will be renamed from release
2.8; in order to keep installing from binary please use "pip install psycopg2-binary" instead. For details see:
http://initd.org/psycopg/docs/install.html#binary-install-from-pypi.
""") [2020-08-24 12:04:44,833] {{init.py:51}} INFO - Using
executor LocalExecutor [2020-08-24 12:04:44,834] {{dagbag.py:92}} INFO
Filling up the DagBag from /usr/local/airflow/dags/postgress_example.py Traceback (most recent
call last): File "/usr/local/bin/airflow", line 37, in
[2020-08-24 12:04:44,841] {{dagbag.py:207}} ERROR - Failed to import:
/usr/local/airflow/dags/postgress_example.py Traceback (most recent
call last): File
"/usr/local/lib/python3.7/site-packages/airflow/models/dagbag.py",
line 204, in process_file
m = imp.load_source(mod_name, filepath) File "/usr/local/lib/python3.7/imp.py", line 171, in load_source
module = _load(spec) File "", line 696, in _load File "", line 677, in
_load_unlocked File "", line 728, in exec_module File "", line 219,
in _call_with_frames_removed File
"/usr/local/airflow/dags/postgress_example.py", line 33, in
import boto3 ModuleNotFoundError: No module named 'boto3'
args.func(args) File "/usr/local/lib/python3.7/site-packages/airflow/utils/cli.py", line
74, in wrapper
return f(*args, **kwargs) File "/usr/local/lib/python3.7/site-packages/airflow/bin/cli.py", line 529,
in run
dag = get_dag(args) File "/usr/local/lib/python3.7/site-packages/airflow/bin/cli.py", line 148,
in get_dag
'parse.'.format(args.dag_id)) airflow.exceptions.AirflowException: dag_id could not be found: postgress_example. Either the dag did not
exist or it failed to parse.
s3fs seems to fail from time to time when reading from an S3 bucket using an AWS Lambda function within a VPN. I am using s3fs==0.4.0 and pandas==1.0.1.
import s3fs
import pandas as pd
def lambda_handler(event, context):
bucket = event['Records'][0]['s3']['bucket']['name']
s3_file = event['Records'][0]['s3']['object']['key']
s3fs.S3FileSystem.connect_timeout = 1800
s3fs.S3FileSystem.read_timeout = 1800
with s3fs.S3FileSystem(anon=False).open(f"s3://{bucket}/{s3_file}", 'rb') as f:
self.data = pd.read_json(f, **kwargs)
The stacktrace is the following:
Traceback (most recent call last):
File "/var/task/urllib3/connection.py", line 157, in _new_conn
(self._dns_host, self.port), self.timeout, **extra_kw
File "/var/task/urllib3/util/connection.py", line 84, in create_connection
raise err
File "/var/task/urllib3/util/connection.py", line 74, in create_connection
sock.connect(sa)
TimeoutError: [Errno 110] Connection timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/var/task/botocore/httpsession.py", line 263, in send
chunked=self._chunked(request.headers),
File "/var/task/urllib3/connectionpool.py", line 720, in urlopen
method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
File "/var/task/urllib3/util/retry.py", line 376, in increment
raise six.reraise(type(error), error, _stacktrace)
File "/var/task/urllib3/packages/six.py", line 735, in reraise
raise value
File "/var/task/urllib3/connectionpool.py", line 672, in urlopen
chunked=chunked,
File "/var/task/urllib3/connectionpool.py", line 376, in _make_request
self._validate_conn(conn)
File "/var/task/urllib3/connectionpool.py", line 994, in _validate_conn
conn.connect()
File "/var/task/urllib3/connection.py", line 300, in connect
conn = self._new_conn()
File "/var/task/urllib3/connection.py", line 169, in _new_conn
self, "Failed to establish a new connection: %s" % e
urllib3.exceptions.NewConnectionError: <botocore.awsrequest.AWSHTTPSConnection object at 0x7f4d578e3ed0>: Failed to establish a new connection: [Errno 110] Connection timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/var/task/botocore/endpoint.py", line 200, in _do_get_response
http_response = self._send(request)
File "/var/task/botocore/endpoint.py", line 244, in _send
return self.http_session.send(request)
File "/var/task/botocore/httpsession.py", line 283, in send
raise EndpointConnectionError(endpoint_url=request.url, error=e)
botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "https://my_bucket.s3.eu-west-1.amazonaws.com/?list-type=2&prefix=my_folder%2Fsomething%2F&delimiter=%2F&encoding-type=url"
Has someone faced this same issue? Why would it fail only sometimes? Is there a s3fs configuration that could help for this specific issue?
Actually there was no problem at all with s3fs. Seems like we were using a Lambda function with two Subnets within the VPC and one was working normally but the other one wasn't allowed to access S3 resources, therefore when a Lambda was spawned using the second network it wouldn't be able to connect at all.
Fixing this issue was as easy as removing the second subnet.
You could also use boto3 which is supported by AWS, in order to get json from S3.
import json
import boto3
def lambda_handler(event, context):
bucket = event['Records'][0]['s3']['bucket']['name']
key = event['Records'][0]['s3']['object']['key']
s3 = boto3.resource('s3')
file_object = s3_resource.Object(bucket, key)
json_content = json.loads(file_object.get()['Body'].read())
I have tried two solutions:
1)`REDIS_HOST = '111.111.111.111'
REDIS_PORT = 12000
REDIS_PASSWORD = 'aaaaaaaa'`
but it will raise:
2017-11-23 15:03:13 [twisted] CRITICAL:
Traceback (most recent call last):
File "/home/yuyanggo/.local/lib/python3.6/site-packages/twisted/internet/defer.py", line 1386, in _inlineCallbacks
result = g.send(result)
File "/home/yuyanggo/.local/lib/python3.6/site-packages/scrapy/crawler.py", line 79, in crawl
yield self.engine.open_spider(self.spider, start_requests)
redis.exceptions.ResponseError: NOAUTH Authentication required.
2)
REDIS_URL = 'redis://:aaaaaaaa#111.111.111.111:12000/0'
but I find that the data of redis is saved into localhost,not the remote redis-server.
try to put the following code in your setting.py file
REDIS_URL = 'redis://:{psw}#{host}:{port}'.format(
host='xx.xx.xx.xx', # your server ip
port='xxx',
psw='xxxx',
)