Catch Scrapy exception when crawling from Airflow - scrapy

I'm trying to catch the exception that occurs on my spider in a manner that I can mark the task instance as failed. Currently the task finishes and is marked as succeeded. I'm calling the crawl() from PythonOperator in Airflow, as follow:
with DAG(
'MySpider',
default_args=default_args,
schedule_interval=None) as dag:
t1 = python_task = PythonOperator(
task_id="crawler_task",
python_callable=run_crawler,
op_kwargs=dag_kwargs
)
Here is my run_crawler() method:
def run_crawler(**kwargs):
project_settings = set_project_settings({
'FEEDS': {
f'{kwargs["bucket"]}%(time)s.{kwargs["format"]}': {
'format': kwargs["format"],
'encoding': 'utf8',
'store_empty': kwargs["store_empty"]
}
}
})
print("Project settings: ")
pprint(project_settings.attributes.items())
set_connection("airflow", kwargs["gcs_connection_id"])
process = CrawlerProcess(project_settings)
process.crawl(spider.MySpider)
print("Starting crawler...")
process.start()
When running, I'm having problems with GCS credentials, which leads me to an Exception, as follow:
google.auth.exceptions.DefaultCredentialsError: The file /tmp/file_my_credentials.json does not have a valid type. Type is None, expected one of ('authorized_user', 'service_account', 'external_account', 'external_account_authorized_user', 'impersonated_service_account', 'gdch_service_account').
{logging_mixin.py:115} WARNING - [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 21087,
'downloader/request_count': 68,
'downloader/request_method_count/GET': 68,
'downloader/response_bytes': 1863876,
'downloader/response_count': 68,
'downloader/response_status_count/200': 68,
'elapsed_time_seconds': 25.647386,
'feedexport/failed_count/GCSFeedStorage': 1,
'httpcompression/response_bytes': 9212776,
'httpcompression/response_count': 68,
'item_scraped_count': 66,
'log_count/DEBUG': 136,
'log_count/ERROR': 1,
'log_count/INFO': 10,
'log_count/WARNING': 3,
'memusage/max': 264441856,
'memusage/startup': 264441856,
'request_depth_max': 1,
'response_received_count': 68,
'scheduler/dequeued': 68,
'scheduler/dequeued/memory': 68,
'scheduler/enqueued': 68,
'scheduler/enqueued/memory': 68,
[2032-13-13, 09:04:28 UTC] {engine.py:389} INFO - Spider closed (finished)
[2032-13-13, 09:04:28 UTC] {logging_mixin.py:115} WARNING -
[scrapy.core.engine] INFO: Spider closed (finished)
[2032-13-13, 09:04:28 UTC] {python.py:173} INFO - Done. Returned value was: None
[2032-13-13, 09:04:28 UTC] {taskinstance.py:1408} INFO - Marking task as SUCCESS. dag_id=MySpider, task_id=crawler_task, execution_date=2032-13-13, start_date=2032-13-13, end_date=2032-13-13
[2032-13-13, 09:04:28 UTC] {local_task_job.py:156} INFO - Task exited with return code 0
[2032-13-13, 09:04:28 UTC] {local_task_job.py:279} INFO - 0 downstream tasks scheduled from follow-on schedule check
As you can see, even having this exception, the task itself is marked as "SUCCESS". Is it possible to catch it in order to mark as FAILED, then we can follow it on airflow (Composer) interface?
Thank you

I don't understand why in this case the exception doesn't break the task.
You can add a try except in the run_crawler method and then raise you own exception in the except bloc :
import logging
def run_crawler(**kwargs):
class CustomException(Exception):
pass
try:
project_settings = set_project_settings({
'FEEDS': {
f'{kwargs["bucket"]}%(time)s.{kwargs["format"]}': {
'format': kwargs["format"],
'encoding': 'utf8',
'store_empty': kwargs["store_empty"]
}
}
})
print("Project settings: ")
pprint(project_settings.attributes.items())
set_connection("airflow", kwargs["gcs_connection_id"])
process = CrawlerProcess(project_settings)
process.crawl(spider.MySpider)
print("Starting crawler...")
process.start()
except Exception as err:
logging.error("Error in the Airflow task !!!!!", err)
raise CustomException("Error in the Airflow task !!!!!", err)
In this case when your custom exception will be raised, it will break the Airflow and mark it as failed.

Related

Scrapy Pagination - Works for 2 pages but not after that

I'm crawling cdw.com website. For a given URL, there are around 17 pages. The script that I have written is able t fetch data from Page 1 and Page 2. Spider closes on its own after giving result of first 2 pages. Please let me know, how can I fetch data for remaining 15 pages.
TIA.
import scrapy
from cdwfinal.items import CdwfinalItem
from scrapy.selector import Selector
import datetime
import pandas as pd
import time
class CdwSpider(scrapy.Spider):
name = 'cdw'
allowed_domains = ['www.cdw.com']
start_urls = ['http://www.cdw.com/']
base_url = 'http://www.cdw.com'
def start_requests(self):
yield scrapy.Request(url = 'https://www.cdw.com/search/?key=axiom' , callback=self.parse )
def parse(self, response):
item=[]
hxs = Selector(response)
item = CdwfinalItem()
abc = hxs.xpath('//*[#id="main"]//*[#class="grid-row"]')
for i in range(len(abc)):
try:
item['mpn'] = hxs.xpath("//div[contains(#class,'search-results')]/div[contains(#class,'search-result')]["+ str(i+1) +"]//*[#class='mfg-code']/text()").extract()
except:
item['mpn'] = 'NA'
try:
item['part_no'] = hxs.xpath("//div[contains(#class,'search-results')]/div[contains(#class,'search-result')]["+ str(i+1) +"]//*[#class='cdw-code']/text()").extract()
except:
item['part_no'] = 'NA'
yield item
next_page = hxs.xpath('//*[#id="main"]//*[#class="no-hover" and #aria-label="Next Page"]').extract()
if next_page:
new_page_href = hxs.xpath('//*[#id="main"]//*[#class="no-hover" and #aria-label="Next Page"]/#href').extract_first()
new_page_url = response.urljoin(new_page_href)
yield scrapy.Request(new_page_url, callback=self.parse, meta={"searchword": '123'})
LOG:
2023-02-11 15:39:55 [scrapy_user_agents.middlewares] DEBUG: Assigned User-Agent Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36
2023-02-11 15:39:55 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.cdw.com/search/?key=axiom&pcurrent=3> (referer: https://www.cdw.com/search/?key=axiom&pcurrent=2) ['cached']
2023-02-11 15:39:55 [scrapy.core.engine] INFO: Closing spider (finished)
2023-02-11 15:39:55 [scrapy.extensions.feedexport] INFO: Stored csv feed (48 items) in: Test5.csv
2023-02-11 15:39:55 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 2178,
'downloader/request_count': 3,
'downloader/request_method_count/GET': 3,
'downloader/response_bytes': 68059,
'downloader/response_count': 3,
'downloader/response_status_count/200': 3,
'elapsed_time_seconds': 1.30903,
'feedexport/success_count/FileFeedStorage': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2023, 2, 11, 10, 9, 55, 327740),
'httpcache/hit': 3,
'httpcompression/response_bytes': 384267,
'httpcompression/response_count': 3,
'item_scraped_count': 48,
'log_count/DEBUG': 62,
'log_count/INFO': 11,
'log_count/WARNING': 45,
'request_depth_max': 2,
'response_received_count': 3,
'scheduler/dequeued': 3,
'scheduler/dequeued/memory': 3,
'scheduler/enqueued': 3,
'scheduler/enqueued/memory': 3,
'start_time': datetime.datetime(2023, 2, 11, 10, 9, 54, 18710)}
Your next_page selector is failing to extract the information for the next page. In general your selectors are more complicated then they need to be, for example you should be using relative xpath expressions in your for loop .
Here is an example that replicates the same behaviour as your spider except using much simpler selectors, and successfully extracts the results from all of the pages.
import scrapy
class CdwSpider(scrapy.Spider):
name = 'cdw'
allowed_domains = ['www.cdw.com']
custom_settings = {
"USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
}
def start_requests(self):
yield scrapy.Request(url='https://www.cdw.com/search/?key=axiom' , callback=self.parse )
def parse(self, response):
for row in response.xpath('//div[#class="grid-row"]'):
mpn = row.xpath(".//span[#class='mfg-code']/text()").get()
cdw = row.xpath('.//span[#class="cdw-code"]/text()').get()
yield {"mpn": mpn, "part_no": cdw}
current = response.css("div.search-pagination-active")
next_page = current.xpath('./following-sibling::a/#href').get()
if next_page:
new_page_url = response.urljoin(next_page)
yield scrapy.Request(new_page_url, callback=self.parse)
EDIT
The only non-default setting I am using is setting a the user agent.
I have made adjustments in the example above to reflect that.
Partial output:
2023-02-11 22:10:58 [scrapy.core.engine] INFO: Closing spider (finished)
2023-02-11 22:10:58 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 106555,
'downloader/request_count': 41,
'downloader/request_method_count/GET': 41,
'downloader/response_bytes': 1099256,
'downloader/response_count': 41,
'downloader/response_status_count/200': 41,
'elapsed_time_seconds': 22.968986,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2023, 2, 12, 6, 10, 58, 700080),
'httpcompression/response_bytes': 7962149,
'httpcompression/response_count': 41,
'item_scraped_count': 984,
'log_count/DEBUG': 1028,
'log_count/INFO': 10,
'request_depth_max': 40,
'response_received_count': 41,
'scheduler/dequeued': 41,
'scheduler/dequeued/memory': 41,
'scheduler/enqueued': 41,
'scheduler/enqueued/memory': 41,
'start_time': datetime.datetime(2023, 2, 12, 6, 10, 35, 731094)}
2023-02-11 22:10:58 [scrapy.core.engine] INFO: Spider closed (finished)

Reading shared global dataframe in dask

I'm parallelizing a function over 32 cores and am having some trouble accessing a shared dataframe dask_paths. All the code works correctly when I get rid of the line (and lines that depend on it) dask_paths[dask_paths['od'] == (o,d)].compute(). Incredibly, if I compute this for some fixed o, d outside of the distributed code, then use that result, I get what I want (for that o, d). This means it really is just the actual accessing of dask_paths that is failing in the parallel computation. I am using the logic given here for "embarassingly paralellizable for loops" in the dask docs. Moreover, I used to use get_group on a global pd.DataFrame grouped for this logic, and that suffered from the same problem of glogbal (even though this is serially done in a couple seconds, the computation stalls before giving a cryptic error message, given at the bottom). I don't know why this is.
Note that dask_paths is a Dask.dataframe. This is the most basic of logic in parallellizing with dask, so not sure why it's failing. I am working on a Vertex AI jupyter notebook on Google Cloud. There is no error trace, because the program simply stalls. All the data/dataframes have been defined in the global environment of the jupyter notebook in the cells above, and are working fine. The vertex AI notebook has 16 vCPUs and 100GB RAM and is run on Google Cloud's VM. There is no reading or writing to any files, so that's not the issue.
# dask_paths['od'] takes on values like (100, 150)
# popular takes the form of a [[20, 25], [100, 150], [67, 83],...]
# and is of length 2000 elements, every element is a list of len 2
def main():
def pop2unique(pop):
df = dask_paths[dask_paths['od'] == (pop[0], pop[1])].compute()
return df['last'].sum()
lzs = []
ncores = 32
dask_client.cluster.scale(10)
futures = dask_client.map(pop2unique, popular[:10]) # this stalls
results = dask_client.gather(futures)
And dask_paths takes the form
index (o, d) last
605096 (22, 25) 10
999336 (103, 88) 31
1304512 (101, 33) 9
1432383 (768, 21) 16
The client being used everywhere is given by
from dask.distributed import Client, progress
dask_client = Client(threads_per_worker=4, n_workers=8)
The error message I get is long and cryptic:
distributed.diskutils - INFO - Found stale lock file and directory '/home/jupyter/dask-worker-space/worker-9y17gy_r', purging
distributed.scheduler - WARNING - Worker tried to connect with a duplicate name: 33
distributed.scheduler - WARNING - Worker tried to connect with a duplicate name: 35
distributed.nanny - ERROR - Failed to start worker
Traceback (most recent call last):
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 884, in run
await worker
File "/opt/conda/lib/python3.7/site-packages/distributed/core.py", line 279, in _
await self.start()
File "/opt/conda/lib/python3.7/site-packages/distributed/worker.py", line 1584, in start
await self._register_with_scheduler()
File "/opt/conda/lib/python3.7/site-packages/distributed/worker.py", line 1299, in _register_with_scheduler
raise ValueError(f"Unexpected response from register: {response!r}")
ValueError: Unexpected response from register: {'status': 'error', 'message': 'name taken, 33', 'time': 1658860560.8544912}
distributed.scheduler - WARNING - Worker tried to connect with a duplicate name: 36
distributed.scheduler - WARNING - Worker tried to connect with a duplicate name: 31
distributed.nanny - ERROR - Failed to start worker
Traceback (most recent call last):
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 884, in run
await worker
File "/opt/conda/lib/python3.7/site-packages/distributed/core.py", line 279, in _
await self.start()
File "/opt/conda/lib/python3.7/site-packages/distributed/worker.py", line 1584, in start
await self._register_with_scheduler()
File "/opt/conda/lib/python3.7/site-packages/distributed/worker.py", line 1299, in _register_with_scheduler
raise ValueError(f"Unexpected response from register: {response!r}")
ValueError: Unexpected response from register: {'status': 'error', 'message': 'name taken, 35', 'time': 1658860560.8563268}
distributed.nanny - ERROR - Failed to start worker
Traceback (most recent call last):
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 884, in run
await worker
File "/opt/conda/lib/python3.7/site-packages/distributed/core.py", line 279, in _
await self.start()
File "/opt/conda/lib/python3.7/site-packages/distributed/worker.py", line 1584, in start
await self._register_with_scheduler()
File "/opt/conda/lib/python3.7/site-packages/distributed/worker.py", line 1299, in _register_with_scheduler
raise ValueError(f"Unexpected response from register: {response!r}")
ValueError: Unexpected response from register: {'status': 'error', 'message': 'name taken, 36', 'time': 1658860560.8576422}
distributed.scheduler - WARNING - Worker tried to connect with a duplicate name: 34
distributed.nanny - ERROR - Failed to start worker
Traceback (most recent call last):
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 884, in run
await worker
File "/opt/conda/lib/python3.7/site-packages/distributed/core.py", line 279, in _
await self.start()
File "/opt/conda/lib/python3.7/site-packages/distributed/worker.py", line 1584, in start
await self._register_with_scheduler()
File "/opt/conda/lib/python3.7/site-packages/distributed/worker.py", line 1299, in _register_with_scheduler
raise ValueError(f"Unexpected response from register: {response!r}")
ValueError: Unexpected response from register: {'status': 'error', 'message': 'name taken, 31', 'time': 1658860560.8595085}
distributed.scheduler - WARNING - Worker tried to connect with a duplicate name: 32
distributed.nanny - ERROR - Failed to start worker
Traceback (most recent call last):
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 884, in run
await worker
File "/opt/conda/lib/python3.7/site-packages/distributed/core.py", line 279, in _
await self.start()
File "/opt/conda/lib/python3.7/site-packages/distributed/worker.py", line 1584, in start
await self._register_with_scheduler()
File "/opt/conda/lib/python3.7/site-packages/distributed/worker.py", line 1299, in _register_with_scheduler
raise ValueError(f"Unexpected response from register: {response!r}")
ValueError: Unexpected response from register: {'status': 'error', 'message': 'name taken, 34', 'time': 1658860560.8609138}
distributed.nanny - ERROR - Failed to start worker
Traceback (most recent call last):
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 884, in run
await worker
File "/opt/conda/lib/python3.7/site-packages/distributed/core.py", line 279, in _
await self.start()
File "/opt/conda/lib/python3.7/site-packages/distributed/worker.py", line 1584, in start
await self._register_with_scheduler()
File "/opt/conda/lib/python3.7/site-packages/distributed/worker.py", line 1299, in _register_with_scheduler
raise ValueError(f"Unexpected response from register: {response!r}")
ValueError: Unexpected response from register: {'status': 'error', 'message': 'name taken, 32', 'time': 1658860560.862359}
distributed.nanny - ERROR - Failed while trying to start worker process: Unexpected response from register: {'status': 'error', 'message': 'name taken, 36', 'time': 1658860560.8576422}
distributed.nanny - ERROR - Failed while trying to start worker process: Unexpected response from register: {'status': 'error', 'message': 'name taken, 31', 'time': 1658860560.8595085}
distributed.nanny - ERROR - Failed while trying to start worker process: Unexpected response from register: {'status': 'error', 'message': 'name taken, 35', 'time': 1658860560.8563268}
distributed.nanny - ERROR - Failed while trying to start worker process: Unexpected response from register: {'status': 'error', 'message': 'name taken, 33', 'time': 1658860560.8544912}
distributed.nanny - ERROR - Failed while trying to start worker process: Unexpected response from register: {'status': 'error', 'message': 'name taken, 34', 'time': 1658860560.8609138}
distributed.nanny - ERROR - Failed while trying to start worker process: Unexpected response from register: {'status': 'error', 'message': 'name taken, 32', 'time': 1658860560.862359}
Task exception was never retrieved
future: <Task finished coro=<_wrap_awaitable() done, defined at /opt/conda/lib/python3.7/asyncio/tasks.py:623> exception=ValueError("Unexpected response from register: {'status': 'error', 'message': 'name taken, 32', 'time': 1658860560.862359}")>
Traceback (most recent call last):
File "/opt/conda/lib/python3.7/asyncio/tasks.py", line 630, in _wrap_awaitable
return (yield from awaitable.__await__())
File "/opt/conda/lib/python3.7/site-packages/distributed/core.py", line 279, in _
await self.start()
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 334, in start
response = await self.instantiate()
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 417, in instantiate
result = await self.process.start()
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 694, in start
msg = await self._wait_until_connected(uid)
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 812, in _wait_until_connected
raise msg["exception"]
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 884, in run
await worker
File "/opt/conda/lib/python3.7/site-packages/distributed/core.py", line 279, in _
await self.start()
File "/opt/conda/lib/python3.7/site-packages/distributed/worker.py", line 1584, in start
await self._register_with_scheduler()
File "/opt/conda/lib/python3.7/site-packages/distributed/worker.py", line 1299, in _register_with_scheduler
raise ValueError(f"Unexpected response from register: {response!r}")
ValueError: Unexpected response from register: {'status': 'error', 'message': 'name taken, 32', 'time': 1658860560.862359}
Task exception was never retrieved
future: <Task finished coro=<_wrap_awaitable() done, defined at /opt/conda/lib/python3.7/asyncio/tasks.py:623> exception=ValueError("Unexpected response from register: {'status': 'error', 'message': 'name taken, 36', 'time': 1658860560.8576422}")>
Traceback (most recent call last):
File "/opt/conda/lib/python3.7/asyncio/tasks.py", line 630, in _wrap_awaitable
return (yield from awaitable.__await__())
File "/opt/conda/lib/python3.7/site-packages/distributed/core.py", line 279, in _
await self.start()
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 334, in start
response = await self.instantiate()
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 417, in instantiate
result = await self.process.start()
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 694, in start
msg = await self._wait_until_connected(uid)
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 812, in _wait_until_connected
raise msg["exception"]
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 884, in run
await worker
File "/opt/conda/lib/python3.7/site-packages/distributed/core.py", line 279, in _
await self.start()
File "/opt/conda/lib/python3.7/site-packages/distributed/worker.py", line 1584, in start
await self._register_with_scheduler()
File "/opt/conda/lib/python3.7/site-packages/distributed/worker.py", line 1299, in _register_with_scheduler
raise ValueError(f"Unexpected response from register: {response!r}")
ValueError: Unexpected response from register: {'status': 'error', 'message': 'name taken, 36', 'time': 1658860560.8576422}
Task exception was never retrieved
future: <Task finished coro=<_wrap_awaitable() done, defined at /opt/conda/lib/python3.7/asyncio/tasks.py:623> exception=ValueError("Unexpected response from register: {'status': 'error', 'message': 'name taken, 35', 'time': 1658860560.8563268}")>
Traceback (most recent call last):
File "/opt/conda/lib/python3.7/asyncio/tasks.py", line 630, in _wrap_awaitable
return (yield from awaitable.__await__())
File "/opt/conda/lib/python3.7/site-packages/distributed/core.py", line 279, in _
await self.start()
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 334, in start
response = await self.instantiate()
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 417, in instantiate
result = await self.process.start()
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 694, in start
msg = await self._wait_until_connected(uid)
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 812, in _wait_until_connected
raise msg["exception"]
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 884, in run
await worker
File "/opt/conda/lib/python3.7/site-packages/distributed/core.py", line 279, in _
await self.start()
File "/opt/conda/lib/python3.7/site-packages/distributed/worker.py", line 1584, in start
await self._register_with_scheduler()
File "/opt/conda/lib/python3.7/site-packages/distributed/worker.py", line 1299, in _register_with_scheduler
raise ValueError(f"Unexpected response from register: {response!r}")
ValueError: Unexpected response from register: {'status': 'error', 'message': 'name taken, 35', 'time': 1658860560.8563268}
Task exception was never retrieved
future: <Task finished coro=<_wrap_awaitable() done, defined at /opt/conda/lib/python3.7/asyncio/tasks.py:623> exception=ValueError("Unexpected response from register: {'status': 'error', 'message': 'name taken, 34', 'time': 1658860560.8609138}")>
Traceback (most recent call last):
File "/opt/conda/lib/python3.7/asyncio/tasks.py", line 630, in _wrap_awaitable
return (yield from awaitable.__await__())
File "/opt/conda/lib/python3.7/site-packages/distributed/core.py", line 279, in _
await self.start()
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 334, in start
response = await self.instantiate()
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 417, in instantiate
result = await self.process.start()
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 694, in start
msg = await self._wait_until_connected(uid)
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 812, in _wait_until_connected
raise msg["exception"]
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 884, in run
await worker
File "/opt/conda/lib/python3.7/site-packages/distributed/core.py", line 279, in _
await self.start()
File "/opt/conda/lib/python3.7/site-packages/distributed/worker.py", line 1584, in start
await self._register_with_scheduler()
File "/opt/conda/lib/python3.7/site-packages/distributed/worker.py", line 1299, in _register_with_scheduler
raise ValueError(f"Unexpected response from register: {response!r}")
ValueError: Unexpected response from register: {'status': 'error', 'message': 'name taken, 34', 'time': 1658860560.8609138}
Task exception was never retrieved
future: <Task finished coro=<_wrap_awaitable() done, defined at /opt/conda/lib/python3.7/asyncio/tasks.py:623> exception=ValueError("Unexpected response from register: {'status': 'error', 'message': 'name taken, 33', 'time': 1658860560.8544912}")>
Traceback (most recent call last):
File "/opt/conda/lib/python3.7/asyncio/tasks.py", line 630, in _wrap_awaitable
return (yield from awaitable.__await__())
File "/opt/conda/lib/python3.7/site-packages/distributed/core.py", line 279, in _
await self.start()
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 334, in start
response = await self.instantiate()
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 417, in instantiate
result = await self.process.start()
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 694, in start
msg = await self._wait_until_connected(uid)
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 812, in _wait_until_connected
raise msg["exception"]
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 884, in run
await worker
File "/opt/conda/lib/python3.7/site-packages/distributed/core.py", line 279, in _
await self.start()
File "/opt/conda/lib/python3.7/site-packages/distributed/worker.py", line 1584, in start
await self._register_with_scheduler()
File "/opt/conda/lib/python3.7/site-packages/distributed/worker.py", line 1299, in _register_with_scheduler
raise ValueError(f"Unexpected response from register: {response!r}")
ValueError: Unexpected response from register: {'status': 'error', 'message': 'name taken, 33', 'time': 1658860560.8544912}
Task exception was never retrieved
future: <Task finished coro=<_wrap_awaitable() done, defined at /opt/conda/lib/python3.7/asyncio/tasks.py:623> exception=ValueError("Unexpected response from register: {'status': 'error', 'message': 'name taken, 31', 'time': 1658860560.8595085}")>
Traceback (most recent call last):
File "/opt/conda/lib/python3.7/asyncio/tasks.py", line 630, in _wrap_awaitable
return (yield from awaitable.__await__())
File "/opt/conda/lib/python3.7/site-packages/distributed/core.py", line 279, in _
await self.start()
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 334, in start
response = await self.instantiate()
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 417, in instantiate
result = await self.process.start()
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 694, in start
msg = await self._wait_until_connected(uid)
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 812, in _wait_until_connected
raise msg["exception"]
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 884, in run
await worker
File "/opt/conda/lib/python3.7/site-packages/distributed/core.py", line 279, in _
await self.start()
File "/opt/conda/lib/python3.7/site-packages/distributed/worker.py", line 1584, in start
await self._register_with_scheduler()
File "/opt/conda/lib/python3.7/site-packages/distributed/worker.py", line 1299, in _register_with_scheduler
raise ValueError(f"Unexpected response from register: {response!r}")
ValueError: Unexpected response from register: {'status': 'error', 'message': 'name taken, 31', 'time': 1658860560.8595085}
distributed.diskutils - INFO - Found stale lock file and directory '/home/jupyter/dask-worker-space/worker-xd5jxrin', purging
distributed.diskutils - INFO - Found stale lock file and directory '/home/jupyter/dask-worker-space/worker-w_fmefrs', purging
distributed.diskutils - INFO - Found stale lock file and directory '/home/jupyter/dask-worker-space/worker-djg8ki4m', purging
distributed.diskutils - INFO - Found stale lock file and directory '/home/jupyter/dask-worker-space/worker-ho1hw10b', purging
distributed.diskutils - INFO - Found stale lock file and directory '/home/jupyter/dask-worker-space/worker-mbdw10vg', purging
distributed.diskutils - INFO - Found stale lock file and directory '/home/jupyter/dask-worker-space/worker-whk890cp', purging
distributed.scheduler - WARNING - Worker tried to connect with a duplicate name: 32
distributed.nanny - ERROR - Failed to start worker
Traceback (most recent call last):
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 884, in run
await worker
File "/opt/conda/lib/python3.7/site-packages/distributed/core.py", line 279, in _
await self.start()
File "/opt/conda/lib/python3.7/site-packages/distributed/worker.py", line 1584, in start
await self._register_with_scheduler()
File "/opt/conda/lib/python3.7/site-packages/distributed/worker.py", line 1299, in _register_with_scheduler
raise ValueError(f"Unexpected response from register: {response!r}")
ValueError: Unexpected response from register: {'status': 'error', 'message': 'name taken, 32', 'time': 1658860564.692771}
distributed.nanny - ERROR - Failed while trying to start worker process: Unexpected response from register: {'status': 'error', 'message': 'name taken, 32', 'time': 1658860564.692771}
tornado.application - ERROR - Exception in callback functools.partial(<bound method IOLoop._discard_future_result of <zmq.eventloop.ioloop.ZMQIOLoop object at 0x7f82cf7eba10>>, <Task finished coro=<SpecCluster._correct_state_internal() done, defined at /opt/conda/lib/python3.7/site-packages/distributed/deploy/spec.py:310> exception=ValueError("Unexpected response from register: {'status': 'error', 'message': 'name taken, 32', 'time': 1658860564.692771}")>)
Traceback (most recent call last):
File "/opt/conda/lib/python3.7/site-packages/tornado/ioloop.py", line 741, in _run_callback
ret = callback()
File "/opt/conda/lib/python3.7/site-packages/tornado/ioloop.py", line 765, in _discard_future_result
future.result()
File "/opt/conda/lib/python3.7/site-packages/distributed/deploy/spec.py", line 348, in _correct_state_internal
await w # for tornado gen.coroutine support
File "/opt/conda/lib/python3.7/site-packages/distributed/core.py", line 279, in _
await self.start()
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 334, in start
response = await self.instantiate()
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 417, in instantiate
result = await self.process.start()
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 694, in start
msg = await self._wait_until_connected(uid)
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 812, in _wait_until_connected
raise msg["exception"]
File "/opt/conda/lib/python3.7/site-packages/distributed/nanny.py", line 884, in run
await worker
File "/opt/conda/lib/python3.7/site-packages/distributed/core.py", line 279, in _
await self.start()
File "/opt/conda/lib/python3.7/site-packages/distributed/worker.py", line 1584, in start
await self._register_with_scheduler()
File "/opt/conda/lib/python3.7/site-packages/distributed/worker.py", line 1299, in _register_with_scheduler
raise ValueError(f"Unexpected response from register: {response!r}")
ValueError: Unexpected response from register: {'status': 'error', 'message': 'name taken, 32', 'time': 1658860564.692771}
The errors you are seeing might not be related to your workflow - maybe a version conflict or similar.
However, you are mixing dask API paradigms. You have created a dask-dataframe - which understand how to partitionify operations for dask to compute - but then chosen to create tasks manually yourself. This is a bad idea. Dask tasks should generally be operating on one partition of a normal data structure (in this case, dataframe) not on a dask collection (in this case, dask dataframe). It may be (I am not sure) that the attempt to serialise the dask-dataframe and deserialise it on the workers is what is causing them to fail to start properly.
Your workflow at first glance looks like a full shuffle, but indeed i does parallelise OK, because you can groupby in each partition, and sum the results.
def per_partition_op(df):
return df.groupby("od")["last"].sum()
df2 = df.map_partitions(per_partition_op)
At this point, you can just compute and work with the partials series, since this should already be of a manageable size
partials = df2.compute()
results = partials.groupby(level=0).sum()

Airflow 1.10.6 kubernetes executor over EKS using s3 connection, task pass test but dag run fail

I tried:
- configuring s3 connection from the value.yaml:
connections:
id: aws_default
type: aws
login: xxxaws_access_key_idxxx
password: xxxxxxxxxxxxxaws_secret_access_keyxxxxxxxxxxxxxxxxxx
id: my_s3
type: s3
login: xxxaws_access_key_idxxx
password: xxxxxxxxxxxxxaws_secret_access_keyxxxxxxxxxxxxxxxxxx
write DAG that uses the s3Hook and write string to s3.
run test from scheduler pod:
/entrypoint airflow test dag_id task_id date_before_the_start_data_of_DAG
the file is created and content is ok
# Airflow UI activating the
DAG and RUN it is queued and fail.
any suggestions?
BTW DAG ARGS:
args = {
'owner': 'airflow',
'start_date': airflow.utils.dates.days_ago(0),
'trigger_rule': 'dummy',
#'pool': 'my_workers_pool',
'catchup': False, # dont fill back
}
in addition I added 5 min sleep to the task that cause the DAG to fail and watched the pod creation at the kubectl but the task started for few seconds and disappears. any Ideas how to debug this issue?
logs from task pod:
kubectl logs postgressexamplesababaegozimpostgress-7322d44cb2684a09bef95ad3080b9505
-n airflow-research-p-8482f --tail=200 [2020-08-24 12:04:44,262] {{settings.py:252}} INFO - settings.configure_orm(): Using pool
settings. pool_size=5, max_overflow=10, pool_recycle=1800, pid=1
/usr/local/lib/python3.7/site-packages/psycopg2/init.py:144:
UserWarning: The psycopg2 wheel package will be renamed from release
2.8; in order to keep installing from binary please use "pip install psycopg2-binary" instead. For details see:
http://initd.org/psycopg/docs/install.html#binary-install-from-pypi.
""") [2020-08-24 12:04:44,833] {{init.py:51}} INFO - Using
executor LocalExecutor [2020-08-24 12:04:44,834] {{dagbag.py:92}} INFO
Filling up the DagBag from /usr/local/airflow/dags/postgress_example.py Traceback (most recent
call last): File "/usr/local/bin/airflow", line 37, in
[2020-08-24 12:04:44,841] {{dagbag.py:207}} ERROR - Failed to import:
/usr/local/airflow/dags/postgress_example.py Traceback (most recent
call last): File
"/usr/local/lib/python3.7/site-packages/airflow/models/dagbag.py",
line 204, in process_file
m = imp.load_source(mod_name, filepath) File "/usr/local/lib/python3.7/imp.py", line 171, in load_source
module = _load(spec) File "", line 696, in _load File "", line 677, in
_load_unlocked File "", line 728, in exec_module File "", line 219,
in _call_with_frames_removed File
"/usr/local/airflow/dags/postgress_example.py", line 33, in
import boto3 ModuleNotFoundError: No module named 'boto3'
args.func(args) File "/usr/local/lib/python3.7/site-packages/airflow/utils/cli.py", line
74, in wrapper
return f(*args, **kwargs) File "/usr/local/lib/python3.7/site-packages/airflow/bin/cli.py", line 529,
in run
dag = get_dag(args) File "/usr/local/lib/python3.7/site-packages/airflow/bin/cli.py", line 148,
in get_dag
'parse.'.format(args.dag_id)) airflow.exceptions.AirflowException: dag_id could not be found: postgress_example. Either the dag did not
exist or it failed to parse.

Scrapy + Splash = Connection Refused

I installed Splash using this link.
Followed all steps to installation, but Splash doesn't work.
My settings.py file:
BOT_NAME = 'Teste'
SPIDER_MODULES = ['Test.spiders']
NEWSPIDER_MODULE = 'Test.spiders'
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,}
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
SPLASH_URL = 'http://127.0.0.1:8050/'
When I run scrapy crawl TestSpider:
[scrapy.core.engine] INFO: Spider opened
[scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
[scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET http://www.google.com.br via http://127.0.0.1:8050/render.html> (failed 1 times): Connection was refused by other side: 111: Connection refused.
[scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET http://www.google.com.br via http://127.0.0.1:8050/render.html> (failed 2 times): Connection was refused by other side: 111: Connection refused.
[scrapy.downloadermiddlewares.retry] DEBUG: Gave up retrying <GET http://www.google.com.br via http://127.0.0.1:8050/render.html> (failed 3 times): Connection was refused by other side: 111: Connection refused.
[scrapy.core.scraper] ERROR: Error downloading <GET http://www.google.com.br via http://127.0.0.1:8050/render.html>
Traceback (most recent call last):
File "/home/ricardo/scrapy/lib/python3.5/site-packages/twisted/internet/defer.py", line 1126, in _inlineCallbacks
result = result.throwExceptionIntoGenerator(g)
File "/home/ricardo/scrapy/lib/python3.5/site-packages/twisted/python/failure.py", line 389, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb)
File "/home/ricardo/scrapy/lib/python3.5/site-packages/scrapy/core/downloader/middleware.py", line 43, in process_request
defer.returnValue((yield
download_func(request=request,spider=spider)))
twisted.internet.error.ConnectionRefusedError: Connection was refused
by other side: 111: Connection refused.
[scrapy.core.engine] INFO: Closing spider (finished)
[scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/exception_count': 3, 'downloader/exception_type_count/twisted.internet.error.ConnectionRefusedError': 3,
'downloader/request_bytes': 1476,
'downloader/request_count': 3,
'downloader/request_method_count/POST': 3,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2017, 6, 29, 21, 36, 16, 72916),
'log_count/DEBUG': 3,
'log_count/ERROR': 1,
'log_count/INFO': 7,
'memusage/max': 47468544,
'memusage/startup': 47468544,
'retry/count': 2,
'retry/max_reached': 1,
'retry/reason_count/twisted.internet.error.ConnectionRefusedError': 2,
'scheduler/dequeued': 4,
'scheduler/dequeued/memory': 4,
'scheduler/enqueued': 4,
'scheduler/enqueued/memory': 4,
'splash/render.html/request_count': 1,
'start_time': datetime.datetime(2017, 6, 29, 21, 36, 15, 851593)}
[scrapy.core.engine] INFO: Spider closed (finished)
Here is my spider:
import scrapy
from scrapy_splash import SplashRequest
class TesteSpider(scrapy.Spider):
name="Teste"
def start_requests(self):
yield SplashRequest("http://www.google.com.br", self.parse, meta={"splash": {"endpoint":"render.html",}})
def parse(self, response):
self.log('Hello World')
I tried to run this in terminal: curl http://localhost:8050/render.html?url=http://www.google.com/"
Output:
curl: (7) Failed to connect to localhost port 8050: Connection Refused
You need run via command line:
sudo docker run -p 8050:8050 scrapinghub/splash
And settings.py as
SPLASH_URL = 'http://localhost:8050'
please make sure your splash server is up and running before calling the spider.
sudo docker run -p 5023:5023 -p 8050:8050 -p 8051:8051 scrapinghub/splash

Auth Issue with AirFlow and BigQuery?

I'm trying to get a simple dummy job going in Airflow for BigQuery but running into what i think might be auth issues but am not quite sure.
My DAG:
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from datetime import datetime, timedelta
from airflow.contrib.hooks.bigquery_hook import BigQueryHook
from airflow.contrib.operators.bigquery_operator import BigQueryOperator
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime(2017, 1, 1),
'email': ['airflow#airflow.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
# 'queue': 'bash_queue',
# 'pool': 'backfill',
# 'priority_weight': 10,
# 'end_date': datetime(2016, 1, 1),
}
with DAG('my_bq_dag', schedule_interval=timedelta(days=1),
default_args=default_args) as dag:
bq_extract_one_day = BigQueryOperator(
task_id='my_bq_task1',
bql='SELECT 666 as msg',
destination_dataset_table='airflow.msg',
write_disposition='WRITE_TRUNCATE',
bigquery_conn_id='bigquery_default'
)
Then when i try test with:
#airflow-server:~/$ airflow test my_bq_dag my_bq_task1 2017-01-01
I get:
[2017-03-09 17:06:05,629] {__init__.py:36} INFO - Using executor LocalExecutor
[2017-03-09 17:06:05,735] {driver.py:120} INFO - Generating grammar tables from /usr/lib/python2.7/lib2to3/Grammar.txt
[2017-03-09 17:06:05,764] {driver.py:120} INFO - Generating grammar tables from /usr/lib/python2.7/lib2to3/PatternGrammar.txt
[2017-03-09 17:06:06,091] {models.py:154} INFO - Filling up the DagBag from /home/user/myproject/airflow/dags
[2017-03-09 17:06:06,385] {models.py:1196} INFO -
--------------------------------------------------------------------------------
Starting attempt 1 of 2
--------------------------------------------------------------------------------
[2017-03-09 17:06:06,386] {models.py:1219} INFO - Executing <Task(BigQueryOperator): my_bq_task1> on 2017-01-01 00:00:00
[2017-03-09 17:06:06,396] {bigquery_operator.py:55} INFO - Executing: SELECT 666 as msg
[2017-03-09 17:06:06,425] {discovery.py:810} INFO - URL being requested: POST https://www.googleapis.com/bigquery/v2/projects/myproject/jobs?alt=json
[2017-03-09 17:06:06,425] {client.py:570} INFO - Attempting refresh to obtain initial access_token
[2017-03-09 17:06:06,426] {models.py:1286} ERROR - []
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/airflow/models.py", line 1245, in run
result = task_copy.execute(context=context)
File "/usr/local/lib/python2.7/dist-packages/airflow/contrib/operators/bigquery_operator.py", line 59, in execute
cursor.run_query(self.bql, self.destination_dataset_table, self.write_disposition, self.allow_large_results, self.udf_config)
File "/usr/local/lib/python2.7/dist-packages/airflow/contrib/hooks/bigquery_hook.py", line 207, in run_query
return self.run_with_configuration(configuration)
File "/usr/local/lib/python2.7/dist-packages/airflow/contrib/hooks/bigquery_hook.py", line 437, in run_with_configuration
.insert(projectId=self.project_id, body=job_data) \
File "/usr/local/lib/python2.7/dist-packages/oauth2client/util.py", line 140, in positional_wrapper
return wrapped(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/googleapiclient/http.py", line 722, in execute
body=self.body, headers=self.headers)
File "/usr/local/lib/python2.7/dist-packages/oauth2client/client.py", line 572, in new_request
self._refresh(request_orig)
File "/usr/local/lib/python2.7/dist-packages/oauth2client/client.py", line 842, in _refresh
self._do_refresh_request(http_request)
File "/usr/local/lib/python2.7/dist-packages/oauth2client/client.py", line 869, in _do_refresh_request
body = self._generate_refresh_request_body()
File "/usr/local/lib/python2.7/dist-packages/oauth2client/client.py", line 1549, in _generate_refresh_request_body
assertion = self._generate_assertion()
File "/usr/local/lib/python2.7/dist-packages/oauth2client/client.py", line 1677, in _generate_assertion
private_key, self.private_key_password), payload)
File "/usr/local/lib/python2.7/dist-packages/oauth2client/_openssl_crypt.py", line 117, in from_string
pkey = crypto.load_privatekey(crypto.FILETYPE_PEM, parsed_pem_key)
File "/usr/local/lib/python2.7/dist-packages/OpenSSL/crypto.py", line 2583, in load_privatekey
_raise_current_error()
File "/usr/local/lib/python2.7/dist-packages/OpenSSL/_util.py", line 48, in exception_from_error_queue
raise exception_type(errors)
Error: []
[2017-03-09 17:06:06,428] {models.py:1298} INFO - Marking task as UP_FOR_RETRY
[2017-03-09 17:06:06,428] {models.py:1327} ERROR - []
Traceback (most recent call last):
File "/usr/local/bin/airflow", line 15, in <module>
args.func(args)
File "/usr/local/lib/python2.7/dist-packages/airflow/bin/cli.py", line 352, in test
ti.run(force=True, ignore_dependencies=True, test_mode=True)
File "/usr/local/lib/python2.7/dist-packages/airflow/utils/db.py", line 53, in wrapper
result = func(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/airflow/models.py", line 1245, in run
result = task_copy.execute(context=context)
File "/usr/local/lib/python2.7/dist-packages/airflow/contrib/operators/bigquery_operator.py", line 59, in execute
cursor.run_query(self.bql, self.destination_dataset_table, self.write_disposition, self.allow_large_results, self.udf_config)
File "/usr/local/lib/python2.7/dist-packages/airflow/contrib/hooks/bigquery_hook.py", line 207, in run_query
return self.run_with_configuration(configuration)
File "/usr/local/lib/python2.7/dist-packages/airflow/contrib/hooks/bigquery_hook.py", line 437, in run_with_configuration
.insert(projectId=self.project_id, body=job_data) \
File "/usr/local/lib/python2.7/dist-packages/oauth2client/util.py", line 140, in positional_wrapper
return wrapped(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/googleapiclient/http.py", line 722, in execute
body=self.body, headers=self.headers)
File "/usr/local/lib/python2.7/dist-packages/oauth2client/client.py", line 572, in new_request
self._refresh(request_orig)
File "/usr/local/lib/python2.7/dist-packages/oauth2client/client.py", line 842, in _refresh
self._do_refresh_request(http_request)
File "/usr/local/lib/python2.7/dist-packages/oauth2client/client.py", line 869, in _do_refresh_request
body = self._generate_refresh_request_body()
File "/usr/local/lib/python2.7/dist-packages/oauth2client/client.py", line 1549, in _generate_refresh_request_body
assertion = self._generate_assertion()
File "/usr/local/lib/python2.7/dist-packages/oauth2client/client.py", line 1677, in _generate_assertion
private_key, self.private_key_password), payload)
File "/usr/local/lib/python2.7/dist-packages/oauth2client/_openssl_crypt.py", line 117, in from_string
pkey = crypto.load_privatekey(crypto.FILETYPE_PEM, parsed_pem_key)
File "/usr/local/lib/python2.7/dist-packages/OpenSSL/crypto.py", line 2583, in load_privatekey
_raise_current_error()
File "/usr/local/lib/python2.7/dist-packages/OpenSSL/_util.py", line 48, in exception_from_error_queue
raise exception_type(errors)
OpenSSL.crypto.Error: []
I've been trying to just get a simple job to write to a table in my bq project. Partly using this post as a guide https://medium.com/google-cloud/airflow-for-google-cloud-part-1-d7da9a048aa4#.5qclla82t