Trying to pass variable from pandas to sql - sql

I am trying to pass a simple variable (string, list whatever I can get working) in to sql (using jaydebapi to AWS Redshift cluster.
The python/pandas code looks like this at moment. I think the %s is part of the problem:
id = (261894)
df = pd.read_sql_query(
'''
select * from purchases where customer_id = %s
''',
conn, params=[id])
exception is
ava.sql.SQLExceptionPyRaisable Traceback (most recent call last)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\sql.py in execute(self, *args, **kwargs)
1377 else:
-> 1378 cur.execute(*args)
1379 return cur
C:\ProgramData\Anaconda3\lib\site-packages\jaydebeapi\__init__.py in execute(self, operation, parameters)
497 self._close_last()
--> 498 self._prep = self._connection.jconn.prepareStatement(operation)
499 self._set_stmt_parms(self._prep, parameters)
java.sql.SQLExceptionPyRaisable: java.sql.SQLException: [Amazon](500310) Invalid operation: syntax error at or near "%"
Position: 70;
During handling of the above exception, another exception occurred:
java.sql.SQLExceptionPyRaisable Traceback (most recent call last)
C:\ProgramData\Anaconda3\lib\site-packages\jaydebeapi\__init__.py in rollback(self)
416 try:
--> 417 self.jconn.rollback()
418 except:
java.sql.SQLExceptionPyRaisable: java.sql.SQLException: [Amazon][JDBC](11680) Cannot use rollback while Connection is in auto-commit mode.
During handling of the above exception, another exception occurred:
DatabaseError Traceback (most recent call last)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\sql.py in execute(self, *args, **kwargs)
1381 try:
-> 1382 self.con.rollback()
1383 except Exception: # pragma: no cover
C:\ProgramData\Anaconda3\lib\site-packages\jaydebeapi\__init__.py in rollback(self)
418 except:
--> 419 _handle_sql_exception()
420
C:\ProgramData\Anaconda3\lib\site-packages\jaydebeapi\__init__.py in _handle_sql_exception_jpype()
155 exc_type = InterfaceError
--> 156 reraise(exc_type, exc_info[1], exc_info[2])
157
C:\ProgramData\Anaconda3\lib\site-packages\jaydebeapi\__init__.py in reraise(tp, value, tb)
56 if tb:
---> 57 raise value.with_traceback(tb)
58 raise value
C:\ProgramData\Anaconda3\lib\site-packages\jaydebeapi\__init__.py in rollback(self)
416 try:
--> 417 self.jconn.rollback()
418 except:
DatabaseError: java.sql.SQLException: [Amazon][JDBC](11680) Cannot use rollback while Connection is in auto-commit mode.
During handling of the above exception, another exception occurred:
DatabaseError Traceback (most recent call last)
in
5 select * from purchases where customer_id = %s
6 ''',
----> 7 conn, params=[id])
8
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\sql.py in read_sql_query(sql, con, index_col, coerce_float, params, parse_dates, chunksize)
312 return pandas_sql.read_query(
313 sql, index_col=index_col, params=params, coerce_float=coerce_float,
--> 314 parse_dates=parse_dates, chunksize=chunksize)
315
316
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\sql.py in read_query(self, sql, index_col, coerce_float, params, parse_dates, chunksize)
1411
1412 args = _convert_params(sql, params)
-> 1413 cursor = self.execute(*args)
1414 columns = [col_desc[0] for col_desc in cursor.description]
1415
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\sql.py in execute(self, *args, **kwargs)
1384 ex = DatabaseError("Execution failed on sql: %s\n%s\nunable"
1385 " to rollback" % (args[0], exc))
-> 1386 raise_with_traceback(ex)
1387
1388 ex = DatabaseError(
C:\ProgramData\Anaconda3\lib\site-packages\pandas\compat\__init__.py in raise_with_traceback(exc, traceback)
402 if traceback == Ellipsis:
403 _, _, traceback = sys.exc_info()
--> 404 raise exc.with_traceback(traceback)
405 else:
406 # this version of raise is a syntax error in Python 3
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\sql.py in execute(self, *args, **kwargs)
1380 except Exception as exc:
1381 try:
-> 1382 self.con.rollback()
1383 except Exception: # pragma: no cover
1384 ex = DatabaseError("Execution failed on sql: %s\n%s\nunable"
C:\ProgramData\Anaconda3\lib\site-packages\jaydebeapi\__init__.py in rollback(self)
417 self.jconn.rollback()
418 except:
--> 419 _handle_sql_exception()
420
421 def cursor(self):
C:\ProgramData\Anaconda3\lib\site-packages\jaydebeapi\__init__.py in _handle_sql_exception_jpype()
154 else:
155 exc_type = InterfaceError
--> 156 reraise(exc_type, exc_info[1], exc_info[2])
157
158 def _jdbc_connect_jpype(jclassname, url, driver_args, jars, libs):
C:\ProgramData\Anaconda3\lib\site-packages\jaydebeapi\__init__.py in reraise(tp, value, tb)
55 value = tp(value)
56 if tb:
---> 57 raise value.with_traceback(tb)
58 raise value
59
C:\ProgramData\Anaconda3\lib\site-packages\jaydebeapi\__init__.py in rollback(self)
415 def rollback(self):
416 try:
--> 417 self.jconn.rollback()
418 except:
419 _handle_sql_exception()
DatabaseError: Execution failed on sql:
select * from purchases where customer_id = %s
java.sql.SQLException: [Amazon](500310) Invalid operation: syntax error at or near "%"
Position: 70;
unable to rollback
[39]
id = (261894)
[44]
Any thoughts on how I can get this to work? Do I need an alternative to %s?

try this:
id = (261894)
df = pd.read_sql_query( ''' select * from purchases where customer_id = ? ''', conn, params=[id])
it should work

Related

I am unable to retrieve data from pndas_datareader. how will it work for yahoo data

PG = wb.DataReader('PG',data_source = 'yahoo',start = '2000-1-1', end = '2001-1-1')
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[34], line 2
1 # !pip install pandas_datareader
----> 2 PG = wb.DataReader('PG',data_source = 'yahoo',start = '2000-1-1', end = '2001-1-1')
File c:\Users\intiz\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\util\_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs)
209 else:
210 kwargs[new_arg_name] = new_arg_value
--> 211 return func(*args, **kwargs)
File c:\Users\intiz\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas_datareader\data.py:379, in DataReader(name, data_source, start, end, retry_count, pause, session, api_key)
367 raise NotImplementedError(msg)
369 if data_source == "yahoo":
370 return YahooDailyReader(
371 symbols=name,
372 start=start,
373 end=end,
374 adjust_price=False,
375 chunksize=25,
376 retry_count=retry_count,
377 pause=pause,
378 session=session,
--> 379 ).read()
381 elif data_source == "iex":
...
--> 153 data = j["context"]["dispatcher"]["stores"]["HistoricalPriceStore"]
154 except KeyError:
155 msg = "No data fetched for symbol {} using {}"
TypeError: string indices must be integers
I need PG stock index information datewise

tight_layout KeyError default, matplotlib widget

Using jupyterlab, i receive a KeyError: 'Default' when using plt.tight_layout() in combination with %matplotlib widget. The following code reproduces the issue:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib widget
x=np.linspace(0,10)
y=x**2
plt.plot(x,y)
plt.tight_layout()
The complete error message is the following:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~/anaconda3/lib/python3.8/site-packages/matplotlib/backend_bases.py in _wait_cursor_for_draw_cm(self)
3024 try:
-> 3025 self.canvas.set_cursor(tools.Cursors.WAIT)
3026 yield
~/anaconda3/lib/python3.8/site-packages/matplotlib/backends/backend_webagg_core.py in set_cursor(self, cursor)
209 }, cursor=cursor)
--> 210 self.send_event('cursor', cursor=cursor)
211
~/anaconda3/lib/python3.8/site-packages/matplotlib/backends/backend_webagg_core.py in send_event(self, event_type, **kwargs)
391 if self.manager:
--> 392 self.manager._send_event(event_type, **kwargs)
393
~/anaconda3/lib/python3.8/site-packages/matplotlib/backends/backend_webagg_core.py in _send_event(self, event_type, **kwargs)
540 for s in self.web_sockets:
--> 541 s.send_json(payload)
542
~/anaconda3/lib/python3.8/site-packages/ipympl/backend_nbagg.py in send_json(self, content)
180 if content['type'] == 'cursor':
--> 181 self._cursor = cursors_str[content['cursor']]
182
KeyError: 'wait'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
/tmp/ipykernel_119035/3466922198.py in <module>
7 y=x**2
8 plt.plot(x,y)
----> 9 plt.tight_layout()
~/anaconda3/lib/python3.8/site-packages/matplotlib/pyplot.py in tight_layout(pad, h_pad, w_pad, rect)
2300 #_copy_docstring_and_deprecators(Figure.tight_layout)
2301 def tight_layout(*, pad=1.08, h_pad=None, w_pad=None, rect=None):
-> 2302 return gcf().tight_layout(pad=pad, h_pad=h_pad, w_pad=w_pad, rect=rect)
2303
2304
~/anaconda3/lib/python3.8/site-packages/matplotlib/figure.py in tight_layout(self, pad, h_pad, w_pad, rect)
3186 "compatible with tight_layout, so results "
3187 "might be incorrect.")
-> 3188 renderer = _get_renderer(self)
3189 with getattr(renderer, "_draw_disabled", nullcontext)():
3190 kwargs = get_tight_layout_figure(
~/anaconda3/lib/python3.8/site-packages/matplotlib/backend_bases.py in _get_renderer(figure, print_method)
1542 figure.canvas._get_output_canvas(None, fmt), f"print_{fmt}")
1543 try:
-> 1544 print_method(io.BytesIO())
1545 except Done as exc:
1546 renderer, = figure._cachedRenderer, = exc.args
~/anaconda3/lib/python3.8/site-packages/matplotlib/backend_bases.py in wrapper(*args, **kwargs)
1641 kwargs.pop(arg)
1642
-> 1643 return func(*args, **kwargs)
1644
1645 return wrapper
~/anaconda3/lib/python3.8/site-packages/matplotlib/_api/deprecation.py in wrapper(*inner_args, **inner_kwargs)
410 else deprecation_addendum,
411 **kwargs)
--> 412 return func(*inner_args, **inner_kwargs)
413
414 DECORATORS[wrapper] = decorator
~/anaconda3/lib/python3.8/site-packages/matplotlib/backends/backend_agg.py in print_png(self, filename_or_obj, metadata, pil_kwargs, *args)
538 *metadata*, including the default 'Software' key.
539 """
--> 540 FigureCanvasAgg.draw(self)
541 mpl.image.imsave(
542 filename_or_obj, self.buffer_rgba(), format="png", origin="upper",
~/anaconda3/lib/python3.8/site-packages/matplotlib/backends/backend_agg.py in draw(self)
431 self.renderer = self.get_renderer(cleared=True)
432 # Acquire a lock on the shared font cache.
--> 433 with RendererAgg.lock, \
434 (self.toolbar._wait_cursor_for_draw_cm() if self.toolbar
435 else nullcontext()):
~/anaconda3/lib/python3.8/contextlib.py in __enter__(self)
111 del self.args, self.kwds, self.func
112 try:
--> 113 return next(self.gen)
114 except StopIteration:
115 raise RuntimeError("generator didn't yield") from None
~/anaconda3/lib/python3.8/site-packages/matplotlib/backend_bases.py in _wait_cursor_for_draw_cm(self)
3026 yield
3027 finally:
-> 3028 self.canvas.set_cursor(self._lastCursor)
3029 else:
3030 yield
~/anaconda3/lib/python3.8/site-packages/matplotlib/backends/backend_webagg_core.py in set_cursor(self, cursor)
208 backend_tools.Cursors.RESIZE_VERTICAL: 'ns-resize',
209 }, cursor=cursor)
--> 210 self.send_event('cursor', cursor=cursor)
211
212 def set_image_mode(self, mode):
~/anaconda3/lib/python3.8/site-packages/matplotlib/backends/backend_webagg_core.py in send_event(self, event_type, **kwargs)
390 def send_event(self, event_type, **kwargs):
391 if self.manager:
--> 392 self.manager._send_event(event_type, **kwargs)
393
394
~/anaconda3/lib/python3.8/site-packages/matplotlib/backends/backend_webagg_core.py in _send_event(self, event_type, **kwargs)
539 payload = {'type': event_type, **kwargs}
540 for s in self.web_sockets:
--> 541 s.send_json(payload)
542
543
~/anaconda3/lib/python3.8/site-packages/ipympl/backend_nbagg.py in send_json(self, content)
179 # Change in the widget state?
180 if content['type'] == 'cursor':
--> 181 self._cursor = cursors_str[content['cursor']]
182
183 elif content['type'] == 'message':
KeyError: 'default'

Google Sheets API - Timeout

I regularly use gspread and lately I've been getting these read timeout errors but only on certain sheets. Sometimes I'll be able to read & write to a sheet for a bit and then it'll hang until I get this read timeout error and then all subsequent attempts to read from that sheet will hang immediately and again give me this error.
---------------------------------------------------------------------------
timeout Traceback (most recent call last)
~/envs/jupyter/lib/python3.6/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
444 # Otherwise it looks like a bug in the code.
--> 445 six.raise_from(e, None)
446 except (SocketTimeout, BaseSSLError, SocketError) as e:
~/envs/jupyter/lib/python3.6/site-packages/urllib3/packages/six.py in raise_from(value, from_value)
~/envs/jupyter/lib/python3.6/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
439 try:
--> 440 httplib_response = conn.getresponse()
441 except BaseException as e:
/usr/lib/python3.6/http/client.py in getresponse(self)
1372 try:
-> 1373 response.begin()
1374 except ConnectionError:
/usr/lib/python3.6/http/client.py in begin(self)
310 while True:
--> 311 version, status, reason = self._read_status()
312 if status != CONTINUE:
/usr/lib/python3.6/http/client.py in _read_status(self)
271 def _read_status(self):
--> 272 line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
273 if len(line) > _MAXLINE:
/usr/lib/python3.6/socket.py in readinto(self, b)
585 try:
--> 586 return self._sock.recv_into(b)
587 except timeout:
/usr/lib/python3.6/ssl.py in recv_into(self, buffer, nbytes, flags)
1011 self.__class__)
-> 1012 return self.read(nbytes, buffer)
1013 else:
/usr/lib/python3.6/ssl.py in read(self, len, buffer)
873 try:
--> 874 return self._sslobj.read(len, buffer)
875 except SSLError as x:
/usr/lib/python3.6/ssl.py in read(self, len, buffer)
630 if buffer is not None:
--> 631 v = self._sslobj.read(len, buffer)
632 else:
timeout: The read operation timed out
During handling of the above exception, another exception occurred:
ReadTimeoutError Traceback (most recent call last)
~/envs/jupyter/lib/python3.6/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
448 retries=self.max_retries,
--> 449 timeout=timeout
450 )
~/envs/jupyter/lib/python3.6/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
755 retries = retries.increment(
--> 756 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
757 )
~/envs/jupyter/lib/python3.6/site-packages/urllib3/util/retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
531 if read is False or not self._is_method_retryable(method):
--> 532 raise six.reraise(type(error), error, _stacktrace)
533 elif read is not None:
~/envs/jupyter/lib/python3.6/site-packages/urllib3/packages/six.py in reraise(tp, value, tb)
734 raise value.with_traceback(tb)
--> 735 raise value
736 finally:
~/envs/jupyter/lib/python3.6/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
705 headers=headers,
--> 706 chunked=chunked,
707 )
~/envs/jupyter/lib/python3.6/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
446 except (SocketTimeout, BaseSSLError, SocketError) as e:
--> 447 self._raise_timeout(err=e, url=url, timeout_value=read_timeout)
448 raise
~/envs/jupyter/lib/python3.6/site-packages/urllib3/connectionpool.py in _raise_timeout(self, err, url, timeout_value)
336 raise ReadTimeoutError(
--> 337 self, url, "Read timed out. (read timeout=%s)" % timeout_value
338 )
ReadTimeoutError: HTTPSConnectionPool(host='sheets.googleapis.com', port=443): Read timed out. (read timeout=120)
During handling of the above exception, another exception occurred:
ReadTimeout Traceback (most recent call last)
<ipython-input-70-dc10e6e8ee1a> in <module>
----> 1 max_sheet.range('A1:B1')
~/envs/jupyter/lib/python3.6/site-packages/gspread/utils.py in wrapper(self, *args, **kwargs)
396 pass
397
--> 398 return method(self, *args, **kwargs)
399
400 return wrapper
~/envs/jupyter/lib/python3.6/site-packages/gspread/models.py in range(self, name)
686 range_label = absolute_range_name(self.title, name)
687
--> 688 data = self.spreadsheet.values_get(range_label)
689
690 start, end = name.split(':')
~/envs/jupyter/lib/python3.6/site-packages/gspread/models.py in values_get(self, range, params)
190 """
191 url = SPREADSHEET_VALUES_URL % (self.id, quote(range))
--> 192 r = self.client.request('get', url, params=params)
193 return r.json()
194
~/envs/jupyter/lib/python3.6/site-packages/gspread/client.py in request(self, method, endpoint, params, data, json, files, headers)
68 data=data,
69 files=files,
---> 70 headers=headers,
71 )
72
~/envs/jupyter/lib/python3.6/site-packages/requests/sessions.py in get(self, url, **kwargs)
553
554 kwargs.setdefault('allow_redirects', True)
--> 555 return self.request('GET', url, **kwargs)
556
557 def options(self, url, **kwargs):
~/envs/jupyter/lib/python3.6/site-packages/google/auth/transport/requests.py in request(self, method, url, data, headers, max_allowed_time, timeout, **kwargs)
486 headers=request_headers,
487 timeout=timeout,
--> 488 **kwargs
489 )
490 remaining_time = guard.remaining_timeout
~/envs/jupyter/lib/python3.6/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
540 }
541 send_kwargs.update(settings)
--> 542 resp = self.send(prep, **send_kwargs)
543
544 return resp
~/envs/jupyter/lib/python3.6/site-packages/requests/sessions.py in send(self, request, **kwargs)
653
654 # Send the request
--> 655 r = adapter.send(request, **kwargs)
656
657 # Total elapsed time of the request (approximately)
~/envs/jupyter/lib/python3.6/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
527 raise SSLError(e, request=request)
528 elif isinstance(e, ReadTimeoutError):
--> 529 raise ReadTimeout(e, request=request)
530 else:
531 raise
ReadTimeout: HTTPSConnectionPool(host='sheets.googleapis.com', port=443): Read timed out. (read timeout=120)
I tried taking gspread out of it for troubleshooting but it still gives me the same issue even when I'm using code from the python quickstart example from https://developers.google.com/sheets/api/quickstart/python (below). The timeout issue is the same whether I do it from my local machine or my jupyter server on digitalocean.
from googleapiclient.discovery import build
from google.oauth2.service_account import Credentials
creds = Credentials.from_service_account_file('XXXXXXX', scopes=SCOPES)
service = build('sheets', 'v4', credentials=creds)
sheet = service.spreadsheets()
result = sheet.values().get(spreadsheetId='XXXXXXX',
range='Active Accounts - working copy!A1:A2').execute()
values = result.get('values', [])
The above code results in this:
---------------------------------------------------------------------------
timeout Traceback (most recent call last)
<ipython-input-95-5c352599e283> in <module>
1 sheet = service.spreadsheets()
2 result = sheet.values().get(spreadsheetId='XXXXXX',
----> 3 range='Active Accounts - working copy!A1:A2').execute()
4 values = result.get('values', [])
~/envs/jupyter/lib/python3.6/site-packages/googleapiclient/_helpers.py in positional_wrapper(*args, **kwargs)
132 elif positional_parameters_enforcement == POSITIONAL_WARNING:
133 logger.warning(message)
--> 134 return wrapped(*args, **kwargs)
135
136 return positional_wrapper
~/envs/jupyter/lib/python3.6/site-packages/googleapiclient/http.py in execute(self, http, num_retries)
899 method=str(self.method),
900 body=self.body,
--> 901 headers=self.headers,
902 )
903
~/envs/jupyter/lib/python3.6/site-packages/googleapiclient/http.py in _retry_request(http, num_retries, req_type, sleep, rand, uri, method, *args, **kwargs)
202 if exception:
203 if retry_num == num_retries:
--> 204 raise exception
205 else:
206 continue
~/envs/jupyter/lib/python3.6/site-packages/googleapiclient/http.py in _retry_request(http, num_retries, req_type, sleep, rand, uri, method, *args, **kwargs)
175 try:
176 exception = None
--> 177 resp, content = http.request(uri, method, *args, **kwargs)
178 # Retry on SSL errors and socket timeout errors.
179 except _ssl_SSLError as ssl_error:
~/envs/jupyter/lib/python3.6/site-packages/google_auth_httplib2.py in request(self, uri, method, body, headers, **kwargs)
196 # Make the request.
197 response, content = self.http.request(
--> 198 uri, method, body=body, headers=request_headers, **kwargs)
199
200 # If the response indicated that the credentials needed to be
~/envs/jupyter/lib/python3.6/site-packages/httplib2/__init__.py in request(self, uri, method, body, headers, redirections, connection_type)
1989 headers,
1990 redirections,
-> 1991 cachekey,
1992 )
1993 except Exception as e:
~/envs/jupyter/lib/python3.6/site-packages/httplib2/__init__.py in _request(self, conn, host, absolute_uri, request_uri, method, body, headers, redirections, cachekey)
1649
1650 (response, content) = self._conn_request(
-> 1651 conn, request_uri, method, body, headers
1652 )
1653
~/envs/jupyter/lib/python3.6/site-packages/httplib2/__init__.py in _conn_request(self, conn, request_uri, method, body, headers)
1587 pass
1588 try:
-> 1589 response = conn.getresponse()
1590 except (http.client.BadStatusLine, http.client.ResponseNotReady):
1591 # If we get a BadStatusLine on the first try then that means
/usr/lib/python3.6/http/client.py in getresponse(self)
1371 try:
1372 try:
-> 1373 response.begin()
1374 except ConnectionError:
1375 self.close()
/usr/lib/python3.6/http/client.py in begin(self)
309 # read until we get a non-100 response
310 while True:
--> 311 version, status, reason = self._read_status()
312 if status != CONTINUE:
313 break
/usr/lib/python3.6/http/client.py in _read_status(self)
270
271 def _read_status(self):
--> 272 line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
273 if len(line) > _MAXLINE:
274 raise LineTooLong("status line")
/usr/lib/python3.6/socket.py in readinto(self, b)
584 while True:
585 try:
--> 586 return self._sock.recv_into(b)
587 except timeout:
588 self._timeout_occurred = True
/usr/lib/python3.6/ssl.py in recv_into(self, buffer, nbytes, flags)
1010 "non-zero flags not allowed in calls to recv_into() on %s" %
1011 self.__class__)
-> 1012 return self.read(nbytes, buffer)
1013 else:
1014 return socket.recv_into(self, buffer, nbytes, flags)
/usr/lib/python3.6/ssl.py in read(self, len, buffer)
872 raise ValueError("Read on closed or unwrapped SSL socket.")
873 try:
--> 874 return self._sslobj.read(len, buffer)
875 except SSLError as x:
876 if x.args[0] == SSL_ERROR_EOF and self.suppress_ragged_eofs:
/usr/lib/python3.6/ssl.py in read(self, len, buffer)
629 """
630 if buffer is not None:
--> 631 v = self._sslobj.read(len, buffer)
632 else:
633 v = self._sslobj.read(len)
timeout: The read operation timed out

Dask Cluster: AttributeError: 'DataFrame' object has no attribute '_data'

I'm working with a Dask Cluster on GCP. I'm using this code to deploy it:
from dask_cloudprovider.gcp import GCPCluster
from dask.distributed import Client
enviroment_vars = {
'EXTRA_PIP_PACKAGES': '"gcsfs"'
}
cluster = GCPCluster(
n_workers=32,
docker_image='daskdev/dask:2021.2.0',
env_vars=enviroment_vars,
network='my-network',
#filesystem_size=150,
machine_type='e2-standard-16',
projectid='my-project-id',
zone='us-central1-a',
on_host_maintenance="MIGRATE"
client = Client(cluster)
Then I read csv files, with the following code:
import dask.dataframe as dd
import csv
col_dtypes = {
'var1': 'float64',
'var2': 'object',
'var3': 'object',
'var4': 'float64'
}
df = dd.read_csv('gs://my_bucket/files-*.csv', blocksize=None, dtype= col_dtypes)
df = df.persist()
Everything works fine, but when I try to do some queries, or calculation, I get an error. For instance this piece of code:
df.var1.value_counts().compute()
This is the output:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-14-711a7c21ed42> in <module>
----> 1 df.var1.value_counts().compute()
/opt/conda/lib/python3.8/site-packages/dask/base.py in compute(self, **kwargs)
279 dask.base.compute
280 """
--> 281 (result,) = compute(self, traverse=False, **kwargs)
282 return result
283
/opt/conda/lib/python3.8/site-packages/dask/base.py in compute(*args, **kwargs)
561 postcomputes.append(x.__dask_postcompute__())
562
--> 563 results = schedule(dsk, keys, **kwargs)
564 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
565
/opt/conda/lib/python3.8/site-packages/distributed/client.py in get(self, dsk, keys, workers, allow_other_workers, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)
2653 should_rejoin = False
2654 try:
-> 2655 results = self.gather(packed, asynchronous=asynchronous, direct=direct)
2656 finally:
2657 for f in futures.values():
/opt/conda/lib/python3.8/site-packages/distributed/client.py in gather(self, futures, errors, direct, asynchronous)
1962 else:
1963 local_worker = None
-> 1964 return self.sync(
1965 self._gather,
1966 futures,
/opt/conda/lib/python3.8/site-packages/distributed/client.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
836 return future
837 else:
--> 838 return sync(
839 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
840 )
/opt/conda/lib/python3.8/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
338 if error[0]:
339 typ, exc, tb = error[0]
--> 340 raise exc.with_traceback(tb)
341 else:
342 return result[0]
/opt/conda/lib/python3.8/site-packages/distributed/utils.py in f()
322 if callback_timeout is not None:
323 future = asyncio.wait_for(future, callback_timeout)
--> 324 result[0] = yield future
325 except Exception as exc:
326 error[0] = sys.exc_info()
/opt/conda/lib/python3.8/site-packages/tornado/gen.py in run(self)
760
761 try:
--> 762 value = future.result()
763 except Exception:
764 exc_info = sys.exc_info()
/opt/conda/lib/python3.8/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
1827 exc = CancelledError(key)
1828 else:
-> 1829 raise exception.with_traceback(traceback)
1830 raise exc
1831 if errors == "skip":
/opt/conda/lib/python3.8/site-packages/dask/optimization.py in __call__()
961 if not len(args) == len(self.inkeys):
962 raise ValueError("Expected %d args, got %d" % (len(self.inkeys), len(args)))
--> 963 return core.get(self.dsk, self.outkey, dict(zip(self.inkeys, args)))
964
965 def __reduce__(self):
/opt/conda/lib/python3.8/site-packages/dask/core.py in get()
149 for key in toposort(dsk):
150 task = dsk[key]
--> 151 result = _execute_task(task, cache)
152 cache[key] = result
153 result = _execute_task(out, cache)
/opt/conda/lib/python3.8/site-packages/dask/core.py in _execute_task()
119 # temporaries by their reference count and can execute certain
120 # operations in-place.
--> 121 return func(*(_execute_task(a, cache) for a in args))
122 elif not ishashable(arg):
123 return arg
/opt/conda/lib/python3.8/site-packages/dask/utils.py in apply()
33 def apply(func, args, kwargs=None):
34 if kwargs:
---> 35 return func(*args, **kwargs)
36 else:
37 return func(*args)
/opt/conda/lib/python3.8/site-packages/dask/dataframe/core.py in apply_and_enforce()
5474 return meta
5475 if is_dataframe_like(df):
-> 5476 check_matching_columns(meta, df)
5477 c = meta.columns
5478 else:
/opt/conda/lib/python3.8/site-packages/dask/dataframe/utils.py in check_matching_columns()
690 def check_matching_columns(meta, actual):
691 # Need nan_to_num otherwise nan comparison gives False
--> 692 if not np.array_equal(np.nan_to_num(meta.columns), np.nan_to_num(actual.columns)):
693 extra = methods.tolist(actual.columns.difference(meta.columns))
694 missing = methods.tolist(meta.columns.difference(actual.columns))
/opt/conda/lib/python3.8/site-packages/pandas/core/generic.py in __getattr__()
5268 or name in self._accessors
5269 ):
-> 5270 return object.__getattribute__(self, name)
5271 else:
5272 if self._info_axis._can_hold_identifiers_and_holds_name(name):
pandas/_libs/properties.pyx in pandas._libs.properties.AxisProperty.__get__()
/opt/conda/lib/python3.8/site-packages/pandas/core/generic.py in __getattr__()
5268 or name in self._accessors
5269 ):
-> 5270 return object.__getattribute__(self, name)
5271 else:
5272 if self._info_axis._can_hold_identifiers_and_holds_name(name):
AttributeError: 'DataFrame' object has no attribute '_data'
The version of Pandas in my docker file is 1.0.1, so I already try upgrading Pandas (to version 1.2.2), but it didn't work, what am I doing wrong?
My guess is that you have a version mismatch somewhere. What does client.get_versions(check=True) say?

There is already an object named ""; in the database

I am using pyspark in databricks to append data to a sql table via ADF pipelines. Code is sown below:
log_status_df_all = spark.createDataFrame(log_status_df_all)
log_status_df_all.write.format("com.microsoft.sqlserver.jdbc.spark").mode(write_mode).option("url", url).option("dbtable", 'Logs_Collection_Status').option("user", username).option("password", password).save()
log_status_df_all.show()
On some days I am get the error message:
com.microsoft.sqlserver.jdbc.SQLServerException: There is already an object named '<table_name>' in the database.
Upon simply re-running the pipeline the table is updated with no issues; therefore the code is working. How can I prevent this from happening again? Is it an error when multiple pipelines try writing to the same table at the same time?
The rest of the error message is shown below:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<command-3143827225825384> in <module>
8
9 log_collection_df = spark.createDataFrame(log_collection_df)
---> 10 write_df_sql(log_collection_df, 'Logs_Collection_Status', 'overwrite')
11
<command-1421348210166948> in write_df_sql(df, table, write_mode)
14
15
---> 16 spark_df.write.format("com.microsoft.sqlserver.jdbc.spark").mode(write_mode).option("url", url).option("dbtable", table_name).option("user", username).option("password", password).save()
17
18 #backup table
/databricks/spark/python/pyspark/sql/readwriter.py in save(self, path, format, mode, partitionBy, **options)
735 self.format(format)
736 if path is None:
--> 737 self._jwrite.save()
738 else:
739 self._jwrite.save(path)
/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args) 1255 answer = self.gateway_client.send_command(command) 1256 return_value
= get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name) 1258 1259 for temp_arg in temp_args:
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o661.save. : com.microsoft.sqlserver.jdbc.SQLServerException: There is already an object named 'Logs_Collection_Status' in the database. at com.microsoft.sqlserver.jdbc.SQLServerException.makeFromDatabaseError(SQLServerException.java:258) at com.microsoft.sqlserver.jdbc.SQLServerStatement.getNextResult(SQLServerStatement.java:1535) at com.microsoft.sqlserver.jdbc.SQLServerStatement.doExecuteStatement(SQLServerStatement.java:845) at com.microsoft.sqlserver.jdbc.SQLServerStatement$StmtExecCmd.doExecute(SQLServerStatement.java:752) at com.microsoft.sqlserver.jdbc.TDSCommand.execute(IOBuffer.java:7151) at com.microsoft.sqlserver.jdbc.SQLServerConnection.executeCommand(SQLServerConnection.java:2478) at com.microsoft.sqlserver.jdbc.SQLServerStatement.executeCommand(SQLServerStatement.java:219) at com.microsoft.sqlserver.jdbc.SQLServerStatement.executeStatement(SQLServerStatement.java:199) at com.microsoft.sqlserver.jdbc.SQLServerStatement.executeUpdate(SQLServerStatement.java:680) at com.microsoft.sqlserver.jdbc.spark.BulkCopyUtils$.executeUpdate(BulkCopyUtils.scala:456) at com.microsoft.sqlserver.jdbc.spark.BulkCopyUtils$.mssqlCreateTable(BulkCopyUtils.scala:495) at com.microsoft.sqlserver.jdbc.spark.SingleInstanceConnector$.createTable(SingleInstanceConnector.scala:33) at com.microsoft.sqlserver.jdbc.spark.Connector.write(Connector.scala:60) at com.microsoft.sqlserver.jdbc.spark.DefaultSource.createRelation(DefaultSource.scala:51) at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45) at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70) at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68) at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:86) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:152) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:140) at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$5.apply(SparkPlan.scala:193) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:189) at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:140) at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:117) at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:115) at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:711) at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:711) at org.apache.spark.sql.execution.SQLExecution$$anonfun$withCustomExecutionEnv$1.apply(SQLExecution.scala:113) at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:243) at org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:99) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:173) at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:711) at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:307) at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:293) at sun.reflect.GeneratedMethodAccessor841.invoke(Unknown Source) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380) at py4j.Gateway.invoke(Gateway.java:295) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:251) at java.lang.Thread.run(Thread.java:748)
The issue has now been resolved.
Previous code:
if condition1==True:
write_mode = 'overwrite'
elif condition2==True:
write_mode = 'append'
log_status_df_all = spark.createDataFrame(log_status_df_all)
log_status_df_all.write.format("com.microsoft.sqlserver.jdbc.spark").mode(write_mode).option("url", url).option("dbtable", 'Logs_Collection_Status').option("user", username).option("password", password).save()
log_status_df_all.show()
Current code:
write_mode = 'append'
log_status_df_all = spark.createDataFrame(log_status_df_all)
log_status_df_all.write.format("com.microsoft.sqlserver.jdbc.spark").mode(write_mode).option("url", url).option("dbtable", 'Logs_Collection_Status').option("user", username).option("password", password).save()
log_status_df_all.show()