plot no longer works after upgrade - pandas

I recently upgraded pandas to 1.1.5, using Python 3.6.4 and I can no longer plot any charts with a datetime index column.
See the below example where I import a time series from a csv file. I have also tried registering matplotlib converters in case this was the issue. I get the error message shown below. Incidentally seaborn also no longer works but not sure if that's relevant.
Thanks
import matplotlib.pyplot as plt
import pandas as pd
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
df = pd.read_csv('example.csv', parse_dates=True, index_col=0, dayfirst=True)
df.head()
Click here to see output for df.head()
df.plot()
I get the following error if I try and plot
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-37-848b80e64df8> in <module>()
----> 1 df.plot()
C:\Program Files\Anaconda3\lib\site-packages\pandas\plotting\_core.py in __call__(self, *args, **kwargs)
947 data.columns = label_name
948
--> 949 return plot_backend.plot(data, kind=kind, **kwargs)
950
951 __call__.__doc__ = __doc__
C:\Program Files\Anaconda3\lib\site-packages\pandas\plotting\_matplotlib\__init__.py in plot(data, kind, **kwargs)
59 kwargs["ax"] = getattr(ax, "left_ax", ax)
60 plot_obj = PLOT_CLASSES[kind](data, **kwargs)
---> 61 plot_obj.generate()
62 plot_obj.draw()
63 return plot_obj.result
C:\Program Files\Anaconda3\lib\site-packages\pandas\plotting\_matplotlib\core.py in generate(self)
269 self._compute_plot_data()
270 self._setup_subplots()
--> 271 self._make_plot()
272 self._add_table()
273 self._make_legend()
C:\Program Files\Anaconda3\lib\site-packages\pandas\plotting\_matplotlib\core.py in _make_plot(self)
1124 stacking_id=stacking_id,
1125 is_errorbar=is_errorbar,
-> 1126 **kwds,
1127 )
1128 self._add_legend_handle(newlines[0], label, index=i)
C:\Program Files\Anaconda3\lib\site-packages\pandas\plotting\_matplotlib\core.py in _plot(cls, ax, x, y, style, column_num, stacking_id, **kwds)
1143 cls._initialize_stacker(ax, stacking_id, len(y))
1144 y_values = cls._get_stacked_values(ax, stacking_id, y, kwds["label"])
-> 1145 lines = MPLPlot._plot(ax, x, y_values, style=style, **kwds)
1146 cls._update_stacker(ax, stacking_id, y)
1147 return lines
C:\Program Files\Anaconda3\lib\site-packages\pandas\plotting\_matplotlib\converter.py in wrapper(*args, **kwargs)
63 def wrapper(*args, **kwargs):
64 with pandas_converters():
---> 65 return func(*args, **kwargs)
66
67 return wrapper
C:\Program Files\Anaconda3\lib\site-packages\pandas\plotting\_matplotlib\core.py in _plot(cls, ax, x, y, style, is_errorbar, **kwds)
666 else:
667 args = (x, y)
--> 668 return ax.plot(*args, **kwds)
669
670 def _get_index_name(self):
C:\Program Files\Anaconda3\lib\site-packages\matplotlib\__init__.py in inner(ax, *args, **kwargs)
1715 warnings.warn(msg % (label_namer, func.__name__),
1716 RuntimeWarning, stacklevel=2)
-> 1717 return func(ax, *args, **kwargs)
1718 pre_doc = inner.__doc__
1719 if pre_doc is None:
C:\Program Files\Anaconda3\lib\site-packages\matplotlib\axes\_axes.py in plot(self, *args, **kwargs)
1371
1372 for line in self._get_lines(*args, **kwargs):
-> 1373 self.add_line(line)
1374 lines.append(line)
1375
C:\Program Files\Anaconda3\lib\site-packages\matplotlib\axes\_base.py in add_line(self, line)
1777 line.set_clip_path(self.patch)
1778
-> 1779 self._update_line_limits(line)
1780 if not line.get_label():
1781 line.set_label('_line%d' % len(self.lines))
C:\Program Files\Anaconda3\lib\site-packages\matplotlib\axes\_base.py in _update_line_limits(self, line)
1799 Figures out the data limit of the given line, updating self.dataLim.
1800 """
-> 1801 path = line.get_path()
1802 if path.vertices.size == 0:
1803 return
C:\Program Files\Anaconda3\lib\site-packages\matplotlib\lines.py in get_path(self)
955 """
956 if self._invalidy or self._invalidx:
--> 957 self.recache()
958 return self._path
959
C:\Program Files\Anaconda3\lib\site-packages\matplotlib\lines.py in recache(self, always)
655 def recache(self, always=False):
656 if always or self._invalidx:
--> 657 xconv = self.convert_xunits(self._xorig)
658 x = _to_unmasked_float_array(xconv).ravel()
659 else:
C:\Program Files\Anaconda3\lib\site-packages\matplotlib\artist.py in convert_xunits(self, x)
189 if ax is None or ax.xaxis is None:
190 return x
--> 191 return ax.xaxis.convert_units(x)
192
193 def convert_yunits(self, y):
C:\Program Files\Anaconda3\lib\site-packages\matplotlib\axis.py in convert_units(self, x)
1489 return x
1490
-> 1491 ret = self.converter.convert(x, self.units, self)
1492 return ret
1493
C:\Program Files\Anaconda3\lib\site-packages\pandas\plotting\_matplotlib\converter.py in convert(values, unit, axis)
254 values = [DatetimeConverter._convert_1d(v, unit, axis) for v in values]
255 else:
--> 256 values = DatetimeConverter._convert_1d(values, unit, axis)
257 return values
258
C:\Program Files\Anaconda3\lib\site-packages\pandas\plotting\_matplotlib\converter.py in _convert_1d(values, unit, axis)
289 pass
290
--> 291 values = dates.date2num(values)
292
293 return values
C:\Program Files\Anaconda3\lib\site-packages\matplotlib\dates.py in date2num(d)
394 if not d.size:
395 return d
--> 396 return _to_ordinalf_np_vectorized(d)
397
398
C:\Program Files\Anaconda3\lib\site-packages\numpy\lib\function_base.py in __call__(self, *args, **kwargs)
2106 vargs.extend([kwargs[_n] for _n in names])
2107
-> 2108 return self._vectorize_call(func=func, args=vargs)
2109
2110 def _get_ufunc_and_otypes(self, func, args):
C:\Program Files\Anaconda3\lib\site-packages\numpy\lib\function_base.py in _vectorize_call(self, func, args)
2184 res = func()
2185 else:
-> 2186 ufunc, otypes = self._get_ufunc_and_otypes(func=func, args=args)
2187
2188 # Convert args to object arrays first
C:\Program Files\Anaconda3\lib\site-packages\numpy\lib\function_base.py in _get_ufunc_and_otypes(self, func, args)
2144
2145 inputs = [arg.flat[0] for arg in args]
-> 2146 outputs = func(*inputs)
2147
2148 # Performance note: profiling indicates that -- for simple
C:\Program Files\Anaconda3\lib\site-packages\matplotlib\dates.py in _to_ordinalf(dt)
243 tzi = UTC
244
--> 245 base = float(dt.toordinal())
246
247 # If it's sufficiently datetime-like, it will have a `date()` method
AttributeError: 'numpy.datetime64' object has no attribute 'toordinal'

The older version of matplotlib (2.1.2) is out of date and no longer compatible with the newer version of pandas (1.1.5). An upgrade to matplotlib 3.3.4 solves this issue - as discussed in the comments.

Related

s3fs FileNotFoundError

I am only able to gain limited/top-level access to my aws s3. I can see the buckets, but not their contents; neither subfolders nor files. I'm running everything from inside a conda environment. I've tried accessing files in private and public buckets without success. What am I doing wrong?
This block of code works as expected
>>> import s3fs
>>> AKEY = 'XXXX'
>>> SKEY = 'XXXX'
>>> fs = s3fs.S3FileSystem(key=AKEY,secret=SKEY)
>>> fs.ls('s3://')
['my-bucket-1',
'my-bucket-2',
'my-bucket-3']
This block doesn't
>>> fs.ls('s3://my-bucket-1')
[]
what I expect
>>> fs.ls('s3://my-bucket-1')
['my-bucket-1/test.txt',
'my-bucket-1/test.csv']
When I try to open a file I get a FileNotFoundError
import pandas as pd
pd.read_csv(
's3://my-bucket-1/test.csv',
storage_options={'key':AKEY,'secret':SKEY}
)
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
Cell In[8], line 2
1 import pandas as pd
----> 2 pd.read_csv(
3 's3://my-bucket-1/test.csv'',
4 storage_options={'key':AKEY,'secret':SKEY}
5 )
File ~\anaconda3\envs\env-2\lib\site-packages\pandas\util\_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs)
209 else:
210 kwargs[new_arg_name] = new_arg_value
--> 211 return func(*args, **kwargs)
File ~\anaconda3\envs\env-2\lib\site-packages\pandas\util\_decorators.py:331, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
325 if len(args) > num_allow_args:
326 warnings.warn(
327 msg.format(arguments=_format_argument_list(allow_args)),
328 FutureWarning,
329 stacklevel=find_stack_level(),
330 )
--> 331 return func(*args, **kwargs)
File ~\anaconda3\envs\env-2\lib\site-packages\pandas\io\parsers\readers.py:950, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
935 kwds_defaults = _refine_defaults_read(
936 dialect,
937 delimiter,
(...)
946 defaults={"delimiter": ","},
947 )
948 kwds.update(kwds_defaults)
--> 950 return _read(filepath_or_buffer, kwds)
File ~\anaconda3\envs\env-2\lib\site-packages\pandas\io\parsers\readers.py:605, in _read(filepath_or_buffer, kwds)
602 _validate_names(kwds.get("names", None))
604 # Create the parser.
--> 605 parser = TextFileReader(filepath_or_buffer, **kwds)
607 if chunksize or iterator:
608 return parser
File ~\anaconda3\envs\env-2\lib\site-packages\pandas\io\parsers\readers.py:1442, in TextFileReader.__init__(self, f, engine, **kwds)
1439 self.options["has_index_names"] = kwds["has_index_names"]
1441 self.handles: IOHandles | None = None
-> 1442 self._engine = self._make_engine(f, self.engine)
File ~\anaconda3\envs\env-2\lib\site-packages\pandas\io\parsers\readers.py:1735, in TextFileReader._make_engine(self, f, engine)
1733 if "b" not in mode:
1734 mode += "b"
-> 1735 self.handles = get_handle(
1736 f,
1737 mode,
1738 encoding=self.options.get("encoding", None),
1739 compression=self.options.get("compression", None),
1740 memory_map=self.options.get("memory_map", False),
1741 is_text=is_text,
1742 errors=self.options.get("encoding_errors", "strict"),
1743 storage_options=self.options.get("storage_options", None),
1744 )
1745 assert self.handles is not None
1746 f = self.handles.handle
File ~\anaconda3\envs\env-2\lib\site-packages\pandas\io\common.py:713, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
710 codecs.lookup_error(errors)
712 # open URLs
--> 713 ioargs = _get_filepath_or_buffer(
714 path_or_buf,
715 encoding=encoding,
716 compression=compression,
717 mode=mode,
718 storage_options=storage_options,
719 )
721 handle = ioargs.filepath_or_buffer
722 handles: list[BaseBuffer]
File ~\anaconda3\envs\env-2\lib\site-packages\pandas\io\common.py:409, in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options)
406 pass
408 try:
--> 409 file_obj = fsspec.open(
410 filepath_or_buffer, mode=fsspec_mode, **(storage_options or {})
411 ).open()
412 # GH 34626 Reads from Public Buckets without Credentials needs anon=True
413 except tuple(err_types_to_retry_with_anon):
File ~\anaconda3\envs\env-2\lib\site-packages\fsspec\core.py:135, in OpenFile.open(self)
128 def open(self):
129 """Materialise this as a real open file without context
130
131 The OpenFile object should be explicitly closed to avoid enclosed file
132 instances persisting. You must, therefore, keep a reference to the OpenFile
133 during the life of the file-like it generates.
134 """
--> 135 return self.__enter__()
File ~\anaconda3\envs\env-2\lib\site-packages\fsspec\core.py:103, in OpenFile.__enter__(self)
100 def __enter__(self):
101 mode = self.mode.replace("t", "").replace("b", "") + "b"
--> 103 f = self.fs.open(self.path, mode=mode)
105 self.fobjects = [f]
107 if self.compression is not None:
File ~\anaconda3\envs\env-2\lib\site-packages\fsspec\spec.py:1106, in AbstractFileSystem.open(self, path, mode, block_size, cache_options, compression, **kwargs)
1104 else:
1105 ac = kwargs.pop("autocommit", not self._intrans)
-> 1106 f = self._open(
1107 path,
1108 mode=mode,
1109 block_size=block_size,
1110 autocommit=ac,
1111 cache_options=cache_options,
1112 **kwargs,
1113 )
1114 if compression is not None:
1115 from fsspec.compression import compr
File ~\anaconda3\envs\env-2\lib\site-packages\s3fs\core.py:640, in S3FileSystem._open(self, path, mode, block_size, acl, version_id, fill_cache, cache_type, autocommit, requester_pays, cache_options, **kwargs)
637 if cache_type is None:
638 cache_type = self.default_cache_type
--> 640 return S3File(
641 self,
642 path,
643 mode,
644 block_size=block_size,
645 acl=acl,
646 version_id=version_id,
647 fill_cache=fill_cache,
648 s3_additional_kwargs=kw,
649 cache_type=cache_type,
650 autocommit=autocommit,
651 requester_pays=requester_pays,
652 cache_options=cache_options,
653 )
File ~\anaconda3\envs\env-2\lib\site-packages\s3fs\core.py:1989, in S3File.__init__(self, s3, path, mode, block_size, acl, version_id, fill_cache, s3_additional_kwargs, autocommit, cache_type, requester_pays, cache_options)
1987 self.details = s3.info(path)
1988 self.version_id = self.details.get("VersionId")
-> 1989 super().__init__(
1990 s3,
1991 path,
1992 mode,
1993 block_size,
1994 autocommit=autocommit,
1995 cache_type=cache_type,
1996 cache_options=cache_options,
1997 )
1998 self.s3 = self.fs # compatibility
2000 # when not using autocommit we want to have transactional state to manage
File ~\anaconda3\envs\env-2\lib\site-packages\fsspec\spec.py:1462, in AbstractBufferedFile.__init__(self, fs, path, mode, block_size, autocommit, cache_type, cache_options, size, **kwargs)
1460 self.size = size
1461 else:
-> 1462 self.size = self.details["size"]
1463 self.cache = caches[cache_type](
1464 self.blocksize, self._fetch_range, self.size, **cache_options
1465 )
1466 else:
File ~\anaconda3\envs\env-2\lib\site-packages\fsspec\spec.py:1475, in AbstractBufferedFile.details(self)
1472 #property
1473 def details(self):
1474 if self._details is None:
-> 1475 self._details = self.fs.info(self.path)
1476 return self._details
File ~\anaconda3\envs\env-2\lib\site-packages\fsspec\asyn.py:113, in sync_wrapper.<locals>.wrapper(*args, **kwargs)
110 #functools.wraps(func)
111 def wrapper(*args, **kwargs):
112 self = obj or args[0]
--> 113 return sync(self.loop, func, *args, **kwargs)
File ~\anaconda3\envs\env-2\lib\site-packages\fsspec\asyn.py:98, in sync(loop, func, timeout, *args, **kwargs)
96 raise FSTimeoutError from return_result
97 elif isinstance(return_result, BaseException):
---> 98 raise return_result
99 else:
100 return return_result
File ~\anaconda3\envs\env-2\lib\site-packages\fsspec\asyn.py:53, in _runner(event, coro, result, timeout)
51 coro = asyncio.wait_for(coro, timeout=timeout)
52 try:
---> 53 result[0] = await coro
54 except Exception as ex:
55 result[0] = ex
File ~\anaconda3\envs\env-2\lib\site-packages\s3fs\core.py:1257, in S3FileSystem._info(self, path, bucket, key, refresh, version_id)
1245 if (
1246 out.get("KeyCount", 0) > 0
1247 or out.get("Contents", [])
1248 or out.get("CommonPrefixes", [])
1249 ):
1250 return {
1251 "name": "/".join([bucket, key]),
1252 "type": "directory",
1253 "size": 0,
1254 "StorageClass": "DIRECTORY",
1255 }
-> 1257 raise FileNotFoundError(path)
1258 except ClientError as e:
1259 raise translate_boto_error(e, set_cause=False)
FileNotFoundError: my-bucket-1/test.csv
s3fs-2022.11.0, aiobotocore-2.4.0, botocore-1.27.59
fs = s3fs.S3FileSystem(anon=True)
fs.ls('s3://dask-data/nyc-taxi/2015')
ParseError
Check the bucket policy / IAM role that gives you permissions to access the bucket. It should have /* after the name of the resource:
"Action": "s3:GetObject",
"Resource": "arn:aws:s3:::my-bucket-1/*"
to allow you access the objects in the bucket, not just the bucket itself.
Have you tried boto3? s3fs is no longer supported.

tight_layout KeyError default, matplotlib widget

Using jupyterlab, i receive a KeyError: 'Default' when using plt.tight_layout() in combination with %matplotlib widget. The following code reproduces the issue:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib widget
x=np.linspace(0,10)
y=x**2
plt.plot(x,y)
plt.tight_layout()
The complete error message is the following:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~/anaconda3/lib/python3.8/site-packages/matplotlib/backend_bases.py in _wait_cursor_for_draw_cm(self)
3024 try:
-> 3025 self.canvas.set_cursor(tools.Cursors.WAIT)
3026 yield
~/anaconda3/lib/python3.8/site-packages/matplotlib/backends/backend_webagg_core.py in set_cursor(self, cursor)
209 }, cursor=cursor)
--> 210 self.send_event('cursor', cursor=cursor)
211
~/anaconda3/lib/python3.8/site-packages/matplotlib/backends/backend_webagg_core.py in send_event(self, event_type, **kwargs)
391 if self.manager:
--> 392 self.manager._send_event(event_type, **kwargs)
393
~/anaconda3/lib/python3.8/site-packages/matplotlib/backends/backend_webagg_core.py in _send_event(self, event_type, **kwargs)
540 for s in self.web_sockets:
--> 541 s.send_json(payload)
542
~/anaconda3/lib/python3.8/site-packages/ipympl/backend_nbagg.py in send_json(self, content)
180 if content['type'] == 'cursor':
--> 181 self._cursor = cursors_str[content['cursor']]
182
KeyError: 'wait'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
/tmp/ipykernel_119035/3466922198.py in <module>
7 y=x**2
8 plt.plot(x,y)
----> 9 plt.tight_layout()
~/anaconda3/lib/python3.8/site-packages/matplotlib/pyplot.py in tight_layout(pad, h_pad, w_pad, rect)
2300 #_copy_docstring_and_deprecators(Figure.tight_layout)
2301 def tight_layout(*, pad=1.08, h_pad=None, w_pad=None, rect=None):
-> 2302 return gcf().tight_layout(pad=pad, h_pad=h_pad, w_pad=w_pad, rect=rect)
2303
2304
~/anaconda3/lib/python3.8/site-packages/matplotlib/figure.py in tight_layout(self, pad, h_pad, w_pad, rect)
3186 "compatible with tight_layout, so results "
3187 "might be incorrect.")
-> 3188 renderer = _get_renderer(self)
3189 with getattr(renderer, "_draw_disabled", nullcontext)():
3190 kwargs = get_tight_layout_figure(
~/anaconda3/lib/python3.8/site-packages/matplotlib/backend_bases.py in _get_renderer(figure, print_method)
1542 figure.canvas._get_output_canvas(None, fmt), f"print_{fmt}")
1543 try:
-> 1544 print_method(io.BytesIO())
1545 except Done as exc:
1546 renderer, = figure._cachedRenderer, = exc.args
~/anaconda3/lib/python3.8/site-packages/matplotlib/backend_bases.py in wrapper(*args, **kwargs)
1641 kwargs.pop(arg)
1642
-> 1643 return func(*args, **kwargs)
1644
1645 return wrapper
~/anaconda3/lib/python3.8/site-packages/matplotlib/_api/deprecation.py in wrapper(*inner_args, **inner_kwargs)
410 else deprecation_addendum,
411 **kwargs)
--> 412 return func(*inner_args, **inner_kwargs)
413
414 DECORATORS[wrapper] = decorator
~/anaconda3/lib/python3.8/site-packages/matplotlib/backends/backend_agg.py in print_png(self, filename_or_obj, metadata, pil_kwargs, *args)
538 *metadata*, including the default 'Software' key.
539 """
--> 540 FigureCanvasAgg.draw(self)
541 mpl.image.imsave(
542 filename_or_obj, self.buffer_rgba(), format="png", origin="upper",
~/anaconda3/lib/python3.8/site-packages/matplotlib/backends/backend_agg.py in draw(self)
431 self.renderer = self.get_renderer(cleared=True)
432 # Acquire a lock on the shared font cache.
--> 433 with RendererAgg.lock, \
434 (self.toolbar._wait_cursor_for_draw_cm() if self.toolbar
435 else nullcontext()):
~/anaconda3/lib/python3.8/contextlib.py in __enter__(self)
111 del self.args, self.kwds, self.func
112 try:
--> 113 return next(self.gen)
114 except StopIteration:
115 raise RuntimeError("generator didn't yield") from None
~/anaconda3/lib/python3.8/site-packages/matplotlib/backend_bases.py in _wait_cursor_for_draw_cm(self)
3026 yield
3027 finally:
-> 3028 self.canvas.set_cursor(self._lastCursor)
3029 else:
3030 yield
~/anaconda3/lib/python3.8/site-packages/matplotlib/backends/backend_webagg_core.py in set_cursor(self, cursor)
208 backend_tools.Cursors.RESIZE_VERTICAL: 'ns-resize',
209 }, cursor=cursor)
--> 210 self.send_event('cursor', cursor=cursor)
211
212 def set_image_mode(self, mode):
~/anaconda3/lib/python3.8/site-packages/matplotlib/backends/backend_webagg_core.py in send_event(self, event_type, **kwargs)
390 def send_event(self, event_type, **kwargs):
391 if self.manager:
--> 392 self.manager._send_event(event_type, **kwargs)
393
394
~/anaconda3/lib/python3.8/site-packages/matplotlib/backends/backend_webagg_core.py in _send_event(self, event_type, **kwargs)
539 payload = {'type': event_type, **kwargs}
540 for s in self.web_sockets:
--> 541 s.send_json(payload)
542
543
~/anaconda3/lib/python3.8/site-packages/ipympl/backend_nbagg.py in send_json(self, content)
179 # Change in the widget state?
180 if content['type'] == 'cursor':
--> 181 self._cursor = cursors_str[content['cursor']]
182
183 elif content['type'] == 'message':
KeyError: 'default'

Dask Cluster: AttributeError: 'DataFrame' object has no attribute '_data'

I'm working with a Dask Cluster on GCP. I'm using this code to deploy it:
from dask_cloudprovider.gcp import GCPCluster
from dask.distributed import Client
enviroment_vars = {
'EXTRA_PIP_PACKAGES': '"gcsfs"'
}
cluster = GCPCluster(
n_workers=32,
docker_image='daskdev/dask:2021.2.0',
env_vars=enviroment_vars,
network='my-network',
#filesystem_size=150,
machine_type='e2-standard-16',
projectid='my-project-id',
zone='us-central1-a',
on_host_maintenance="MIGRATE"
client = Client(cluster)
Then I read csv files, with the following code:
import dask.dataframe as dd
import csv
col_dtypes = {
'var1': 'float64',
'var2': 'object',
'var3': 'object',
'var4': 'float64'
}
df = dd.read_csv('gs://my_bucket/files-*.csv', blocksize=None, dtype= col_dtypes)
df = df.persist()
Everything works fine, but when I try to do some queries, or calculation, I get an error. For instance this piece of code:
df.var1.value_counts().compute()
This is the output:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-14-711a7c21ed42> in <module>
----> 1 df.var1.value_counts().compute()
/opt/conda/lib/python3.8/site-packages/dask/base.py in compute(self, **kwargs)
279 dask.base.compute
280 """
--> 281 (result,) = compute(self, traverse=False, **kwargs)
282 return result
283
/opt/conda/lib/python3.8/site-packages/dask/base.py in compute(*args, **kwargs)
561 postcomputes.append(x.__dask_postcompute__())
562
--> 563 results = schedule(dsk, keys, **kwargs)
564 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
565
/opt/conda/lib/python3.8/site-packages/distributed/client.py in get(self, dsk, keys, workers, allow_other_workers, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)
2653 should_rejoin = False
2654 try:
-> 2655 results = self.gather(packed, asynchronous=asynchronous, direct=direct)
2656 finally:
2657 for f in futures.values():
/opt/conda/lib/python3.8/site-packages/distributed/client.py in gather(self, futures, errors, direct, asynchronous)
1962 else:
1963 local_worker = None
-> 1964 return self.sync(
1965 self._gather,
1966 futures,
/opt/conda/lib/python3.8/site-packages/distributed/client.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
836 return future
837 else:
--> 838 return sync(
839 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
840 )
/opt/conda/lib/python3.8/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
338 if error[0]:
339 typ, exc, tb = error[0]
--> 340 raise exc.with_traceback(tb)
341 else:
342 return result[0]
/opt/conda/lib/python3.8/site-packages/distributed/utils.py in f()
322 if callback_timeout is not None:
323 future = asyncio.wait_for(future, callback_timeout)
--> 324 result[0] = yield future
325 except Exception as exc:
326 error[0] = sys.exc_info()
/opt/conda/lib/python3.8/site-packages/tornado/gen.py in run(self)
760
761 try:
--> 762 value = future.result()
763 except Exception:
764 exc_info = sys.exc_info()
/opt/conda/lib/python3.8/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
1827 exc = CancelledError(key)
1828 else:
-> 1829 raise exception.with_traceback(traceback)
1830 raise exc
1831 if errors == "skip":
/opt/conda/lib/python3.8/site-packages/dask/optimization.py in __call__()
961 if not len(args) == len(self.inkeys):
962 raise ValueError("Expected %d args, got %d" % (len(self.inkeys), len(args)))
--> 963 return core.get(self.dsk, self.outkey, dict(zip(self.inkeys, args)))
964
965 def __reduce__(self):
/opt/conda/lib/python3.8/site-packages/dask/core.py in get()
149 for key in toposort(dsk):
150 task = dsk[key]
--> 151 result = _execute_task(task, cache)
152 cache[key] = result
153 result = _execute_task(out, cache)
/opt/conda/lib/python3.8/site-packages/dask/core.py in _execute_task()
119 # temporaries by their reference count and can execute certain
120 # operations in-place.
--> 121 return func(*(_execute_task(a, cache) for a in args))
122 elif not ishashable(arg):
123 return arg
/opt/conda/lib/python3.8/site-packages/dask/utils.py in apply()
33 def apply(func, args, kwargs=None):
34 if kwargs:
---> 35 return func(*args, **kwargs)
36 else:
37 return func(*args)
/opt/conda/lib/python3.8/site-packages/dask/dataframe/core.py in apply_and_enforce()
5474 return meta
5475 if is_dataframe_like(df):
-> 5476 check_matching_columns(meta, df)
5477 c = meta.columns
5478 else:
/opt/conda/lib/python3.8/site-packages/dask/dataframe/utils.py in check_matching_columns()
690 def check_matching_columns(meta, actual):
691 # Need nan_to_num otherwise nan comparison gives False
--> 692 if not np.array_equal(np.nan_to_num(meta.columns), np.nan_to_num(actual.columns)):
693 extra = methods.tolist(actual.columns.difference(meta.columns))
694 missing = methods.tolist(meta.columns.difference(actual.columns))
/opt/conda/lib/python3.8/site-packages/pandas/core/generic.py in __getattr__()
5268 or name in self._accessors
5269 ):
-> 5270 return object.__getattribute__(self, name)
5271 else:
5272 if self._info_axis._can_hold_identifiers_and_holds_name(name):
pandas/_libs/properties.pyx in pandas._libs.properties.AxisProperty.__get__()
/opt/conda/lib/python3.8/site-packages/pandas/core/generic.py in __getattr__()
5268 or name in self._accessors
5269 ):
-> 5270 return object.__getattribute__(self, name)
5271 else:
5272 if self._info_axis._can_hold_identifiers_and_holds_name(name):
AttributeError: 'DataFrame' object has no attribute '_data'
The version of Pandas in my docker file is 1.0.1, so I already try upgrading Pandas (to version 1.2.2), but it didn't work, what am I doing wrong?
My guess is that you have a version mismatch somewhere. What does client.get_versions(check=True) say?

How to convert coordinate columns to Point column with Shapely and Dask?

I have the following problem. My data is a huge dataframe, looking like this (this is the head of the dataframe)
import pandas
import dask.dataframe as dd
data = dd.read_csv(data_path)
data.persist()
print(data.head())
Gitter_ID_100m x_mp_100m y_mp_100m Einwohner
0 100mN26840E43341 4334150 2684050 -1
1 100mN26840E43342 4334250 2684050 -1
2 100mN26840E43343 4334350 2684050 -1
3 100mN26840E43344 4334450 2684050 -1
4 100mN26840E43345 4334550 2684050 -1
I am using Dask to handle it. I now want to create a new column where the 'x_mp_100m' and 'y_mp_100m' are converted into a Shapely Point. For a single row, it would look like this:
from shapely.geometry import Point
test_df = data.head(1)
test_df = test_df.assign(geom=lambda k: Point(k.x_mp_100m,k.y_mp_100m))
print(test_df)
Gitter_ID_100m x_mp_100m y_mp_100m Einwohner geom
0 100mN26840E43341 4334150 2684050 -1 POINT (4334150 2684050)
I already tried the following code with Dask:
data_out = data.map_partitions(lambda df: df.assign(geom= lambda k: Point(k.x_mp_100m,k.y_mp_100m)), meta=pd.DataFrame)
When doing that, I get the following error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-17-b8de11d9b9b3> in <module>
----> 1 data_out.compute()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\base.py in compute(self, **kwargs)
154 dask.base.compute
155 """
--> 156 (result,) = compute(self, traverse=False, **kwargs)
157 return result
158
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\base.py in compute(*args, **kwargs)
395 keys = [x.__dask_keys__() for x in collections]
396 postcomputes = [x.__dask_postcompute__() for x in collections]
--> 397 results = schedule(dsk, keys, **kwargs)
398 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
399
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)
2319 try:
2320 results = self.gather(packed, asynchronous=asynchronous,
-> 2321 direct=direct)
2322 finally:
2323 for f in futures.values():
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in gather(self, futures, errors, maxsize, direct, asynchronous)
1653 return self.sync(self._gather, futures, errors=errors,
1654 direct=direct, local_worker=local_worker,
-> 1655 asynchronous=asynchronous)
1656
1657 #gen.coroutine
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in sync(self, func, *args, **kwargs)
671 return future
672 else:
--> 673 return sync(self.loop, func, *args, **kwargs)
674
675 def __repr__(self):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\utils.py in sync(loop, func, *args, **kwargs)
275 e.wait(10)
276 if error[0]:
--> 277 six.reraise(*error[0])
278 else:
279 return result[0]
~\AppData\Local\Continuum\anaconda3\lib\site-packages\six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\utils.py in f()
260 if timeout is not None:
261 future = gen.with_timeout(timedelta(seconds=timeout), future)
--> 262 result[0] = yield future
263 except Exception as exc:
264 error[0] = sys.exc_info()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py in run(self)
1131
1132 try:
-> 1133 value = future.result()
1134 except Exception:
1135 self.had_exception = True
~\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py in run(self)
1139 if exc_info is not None:
1140 try:
-> 1141 yielded = self.gen.throw(*exc_info)
1142 finally:
1143 # Break up a reference to itself
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in _gather(self, futures, errors, direct, local_worker)
1498 six.reraise(type(exception),
1499 exception,
-> 1500 traceback)
1501 if errors == 'skip':
1502 bad_keys.add(key)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\six.py in reraise(tp, value, tb)
690 value = tp()
691 if value.__traceback__ is not tb:
--> 692 raise value.with_traceback(tb)
693 raise value
694 finally:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\dataframe\core.py in apply_and_enforce()
3682
3683 Ensures the output has the same columns, even if empty."""
-> 3684 df = func(*args, **kwargs)
3685 if isinstance(df, (pd.DataFrame, pd.Series, pd.Index)):
3686 if len(df) == 0:
<ipython-input-16-d5710cb00158> in <lambda>()
----> 1 data_out = data.map_partitions(lambda df: df.assign(geom= lambda k: Point(k.x_mp_100m,k.y_mp_100m)), meta=pd.DataFrame)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in assign()
3549 if PY36:
3550 for k, v in kwargs.items():
-> 3551 data[k] = com.apply_if_callable(v, data)
3552 else:
3553 # <= 3.5: do all calculations first...
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\common.py in apply_if_callable()
327
328 if callable(maybe_callable):
--> 329 return maybe_callable(obj, **kwargs)
330
331 return maybe_callable
<ipython-input-16-d5710cb00158> in <lambda>()
----> 1 data_out = data.map_partitions(lambda df: df.assign(geom= lambda k: Point(k.x_mp_100m,k.y_mp_100m)), meta=pd.DataFrame)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\shapely\geometry\point.py in __init__()
47 BaseGeometry.__init__(self)
48 if len(args) > 0:
---> 49 self._set_coords(*args)
50
51 # Coordinate getters and setters
~\AppData\Local\Continuum\anaconda3\lib\site-packages\shapely\geometry\point.py in _set_coords()
130 self._geom, self._ndim = geos_point_from_py(args[0])
131 else:
--> 132 self._geom, self._ndim = geos_point_from_py(tuple(args))
133
134 coords = property(BaseGeometry._get_coords, _set_coords)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\shapely\geometry\point.py in geos_point_from_py()
207 coords = ob
208 n = len(coords)
--> 209 dx = c_double(coords[0])
210 dy = c_double(coords[1])
211 dz = None
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\series.py in wrapper()
91 return converter(self.iloc[0])
92 raise TypeError("cannot convert the series to "
---> 93 "{0}".format(str(converter)))
94
95 wrapper.__name__ = "__{name}__".format(name=converter.__name__)
TypeError: cannot convert the series to <class 'float'>
So I think, I am using pandas.assign() function in a wrong way, or there should be a better fitting function, I just cannot seem to wrap my head around it. Do you know a better way to handle this?
I also found this way:
data_out = data.map_partitions(lambda df: df.apply(lambda row: Point(row['x_mp_100m'],row['y_mp_100m']), axis=1))
But is that the most efficient way?
What you're doing seems fine. I would find a function that works well on a single row and then use the apply method or a function that works well on a single Pandas dataframe and then use the map_partitions method.
For the error that you're getting I would first verify that your function works on a pandas dataframe.

Network Plot Error Using Python / iPython

The following starter code errors out (AttributeError: incompatible shape for a non-contiguous array) when I try to use networkx on OSX 10.8.2, in an IPython notebook.
import networkx as nx
g = nx.Graph()
g.add_node(1)
g.add_node(2)
g.add_edge(1,2) # no error if this line is omitted
nx.draw(g)
I do not see the error if I do not add the edge to this graph. Matplotlib, Numpy, and Scipy were all installed as suggested here.
Below is the error message:
AttributeError Traceback (most recent call last)
<ipython-input-33-df7dfeff6452> in <module>()
----> 1 nx.draw(g)
/usr/local/lib/python2.7/site-packages/networkx/drawing/nx_pylab.pyc in draw(G, pos, ax, hold, **kwds)
131 pylab.hold(h)
132 try:
--> 133 draw_networkx(G,pos=pos,ax=ax,**kwds)
134 ax.set_axis_off()
135 pylab.draw_if_interactive()
/usr/local/lib/python2.7/site-packages/networkx/drawing/nx_pylab.pyc in draw_networkx(G, pos, with_labels, **kwds)
265
266 node_collection=draw_networkx_nodes(G, pos, **kwds)
--> 267 edge_collection=draw_networkx_edges(G, pos, **kwds)
268 if with_labels:
269 draw_networkx_labels(G, pos, **kwds)
/usr/local/lib/python2.7/site-packages/networkx/drawing/nx_pylab.pyc in draw_networkx_edges(G, pos, edgelist, width, edge_color, style, alpha, edge_cmap, edge_vmin, edge_vmax, ax, arrows, label, **kwds)
544 edge_collection.set_zorder(1) # edges go behind nodes
545 edge_collection.set_label(label)
--> 546 ax.add_collection(edge_collection)
547
548 # Note: there was a bug in mpl regarding the handling of alpha values for
/usr/local/lib/python2.7/site-packages/matplotlib/axes.pyc in add_collection(self, collection, autolim)
1443 if autolim:
1444 if collection._paths and len(collection._paths):
-> 1445 self.update_datalim(collection.get_datalim(self.transData))
1446
1447 collection._remove_method = lambda h: self.collections.remove(h)
/usr/local/lib/python2.7/site-packages/matplotlib/collections.pyc in get_datalim(self, transData)
165 offsets = offsets.filled(np.nan)
166 # get_path_collection_extents handles nan but not masked arrays
--> 167 offsets.shape = (-1, 2) # Make it Nx2
168
169 result = mpath.get_path_collection_extents(
AttributeError: incompatible shape for a non-contiguous array
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/usr/local/lib/python2.7/site-packages/IPython/zmq/pylab/backend_inline.pyc in show(close)
100 try:
101 for figure_manager in Gcf.get_all_fig_managers():
--> 102 send_figure(figure_manager.canvas.figure)
103 finally:
104 show._to_draw = []
/usr/local/lib/python2.7/site-packages/IPython/zmq/pylab/backend_inline.pyc in send_figure(fig)
209 """
210 fmt = InlineBackend.instance().figure_format
--> 211 data = print_figure(fig, fmt)
212 # print_figure will return None if there's nothing to draw:
213 if data is None:
/usr/local/lib/python2.7/site-packages/IPython/core/pylabtools.pyc in print_figure(fig, fmt)
102 try:
103 bytes_io = BytesIO()
--> 104 fig.canvas.print_figure(bytes_io, format=fmt, bbox_inches='tight')
105 data = bytes_io.getvalue()
106 finally:
/usr/local/lib/python2.7/site-packages/matplotlib/backend_bases.pyc in print_figure(self, filename, dpi, facecolor, edgecolor, orientation, format, **kwargs)
1981 orientation=orientation,
1982 dryrun=True,
-> 1983 **kwargs)
1984 renderer = self.figure._cachedRenderer
1985 bbox_inches = self.figure.get_tightbbox(renderer)
/usr/local/lib/python2.7/site-packages/matplotlib/backends/backend_agg.pyc in print_png(self, filename_or_obj, *args, **kwargs)
467
468 def print_png(self, filename_or_obj, *args, **kwargs):
--> 469 FigureCanvasAgg.draw(self)
470 renderer = self.get_renderer()
471 original_dpi = renderer.dpi
/usr/local/lib/python2.7/site-packages/matplotlib/backends/backend_agg.pyc in draw(self)
419
420 try:
--> 421 self.figure.draw(self.renderer)
422 finally:
423 RendererAgg.lock.release()
/usr/local/lib/python2.7/site-packages/matplotlib/artist.pyc in draw_wrapper(artist, renderer, *args, **kwargs)
53 def draw_wrapper(artist, renderer, *args, **kwargs):
54 before(artist, renderer)
---> 55 draw(artist, renderer, *args, **kwargs)
56 after(artist, renderer)
57
/usr/local/lib/python2.7/site-packages/matplotlib/figure.pyc in draw(self, renderer)
896 dsu.sort(key=itemgetter(0))
897 for zorder, a, func, args in dsu:
--> 898 func(*args)
899
900 renderer.close_group('figure')
/usr/local/lib/python2.7/site-packages/matplotlib/artist.pyc in draw_wrapper(artist, renderer, *args, **kwargs)
53 def draw_wrapper(artist, renderer, *args, **kwargs):
54 before(artist, renderer)
---> 55 draw(artist, renderer, *args, **kwargs)
56 after(artist, renderer)
57
/usr/local/lib/python2.7/site-packages/matplotlib/axes.pyc in draw(self, renderer, inframe)
1995
1996 for zorder, a in dsu:
-> 1997 a.draw(renderer)
1998
1999 renderer.close_group('axes')
/usr/local/lib/python2.7/site-packages/matplotlib/artist.pyc in draw_wrapper(artist, renderer, *args, **kwargs)
53 def draw_wrapper(artist, renderer, *args, **kwargs):
54 before(artist, renderer)
---> 55 draw(artist, renderer, *args, **kwargs)
56 after(artist, renderer)
57
/usr/local/lib/python2.7/site-packages/matplotlib/collections.pyc in draw(self, renderer)
227 self.update_scalarmappable()
228
--> 229 transform, transOffset, offsets, paths = self._prepare_points()
230
231 gc = renderer.new_gc()
/usr/local/lib/python2.7/site-packages/matplotlib/collections.pyc in _prepare_points(self)
203
204 offsets = np.asanyarray(offsets, np.float_)
--> 205 offsets.shape = (-1, 2) # Make it Nx2
206
207 if not transform.is_affine:
AttributeError: incompatible shape for a non-contiguous array
This was reported earlier in Draw a graph with edges using Networkx. It likely is a bug/feature in the NumPy library development version as mentioned above (numpy issue 2700).