pandas error when reading file - pandas

I am new to jupyter and have been trying to plot using it only last week. I have huge excel sheets .csv format that I want to read and plot. I am doing it by converting the .csv to a .dat format.
My code looks like this
import numpy as np
import pandas as pd
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
file1 = 'king2.dat'
file2 = 'iso.dat'
data1 = pd.read_csv(file1, delimiter='\s+', header=None, engine='python')
data1.columns = ['no_plt', 'Op_RA_plt', 'Op_DE_plt', 'Vmag_plt', 'B-
V_plt', '(B-V)o_plt', 'no_2_plt', 'FUV_mag_plt','FUV_magerr_plt',
'(FUV-V)_plt', '(V-I)_plt', '(FUV-I)_plt',
'no_op_2M','Op_RA_2M','Op_DE_2M','Vmag_2M','(B-V)o_2M',
'FUV_mag_2M','FUV-V_2M','no_2M','j_m_2M','h_m_2M','k_m_2M','j-h_2M',
'h-k_2M','j-k_2M','(V-I)_2M','(FUV-I)_2M',
'no_MS','Op_RA_MS','Op_DE_MS','Vmag_MS','(B-V)o_MS',
'FUV_mag_MS','FUV-V_MS','(V-I)_MS','(FUV-I)_MS',
'no_508','Op_RA_508','Op_DE_508','Vmag_508','(B-V)o_508',
'FUV_mag_508','FUV-V_508', '(V-I)_508','(FUV-I)_508',
'no_RG','Op_RA_RG','Op_DE_RG','Vmag_RG','(B-V)o_RG','FUV_mag_RG',
'FUV-V_RG','(V-I)_RG','(FUV-I)_RG','no_RG609','Op_RA_RG609',
'Op_DE_RG609','Vmag_RG609','(B-V)o_RG609','FUV_mag_RG609',
'FUV-V_RG609', '(V-I)_RG609', '(FUV-I)_RG609',
'no_TF621','Op_RA_TF621','Op_DE_TF621','Vmag_TF621','(B-V)o_TF621',
'FUV_mag_TF621','FUV-V_TF621','(V-I)_TF621','(FUV-I)_TF621',
'no_onBSS','Op_RA_onBSS','Op_DE_onBSS','Vmag_onBSS','(B-V)o_onBSS',
'FUV_mag_onBSS','FUV-V_onBSS','(V-I)_onBSs','(FUV-I)_onBSS',
'no_BSSreg','Op_RA_BSSreg','Op_DE_BSSreg','Vmag_BSSreg',
'(B-V)o_BSSreg','FUV_mag_BSSreg','FUV-V_BSSreg','(V-I)_BSSreg',
'(FUV-I)_BSSreg','no_BSSreg558', 'Op_RA_BSSreg558' , 'Op_DE_BSSreg558','Vmag_BSSreg558','(B-V)o_BSSreg558',
'FUV_mag_BSSreg558','FUV-V_BSSreg558','(V-I)_BSSreg558','(FUV-I)_BSSreg558','no_aMS','Op_RA_aMS',
'Op_DE_aMS','Vmag_aMS','(B-V)o_aMS','FUV_mag_aMS','FUV-V_aMS','(V-I)_aMS','(FUV-I)_aMS','no_bMS',
'Op_RA_bMS','Op_DE_bMS','Vmag_bMS','(B-V)o_bMS','FUV_mag_bMS','FUV-V_bMS','(V-I)_bMS',
'(FUV-I)_bMS','no _SED','Op_RA _SED','Op_DE _SED','Vmag _SED','(B-V)o _SED','FUV_mag _SED',
'FUV-V _SED','(V-I)_SED','(FUV-I)_SED']
data2 = pd.read_csv(file1, delimiter='\s+', header=None, engine='python')
data2.columns =['age','log(Z)','mass','logl','logt','logg',
'FUVCa_14.76','NUVB15_14.76', '(FUV-NUV)14.76','V14.76','B14.76',
'B-V14.76','(FUV-V)14.76','(NUV-V)14.76','GA NUV14.76',
'(Uf-Gn)14.76','I14.76','(V-I)14.76','(FUV-I)14.76']
def fit_data():
fig = plt.figure(1,figsize=(8,8))
plt.subplot(111)
plt.scatter(data1['(B-V)o_plt'], data1['Vmag_plt'], marker='.', color='r', s=5)
plt.scatter(data2['B-V14.76'], data2['V14.76'], marker='o', color='g', s=6)
plt.xlabel('RA_F',size=20)
plt.ylabel('DEC_F',size=20)
plt.gca().invert_xaxis()
plt.gca().invert_yaxis()
plt.show()
plt.close()
fit_data()
When I had sheets of 4 columns, it would read and plot without any errors. But if I increase the columns, it gives me the error:
ParserError Traceback (most recent call last)
<ipython-input-5-fe66f2a2aac9> in <module>()
13 data1.columns = ['age','FUVCa','NUVB15','(FUV-NUV)','V','B','B-V','(FUV-V)','(NUV-V)','I','(V-I)','(FUV-I)']
14
---> 15 data2 = pd.read_csv(file2, delimiter='\s+', header=None, engine='python')
16 data2.columns = ['no','Op_RA','Op_DE','Vmag','(B-V)o','FUV_mag','(FUV-V)','(V-I)','(FUV-I)', 'no_MS','Op_RA_MS','Op_DE_MS','Vmag_MS','(B-V)o_MS',
17 'FUV_mag_MS','FUV-V_MS','(V-I)_MS','(FUV-I)_MS','no_508','Op_RA_508','Op_DE_508','Vmag_508',
~/anaconda3/lib/python3.6/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
707 skip_blank_lines=skip_blank_lines)
708
--> 709 return _read(filepath_or_buffer, kwds)
710
711 parser_f.__name__ = name
~/anaconda3/lib/python3.6/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
453
454 try:
--> 455 data = parser.read(nrows)
456 finally:
457 parser.close()
~/anaconda3/lib/python3.6/site-packages/pandas/io/parsers.py in read(self, nrows)
1067 raise ValueError('skipfooter not supported for iteration')
1068
-> 1069 ret = self._engine.read(nrows)
1070
1071 if self.options.get('as_recarray'):
~/anaconda3/lib/python3.6/site-packages/pandas/io/parsers.py in read(self, rows)
2261 content = content[1:]
2262
-> 2263 alldata = self._rows_to_cols(content)
2264 data = self._exclude_implicit_index(alldata)
2265
~/anaconda3/lib/python3.6/site-packages/pandas/io/parsers.py in _rows_to_cols(self, content)
2916 msg += '. ' + reason
2917
-> 2918 self._alert_malformed(msg, row_num + 1)
2919
2920 # see gh-13320
~/anaconda3/lib/python3.6/site-packages/pandas/io/parsers.py in _alert_malformed(self, msg, row_num)
2683
2684 if self.error_bad_lines:
-> 2685 raise ParserError(msg)
2686 elif self.warn_bad_lines:
2687 base = 'Skipping line {row_num}: '.format(row_num=row_num)
ParserError: Expected 30 fields in line 21, saw 45. Error could possibly be due to quotes being ignored when a multi-char delimiter is used.
I wasn't able to understand what it means. I don't know where I am going wrong or if I am giving too many columns for it to handle.
Line 20
508 12.76968 58.18559 18.97 0.96 0.65 1371 22.925 0.343 3.955 508 12.76968 58.18559 18.97 0.65 22.925 3.955 32 16.111 15.777 15.253 0.334 0.524 0.858 508 12.76968 58.18559 18.97 0.65 22.925 3.955 508 12.76968 58.18559 18.97 0.65 22.925 3.955
Line 21
508 12.76968 58.18559 18.97 0.96 0.65 1371 22.925 0.343 3.955 508 12.76968 58.18559 18.97 0.6522.925 3.955 32 16.111 15.777 15.253 0.334 0.524 0.858 508 12.76968 58.18559 18.97 0.65 22.925 3.955 508 12.76968 58.18559 18.97 0.65 22.925 3.955 508 12.76968 58.18559 18.97 0.65 22.925 3.955

Related

unable to read csv file in jupyter notebook and following errors coming

import pandas as pd
df = pd.read_csv('D:\Tableau\codebasics_files\Weather_data.csv.xlsx')
df
​
UnicodeDecodeError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_18872\1985582496.py in
1 import pandas as pd
----> 2 df = pd.read_csv('D:\Tableau\codebasics_files\Weather_data.csv.xlsx')
3 df
C:\ProgramData\Anaconda3\lib\site-packages\pandas\util_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\readers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
676 kwds.update(kwds_defaults)
677
--> 678 return _read(filepath_or_buffer, kwds)
679
680
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\readers.py in _read(filepath_or_buffer, kwds)
573
574 # Create the parser.
--> 575 parser = TextFileReader(filepath_or_buffer, **kwds)
576
577 if chunksize or iterator:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\readers.py in init(self, f, engine, **kwds)
930
931 self.handles: IOHandles | None = None
--> 932 self._engine = self._make_engine(f, self.engine)
933
934 def close(self):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\readers.py in _make_engine(self, f, engine)
1232
1233 try:
-> 1234 return mapping[engine](f, **self.options)
1235 except Exception:
1236 if self.handles is not None:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\c_parser_wrapper.py in init(self, src, **kwds)
73
74 kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
---> 75 self._reader = parsers.TextReader(src, **kwds)
76
77 self.unnamed_cols = self._reader.unnamed_cols
C:\ProgramData\Anaconda3\lib\site-packages\pandas_libs\parsers.pyx in pandas._libs.parsers.TextReader.cinit()
C:\ProgramData\Anaconda3\lib\site-packages\pandas_libs\parsers.pyx in pandas._libs.parsers.TextReader._get_header()
C:\ProgramData\Anaconda3\lib\site-packages\pandas_libs\parsers.pyx in pandas._libs.parsers.TextReader._tokenize_rows()
C:\ProgramData\Anaconda3\lib\site-packages\pandas_libs\parsers.pyx in pandas._libs.parsers.raise_parser_error()
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xaa in position 14: invalid start byte
​
i tried some options using youtube but not working
Looking at the very end of the file extension, you're importing a .xlsx file, not a CSV file.
Try opening the file on Excel and export it as a CSV. You need to make sure that .csv is the last characters of the file.
I think you need to use XLSX function for load and read the XLSX file inside pandas.
For that you need to use this line of code inside your code:
import pandas as pd
df = pd.read_excel('D:\Tableau\codebasics_files\Weather_data.csv.xlsx')

plot no longer works after upgrade

I recently upgraded pandas to 1.1.5, using Python 3.6.4 and I can no longer plot any charts with a datetime index column.
See the below example where I import a time series from a csv file. I have also tried registering matplotlib converters in case this was the issue. I get the error message shown below. Incidentally seaborn also no longer works but not sure if that's relevant.
Thanks
import matplotlib.pyplot as plt
import pandas as pd
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
df = pd.read_csv('example.csv', parse_dates=True, index_col=0, dayfirst=True)
df.head()
Click here to see output for df.head()
df.plot()
I get the following error if I try and plot
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-37-848b80e64df8> in <module>()
----> 1 df.plot()
C:\Program Files\Anaconda3\lib\site-packages\pandas\plotting\_core.py in __call__(self, *args, **kwargs)
947 data.columns = label_name
948
--> 949 return plot_backend.plot(data, kind=kind, **kwargs)
950
951 __call__.__doc__ = __doc__
C:\Program Files\Anaconda3\lib\site-packages\pandas\plotting\_matplotlib\__init__.py in plot(data, kind, **kwargs)
59 kwargs["ax"] = getattr(ax, "left_ax", ax)
60 plot_obj = PLOT_CLASSES[kind](data, **kwargs)
---> 61 plot_obj.generate()
62 plot_obj.draw()
63 return plot_obj.result
C:\Program Files\Anaconda3\lib\site-packages\pandas\plotting\_matplotlib\core.py in generate(self)
269 self._compute_plot_data()
270 self._setup_subplots()
--> 271 self._make_plot()
272 self._add_table()
273 self._make_legend()
C:\Program Files\Anaconda3\lib\site-packages\pandas\plotting\_matplotlib\core.py in _make_plot(self)
1124 stacking_id=stacking_id,
1125 is_errorbar=is_errorbar,
-> 1126 **kwds,
1127 )
1128 self._add_legend_handle(newlines[0], label, index=i)
C:\Program Files\Anaconda3\lib\site-packages\pandas\plotting\_matplotlib\core.py in _plot(cls, ax, x, y, style, column_num, stacking_id, **kwds)
1143 cls._initialize_stacker(ax, stacking_id, len(y))
1144 y_values = cls._get_stacked_values(ax, stacking_id, y, kwds["label"])
-> 1145 lines = MPLPlot._plot(ax, x, y_values, style=style, **kwds)
1146 cls._update_stacker(ax, stacking_id, y)
1147 return lines
C:\Program Files\Anaconda3\lib\site-packages\pandas\plotting\_matplotlib\converter.py in wrapper(*args, **kwargs)
63 def wrapper(*args, **kwargs):
64 with pandas_converters():
---> 65 return func(*args, **kwargs)
66
67 return wrapper
C:\Program Files\Anaconda3\lib\site-packages\pandas\plotting\_matplotlib\core.py in _plot(cls, ax, x, y, style, is_errorbar, **kwds)
666 else:
667 args = (x, y)
--> 668 return ax.plot(*args, **kwds)
669
670 def _get_index_name(self):
C:\Program Files\Anaconda3\lib\site-packages\matplotlib\__init__.py in inner(ax, *args, **kwargs)
1715 warnings.warn(msg % (label_namer, func.__name__),
1716 RuntimeWarning, stacklevel=2)
-> 1717 return func(ax, *args, **kwargs)
1718 pre_doc = inner.__doc__
1719 if pre_doc is None:
C:\Program Files\Anaconda3\lib\site-packages\matplotlib\axes\_axes.py in plot(self, *args, **kwargs)
1371
1372 for line in self._get_lines(*args, **kwargs):
-> 1373 self.add_line(line)
1374 lines.append(line)
1375
C:\Program Files\Anaconda3\lib\site-packages\matplotlib\axes\_base.py in add_line(self, line)
1777 line.set_clip_path(self.patch)
1778
-> 1779 self._update_line_limits(line)
1780 if not line.get_label():
1781 line.set_label('_line%d' % len(self.lines))
C:\Program Files\Anaconda3\lib\site-packages\matplotlib\axes\_base.py in _update_line_limits(self, line)
1799 Figures out the data limit of the given line, updating self.dataLim.
1800 """
-> 1801 path = line.get_path()
1802 if path.vertices.size == 0:
1803 return
C:\Program Files\Anaconda3\lib\site-packages\matplotlib\lines.py in get_path(self)
955 """
956 if self._invalidy or self._invalidx:
--> 957 self.recache()
958 return self._path
959
C:\Program Files\Anaconda3\lib\site-packages\matplotlib\lines.py in recache(self, always)
655 def recache(self, always=False):
656 if always or self._invalidx:
--> 657 xconv = self.convert_xunits(self._xorig)
658 x = _to_unmasked_float_array(xconv).ravel()
659 else:
C:\Program Files\Anaconda3\lib\site-packages\matplotlib\artist.py in convert_xunits(self, x)
189 if ax is None or ax.xaxis is None:
190 return x
--> 191 return ax.xaxis.convert_units(x)
192
193 def convert_yunits(self, y):
C:\Program Files\Anaconda3\lib\site-packages\matplotlib\axis.py in convert_units(self, x)
1489 return x
1490
-> 1491 ret = self.converter.convert(x, self.units, self)
1492 return ret
1493
C:\Program Files\Anaconda3\lib\site-packages\pandas\plotting\_matplotlib\converter.py in convert(values, unit, axis)
254 values = [DatetimeConverter._convert_1d(v, unit, axis) for v in values]
255 else:
--> 256 values = DatetimeConverter._convert_1d(values, unit, axis)
257 return values
258
C:\Program Files\Anaconda3\lib\site-packages\pandas\plotting\_matplotlib\converter.py in _convert_1d(values, unit, axis)
289 pass
290
--> 291 values = dates.date2num(values)
292
293 return values
C:\Program Files\Anaconda3\lib\site-packages\matplotlib\dates.py in date2num(d)
394 if not d.size:
395 return d
--> 396 return _to_ordinalf_np_vectorized(d)
397
398
C:\Program Files\Anaconda3\lib\site-packages\numpy\lib\function_base.py in __call__(self, *args, **kwargs)
2106 vargs.extend([kwargs[_n] for _n in names])
2107
-> 2108 return self._vectorize_call(func=func, args=vargs)
2109
2110 def _get_ufunc_and_otypes(self, func, args):
C:\Program Files\Anaconda3\lib\site-packages\numpy\lib\function_base.py in _vectorize_call(self, func, args)
2184 res = func()
2185 else:
-> 2186 ufunc, otypes = self._get_ufunc_and_otypes(func=func, args=args)
2187
2188 # Convert args to object arrays first
C:\Program Files\Anaconda3\lib\site-packages\numpy\lib\function_base.py in _get_ufunc_and_otypes(self, func, args)
2144
2145 inputs = [arg.flat[0] for arg in args]
-> 2146 outputs = func(*inputs)
2147
2148 # Performance note: profiling indicates that -- for simple
C:\Program Files\Anaconda3\lib\site-packages\matplotlib\dates.py in _to_ordinalf(dt)
243 tzi = UTC
244
--> 245 base = float(dt.toordinal())
246
247 # If it's sufficiently datetime-like, it will have a `date()` method
AttributeError: 'numpy.datetime64' object has no attribute 'toordinal'
The older version of matplotlib (2.1.2) is out of date and no longer compatible with the newer version of pandas (1.1.5). An upgrade to matplotlib 3.3.4 solves this issue - as discussed in the comments.

Why are the limits of the histogram data autodetected as [nan, nan] instead of discarding NaNs?

The following code generates an error
print(g['resp'])
par = {'hist': True, 'kde': False, 'fit': scipy.stats.norm, 'bins': 'auto'}
sns.distplot(g['resp'], color='blue', **par)
31 23.0
32 28.0
33 29.0
34 31.0
35 32.0
36 35.0
37 35.0
38 36.0
39 37.0
40 38.0
41 38.0
42 38.0
43 41.0
44 42.0
45 42.0
46 42.0
47 42.0
48 46.0
49 48.0
50 49.0
51 50.0
52 52.0
53 55.0
54 56.0
55 60.0
56 60.0
57 100.0
58 NaN
59 NaN
60 NaN
61 NaN
Name: resp, dtype: float64
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-23-42944bf1e405> in <module>
1 print(g['resp'])
2 par = {'hist': True, 'kde': False, 'fit': scipy.stats.norm, 'bins': 'auto'}
----> 3 sns.distplot(g['resp'], color='blue', **par)
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py in distplot(a, bins, hist, kde, rug, fit, hist_kws, kde_kws, rug_kws, fit_kws, color, vertical, norm_hist, axlabel, label, ax)
223 hist_color = hist_kws.pop("color", color)
224 ax.hist(a, bins, orientation=orientation,
--> 225 color=hist_color, **hist_kws)
226 if hist_color != color:
227 hist_kws["color"] = hist_color
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py in inner(ax, data, *args, **kwargs)
1808 "the Matplotlib list!)" % (label_namer, func.__name__),
1809 RuntimeWarning, stacklevel=2)
-> 1810 return func(ax, *args, **kwargs)
1811
1812 inner.__doc__ = _add_data_doc(inner.__doc__,
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axes\_axes.py in hist(self, x, bins, range, density, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, normed, **kwargs)
6589 # this will automatically overwrite bins,
6590 # so that each histogram uses the same bins
-> 6591 m, bins = np.histogram(x[i], bins, weights=w[i], **hist_kwargs)
6592 m = m.astype(float) # causes problems later if it's an int
6593 if mlast is None:
C:\ProgramData\Anaconda3\lib\site-packages\numpy\lib\histograms.py in histogram(a, bins, range, normed, weights, density)
708 a, weights = _ravel_and_check_weights(a, weights)
709
--> 710 bin_edges, uniform_bins = _get_bin_edges(a, bins, range, weights)
711
712 # Histogram is an integer or a float array depending on the weights.
C:\ProgramData\Anaconda3\lib\site-packages\numpy\lib\histograms.py in _get_bin_edges(a, bins, range, weights)
331 "bins is not supported for weighted data")
332
--> 333 first_edge, last_edge = _get_outer_edges(a, range)
334
335 # truncate the range if needed
C:\ProgramData\Anaconda3\lib\site-packages\numpy\lib\histograms.py in _get_outer_edges(a, range)
259 if not (np.isfinite(first_edge) and np.isfinite(last_edge)):
260 raise ValueError(
--> 261 "autodetected range of [{}, {}] is not finite".format(first_edge, last_edge))
262
263 # expand empty range to avoid divide by zero
ValueError: autodetected range of [nan, nan] is not finite
It looks like the NaN values are causing trouble - how to discard them?
I think not, so possible solution is Series.dropna for remove missing values:
sns.distplot(g['resp'].dropna(), color='blue', **par)

index value vs. flight (data range A row & E row )

I want to know the scatter plot of the sum of the flight fields per minute. My information is as follows
http://python2018.byethost10.com/flights.csv
My grammar is as follows
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib
matplotlib.rcParams['font.sans-serif'] = ['Noto Serif CJK TC']
matplotlib.rcParams['font.family']='sans-serif'
Df=pd.read_csv('flights.csv')
Df["time_hour"] = pd.to_datetime(df['time_hour'])
grp = df.groupby(by=[df.time_hour.map(lambda x : (x.hour, x.minute))])
a=grp.sum()
plt.scatter(a.index, a['flight'], c='b', marker='o')
plt.xlabel('index value', fontsize=16)
plt.ylabel('flight', fontsize=16)
plt.title('scatter plot - index value vs. flight (data range A row & E row )', fontsize=20)
plt.show()
Produced the following error:
Produced the following error
Traceback (most recent call last):
File "I:/PycharmProjects/1223/raise1/char3.py", line 10, in
Plt.scatter(a.index, a['flight'], c='b', marker='o')
File "C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py", line 3470, in scatter
Edgecolors=edgecolors, data=data, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\matplotlib__init__.py", line 1855, in inner
Return func(ax, *args, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axes_axes.py", line 4320, in scatter
Alpha=alpha
File "C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\collections.py", line 927, in init
Collection.init(self, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\collections.py", line 159, in init
Offsets = np.asanyarray(offsets, float)
File "C:\ProgramData\Anaconda3\lib\site-packages\numpy\core\numeric.py", line 544, in asanyarray
Return array(a, dtype, copy=False, order=order, subok=True)
ValueError: setting an array element with a sequence.
How can I produce the following results? Thank you.
http://python2018.byethost10.com/image.png
Problem is in aggregation, in your code it return tuples in index.
Solution is convert time_dt column to strings HH:MM by Series.dt.strftime:
a = df.groupby(by=[df.time_hour.dt.strftime('%H:%M')]).sum()
All together:
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib
matplotlib.rcParams['font.sans-serif'] = ['Noto Serif CJK TC']
matplotlib.rcParams['font.family']='sans-serif'
#first column is index and second clumn is parsed to datetimes
df=pd.read_csv('flights.csv', index_col=[0], parse_dates=[1])
a = df.groupby(by=[df.time_hour.dt.strftime('%H:%M')]).sum()
print (a)
year sched_dep_time flight air_time distance hour minute
time_hour
05:00 122793 37856 87445 11282.0 72838 366 1256
05:01 120780 44810 82113 11115.0 71168 435 1310
05:02 122793 52989 99975 11165.0 72068 515 1489
05:03 120780 57653 98323 10366.0 65137 561 1553
05:04 122793 67706 110230 10026.0 63118 661 1606
05:05 122793 75807 126426 9161.0 55371 742 1607
05:06 120780 82010 120753 10804.0 67827 799 2110
05:07 122793 90684 130339 8408.0 52945 890 1684
05:08 120780 93687 114415 10299.0 63271 922 1487
05:09 122793 101571 99526 11525.0 72915 1002 1371
05:10 122793 107252 107961 10383.0 70137 1056 1652
05:11 120780 111351 120261 10949.0 73350 1098 1551
05:12 122793 120575 135930 8661.0 57406 1190 1575
05:13 120780 118272 104763 7784.0 55886 1166 1672
05:14 122793 37289 109300 9838.0 63582 364 889
05:15 122793 42374 67193 11480.0 78183 409 1474
05:16 58377 22321 53424 4271.0 27527 216 721
plt.scatter(a.index, a['flight'], c='b', marker='o')
#rotate labels of x axis
plt.xticks(rotation=90)
plt.xlabel('index value', fontsize=16)
plt.ylabel('flight', fontsize=16)
plt.title('scatter plot - index value vs. flight (data range A row & E row )', fontsize=20)
plt.show()
Another solution is convert datetimes to times:
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib
matplotlib.rcParams['font.sans-serif'] = 'Noto Serif CJK TC'
matplotlib.rcParams['font.family']='sans-serif'
df=pd.read_csv('flights.csv', index_col=[0], parse_dates=[1])
a = df.groupby(by=[df.time_hour.dt.time]).sum()
print (a)
year sched_dep_time flight air_time distance hour minute
time_hour
05:00:00 122793 37856 87445 11282.0 72838 366 1256
05:01:00 120780 44810 82113 11115.0 71168 435 1310
05:02:00 122793 52989 99975 11165.0 72068 515 1489
05:03:00 120780 57653 98323 10366.0 65137 561 1553
05:04:00 122793 67706 110230 10026.0 63118 661 1606
05:05:00 122793 75807 126426 9161.0 55371 742 1607
05:06:00 120780 82010 120753 10804.0 67827 799 2110
05:07:00 122793 90684 130339 8408.0 52945 890 1684
05:08:00 120780 93687 114415 10299.0 63271 922 1487
05:09:00 122793 101571 99526 11525.0 72915 1002 1371
05:10:00 122793 107252 107961 10383.0 70137 1056 1652
05:11:00 120780 111351 120261 10949.0 73350 1098 1551
05:12:00 122793 120575 135930 8661.0 57406 1190 1575
05:13:00 120780 118272 104763 7784.0 55886 1166 1672
05:14:00 122793 37289 109300 9838.0 63582 364 889
05:15:00 122793 42374 67193 11480.0 78183 409 1474
05:16:00 58377 22321 53424 4271.0 27527 216 721
plt.scatter(a.index, a['flight'], c='b', marker='o')
plt.xticks(rotation=90)
plt.xlabel('index value', fontsize=16)
plt.ylabel('flight', fontsize=16)
plt.title('scatter plot - index value vs. flight (data range A row & E row )', fontsize=20)
plt.show()

xarray: mean of data stored via OPeNDAP

I'm using xarray's very cool pydap back-end (http://xarray.pydata.org/en/stable/io.html#opendap) to read data stored via OPenDAP at IRI:
import xarray as xr
remote_data = xr.open_dataarray('http://iridl.ldeo.columbia.edu/SOURCES/.Models/.SubX/.RSMAS/.CCSM4/.hindcast/.zg/dods')
print(remote_data)
#<xarray.DataArray 'zg' (P: 2, S: 6569, M: 3, L: 45, Y: 181, X: 360)>
#[115569730800 values with dtype=float32]
#Coordinates:
# * L (L) timedelta64[ns] 0 days 12:00:00 1 days 12:00:00 ...
# * Y (Y) float32 -90.0 -89.0 -88.0 -87.0 -86.0 -85.0 -84.0 -83.0 ...
# * S (S) datetime64[ns] 1999-01-07 1999-01-08 1999-01-09 1999-01-10 ...
# * M (M) float32 1.0 2.0 3.0
# * X (X) float32 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0 11.0 ...
# * P (P) int32 500 200
#Attributes:
# level_type: pressure level
# standard_name: geopotential_height
# long_name: Geopotential Height
# units: m
For reference it's sub-seasonal forecast data where L is lead-time (45 days forecasts), S is initialization date and M is ensemble.
I would like to do an ensemble mean and i'm only interested in the 500 hPa level. However, it crashes out and gives a RuntimeError: NetCDF: Access failure:
da = remote_data.sel(P=500)
da_ensmean = da.mean(dim='M')
RuntimeError Traceback (most recent call last)
<ipython-input-46-eca488e9def5> in <module>()
1 remote_data = xr.open_dataarray('http://iridl.ldeo.columbia.edu/SOURCES/.Models' '/.SubX/.RSMAS/.CCSM4/.hindcast/.zg/dods')
2 da = remote_data.sel(P=500)
----> 3 da_ensmean = da.mean(dim='M')
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/common.py in wrapped_func(self, dim, axis, skipna, keep_attrs, **kwargs)
20 keep_attrs=False, **kwargs):
21 return self.reduce(func, dim, axis, keep_attrs=keep_attrs,
---> 22 skipna=skipna, allow_lazy=True, **kwargs)
23 else:
24 def wrapped_func(self, dim=None, axis=None, keep_attrs=False,
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/dataarray.py in reduce(self, func, dim, axis, keep_attrs, **kwargs)
1359 summarized data and the indicated dimension(s) removed.
1360 """
-> 1361 var = self.variable.reduce(func, dim, axis, keep_attrs, **kwargs)
1362 return self._replace_maybe_drop_dims(var)
1363
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/variable.py in reduce(self, func, dim, axis, keep_attrs, allow_lazy, **kwargs)
1264 if dim is not None:
1265 axis = self.get_axis_num(dim)
-> 1266 data = func(self.data if allow_lazy else self.values,
1267 axis=axis, **kwargs)
1268
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/variable.py in data(self)
293 return self._data
294 else:
--> 295 return self.values
296
297 #data.setter
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/variable.py in values(self)
385 def values(self):
386 """The variable's data as a numpy.ndarray"""
--> 387 return _as_array_or_item(self._data)
388
389 #values.setter
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/variable.py in _as_array_or_item(data)
209 TODO: remove this (replace with np.asarray) once these issues are fixed
210 """
--> 211 data = np.asarray(data)
212 if data.ndim == 0:
213 if data.dtype.kind == 'M':
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/numpy/core/numeric.py in asarray(a, dtype, order)
490
491 """
--> 492 return array(a, dtype, copy=False, order=order)
493
494
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/indexing.py in __array__(self, dtype)
622
623 def __array__(self, dtype=None):
--> 624 self._ensure_cached()
625 return np.asarray(self.array, dtype=dtype)
626
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/indexing.py in _ensure_cached(self)
619 def _ensure_cached(self):
620 if not isinstance(self.array, NumpyIndexingAdapter):
--> 621 self.array = NumpyIndexingAdapter(np.asarray(self.array))
622
623 def __array__(self, dtype=None):
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/numpy/core/numeric.py in asarray(a, dtype, order)
490
491 """
--> 492 return array(a, dtype, copy=False, order=order)
493
494
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/indexing.py in __array__(self, dtype)
600
601 def __array__(self, dtype=None):
--> 602 return np.asarray(self.array, dtype=dtype)
603
604 def __getitem__(self, key):
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/numpy/core/numeric.py in asarray(a, dtype, order)
490
491 """
--> 492 return array(a, dtype, copy=False, order=order)
493
494
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/indexing.py in __array__(self, dtype)
506 def __array__(self, dtype=None):
507 array = as_indexable(self.array)
--> 508 return np.asarray(array[self.key], dtype=None)
509
510 def transpose(self, order):
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/coding/variables.py in __getitem__(self, key)
64
65 def __getitem__(self, key):
---> 66 return self.func(self.array[key])
67
68 def __repr__(self):
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/coding/variables.py in _apply_mask(data, encoded_fill_values, decoded_fill_value, dtype)
133 for fv in encoded_fill_values:
134 condition |= data == fv
--> 135 data = np.asarray(data, dtype=dtype)
136 return np.where(condition, decoded_fill_value, data)
137
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/numpy/core/numeric.py in asarray(a, dtype, order)
490
491 """
--> 492 return array(a, dtype, copy=False, order=order)
493
494
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/indexing.py in __array__(self, dtype)
506 def __array__(self, dtype=None):
507 array = as_indexable(self.array)
--> 508 return np.asarray(array[self.key], dtype=None)
509
510 def transpose(self, order):
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/backends/netCDF4_.py in __getitem__(self, key)
63 with self.datastore.ensure_open(autoclose=True):
64 try:
---> 65 array = getitem(self.get_array(), key.tuple)
66 except IndexError:
67 # Catch IndexError in netCDF4 and return a more informative
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/backends/common.py in robust_getitem(array, key, catch, max_retries, initial_delay)
114 for n in range(max_retries + 1):
115 try:
--> 116 return array[key]
117 except catch:
118 if n == max_retries:
netCDF4/_netCDF4.pyx in netCDF4._netCDF4.Variable.__getitem__()
netCDF4/_netCDF4.pyx in netCDF4._netCDF4.Variable._get()
netCDF4/_netCDF4.pyx in netCDF4._netCDF4._ensure_nc_success()
RuntimeError: NetCDF: Access failure
Breaking down the calculation removes the RuntimeError. Guess it was just too hefty of a calculation with all the start times. Shouldn't be too difficult to put in a loop over S:
da = remote_data.isel(P=0,S=0)
da_ensmean = da.mean(dim='M')
print(da_ensmean)
<xarray.DataArray 'zg' (L: 45, Y: 181, X: 360)>
array([[[5231.1445, 5231.1445, ..., 5231.1445, 5231.1445],
[5231.1445, 5231.1445, ..., 5231.1445, 5231.1445],
...,
[5056.2383, 5056.2383, ..., 5056.2383, 5056.2383],
[5056.2383, 5056.2383, ..., 5056.2383, 5056.2383]],
[[5211.346 , 5211.346 , ..., 5211.346 , 5211.346 ],
[5211.346 , 5211.346 , ..., 5211.346 , 5211.346 ],
...,
[5082.062 , 5082.062 , ..., 5082.062 , 5082.062 ],
[5082.062 , 5082.062 , ..., 5082.062 , 5082.062 ]],
...,
[[5108.8247, 5108.8247, ..., 5108.8247, 5108.8247],
[5108.8247, 5108.8247, ..., 5108.8247, 5108.8247],
...,
[5154.2173, 5154.2173, ..., 5154.2173, 5154.2173],
[5154.2173, 5154.2173, ..., 5154.2173, 5154.2173]],
[[5106.4893, 5106.4893, ..., 5106.4893, 5106.4893],
[5106.4893, 5106.4893, ..., 5106.4893, 5106.4893],
...,
[5226.0063, 5226.0063, ..., 5226.0063, 5226.0063],
[5226.0063, 5226.0063, ..., 5226.0063, 5226.0063]]], dtype=float32)
Coordinates:
* L (L) timedelta64[ns] 0 days 12:00:00 1 days 12:00:00 ...
* Y (Y) float32 -90.0 -89.0 -88.0 -87.0 -86.0 -85.0 -84.0 -83.0 ...
S datetime64[ns] 1999-01-07
* X (X) float32 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0 11.0 ...
P int32 500
This is a good use-case for chunking with dask, e.g.,
import xarray as xr
url = 'http://iridl.ldeo.columbia.edu/SOURCES/.Models/.SubX/.RSMAS/.CCSM4/.hindcast/.zg/dods'
remote_data = xr.open_dataarray(url, chunks={'S': 1, 'L': 1})
da = remote_data.sel(P=500)
da_ensmean = da.mean(dim='M')
This version will access the data server in parallel, using many smaller chunks. It will still be slow to download 231 GB of data, but your request will have much better odds of success.