Pandas Error Handling - "day is out of range for month" - pandas

I wonder how do I handle and spot the data which has wrong format in pandas.
I tried to convert string into pd data form however,
somewhere in the middle the format is not in line with the format that was expected (I assume), an error message popped.
I want to print what the data is
and I want to fix it into the right format that I could keep moving on
how do I handle this issue?
My code is as below.
def date_format(df):
target = pd.to_datetime(df['Issue Date'])
return target
The error message is
--------------------------------------------------------------------------- TypeError Traceback (most recent call
last)
~/anaconda3/lib/python3.8/site-packages/pandas/core/arrays/datetimes.py
in objects_to_datetime64ns(data, dayfirst, yearfirst, utc, errors,
require_iso8601, allow_object) 2053 try:
-> 2054 values, tz_parsed = conversion.datetime_to_datetime64(data) 2055 # If
tzaware, these values represent unix timestamps, so we
pandas/_libs/tslibs/conversion.pyx in
pandas._libs.tslibs.conversion.datetime_to_datetime64()
TypeError: Unrecognized value type: <class 'str'>
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call
last) in
----> 1 seg0=generateDb(csv,2017,1000000,0)
in generateDb(csv, year, segment,
index)
2
3 targetSeg0=segment_df(csv,segment,index)
----> 4 targetSeg0['Issue Date']=date_format(targetSeg0)
5 targetSeg0=remove_etc(targetSeg0)
6 filter_date(targetSeg0,year)
in date_format(df)
1 def date_format(df):
----> 2 target = pd.to_datetime(df['Issue Date'])
3 return target
~/anaconda3/lib/python3.8/site-packages/pandas/core/tools/datetimes.py
in to_datetime(arg, errors, dayfirst, yearfirst, utc, format, exact,
unit, infer_datetime_format, origin, cache)
797 result = result.tz_localize(tz)
798 elif isinstance(arg, ABCSeries):
--> 799 cache_array = _maybe_cache(arg, format, cache, convert_listlike)
800 if not cache_array.empty:
801 result = arg.map(cache_array)
~/anaconda3/lib/python3.8/site-packages/pandas/core/tools/datetimes.py
in _maybe_cache(arg, format, cache, convert_listlike)
168 unique_dates = unique(arg)
169 if len(unique_dates) < len(arg):
--> 170 cache_dates = convert_listlike(unique_dates, format)
171 cache_array = Series(cache_dates, index=unique_dates)
172 return cache_array
~/anaconda3/lib/python3.8/site-packages/pandas/core/tools/datetimes.py
in _convert_listlike_datetimes(arg, format, name, tz, unit, errors,
infer_datetime_format, dayfirst, yearfirst, exact)
457 assert format is None or infer_datetime_format
458 utc = tz == "utc"
--> 459 result, tz_parsed = objects_to_datetime64ns(
460 arg,
461 dayfirst=dayfirst,
~/anaconda3/lib/python3.8/site-packages/pandas/core/arrays/datetimes.py
in objects_to_datetime64ns(data, dayfirst, yearfirst, utc, errors,
require_iso8601, allow_object) 2057 return
values.view("i8"), tz_parsed 2058 except (ValueError,
TypeError):
-> 2059 raise e 2060 2061 if tz_parsed is not None:
~/anaconda3/lib/python3.8/site-packages/pandas/core/arrays/datetimes.py
in objects_to_datetime64ns(data, dayfirst, yearfirst, utc, errors,
require_iso8601, allow_object) 2042 2043 try:
-> 2044 result, tz_parsed = tslib.array_to_datetime( 2045 data, 2046 errors=errors,
pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime()
pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime()
pandas/_libs/tslib.pyx in
pandas._libs.tslib.array_to_datetime_object()
pandas/_libs/tslib.pyx in
pandas._libs.tslib.array_to_datetime_object()
pandas/_libs/tslibs/parsing.pyx in
pandas._libs.tslibs.parsing.parse_datetime_string()
pandas/_libs/tslibs/parsing.pyx in
pandas._libs.tslibs.parsing._parse_delimited_date()
datetime.pxd in cpython.datetime.datetime_new()
ValueError: day is out of range for month
I'm eventually gonna filter the data as below.
def filter_date(df,year):
booldf = df['Issue Date'] >= pd.to_datetime(f"{year}-01-01")
booldf2 = df['Issue Date'] <= pd.to_datetime(f"{year}-12-31")
return df[
(df['Issue Date'] >= pd.to_datetime(f"{year}-01-01")) & (df['Issue Date'] <= pd.to_datetime(f"{year}-12-31"))
]
Sample data form is as below.
The Error occurs because of wrong data like below. (51th month doesn't exist).. I guess this is where the error came from.

Based on your comments I assume this will help:
Using as example (99/99/9999 is the incorrect data):
df = pd.DataFrame(["09/26/2016", "06/14/2017", "09/05/2018", "06/16/2017", "05/09/2018", "99/99/9999"], columns=['Issue Date']).
You mean something like this:
pd.to_datetime(df["Issue Date"], errors="coerce")
output:
see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html
if you want to you drop the rows with NaTs next with:
df = df.dropna(axis=0, subset=['Issue Date'])

Related

Pandas Apply function on Groupby behaviour

I am trying to create a column that computes the ratio of consumption based on a monthly basis. I have created the function but once i use i run it pandas shows a typeerror:
Below is the function error and error stack
The consumption ratio function is:
def consumption_ratio(row):
c_consumption = row["consumption"].iloc[0]
month = row["month"].iloc[0]
year = row["year"].iloc[0]
house = row["houseid-meterid"].iloc[0]
if month == 2 and year == 2019:
return 0
else:
if month == 1:
# print(f"This is the {month} month")
prevyear = year - 1
prevmonth = 12
prev_record = water_df.query("`houseid-meterid` == #house and year == #prevyear and month == #prevmonth")
try:
ratio = c_consumption / prev_record["consumption"]
except ZeroDivisionError:
ratio = 0
# print(f"Non regular rations {ratio}")
return ratio
else:
prevmonth = month - 1
prev_record = water_df.query("`houseid-meterid` == #house and year == #year and month == #prevmonth")
# print(prev_record)
try:
ratio = c_consumption/ prev_record["consumption"]
except ZeroDivisionError:
ratio = 0
# ratio = c_consumption / prev_record["consumption"]
# print(f"Regular ratios {ratio}")
return ratio
The code executes here:
water_df["consumption_ratio"] = water_df.groupby(['Datetime', 'houseid-meterid']).apply(consumption_ratio)
The error stack looks like this:
ValueError Traceback (most recent call last)
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\frame.py:12017, in _reindex_for_setitem(value, index)
12016 try:
> 12017 reindexed_value = value.reindex(index)._values
12018 except ValueError as err:
12019 # raised in MultiIndex.from_tuples, see test_insert_error_msmgs
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\series.py:5094, in Series.reindex(self, *args, **kwargs)
5093 kwargs.update({"index": index})
-> 5094 return super().reindex(**kwargs)
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\generic.py:5289, in NDFrame.reindex(self, *args, **kwargs)
5288 # perform the reindex on the axes
-> 5289 return self._reindex_axes(
5290 axes, level, limit, tolerance, method, fill_value, copy
5291 ).__finalize__(self, method="reindex")
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\generic.py:5304, in NDFrame._reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
5303 ax = self._get_axis(a)
-> 5304 new_index, indexer = ax.reindex(
5305 labels, level=level, limit=limit, tolerance=tolerance, method=method
5306 )
5308 axis = self._get_axis_number(a)
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\indexes\base.py:4477, in Index.reindex(self, target, method, level, limit, tolerance)
4470 warnings.warn(
4471 "reindexing with a non-unique Index is deprecated and "
4472 "will raise in a future version.",
4473 FutureWarning,
4474 stacklevel=find_stack_level(),
4475 )
-> 4477 target = self._wrap_reindex_result(target, indexer, preserve_names)
4478 return target, indexer
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\indexes\multi.py:2556, in MultiIndex._wrap_reindex_result(self, target, indexer, preserve_names)
2555 try:
-> 2556 target = MultiIndex.from_tuples(target)
2557 except TypeError:
2558 # not all tuples, see test_constructor_dict_multiindex_reindex_flat
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\indexes\multi.py:205, in names_compat.<locals>.new_meth(self_or_cls, *args, **kwargs)
203 kwargs["names"] = kwargs.pop("name")
--> 205 return meth(self_or_cls, *args, **kwargs)
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\indexes\multi.py:573, in MultiIndex.from_tuples(cls, tuples, sortorder, names)
571 tuples = np.asarray(tuples._values)
--> 573 arrays = list(lib.tuples_to_object_array(tuples).T)
574 elif isinstance(tuples, list):
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\_libs\lib.pyx:2978, in pandas._libs.lib.tuples_to_object_array()
ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long long'
The above exception was the direct cause of the following exception:
TypeError Traceback (most recent call last)
Cell In[34], line 1
----> 1 water_df["consumption_ratio"] = water_df.groupby(['Datetime', 'houseid-meterid']).apply(consumption_ratio)
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\frame.py:3978, in DataFrame.__setitem__(self, key, value)
3975 self._setitem_array([key], value)
3976 else:
3977 # set column
-> 3978 self._set_item(key, value)
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\frame.py:4172, in DataFrame._set_item(self, key, value)
4162 def _set_item(self, key, value) -> None:
4163 """
4164 Add series to DataFrame in specified column.
4165
(...)
4170 ensure homogeneity.
4171 """
-> 4172 value = self._sanitize_column(value)
4174 if (
4175 key in self.columns
4176 and value.ndim == 1
4177 and not is_extension_array_dtype(value)
4178 ):
4179 # broadcast across multiple columns if necessary
4180 if not self.columns.is_unique or isinstance(self.columns, MultiIndex):
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\frame.py:4909, in DataFrame._sanitize_column(self, value)
4907 return _reindex_for_setitem(value, self.index)
4908 elif is_dict_like(value):
-> 4909 return _reindex_for_setitem(Series(value), self.index)
4911 if is_list_like(value):
4912 com.require_length_match(value, self.index)
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\frame.py:12024, in _reindex_for_setitem(value, index)
12020 if not value.index.is_unique:
12021 # duplicate axis
12022 raise err
> 12024 raise TypeError(
12025 "incompatible index of inserted column with frame index"
12026 ) from err
12027 return reindexed_value
TypeError: incompatible index of inserted column with frame index
The dataset is of the form
year month houseid-meterid Datetime cleaned_quantity
2019 2 m5 2019-02-01 7.0
2019 3 m5 2019-03-01 23.0
2019 4 m5 2019-04-01 14.0
2019 4 m5 2019-05-01 22.0
The expected output should be
year month houseid-meterid Datetime consumption consumption-ratio
2019 2 m5 2019-02-01 7.0 0
2019 3 m5 2019-03-01 23.0 3.285
2019 4 m5 2019-04-01 14.0 0.608
2019 4 m5 2019-05-01 22.0 1.571
What am i doing wrong?
Cahnge your function for next with iter for first consumption, if no exist is added 0, then append ratio to column consumption_ratio and return row instead ratio or 0, last remove assign to water_df["consumption_ratio"] = in groupby in last row of code:
def consumption_ratio(row):
c_consumption = row["consumption"].iloc[0]
# print (c_consumption)
month = row["month"].iloc[0]
year = row["year"].iloc[0]
house = row["houseid-meterid"].iloc[0]
if month == 2 and year == 2019:
ratio=0
else:
if month == 1:
# print(f"This is the {month} month")
prevyear = year - 1
prevmonth = 12
prev_record = water_df.query("`houseid-meterid` == #house and year == #prevyear and month == #prevmonth")
try:
ratio = c_consumption / next(iter(prev_record["consumption"]), 0)
except ZeroDivisionError:
ratio = 0
# print(f"Non regular rations {ratio}")
else:
prevmonth = month - 1
prev_record = water_df.query("`houseid-meterid` == #house and year == #year and month == #prevmonth")
# print(prev_record)
try:
ratio = c_consumption/ next(iter(prev_record["consumption"]), 0)
except ZeroDivisionError:
ratio = 0
# ratio = c_consumption / prev_record["consumption"]
# print(f"Regular ratios {ratio}")
row['consumption_ratio'] = ratio
return row
df = water_df.groupby(['Datetime', 'houseid-meterid']).apply(consumption_ratio)

Can't plot data frame column, I want to plot a scatter graph

My data
I want to make a scatter graph of 2 columns but its not working. if you could help please, i have confirmed the column exists in show columns, then it doesnt exsist as show by code at bottom.
df = data
x = Birth rate (object)
y = CO₂ (per capita). (float64)
i tried
data.plot(x='Birth_rate', y='CO₂_(per capita)')
plt.title('1960 - C02 emissions vs. Crude birth rates')
It resulted in Keyerror 'Birth rate'
File /shared-libs/python3.9/py/lib/python3.9/site-packages/pandas/plotting/_core.py:920, in PlotAccessor.__call__(self, *args, **kwargs)
918 if is_integer(x) and not data.columns.holds_integer():
919 x = data_cols[x]
--> 920 elif not isinstance(data[x], ABCSeries):
921 raise ValueError("x must be a label or position")
922 data = data.set_index(x)
File /shared-libs/python3.9/py/lib/python3.9/site-packages/pandas/core/frame.py:3024, in DataFrame.__getitem__(self, key)
3022 if self.columns.nlevels > 1:
3023 return self._getitem_multilevel(key)
-> 3024 indexer = self.columns.get_loc(key)
3025 if is_integer(indexer):
3026 indexer = [indexer]
File /shared-libs/python3.9/py/lib/python3.9/site-packages/pandas/core/indexes/base.py:3083, in Index.get_loc(self, key, method, tolerance)
3081 return self._engine.get_loc(casted_key)
3082 except KeyError as err:
-> 3083 raise KeyError(key) from err
3085 if tolerance is not None:
3086 tolerance = self._convert_tolerance(tolerance, np.asarray(key))
KeyError: 'Birth_rate'
I tried an index aswell and go this error:
KeyError: "None of ['Birth_rate'] are in the columns"

Pandas groupy "aggregate" does not see column

I am working on a huge database where I did a pandas apply to categorize the type of cliente based on the type of the product he consumed:
Sample DF:
import pandas as pd
import numpy as np
from datetime import datetime
num_variables = 1000
rng = np.random.default_rng()
data = pd.DataFrame({
'id' : np.random.randint(1,999999999,num_variables),
'date' : [np.random.choice(pd.date_range(datetime(2021,1,1),datetime(2022,12,31))) for i in range(num_variables)],
'product' : [np.random.choice(['giftcards', 'afiliates']) for i in range(num_variables)],
'brand' : [np.random.choice(['brand_1', 'brand_2', 'brand_4', 'brand_6']) for i in range(num_variables)],
'gmv' : rng.random(num_variables) * 100,
'revenue' : rng.random(num_variables) * 100,})
data = data.astype({'product':'category', 'brand':'category'})
base = data.groupby(['id', 'product']).aggregate({'product' : 'count'})
base = base.unstack()
Now I need to group clients by the "type" column and just count how much there are in each group.
first, apply the categorization function and its application :
def setup(row):
if row[('product', 'afiliates')] >= 1 and row[('product', 'giftcards')] == 0:
return 'afiliates'
if row[('product', 'afiliates')] == 0 and row[('product', 'giftcards')] >= 1:
return 'gift'
if row[('product', 'afiliates')] >= 1 and row[('product', 'giftcards')] >= 1:
return 'both'
base['type'] = base.apply(setup, axis=1)
base.reset_index(inplace=True)
So far, so good. If I run an groupby.agg, I get these results:
results = base[['type','id']].groupby(['type'], dropna=False).agg('count')
but if instead of agg I try an agregate, it does not work.
results = base[['type','id']].groupby(['type']).aggregate({'id': 'count'})
Output exceeds the size limit. Open the full output data in a text editor
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
Cell In[10], line 2
1 #results = base[['type','id']].groupby(['type'], dropna=False).agg('count')
----> 2 results = base[['type','id']].groupby(['type']).aggregate({'id': 'count'})
File c:\Users\fabio\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\groupby\generic.py:894, in DataFrameGroupBy.aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
891 func = maybe_mangle_lambdas(func)
893 op = GroupByApply(self, func, args, kwargs)
--> 894 result = op.agg()
895 if not is_dict_like(func) and result is not None:
896 return result
File c:\Users\fabio\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\apply.py:169, in Apply.agg(self)
166 return self.apply_str()
168 if is_dict_like(arg):
--> 169 return self.agg_dict_like()
170 elif is_list_like(arg):
171 # we require a list, but not a 'str'
172 return self.agg_list_like()
File c:\Users\fabio\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\apply.py:478, in Apply.agg_dict_like(self)
475 selected_obj = obj._selected_obj
476 selection = obj._selection
--> 478 arg = self.normalize_dictlike_arg("agg", selected_obj, arg)
...
606 # eg. {'A' : ['mean']}, normalize all to
607 # be list-likes
608 # Cannot use func.values() because arg may be a Series
KeyError: "Column(s) ['id'] do not exist"
What am I missing?
I´ve made the same question on Pandas Github.
They helped me, I will reproduce the answer here.
you can see how to access your columns using:
print(base.columns.tolist())
[('id', ''), ('product', 'afiliates'), ('product', 'giftcards'), ('type', '')]
When you have a MultiIndex for columns, you need to specify each level as a tuple. So you can do:
base[['type','id']].groupby(['type']).aggregate({('id', ''): 'count'})
Regarding the title of this issue - agg and aggregate are aliases, they do not behave differently.
I suppose there is a bit of an oddity here - why can you do base[['id']] but not specify {'id': ...} in agg? The reason is because column selection can return multiple columns (e.g. in the example here, base[['product']] returns a DataFrame with two columns), whereas agg must have one column and one column only. Thus, it is necessary to specify all levels in agg.

TypeError: Wrong number or type of arguments for overloaded function 'new_Date'

I am new to python. I am getting an error when running below code. The issue seems to be with date. can someone help me to correct i please. I have tried changing the date format in the excel but it does not solve the issue. The excel have a list of several bonds. I want to generate the coupon dates of the different bonds
BondData = pd.read_excel (r'C:\Users\Avishen\Desktop\Python\BONDDATA.xlsx')
Data = pd.DataFrame(BondData)
def scheduledates():
tenor = ql.Period(ql.Semiannual)
day_count = ql.Thirty360
calendar = ql.UnitedStates()
businessConvention = ql.Unadjusted
dateGeneration = ql.DateGeneration.Backward
monthEnd = False
# Dates in Bond Period
return ql.Schedule (issueDate, maturityDate, tenor, calendar, businessConvention,
businessConvention , dateGeneration, monthEnd)
new_df["Dates"]= Data.apply(lambda x: scheduledates(),axis = 1)
new_df["ISIN"] = Data.ISIN
new_df
Error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-4-877415e9cf83> in <module>
21 businessConvention , dateGeneration, monthEnd)
22
---> 23 new_df["Dates"]= Data.apply(lambda x: scheduledates(),axis = 1)
24 new_df["ISIN"] = Data.ISIN
25 new_df
~\anaconda3\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, raw, result_type, args, **kwds)
7546 kwds=kwds,
7547 )
-> 7548 return op.get_result()
7549
7550 def applymap(self, func) -> "DataFrame":
~\anaconda3\lib\site-packages\pandas\core\apply.py in get_result(self)
178 return self.apply_raw()
179
--> 180 return self.apply_standard()
181
182 def apply_empty_result(self):
~\anaconda3\lib\site-packages\pandas\core\apply.py in apply_standard(self)
269
270 def apply_standard(self):
--> 271 results, res_index = self.apply_series_generator()
272
273 # wrap results
~\anaconda3\lib\site-packages\pandas\core\apply.py in apply_series_generator(self)
298 for i, v in enumerate(series_gen):
299 # ignore SettingWithCopy here in case the user mutates
--> 300 results[i] = self.f(v)
301 if isinstance(results[i], ABCSeries):
302 # If we have a view on v, we need to make a copy because
<ipython-input-4-877415e9cf83> in <lambda>(x)
21 businessConvention , dateGeneration, monthEnd)
22
---> 23 new_df["Dates"]= Data.apply(lambda x: scheduledates(),axis = 1)
24 new_df["ISIN"] = Data.ISIN
25 new_df
<ipython-input-4-877415e9cf83> in scheduledates()
8
9 def scheduledates():
---> 10 issueDate = ql.Date(Data.issuedate)
11 maturityDate = ql.Date(Data.maturitydate)
12 tenor = ql.Period(ql.Semiannual)
~\anaconda3\lib\site-packages\QuantLib\QuantLib.py in __init__(self, *args)
425
426 def __init__(self, *args):
--> 427 _QuantLib.Date_swiginit(self, _QuantLib.new_Date(*args))
428
429 def weekdayNumber(self):
TypeError: Wrong number or type of arguments for overloaded function 'new_Date'.
Possible C/C++ prototypes are:
Date::Date()
Date::Date(Day,Month,Year)
Date::Date(Day,Month,Year,Hour,Minute,Second,Millisecond,Microsecond)
Date::Date(Day,Month,Year,Hour,Minute,Second,Millisecond)
Date::Date(Day,Month,Year,Hour,Minute,Second)
Date::Date(BigInteger)
Date::Date(std::string const &,std::string)
---------------------------------------------------------------------------
Data = pd.DataFrame(BondData)
Fields from Bond Data
ISIN
issuedate
maturitydate
coupon
Tradeyield
Bond_Price
MarketPrice
Nominal_Amount
From the traceback, the problem is the line:
issueDate = ql.Date(Data.issuedate)
(which for some reason is not in the code you pasted). Coming from Excel, issuedate should be an integer and thus compatible with the ql.Date constructor, but it's possible that pandas is reading it as a string or some other type. You should examine the data frame and check the type of the column. If it's not what you expect, you'll have to figure out if there are data in that column that pandas can't interpret as integers, and either clean them up of force the conversion somehow before passing them to ql.Date.

trimming column named is generating ValueError

I have a table which I run through a function to trim its columns down to length 128 (I know it's really long, there isn't anything I can do about that) characters so it can use to_sql to create a database from it.
def truncate_column_names(df, length):
rename = {}
for col in df.columns:
if len(col) > length:
new_col = col[:length-3]+"..."
rename[col] = new_col
result = df.rename(columns=rename)
return result
This function works fine and I get a table out just fine but the problem comes when I tried to save the file I get the error
ValueError: Buffer has wrong number of dimensions (expected 1, got 2)
The method I have doing some housekeeping before saving to a file included dropping duplicates and that is where this error is being spit out. I tested this by saving the original dataFrame and then just loading it, running the truncate function, and then trying drop_duplicates on the result and I get the same error.
The headers for the file before I try truncating looks like this:
http://pastebin.com/WXmvwHDg
I trimmed the file down to 1 record and still have the problem.
This was a result of the truncating causing some columns to have non-unique names.
To confirm this was an issue I did a short test:
In [113]: df = pd.DataFrame(columns=["ab", "ac", "ad"])
In [114]: df
Out[114]:
Empty DataFrame
Columns: [ab, ac, ad]
Index: []
In [115]: df.drop_duplicates()
Out[115]:
Empty DataFrame
Columns: [ab, ac, ad]
Index: []
In [116]: df.columns
Out[116]: Index([u'ab', u'ac', u'ad'], dtype='object')
In [117]: df.columns = df.columns.str[:1]
In [118]: df
Out[118]:
Empty DataFrame
Columns: [a, a, a]
Index: []
In [119]: df.drop_duplicates()
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-119-daf275b6788b> in <module>()
----> 1 df.drop_duplicates()
C:\Miniconda\lib\site-packages\pandas\util\decorators.pyc in wrapper(*args, **kw
args)
86 else:
87 kwargs[new_arg_name] = new_arg_value
---> 88 return func(*args, **kwargs)
89 return wrapper
90 return _deprecate_kwarg
C:\Miniconda\lib\site-packages\pandas\core\frame.pyc in drop_duplicates(self, su
bset, take_last, inplace)
2826 deduplicated : DataFrame
2827 """
-> 2828 duplicated = self.duplicated(subset, take_last=take_last)
2829
2830 if inplace:
C:\Miniconda\lib\site-packages\pandas\util\decorators.pyc in wrapper(*args, **kw
args)
86 else:
87 kwargs[new_arg_name] = new_arg_value
---> 88 return func(*args, **kwargs)
89 return wrapper
90 return _deprecate_kwarg
C:\Miniconda\lib\site-packages\pandas\core\frame.pyc in duplicated(self, subset,
take_last)
2871
2872 vals = (self[col].values for col in subset)
-> 2873 labels, shape = map(list, zip( * map(f, vals)))
2874
2875 ids = get_group_index(labels, shape, sort=False, xnull=False)
C:\Miniconda\lib\site-packages\pandas\core\frame.pyc in f(vals)
2860
2861 def f(vals):
-> 2862 labels, shape = factorize(vals, size_hint=min(len(self), _SI
ZE_HINT_LIMIT))
2863 return labels.astype('i8',copy=False), len(shape)
2864
C:\Miniconda\lib\site-packages\pandas\core\algorithms.pyc in factorize(values, s
ort, order, na_sentinel, size_hint)
133 table = hash_klass(size_hint or len(vals))
134 uniques = vec_klass()
--> 135 labels = table.get_labels(vals, uniques, 0, na_sentinel)
136
137 labels = com._ensure_platform_int(labels)
pandas\hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_labels (pandas\ha
shtable.c:13946)()
ValueError: Buffer has wrong number of dimensions (expected 1, got 2)
and got the same result. using df.columns.unique() after the truncation i had ~200 duplicate columns after the truncation