Can pandas df have cell values of numpy array - pandas

I want to store Numpy arrays as values for cells in my Dataframe. Is there any way to do this?
Basically i have pixel data which is a (512,512) Numpy array that i want to save as the value for pixel_data column corresponding to its particular id in the ID column of my Dataframe. How can i do this?
Heres what i tried:
for f in train_files[:10]:
id_tmp = f.split('/')[4].split('.')[0]
first_dcm = pydicom.read_file(f)
img = first_dcm.pixel_array
window = get_windowing(first_dcm)
image = window_image(img, *window)
train.loc[train.Image == id_tmp, 'img_before_w'] = img
train.loc[train.Image == id_tmp, 'img_after_w'] = image
The error i got:
ValueError Traceback (most recent call last)
<ipython-input-47-32236f8c9ccc> in <module>
5 window = get_windowing(first_dcm)
6 image = window_image(img, *window)
----> 7 train.loc[train.Image == id_tmp, 'img_before_w'] = img
8 train.loc[train.Image == id_tmp, 'img_after_w'] = image
9
/opt/conda/lib/python3.6/site-packages/pandas/core/indexing.py in __setitem__(self, key, value)
203 key = com.apply_if_callable(key, self.obj)
204 indexer = self._get_setitem_indexer(key)
--> 205 self._setitem_with_indexer(indexer, value)
206
207 def _validate_key(self, key, axis: int):
/opt/conda/lib/python3.6/site-packages/pandas/core/indexing.py in _setitem_with_indexer(self, indexer, value)
525 if len(labels) != value.shape[1]:
526 raise ValueError(
--> 527 "Must have equal len keys and value "
528 "when setting with an ndarray"
529 )
ValueError: Must have equal len keys and value when setting with an ndarray

Taking sample dataframe as below:
train=pd.DataFrame({'Image':[1,2,3,2],'img_before_w':[np.nan, np.nan, np.nan,np.nan]})
print(train) gives
Image img_before_w
0 1 NaN
1 2 NaN
2 3 NaN
3 2 NaN
Now, for example, if you want to insert pixel data when train.Image == 2, then it can be achieved using below code:
mask = train.Image == 2 # contains True for desired rows
target_index=mask[mask==True].index # gives index of rows, wherever condition is met
train.loc[mask, 'img_before_w'] = pd.Series([[512,512]]*len(target_index), index=target_index) # inserts [512,512] array in rows wherever condition is met, in given column
Now, print(train) gives, desired output:
Image img_before_w
0 1 NaN
1 2 [512, 512]
2 3 NaN
3 2 [512, 512]

Related

How efficiently filter a specific number of entries and concatenating them in a unique tf.data.Dataset?

I have a huge TFRecord file with more than 4M entries. It is a very unbalanced dataset containing many more entries of some labels and few others - compare to the whole dataset. I want to filter a limited number of entries of some of these labels in order to have a balanced dataset. Below, you can see my attempt, but it takes more than 24 hours to filter 1k from each label (33 different labels).
import tensorflow as tf
tf.compat.as_str(
bytes_or_text='str', encoding='utf-8'
)
try:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
print("Device:", tpu.master())
strategy = tf.distribute.TPUStrategy(tpu)
except:
strategy = tf.distribute.get_strategy()
print("Number of replicas:", strategy.num_replicas_in_sync)
ignore_order = tf.data.Options()
ignore_order.experimental_deterministic = False
dataset = tf.data.TFRecordDataset('/test.tfrecord')
dataset = dataset.with_options(ignore_order)
features, feature_lists = detect_schema(dataset)
#Decodings TFRecord serialized data
def decode_data(serialized):
X, y = tf.io.parse_single_sequence_example(
serialized,
context_features=features,
sequence_features=feature_lists)
return X['title'], y['subject']
dataset = dataset.map(lambda x: tf.py_function(func=decode_data, inp=[x], Tout=(tf.string, tf.string)))
#Filtering and concatenating the samples
def balanced_dataset(dataset, labels_list, sample_size=1000):
datasets_list = []
for label in labels_list:
#Filtering the chosen labels
locals()[label] = dataset.filter(lambda x, y: tf.greater(tf.reduce_sum(tf.cast(tf.equal(tf.constant(label, dtype=tf.int64), y), tf.float32)), tf.constant(0.)))
#appending a limited sample
datasets_list.append(locals()[label].take(sample_size))
concat_dataset = datasets_list[0]
#concatenating the datasets
for dset in datasets_list[1:]:
concat_dataset = concat_dataset.concatenate(dset)
return concat_dataset
balanced_data = balanced_dataset(tabledataset, labels_list=list(decod_dic.values()), sample_size=1000)
One way to solve this is by using group_by_window method where the window_size would be the sample size of each class (in your case 1k).
ds = ds.group_by_window(
# Use label as key
key_func=lambda _, l: l,
# Convert each window to a sample_size
reduce_func=lambda _, window: window.batch(sample_size),
# Use window size as sample_size
window_size=sample_size)
This will form batches of single classes of size sample_size. But there is one problem, there will be multiple batches of same class, but you just need one of the batches in each class.
To solve the above problem, we need to add a count for each of the batches and then filter out count==0, which will fetch the first batch of all the classes.
Lets define an example:
labels = np.array(sum([[label]*repeat for label, repeat in zip([0, 1, 2], [100, 200, 15])], []))
features = np.arange(len(labels))
np.unique(labels, return_counts=True)
#(array([0, 1, 2]), array([100, 200, 15]))
# There are 3 labels chosen for simplicity and each of their counts are shown along.
sample_size = 15 # we choose to pick sample of 15 from each class
We create a dataset from the above inputs,
ds = tf.data.Dataset.from_tensor_slices((features, labels))
In the above window function we modify the reduce_func to make the counter, so the batch will have 3 elements (X_batch, y_batch, label_counter) :
def reduce_func(x, y):
#class_count[y] += 1
z = table.lookup(x)
table.insert(x, z+1)
return y.batch(sample_size).map(lambda a,b: (a, b, z))
# Group by window
ds = tf.data.Dataset.from_tensor_slices((features, labels))
ds = ds.group_by_window(
# Use label as key
key_func=lambda _, l: l,
# Convert each window to a sample_size
reduce_func=reduce_func,
# Use window size as sample_size
window_size=sample_size)
The counter logic in reduce_func is implemented as a table lookup where the counter needs to be updated and read from a lookup table. Its initialized as shown below:
n_classes = 3
keys = tf.range(0,n_classes, dtype=tf.int64)
vals = tf.zeros_like(keys, dtype=tf.int64)
table = tf.lookup.experimental.MutableHashTable(key_dtype=tf.int64,
value_dtype=tf.int64,
default_value=-1)
table.insert(keys, vals)
Now we filter out the batch where the count==0 and remove the count element to form (X, y) batch pairs:
ds = ds.filter(lambda x, y, count: count==0)
ds = ds.map(lambda x, y, count: (x, y))
Output,
for x, y in ds:
print(x.numpy(), y.numpy())
[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14] [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[100 101 102 103 104 105 106 107 108 109 110 111 112 113 114] [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[300 301 302 303 304 305 306 307 308 309 310 311 312 313 314] [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]

Merging GeoDataFrames - TypeError: float() argument must be a string or a number, not 'Point'

I have a dataframe whose one of the columns has a Series of shapely Points and another one in which I have a Series of Polygons.
df.head()
hash number street unit \
2024459 283e04eca5c4932a SN AVENIDA DOUTOR SEVERIANO DE ALMEIDA NaN
2024460 1a92a1c3cba7941a 485 AVENIDA DOUTOR SEVERIANO DE ALMEIDA NaN
2024461 837341c45de519a3 475 AVENIDA DOUTOR SEVERIANO DE ALMEIDA NaN
city district region postcode id geometry
2024459 Jaguari NaN RS 97760-000 NaN POINT (-54.69445 -29.49421)
2024460 Jaguari NaN RS 97760-000 NaN POINT (-54.69445 -29.49421)
2024461 Jaguari NaN RS 97760-000 NaN POINT (-54.69445 -29.49421)
poly_df.head()
centroids geometry
0 POINT (-29.31067315122428 -54.64176359828149) POLYGON ((-54.64069 -29.31161, -54.64069 -29.3...
1 POINT (-29.31067315122428 -54.63961783106958) POLYGON ((-54.63854 -29.31161, -54.63854 -29.3...
2 POINT (-29.31067315122428 -54.637472063857665) POLYGON ((-54.63640 -29.31161, -54.63640 -29.3...
I'm checking if the Point belongs to the Polygon and inserting the Point object into the cell of the second dataframe. However, I'm getting the following error:
Traceback (most recent call last):
File "/tmp/ipykernel_4771/1967309101.py", line 1, in <module>
df.loc[idx, 'centroids'] = poly_mun.loc[ix, 'centroids']
File ".local/lib/python3.8/site-packages/pandas/core/indexing.py", line 692, in __setitem__
iloc._setitem_with_indexer(indexer, value, self.name)
File ".local/lib/python3.8/site-packages/pandas/core/indexing.py", line 1599, in _setitem_with_indexer
self.obj[key] = infer_fill_value(value)
File ".local/lib/python3.8/site-packages/pandas/core/dtypes/missing.py", line 516, in infer_fill_value
val = np.array(val, copy=False)
TypeError: float() argument must be a string or a number, not 'Point'
I'm using the following command line:
df.loc[idx, 'centroids'] = poly_df.loc[ix, 'centroids']
I have already tried at as well.
Thanks
You can't create a new column in pandas with a shapely geometry using loc:
In [1]: import pandas as pd, shapely.geometry
In [2]: df = pd.DataFrame({'mycol': [1, 2, 3]})
In [3]: df.loc[0, "centroid"] = shapely.geometry.Point([0, 0])
/Users/mikedelgado/opt/miniconda3/envs/rhodium-env/lib/python3.10/site-packages/pandas/core/indexing.py:1642: ShapelyDeprecationWarning: The array interface is deprecated and will no longer work in Shapely 2.0. Convert the '.coords' to a numpy array instead.
self.obj[key] = infer_fill_value(value)
/Users/mikedelgado/opt/miniconda3/envs/rhodium-env/lib/python3.10/site-packages/pandas/core/dtypes/missing.py:550: FutureWarning: The input object of type 'Point' is an array-like implementing one of the corresponding protocols (`__array__`, `__array_interface__` or `__array_struct__`); but not a sequence (or 0-D). In the future, this object will be coerced as if it was first converted using `np.array(obj)`. To retain the old behaviour, you have to either modify the type 'Point', or assign to an empty array created with `np.empty(correct_shape, dtype=object)`.
val = np.array(val, copy=False)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Input In [3], in <cell line: 1>()
----> 1 df.loc[0, "centroid"] = shapely.geometry.Point([0, 0])
File ~/opt/miniconda3/envs/rhodium-env/lib/python3.10/site-packages/pandas/core/indexing.py:716, in _LocationIndexer.__setitem__(self, key, value)
713 self._has_valid_setitem_indexer(key)
715 iloc = self if self.name == "iloc" else self.obj.iloc
--> 716 iloc._setitem_with_indexer(indexer, value, self.name)
File ~/opt/miniconda3/envs/rhodium-env/lib/python3.10/site-packages/pandas/core/indexing.py:1642, in _iLocIndexer._setitem_with_indexer(self, indexer, value, name)
1639 self.obj[key] = empty_value
1641 else:
-> 1642 self.obj[key] = infer_fill_value(value)
1644 new_indexer = convert_from_missing_indexer_tuple(
1645 indexer, self.obj.axes
1646 )
1647 self._setitem_with_indexer(new_indexer, value, name)
File ~/opt/miniconda3/envs/rhodium-env/lib/python3.10/site-packages/pandas/core/dtypes/missing.py:550, in infer_fill_value(val)
548 if not is_list_like(val):
549 val = [val]
--> 550 val = np.array(val, copy=False)
551 if needs_i8_conversion(val.dtype):
552 return np.array("NaT", dtype=val.dtype)
TypeError: float() argument must be a string or a real number, not 'Point'
Essentially, pandas doesn't know how to interpret a point object, and so creates a float column with NaNs, and then can't handle the point. This might get fixed in the future, but you're best off explicitly defining the column as object dtype:
In [27]: df['centroid'] = None
In [28]: df['centroid'] = df['centroid'].astype(object)
In [29]: df
Out[29]:
mycol centroid
0 1 None
1 2 None
2 3 None
In [30]: df.loc[0, "centroid"] = shapely.geometry.Point([0, 0])
/Users/mikedelgado/opt/miniconda3/envs/rhodium-env/lib/python3.10/site-packages/pandas/core/internals/managers.py:304: ShapelyDeprecationWarning: The array interface is deprecated and will no longer work in Shapely 2.0. Convert the '.coords' to a numpy array instead.
applied = getattr(b, f)(**kwargs)
In [31]: df
Out[31]:
mycol centroid
0 1 POINT (0 0)
1 2 None
2 3 None
That said, joining two GeoDataFrames with polygons and points based on whether the points are in the polygons certainly sounds like a job for geopandas.sjoin:
union = gpd.sjoin(polygon_df, points_df, op='contains')

"ValueError: arrays must all be same length"

In a machine learning project, suppose I have 3 cat images and 2 dog images. when I make a dataframe for the training data.
#pre processing train data
filenames = os.listdir('/content/train')
categories = []
for filename in os.listdir('/content/train'):
category = filename.split('.')[0]
if category == 'dog' :
categories.append(1) #1 for dog and 0 for cat
else :
categories.append(0)
#make a dictonary
df = pd.DataFrame(
{
'filename' : filenames,
'category':categories
}
)
It gives an error because I haven't the same amount of dog, cat images.
ValueError Traceback (most recent call last)
<ipython-input-28-2d4e2440ba41> in <module>()
12 {
13 'filename' : filenames,
---> 14 'category' : categories
15 }
16 )
3 frames
/usr/local/lib/python3.6/dist-packages/pandas/core/internals/construction.py in extract_index(data)
395 lengths = list(set(raw_lengths))
396 if len(lengths) > 1:
--> 397 raise ValueError("arrays must all be same length")
398
399 if have_dicts:
ValueError: arrays must all be same length
Is there any way to fix it without adding any image to the training dataset?

After rename column get keyerror

I have df:
df = pd.DataFrame({'a':[7,8,9],
'b':[1,3,5],
'c':[5,3,6]})
print (df)
a b c
0 7 1 5
1 8 3 3
2 9 5 6
Then rename first value by this:
df.columns.values[0] = 'f'
All seems very nice:
print (df)
f b c
0 7 1 5
1 8 3 3
2 9 5 6
print (df.columns)
Index(['f', 'b', 'c'], dtype='object')
print (df.columns.values)
['f' 'b' 'c']
If select b it works nice:
print (df['b'])
0 1
1 3
2 5
Name: b, dtype: int64
But if select a it return column f:
print (df['a'])
0 7
1 8
2 9
Name: f, dtype: int64
And if select f get keyerror.
print (df['f'])
#KeyError: 'f'
print (df.info())
#KeyError: 'f'
What is problem? Can somebody explain it? Or bug?
You aren't expected to alter the values attribute.
Try df.columns.values = ['a', 'b', 'c'] and you get:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-61-e7e440adc404> in <module>()
----> 1 df.columns.values = ['a', 'b', 'c']
AttributeError: can't set attribute
That's because pandas detects that you are trying to set the attribute and stops you.
However, it can't stop you from changing the underlying values object itself.
When you use rename, pandas follows up with a bunch of clean up stuff. I've pasted the source below.
Ultimately what you've done is altered the values without initiating the clean up. You can initiate it yourself with a followup call to _data.rename_axis (example can be seen in source below). This will force the clean up to be run and then you can access ['f']
df._data = df._data.rename_axis(lambda x: x, 0, True)
df['f']
0 7
1 8
2 9
Name: f, dtype: int64
Moral of the story: probably not a great idea to rename a column this way.
but this story gets weirder
This is fine
df = pd.DataFrame({'a':[7,8,9],
'b':[1,3,5],
'c':[5,3,6]})
df.columns.values[0] = 'f'
df['f']
0 7
1 8
2 9
Name: f, dtype: int64
This is not fine
df = pd.DataFrame({'a':[7,8,9],
'b':[1,3,5],
'c':[5,3,6]})
print(df)
df.columns.values[0] = 'f'
df['f']
KeyError:
Turns out, we can modify the values attribute prior to displaying df and it will apparently run all the initialization upon the first display. If you display it prior to changing the values attribute, it will error out.
weirder still
df = pd.DataFrame({'a':[7,8,9],
'b':[1,3,5],
'c':[5,3,6]})
print(df)
df.columns.values[0] = 'f'
df['f'] = 1
df['f']
f f
0 7 1
1 8 1
2 9 1
As if we didn't already know that this was a bad idea...
source for rename
def rename(self, *args, **kwargs):
axes, kwargs = self._construct_axes_from_arguments(args, kwargs)
copy = kwargs.pop('copy', True)
inplace = kwargs.pop('inplace', False)
if kwargs:
raise TypeError('rename() got an unexpected keyword '
'argument "{0}"'.format(list(kwargs.keys())[0]))
if com._count_not_none(*axes.values()) == 0:
raise TypeError('must pass an index to rename')
# renamer function if passed a dict
def _get_rename_function(mapper):
if isinstance(mapper, (dict, ABCSeries)):
def f(x):
if x in mapper:
return mapper[x]
else:
return x
else:
f = mapper
return f
self._consolidate_inplace()
result = self if inplace else self.copy(deep=copy)
# start in the axis order to eliminate too many copies
for axis in lrange(self._AXIS_LEN):
v = axes.get(self._AXIS_NAMES[axis])
if v is None:
continue
f = _get_rename_function(v)
baxis = self._get_block_manager_axis(axis)
result._data = result._data.rename_axis(f, axis=baxis, copy=copy)
result._clear_item_cache()
if inplace:
self._update_inplace(result._data)
else:
return result.__finalize__(self)

trimming column named is generating ValueError

I have a table which I run through a function to trim its columns down to length 128 (I know it's really long, there isn't anything I can do about that) characters so it can use to_sql to create a database from it.
def truncate_column_names(df, length):
rename = {}
for col in df.columns:
if len(col) > length:
new_col = col[:length-3]+"..."
rename[col] = new_col
result = df.rename(columns=rename)
return result
This function works fine and I get a table out just fine but the problem comes when I tried to save the file I get the error
ValueError: Buffer has wrong number of dimensions (expected 1, got 2)
The method I have doing some housekeeping before saving to a file included dropping duplicates and that is where this error is being spit out. I tested this by saving the original dataFrame and then just loading it, running the truncate function, and then trying drop_duplicates on the result and I get the same error.
The headers for the file before I try truncating looks like this:
http://pastebin.com/WXmvwHDg
I trimmed the file down to 1 record and still have the problem.
This was a result of the truncating causing some columns to have non-unique names.
To confirm this was an issue I did a short test:
In [113]: df = pd.DataFrame(columns=["ab", "ac", "ad"])
In [114]: df
Out[114]:
Empty DataFrame
Columns: [ab, ac, ad]
Index: []
In [115]: df.drop_duplicates()
Out[115]:
Empty DataFrame
Columns: [ab, ac, ad]
Index: []
In [116]: df.columns
Out[116]: Index([u'ab', u'ac', u'ad'], dtype='object')
In [117]: df.columns = df.columns.str[:1]
In [118]: df
Out[118]:
Empty DataFrame
Columns: [a, a, a]
Index: []
In [119]: df.drop_duplicates()
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-119-daf275b6788b> in <module>()
----> 1 df.drop_duplicates()
C:\Miniconda\lib\site-packages\pandas\util\decorators.pyc in wrapper(*args, **kw
args)
86 else:
87 kwargs[new_arg_name] = new_arg_value
---> 88 return func(*args, **kwargs)
89 return wrapper
90 return _deprecate_kwarg
C:\Miniconda\lib\site-packages\pandas\core\frame.pyc in drop_duplicates(self, su
bset, take_last, inplace)
2826 deduplicated : DataFrame
2827 """
-> 2828 duplicated = self.duplicated(subset, take_last=take_last)
2829
2830 if inplace:
C:\Miniconda\lib\site-packages\pandas\util\decorators.pyc in wrapper(*args, **kw
args)
86 else:
87 kwargs[new_arg_name] = new_arg_value
---> 88 return func(*args, **kwargs)
89 return wrapper
90 return _deprecate_kwarg
C:\Miniconda\lib\site-packages\pandas\core\frame.pyc in duplicated(self, subset,
take_last)
2871
2872 vals = (self[col].values for col in subset)
-> 2873 labels, shape = map(list, zip( * map(f, vals)))
2874
2875 ids = get_group_index(labels, shape, sort=False, xnull=False)
C:\Miniconda\lib\site-packages\pandas\core\frame.pyc in f(vals)
2860
2861 def f(vals):
-> 2862 labels, shape = factorize(vals, size_hint=min(len(self), _SI
ZE_HINT_LIMIT))
2863 return labels.astype('i8',copy=False), len(shape)
2864
C:\Miniconda\lib\site-packages\pandas\core\algorithms.pyc in factorize(values, s
ort, order, na_sentinel, size_hint)
133 table = hash_klass(size_hint or len(vals))
134 uniques = vec_klass()
--> 135 labels = table.get_labels(vals, uniques, 0, na_sentinel)
136
137 labels = com._ensure_platform_int(labels)
pandas\hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_labels (pandas\ha
shtable.c:13946)()
ValueError: Buffer has wrong number of dimensions (expected 1, got 2)
and got the same result. using df.columns.unique() after the truncation i had ~200 duplicate columns after the truncation