How to safely subclass ndarray and get behavior consistent with ndarray - odd nanmin/max results? - numpy

I'm trying to subclass an ndarray so that I can add some additional fields. When I do this however, I get odd behavior in a variety of numpy functions. For example nanmin returns now return an object of the type of my new array classs, whereas previously I'd get a float64. Why? Is this a bug with nanmin or my class?
import numpy as np
class NDArrayWithColumns(np.ndarray):
def __new__(cls, obj, columns=None):
obj = obj.view(cls)
obj.columns = tuple(columns)
return obj
def __array_finalize__(self, obj):
if obj is None: return
self.columns = getattr(obj, 'columns', None)
NAN = float("nan")
r = np.array([1.,0.,1.,0.,1.,0.,1.,0.,NAN, 1., 1.])
print "MIN", np.nanmin(r), type(np.nanmin(r))
MIN 0.0 <type 'numpy.float64'>
>>> r = NDArrayWithColumns(r, ["a"])
>>> print "MIN", np.nanmin(r), type(np.nanmin(r))
MIN 0.0 <class '__main__.NDArrayWithColumns'>
>>> print r.shape
Note the change in type, and also that str(np.nanmin(r)) shows 1 field, not 11.
In case you're interested, I'm subclassing because I'd like to track columns names is matrices of a single type but structure arrays and record type arrays allow for varying type).

You need to implement the __array_wrap__ method that gets called at the end of ufuncs, per the docs:
def __array_wrap__(self, out_arr, context=None):
print('In __array_wrap__:')
print(' self is %s' % repr(self))
print(' arr is %s' % repr(out_arr))
# then just call the parent
return np.ndarray.__array_wrap__(self, out_arr, context)


alternative way to define a function inside a class method [closed]

I have a the following class:
class Analysis():
def __init__(self, file_dir):
self.path = file_dir #file path directory
def getData(self):
return pd.read_csv(self.path) # create a pandas dataframe
def getStd(self):
return self.getData().loc['1':'5'].apply(lambda x: x.std()) # cacluate the standard deviation of all columns
def getHighlight(self):
#a function to highlight df based on the given condition
def highlight(x):
c1 = 'background-color:red'
c2 = 'background-color:yellow'
c3 = 'background-color:green'
#rows over which the highlighting function should apply
r = ['1', '2', '3', '4', '5']
#first boolean mask for selecting the df elements
m1 = (x.loc[r]>x.loc['USL']) | (x.loc[r]<x.loc['LSL'])
#second boolean mask for selecting the df elements
m2 = (x.loc[r]==x.loc['USL']) | (x.loc[r]==x.loc['LSL'])
#DataFrame with same index and columns names as original filled empty strings
df1 = pd.DataFrame('', index=x.index, columns=x.columns)
#modify values of df1 columns by boolean mask
df1.loc[r, :] =[m1, m2], [c1, c2], default=c3)
return df1
#apply the highlight function on the df to get highlighted
return self.getData().style.apply(highlight, axis=None)
getData method returns the df like this:
my_analysis = Analysis(path_to_file)
Tg 0.37 10.24 5.02 0.63 20.30
USL 0.39 10.26 5.04 0.65 20.32
LSL 0.35 10.22 5.00 0.63 20.28
1 0.35 10.23 5.05 0.65 20.45
2 0.36 10.19 5.07 0.67 20.25
3 0.34 10.25 5.03 0.66 20.33
4 0.35 10.20 5.08 0.69 20.22
5 0.33 10.17 5.05 0.62 20.40
Max 0.36 10.25 5.08 0.69 20.45
Min 0.33 10.17 5.03 0.62 20.22
The getHighligt method has an inner function which applies to the df in order to highlight the df elements based on the given mask and it would out put something like this:
My question is what is the pythonic or elegant way of defining the inner function inside the class method?
Disclaimer: the following remarks represent my opinion about the topic of pythonic code.
Avoid Inner Functions
You should avoid inner functions at all cost. Sometimes they're necessary, but most of the time they're an indication that you might want to refactor your code.
Avoid re-reading multiple times
I would also avoid calling pd.read_csv every time I want to perform some operation in the data. Unless there's a good reason to read the file over and over again, It's more performant to read it once and store it in a class attribute, or property.
PEP-8 Naming Conventions
Another important thing to consider, if you're trying to make your code more pythonic, is to try to follow the PEP8 naming conventions, unless you're working on legacy code that does not follow PEP-8.
Class Overkill
Finally, I think that creating a class for what you're doing seems a little overkill. Most of your methods are simply transformations that could be easily converted to functions. Aside from making your code less complex, It would improve its reusability.
How I would write the Analysis class
from __future__ import absolute_import, annotations
from pathlib import Path
from typing import Any, Collection, Iterable, Type, Union
import numpy as np
import pandas as pd
from pandas.core.dtypes.dtypes import ExtensionDtype # type: ignore
# Custom types for type hinting
Axes = Collection[Any]
NpDtype = Union[
str, np.dtype, Type[Union[str, float, int, complex, bool, object]]
Dtype = Union["ExtensionDtype", NpDtype]
# Auxiliary functions
def is_iterable_not_string(iterable: Any) -> bool:
"""Return True, if `iterable` is an iterable object, and not a string.
iterable: Any
The object to check whether it's an iterable except for strings,
or not.
True, if object is iterable, but not a string.
Otherwise, if object isn't an iterable, or if it's a string, return
>>> import numpy as np
>>> import pandas as pd
>>> class FakeIterable(int):
... def __iter__(self): pass
>>> print(is_iterable_not_string('abcde'))
>>> print(is_iterable_not_string(bytes(12345)))
>>> print(is_iterable_not_string(12345))
>>> print(is_iterable_not_string(123.45))
>>> print(is_iterable_not_string(type))
>>> print(is_iterable_not_string(list)) # Type list isn't iterable
>>> print(is_iterable_not_string(object))
>>> print(is_iterable_not_string(None))
>>> print(is_iterable_not_string(list())) # Empty list is still iterable
>>> # `FakeIterable` has a method `__iter__`, therefore it's considered
>>> # iterable, even though it isn't.
>>> print(is_iterable_not_string(FakeIterable(10)))
>>> print(is_iterable_not_string(list('abcde')))
>>> print(is_iterable_not_string(tuple('abcde')))
>>> print(is_iterable_not_string(set('abcde')))
>>> print(is_iterable_not_string(np.array(list('abcdef'))))
>>> print(is_iterable_not_string({col: [1, 2, 3, 4] for col in 'abcde'}))
>>> print(is_iterable_not_string(
... pd.DataFrame({col: [1, 2, 3, 4] for col in 'abcde'}))
... )
>>> print(is_iterable_not_string(pd.DataFrame()))
In python, any object that contains a method called `__iter__` considered
an “iterable”. This means that you can, in theory, fake an “iterable”
object, by creating a method called `__iter__` that doesn't contain any
real implementation. For a concrete case, see the examples section.
Python common iterable objects:
- strings
- bytes
- lists
- tuples
- sets
- dictionaries
Python common non-iterable objects:
- integers
- floats
- None
- types
- objects
return (not isinstance(iterable, (bytes, str))
and isinstance(iterable, Iterable))
def prepare_dict(data: dict) -> dict:
"""Transform non-iterable dictionary values into lists.
data : dict
The dictionary to convert non-iterable values into lists.
Dictionary with non-iterable values converted to lists.
>>> import pandas as pd
>>> d = {'a': '1', 'b': 2}
>>> prepare_dict(d)
{'a': ['1'], 'b': [2]}
>>> pd.DataFrame(d) # doctest: +ELLIPSIS
Traceback (most recent call last):
ValueError: If using all scalar values, you must pass an index
>>> pd.DataFrame(prepare_dict(d))
a b
0 1 2
Use this function to prepare dictionaries, before calling
`pandas.DataFrame`, to make sure all values have the correct format.
return {
key: value if is_iterable_not_string(value) else [value]
for key, value in data.items()
def check_dict_value_lens(data: dict) -> bool:
"""Check whether all values from dictionary have the same lenght.
data : dict
The dictionary to check the values lenghts.
True, if all `data` values have the same lenght. False otherwise.
min_len = min(map(lambda value: len(value), data.values()))
return all(len(value) == min_len for value in data.values())
def read_file(path: Path | str, **kwargs: Any) -> pd.DataFrame:
Read a DataFrame from a file.
Supported file types are:
- `.csv`
- `.xlsx`, `.xls`, `.xlsm`, `.xlsb` (Excel files)
- `.json`
- `.parquet`
- `.feather`
- `.html`
path : Path | str
The path to the file.
kwargs : Any
Keyword arguments to pass to pandas io functions.
The DataFrame read from the file.
If the file type not supported.
If the file doesn't exist.
_path = Path(path)
path = str(path)
if not _path.is_file():
raise FileNotFoundError(f"File {path} does not exist.")
if _path.suffix in [".csv", ".txt"]:
return pd.read_csv(path, **kwargs)
if ".xls" in _path.suffix:
return pd.read_excel(path, **kwargs)
if _path.suffix == ".json":
return pd.read_json(path, **kwargs)
if _path.suffix == ".pickle":
return pd.read_pickle(path, **kwargs)
if _path.suffix == ".html":
return pd.read_html(path, **kwargs)
if _path.suffix == ".feather":
return pd.read_feather(path, **kwargs)
if _path.suffix in [".parquet", ".pq"]:
return pd.read_parquet(path, **kwargs)
raise ValueError(f"File {path} has an unknown extension.")
def highlight(df: pd.DataFrame) -> pd.DataFrame:
"""Highlight a DataFrame.
df : pd.DataFrame
The DataFrame to highlight. Required indexes:
- ["USL", "LSL", "1", "2", "3", "4", "5"]
The DataFrame with highlighted rows.
# The dataframe cells background colors.
c1: str = "background-color:red"
c2: str = "background-color:yellow"
c3: str = "background-color:green"
# Rows over which the highlighting function should apply
rows: list[str] = ["1", "2", "3", "4", "5"]
# First boolean mask for selecting the df elements
m1 = (df.loc[rows] > df.loc["USL"]) | (df.loc[rows] < df.loc["LSL"])
# Second boolean mask for selecting the df elements
m2 = (df.loc[rows] == df.loc["USL"]) | (df.loc[rows] == df.loc["LSL"])
# DataFrame with same index, and column names as the original,
# but with filled empty strings.
df_highlight = pd.DataFrame("", index=df.index, columns=df.columns)
# Change values of df1 columns by boolean mask
df_highlight.loc[rows, :] =
[m1, m2], [c1, c2], default=c3
return df_highlight
class Analysis:
Read a dataframe, and help performing some analysis in the data.
path_or_data : str | Path | pd.DataFrame
The path to a file, or a dataframe to analyze.
_data : pd.DataFrame
The data read from the file.
_path : str | Path
The path to the file.
>>> data = {
... 'A-A': [
... 0.37, 0.39, 0.35, 0.35, 0.36, 0.34, 0.35, 0.33, 0.36, 0.33,
... ],
... 'A-B': [
... 10.24, 10.26, 10.22, 10.23, 10.19, 10.25, 10.2, 10.17, 10.25,
... 10.17,
... ],
... 'A-C': [
... 5.02, 5.04, 5.0, 5.05, 5.07, 5.03, 5.08, 5.05, 5.08, 5.03,
... ],
... 'A-D': [
... 0.63, 0.65, 0.63, 0.65, 0.67, 0.66, 0.69, 0.62, 0.69, 0.62,
... ],
... 'A-E': [
... 20.3, 20.32, 20.28, 20.45, 20.25, 20.33, 20.22, 20.4,
... 20.45, 20.22,
... ],
... }
>>> index = ['Tg', 'USL', 'LSL', '1', '2', '3', '4', '5', 'Max', 'Min']
>>> analysis = Analysis.from_dict(data, index=index)
>>> analysis.get_std()
A-A 0.011402
A-B 0.031937
A-C 0.019494
A-D 0.025884
A-E 0.097211
dtype: float64
_path: Path | str | None = None
_data: pd.DataFrame | None = None
def path(self) -> str | Path:
"""Get the path to the file.
str | Path
The path to the file.
If `_path` is `None`.
if self._path is None:
raise ValueError("Path not set.")
return str(self._path)
def path(self, path: str | Path):
"""Set the path of the file to analyze.
path : str | Path
The path of the file to analyze.
Path should point to a `.csv` file.
If the path not found.
_path = Path(path)
if _path.is_file():
self._path = str(path)
raise FileNotFoundError(f"Path {path} does not exist.")
def data(self) -> pd.DataFrame:
"""Dataframe read from `path`.
The dataframe read from `path`.
if self._data is None:
self._data = self.get_data()
return self._data
def data(self, data: pd.DataFrame):
"""Set the data to analyze.
data : pd.DataFrame
The data to analyze.
self._data = data
def __init__(self, path_or_data: str | Path | pd.DataFrame):
"""Initialize the Analyzer.
path_or_data : str | Path | pd.DataFrame
The path to a file, or a dataframe to analyze.
If `path_or_data` not a `str`, `Path`, or `pd.DataFrame`.
if isinstance(path_or_data, (str, Path)):
self.path = path_or_data
elif isinstance(path_or_data, pd.DataFrame): = path_or_data
raise ValueError(f"Invalid type {type(path_or_data)}.")
def get_data(self) -> pd.DataFrame:
"""Read the data from the file.
The dataframe read from the `path` property.
return read_file(self.path)
def get_std(self) -> pd.Series:
"""Calcuate the standard deviation of every column.
The standard deviation of every column.
return["1":"5"].apply(lambda x: x.std()) # type: ignore
def highlight_frame(
self, round_values: int | None = None
) -> # type: ignore
"""Highlight dataframe, based on some condition.
round_values: int | None
If defined, sets the precision of the Styler object with the
highlighted dataframe.
The Styler object with the highlighted dataframe.
highlight_df =, axis=None)
if isinstance(round_values, int) and round_values >= 0:
return highlight_df.format(precision=round_values)
return highlight_df
def from_dict(
data: dict,
index: Axes | None = None,
columns: Axes | None = None,
dtype: Dtype | None = None,
) -> Analysis:
"""Create an Analysis object from a dictionary.
data : dict
The dictionary to create the Analysis object from.
index : Index or array-like
Index to use for resulting frame. Defaults to RangeIndex, if
no indexing information part of input data and no index provided.
columns : Index or array-like
Column labels to use for resulting frame when data doesn't have
them, defaulting to RangeIndex(0, 1, 2, ..., n).
If data contains column labels, will perform column selection
dtype : dtype, default None
Data type to force. Only a single dtype allowed. If None, infer.
An instance of the `Analysis` class.
If dictionary values have different lenghts.
data = prepare_dict(data)
if check_dict_value_lens(data):
return cls(
pd.DataFrame(data, index=index, columns=columns, dtype=dtype)
raise ValueError(
f"Dictionary values don't have the same lenghts.\nData: {data}"
if __name__ == "__main__":
import doctest

Keras custom layer on ragged tensor to reduce dimensionallity

I'm trying to write a custom layer that will handle variable-length vectors, and reduce them to the same length vector.
The length is known in advance because the reason for the variable lengths is that I have several different data types that I encode using a different number of features.
In a sense, it is similar to Embedding only for numerical values.
I've tried using padding, but the results were bad, so I'm trying this approach instead.
So, for example let's say I have 3 data types, which I encode with 3, 4, 6 length vectors.
arr = [
# example one (data type 1 [len()==3], datat type 3[len()==6]) - force values as floats
# example two (data type 2 [len()==4], datat type 3len()==6]) - force values as floats
I tried implementing a custom layer like:
class DimensionReducer(tf.keras.layers.Layer):
def __init__(self, output_dim, expected_lengths):
super(DimensionReducer, self).__init__()
self._supports_ragged_inputs = True
self.output_dim = output_dim
for l in expected_lengths:
setattr(self,f'w_{l}', self.add_weight(shape=(l, self.output_dim),initializer='random_normal',trainable=True))
setattr(self, f'b_{l}',self.add_weight(shape=(self.output_dim,), initializer='random_normal',trainable=True))
def call(self, inputs):
# batch
if len(inputs.shape) == 3:
result = []
for i,x in enumerate(inputs):
_result = []
for v in x:
l = len(v)
w = getattr(self, f'w_{l}')
b = getattr(self, f'b_{l}')
out = tf.matmul([v],w) + b
result.append(tf.concat(_result, 0))
r = tf.stack(result)
print("batch output:",r.shape)
return r
Which seems to be working when called directly:
dim = DimensionReducer(3, [3,4,6])
But when I try to incorporate it into a model, it fails:
import tensorflow as tf
val_ragged = tf.ragged.constant(arr)
inputs_ragged = tf.keras.layers.Input(shape=(None,None), ragged=True)
outputs_ragged = DimensionReducer(3, [3,4,6])(inputs_ragged)
model_ragged = tf.keras.Model(inputs=inputs_ragged, outputs=outputs_ragged)
# this one with RaggedTensor doesn't
AttributeError: 'DimensionReducer' object has no attribute 'w_Tensor("dimension_reducer_98/strided_slice:0", shape=(), dtype=int32)'
I'm not sure how am I to implement such a layer, or what I'm doing wrong.

numpy.nanmean() on a subclass of numpy.ndarray returns unexpected type

starting from a popular example
import numpy as np
class TestArray(np.ndarray):
def __new__(subtype, shape, dtype=float, buffer=None, offset=0,
strides=None, order=None):
obj = np.ndarray.__new__(subtype, shape, dtype, buffer, offset, strides,
return obj
obj = TestArray(shape=(3,))
obj[:] = [1, 2, 3]
print type(np.nanmean(obj))
print type(np.nanmean(numpy.array(obj)))
gives the output
<class '__main__.TestArray'>
<type 'numpy.float64'>
I'd rather like numpy.nanmean(obj) to return also a numpy.float64.
Now obviously I can cast to numpy.array, but I don't want to do this.
What do I need to modify in the class definition such that numpy.nanmean() (and probably others) return always the same type as if called with a numpy.ndarray directly?

numpy matrix trace behaviour

If X is a NumPy matrix object, why does np.trace(X) return a scalar (as expected) but X.trace() return a 1x1 matrix object?
>>> X = np.matrix([[1, 2], [3, 4]])
>>> np.trace(X)
>>> X.trace()
matrix([[5]]) # Why not just 5, which would be more useful?
I'm using NumPy 1.7.1, but don't see anything in the release notes of 1.8 to suggest anything's changed.
def __array_finalize__(self, obj):
self._getitem = False
if (isinstance(obj, matrix) and obj._getitem): return
ndim = self.ndim
if (ndim == 2):
if (ndim > 2):
newshape = tuple([x for x in self.shape if x > 1])
ndim = len(newshape)
if ndim == 2:
self.shape = newshape
elif (ndim > 2):
raise ValueError("shape too large to be a matrix.")
newshape = self.shape
if ndim == 0:
self.shape = (1, 1)
elif ndim == 1:
self.shape = (1, newshape[0])
This is from the matrix definition, subclassing ndarray. The trace function is not changed so it is actually the same function getting called.
This function is getting called every time a matrix object is created. The problem is that if ndims is less than 2, it is forced to be larger.
Then here comes some educated guess work, which i think should be true, but i'm not familiar enough with numpy codebase to figure it out exactly.
np.trace and ndarray.trace are two different functions.
np.trace is defined in "core/"
ndarray.trace is defined in "core/src/multiarray/methods.c or calculation.c"
np.trace converts the object to a ndarray
ndarray.trace will try to keep the object as the subclassed object.
Unsure about this, i did not understand squat of that code tbh
both trace functions, will keep the result as an array object (subclassed or not). If it's a single value, it will return that single value, or else it returns the array object.
Since the result is kept as a matrix object, it will be forced to be two dimensional by the function above here. And because of this, it will not be returned as a single value, but as the matrix object.
This conclusion is further backed up by editing the _array_finalize__ function like this:
def __array_finalize__(self, obj):
self._getitem = False
if (isinstance(obj, matrix) and obj._getitem): return
ndim = self.ndim
if (ndim == 2):
if (ndim > 2):
newshape = tuple([x for x in self.shape if x > 1])
ndim = len(newshape)
if ndim == 2:
self.shape = newshape
elif (ndim > 2):
raise ValueError("shape too large to be a matrix.")
newshape = self.shape
if ndim == 0:
self.shape = (1, 1)
elif ndim == 1:
self.shape = (1, newshape[0])
notice the new return before the last if-else check. Now the result of X.trace() is a single value.
THIS IS NOT A FIX, revert the change if you try this yourself.
They have good reasons for doing this
np.tracedoes not have this problems since it convert's it to an array object directly.
The code for np.trace is (without the docstring):
def trace(a, offset=0, axis1=0, axis2=1, dtype=None, out=None):
return asarray(a).trace(offset, axis1, axis2, dtype, out)
From the docstring of asarray
Array interpretation of a. No copy is performed if the input
is already an ndarray. If a is a subclass of ndarray, a base
class ndarray is returned.
Because X.trace is coded that way! The matrix documentation says:
A matrix is a specialized 2-D array that retains its 2-D nature
through operations.
np.trace is coded as (using ndarray.trace):
return asarray(a).trace(offset, axis1, axis2, dtype, out)
It's harder to follow how the matrix trace is evaluated. But looking at
I suspect it is equivalent to:
In that same file, sum is defined as:
return N.ndarray.sum(self, axis, dtype, out, keepdims=True)._collapse(axis)
mean, prod etc do the same. _collapse returns a scalar if axis is None. There isn't an explicit definition for a matrix trace, so it probably uses __array_finalize__. In other words, trace returns the default matrix type.
Several constructs that return the scalar are:

Itertools for containers

Considder the following interactive example
>>> l=imap(str,xrange(1,4))
>>> list(l)
['1', '2', '3']
>>> list(l)
Does anyone know if there is already an implementation somewhere out there with a version of imap (and the other itertools functions) such that the second time list(l) is executed you get the same as the first. And I don't want the regular map because building the entire output in memory can be a waste of memory if you use larger ranges.
I want something that basically does something like
class cmap:
def __init__(self, function, *iterators):
self._function = function
self._iterators = iterators
def __iter__(self):
return itertools.imap(self._function, *self._iterators)
def __len__(self):
return min( map(len, self._iterators) )
But it would be a waste of time to do this manually for all itertools if someone already did this.
Do you think containers are more zen then iterators since for an iterator something like
for i in iterator:
do something
implicitly empties the iterator while a container you explicitly need to remove elements.
You do not have to build such an object for each type of container. Basically, you have the following:
mkimap = lambda: imap(str,xrange(1,4))
Now you onlky need a nice wrapping object to prevent the "ugly" function calls. This could work this way:
class MultiIter(object):
def __init__(self, f, *a, **k):
if a or k:
self.create = lambda: f(*a, **k)
else: # optimize
self.create = f
def __iter__(self):
return self.create()
l = MultiIter(lambda: imap(str, xrange(1,4)))
# or
l = MultiIter(imap, str, xrange(1,4))
# or even
def l():
return imap(str, xrange(1,4))
# and then
print list(l)
print list(l)
(untested, hope it works, but you should get the idea)
For your 2nd question: Iterators and containers both have their uses. You should take whatever best fits your needs.
You may be looking for itertools.tee()
Iterators are my favorite topic ;)
from itertools import imap
class imap2(object):
def __init__(self, f, *args):
self.g = imap(f,*args)
self.lst = []
self.done = False
def __iter__(self):
while True:
try: # try to get something from g
x = next(self.g)
except StopIteration:
if self.done:
# give the old values
for x in self.lst:
yield x
# g was consumed for the first time
self.done = True
yield x
print list(l)
print list(l)