Converting financial information to pandas data frame - pandas

I am trying to get stock data such as balance_sheet, income_statement and cash_flow for multiple stocks and converting it to a data frame for manipulations.
here is the getting the data part of the code :
**import yahoo_fin.stock_info as yfs
tickers = ['AMZN','AAPL','MSFT','DIS','GOOG']
balance_sheet=[]
income_statement=[]
cash_flow=[]
balance_sheet.append({ticker : yfs.get_balance_sheet(ticker) for ticker in tickers})
income_statement.append({ticker : yfs.get_income_statement(ticker) for ticker in tickers })
cash_flow.append({ticker : yfs.get_cash_flow(ticker) for ticker in tickers})**
This part works well and returns a dictionary for each category. I then this :
my_dict=cash_flow+balance_sheet+income_statement
dff=pd.DataFrame.from_dict(my_dict, orient='columns', dtype=None, columns=None)
Note that when I try orient='index' I get the following error message :
**AttributeError Traceback (most recent call last)
in
1 my_dict=cash_flow+balance_sheet+income_statement
----> 2 dff=pd.DataFrame.from_dict(my_dict, orient='index', dtype=None, columns=None)
3 # dff=dff.set_index('endDate')
4 dff
5 # cash_flow
/opt/anaconda3/lib/python3.8/site-packages/pandas/core/frame.py in from_dict(cls, data, orient, dtype, columns)
1361 if len(data) > 0:
1362 # TODO speed up Series case
-> 1363 if isinstance(list(data.values())[0], (Series, dict)):
1364 data = _from_nested_dict(data)
1365 else:
enter code here
AttributeError: 'list' object has no attribute 'values'**
If someone could let me know what I'm doing wrong that would be very appreciated ! :)

Related

ufunc 'divide' not supported for the input types...... error problem while trying to get the NumPy average

I am new to Numpy and I have been trying to get the average of an array I derived from another array.
This is the code that have been giving me error: "ufunc 'divide' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'' "
import numpy as np
import pandas as pd
​
cars = pd.read_csv('data/co2_emissions_canada.csv')
cars_makes = cars['Make'].to_numpy()
cars_models = cars['Model'].to_numpy()
cars_classes = cars['Vehicle Class'].to_numpy()
cars_engine_sizes = cars['Engine Size(L)'].to_numpy()
cars_cylinders = cars['Cylinders'].to_numpy()
cars_transmissions = cars['Transmission'].to_numpy()
cars_fuel_types = cars['Fuel Type'].to_numpy()
cars_fuel_consumption = cars['Fuel Consumption Comb (L/100 km)'].to_numpy()
cars_co2_emissions = cars['CO2 Emissions(g/km)'].to_numpy()
​
#the median of the cars_engine_sizes
print(np.median(cars_engine_sizes))
#the average fuel consumption for regular gasoline (Fuel Type = X), #premium gasoline (Z), ethanol (E), and diesel (D)?
fuel_typesx=np.array(cars_fuel_types[cars_fuel_types=='X'])
print(np.average(fuel_typesx))
fuel_typesz=np.array(cars_fuel_types[cars_fuel_types=='Z'])
print(np.average(fuel_typesz))
fuel_typese=np.array(cars_fuel_types[cars_fuel_types=='E'])
print(np.average(fuel_typese))
please, what am i missing
I'm guessing the FULL error message looks something like this:
In [753]: np.average(np.array(['A','B','C','A'],dtype=object))
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Input In [753], in <cell line: 1>()
----> 1 np.average(np.array(['A','B','C','A'],dtype=object))
File <__array_function__ internals>:5, in average(*args, **kwargs)
File ~\anaconda3\lib\site-packages\numpy\lib\function_base.py:380, in average(a, axis, weights, returned)
377 a = np.asanyarray(a)
379 if weights is None:
--> 380 avg = a.mean(axis)
381 scl = avg.dtype.type(a.size/avg.size)
382 else:
File ~\anaconda3\lib\site-packages\numpy\core\_methods.py:191, in _mean(a, axis, dtype, out, keepdims, where)
189 ret = ret.dtype.type(ret / rcount)
190 else:
--> 191 ret = ret / rcount
193 return ret
TypeError: ufunc 'true_divide' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''
cars_fuel_types comes from a dataframe, and evidently contains strings like 'E'. So it is object dtype. Even if you select like values, you can't take an 'average'.
average takes the sum of values and divides by the count. sum for python strings is concatenation, not some sort of math.
In [754]: np.sum(np.array(['A','B','C','A'],dtype=object))
Out[754]: 'ABCA'

ValueError: Must pass 2-d input when creating a pandas dataframe using for loop

The neg_ctl_df dataframe contains negative control and the coding_gene_df contains my gene-of-interest.
I want to create a dataframe norm_df that stores the normalized output.
import pandas as pd
# Median of the NEGATIVE controls
neg_ctl_median = neg_ctl_df.iloc[:,-29:].median()
# Median of the POSITIVE controls
pos_ctl_median = pos_ctl_df.iloc[:,-29:].median()
# Mean of the PROBESET controls
probeset_norm = qnorm.quantile_normalize(probe_ctl_df.iloc[:,-29:], axis=1)
# Normalize the samples
i = []
for gene, sample in coding_gene_df.iloc[:,-29:].astype(float).iterrows():
norm_val = sample - neg_ctl_median # Subtract the median of the NEGATIVE controls within the patient sample
norm_val = norm_val / pos_ctl_median # Divide the median of the POSITIVE controls within the patient sample (replace sample value with the value that has already been normalized against negative control)
norm_val = norm_val / probeset_norm # Probeset normalization (quantile normalization)
i.append(norm_val)
pd.DataFrame(i)
Traceback:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-63-85108013a3d8> in <module>()
15 norm_val = norm_val / probeset_norm # Probeset normalization (quantile normalization)
16 i.append(norm_val)
---> 17 pd.DataFrame(i)
2 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/internals/construction.py in _prep_ndarray(values, copy)
553 values = values.reshape((values.shape[0], 1))
554 elif values.ndim != 2:
--> 555 raise ValueError(f"Must pass 2-d input. shape={values.shape}")
556
557 return values
ValueError: Must pass 2-d input. shape=(1, 102, 29)
Samples:
coding_gene_df.iloc[1:10,-29:-27].to_dict()
{'12h_P1_T4_TimeC2_PIDC4_Non-Survivor': {'CNTN2': '6.35',
'KCNA2': '5.29',
'LOC79160': '5.99',
'PTGIS': '5.66',
'TTTY11': '3.91',
'VPS4B': '9.68',
'XRCC1': '9.09',
'ZC3HC1': '7.19',
'ZFAS1': '8.68'},
'48h_P1_T6_TimeC3_PIDC1_Non-Survivor': {'CNTN2': '6.6',
'KCNA2': '5.36',
'LOC79160': '6.18',
'PTGIS': '5.54',
'TTTY11': '3.92',
'VPS4B': '9.51',
'XRCC1': '9.15',
'ZC3HC1': '7.05',
'ZFAS1': '8.46'}}
Negative controls:
neg_ctl_df.iloc[1:10,-29:-27].to_dict()
{'12h_P1_T4_TimeC2_PIDC4_Non-Survivor': {'---': '8.45'},
'48h_P1_T6_TimeC3_PIDC1_Non-Survivor': {'---': '8.16'}}

How do I filter by date and count these dates using relational algebra (Reframe)?

I'm really stuck. I read the Reframe documentation https://reframe.readthedocs.io/en/latest/ that's based on pandas and I tried multiple things on my own but it still doesn't work. So I got a CSV file called weddings that looks like this:
Weddings, Date
Wedding1,20181107
Wedding2,20181107
And many more rows. As you can see, there are duplicates in the date column but this doesn't matter I think. I want to count the amount of weddings filtered by date, for example, the amount of weddings after 5 october 2016 (20161005). So first I tried this:
Weddings = Relation('weddings.csv')
Weddings.sort(['Date']).query('Date > 20161005').project(['Weddings', 'Date'])
This seems logical to me but I get a Keyerror 'Date' and don't know why? So I tried something more simple
Weddings = Relation('weddings.csv')
Weddings.groupby(['Date']).count()
And this doesn't work either, I still get a Keyerror 'Date' and don't know why. Can someone help me?
Track trace
KeyError Traceback (most recent call last)
<ipython-input-44-b358cdf55fdb> in <module>()
1
2 Weddings = Relation('weddings.csv')
----> 3 weddings.sort(['Date']).query('Date > 20161005').project(['Weddings', 'Date'])
4
5
~\Documents\Reframe.py in sort(self, *args,
**kwargs)
110 """
111
--> 112 return Relation(super().sort_values(*args, **kwargs))
113
114 def intersect(self, other):
~\Anaconda3\lib\site-packages\pandas\core\frame.py in sort_values(self, by,
axis, ascending, inplace, kind, na_position)
4416 by = by[0]
4417 k = self._get_label_or_level_values(by, axis=axis,
-> 4418 stacklevel=stacklevel)
4419
4420 if isinstance(ascending, (tuple, list)):
~\Anaconda3\lib\site-packages\pandas\core\generic.py in
_get_label_or_level_values(self, key, axis, stacklevel)
1377 values = self.axes[axis].get_level_values(key)._values
1378 else:
-> 1379 raise KeyError(key)
1380
1381 # Check for duplicates
KeyError: 'Date'

TypeError: scatter() got multiple values for argument 'c'

I am trying to do hierarchy clustering on my MFCC array 'signal_mfcc' which is an ndarray with dimensions of (198, 12). 198 audio frames/observation and 12 coefficients/dimensions?
I am using a random threshold of '250' with 'distance' for the criterion as shown below:
thresh = 250
print(signal_mfcc.shape)
clusters = hcluster.fclusterdata(signal_mfcc, thresh, criterion="distance")
With the specified threshold, the output variable 'cluster' is a sequence [1 1 1 ... 1] with the length of 198 or (198,) which I assume points all the data to a single cluster.
Then, I am using pyplot to plot scatter() with the following code:
# plotting
print(*(signal_mfcc.T).shape)
plt.scatter(*np.transpose(signal_mfcc), c=clusters)
plt.axis("equal")
title = "threshold: %f, number of clusters: %d" % (thresh) len(set(clusters)))
plt.title(title)
plt.show()
The output is:
plt.scatter(*np.transpose(signal_mfcc), c=clusters)
TypeError: scatter() got multiple values for argument 'c'
The scatter plot would not show. Any clues to what may went wrong?
Thanks in advance!
From this SO Thread, you can see why you have this error.
Fom the Scatter documentation, c is the 2nd optional argument, and the 4th argument total. This error means that your unpacking on np.transpose(signal_mfcc) returns more than 4 items. And as you define c later on, it is defined twice and it cannot choose which one is correct.
Example :
def temp(n, c=0):
pass
temp(*[1, 2], c=1)
# Traceback (most recent call last):
# File "<stdin>", line 1, in <module>
# TypeError: temp() got multiple values for argument 'c'

DataFrame.ix() in pandas - is there an option to catch situations when requested columns do not exist?

My code reads CSV file into pandas DataFrame - and processes it.
The code relies on column names - uses df.ix[,] to get the columns.
Recently some column names in the CSV file were changed (without notice).
But the code was not complaining and was silently producing wrong results.
The ix[,] construct doesn't check if column exists.
If it doesn't - it simply creates it and populate with NaN.
Here is the main idea of what was going on.
df1=DataFrame({'a':[1,2,3],'b':[4,5,6]}) # columns 'a' & 'b'
df2=df1.ix[:,['a','c']] # trying to get 'a' & 'c'
print df2
a c
0 1 NaN
1 2 NaN
2 3 NaN
So it doesn't produce an error or a warning.
Is there an alternative way to select specific columns with extra check that columns exist?
My current workaround is to use my own small utility function, something like this:
import sys, inspect
def validate_cols_or_exit(df,cols):
"""
Exits with error message if pandas DataFrame object df
doesn't have all columns from the provided list of columns
Example of usage:
validate_cols_or_exit(mydf,['col1','col2'])
"""
dfcols = list(df.columns)
valid_flag = True
for c in cols:
if c not in dfcols:
print "Error, non-existent DataFrame column found - ",c
valid_flag = False
if not valid_flag:
print "Error, non-existent DataFrame column(s) found in function ", inspect.stack()[1][3]
print "valid column names are:"
print "\n".join(df.columns)
sys.exit(1)
How about:
In [3]: df1[['a', 'c']]
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/home/wesm/code/pandas/<ipython-input-3-2349e89f1bb5> in <module>()
----> 1 df1[['a', 'c']]
/home/wesm/code/pandas/pandas/core/frame.py in __getitem__(self, key)
1582 if com._is_bool_indexer(key):
1583 key = np.asarray(key, dtype=bool)
-> 1584 return self._getitem_array(key)
1585 elif isinstance(self.columns, MultiIndex):
1586 return self._getitem_multilevel(key)
/home/wesm/code/pandas/pandas/core/frame.py in _getitem_array(self, key)
1609 mask = indexer == -1
1610 if mask.any():
-> 1611 raise KeyError("No column(s) named: %s" % str(key[mask]))
1612 result = self.reindex(columns=key)
1613 if result.columns.name is None:
KeyError: 'No column(s) named: [c]'
Not sure you can constrain a DataFrame, but your helper function could be a lot simpler. (something like)
mismatch = set(cols).difference(set(dfcols))
if mismatch:
raise SystemExit('Unknown column(s): {}'.format(','.join(mismatch)))