Why I can't loop xmltodict? - pandas

Ive'been trying to transform all my logs in a dict through xmltodict.parse function
The thing is, when I try to convert a single row to a variable it works fine
a = xmltodict.parse(df['CONFIG'][0])
Same to
parsed[1] = xmltodict.parse(df['CONFIG'][1])
But when I try to iterate the entire dataframe and store it on a dictionaire I get the following
for ind in df['CONFIG'].index:
parsed[ind] = xmltodict.parse(df['CONFIG'][ind])
---------------------------------------------------------------------------
ExpatError Traceback (most recent call last)
/tmp/ipykernel_31/1871123186.py in <module>
1 for ind in df['CONFIG'].index:
----> 2 parsed[ind] = xmltodict.parse(df['CONFIG'][ind])
/opt/conda/lib/python3.9/site-packages/xmltodict.py in parse(xml_input, encoding, expat, process_namespaces, namespace_separator, disable_entities, **kwargs)
325 parser.ParseFile(xml_input)
326 else:
--> 327 parser.Parse(xml_input, True)
328 return handler.item
329
ExpatError: syntax error: line 1, column 0

Can you try this?
for ind in range(len(df['CONFIG'])):
parsed[ind] = xmltodict.parse(df['CONFIG'][ind])

Related

AttributeError: 'ArrayView' object has no attribute 'A1'

I have to import a processed h5ad file, but it seems that X has been passed as a numpy array instead of a numpy matrix. See below:
# Read the data
data_path = "/home/bbb5130/snOMICS/maria/msrna.h5ad"
adata = sn.pp.read_h5ad(data_path, pr_process="Yes")
adata
But the output was:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
Cell In [15], line 3
1 # Read the data
2 data_path = "/home/bbb5130/snOMICS/maria/msrna.h5ad"
----> 3 adata = sn.pp.read_h5ad(data_path, pr_process="Yes")
4 adata
File ~/miniconda3/envs/snOMICS/lib/python3.9/site-packages/scanet/preprocessing.py:54, in Preprocessing.read_h5ad(cls, filename, pr_process)
51 return sc.read_h5ad(filename)
52 else:
53 # initial preprocessing as it is required later
---> 54 return cls._intial(adata)
File ~/miniconda3/envs/snOMICS/lib/python3.9/site-packages/scanet/preprocessing.py:35, in Preprocessing._intial(adata)
33 adata.var['mt'] = adata.var_names.str.startswith('MT-')
34 mito_genes = adata.var_names.str.startswith('MT-')
---> 35 adata.obs['percent_mito'] = np.sum(adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
36 sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, inplace=True)
37 sc.pp.filter_cells(adata, min_genes=0)
AttributeError: 'ArrayView' object has no attribute 'A1'
Is there anyway I can change the format, so the file can be read?
Thanks in advance.

TypeError: Wrong number or type of arguments for overloaded function 'new_Date'

I am new to python. I am getting an error when running below code. The issue seems to be with date. can someone help me to correct i please. I have tried changing the date format in the excel but it does not solve the issue. The excel have a list of several bonds. I want to generate the coupon dates of the different bonds
BondData = pd.read_excel (r'C:\Users\Avishen\Desktop\Python\BONDDATA.xlsx')
Data = pd.DataFrame(BondData)
def scheduledates():
tenor = ql.Period(ql.Semiannual)
day_count = ql.Thirty360
calendar = ql.UnitedStates()
businessConvention = ql.Unadjusted
dateGeneration = ql.DateGeneration.Backward
monthEnd = False
# Dates in Bond Period
return ql.Schedule (issueDate, maturityDate, tenor, calendar, businessConvention,
businessConvention , dateGeneration, monthEnd)
new_df["Dates"]= Data.apply(lambda x: scheduledates(),axis = 1)
new_df["ISIN"] = Data.ISIN
new_df
Error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-4-877415e9cf83> in <module>
21 businessConvention , dateGeneration, monthEnd)
22
---> 23 new_df["Dates"]= Data.apply(lambda x: scheduledates(),axis = 1)
24 new_df["ISIN"] = Data.ISIN
25 new_df
~\anaconda3\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, raw, result_type, args, **kwds)
7546 kwds=kwds,
7547 )
-> 7548 return op.get_result()
7549
7550 def applymap(self, func) -> "DataFrame":
~\anaconda3\lib\site-packages\pandas\core\apply.py in get_result(self)
178 return self.apply_raw()
179
--> 180 return self.apply_standard()
181
182 def apply_empty_result(self):
~\anaconda3\lib\site-packages\pandas\core\apply.py in apply_standard(self)
269
270 def apply_standard(self):
--> 271 results, res_index = self.apply_series_generator()
272
273 # wrap results
~\anaconda3\lib\site-packages\pandas\core\apply.py in apply_series_generator(self)
298 for i, v in enumerate(series_gen):
299 # ignore SettingWithCopy here in case the user mutates
--> 300 results[i] = self.f(v)
301 if isinstance(results[i], ABCSeries):
302 # If we have a view on v, we need to make a copy because
<ipython-input-4-877415e9cf83> in <lambda>(x)
21 businessConvention , dateGeneration, monthEnd)
22
---> 23 new_df["Dates"]= Data.apply(lambda x: scheduledates(),axis = 1)
24 new_df["ISIN"] = Data.ISIN
25 new_df
<ipython-input-4-877415e9cf83> in scheduledates()
8
9 def scheduledates():
---> 10 issueDate = ql.Date(Data.issuedate)
11 maturityDate = ql.Date(Data.maturitydate)
12 tenor = ql.Period(ql.Semiannual)
~\anaconda3\lib\site-packages\QuantLib\QuantLib.py in __init__(self, *args)
425
426 def __init__(self, *args):
--> 427 _QuantLib.Date_swiginit(self, _QuantLib.new_Date(*args))
428
429 def weekdayNumber(self):
TypeError: Wrong number or type of arguments for overloaded function 'new_Date'.
Possible C/C++ prototypes are:
Date::Date()
Date::Date(Day,Month,Year)
Date::Date(Day,Month,Year,Hour,Minute,Second,Millisecond,Microsecond)
Date::Date(Day,Month,Year,Hour,Minute,Second,Millisecond)
Date::Date(Day,Month,Year,Hour,Minute,Second)
Date::Date(BigInteger)
Date::Date(std::string const &,std::string)
---------------------------------------------------------------------------
Data = pd.DataFrame(BondData)
Fields from Bond Data
ISIN
issuedate
maturitydate
coupon
Tradeyield
Bond_Price
MarketPrice
Nominal_Amount
From the traceback, the problem is the line:
issueDate = ql.Date(Data.issuedate)
(which for some reason is not in the code you pasted). Coming from Excel, issuedate should be an integer and thus compatible with the ql.Date constructor, but it's possible that pandas is reading it as a string or some other type. You should examine the data frame and check the type of the column. If it's not what you expect, you'll have to figure out if there are data in that column that pandas can't interpret as integers, and either clean them up of force the conversion somehow before passing them to ql.Date.

While removing html text from column, object of type 'float' has no len() error is occuring

I am using an amazon dataset to do sentiment analysis. Dataset content is
https://i.stack.imgur.com/qcKZp.png
dataset con be found on:
https://www.kaggle.com/PromptCloudHQ/amazon-reviews-unlocked-mobile-phones
I am trying to remove html from Review column.
This is what I am doing. Note: dataset is assigned to df.
df_removedNoise = []
def removingHTML(text):
soup = BeautifulSoup(text, 'lxml').get_text()
return soup
def removingNoise(text):
html_removed = removingHTML(text)
return html_removed
for i in df["Reviews"]:
text = removingNoise(i)
df_removedNoise.append(text)
Even though Reviews column has object as a datatype, I am still getting an error like.
TypeError Traceback (most recent call last)
<ipython-input-83-3591f5d7a54f> in <module>
9
10 for i in df["Reviews"]:
---> 11 df_removedNoise.append(removingNoise(i))
<ipython-input-83-3591f5d7a54f> in removingNoise(text)
5
6 def removingNoise(text):
----> 7 html_removed = removingHTML(text)
8 return html_removed
9
<ipython-input-83-3591f5d7a54f> in removingHTML(text)
1 df_removedNoise = []
2 def removingHTML(text):
----> 3 soup = BeautifulSoup(text, 'lxml').get_text()
4 return soup
5
~/anaconda3/lib/python3.7/site-packages/bs4/__init__.py in __init__(self, markup, features, builder, parse_only, from_encoding, exclude_encodings, **kwargs)
244 if hasattr(markup, 'read'): # It's a file-type object.
245 markup = markup.read()
--> 246 elif len(markup) <= 256 and (
247 (isinstance(markup, bytes) and not b'<' in markup)
248 or (isinstance(markup, str) and not '<' in markup)
TypeError: object of type 'float' has no len()
Any help will be appreciated!
Check for NaN with df[df['Reviews'].isnull()], if you find any try to dropna first

How to get rid of "AttributeError: 'float' object has no attribute 'log2' "

Say I have a data frame with columns of min value =36884326.0, and max value =6619162563.0, which I need to plot as box plot, so I tried to log transform the values, as follows,
diff["values"] = diff['value'].apply(lambda x: (x+1))
diff["log_values"] = diff['values'].apply(lambda x: x.log2(x))
However, the above lines are throwing the error as follows,
AttributeError Traceback (most recent call last)
<ipython-input-28-fe4e1d2286b0> in <module>
1 diff['value'].max()
2 diff["values"] = diff['value'].apply(lambda x: (x+1))
----> 3 diff["log_values"] = diff['values'].apply(lambda x: x.log2(x))
~/software/anaconda/lib/python3.7/site-packages/pandas/core/series.py in apply(self, func, convert_dtype, args, **kwds)
3192 else:
3193 values = self.astype(object).values
-> 3194 mapped = lib.map_infer(values, f, convert=convert_dtype)
3195
3196 if len(mapped) and isinstance(mapped[0], Series):
pandas/_libs/src/inference.pyx in pandas._libs.lib.map_infer()
<ipython-input-28-fe4e1d2286b0> in <lambda>(x)
1 diff['value'].max()
2 diff["values"] = diff['value'].apply(lambda x: (x+1))
----> 3 diff["log_values"] = diff['values'].apply(lambda x: x.log2(x))
AttributeError: 'float' object has no attribute 'log2'
Any suggestions would be great. Thanks
You need numpy.log2 function to aplly, please, check sintaxis here.

trimming column named is generating ValueError

I have a table which I run through a function to trim its columns down to length 128 (I know it's really long, there isn't anything I can do about that) characters so it can use to_sql to create a database from it.
def truncate_column_names(df, length):
rename = {}
for col in df.columns:
if len(col) > length:
new_col = col[:length-3]+"..."
rename[col] = new_col
result = df.rename(columns=rename)
return result
This function works fine and I get a table out just fine but the problem comes when I tried to save the file I get the error
ValueError: Buffer has wrong number of dimensions (expected 1, got 2)
The method I have doing some housekeeping before saving to a file included dropping duplicates and that is where this error is being spit out. I tested this by saving the original dataFrame and then just loading it, running the truncate function, and then trying drop_duplicates on the result and I get the same error.
The headers for the file before I try truncating looks like this:
http://pastebin.com/WXmvwHDg
I trimmed the file down to 1 record and still have the problem.
This was a result of the truncating causing some columns to have non-unique names.
To confirm this was an issue I did a short test:
In [113]: df = pd.DataFrame(columns=["ab", "ac", "ad"])
In [114]: df
Out[114]:
Empty DataFrame
Columns: [ab, ac, ad]
Index: []
In [115]: df.drop_duplicates()
Out[115]:
Empty DataFrame
Columns: [ab, ac, ad]
Index: []
In [116]: df.columns
Out[116]: Index([u'ab', u'ac', u'ad'], dtype='object')
In [117]: df.columns = df.columns.str[:1]
In [118]: df
Out[118]:
Empty DataFrame
Columns: [a, a, a]
Index: []
In [119]: df.drop_duplicates()
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-119-daf275b6788b> in <module>()
----> 1 df.drop_duplicates()
C:\Miniconda\lib\site-packages\pandas\util\decorators.pyc in wrapper(*args, **kw
args)
86 else:
87 kwargs[new_arg_name] = new_arg_value
---> 88 return func(*args, **kwargs)
89 return wrapper
90 return _deprecate_kwarg
C:\Miniconda\lib\site-packages\pandas\core\frame.pyc in drop_duplicates(self, su
bset, take_last, inplace)
2826 deduplicated : DataFrame
2827 """
-> 2828 duplicated = self.duplicated(subset, take_last=take_last)
2829
2830 if inplace:
C:\Miniconda\lib\site-packages\pandas\util\decorators.pyc in wrapper(*args, **kw
args)
86 else:
87 kwargs[new_arg_name] = new_arg_value
---> 88 return func(*args, **kwargs)
89 return wrapper
90 return _deprecate_kwarg
C:\Miniconda\lib\site-packages\pandas\core\frame.pyc in duplicated(self, subset,
take_last)
2871
2872 vals = (self[col].values for col in subset)
-> 2873 labels, shape = map(list, zip( * map(f, vals)))
2874
2875 ids = get_group_index(labels, shape, sort=False, xnull=False)
C:\Miniconda\lib\site-packages\pandas\core\frame.pyc in f(vals)
2860
2861 def f(vals):
-> 2862 labels, shape = factorize(vals, size_hint=min(len(self), _SI
ZE_HINT_LIMIT))
2863 return labels.astype('i8',copy=False), len(shape)
2864
C:\Miniconda\lib\site-packages\pandas\core\algorithms.pyc in factorize(values, s
ort, order, na_sentinel, size_hint)
133 table = hash_klass(size_hint or len(vals))
134 uniques = vec_klass()
--> 135 labels = table.get_labels(vals, uniques, 0, na_sentinel)
136
137 labels = com._ensure_platform_int(labels)
pandas\hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_labels (pandas\ha
shtable.c:13946)()
ValueError: Buffer has wrong number of dimensions (expected 1, got 2)
and got the same result. using df.columns.unique() after the truncation i had ~200 duplicate columns after the truncation