UnicodeDecodeError: While reading a csv file using .csvreader - pandas

Here's my code:
import csv
path = "/home/Downloads/sample_email.csv"
with open(path) as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
print(row['first_name'], row['last_name'])
The error is:
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-47-d89ea200a227> in <module>
3 with open(path) as csvfile:
4 reader = csv.DictReader(csvfile)
----> 5 for i in reader:
6 print(i['first_name'], i['last_name'])
/usr/lib/python3.6/csv.py in __next__(self)
109 if self.line_num == 0:
110 # Used only for its side effect.
--> 111 self.fieldnames
112 row = next(self.reader)
113 self.line_num = self.reader.line_num
/usr/lib/python3.6/csv.py in fieldnames(self)
96 if self._fieldnames is None:
97 try:
---> 98 self._fieldnames = next(self.reader)
99 except StopIteration:
100 pass
/usr/lib/python3.6/codecs.py in decode(self, input, final)
319 # decode input (taking the buffer into account)
320 data = self.buffer + input
--> 321 (result, consumed) = self._buffer_decode(data, self.errors, final)
322 # keep undecoded input until the next call
323 self.buffer = data[consumed:]
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa0 in position 596: invalid start byte
When i want to convert this file to df, i have to use :
df = pd.read_csv(path, sep=',', engine = 'python')
Any help?
My aim is , extract the data, and a create a email template. Any guide about this matter is also appriciated.

Related

AttributeError: 'ArrayView' object has no attribute 'A1'

I have to import a processed h5ad file, but it seems that X has been passed as a numpy array instead of a numpy matrix. See below:
# Read the data
data_path = "/home/bbb5130/snOMICS/maria/msrna.h5ad"
adata = sn.pp.read_h5ad(data_path, pr_process="Yes")
adata
But the output was:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
Cell In [15], line 3
1 # Read the data
2 data_path = "/home/bbb5130/snOMICS/maria/msrna.h5ad"
----> 3 adata = sn.pp.read_h5ad(data_path, pr_process="Yes")
4 adata
File ~/miniconda3/envs/snOMICS/lib/python3.9/site-packages/scanet/preprocessing.py:54, in Preprocessing.read_h5ad(cls, filename, pr_process)
51 return sc.read_h5ad(filename)
52 else:
53 # initial preprocessing as it is required later
---> 54 return cls._intial(adata)
File ~/miniconda3/envs/snOMICS/lib/python3.9/site-packages/scanet/preprocessing.py:35, in Preprocessing._intial(adata)
33 adata.var['mt'] = adata.var_names.str.startswith('MT-')
34 mito_genes = adata.var_names.str.startswith('MT-')
---> 35 adata.obs['percent_mito'] = np.sum(adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
36 sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, inplace=True)
37 sc.pp.filter_cells(adata, min_genes=0)
AttributeError: 'ArrayView' object has no attribute 'A1'
Is there anyway I can change the format, so the file can be read?
Thanks in advance.

Why I can't loop xmltodict?

Ive'been trying to transform all my logs in a dict through xmltodict.parse function
The thing is, when I try to convert a single row to a variable it works fine
a = xmltodict.parse(df['CONFIG'][0])
Same to
parsed[1] = xmltodict.parse(df['CONFIG'][1])
But when I try to iterate the entire dataframe and store it on a dictionaire I get the following
for ind in df['CONFIG'].index:
parsed[ind] = xmltodict.parse(df['CONFIG'][ind])
---------------------------------------------------------------------------
ExpatError Traceback (most recent call last)
/tmp/ipykernel_31/1871123186.py in <module>
1 for ind in df['CONFIG'].index:
----> 2 parsed[ind] = xmltodict.parse(df['CONFIG'][ind])
/opt/conda/lib/python3.9/site-packages/xmltodict.py in parse(xml_input, encoding, expat, process_namespaces, namespace_separator, disable_entities, **kwargs)
325 parser.ParseFile(xml_input)
326 else:
--> 327 parser.Parse(xml_input, True)
328 return handler.item
329
ExpatError: syntax error: line 1, column 0
Can you try this?
for ind in range(len(df['CONFIG'])):
parsed[ind] = xmltodict.parse(df['CONFIG'][ind])

While removing html text from column, object of type 'float' has no len() error is occuring

I am using an amazon dataset to do sentiment analysis. Dataset content is
https://i.stack.imgur.com/qcKZp.png
dataset con be found on:
https://www.kaggle.com/PromptCloudHQ/amazon-reviews-unlocked-mobile-phones
I am trying to remove html from Review column.
This is what I am doing. Note: dataset is assigned to df.
df_removedNoise = []
def removingHTML(text):
soup = BeautifulSoup(text, 'lxml').get_text()
return soup
def removingNoise(text):
html_removed = removingHTML(text)
return html_removed
for i in df["Reviews"]:
text = removingNoise(i)
df_removedNoise.append(text)
Even though Reviews column has object as a datatype, I am still getting an error like.
TypeError Traceback (most recent call last)
<ipython-input-83-3591f5d7a54f> in <module>
9
10 for i in df["Reviews"]:
---> 11 df_removedNoise.append(removingNoise(i))
<ipython-input-83-3591f5d7a54f> in removingNoise(text)
5
6 def removingNoise(text):
----> 7 html_removed = removingHTML(text)
8 return html_removed
9
<ipython-input-83-3591f5d7a54f> in removingHTML(text)
1 df_removedNoise = []
2 def removingHTML(text):
----> 3 soup = BeautifulSoup(text, 'lxml').get_text()
4 return soup
5
~/anaconda3/lib/python3.7/site-packages/bs4/__init__.py in __init__(self, markup, features, builder, parse_only, from_encoding, exclude_encodings, **kwargs)
244 if hasattr(markup, 'read'): # It's a file-type object.
245 markup = markup.read()
--> 246 elif len(markup) <= 256 and (
247 (isinstance(markup, bytes) and not b'<' in markup)
248 or (isinstance(markup, str) and not '<' in markup)
TypeError: object of type 'float' has no len()
Any help will be appreciated!
Check for NaN with df[df['Reviews'].isnull()], if you find any try to dropna first

pyspark: creating a k-means clustering model using spark-ml with spark data frame

I am using the following code to create a clustering model:
import pandas as pd
pandas_df = pd.read_pickle('df_features.pickle')
spark_df = sqlContext.createDataFrame(pandas_df)
from pyspark.ml.linalg import Vectors
from pyspark.ml.clustering import KMeans
kmeans = KMeans(k=2, seed=1.0)
modela = kmeans.fit(spark_df)
Then I got errors:
AnalysisException Traceback (most recent call last)
<ipython-input-26-00e1e2ba1983> in <module>()
3
4 kmeans = KMeans(k=2, seed=1.0)
----> 5 modela = kmeans.fit(spark_df)
/home/edamame/spark/spark-2.0.0-bin-hadoop2.6/python/pyspark/ml/base.pyc in fit(self, dataset, params)
62 return self.copy(params)._fit(dataset)
63 else:
---> 64 return self._fit(dataset)
65 else:
66 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
/home/edamame/spark/spark-2.0.0-bin-hadoop2.6/python/pyspark/ml/wrapper.pyc in _fit(self, dataset)
211
212 def _fit(self, dataset):
--> 213 java_model = self._fit_java(dataset)
214 return self._create_model(java_model)
215
/home/edamame/spark/spark-2.0.0-bin-hadoop2.6/python/pyspark/ml/wrapper.pyc in _fit_java(self, dataset)
208 """
209 self._transfer_params_to_java()
--> 210 return self._java_obj.fit(dataset._jdf)
211
212 def _fit(self, dataset):
/home/edamame/spark/spark-2.0.0-bin-hadoop2.6/python/lib/py4j-0.10.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
931 answer = self.gateway_client.send_command(command)
932 return_value = get_return_value(
--> 933 answer, self.gateway_client, self.target_id, self.name)
934
935 for temp_arg in temp_args:
/home/edamame/spark/spark-2.0.0-bin-hadoop2.6/python/pyspark/sql/utils.pyc in deco(*a, **kw)
67 e.java_exception.getStackTrace()))
68 if s.startswith('org.apache.spark.sql.AnalysisException: '):
---> 69 raise AnalysisException(s.split(': ', 1)[1], stackTrace)
70 if s.startswith('org.apache.spark.sql.catalyst.analysis'):
71 raise AnalysisException(s.split(': ', 1)[1], stackTrace)
AnalysisException: u"cannot resolve '`features`' given input columns: [field_1, field_2, field_3, field_4, field_5, field_6, field_7];"
Did I create the data frame wrong? Does anyone know what I missed? Thanks!
You need to use VectorAssembler
http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.VectorAssembler
from pyspark.ml.feature import VectorAssembler
vecAssembler = VectorAssembler(inputCols=spark_df.columns, outputCol="features")
vector_df = vecAssembler.transform(spark_df)
kmeans = KMeans().setK(n_clusters).setSeed(1)
model = kmeans.fit(vector_df )
For kmeans, it requires an rdd of DenseVectors. So you need to create a rdd of DenseVectors, where each vector corresponds to one row of your dataframe. So supposing that your dataframe has three columns you are feeding into the K Means model, I would refactor it to be along the lines of:
spark_rdd = spark_df.rdd.sortByKey()
modelInput = spark_rdd.map(lambda x: Vectors.dense(x[0],x[1],x[2])).sortByKey()
modelObject = Kmeans.train(modelInput,2)
Then if you want to get the results back from an RDD into a dataframe, I would do something like:
labels = modelInput.map(lambda x: model.predict(x))
results = labels.zip(spark_rdd)
resultFrame = results.map(lambda x: Row(Label = x[0], Column1 = x[0][1], Column2 = x[1][1],Column3 = x[1][2]).toDF()
data = [(Vectors.dense( [x[0], x[1]]),) for x in pandas_df.iloc[0:,2:4].values]
spark_df = spark.createDataFrame(data, ["features"])
kmeans = KMeans(k=2, seed=1.0)
modela = kmeans.fit(spark_df)
for more details refer to the official manual

trimming column named is generating ValueError

I have a table which I run through a function to trim its columns down to length 128 (I know it's really long, there isn't anything I can do about that) characters so it can use to_sql to create a database from it.
def truncate_column_names(df, length):
rename = {}
for col in df.columns:
if len(col) > length:
new_col = col[:length-3]+"..."
rename[col] = new_col
result = df.rename(columns=rename)
return result
This function works fine and I get a table out just fine but the problem comes when I tried to save the file I get the error
ValueError: Buffer has wrong number of dimensions (expected 1, got 2)
The method I have doing some housekeeping before saving to a file included dropping duplicates and that is where this error is being spit out. I tested this by saving the original dataFrame and then just loading it, running the truncate function, and then trying drop_duplicates on the result and I get the same error.
The headers for the file before I try truncating looks like this:
http://pastebin.com/WXmvwHDg
I trimmed the file down to 1 record and still have the problem.
This was a result of the truncating causing some columns to have non-unique names.
To confirm this was an issue I did a short test:
In [113]: df = pd.DataFrame(columns=["ab", "ac", "ad"])
In [114]: df
Out[114]:
Empty DataFrame
Columns: [ab, ac, ad]
Index: []
In [115]: df.drop_duplicates()
Out[115]:
Empty DataFrame
Columns: [ab, ac, ad]
Index: []
In [116]: df.columns
Out[116]: Index([u'ab', u'ac', u'ad'], dtype='object')
In [117]: df.columns = df.columns.str[:1]
In [118]: df
Out[118]:
Empty DataFrame
Columns: [a, a, a]
Index: []
In [119]: df.drop_duplicates()
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-119-daf275b6788b> in <module>()
----> 1 df.drop_duplicates()
C:\Miniconda\lib\site-packages\pandas\util\decorators.pyc in wrapper(*args, **kw
args)
86 else:
87 kwargs[new_arg_name] = new_arg_value
---> 88 return func(*args, **kwargs)
89 return wrapper
90 return _deprecate_kwarg
C:\Miniconda\lib\site-packages\pandas\core\frame.pyc in drop_duplicates(self, su
bset, take_last, inplace)
2826 deduplicated : DataFrame
2827 """
-> 2828 duplicated = self.duplicated(subset, take_last=take_last)
2829
2830 if inplace:
C:\Miniconda\lib\site-packages\pandas\util\decorators.pyc in wrapper(*args, **kw
args)
86 else:
87 kwargs[new_arg_name] = new_arg_value
---> 88 return func(*args, **kwargs)
89 return wrapper
90 return _deprecate_kwarg
C:\Miniconda\lib\site-packages\pandas\core\frame.pyc in duplicated(self, subset,
take_last)
2871
2872 vals = (self[col].values for col in subset)
-> 2873 labels, shape = map(list, zip( * map(f, vals)))
2874
2875 ids = get_group_index(labels, shape, sort=False, xnull=False)
C:\Miniconda\lib\site-packages\pandas\core\frame.pyc in f(vals)
2860
2861 def f(vals):
-> 2862 labels, shape = factorize(vals, size_hint=min(len(self), _SI
ZE_HINT_LIMIT))
2863 return labels.astype('i8',copy=False), len(shape)
2864
C:\Miniconda\lib\site-packages\pandas\core\algorithms.pyc in factorize(values, s
ort, order, na_sentinel, size_hint)
133 table = hash_klass(size_hint or len(vals))
134 uniques = vec_klass()
--> 135 labels = table.get_labels(vals, uniques, 0, na_sentinel)
136
137 labels = com._ensure_platform_int(labels)
pandas\hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_labels (pandas\ha
shtable.c:13946)()
ValueError: Buffer has wrong number of dimensions (expected 1, got 2)
and got the same result. using df.columns.unique() after the truncation i had ~200 duplicate columns after the truncation