IPython Notebook: What is the default encoding? - pandas

I have created a package using the encoding utf-8.
When calling a function, it returns a DataFrame, with a column coded in utf-8.
When using IPython at the command line, I don't have any problems showing the content of this table. When using the Notebook, it crashes with the error 'utf8' codec can't decode byte 0xe7. I've attached a full traceback below.
What is the proper encoding to work with Notebook?
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-13-92c0011919e7> in <module>()
3 ver = verif.VerificacaoNA()
4 comp, total = ver.executarCompRealFisica(DT_INI, DT_FIN)
----> 5 comp
c:\Python27-32\lib\site-packages\ipython-0.13.1-py2.7.egg\IPython\core\displayhook.pyc in __call__(self, result)
240 self.update_user_ns(result)
241 self.log_output(format_dict)
--> 242 self.finish_displayhook()
243
244 def flush(self):
c:\Python27-32\lib\site-packages\ipython-0.13.1-py2.7.egg\IPython\zmq\displayhook.pyc in finish_displayhook(self)
59 sys.stdout.flush()
60 sys.stderr.flush()
---> 61 self.session.send(self.pub_socket, self.msg, ident=self.topic)
62 self.msg = None
63
c:\Python27-32\lib\site-packages\ipython-0.13.1-py2.7.egg\IPython\zmq\session.pyc in send(self, stream, msg_or_type, content, parent, ident, buffers, subheader, track, header)
557
558 buffers = [] if buffers is None else buffers
--> 559 to_send = self.serialize(msg, ident)
560 flag = 0
561 if buffers:
c:\Python27-32\lib\site-packages\ipython-0.13.1-py2.7.egg\IPython\zmq\session.pyc in serialize(self, msg, ident)
461 content = self.none
462 elif isinstance(content, dict):
--> 463 content = self.pack(content)
464 elif isinstance(content, bytes):
465 # content is already packed, as in a relayed message
c:\Python27-32\lib\site-packages\ipython-0.13.1-py2.7.egg\IPython\zmq\session.pyc in <lambda>(obj)
76
77 # ISO8601-ify datetime objects
---> 78 json_packer = lambda obj: jsonapi.dumps(obj, default=date_default)
79 json_unpacker = lambda s: extract_dates(jsonapi.loads(s))
80
c:\Python27-32\lib\site-packages\pyzmq-13.0.0-py2.7-win32.egg\zmq\utils\jsonapi.pyc in dumps(o, **kwargs)
70 kwargs['separators'] = (',', ':')
71
---> 72 return _squash_unicode(jsonmod.dumps(o, **kwargs))
73
74 def loads(s, **kwargs):
c:\Python27-32\lib\json\__init__.pyc in dumps(obj, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, encoding, default, **kw)
236 check_circular=check_circular, allow_nan=allow_nan, indent=indent,
237 separators=separators, encoding=encoding, default=default,
--> 238 **kw).encode(obj)
239
240
c:\Python27-32\lib\json\encoder.pyc in encode(self, o)
199 # exceptions aren't as detailed. The list call should be roughly
200 # equivalent to the PySequence_Fast that ''.join() would do.
--> 201 chunks = self.iterencode(o, _one_shot=True)
202 if not isinstance(chunks, (list, tuple)):
203 chunks = list(chunks)
c:\Python27-32\lib\json\encoder.pyc in iterencode(self, o, _one_shot)
262 self.key_separator, self.item_separator, self.sort_keys,
263 self.skipkeys, _one_shot)
--> 264 return _iterencode(o, 0)
265
266 def _make_iterencode(markers, _default, _encoder, _indent, _floatstr,
UnicodeDecodeError: 'utf8' codec can't decode byte 0xe7 in position 199: invalid continuation byte

I had the same problem recently, and indeed setting the default encoding to UTF-8 did the trick:
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
Running sys.getdefaultencoding() yielded 'ascii' on my environment (Python 2.7.3), so I guess that's the default.
Also see this related question and Ian Bicking's blog post on the subject.

Related

s3fs FileNotFoundError

I am only able to gain limited/top-level access to my aws s3. I can see the buckets, but not their contents; neither subfolders nor files. I'm running everything from inside a conda environment. I've tried accessing files in private and public buckets without success. What am I doing wrong?
This block of code works as expected
>>> import s3fs
>>> AKEY = 'XXXX'
>>> SKEY = 'XXXX'
>>> fs = s3fs.S3FileSystem(key=AKEY,secret=SKEY)
>>> fs.ls('s3://')
['my-bucket-1',
'my-bucket-2',
'my-bucket-3']
This block doesn't
>>> fs.ls('s3://my-bucket-1')
[]
what I expect
>>> fs.ls('s3://my-bucket-1')
['my-bucket-1/test.txt',
'my-bucket-1/test.csv']
When I try to open a file I get a FileNotFoundError
import pandas as pd
pd.read_csv(
's3://my-bucket-1/test.csv',
storage_options={'key':AKEY,'secret':SKEY}
)
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
Cell In[8], line 2
1 import pandas as pd
----> 2 pd.read_csv(
3 's3://my-bucket-1/test.csv'',
4 storage_options={'key':AKEY,'secret':SKEY}
5 )
File ~\anaconda3\envs\env-2\lib\site-packages\pandas\util\_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs)
209 else:
210 kwargs[new_arg_name] = new_arg_value
--> 211 return func(*args, **kwargs)
File ~\anaconda3\envs\env-2\lib\site-packages\pandas\util\_decorators.py:331, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
325 if len(args) > num_allow_args:
326 warnings.warn(
327 msg.format(arguments=_format_argument_list(allow_args)),
328 FutureWarning,
329 stacklevel=find_stack_level(),
330 )
--> 331 return func(*args, **kwargs)
File ~\anaconda3\envs\env-2\lib\site-packages\pandas\io\parsers\readers.py:950, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
935 kwds_defaults = _refine_defaults_read(
936 dialect,
937 delimiter,
(...)
946 defaults={"delimiter": ","},
947 )
948 kwds.update(kwds_defaults)
--> 950 return _read(filepath_or_buffer, kwds)
File ~\anaconda3\envs\env-2\lib\site-packages\pandas\io\parsers\readers.py:605, in _read(filepath_or_buffer, kwds)
602 _validate_names(kwds.get("names", None))
604 # Create the parser.
--> 605 parser = TextFileReader(filepath_or_buffer, **kwds)
607 if chunksize or iterator:
608 return parser
File ~\anaconda3\envs\env-2\lib\site-packages\pandas\io\parsers\readers.py:1442, in TextFileReader.__init__(self, f, engine, **kwds)
1439 self.options["has_index_names"] = kwds["has_index_names"]
1441 self.handles: IOHandles | None = None
-> 1442 self._engine = self._make_engine(f, self.engine)
File ~\anaconda3\envs\env-2\lib\site-packages\pandas\io\parsers\readers.py:1735, in TextFileReader._make_engine(self, f, engine)
1733 if "b" not in mode:
1734 mode += "b"
-> 1735 self.handles = get_handle(
1736 f,
1737 mode,
1738 encoding=self.options.get("encoding", None),
1739 compression=self.options.get("compression", None),
1740 memory_map=self.options.get("memory_map", False),
1741 is_text=is_text,
1742 errors=self.options.get("encoding_errors", "strict"),
1743 storage_options=self.options.get("storage_options", None),
1744 )
1745 assert self.handles is not None
1746 f = self.handles.handle
File ~\anaconda3\envs\env-2\lib\site-packages\pandas\io\common.py:713, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
710 codecs.lookup_error(errors)
712 # open URLs
--> 713 ioargs = _get_filepath_or_buffer(
714 path_or_buf,
715 encoding=encoding,
716 compression=compression,
717 mode=mode,
718 storage_options=storage_options,
719 )
721 handle = ioargs.filepath_or_buffer
722 handles: list[BaseBuffer]
File ~\anaconda3\envs\env-2\lib\site-packages\pandas\io\common.py:409, in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options)
406 pass
408 try:
--> 409 file_obj = fsspec.open(
410 filepath_or_buffer, mode=fsspec_mode, **(storage_options or {})
411 ).open()
412 # GH 34626 Reads from Public Buckets without Credentials needs anon=True
413 except tuple(err_types_to_retry_with_anon):
File ~\anaconda3\envs\env-2\lib\site-packages\fsspec\core.py:135, in OpenFile.open(self)
128 def open(self):
129 """Materialise this as a real open file without context
130
131 The OpenFile object should be explicitly closed to avoid enclosed file
132 instances persisting. You must, therefore, keep a reference to the OpenFile
133 during the life of the file-like it generates.
134 """
--> 135 return self.__enter__()
File ~\anaconda3\envs\env-2\lib\site-packages\fsspec\core.py:103, in OpenFile.__enter__(self)
100 def __enter__(self):
101 mode = self.mode.replace("t", "").replace("b", "") + "b"
--> 103 f = self.fs.open(self.path, mode=mode)
105 self.fobjects = [f]
107 if self.compression is not None:
File ~\anaconda3\envs\env-2\lib\site-packages\fsspec\spec.py:1106, in AbstractFileSystem.open(self, path, mode, block_size, cache_options, compression, **kwargs)
1104 else:
1105 ac = kwargs.pop("autocommit", not self._intrans)
-> 1106 f = self._open(
1107 path,
1108 mode=mode,
1109 block_size=block_size,
1110 autocommit=ac,
1111 cache_options=cache_options,
1112 **kwargs,
1113 )
1114 if compression is not None:
1115 from fsspec.compression import compr
File ~\anaconda3\envs\env-2\lib\site-packages\s3fs\core.py:640, in S3FileSystem._open(self, path, mode, block_size, acl, version_id, fill_cache, cache_type, autocommit, requester_pays, cache_options, **kwargs)
637 if cache_type is None:
638 cache_type = self.default_cache_type
--> 640 return S3File(
641 self,
642 path,
643 mode,
644 block_size=block_size,
645 acl=acl,
646 version_id=version_id,
647 fill_cache=fill_cache,
648 s3_additional_kwargs=kw,
649 cache_type=cache_type,
650 autocommit=autocommit,
651 requester_pays=requester_pays,
652 cache_options=cache_options,
653 )
File ~\anaconda3\envs\env-2\lib\site-packages\s3fs\core.py:1989, in S3File.__init__(self, s3, path, mode, block_size, acl, version_id, fill_cache, s3_additional_kwargs, autocommit, cache_type, requester_pays, cache_options)
1987 self.details = s3.info(path)
1988 self.version_id = self.details.get("VersionId")
-> 1989 super().__init__(
1990 s3,
1991 path,
1992 mode,
1993 block_size,
1994 autocommit=autocommit,
1995 cache_type=cache_type,
1996 cache_options=cache_options,
1997 )
1998 self.s3 = self.fs # compatibility
2000 # when not using autocommit we want to have transactional state to manage
File ~\anaconda3\envs\env-2\lib\site-packages\fsspec\spec.py:1462, in AbstractBufferedFile.__init__(self, fs, path, mode, block_size, autocommit, cache_type, cache_options, size, **kwargs)
1460 self.size = size
1461 else:
-> 1462 self.size = self.details["size"]
1463 self.cache = caches[cache_type](
1464 self.blocksize, self._fetch_range, self.size, **cache_options
1465 )
1466 else:
File ~\anaconda3\envs\env-2\lib\site-packages\fsspec\spec.py:1475, in AbstractBufferedFile.details(self)
1472 #property
1473 def details(self):
1474 if self._details is None:
-> 1475 self._details = self.fs.info(self.path)
1476 return self._details
File ~\anaconda3\envs\env-2\lib\site-packages\fsspec\asyn.py:113, in sync_wrapper.<locals>.wrapper(*args, **kwargs)
110 #functools.wraps(func)
111 def wrapper(*args, **kwargs):
112 self = obj or args[0]
--> 113 return sync(self.loop, func, *args, **kwargs)
File ~\anaconda3\envs\env-2\lib\site-packages\fsspec\asyn.py:98, in sync(loop, func, timeout, *args, **kwargs)
96 raise FSTimeoutError from return_result
97 elif isinstance(return_result, BaseException):
---> 98 raise return_result
99 else:
100 return return_result
File ~\anaconda3\envs\env-2\lib\site-packages\fsspec\asyn.py:53, in _runner(event, coro, result, timeout)
51 coro = asyncio.wait_for(coro, timeout=timeout)
52 try:
---> 53 result[0] = await coro
54 except Exception as ex:
55 result[0] = ex
File ~\anaconda3\envs\env-2\lib\site-packages\s3fs\core.py:1257, in S3FileSystem._info(self, path, bucket, key, refresh, version_id)
1245 if (
1246 out.get("KeyCount", 0) > 0
1247 or out.get("Contents", [])
1248 or out.get("CommonPrefixes", [])
1249 ):
1250 return {
1251 "name": "/".join([bucket, key]),
1252 "type": "directory",
1253 "size": 0,
1254 "StorageClass": "DIRECTORY",
1255 }
-> 1257 raise FileNotFoundError(path)
1258 except ClientError as e:
1259 raise translate_boto_error(e, set_cause=False)
FileNotFoundError: my-bucket-1/test.csv
s3fs-2022.11.0, aiobotocore-2.4.0, botocore-1.27.59
fs = s3fs.S3FileSystem(anon=True)
fs.ls('s3://dask-data/nyc-taxi/2015')
ParseError
Check the bucket policy / IAM role that gives you permissions to access the bucket. It should have /* after the name of the resource:
"Action": "s3:GetObject",
"Resource": "arn:aws:s3:::my-bucket-1/*"
to allow you access the objects in the bucket, not just the bucket itself.
Have you tried boto3? s3fs is no longer supported.

AttributeError: 'Adam' object has no attribute 'get_weights'

I am pretty new to tensorflow Keras and there is a Problem Running Cross Validation that I could not fix. It all worked before I installed featurewiz (conda install -c conda-forge featurewiz).
from sklearn.model_selection import KFold, cross_validate, cross_val_score
from scikeras.wrappers import KerasClassifier
estimator = KerasClassifier(model, epochs=500, batch_size=10) #, verbose = 0
kfold = KFold(n_splits=5, shuffle=True)
results = cross_validate(estimator, X, y, cv=kfold, scoring=['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'], return_train_score=True)
print(results)
Error:
WARNING:absl:Found untraced functions such as _update_step_xla while saving (showing 1 of 1). These functions will not be directly callable after loading.
INFO:tensorflow:Assets written to: ram:///var/folders/c4/ywdtx99d1vl0ptsg1fy494_40000gn/T/tmpsuvxkjb9/assets
INFO:tensorflow:Assets written to: ram:///var/folders/c4/ywdtx99d1vl0ptsg1fy494_40000gn/T/tmpsuvxkjb9/assets
---------------------------------------------------------------------------
Empty Traceback (most recent call last)
File ~/tensorflow-test/env/lib/python3.8/site-packages/joblib/parallel.py:862, in Parallel.dispatch_one_batch(self, iterator)
861 try:
--> 862 tasks = self._ready_batches.get(block=False)
863 except queue.Empty:
864 # slice the iterator n_jobs * batchsize items at a time. If the
865 # slice returns less than that, then the current batchsize puts
(...)
868 # accordingly to distribute evenly the last items between all
869 # workers.
File ~/tensorflow-test/env/lib/python3.8/queue.py:167, in Queue.get(self, block, timeout)
166 if not self._qsize():
--> 167 raise Empty
168 elif timeout is None:
Empty:
During handling of the above exception, another exception occurred:
AttributeError Traceback (most recent call last)
Cell In[5], line 6
4 estimator = KerasClassifier(model, epochs=500, batch_size=10) #, verbose = 0
5 kfold = KFold(n_splits=5, shuffle=True) #seed, damit shuffle gleich bleibt , random_state=1337
----> 6 results = cross_validate(estimator, X, y, cv=kfold, scoring=['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'], return_train_score=True)
8 print(results)
File ~/tensorflow-test/env/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:266, in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
263 # We clone the estimator to make sure that all the folds are
264 # independent, and that it is pickle-able.
265 parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
--> 266 results = parallel(
267 delayed(_fit_and_score)(
268 clone(estimator),
269 X,
270 y,
271 scorers,
272 train,
273 test,
274 verbose,
275 None,
276 fit_params,
277 return_train_score=return_train_score,
278 return_times=True,
279 return_estimator=return_estimator,
280 error_score=error_score,
281 )
282 for train, test in cv.split(X, y, groups)
283 )
285 _warn_or_raise_about_fit_failures(results, error_score)
287 # For callabe scoring, the return type is only know after calling. If the
288 # return type is a dictionary, the error scores can now be inserted with
289 # the correct key.
File ~/tensorflow-test/env/lib/python3.8/site-packages/joblib/parallel.py:1085, in Parallel.__call__(self, iterable)
1076 try:
1077 # Only set self._iterating to True if at least a batch
1078 # was dispatched. In particular this covers the edge
(...)
1082 # was very quick and its callback already dispatched all the
1083 # remaining jobs.
1084 self._iterating = False
-> 1085 if self.dispatch_one_batch(iterator):
1086 self._iterating = self._original_iterator is not None
1088 while self.dispatch_one_batch(iterator):
File ~/tensorflow-test/env/lib/python3.8/site-packages/joblib/parallel.py:873, in Parallel.dispatch_one_batch(self, iterator)
870 n_jobs = self._cached_effective_n_jobs
871 big_batch_size = batch_size * n_jobs
--> 873 islice = list(itertools.islice(iterator, big_batch_size))
874 if len(islice) == 0:
875 return False
File ~/tensorflow-test/env/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:268, in <genexpr>(.0)
263 # We clone the estimator to make sure that all the folds are
264 # independent, and that it is pickle-able.
265 parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
266 results = parallel(
267 delayed(_fit_and_score)(
--> 268 clone(estimator),
269 X,
270 y,
271 scorers,
272 train,
273 test,
274 verbose,
275 None,
276 fit_params,
277 return_train_score=return_train_score,
278 return_times=True,
279 return_estimator=return_estimator,
280 error_score=error_score,
281 )
282 for train, test in cv.split(X, y, groups)
283 )
285 _warn_or_raise_about_fit_failures(results, error_score)
287 # For callabe scoring, the return type is only know after calling. If the
288 # return type is a dictionary, the error scores can now be inserted with
289 # the correct key.
File ~/tensorflow-test/env/lib/python3.8/site-packages/sklearn/base.py:89, in clone(estimator, safe)
87 new_object_params = estimator.get_params(deep=False)
88 for name, param in new_object_params.items():
---> 89 new_object_params[name] = clone(param, safe=False)
90 new_object = klass(**new_object_params)
91 params_set = new_object.get_params(deep=False)
File ~/tensorflow-test/env/lib/python3.8/site-packages/sklearn/base.py:70, in clone(estimator, safe)
68 elif not hasattr(estimator, "get_params") or isinstance(estimator, type):
69 if not safe:
---> 70 return copy.deepcopy(estimator)
71 else:
72 if isinstance(estimator, type):
File ~/tensorflow-test/env/lib/python3.8/copy.py:153, in deepcopy(x, memo, _nil)
151 copier = getattr(x, "__deepcopy__", None)
152 if copier is not None:
--> 153 y = copier(memo)
154 else:
155 reductor = dispatch_table.get(cls)
File ~/tensorflow-test/env/lib/python3.8/site-packages/scikeras/_saving_utils.py:117, in deepcopy_model(model, memo)
116 def deepcopy_model(model: keras.Model, memo: Dict[Hashable, Any]) -> keras.Model:
--> 117 _, (model_bytes, optimizer_weights) = pack_keras_model(model)
118 new_model = unpack_keras_model(model_bytes, optimizer_weights)
119 memo[model] = new_model
File ~/tensorflow-test/env/lib/python3.8/site-packages/scikeras/_saving_utils.py:108, in pack_keras_model(model)
106 optimizer_weights = None
107 if model.optimizer is not None:
--> 108 optimizer_weights = model.optimizer.get_weights()
109 model_bytes = np.asarray(memoryview(b.read()))
110 return (
111 unpack_keras_model,
112 (model_bytes, optimizer_weights),
113 )
AttributeError: 'Adam' object has no attribute 'get_weights'
I created a Tensorflow enviroment on my M1 Macbook following https://github.com/mrdbourke/m1-machine-learning-test.
It all worked, I got following results:
TensorFlow has access to the following devices:
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
TensorFlow version: 2.11.0
I also installed featurewiz, I am not sure if there are some Problems installing it (I did conda install -c conda-forge featurewiz)
SciKeras doesn't work with TensorFlow 2.11. The TensorFlow team release a breaking change in a minor version bump (they removed the get_weights() method). It will be fixed in SciKeras soon: https://github.com/adriangb/scikeras/pull/287
Edit: that PR was merged so the new version of SciKeras (v0.10.0) should solve this issue.

Translate Tweets using googletrans - AttributeError: 'NoneType' object has no attribute 'group'

I have a pandas dataframe with some German tweets. I want to translate these tweets to English and use googletrans for this task and try to apply it via a lambda function to my dataframe.
I use this code:
from googletrans import Translator
df1['translated_tweet'] = df1['tweet'].apply(lambda x: Translator().translate(x, dest='en').text)
And get this error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-16-ed0b3a6e6dd8> in <module>
----> 1 df1['translated_tweet'] = df1['tweet'].apply(lambda x: Translator().translate(x, dest='en').text)
~\Downloads\WPy64-3920\python-3.9.2.amd64\lib\site-packages\pandas\core\series.py in apply(self, func, convert_dtype, args, **kwds)
4136 else:
4137 values = self.astype(object)._values
-> 4138 mapped = lib.map_infer(values, f, convert=convert_dtype)
4139
4140 if len(mapped) and isinstance(mapped[0], Series):
pandas\_libs\lib.pyx in pandas._libs.lib.map_infer()
<ipython-input-16-ed0b3a6e6dd8> in <lambda>(x)
----> 1 df1['translated_tweet'] = df1['tweet'].apply(lambda x: Translator().translate(x, dest='en').text)
~\Downloads\WPy64-3920\python-3.9.2.amd64\lib\site-packages\googletrans\client.py in translate(self, text, dest, src, **kwargs)
180
181 origin = text
--> 182 data = self._translate(text, dest, src, kwargs)
183
184 # this code will be updated when the format is changed.
~\Downloads\WPy64-3920\python-3.9.2.amd64\lib\site-packages\googletrans\client.py in _translate(self, text, dest, src, override)
76
77 def _translate(self, text, dest, src, override):
---> 78 token = self.token_acquirer.do(text)
79 params = utils.build_params(query=text, src=src, dest=dest,
80 token=token, override=override)
~\Downloads\WPy64-3920\python-3.9.2.amd64\lib\site-packages\googletrans\gtoken.py in do(self, text)
192
193 def do(self, text):
--> 194 self._update()
195 tk = self.acquire(text)
196 return tk
~\Downloads\WPy64-3920\python-3.9.2.amd64\lib\site-packages\googletrans\gtoken.py in _update(self)
60
61 # this will be the same as python code after stripping out a reserved word 'var'
---> 62 code = self.RE_TKK.search(r.text).group(1).replace('var ', '')
63 # unescape special ascii characters such like a \x3d(=)
64 code = code.encode().decode('unicode-escape')
AttributeError: 'NoneType' object has no attribute 'group'
What do I have to change here?

Pandas can't read in excel file

Something is wrong with my pandas module. I tried to read in an excel file using the following code, which works on my classmate's computer, but it's giving me an error on my computer:
FFT1=pd.read_excel('FFT1.xlsx', sheet_name='sheet1')
The file named 'FFT1.xlsx' is in the same directory as my jupyter notebook. The error message says:
XLRDError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_7436/2793485739.py in <module>
----> 1 FFT1=pd.read_excel('FFT1.xlsx', sheet_name='sheet1')
D:\Softwares\Anaconda\lib\site-packages\pandas\io\excel\_base.py in read_excel(io, sheet_name, header, names, index_col, usecols, squeeze, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, verbose, parse_dates, date_parser, thousands, comment, skipfooter, convert_float, mangle_dupe_cols, **kwds)
302
303 if not isinstance(io, ExcelFile):
--> 304 io = ExcelFile(io, engine=engine)
305 elif engine and engine != io.engine:
306 raise ValueError(
D:\Softwares\Anaconda\lib\site-packages\pandas\io\excel\_base.py in __init__(self, io, engine)
819 self._io = stringify_path(io)
820
--> 821 self._reader = self._engines[engine](self._io)
822
823 def __fspath__(self):
D:\Softwares\Anaconda\lib\site-packages\pandas\io\excel\_xlrd.py in __init__(self, filepath_or_buffer)
19 err_msg = "Install xlrd >= 1.0.0 for Excel support"
20 import_optional_dependency("xlrd", extra=err_msg)
---> 21 super().__init__(filepath_or_buffer)
22
23 #property
D:\Softwares\Anaconda\lib\site-packages\pandas\io\excel\_base.py in __init__(self, filepath_or_buffer)
351 self.book = self.load_workbook(filepath_or_buffer)
352 elif isinstance(filepath_or_buffer, str):
--> 353 self.book = self.load_workbook(filepath_or_buffer)
354 elif isinstance(filepath_or_buffer, bytes):
355 self.book = self.load_workbook(BytesIO(filepath_or_buffer))
D:\Softwares\Anaconda\lib\site-packages\pandas\io\excel\_xlrd.py in load_workbook(self, filepath_or_buffer)
34 return open_workbook(file_contents=data)
35 else:
---> 36 return open_workbook(filepath_or_buffer)
37
38 #property
D:\Softwares\Anaconda\lib\site-packages\xlrd\__init__.py in open_workbook(filename, logfile, verbosity, use_mmap, file_contents, encoding_override, formatting_info, on_demand, ragged_rows, ignore_workbook_corruption)
168 # files that xlrd can parse don't start with the expected signature.
169 if file_format and file_format != 'xls':
--> 170 raise XLRDError(FILE_FORMAT_DESCRIPTIONS[file_format]+'; not supported')
171
172 bk = open_workbook_xls(
XLRDError: Excel xlsx file; not supported
How should I fix this?
Make sure that you already install openpyxl, if you don't try
pip install openpyxl
Change your code to
FFT1=pd.read_excel('FFT1.xlsx', sheet_name='sheet1',engine='openpyxl')

pandas to_parquet fails on large datasets

I'm trying to save a very large dataset using pandas to_parquet, and it seems to fail when exceeding a certain limit, both with 'pyarrow' and 'fastparquet'. I reproduced the errors I am getting with the following code, and would be happy to hear ideas on how to overcome that issue:
Using Pyarrow:
low = 3
high = 8
for n in np.logspace(low, high, high-low+1):
t0 = time()
df = pd.DataFrame.from_records([(f'ind_{x}', ''.join(['x']*50)) for x in range(int(n))], columns=['a', 'b']).set_index('a')
df.to_parquet(tmp_file, engine='pyarrow', compression='gzip')
pd.read_parquet(tmp_file, engine='pyarrow')
print(f'10^{np.log10(int(n))} read-write took {time()-t0} seconds')
10^3.0 read-write took 0.012851715087890625 seconds
10^4.0 read-write took 0.05722832679748535 seconds
10^5.0 read-write took 0.46846866607666016 seconds
10^6.0 read-write took 4.4494054317474365 seconds
10^7.0 read-write took 43.0602171421051 seconds
---------------------------------------------------------------------------
ArrowIOError Traceback (most recent call last)
<ipython-input-51-cad917a26b91> in <module>()
5 df = pd.DataFrame.from_records([(f'ind_{x}', ''.join(['x']*50)) for x in range(int(n))], columns=['a', 'b']).set_index('a')
6 df.to_parquet(tmp_file, engine='pyarrow', compression='gzip')
----> 7 pd.read_parquet(tmp_file, engine='pyarrow')
8 print(f'10^{np.log10(int(n))} read-write took {time()-t0} seconds')
~/.conda/envs/anaconda3/lib/python3.6/site-packages/pandas/io/parquet.py in read_parquet(path, engine, columns, **kwargs)
255
256 impl = get_engine(engine)
--> 257 return impl.read(path, columns=columns, **kwargs)
~/.conda/envs/anaconda3/lib/python3.6/site-packages/pandas/io/parquet.py in read(self, path, columns, **kwargs)
128 kwargs['use_pandas_metadata'] = True
129 return self.api.parquet.read_table(path, columns=columns,
--> 130 **kwargs).to_pandas()
131
132 def _validate_write_lt_070(self, df):
~/.conda/envs/anaconda3/lib/python3.6/site-packages/pyarrow/parquet.py in read_table(source, columns, nthreads, metadata, use_pandas_metadata)
939 pf = ParquetFile(source, metadata=metadata)
940 return pf.read(columns=columns, nthreads=nthreads,
--> 941 use_pandas_metadata=use_pandas_metadata)
942
943
~/.conda/envs/anaconda3/lib/python3.6/site-packages/pyarrow/parquet.py in read(self, columns, nthreads, use_pandas_metadata)
148 columns, use_pandas_metadata=use_pandas_metadata)
149 return self.reader.read_all(column_indices=column_indices,
--> 150 nthreads=nthreads)
151
152 def scan_contents(self, columns=None, batch_size=65536):
_parquet.pyx in pyarrow._parquet.ParquetReader.read_all()
error.pxi in pyarrow.lib.check_status()
ArrowIOError: Arrow error: Invalid: BinaryArray cannot contain more than 2147483646 bytes, have 2147483650
Using fastparquet:
low = 3
high = 8
for n in np.logspace(low, high, high-low+1):
t0 = time()
df = pd.DataFrame.from_records([(f'ind_{x}', ''.join(['x']*50)) for x in range(int(n))], columns=['a', 'b']).set_index('a')
df.to_parquet(tmp_file, engine='fastparquet', compression='gzip')
pd.read_parquet(tmp_file, engine='fastparquet')
print(f'10^{np.log10(int(n))} read-write took {time()-t0} seconds')
10^3.0 read-write took 0.17770028114318848 seconds
10^4.0 read-write took 0.06351733207702637 seconds
10^5.0 read-write took 0.46896958351135254 seconds
10^6.0 read-write took 5.464379549026489 seconds
10^7.0 read-write took 50.26520347595215 seconds
---------------------------------------------------------------------------
OverflowError Traceback (most recent call last)
<ipython-input-49-234a889ae790> in <module>()
4 t0 = time()
5 df = pd.DataFrame.from_records([(f'ind_{x}', ''.join(['x']*50)) for x in range(int(n))], columns=['a', 'b']).set_index('a')
----> 6 df.to_parquet(tmp_file, engine='fastparquet', compression='gzip')
7 pd.read_parquet(tmp_file, engine='fastparquet')
8 print(f'10^{np.log10(int(n))} read-write took {time()-t0} seconds')
~/.conda/envs/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in to_parquet(self, fname, engine, compression, **kwargs)
1647 from pandas.io.parquet import to_parquet
1648 to_parquet(self, fname, engine,
-> 1649 compression=compression, **kwargs)
1650
1651 #Substitution(header='Write out the column names. If a list of strings '
~/.conda/envs/anaconda3/lib/python3.6/site-packages/pandas/io/parquet.py in to_parquet(df, path, engine, compression, **kwargs)
225 """
226 impl = get_engine(engine)
--> 227 return impl.write(df, path, compression=compression, **kwargs)
228
229
~/.conda/envs/anaconda3/lib/python3.6/site-packages/pandas/io/parquet.py in write(self, df, path, compression, **kwargs)
198 with catch_warnings(record=True):
199 self.api.write(path, df,
--> 200 compression=compression, **kwargs)
201
202 def read(self, path, columns=None, **kwargs):
~/.conda/envs/anaconda3/lib/python3.6/site-packages/fastparquet/writer.py in write(filename, data, row_group_offsets, compression, file_scheme, open_with, mkdirs, has_nulls, write_index, partition_on, fixed_text, append, object_encoding, times)
846 if file_scheme == 'simple':
847 write_simple(filename, data, fmd, row_group_offsets,
--> 848 compression, open_with, has_nulls, append)
849 elif file_scheme in ['hive', 'drill']:
850 if append:
~/.conda/envs/anaconda3/lib/python3.6/site-packages/fastparquet/writer.py in write_simple(fn, data, fmd, row_group_offsets, compression, open_with, has_nulls, append)
715 else None)
716 rg = make_row_group(f, data[start:end], fmd.schema,
--> 717 compression=compression)
718 if rg is not None:
719 fmd.row_groups.append(rg)
~/.conda/envs/anaconda3/lib/python3.6/site-packages/fastparquet/writer.py in make_row_group(f, data, schema, compression)
612 comp = compression
613 chunk = write_column(f, data[column.name], column,
--> 614 compression=comp)
615 rg.columns.append(chunk)
616 rg.total_byte_size = sum([c.meta_data.total_uncompressed_size for c in
~/.conda/envs/anaconda3/lib/python3.6/site-packages/fastparquet/writer.py in write_column(f, data, selement, compression)
545 data_page_header=dph, crc=None)
546
--> 547 write_thrift(f, ph)
548 f.write(bdata)
549
~/.conda/envs/anaconda3/lib/python3.6/site-packages/fastparquet/thrift_structures.py in write_thrift(fobj, thrift)
49 pout = TCompactProtocol(fobj)
50 try:
---> 51 thrift.write(pout)
52 fail = False
53 except TProtocolException as e:
~/.conda/envs/anaconda3/lib/python3.6/site-packages/fastparquet/parquet_thrift/parquet/ttypes.py in write(self, oprot)
1028 def write(self, oprot):
1029 if oprot._fast_encode is not None and self.thrift_spec is not None:
-> 1030 oprot.trans.write(oprot._fast_encode(self, [self.__class__, self.thrift_spec]))
1031 return
1032 oprot.writeStructBegin('PageHeader')
OverflowError: int out of range
It seems you succeeded with Pyarrow to write but not to read, and failed to write with fastparquet, thus did not get to read. I suggest you to write the data with Pyarrow and read with fastparquet by chunks, iterating through the row-groups:
from fastparquet import ParquetFile
df.to_parquet(tmp_file, engine='pyarrow', compression='gzip')
pf = ParquetFile(tmp_file)
for df in pf.iter_row_groups():
print(df.head(n=10))
I had a similar issue, upgrading to pyarrow 0.12 worked for me, and let me read the file in one go (instead of chunks).