Exporting from Access Database (.mdb) via Python to Pandas Dataframe - pandas

I'm looking for a method to programmatically access an access database file (.mdb) using python in an Ubuntu environment (hosted on a Windows 10 machine) to export tables to pandas dataframes.
I've attempted the solutions previously posted, such as using pandas access, pyodbc, or mdbtools, however all have failed due to errors relating to missing tables or files in the packages.
Any feedback or insights would be much appreciated!
pyodc failure:
OperationalError Traceback (most recent call last)
Cell In[6], line 2
1 pypyodbc.lowercase = False
----> 2 conn = pypyodbc.connect(
3 r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};" +
4 r"/home/kevin/scratch/HDINEWTON.mdb;")
5 cur = conn.cursor()
6 # cur.execute("SELECT CreatureID, Name_EN, Name_JP FROM Creatures");
7 # while True:
8 # row = cur.fetchone()
(...)
11 # print(u"Creature with ID {0} is {1} ({2})".format(
12 # row.get("CreatureID"), row.get("Name_EN"), row.get("Name_JP")))
File ~/miniconda3/envs/mdb_access/lib/python3.11/site-packages/pypyodbc.py:2454, in Connection.__init__(self, connectString, autocommit, ansi, timeout, unicode_results, readonly, **kargs)
2450 if self.connection_timeout != 0:
2451 self.set_connection_timeout(connection_timeout)
-> 2454 self.connect(connectString, autocommit, ansi, timeout, unicode_results, readonly)
File ~/miniconda3/envs/mdb_access/lib/python3.11/site-packages/pypyodbc.py:2507, in Connection.connect(self, connectString, autocommit, ansi, timeout, unicode_results, readonly)
2505 else:
2506 ret = odbc_func(self.dbc_h, 0, c_connectString, len(self.connectString), None, 0, None, SQL_DRIVER_NOPROMPT)
-> 2507 check_success(self, ret)
2510 # Set the connection's attribute of "autocommit"
2511 #
2512 self.autocommit = autocommit
File ~/miniconda3/envs/mdb_access/lib/python3.11/site-packages/pypyodbc.py:1009, in check_success(ODBC_obj, ret)
1007 ctrl_err(SQL_HANDLE_STMT, ODBC_obj.stmt_h, ret, ODBC_obj.ansi)
1008 elif isinstance(ODBC_obj, Connection):
-> 1009 ctrl_err(SQL_HANDLE_DBC, ODBC_obj.dbc_h, ret, ODBC_obj.ansi)
1010 else:
1011 ctrl_err(SQL_HANDLE_ENV, ODBC_obj, ret, False)
File ~/miniconda3/envs/mdb_access/lib/python3.11/site-packages/pypyodbc.py:983, in ctrl_err(ht, h, val_ret, ansi)
981 raise NotSupportedError(state,err_text)
982 elif state in (raw_s('HYT00'),raw_s('HYT01'),raw_s('01000')):
--> 983 raise OperationalError(state,err_text)
984 elif state[:2] in (raw_s('IM'),raw_s('HY')):
985 raise Error(state,err_text)
OperationalError: ('01000', "[01000] [unixODBC][Driver Manager]Can't open lib 'Microsoft Access Driver (*.mdb, *.accdb)' : file not found")
pandas access failure:
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
Cell In[4], line 2
1 path_to_mdb_database = '/home/kevin/scratch/HDINEWTON.mdb'
----> 2 mdb.list_tables(path_to_mdb_database)
File ~/miniconda3/envs/mdb_access/lib/python3.11/site-packages/pandas_access/__init__.py:25, in list_tables(rdb_file, encoding)
17 def list_tables(rdb_file, encoding="latin-1"):
18 """
19 :param rdb_file: The MS Access database file.
20 :param encoding: The content encoding of the output. I assume `latin-1`
(...)
23 :return: A list of the tables in a given database.
24 """
---> 25 tables = subprocess.check_output(['mdb-tables', rdb_file]).decode(encoding)
26 return tables.strip().split(" ")
File ~/miniconda3/envs/mdb_access/lib/python3.11/subprocess.py:465, in check_output(timeout, *popenargs, **kwargs)
462 empty = b''
463 kwargs['input'] = empty
--> 465 return run(*popenargs, stdout=PIPE, timeout=timeout, check=True,
466 **kwargs).stdout
File ~/miniconda3/envs/mdb_access/lib/python3.11/subprocess.py:546, in run(input, capture_output, timeout, check, *popenargs, **kwargs)
543 kwargs['stdout'] = PIPE
544 kwargs['stderr'] = PIPE
--> 546 with Popen(*popenargs, **kwargs) as process:
547 try:
548 stdout, stderr = process.communicate(input, timeout=timeout)
File ~/miniconda3/envs/mdb_access/lib/python3.11/subprocess.py:1022, in Popen.__init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, user, group, extra_groups, encoding, errors, text, umask, pipesize, process_group)
1018 if self.text_mode:
1019 self.stderr = io.TextIOWrapper(self.stderr,
1020 encoding=encoding, errors=errors)
-> 1022 self._execute_child(args, executable, preexec_fn, close_fds,
1023 pass_fds, cwd, env,
1024 startupinfo, creationflags, shell,
1025 p2cread, p2cwrite,
1026 c2pread, c2pwrite,
1027 errread, errwrite,
1028 restore_signals,
1029 gid, gids, uid, umask,
1030 start_new_session, process_group)
1031 except:
1032 # Cleanup if the child failed starting.
1033 for f in filter(None, (self.stdin, self.stdout, self.stderr)):
File ~/miniconda3/envs/mdb_access/lib/python3.11/subprocess.py:1899, in Popen._execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, gid, gids, uid, umask, start_new_session, process_group)
1897 if errno_num != 0:
1898 err_msg = os.strerror(errno_num)
-> 1899 raise child_exception_type(errno_num, err_msg, err_filename)
1900 raise child_exception_type(err_msg)
FileNotFoundError: [Errno 2] No such file or directory: 'mdb-tables'
mdb tools failure:
ModuleNotFoundError Traceback (most recent call last)
Cell In[2], line 5
3 import os
4 import pandas_access as mdb
----> 5 import mdbtools
7 print('Imports Successful')
File ~/miniconda3/envs/mdb/lib/python3.11/site-packages/mdbtools/__init__.py:1
----> 1 from .MDBcli import *
2 from .MDBnetrc import *
3 from .MDBtool import *
File ~/miniconda3/envs/mdb/lib/python3.11/site-packages/mdbtools/MDBcli.py:21
19 import sys
20 import argparse
---> 21 from MDButils import *
23 # Stop Python from complaining when I/O pipes are closed
24 from signal import signal, SIGPIPE, SIG_DFL
ModuleNotFoundError: No module named 'MDButils'

Related

s3fs FileNotFoundError

I am only able to gain limited/top-level access to my aws s3. I can see the buckets, but not their contents; neither subfolders nor files. I'm running everything from inside a conda environment. I've tried accessing files in private and public buckets without success. What am I doing wrong?
This block of code works as expected
>>> import s3fs
>>> AKEY = 'XXXX'
>>> SKEY = 'XXXX'
>>> fs = s3fs.S3FileSystem(key=AKEY,secret=SKEY)
>>> fs.ls('s3://')
['my-bucket-1',
'my-bucket-2',
'my-bucket-3']
This block doesn't
>>> fs.ls('s3://my-bucket-1')
[]
what I expect
>>> fs.ls('s3://my-bucket-1')
['my-bucket-1/test.txt',
'my-bucket-1/test.csv']
When I try to open a file I get a FileNotFoundError
import pandas as pd
pd.read_csv(
's3://my-bucket-1/test.csv',
storage_options={'key':AKEY,'secret':SKEY}
)
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
Cell In[8], line 2
1 import pandas as pd
----> 2 pd.read_csv(
3 's3://my-bucket-1/test.csv'',
4 storage_options={'key':AKEY,'secret':SKEY}
5 )
File ~\anaconda3\envs\env-2\lib\site-packages\pandas\util\_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs)
209 else:
210 kwargs[new_arg_name] = new_arg_value
--> 211 return func(*args, **kwargs)
File ~\anaconda3\envs\env-2\lib\site-packages\pandas\util\_decorators.py:331, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
325 if len(args) > num_allow_args:
326 warnings.warn(
327 msg.format(arguments=_format_argument_list(allow_args)),
328 FutureWarning,
329 stacklevel=find_stack_level(),
330 )
--> 331 return func(*args, **kwargs)
File ~\anaconda3\envs\env-2\lib\site-packages\pandas\io\parsers\readers.py:950, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
935 kwds_defaults = _refine_defaults_read(
936 dialect,
937 delimiter,
(...)
946 defaults={"delimiter": ","},
947 )
948 kwds.update(kwds_defaults)
--> 950 return _read(filepath_or_buffer, kwds)
File ~\anaconda3\envs\env-2\lib\site-packages\pandas\io\parsers\readers.py:605, in _read(filepath_or_buffer, kwds)
602 _validate_names(kwds.get("names", None))
604 # Create the parser.
--> 605 parser = TextFileReader(filepath_or_buffer, **kwds)
607 if chunksize or iterator:
608 return parser
File ~\anaconda3\envs\env-2\lib\site-packages\pandas\io\parsers\readers.py:1442, in TextFileReader.__init__(self, f, engine, **kwds)
1439 self.options["has_index_names"] = kwds["has_index_names"]
1441 self.handles: IOHandles | None = None
-> 1442 self._engine = self._make_engine(f, self.engine)
File ~\anaconda3\envs\env-2\lib\site-packages\pandas\io\parsers\readers.py:1735, in TextFileReader._make_engine(self, f, engine)
1733 if "b" not in mode:
1734 mode += "b"
-> 1735 self.handles = get_handle(
1736 f,
1737 mode,
1738 encoding=self.options.get("encoding", None),
1739 compression=self.options.get("compression", None),
1740 memory_map=self.options.get("memory_map", False),
1741 is_text=is_text,
1742 errors=self.options.get("encoding_errors", "strict"),
1743 storage_options=self.options.get("storage_options", None),
1744 )
1745 assert self.handles is not None
1746 f = self.handles.handle
File ~\anaconda3\envs\env-2\lib\site-packages\pandas\io\common.py:713, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
710 codecs.lookup_error(errors)
712 # open URLs
--> 713 ioargs = _get_filepath_or_buffer(
714 path_or_buf,
715 encoding=encoding,
716 compression=compression,
717 mode=mode,
718 storage_options=storage_options,
719 )
721 handle = ioargs.filepath_or_buffer
722 handles: list[BaseBuffer]
File ~\anaconda3\envs\env-2\lib\site-packages\pandas\io\common.py:409, in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options)
406 pass
408 try:
--> 409 file_obj = fsspec.open(
410 filepath_or_buffer, mode=fsspec_mode, **(storage_options or {})
411 ).open()
412 # GH 34626 Reads from Public Buckets without Credentials needs anon=True
413 except tuple(err_types_to_retry_with_anon):
File ~\anaconda3\envs\env-2\lib\site-packages\fsspec\core.py:135, in OpenFile.open(self)
128 def open(self):
129 """Materialise this as a real open file without context
130
131 The OpenFile object should be explicitly closed to avoid enclosed file
132 instances persisting. You must, therefore, keep a reference to the OpenFile
133 during the life of the file-like it generates.
134 """
--> 135 return self.__enter__()
File ~\anaconda3\envs\env-2\lib\site-packages\fsspec\core.py:103, in OpenFile.__enter__(self)
100 def __enter__(self):
101 mode = self.mode.replace("t", "").replace("b", "") + "b"
--> 103 f = self.fs.open(self.path, mode=mode)
105 self.fobjects = [f]
107 if self.compression is not None:
File ~\anaconda3\envs\env-2\lib\site-packages\fsspec\spec.py:1106, in AbstractFileSystem.open(self, path, mode, block_size, cache_options, compression, **kwargs)
1104 else:
1105 ac = kwargs.pop("autocommit", not self._intrans)
-> 1106 f = self._open(
1107 path,
1108 mode=mode,
1109 block_size=block_size,
1110 autocommit=ac,
1111 cache_options=cache_options,
1112 **kwargs,
1113 )
1114 if compression is not None:
1115 from fsspec.compression import compr
File ~\anaconda3\envs\env-2\lib\site-packages\s3fs\core.py:640, in S3FileSystem._open(self, path, mode, block_size, acl, version_id, fill_cache, cache_type, autocommit, requester_pays, cache_options, **kwargs)
637 if cache_type is None:
638 cache_type = self.default_cache_type
--> 640 return S3File(
641 self,
642 path,
643 mode,
644 block_size=block_size,
645 acl=acl,
646 version_id=version_id,
647 fill_cache=fill_cache,
648 s3_additional_kwargs=kw,
649 cache_type=cache_type,
650 autocommit=autocommit,
651 requester_pays=requester_pays,
652 cache_options=cache_options,
653 )
File ~\anaconda3\envs\env-2\lib\site-packages\s3fs\core.py:1989, in S3File.__init__(self, s3, path, mode, block_size, acl, version_id, fill_cache, s3_additional_kwargs, autocommit, cache_type, requester_pays, cache_options)
1987 self.details = s3.info(path)
1988 self.version_id = self.details.get("VersionId")
-> 1989 super().__init__(
1990 s3,
1991 path,
1992 mode,
1993 block_size,
1994 autocommit=autocommit,
1995 cache_type=cache_type,
1996 cache_options=cache_options,
1997 )
1998 self.s3 = self.fs # compatibility
2000 # when not using autocommit we want to have transactional state to manage
File ~\anaconda3\envs\env-2\lib\site-packages\fsspec\spec.py:1462, in AbstractBufferedFile.__init__(self, fs, path, mode, block_size, autocommit, cache_type, cache_options, size, **kwargs)
1460 self.size = size
1461 else:
-> 1462 self.size = self.details["size"]
1463 self.cache = caches[cache_type](
1464 self.blocksize, self._fetch_range, self.size, **cache_options
1465 )
1466 else:
File ~\anaconda3\envs\env-2\lib\site-packages\fsspec\spec.py:1475, in AbstractBufferedFile.details(self)
1472 #property
1473 def details(self):
1474 if self._details is None:
-> 1475 self._details = self.fs.info(self.path)
1476 return self._details
File ~\anaconda3\envs\env-2\lib\site-packages\fsspec\asyn.py:113, in sync_wrapper.<locals>.wrapper(*args, **kwargs)
110 #functools.wraps(func)
111 def wrapper(*args, **kwargs):
112 self = obj or args[0]
--> 113 return sync(self.loop, func, *args, **kwargs)
File ~\anaconda3\envs\env-2\lib\site-packages\fsspec\asyn.py:98, in sync(loop, func, timeout, *args, **kwargs)
96 raise FSTimeoutError from return_result
97 elif isinstance(return_result, BaseException):
---> 98 raise return_result
99 else:
100 return return_result
File ~\anaconda3\envs\env-2\lib\site-packages\fsspec\asyn.py:53, in _runner(event, coro, result, timeout)
51 coro = asyncio.wait_for(coro, timeout=timeout)
52 try:
---> 53 result[0] = await coro
54 except Exception as ex:
55 result[0] = ex
File ~\anaconda3\envs\env-2\lib\site-packages\s3fs\core.py:1257, in S3FileSystem._info(self, path, bucket, key, refresh, version_id)
1245 if (
1246 out.get("KeyCount", 0) > 0
1247 or out.get("Contents", [])
1248 or out.get("CommonPrefixes", [])
1249 ):
1250 return {
1251 "name": "/".join([bucket, key]),
1252 "type": "directory",
1253 "size": 0,
1254 "StorageClass": "DIRECTORY",
1255 }
-> 1257 raise FileNotFoundError(path)
1258 except ClientError as e:
1259 raise translate_boto_error(e, set_cause=False)
FileNotFoundError: my-bucket-1/test.csv
s3fs-2022.11.0, aiobotocore-2.4.0, botocore-1.27.59
fs = s3fs.S3FileSystem(anon=True)
fs.ls('s3://dask-data/nyc-taxi/2015')
ParseError
Check the bucket policy / IAM role that gives you permissions to access the bucket. It should have /* after the name of the resource:
"Action": "s3:GetObject",
"Resource": "arn:aws:s3:::my-bucket-1/*"
to allow you access the objects in the bucket, not just the bucket itself.
Have you tried boto3? s3fs is no longer supported.

unable to read csv file in jupyter notebook and following errors coming

import pandas as pd
df = pd.read_csv('D:\Tableau\codebasics_files\Weather_data.csv.xlsx')
df
​
UnicodeDecodeError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_18872\1985582496.py in
1 import pandas as pd
----> 2 df = pd.read_csv('D:\Tableau\codebasics_files\Weather_data.csv.xlsx')
3 df
C:\ProgramData\Anaconda3\lib\site-packages\pandas\util_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\readers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
676 kwds.update(kwds_defaults)
677
--> 678 return _read(filepath_or_buffer, kwds)
679
680
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\readers.py in _read(filepath_or_buffer, kwds)
573
574 # Create the parser.
--> 575 parser = TextFileReader(filepath_or_buffer, **kwds)
576
577 if chunksize or iterator:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\readers.py in init(self, f, engine, **kwds)
930
931 self.handles: IOHandles | None = None
--> 932 self._engine = self._make_engine(f, self.engine)
933
934 def close(self):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\readers.py in _make_engine(self, f, engine)
1232
1233 try:
-> 1234 return mapping[engine](f, **self.options)
1235 except Exception:
1236 if self.handles is not None:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\c_parser_wrapper.py in init(self, src, **kwds)
73
74 kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
---> 75 self._reader = parsers.TextReader(src, **kwds)
76
77 self.unnamed_cols = self._reader.unnamed_cols
C:\ProgramData\Anaconda3\lib\site-packages\pandas_libs\parsers.pyx in pandas._libs.parsers.TextReader.cinit()
C:\ProgramData\Anaconda3\lib\site-packages\pandas_libs\parsers.pyx in pandas._libs.parsers.TextReader._get_header()
C:\ProgramData\Anaconda3\lib\site-packages\pandas_libs\parsers.pyx in pandas._libs.parsers.TextReader._tokenize_rows()
C:\ProgramData\Anaconda3\lib\site-packages\pandas_libs\parsers.pyx in pandas._libs.parsers.raise_parser_error()
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xaa in position 14: invalid start byte
​
i tried some options using youtube but not working
Looking at the very end of the file extension, you're importing a .xlsx file, not a CSV file.
Try opening the file on Excel and export it as a CSV. You need to make sure that .csv is the last characters of the file.
I think you need to use XLSX function for load and read the XLSX file inside pandas.
For that you need to use this line of code inside your code:
import pandas as pd
df = pd.read_excel('D:\Tableau\codebasics_files\Weather_data.csv.xlsx')

Pandas can't read in excel file

Something is wrong with my pandas module. I tried to read in an excel file using the following code, which works on my classmate's computer, but it's giving me an error on my computer:
FFT1=pd.read_excel('FFT1.xlsx', sheet_name='sheet1')
The file named 'FFT1.xlsx' is in the same directory as my jupyter notebook. The error message says:
XLRDError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_7436/2793485739.py in <module>
----> 1 FFT1=pd.read_excel('FFT1.xlsx', sheet_name='sheet1')
D:\Softwares\Anaconda\lib\site-packages\pandas\io\excel\_base.py in read_excel(io, sheet_name, header, names, index_col, usecols, squeeze, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, verbose, parse_dates, date_parser, thousands, comment, skipfooter, convert_float, mangle_dupe_cols, **kwds)
302
303 if not isinstance(io, ExcelFile):
--> 304 io = ExcelFile(io, engine=engine)
305 elif engine and engine != io.engine:
306 raise ValueError(
D:\Softwares\Anaconda\lib\site-packages\pandas\io\excel\_base.py in __init__(self, io, engine)
819 self._io = stringify_path(io)
820
--> 821 self._reader = self._engines[engine](self._io)
822
823 def __fspath__(self):
D:\Softwares\Anaconda\lib\site-packages\pandas\io\excel\_xlrd.py in __init__(self, filepath_or_buffer)
19 err_msg = "Install xlrd >= 1.0.0 for Excel support"
20 import_optional_dependency("xlrd", extra=err_msg)
---> 21 super().__init__(filepath_or_buffer)
22
23 #property
D:\Softwares\Anaconda\lib\site-packages\pandas\io\excel\_base.py in __init__(self, filepath_or_buffer)
351 self.book = self.load_workbook(filepath_or_buffer)
352 elif isinstance(filepath_or_buffer, str):
--> 353 self.book = self.load_workbook(filepath_or_buffer)
354 elif isinstance(filepath_or_buffer, bytes):
355 self.book = self.load_workbook(BytesIO(filepath_or_buffer))
D:\Softwares\Anaconda\lib\site-packages\pandas\io\excel\_xlrd.py in load_workbook(self, filepath_or_buffer)
34 return open_workbook(file_contents=data)
35 else:
---> 36 return open_workbook(filepath_or_buffer)
37
38 #property
D:\Softwares\Anaconda\lib\site-packages\xlrd\__init__.py in open_workbook(filename, logfile, verbosity, use_mmap, file_contents, encoding_override, formatting_info, on_demand, ragged_rows, ignore_workbook_corruption)
168 # files that xlrd can parse don't start with the expected signature.
169 if file_format and file_format != 'xls':
--> 170 raise XLRDError(FILE_FORMAT_DESCRIPTIONS[file_format]+'; not supported')
171
172 bk = open_workbook_xls(
XLRDError: Excel xlsx file; not supported
How should I fix this?
Make sure that you already install openpyxl, if you don't try
pip install openpyxl
Change your code to
FFT1=pd.read_excel('FFT1.xlsx', sheet_name='sheet1',engine='openpyxl')

How change the value in a koalas dataframe based in a condition

I am using Koalas and I want to change the value of a column based on a condition.
In pandas I can do that using:
import pandas as pd
df_test = pd.DataFrame({
'a': [1,2,3]
,'b': ['one','two','three']})
df_test2 = pd.DataFrame({
'c': [2,1,3]
,'d': ['one','two','three']})
df_test.loc[df_test.a.isin(df_test2['c']),'b'] = 'four'
df_test.head()
a b
0 1 four
1 2 four
2 3 four
I am trying to use the same in Koalas, but I have this error:
---------------------------------------------------------------------------
PandasNotImplementedError Traceback (most recent call last)
<ipython-input-15-814219258adb> in <module>
5 new_loans['write_offs'] = 0
6
----> 7 new_loans.loc[(new_loans['ID'].isin(userinput_write_offs['id'])),'write_offs'] = 1
8 new_loans.loc[new_loans['write_offs']==1,'is_active'] = 0
9 new_loans = new_loans.sort_values(by = ['ZOHOID','Disb Date'])
/usr/local/lib/python3.7/dist-packages/databricks/koalas/base.py in isin(self, values)
894 )
895
--> 896 return self._with_new_scol(self.spark.column.isin(list(values)))
897
898 def isnull(self) -> Union["Series", "Index"]:
/usr/local/lib/python3.7/dist-packages/databricks/koalas/series.py in __iter__(self)
5871
5872 def __iter__(self):
-> 5873 return MissingPandasLikeSeries.__iter__(self)
5874
5875 if sys.version_info >= (3, 7):
/usr/local/lib/python3.7/dist-packages/databricks/koalas/missing/__init__.py in unsupported_function(*args, **kwargs)
21 def unsupported_function(*args, **kwargs):
22 raise PandasNotImplementedError(
---> 23 class_name=class_name, method_name=method_name, reason=reason
24 )
25
PandasNotImplementedError: The method `pd.Series.__iter__()` is not implemented. If you want to collect your data as an NumPy array, use 'to_numpy()' instead.
How could I do the same operation in Koalas?
UPDATE
Following this question: Assign Koalas Column from Numpy Result I have done:
df_test.loc[df_test.a.isin(df_test2['c'].to_list()),'b'] = 'four'
But now I have this error:
---------------------------------------------------------------------------
PythonException Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/IPython/core/formatters.py in __call__(self, obj)
700 type_pprinters=self.type_printers,
701 deferred_pprinters=self.deferred_printers)
--> 702 printer.pretty(obj)
703 printer.flush()
704 return stream.getvalue()
/usr/local/lib/python3.7/dist-packages/IPython/lib/pretty.py in pretty(self, obj)
392 if cls is not object \
393 and callable(cls.__dict__.get('__repr__')):
--> 394 return _repr_pprint(obj, self, cycle)
395
396 return _default_pprint(obj, self, cycle)
/usr/local/lib/python3.7/dist-packages/IPython/lib/pretty.py in _repr_pprint(obj, p, cycle)
698 """A pprint that just redirects to the normal repr function."""
699 # Find newlines and replace them with p.break_()
--> 700 output = repr(obj)
701 lines = output.splitlines()
702 with p.group():
/usr/local/lib/python3.7/dist-packages/databricks/koalas/frame.py in __repr__(self)
10614 return self._to_internal_pandas().to_string()
10615
> 10616 pdf = self._get_or_create_repr_pandas_cache(max_display_count)
10617 pdf_length = len(pdf)
10618 pdf = pdf.iloc[:max_display_count]
/usr/local/lib/python3.7/dist-packages/databricks/koalas/frame.py in _get_or_create_repr_pandas_cache(self, n)
10606 def _get_or_create_repr_pandas_cache(self, n):
10607 if not hasattr(self, "_repr_pandas_cache") or n not in self._repr_pandas_cache:
> 10608 self._repr_pandas_cache = {n: self.head(n + 1)._to_internal_pandas()}
10609 return self._repr_pandas_cache[n]
10610
/usr/local/lib/python3.7/dist-packages/databricks/koalas/frame.py in _to_internal_pandas(self)
10602 This method is for internal use only.
10603 """
> 10604 return self._internal.to_pandas_frame
10605
10606 def _get_or_create_repr_pandas_cache(self, n):
/usr/local/lib/python3.7/dist-packages/databricks/koalas/utils.py in wrapped_lazy_property(self)
514 def wrapped_lazy_property(self):
515 if not hasattr(self, attr_name):
--> 516 setattr(self, attr_name, fn(self))
517 return getattr(self, attr_name)
518
/usr/local/lib/python3.7/dist-packages/databricks/koalas/internal.py in to_pandas_frame(self)
807 """ Return as pandas DataFrame. """
808 sdf = self.to_internal_spark_frame
--> 809 pdf = sdf.toPandas()
810 if len(pdf) == 0 and len(sdf.schema) > 0:
811 pdf = pdf.astype(
/usr/local/spark/python/pyspark/sql/pandas/conversion.py in toPandas(self)
136
137 # Below is toPandas without Arrow optimization.
--> 138 pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
139 column_counter = Counter(self.columns)
140
/usr/local/spark/python/pyspark/sql/dataframe.py in collect(self)
594 """
595 with SCCallSiteSync(self._sc) as css:
--> 596 sock_info = self._jdf.collectToPython()
597 return list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))
598
/usr/local/lib/python3.7/dist-packages/py4j/java_gateway.py in __call__(self, *args)
1303 answer = self.gateway_client.send_command(command)
1304 return_value = get_return_value(
-> 1305 answer, self.gateway_client, self.target_id, self.name)
1306
1307 for temp_arg in temp_args:
/usr/local/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
132 # Hide where the exception came from that shows a non-Pythonic
133 # JVM exception message.
--> 134 raise_from(converted)
135 else:
136 raise
/usr/local/spark/python/pyspark/sql/utils.py in raise_from(e)
PythonException:
An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 589, in main
func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)
File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 447, in read_udfs
udfs.append(read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=i))
File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 254, in read_single_udf
f, return_type = read_command(pickleSer, infile)
File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 74, in read_command
command = serializer._read_with_length(file)
File "/opt/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 172, in _read_with_length
return self.loads(obj)
File "/opt/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 458, in loads
return pickle.loads(obj, encoding=encoding)
File "/opt/spark/python/lib/pyspark.zip/pyspark/cloudpickle.py", line 1110, in subimport
__import__(name)
ModuleNotFoundError: No module named 'pandas'
Why is trying to use pandas?
Koalas package exposes Pandas Like APIs on high level for the users but under the hood implementation is done using PySpark APIs.
I observed that within the stack track log you have pasted, a pandas dataframe is being created from sdf spark Dataframe using toPandas() method and assigned to pdf.
In the implementation of toPandas() function, pandas and numpy are being imported.
check line numbers 809 & 138.
/usr/local/lib/python3.7/dist-packages/databricks/koalas/internal.py in to_pandas_frame(self)
807 """ Return as pandas DataFrame. """
808 sdf = self.to_internal_spark_frame
--> 809 pdf = sdf.toPandas()
810 if len(pdf) == 0 and len(sdf.schema) > 0:
811 pdf = pdf.astype(
/usr/local/spark/python/pyspark/sql/pandas/conversion.py in toPandas(self)
136
137 # Below is toPandas without Arrow optimization.
--> 138 pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
139 column_counter = Counter(self.columns)
140
/usr/local/spark/python/pyspark/sql/dataframe.py in collect(self)
594 """
595 with SCCallSiteSync(self._sc) as css:
--> 596 sock_info = self._jdf.collectToPython()
597 return list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))
598
you can check out the implementation of toPandas() function at the following link:
https://github.com/apache/spark/blob/master/python/pyspark/sql/pandas/conversion.py

Error with matplotlib

When I try the magic command %pylab I get the following error:
In [1]: %pylab
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-1-5c1faa999e5b> in <module>()
----> 1 get_ipython().magic(u'pylab')
/Users/rafaelrodrigues/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in magic(self, arg_s)
2203 magic_name, _, magic_arg_s = arg_s.partition(' ')
2204 magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2205 return self.run_line_magic(magic_name, magic_arg_s)
2206
2207 #-------------------------------------------------------------------------
/Users/rafaelrodrigues/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in run_line_magic(self, magic_name, line)
2124 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
2125 with self.builtin_trap:
-> 2126 result = fn(*args,**kwargs)
2127 return result
2128
/Users/rafaelrodrigues/anaconda/lib/python2.7/site-packages/IPython/core/magics/pylab.pyc in pylab(self, line)
/Users/rafaelrodrigues/anaconda/lib/python2.7/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
191 # but it's overkill for just that one bit of state.
192 def magic_deco(arg):
--> 193 call = lambda f, *a, **k: f(*a, **k)
194
195 if callable(arg):
/Users/rafaelrodrigues/anaconda/lib/python2.7/site-packages/IPython/core/magics/pylab.pyc in pylab(self, line)
134 import_all = not args.no_import_all
135
--> 136 gui, backend, clobbered = self.shell.enable_pylab(args.gui, import_all=import_all)
137 self._show_matplotlib_backend(args.gui, backend)
138 print ("Populating the interactive namespace from numpy and matplotlib")
/Users/rafaelrodrigues/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in enable_pylab(self, gui, import_all, welcome_message)
2980 from IPython.core.pylabtools import import_pylab
2981
-> 2982 gui, backend = self.enable_matplotlib(gui)
2983
2984 # We want to prevent the loading of pylab to pollute the user's
/Users/rafaelrodrigues/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in enable_matplotlib(self, gui)
2941 gui, backend = pt.find_gui_and_backend(self.pylab_gui_select)
2942
-> 2943 pt.activate_matplotlib(backend)
2944 pt.configure_inline_support(self, backend)
2945
/Users/rafaelrodrigues/anaconda/lib/python2.7/site-packages/IPython/core/pylabtools.pyc in activate_matplotlib(backend)
287 matplotlib.rcParams['backend'] = backend
288
--> 289 import matplotlib.pyplot
290 matplotlib.pyplot.switch_backend(backend)
291
/Users/rafaelrodrigues/anaconda/lib/python2.7/site-packages/matplotlib/pyplot.p in <module>()
25
26 import matplotlib
---> 27 import matplotlib.colorbar
28 from matplotlib import style
29 from matplotlib import _pylab_helpers, interactive
/Users/rafaelrodrigues/anaconda/lib/python2.7/site-packages/matplotlib/colorbar.py in <module>()
32 import matplotlib.artist as martist
33 import matplotlib.cbook as cbook
---> 34 import matplotlib.collections as collections
35 import matplotlib.colors as colors
36 import matplotlib.contour as contour
/Users/rafaelrodrigues/anaconda/lib/python2.7/site-packages/matplotlib/collections.py in <module>()
25 import matplotlib.artist as artist
26 from matplotlib.artist import allow_rasterization
---> 27 import matplotlib.backend_bases as backend_bases
28 import matplotlib.path as mpath
29 from matplotlib import _path
/Users/rafaelrodrigues/anaconda/lib/python2.7/site-packages/matplotlib/backend_bases.py in <module>()
54
55 import matplotlib.tight_bbox as tight_bbox
---> 56 import matplotlib.textpath as textpath
57 from matplotlib.path import Path
58 from matplotlib.cbook import mplDeprecation
/Users/rafaelrodrigues/anaconda/lib/python2.7/site-packages/matplotlib/textpath.py in <module>()
17 from matplotlib.path import Path
18 from matplotlib import rcParams
---> 19 import matplotlib.font_manager as font_manager
20 from matplotlib.ft2font import FT2Font, KERNING_DEFAULT, LOAD_NO_HINTING
21 from matplotlib.ft2font import LOAD_TARGET_LIGHT
/Users/rafaelrodrigues/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py in <module>()
1410 verbose.report("Using fontManager instance from %s" % _fmcache)
1411 except:
-> 1412 _rebuild()
1413 else:
1414 _rebuild()
/Users/rafaelrodrigues/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py in _rebuild()
1395 def _rebuild():
1396 global fontManager
-> 1397 fontManager = FontManager()
1398 if _fmcache:
1399 pickle_dump(fontManager, _fmcache)
/Users/rafaelrodrigues/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py in __init__(self, size, weight)
1035 # Load TrueType fonts and create font dictionary.
1036
-> 1037 self.ttffiles = findSystemFonts(paths) + findSystemFonts()
1038 self.defaultFamily = {
1039 'ttf': 'Bitstream Vera Sans',
/Users/rafaelrodrigues/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py in findSystemFonts(fontpaths, fontext)
320 fontfiles[f] = 1
321
--> 322 for f in get_fontconfig_fonts(fontext):
323 fontfiles[f] = 1
324
/Users/rafaelrodrigues/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py in get_fontconfig_fonts(fontext)
272 pipe = subprocess.Popen(['fc-list', '--format=%{file}\\n'],
273 stdout=subprocess.PIPE,
--> 274 stderr=subprocess.PIPE)
275 output = pipe.communicate()[0]
276 except (OSError, IOError):
/Users/rafaelrodrigues/anaconda/lib/python2.7/subprocess.pyc in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags)
708 p2cread, p2cwrite,
709 c2pread, c2pwrite,
--> 710 errread, errwrite)
711 except Exception:
712 # Preserve original exception in case os.close raises.
/Users/rafaelrodrigues/anaconda/lib/python2.7/subprocess.pyc in _execute_child(self, args, executable, preexec_fn, close_fds, cwd, env, universal_newlines, startupinfo, creationflags, shell, to_close, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite)
1325 raise
1326 child_exception = pickle.loads(data)
-> 1327 raise child_exception
1328
1329
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 0: ordinal not in range(128)
I have uninstalled and installed matplotlib again many times. I am using the latest versions of IPython and Matplotlib in my Mac OS Mavericks.
What else should I try?
Tks
activate your conda environment if you're not using the default environment, then do conda install matplotlib
#Clarinetist:
try with fresh official Python installation.
setup ipython notebook server according to this guide:
http://ipython.org/ipython-doc/3/notebook/notebook.html
pylab is discouraged:
http://matplotlib.org/faq/usage_faq.html#matplotlib-pyplot-and-pylab-how-are-they-related