Cannot download datasets files from Kaggle to Google Colab - google-colaboratory

def load_kaggle_csv(dataset_name, file_name):
dataset_url = f"https://www.kaggle.com/{dataset_name}/download"
!kaggle datasets download -d $dataset_url -f $file_name
return pd.read_csv(file_name)
dataset_name = "datasets/diegosilvadefrana/2023-data-scientists-jobs-descriptions"
file_name = "Jobs.csv"
Error:
Invalid dataset specification https://www.kaggle.com/datasets/diegosilvadefrana/2023-data-scientists-jobs-descriptions/download
FileNotFoundError Traceback (most recent call last)
in
9 file_name = "Jobs.csv"
10
---> 11 df = load_kaggle_csv(dataset_name, file_name)
8 frames
/usr/local/lib/python3.8/dist-packages/pandas/io/common.py in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
700 if ioargs.encoding and "b" not in ioargs.mode:
701 # Encoding
--> 702 handle = open(
703 handle,
704 ioargs.mode,
FileNotFoundError: [Errno 2] No such file or directory: 'Jobs.csv'

Here's a workaround:
! kaggle datasets download diegosilvadefrana/2023-data-scientists-jobs-descriptions

Related

error in building model "ner_ontonotes_bert_mult" in DeepPavlov

I'm trying to build model "ner_ontonotes_bert_mult" via GoogleColab using example in documentation:
from deeppavlov import build_model, configs
ner_model = build_model(configs.ner.ner_ontonotes_bert_mult, download=True)
but get error:
TypeError: init() got an unexpected keyword argument 'num_tags'
p.s. if I try to load another model (e.g. "ner_rus_bert"), the error does not appear
Full error (* maybe the error is related to directoty /packages/deeppavlov/models/torch_bert/crf.py* ):
2022-12-17 13:08:23.235 INFO in 'deeppavlov.download'['download'] at line 138: Skipped http://files.deeppavlov.ai/v1/ner/ner_ontonotes_bert_mult_torch_crf.tar.gz download because of matching hashes
INFO:deeppavlov.download:Skipped http://files.deeppavlov.ai/v1/ner/ner_ontonotes_bert_mult_torch_crf.tar.gz download because of matching hashes
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2022-12-17 13:08:30.1 ERROR in 'deeppavlov.core.common.params'['params'] at line 108: Exception in <class 'deeppavlov.models.torch_bert.torch_transformers_sequence_tagger.TorchTransformersSequenceTagger'>
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/deeppavlov/core/common/params.py", line 102, in from_params
component = obj(**dict(config_params, **kwargs))
File "/usr/local/lib/python3.8/dist-packages/deeppavlov/models/torch_bert/torch_transformers_sequence_tagger.py", line 182, in __init__
super().__init__(optimizer=optimizer,
File "/usr/local/lib/python3.8/dist-packages/deeppavlov/core/models/torch_model.py", line 98, in __init__
self.load()
File "/usr/local/lib/python3.8/dist-packages/deeppavlov/models/torch_bert/torch_transformers_sequence_tagger.py", line 295, in load
self.crf = CRF(self.n_classes).to(self.device)
File "/usr/local/lib/python3.8/dist-packages/deeppavlov/models/torch_bert/crf.py", line 13, in __init__
super().__init__(num_tags=num_tags, batch_first=batch_first)
TypeError: __init__() got an unexpected keyword argument 'num_tags'
ERROR:deeppavlov.core.common.params:Exception in <class 'deeppavlov.models.torch_bert.torch_transformers_sequence_tagger.TorchTransformersSequenceTagger'>
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/deeppavlov/core/common/params.py", line 102, in from_params
component = obj(**dict(config_params, **kwargs))
File "/usr/local/lib/python3.8/dist-packages/deeppavlov/models/torch_bert/torch_transformers_sequence_tagger.py", line 182, in __init__
super().__init__(optimizer=optimizer,
File "/usr/local/lib/python3.8/dist-packages/deeppavlov/core/models/torch_model.py", line 98, in __init__
self.load()
File "/usr/local/lib/python3.8/dist-packages/deeppavlov/models/torch_bert/torch_transformers_sequence_tagger.py", line 295, in load
self.crf = CRF(self.n_classes).to(self.device)
File "/usr/local/lib/python3.8/dist-packages/deeppavlov/models/torch_bert/crf.py", line 13, in __init__
super().__init__(num_tags=num_tags, batch_first=batch_first)
TypeError: __init__() got an unexpected keyword argument 'num_tags'
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-75-5e156706e7e4> in <module>
1 # from deeppavlov import configs, build_model
2
----> 3 ner_model = build_model(configs.ner.ner_ontonotes_bert_mult, download=True)
5 frames
/usr/local/lib/python3.8/dist-packages/deeppavlov/core/commands/infer.py in build_model(config, mode, load_trained, install, download)
53 .format(component_config.get('class_name', component_config.get('ref', 'UNKNOWN'))))
54
---> 55 component = from_params(component_config, mode=mode)
56
57 if 'id' in component_config:
/usr/local/lib/python3.8/dist-packages/deeppavlov/core/common/params.py in from_params(params, mode, **kwargs)
100 kwargs['mode'] = mode
101
--> 102 component = obj(**dict(config_params, **kwargs))
103 try:
104 _refs[config_params['id']] = component
/usr/local/lib/python3.8/dist-packages/deeppavlov/models/torch_bert/torch_transformers_sequence_tagger.py in __init__(self, n_tags, pretrained_bert, bert_config_file, attention_probs_keep_prob, hidden_keep_prob, optimizer, optimizer_parameters, learning_rate_drop_patience, learning_rate_drop_div, load_before_drop, clip_norm, min_learning_rate, use_crf, **kwargs)
180 self.use_crf = use_crf
181
--> 182 super().__init__(optimizer=optimizer,
183 optimizer_parameters=optimizer_parameters,
184 learning_rate_drop_patience=learning_rate_drop_patience,
/usr/local/lib/python3.8/dist-packages/deeppavlov/core/models/torch_model.py in __init__(self, device, optimizer, optimizer_parameters, lr_scheduler, lr_scheduler_parameters, learning_rate_drop_patience, learning_rate_drop_div, load_before_drop, min_learning_rate, *args, **kwargs)
96 self.opt = deepcopy(kwargs)
97
---> 98 self.load()
99 # we need to switch to eval mode here because by default it's in `train` mode.
100 # But in case of `interact/build_model` usage, we need to have model in eval mode.
/usr/local/lib/python3.8/dist-packages/deeppavlov/models/torch_bert/torch_transformers_sequence_tagger.py in load(self, fname)
293 self.model.to(self.device)
294 if self.use_crf:
--> 295 self.crf = CRF(self.n_classes).to(self.device)
296
297 self.optimizer = getattr(torch.optim, self.optimizer_name)(
/usr/local/lib/python3.8/dist-packages/deeppavlov/models/torch_bert/crf.py in __init__(self, num_tags, batch_first)
11
12 def __init__(self, num_tags: int, batch_first: bool = False) -> None:
---> 13 super().__init__(num_tags=num_tags, batch_first=batch_first)
14 nn.init.zeros_(self.transitions)
15 nn.init.zeros_(self.start_transitions)
TypeError: __init__() got an unexpected keyword argument 'num_tags'
Make sure that you are using the latest version of DeepPavlov by running:
!pip install deeppavlov
Then import all the required packages:
from deeppavlov import configs, build_model
Install the model's requirements and download the pretrained model:
ner_model = build_model(configs.ner.ner_ontonotes_bert_mult, download=True, install=True)
You can get more information about this model and many others in our recent Medium article.

OSError: [Errno -128] NetCDF: Attempt to use feature that was not turned on when netCDF was built.: b'/content/test.hdf'

I downloaded MODIS data from its http server and was trying to load it in xarray on google colab. I added the netrc file and the file is not corrupted since gdalinfo on that gave no error.
URL = 'https://e4ftl01.cr.usgs.gov/MOLT/MOD09GA.061/2019.02.24/MOD09GA.A2019055.h09v07.061.2020288120208.hdf'
result = requests.get(URL)
filename = 'test.hdf'
with open(filename, 'wb') as f:
f.write(result.content)
when I run
xr.open_dataset('test.hdf',engine = 'netcdf4' )
this is the error
KeyError: [<class 'netCDF4._netCDF4.Dataset'>, ('/content/test1.hdf',), 'r', (('clobber', True), ('diskless', False), ('format', 'NETCDF4'), ('persist', False))]
During handling of the above exception, another exception occurred:
OSError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/xarray/backends/file_manager.py in _acquire_with_cache_info(self, needs_lock)
203 kwargs = kwargs.copy()
204 kwargs["mode"] = self._mode
--> 205 file = self._opener(*self._args, **kwargs)
206 if self._mode == "w":
207 # ensure file doesn't get overriden when opened again
src/netCDF4/_netCDF4.pyx in netCDF4._netCDF4.Dataset.__init__()
src/netCDF4/_netCDF4.pyx in netCDF4._netCDF4._ensure_nc_success()
OSError: [Errno -128] NetCDF: Attempt to use feature that was not turned on when netCDF was built.: b'/content/test.hdf'
What is the error about and how to resolve it?
netcdf4 version 1.5.8
xarray version 0.18.1

Pandas can't read in excel file

Something is wrong with my pandas module. I tried to read in an excel file using the following code, which works on my classmate's computer, but it's giving me an error on my computer:
FFT1=pd.read_excel('FFT1.xlsx', sheet_name='sheet1')
The file named 'FFT1.xlsx' is in the same directory as my jupyter notebook. The error message says:
XLRDError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_7436/2793485739.py in <module>
----> 1 FFT1=pd.read_excel('FFT1.xlsx', sheet_name='sheet1')
D:\Softwares\Anaconda\lib\site-packages\pandas\io\excel\_base.py in read_excel(io, sheet_name, header, names, index_col, usecols, squeeze, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, verbose, parse_dates, date_parser, thousands, comment, skipfooter, convert_float, mangle_dupe_cols, **kwds)
302
303 if not isinstance(io, ExcelFile):
--> 304 io = ExcelFile(io, engine=engine)
305 elif engine and engine != io.engine:
306 raise ValueError(
D:\Softwares\Anaconda\lib\site-packages\pandas\io\excel\_base.py in __init__(self, io, engine)
819 self._io = stringify_path(io)
820
--> 821 self._reader = self._engines[engine](self._io)
822
823 def __fspath__(self):
D:\Softwares\Anaconda\lib\site-packages\pandas\io\excel\_xlrd.py in __init__(self, filepath_or_buffer)
19 err_msg = "Install xlrd >= 1.0.0 for Excel support"
20 import_optional_dependency("xlrd", extra=err_msg)
---> 21 super().__init__(filepath_or_buffer)
22
23 #property
D:\Softwares\Anaconda\lib\site-packages\pandas\io\excel\_base.py in __init__(self, filepath_or_buffer)
351 self.book = self.load_workbook(filepath_or_buffer)
352 elif isinstance(filepath_or_buffer, str):
--> 353 self.book = self.load_workbook(filepath_or_buffer)
354 elif isinstance(filepath_or_buffer, bytes):
355 self.book = self.load_workbook(BytesIO(filepath_or_buffer))
D:\Softwares\Anaconda\lib\site-packages\pandas\io\excel\_xlrd.py in load_workbook(self, filepath_or_buffer)
34 return open_workbook(file_contents=data)
35 else:
---> 36 return open_workbook(filepath_or_buffer)
37
38 #property
D:\Softwares\Anaconda\lib\site-packages\xlrd\__init__.py in open_workbook(filename, logfile, verbosity, use_mmap, file_contents, encoding_override, formatting_info, on_demand, ragged_rows, ignore_workbook_corruption)
168 # files that xlrd can parse don't start with the expected signature.
169 if file_format and file_format != 'xls':
--> 170 raise XLRDError(FILE_FORMAT_DESCRIPTIONS[file_format]+'; not supported')
171
172 bk = open_workbook_xls(
XLRDError: Excel xlsx file; not supported
How should I fix this?
Make sure that you already install openpyxl, if you don't try
pip install openpyxl
Change your code to
FFT1=pd.read_excel('FFT1.xlsx', sheet_name='sheet1',engine='openpyxl')

Cannot load lvis via tfds

I am trying to load built-in dataset lvis. It turns out that the tfds and lvis should be imported and installed respectively, however, I did possible all, it still does not work.
import os
import tensorflow as tf
from matplotlib import pyplot as plt
%matplotlib inline
!pip install lvis
!pip install tfds-nightly
import tensorflow_datasets as tfds
train_data, info = tfds.load('lvis', split='train', as_supervised=True, with_info=True)
validation_data = tfds.load('lvis', split='validation', as_supervised=True)
test_data = tfds.load('lvis', split='test', as_supervised=True)
There are some odd outputs after running upon codes in colab.
otFoundError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/tensorflow_datasets/core/utils/py_utils.py in try_reraise(*args, **kwargs)
391 try:
--> 392 yield
393 except Exception as e: # pylint: disable=broad-except
15 frames
/usr/local/lib/python3.7/dist-packages/tensorflow_datasets/core/load.py in builder(name, try_gcs, **builder_kwargs)
167 with py_utils.try_reraise(prefix=f'Failed to construct dataset {name}: '):
--> 168 return cls(**builder_kwargs) # pytype: disable=not-instantiable
169
/usr/local/lib/python3.7/dist-packages/tensorflow_datasets/core/dataset_builder.py in __init__(self, file_format, **kwargs)
917 """
--> 918 super().__init__(**kwargs)
919 self.info.set_file_format(file_format)
/usr/local/lib/python3.7/dist-packages/tensorflow_datasets/core/dataset_builder.py in __init__(self, data_dir, config, version)
184 else: # Use the code version (do not restore data)
--> 185 self.info.initialize_from_bucket()
186
/usr/local/lib/python3.7/dist-packages/tensorflow_datasets/core/utils/py_utils.py in __get__(self, obj, objtype)
145 if cached is None:
--> 146 cached = self.fget(obj) # pytype: disable=attribute-error
147 setattr(obj, attr, cached)
/usr/local/lib/python3.7/dist-packages/tensorflow_datasets/core/dataset_builder.py in info(self)
328 "the restored dataset.")
--> 329 info = self._info()
330 if not isinstance(info, dataset_info.DatasetInfo):
/usr/local/lib/python3.7/dist-packages/tensorflow_datasets/object_detection/lvis/lvis.py in _info(self)
94 names_file=tfds.core.tfds_path(
---> 95 'object_detection/lvis/lvis_classes.txt'))
96 return tfds.core.DatasetInfo(
/usr/local/lib/python3.7/dist-packages/tensorflow_datasets/core/features/class_label_feature.py in __init__(self, num_classes, names, names_file)
67 else:
---> 68 self.names = _load_names_from_file(names_file)
69
/usr/local/lib/python3.7/dist-packages/tensorflow_datasets/core/features/class_label_feature.py in _load_names_from_file(names_filepath)
198 name.strip()
--> 199 for name in tf.compat.as_text(f.read()).split("\n")
200 if name.strip() # Filter empty names
/usr/local/lib/python3.7/dist-packages/tensorflow/python/lib/io/file_io.py in read(self, n)
116 """
--> 117 self._preread_check()
118 if n == -1:
/usr/local/lib/python3.7/dist-packages/tensorflow/python/lib/io/file_io.py in _preread_check(self)
79 self._read_buf = _pywrap_file_io.BufferedInputStream(
---> 80 compat.path_to_str(self.__name), 1024 * 512)
81
NotFoundError: /usr/local/lib/python3.7/dist-packages/tensorflow_datasets/object_detection/lvis/lvis_classes.txt; No such file or directory
The above exception was the direct cause of the following exception:
RuntimeError Traceback (most recent call last)
<ipython-input-4-b8c819fe5c62> in <module>()
----> 1 train_data, info = tfds.load('lvis', split='train', as_supervised=True, with_info=True)
2 validation_data = tfds.load('lvis', split='validation', as_supervised=True)
3 test_data = tfds.load('lvis', split='test', as_supervised=True)
/usr/local/lib/python3.7/dist-packages/tensorflow_datasets/core/load.py in load(name, split, data_dir, batch_size, shuffle_files, download, as_supervised, decoders, read_config, with_info, builder_kwargs, download_and_prepare_kwargs, as_dataset_kwargs, try_gcs)
315 builder_kwargs = {}
316
--> 317 dbuilder = builder(name, data_dir=data_dir, try_gcs=try_gcs, **builder_kwargs)
318 if download:
319 download_and_prepare_kwargs = download_and_prepare_kwargs or {}
/usr/local/lib/python3.7/dist-packages/tensorflow_datasets/core/load.py in builder(name, try_gcs, **builder_kwargs)
166 if cls:
167 with py_utils.try_reraise(prefix=f'Failed to construct dataset {name}: '):
--> 168 return cls(**builder_kwargs) # pytype: disable=not-instantiable
169
170 # If neither the code nor the files are found, raise DatasetNotFoundError
/usr/lib/python3.7/contextlib.py in __exit__(self, type, value, traceback)
128 value = type()
129 try:
--> 130 self.gen.throw(type, value, traceback)
131 except StopIteration as exc:
132 # Suppress StopIteration *unless* it's the same exception that
/usr/local/lib/python3.7/dist-packages/tensorflow_datasets/core/utils/py_utils.py in try_reraise(*args, **kwargs)
392 yield
393 except Exception as e: # pylint: disable=broad-except
--> 394 reraise(e, *args, **kwargs)
395
396
/usr/local/lib/python3.7/dist-packages/tensorflow_datasets/core/utils/py_utils.py in reraise(e, prefix, suffix)
359 else:
360 exception = RuntimeError(f'{type(e).__name__}: {msg}')
--> 361 raise exception from e
362 # Otherwise, modify the exception in-place
363 elif len(e.args) <= 1:
RuntimeError: NotFoundError: Failed to construct dataset lvis: /usr/local/lib/python3.7/dist-packages/tensorflow_datasets/object_detection/lvis/lvis_classes.txt; No such file or directory
This is what I did to get it to work on Colab Notebook:
!pip install -q tfds-nightly tensorflow tensorflow-datasets matplotlib lvis pycocotools apache_beam
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
Since the tfds object detection lvis folder isn't up to date, I deleted that folder then redownloaded it from the tfds github page.
First install github-clone so we can download specific repo subfolders
!pip install github-clone
Then remove the lvis folder and redownload it from github:
!rm -rf ../usr/local/lib/python3.7/dist-packages/tensorflow_datasets/object_detection/lvis
!ghclone https://github.com/tensorflow/datasets/tree/master/tensorflow_datasets/object_detection/lvis
!mv ./lvis ../usr/local/lib/python3.7/dist-packages/tensorflow_datasets/object_detection/
After that I could get it to work, this next chunk of code worked for me:
ds, info = tfds.load('lvis', split='train[:25%]', with_info=True,
data_dir= '../content/tensorflow_datasets/',
decoders=tfds.decode.PartialDecoding({
'image': True,
'features': tfds.features.FeaturesDict({'image/id':True,
'objects':tfds.features.Sequence({
'id': True,
'bbox': True,
'label': tfds.features.ClassLabel(names=['skateboard','shoe'])
})
})
})
)

ParserError: Error tokenizing data. C error: Expected x fields in line y, saw z

I am new to python. Ran across below error when trying to read in dozens of zip files into a signle df. Each zip file contains a csv file. I searched extensively on stack overflow and haven't find a solution yet. my suspicion is that this csv contains Chinese characters as well as urls.
my code:
import os
import zipfile
import pandas as pd
import glob
file_path = os.getcwd() #obtain file path
allFiles = glob.glob(file_path + "/*.zip") #return list of all file names
list_ = []
for file in allFiles:
with zipfile.ZipFile(file) as f:
df = pd.read_csv(f.open(f.namelist()[0]),encoding='latin-1',header=None)
list_.append(df)
df = pd.concat(list_)
df
and here is the error message:
ParserError Traceback (most recent call last)
<ipython-input-25-471e9d0ab709> in <module>()
6 for file in allFiles:
7 with zipfile.ZipFile(file) as f:
----> 8 df = pd.read_csv(f.open(f.namelist()[0]),encoding='latin-1',skiprows=[0, 1],header=None)
9 list_.append(df)
10 df = pd.concat(list_)
D:\anaconda\lib\site-packages\pandas\io\parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
653 skip_blank_lines=skip_blank_lines)
654
--> 655 return _read(filepath_or_buffer, kwds)
656
657 parser_f.__name__ = name
D:\anaconda\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
409
410 try:
--> 411 data = parser.read(nrows)
412 finally:
413 parser.close()
D:\anaconda\lib\site-packages\pandas\io\parsers.py in read(self, nrows)
1003 raise ValueError('skipfooter not supported for iteration')
1004
-> 1005 ret = self._engine.read(nrows)
1006
1007 if self.options.get('as_recarray'):
D:\anaconda\lib\site-packages\pandas\io\parsers.py in read(self, nrows)
1746 def read(self, nrows=None):
1747 try:
-> 1748 data = self._reader.read(nrows)
1749 except StopIteration:
1750 if self._first_chunk:
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader.read (pandas\_libs\parsers.c:10862)()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._read_low_memory (pandas\_libs\parsers.c:11138)()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._read_rows (pandas\_libs\parsers.c:11884)()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._tokenize_rows (pandas\_libs\parsers.c:11755)()
pandas\_libs\parsers.pyx in pandas._libs.parsers.raise_parser_error (pandas\_libs\parsers.c:28765)()
ParserError: Error tokenizing data. C error: Expected 1 fields in line 9, saw 2
I tried using "error_bad_line=False", yet it returns another error.
Eventually, I managed to by pass this problem by renaming both the zip file titles and the CSV file titles, keeping only numbers for my own reference. And changed the read_csv encoding to "latin-1".