KeyBERT package is not working on Google Colab - google-colaboratory

I'm using KeyBERT on Google Colab to extract keywords from the text.
from keybert import KeyBERT
model = KeyBERT('distilbert-base-nli-mean-tokens')
text_keywords = model.extract_keywords(my_long_text)
But I get the following error:
OSError: Model name 'distilbert-base-nli-mean-token' was not found in model name list (distilbert-base-uncased, distilbert-base-uncased-distilled-squad). We assumed 'distilbert-base-nli-mean-token' was a path or url to a configuration file named config.json or a directory containing such a file but couldn't find any such file at this path or url.
Any idea how to fix this?
Thanks
Exception when trying to download http://sbert.net/models/distilbert-base-nli-mean-token.zip. Response 404
SentenceTransformer-Model http://sbert.net/models/distilbert-base-nli-mean-token.zip not found. Try to create it from scratch
Try to create Transformer Model distilbert-base-nli-mean-token with mean pooling
---------------------------------------------------------------------------
HTTPError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/sentence_transformers/SentenceTransformer.py in __init__(self, model_name_or_path, modules, device)
78 zip_save_path = os.path.join(model_path_tmp, 'model.zip')
---> 79 http_get(model_url, zip_save_path)
80 with ZipFile(zip_save_path, 'r') as zip:
11 frames
/usr/local/lib/python3.7/dist-packages/sentence_transformers/util.py in http_get(url, path)
241 print("Exception when trying to download {}. Response {}".format(url, req.status_code), file=sys.stderr)
--> 242 req.raise_for_status()
243 return
/usr/local/lib/python3.7/dist-packages/requests/models.py in raise_for_status(self)
940 if http_error_msg:
--> 941 raise HTTPError(http_error_msg, response=self)
942
HTTPError: 404 Client Error: Not Found for url: https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/distilbert-base-nli-mean-token.zip
During handling of the above exception, another exception occurred:
OSError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/transformers/configuration_utils.py in from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
133 that will be used by default in the :obj:`generate` method of the model. In order to get the tokens of the
--> 134 words that should not appear in the generated text, use :obj:`tokenizer.encode(bad_word,
135 add_prefix_space=True)`.
/usr/local/lib/python3.7/dist-packages/transformers/file_utils.py in cached_path(url_or_filename, cache_dir, force_download, proxies)
181 except importlib_metadata.PackageNotFoundError:
--> 182 _timm_available = False
183
OSError: file distilbert-base-nli-mean-token not found
During handling of the above exception, another exception occurred:
OSError Traceback (most recent call last)
<ipython-input-59-d0fa7b6b7cd1> in <module>()
1 doc = full_text
----> 2 model = KeyBERT('distilbert-base-nli-mean-token')
/usr/local/lib/python3.7/dist-packages/keybert/model.py in __init__(self, model)
46 * https://www.sbert.net/docs/pretrained_models.html
47 """
---> 48 self.model = select_backend(model)
49
50 def extract_keywords(self,
/usr/local/lib/python3.7/dist-packages/keybert/backend/_utils.py in select_backend(embedding_model)
40 # Create a Sentence Transformer model based on a string
41 if isinstance(embedding_model, str):
---> 42 return SentenceTransformerBackend(embedding_model)
43
44 return SentenceTransformerBackend("xlm-r-bert-base-nli-stsb-mean-tokens")
/usr/local/lib/python3.7/dist-packages/keybert/backend/_sentencetransformers.py in __init__(self, embedding_model)
33 self.embedding_model = embedding_model
34 elif isinstance(embedding_model, str):
---> 35 self.embedding_model = SentenceTransformer(embedding_model)
36 else:
37 raise ValueError("Please select a correct SentenceTransformers model: \n"
/usr/local/lib/python3.7/dist-packages/sentence_transformers/SentenceTransformer.py in __init__(self, model_name_or_path, modules, device)
93 save_model_to = model_path
94 model_path = None
---> 95 transformer_model = Transformer(model_name_or_path)
96 pooling_model = Pooling(transformer_model.get_word_embedding_dimension())
97 modules = [transformer_model, pooling_model]
/usr/local/lib/python3.7/dist-packages/sentence_transformers/models/Transformer.py in __init__(self, model_name_or_path, max_seq_length, model_args, cache_dir, tokenizer_args, do_lower_case)
25 self.do_lower_case = do_lower_case
26
---> 27 config = AutoConfig.from_pretrained(model_name_or_path, **model_args, cache_dir=cache_dir)
28 self.auto_model = AutoModel.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir)
29 self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir, **tokenizer_args)
/usr/local/lib/python3.7/dist-packages/transformers/configuration_auto.py in from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
/usr/local/lib/python3.7/dist-packages/transformers/configuration_utils.py in from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
144 after the :obj:`decoder_start_token_id`. Useful for multilingual models like :doc:`mBART
145 <../model_doc/mbart>` where the first generated token needs to be the target language token.
--> 146 - **forced_eos_token_id** (:obj:`int`, `optional`) -- The id of the token to force as the last generated token
147 when :obj:`max_length` is reached.
148 - **remove_invalid_values** (:obj:`bool`, `optional`) -- Whether to remove possible `nan` and `inf` outputs of
OSError: Model name 'distilbert-base-nli-mean-token' was not found in model name list (distilbert-base-uncased, distilbert-base-uncased-distilled-squad). We assumed 'distilbert-base-nli-mean-token' was a path or url to a configuration file named config.json or a directory containing such a file but couldn't find any such file at this path or url.

I couldn't reproduce this issue with the code you've provided but from the provided error message I believe you're just missing an 's' in the model name so just make sure that the model name is as follows:
distilbert-base-nli-mean-tokens
and not
distilbert-base-nli-mean-token
Also refer to this link for all models available for use.

Related

AttributeError: 'ArrayView' object has no attribute 'A1'

I have to import a processed h5ad file, but it seems that X has been passed as a numpy array instead of a numpy matrix. See below:
# Read the data
data_path = "/home/bbb5130/snOMICS/maria/msrna.h5ad"
adata = sn.pp.read_h5ad(data_path, pr_process="Yes")
adata
But the output was:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
Cell In [15], line 3
1 # Read the data
2 data_path = "/home/bbb5130/snOMICS/maria/msrna.h5ad"
----> 3 adata = sn.pp.read_h5ad(data_path, pr_process="Yes")
4 adata
File ~/miniconda3/envs/snOMICS/lib/python3.9/site-packages/scanet/preprocessing.py:54, in Preprocessing.read_h5ad(cls, filename, pr_process)
51 return sc.read_h5ad(filename)
52 else:
53 # initial preprocessing as it is required later
---> 54 return cls._intial(adata)
File ~/miniconda3/envs/snOMICS/lib/python3.9/site-packages/scanet/preprocessing.py:35, in Preprocessing._intial(adata)
33 adata.var['mt'] = adata.var_names.str.startswith('MT-')
34 mito_genes = adata.var_names.str.startswith('MT-')
---> 35 adata.obs['percent_mito'] = np.sum(adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
36 sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, inplace=True)
37 sc.pp.filter_cells(adata, min_genes=0)
AttributeError: 'ArrayView' object has no attribute 'A1'
Is there anyway I can change the format, so the file can be read?
Thanks in advance.

Problem with 'Unknown image file format' error for GCS image in Tensorflow style transfer demo

I'm wanting to use my own images for this Tensorflow Style Transfer demo, which I've copied to my own Colab notebook.
[https://www.tensorflow.org/hub/tutorials/tf2_arbitrary_image_stylization][1]
I have images stored in a GCS bucket but have been getting image format errors. To test this, I took one of the images from the Tensorflow demo, downloaded it and put it in my GCS bucket, added the link to the "Let's try it on more images" section of my demo code, and am getting the same file format error message I was previously getting with my own images.
Here's where I've inserted the GCS version of the image:
content_urls = dict(
tueblingen02='https://storage.cloud.google.com/01_bucket-02/Tuebingen_Neckarfront-vox.jpeg',
sea_turtle='https://upload.wikimedia.org/wikipedia/commons/d/d7/Green_Sea_Turtle_grazing_seagrass.jpg',
tuebingen='https://upload.wikimedia.org/wikipedia/commons/0/00/Tuebingen_Neckarfront.jpg',
grace_hopper='https://storage.googleapis.com/download.tensorflow.org/example_images/grace_hopper.jpg',
)
style_urls = dict(
kanagawa_great_wave='https://upload.wikimedia.org/wikipedia/commons/0/0a/The_Great_Wave_off_Kanagawa.jpg',
kandinsky_composition_7='https://upload.wikimedia.org/wikipedia/commons/b/b4/Vassily_Kandinsky%2C_1913_-_Composition_7.jpg',
etc ...
The resulting error message:
InvalidArgumentError: Unknown image file format. One of JPEG, PNG, GIF, BMP required. [Op:DecodeImage]
Full message:
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
<ipython-input-16-3ded16359898> in <module>()
26 content_image_size = 384
27 style_image_size = 256
---> 28 content_images = {k: load_image(v, (content_image_size, content_image_size)) for k, v in content_urls.items()}
29 style_images = {k: load_image(v, (style_image_size, style_image_size)) for k, v in style_urls.items()}
30 style_images = {k: tf.nn.avg_pool(style_image, ksize=[3,3], strides=[1,1], padding='SAME') for k, style_image in style_images.items()}
3 frames
<ipython-input-16-3ded16359898> in <dictcomp>(.0)
26 content_image_size = 384
27 style_image_size = 256
---> 28 content_images = {k: load_image(v, (content_image_size, content_image_size)) for k, v in content_urls.items()}
29 style_images = {k: load_image(v, (style_image_size, style_image_size)) for k, v in style_urls.items()}
30 style_images = {k: tf.nn.avg_pool(style_image, ksize=[3,3], strides=[1,1], padding='SAME') for k, style_image in style_images.items()}
<ipython-input-2-1485a3082999> in load_image(image_url, image_size, preserve_aspect_ratio)
19 img = tf.io.decode_image(
20 tf.io.read_file(image_path),
---> 21 channels=3, dtype=tf.float32)[tf.newaxis, ...]
22 img = crop_center(img)
23 img = tf.image.resize(img, image_size, preserve_aspect_ratio=True)
/usr/local/lib/python3.7/dist-packages/tensorflow/python/util/traceback_utils.py in error_handler(*args, **kwargs)
151 except Exception as e:
152 filtered_tb = _process_traceback_frames(e.__traceback__)
--> 153 raise e.with_traceback(filtered_tb) from None
154 finally:
155 del filtered_tb
/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py in raise_from_not_ok_status(e, name)
7184 def raise_from_not_ok_status(e, name):
7185 e.message += (" name: " + name if name is not None else "")
-> 7186 raise core._status_to_exception(e) from None # pylint: disable=protected-access
7187
7188
InvalidArgumentError: Unknown image file format. One of JPEG, PNG, GIF, BMP required. [Op:DecodeImage]
So that's confusing me because the image works fine when it's hosted elsewhere, leading me to believe it's not an image format issue, but something else.
I'd greatly appreciate any input or suggestions on what might be happening here.
thx

Data wrangling using CPU workers and training xgboost using GPU workers with dask

I am trying to read 200 parquet files from hdfs and then try to train a model using 4 GPUs. I have 48 vcores available on the machine as well. If I start the cluster with just the GPU workers then reading part is going to be very slow (since it just uses 4 cpu workers assigned to the gpu workers and you can't really run more workers than the number of gpus you have unless you run them on separate shells and then it gets nasty because you are on your own for memory management issues.) I would like to read the files using CPU workers, play with the data with the cpu workers and then train an xgboost model using GPU workers. I read the documentation here about how to start and assign workers with different resources to different tasks. Also I have seen this question, but I am confused a bit.
Here is the the code I am trying to run to read the .parquet files:
import dask.dataframe as dd
df = dd \
.read_parquet(
"hdfs://address/to/the/*.parquet",
storage_options = {
"user":user,
"kerb_ticket":kerb_ticket},
engine='pyarrow') \
.persist()
This will automatically use all the cpu and gpu workers which is fine. After this I need to create my training data and label. Let's say I have X_train, y_train, and params. Here I convert them to dask_cudf:
X_train = dask_cudf.from_dask_dataframe(X_train)
y_train = dask_cudf.from_dask_dataframe(y_train)
Here is the part that I need to use just GPU workers:
Xy = dxgb.DaskDMatrix(client, X_train, y_train)
in order to follow the document I should convert it to this:
Xy = client.submit(dxgb.DaskDMatrix, client, X_train, y_train, resources={'GPU': 1})
But then I'll get this error:
distributed.protocol.pickle - INFO - Failed to serialize (<Client: 'tcp://169.68.236.35:8786' processes=52 threads=52, memory=1.97 TiB>, <dask_cudf.DataFrame | 19200 tasks | 200 npartitions>, <dask_cudf.Series | 600 tasks | 200 npartitions>). Exception: cannot pickle 'socket' object
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/envs/dask/lib/python3.8/site-packages/distributed/protocol/pickle.py in dumps(x, buffer_callback, protocol)
48 buffers.clear()
---> 49 result = pickle.dumps(x, **dump_kwargs)
50 if len(result) < 1000:
/envs/dask/lib/python3.8/socket.py in __getstate__(self)
271 def __getstate__(self):
--> 272 raise TypeError(f"cannot pickle {self.__class__.__name__!r} object")
273
TypeError: cannot pickle 'socket' object
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
<ipython-input-12-0d6a943365a9> in <module>
1 # Xy = dxgb.DaskDMatrix(client, X_train, y_train)
2 # Xy = dxgb.DaskDeviceQuantileDMatrix(client, X_train, y_train)
----> 3 Xy = client.submit(dxgb.DaskDMatrix, client, X_train, y_train, resources={'GPU': 1})
4 # Xy_valid = dxgb.DaskDMatrix(client, X_valid, y_valid)
/envs/dask/lib/python3.8/site-packages/distributed/client.py in submit(self, func, key, workers, resources, retries, priority, fifo_timeout, allow_other_workers, actor, actors, pure, *args, **kwargs)
1629 dsk = {skey: (func,) + tuple(args)}
1630
-> 1631 futures = self._graph_to_futures(
1632 dsk,
1633 [skey],
/envs/dask/lib/python3.8/site-packages/distributed/client.py in _graph_to_futures(self, dsk, keys, workers, allow_other_workers, priority, user_priority, resources, retries, fifo_timeout, actors)
2646 # Pack the high level graph before sending it to the scheduler
2647 keyset = set(keys)
-> 2648 dsk = dsk.__dask_distributed_pack__(self, keyset, annotations)
2649
2650 # Create futures before sending graph (helps avoid contention)
/envs/dask/lib/python3.8/site-packages/dask/highlevelgraph.py in __dask_distributed_pack__(self, client, client_keys, annotations)
1045 "__module__": layer.__module__,
1046 "__name__": type(layer).__name__,
-> 1047 "state": layer.__dask_distributed_pack__(
1048 self.get_all_external_keys(),
1049 self.key_dependencies,
/envs/dask/lib/python3.8/site-packages/dask/highlevelgraph.py in __dask_distributed_pack__(self, all_hlg_keys, known_key_dependencies, client, client_keys)
424 for k, v in dsk.items()
425 }
--> 426 dsk = toolz.valmap(dumps_task, dsk)
427 return {"dsk": dsk, "dependencies": dependencies}
428
/envs/dask/lib/python3.8/site-packages/cytoolz/dicttoolz.pyx in cytoolz.dicttoolz.valmap()
/envs/dask/lib/python3.8/site-packages/cytoolz/dicttoolz.pyx in cytoolz.dicttoolz.valmap()
/envs/dask/lib/python3.8/site-packages/distributed/worker.py in dumps_task(task)
3784 return d
3785 elif not any(map(_maybe_complex, task[1:])):
-> 3786 return {"function": dumps_function(task[0]), "args": warn_dumps(task[1:])}
3787 return to_serialize(task)
3788
/envs/dask/lib/python3.8/site-packages/distributed/worker.py in warn_dumps(obj, dumps, limit)
3793 def warn_dumps(obj, dumps=pickle.dumps, limit=1e6):
3794 """Dump an object to bytes, warn if those bytes are large"""
-> 3795 b = dumps(obj, protocol=4)
3796 if not _warn_dumps_warned[0] and len(b) > limit:
3797 _warn_dumps_warned[0] = True
/envs/dask/lib/python3.8/site-packages/distributed/protocol/pickle.py in dumps(x, buffer_callback, protocol)
58 try:
59 buffers.clear()
---> 60 result = cloudpickle.dumps(x, **dump_kwargs)
61 except Exception as e:
62 logger.info("Failed to serialize %s. Exception: %s", x, e)
/envs/dask/lib/python3.8/site-packages/cloudpickle/cloudpickle_fast.py in dumps(obj, protocol, buffer_callback)
71 file, protocol=protocol, buffer_callback=buffer_callback
72 )
---> 73 cp.dump(obj)
74 return file.getvalue()
75
/envs/dask/lib/python3.8/site-packages/cloudpickle/cloudpickle_fast.py in dump(self, obj)
561 def dump(self, obj):
562 try:
--> 563 return Pickler.dump(self, obj)
564 except RuntimeError as e:
565 if "recursion" in e.args[0]:
/envs/dask/lib/python3.8/socket.py in __getstate__(self)
270
271 def __getstate__(self):
--> 272 raise TypeError(f"cannot pickle {self.__class__.__name__!r} object")
273
274 def dup(self):
TypeError: cannot pickle 'socket' object
Anyone knows how to fix this issue?
The problem is that dask.Client is not serializable, so you can't submit it.
You can work around this problem accessing dask.Client within a task by using dask.distributed.get_client:
from dask.distributed import get_client
def create_dmatrix(X_train, y_train):
client = get_client()
return dxgb.DaskDMatrix(client, X_train, y_train)
Xy = client.submit(create_dmatrix, X_train, y_train, resources={'GPU': 1})

train image classification models with colab

I follow the template and change the link , but it doesn't work
https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb#scrollTo=3jz5x0JoskPv
This is my datasets
https://firebasestorage.googleapis.com/v0/b/lol-fypproject.appspot.com/o/lol.tgz?alt=media&token=d07b81bd-442f-4ebe-920e-3772598fbb20
original code
image_path = tf.keras.utils.get_file(
'flower_photos.tgz',
'https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz',
extract=True)
image_path = os.path.join(os.path.dirname(image_path), 'flower_photos')
I changed in that
image_path = tf.keras.utils.get_file(
'lol.tgz',
'https://firebasestorage.googleapis.com/v0/b/lol-fypproject.appspot.com/o/lol.tgz?alt=media&token=d07b81bd-442f-4ebe-920e-3772598fbb20',
extract=True)
image_path = os.path.join(os.path.dirname(image_path), 'lol')
the line wrong and error message is showed
data = ImageClassifierDataLoader.from_folder(image_path)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-15-a5e7646aca55> in <module>()
----> 1 data = ImageClassifierDataLoader.from_folder(image_path)
2 train_data, test_data = data.split(0.9)
/usr/local/lib/python3.7/dist-
packages/tensorflow_examples/lite/model_maker/core/data_util/image_dataloader.py
in
from_folder(cls, filename, shuffle)
69 all_image_size = len(all_image_paths)
70 if all_image_size == 0:
---> 71 raise ValueError('Image size is zero')
72
73 if shuffle:
ValueError: Image size is zero
I have find the problem
the path of the zip file is not the right structure as the sample

While removing html text from column, object of type 'float' has no len() error is occuring

I am using an amazon dataset to do sentiment analysis. Dataset content is
https://i.stack.imgur.com/qcKZp.png
dataset con be found on:
https://www.kaggle.com/PromptCloudHQ/amazon-reviews-unlocked-mobile-phones
I am trying to remove html from Review column.
This is what I am doing. Note: dataset is assigned to df.
df_removedNoise = []
def removingHTML(text):
soup = BeautifulSoup(text, 'lxml').get_text()
return soup
def removingNoise(text):
html_removed = removingHTML(text)
return html_removed
for i in df["Reviews"]:
text = removingNoise(i)
df_removedNoise.append(text)
Even though Reviews column has object as a datatype, I am still getting an error like.
TypeError Traceback (most recent call last)
<ipython-input-83-3591f5d7a54f> in <module>
9
10 for i in df["Reviews"]:
---> 11 df_removedNoise.append(removingNoise(i))
<ipython-input-83-3591f5d7a54f> in removingNoise(text)
5
6 def removingNoise(text):
----> 7 html_removed = removingHTML(text)
8 return html_removed
9
<ipython-input-83-3591f5d7a54f> in removingHTML(text)
1 df_removedNoise = []
2 def removingHTML(text):
----> 3 soup = BeautifulSoup(text, 'lxml').get_text()
4 return soup
5
~/anaconda3/lib/python3.7/site-packages/bs4/__init__.py in __init__(self, markup, features, builder, parse_only, from_encoding, exclude_encodings, **kwargs)
244 if hasattr(markup, 'read'): # It's a file-type object.
245 markup = markup.read()
--> 246 elif len(markup) <= 256 and (
247 (isinstance(markup, bytes) and not b'<' in markup)
248 or (isinstance(markup, str) and not '<' in markup)
TypeError: object of type 'float' has no len()
Any help will be appreciated!
Check for NaN with df[df['Reviews'].isnull()], if you find any try to dropna first