How to run inference of a pytorch model on pyspark dataframe (create new column with prediction) using pandas_udf? - pandas

Is there a way to run the inference of pytorch model over a pyspark dataframe in vectorized way (using pandas_udf?).
One row udf is pretty slow since the model state_dict() needs to be loaded for each row. I'm trying to use pandas_udf to speed this up, since all the operations can be vectorized efficiently in pandas/pytorch.
I've looked at this databricks post for inspiration, but it's doesn't correspond exactly to my use case since I want to run prediction on an existing pyspark dataframe.
I can get it to work using one row udf in this simple example:
import torch
import torch.nn as nn
from pyspark.sql.functions import col, pandas_udf, PandasUDFType, udf
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, FloatType, DoubleType
import pandas as pd
import numpy as np
spark = SparkSession.builder.master('local[*]') \
.appName("model_training") \
.getOrCreate()
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.w = nn.Linear(5, 1)
def forward(self, x):
return self.w(x)
net = Net()
bc_model_state = spark.sparkContext.broadcast(net.state_dict())
df = spark.sparkContext.parallelize([[np.random.rand() for i in range(5)] for j in range(10)]).toDF()
df = df.withColumn('features', F.array([F.col(f"_{i}") for i in range(1, 6)]))
def get_model_for_eval():
# Broadcast the model state_dict
net.load_state_dict(bc_model_state.value)
net.eval()
return net
def one_row_predict(x):
model = get_model_for_eval()
t = torch.tensor(x, dtype=torch.float32)
prediction = model(t).cpu().detach().item()
return prediction
one_row_udf = udf(one_row_predict, FloatType())
df = df.withColumn('pred_one_row', one_row_udf(col('features')))
df.show()
Output:
+--------------------+-------------------+-------------------+-------------------+-------------------+--------------------+------------+
| _1| _2| _3| _4| _5| features|pred_one_row|
+--------------------+-------------------+-------------------+-------------------+-------------------+--------------------+------------+
| 0.8447505355266759| 0.3938414671838497|0.46347383092447003| 0.7694022276208854| 0.6152606009215115|[0.84475053552667...| 0.025048971|
|0.023782157504950607| 0.6434186254505012| 0.4090423037706754| 0.5466917794921007| 0.7855157903802007|[0.02378215750495...| 0.19694215|
| 0.5057589877333257| 0.7186078182786649| 0.9123361330966105| 0.601837718628886| 0.0773272396167538|[0.50575898773332...| 0.278222|
| 0.2815336141913932| 0.5196112020157087| 0.9646444599173869|0.04844988843812004|0.35445251642633047|[0.28153361419139...| 0.10699606|
| 0.3896101050146765|0.38732747821339863| 0.8516864705178889| 0.2500977280156421| 0.7781221754566505|[0.38961010501467...| -0.08206403|
| 0.8223344715797269| 0.9089425281658239|0.10088026161623431| 0.9920995834835098|0.40665125930441104|[0.82233447157972...| 0.3565607|
| 0.31167413110257425| 0.9778009876605741| 0.4717549025588036|0.24563879994222826| 0.7594244867194454|[0.31167413110257...| 0.18897778|
| 0.5667657426129576| 0.5383639427018171| 0.2983527299596511|0.18914810241640534|0.47854422807435326|[0.56676574261295...| 0.17796803|
| 0.6419824467244137|0.03992370080139418|0.38462617679839173| 0.709487894249459|0.23020927682221126|[0.64198244672441...| 0.15635887|
| 0.7972928622000178| 0.7700992684264264| 0.4387404431803098| 0.1340696629092989| 0.7072213018683782|[0.79729286220001...| 0.0500246|
+--------------------+-------------------+-------------------+-------------------+-------------------+--------------------+------------+
Trying to do the same thing with in a vectorized way, this works:
def batch_predict(x):
model = get_model_for_eval()
xp = np.vstack(x)
t = torch.tensor(xp, dtype=torch.float32)
prediction = model(t).cpu().detach().numpy().flatten()
return pd.Series(prediction)
df_pd = df.toPandas()
x = df_pd['features']
print(batch_predict(x))
But running it inside a pandas_udf fails:
batch_udf = pandas_udf(batch_predict, FloatType())
df = df.withColumn('pred_batch', batch_udf(col('features')))
df.show()
with:
20/02/11 10:13:01 ERROR Executor: Exception in task 2.0 in stage 1.0 (TID 3)
java.lang.IllegalArgumentException
at java.nio.ByteBuffer.allocate(ByteBuffer.java:334)
at org.apache.arrow.vector.ipc.message.MessageSerializer.readMessage(MessageSerializer.java:543)
at org.apache.arrow.vector.ipc.message.MessageChannelReader.readNext(MessageChannelReader.java:58)
at org.apache.arrow.vector.ipc.ArrowStreamReader.readSchema(ArrowStreamReader.java:132)
at org.apache.arrow.vector.ipc.ArrowReader.initialize(ArrowReader.java:181)
at org.apache.arrow.vector.ipc.ArrowReader.ensureInitialized(ArrowReader.java:172)
at org.apache.arrow.vector.ipc.ArrowReader.getVectorSchemaRoot(ArrowReader.java:65)
at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:162)
at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:122)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at org.apache.spark.sql.execution.python.ArrowEvalPythonExec$$anon$2.<init>(ArrowEvalPythonExec.scala:98)
at org.apache.spark.sql.execution.python.ArrowEvalPythonExec.evaluate(ArrowEvalPythonExec.scala:96)
at org.apache.spark.sql.execution.python.EvalPythonExec$$anonfun$doExecute$1.apply(EvalPythonExec.scala:127)
at org.apache.spark.sql.execution.python.EvalPythonExec$$anonfun$doExecute$1.apply(EvalPythonExec.scala:89)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Thanks for the help

So apparently this issue is due to an incompatibility between spark 2.4.x and pyarrow >= 0.15. See here:
https://issues.apache.org/jira/browse/SPARK-29367
https://arrow.apache.org/blog/2019/10/06/0.15.0-release/
https://spark.apache.org/docs/3.0.0-preview/sql-pyspark-pandas-with-arrow.html#usage-notes
How I fixed it: Call this code before creating the spark session:
import os
os.environ['ARROW_PRE_0_15_IPC_FORMAT'] = '1'

Related

ThreadPoolExecutor DataFrame

I am dealing with a simple loop.
I have a slightly larger dataframe and I would like to use the processor (currently 2%).
I tried this:
import pandas as pd
import numpy as np
import time
from concurrent.futures import ThreadPoolExecutor
scan = pd.DataFrame([[0,2,3,5],[4,2,7,7], [5,6,2,3]], columns=['st1','nd1','st2','nd2'])
def task(value):
calc_all = pd.DataFrame()
for i in range(0,3,2):
j=i+1
calc = pd.concat([pd.DataFrame(scan.iloc[:,i]), pd.DataFrame(scan.iloc[:,j])],axis=1)
calc['th'] = calc.iloc[:,0] + calc.iloc[:,1]
calc_all = pd.concat([calc_all, calc], axis=1)
time.sleep(1) #tested time
return calc_all
if __name__ == '__main__':
with ThreadPoolExecutor(2) as exe:
for result in exe.map(task, range(2)):
print(result)
It's not faster. What did I do wrong?

WebGL disabled when embedding Plotly in PyQt

I am trying to embed a plotly code in a PyQt GUI. My plotly code needs to receive a huge amount of data, which was not a problem at all when it was running alone. But when I am trying to put it in a PyQt code, the performance is not good and I receive a warning about WebGL. Apparently WebGL is not activated. I tried to correct it by adding this line :
self.browser.page().settings().setAttribute(QtWebEngineWidgets.QWebEngineSettings.WebGLEnabled, True)
but it didn't change anything.
Here is my code:
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
import numpy as np
import pandas as pd
import plotly.express as px
class Widget(QtWidgets.QWidget):
def __init__(self, df, parent=None):
super().__init__(parent)
self.browser = QtWebEngineWidgets.QWebEngineView(self)
# WebGL enabled: this line appears to be useless
self.browser.page().settings().setAttribute(QtWebEngineWidgets.QWebEngineSettings.WebGLEnabled, True)
vlayout = QtWidgets.QVBoxLayout(self)
vlayout.addWidget(self.browser)
fig = px.scatter(df, x='x', y='y', color='lambda', log_x=True, log_y=True, opacity=0.10)
self.browser.setHtml(fig.to_html(include_plotlyjs='cdn'))
self.resize(1000,800)
if __name__ == "__main__":
N = 20000
col0 = 400 + 400 * np.random.random(N)
col1 = np.linspace(1,N,N) * np.random.normal(1,0.5,N)
col2 = np.linspace(1,N,N) * np.random.normal(1,5,N)
result = np.array([col0, col1, col2]).T
df = pd.DataFrame(result, columns=['lambda', 'x', 'y'])
app = QtWidgets.QApplication([])
widget = Widget(df)
widget.show()
app.exec()
Here is what I am obtaining if I am putting a value too big for N:
[1784:7704:1202/161033.541:ERROR:gles2_cmd_decoder.cc(10941)] [.WebGL-000001AB7779E4D0]PERFORMANCE WARNING: Attribute 0 is disabled. This has significant performance penalty
js: [.WebGL-000001AB7779E4D0]PERFORMANCE WARNING: Attribute 0 is disabled. This has significant performance penalty
I have to say that even if I have this warning, the code is not crashing... but is taking seriously too long when N is huge.

Dask: Access published dataset from within a submitted job

# Init
import time
import pandas as pd
import numpy as np
from dask.distributed import Client
client = Client()
# Publish data
dataset_name = 'my_dataset'
df_my_dataset = pd.DataFrame(np.ones((2,3)), dtype=np.float32)
client.publish_dataset(df_my_dataset, name=dataset_name)
Its there:
In [13]: client.list_datasets()
Out[13]: ('my_dataset',)
Create submit function for dask. Here I would like to access the published dataset by name:
# submit function
def get_gate1_rows(df_from_submit):
return df_from_submit.mean()
# return df.mean() + my_dataset.mean() #### <<<<<<< How to do this?
And finally the submit:
# Submit code
df_zeros = np.zeros((2,3), dtype=np.float32)
future = client.submit(get_gate1_rows, df_zeros)
time.sleep(2)
result = future.result()
This yields - but should be 0.5:
In [41]: result
Out[41]: 0.0
So how can I access the published dataset from within the dask job?
To access the published datasets within a task, you need get_client:
def get_gate1_rows(df_from_submit):
client = distributed.get_client()
my_dataset = client.get_dataset('my_dataset')
return df_from_submit.mean() + my_dataset.mean()
(the answer is three 1s, since df_zeros.mean()->0, df_my_dataset.mean()->1,1,1)

"No space left on device" error and SIGTERM signal when using Spark LSH in EMR

Spark version-2.3.2
EMR - 5.19.0
8 Executors
Each executor - 5 core
What I am trying to do :-
1. millions of numpy features (.npy files) are there in s3.
2. Using spark download all the numpy feature vectors.
3. convert them to Spark Dataframes where the Dataframe schema is feature-name (string) and feature-vector (VectorUDT).
4. Then find the LSH using spark between the data frames.
code:-
import numpy as np
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.sql import SparkSession
from pyspark import SQLContext
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
from pyspark.sql.types import StructField
from pyspark.sql.types import StructType
from pyspark.ml.linalg import VectorUDT
spark = SparkSession \
.builder \
.appName('test-app') \
.getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)
features_rdd = sc. \
binaryFiles("s3://input_path/"). \
map(
lambda x: (
x[0].split("/")[-1].split(".")[0],
Vectors.dense(np.fromstring(x[1], dtype='<f4'))
)
)
FEATURES_RAW_SCHEMA = StructType([StructField("image_id", StringType(), True),
StructField("feature", VectorUDT(), True)
])
features_df = sqlContext.createDataFrame(features_rdd, FEATURES_RAW_SCHEMA)
brp = BucketedRandomProjectionLSH(inputCol="feature", outputCol="hashes", seed=12345, numHashTables=30,
bucketLength=1000)
model = brp.fit(features_df)
all_pairs_df = model.approxSimilarityJoin(features_df, features_df, 50.0, distCol="similarity_score").select(
F.col("datasetA.image_id").alias("image_id1"), F.col("datasetB.image_id").alias("image_id2"),
F.col("similarity_score"))
all_pairs_df.write.mode("overwrite").json(
"s3a://output")
Error:-
19/01/10 11:20:11 WARN TaskSetManager: Lost task 120.0 in stage 3.0 (TID 11, ip-ip1.ec2.internal, executor 5): java.io.IOException: No space left on device
at java.io.FileOutputStream.writeBytes(Native Method)
at java.io.FileOutputStream.write(FileOutputStream.java:326)
at org.apache.spark.storage.TimeTrackingOutputStream.write(TimeTrackingOutputStream.java:58)
at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
at java.io.BufferedOutputStream.write(BufferedOutputStream.java:126)
at net.jpountz.lz4.LZ4BlockOutputStream.flushBufferedData(LZ4BlockOutputStream.java:220)
at net.jpountz.lz4.LZ4BlockOutputStream.write(LZ4BlockOutputStream.java:173)
at org.apache.spark.storage.DiskBlockObjectWriter.write(DiskBlockObjectWriter.scala:252)
at org.apache.spark.util.collection.unsafe.sort.UnsafeSorterSpillWriter.write(UnsafeSorterSpillWriter.java:133)
at org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.spillIterator(UnsafeExternalSorter.java:498)
at org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.spill(UnsafeExternalSorter.java:222)
at org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.createWithExistingInMemorySorter(UnsafeExternalSorter.java:111)
at org.apache.spark.sql.execution.UnsafeKVExternalSorter.<init>(UnsafeKVExternalSorter.java:156)
at org.apache.spark.sql.execution.UnsafeFixedWidthAggregationMap.destructAndCreateExternalSorter(UnsafeFixedWidthAggregationMap.java:248)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage7.agg_doConsume_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage7.agg_doAggregateWithKeys_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage7.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$12$$anon$2.hasNext(WholeStageCodegenExec.scala:633)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
submit script:-
sudo -i spark-submit --verbose --deploy-mode cluster --num-executors 11 --executor-cores 5 --executor-memory 8G --driver-cores 5 --driver-memory 8G --conf spark.dynamicAllocation.enabled=false --conf spark.yarn.executor.memoryOverhead=2048 /home/hadoop/filename.py
In some cases the the executors dying with SIGTERM signal I can see in log files. What am I doing wrong here ?

small test_set xgb predict

i would like to ask a question about a problem that i have for the last couple days.
First of all i am a beginner in machine learning and this is my first time using the XGBoost algorithm so excuse me for any mistakes I have done.
I trained my model to predict whether a log file is malicious or not. After i save and reload my model on a different session i use the predict function which seems to be working normally ( with a few deviations in probabilities but that is another topic, I know I, have seen it in another topic )
The problem is this: Sometimes when i try to predict a "small" csv file after load it seems to be broken predicting only the Zero label, even for indexes that are categorized correct previously.
For example, i load a dataset containing 20.000 values , the predict() is working. I keep only the first 5 of these values using pandas drop, again its working. If i save the 5 values on a different csv and reload it its not working. The same error happens if i just remove by hand all indexes (19.995) and save file only with 5 remaining.
I would bet it is a size of file problem but when i drop the indexes on the dataframe through pandas it seems to be working
Also the number 5 ( of indexes ) is for example purpose the same happens if I delete a large portion of the dataset.
I first came up with this problem after trying to verify by hand some completely new logs, which seem to be classified correctly if thrown into the big csv file but not in a new file on their own.
Here is my load and predict code
##IMPORTS
import os
import pandas as pd
from pandas.compat import StringIO
from datetime import datetime
from langid.langid import LanguageIdentifier, model
import langid
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import precision_recall_curve
from sklearn.externals import joblib
from ggplot import ggplot, aes, geom_line
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import average_precision_score
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from collections import defaultdict
import pickle
df = pd.read_csv('big_test.csv')
df3 = pd.read_csv('small_test.csv')
#This one is necessary for the loaded_model
class ColumnSelector(BaseEstimator, TransformerMixin):
def init(self, column_list):
self.column_list = column_list
def fit(self, x, y=None):
return self
def transform(self, x):
if len(self.column_list) == 1:
return x[self.column_list[0]].values
else:
return x[self.column_list].to_dict(orient='records')
loaded_model = joblib.load('finalized_model.sav')
result = loaded_model.predict(df)
print(result)
df2=df[:5]
result2 = loaded_model.predict(df2)
print(result2)
result3 = loaded_model.predict(df3)
print(result3)
The results i get are these:
[1 0 1 ... 0 0 0]
[1 0 1 0 1]
[0 0 0 0 0]
I can provide any code even from training or my dataset if necessary.
*EDIT: I use a pipeline for my data. I tried to reproduce the error after using xgb to fit the iris data and i could not. Maybe there is something wrong with my pipeline? the code is below :
df = pd.read_csv('big_test.csv')
# df.info()
# Split Dataset
attributes = ['uri','code','r_size','DT_sec','Method','http_version','PenTool','has_referer', 'Lang','LangProb','GibberFlag' ]
x_train, x_test, y_train, y_test = train_test_split(df[attributes], df['Scan'], test_size=0.2,
stratify=df['Scan'], random_state=0)
x_train, x_dev, y_train, y_dev = train_test_split(x_train, y_train, test_size=0.2,
stratify=y_train, random_state=0)
# print('Train:', len(y_train), 'Dev:', len(y_dev), 'Test:', len(y_test))
# set up graph function
def plot_precision_recall_curve(y_true, y_pred_scores):
precision, recall, thresholds = precision_recall_curve(y_true, y_pred_scores)
return ggplot(aes(x='recall', y='precision'),
data=pd.DataFrame({"precision": precision, "recall": recall})) + geom_line()
# XGBClassifier
class ColumnSelector(BaseEstimator, TransformerMixin):
def __init__(self, column_list):
self.column_list = column_list
def fit(self, x, y=None):
return self
def transform(self, x):
if len(self.column_list) == 1:
return x[self.column_list[0]].values
else:
return x[self.column_list].to_dict(orient='records')
count_vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 2), min_df=10)
dict_vectorizer = DictVectorizer()
xgb = XGBClassifier(seed=0)
pipeline = Pipeline([
("feature_union", FeatureUnion([
('text_features', Pipeline([
('selector', ColumnSelector(['uri'])),
('count_vectorizer', count_vectorizer)
])),
('categorical_features', Pipeline([
('selector', ColumnSelector(['code','r_size','DT_sec','Method','http_version','PenTool','has_referer', 'Lang','LangProb','GibberFlag' ])),
('dict_vectorizer', dict_vectorizer)
]))
])),
('xgb', xgb)
])
pipeline.fit(x_train, y_train)
filename = 'finalized_model.sav'
joblib.dump(pipeline, filename)
Thats due to different dtypes in big and small file.
When you do:
df = pd.read_csv('big_test.csv')
The dtypes are these:
print(df.dtypes)
# Output
uri object
code object # <== Observe this
r_size object # <== Observe this
Scan int64
...
...
...
Now when you do:
df3 = pd.read_csv('small_test.csv')
the dtypes are changed:
print(df3.dtypes)
# Output
uri object
code int64 # <== Now this has changed
r_size int64 # <== Now this has changed
Scan int64
...
...
You see, pandas will try to determine the dtypes of the columns by itself. When you load the big_test.csv, there are some values in code and r_size column which are of string types, due to this whole column dtype is changed to string, which is not done in small_test.csv.
Now due to this change, the dictVectorizer encodes the data in a different way than before and the features are changed, and hence the results are also changed.
If you do this:
df3[['code', 'r_size']] = df3[['code', 'r_size']].astype(str)
and then call the predict(), the results are same again.