My training data looks like
df = pd.DataFrame({'A' : [2, 5], 'B' : [1, 7]})
I have trained a model in AWS Sagemaker and I deployed the model behind an endpoint.
The endpoint accepts the payload as "text/csv".
to invoke the endpoint using boto3 you can do:
import boto3
client = boto3.client('sagemaker-runtime')
response = client.invoke_endpoint(
Body= my_payload_as_csv,
ContentType = 'text/csv')
How do i construct the payload "my_payload_as_csv" from my Dataframe in order to invoke the Sagemaker Endpoint correctly?
if you start from the dataframe example
df = pd.DataFrame({'A' : [2, 5], 'B' : [1, 7]})
you take a row
df_1_record = df[:1]
and convert df_1_record to a csv like this:
import io
from io import StringIO
csv_file = io.StringIO()
# by default sagemaker expects comma seperated
df_1_record.to_csv(csv_file, sep=",", header=False, index=False)
my_payload_as_csv = csv_file.getvalue()
my_payload_as_csv looks like
then you can invoke the sagemaker endpoint
import boto3
client = boto3.client('sagemaker-runtime')
response = client.invoke_endpoint(
Body= my_payload_as_csv,
ContentType = 'text/csv')
#VincentCleas's answer was good. But, If you want to construct csv-payload without installing pandas, do this:
import boto3
csv_buffer = open('<file-name>.csv')
my_payload_as_csv =
client = boto3.client('sagemaker-runtime')
response = client.invoke_endpoint(
Body= my_payload_as_csv,
ContentType = 'text/csv')
I am trying to read data from the following link to a data frame without saving locally (this is important). I figured out a way (below), but is there an efficient way to do this?
from urllib.request import urlopen
import pandas as pd
from io import StringIO
from matplotlib.dates import DateFormatter
from datetime import datetime
uri = ''
data = urlopen(uri, timeout=300).read().decode("utf-8")
dateparse = lambda x: datetime.strptime(x.strip(), '%Y-%m-%d %H:%M')
str1 = data.split('\n')
dfList = []
for ii in range(1,len(str1)):
if len(str1[ii])>0:
df1 = pd.read_csv(StringIO(str1[ii]), parse_dates=[1], date_parser=dateparse, header=None) #Read each string into a dataframe
if not df1.empty:
df2 = df1.iloc[:,0:3] #Get the first five columns
if df2.iloc[0,-1] != 'M': #Don't append the ones with missing data
df = pd.concat(dfList, axis=0, ignore_index=True)
df.columns = ['Station','Date','Temp']
ax1 = df.plot(x=1,y=2)
Using requests, pandas and io:
from io import StringIO
import pandas as pd
import requests
url = (
with requests.Session() as request:
response = request.get(url, timeout=30)
if response.status_code != 200:
df = pd.read_csv(StringIO(response.text), sep=",")
I'm using awswrangler to write parquets in my S3 and I usually add tags on all my objects to access and cost control, but I didn't find a way to do that using directly awswrangler. I'm current using the code below to test:
import awswrangler as wr
import boto3
import pandas as pd
# Boto session
session = boto3.Session(profile_name='my_profile')
# Dummy pandas dataframe
d = {'col1': [1, 2], 'col2': [3, 4]}
df_pandas = pd.DataFrame(data=d)
wr.s3.to_parquet(df=df_pandas, path='s3://my-bucket/path/', boto3_session=session)
There is a way to add tags to the objects that .to_parquet will write in my S3?
I just figured out that awswrangler has a parameter called s3_additional_kwargs that you can pass additional variables to the s3 requests that awswrangler does for you. You can send tags like in boto3 'Key1=value1&Key2=value2'
Below is an example how to add tags to your objects:
import awswrangler as wr
import boto3
import pandas as pd
# Tagging
tag_set = 'Key1=value1&Key2=value2'
# Boto session
session = boto3.Session(profile_name='my_profile')
# Dummy pandas dataframe
d = {'col1': [1, 2], 'col2': [3, 4]}
df_pandas = pd.DataFrame(data=d)
wr.s3.to_parquet(df=df_pandas, path='s3://my-bucket/path/', s3_additional_kwargs={'Tagging': tag_set}, boto3_session=session)
I use google cloud to create a pipeline for ingest data from api by using google cloud composer
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
from airflow.utils.dates import days_ago
from datetime import timedelta
import pymysql.cursors
import pandas as pd
import requests
def get_data_from_api():
url = ""
response = requests.get(url)
result_conversion_rate = response.json()
conversion_rate = pd.DataFrame.from_dict(result_conversion_rate)
conversion_rate = conversion_rate.reset_index().rename(columns={"index":"date"})
conversion_rate['date'] = pd.to_datetime(conversion_rate['date'])
conversion_rate.to_csv("/home/airflow/gcs/data/conversion_rate_from_api.csv", index=False)
def covid_api():
url = ""
response = requests.get(url)
df = response.json()
df = pd.DataFrame.from_dict(df['Data'])
df['Date'] = pd.to_datetime(df['Date'])
df.to_csv("/home/airflow/gcs/data/result.csv", index=False)
default_args = {
'owner': 'datath',
'depends_on_past': False,
'start_date': days_ago(2),
'email': [''],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
'schedule_interval': '#once'
dag = DAG(
description='Pipeline for ETL online_retail data',
t1 = PythonOperator(
t2 = PythonOperator(
t1 >> t2
the first task works fine but the second task got failed and I try the second task on jupyter it works fine please help don't know what to do
It appears it is failing on the response.json() step.
There are a couple things you can do to troubleshoot:
Output the raw result of the response with like r.text. I think this will show you where the error is.
If you are still uncertain where the error is after step 1. We should load the result and try to deserialise using native json from python.
# Init
import time
import pandas as pd
import numpy as np
from dask.distributed import Client
client = Client()
# Publish data
dataset_name = 'my_dataset'
df_my_dataset = pd.DataFrame(np.ones((2,3)), dtype=np.float32)
client.publish_dataset(df_my_dataset, name=dataset_name)
Its there:
In [13]: client.list_datasets()
Out[13]: ('my_dataset',)
Create submit function for dask. Here I would like to access the published dataset by name:
# submit function
def get_gate1_rows(df_from_submit):
return df_from_submit.mean()
# return df.mean() + my_dataset.mean() #### <<<<<<< How to do this?
And finally the submit:
# Submit code
df_zeros = np.zeros((2,3), dtype=np.float32)
future = client.submit(get_gate1_rows, df_zeros)
result = future.result()
This yields - but should be 0.5:
In [41]: result
Out[41]: 0.0
So how can I access the published dataset from within the dask job?
To access the published datasets within a task, you need get_client:
def get_gate1_rows(df_from_submit):
client = distributed.get_client()
my_dataset = client.get_dataset('my_dataset')
return df_from_submit.mean() + my_dataset.mean()
(the answer is three 1s, since df_zeros.mean()->0, df_my_dataset.mean()->1,1,1)
I'm trying to deploy a simple ML model on SageMaker to get the hang of it, and I am not having any luck because I get the following error:
ValueError: could not convert string to float: '6.320000000000000097e-03 1.800000000000000000e+01 2.310000000000000053e+00 0.000000000000000000e+00 5.380000000000000338e-01 6.575000000000000178e+00 6.520000000000000284e+01 4.089999999999999858e+00 1.000000000000000000e+00 2.960000000000000000e+02 1.530000000000000071e+01 3.968999999999999773e+02 4.980000000000000426e+00 2.400000000000000000e+01'
This is the first row of my dataframe.
This is the code in my notebook that I'm using right now:
from sagemaker import get_execution_role, Session
from sagemaker.sklearn.estimator import SKLearn
work_dir = 'data'
session = Session()
role = get_execution_role()
train_input = session.upload_data('data')
script = ''
model = SKLearn(
entry_point = script,
train_instance_type = 'ml.c4.xlarge',
role = role,
sagemaker_session = session,
hyperparameters = {'alpha': 10}
){'train': train_input})
My script for looks like this:
import argparse
import pandas as pd
import os
from sklearn.linear_model import Ridge
from sklearn.externals import joblib
from sklearn.preprocessing import StandardScaler
import numpy as np
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--alpha', type=int, default=1)
parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])
parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
args = parser.parse_args()
input_files = [ os.path.join(args.train, file) for file in os.listdir(args.train) ]
if len(input_files) == 0:
raise ValueError(('There are no files in {}.\n' +
'This usually indicates that the channel ({}) was incorrectly specified,\n' +
'the data specification in S3 was incorrectly specified or the role specified\n' +
'does not have permission to access the data.').format(args.train, "train"))
raw_data = [ pd.read_csv(file, header=None, engine="python") for file in input_files ]
df = pd.concat(raw_data)
y_train = df.iloc[:, -1]
X_train = df.iloc[:, :5]
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
alpha = args.alpha
clf = Ridge(alpha=alpha)
clf =, y_train)
joblib.dump(clf, os.path.join(args.model_dir, "model.joblib"))
def model_fn(model_dir):
clf = joblib.load(os.path.join(model_dir, "model.joblib"))
return clf
The line that's giving the problem is this one:
X_train = scaler.fit_transform(X_train)
I tried df = df.astype(np.float) after I loaded in the df, but that didn't work either.
This file loads in without a problem when I'm not in SageMaker.