How to improve get object size with python and boto3? - amazon-s3

I use Cloudian Storage on premise with S3 API.
I need to monitor the used size of a bucket without Cloudian Admin Access.
With AWS CLI I use:
./aws --endpoint-url=https://s3-edc.emea.svc.corpintra.net:443 s3api list-objects --bucket edcs3mposdocifyb --output json --query "{\"size\": sum(Contents[].Size), \"objects\": length(Contents[])}"
This takes around 3 Minutes with following result:
{
"size": 216317367311,
"objects": 756771
}
I tried to get the same information with following python3 script using boto3.
import boto3
total_bucket_size = 0
total_bucket_objects = 0
s3 = boto3.resource('s3', aws_access_key_id="****", aws_secret_access_key="***", endpoint_url="https://my.cloudian.fqdn:443", verify="MyChain.cer")
bucket = s3.Bucket("mybucketname")
bucket_name = bucket.name
for obj in bucket.objects.all():
obj_key = obj.key
bucket_object = s3.Object(bucket_name, obj_key)
obj_size = int(bucket_object.content_length)
total_bucket_size += obj_size
total_bucket_objects += 1
print("%010d %s -> %d" %(total_bucket_objects,obj_key,obj_size))
print("Total size: %d" % total_bucket_size)
But this code will run some hours.
The goal is to write the result to an influxdb. It is quite easy with InfluxDBClient for python.
Any I idea why my boto3 code takes so long?
What can I change to speed up the code?

I found a way to reduce the time the python script uses to 4 minutes.
total_bucket_size = 0
total_bucket_objects = 0
s3 = boto3.resource('s3', aws_access_key_id="13c81dba2e4e78628c76", aws_secret_access_key="zLmJVNVx03BQaUokmu6bSROskArFKROhwVyoOdcT", endpoint_url="https://s3-edc.emea.svc.corpintra.net:443", verify="DaimlerChain.cer")
bucket = s3.Bucket("edcs3mposdocifyb")
bucket_name = bucket.name
for obj in bucket.objects.all():
obj_key = obj.key
#bucket_object = s3.Object(bucket_name, obj_key)
#obj_size = int(bucket_object.content_length)
obj_size = obj.size
total_bucket_size += obj_size
total_bucket_objects += 1
print("%010d %s -> %d" %(total_bucket_objects,obj_key,obj_size))
print("Total size: %d" % total_bucket_size)

Related

ERROR in getting file from Amazon S3 using python(3.8)

I am trying to get data from Amazon S3 and store it into a variable (file such as .pkl file).
And I am getting the following error:
ERROR:
expected str, bytes or os.PathLike object, not _io.BytesIO
CODE for S3
class S3Mgr:
def __init__(self,bucketName):
self.aws_access_key_id= CONFIG.S3[0]
self.aws_secret_access_key = CONFIG.S3[1]
self.region_name = CONFIG.S3[2]
self.bucketName = bucketName
def __connect(self):
self.s3 = boto3.client(
's3',
aws_access_key_id=self.aws_access_key_id,
aws_secret_access_key=self.aws_secret_access_key,
region_name=self.region_name
)
def retrieveModel(self,fileName):
self.__connect()
a = self.s3.download_fileobj(Bucket=self.bucketName, Key="fcm/project/"+str(fileName))
return a['Body'].read()
CODE for pickle
import pickle
from io import BytesIO
S3obj = S3mgr("mybucket")
model = S3obj.retrieveModel("model.pkl")
data = BytesIO(model)
model = pickle.load(data)
prediction = model.predict(inputArray)
Above inputArray is the array of inputs.
Try this, replace download_fileobj with get_object in your s3Mgr class's retrieveModel method.
Something like this:
def retrieveModel(self,fileName):
self.__connect()
a = self.s3.get_object(Bucket=self.bucketName, Key="fcm/project/"+str(fileName))
return a['Body'].read()
I Hope, it works. Ping if any progress or need any help.
Cheers 👍 !

Increasing the volume of recording in Real Time of after saving it?

I used python to make a prototype, to increase the volume of audio signal in real time. It worked by using new_data = audioop.mul(data, 4, 4) where 'data' is chunks from Pyaudio in streaming.
Now, I have to apply similar in ObjectiveC, and even after searching I am unable to find it. How can it be done in Objective C? Do we have such control over data flow in Objective C and If we can't, Is there anyway that a recorded sample's volume can be increased?
import pyaudio
import wave
import audioop
import sys
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
CHUNK = 1024
RECORD_SECONDS = 7
WAVE_OUTPUT_FILENAME1 = sys.argv[1]
WAVE_OUTPUT_FILENAME2 = sys.argv[2]
device_index = 2
print("----------------------record device list---------------------")
audio = pyaudio.PyAudio()
print(audio)
info = audio.get_host_api_info_by_index(0)
numdevices = info.get('deviceCount')
for i in range(0, numdevices):
if (audio.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0:
print ("Input Device id ", i, " - ", audio.get_device_info_by_host_api_device_index(0, i).get('name'))
print("-------------------------------------------------------------")
index = int((input()))
print(type(index))
print("recording via index "+str(index))
stream = audio.open(format=FORMAT, channels=CHANNELS,
rate=RATE, input=True,input_device_index = index,
frames_per_buffer=CHUNK)
print ("recording started")
Recordframes = []
Recordframes2= []
print(int(RATE / CHUNK * RECORD_SECONDS))
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
new_data = audioop.mul(data, 4, 4)
print("hshsh")
Recordframes.append(data)
Recordframes2.append(new_data)
# data = stream.read(CHUNK)
# print("hshsh")
# Recordframes.append(data)
# print ("recording stopped")
stream.stop_stream()
stream.close()
audio.terminate()
waveFile = wave.open(WAVE_OUTPUT_FILENAME1, 'wb')
waveFile.setnchannels(CHANNELS)
waveFile.setsampwidth(audio.get_sample_size(FORMAT))
waveFile.setframerate(RATE)
waveFile.writeframes(b''.join(Recordframes))
waveFile2 = wave.open(WAVE_OUTPUT_FILENAME2, 'wb')
waveFile2.setnchannels(CHANNELS)
waveFile2.setsampwidth(audio.get_sample_size(FORMAT))
waveFile2.setframerate(RATE)
waveFile2.writeframes(b''.join(Recordframes2))
waveFile.close()
waveFile2.close()
You can use AVAudioEngine (link) to tap into the raw audio data. Alternatively, still using AVAudioEngine, you could add an AVAudioUnitEQ (link) node to your audio graph and use that boost the gain.
Using either method, you can then write out to a file using AVAudioFile (link).

Process CSV line by line from S3 using python on Lambda

I am trying to process .csv (30MB) file that is on S3 bucket using AWS Lambda (Python). I wrote my python code locally to process file, now trying to execute using Lambda. Having a hard time to read file line by line.
Please let me know how I can traverse file line by line using boto3 or s3 methods. Please help me on the same at the earliest. Thanks
In Lambda:
s3 = boto3.client("s3")
file_obj = event["Records"][0]
filename=str(file_obj['s3']['object']['key'])
#print('file name is :', filename)
fileObj = s3.get_object(Bucket=<mybucket>, Key=filename)
file_content = fileObj["Body"].read().decode('utf-8')
My Original code:
import csv
import pandas as pd
import datetime
#from datetime import datetime,timedelta
import numpy as np
with open ('sample.csv', 'r') as file_name:
csv_reader = csv.reader(file_name, delimiter=',')
Time = []
Latitude=[]
Longitude= []
Org_Units=[]
Org_Unit_Type =[]
Variable_Name=[]
#New columns
Year=[]
Month= []
Day =[]
Celsius=[]
Far=[]
Conv_Units=[]
Conv_Unit_Type=[]
header = ['Time','Latitude', 'Longitude','Org_Units','Org_Unit_Type','Conv_Units','Conv_Unit_Type','Variable_Name']
out_filename = 'Write' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") #need to rename based on the org file name
with open(out_filename +'.csv', 'w') as csvFile:
outputwriter = csv.writer(csvFile, delimiter=',')
outputwriter.writerow(header)
next(csv_reader, None) # avoid hearder
for row in csv_reader:
# print(row)
Time = row[0]
Org_Lat=row[1]
Org_Long=row[2]
Org_Units=row[3]
Org_Unit_Type =row[4]
Variable_Name=row[5]
# print(Time,Org_Lat,Org_Long,Org_Units,Org_Unit_Type,Variable_Name)
if Org_Unit_Type == 'm s-1':
Conv_Units =round(float(Org_Units) * 1.151,2)
Conv_Unit_Type = 'miles'
if Org_Unit_Type == 'm':
Conv_Units =round(float(Org_Units) / 1609.344,2)
# print (Org_Units,Conv_Units)
Conv_Unit_Type = 'miles'
if Org_Unit_Type == 'Pa':
Conv_Units =round(float(Org_Units) / 6894.757,2)
Conv_Unit_Type = 'Psi'
#print(type(Time))
date_time_obj = datetime.datetime.strptime(Time, '%m-%d-%Y, %H:%M')
# Year = time.strptime(date_time_obj, "%B")
#print(date_time_obj)
f_row =[Time,Latitude,Longitude,Org_Units,Org_Unit_Type,Conv_Units,Conv_Unit_Type,Variable_Name]
outputwriter.writerow(f_row)
csvFile.close()
print("done")
I think this should work the only thing you need to check is your lambda needs a role with policy which has read access on s3 bucket.
Initially for testing i would give full access on s3 to the lambda AmazonS3FullAccess
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": "s3:*",
"Resource": "*"
}
]
}
python code
s3 = boto3.client('s3')
def lambda_handler(event, context):
# Get the object from the event and show its content type
bucket = event['Records'][0]['s3']['bucket']['name']
key = event['Records'][0]['s3']['object']['key'].encode('utf8')
obj = s3.get_object(Bucket=bucket, Key=key)
rows = obj['Body'].read().split('\n')
print("rows" + rows)
Rather than using .read() to read the object as a stream, you might find it easier to download the object to local storage:
s3_client = boto3.client('s3', region='ap-southeast-2')
s3_client.download_file(bucket, key, '/tmp/local_file.csv')
You can then use your original program to process the file.
Once you have finished, be sure to delete the temporary file because the AWS Lambda container might be reused and there is only 500MB of disk space available.

Luigi overwrite to S3

I have a routine task to upload and share a dump on S3 bucket. While the code below works, for some reason it does not want to overwrite file.
From the docs, I need to
1) define solution for two parallel executions:
path = luigi.Parameter(default=glob(DATA_DIR)[-2], batch_method=max)
2) add resources = {'overwrite_resource': 1}
While it works for the local files - it does not for the S3.
class report_to_S3(luigi.Task):
client = S3Client()
path = luigi.Parameter(default=glob(DATA_DIR)[-2], batch_method=max)
local_dump_path = '../../../data/local_db_dump.csv'
resources = {'overwrite_resource': 1}
def requires(self):
return upload_tweets(path=self.path)
def output(self):
self.s3_path = "s3://qclm-nyc-ct/dump/dump.csv"
return S3Target(self.s3_path, client=self.client)
def run(self):
c = sqa.create_engine('postgresql:///qc_soc_media')
df = pd.read_sql_query('SELECT id, user_id, timestamp, lat, lon, ct FROM tweets WHERE ct IS NOT NULL', c)
N = len(df)
df.to_csv(self.local_dump_path, index=None)
self.client.put(self.local_dump_path, self.output().path,
headers={'Content-Type': 'application/csv'})
send_S3_report(N)
if __name__ == '__main__':
luigi.run(local_scheduler=True, main_task_cls=report_to_S3)
If the target specified in the output() method already exists, the run() method will not execute. You may want to work the timestamp into the filename, or create another sentinel/flag that indicates the work is done.

Pyspark Streaming application stucks during a batch processing

I have a pyspark applicataion that loads data from Kinesis and saves to S3.
Each batch processing time is quite stable, but then it can stuck.
How can I figure out why it happens?
Code sample:
columns = [x.name for x in schema]
Event = Row(*[x[0] for x in columns])
def get_spark_session_instance(sparkConf):
if ("sparkSessionSingletonInstance" not in globals()):
globals()["sparkSessionSingletonInstance"] = SparkSession \
.builder \
.config(conf=sparkConf) \
.getOrCreate()
return globals()["sparkSessionSingletonInstance"]
def creating_func():
def timing(message):
print('timing', str(datetime.utcnow()), message)
def process_game(df, game, time_part):
# s3
df.write.json("{}/{}/{}/{}".format(path_prefix, game, 'group_1', time_part),
compression="gzip", timestampFormat="yyyy-MM-dd'T'HH:mm:ss.SSS")
timing('{}_grop_1'.format(game))
df[df['group'] == 2] \
.write.json("{}/{}/{}/{}".format(path_prefix, game, 'group_2', time_part),
compression="gzip", timestampFormat="yyyy-MM-dd'T'HH:mm:ss.SSS")
timing('{}_grop_2'.format(game))
# database
df[df['group'] == 3].select(*db_columns) \
.write.jdbc(db_connection_string, table="test.{}group_3".format(game), mode='append',
properties=db_connection_propetries)
timing('{}_db'.format(game))
def event_to_row(event):
event_dict = json.loads(event)
event_dict['json_data'] = event_dict.get('json_data') and json.dumps(
event_dict.get('json_data'))
return Event(*[event_dict.get(x) for x in columns])
def process(rdd):
if not rdd.isEmpty():
spark_time = datetime.utcnow().strftime('%Y/%m/%d/%H/%M%S_%f')
rows_rdd = rdd.map(event_to_row)
spark = get_spark_session_instance(rdd.context.getConf())
df = spark.createDataFrame(data=rows_rdd, schema=schema)
df = df.withColumn("ts", df["ts"].cast(TimestampType())) \
.withColumn("processing_time", lit(datetime.utcnow()))
df.cache()
print('timing -----------------------------')
process_game(df[df['app_id'] == 1], 'app_1', spark_time)
process_game(df[df['app_id'] == 2], 'app_2', spark_time)
sc = SparkContext.getOrCreate()
ssc = StreamingContext(sc, 240)
kinesis_stream = KinesisUtils.createStream(
ssc, sys.argv[2], 'My-stream-name', "kinesis.us-east-1.amazonaws.com",
'us-east-1', InitialPositionInStream.TRIM_HORIZON, 240, StorageLevel.MEMORY_AND_DISK_2)
kinesis_stream.repartition(16 * 3).foreachRDD(process)
ssc.checkpoint(checkpoint_prefix + sys.argv[1])
return ssc
if __name__ == '__main__':
print('timing', 'cast ts', str(datetime.utcnow()))
ssc = StreamingContext.getActiveOrCreate(checkpoint_prefix + sys.argv[1], creating_func)
ssc.start()
ssc.awaitTermination()
Streaming Web UI
Batches info
identify the process taking the time, use kill -QUIT or jstack to get the stack trace. look in the source for possible delays, and consider where you can increase log4j logging for more info.
Does the delay increase with the amount of data written? If so, that's the usual "rename is really copy" problem s3 has