convert TB to GB on specific index - pandas

I wrote such an code. Here i wanted to change all column that constitute TB and GB to single integer. for example if column has 2 TB, this code will delete TB and will keep it as 2. The program works good. What now i want to do is to convert 2TB to 2048 GB so that i can sum all column values. Is there any way to remove TB and make calculation on specific row at the same time?
def removeend():
df= pd.read_csv('ExportList.csv')
if df["Used Space"].str.contains("GB | TB").any() or df["Memory Size"].str.contains("GB | TB").any() or df["Host CPU"].str.contains("Hz|MHz|GHz").any():
df['Used Space'] = df['Used Space'].str.replace(r'GB|TB', '', regex=True)
df["Memory Size"] = df["Memory Size"].str.replace(r'GB|TB', '', regex=True)
df['Host CPU'] = df['Host CPU'].str.replace(r'MHz|Hz|GHz', '', regex=True)
df = df.convert_dtypes()
df["Used Space"] = pd.to_numeric(df["Used Space"])
df["Memory Size"] = pd.to_numeric(df["Memory Size"])
df["Host CPU"] = pd.to_numeric(df["Host CPU"])
else:
print("Error occured!!!")
return df

define\create a custom function:
def converter(x):
try:
return pd.eval(x)
except:
return x
Finally:
cols=["Used Space","Memory Size"]
df[cols]=df[cols].replace({'GB':'','TB':'*1024'},regex=True).applymap(converter)
df["Host CPU"]=df["Host CPU"].replace({'MHz':'','GHz':'*0.001','Hz':'*0.000001'},regex=True).map(converter)

Related

New column with word at nth position of string from other column pandas

import numpy as np
import pandas as pd
d = {'ABSTRACT_ID': [14145090,1900667, 8157202,6784974],
'TEXT': [
"velvet antlers vas are commonly used in tradit",
"we have taken a basic biologic RPA to elucidat4",
"ceftobiprole bpr is an investigational cephalo",
"lipoperoxidationderived aldehydes for example",],
'LOCATION': [1, 4, 2, 1]}
df = pd.DataFrame(data=d)
df
def word_at_pos(x,y):
pos=x
string= y
count = 0
res = ""
for word in string:
if word == ' ':
count = count + 1
if count == pos:
break
res = ""
else :
res = res + word
print(res)
word_at_pos(df.iloc[0,2],df.iloc[0,1])
For this df I want to create a new column WORD that contains the word from TEXT at the position indicated by LOCATION. e.g. first line would be "velvet".
I can do this for a single line as an isolated function world_at_pos(x,y), but can't work out how to apply this to whole column. I have done new columns with Lambda functions before, but can't work out how to fit this function to lambda.
Looping over TEXT and LOCATION could be the best idea because splitting creates a jagged array, so filtering using numpy advanced indexing won't be possible.
df["WORDS"] = [txt.split()[loc] for txt, loc in zip(df["TEXT"], df["LOCATION"]-1)]
print(df)
ABSTRACT_ID ... WORDS
0 14145090 ... velvet
1 1900667 ... a
2 8157202 ... bpr
3 6784974 ... lipoperoxidationderived
[4 rows x 4 columns]

Pandas Data frame column condition check based on length of the value

I have pandas data frame which gets created by reading an excel file. The excel file has a column called serial number. Then I pass a serial number to another function which connect to API and fetch me the result set for those serial number.
My Code -:
def create_excel(filename):
try:
data = pd.read_excel(filename, usecols=[4,18,19,20,26,27,28],converters={'Serial Number': '{:0>32}'.format})
except Exception as e:
sys.exit("Error reading %s: %s" % (filename, e))
data["Subject Organization"].fillna("N/A",inplace= True)
df = data[data['Subject Organization'].str.contains("Fannie",case = False)]
#df['Serial Number'].apply(lamda x: '000'+x if len(x) == 29 else '00'+x if len(x) == 30 else '0'+x if len(x) == 31 else x)
print(df)
df.to_excel(r'Data.xlsx',index= False)
output = df['Serial Number'].apply(lambda x: fetch_by_ser_no(x))
df2 = pd.DataFrame(output)
df2.columns = ['Output']
df5 = pd.concat([df,df2],axis = 1)
The problem I am facing is I want to check if df5 returned by fetch_by_ser_no() is blank then make the serial number as 34 characters by adding two more leading 00 and then check the function again.
How can I do it by not creating multiple dataframe
Any help!!
Thanks
You can try to use if ... else ...:
output = df['Serial Number'].apply(lambda x: 'ok' if fetch_by_ser_no(x) else 'badly')

Recursively update the dataframe

I have a dataframe called datafe from which I want to combine the hyphenated words.
for example input dataframe looks like this:
,author_ex
0,Marios
1,Christodoulou
2,Intro-
3,duction
4,Simone
5,Speziale
6,Exper-
7,iment
And the output dataframe should be like:
,author_ex
0,Marios
1,Christodoulou
2,Introduction
3,Simone
4,Speziale
5,Experiment
I have written a sample code to achieve this but I am not able to get out of the recursion safely.
def rm_actual(datafe, index):
stem1 = datafe.iloc[index]['author_ex']
stem2 = datafe.iloc[index + 1]['author_ex']
fixed_token = stem1[:-1] + stem2
datafe.drop(index=index + 1, inplace=True, axis=0)
newdf=datafe.reset_index(drop=True)
newdf.iloc[index]['author_ex'] = fixed_token
return newdf
def remove_hyphens(datafe):
for index, row in datafe.iterrows():
flag = False
token=row['author_ex']
if token[-1:] == '-':
datafe=rm_actual(datafe, index)
flag=True
break
if flag==True:
datafe=remove_hyphens(datafe)
if flag==False:
return datafe
datafe=remove_hyphens(datafe)
print(datafe)
Is there any possibilities I can get out of this recursion with expected output?
Another option:
Given/Input:
author_ex
0 Marios
1 Christodoulou
2 Intro-
3 duction
4 Simone
5 Speziale
6 Exper-
7 iment
Code:
import pandas as pd
# read/open file or create dataframe
df = pd.DataFrame({'author_ex':['Marios', 'Christodoulou', 'Intro-', \
'duction', 'Simone', 'Speziale', 'Exper-', 'iment']})
# check input format
print(df)
# create new column 'Ending' for True/False if column 'author_ex' ends with '-'
df['Ending'] = df['author_ex'].shift(1).str.contains('-$', na=False, regex=True)
# remove the trailing '-' from the 'author_ex' column
df['author_ex'] = df['author_ex'].str.replace('-$', '', regex=True)
# create new column with values of 'author_ex' and shifted 'author_ex' concatenated together
df['author_ex_combined'] = df['author_ex'] + df.shift(-1)['author_ex']
# create a series true/false but shifted up
index = (df['Ending'] == True).shift(-1)
# set the last row to 'False' after it was shifted
index.iloc[-1] = False
# replace 'author_ex' with 'author_ex_combined' based on true/false of index series
df.loc[index,'author_ex'] = df['author_ex_combined']
# remove rows that have the 2nd part of the 'author_ex' string and are no longer required
df = df[~df.Ending]
# remove the extra columns
df.drop(['Ending', 'author_ex_combined'], axis = 1, inplace=True)
# output final dataframe
print('\n\n')
print(df)
# notice index 3 and 6 are missing
Outputs:
author_ex
0 Marios
1 Christodoulou
2 Introduction
4 Simone
5 Speziale
6 Experiment

Boto3 s3 Select CSV to Pandas Dataframe-- trouble delimiting

I am trying to use Boto3 to 'query' a .CSV within an s3 bucket and spit the data into a Pandas Dataframe object. It is 'working'-- with (almost all of the data) in a single column.
Here is the Python (thanks 20 Chrome tabs and stackoverflow threads):
import pandas as pd
import boto3
import io
s3 = boto3.client(service_name='s3',
aws_access_key_id = 'redacted',
aws_secret_access_key = 'redacted')
#just selecting everything until I get this proof of concept finished
query = """SELECT *
FROM S3Object"""
obj = s3.select_object_content(
Bucket='redacted',
Key='redacted',
ExpressionType='SQL',
Expression=query,
InputSerialization={'CSV': {'FileHeaderInfo': 'Use',
'RecordDelimiter': '|'}},
OutputSerialization={'CSV': {}})
records = []
for event in obj['Payload']:
if 'Records' in event:
records.append(event['Records']['Payload'])
elif 'Stats' in event:
stats = event['Stats']['Details']
file_str = ''.join(r.decode('utf-8') for r in records)
df = pd.read_csv(io.StringIO(file_str))
This is what the .CSV in the s3 bucket looks like:
Field_1
"HeaderA""|""HeaderB""|""HeaderC""|""HeaderD"
"valueA1""|""valueB1""|""valueC1""|""valueD1"
"valueA2""|""valueB2""|""valueC2""|""valueD2"
"valueA3""|""valueB3""|""valueC3""|""valueD3"
.
.
.
"valueAn""|""valueBn""|""valueCn""|""valueDn"
And here is my current Dataframe output:
HeaderB
------------
HeaderC
HeaderD
valueA1
valueB1
valueC1
valueD1
valueA2
valueB2
valueC2
valueD2
...
valueDn
Desired output is 4 columns by n rows (plus headers)
Any ideas on how to fix this?
.
.
.
Edit:
InputSerialization={'CSV': {'FileHeaderInfo': 'None',
'FieldDelimiter': '"',
'AllowQuotedRecordDelimiter': True
}}
That got me 95% of the way there. The pipes were added as columns in the dataframe. Solution:
for col in df.columns:
if col[0] == '|':
df = df.drop(col, axis = 1)
Edit 2:
This solution works when pulling the entire CSV with SELECT *.
Now that this works, I've moved on to the next proof of concept, which is using a more specific query. There were some discrepancies with what was returned vs. what I could verify by looking directly at the CSV. I think this is due to the first line of the CSV being Field_1, followed by the actual header fields and record values. My current theory is that with this first line removed from the original input, I will be able to field-delimit on the quoted pipe and record-delimit on the newline and get the results I want. I am reaching out to the team responsible for these s3 dumps to see if the first line can be removed.
New csv file
Field_1
"HeaderA""|""HeaderB""|""HeaderC""|""HeaderD"
"a_val1""|""bv3""|""1""|""10"
"a_val2""|""bv4""|""1""|""20"
"a_val3""|""bv4""|""3""|""40"
"a_val4""|""bv6""|""4""|""40"
def get_results(query):
obj = s3.select_object_content(
Bucket=bucket,
Key=key,
ExpressionType='SQL',
Expression=query,
InputSerialization={'CSV': {'FileHeaderInfo': 'IGNORE',
'FieldDelimiter': '"',
'AllowQuotedRecordDelimiter': True
}},
OutputSerialization={'CSV': {}})
# print(list(obj['Payload']))
records = []
for event in obj['Payload']:
if 'Records' in event:
records.append(event['Records']['Payload'])
elif 'Stats' in event:
stats = event['Stats']['Details']
file_str = ''.join(r.decode('utf-8') for r in records)
df = pd.read_csv(io.StringIO(file_str))
# df = df.filter(regex='Header')
return df
To get this work, ignore the headers (the first line of file) and then specifically search for it in where/and clause. FIguring out the column positions is the time consuming part.
query = '''SELECT s._2, s._6, s._10, s._14 FROM S3Object s where s._6 = 'bv4' or s._6 = 'HeaderB' '''
query = '''SELECT s._2, s._6 FROM S3Object s where s._6 = 'bv4' or s._6 = 'HeaderB' '''
get_results(query)
Here are the outputs of the two queries
HeaderA HeaderB HeaderC HeaderD
0 a_val2 bv4 1 20
1 a_val3 bv4 3 40
HeaderA HeaderB
0 a_val2 bv4
1 a_val3 bv4

Pandas saving in text format

I am trying to save the output, which is a number ,to a text format in pandas after working on the dataset.
import pandas as pd
df = pd.read_csv("sales.csv")
def HighestSales():
df.drop(['index', "month"], axis =1, inplace = True)
df2 = df.groupby("year").sum()
df2 = df2.sort_values(by = 'sales', ascending = True).reset_index()
df3 = df2.loc[11, 'year']
df4 = pd.Series(df3)
df5 = df4.iloc[0]
#*the output here is 1964 , which alone needs to be saved in the text file*.
df5.to_csv("modified.txt")
HighestSales()
But I get 'numpy.int64' object has no attribute 'to_csv'- this error . Is there a way to save just one single value in the text file?
you can do:
# open a file named modified.txt
with open('modified.txt', 'w') as f:
# df5 is just an integer of 196
# and write 1964 plus a line break
f.write(df5 + '\n')
You cannot save a single value to csv by using "pd.to_csv". In your case you should convert it into DataFrame again and then saving it. If you want to see only the number in .txt file, you need to add some parameters:
result = pd.DataFrame(df5)
result.to_csv('modified.txt', index=False, header=False)