Adding new column with values from two other columns - added conditionally - pandas

I have got such data frame:
Short sample of data:
import pandas as pd
df = pd.DataFrame({'longitude':(-122.05, -118.30, -117.81), 'latitude':(37. 37, 34.26, 33.78)})
I need to add one more column "coordinates" where cell value is equal to:
[lon]-122.05[lon] \n [lat] 37.37 [lat]
if there is longitude and latitude (sometimes there are None or "empty" values)
[lon]-122.05[lon]
if there is no latitude value
[B] No coordinates [B]
if there are no longitude and latitude values.
All new cells must be strings.
My code is here:
def prepare_coords(df):
def custom_edit(long, lat):
if not long.empty:
long = "<lon>"+str(long.astype(str))+"</lon>"
if not lat.empty:
lat = str(lat.astype(str))
if lat.endswith("\n"):
lat.rstrip()
lat = "<lat>"+lat+"</lat>"
if len(long) > 1 and len(lat) > 1: # Both: lon and lat
return long + "\n" + lat
elif len(long) > 1: # Only longitude
return long
else:
return np.nan # No longitude
df["coordinates"] = ""
df["coordinates"] = df["coordinates"].apply(custom_edit(df["longitude"], df["latitude"])).astype(str)
return df
df = prepare_coords(df)
But it gives me Atributte Error and "is not a valid function for 'Series' object" error.
How can I fix it?

Related

Increment a time and add it in data frame column

Hi I am new to python and I am looking for below result.
I have From_Curr(3), To_Curr(3) and making currency pairs and adding new column in my data frame as time.
3*3 = 9 currency pairs created So I want same time for currency pairs and then increment by 1 hr again for same pairs as shown below.
Problem statement is time gets incremented after every row.
Actual df:
Expected df:
Thanks for any help and appreciate your time.
`
import pandas as pd
import datetime
from datetime import timedelta
data = pd.DataFrame({'From':["EUR","GBP",'USD'],
'To':["INR","SGD",'HKD'],
'time':''})
init_date = datetime.datetime(1, 1, 1)
for index, row in data.iterrows():
row['time'] = str(init_date)[11:19]
init_date = init_date + timedelta(hours=1.0)
`
I'm not understanding why you are repeating the combinations, and incrementing in one hour in the last half.
But for this case, you can do something like this:
import pandas as pd
data = pd.DataFrame({'From':["EUR","GBP",'USD'],
'To':["INR","SGD",'HKD'],
'time':''})
outlist = [ (i, j) for i in data["From"] for j in data["To"] ]*2 # Create double combinations
data = pd.DataFrame(data=outlist,columns=["From","To"])
data["time"] = "00:00:00"
data["time"].iloc[int(len(data)/2):len(data)] = "01:00:00" # Assign 1 hour to last half
data["time"] = pd.to_datetime(data["time"]).dt.time
Update: After some clarifications
import pandas as pd
data = pd.DataFrame(
{"From": ["EUR", "GBP", "USD"], "To": ["INR", "SGD", "HKD"], "time": ""}
)
outlist = [
(i, j) for i in data["From"] for j in data["To"]
] * 2 # Create double combinations, i think that for your case it would be 24 instead of two
data = pd.DataFrame(data=outlist, columns=["From", "To"])
data["time"] = data.groupby(["From", "To"]).cumcount() # Get counts of pairs values
data["time"] = data["time"] * pd.to_datetime("01:00:00").value # Multiply occurrences by the value of 1 hour
data["time"] = pd.to_datetime(data["time"]).dt.time # Pass to time
I think this script covers all your needs, happy coding :)
Regards,

Better way to iterate through rows in a data frame and conditionally assign a group

I have created a function that assigns which latitude & longitude category each row fall in to. However, the issue is way too slow. How can I increase the performance?
Here is my code.
def assign_segment(use_df: pd.DataFrame,
lat_categories: pd.core.indexes.interval.IntervalIndex,
lng_categories: pd.core.indexes.interval.IntervalIndex) -> pd.DataFrame:
"""
Assign segments based on the latitude and longtitude column of "use_tb".
Parameters
----------
use_df : pd.DataFrame
Use DataFrame.
lat_categories : pd.core.indexes.interval.IntervalIndex
Latitude interval categories.
(ex.) IntervalIndex([(35.809, 35.816], (35.816, 35.824],
(35.824, 35.832], (35.832, 35.84], (35.84, 35.848]])
lng_categories : pd.core.indexes.interval.IntervalIndex
Lontitude interval categories.
(ex.) IntervalIndex([(128.668, 128.685], (128.685, 128.703],
(128.703, 128.72], (128.72, 128.737]])
Returns
-------
use_df : pd.DataFrame
"use_df" with segments assigned.
"""
segment = []
# iterate each row and get the segment according to latitude and longitude
for idx, row in use_df.iterrows():
use_lat = row['use_lat']
use_lng = row['use_lng']
for lat_idx, lat_category in enumerate(lat_categories):
if use_lat in lat_category:
lat_segment = lat_idx + 1
break
for lng_idx, lng_category in enumerate(lng_categories):
if use_lng in lng_category:
lng_segment = lng_idx + 1
break
num_lng_grid = len(lat_categories) # number of longtitude grid
lng_num_digits = len(str(num_lng_grid)) # number of digits of lng_grid
segment.append((lat_segment*10**lng_num_digits)+lng_segment)
# create the segment column with the segment list that we created in this function
use_df['segment'] = segment
return use_df
iterrows() is very slow when iterating over rows. Some developers think that iterrows should never be used. We have to resort to some vectorization to speed up the code. you can use tqdm.
from tqdm import tqdm
def assign_segment(use_df: pd.DataFrame,
lat_categories: pd.core.indexes.interval.IntervalIndex,
lng_categories: pd.core.indexes.interval.IntervalIndex) -> pd.DataFrame:
"""
Assign segments based on the latitude and longtitude column of "use_tb".
Parameters
----------
use_df : pd.DataFrame
Use DataFrame.
lat_categories : pd.core.indexes.interval.IntervalIndex
Latitude interval categories.
(ex.) IntervalIndex([(35.809, 35.816], (35.816, 35.824],
(35.824, 35.832], (35.832, 35.84], (35.84, 35.848]])
lng_categories : pd.core.indexes.interval.IntervalIndex
Lontitude interval categories.
(ex.) IntervalIndex([(128.668, 128.685], (128.685, 128.703],
(128.703, 128.72], (128.72, 128.737]])
Returns
-------
use_df : pd.DataFrame
"use_df" with segments assigned.
"""
segment = []
# iterate each row and get the segment according to latitude and longitude
for row in tqdm(use_df.to_dict('records')):
use_lat = row['use_lat']
use_lng = row['use_lng']
for lat_idx, lat_category in enumerate(lat_categories):
if use_lat in lat_category:
lat_segment = lat_idx + 1
break
for lng_idx, lng_category in enumerate(lng_categories):
if use_lng in lng_category:
lng_segment = lng_idx + 1
break
num_lng_grid = len(lat_categories) # number of longtitude grid
lng_num_digits = len(str(num_lng_grid)) # number of digits of lng_grid
segment.append((lat_segment*10**lng_num_digits)+lng_segment)
# create the segment column with the segment list that we created in this function
use_df['segment'] = segment
return use_df
It is worth reading these articles:
How To Make Your Pandas Loop 71803 Times Faster
Stop Using iterrows()
This is all you need to do here...
lat_segments = lat_categories.get_indexer(df['use_lat'])
lng_segments = lng_categories.get_indexer(df['use_lng'])
num_lng_grid = len(lat_categories)
lng_num_digits = len(str(num_lng_grid))
df['segment'] = (lat_segments*10**lng_num_digits)+lng_segments

How to convert the results of a for loop into pandas data frame?

Using the Haversine formula for distance calculation on a great circle, I use the following code to calculate the coordinates of any point between a known start location (with lat1/lon1) and a known destination (with lat2/lon2):
Here's the complete code:
from math import radians, sin, cos, acos, atan2, sqrt, pi
#enter the following numbers in the corresponding input fields:
#lat1 = starting latitude = 33.95
#lon1 = starting longitude = -118.40
#lat2 = destination latitude = 40.6333
#lon2= destination longitude = -73.7833
lat1 = radians(float(input("Starting latitude: ")))
lon1 = radians(float(input("Starting longitude: ")))
lat2 = radians(float(input("Destination latitude: ")))
lon2 = radians(float(input("Destination longitude: ")))
#Haversine formula to calculate the distance, in radians, between starting point and destination:
d = ((6371.01 * acos(sin(lat1)*sin(lat2) + cos(lat1)*cos(lat2)*cos(lon1 - lon2)))/1.852)/(180*60/pi)
import numpy as np
x = np.arange(0, 1, 0.2)
for f in x:
A=sin((1-f)*d)/sin(d)
B=sin(f*d)/sin(d)
x = A*cos(lat1)*cos(lon1) + B*cos(lat2)*cos(lon2)
y = A*cos(lat1)*sin(lon1) + B*cos(lat2)*sin(lon2)
z = A*sin(lat1) + B*sin(lat2)
lat_rad=atan2(z,sqrt(x**2+y**2))
lon_rad=atan2(y,x)
lat_deg = lat_rad*180/pi
lon_deg = lon_rad*180/pi
print('%.2f' %f, '%.4f' %lat_deg, '%.4f' %lon_deg)
I use the np.arange() function to do a fractional iteration, f, between 0 (the starting point) and 1 (the destination).
The output of the for loop is:
0.00 33.9500 -118.4000
0.20 36.6040 -110.2685
0.40 38.6695 -101.6259
0.60 40.0658 -92.5570
0.80 40.7311 -83.2103
Where, the first number is the fraction (f); the second number is the latitude (lat_deg) and the third number is the longitude (lon_deg).
My question is: how do I convert the output of my code into a pandas (3x6) data frame with the data arranged in 3 columns with header Fraction (col1), Latitude (col2), Longitude (col3)?
Once the output is in a pandas data frame I can then easily write the data into a CSV file.
You're almost there. With the following modifications, you will be able to get your CSV:
Append your values to a list instead of printing them.
Convert the result to a dataframe
Below is your code with the required updates. I have now tested this and it works all the way to the final CSV.
import numpy as np
import pandas as pd
from math import radians, sin, cos, acos, atan2, sqrt, pi
# Numbers per your instructions
lat1 = radians(float(33.95))
lon1 = radians(float(-118.40))
lat2 = radians(float(40.6333))
lon2 = radians(float(-73.7833))
#Haversine formula to calculate the distance, in radians, between starting point and destination:
d = ((6371.01 * acos(sin(lat1)*sin(lat2) + cos(lat1)*cos(lat2)*cos(lon1 - lon2)))/1.852)/(180*60/pi)
x = np.arange(0, 1, 0.2)
# An empty list into which we'll append each list of values
res = []
for f in x:
A=sin((1-f)*d)/sin(d)
B=sin(f*d)/sin(d)
x = A*cos(lat1)*cos(lon1) + B*cos(lat2)*cos(lon2)
y = A*cos(lat1)*sin(lon1) + B*cos(lat2)*sin(lon2)
z = A*sin(lat1) + B*sin(lat2)
lat_rad=atan2(z,sqrt(x**2+y**2))
lon_rad=atan2(y,x)
lat_deg = lat_rad*180/pi
lon_deg = lon_rad*180/pi
# Add the desired values, creating a list of lists
res.append([f, lat_deg, lon_deg])
# Convert the result to a dataframe
res_df= pd.DataFrame(res, columns=['Fraction', 'Latitude', 'Longitude'])
# Voila! You can now save to CSV
res_df.to_csv('coordinates.csv', index=False)

pandas histogram plot error: ValueError: num must be 1 <= num <= 0, not 1

I am drawing a histogram of a column from pandas data frame:
%matplotlib notebook
import matplotlib.pyplot as plt
import matplotlib
df.hist(column='column_A', bins = 100)
but got the following errors:
62 raise ValueError(
63 "num must be 1 <= num <= {maxn}, not {num}".format(
---> 64 maxn=rows*cols, num=num))
65 self._subplotspec = GridSpec(rows, cols)[int(num) - 1]
66 # num - 1 for converting from MATLAB to python indexing
ValueError: num must be 1 <= num <= 0, not 1
Does anyone know what this error mean? Thanks!
Problem
The problem you encounter arises when column_A does not contain numeric data. As you can see in the excerpt from pandas.plotting._core below, the numeric data is essential to make the function hist_frame (which you call by DataFrame.hist()) work correctly.
def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None,
xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False,
sharey=False, figsize=None, layout=None, bins=10, **kwds):
# skipping part of the code
# ...
if column is not None:
if not isinstance(column, (list, np.ndarray, Index)):
column = [column]
data = data[column]
data = data._get_numeric_data() # there is no numeric data in the column
naxes = len(data.columns) # so the number of axes becomes 0
# naxes is passed to the subplot generating function as 0 and later determines the number of columns as 0
fig, axes = _subplots(naxes=naxes, ax=ax, squeeze=False,
sharex=sharex, sharey=sharey, figsize=figsize,
layout=layout)
# skipping the rest of the code
# ...
Solution
If your problem is to represent numeric data (but not of numeric dtype yet) with a histogram, you need to cast your data to numeric, either with pd.to_numeric or df.astype(a_selected_numeric_dtype), e.g. 'float64', and then proceed with your code.
If your problem is to represent non-numeric data in one column with a histogram, you can call the function hist_series with the following line: df['column_A'].hist(bins=100).
If your problem is to represent non-numeric data in many columns with a histogram, you may resort to a handful options:
Use matplotlib and create subplots and histograms directly
Update pandas at least to version 0.25
usually is 0
mta['penn'] = [mta_bystation[mta_bystation.STATION == "34 ST-PENN STA"], 'Penn Station']
mta['grdcntrl'] = [mta_bystation[mta_bystation.STATION == "GRD CNTRL-42 ST"], 'Grand Central']
mta['heraldsq'] = [mta_bystation[mta_bystation.STATION == "34 ST-HERALD SQ"], 'Herald Sq']
mta['23rd'] = [mta_bystation[mta_bystation.STATION == "23 ST"], '23rd St']
#mta['portauth'] = [mta_bystation[mta_bystation.STATION == "42 ST-PORT AUTH"], 'Port Auth']
#mta['unionsq'] = [mta_bystation[mta_bystation.STATION == "14 ST-UNION SQ"], 'Union Sq']
mta['timessq'] = [mta_bystation[mta_bystation.STATION == "TIMES SQ-42 ST"], 'Ti

Assigning values to dataframe columns

In the below code, the dataframe df5 is not getting populated. I am just assigning the values to dataframe's columns and I have specified the column beforehand. When I print the dataframe, it returns an empty dataframe. Not sure whether I am missing something.
Any help would be appreciated.
import math
import pandas as pd
columns = ['ClosestLat','ClosestLong']
df5 = pd.DataFrame(columns=columns)
def distance(pt1, pt2):
return math.sqrt((pt1[0] - pt2[0])**2 + (pt1[1] - pt2[1])**2)
for pt1 in df1:
closestPoints = [pt1, df2[0]]
for pt2 in df2:
if distance(pt1, pt2) < distance(closestPoints[0], closestPoints[1]):
closestPoints = [pt1, pt2]
df5['ClosestLat'] = closestPoints[1][0]
df5['ClosestLat'] = closestPoints[1][0]
df5['ClosestLong'] = closestPoints[1][1]
print ("Point: " + str(closestPoints[0]) + " is closest to " + str(closestPoints[1]))
From the look of your code, you're trying to populate df5 with a list of latitudes and longitudes. However, you're making a couple mistakes.
The columns of pandas dataframes are Series, and hold some type of sequential data. So df5['ClosestLat'] = closestPoints[1][0] attempts to assign the entire column a single numerical value, and results in an empty column.
Even if the dataframe wasn't ignoring your attempts to assign a real number to the column, you would lose data because you are overwriting the column with each loop.
The Solution: Build a list of lats and longs, then insert into the dataframe.
import math
import pandas as pd
columns = ['ClosestLat','ClosestLong']
df5 = pd.DataFrame(columns=columns)
def distance(pt1, pt2):
return math.sqrt((pt1[0] - pt2[0])**2 + (pt1[1] - pt2[1])**2)
lats, lngs = [], []
for pt1 in df1:
closestPoints = [pt1, df2[0]]
for pt2 in df2:
if distance(pt1, pt2) < distance(closestPoints[0], closestPoints[1]):
closestPoints = [pt1, pt2]
lats.append(closestPoints[1][0])
lngs.append(closestPoints[1][1])
df['ClosestLat'] = pd.Series(lats)
df['ClosestLong'] = pd.Series(lngs)