How to escape single quote in sparkSQL - apache-spark-sql

I am new to pySpark and SQL. I am working on below query;
sqlContext.sql("Select Crime_type, substring(Location,11,100) as Location_where_crime_happened, count(*) as Count\
From street_SQLTB\
where LSOA_name = 'City of London 001F' and \
group by Location_where_crime_happened, Crime_type\
having Location_where_crime_happened = 'Alderman'S Walk'")
I am struggling in dealing with single quote. I need to apply filter on Alderman'S Walk. It could be easy one but I am unable to figure out.
Your help is much appreciated.

Try this
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
simpleData = [("James","Sales","NY",90000,34,10000), \
("Michael","Sales","NY",86000,56,20000), \
("Robert","Sales","CA",81000,30,23000), \
("Maria","Alderman'S Walk","CA",90000,24,23000) \
]
columns= ["employee_name","department","state","salary","age","bonus"]
df1 = spark.createDataFrame(data = simpleData, schema = columns)
df1.createOrReplaceTempView('temp')
df = sqlContext.sql("""select * from temp where department = "Alderman'S Walk" """)
display(df)
or
df = sqlContext.sql("select * from temp where department = 'Alderman\\'S Walk' ")
display(df)
Filtered output:

Related

Change DataType of multiple columns with pyspark

i'am trying to change the datatype of multiple columns (100 column) with pyspark,i'am trying to make a loop or something else that can helps to change th 100 column
Any help will be appreciated. this was the syntax that helped me change 3 columns:
from pyspark.sql.types import (
IntegerType
)
dfcontract2 = dfcontract \
.withColumn("Offre durable" ,
dfcontract["Offre durable"]
.cast(IntegerType())) \
.withColumn("Offre non durable",
dfcontract["Offre non durable"]
.cast(IntegerType())) \
.withColumn("Total" ,
dfcontract["Total"]
.cast(IntegerType())) \
dfcontract2.printSchema()
You can employ reduce in conjuction to withColumn to do this -
DataFrame - Reduce
from functools import reduce
schema = {col: col_type for col, col_type in sparkDF.dtypes}
cast_cols = [ col for col, col_type in schema.items() if col_type in ["bigint"] ]
sparkDF = reduce(
lambda df, x: df.withColumn(x, F.col(x).cast(IntegerType())),
cast_cols,
sparkDF,
)
you can use list comprehension.
list_of_cols_to_update = ['col2', 'col3', 'col4'] # specify the columns that need casting
data_sdf. \
select(*[func.col(k).cast('int').alias(k) if k in list_of_cols_to_update else k for k in data_sdf.columns])
lets print the list from the comprehension to see how it looks
print([func.col(k).cast('int').alias(k) if k in list_of_cols_to_update else k for k in data_sdf.columns])
# ['col1', Column<'CAST(col2 AS INT) AS `col2`'>, Column<'CAST(col3 AS INT) AS `col3`'>, Column<'CAST(col4 AS INT) AS `col4`'>]
The list_of_cols_to_update list can be generated using a list comprehension as well
list_of_cols_to_update = ['col'+str(i) for i in range(2, 5)]
print(list_of_cols_to_update)
# ['col2', 'col3', 'col4']

How to create variable PySpark Dataframes by Dropping Null columns

I have 2 JSON files in a relative folder named 'source_data'
"source_data/data1.json"
{
"name": "John Doe",
"age": 32,
"address": "ZYZ - Heaven"
}
"source_data/data2.json"
{
"userName": "jdoe",
"password": "password",
"salary": "123456789"
}
Using the following PySpark code I have created DataFrame:
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.appName("Python Spark SQL basic example") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()
df = spark.read.json("source_data")
print(df.head())
Output:
df.head(10)
[Row(name='John Doe', age=32, address='ZYZ - Heaven', userName=None, password=None, salary=None),
Row(name=None, age=None, address=None, userName='jdoe', password='password', salary='123456789')]
Now I want to create variable number of DataFrame, by dropping 'None' type column values, like this:
df1.head()
[Row(name='John Doe', age=32, address='ZYZ - Heaven']
and,
df2.head()
[Row(userName='jdoe', password='password', salary='123456789')]
I am only finding solutions for dropping entire row based on all or any column(s)
Is there any ways to achieve what I am looking for ?
TIA
You can just select the columns that you require in a different dataframe and filter that based on the condition.
//source data
val df = spark.read.json("path")
//select and filter
val df1 = df.select("address","age","name")
.filter($"address".isNotNull || $"age".isNotNull || $"name".isNotNull)
val df2 = df.select("password","salary","userName")
.filter($"password".isNotNull || $"salary".isNotNull || $"userName".isNotNull)
//see the output as dataframe or using head as you want
println(df1.head)
df2.head
Output for both the head command
df1 :
df2:

Pyspark: How to convert array of strings in a dataframe to array of timestamps

I run a simple query to get cookie as string and timestamps as array using pyspark sql.
I want to pass them to my user defined function but the array of timestamps is passed as an array of unicodes.
Can someone help me figure this out. Thanks
#udf(returnType=StringType())
def PrintDetails(cookie, timestamps, current_day, current_hourly_threshold,current_daily_threshold):
print(type(timestamps[0]))
def main(argv):
spark = SparkSession \
.builder \
.appName("parquet_test") \
.config("spark.debug.maxToStringFields", "100") \
.getOrCreate()
inputPath = r'D:\Hadoop\Spark\parquet_input_files'
inputFiles = os.path.join(inputPath, '*.parquet')
impressionDate = datetime.strptime("2019_12_31", '%Y_%m_%d')
current_hourly_threshold = 40
current_daily_threshold = 200
parquetFile = spark.read.parquet(inputFiles)
parquetFile.createOrReplaceTempView("parquetFile")
cookie_and_time = spark.sql("SELECT cookie, collect_list(date_format(from_unixtime(ts), 'YYYY-MM-dd-hh:mm:ss')) as imp_times FROM parquetFile group by 1 ")
cookie_df = cookie_and_time.withColumn("cookies", PrintDetails(cookie_and_time['cookie'], cookie_and_time['imp_times'], lit(impressionDate), lit(current_hourly_threshold), lit(current_daily_threshold)))
cookie_df.show()
if __name__ == "__main__":
main(sys.argv)

df.groupby('columns').apply(''.join()), join all the cells to a string

df.groupby('columns').apply(''.join()), join all the cells to a string.
This is for a junior dataprocessor. In the past, I've tried many ways.
import pandas as pd
data = {'key':['a','b','c','a','b','c','a'], 'profit':
[12,3,4,5,6,7,9],'income':['j','d','d','g','d','t','d']}
df = pd.DataFrame(data)
df = df.set_index(‘key’)
#df2 is expected result
data2 = {'a':['12j5g9d'],'b':['3d6d'],'c':['4d7t']}
df2 = pd.DataFrame(data2)
df2 = df2.set_index(‘key’)
Here's a simple solution, where we first translate the integers to strings and then concatenate profit and income, then finally we concatenate all strings under the same key:
data = {'key':['a','b','c','a','b','c','a'], 'profit':
[12,3,4,5,6,7,9],'income':['j','d','d','g','d','t','d']}
df = pd.DataFrame(data)
df['profit_income'] = df['profit'].apply(str) + df['income']
res = df.groupby('key')['profit_income'].agg(''.join)
print(res)
output:
key
a 12j5g9d
b 3d6d
c 4d7t
Name: profit_income, dtype: object
This question can be solved couple different ways:
First add an extra column by concatenating the profit and income columns.
import pandas as pd
data = {'key':['a','b','c','a','b','c','a'], 'profit':
[12,3,4,5,6,7,9],'income':['j','d','d','g','d','t','d']}
df = pd.DataFrame(data)
df = df.set_index('key')
df['profinc']=df['profit'].astype(str)+df['income']
1) Using sum
df2=df.groupby('key').profinc.sum()
2) Using apply and join
df2=df.groupby('key').profinc.apply(''.join)
Results from both of the above would be the same:
key
a 12j5g9d
b 3d6d
c 4d7t

How can I populate a pandas DataFrame with the result of a Snowflake sql query?

Using the Python Connector I can query Snowflake:
import snowflake.connector
# Gets the version
ctx = snowflake.connector.connect(
user=USER,
password=PASSWORD,
account=ACCOUNT,
authenticator='https://XXXX.okta.com',
)
ctx.cursor().execute('USE warehouse MY_WH')
ctx.cursor().execute('USE MYDB.MYSCHEMA')
query = '''
select * from MYDB.MYSCHEMA.MYTABLE
LIMIT 10;
'''
cur = ctx.cursor().execute(query)
The result is a snowflake.connector.cursor.SnowflakeCursor. How can I convert that to a pandas DataFrame?
You can use DataFrame.from_records() or pandas.read_sql() with snowflake-sqlalchemy. The snowflake-alchemy option has a simpler API
pd.DataFrame.from_records(iter(cur), columns=[x[0] for x in cur.description])
will return a DataFrame with proper column names taken from the SQL result. The iter(cur) will convert the cursor into an iterator and cur.description gives the names and types of the columns.
So the complete code will be
import snowflake.connector
import pandas as pd
# Gets the version
ctx = snowflake.connector.connect(
user=USER,
password=PASSWORD,
account=ACCOUNT,
authenticator='https://XXXX.okta.com',
)
ctx.cursor().execute('USE warehouse MY_WH')
ctx.cursor().execute('USE MYDB.MYSCHEMA')
query = '''
select * from MYDB.MYSCHEMA.MYTABLE
LIMIT 10;
'''
cur = ctx.cursor().execute(query)
df = pd.DataFrame.from_records(iter(cur), columns=[x[0] for x in cur.description])
If you prefer using pandas.read_sql then you can
import pandas as pd
from sqlalchemy import create_engine
from snowflake.sqlalchemy import URL
url = URL(
account = 'xxxx',
user = 'xxxx',
password = 'xxxx',
database = 'xxx',
schema = 'xxxx',
warehouse = 'xxx',
role='xxxxx',
authenticator='https://xxxxx.okta.com',
)
engine = create_engine(url)
connection = engine.connect()
query = '''
select * from MYDB.MYSCHEMA.MYTABLE
LIMIT 10;
'''
df = pd.read_sql(query, connection)
There is now a method .fetch_pandas.all() for this, no need for SQL Alchemy anymore.
Note that you need to install snowflake.connector for pandas by doing this
pip install snowflake-connector-python[pandas]
Full documentation here
import pandas as pd
import snowflake.connector
conn = snowflake.connector.connect(
user="xxx",
password="xxx",
account="xxx",
warehouse="xxx",
database="MYDB",
schema="MYSCHEMA"
)
cur = conn.cursor()
# Execute a statement that will generate a result set.
sql = "select * from MYTABLE limit 10"
cur.execute(sql)
# Fetch the result set from the cursor and deliver it as the Pandas DataFrame.
df = cur.fetch_pandas_all()
I just want to leave here a small change made to the code to ensure that the columns have correct names (in my case the fetch call returned long column names that included information beyond the name itself). I leave it here, in case someone needs it:
import snowflake.connector
import pandas as pd
def fetch_pandas(cur, sql):
cur.execute(sql)
rows = 0
while True:
dat = cur.fetchmany(n)
if not dat:
break
a = [cursor.description[i][0] for i in range(len(cursor.description))]
df = pd.DataFrame(dat, columns=a)
rows += df.shape[0]
return df
n = 100000
conn = snowflake.connector.connect(
user='xxxxx',
password='yyyyyy',
account='zzzzz',
warehouse = 'wwwww',
database = 'mmmmmm',
schema = 'nnnnn'
)
cursor = conn.cursor()
fetch_pandas(cursor, 'select * from "mmmmmm"."wwwww"."table"')