How to extend the query if the sql query is parametrized? - sql

Below is the function created to generate counts from the table, but in the query (string) I want to add 'group by' a column 'xyz'. Suggest, how to do the same.
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
db = 'database'
schema = 'Schema'
def getCount(table):
string = f"select count(*) as ct from {db}.{schema}." + table
df = spark.read.format(snowflake_name)\
.options(**sfOptions)\
.option('query', string).load()
return df

Well one way would be to alter the f-string slightly
string = f"select some_column, count(*) as ct from {db}.{schema}.{table} group by some_column"

Related

Converting Python code to pyspark environment

How can I have the same functions as shift() and cumsum() from pandas in pyspark?
import pandas as pd
temp = pd.DataFrame(data=[['a',0],['a',0],['a',0],['b',0],['b',1],['b',1],['c',1],['c',0],['c',0]], columns=['ID','X'])
temp['transformed'] = temp.groupby('ID').apply(lambda x: (x["X"].shift() != x["X"]).cumsum()).reset_index()['X']
print(temp)
My question is how to achieve in pyspark.
Pyspark have handle these type of queries with Windows utility functions.
you can read its documentation here
Your pyspark code would be something like this :
from pyspark.sql import functions as F
from pyspark.sql Import Window as W
window = W.partitionBy('id').orderBy('time'?)
new_df = (
df
.withColumn('shifted', F.lag('X').over(window))
.withColumn('isEqualToPrev', (F.col('shifted') == F.col('X')).cast('int'))
.withColumn('cumsum', F.sum('isEqualToPrev').over(window))
)

Transfer a df to a new one and change the context of a column

I have one dataframe df_test and I want to parse all the columns into a new df.
Also I want with if else statement to modify one column's context.
Tried this:
import pyspark
import pandas as pd
from pyspark.sql import SparkSession
df_cast= df_test.withColumn('account_id', when(col("account_id") == 8, "teo").when(col("account_id") == 9, "liza").otherwise(' '))
But it gives me this error:
NameError: name 'when' is not defined
Thanks in advance
At the start of your code, you should import the pyspark sql functions. The following, for example, would work:
import pyspark.sql.functions as F
import pyspark
import pandas as pd
from pyspark.sql import SparkSession
df_cast= df_test.withColumn('account_id', F.when(col("account_id") == 8, "teo").F.when(col("account_id") == 9, "liza").otherwise(' '))

How to remove diacritics in pyspark dataframes?

I am wondering how to remove diacritics in Pyspark Dataframe with Python2. I would need something like
from pyspark.sql.session import SparkSession
from pyspark import SparkContext
import pyspark.sql.functions as sf
from pyspark.sql.types import StringType
df = sc.parallelize([(u'pádlo', 1), (u'dřez', 4)]).toDF(['text', 'num'])
def remove_diacritics(s):
return unidecode.unidecode(s)
rem_udf = sf.udf(remove_diacritics, StringType())
df.select(rem_udf('text'))
unfortunatelly, unidecode module is not available in our cluster.
Is there some any natural solution that I am missing excepting manual replacement of all possible characters? Note that the expected result is [padlo, drez]
You can use analog of SQL translate to replace character based on two "dictionaries":
from pyspark.sql.session import SparkSession
from pyspark import SparkContext
import pyspark.sql.functions as sf
from pyspark.sql.types import StringType
charsFrom = 'řá' #fill those strings with all diactricts
charsTo = 'ra' #and coresponding latins
df = sc.parallelize([(u'pádlo', 1), (u'dřez', 4)]).toDF(['text', 'num'])
df = df.select(sf.translate('text', charsFrom, charsTo),'num')
It will replace every occurence of each character from the first string to coresponding character from second string.

How to do GROUP BY on exploded field in Spark SQL's?

Zeppelin 0.6
Spark 1.6
SQL
I am trying to find the top 20 occurring words in some tweets. filtered contains an array of words for each tweet. The following:
select explode(filtered) AS words from tweettable
lists each word as you would expect, but what I want is to get a count of each word over all tweets and then display the top 20 of these. The following works but I need to do this in SQL:
df.select(explode($"filtered").as("value"))
.groupBy("value")
.count()
.sort(desc("count"))
.show(20, false)
I tried GROUP BY on words, filtered, and explode(filtered) but all gave errors.
You can use subqueries in the FROM statement:
SELECT value, count(*) AS count
FROM (SELECT explode(filtered) AS value
FROM tweettable) AS temp
GROUP BY value
ORDER BY count DESC
Following code will give you complete idea to achieve what you are expecting. Tested in Spark(1.6)
val hiveContext = new org.apache.spark.sql.hive.HiveContext(sc)
import hiveContext.implicits._
val lst = List(Seq("Hello","Hyd","Hello","Mumbai"),Seq("Hello","Mumbai"),Seq("Hello","Delhi","Hello","Banglore"))
case class Tweets(filtered: Seq[String])
val df = sc.parallelize(lst).map(x=>Tweets(x)).toDF
import org.apache.spark.sql.functions.{explode}
import org.apache.spark.sql.functions.count
df.select(explode($"filtered").as("value")).groupBy("value").agg(count("*").alias("cnt")).orderBy('cnt.desc).show(20,false)
Alternatively you can use window function.
val hiveContext = new org.apache.spark.sql.hive.HiveContext(sc)
import hiveContext.implicits._
val lst = List(Seq("Hello","Hyd","Hello","Mumbai"),Seq("Hello","Mumbai"),Seq("Hello","Delhi","Hello","Banglore"))
case class Tweets(filtered: Seq[String])
val df = sc.parallelize(lst).map(x=>Tweets(x)).toDF
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
val w = org.apache.spark.sql.expressions.Window.orderBy('cnt.desc)
df.select(explode($"filtered").as("value")).groupBy("value").agg(count("*").alias("cnt")).withColumn("filteredrank", rank.over(w)).filter(col("filteredrank") <= 20).show()

Spark sql pivot table generation

I have a spark dataframe looks like this:
from pyspark.sql import SQLContext, Row
sqlContext = SQLContext(sc)
from pyspark.sql.types import StringType, IntegerType, StructType, StructField,LongType
from pyspark.sql.functions import sum, mean
rdd = sc.parallelize([('retail','food'),
('retail','food'),
('retail','auto'),
('retail','shoes'),
('wholesale','healthsupply'),
('wholesale','foodsupply'),
('wholesale','foodsupply'),
('retail','toy'),
('retail','toy'),
('wholesale','foodsupply'])
schema = StructType([StructField('division', StringType(), True),
StructField('category', StringType(), True)
])
df = sqlContext.createDataFrame(rdd, schema)
I want to generate a table like this, get the division name, division totol records number, top 1 and top2 category within each division and their record number:
division division_total cat_top_1 top1_cnt cat_top_2 top2_cnt
retail 5 food 2 toy 2
wholesale4 foodsupply 3 healthsupply 1
Now I could generate the cat_top_1, cat_top_2 by using window functions in spark, but how to pivot to row, also add a column of division_total, I could not do it right
df_by_div = df.groupby('division','revenue').sort(asc("division"),desc("count"))
windowSpec = Window().partitionBy("division").orderBy(col("count").desc())
df_list = df_by_div.withColumn("rn", rowNumber()\
.over(windowSpec).cast('int'))\
.where(col("rn")<=2)\
.orderBy("division",desc("count"))