How to remove diacritics in pyspark dataframes? - apache-spark-sql

I am wondering how to remove diacritics in Pyspark Dataframe with Python2. I would need something like
from pyspark.sql.session import SparkSession
from pyspark import SparkContext
import pyspark.sql.functions as sf
from pyspark.sql.types import StringType
df = sc.parallelize([(u'pádlo', 1), (u'dřez', 4)]).toDF(['text', 'num'])
def remove_diacritics(s):
return unidecode.unidecode(s)
rem_udf = sf.udf(remove_diacritics, StringType())
df.select(rem_udf('text'))
unfortunatelly, unidecode module is not available in our cluster.
Is there some any natural solution that I am missing excepting manual replacement of all possible characters? Note that the expected result is [padlo, drez]

You can use analog of SQL translate to replace character based on two "dictionaries":
from pyspark.sql.session import SparkSession
from pyspark import SparkContext
import pyspark.sql.functions as sf
from pyspark.sql.types import StringType
charsFrom = 'řá' #fill those strings with all diactricts
charsTo = 'ra' #and coresponding latins
df = sc.parallelize([(u'pádlo', 1), (u'dřez', 4)]).toDF(['text', 'num'])
df = df.select(sf.translate('text', charsFrom, charsTo),'num')
It will replace every occurence of each character from the first string to coresponding character from second string.

Related

How to extend the query if the sql query is parametrized?

Below is the function created to generate counts from the table, but in the query (string) I want to add 'group by' a column 'xyz'. Suggest, how to do the same.
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
db = 'database'
schema = 'Schema'
def getCount(table):
string = f"select count(*) as ct from {db}.{schema}." + table
df = spark.read.format(snowflake_name)\
.options(**sfOptions)\
.option('query', string).load()
return df
Well one way would be to alter the f-string slightly
string = f"select some_column, count(*) as ct from {db}.{schema}.{table} group by some_column"

Converting Python code to pyspark environment

How can I have the same functions as shift() and cumsum() from pandas in pyspark?
import pandas as pd
temp = pd.DataFrame(data=[['a',0],['a',0],['a',0],['b',0],['b',1],['b',1],['c',1],['c',0],['c',0]], columns=['ID','X'])
temp['transformed'] = temp.groupby('ID').apply(lambda x: (x["X"].shift() != x["X"]).cumsum()).reset_index()['X']
print(temp)
My question is how to achieve in pyspark.
Pyspark have handle these type of queries with Windows utility functions.
you can read its documentation here
Your pyspark code would be something like this :
from pyspark.sql import functions as F
from pyspark.sql Import Window as W
window = W.partitionBy('id').orderBy('time'?)
new_df = (
df
.withColumn('shifted', F.lag('X').over(window))
.withColumn('isEqualToPrev', (F.col('shifted') == F.col('X')).cast('int'))
.withColumn('cumsum', F.sum('isEqualToPrev').over(window))
)

Add column with the first IP address of the subnet

I have PySpark dataframe with column named "subnet". I want to add a column which is the first IP of that subnet. I've tried many solutions including
def get_first_ip(prefix):
n = ipaddress.IPv4Network(prefix)
first, last = n[0], n[-1]
return first
df.withColumn("first_ip", get_first_ip(F.col("subnet")))
But getting error:
-> 1161 raise AddressValueError("Expected 4 octets in %r" % ip_str)
1162
1163 try:
AddressValueError: Expected 4 octets in "Column<'subnet'>"
I do understand that is the Column value and can no use it as a simple string here, but how to solve my problem with PySpark?
I could do the same in pandas and then convert to PySpark, but I'm wondering if there's any other more elegant way?
It's hard to tell what's the issue when we don't know how the input dataframe looks like. But something is wrong with the column values as #samkart suggested.
Here's an example that I tested:
import ipaddress
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StringType
def get_first_ip(x):
n = ipaddress.IPv4Network(x)
return str(n[0])
def get_last_ip(x):
n = ipaddress.IPv4Network(x)
return str(n[-1])
first_ip_udf = F.udf(lambda x: get_first_ip(x), StringType())
last_ip_udf = F.udf(lambda x: get_last_ip(x), StringType())
spark = SparkSession.builder.getOrCreate()
data = [
{"IP": "10.10.128.123"},
{"IP": "10.10.128.0/17"},
]
df = spark.createDataFrame(data=data)
df = df.withColumn("first_ip", first_ip_udf(F.col("IP")))
df = df.withColumn("last_ip", last_ip_udf(F.col("IP")))
Outputs:
+--------------+-------------+-------------+
|IP |first_ip |last_ip |
+--------------+-------------+-------------+
|10.10.128.123 |10.10.128.123|10.10.128.123|
|10.10.128.0/17|10.10.128.0 |10.10.255.255|
+--------------+-------------+-------------+
You cannot directly apply python native function to a Spark dataframe column. As demonstrated in this answer, you could create a udf from your function.
Since udf is slow for big dataframes, you could use pandas_udf which is a lot faster.
Input:
import ipaddress
import pandas as pd
from pyspark.sql import functions as F
df = spark.createDataFrame([("10.10.128.123",), ("10.10.128.0/17",)], ["subnet"])
Script:
#F.pandas_udf('string')
def get_first_ip(prefix: pd.Series) -> pd.Series:
return prefix.apply(lambda s: str(ipaddress.IPv4Network(s)[0]))
df = df.withColumn("first_ip", get_first_ip("subnet"))
df.show()
# +--------------+-------------+
# | subnet| first_ip|
# +--------------+-------------+
# | 10.10.128.123|10.10.128.123|
# |10.10.128.0/17| 10.10.128.0|
# +--------------+-------------+

Transfer a df to a new one and change the context of a column

I have one dataframe df_test and I want to parse all the columns into a new df.
Also I want with if else statement to modify one column's context.
Tried this:
import pyspark
import pandas as pd
from pyspark.sql import SparkSession
df_cast= df_test.withColumn('account_id', when(col("account_id") == 8, "teo").when(col("account_id") == 9, "liza").otherwise(' '))
But it gives me this error:
NameError: name 'when' is not defined
Thanks in advance
At the start of your code, you should import the pyspark sql functions. The following, for example, would work:
import pyspark.sql.functions as F
import pyspark
import pandas as pd
from pyspark.sql import SparkSession
df_cast= df_test.withColumn('account_id', F.when(col("account_id") == 8, "teo").F.when(col("account_id") == 9, "liza").otherwise(' '))

Parse list and create DataFrame

I have been given a list called data which has the following content
data=[b'Name,Age,Occupation,Salary\r\nRam,37,Plumber,1769\r\nMohan,49,Elecrician,3974\r\nRahim,39,Teacher,4559\r\n']
I wanted to have a pandas dataframe which looks like the link
Expected Dataframe
How can I achieve this.
You can try this:
data=[b'Name,Age,Occupation,Salary\r\nRam,37,Plumber,1769\r\nMohan,49,Elecrician,3974\r\nRahim,39,Teacher,4559\r\n']
processed_data = [x.split(',') for x in data[0].decode().replace('\r', '').strip().split('\n')]
df = pd.DataFrame(columns=processed_data[0], data=processed_data[1:])
Hope it helps.
I would recommend you to convert this list to string as there is only one index in this list
str1 = ''.join(data)
Then use solution provided here
import sys
if sys.version_info[0] < 3:
from StringIO import StringIO
else:
from io import StringIO
import pandas as pd
TESTDATA = StringIO(str1)
df = pd.read_csv(TESTDATA, sep=",")