data frame parsing column scala - dataframe

I have some problem with parsing Dataframe
val result = df_app_clickstream.withColumn(
"attributes",
explode(expr(raw"transform(attributes, x -> str_to_map(regexp_replace(x, '{\\}',''), ' '))"))
).select(
col("userId"),
col("attributes").getField("campaign_id").alias("app_campaign_id"),
col("attributes").getField("channel_id").alias("app_channel_id")
)
result.show()
I have input like this :
-------------------------------------------------------------------------------
| userId | attributes |
-------------------------------------------------------------------------------
| f6e8252f-b5cc-48a4-b348-29d89ee4fa9e |{'campaign_id':082,'channel_id':'Chnl'}|
-------------------------------------------------------------------------------
and need to get output like this :
--------------------------------------------------------------------
| userId | campaign_id | channel_id|
--------------------------------------------------------------------
| f6e8252f-b5cc-48a4-b348-29d89ee4fa9e | 082 | Facebook |
--------------------------------------------------------------------
but have error

you can try below solution
import org.apache.spark.sql.functions._
val data = Seq(("f6e8252f-b5cc-48a4-b348-29d89ee4fa9e", """{'campaign_id':082, 'channel_id':'Chnl'}""")).toDF("user_id", "attributes")
val out_df = data.withColumn("splitted_col", split(regexp_replace(col("attributes"),"'|\\}|\\{", ""), ","))
.withColumn("campaign_id", split(element_at(col("splitted_col"), 1), ":")(1))
.withColumn("channel_id", split(element_at(col("splitted_col"), 2), ":")(1))
out_df.show(truncate = false)
+------------------------------------+----------------------------------------+-----------------------------------+-----------+----------+
|user_id |attributes |splitted_col |campaign_id|channel_id|
+------------------------------------+----------------------------------------+-----------------------------------+-----------+----------+
|f6e8252f-b5cc-48a4-b348-29d89ee4fa9e|{'campaign_id':082, 'channel_id':'Chnl'}|[campaign_id:082, channel_id:Chnl]|082 |Chnl |
+------------------------------------+----------------------------------------+-----------------------------------+-----------+----------+

Related

Telegram User Adder

Hi recently I made telegram scrapper that scrap users from telegram groups.
Now I am trying make user adder to it.
#!/bin/env python3
from telethon.sync import TelegramClient
from telethon.tl.functions.messages import GetDialogsRequest
from telethon.tl.types import InputPeerEmpty, InputPeerChannel, InputPeerUser
from telethon.errors.rpcerrorlist import PeerFloodError, UserPrivacyRestrictedError
from telethon.tl.functions.channels import InviteToChannelRequest
import configparser
import os, sys
import csv
import traceback
import time
import random
re="\033[1;31m"
gr="\033[1;32m"
cy="\033[1;36m"
def banner():
print(f"""
_____ __ ____ ____ ____ ___ ____ _____ __ ____ ____ ____ ___ ____
.----------------. .----------------. .----------------. .----------------. .----------------.
| .--------------. || .--------------. || .--------------. || .--------------. || .--------------. |
| | __ | || | ________ | || | ________ | || | _________ | || | _______ | |
| | / \ | || | |_ ___ `. | || | |_ ___ `. | || | |_ ___ | | || | |_ __ \ | |
| | / /\ \ | || | | | `. \ | || | | | `. \ | || | | |_ \_| | || | | |__) | | |
| | / ____ \ | || | | | | | | || | | | | | | || | | _| _ | || | | __ / | |
| | _/ / \ \_ | || | _| |___.' / | || | _| |___.' / | || | _| |___/ | | || | _| | \ \_ | |
| ||____| |____|| || | |________.' | || | |________.' | || | |_________| | || | |____| |___| | |
| | | || | | || | | || | | || | | |
| '--------------' || '--------------' || '--------------' || '--------------' || '--------------' |
'----------------' '----------------' '----------------' '----------------' '----------------'
_____ __ ____ ____ ____ ___ ____ _____ __ ____ ____ ____ ___ ____
version : 2.0
""")
cpass = configparser.RawConfigParser()
cpass.read('config.data')
try:
api_id = cpass['cred']['id']
api_hash = cpass['cred']['hash']
phone = cpass['cred']['phone']
client = TelegramClient(phone, api_id, api_hash)
except KeyError:
os.system('clear')
banner()
print(re+"[!] run python3 setup.py first !!\n")
sys.exit(1)
client.connect()
if not client.is_user_authorized():
client.send_code_request(phone)
os.system('clear')
banner()
client.sign_in(phone, input(gr+'[+] Enter the code: '+re))
os.system('clear')
banner()
input_file = sys.argv[1]
users = []
with open(input_file, encoding='UTF-8') as f:
rows = csv.reader(f,delimiter=",",lineterminator="\n")
next(rows, None)
for row in rows:
user = {}
user['username'] = row[0]
user['id'] = int(row[1])
user['access_hash'] = int(row[2])
user['name'] = row[3]
users.append(user)
chats = []
last_date = None
chunk_size = 200
groups=[]
result = client(GetDialogsRequest(
offset_date=last_date,
offset_id=0,
offset_peer=InputPeerEmpty(),
limit=chunk_size,
hash = 0
))
chats.extend(result.chats)
for chat in chats:
try:
if chat.megagroup== False:
groups.append(chat)
except:
continue
i=0
for group in groups:
print(gr+'['+cy+str(i)+gr+']'+cy+' - '+group.title)
i+=1
print(gr+'[+] Choose a group to add members')
g_index = input(gr+"[+] Enter a Number : "+re)
target_group=groups[int(g_index)]
target_group_entity = InputPeerChannel(target_group.id,target_group.access_hash)
print(gr+"[1] add member by user ID\n[2] add member by username ")
mode = int(input(gr+"Input : "+re))
n = 0
for user in users:
n += 1
if n % 50 == 0:
time.sleep(1)
try:
print ("Adding {}".format(user['id']))
if mode == 1:
if user['username'] == "":
continue
user_to_add = client.get_input_entity(user['username'])
elif mode == 2:
user_to_add = InputPeerUser(user['id'], user['access_hash'])
else:
sys.exit(re+"[!] Invalid Mode Selected. Please Try Again.")
client(InviteToChannelRequest(target_group_entity,[user_to_add]))
print(gr+"[+] Waiting for 2-10 Seconds...")
time.sleep(random.randrange(2, 10))
except FloodWaitError:
print(re+"[!] Getting Flood Error from telegram. \n[!] Script is stopping now. \n[!] Please try again after some time.")
except UserPrivacyRestrictedError:
print(re+"[!] The user's privacy settings do not allow you to do this. Skipping.")
except:
traceback.print_exc()
print(re+"[!] Unexpected Error")
continue
It works but partly I can hardly add 1-10 user at a time and I shows errors some of adding proccess
Kindly I tried most thing command says it needs much time but timer doesnt seem effect on it even I add some.Any suggestions any helps ?
Adding 1456428294
[!] Getting FloodWaitError from telegram.
[!] Script is stopping now.
[!] Please try again after some time.
FloodWaitError (420)
the same request was repeated many times. Must wait .seconds (you can access this attribute). For example:
from telethon import errors
try:
messages = await client.get_messages(chat)
print(messages[0].text)
except errors.FloodWaitError as e:
print('Have to sleep', e.seconds, 'seconds')
time.sleep(e.seconds)
Read the documentation:
https://docs.telethon.dev/en/latest/concepts/errors.html

How to split a column by using length split and MaxSplit in Pyspark dataframe?

For Example
If I have a Column as given below by calling and showing the CSV in Pyspark
+--------+
| Names|
+--------+
|Rahul |
|Ravi |
|Raghu |
|Romeo |
+--------+
if I specify in my functions as Such
Length = 2
Maxsplit = 3
Then I have to get the results as
+----------+-----------+----------+
|Col_1 |Col_2 |Col_3 |
+----------+-----------+----------+
| Ra | hu | l |
| Ra | vi | Null |
| Ra | gh | u |
| Ro | me | o |
+----------+-----------+----------+
Simirarly in Pyspark
Length = 3
Max split = 2 it should provide me the output such as
+----------+-----------+
|Col_1 |Col_2 |
+----------+-----------+
| Rah | ul |
| Rav | i |
| Rag | hu |
| Rom | eo |
+----------+-----------+
This is how it should look like, Thank you
Another way to go about this. Should be faster than any looping or udf solution.
from pyspark.sql import functions as F
def split(df,length,maxsplit):
return df.withColumn('Names',F.split("Names","(?<=\\G{})".format('.'*length)))\
.select(*((F.col("Names")[x]).alias("Col_"+str(x+1)) for x in range(0,maxsplit)))
split(df,3,2).show()
#+-----+-----+
#|Col_1|Col_2|
#+-----+-----+
#| Rah| ul|
#| Rav| i|
#| Rag| hu|
#| Rom| eo|
#+-----+-----+
split(df,2,3).show()
#+-----+-----+-----+
#|col_1|col_2|col_3|
#+-----+-----+-----+
#| Ra| hu| l|
#| Ra| vi| |
#| Ra| gh| u|
#| Ro| me| o|
#+-----+-----+-----+
Try this,
import pyspark.sql.functions as F
tst = sqlContext.createDataFrame([("Raghu",1),("Ravi",2),("Rahul",3)],schema=["Name","val"])
def fn (split,max_n,tst):
for i in range(max_n):
tst_loop=tst.withColumn("coln"+str(i),F.substring(F.col("Name"),(i*split)+1,split))
tst=tst_loop
return(tst)
tst_res = fn(3,2,tst)
The for loop can also replaced by a list comprehension or reduce, but i felt in you case, a for loop looked neater. they have the same physical plan anyway.
The results
+-----+---+-----+-----+
| Name|val|coln0|coln1|
+-----+---+-----+-----+
|Raghu| 1| Rag| hu|
| Ravi| 2| Rav| i|
|Rahul| 3| Rah| ul|
+-----+---+-----+-----+
Try this
def split(data,length,maxSplit):
start=1
for i in range(0,maxSplit):
data = data.withColumn(f'col_{start}-{start+length-1}',f.substring('channel',start,length))
start=length+1
return data
df = split(data,3,2)
df.show()
+--------+----+-------+-------+
| channel|type|col_1-3|col_4-6|
+--------+----+-------+-------+
| web| 0| web| |
| web| 1| web| |
| web| 2| web| |
| twitter| 0| twi| tte|
| twitter| 1| twi| tte|
|facebook| 0| fac| ebo|
|facebook| 1| fac| ebo|
|facebook| 2| fac| ebo|
+--------+----+-------+-------+
Perhaps this is useful-
Load the test data
Note: written in scala
val Length = 2
val Maxsplit = 3
val df = Seq("Rahul", "Ravi", "Raghu", "Romeo").toDF("Names")
df.show(false)
/**
* +-----+
* |Names|
* +-----+
* |Rahul|
* |Ravi |
* |Raghu|
* |Romeo|
* +-----+
*/
split the string col as per the length and offset
val schema = StructType(Range(1, Maxsplit + 1).map(f => StructField(s"Col_$f", StringType)))
val split = udf((str:String, length: Int, maxSplit: Int) =>{
val splits = str.toCharArray.grouped(length).map(_.mkString).toArray
RowFactory.create(splits ++ Array.fill(maxSplit-splits.length)(null): _*)
}, schema)
val p = df
.withColumn("x", split($"Names", lit(Length), lit(Maxsplit)))
.selectExpr("x.*")
p.show(false)
p.printSchema()
/**
* +-----+-----+-----+
* |Col_1|Col_2|Col_3|
* +-----+-----+-----+
* |Ra |hu |l |
* |Ra |vi |null |
* |Ra |gh |u |
* |Ro |me |o |
* +-----+-----+-----+
*
* root
* |-- Col_1: string (nullable = true)
* |-- Col_2: string (nullable = true)
* |-- Col_3: string (nullable = true)
*/
Dataset[Row] -> Dataset[Array[String]]
val x = df.map(r => {
val splits = r.getString(0).toCharArray.grouped(Length).map(_.mkString).toArray
splits ++ Array.fill(Maxsplit-splits.length)(null)
})
x.show(false)
x.printSchema()
/**
* +-----------+
* |value |
* +-----------+
* |[Ra, hu, l]|
* |[Ra, vi,] |
* |[Ra, gh, u]|
* |[Ro, me, o]|
* +-----------+
*
* root
* |-- value: array (nullable = true)
* | |-- element: string (containsNull = true)
*/

Spark dataframe inner join without duplicate match

I want to join two dataframes based on certain condition is spark scala. However the catch is if row in df1 matches any row in df2, it should not try to match same row of df1 with any other row in df2. Below is sample data and outcome I am trying to get.
DF1
--------------------------------
Emp_id | Emp_Name | Address_id
1 | ABC | 1
2 | DEF | 2
3 | PQR | 3
4 | XYZ | 1
DF2
-----------------------
Address_id | City
1 | City_1
1 | City_2
2 | City_3
REST | Some_City
Output DF
----------------------------------------
Emp_id | Emp_Name | Address_id | City
1 | ABC | 1 | City_1
2 | DEF | 2 | City_3
3 | PQR | 3 | Some_City
4 | XYZ | 1 | City_1
Note:- REST is like wild card. Any value can be equal to REST.
So in above sample emp_name "ABC" can match with City_1, City_2 or Some_City. Output DF contains only City_1 because it finds it first.
You seem to have a custom logic for your join. Basically I've been to come up with the below UDF.
Note that you may want to change the logic for the UDF as per your requirement.
import spark.implicits._
import org.apache.spark.sql.functions.to_timestamp
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.functions.first
//dataframe 1
val df_1 = Seq(("1", "ABC", "1"), ("2", "DEF", "2"), ("3", "PQR", "3"), ("4", "XYZ", "1")).toDF("Emp_Id", "Emp_Name", "Address_Id")
//dataframe 2
val df_2 = Seq(("1", "City_1"), ("1", "City_2"), ("2", "City_3"), ("REST","Some_City")).toDF("Address_Id", "City_Name")
// UDF logic
val join_udf = udf((a: String, b: String) => {
(a,b) match {
case ("1", "1") => true
case ("1", _) => false
case ("2", "2") => true
case ("2", _) => false
case(_, "REST") => true
case(_, _) => false
}})
val dataframe_join = df_1.join(df_2, join_udf(df_1("Address_Id"), df_2("Address_Id")), "inner").drop(df_2("Address_Id"))
.orderBy($"City_Name")
.groupBy($"Emp_Id", $"Emp_Name", $"Address_Id")
.agg(first($"City_Name"))
.orderBy($"Emp_Id")
dataframe_join.show(false)
Basically post applying UDF, what you get is all possible combinations of the matches.
Post that when you apply groupBy and make use of first function of agg, you would only get the filtered values as what you are looking for.
+------+--------+----------+-----------------------+
|Emp_Id|Emp_Name|Address_Id|first(City_Name, false)|
+------+--------+----------+-----------------------+
|1 |ABC |1 |City_1 |
|2 |DEF |2 |City_3 |
|3 |PQR |3 |Some_City |
|4 |XYZ |1 |City_1 |
+------+--------+----------+-----------------------+
Note that I've made use of Spark 2.3 and hope this helps!
{
import org.apache.spark.sql.{SparkSession}
import org.apache.spark.sql.functions._
object JoinTwoDataFrame extends App {
val spark = SparkSession.builder()
.master("local")
.appName("DataFrame-example")
.getOrCreate()
import spark.implicits._
val df1 = Seq(
(1, "ABC", "1"),
(2, "DEF", "2"),
(3, "PQR", "3"),
(4, "XYZ", "1")
).toDF("Emp_id", "Emp_Name", "Address_id")
val df2 = Seq(
("1", "City_1"),
("1", "City_2"),
("2", "City_3"),
("REST", "Some_City")
).toDF("Address_id", "City")
val restCity: Option[String] = Some(df2.filter('Address_id.equalTo("REST")).select('City).first()(0).toString)
val res = df1.join(df2, df1.col("Address_id") === df2.col("Address_id") , "left_outer")
.select(
df1.col("Emp_id"),
df1.col("Emp_Name"),
df1.col("Address_id"),
df2.col("City")
)
.withColumn("city2", when('City.isNotNull, 'City).otherwise(restCity.getOrElse("")))
.drop("City")
.withColumnRenamed("city2", "City")
.orderBy("Address_id", "City")
.groupBy("Emp_id", "Emp_Name", "Address_id")
.agg(collect_list("City").alias("cityList"))
.withColumn("City", 'cityList.getItem(0))
.drop("cityList")
.orderBy("Emp_id")
res.show(false)
// +------+--------+----------+---------+
// |Emp_id|Emp_Name|Address_id|City |
// +------+--------+----------+---------+
// |1 |ABC |1 |City_1 |
// |2 |DEF |2 |City_3 |
// |3 |PQR |3 |Some_City|
// |4 |XYZ |1 |City_1 |
// +------+--------+----------+---------+
}
}

How to extract values from key value map?

I have a column of type map, where the key and value changes. I am trying to extract the value and create a new column.
Input:
----------------+
|symbols |
+---------------+
|[3pea -> 3PEA] |
|[barello -> BA]|
|[] |
|[] |
+---------------+
Expected output:
--------+
|symbols|
+-------+
|3PEA |
|BA |
| |
| |
+-------+
Here is what I tried so far using a udf:
def map_value=udf((inputMap:Map[String,String])=> {inputMap.map(x=>x._2)
})
java.lang.UnsupportedOperationException: Schema for type scala.collection.immutable.Iterable[String] is not supported
Since Spark scala v2.3 api, sql v2.3 api, or pyspark v2.4 api you can use the spark sql function map_values
The following is in pyspark, scala would be very similar.
Setup (assuming working SparkSession as spark):
from pyspark.sql import functions as F
df = (
spark.read.json(sc.parallelize(["""[
{"key": ["3pea"], "value": ["3PEA"] },
{"key": ["barello"], "value": ["BA"] }
]"""]))
.select(F.map_from_arrays(F.col("key"), F.col("value")).alias("symbols") )
)
df.printSchema()
df.show()
root
|-- symbols: map (nullable = true)
| |-- key: string
| |-- value: string (valueContainsNull = true)
+---------------+
| symbols|
+---------------+
| [3pea -> 3PEA]|
|[barello -> BA]|
+---------------+
df.select((F.map_values(F.col("symbols"))[0]).alias("map_vals")).show()
+--------+
|map_vals|
+--------+
| 3PEA|
| BA|
+--------+
import org.apache.spark.sql.functions._
import spark.implicits._
val m = Seq(Array("A -> abc"), Array("B -> 0.11856755943424617"), Array("C -> kqcams"))
val df = m.toDF("map_data")
df.show
// Simulate your data I think.
val df2 = df.withColumn("xxx", split(concat_ws("",$"map_data"), "-> ")).select($"xxx".getItem(1).as("map_val")).drop("xxx")
df2.show(false)
results in:
+--------------------+
| map_data|
+--------------------+
| [A -> abc]|
|[B -> 0.118567559...|
| [C -> kqcams]|
+--------------------+
+-------------------+
|map_val |
+-------------------+
|abc |
|0.11856755943424617|
|kqcams |
+-------------------+

Convert Pyspark Dataframe column from array to new columns

I've a Pyspark Dataframe with this structure:
root
|-- Id: string (nullable = true)
|-- Q: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- pr: string (nullable = true)
| | |-- qt: double (nullable = true)
Something similar to:
+----+--------------------- ... --+
| Id | Q |
+----+---------------------- ... -+
| 001| [ [pr1,1.9], [pr3,2.0]...] |
| 002| [ [pr2,1.0], [pr9,3.9]...] |
| 003| [ [pr2,9.0], ... ] |
...
I wold like to convert Q array into columns (name pr value qt).
Also I would like to avoid duplicated columns by merging (add) same columns.
+----+-----+-----+------+ ... ----+
| Id | pr1 | pr2 | pr3 | ... prn |
+----+-----+-----+------+ ... ----+
| 001| 1.9 | 0.0 | 2.0 | ... |
| 002| 0.0 | 1.0 | 0 | ... |
| 003| 0.0 | 9.0 | ... | ... |
...
How can I perform this transformation?.
Thakyou in advance!!.
Julián.
You can do this with a combination of explode and pivot:
import pyspark.sql.functions as F
# explode to get "long" format
df=df.withColumn('exploded', F.explode('Q'))
# get the name and the name in separate columns
df=df.withColumn('name', F.col('exploded').getItem(0))
df=df.withColumn('value', F.col('exploded').getItem(1))
# now pivot
df.groupby('Id').pivot('name').agg(F.max('value')).na.fill(0)
Very interesting question. This is how I approached it.
test.csv
001,pr1:0.9,pr3:1.2,pr2:2.0
002,pr3:5.2,pr4:0.99
Pyspark
file = sc.textFile("file:///test2.csv")
//get it in (key,value)
//[(u'001', u'pr1:0.9')...]
//rdd1 = file.map(lambda r: r.replace(",","\t",1)).map(lambda r: r.split("\t")).map(lambda r: (r[0],r[1])).flatMapValues(lambda r: r.split(','))
rdd1 = file.map(lambda r: r.split(",")[0]).map(lambda r: (r[0],r[1])).flatMapValues(lambda r: r.split(','))
//create a DF with 3 columns
//[(u'001', u'pr1', u'0.9')...)]
+---+---+----+
| _1| _2| _3|
+---+---+----+
|001|pr1| 0.9|
|001|pr3| 1.2|
|001|pr2| 2.0|
|002|pr3| 5.2|
|002|pr4|0.99|
+---+---+----+
rdd2 = rdd1.map(lambda r: (r[0],r[1].split(":"))).map(lambda r: (r[0],r[1][0],r[1][1]))
df = rdd2.toDF()
//Perform the magic
df.groupBy("_1").pivot("_2").agg(expr("coalesce(first(_3),0)"))
+---+---+---+---+----+
| _1|pr1|pr2|pr3| pr4|
+---+---+---+---+----+
|001|0.9|2.0|1.2| 0|
|002| 0| 0|5.2|0.99|
+---+---+---+---+----+